aboutsummaryrefslogtreecommitdiff
path: root/libbutl/unicode.mxx
diff options
context:
space:
mode:
authorKaren Arutyunov <karen@codesynthesis.com>2020-02-26 17:16:45 +0300
committerKaren Arutyunov <karen@codesynthesis.com>2020-02-26 17:17:49 +0300
commit5ae9686adac1508873f2d980e84becd3496244c2 (patch)
treed7c88e678b29ed6bb7ae30b74bd01aa2b5d2e9a8 /libbutl/unicode.mxx
parentafb726d2d59b3715960a8647738860f40e37cf4f (diff)
Add notion of validator to char_scanner and make sure manifest is UTF-8
This involves implementing utf8_validator and UTF-8 utility functions and using them during the manifest parsing, serialization, and rewriting.
Diffstat (limited to 'libbutl/unicode.mxx')
-rw-r--r--libbutl/unicode.mxx82
1 files changed, 82 insertions, 0 deletions
diff --git a/libbutl/unicode.mxx b/libbutl/unicode.mxx
new file mode 100644
index 0000000..b846476
--- /dev/null
+++ b/libbutl/unicode.mxx
@@ -0,0 +1,82 @@
+// file : libbutl/unicode.mxx -*- C++ -*-
+// license : MIT; see accompanying LICENSE file
+
+#ifndef __cpp_modules_ts
+#pragma once
+#endif
+
+// C includes.
+
+#ifndef __cpp_lib_modules_ts
+#include <string>
+#include <ostream>
+#include <cstdint> // uint16_t
+#endif
+
+// Other includes.
+
+#ifdef __cpp_modules_ts
+export module butl.unicode;
+#ifdef __cpp_lib_modules_ts
+import std.core;
+import std.io;
+#endif
+#endif
+
+#include <libbutl/export.hxx>
+
+LIBBUTL_MODEXPORT namespace butl
+{
+ // Note that the Unicode Standard requires the surrogates ([D800 DFFF]) to
+ // only be used in the context of the UTF-16 character encoding form. Thus,
+ // we omit the surrogate codepoint type and assume surrogates as invalid
+ // codepoints.
+ //
+ enum class codepoint_types: std::uint16_t
+ {
+ // Useful to denote invalid codepoints or when building the type set
+ // incrementally.
+ //
+ none = 0x00,
+
+ graphic = 0x01, // L(etter), M(ark), N(number), P(uncturation),
+ // S(symbol), Zs(separator, space)
+ format = 0x02,
+ control = 0x04,
+ private_use = 0x08,
+ non_character = 0x10,
+ reserved = 0x20,
+
+ any = 0x3f
+ };
+
+ codepoint_types operator& (codepoint_types, codepoint_types);
+ codepoint_types operator| (codepoint_types, codepoint_types);
+ codepoint_types operator&= (codepoint_types&, codepoint_types);
+ codepoint_types operator|= (codepoint_types&, codepoint_types);
+
+ // Return the codepoint type for a valid codepoint value and none otherwise.
+ //
+ // Note that the valid codepoint ranges are [0 D800) and (DFFF 10FFFF].
+ //
+ codepoint_types
+ codepoint_type (char32_t);
+
+ // Return the type name for a single codepoint type and empty string for
+ // `none` and `any`.
+ //
+ // Potential future improvements:
+ // - add the none value name parameter ("invalid" by default)
+ // - produce names for type masks ("graphic, format", "any", etc)
+ //
+ std::string
+ to_string (codepoint_types);
+
+ inline std::ostream&
+ operator<< (std::ostream& os, codepoint_types ts)
+ {
+ return os << to_string (ts);
+ }
+}
+
+#include <libbutl/unicode.ixx>