diff options
author | Karen Arutyunov <karen@codesynthesis.com> | 2020-02-26 17:16:45 +0300 |
---|---|---|
committer | Karen Arutyunov <karen@codesynthesis.com> | 2020-02-26 17:17:49 +0300 |
commit | 5ae9686adac1508873f2d980e84becd3496244c2 (patch) | |
tree | d7c88e678b29ed6bb7ae30b74bd01aa2b5d2e9a8 /libbutl/unicode.ixx | |
parent | afb726d2d59b3715960a8647738860f40e37cf4f (diff) |
Add notion of validator to char_scanner and make sure manifest is UTF-8
This involves implementing utf8_validator and UTF-8 utility functions and
using them during the manifest parsing, serialization, and rewriting.
Diffstat (limited to 'libbutl/unicode.ixx')
-rw-r--r-- | libbutl/unicode.ixx | 72 |
1 files changed, 72 insertions, 0 deletions
diff --git a/libbutl/unicode.ixx b/libbutl/unicode.ixx new file mode 100644 index 0000000..cba4fd2 --- /dev/null +++ b/libbutl/unicode.ixx @@ -0,0 +1,72 @@ +// file : libbutl/unicode.ixx -*- C++ -*- +// license : MIT; see accompanying LICENSE file + +namespace butl +{ + inline codepoint_types + operator&= (codepoint_types& x, codepoint_types y) + { + return x = static_cast<codepoint_types> ( + static_cast<std::uint16_t> (x) & + static_cast<std::uint16_t> (y)); + } + + inline codepoint_types + operator|= (codepoint_types& x, codepoint_types y) + { + return x = static_cast<codepoint_types> ( + static_cast<std::uint16_t> (x) | + static_cast<std::uint16_t> (y)); + } + + inline codepoint_types + operator& (codepoint_types x, codepoint_types y) + { + return x &= y; + } + + inline codepoint_types + operator| (codepoint_types x, codepoint_types y) + { + return x |= y; + } + + LIBBUTL_SYMEXPORT codepoint_types + codepoint_type_lookup (char32_t); + + inline codepoint_types + codepoint_type (char32_t c) + { + // Optimize for the common case (printable ASCII characters). + // + if (c >= 0x20 && c <= 0x7E) // Printable ASCII? + return codepoint_types::graphic; + else if (c > 0x10FFFF || (c >= 0xD800 && c <= 0xDFFF)) // Invalid? + return codepoint_types::none; + else if ((c & 0xFFFF) >= 0xFFFE) // Non-range based? + return codepoint_types::non_character; + else + return codepoint_type_lookup (c); + } + + inline std::string + to_string (codepoint_types t) + { + // Note that we use the terms from the Unicode standard ("private-use" + // rather than "private use", "noncharacter" rather than "non-character"). + // + switch (t) + { + case codepoint_types::graphic: return "graphic"; + case codepoint_types::format: return "format"; + case codepoint_types::control: return "control"; + case codepoint_types::private_use: return "private-use"; + case codepoint_types::non_character: return "noncharacter"; // No dash. + case codepoint_types::reserved: return "reserved"; + case codepoint_types::none: + case codepoint_types::any: return ""; + } + + return ""; // Types combination. + } +} |