diff options
author | Karen Arutyunov <karen@codesynthesis.com> | 2020-02-26 17:16:45 +0300 |
---|---|---|
committer | Karen Arutyunov <karen@codesynthesis.com> | 2020-02-26 17:17:49 +0300 |
commit | 5ae9686adac1508873f2d980e84becd3496244c2 (patch) | |
tree | d7c88e678b29ed6bb7ae30b74bd01aa2b5d2e9a8 /libbutl/unicode.mxx | |
parent | afb726d2d59b3715960a8647738860f40e37cf4f (diff) |
Add notion of validator to char_scanner and make sure manifest is UTF-8
This involves implementing utf8_validator and UTF-8 utility functions and
using them during the manifest parsing, serialization, and rewriting.
Diffstat (limited to 'libbutl/unicode.mxx')
-rw-r--r-- | libbutl/unicode.mxx | 82 |
1 files changed, 82 insertions, 0 deletions
diff --git a/libbutl/unicode.mxx b/libbutl/unicode.mxx new file mode 100644 index 0000000..b846476 --- /dev/null +++ b/libbutl/unicode.mxx @@ -0,0 +1,82 @@ +// file : libbutl/unicode.mxx -*- C++ -*- +// license : MIT; see accompanying LICENSE file + +#ifndef __cpp_modules_ts +#pragma once +#endif + +// C includes. + +#ifndef __cpp_lib_modules_ts +#include <string> +#include <ostream> +#include <cstdint> // uint16_t +#endif + +// Other includes. + +#ifdef __cpp_modules_ts +export module butl.unicode; +#ifdef __cpp_lib_modules_ts +import std.core; +import std.io; +#endif +#endif + +#include <libbutl/export.hxx> + +LIBBUTL_MODEXPORT namespace butl +{ + // Note that the Unicode Standard requires the surrogates ([D800 DFFF]) to + // only be used in the context of the UTF-16 character encoding form. Thus, + // we omit the surrogate codepoint type and assume surrogates as invalid + // codepoints. + // + enum class codepoint_types: std::uint16_t + { + // Useful to denote invalid codepoints or when building the type set + // incrementally. + // + none = 0x00, + + graphic = 0x01, // L(etter), M(ark), N(number), P(uncturation), + // S(symbol), Zs(separator, space) + format = 0x02, + control = 0x04, + private_use = 0x08, + non_character = 0x10, + reserved = 0x20, + + any = 0x3f + }; + + codepoint_types operator& (codepoint_types, codepoint_types); + codepoint_types operator| (codepoint_types, codepoint_types); + codepoint_types operator&= (codepoint_types&, codepoint_types); + codepoint_types operator|= (codepoint_types&, codepoint_types); + + // Return the codepoint type for a valid codepoint value and none otherwise. + // + // Note that the valid codepoint ranges are [0 D800) and (DFFF 10FFFF]. + // + codepoint_types + codepoint_type (char32_t); + + // Return the type name for a single codepoint type and empty string for + // `none` and `any`. + // + // Potential future improvements: + // - add the none value name parameter ("invalid" by default) + // - produce names for type masks ("graphic, format", "any", etc) + // + std::string + to_string (codepoint_types); + + inline std::ostream& + operator<< (std::ostream& os, codepoint_types ts) + { + return os << to_string (ts); + } +} + +#include <libbutl/unicode.ixx> |