From 5ae9686adac1508873f2d980e84becd3496244c2 Mon Sep 17 00:00:00 2001 From: Karen Arutyunov Date: Wed, 26 Feb 2020 17:16:45 +0300 Subject: Add notion of validator to char_scanner and make sure manifest is UTF-8 This involves implementing utf8_validator and UTF-8 utility functions and using them during the manifest parsing, serialization, and rewriting. --- libbutl/unicode.mxx | 82 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 82 insertions(+) create mode 100644 libbutl/unicode.mxx (limited to 'libbutl/unicode.mxx') diff --git a/libbutl/unicode.mxx b/libbutl/unicode.mxx new file mode 100644 index 0000000..b846476 --- /dev/null +++ b/libbutl/unicode.mxx @@ -0,0 +1,82 @@ +// file : libbutl/unicode.mxx -*- C++ -*- +// license : MIT; see accompanying LICENSE file + +#ifndef __cpp_modules_ts +#pragma once +#endif + +// C includes. + +#ifndef __cpp_lib_modules_ts +#include +#include +#include // uint16_t +#endif + +// Other includes. + +#ifdef __cpp_modules_ts +export module butl.unicode; +#ifdef __cpp_lib_modules_ts +import std.core; +import std.io; +#endif +#endif + +#include + +LIBBUTL_MODEXPORT namespace butl +{ + // Note that the Unicode Standard requires the surrogates ([D800 DFFF]) to + // only be used in the context of the UTF-16 character encoding form. Thus, + // we omit the surrogate codepoint type and assume surrogates as invalid + // codepoints. + // + enum class codepoint_types: std::uint16_t + { + // Useful to denote invalid codepoints or when building the type set + // incrementally. + // + none = 0x00, + + graphic = 0x01, // L(etter), M(ark), N(number), P(uncturation), + // S(symbol), Zs(separator, space) + format = 0x02, + control = 0x04, + private_use = 0x08, + non_character = 0x10, + reserved = 0x20, + + any = 0x3f + }; + + codepoint_types operator& (codepoint_types, codepoint_types); + codepoint_types operator| (codepoint_types, codepoint_types); + codepoint_types operator&= (codepoint_types&, codepoint_types); + codepoint_types operator|= (codepoint_types&, codepoint_types); + + // Return the codepoint type for a valid codepoint value and none otherwise. + // + // Note that the valid codepoint ranges are [0 D800) and (DFFF 10FFFF]. + // + codepoint_types + codepoint_type (char32_t); + + // Return the type name for a single codepoint type and empty string for + // `none` and `any`. + // + // Potential future improvements: + // - add the none value name parameter ("invalid" by default) + // - produce names for type masks ("graphic, format", "any", etc) + // + std::string + to_string (codepoint_types); + + inline std::ostream& + operator<< (std::ostream& os, codepoint_types ts) + { + return os << to_string (ts); + } +} + +#include -- cgit v1.1