aboutsummaryrefslogtreecommitdiff
path: root/libbutl/utf8.mxx
diff options
context:
space:
mode:
authorKaren Arutyunov <karen@codesynthesis.com>2021-09-28 19:24:31 +0300
committerKaren Arutyunov <karen@codesynthesis.com>2021-09-28 20:29:59 +0300
commitdf1ef68cd8e8582724ce1192bfc202e0b9aeaf0c (patch)
treeb731ca4c68e60c00c7e7d499dbf4868ee7b71f44 /libbutl/utf8.mxx
parent7a4fc37f264cdb67f2f83fa92703c869215bbc86 (diff)
Get rid of C++ modules related code and rename *.mxx files to *.hxx
Diffstat (limited to 'libbutl/utf8.mxx')
-rw-r--r--libbutl/utf8.mxx130
1 files changed, 0 insertions, 130 deletions
diff --git a/libbutl/utf8.mxx b/libbutl/utf8.mxx
deleted file mode 100644
index 15e8ded..0000000
--- a/libbutl/utf8.mxx
+++ /dev/null
@@ -1,130 +0,0 @@
-// file : libbutl/utf8.mxx -*- C++ -*-
-// license : MIT; see accompanying LICENSE file
-
-#ifndef __cpp_modules_ts
-#pragma once
-#endif
-
-// C includes.
-
-#ifndef __cpp_lib_modules_ts
-#include <string>
-#include <cstdint> // uint8_t
-#include <utility> // pair
-#endif
-
-// Other includes.
-
-#ifdef __cpp_modules_ts
-export module butl.utf8;
-#ifdef __cpp_lib_modules_ts
-import std.core;
-#endif
-import butl.unicode;
-#else
-#include <libbutl/unicode.mxx>
-#endif
-
-#include <libbutl/export.hxx>
-
-LIBBUTL_MODEXPORT namespace butl
-{
- // Here and below we will refer to bytes that encode a singe Unicode
- // codepoint as "UTF-8 byte sequence" ("UTF-8 sequence" or "byte sequence"
- // for short) and a sequence of such sequences as "UTF-8 encoded byte
- // string" ("byte string" for short).
- //
-
- // Validate a UTF-8 encoded byte string one byte at a time. Optionally, also
- // validate that its decoded codepoints belong to the specified types or
- // codepoint whitelist.
- //
- class utf8_validator
- {
- public:
- // Note: use whitelist via shallow copy.
- //
- explicit
- utf8_validator (codepoint_types = codepoint_types::any,
- const char32_t* whitelist = nullptr);
-
- // Validate the next byte returning true if it is valid (first) and
- // whether it is the last byte of a codepoint (second). The {false, true}
- // result indicates a byte sequence decoded into a codepoint of undesired
- // type rather than an invalid byte that happens to be the last in the
- // sequence (and may well be a valid starting byte of the next sequence).
- //
- // Note that in case the byte is invalid, calling this function again
- // without recovery is illegal.
- //
- std::pair<bool, bool>
- validate (char);
-
- // As above but in case of an invalid byte also return the description of
- // why it is invalid.
- //
- // Note that the description only contains the reason why the specified
- // byte is not part of a valid UTF-8 sequence or the desired codepoint
- // type, for example:
- //
- // "invalid UTF-8 sequence first byte (0xB0)"
- // "invalid Unicode codepoint (reserved)"
- //
- // It can be used to form complete diagnostics along these lines:
- //
- // cerr << "invalid manifest value " << name << ": " << what << endl;
- //
- std::pair<bool, bool>
- validate (char, std::string& what);
-
- // As above but decide whether the description is needed at runtime (what
- // may be NULL).
- //
- std::pair<bool, bool>
- validate (char, std::string* what);
-
- // Recover from an invalid byte.
- //
- // This function must be called with the first invalid and then subsequent
- // bytes until it signals that the specified byte is valid. Note that it
- // shall not be called if the sequence is decoded into a codepoint of an
- // undesired type.
- //
- // Note also that a byte being invalid in the middle of a UTF-8 sequence
- // may be valid as a first byte of the next sequence.
- //
- std::pair<bool, bool>
- recover (char);
-
- // Return the codepoint of the last byte sequence.
- //
- // This function can only be legally called after validate() or recover()
- // signal that the preceding byte is valid and last.
- //
- char32_t
- codepoint () const;
-
- private:
- codepoint_types types_;
- const char32_t* whitelist_;
-
- // State machine.
- //
- uint8_t seq_size_; // [1 4]; calculated at the first byte validation.
- uint8_t seq_index_ = 0; // [0 3]
-
- // Last byte sequence decoded codepoint (built incrementally).
- //
- char32_t codepoint_;
-
- // The byte range a valid UTF-8 sequence second byte must belong to as
- // calculated during the first byte validation.
- //
- // Note that the subsequent (third and forth) bytes must belong to the
- // [80 BF] range regardless to the previous bytes.
- //
- std::pair<unsigned char, unsigned char> byte2_range_;
- };
-}
-
-#include <libbutl/utf8.ixx>