diff options
author | Karen Arutyunov <karen@codesynthesis.com> | 2020-02-26 17:16:45 +0300 |
---|---|---|
committer | Karen Arutyunov <karen@codesynthesis.com> | 2020-02-26 17:17:49 +0300 |
commit | 5ae9686adac1508873f2d980e84becd3496244c2 (patch) | |
tree | d7c88e678b29ed6bb7ae30b74bd01aa2b5d2e9a8 /libbutl/utility.ixx | |
parent | afb726d2d59b3715960a8647738860f40e37cf4f (diff) |
Add notion of validator to char_scanner and make sure manifest is UTF-8
This involves implementing utf8_validator and UTF-8 utility functions and
using them during the manifest parsing, serialization, and rewriting.
Diffstat (limited to 'libbutl/utility.ixx')
-rw-r--r-- | libbutl/utility.ixx | 95 |
1 files changed, 69 insertions, 26 deletions
diff --git a/libbutl/utility.ixx b/libbutl/utility.ixx index c5fdbac..27ef7fb 100644 --- a/libbutl/utility.ixx +++ b/libbutl/utility.ixx @@ -2,8 +2,11 @@ // license : MIT; see accompanying LICENSE file #ifndef __cpp_lib_modules_ts -#include <cstdlib> // getenv() -#include <algorithm> +#include <cctype> // toupper(), tolower(), is*() +#include <cwctype> // isw*() +#include <cstdlib> // getenv() +#include <algorithm> // for_each() +#include <stdexcept> // invalid_argument #endif namespace butl @@ -216,44 +219,84 @@ namespace butl return sanitize_identifier (std::string (s)); } - inline codepoint_types - operator&= (codepoint_types& x, codepoint_types y) + inline bool + eof (std::istream& is) { - return x = static_cast<codepoint_types> ( - static_cast<std::uint16_t> (x) & - static_cast<std::uint16_t> (y)); + if (!is.fail ()) + return false; + + if (is.eof ()) + return true; + + throw std::istream::failure (""); } - inline codepoint_types - operator|= (codepoint_types& x, codepoint_types y) + inline optional<std::size_t> + utf8_length_impl (const std::string& s, + std::string* what, + codepoint_types ts, + const char32_t* wl) { - return x = static_cast<codepoint_types> ( - static_cast<std::uint16_t> (x) | - static_cast<std::uint16_t> (y)); + using namespace std; + + // Optimize for an empty string. + // + if (s.empty ()) + return 0; + + size_t r (0); + pair<bool, bool> v; + utf8_validator val (ts, wl); + + for (char c: s) + { + v = val.validate (c, what); + + if (!v.first) // Invalid byte? + return nullopt; + + if (v.second) // Last byte in the sequence? + ++r; + } + + // Make sure that the last UTF-8 sequence is complete. + // + if (!v.second) + { + if (what != nullptr) + *what = "incomplete UTF-8 sequence"; + + return nullopt; + } + + return r; } - inline codepoint_types - operator& (codepoint_types x, codepoint_types y) + inline std::size_t + utf8_length (const std::string& s, codepoint_types ts, const char32_t* wl) { - return x &= y; + using namespace std; + + string what; + if (optional<size_t> r = utf8_length_impl (s, &what, ts, wl)) + return *r; + + throw invalid_argument (what); } - inline codepoint_types - operator| (codepoint_types x, codepoint_types y) + inline bool + utf8 (const std::string& s, + std::string& what, + codepoint_types ts, + const char32_t* wl) { - return x |= y; + return utf8_length_impl (s, &what, ts, wl).has_value (); } inline bool - eof (std::istream& is) + utf8 (const std::string& s, codepoint_types ts, const char32_t* wl) { - if (!is.fail ()) - return false; - - if (is.eof ()) - return true; - - throw std::istream::failure (""); + return utf8_length_impl (s, nullptr, ts, wl).has_value (); } inline optional<std::string> |