Add notion of validator to char_scanner and make sure manifest is UTF-8

This involves implementing utf8_validator and UTF-8 utility functions and using them during the manifest parsing, serialization, and rewriting.
author: Karen Arutyunov <karen@codesynthesis.com> 2020-02-26 17:16:45 +0300
committer: Karen Arutyunov <karen@codesynthesis.com> 2020-02-26 17:17:49 +0300
commit: 5ae9686adac1508873f2d980e84becd3496244c2 (patch)
tree: d7c88e678b29ed6bb7ae30b74bd01aa2b5d2e9a8
parent: afb726d2d59b3715960a8647738860f40e37cf4f (diff)
24 files changed, 1696 insertions, 596 deletions
diff --git a/libbutl/char-scanner.cxx b/libbutl/char-scanner.cxx
deleted file mode 100644
index 85416e5..0000000
--- a/libbutl/char-scanner.cxx
+++ /dev/null
@@ -1,126 +0,0 @@
-// file      : libbutl/char-scanner.cxx -*- C++ -*-
-// license   : MIT; see accompanying LICENSE file
-
-#ifndef __cpp_modules_ts
-#include <libbutl/char-scanner.mxx>
-#endif
-
-// C includes.
-
-#ifndef __cpp_lib_modules_ts
-#include <string>  // char_traits
-#include <cstdint> // uint64_t
-#include <istream>
-#endif
-
-// Other includes.
-
-#ifdef __cpp_modules_ts
-module butl.char_scanner;
-
-// Only imports additional to interface.
-#ifdef __clang__
-#ifdef __cpp_lib_modules_ts
-import std.core;
-import std.io;
-#endif
-import butl.fdstream;
-#endif
-
-#endif
-
-using namespace std;
-
-namespace butl
-{
-  char_scanner::
-  char_scanner (istream& is, bool crlf, uint64_t l, uint64_t p)
-      : line (l),
-        column (1),
-        position (p),
-        is_ (is),
-        buf_ (dynamic_cast<fdbuf*> (is.rdbuf ())),
-        gptr_ (nullptr),
-        egptr_ (nullptr),
-        crlf_ (crlf)
-  {
-  }
-
-  auto char_scanner::
-  peek () -> xchar
-  {
-    if (unget_)
-      return ungetc_;
-
-    if (unpeek_)
-      return unpeekc_;
-
-    if (eos_)
-      return xchar (xchar::traits_type::eof (), line, column, position);
-
-    int_type v (peek_ ());
-
-    if (v == xchar::traits_type::eof ())
-      eos_ = true;
-    else if (crlf_ && v == '\r')
-    {
-      int_type v1;
-      do
-      {
-        get_ ();
-        v1 = peek_ ();
-      }
-      while (v1 == '\r');
-
-      if (v1 != '\n')
-      {
-        // We need to make sure subsequent calls to peek() return newline.
-        //
-        unpeek_ = true;
-        unpeekc_ = xchar ('\n', line, column, position);
-
-        if (v1 == xchar::traits_type::eof ())
-          eos_ = true;
-      }
-
-      v = '\n';
-    }
-
-    return xchar (v, line, column, position);
-  }
-
-  void char_scanner::
-  get (const xchar& c)
-  {
-    if (unget_)
-      unget_ = false;
-    else
-    {
-      if (unpeek_)
-      {
-        unpeek_ = false;
-      }
-      // When is_.get () returns eof, the failbit is also set (stupid,
-      // isn't?) which may trigger an exception. To work around this
-      // we will call peek() first and only call get() if it is not
-      // eof. But we can only call peek() on eof once; any subsequent
-      // calls will spoil the failbit (even more stupid).
-      //
-      else if (!eos (c))
-        get_ ();
-
-      if (!eos (c))
-      {
-        if (c == '\n')
-        {
-          line++;
-          column = 1;
-        }
-        else
-          column++;
-
-        position = pos_ ();
-      }
-    }
-  }
-}
diff --git a/libbutl/char-scanner.ixx b/libbutl/char-scanner.ixx
index 36cc93d..7e9c4b0 100644
--- a/libbutl/char-scanner.ixx
+++ b/libbutl/char-scanner.ixx
@@ -3,8 +3,30 @@
 
 namespace butl
 {
-  inline auto char_scanner::
-  get () -> xchar
+  template <typename V>
+  inline char_scanner<V>::
+  char_scanner (std::istream& is, bool crlf, std::uint64_t l, std::uint64_t p)
+      : char_scanner (is, validator_type (), crlf, l, p)
+  {
+  }
+
+  template <typename V>
+  inline auto char_scanner<V>::
+  peek (std::string& what) -> xchar
+  {
+    return peek (&what);
+  }
+
+  template <typename V>
+  inline auto char_scanner<V>::
+  peek () -> xchar
+  {
+    return peek (nullptr /* what */);
+  }
+
+  template <typename V>
+  inline auto char_scanner<V>::
+  get (std::string* what) -> xchar
   {
     if (unget_)
     {
@@ -13,13 +35,28 @@ namespace butl
     }
     else
     {
-      xchar c (peek ());
+      xchar c (peek (what));
       get (c);
       return c;
     }
   }
 
-  inline void char_scanner::
+  template <typename V>
+  inline auto char_scanner<V>::
+  get (std::string& what) -> xchar
+  {
+    return get (&what);
+  }
+
+  template <typename V>
+  inline auto char_scanner<V>::
+  get () -> xchar
+  {
+    return get (nullptr /* what */);
+  }
+
+  template <typename V>
+  inline void char_scanner<V>::
   unget (const xchar& c)
   {
     // Because iostream::unget cannot work once eos is reached, we have to
@@ -29,7 +66,8 @@ namespace butl
     ungetc_ = c;
   }
 
-  inline auto char_scanner::
+  template <typename V>
+  inline auto char_scanner<V>::
   peek_ () -> int_type
   {
     if (gptr_ != egptr_)
@@ -48,7 +86,8 @@ namespace butl
     return r;
   }
 
-  inline void char_scanner::
+  template <typename V>
+  inline void char_scanner<V>::
   get_ ()
   {
     int_type c;
@@ -61,11 +100,14 @@ namespace butl
     else
       c = is_.get (); // About as fast as ignore() and way faster than tellg().
 
+    validated_ = false;
+
     if (save_ != nullptr && c != xchar::traits_type::eof ())
       save_->push_back (static_cast<char_type> (c));
   }
 
-  inline std::uint64_t char_scanner::
+  template <typename V>
+  inline std::uint64_t char_scanner<V>::
   pos_ () const
   {
     return buf_ != nullptr ? buf_->tellg () : 0;
diff --git a/libbutl/char-scanner.mxx b/libbutl/char-scanner.mxx
index 5ad3d61..e57245b 100644
--- a/libbutl/char-scanner.mxx
+++ b/libbutl/char-scanner.mxx
@@ -10,6 +10,8 @@
 #ifndef __cpp_lib_modules_ts
 #include <string>  // char_traits
 #include <cstdint> // uint64_t
+#include <climits> // INT_*
+#include <utility> // pair, make_pair()
 #include <istream>
 #endif
 
@@ -30,12 +32,26 @@ import butl.fdstream;
 
 LIBBUTL_MODEXPORT namespace butl
 {
+  // Refer to utf8_validator for details.
+  //
+  struct noop_validator
+  {
+    std::pair<bool, bool>
+    validate (char) {return std::make_pair (true, true);}
+
+    std::pair<bool, bool>
+    validate (char c, std::string&) {return validate (c);}
+  };
+
   // Low-level character stream scanner. Normally used as a base for
   // higher-level lexers.
   //
-  class LIBBUTL_SYMEXPORT char_scanner
+  template <typename V = noop_validator>
+  class char_scanner
   {
   public:
+    using validator_type = V;
+
     // If the crlf argument is true, then recognize Windows newlines (0x0D
     // 0x0A) and convert them to just '\n' (0x0A). Note that a standalone
     // 0x0D is treated "as if" it was followed by 0x0A and multiple 0x0D
@@ -49,7 +65,13 @@ LIBBUTL_MODEXPORT namespace butl
     // and position in the stream (useful when re-scanning data saved with the
     // save_* facility).
     //
-    char_scanner (std::istream& is,
+    char_scanner (std::istream&,
+                  bool crlf = true,
+                  std::uint64_t line = 1,
+                  std::uint64_t position = 0);
+
+    char_scanner (std::istream&,
+                  validator_type,
                   bool crlf = true,
                   std::uint64_t line = 1,
                   std::uint64_t position = 0);
@@ -62,10 +84,10 @@ LIBBUTL_MODEXPORT namespace butl
   public:
 
     // Extended character. It includes line/column/position information and is
-    // capable of representing EOF.
+    // capable of representing EOF and invalid characters.
     //
-    // Note that implicit conversion of EOF to char_type results in NUL
-    // character (which means in most cases it is safe to compare xchar to
+    // Note that implicit conversion of EOF/invalid to char_type results in
+    // NUL character (which means in most cases it is safe to compare xchar to
     // char without checking for EOF).
     //
     class xchar
@@ -76,6 +98,9 @@ LIBBUTL_MODEXPORT namespace butl
       using char_type = traits_type::char_type;
 
       int_type value;
+
+      // Note that the column is of the codepoint this byte belongs to.
+      //
       std::uint64_t line;
       std::uint64_t column;
 
@@ -84,9 +109,12 @@ LIBBUTL_MODEXPORT namespace butl
       //
       std::uint64_t position;
 
+      static int_type
+      invalid () {return traits_type::eof () != INT_MIN ? INT_MIN : INT_MAX;}
+
       operator char_type () const
       {
-        return value != traits_type::eof ()
+        return value != traits_type::eof () && value != invalid ()
           ? static_cast<char_type> (value)
           : char_type (0);
       }
@@ -98,27 +126,44 @@ LIBBUTL_MODEXPORT namespace butl
           : value (v), line (l), column (c), position (p) {}
     };
 
+    // Note that if any of the get() or peek() functions return an invalid
+    // character, then the scanning has failed and none of them should be
+    // called again.
+
     xchar
     get ();
 
+    // As above but in case of an invalid character also return the
+    // description of why it is invalid.
+    //
+    xchar
+    get (std::string& what);
+
     void
     get (const xchar& peeked); // Get previously peeked character (faster).
 
     void
     unget (const xchar&);
 
-    // Note that if there is an "ungot" character, peek() will return
-    // that.
+    // Note that if there is an "ungot" character, peek() will return that.
     //
     xchar
     peek ();
 
-    // Tests. In the future we can add tests line alpha(), alnum(),
-    // etc.
+    // As above but in case of an invalid character also return the
+    // description of why it is invalid.
+    //
+    xchar
+    peek (std::string& what);
+
+    // Tests. In the future we can add tests line alpha(), alnum(), etc.
     //
     static bool
     eos (const xchar& c) {return c.value == xchar::traits_type::eof ();}
 
+    static bool
+    invalid (const xchar& c) {return c.value == xchar::invalid ();}
+
     // Line, column and position of the next character to be extracted from
     // the stream by peek() or get().
     //
@@ -159,8 +204,8 @@ LIBBUTL_MODEXPORT namespace butl
     };
 
   protected:
-    using int_type = xchar::int_type;
-    using char_type = xchar::char_type;
+    using int_type  = typename xchar::int_type;
+    using char_type = typename xchar::char_type;
 
     int_type
     peek_ ();
@@ -171,11 +216,27 @@ LIBBUTL_MODEXPORT namespace butl
     std::uint64_t
     pos_ () const;
 
+    xchar
+    get (std::string* what);
+
+    xchar
+    peek (std::string* what);
+
   protected:
     std::istream& is_;
 
-    // Note that if you are reading from the buffer directly, then it is
-    // also your responsibility to save the data.
+    validator_type val_;
+    bool decoded_   = true;  // The peeked character is last byte of sequence.
+    bool validated_ = false; // The peeked character has been validated.
+
+    // Note that if you are reading from the buffer directly, then it is also
+    // your responsibility to call the validator and save the data (see
+    // save_*().
+    //
+    // Besides that, make sure that the peek() call preceding the scan is
+    // followed by the get() call (see validated_, decoded_, and unpeek_ for
+    // the hairy details; realistically, you would probably only direct-scan
+    // ASCII fragments).
     //
     fdbuf* buf_; // NULL if not ifdstream.
     const char_type* gptr_;
@@ -195,3 +256,4 @@ LIBBUTL_MODEXPORT namespace butl
 }
 
 #include <libbutl/char-scanner.ixx>
+#include <libbutl/char-scanner.txx>
diff --git a/libbutl/char-scanner.txx b/libbutl/char-scanner.txx
new file mode 100644
index 0000000..d4e2082
--- /dev/null
+++ b/libbutl/char-scanner.txx
@@ -0,0 +1,146 @@
+// file      : libbutl/char-scanner.txx -*- C++ -*-
+// license   : MIT; see accompanying LICENSE file
+
+#ifndef __cpp_lib_modules_ts
+#include <utility> // move
+#endif
+
+namespace butl
+{
+  template <typename V>
+  char_scanner<V>::
+  char_scanner (std::istream& is,
+                validator_type v,
+                bool crlf,
+                std::uint64_t l,
+                std::uint64_t p)
+      : line (l),
+        column (1),
+        position (p),
+        is_ (is),
+        val_ (std::move (v)),
+        buf_ (dynamic_cast<fdbuf*> (is.rdbuf ())),
+        gptr_ (nullptr),
+        egptr_ (nullptr),
+        crlf_ (crlf)
+  {
+  }
+
+  template <typename V>
+  auto char_scanner<V>::
+  peek (std::string* what) -> xchar
+  {
+    if (unget_)
+      return ungetc_;
+
+    if (unpeek_)
+      return unpeekc_;
+
+    if (eos_)
+      return xchar (xchar::traits_type::eof (), line, column, position);
+
+    int_type v (peek_ ());
+
+    if (v == xchar::traits_type::eof ())
+    {
+      if (!decoded_)
+      {
+        if (what != nullptr)
+          *what = "unexpected end of stream";
+
+        v = xchar::invalid ();
+      }
+
+      eos_ = true;
+    }
+    else
+    {
+      auto valid = [what, this] (int_type v)
+      {
+        if (validated_)
+          return true;
+
+        char c (xchar::traits_type::to_char_type (v));
+        std::pair<bool, bool> r (what != nullptr
+                                 ? val_.validate (c, *what)
+                                 : val_.validate (c));
+
+        decoded_ = r.second;
+        validated_ = true;
+        return r.first;
+      };
+
+      if (!valid (v))
+        v = xchar::invalid ();
+      else if (crlf_ && v == '\r')
+      {
+        // Note that '\r' is a valid character (otherwise we won't be here),
+        // so we don't validate it again below. We also postpone the
+        // validation of the next non-'\r' character (except EOF) until the
+        // next peek() call.
+        //
+        int_type v1;
+        do
+        {
+          get_ ();       // Sets validated_ to false.
+          v1 = peek_ ();
+        }
+        while (v1 == '\r');
+
+        if (v1 != '\n')
+        {
+          // We need to make sure subsequent calls to peek() return newline.
+          //
+          unpeek_ = true;
+          unpeekc_ = xchar ('\n', line, column, position);
+
+          // Note that the previous character is decoded ('\r') and so EOF is
+          // legitimate.
+          //
+          if (v1 == xchar::traits_type::eof ())
+            eos_ = true;
+        }
+
+        v = '\n';
+      }
+    }
+
+    return xchar (v, line, column, position);
+  }
+
+  template <typename V>
+  void char_scanner<V>::
+  get (const xchar& c)
+  {
+    if (unget_)
+      unget_ = false;
+    else
+    {
+      if (unpeek_)
+      {
+        unpeek_ = false;
+      }
+      // When is_.get () returns eof, the failbit is also set (stupid,
+      // isn't?) which may trigger an exception. To work around this
+      // we will call peek() first and only call get() if it is not
+      // eof. But we can only call peek() on eof once; any subsequent
+      // calls will spoil the failbit (even more stupid).
+      //
+      else if (!eos (c))
+        get_ ();
+
+      if (!eos (c))
+      {
+        if (c == '\n')
+        {
+          line++;
+          column = 1;
+        }
+        else if (decoded_) // The character is the last in a sequence?
+          column++;
+
+        position = pos_ ();
+      }
+    }
+  }
+}
diff --git a/libbutl/manifest-parser.cxx b/libbutl/manifest-parser.cxx
index 4de59b7..9514bbd 100644
--- a/libbutl/manifest-parser.cxx
+++ b/libbutl/manifest-parser.cxx
@@ -89,7 +89,7 @@ namespace butl
     parse_name (r);
 
     skip_spaces ();
-    c = get ();
+    c = get ("manifest");
 
     if (eos (c))
     {
@@ -117,7 +117,7 @@ namespace butl
     skip_spaces ();
     parse_value (r);
 
-    c = peek ();
+    c = peek ("manifest");
 
     // The character after the value should be either a newline or eos.
     //
@@ -126,7 +126,7 @@ namespace butl
     r.end_pos = c.position;
 
     if (c == '\n')
-      get ();
+      get (c);
 
     // Now figure out whether what we've got makes sense, depending
     // on the state we are in.
@@ -217,6 +217,8 @@ namespace butl
   void manifest_parser::
   parse_name (name_value& r)
   {
+    auto peek = [this] () {return manifest_parser::peek ("manifest name");};
+
     xchar c (peek ());
 
     r.name_line = c.line;
@@ -228,13 +230,19 @@ namespace butl
         break;
 
       r.name += c;
-      get ();
+      get (c);
     }
   }
 
   void manifest_parser::
   parse_value (name_value& r)
   {
+    auto peek = [this] () {return manifest_parser::peek ("manifest value");};
+
+    // Here we don't always track the last peeked character.
+    //
+    auto get = [this] () {manifest_parser::get ("manifest value");};
+
     xchar c (peek ());
 
     r.value_line = c.line;
@@ -408,6 +416,8 @@ namespace butl
   pair<manifest_parser::xchar, uint64_t> manifest_parser::
   skip_spaces ()
   {
+    auto peek = [this] () {return manifest_parser::peek ("manifest");};
+
     xchar c (peek ());
     bool start (c.column == 1);
     uint64_t lp (c.position);
@@ -437,12 +447,12 @@ namespace butl
           if (!start)
             return make_pair (c, lp);
 
-          get ();
+          get (c);
 
           // Read until newline or eos.
           //
           for (c = peek (); !eos (c) && c != '\n'; c = peek ())
-            get ();
+            get (c);
 
           continue;
         }
@@ -450,7 +460,7 @@ namespace butl
         return make_pair (c, lp); // Not a space.
       }
 
-      get ();
+      get (c);
     }
 
     return make_pair (c, lp);
diff --git a/libbutl/manifest-parser.ixx b/libbutl/manifest-parser.ixx
index e616ad9..bc5246c 100644
--- a/libbutl/manifest-parser.ixx
+++ b/libbutl/manifest-parser.ixx
@@ -3,6 +3,37 @@
 
 namespace butl
 {
+
+  inline auto manifest_parser::
+  get (const char* what) -> xchar
+  {
+    xchar c (base::get (ebuf_));
+
+    if (invalid (c))
+      throw manifest_parsing (name_,
+                              c.line, c.column,
+                              std::string ("invalid ") + what + ": " + ebuf_);
+    return c;
+  }
+
+  inline void manifest_parser::
+  get (const xchar& peeked)
+  {
+    base::get (peeked);
+  }
+
+  inline auto manifest_parser::
+  peek (const char* what) -> xchar
+  {
+    xchar c (base::peek (ebuf_));
+
+    if (invalid (c))
+      throw manifest_parsing (name_,
+                              c.line, c.column,
+                              std::string ("invalid ") + what + ": " + ebuf_);
+    return c;
+  }
+
   inline manifest_name_value manifest_parser::
   next ()
   {
diff --git a/libbutl/manifest-parser.mxx b/libbutl/manifest-parser.mxx
index adf6181..77addff 100644
--- a/libbutl/manifest-parser.mxx
+++ b/libbutl/manifest-parser.mxx
@@ -25,10 +25,12 @@ export module butl.manifest_parser;
 import std.core;
 import std.io;
 #endif
+import butl.utf8;
 import butl.optional;
 import butl.char_scanner;
 import butl.manifest_types;
 #else
+#include <libbutl/utf8.mxx>
 #include <libbutl/optional.mxx>
 #include <libbutl/char-scanner.mxx>
 #include <libbutl/manifest-types.mxx>
@@ -54,7 +56,8 @@ LIBBUTL_MODEXPORT namespace butl
     std::string description;
   };
 
-  class LIBBUTL_SYMEXPORT manifest_parser: protected butl::char_scanner
+  class LIBBUTL_SYMEXPORT manifest_parser:
+    protected char_scanner<utf8_validator>
   {
   public:
     // The filter, if specified, is called by next() prior to returning the
@@ -69,7 +72,10 @@ LIBBUTL_MODEXPORT namespace butl
     manifest_parser (std::istream& is,
                      const std::string& name,
                      std::function<filter_function> filter = {})
-      : char_scanner (is), name_ (name), filter_ (std::move (filter)) {}
+      : char_scanner (is,
+                      utf8_validator (codepoint_types::graphic, U"\n\r\t")),
+        name_ (name),
+        filter_ (std::move (filter)) {}
 
     const std::string&
     name () const {return name_;}
@@ -97,6 +103,8 @@ LIBBUTL_MODEXPORT namespace butl
     split_comment (const std::string&);
 
   private:
+    using base = char_scanner<utf8_validator>;
+
     void
     parse_next (manifest_name_value&);
 
@@ -114,12 +122,33 @@ LIBBUTL_MODEXPORT namespace butl
     std::pair<xchar, std::uint64_t>
     skip_spaces ();
 
+    // As base::get() but in case of an invalid character throws
+    // manifest_parsing.
+    //
+    xchar
+    get (const char* what);
+
+    // Get previously peeked character (faster).
+    //
+    void
+    get (const xchar&);
+
+    // As base::peek() but in case of an invalid character throws
+    // manifest_parsing.
+    //
+    xchar
+    peek (const char* what);
+
   private:
     const std::string name_;
     const std::function<filter_function> filter_;
 
     enum {start, body, end} s_ = start;
     std::string version_; // Current format version.
+
+    // Buffer for a get()/peek() potential error.
+    //
+    std::string ebuf_;
   };
 
   // Parse and return a single manifest. Throw manifest_parsing in case of an
diff --git a/libbutl/manifest-rewriter.cxx b/libbutl/manifest-rewriter.cxx
index ba0c866..e38d5f4 100644
--- a/libbutl/manifest-rewriter.cxx
+++ b/libbutl/manifest-rewriter.cxx
@@ -30,8 +30,10 @@ import butl.fdstream;
 import butl.manifest_types;
 #endif
 
+import butl.utility;             // utf8_length()
 import butl.manifest_serializer;
 #else
+#include <libbutl/utility.mxx>
 #include <libbutl/manifest-serializer.mxx>
 #endif
 
@@ -101,8 +103,16 @@ namespace butl
 
       manifest_serializer s (os, path_.string (), long_lines_);
 
+      // Note that the name can be surrounded with the ASCII whitespace
+      // characters and the start_pos refers to the first character in the
+      // line.
+      //
+      // Also note that we assume the already serialized name to be a valid
+      // UTF-8 byte string and so utf8_length() may not throw.
+      //
       s.write_value (nv.value,
-                     static_cast<size_t> (nv.colon_pos - nv.start_pos + 2));
+                     static_cast<size_t> (nv.colon_pos - nv.start_pos) -
+                     (nv.name.size () - utf8_length (nv.name)) + 2);
     }
 
     os << suffix;
@@ -128,15 +138,21 @@ namespace butl
     os << '\n';
 
     manifest_serializer s (os, path_.string (), long_lines_);
-    s.write_name (nv.name);
+    size_t n (s.write_name (nv.name));
 
     os << ':';
 
     if (!nv.value.empty ())
     {
       os << ' ';
+
+      // Note that the name can be surrounded with the ASCII whitespace
+      // characters and the start_pos refers to the first character in the
+      // line.
+      //
       s.write_value (nv.value,
-                     static_cast<size_t> (nv.colon_pos - nv.start_pos + 2));
+                     static_cast<size_t> (nv.colon_pos - nv.start_pos) -
+                     (nv.name.size () - n) + 2);
     }
 
     os << suffix;
diff --git a/libbutl/manifest-serializer.cxx b/libbutl/manifest-serializer.cxx
index 0a81478..6a26a15 100644
--- a/libbutl/manifest-serializer.cxx
+++ b/libbutl/manifest-serializer.cxx
@@ -30,6 +30,11 @@ import std.io;
 import butl.manifest_types;
 #endif
 
+import butl.utf8;
+import butl.utility;
+#else
+#include <libbutl/utf8.mxx>
+#include <libbutl/utility.mxx>
 #endif
 
 using namespace std;
@@ -86,13 +91,13 @@ namespace butl
           break;
         }
 
-        write_name (n);
+        size_t l (write_name (n));
         os_ << ':';
 
         if (!v.empty ())
         {
           os_ << ' ';
-          write_value (v, n.size () + 2);
+          write_value (v, l + 2);
         }
 
         os_ << endl;
@@ -111,6 +116,10 @@ namespace butl
     if (s_ == end)
       throw serialization (name_, "serialization after eos");
 
+    string what;
+    if (!utf8 (t, what, codepoint_types::graphic, U"\n\r\t"))
+      throw serialization (name_, "invalid comment: " + what);
+
     os_ << '#';
 
     if (!t.empty ())
@@ -144,7 +153,7 @@ namespace butl
     return r;
   }
 
-  void manifest_serializer::
+  size_t manifest_serializer::
   write_name (const string& n)
   {
     if (n.empty ())
@@ -153,43 +162,76 @@ namespace butl
     if (n[0] == '#')
       throw serialization (name_, "name starts with '#'");
 
+    size_t r (0);
+    pair<bool, bool> v;
+    utf8_validator val (codepoint_types::graphic, U"\n\r\t");
+
+    string what;
     for (char c: n)
     {
-      switch (c)
+      v = val.validate (c, what);
+
+      if (!v.first)
+        throw serialization (name_, "invalid name: " + what);
+
+      if (v.second) // Sequence last byte?
       {
-      case ' ':
-      case '\t':
-      case '\r':
-      case '\n': throw serialization (name_, "name contains whitespace");
-      case ':':  throw serialization (name_, "name contains ':'");
-      default:   break;
+        // Note: ASCII characters may not be a part of a multi-byte sequence.
+        //
+        switch (c)
+        {
+        case ' ':
+        case '\t':
+        case '\r':
+        case '\n': throw serialization (name_, "name contains whitespace");
+        case ':':  throw serialization (name_, "name contains ':'");
+        default:   break;
+        }
+
+        ++r;
       }
     }
 
+    // Make sure that the last UTF-8 sequence is complete.
+    //
+    if (!v.second)
+      throw serialization (name_, "invalid name: incomplete UTF-8 sequence");
+
     os_ << n;
+    return r;
   }
 
   void manifest_serializer::
   write_value (const char* s, size_t n, size_t cl)
   {
+    utf8_validator val (codepoint_types::graphic, U"\n\r\t");
+
     char c ('\0');
+    bool b (true); // Begin of UTF-8 byte sequence.
 
-    // The idea is to break on the 77th character (i.e., write it
-    // on the next line) which means we have written 76 characters
+    // The idea is to break on the 77th codepoint (i.e., write it
+    // on the next line) which means we have written 76 codepoints
     // on this line plus 2 for '\' and '\n', which gives us 78.
     //
-    for (const char* e (s + n); s != e; s++, cl++)
+    string what;
+    for (const char* e (s + n); s != e; s++)
     {
       char pc (c);
       c = *s;
 
+      pair<bool, bool> v (val.validate (c, what));
+
+      if (!v.first)
+        throw serialization (name_, "invalid value: " + what);
+
       // Note that even the "hard" break (see below) is not that hard when it
       // comes to breaking the line right after the backslash. Doing so would
       // inject the redundant newline character, as the line-terminating
       // backslash would be escaped. So we delay breaking till the next
-      // non-backslash character.
+      // non-backslash character. We also delay until the beginning of a UTF-8
+      // sequence.
       //
-      if (pc != '\\' && !long_lines_)
+      if (pc != '\\' && b && !long_lines_)
       {
         bool br (false); // Break the line.
 
@@ -237,8 +279,18 @@ namespace butl
       }
 
       os_ << c;
+
+      b = v.second;
+
+      if (b)
+        ++cl;
     }
 
+    // Make sure that the last UTF-8 sequence is complete.
+    //
+    if (!b)
+      throw serialization (name_, "invalid value: incomplete UTF-8 sequence");
+
     // What comes next is always a newline. If the last character that
     // we have written is a backslash, escape it.
     //
@@ -256,7 +308,7 @@ namespace butl
 
     // Use the multi-line mode in any of the following cases:
     //
-    // - column offset is too large (say greater than 39 (78/2) characters; we
+    // - column offset is too large (say greater than 39 (78/2) codepoints; we
     //   cannot start on the next line since that would start the multi-line
     //   mode)
     // - value contains newlines
diff --git a/libbutl/manifest-serializer.mxx b/libbutl/manifest-serializer.mxx
index f114ffb..b73c255 100644
--- a/libbutl/manifest-serializer.mxx
+++ b/libbutl/manifest-serializer.mxx
@@ -60,7 +60,7 @@ LIBBUTL_MODEXPORT namespace butl
                                   const std::string& value);
 
     // Unless long_lines is true, break lines in values (including multi-line)
-    // so that their length does not exceed 78 characters (including '\n').
+    // so that their length does not exceed 78 codepoints (including '\n').
     //
     manifest_serializer (std::ostream& os,
                          const std::string& name,
@@ -108,23 +108,23 @@ LIBBUTL_MODEXPORT namespace butl
     void
     write_next (const std::string& name, const std::string& value);
 
-    // Validate and write a name.
+    // Validate and write a name and return its length in codepoints.
     //
-    void
+    size_t
     write_name (const std::string&);
 
     // Write a value assuming the current line already has the specified
-    // offset. If the resulting line length would be too large then the
-    // multi-line representation will be used. It is assumed that the name,
-    // followed by the colon, is already written.
+    // codepoint offset. If the resulting line length would be too large then
+    // the multi-line representation will be used. It is assumed that the
+    // name, followed by the colon, is already written.
     //
     void
     write_value (const std::string&, std::size_t offset);
 
     // Write the specified number of characters from the specified string
     // (assuming there are no newlines) split into multiple lines at or near
-    // the 78 characters boundary. Assume the current line already has the
-    // specified offset.
+    // the 78 codepoints boundary. Assume the current line already has the
+    // specified codepoint offset.
     //
     void
     write_value (const char* s, std::size_t n, std::size_t offset);
diff --git a/libbutl/standard-version.cxx b/libbutl/standard-version.cxx
index c27b064..a9f5eb8 100644
--- a/libbutl/standard-version.cxx
+++ b/libbutl/standard-version.cxx
@@ -41,6 +41,8 @@ using namespace std;
 
 namespace butl
 {
+  using std::to_string;
+
   // Parse uint64_t from the specified string starting at the specified
   // position and check the min/max constraints. If successful, save the
   // result, update the position to point to the next character, and return
diff --git a/libbutl/unicode.cxx b/libbutl/unicode.cxx
new file mode 100644
index 0000000..4219846
--- /dev/null
+++ b/libbutl/unicode.cxx
@@ -0,0 +1,165 @@
+// file      : libbutl/unicode.cxx -*- C++ -*-
+// license   : MIT; see accompanying LICENSE file
+
+#ifndef __cpp_modules_ts
+#include <libbutl/unicode.mxx>
+#endif
+
+#ifndef __cpp_lib_modules_ts
+#include <string>
+#include <ostream>
+#include <cstdint>
+
+#include <cstddef>   // size_t
+#include <utility>   // pair
+#include <algorithm> // lower_bound()
+#endif
+
+#ifdef __cpp_modules_ts
+module butl.unicode;
+
+// Only imports additional to interface.
+#ifdef __clang__
+#ifdef __cpp_lib_modules_ts
+import std.core;
+import std.io;
+#endif
+#endif
+
+#endif
+
+using namespace std;
+
+namespace butl
+{
+  // Sorted arrays of the Unicode codepoint ranges corresponding to the
+  // codepoint types (see the Types of Code Points table in the Unicode 12.0
+  // Standard for details). Note that code type range lists (but not ranges
+  // themselves) may overlap.
+  //
+  // Also note that the graphic type codepoints are numerous and scattered.
+  // Thus, we will consider a codepoint to be of the graphic type if it is not
+  // of any other type.
+  //
+  using codepoint_range = pair<char32_t, char32_t>;
+
+  static const codepoint_range cn_rs[] = // Control.
+  {
+    {0x00, 0x1F},
+    {0x7F, 0x9F}
+  };
+
+  static const codepoint_range fr_rs[] = // Format.
+  {
+    {0x000AD, 0x000AD},
+    {0x00600, 0x00605},
+    {0x0061C, 0x0061C},
+    {0x006DD, 0x006DD},
+    {0x0070F, 0x0070F},
+    {0x008E2, 0x008E2},
+    {0x0180E, 0x0180E},
+    {0x0200B, 0x0200F},
+    {0x0202A, 0x0202E},
+    {0x02060, 0x02064},
+    {0x02066, 0x0206F},
+    {0x0FEFF, 0x0FEFF},
+    {0x0FFF9, 0x0FFFB},
+    {0x110BD, 0x110BD},
+    {0x110CD, 0x110CD},
+    {0x13430, 0x13438},
+    {0x1BCA0, 0x1BCA3},
+    {0x1D173, 0x1D17A},
+    {0xE0001, 0xE0001},
+    {0xE0020, 0xE007F}
+  };
+
+  static const codepoint_range pr_rs[] = // Private-use.
+  {
+    {0x00E000, 0x00F8FF},
+    {0x0F0000, 0x10FFFF}
+  };
+
+  static const codepoint_range nc_rs[] = // Non-character.
+  {
+    {0xFDD0, 0xFDEF}
+  };
+
+  static const codepoint_range rs_rs[] = // Reserved.
+  {
+    {0x30000, 0xE0000},
+    {0xE0002, 0xE001F},
+    {0xE0080, 0xE00FF},
+    {0xE01F0, 0xEFFFF}
+  };
+
+  struct codepoint_type_ranges
+  {
+    codepoint_types type;
+    const codepoint_range* begin;
+    const codepoint_range* end;
+  };
+
+  static const codepoint_type_ranges ct_ranges[] =
+  {
+    {
+      codepoint_types::control,
+      cn_rs,
+      cn_rs + sizeof (cn_rs) / sizeof (*cn_rs)
+    },
+    {
+      codepoint_types::format,
+      fr_rs,
+      fr_rs + sizeof (fr_rs) / sizeof (*fr_rs)
+    },
+    {
+      codepoint_types::private_use,
+      pr_rs,
+      pr_rs + sizeof (pr_rs) / sizeof (*pr_rs)
+    },
+    {
+      codepoint_types::non_character,
+      nc_rs,
+      nc_rs + sizeof (nc_rs) / sizeof (*nc_rs)
+    },
+    {
+      codepoint_types::reserved,
+      rs_rs,
+      rs_rs + sizeof (rs_rs) / sizeof (*rs_rs)
+    }
+  };
+
+  // Return the codepoint type of a range if the codepoint value falls into
+  // one and the graphic type otherwise.
+  //
+  // Note that this is a type detection fallback (see codepoint_type() for
+  // details).
+  //
+  codepoint_types
+  codepoint_type_lookup (char32_t c)
+  {
+    // Note that the codepoint type range lists may overlap. Thus, we iterate
+    // over all of them until there is a match.
+    //
+    for (size_t i (0); i != sizeof (ct_ranges) / sizeof (*ct_ranges); ++i)
+    {
+      const codepoint_type_ranges& rs (ct_ranges[i]);
+
+      // Find the range that either contains the codepoint or lays to the
+      // right of it. Note that here we assume a range to be less than a
+      // codepoint value if it lays to the left of the codepoint.
+      //
+      const codepoint_range* r (
+        lower_bound (rs.begin, rs.end,
+                     c,
+                     [] (const codepoint_range& r, char32_t c)
+                     {
+                       return r.second < c;
+                     }));
+
+      if (r != rs.end && r->first <= c) // Contains the codepoint?
+        return rs.type;
+    }
+
+    return codepoint_types::graphic;
+  }
+}
diff --git a/libbutl/unicode.ixx b/libbutl/unicode.ixx
new file mode 100644
index 0000000..cba4fd2
--- /dev/null
+++ b/libbutl/unicode.ixx
@@ -0,0 +1,72 @@
+// file      : libbutl/unicode.ixx -*- C++ -*-
+// license   : MIT; see accompanying LICENSE file
+
+namespace butl
+{
+  inline codepoint_types
+  operator&= (codepoint_types& x, codepoint_types y)
+  {
+    return x = static_cast<codepoint_types> (
+      static_cast<std::uint16_t> (x) &
+      static_cast<std::uint16_t> (y));
+  }
+
+  inline codepoint_types
+  operator|= (codepoint_types& x, codepoint_types y)
+  {
+    return x = static_cast<codepoint_types> (
+      static_cast<std::uint16_t> (x) |
+      static_cast<std::uint16_t> (y));
+  }
+
+  inline codepoint_types
+  operator& (codepoint_types x, codepoint_types y)
+  {
+    return x &= y;
+  }
+
+  inline codepoint_types
+  operator| (codepoint_types x, codepoint_types y)
+  {
+    return x |= y;
+  }
+
+  LIBBUTL_SYMEXPORT codepoint_types
+  codepoint_type_lookup (char32_t);
+
+  inline codepoint_types
+  codepoint_type (char32_t c)
+  {
+    // Optimize for the common case (printable ASCII characters).
+    //
+    if (c >= 0x20 && c <= 0x7E)                            // Printable ASCII?
+      return codepoint_types::graphic;
+    else if (c > 0x10FFFF || (c >= 0xD800 && c <= 0xDFFF)) // Invalid?
+      return codepoint_types::none;
+    else if ((c & 0xFFFF) >= 0xFFFE)                       // Non-range based?
+      return codepoint_types::non_character;
+    else
+      return codepoint_type_lookup (c);
+  }
+
+  inline std::string
+  to_string (codepoint_types t)
+  {
+    // Note that we use the terms from the Unicode standard ("private-use"
+    // rather than "private use", "noncharacter" rather than "non-character").
+    //
+    switch (t)
+    {
+    case codepoint_types::graphic:       return "graphic";
+    case codepoint_types::format:        return "format";
+    case codepoint_types::control:       return "control";
+    case codepoint_types::private_use:   return "private-use";
+    case codepoint_types::non_character: return "noncharacter"; // No dash.
+    case codepoint_types::reserved:      return "reserved";
+    case codepoint_types::none:
+    case codepoint_types::any:           return "";
+    }
+
+    return ""; // Types combination.
+  }
+}
diff --git a/libbutl/unicode.mxx b/libbutl/unicode.mxx
new file mode 100644
index 0000000..b846476
--- /dev/null
+++ b/libbutl/unicode.mxx
@@ -0,0 +1,82 @@
+// file      : libbutl/unicode.mxx -*- C++ -*-
+// license   : MIT; see accompanying LICENSE file
+
+#ifndef __cpp_modules_ts
+#pragma once
+#endif
+
+// C includes.
+
+#ifndef __cpp_lib_modules_ts
+#include <string>
+#include <ostream>
+#include <cstdint> // uint16_t
+#endif
+
+// Other includes.
+
+#ifdef __cpp_modules_ts
+export module butl.unicode;
+#ifdef __cpp_lib_modules_ts
+import std.core;
+import std.io;
+#endif
+#endif
+
+#include <libbutl/export.hxx>
+
+LIBBUTL_MODEXPORT namespace butl
+{
+  // Note that the Unicode Standard requires the surrogates ([D800 DFFF]) to
+  // only be used in the context of the UTF-16 character encoding form. Thus,
+  // we omit the surrogate codepoint type and assume surrogates as invalid
+  // codepoints.
+  //
+  enum class codepoint_types: std::uint16_t
+  {
+    // Useful to denote invalid codepoints or when building the type set
+    // incrementally.
+    //
+    none          = 0x00,
+
+    graphic       = 0x01, // L(etter), M(ark), N(number), P(uncturation),
+                          // S(symbol), Zs(separator, space)
+    format        = 0x02,
+    control       = 0x04,
+    private_use   = 0x08,
+    non_character = 0x10,
+    reserved      = 0x20,
+
+    any           = 0x3f
+  };
+
+  codepoint_types operator&  (codepoint_types,  codepoint_types);
+  codepoint_types operator|  (codepoint_types,  codepoint_types);
+  codepoint_types operator&= (codepoint_types&, codepoint_types);
+  codepoint_types operator|= (codepoint_types&, codepoint_types);
+
+  // Return the codepoint type for a valid codepoint value and none otherwise.
+  //
+  // Note that the valid codepoint ranges are [0 D800) and (DFFF 10FFFF].
+  //
+  codepoint_types
+  codepoint_type (char32_t);
+
+  // Return the type name for a single codepoint type and empty string for
+  // `none` and `any`.
+  //
+  // Potential future improvements:
+  //  - add the none value name parameter ("invalid" by default)
+  //  - produce names for type masks ("graphic, format", "any", etc)
+  //
+  std::string
+  to_string (codepoint_types);
+
+  inline std::ostream&
+  operator<< (std::ostream& os, codepoint_types ts)
+  {
+    return os << to_string (ts);
+  }
+}
+
+#include <libbutl/unicode.ixx>
diff --git a/libbutl/utf8.cxx b/libbutl/utf8.cxx
deleted file mode 100644
index 0f24559..0000000
--- a/libbutl/utf8.cxx
+++ /dev/null
@@ -1,342 +0,0 @@
-// file      : libbutl/utf8.cxx -*- C++ -*-
-// license   : MIT; see accompanying LICENSE file
-
-#ifndef __cpp_modules_ts
-#include <libbutl/utility.mxx>
-#endif
-
-#ifndef __cpp_lib_modules_ts
-#include <string>
-#include <cstddef>
-
-#include <algorithm>    // lower_bound()
-#endif
-
-#ifdef __cpp_modules_ts
-module butl.utility;
-
-// Only imports additional to interface.
-#ifdef __clang__
-#ifdef __cpp_lib_modules_ts
-import std.core;
-import std.io;
-#endif
-#endif
-
-#endif
-
-namespace butl
-{
-  using namespace std;
-
-  // Sorted arrays of the Unicode codepoint ranges corresponding to the
-  // codepoint types. Note that code type range lists (but not ranges
-  // themselves) may overlap.
-  //
-  // Note that the graphic type codepoints are numerous and scattered. Thus,
-  // we will consider a codepoint to be of the graphic type if it is not of
-  // any other type.
-  //
-  using codepoint_range = pair<char32_t, char32_t>;
-
-  static const codepoint_range cn_rs[] = // Control.
-  {
-    {0x00, 0x1F},
-    {0x7F, 0x9F}
-  };
-
-  static const codepoint_range fr_rs[] = // Format.
-  {
-    {0x000AD, 0x000AD},
-    {0x00600, 0x00605},
-    {0x0061C, 0x0061C},
-    {0x006DD, 0x006DD},
-    {0x0070F, 0x0070F},
-    {0x008E2, 0x008E2},
-    {0x0180E, 0x0180E},
-    {0x0200B, 0x0200F},
-    {0x0202A, 0x0202E},
-    {0x02060, 0x02064},
-    {0x02066, 0x0206F},
-    {0x0FEFF, 0x0FEFF},
-    {0x0FFF9, 0x0FFFB},
-    {0x110BD, 0x110BD},
-    {0x110CD, 0x110CD},
-    {0x13430, 0x13438},
-    {0x1BCA0, 0x1BCA3},
-    {0x1D173, 0x1D17A},
-    {0xE0001, 0xE0001},
-    {0xE0020, 0xE007F}
-  };
-
-  static const codepoint_range pr_rs[] = // Private-use.
-  {
-    {0x00E000, 0x00F8FF},
-    {0x0F0000, 0x10FFFF}
-  };
-
-  static const codepoint_range nc_rs[] = // Non-character.
-  {
-    {0xFDD0, 0xFDEF}
-  };
-
-  static const codepoint_range rs_rs[] = // Reserved.
-  {
-    {0x30000, 0xE0000},
-    {0xE0002, 0xE001F},
-    {0xE0080, 0xE00FF},
-    {0xE01F0, 0xEFFFF}
-  };
-
-  struct codepoint_type_ranges
-  {
-    codepoint_types type;
-    const codepoint_range* begin;
-    const codepoint_range* end;
-  };
-
-  static const codepoint_type_ranges ct_ranges[] =
-  {
-    {
-      codepoint_types::control,
-      cn_rs,
-      cn_rs + sizeof (cn_rs) / sizeof (*cn_rs)
-    },
-    {
-      codepoint_types::format,
-      fr_rs,
-      fr_rs + sizeof (fr_rs) / sizeof (*fr_rs)
-    },
-    {
-      codepoint_types::private_use,
-      pr_rs,
-      pr_rs + sizeof (pr_rs) / sizeof (*pr_rs)
-    },
-    {
-      codepoint_types::non_character,
-      nc_rs,
-      nc_rs + sizeof (nc_rs) / sizeof (*nc_rs)
-    },
-    {
-      codepoint_types::reserved,
-      rs_rs,
-      rs_rs + sizeof (rs_rs) / sizeof (*rs_rs)
-    }
-  };
-
-  bool
-  utf8 (const string& s, codepoint_types ts, const char32_t* wl)
-  {
-    // A UCS-4 character is encoded as the UTF-8 byte sequence as follows,
-    // depending on the value range it falls into:
-    //
-    // 0x00000000 - 0x0000007F:
-    //   0xxxxxxx
-    //
-    // 0x00000080 - 0x000007FF:
-    //   110xxxxx 10xxxxxx
-    //
-    // 0x00000800 - 0x0000FFFF:
-    //   1110xxxx 10xxxxxx 10xxxxxx
-    //
-    // 0x00010000 - 0x001FFFFF:
-    //   11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
-    //
-    // 0x00200000 - 0x03FFFFFF:
-    //   111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
-    //
-    // 0x04000000 - 0x7FFFFFFF:
-    //   1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
-    //
-    // Also note that the Unicode Standard (as of 12.1) specifies no
-    // codepoints above 0x10FFFF, so we consider 5- and 6-byte length UTF-8
-    // sequences as invalid (we could have added `unspecified` codepoint type
-    // except that there are no UTF-8 validation tables defined for these
-    // sequences).
-    //
-    size_t n (s.size ());
-
-    for (size_t i (0); i != n; )
-    {
-      // Detect the UTF-8 byte sequence length based on its first byte. While
-      // at it, start calculating the Unicode codepoint value.
-      //
-      size_t sn;
-      char32_t c;
-      unsigned char b1 (s[i]);
-
-      if (b1 < 0x80)
-      {
-        sn = 1;
-        c  = b1;
-      }
-      else if (b1 < 0xE0)
-      {
-        sn = 2;
-        c  = b1 & 0x1F; // Takes 5 rightmost bits of the first sequence byte.
-      }
-      else if (b1 < 0xF0)
-      {
-        sn = 3;
-        c  = b1 & 0xF; // Takes 4 rightmost bits of the first sequence byte.
-      }
-      else if (b1 < 0xF8)
-      {
-        sn = 4;
-        c  = b1 & 0x7; // Takes 3 rightmost bits of the first sequence byte.
-      }
-      else
-        return false; // The byte starts 5- or 6-byte length sequence.
-
-      // Bail out if the string doesn't contain all the requred codepoint
-      // encoding bytes.
-      //
-      if (sn > n - i)
-        return false;
-
-      // Note that while a codepoint may potentially be encoded with byte
-      // sequences of different lengths, only the shortest encoding sequence
-      // is considered well-formed. Also a well-formed sequence may not be
-      // decoded into a UTF-16 surrogate value ([D800 DFFF]) or a value that
-      // is greater than the max codepoint value (0x10FFFF). We will check all
-      // that using the Well-Formed UTF-8 Byte Sequences table (provided by
-      // the Unicode 12.0 Standard) which also takes care of the missing UTF-8
-      // sequence bytes.
-      //
-      // Return true if a byte value belongs to the specified range.
-      //
-      auto belongs = [] (unsigned char c, unsigned char l, unsigned char r)
-      {
-        return c >= l && c <= r;
-      };
-
-      switch (sn)
-      {
-      case 1: break; // Always well-formed by the definition (see above).
-      case 2:
-        {
-          // [000080 0007FF]: [C2 DF]  [80 BF]
-          //
-          // Check the first/second bytes combinations:
-          //
-          if (!(belongs (b1, 0xC2, 0xDF) && belongs (s[i + 1], 0x80, 0xBF)))
-            return false;
-
-          break;
-        }
-      case 3:
-        {
-          // [000800 000FFF]: E0       [A0 BF]  [80 BF]
-          // [001000 00CFFF]: [E1 EC]  [80 BF]  [80 BF]
-          // [00D000 00D7FF]: ED       [80 9F]  [80 BF] ; Excludes surrogates.
-          // [00E000 00FFFF]: [EE EF]  [80 BF]  [80 BF]
-          //
-          unsigned char b2 (s[i + 1]);
-
-          if (!((b1 == 0xE0               && belongs (b2, 0xA0, 0xBF)) ||
-                (belongs (b1, 0xE1, 0xEC) && belongs (b2, 0x80, 0xBF)) ||
-                (b1 == 0xED               && belongs (b2, 0x80, 0x9F)) ||
-                (belongs (b1, 0xEE, 0xEF) && belongs (b2, 0x80, 0xBF))) ||
-              !belongs (s[i + 2], 0x80, 0xBF))
-            return false;
-
-          break;
-        }
-      case 4:
-        {
-          // [010000 03FFFF]: F0       [90 BF]  [80 BF]  [80 BF]
-          // [040000 0FFFFF]: [F1 F3]  [80 BF]  [80 BF]  [80 BF]
-          // [100000 10FFFF]: F4       [80 8F]  [80 BF]  [80 BF]
-          //
-          unsigned char b2 (s[i + 1]);
-
-          if (!((b1 == 0xF0               && belongs (b2, 0x90, 0xBF)) ||
-                (belongs (b1, 0xF1, 0xF3) && belongs (b2, 0x80, 0xBF)) ||
-                (b1 == 0xF4               && belongs (b2, 0x80, 0x8F))) ||
-              !belongs (s[i + 2], 0x80, 0xBF)                           ||
-              !belongs (s[i + 3], 0x80, 0xBF))
-            return false;
-
-          break;
-        }
-      }
-
-      // For the remaining sequence bytes, "append" their 6 rightmost bits to
-      // the resulting codepoint value.
-      //
-      --sn;
-      ++i;
-
-      for (size_t n (i + sn); i != n; ++i)
-        c = (c << 6) | (s[i] & 0x3F);
-
-      // Check the decoded codepoint, unless any codepoint type is allowed.
-      //
-      if (ts == codepoint_types::any)
-        continue;
-
-      using traits = u32string::traits_type;
-
-      // Check if the decoded codepoint is whitelisted.
-      //
-      if (wl != nullptr &&
-          traits::find (wl, traits::length (wl), c) != nullptr)
-        continue;
-
-      // Match the decoded codepoint type against the specified type set.
-      //
-      // Detect the codepoint type (see the Types of Code Points table in the
-      // Unicode 12.0 Standard for details).
-      //
-      codepoint_types ct;
-
-      // Optimize for the common case (printable ASCII characters).
-      //
-      if (c >= 0x20 && c <= 0x7E)
-        ct = codepoint_types::graphic;
-      else if ((c & 0xFFFF) >= 0xFFFE) // Non-range based detection.
-        ct = codepoint_types::non_character;
-      else
-      {
-        // Note that we consider a codepoint to be of the graphic type if it
-        // is not of any other type (see above).
-        //
-        ct = codepoint_types::graphic;
-
-        // Note that the codepoint type range lists may overlap. Thus, we
-        // iterate over all of them until there is a match.
-        //
-        for (size_t i (0); i != sizeof (ct_ranges) / sizeof (*ct_ranges); ++i)
-        {
-          const codepoint_type_ranges& rs (ct_ranges[i]);
-
-          // Find the range that either contains the codepoint or lays to the
-          // right of it. Note that here we assume a range to be less than a
-          // codepoint if it lays to the left of the codepoint.
-          //
-          const codepoint_range* r (
-            lower_bound (rs.begin, rs.end,
-                         c,
-                         [] (const codepoint_range& r, char32_t c)
-                         {
-                           return r.second < c;
-                         }));
-
-          if (r != rs.end && r->first <= c) // Contains the codepoint?
-          {
-            ct = rs.type;
-            break;
-          }
-        }
-      }
-
-      // Now check if the codepoint type matches the specified set. Note: also
-      // covers the `ts == codepoint_types::none` case.
-      //
-      if ((ct & ts) == codepoint_types::none)
-        return false;
-    }
-
-    return true;
-  }
-}
diff --git a/libbutl/utf8.ixx b/libbutl/utf8.ixx
new file mode 100644
index 0000000..3d2e092
--- /dev/null
+++ b/libbutl/utf8.ixx
@@ -0,0 +1,305 @@
+// file      : libbutl/utf8.ixx -*- C++ -*-
+// license   : MIT; see accompanying LICENSE file
+
+namespace butl
+{
+  inline utf8_validator::
+  utf8_validator (codepoint_types ts, const char32_t* wl)
+      : types_ (ts),
+        whitelist_ (wl)
+  {
+  }
+
+  inline std::pair<bool, bool> utf8_validator::
+  validate (char c)
+  {
+    return validate (c, nullptr /* what */);
+  }
+
+  inline std::pair<bool, bool> utf8_validator::
+  validate (char c, std::string& what)
+  {
+    return validate (c, &what);
+  }
+
+  inline std::pair<bool, bool> utf8_validator::
+  validate (char c, std::string* what)
+  {
+    using namespace std;
+
+    // A UCS-4 character is encoded as the UTF-8 byte sequence as follows,
+    // depending on the value range it falls into:
+    //
+    // 0x00000000 - 0x0000007F:
+    //   0xxxxxxx
+    //
+    // 0x00000080 - 0x000007FF:
+    //   110xxxxx 10xxxxxx
+    //
+    // 0x00000800 - 0x0000FFFF:
+    //   1110xxxx 10xxxxxx 10xxxxxx
+    //
+    // 0x00010000 - 0x001FFFFF:
+    //   11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+    //
+    // 0x00200000 - 0x03FFFFFF:
+    //   111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
+    //
+    // 0x04000000 - 0x7FFFFFFF:
+    //   1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
+    //
+    // Also note that the Unicode Standard (as of 12.1) specifies no
+    // codepoints above 0x10FFFF, so we consider 5- and 6-byte length UTF-8
+    // sequences as invalid (we could have added `unspecified` codepoint type
+    // except that there are no UTF-8 validation tables defined for these
+    // sequences).
+    //
+    unsigned char b (c);
+
+    // Compose the detailed "invalid UTF-8 sequence byte" error.
+    //
+    auto byte_error = [c, b, this] ()
+    {
+      string s ("invalid UTF-8 sequence ");
+
+      const char* names[] = {"first", "second", "third", "forth"};
+      s += names[seq_index_];
+      s += " byte (0x";
+
+      const char digits[] = "0123456789ABCDEF";
+      s += digits[(b >> 4) & 0xF];
+      s += digits[b & 0xF];
+
+      // If the byte happens to be a printable ASCII character then let's
+      // print it as a character as well. This can help a bit with grepping
+      // through text while troubleshooting.
+      //
+      if (b >= 0x20 && b <= 0x7E)
+      {
+        s += " '";
+        s += c;
+        s += "'";
+      }
+
+      s += ")";
+      return s;
+    };
+
+    // Detect the byte sequence length based on its first byte. While at it,
+    // start calculating the resulting Unicode codepoint value.
+    //
+    if (seq_index_ == 0)
+    {
+      if (b < 0x80)
+      {
+        seq_size_ = 1;
+        codepoint_ = b;
+      }
+      else if (b < 0xE0)
+      {
+        seq_size_ = 2;
+        codepoint_ = b & 0x1F; // Takes 5 rightmost bits.
+      }
+      else if (b < 0xF0)
+      {
+        seq_size_ = 3;
+        codepoint_ = b & 0xF; // Takes 4 rightmost bits.
+      }
+      else if (b < 0xF8)
+      {
+        seq_size_ = 4;
+        codepoint_ = b & 0x7; // Takes 3 rightmost bits.
+      }
+      else
+      {
+        if (what != nullptr)
+        {
+          if (b < 0xFE)
+          {
+            *what  = b < 0xFC ? "5" : "6";
+            *what += "-byte length UTF-8 sequence";
+          }
+          else
+            *what = byte_error ();
+        }
+
+        return make_pair (false, false); // Invalid byte.
+      }
+    }
+
+    // Note that while a codepoint may potentially be encoded with byte
+    // sequences of different lengths, only the shortest encoding sequence is
+    // considered well-formed. Also a well-formed sequence may not be decoded
+    // into invalid codepoint value (see codepoint_type() for details). We
+    // will check all that using the Well-Formed UTF-8 Byte Sequences table
+    // (provided by the Unicode 12.0 Standard) which also takes care of the
+    // missing UTF-8 sequence bytes.
+    //
+    bool valid (false);
+
+    // Return true if a byte value belongs to the specified range.
+    //
+    auto belongs = [] (unsigned char c, unsigned char l, unsigned char r)
+    {
+      return c >= l && c <= r;
+    };
+
+    switch (seq_size_)
+    {
+    case 1: valid = true; break; // Well-formed by the definition (see above).
+    case 2:
+      {
+        // [000080 0007FF]: [C2 DF]  [80 BF]
+        //
+        // Check the first byte and set the second byte range.
+        //
+        if (seq_index_ == 0)
+        {
+          if ((valid = belongs (b, 0xC2, 0xDF)))
+            byte2_range_ = make_pair (0x80, 0xBF);
+        }
+        else // Check the second byte.
+          valid = belongs (b, byte2_range_.first, byte2_range_.second);
+
+        break;
+      }
+    case 3:
+      {
+        // [000800 000FFF]: E0       [A0 BF]  [80 BF]
+        // [001000 00CFFF]: [E1 EC]  [80 BF]  [80 BF]
+        // [00D000 00D7FF]: ED       [80 9F]  [80 BF] ; Excludes surrogates.
+        // [00E000 00FFFF]: [EE EF]  [80 BF]  [80 BF]
+        //
+        // Check the first byte and set the second byte range.
+        //
+        if (seq_index_ == 0)
+        {
+          if ((valid = (b == 0xE0)))
+            byte2_range_ = make_pair (0xA0, 0xBF);
+          else if ((valid = belongs (b, 0xE1, 0xEC)))
+            byte2_range_ = make_pair (0x80, 0xBF);
+          else if ((valid = (b == 0xED)))
+            byte2_range_ = make_pair (0x80, 0x9F);
+          else if ((valid = belongs (b, 0xEE, 0xEF)))
+            byte2_range_ = make_pair (0x80, 0xBF);
+        }
+        else if (seq_index_ == 1) // Check the second byte.
+          valid = belongs (b, byte2_range_.first, byte2_range_.second);
+        else                      // Check the third byte.
+          valid = belongs (b, 0x80, 0xBF);
+
+        break;
+      }
+    case 4:
+      {
+        // [010000 03FFFF]: F0       [90 BF]  [80 BF]  [80 BF]
+        // [040000 0FFFFF]: [F1 F3]  [80 BF]  [80 BF]  [80 BF]
+        // [100000 10FFFF]: F4       [80 8F]  [80 BF]  [80 BF]
+        //
+        // Check the first byte and set the second byte range.
+        //
+        if (seq_index_ == 0)
+        {
+          if ((valid = (b == 0xF0)))
+            byte2_range_ = make_pair (0x90, 0xBF);
+          else if ((valid = belongs (b, 0xF1, 0xF3)))
+            byte2_range_ = make_pair (0x80, 0xBF);
+          else if ((valid = (b == 0xF4)))
+            byte2_range_ = make_pair (0x80, 0x8F);
+        }
+        else if (seq_index_ == 1) // Check the second byte.
+          valid = belongs (b, byte2_range_.first, byte2_range_.second);
+        else                      // Check the third and forth bytes.
+          valid = belongs (b, 0x80, 0xBF);
+
+        break;
+      }
+    }
+
+    // Bail out if the current UTF-8 sequence byte is invalid.
+    //
+    if (!valid)
+    {
+      // We could probably distinguish "surrogate" and "exceed max value" from
+      // other ill-formedness cases (amend the well-formedness table, keep
+      // decoding the sequence, and test the codepoint in the end) and produce
+      // more specific error messages, but this doesn't seem worth the
+      // trouble.
+      //
+      if (what != nullptr)
+        *what = byte_error ();
+
+      return make_pair (false, false); // Invalid byte.
+    }
+
+    // "Append" the sequence byte's 6 rightmost bits to the resulting
+    // codepoint value, unless this is the first byte (which value is already
+    // taken into account; see above).
+    //
+    if (seq_index_ != 0)
+      codepoint_ = (codepoint_ << 6) | (b & 0x3F);
+
+    // If we didn't get to the end of the UTF-8 sequence, then we are done
+    // with this byte.
+    //
+    if (++seq_index_ != seq_size_)
+      return make_pair (true, false); // Valid byte.
+
+    // Prepare for the next UTF-8 sequence validation, regardless of the
+    // decoded codepoint validity.
+    //
+    seq_index_ = 0;
+
+    // Check the decoded codepoint, unless any codepoint type is allowed.
+    //
+    // Note that the well-formedness sequence check guarantees that we decoded
+    // a valid Unicode codepoint (see above).
+    //
+    if (types_ == codepoint_types::any)
+      return make_pair (true, true); // Valid codepoint.
+
+    // Check if the decoded codepoint is whitelisted.
+    //
+    using traits = u32string::traits_type;
+
+    if (whitelist_ != nullptr &&
+        traits::find (whitelist_, traits::length (whitelist_), codepoint_) !=
+        nullptr)
+      return make_pair (true, true); // Valid codepoint.
+
+    // Now check if the codepoint type matches the specified set. Note: also
+    // covers the `types_ == codepoint_types::none` case.
+    //
+    codepoint_types t (codepoint_type (codepoint_));
+
+    if ((t & types_) != codepoint_types::none)
+      return make_pair (true, true); // Valid codepoint.
+
+    if (what != nullptr)
+      *what = "invalid Unicode codepoint (" + to_string (t) + ")";
+
+    return make_pair (false, true); // Invalid codepoint.
+  }
+
+  inline std::pair<bool, bool> utf8_validator::
+  recover (char c)
+  {
+    // We are recovered if the character can be interpreted as a sequence
+    // leading byte.
+    //
+    // As an optimization, bail out if the byte is a sequence trailing byte
+    // (10xxxxxx).
+    //
+    if ((c & 0xC0) == 0x80)
+      return std::make_pair (false, false); // Invalid byte.
+
+    seq_index_ = 0;
+    return validate (c);
+  }
+
+  inline char32_t utf8_validator::
+  codepoint () const
+  {
+    return codepoint_;
+  }
+}
diff --git a/libbutl/utf8.mxx b/libbutl/utf8.mxx
new file mode 100644
index 0000000..15e8ded
--- /dev/null
+++ b/libbutl/utf8.mxx
@@ -0,0 +1,130 @@
+// file      : libbutl/utf8.mxx -*- C++ -*-
+// license   : MIT; see accompanying LICENSE file
+
+#ifndef __cpp_modules_ts
+#pragma once
+#endif
+
+// C includes.
+
+#ifndef __cpp_lib_modules_ts
+#include <string>
+#include <cstdint> // uint8_t
+#include <utility> // pair
+#endif
+
+// Other includes.
+
+#ifdef __cpp_modules_ts
+export module butl.utf8;
+#ifdef __cpp_lib_modules_ts
+import std.core;
+#endif
+import butl.unicode;
+#else
+#include <libbutl/unicode.mxx>
+#endif
+
+#include <libbutl/export.hxx>
+
+LIBBUTL_MODEXPORT namespace butl
+{
+  // Here and below we will refer to bytes that encode a singe Unicode
+  // codepoint as "UTF-8 byte sequence" ("UTF-8 sequence" or "byte sequence"
+  // for short) and a sequence of such sequences as "UTF-8 encoded byte
+  // string" ("byte string" for short).
+  //
+
+  // Validate a UTF-8 encoded byte string one byte at a time. Optionally, also
+  // validate that its decoded codepoints belong to the specified types or
+  // codepoint whitelist.
+  //
+  class utf8_validator
+  {
+  public:
+    // Note: use whitelist via shallow copy.
+    //
+    explicit
+    utf8_validator (codepoint_types = codepoint_types::any,
+                    const char32_t* whitelist = nullptr);
+
+    // Validate the next byte returning true if it is valid (first) and
+    // whether it is the last byte of a codepoint (second). The {false, true}
+    // result indicates a byte sequence decoded into a codepoint of undesired
+    // type rather than an invalid byte that happens to be the last in the
+    // sequence (and may well be a valid starting byte of the next sequence).
+    //
+    // Note that in case the byte is invalid, calling this function again
+    // without recovery is illegal.
+    //
+    std::pair<bool, bool>
+    validate (char);
+
+    // As above but in case of an invalid byte also return the description of
+    // why it is invalid.
+    //
+    // Note that the description only contains the reason why the specified
+    // byte is not part of a valid UTF-8 sequence or the desired codepoint
+    // type, for example:
+    //
+    // "invalid UTF-8 sequence first byte (0xB0)"
+    // "invalid Unicode codepoint (reserved)"
+    //
+    // It can be used to form complete diagnostics along these lines:
+    //
+    // cerr << "invalid manifest value " << name << ": " << what << endl;
+    //
+    std::pair<bool, bool>
+    validate (char, std::string& what);
+
+    // As above but decide whether the description is needed at runtime (what
+    // may be NULL).
+    //
+    std::pair<bool, bool>
+    validate (char, std::string* what);
+
+    // Recover from an invalid byte.
+    //
+    // This function must be called with the first invalid and then subsequent
+    // bytes until it signals that the specified byte is valid. Note that it
+    // shall not be called if the sequence is decoded into a codepoint of an
+    // undesired type.
+    //
+    // Note also that a byte being invalid in the middle of a UTF-8 sequence
+    // may be valid as a first byte of the next sequence.
+    //
+    std::pair<bool, bool>
+    recover (char);
+
+    // Return the codepoint of the last byte sequence.
+    //
+    // This function can only be legally called after validate() or recover()
+    // signal that the preceding byte is valid and last.
+    //
+    char32_t
+    codepoint () const;
+
+  private:
+    codepoint_types types_;
+    const char32_t* whitelist_;
+
+    // State machine.
+    //
+    uint8_t seq_size_;      // [1 4]; calculated at the first byte validation.
+    uint8_t seq_index_ = 0; // [0 3]
+
+    // Last byte sequence decoded codepoint (built incrementally).
+    //
+    char32_t codepoint_;
+
+    // The byte range a valid UTF-8 sequence second byte must belong to as
+    // calculated during the first byte validation.
+    //
+    // Note that the subsequent (third and forth) bytes must belong to the
+    // [80 BF] range regardless to the previous bytes.
+    //
+    std::pair<unsigned char, unsigned char> byte2_range_;
+  };
+}
+
+#include <libbutl/utf8.ixx>
diff --git a/libbutl/utility.cxx b/libbutl/utility.cxx
index ce78295..d6a21c6 100644
--- a/libbutl/utility.cxx
+++ b/libbutl/utility.cxx
@@ -35,6 +35,9 @@ import std.io;
 #endif
 #endif
 
+import butl.utf8;
+#else
+#include <libbutl/utf8.mxx>
 #endif
 
 namespace butl
@@ -191,6 +194,135 @@ namespace butl
   }
 
   void
+  to_utf8 (string& s, char repl, codepoint_types ts, const char32_t* wl)
+  {
+    using iterator = string::iterator;
+
+    utf8_validator val (ts, wl);
+
+    iterator i (s.begin ()); // Source current position.
+    iterator e (s.end ());   // Source end position.
+    iterator d (i);          // Destination current position.
+    iterator b (d);          // Begin of the current destination sequence.
+
+    // Replace the current byte and prepare for the next sequence.
+    //
+    auto replace_byte = [&d, &b, repl] ()
+    {
+      *d++ = repl;
+      b = d;
+    };
+
+    // Replace bytes of the current sequence excluding the current byte and
+    // prepare for the next sequence.
+    //
+    auto replace_sequence = [&d, &b, repl] ()
+    {
+      for (; b != d; ++b)
+        *b = repl;
+    };
+
+    // Replace sequence bytes with a single replacement byte and prepare for
+    // the next sequence.
+    //
+    auto replace_codepoint = [&d, &b, &replace_byte] ()
+    {
+      d = b;           // Rewind to the beginning of the sequence.
+      replace_byte ();
+    };
+
+    // Iterate over the byte string appending valid bytes, replacing invalid
+    // bytes/codepoints, and recovering after invalid bytes.
+    //
+    for (; i != e; ++i)
+    {
+      char c (*i);
+      pair<bool, bool> v (val.validate (c));
+
+      // Append a valid byte and prepare for the next sequence if the sequence
+      // end is reached.
+      //
+      auto append_byte = [&d, &b, &v, &c] ()
+      {
+        *d++ = c;
+
+        if (v.second) // Sequence last byte?
+          b = d;
+      };
+
+      // If this is a valid byte/codepoint, then append the byte and proceed
+      // to the next string byte.
+      //
+      if (v.first)
+      {
+        append_byte ();
+        continue;
+      }
+
+      // If this is an invalid codepoint, then replace the sequence with a
+      // single replacement character and proceed to the next byte sequence
+      // (no recovery is necessary).
+      //
+      if (v.second)
+      {
+        replace_codepoint ();
+        continue;
+      }
+
+      // Now, given this is an invalid byte, replace the current sequence
+      // bytes and recover.
+      //
+      replace_sequence ();
+
+      // Stay in the recovery cycle until a valid byte is encountered. Note
+      // that we start from where we left off, not from the next byte (see
+      // utf8_validator::recover() for details).
+      //
+      for (; i != e; ++i)
+      {
+        c = *i;
+        v = val.recover (c);
+
+        // End the recovery cycle for a valid byte.
+        //
+        if (v.first)
+        {
+          append_byte ();
+          break;
+        }
+
+        // End the recovery cycle for a decoded but invalid (ASCII-range)
+        // codepoint.
+        //
+        if (v.second)
+        {
+          replace_codepoint ();
+          break;
+        }
+
+        replace_byte ();
+      }
+
+      // Bail out if we reached the end of the byte string. Note that while we
+      // failed to recover (otherwise i != e), all the bytes are already
+      // replaced.
+      //
+      if (i == e)
+        break;
+    }
+
+    // If the last byte sequence is incomplete, then replace its bytes.
+    //
+    if (b != d)
+      replace_sequence ();
+
+    // Shrink the byte string if we replaced any invalid codepoints.
+    //
+    if (d != e)
+      s.resize (d - s.begin ());
+  }
+
+  void
   setenv (const string& name, const string& value)
   {
 #ifndef _WIN32
diff --git a/libbutl/utility.ixx b/libbutl/utility.ixx
index c5fdbac..27ef7fb 100644
--- a/libbutl/utility.ixx
+++ b/libbutl/utility.ixx
@@ -2,8 +2,11 @@
 // license   : MIT; see accompanying LICENSE file
 
 #ifndef __cpp_lib_modules_ts
-#include <cstdlib> // getenv()
-#include <algorithm>
+#include <cctype>    // toupper(), tolower(), is*()
+#include <cwctype>   // isw*()
+#include <cstdlib>   // getenv()
+#include <algorithm> // for_each()
+#include <stdexcept> // invalid_argument
 #endif
 
 namespace butl
@@ -216,44 +219,84 @@ namespace butl
     return sanitize_identifier (std::string (s));
   }
 
-  inline codepoint_types
-  operator&= (codepoint_types& x, codepoint_types y)
+  inline bool
+  eof (std::istream& is)
   {
-    return x = static_cast<codepoint_types> (
-      static_cast<std::uint16_t> (x) &
-      static_cast<std::uint16_t> (y));
+    if (!is.fail ())
+      return false;
+
+    if (is.eof ())
+      return true;
+
+    throw std::istream::failure ("");
   }
 
-  inline codepoint_types
-  operator|= (codepoint_types& x, codepoint_types y)
+  inline optional<std::size_t>
+  utf8_length_impl (const std::string& s,
+                    std::string* what,
+                    codepoint_types ts,
+                    const char32_t* wl)
   {
-    return x = static_cast<codepoint_types> (
-      static_cast<std::uint16_t> (x) |
-      static_cast<std::uint16_t> (y));
+    using namespace std;
+
+    // Optimize for an empty string.
+    //
+    if (s.empty ())
+      return 0;
+
+    size_t r (0);
+    pair<bool, bool> v;
+    utf8_validator val (ts, wl);
+
+    for (char c: s)
+    {
+      v = val.validate (c, what);
+
+      if (!v.first) // Invalid byte?
+        return nullopt;
+
+      if (v.second) // Last byte in the sequence?
+        ++r;
+    }
+
+    // Make sure that the last UTF-8 sequence is complete.
+    //
+    if (!v.second)
+    {
+      if (what != nullptr)
+        *what = "incomplete UTF-8 sequence";
+
+      return nullopt;
+    }
+
+    return r;
   }
 
-  inline codepoint_types
-  operator& (codepoint_types x, codepoint_types y)
+  inline std::size_t
+  utf8_length (const std::string& s, codepoint_types ts, const char32_t* wl)
   {
-    return x &= y;
+    using namespace std;
+
+    string what;
+    if (optional<size_t> r = utf8_length_impl (s, &what, ts, wl))
+      return *r;
+
+    throw invalid_argument (what);
   }
 
-  inline codepoint_types
-  operator| (codepoint_types x, codepoint_types y)
+  inline bool
+  utf8 (const std::string& s,
+        std::string& what,
+        codepoint_types ts,
+        const char32_t* wl)
   {
-    return x |= y;
+    return utf8_length_impl (s, &what, ts, wl).has_value ();
   }
 
   inline bool
-  eof (std::istream& is)
+  utf8 (const std::string& s, codepoint_types ts, const char32_t* wl)
   {
-    if (!is.fail ())
-      return false;
-
-    if (is.eof ())
-      return true;
-
-    throw std::istream::failure ("");
+    return utf8_length_impl (s, nullptr, ts, wl).has_value ();
   }
 
   inline optional<std::string>
diff --git a/libbutl/utility.mxx b/libbutl/utility.mxx
index 71c2860..b84e731 100644
--- a/libbutl/utility.mxx
+++ b/libbutl/utility.mxx
@@ -16,13 +16,10 @@
 #include <iosfwd>       // ostream
 #include <istream>
 #include <cstddef>      // size_t
-#include <utility>      // move(), forward()
+#include <utility>      // move(), forward(), pair
 #include <cstring>      // strcmp(), strlen()
 #include <exception>    // exception, uncaught_exception[s]()
 //#include <functional> // hash
-
-#include <cctype>  // toupper(), tolower(), is*()
-#include <cwctype> // isw*()
 #endif
 
 #include <libbutl/ft/lang.hxx>      // thread_local
@@ -34,8 +31,12 @@ export module butl.utility;
 import std.core;
 import std.io;
 #endif
+import butl.utf8;
+import butl.unicode;
 import butl.optional;
 #else
+#include <libbutl/utf8.mxx>
+#include <libbutl/unicode.mxx>
 #include <libbutl/optional.mxx>
 #endif
 
@@ -194,41 +195,44 @@ LIBBUTL_MODEXPORT namespace butl
   std::string  sanitize_identifier (std::string&&);
   std::string  sanitize_identifier (const std::string&);
 
-  // Return true if the string is a valid UTF-8 encoded byte sequence and,
-  // optionally, its decoded codepoints belong to the specified types or to
-  // the codepoint whitelist.
-  //
-  // Note that the Unicode Standard considers a UTF-8 byte sequence decoded
-  // into a codepoint of the surrogate type as invalid. Thus, the surrogate
-  // type may not be specified.
+  // Return true if the string is a valid UTF-8 encoded byte string and,
+  // optionally, its decoded codepoints belong to the specified types or
+  // codepoint whitelist.
   //
-  enum class codepoint_types: std::uint16_t
-  {
-    // Useful to only allow the whitelisted codepoints or when building the
-    // type set incrementally.
-    //
-    none          = 0x00,
-
-    graphic       = 0x01, // L(etter), M(ark), N(number), P(uncturation),
-                          // S(symbol), Zs(separator, space)
-    format        = 0x02,
-    control       = 0x04,
-    private_use   = 0x08,
-    non_character = 0x10,
-    reserved      = 0x20,
-
-    any           = 0x3f
-  };
+  bool
+  utf8 (const std::string&,
+        codepoint_types = codepoint_types::any,
+        const char32_t* whitelist = nullptr);
 
-  LIBBUTL_SYMEXPORT bool
+  // As above but in case of an invalid sequence also return the description
+  // of why it is invalid.
+  //
+  bool
   utf8 (const std::string&,
+        std::string& what,
         codepoint_types = codepoint_types::any,
         const char32_t* whitelist = nullptr);
 
-  codepoint_types operator&  (codepoint_types,  codepoint_types);
-  codepoint_types operator|  (codepoint_types,  codepoint_types);
-  codepoint_types operator&= (codepoint_types&, codepoint_types);
-  codepoint_types operator|= (codepoint_types&, codepoint_types);
+  // Return UTF-8 byte string length in codepoints. Throw
+  // std::invalid_argument if this is not a valid UTF-8.
+  //
+  std::size_t
+  utf8_length (const std::string&,
+               codepoint_types = codepoint_types::any,
+               const char32_t* whitelist = nullptr);
+
+  // Fixup the specified string (in place) to be valid UTF-8 replacing invalid
+  // bytes and codepoints with the specified character, for example, '?'.
+  //
+  // Potential future improvements:
+  //  - char32_t replacement (will need UTF-8 encoding)
+  //  - different replacement for bytes and codepoints
+  //
+  LIBBUTL_SYMEXPORT void
+  to_utf8 (std::string&,
+           char replacement,
+           codepoint_types = codepoint_types::any,
+           const char32_t* whitelist = nullptr);
 
   // If an input stream is in a failed state, then return true if this is
   // because of the eof and throw istream::failure otherwise. If the stream
diff --git a/tests/manifest-parser/driver.cxx b/tests/manifest-parser/driver.cxx
index 57674cb..a34f2b7 100644
--- a/tests/manifest-parser/driver.cxx
+++ b/tests/manifest-parser/driver.cxx
@@ -40,6 +40,9 @@ namespace butl
   static bool
   equal (const optional<pairs>& actual, const optional<pairs>& expected);
 
+  static pairs
+  parse (const char* m, manifest_parser::filter_function f = {});
+
   // Test manifest as it is represented in the stream, including format
   // version and end-of-manifest values.
   //
@@ -188,6 +191,41 @@ namespace butl
       assert (p.first == "" && p.second == "comment");
     }
 
+    // UTF-8.
+    //
+    assert (test (":1\n#\xD0\xB0\n\xD0\xB0y\xD0\xB0:\xD0\xB0z\xD0\xB0",
+                  {{"","1"},
+                   {"\xD0\xB0y\xD0\xB0", "\xD0\xB0z\xD0\xB0"},
+                   {"",""},
+                   {"",""}}));
+
+    assert (fail (":1\n#\xD0\n\xD0\xB0y\xD0\xB0:\xD0\xB0z\xD0\xB0"));
+    assert (fail (":1\n#\xD0\xB0\n\xB0y\xD0\xB0:\xD0\xB0z\xD0\xB0"));
+    assert (fail (":1\n#\xD0\xB0\n\xD0y\xD0\xB0:\xD0\xB0z\xD0\xB0"));
+    assert (fail (":1\n#\xD0\xB0\n\xD0\xB0y\xD0:\xD0\xB0z\xD0\xB0"));
+    assert (fail (":1\n#\xD0\xB0\n\xD0\xB0y\xD0\xB0:\xD0z\xD0\xB0"));
+    assert (fail (":1\n#\xD0\xB0\n\xD0\xB0y\xD0\xB0:\xD0\xB0z\xD0"));
+    assert (fail (":1\r\r\xB0#\n\xD0\xB0y\xD0\xB0:\xD0\xB0z\xD0\xB0"));
+    assert (fail (":1\r\xD0#\n\xD0\xB0y\xD0\xB0:\xD0\xB0z\xD0\xB0"));
+    assert (fail (":1\n#\xD0\xB0\n\xD0\xB0y\xD0\xB0:\xD0\xB0z\xD0\xB0\r\xD0"));
+
+    // Test parsing failure for manifest with multi-byte UTF-8 sequences
+    // (the column is properly reported, etc).
+    //
+    try
+    {
+      parse (":1\na\xD0\xB0\xD0\xB0\xFE");
+      assert (false);
+    }
+    catch (const manifest_parsing& e)
+    {
+      assert (e.line == 2   &&
+              e.column == 4 &&
+              e.description ==
+              "invalid manifest name: "
+              "invalid UTF-8 sequence first byte (0xFE)");
+    }
+
     // Filtering.
     //
     assert (test (":1\na: abc\nb: bca\nc: cab",
@@ -281,7 +319,7 @@ namespace butl
   }
 
   static pairs
-  parse (const char* m, manifest_parser::filter_function f = {})
+  parse (const char* m, manifest_parser::filter_function f)
   {
     istringstream is (m);
     is.exceptions (istream::failbit | istream::badbit);
diff --git a/tests/manifest-rewriter/driver.cxx b/tests/manifest-rewriter/driver.cxx
index fd76929..ec73d81 100644
--- a/tests/manifest-rewriter/driver.cxx
+++ b/tests/manifest-rewriter/driver.cxx
@@ -90,6 +90,10 @@ namespace butl
                   {{"abc", "xyz"}}) ==
             ":1\n                                     abc: \\\nxyz\n\\");
 
+    assert (edit (":1\n                                     a\xD0\xB0g : b",
+                  {{"a\xD0\xB0g", "xyz"}}) ==
+            ":1\n                                     a\xD0\xB0g : \\\nxyz\n\\");
+
     // Test editing of manifests that contains CR characters.
     //
     assert (edit (":1\r\na: b\r\r\n", {{"a", "xyz"}}) == ":1\r\na: xyz\r\r\n");
diff --git a/tests/manifest-serializer/driver.cxx b/tests/manifest-serializer/driver.cxx
index 148a281..c818b4a 100644
--- a/tests/manifest-serializer/driver.cxx
+++ b/tests/manifest-serializer/driver.cxx
@@ -46,6 +46,7 @@ main ()
   assert (test ({{"#", "x"}}, "# x\n"));
   assert (test ({{"#", "x"},{"#", "y"},{"#", ""}}, "# x\n# y\n#\n"));
   assert (fail ({{"",""},{"#", "x"}})); // serialization after eos
+  assert (fail ({{"#", "\xB0"}}));      // invalid UTF-8 sequence
 
   // Empty manifest stream.
   //
@@ -89,6 +90,12 @@ main ()
   assert (fail ({{"","1"},{"a b",""}}));
   assert (fail ({{"","1"},{"a\tb",""}}));
   assert (fail ({{"","1"},{"a\n",""}}));
+  assert (fail ({{"","1"},{"a\xB0",""}})); // invalid UTF-8 sequence
+
+  // Invalid value.
+  //
+  assert (fail ({{"","1"},{"a","\xB0"}})); // invalid UTF-8 sequence
+  assert (fail ({{"","1"},{"a","\xD0"}})); // incomplete UTF-8 sequence
 
   // Simple value.
   //
@@ -172,11 +179,22 @@ main ()
              "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\\Y\\\n"
              "yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy");
 
+  // Hard break after the UTF-8/delayed hard break.
+  //
+  string l6 ("xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
+             "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\xF0\x90\x8C\x82"
+             "\xF0\x90\x8C\x82yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy");
+
+  string e6 ("xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
+             "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\xF0\x90\x8C\x82\\\n"
+             "\xF0\x90\x8C\x82yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy");
+
   assert (test ({{"","1"},{"a",l1},{"",""},{"",""}}, ": 1\na: " + e1 + "\n"));
   assert (test ({{"","1"},{"a",l2},{"",""},{"",""}}, ": 1\na: " + e2 + "\n"));
   assert (test ({{"","1"},{"a",l3},{"",""},{"",""}}, ": 1\na: " + e3 + "\n"));
   assert (test ({{"","1"},{"a",l4},{"",""},{"",""}}, ": 1\na: " + e4 + "\n"));
   assert (test ({{"","1"},{"a",l5},{"",""},{"",""}}, ": 1\na: " + e5 + "\n"));
+  assert (test ({{"","1"},{"a",l6},{"",""},{"",""}}, ": 1\na: " + e6 + "\n"));
 
   // Multi-line value.
   //
diff --git a/tests/utf8/driver.cxx b/tests/utf8/driver.cxx
index 8480dec..f35e65e 100644
--- a/tests/utf8/driver.cxx
+++ b/tests/utf8/driver.cxx
@@ -13,8 +13,10 @@
 #ifdef __cpp_lib_modules_ts
 import std.core;
 #endif
+import butl.utf8;
 import butl.utility;
 #else
+#include <libbutl/utf8.mxx>
 #include <libbutl/utility.mxx>
 #endif
 
@@ -24,6 +26,17 @@ using namespace butl;
 int
 main ()
 {
+  // utf8() tests.
+  //
+  auto utf8_error = [] (const string& s,
+                        codepoint_types ts = codepoint_types::any,
+                        const char32_t* wl = nullptr)
+  {
+    string error;
+    assert (!utf8 (s, error, ts, wl));
+    return error;
+  };
+
   // Valid sequences.
   //
   // Empty.
@@ -43,18 +56,36 @@ main ()
 
   // Ill-formed sequences.
   //
+  // Long sequences.
+  //
+  assert (!utf8 ("\xF8")); // 5-byte sequence.
+  assert (!utf8 ("\xFC")); // 6-byte sequence.
+
+  assert (utf8_error ("\xF8") == "5-byte length UTF-8 sequence");
+  assert (utf8_error ("\xFC") == "6-byte length UTF-8 sequence");
+  assert (utf8_error ("\xFE") == "invalid UTF-8 sequence first byte (0xFE)");
+
   // 2-byte sequences.
   //
   assert (!utf8 ("\xC1\x80")); // Invalid first byte.
   assert (!utf8 ("\xD0y"));    // Invalid second byte.
 
+  assert (utf8_error ("\xC1\x80") ==
+          "invalid UTF-8 sequence first byte (0xC1)");
+
+  assert (utf8_error ("\xD0y") ==
+          "invalid UTF-8 sequence second byte (0x79 'y')");
+
   // 3-byte sequences.
   //
   assert (!utf8 ("\xE2\x70\x80")); // Invalid second byte.
   assert (!utf8 ("\xE2\x80\x70")); // Invalid third byte.
 
-  assert (!utf8 ("\xED\xA0\x80")); // Min UTF-16 surrogate value.
-  assert (!utf8 ("\xED\xBF\xBF")); // Max UTF-16 surrogate value.
+  assert (!utf8 ("\xED\xA0\x80")); // Min UTF-16 surrogate.
+  assert (!utf8 ("\xED\xBF\xBF")); // Max UTF-16 surrogate.
+
+  assert (utf8_error ("\xE2\x80\x70") ==
+          "invalid UTF-8 sequence third byte (0x70 'p')");
 
   // 4-byte sequences.
   //
@@ -63,9 +94,8 @@ main ()
   assert (!utf8 ("\xF0\x90\x70\x80")); // Invalid third byte.
   assert (!utf8 ("\xF1\x80\x80\xC0")); // Invalid forth byte.
 
-  // Out of the codepoint range (0x10ffff + 1).
-  //
-  assert (!utf8 ("\xF4\x90\x80\x80"));
+  assert (utf8_error ("\xF1\x80\x80\xC0") ==
+          "invalid UTF-8 sequence forth byte (0xC0)");
 
   // Incomplete sequences.
   //
@@ -73,14 +103,25 @@ main ()
   assert (!utf8 ("\xE4\xBA"));     // 3-byte sequence.
   assert (!utf8 ("\xF0\x90\x8C")); // 4-byte sequence.
 
+  assert (utf8_error ("\xD0") == "incomplete UTF-8 sequence");
+
   // Missing sequence leading bytes.
   //
-  assert (!utf8 ("\xB0xyz"));         // 2-byte sequence.
-  assert (!utf8 ("\xBA\x8Cxyz"));     // 3-byte sequence.
-  assert (!utf8 ("\x8Cxyz"));         // 3-byte sequence.
-  assert (!utf8 ("\x90\x8C\x82xyz")); // 4-byte sequence.
-  assert (!utf8 ("\x8C\x82xyz"));     // 4-byte sequence.
-  assert (!utf8 ("\x82xyz"));         // 4-byte sequence.
+  assert (!utf8 ("\xB0xyz"));            // 2-byte sequence.
+  assert (!utf8 ("\xBA\x8C\xD0\xB0yz")); // 3-byte sequence.
+  assert (!utf8 ("\x8Cxyz"));            // 3-byte sequence.
+  assert (!utf8 ("\x90\x8C\x82xyz"));    // 4-byte sequence.
+  assert (!utf8 ("\x8C\x82xyz"));        // 4-byte sequence.
+  assert (!utf8 ("\x82xyz"));            // 4-byte sequence.
+
+  assert (utf8_error ("\xB0") == "invalid UTF-8 sequence first byte (0xB0)");
+
+  // Above the valid codepoint range (0x10ffff + 1).
+  //
+  assert (!utf8 ("\xF4\x90\x80\x80"));
+
+  assert (utf8_error ("\xF4\x90\x80\x80") ==
+          "invalid UTF-8 sequence second byte (0x90)");
 
   // Whitelisting.
   //
@@ -145,6 +186,9 @@ main ()
   assert (!utf8 ("\xF3\xA1\x80\x80", codepoint_types::graphic)); // Reserved.
   assert (!utf8 ("\xF3\xA0\x81\xBF", codepoint_types::graphic)); // Format.
 
+  assert (utf8_error ("\xF3\xA0\x81\xBF", codepoint_types::graphic) ==
+          "invalid Unicode codepoint (format)");
+
   assert (!utf8 ("\xC2\xAC", codepoint_types::format)); // Graphic.
 
   // Private-use & Graphic.
@@ -153,4 +197,145 @@ main ()
                  codepoint_types::format));
 
   assert (!utf8 ("a", codepoint_types::none)); // None.
+
+  assert (utf8_error ("a", codepoint_types::none) ==
+          "invalid Unicode codepoint (graphic)");
+
+  // UTF-8 string length.
+  //
+  auto invalid_utf8 = [] (string s, codepoint_types ts = codepoint_types::any)
+  {
+    try
+    {
+      utf8_length (s, ts);
+      return false;
+    }
+    catch (const invalid_argument&)
+    {
+      return true;
+    }
+  };
+
+  assert (utf8_length ("") == 0);
+  assert (utf8_length ("x\xD0\xB0\xE4\xBA\x8C\xF0\x90\x8C\x82y") == 5);
+
+  assert (invalid_utf8 ("\xFE"));                         // Invalid byte.
+  assert (invalid_utf8 ("\xD0"));                         // Incomplete.
+  assert (invalid_utf8 ("\n", codepoint_types::graphic)); // Invalid codepoint.
+
+  // to_utf8() tests.
+  //
+  auto roundtrip = [] (const char* s)
+  {
+    string r (s);
+    to_utf8 (r, '?');
+    return r == s;
+  };
+
+  auto sanitize = [] (string s, codepoint_types ts = codepoint_types::any)
+  {
+    to_utf8 (s, '?', ts);
+    return s;
+  };
+
+  // Empty.
+  //
+  assert (roundtrip (""));
+
+  // 1 code point.
+  //
+  assert (roundtrip ("a"));                // 1 byte.
+  assert (roundtrip ("\xD0\xB0"));         // 2 bytes.
+  assert (roundtrip ("\xE4\xBA\x8C"));     // 3 bytes.
+  assert (roundtrip ("\xF0\x90\x8C\x82")); // 4 bytes.
+
+  // Multiple code points.
+  //
+  assert (roundtrip ("a\xD0\xB0\xE4\xBA\x8C\xF0\x90\x8C\x82"));
+
+  // Ill-formed sequences.
+  //
+  // Long sequence.
+  //
+  assert (sanitize ("\xF8") == "?"); // 5-byte sequence.
+
+  // Invalid first byte followed by a second byte which ...
+  //
+  assert (sanitize ("\xC1\x80")     == "??");        // is a trailing byte.
+  assert (sanitize ("\xC1y")        == "?y");        // starts 1-byte sequence.
+  assert (sanitize ("\xC1\xD0\xB0") == "?\xD0\xB0"); // starts 2-byte sequence.
+  assert (sanitize ("\xC1\xFE")     == "??");        // is not UTF-8.
+
+  // Invalid second byte which ...
+  //
+  assert (sanitize ("\xD0y")        == "?y");        // starts 1-byte sequence.
+  assert (sanitize ("\xD0\xD0\xB0") == "?\xD0\xB0"); // starts 2-byte sequence.
+  assert (sanitize ("\xD0\xFE")     == "??");        // is not UTF-8.
+
+  // Incomplete sequences.
+  //
+  assert (sanitize ("\xD0")     == "?");   // 2-byte sequence.
+  assert (sanitize ("y\xD0")    == "y?");  // 2-byte sequence.
+  assert (sanitize ("\xE4\xBA") == "??");  // 3-byte sequence.
+  assert (sanitize ("\xD0\xD0") == "??");  // 2-byte sequence.
+
+  // Incomplete recovery.
+  //
+  assert (sanitize ("\xD0\xFE")     == "??");  // 2-byte sequence.
+  assert (sanitize ("\xD0\xFE\xFE") == "???"); // 2-byte sequence.
+
+  assert (sanitize ("\xF4\x90\x80\x80") == "????"); // Above the range.
+  assert (sanitize ("\xED\xA0\x80")     == "???");  // Min UTF-16 surrogate.
+  assert (sanitize ("\xED\xBF\xBF")     == "???");  // Max UTF-16 surrogate.
+
+  // Invalid codepoints.
+  //
+  auto sanitize_g = [&sanitize] (string s)
+  {
+    return sanitize (move (s), codepoint_types::graphic);
+  };
+
+  assert (sanitize_g ("\xEF\xB7\x90")  == "?");
+  assert (sanitize_g ("y\xEF\xB7\x90") == "y?");
+  assert (sanitize_g ("\xEF\xB7\x90y") == "?y");
+
+  // Invalid during recovery.
+  //
+  assert (sanitize_g ("\xD0\n")     == "??");
+  assert (sanitize_g ("\xD0\ny")    == "??y");
+  assert (sanitize_g ("\xD0\xFE\n") == "???");
+
+  assert (sanitize_g ("\xD0\xEF\xB7\x90") == "??");
+
+  // utf8_validator::codepoint() tests.
+  //
+  {
+    u32string r;
+    size_t invalid_codepoints (0);
+
+    string s ("a"
+              "\xD0\xB0"
+              "\n"                 // Control.
+              "\xE4\xBA\x8C"
+              "\xEE\x80\x80"       // Private-use.
+              "\xF0\x90\x8C\x82");
+
+    utf8_validator val (codepoint_types::graphic);
+
+    for (char c: s)
+    {
+      pair<bool, bool> v (val.validate (c));
+
+      if (v.first)
+      {
+        if (v.second)
+          r.push_back (val.codepoint ());
+      }
+      else
+        ++invalid_codepoints;
+    }
+
+    assert (r == U"a\x430\x4E8C\x10302");
+    assert (invalid_codepoints == 2);
+  }
 }
author	Karen Arutyunov <karen@codesynthesis.com>	2020-02-26 17:16:45 +0300
committer	Karen Arutyunov <karen@codesynthesis.com>	2020-02-26 17:17:49 +0300
commit	5ae9686adac1508873f2d980e84becd3496244c2 (patch)
tree	d7c88e678b29ed6bb7ae30b74bd01aa2b5d2e9a8
parent	afb726d2d59b3715960a8647738860f40e37cf4f (diff)