Fix lexer to fail on invalid UTF-8 sequences

author: Karen Arutyunov <karen@codesynthesis.com> 2020-06-18 16:40:00 +0300
committer: Boris Kolpackov <boris@codesynthesis.com> 2020-06-19 11:27:32 +0200
commit: 112a83c346a537f1a5eac6fc17ee2ce3143d625b (patch)
tree: 11ed26fb72a571299eba7e02a225eaf07e527c58
parent: 78ac6aee6dff1b608bc312fe7ada442ba83710e8 (diff)
4 files changed, 96 insertions, 0 deletions
diff --git a/libbuild2/lexer+utf8.test.testscript b/libbuild2/lexer+utf8.test.testscript
new file mode 100644
index 0000000..42c62ea
--- /dev/null
+++ b/libbuild2/lexer+utf8.test.testscript
@@ -0,0 +1,28 @@
+# file      : libbuild2/lexer+utf8.test.testscript
+# license   : MIT; see accompanying LICENSE file
+
+: valid
+:
+$* <<EOI >>EOO
+  Sommerzeit
+  Mitteleuropäische
+  EOI
+  'Sommerzeit'
+  <newline>
+  'Mitteleuropäische'
+  <newline>
+  EOO
+
+: invalid
+:
+: Here we spoil the UTF-8 sequence 'ä' by dropping its second byte.
+:
+cat <<EOI | sed -e 's/(rop.).(isc)/\1\2/' | $* >>EOO 2>>EOE != 0
+  Sommerzeit
+  Mitteleuropäische
+  EOI
+  'Sommerzeit'
+  <newline>
+  EOO
+  <stdin>:2:12: error: invalid UTF-8 sequence second byte (0x69 'i')
+  EOE
diff --git a/libbuild2/lexer.cxx b/libbuild2/lexer.cxx
index 4256422..ff7be02 100644
--- a/libbuild2/lexer.cxx
+++ b/libbuild2/lexer.cxx
@@ -11,6 +11,12 @@ namespace build2
 {
   using type = token_type;
 
+  [[noreturn]] void lexer::
+  fail_char (const xchar& c)
+  {
+    fail (c) << ebuf_ << endf;
+  }
+
   pair<pair<char, char>, bool> lexer::
   peek_chars ()
   {
diff --git a/libbuild2/lexer.hxx b/libbuild2/lexer.hxx
index 6c2b90b..cc42219 100644
--- a/libbuild2/lexer.hxx
+++ b/libbuild2/lexer.hxx
@@ -187,6 +187,23 @@ namespace build2
     pair<pair<char, char>, bool>
     peek_chars ();
 
+    // As base::get() but in case of an invalid character issue diagnostics
+    // and throw failed.
+    //
+    xchar
+    get ();
+
+    // Get previously peeked character (faster).
+    //
+    void
+    get (const xchar&);
+
+    // As base::peek() but in case of an invalid character issue diagnostics
+    // and throw failed.
+    //
+    xchar
+    peek ();
+
   protected:
     struct state
     {
@@ -243,6 +260,9 @@ namespace build2
   protected:
     fail_mark fail;
 
+    [[noreturn]] void
+    fail_char (const xchar&);
+
     // Lexer state.
     //
   protected:
@@ -266,6 +286,13 @@ namespace build2
     std::stack<state> state_;
 
     bool sep_; // True if we skipped spaces in peek().
+
+  private:
+    using base = char_scanner<butl::utf8_validator, 2>;
+
+    // Buffer for a get()/peek() potential error.
+    //
+    string ebuf_;
   };
 }
 
@@ -284,4 +311,6 @@ namespace butl // ADL
   }
 }
 
+#include <libbuild2/lexer.ixx>
+
 #endif // LIBBUILD2_LEXER_HXX
diff --git a/libbuild2/lexer.ixx b/libbuild2/lexer.ixx
new file mode 100644
index 0000000..04899f0
--- /dev/null
+++ b/libbuild2/lexer.ixx
@@ -0,0 +1,33 @@
+// file      : libbuild2/lexer.ixx -*- C++ -*-
+// license   : MIT; see accompanying LICENSE file
+
+namespace build2
+{
+  inline auto lexer::
+  get () -> xchar
+  {
+    xchar c (base::get (ebuf_));
+
+    if (invalid (c))
+      fail_char (c);
+
+    return c;
+  }
+
+  inline void lexer::
+  get (const xchar& peeked)
+  {
+    base::get (peeked);
+  }
+
+  inline auto lexer::
+  peek () -> xchar
+  {
+    xchar c (base::peek (ebuf_));
+
+    if (invalid (c))
+      fail_char (c);
+
+    return c;
+  }
+}
author	Karen Arutyunov <karen@codesynthesis.com>	2020-06-18 16:40:00 +0300
committer	Boris Kolpackov <boris@codesynthesis.com>	2020-06-19 11:27:32 +0200
commit	112a83c346a537f1a5eac6fc17ee2ce3143d625b (patch)
tree	11ed26fb72a571299eba7e02a225eaf07e527c58
parent	78ac6aee6dff1b608bc312fe7ada442ba83710e8 (diff)