aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKaren Arutyunov <karen@codesynthesis.com>2020-06-18 16:40:00 +0300
committerBoris Kolpackov <boris@codesynthesis.com>2020-06-19 11:27:32 +0200
commit112a83c346a537f1a5eac6fc17ee2ce3143d625b (patch)
tree11ed26fb72a571299eba7e02a225eaf07e527c58
parent78ac6aee6dff1b608bc312fe7ada442ba83710e8 (diff)
Fix lexer to fail on invalid UTF-8 sequences
-rw-r--r--libbuild2/lexer+utf8.test.testscript28
-rw-r--r--libbuild2/lexer.cxx6
-rw-r--r--libbuild2/lexer.hxx29
-rw-r--r--libbuild2/lexer.ixx33
4 files changed, 96 insertions, 0 deletions
diff --git a/libbuild2/lexer+utf8.test.testscript b/libbuild2/lexer+utf8.test.testscript
new file mode 100644
index 0000000..42c62ea
--- /dev/null
+++ b/libbuild2/lexer+utf8.test.testscript
@@ -0,0 +1,28 @@
+# file : libbuild2/lexer+utf8.test.testscript
+# license : MIT; see accompanying LICENSE file
+
+: valid
+:
+$* <<EOI >>EOO
+ Sommerzeit
+ Mitteleuropäische
+ EOI
+ 'Sommerzeit'
+ <newline>
+ 'Mitteleuropäische'
+ <newline>
+ EOO
+
+: invalid
+:
+: Here we spoil the UTF-8 sequence 'ä' by dropping its second byte.
+:
+cat <<EOI | sed -e 's/(rop.).(isc)/\1\2/' | $* >>EOO 2>>EOE != 0
+ Sommerzeit
+ Mitteleuropäische
+ EOI
+ 'Sommerzeit'
+ <newline>
+ EOO
+ <stdin>:2:12: error: invalid UTF-8 sequence second byte (0x69 'i')
+ EOE
diff --git a/libbuild2/lexer.cxx b/libbuild2/lexer.cxx
index 4256422..ff7be02 100644
--- a/libbuild2/lexer.cxx
+++ b/libbuild2/lexer.cxx
@@ -11,6 +11,12 @@ namespace build2
{
using type = token_type;
+ [[noreturn]] void lexer::
+ fail_char (const xchar& c)
+ {
+ fail (c) << ebuf_ << endf;
+ }
+
pair<pair<char, char>, bool> lexer::
peek_chars ()
{
diff --git a/libbuild2/lexer.hxx b/libbuild2/lexer.hxx
index 6c2b90b..cc42219 100644
--- a/libbuild2/lexer.hxx
+++ b/libbuild2/lexer.hxx
@@ -187,6 +187,23 @@ namespace build2
pair<pair<char, char>, bool>
peek_chars ();
+ // As base::get() but in case of an invalid character issue diagnostics
+ // and throw failed.
+ //
+ xchar
+ get ();
+
+ // Get previously peeked character (faster).
+ //
+ void
+ get (const xchar&);
+
+ // As base::peek() but in case of an invalid character issue diagnostics
+ // and throw failed.
+ //
+ xchar
+ peek ();
+
protected:
struct state
{
@@ -243,6 +260,9 @@ namespace build2
protected:
fail_mark fail;
+ [[noreturn]] void
+ fail_char (const xchar&);
+
// Lexer state.
//
protected:
@@ -266,6 +286,13 @@ namespace build2
std::stack<state> state_;
bool sep_; // True if we skipped spaces in peek().
+
+ private:
+ using base = char_scanner<butl::utf8_validator, 2>;
+
+ // Buffer for a get()/peek() potential error.
+ //
+ string ebuf_;
};
}
@@ -284,4 +311,6 @@ namespace butl // ADL
}
}
+#include <libbuild2/lexer.ixx>
+
#endif // LIBBUILD2_LEXER_HXX
diff --git a/libbuild2/lexer.ixx b/libbuild2/lexer.ixx
new file mode 100644
index 0000000..04899f0
--- /dev/null
+++ b/libbuild2/lexer.ixx
@@ -0,0 +1,33 @@
+// file : libbuild2/lexer.ixx -*- C++ -*-
+// license : MIT; see accompanying LICENSE file
+
+namespace build2
+{
+ inline auto lexer::
+ get () -> xchar
+ {
+ xchar c (base::get (ebuf_));
+
+ if (invalid (c))
+ fail_char (c);
+
+ return c;
+ }
+
+ inline void lexer::
+ get (const xchar& peeked)
+ {
+ base::get (peeked);
+ }
+
+ inline auto lexer::
+ peek () -> xchar
+ {
+ xchar c (base::peek (ebuf_));
+
+ if (invalid (c))
+ fail_char (c);
+
+ return c;
+ }
+}