diff options
author | Karen Arutyunov <karen@codesynthesis.com> | 2020-06-18 16:40:00 +0300 |
---|---|---|
committer | Boris Kolpackov <boris@codesynthesis.com> | 2020-06-19 11:27:32 +0200 |
commit | 112a83c346a537f1a5eac6fc17ee2ce3143d625b (patch) | |
tree | 11ed26fb72a571299eba7e02a225eaf07e527c58 | |
parent | 78ac6aee6dff1b608bc312fe7ada442ba83710e8 (diff) |
Fix lexer to fail on invalid UTF-8 sequences
-rw-r--r-- | libbuild2/lexer+utf8.test.testscript | 28 | ||||
-rw-r--r-- | libbuild2/lexer.cxx | 6 | ||||
-rw-r--r-- | libbuild2/lexer.hxx | 29 | ||||
-rw-r--r-- | libbuild2/lexer.ixx | 33 |
4 files changed, 96 insertions, 0 deletions
diff --git a/libbuild2/lexer+utf8.test.testscript b/libbuild2/lexer+utf8.test.testscript new file mode 100644 index 0000000..42c62ea --- /dev/null +++ b/libbuild2/lexer+utf8.test.testscript @@ -0,0 +1,28 @@ +# file : libbuild2/lexer+utf8.test.testscript +# license : MIT; see accompanying LICENSE file + +: valid +: +$* <<EOI >>EOO + Sommerzeit + Mitteleuropäische + EOI + 'Sommerzeit' + <newline> + 'Mitteleuropäische' + <newline> + EOO + +: invalid +: +: Here we spoil the UTF-8 sequence 'ä' by dropping its second byte. +: +cat <<EOI | sed -e 's/(rop.).(isc)/\1\2/' | $* >>EOO 2>>EOE != 0 + Sommerzeit + Mitteleuropäische + EOI + 'Sommerzeit' + <newline> + EOO + <stdin>:2:12: error: invalid UTF-8 sequence second byte (0x69 'i') + EOE diff --git a/libbuild2/lexer.cxx b/libbuild2/lexer.cxx index 4256422..ff7be02 100644 --- a/libbuild2/lexer.cxx +++ b/libbuild2/lexer.cxx @@ -11,6 +11,12 @@ namespace build2 { using type = token_type; + [[noreturn]] void lexer:: + fail_char (const xchar& c) + { + fail (c) << ebuf_ << endf; + } + pair<pair<char, char>, bool> lexer:: peek_chars () { diff --git a/libbuild2/lexer.hxx b/libbuild2/lexer.hxx index 6c2b90b..cc42219 100644 --- a/libbuild2/lexer.hxx +++ b/libbuild2/lexer.hxx @@ -187,6 +187,23 @@ namespace build2 pair<pair<char, char>, bool> peek_chars (); + // As base::get() but in case of an invalid character issue diagnostics + // and throw failed. + // + xchar + get (); + + // Get previously peeked character (faster). + // + void + get (const xchar&); + + // As base::peek() but in case of an invalid character issue diagnostics + // and throw failed. + // + xchar + peek (); + protected: struct state { @@ -243,6 +260,9 @@ namespace build2 protected: fail_mark fail; + [[noreturn]] void + fail_char (const xchar&); + // Lexer state. // protected: @@ -266,6 +286,13 @@ namespace build2 std::stack<state> state_; bool sep_; // True if we skipped spaces in peek(). + + private: + using base = char_scanner<butl::utf8_validator, 2>; + + // Buffer for a get()/peek() potential error. + // + string ebuf_; }; } @@ -284,4 +311,6 @@ namespace butl // ADL } } +#include <libbuild2/lexer.ixx> + #endif // LIBBUILD2_LEXER_HXX diff --git a/libbuild2/lexer.ixx b/libbuild2/lexer.ixx new file mode 100644 index 0000000..04899f0 --- /dev/null +++ b/libbuild2/lexer.ixx @@ -0,0 +1,33 @@ +// file : libbuild2/lexer.ixx -*- C++ -*- +// license : MIT; see accompanying LICENSE file + +namespace build2 +{ + inline auto lexer:: + get () -> xchar + { + xchar c (base::get (ebuf_)); + + if (invalid (c)) + fail_char (c); + + return c; + } + + inline void lexer:: + get (const xchar& peeked) + { + base::get (peeked); + } + + inline auto lexer:: + peek () -> xchar + { + xchar c (base::peek (ebuf_)); + + if (invalid (c)) + fail_char (c); + + return c; + } +} |