From 112a83c346a537f1a5eac6fc17ee2ce3143d625b Mon Sep 17 00:00:00 2001 From: Karen Arutyunov Date: Thu, 18 Jun 2020 16:40:00 +0300 Subject: Fix lexer to fail on invalid UTF-8 sequences --- libbuild2/lexer+utf8.test.testscript | 28 ++++++++++++++++++++++++++++ libbuild2/lexer.cxx | 6 ++++++ libbuild2/lexer.hxx | 29 +++++++++++++++++++++++++++++ libbuild2/lexer.ixx | 33 +++++++++++++++++++++++++++++++++ 4 files changed, 96 insertions(+) create mode 100644 libbuild2/lexer+utf8.test.testscript create mode 100644 libbuild2/lexer.ixx diff --git a/libbuild2/lexer+utf8.test.testscript b/libbuild2/lexer+utf8.test.testscript new file mode 100644 index 0000000..42c62ea --- /dev/null +++ b/libbuild2/lexer+utf8.test.testscript @@ -0,0 +1,28 @@ +# file : libbuild2/lexer+utf8.test.testscript +# license : MIT; see accompanying LICENSE file + +: valid +: +$* <>EOO + Sommerzeit + Mitteleuropäische + EOI + 'Sommerzeit' + + 'Mitteleuropäische' + + EOO + +: invalid +: +: Here we spoil the UTF-8 sequence 'ä' by dropping its second byte. +: +cat <>EOO 2>>EOE != 0 + Sommerzeit + Mitteleuropäische + EOI + 'Sommerzeit' + + EOO + :2:12: error: invalid UTF-8 sequence second byte (0x69 'i') + EOE diff --git a/libbuild2/lexer.cxx b/libbuild2/lexer.cxx index 4256422..ff7be02 100644 --- a/libbuild2/lexer.cxx +++ b/libbuild2/lexer.cxx @@ -11,6 +11,12 @@ namespace build2 { using type = token_type; + [[noreturn]] void lexer:: + fail_char (const xchar& c) + { + fail (c) << ebuf_ << endf; + } + pair, bool> lexer:: peek_chars () { diff --git a/libbuild2/lexer.hxx b/libbuild2/lexer.hxx index 6c2b90b..cc42219 100644 --- a/libbuild2/lexer.hxx +++ b/libbuild2/lexer.hxx @@ -187,6 +187,23 @@ namespace build2 pair, bool> peek_chars (); + // As base::get() but in case of an invalid character issue diagnostics + // and throw failed. + // + xchar + get (); + + // Get previously peeked character (faster). + // + void + get (const xchar&); + + // As base::peek() but in case of an invalid character issue diagnostics + // and throw failed. + // + xchar + peek (); + protected: struct state { @@ -243,6 +260,9 @@ namespace build2 protected: fail_mark fail; + [[noreturn]] void + fail_char (const xchar&); + // Lexer state. // protected: @@ -266,6 +286,13 @@ namespace build2 std::stack state_; bool sep_; // True if we skipped spaces in peek(). + + private: + using base = char_scanner; + + // Buffer for a get()/peek() potential error. + // + string ebuf_; }; } @@ -284,4 +311,6 @@ namespace butl // ADL } } +#include + #endif // LIBBUILD2_LEXER_HXX diff --git a/libbuild2/lexer.ixx b/libbuild2/lexer.ixx new file mode 100644 index 0000000..04899f0 --- /dev/null +++ b/libbuild2/lexer.ixx @@ -0,0 +1,33 @@ +// file : libbuild2/lexer.ixx -*- C++ -*- +// license : MIT; see accompanying LICENSE file + +namespace build2 +{ + inline auto lexer:: + get () -> xchar + { + xchar c (base::get (ebuf_)); + + if (invalid (c)) + fail_char (c); + + return c; + } + + inline void lexer:: + get (const xchar& peeked) + { + base::get (peeked); + } + + inline auto lexer:: + peek () -> xchar + { + xchar c (base::peek (ebuf_)); + + if (invalid (c)) + fail_char (c); + + return c; + } +} -- cgit v1.1