From 1270101f4267ecd187bb604190d004daaae341b7 Mon Sep 17 00:00:00 2001 From: Boris Kolpackov Date: Fri, 4 Nov 2016 08:47:26 +0200 Subject: Various testscript lexer/parser fixes --- build2/lexer | 41 ++++++++++++++++++++++++----------------- build2/lexer.cxx | 30 +++++++++++++++++++----------- build2/parser | 2 ++ 3 files changed, 45 insertions(+), 28 deletions(-) (limited to 'build2') diff --git a/build2/lexer b/build2/lexer index c5c3857..f7f7b82 100644 --- a/build2/lexer +++ b/build2/lexer @@ -31,6 +31,12 @@ namespace build2 // automatically reset after the end of the line. The variable mode is reset // after the word token. And the eval mode is reset after the closing ')'. // + // Note that normally it is only safe to switch mode when the current token + // is not quoted (or, more generally, when you are not in the double-quoted + // mode) unless the mode treats the double-quote as a separator (e.g., + // variable name mode). Failed that your mode (which now will be the top of + // the mode stack) will prevent proper recognition of the closing quote. + // // Extendable/inheritable enum-like class. // @@ -102,6 +108,23 @@ namespace build2 peek_char (); protected: + struct state + { + lexer_mode mode; + + char sep_pair; + bool sep_space; // Are whitespaces separators (see skip_spaces())? + + // Word separator characters. For two-character sequence put the first + // one in sep_first and the second one in the corresponding position of + // sep_second. If it's a single-character sequence, then put space in + // sep_second. If there are multiple sequences that start with the same + // character, then repeat the first character in sep_first. + // + const char* sep_first; + const char* sep_second; + }; + // If you extend the lexer and add a custom lexer mode, then you must // override next_impl() and handle the custom mode there. // @@ -115,7 +138,7 @@ namespace build2 next_quoted (); virtual token - word (bool separated); + word (state, bool separated); // Return true if we have seen any spaces. Skipped empty lines // don't count. In other words, we are only interested in spaces @@ -161,22 +184,6 @@ namespace build2 const char* escapes_; void (*processor_) (token&, const lexer&); - struct state - { - lexer_mode mode; - - char sep_pair; - bool sep_space; // Are whitespaces separators (see skip_spaces())? - - // Word separator characters. For two-character sequence put the first - // one in sep_first and the second one in the corresponding position of - // sep_second. If it's a single-character sequence, then put space in - // sep_second. If there are multiple sequences that start with the same - // character, then repeat the first character in sep_first. - // - const char* sep_first; - const char* sep_second; - }; std::stack state_; bool sep_; // True if we skipped spaces in peek(). diff --git a/build2/lexer.cxx b/build2/lexer.cxx index b188396..c84b102 100644 --- a/build2/lexer.cxx +++ b/build2/lexer.cxx @@ -78,7 +78,8 @@ namespace build2 token lexer:: next_impl () { - lexer_mode m (state_.top ().mode); + const state& st (state_.top ()); + lexer_mode m (st.mode); // For some modes we have dedicated imlementations of next(). // @@ -108,7 +109,7 @@ namespace build2 // Handle pair separator. // if ((m == lexer_mode::normal || m == lexer_mode::value) && - c == state_.top ().sep_pair) + c == st.sep_pair) return make_token (type::pair_separator); switch (c) @@ -168,7 +169,7 @@ namespace build2 // Otherwise it is a word. // unget (c); - return word (sep); + return word (st, sep); } token lexer:: @@ -180,6 +181,8 @@ namespace build2 if (eos (c)) fail (c) << "unterminated evaluation context"; + const state& st (state_.top ()); + uint64_t ln (c.line), cn (c.column); auto make_token = [sep, ln, cn] (type t) @@ -193,7 +196,7 @@ namespace build2 // Handle pair separator. // - if (c == state_.top ().sep_pair) + if (c == st.sep_pair) return make_token (type::pair_separator); // Note: we don't treat [ and ] as special here. Maybe can use them for @@ -242,7 +245,7 @@ namespace build2 // Otherwise it is a word. // unget (c); - return word (sep); + return word (st, sep); } token lexer:: @@ -264,13 +267,13 @@ namespace build2 // Otherwise it is a word. // unget (c); - return word (false); + return word (state_.top (), false); } token lexer:: - word (bool sep) + word (state st, bool sep) { - lexer_mode m (state_.top ().mode); + lexer_mode m (st.mode); xchar c (peek ()); assert (!eos (c)); @@ -337,7 +340,9 @@ namespace build2 { get (); state_.pop (); - m = state_.top ().mode; + + st = state_.top (); + m = st.mode; continue; } } @@ -366,7 +371,6 @@ namespace build2 { // First check if it's a pair separator. // - const state& st (state_.top ()); if (c == st.sep_pair) done = true; else @@ -421,7 +425,11 @@ namespace build2 case '\"': { get (); - mode ((m = lexer_mode::double_quoted)); + + mode (lexer_mode::double_quoted); + st = state_.top (); + m = st.mode; + quoted = true; continue; } diff --git a/build2/parser b/build2/parser index 89f42b8..f038b97 100644 --- a/build2/parser +++ b/build2/parser @@ -445,6 +445,8 @@ namespace build2 const fail_mark fail; protected: + bool pre_parse_ = false; + bool boot_; const path* path_; // Current path. -- cgit v1.1