From e3b6dc455ab5c98606e38983bd19426ae346f469 Mon Sep 17 00:00:00 2001 From: Boris Kolpackov Date: Wed, 9 Sep 2015 10:20:52 +0200 Subject: Reimplement double quote lexing to avoid "implied quote" trick --- build/lexer | 9 +-- build/lexer.cxx | 191 +++++++++++++++++++++++++++----------------------------- 2 files changed, 96 insertions(+), 104 deletions(-) (limited to 'build') diff --git a/build/lexer b/build/lexer index 0740f14..37c7807 100644 --- a/build/lexer +++ b/build/lexer @@ -69,13 +69,10 @@ namespace build private: token - name (bool separated); - - void - single_quote (std::string&); + next_quoted (); - bool - double_quote (std::string&); + token + name (bool separated); // Return true we have seen any spaces. Skipped empty lines don't // count. In other words, we are only interested in spaces that diff --git a/build/lexer.cxx b/build/lexer.cxx index 9c76377..6da18eb 100644 --- a/build/lexer.cxx +++ b/build/lexer.cxx @@ -13,29 +13,12 @@ namespace build { lexer_mode m (mode_.top ()); - // If we are in the quoted mode, then this means we have seen a - // variable expansion ($) and had to "break" the quoted sequence - // into multiple "concatenated" tokens. So what we have now is - // the "tail" of that quoted sequence which we need to continue - // scanning. To make this work auto-magically (well, almost) we - // are going to use a little trick: we will "pretend" that the - // next character is the opening quote. After all, a sequence - // like "$foo bar" is semantically equivalent to "$foo"" bar". + // For some modes we have dedicated imlementations of next(). // - if (m == lexer_mode::quoted) + switch (m) { - xchar c (peek ()); - - // Detect the beginning of the "break". After that, we rely - // on the caller switching to the variable mode. - // - if (c != '$') - { - mode_.pop (); // As if we saw closing quote. - c.value = '"'; // Keep line/column information. - unget (c); - return name (false); - } + case lexer_mode::quoted: return next_quoted (); + default: break; } bool sep (skip_spaces ()); @@ -120,6 +103,23 @@ namespace build } token lexer:: + next_quoted () + { + xchar c (peek ()); + + if (eos (c)) + fail (c) << "unterminated double-quoted sequence"; + + uint64_t ln (c.line), cn (c.column); + + switch (c) + { + case '$': get (); return token (token_type::dollar, false, ln, cn); + default: return name (false); + } + } + + token lexer:: name (bool sep) { xchar c (peek ()); @@ -140,9 +140,11 @@ namespace build break; // The following characters are not treated as special in the - // value or pairs mode. + // value/pairs and quoted modes. // - if (m != lexer_mode::value && m != lexer_mode::pairs) + if (m != lexer_mode::value && + m != lexer_mode::pairs && + m != lexer_mode::quoted) { switch (c) { @@ -178,61 +180,104 @@ namespace build break; } + // If we are quoted, these are ordinary characters. + // + if (m != lexer_mode::quoted) + { + switch (c) + { + case ' ': + case '\t': + case '\n': + case '#': + case '{': + case '}': + case '(': + case ')': + { + done = true; + break; + } + case '\\': + { + get (); + lexeme += escape (); + continue; + } + case '\'': + { + // If we are in the variable mode, then treat quote as just + // another separator. + // + if (m == lexer_mode::variable) + { + done = true; + break; + } + else + { + get (); + + for (c = get (); !eos (c) && c != '\''; c = get ()) + lexeme += c; + + if (eos (c)) + fail (c) << "unterminated single-quoted sequence"; + + continue; + } + } + } + + if (done) + break; + } + switch (c) { - case ' ': - case '\t': - case '\n': - case '#': - case '{': - case '}': case '$': - case '(': - case ')': { done = true; break; } - case '\\': - { - get (); - lexeme += escape (); - break; - } - case '\'': case '\"': { - // If we are in the variable mode, then treat quotes as just + // If we are in the variable mode, then treat quote as just // another separator. // if (m == lexer_mode::variable) + { done = true; + break; + } else { get (); - if (c == '\'') - single_quote (lexeme); + if (m == lexer_mode::quoted) + mode_.pop (); else - { mode_.push (lexer_mode::quoted); - done = double_quote (lexeme); - } + + m = mode_.top (); + continue; } - break; } default: { get (); lexeme += c; - break; + continue; } } - if (done) - break; + assert (done); + break; } + if (m == lexer_mode::quoted && eos (c)) + fail (c) << "unterminated double-quoted sequence"; + // Expire variable mode at the end of the name. // if (m == lexer_mode::variable) @@ -241,56 +286,6 @@ namespace build return token (lexeme, sep, ln, cn); } - // Assuming the previous character is the opening single quote, scan - // the stream until the closing quote or eos, accumulating characters - // in between in lexeme. Fail if eos is reached before the closing - // quote. - // - void lexer:: - single_quote (string& lexeme) - { - xchar c (get ()); - - for (; !eos (c) && c != '\''; c = get ()) - lexeme += c; - - if (eos (c)) - fail (c) << "unterminated single-quoted sequence"; - } - - // Assuming the previous character is the opening double quote, scan - // the stream until the closing quote, $, or eos, accumulating - // characters in between in lexeme. Return false if we stopped - // because of the closing quote (which means the normal name - // scanning can continue) and true if we stopped at $ (meaning this - // name is done and what follows is another token). Fail if eos is - // reached before the closing quote. - // - bool lexer:: - double_quote (string& lexeme) - { - xchar c (peek ()); - - for (; !eos (c); c = peek ()) - { - if (c == '$') - return true; - - get (); - - if (c == '"') - { - mode_.pop (); // Expire quoted mode. - return false; - } - - lexeme += c; - } - - fail (c) << "unterminated double-quoted sequence"; - return false; // Never reached. - } - bool lexer:: skip_spaces () { -- cgit v1.1