From bb02e152dc036879ab0b2d1d8aa2cb19084b8e16 Mon Sep 17 00:00:00 2001 From: Boris Kolpackov Date: Tue, 25 May 2021 13:42:41 +0200 Subject: Recognize quoting of first character in token Use this to relax the pattern inclusion/exclusion syntax to only require unquoted +/-. --- libbuild2/lexer+quoting.test.testscript | 30 ++++++++++++++---- libbuild2/lexer.cxx | 55 ++++++++++++++++++++++----------- libbuild2/lexer.test.cxx | 6 +++- libbuild2/parser.cxx | 48 +++++++++++++++++----------- libbuild2/script/lexer.cxx | 4 +-- libbuild2/test/script/lexer.cxx | 5 ++- libbuild2/token.hxx | 25 ++++++++++----- 7 files changed, 118 insertions(+), 55 deletions(-) diff --git a/libbuild2/lexer+quoting.test.testscript b/libbuild2/lexer+quoting.test.testscript index 0143c90..ddfb0d0 100644 --- a/libbuild2/lexer+quoting.test.testscript +++ b/libbuild2/lexer+quoting.test.testscript @@ -56,9 +56,9 @@ EOO : Token start already quoted : $* <'"$foo"' >>EOO - '' [D/P] + '' [D/P/F] $ [D/C] - 'foo' [D/P] + 'foo' [D/P/F] EOO @@ -66,7 +66,7 @@ EOO : Token end still quoted : $* <'"foo$"' >>EOO - 'foo' [D/P] + 'foo' [D/P/F] $ [D/C] '' [D/P] @@ -87,7 +87,7 @@ EOO : Token continous with unquoted character : $* <'"fo"o' >>EOO - 'foo' [D/P] + 'foo' [D/P/F] EOO @@ -95,7 +95,7 @@ EOO : Token continous with unquoted escaped character : $* <'"fo"\"' >>EOO - 'fo"' [D/P] + 'fo"' [D/P/F] EOO } @@ -104,6 +104,24 @@ EOO : mixed : $* <"\"fo\"'o'" >>EOO -'foo' [M/P] +'foo' [M/P/F] EOO + +: first +: +{ + : empty-single + : + $* <"''+foo" >>EOO + '+foo' [S/P/F] + + EOO + + : empty-double + : + $* <'""+foo' >>EOO + '+foo' [D/P/F] + + EOO +} diff --git a/libbuild2/lexer.cxx b/libbuild2/lexer.cxx index 9b7d01e..0b6f96d 100644 --- a/libbuild2/lexer.cxx +++ b/libbuild2/lexer.cxx @@ -202,9 +202,10 @@ namespace build2 auto make_token = [&sep, ln, cn] (type t, string v = string ()) { - return token (t, move (v), - sep, quote_type::unquoted, false, - ln, cn, token_printer); + return token (t, move (v), sep, + quote_type::unquoted, false, false, + ln, cn, + token_printer); }; // Handle `[` (do it first to make sure the flag is cleared regardless of @@ -446,9 +447,10 @@ namespace build2 auto make_token = [sep, ln, cn] (type t, string v = string ()) { - return token (t, move (v), - sep, quote_type::unquoted, false, - ln, cn, token_printer); + return token (t, move (v), sep, + quote_type::unquoted, false, false, + ln, cn, + token_printer); }; // Handle `[` (do it first to make sure the flag is cleared regardless of @@ -620,15 +622,14 @@ namespace build2 if (c == '\n' || c == '#' || eos (c)) { - st.hold = token (type::multi_rcbrace, - string (count, '}'), - false, quote_type::unquoted, false, + st.hold = token (type::multi_rcbrace, string (count, '}'), false, + quote_type::unquoted, false, false, bln, bcn, token_printer); lexeme.resize (chop); - return token (move (lexeme), - false, quote_type::unquoted, false, + return token (move (lexeme), false, + quote_type::unquoted, false, false, ln, cn); } @@ -671,15 +672,22 @@ namespace build2 // quote character. // bool qcomp (false); + bool qfirst (false); - auto append = [&lexeme, &m, &qcomp] (char c) + auto append = [&lexeme, &m, &qcomp, &qfirst] (char c) { - lexeme += c; + if (m == lexer_mode::double_quoted) + { + if (lexeme.empty ()) // First character. + qfirst = true; + } + else + { + if (qcomp) // An unquoted character after a quoted fragment. + qcomp = false; + } - // An unquoted character after a quoted fragment. - // - if (qcomp && m != lexer_mode::double_quoted) - qcomp = false; + lexeme += c; }; for (; !eos (c); c = peek ()) @@ -840,6 +848,12 @@ namespace build2 break; } + // Note that we will treat plus in ''+ as quoted. This is + // probably the better option considering the "$empty"+ case + // + if (lexeme.empty ()) + qfirst = true; + get (); for (c = get (); !eos (c) && c != '\''; c = get ()) lexeme += c; @@ -875,6 +889,11 @@ namespace build2 break; } + // The same reasoning as above. + // + if (lexeme.empty ()) + qfirst = true; + continue; } } @@ -905,7 +924,7 @@ namespace build2 if (m == lexer_mode::variable) state_.pop (); - return token (move (lexeme), sep, qtype, qcomp, ln, cn); + return token (move (lexeme), sep, qtype, qcomp, qfirst, ln, cn); } pair lexer:: diff --git a/libbuild2/lexer.test.cxx b/libbuild2/lexer.test.cxx index 24f0528..6d48885 100644 --- a/libbuild2/lexer.test.cxx +++ b/libbuild2/lexer.test.cxx @@ -84,7 +84,11 @@ namespace build2 } if (q != '\0') - cout << " [" << q << (t.qcomp ? "/C" : "/P") << ']'; + cout << " [" + << q + << (t.qcomp ? "/C" : "/P") + << (!t.qcomp && t.qfirst ? "/F" : "") + << ']'; } cout << endl; diff --git a/libbuild2/parser.cxx b/libbuild2/parser.cxx index f152b17..a9646d5 100644 --- a/libbuild2/parser.cxx +++ b/libbuild2/parser.cxx @@ -5394,6 +5394,7 @@ namespace build2 // bool concat (false); bool concat_quoted (false); + bool concat_quoted_first (false); name concat_data; auto concat_typed = [&vnull, &vtype, &concat, &concat_data, this] @@ -5492,21 +5493,13 @@ namespace build2 // Return '+' or '-' if a token can start an inclusion or exclusion // (pattern or group), '\0' otherwise. The result can be used as bool. // - // @@ Note that we only need to make sure that the leading '+' or '-' - // characters are unquoted. We could consider some partially quoted - // tokens as starting inclusion or exclusion as well, for example - // +'foo*'. However, currently we can not determine which part of a - // token is quoted, and so can't distinguish the above token from - // '+'foo*. This is why we end up with a criteria that is stricter than - // is really required. - // auto pattern_prefix = [] (const token& t) -> char { char c; - return t.type == type::word && ((c = t.value[0]) == '+' || c == '-') && - t.qtype == quote_type::unquoted - ? c - : '\0'; + return (t.type == type::word && !t.qfirst && + ((c = t.value[0]) == '+' || c == '-') + ? c + : '\0'); }; // A name sequence potentially starts with a pattern if it starts with a @@ -5586,9 +5579,11 @@ namespace build2 assert (!pre_parse_); bool quoted (concat_quoted); + bool quoted_first (concat_quoted_first); concat = false; concat_quoted = false; + concat_quoted_first = false; // If this is a result of typed concatenation, then don't inject. For // one we don't want any of the "interpretations" performed in the @@ -5671,7 +5666,7 @@ namespace build2 t = token (move (concat_data.value), true, quoted ? quote_type::mixed : quote_type::unquoted, - false, + false, quoted_first, t.line, t.column); } else if (!first) @@ -5713,6 +5708,7 @@ namespace build2 string val (move (t.value)); const location loc (get_location (t)); bool quoted (t.qtype != quote_type::unquoted); + bool quoted_first (t.qfirst); // Should we accumulate? If the buffer is not empty, then we continue // accumulating (the case where we are separated should have been @@ -5723,6 +5719,8 @@ namespace build2 if (concat || // Continue. !last_concat ()) // Start. { + bool e (val.empty ()); + // If LHS is typed then do typed concatenation. // if (concat && vtype != nullptr) @@ -5743,8 +5741,17 @@ namespace build2 v += val; } - concat = true; + // Consider something like this: ""$foo where foo='+foo'. Should we + // treat the plus as a first (unquoted) character? Feels like we + // should not. The way we achieve this is a bit hackish: we make it + // look like a quoted first character. Note that there is a second + // half of this in expansion case which deals with $empty+foo. + // + if (!concat) // First. + concat_quoted_first = quoted_first || e; + concat_quoted = quoted || concat_quoted; + concat = true; continue; } @@ -6451,7 +6458,7 @@ namespace build2 // else if (!result->null && !result->empty ()) { - // This can only an untyped value. + // This can only be an untyped value. // // @@ Could move if result == &result_data. // @@ -6487,8 +6494,13 @@ namespace build2 concat_data.value += n.value; } - concat = true; + // The same little hack as in the word case ($empty+foo). + // + if (!concat) // First. + concat_quoted_first = true; + concat_quoted = quoted || concat_quoted; + concat = true; } else { @@ -6703,8 +6715,8 @@ namespace build2 // // print +foo // - // So wepeek at one more character since what we expect next ('=') can't - // be whitespace-separated. + // So we peek at one more character since what we expect next ('=') + // can't be whitespace-separated. // return c0 == '\n' || c0 == '\0' || c0 == '(' || (p.second && diff --git a/libbuild2/script/lexer.cxx b/libbuild2/script/lexer.cxx index a18c1df..7577149 100644 --- a/libbuild2/script/lexer.cxx +++ b/libbuild2/script/lexer.cxx @@ -127,7 +127,7 @@ namespace build2 bool q (m == lexer_mode::here_line_double); return token (t, string (), sep, - (q ? quote_type::double_ : quote_type::unquoted), q, + (q ? quote_type::double_ : quote_type::unquoted), q, q, ln, cn, token_printer); }; @@ -180,7 +180,7 @@ namespace build2 auto make_token = [&sep, &c] (type t, string v = string ()) { return token (t, move (v), sep, - quote_type::unquoted, false, + quote_type::unquoted, false, false, c.line, c.column, token_printer); }; diff --git a/libbuild2/test/script/lexer.cxx b/libbuild2/test/script/lexer.cxx index c23dea4..f9c8ac6 100644 --- a/libbuild2/test/script/lexer.cxx +++ b/libbuild2/test/script/lexer.cxx @@ -324,9 +324,8 @@ namespace build2 lexeme += c; } - return token (move (lexeme), - false, - quote_type::unquoted, false, + return token (move (lexeme), false, + quote_type::unquoted, false, false, ln, cn); } diff --git a/libbuild2/token.hxx b/libbuild2/token.hxx index 030ab48..faae466 100644 --- a/libbuild2/token.hxx +++ b/libbuild2/token.hxx @@ -119,10 +119,12 @@ namespace build2 // Quoting can be complete, where the token starts and ends with the quote // characters and quoting is contiguous or partial where only some part(s) - // of the token are quoted or quoting continues to the next token. + // of the token are quoted or quoting continues to the next token. We also + // keep track whether the first character of a token is quoted. // quote_type qtype; bool qcomp; + bool qfirst; // Normally only used for word, but can also be used to store "modifiers" // or some such for other tokens. @@ -139,26 +141,35 @@ namespace build2 : token (token_type::eos, false, 0, 0, token_printer) {} token (token_type t, bool s, uint64_t l, uint64_t c, printer_type* p) - : token (t, string (), s, quote_type::unquoted, false, l, c, p) {} + : token (t, string (), s, + quote_type::unquoted, false, false, + l, c, + p) {} token (token_type t, bool s, quote_type qt, uint64_t l, uint64_t c, printer_type* p) - : token (t, string (), s, qt, qt != quote_type::unquoted, l, c, p) {} + : token (t, string (), s, + qt, qt != quote_type::unquoted, qt != quote_type::unquoted, + l, c, + p) {} token (string v, bool s, - quote_type qt, bool qc, + quote_type qt, bool qc, bool qf, uint64_t l, uint64_t c) - : token (token_type::word, move (v), s, qt, qc, l, c, &token_printer){} + : token (token_type::word, move (v), s, + qt, qc, qf, + l, c, + &token_printer) {} token (token_type t, string v, bool s, - quote_type qt, bool qc, + quote_type qt, bool qc, bool qf, uint64_t l, uint64_t c, printer_type* p) : type (t), separated (s), - qtype (qt), qcomp (qc), + qtype (qt), qcomp (qc), qfirst (qf), value (move (v)), line (l), column (c), printer (p) {} -- cgit v1.1