aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBoris Kolpackov <boris@codesynthesis.com>2021-05-25 13:42:41 +0200
committerBoris Kolpackov <boris@codesynthesis.com>2021-05-28 10:10:44 +0200
commitbb02e152dc036879ab0b2d1d8aa2cb19084b8e16 (patch)
treeca3f3e950351f5bfadeec5b0ecb31d6c5bae084f
parentaf5fa9e744acf6da12f2eab7f44810195c0d3ecd (diff)
Recognize quoting of first character in token
Use this to relax the pattern inclusion/exclusion syntax to only require unquoted +/-.
-rw-r--r--libbuild2/lexer+quoting.test.testscript30
-rw-r--r--libbuild2/lexer.cxx55
-rw-r--r--libbuild2/lexer.test.cxx6
-rw-r--r--libbuild2/parser.cxx48
-rw-r--r--libbuild2/script/lexer.cxx4
-rw-r--r--libbuild2/test/script/lexer.cxx5
-rw-r--r--libbuild2/token.hxx25
7 files changed, 118 insertions, 55 deletions
diff --git a/libbuild2/lexer+quoting.test.testscript b/libbuild2/lexer+quoting.test.testscript
index 0143c90..ddfb0d0 100644
--- a/libbuild2/lexer+quoting.test.testscript
+++ b/libbuild2/lexer+quoting.test.testscript
@@ -56,9 +56,9 @@ EOO
: Token start already quoted
:
$* <'"$foo"' >>EOO
- '' [D/P]
+ '' [D/P/F]
$ [D/C]
- 'foo' [D/P]
+ 'foo' [D/P/F]
<newline>
EOO
@@ -66,7 +66,7 @@ EOO
: Token end still quoted
:
$* <'"foo$"' >>EOO
- 'foo' [D/P]
+ 'foo' [D/P/F]
$ [D/C]
'' [D/P]
<newline>
@@ -87,7 +87,7 @@ EOO
: Token continous with unquoted character
:
$* <'"fo"o' >>EOO
- 'foo' [D/P]
+ 'foo' [D/P/F]
<newline>
EOO
@@ -95,7 +95,7 @@ EOO
: Token continous with unquoted escaped character
:
$* <'"fo"\"' >>EOO
- 'fo"' [D/P]
+ 'fo"' [D/P/F]
<newline>
EOO
}
@@ -104,6 +104,24 @@ EOO
: mixed
:
$* <"\"fo\"'o'" >>EOO
-'foo' [M/P]
+'foo' [M/P/F]
<newline>
EOO
+
+: first
+:
+{
+ : empty-single
+ :
+ $* <"''+foo" >>EOO
+ '+foo' [S/P/F]
+ <newline>
+ EOO
+
+ : empty-double
+ :
+ $* <'""+foo' >>EOO
+ '+foo' [D/P/F]
+ <newline>
+ EOO
+}
diff --git a/libbuild2/lexer.cxx b/libbuild2/lexer.cxx
index 9b7d01e..0b6f96d 100644
--- a/libbuild2/lexer.cxx
+++ b/libbuild2/lexer.cxx
@@ -202,9 +202,10 @@ namespace build2
auto make_token = [&sep, ln, cn] (type t, string v = string ())
{
- return token (t, move (v),
- sep, quote_type::unquoted, false,
- ln, cn, token_printer);
+ return token (t, move (v), sep,
+ quote_type::unquoted, false, false,
+ ln, cn,
+ token_printer);
};
// Handle `[` (do it first to make sure the flag is cleared regardless of
@@ -446,9 +447,10 @@ namespace build2
auto make_token = [sep, ln, cn] (type t, string v = string ())
{
- return token (t, move (v),
- sep, quote_type::unquoted, false,
- ln, cn, token_printer);
+ return token (t, move (v), sep,
+ quote_type::unquoted, false, false,
+ ln, cn,
+ token_printer);
};
// Handle `[` (do it first to make sure the flag is cleared regardless of
@@ -620,15 +622,14 @@ namespace build2
if (c == '\n' || c == '#' || eos (c))
{
- st.hold = token (type::multi_rcbrace,
- string (count, '}'),
- false, quote_type::unquoted, false,
+ st.hold = token (type::multi_rcbrace, string (count, '}'), false,
+ quote_type::unquoted, false, false,
bln, bcn,
token_printer);
lexeme.resize (chop);
- return token (move (lexeme),
- false, quote_type::unquoted, false,
+ return token (move (lexeme), false,
+ quote_type::unquoted, false, false,
ln, cn);
}
@@ -671,15 +672,22 @@ namespace build2
// quote character.
//
bool qcomp (false);
+ bool qfirst (false);
- auto append = [&lexeme, &m, &qcomp] (char c)
+ auto append = [&lexeme, &m, &qcomp, &qfirst] (char c)
{
- lexeme += c;
+ if (m == lexer_mode::double_quoted)
+ {
+ if (lexeme.empty ()) // First character.
+ qfirst = true;
+ }
+ else
+ {
+ if (qcomp) // An unquoted character after a quoted fragment.
+ qcomp = false;
+ }
- // An unquoted character after a quoted fragment.
- //
- if (qcomp && m != lexer_mode::double_quoted)
- qcomp = false;
+ lexeme += c;
};
for (; !eos (c); c = peek ())
@@ -840,6 +848,12 @@ namespace build2
break;
}
+ // Note that we will treat plus in ''+ as quoted. This is
+ // probably the better option considering the "$empty"+ case
+ //
+ if (lexeme.empty ())
+ qfirst = true;
+
get ();
for (c = get (); !eos (c) && c != '\''; c = get ())
lexeme += c;
@@ -875,6 +889,11 @@ namespace build2
break;
}
+ // The same reasoning as above.
+ //
+ if (lexeme.empty ())
+ qfirst = true;
+
continue;
}
}
@@ -905,7 +924,7 @@ namespace build2
if (m == lexer_mode::variable)
state_.pop ();
- return token (move (lexeme), sep, qtype, qcomp, ln, cn);
+ return token (move (lexeme), sep, qtype, qcomp, qfirst, ln, cn);
}
pair<bool, bool> lexer::
diff --git a/libbuild2/lexer.test.cxx b/libbuild2/lexer.test.cxx
index 24f0528..6d48885 100644
--- a/libbuild2/lexer.test.cxx
+++ b/libbuild2/lexer.test.cxx
@@ -84,7 +84,11 @@ namespace build2
}
if (q != '\0')
- cout << " [" << q << (t.qcomp ? "/C" : "/P") << ']';
+ cout << " ["
+ << q
+ << (t.qcomp ? "/C" : "/P")
+ << (!t.qcomp && t.qfirst ? "/F" : "")
+ << ']';
}
cout << endl;
diff --git a/libbuild2/parser.cxx b/libbuild2/parser.cxx
index f152b17..a9646d5 100644
--- a/libbuild2/parser.cxx
+++ b/libbuild2/parser.cxx
@@ -5394,6 +5394,7 @@ namespace build2
//
bool concat (false);
bool concat_quoted (false);
+ bool concat_quoted_first (false);
name concat_data;
auto concat_typed = [&vnull, &vtype, &concat, &concat_data, this]
@@ -5492,21 +5493,13 @@ namespace build2
// Return '+' or '-' if a token can start an inclusion or exclusion
// (pattern or group), '\0' otherwise. The result can be used as bool.
//
- // @@ Note that we only need to make sure that the leading '+' or '-'
- // characters are unquoted. We could consider some partially quoted
- // tokens as starting inclusion or exclusion as well, for example
- // +'foo*'. However, currently we can not determine which part of a
- // token is quoted, and so can't distinguish the above token from
- // '+'foo*. This is why we end up with a criteria that is stricter than
- // is really required.
- //
auto pattern_prefix = [] (const token& t) -> char
{
char c;
- return t.type == type::word && ((c = t.value[0]) == '+' || c == '-') &&
- t.qtype == quote_type::unquoted
- ? c
- : '\0';
+ return (t.type == type::word && !t.qfirst &&
+ ((c = t.value[0]) == '+' || c == '-')
+ ? c
+ : '\0');
};
// A name sequence potentially starts with a pattern if it starts with a
@@ -5586,9 +5579,11 @@ namespace build2
assert (!pre_parse_);
bool quoted (concat_quoted);
+ bool quoted_first (concat_quoted_first);
concat = false;
concat_quoted = false;
+ concat_quoted_first = false;
// If this is a result of typed concatenation, then don't inject. For
// one we don't want any of the "interpretations" performed in the
@@ -5671,7 +5666,7 @@ namespace build2
t = token (move (concat_data.value),
true,
quoted ? quote_type::mixed : quote_type::unquoted,
- false,
+ false, quoted_first,
t.line, t.column);
}
else if (!first)
@@ -5713,6 +5708,7 @@ namespace build2
string val (move (t.value));
const location loc (get_location (t));
bool quoted (t.qtype != quote_type::unquoted);
+ bool quoted_first (t.qfirst);
// Should we accumulate? If the buffer is not empty, then we continue
// accumulating (the case where we are separated should have been
@@ -5723,6 +5719,8 @@ namespace build2
if (concat || // Continue.
!last_concat ()) // Start.
{
+ bool e (val.empty ());
+
// If LHS is typed then do typed concatenation.
//
if (concat && vtype != nullptr)
@@ -5743,8 +5741,17 @@ namespace build2
v += val;
}
- concat = true;
+ // Consider something like this: ""$foo where foo='+foo'. Should we
+ // treat the plus as a first (unquoted) character? Feels like we
+ // should not. The way we achieve this is a bit hackish: we make it
+ // look like a quoted first character. Note that there is a second
+ // half of this in expansion case which deals with $empty+foo.
+ //
+ if (!concat) // First.
+ concat_quoted_first = quoted_first || e;
+
concat_quoted = quoted || concat_quoted;
+ concat = true;
continue;
}
@@ -6451,7 +6458,7 @@ namespace build2
//
else if (!result->null && !result->empty ())
{
- // This can only an untyped value.
+ // This can only be an untyped value.
//
// @@ Could move if result == &result_data.
//
@@ -6487,8 +6494,13 @@ namespace build2
concat_data.value += n.value;
}
- concat = true;
+ // The same little hack as in the word case ($empty+foo).
+ //
+ if (!concat) // First.
+ concat_quoted_first = true;
+
concat_quoted = quoted || concat_quoted;
+ concat = true;
}
else
{
@@ -6703,8 +6715,8 @@ namespace build2
//
// print +foo
//
- // So wepeek at one more character since what we expect next ('=') can't
- // be whitespace-separated.
+ // So we peek at one more character since what we expect next ('=')
+ // can't be whitespace-separated.
//
return c0 == '\n' || c0 == '\0' || c0 == '(' ||
(p.second &&
diff --git a/libbuild2/script/lexer.cxx b/libbuild2/script/lexer.cxx
index a18c1df..7577149 100644
--- a/libbuild2/script/lexer.cxx
+++ b/libbuild2/script/lexer.cxx
@@ -127,7 +127,7 @@ namespace build2
bool q (m == lexer_mode::here_line_double);
return token (t, string (), sep,
- (q ? quote_type::double_ : quote_type::unquoted), q,
+ (q ? quote_type::double_ : quote_type::unquoted), q, q,
ln, cn,
token_printer);
};
@@ -180,7 +180,7 @@ namespace build2
auto make_token = [&sep, &c] (type t, string v = string ())
{
return token (t, move (v), sep,
- quote_type::unquoted, false,
+ quote_type::unquoted, false, false,
c.line, c.column,
token_printer);
};
diff --git a/libbuild2/test/script/lexer.cxx b/libbuild2/test/script/lexer.cxx
index c23dea4..f9c8ac6 100644
--- a/libbuild2/test/script/lexer.cxx
+++ b/libbuild2/test/script/lexer.cxx
@@ -324,9 +324,8 @@ namespace build2
lexeme += c;
}
- return token (move (lexeme),
- false,
- quote_type::unquoted, false,
+ return token (move (lexeme), false,
+ quote_type::unquoted, false, false,
ln, cn);
}
diff --git a/libbuild2/token.hxx b/libbuild2/token.hxx
index 030ab48..faae466 100644
--- a/libbuild2/token.hxx
+++ b/libbuild2/token.hxx
@@ -119,10 +119,12 @@ namespace build2
// Quoting can be complete, where the token starts and ends with the quote
// characters and quoting is contiguous or partial where only some part(s)
- // of the token are quoted or quoting continues to the next token.
+ // of the token are quoted or quoting continues to the next token. We also
+ // keep track whether the first character of a token is quoted.
//
quote_type qtype;
bool qcomp;
+ bool qfirst;
// Normally only used for word, but can also be used to store "modifiers"
// or some such for other tokens.
@@ -139,26 +141,35 @@ namespace build2
: token (token_type::eos, false, 0, 0, token_printer) {}
token (token_type t, bool s, uint64_t l, uint64_t c, printer_type* p)
- : token (t, string (), s, quote_type::unquoted, false, l, c, p) {}
+ : token (t, string (), s,
+ quote_type::unquoted, false, false,
+ l, c,
+ p) {}
token (token_type t, bool s,
quote_type qt,
uint64_t l, uint64_t c,
printer_type* p)
- : token (t, string (), s, qt, qt != quote_type::unquoted, l, c, p) {}
+ : token (t, string (), s,
+ qt, qt != quote_type::unquoted, qt != quote_type::unquoted,
+ l, c,
+ p) {}
token (string v, bool s,
- quote_type qt, bool qc,
+ quote_type qt, bool qc, bool qf,
uint64_t l, uint64_t c)
- : token (token_type::word, move (v), s, qt, qc, l, c, &token_printer){}
+ : token (token_type::word, move (v), s,
+ qt, qc, qf,
+ l, c,
+ &token_printer) {}
token (token_type t,
string v, bool s,
- quote_type qt, bool qc,
+ quote_type qt, bool qc, bool qf,
uint64_t l, uint64_t c,
printer_type* p)
: type (t), separated (s),
- qtype (qt), qcomp (qc),
+ qtype (qt), qcomp (qc), qfirst (qf),
value (move (v)),
line (l), column (c),
printer (p) {}