From ba628f6f90e7412245dcebdecd9cfa7e4bbf989c Mon Sep 17 00:00:00 2001 From: Boris Kolpackov Date: Mon, 25 May 2020 12:12:13 +0200 Subject: Add support for value subscript after expansions Value subscript is only recognized in evaluation contexts (due to ambiguity with wildcard patterns; consider: $x[123].txt) and should be unseparated from the previous token. For example: x = ($y[1]) x = (($f ? $y : $z)[1]) x = ($identity($x)[$z]) --- libbuild2/build/script/lexer.cxx | 15 ++-- libbuild2/lexer.cxx | 59 +++++++++----- libbuild2/lexer.hxx | 38 +++++---- libbuild2/parser.cxx | 145 ++++++++++++++++++++++++++++------- libbuild2/parser.hxx | 9 ++- libbuild2/script/lexer.cxx | 5 +- libbuild2/test/script/lexer.cxx | 15 ++-- libbuild2/variable.cxx | 8 +- tests/expansion/concat.testscript | 2 +- tests/expansion/subscript.testscript | 97 +++++++++++++++++++++++ 10 files changed, 308 insertions(+), 85 deletions(-) create mode 100644 tests/expansion/subscript.testscript diff --git a/libbuild2/build/script/lexer.cxx b/libbuild2/build/script/lexer.cxx index 7b8bdd4..a58f794 100644 --- a/libbuild2/build/script/lexer.cxx +++ b/libbuild2/build/script/lexer.cxx @@ -27,8 +27,6 @@ namespace build2 optional esc, uintptr_t data) { - bool a (false); // attributes - const char* s1 (nullptr); const char* s2 (nullptr); @@ -88,7 +86,8 @@ namespace build2 } assert (ps == '\0'); - state_.push (state {m, data, nullopt, a, ps, s, n, q, *esc, s1, s2}); + state_.push ( + state {m, data, nullopt, false, false, ps, s, n, q, *esc, s1, s2}); } token lexer:: @@ -129,16 +128,16 @@ namespace build2 return token (t, sep, ln, cn, token_printer); }; - // Handle attributes (do it first to make sure the flag is cleared - // regardless of what we return). + // Handle `[` (do it first to make sure the flag is cleared regardless + // of what we return). // - if (st.attributes) + if (st.lsbrace) { assert (m == lexer_mode::variable_line); - state_.top ().attributes = false; + state_.top ().lsbrace = false; // Note: st is a copy. - if (c == '[') + if (c == '[' && (!st.lsbrace_unsep || !sep)) return make_token (type::lsbrace); } diff --git a/libbuild2/lexer.cxx b/libbuild2/lexer.cxx index 6d3504c..7149d45 100644 --- a/libbuild2/lexer.cxx +++ b/libbuild2/lexer.cxx @@ -39,7 +39,7 @@ namespace build2 void lexer:: mode (lexer_mode m, char ps, optional esc, uintptr_t data) { - bool a (false); // attributes + bool lsb (false); // Enable `[` recognition. const char* s1 (nullptr); const char* s2 (nullptr); @@ -62,9 +62,9 @@ namespace build2 // Note: `%` is only recognized at the beginning of the line so it // should not be included here. // - a = true; s1 = ":<>=+? $(){}#\t\n"; s2 = " == "; + lsb = true; break; } case lexer_mode::value: @@ -103,6 +103,12 @@ namespace build2 s2 = " "; break; } + case lexer_mode::subscript: + { + s1 = " $()]#\t\n"; + s2 = " "; + break; + } case lexer_mode::eval: { s1 = ":<>=!&|?, $(){}#\t\n"; @@ -147,7 +153,8 @@ namespace build2 default: assert (false); // Unhandled custom mode. } - state_.push (state {m, data, nullopt, a, ps, s, n, q, *esc, s1, s2}); + state_.push ( + state {m, data, nullopt, lsb, false, ps, s, n, q, *esc, s1, s2}); } token lexer:: @@ -168,6 +175,7 @@ namespace build2 case lexer_mode::case_patterns: case lexer_mode::attributes: case lexer_mode::attribute_value: + case lexer_mode::subscript: case lexer_mode::variable: case lexer_mode::buildspec: break; case lexer_mode::eval: return next_eval (); @@ -190,14 +198,14 @@ namespace build2 ln, cn, token_printer); }; - // Handle attributes (do it first to make sure the flag is cleared - // regardless of what we return). + // Handle `[` (do it first to make sure the flag is cleared regardless of + // what we return). // - if (st.attributes) + if (st.lsbrace) { - st.attributes = false; + st.lsbrace = false; - if (c == '[') + if (c == '[' && (!st.lsbrace_unsep || !sep)) return make_token (type::lsbrace); } @@ -226,11 +234,15 @@ namespace build2 m == lexer_mode::case_patterns) state_.pop (); - // Re-enable attributes in the normal mode (should never be needed in - // cmdvar). + // Re-enable `[` recognition (attributes) in the normal mode (should + // never be needed in cmdvar). // - if (state_.top ().mode == lexer_mode::normal) - state_.top ().attributes = true; + state& st (state_.top ()); + if (st.mode == lexer_mode::normal) + { + st.lsbrace = true; + st.lsbrace_unsep = false; + } sep = true; // Treat newline as always separated. return make_token (type::newline); @@ -274,9 +286,12 @@ namespace build2 } } - // The following characters are special in all modes except attributes. + // The following characters are special in all modes except attributes + // and subscript. // - if (m != lexer_mode::attributes && m != lexer_mode::attribute_value) + if (m != lexer_mode::attributes && + m != lexer_mode::attribute_value && + m != lexer_mode::subscript) { switch (c) { @@ -295,13 +310,15 @@ namespace build2 } } - if (m == lexer_mode::attributes || m == lexer_mode::attribute_value) + if (m == lexer_mode::attributes || + m == lexer_mode::attribute_value || + m == lexer_mode::subscript) { switch (c) { case ']': { - state_.pop (); // Expire the attributes mode after closing `]`. + state_.pop (); // Expire the mode after closing `]`. return make_token (type::rsbrace); } } @@ -425,14 +442,14 @@ namespace build2 ln, cn, token_printer); }; - // Handle attributes (do it first to make sure the flag is cleared - // regardless of what we return). + // Handle `[` (do it first to make sure the flag is cleared regardless of + // what we return). // - if (st.attributes) + if (st.lsbrace) { - st.attributes = false; + st.lsbrace = false; - if (c == '[') + if (c == '[' && (!st.lsbrace_unsep || !sep)) return make_token (type::lsbrace); } diff --git a/libbuild2/lexer.hxx b/libbuild2/lexer.hxx index 749668e..d5f1c99 100644 --- a/libbuild2/lexer.hxx +++ b/libbuild2/lexer.hxx @@ -31,7 +31,9 @@ namespace build2 // values). The attributes/attribute_value modes are like values where each // value is potentially a variable assignment; they don't treat `{` and `}` // as special (so we cannot have name groups in attributes) as well as - // recognizes `=` and `]`. The eval mode is used in the evaluation context. + // recognizes `=` and `]`. The subscript mode is like value but doesn't + // treat `{` and `}` as special and recognizes `]`. The eval mode is used in + // the evaluation context. // // A number of modes are "derived" from the value/values mode by recognizing // a few extra characters: @@ -55,10 +57,10 @@ namespace build2 // mode data. // // The alternative modes must be set manually. The value/values and derived - // modes automatically expires after the end of the line. The attribute mode - // expires after the closing `]`. The variable mode expires after the word - // token. The eval mode expires after the closing `)`. And the foreign mode - // expires after the closing braces. + // modes automatically expires after the end of the line. The attribute and + // subscript modes expires after the closing `]`. The variable mode expires + // after the word token. The eval mode expires after the closing `)`. And + // the foreign mode expires after the closing braces. // // Note that normally it is only safe to switch mode when the current token // is not quoted (or, more generally, when you are not in the double-quoted @@ -66,13 +68,13 @@ namespace build2 // variable name mode). Failed that your mode (which now will be the top of // the mode stack) will prevent proper recognition of the closing quote. // - // Finally, attributes recognition (the `[` token) cuts across most of the - // modes and is handled with a flag. In the normal mode it is automatically - // set at the beginning and after each newline. In all other modes it must - // be explicitly set at points where attributes are recognized. In all the - // cases it is automatically reset after lexing the next token (whether `[` - // or not). - // + // The `[` token is used for attributes (where it cuts across most of the + // modes) as well as for value subscript (where it is only recognized after + // expansions). It is handled with a flag. In the normal mode it is + // automatically set at the beginning and after each newline. In all other + // modes it must be explicitly set at points where attribute/subscript is + // recognized. In all the cases it is automatically reset after lexing the + // next token (whether `[` or not). // Extendable/inheritable enum-like class. // @@ -91,6 +93,7 @@ namespace build2 switch_expressions, attributes, attribute_value, + subscript, eval, single_quoted, double_quoted, @@ -134,10 +137,14 @@ namespace build2 optional escapes = nullopt, uintptr_t data = 0); - // Enable attributes recognition for the next token. + // Enable `[` recognition for the next token. // void - enable_attributes () {state_.top ().attributes = true;} + enable_lsbrace (bool unsep = false) + { + state_.top ().lsbrace = true; + state_.top ().lsbrace_unsep = unsep; + } // Expire the current mode early. // @@ -177,7 +184,8 @@ namespace build2 uintptr_t data; optional hold; - bool attributes; + bool lsbrace; // Recognize `[`. + bool lsbrace_unsep; // Recognize it only if unseparated. char sep_pair; bool sep_space; // Are whitespaces separators (see skip_spaces())? diff --git a/libbuild2/parser.cxx b/libbuild2/parser.cxx index 94f597d..c359ce0 100644 --- a/libbuild2/parser.cxx +++ b/libbuild2/parser.cxx @@ -4204,8 +4204,9 @@ namespace build2 tt == type::dollar || tt == type::lparen || tt == type::lcbrace)) - fail (t) << "whitespace required after attributes" << - info << "use the '\\[' escape sequence if this is a wildcard pattern"; + fail (t) << "whitespace required after attributes" << + info (l) << "use the '\\[' escape sequence if this is a wildcard " + << "pattern"; return make_pair (has, l); } @@ -5520,7 +5521,7 @@ namespace build2 continue; } - // Variable expansion, function call, or eval context. + // Expanions: variable expansion, function call, or eval context. // if (tt == type::dollar || tt == type::lparen) { @@ -5533,6 +5534,11 @@ namespace build2 const char* what; // Variable, function, or evaluation context. bool quoted (t.qtype != quote_type::unquoted); + // We only recognize value subscripts inside eval contexts due to the + // ambiguity with wildcard patterns (consider: $x[123].txt). + // + bool sub (mode () == lexer_mode::eval); + if (tt == type::dollar) { // Switch to the variable name mode. We want to use this mode for @@ -5625,9 +5631,10 @@ namespace build2 if (!pre_parse_ && name.empty ()) fail (loc) << "empty variable/function name"; - // Figure out whether this is a variable expansion or a function - // call. + // Figure out whether this is a variable expansion with potential + // subscript or a function call. // + if (sub) enable_subscript (); tt = peek (); // Note that we require function call opening paren to be @@ -5645,15 +5652,17 @@ namespace build2 // context in which to call the function? Hm, interesting... // values args (parse_eval (t, tt, pmode)); - tt = peek (); - if (pre_parse_) - continue; // As if empty result. + if (sub) enable_subscript (); + tt = peek (); // Note that we "move" args to call(). // - result_data = ctx.functions.call (scope_, name, args, loc); - what = "function call"; + if (!pre_parse_) + { + result_data = ctx.functions.call (scope_, name, args, loc); + what = "function call"; + } } else { @@ -5661,42 +5670,124 @@ namespace build2 // lookup l (lookup_variable (move (qual), move (name), loc)); - if (pre_parse_) - continue; // As if empty value. - - if (l.defined ()) - result = l.value; // Otherwise leave as NULL result_data. + if (!pre_parse_) + { + if (l.defined ()) + result = l.value; // Otherwise leave as NULL result_data. - what = "variable expansion"; + what = "variable expansion"; + } } } else { - // Context evaluation. + // Evaluation context. // loc = get_location (t); mode (lexer_mode::eval, '@'); next_with_attributes (t, tt); values vs (parse_eval (t, tt, pmode)); + + if (sub) enable_subscript (); tt = peek (); - if (pre_parse_) - continue; // As if empty result. + if (!pre_parse_) + { + switch (vs.size ()) + { + case 0: result_data = value (names ()); break; + case 1: result_data = move (vs[0]); break; + default: fail (loc) << "expected single value"; + } - switch (vs.size ()) + what = "context evaluation"; + } + } + + // Handle value subscript. + // + if (tt == type::lsbrace) + { + location bl (get_location (t)); + next (t, tt); // `[` + mode (lexer_mode::subscript, '\0' /* pair */); + next (t, tt); + + location l (get_location (t)); + value v ( + tt != type::rsbrace + ? parse_value (t, tt, pattern_mode::ignore, "value subscript") + : value (names ())); + + if (tt != type::rsbrace) { - case 0: result_data = value (names ()); break; - case 1: result_data = move (vs[0]); break; - default: fail (loc) << "expected single value"; + // Note: wildcard pattern should have `]` as well so no escaping + // suggestion. + // + fail (t) << "expected ']' instead of " << t; } - what = "context evaluation"; + if (!pre_parse_) + { + uint64_t j; + try + { + j = convert (move (v)); + } + catch (const invalid_argument& e) + { + fail (l) << "invalid value subscript: " << e << + info (bl) << "use the '\\[' escape sequence if this is a " + << "wildcard pattern"; + } + + // Similar to expanding an undefined variable, we return NULL if + // the index is out of bounds. + // + // Note that result may or may not point to result_data. + // + if (result->type == nullptr) + { + const names& ns (result->as ()); + + // Pair-aware subscript. + // + names r; + for (auto i (ns.begin ()); i != ns.end (); ++i, --j) + { + if (j == 0) + { + r.push_back (*i); + if (i->pair) + r.push_back (*++i); + break; + } + + if (i->pair) + ++i; + } + + result_data = r.empty () ? value () : value (move (r)); + } + else + { + // @@ TODO: we would want to return a value with element type. + // + //result_data = ... + fail (l) << "typed value subscript not yet supported" << + info (bl) << "use the '\\[' escape sequence if this is a " + << "wildcard pattern"; + } + + result = &result_data; + } + + tt = peek (); } - // We never end up here during pre-parsing. - // - assert (!pre_parse_); + if (pre_parse_) + continue; // As if empty result. // Should we accumulate? If the buffer is not empty, then we continue // accumulating (the case where we are separated should have been diff --git a/libbuild2/parser.hxx b/libbuild2/parser.hxx index bc01e08..2f67c31 100644 --- a/libbuild2/parser.hxx +++ b/libbuild2/parser.hxx @@ -619,7 +619,14 @@ namespace build2 enable_attributes () { if (replay_ != replay::play) - lexer_->enable_attributes (); + lexer_->enable_lsbrace (); + } + + void + enable_subscript () + { + if (replay_ != replay::play) + lexer_->enable_lsbrace (true /* unseparated */); } void diff --git a/libbuild2/script/lexer.cxx b/libbuild2/script/lexer.cxx index d78e999..ce409c1 100644 --- a/libbuild2/script/lexer.cxx +++ b/libbuild2/script/lexer.cxx @@ -16,8 +16,6 @@ namespace build2 void lexer:: mode (base_mode m, char ps, optional esc, uintptr_t data) { - bool a (false); // attributes - const char* s1 (nullptr); const char* s2 (nullptr); @@ -86,7 +84,8 @@ namespace build2 } assert (ps == '\0'); - state_.push (state {m, data, nullopt, a, ps, s, n, q, *esc, s1, s2}); + state_.push ( + state {m, data, nullopt, false, false, ps, s, n, q, *esc, s1, s2}); } token lexer:: diff --git a/libbuild2/test/script/lexer.cxx b/libbuild2/test/script/lexer.cxx index a94109b..e895d4a 100644 --- a/libbuild2/test/script/lexer.cxx +++ b/libbuild2/test/script/lexer.cxx @@ -26,8 +26,6 @@ namespace build2 void lexer:: mode (base_mode m, char ps, optional esc, uintptr_t data) { - bool a (false); // attributes - const char* s1 (nullptr); const char* s2 (nullptr); @@ -109,7 +107,8 @@ namespace build2 } assert (ps == '\0'); - state_.push (state {m, data, nullopt, a, ps, s, n, q, *esc, s1, s2}); + state_.push ( + state {m, data, nullopt, false, false, ps, s, n, q, *esc, s1, s2}); } token lexer:: @@ -153,16 +152,16 @@ namespace build2 return token (t, sep, ln, cn, token_printer); }; - // Handle attributes (do it first to make sure the flag is cleared - // regardless of what we return). + // Handle `[` (do it first to make sure the flag is cleared regardless + // of what we return). // - if (st.attributes) + if (st.lsbrace) { assert (m == lexer_mode::variable_line); - state_.top ().attributes = false; + state_.top ().lsbrace = false; // Note: st is a copy. - if (c == '[') + if (c == '[' && (!st.lsbrace_unsep || !sep)) return make_token (type::lsbrace); } diff --git a/libbuild2/variable.cxx b/libbuild2/variable.cxx index d16fcb4..206eb54 100644 --- a/libbuild2/variable.cxx +++ b/libbuild2/variable.cxx @@ -491,7 +491,13 @@ namespace build2 { // May throw invalid_argument or out_of_range. // - return stoull (n.value); + size_t i; + uint64_t r (stoull (n.value, &i)); + + if (i == n.value.size ()) + return r; + + // Fall through. } catch (const std::exception&) { diff --git a/tests/expansion/concat.testscript b/tests/expansion/concat.testscript index 181a738..bec48ce 100644 --- a/tests/expansion/concat.testscript +++ b/tests/expansion/concat.testscript @@ -1,4 +1,4 @@ -# file : tests/expansion/type.testscript +# file : tests/expansion/concat.testscript # license : MIT; see accompanying LICENSE file # Test concatenated expansion. diff --git a/tests/expansion/subscript.testscript b/tests/expansion/subscript.testscript new file mode 100644 index 0000000..0c06394 --- /dev/null +++ b/tests/expansion/subscript.testscript @@ -0,0 +1,97 @@ +# file : tests/expansion/subscript.testscript +# license : MIT; see accompanying LICENSE file + +# Test subscript expansion. + +.include ../common.testscript + +: basics +: +$* <>EOO +x = zero one two three +y = zero@one two@three +i = 2 + +print ($x[1]) +print ($x[4]) +print (($x)[1]) +print (($x)[4]) +print ($identity($x)[1]) +print ($identity($x)[4]) + +print + +print ($y[1]) +print ($y[4]) +print (($y)[1]) +print (($y)[4]) +print ($identity($y)[1]) +print ($identity($y)[4]) + +print + +print ($x[$i]) + +EOI +one +[null] +one +[null] +one +[null] + +two@three +[null] +two@three +[null] +two@three +[null] + +two +EOO + +: unseparated +: +$* <>EOO +x = zero one +print ($x [1]) +EOI +zero one +EOO + +: escape +: +$* <>EOO +x = zero +print ($x\[abc]) +EOI +EOO + +: preparse +: +$* <>EOO +x = zero one two three +print (true ? $x[1] : $x[]) +EOI +one +EOO + +: missing-rsbrace +: +$* <'print ($x[1)' 2>>EOE != 0 +:1:12: error: expected ']' instead of ')' +EOE + +: invalid-subscript +: +$* <'print ($x[1a])' 2>>EOE != 0 +:1:11: error: invalid value subscript: invalid uint64 value: '1a' + :1:9: info: use the '\[' escape sequence if this is a wildcard pattern +EOE + +: empty-subscript +: +$* <'print ($x[])' 2>>EOE != 0 +:1:11: error: invalid value subscript: invalid uint64 value: empty + :1:9: info: use the '\[' escape sequence if this is a wildcard pattern +EOE -- cgit v1.1