From 3ca670b7b7c71ca67d70cac9dffb2ba6120b2e36 Mon Sep 17 00:00:00 2001 From: Boris Kolpackov Date: Thu, 15 Dec 2022 11:24:18 +0200 Subject: Improve escape sequence support Specifically: 1. In the double-quoted strings we now only do effective escaping of the special `$("\` characters plus `)` for symmetry. 2. There is now support for "escape sequence expansion" in the form $\X where \X can be any of the C/C++ simple escape sequences (\n, \t, etc) plus \0 (which in C/C++ is an octal escape sequence). For example: info "foo$\n$\tbar$\n$\tbaz" Will print: buildfile:1:1: info: foo bar baz --- libbuild2/lexer.cxx | 96 ++++++++++--- libbuild2/lexer.hxx | 19 +-- libbuild2/parser.cxx | 292 ++++++++++++++++++++++---------------- libbuild2/test/script/lexer.cxx | 8 +- libbuild2/test/script/lexer.hxx | 2 +- libbuild2/token.cxx | 21 ++- libbuild2/token.hxx | 8 +- tests/expansion/escape.testscript | 17 +++ 8 files changed, 291 insertions(+), 172 deletions(-) create mode 100644 tests/expansion/escape.testscript diff --git a/libbuild2/lexer.cxx b/libbuild2/lexer.cxx index 9176422..d82c135 100644 --- a/libbuild2/lexer.cxx +++ b/libbuild2/lexer.cxx @@ -713,9 +713,9 @@ namespace build2 } token lexer:: - word (state st, bool sep) + word (const state& rst, bool sep) { - lexer_mode m (st.mode); + lexer_mode m (rst.mode); xchar c (peek ()); assert (!eos (c)); @@ -746,22 +746,66 @@ namespace build2 lexeme += c; }; - for (; !eos (c); c = peek ()) + const state* st (&rst); + for (bool first (true); !eos (c); first = false, c = peek ()) { // First handle escape sequences. // if (c == '\\') { - // In the variable mode we treat the beginning of the escape sequence - // as a separator (think \"$foo\"). + // In the variable mode we treat immediate `\` as the escape sequence + // literal and any following as a separator (think \"$foo\"). // if (m == lexer_mode::variable) - break; + { + if (!first) + break; + + get (); + c = get (); + + if (eos (c)) + fail (c) << "unterminated escape sequence"; + + // For now we only support all the simple C/C++ escape sequences + // plus \0 (which in C/C++ is an octal escape sequence). + // + // In the future we may decide to support more elaborate sequences + // such as \xNN, \uNNNN, etc. + // + // Note: we return it in the literal form instead of translating for + // easier printing. + // + switch (c) + { + case '\'': + case '"': + case '?': + case '\\': + case '0': + case 'a': + case 'b': + case 'f': + case 'n': + case 'r': + case 't': + case 'v': lexeme = c; break; + default: + fail (c) << "unknown escape sequence \\" << c; + } + + state_.pop (); + return token (type::escape, + move (lexeme), + sep, + qtype, qcomp, qfirst, + ln, cn); + } get (); xchar p (peek ()); - const char* esc (st.escapes); + const char* esc (st->escapes); if (esc == nullptr || (*esc != '\0' && !eos (p) && strchr (esc, p) != nullptr)) @@ -777,7 +821,7 @@ namespace build2 continue; } else - unget (c); // Treat as a normal character. + unget (c); // Fall through to treat as a normal character. } bool done (false); @@ -806,8 +850,8 @@ namespace build2 get (); state_.pop (); - st = state_.top (); - m = st.mode; + st = &state_.top (); + m = st->mode; continue; } } @@ -816,19 +860,17 @@ namespace build2 // else if (m == lexer_mode::variable) { - bool first (lexeme.empty ()); - // Handle special variable names, if any. // - if (first && - st.data != 0 && - strchr (reinterpret_cast (st.data), c) != nullptr) + if (first && + st->data != 0 && + strchr (reinterpret_cast (st->data), c) != nullptr) { get (); lexeme += c; done = true; } - else if (c != '_' && !(first ? alpha (c) : alnum (c))) + else if (c != '_' && !(lexeme.empty () ? alpha (c) : alnum (c))) { if (c != '.') done = true; @@ -848,17 +890,17 @@ namespace build2 { // First check if it's a pair separator. // - if (c == st.sep_pair) + if (c == st->sep_pair) done = true; else { // Then see if this character or character sequence is a separator. // - for (const char* p (strchr (st.sep_first, c)); + for (const char* p (strchr (st->sep_first, c)); p != nullptr; p = done ? nullptr : strchr (p + 1, c)) { - char s (st.sep_second[p - st.sep_first]); + char s (st->sep_second[p - st->sep_first]); // See if it has a second. // @@ -876,13 +918,19 @@ namespace build2 // Handle single and double quotes if enabled for this mode and unless // they were considered separators. // - if (st.quotes && !done) + if (st->quotes && !done) { auto quoted_mode = [this] (lexer_mode m) { + // In the double-quoted mode we only do effective escaping of the + // special `$("\` characters plus `)` for symmetry. Nothing can be + // escaped in single-quoted. + // + const char* esc (m == lexer_mode::double_quoted ? "$()\"\\" : ""); + state_.push (state { m, 0, nullopt, false, false, '\0', false, true, true, - state_.top ().escapes, nullptr, nullptr}); + esc, nullptr, nullptr}); }; switch (c) @@ -933,8 +981,8 @@ namespace build2 quoted_mode (lexer_mode::double_quoted); - st = state_.top (); - m = st.mode; + st = &state_.top (); + m = st->mode; switch (qtype) { @@ -1090,6 +1138,8 @@ namespace build2 } case '\\': { + // See if this is line continuation. + // get (); if (peek () == '\n') diff --git a/libbuild2/lexer.hxx b/libbuild2/lexer.hxx index 4371206..e913829 100644 --- a/libbuild2/lexer.hxx +++ b/libbuild2/lexer.hxx @@ -26,14 +26,15 @@ namespace build2 // mode we don't treat certain characters (e.g., `+`, `=`) as special so // that we can use them in the variable values, e.g., `foo = g++`. In // contrast, in the variable mode, we restrict certain character (e.g., `/`) - // from appearing in the name. The values mode is like value but recogizes - // `,` as special (used in contexts where we need to list multiple - // values). The attributes/attribute_value modes are like values where each - // value is potentially a variable assignment; they don't treat `{` and `}` - // as special (so we cannot have name groups in attributes) as well as - // recognizes `=` and `]`. The subscript mode is like value but doesn't - // treat `{` and `}` as special and recognizes `]`. The eval mode is used in - // the evaluation context. + // from appearing in the name. Additionally, in the variable mode we + // recognize leading `\` as the beginning of the escape sequent ($\n). The + // values mode is like value but recogizes `,` as special (used in contexts + // where we need to list multiple values). The attributes/attribute_value + // modes are like values where each value is potentially a variable + // assignment; they don't treat `{` and `}` as special (so we cannot have + // name groups in attributes) as well as recognizes `=` and `]`. The + // subscript mode is like value but doesn't treat `{` and `}` as special and + // recognizes `]`. The eval mode is used in the evaluation context. // // A number of modes are "derived" from the value/values mode by recognizing // a few extra characters: @@ -262,7 +263,7 @@ namespace build2 // been "expired" from the top). // virtual token - word (state current, bool separated); + word (const state& current, bool separated); // Return true in first if we have seen any spaces. Skipped empty lines // don't count. In other words, we are only interested in spaces that are diff --git a/libbuild2/parser.cxx b/libbuild2/parser.cxx index b118cee..2507a02 100644 --- a/libbuild2/parser.cxx +++ b/libbuild2/parser.cxx @@ -7357,11 +7357,15 @@ namespace build2 // token is a paren or a word, we turn it on and switch to the eval // mode if what we get next is a paren. // - // Also sniff out the special variables string from mode data for - // the ad hoc $() handling below. - // mode (lexer_mode::variable); + // Sniff out the special variables string from mode data and use + // that to recognize special variables in the ad hoc $() handling + // below. + // + // Note: must be done before calling next() which may expire the + // mode. + // auto special = [s = reinterpret_cast (mode_data ())] (const token& t) -> char { @@ -7400,164 +7404,202 @@ namespace build2 next (t, tt); loc = get_location (t); - names qual; - string name; - - if (t.separated) - ; // Leave the name empty to fail below. - else if (tt == type::word) + if (tt == type::escape) { - name = move (t.value); + // For now we only support all the simple C/C++ escape sequences + // plus \0 (which in C/C++ is an octal escape sequence). See the + // lexer part for details. + // + // Note: cannot be subscripted. + // + if (!pre_parse_) + { + string s; + switch (char c = t.value[0]) + { + case '\'': + case '"': + case '?': + case '\\': s = c; break; + case '0': s = '\0'; break; + case 'a': s = '\a'; break; + case 'b': s = '\b'; break; + case 'f': s = '\f'; break; + case 'n': s = '\n'; break; + case 'r': s = '\r'; break; + case 't': s = '\t'; break; + case 'v': s = '\v'; break; + default: + assert (false); + } + + result_data = name (move (s)); + what = "escape sequence expansion"; + } + + tt = peek (); } - else if (tt == type::lparen) + else { - expire_mode (); - mode (lexer_mode::eval, '@'); - next_with_attributes (t, tt); + names qual; + string name; - // Handle the $(x) case ad hoc. We do it this way in order to get - // the variable name even during pre-parse. It should also be - // faster. - // - char c; - if ((tt == type::word - ? path_traits::rfind_separator (t.value) == string::npos - : (c = special (t))) && - peek () == type::rparen) + if (t.separated) + ; // Leave the name empty to fail below. + else if (tt == type::word) { - name = (tt == type::word ? move (t.value) : string (1, c)); - next (t, tt); // Get `)`. + name = move (t.value); } - else + else if (tt == type::lparen) { - using name_type = build2::name; - - values vs (parse_eval (t, tt, pmode)); + expire_mode (); + mode (lexer_mode::eval, '@'); + next_with_attributes (t, tt); - if (!pre_parse_) + // Handle the $(x) case ad hoc. We do it this way in order to + // get the variable name even during pre-parse. It should also + // be faster. + // + char c; + if ((tt == type::word + ? path_traits::rfind_separator (t.value) == string::npos + : (c = special (t))) && + peek () == type::rparen) { - if (vs.size () != 1) - fail (loc) << "expected single variable/function name"; + name = (tt == type::word ? move (t.value) : string (1, c)); + next (t, tt); // Get `)`. + } + else + { + using name_type = build2::name; - value& v (vs[0]); + values vs (parse_eval (t, tt, pmode)); - if (!v) - fail (loc) << "null variable/function name"; + if (!pre_parse_) + { + if (vs.size () != 1) + fail (loc) << "expected single variable/function name"; - names storage; - vector_view ns ( - reverse (v, storage, true /* reduce */)); // Movable. - size_t n (ns.size ()); + value& v (vs[0]); - // We cannot handle scope-qualification in the eval context as - // we do for target-qualification (see eval-qual) since then - // we would be treating all paths as qualified variables. So - // we have to do it here. - // - if (n >= 2 && ns[0].pair == ':') // $(foo: x) - { - // Note: name is first (see eval for details). + if (!v) + fail (loc) << "null variable/function name"; + + names storage; + vector_view ns ( + reverse (v, storage, true /* reduce */)); // Movable. + size_t n (ns.size ()); + + // We cannot handle scope-qualification in the eval context + // as we do for target-qualification (see eval-qual) since + // then we would be treating all paths as qualified + // variables. So we have to do it here. // - qual.push_back (move (ns[1])); + if (n >= 2 && ns[0].pair == ':') // $(foo: x) + { + // Note: name is first (see eval for details). + // + qual.push_back (move (ns[1])); - if (qual.back ().empty ()) - fail (loc) << "empty variable/function qualification"; + if (qual.back ().empty ()) + fail (loc) << "empty variable/function qualification"; - if (n > 2) - qual.push_back (move (ns[2])); + if (n > 2) + qual.push_back (move (ns[2])); - // Move name to the last position (see below). - // - swap (ns[0], ns[n - 1]); - } - else if (n == 2 && ns[0].directory ()) // $(foo/ x) - { - qual.push_back (move (ns[0])); - qual.back ().pair = '/'; - } - else if (n > 1) - fail (loc) << "expected variable/function name instead of '" - << ns << "'"; + // Move name to the last position (see below). + // + swap (ns[0], ns[n - 1]); + } + else if (n == 2 && ns[0].directory ()) // $(foo/ x) + { + qual.push_back (move (ns[0])); + qual.back ().pair = '/'; + } + else if (n > 1) + fail (loc) << "expected variable/function name instead of '" + << ns << "'"; - // Note: checked for empty below. - // - if (!ns[n - 1].simple ()) - fail (loc) << "expected variable/function name instead of '" - << ns[n - 1] << "'"; + // Note: checked for empty below. + // + if (!ns[n - 1].simple ()) + fail (loc) << "expected variable/function name instead of '" + << ns[n - 1] << "'"; - size_t p; - if (n == 1 && // $(foo/x) - (p = path_traits::rfind_separator (ns[0].value)) != + size_t p; + if (n == 1 && // $(foo/x) + (p = path_traits::rfind_separator (ns[0].value)) != string::npos) - { - // Note that p cannot point to the last character since then - // it would have been a directory, not a simple name. - // - string& s (ns[0].value); + { + // Note that p cannot point to the last character since + // then it would have been a directory, not a simple name. + // + string& s (ns[0].value); - name = string (s, p + 1); - s.resize (p + 1); - qual.push_back (name_type (dir_path (move (s)))); - qual.back ().pair = '/'; + name = string (s, p + 1); + s.resize (p + 1); + qual.push_back (name_type (dir_path (move (s)))); + qual.back ().pair = '/'; + } + else + name = move (ns[n - 1].value); } - else - name = move (ns[n - 1].value); } } - } - else - fail (t) << "expected variable/function name instead of " << t; - - if (!pre_parse_ && name.empty ()) - fail (loc) << "empty variable/function name"; - - // Figure out whether this is a variable expansion with potential - // subscript or a function call. - // - if (sub) enable_subscript (); - tt = peek (); + else + fail (t) << "expected variable/function name instead of " << t; - // Note that we require function call opening paren to be - // unseparated; consider: $x ($x == 'foo' ? 'FOO' : 'BAR'). - // - if (tt == type::lparen && !peeked ().separated) - { - // Function call. - // - next (t, tt); // Get '('. - mode (lexer_mode::eval, '@'); - next_with_attributes (t, tt); + if (!pre_parse_ && name.empty ()) + fail (loc) << "empty variable/function name"; - // @@ Should we use (target/scope) qualification (of name) as the - // context in which to call the function? Hm, interesting... + // Figure out whether this is a variable expansion with potential + // subscript or a function call. // - values args (parse_eval (t, tt, pmode)); - if (sub) enable_subscript (); tt = peek (); - // Note that we "move" args to call(). + // Note that we require function call opening paren to be + // unseparated; consider: $x ($x == 'foo' ? 'FOO' : 'BAR'). // - if (!pre_parse_) + if (tt == type::lparen && !peeked ().separated) { - result_data = ctx->functions.call (scope_, name, args, loc); - what = "function call"; + // Function call. + // + next (t, tt); // Get '('. + mode (lexer_mode::eval, '@'); + next_with_attributes (t, tt); + + // @@ Should we use (target/scope) qualification (of name) as + // the context in which to call the function? Hm, interesting... + // + values args (parse_eval (t, tt, pmode)); + + if (sub) enable_subscript (); + tt = peek (); + + // Note that we "move" args to call(). + // + if (!pre_parse_) + { + result_data = ctx->functions.call (scope_, name, args, loc); + what = "function call"; + } + else + lookup_function (move (name), loc); } else - lookup_function (move (name), loc); - } - else - { - // Variable expansion. - // - lookup l (lookup_variable (move (qual), move (name), loc)); - - if (!pre_parse_) { - if (l.defined ()) - result = l.value; // Otherwise leave as NULL result_data. + // Variable expansion. + // + lookup l (lookup_variable (move (qual), move (name), loc)); - what = "variable expansion"; + if (!pre_parse_) + { + if (l.defined ()) + result = l.value; // Otherwise leave as NULL result_data. + + what = "variable expansion"; + } } } } diff --git a/libbuild2/test/script/lexer.cxx b/libbuild2/test/script/lexer.cxx index b470d25..aec91fc 100644 --- a/libbuild2/test/script/lexer.cxx +++ b/libbuild2/test/script/lexer.cxx @@ -339,15 +339,17 @@ namespace build2 } token lexer:: - word (state st, bool sep) + word (const state& st, bool sep) { - lexer_mode m (st.mode); + lexer_mode m (st.mode); // Save. token r (base_lexer::word (st, sep)); if (m == lexer_mode::variable) { - if (r.value.size () == 1 && digit (r.value[0])) // $N + if (r.type == type::word && + r.value.size () == 1 && + digit (r.value[0])) // $N { xchar c (peek ()); diff --git a/libbuild2/test/script/lexer.hxx b/libbuild2/test/script/lexer.hxx index 993a9db..39b950a 100644 --- a/libbuild2/test/script/lexer.hxx +++ b/libbuild2/test/script/lexer.hxx @@ -77,7 +77,7 @@ namespace build2 next_description (); virtual token - word (state, bool) override; + word (const state&, bool) override; }; } } diff --git a/libbuild2/token.cxx b/libbuild2/token.cxx index ab14388..cc102cc 100644 --- a/libbuild2/token.cxx +++ b/libbuild2/token.cxx @@ -29,21 +29,30 @@ namespace build2 os << (r ? "\n" : ""); break; } - case token_type::pair_separator: + case token_type::word: { if (r) - os << t.value[0]; + os << t.value; else - os << ""; + os << '\'' << t.value << '\''; break; } - case token_type::word: + case token_type::escape: { if (r) - os << t.value; + os << '\\' << t.value; else - os << '\'' << t.value << '\''; + os << ""; + + break; + } + case token_type::pair_separator: + { + if (r) + os << t.value[0]; + else + os << ""; break; } diff --git a/libbuild2/token.hxx b/libbuild2/token.hxx index fca888c..f9ede65 100644 --- a/libbuild2/token.hxx +++ b/libbuild2/token.hxx @@ -30,6 +30,7 @@ namespace build2 eos, newline, word, + escape, // token::value is <...> in $\<...> pair_separator, // token::value[0] is the pair separator char. colon, // : @@ -159,16 +160,13 @@ namespace build2 token (string v, bool s, quote_type qt, bool qc, bool qf, uint64_t l, uint64_t c) - : token (token_type::word, move (v), s, - qt, qc, qf, - l, c, - &token_printer) {} + : token (token_type::word, move (v), s, qt, qc, qf, l, c) {} token (token_type t, string v, bool s, quote_type qt, bool qc, bool qf, uint64_t l, uint64_t c, - printer_type* p) + printer_type* p = &token_printer) : type (t), separated (s), qtype (qt), qcomp (qc), qfirst (qf), value (move (v)), diff --git a/tests/expansion/escape.testscript b/tests/expansion/escape.testscript new file mode 100644 index 0000000..1140032 --- /dev/null +++ b/tests/expansion/escape.testscript @@ -0,0 +1,17 @@ +# file : tests/expansion/type.testscript +# license : MIT; see accompanying LICENSE file + +# Test escape sequence expansion. + +.include ../common.testscript + +: simple +: +$* <>EOO +print "foo$\nbar" +print $size([string] "foo$\0bar") +EOI +foo +bar +7 +EOO -- cgit v1.1