From fce9782a330e8f701a8df0b5200e5b78e97ec4b5 Mon Sep 17 00:00:00 2001 From: Boris Kolpackov Date: Wed, 6 May 2020 06:58:34 +0200 Subject: Handle multi-curly-brace tokens in lexer --- libbuild2/lexer+foreign.test.testscript | 96 +++++++++++++++++++++++++ libbuild2/lexer+normal.test.testscript | 18 +++++ libbuild2/lexer.cxx | 124 +++++++++++++++++++++++++++++++- libbuild2/lexer.hxx | 25 +++++-- libbuild2/lexer.test.cxx | 14 +++- libbuild2/test/script/lexer.cxx | 2 +- libbuild2/token.cxx | 69 +++++++++--------- libbuild2/token.hxx | 3 + 8 files changed, 306 insertions(+), 45 deletions(-) create mode 100644 libbuild2/lexer+foreign.test.testscript diff --git a/libbuild2/lexer+foreign.test.testscript b/libbuild2/lexer+foreign.test.testscript new file mode 100644 index 0000000..94c83c1 --- /dev/null +++ b/libbuild2/lexer+foreign.test.testscript @@ -0,0 +1,96 @@ +# file : libbuild2/lexer+foreign.test.testscript +# license : MIT; see accompanying LICENSE file + +test.arguments = foreign=2 + +: basics +: +$* <>EOO +echo foo +}} +EOI +'echo foo +' +}} + +EOO + +: empty +: +$* <>EOO +}} +EOI +'' +}} + +EOO + +: braces +: +$* <>EOO +} +}}} +{{}} +}} } +}} +EOI +'} +}}} +{{}} +}} } +' +}} + +EOO + +: whitespaces +: +$* <' }} ' >>EOO # Note: there are TABs. +'' +}} + +EOO + +: comment +: +$* <'}} # comment' >>EOO +'' +}} + +EOO + +: eos +: +$* <:'}}' >>EOO +'' +}} +EOO + +: missing +: Note that we get eos right away (i.e., there is no word token). +: +$* <>EOO + echo foo + }}} + EOI + ' echo foo + ' + }}} + + EOO +} diff --git a/libbuild2/lexer+normal.test.testscript b/libbuild2/lexer+normal.test.testscript index e66b81e..e2780a2 100644 --- a/libbuild2/lexer+normal.test.testscript +++ b/libbuild2/lexer+normal.test.testscript @@ -70,3 +70,21 @@ EOO 'x%' EOO } + +: multi-lcbrace +: Leading multi-curly-brace recognition. +: +{ + : two + : + $* <:'{{' >>EOO + {{ + EOO + + : three + : + $* <:'{{{c++' >>EOO + {{{ + 'c++' + EOO +} diff --git a/libbuild2/lexer.cxx b/libbuild2/lexer.cxx index e970437..6d3504c 100644 --- a/libbuild2/lexer.cxx +++ b/libbuild2/lexer.cxx @@ -128,10 +128,16 @@ namespace build2 n = false; break; } + case lexer_mode::foreign: + assert (data > 1); + // Fall through. case lexer_mode::single_quoted: case lexer_mode::double_quoted: - s = false; - // Fall through. + { + assert (ps == '\0'); + s = false; + break; + } case lexer_mode::variable: { // These are handled in an ad hoc way in word(). @@ -141,7 +147,7 @@ namespace build2 default: assert (false); // Unhandled custom mode. } - state_.push (state {m, data, a, ps, s, n, q, *esc, s1, s2}); + state_.push (state {m, data, nullopt, a, ps, s, n, q, *esc, s1, s2}); } token lexer:: @@ -166,6 +172,7 @@ namespace build2 case lexer_mode::buildspec: break; case lexer_mode::eval: return next_eval (); case lexer_mode::double_quoted: return next_quoted (); + case lexer_mode::foreign: return next_foreign (); default: assert (false); // Unhandled custom mode. } @@ -241,11 +248,29 @@ namespace build2 } } + // Line-leading tokens in the normal mode. + // + // Note: must come before any other (e.g., `{`) tests below. + // if (m == lexer_mode::normal && first) { switch (c) { case '%': return make_token (type::percent); + case '{': + { + string v; + while (peek () == '{') + v += get (); + + if (!v.empty ()) + { + v += '{'; + return make_token (type::multi_lcbrace, move (v)); + } + + break; + } } } @@ -507,6 +532,99 @@ namespace build2 } token lexer:: + next_foreign () + { + state& st (state_.top ()); + + if (st.hold) + { + token r (move (*st.hold)); + state_.pop (); // Expire foreign mode. + return r; + } + + auto count (state_.top ().data); // Number of closing braces to expect. + + xchar c (get ()); // First character of first line after `{{...`. + uint64_t ln (c.line), cn (c.column); + + string lexeme; + for (bool first (true); !eos (c); c = get ()) + { + // If this is the first character of a line, recognize closing braces. + // + if (first) + { + first = false; + + // If this turns not to be the closing braces, we need to add any + // characters we have extracted to lexeme. Instead of saving these + // characters in a temporary we speculatively add them to the lexeme + // but then chop them off if this turned out to be the closing braces. + // + size_t chop (lexeme.size ()); + + // Skip leading whitespaces, if any. + // + for (; c == ' ' || c == '\t'; c = get ()) + lexeme += c; + + uint64_t bln (c.line), bcn (c.column); // Position of first `}`. + + // Count braces. + // + auto i (count); + for (; c == '}'; c = get ()) + { + lexeme += c; + + if (--i == 0) + break; + } + + if (i == 0) // Got enough braces. + { + // Make sure there are only whitespaces/comments after. Note that + // now we must start peeking since newline is not "ours". + // + for (c = peek (); c == ' ' || c == '\t'; c = peek ()) + lexeme += get (); + + if (c == '\n' || c == '#' || eos (c)) + { + st.hold = token (type::multi_rcbrace, + string (count, '}'), + false, quote_type::unquoted, false, + bln, bcn, + token_printer); + + lexeme.resize (chop); + return token (move (lexeme), + false, quote_type::unquoted, false, + ln, cn); + } + + get (); // And fall through (not eos). + } + else + { + if (eos (c)) + break; + + // Fall through. + } + } + + if (c == '\n') + first = true; + + lexeme += c; + } + + return token (type::eos, false, c.line, c.column, token_printer); + } + + token lexer:: word (state st, bool sep) { lexer_mode m (st.mode); diff --git a/libbuild2/lexer.hxx b/libbuild2/lexer.hxx index 6dc5027..8dd58c8 100644 --- a/libbuild2/lexer.hxx +++ b/libbuild2/lexer.hxx @@ -43,13 +43,22 @@ namespace build2 // split words separated by the pair character (to disable pairs one can // pass `\0` as a pair character). // - // The normal mode recognizes `%` at the beginning of the line as special. - // The cmdvar mode is like normal but does not treat `%` as special. + // The normal mode recognizes `%` and `{{...` at the beginning of the line + // as special. The cmdvar mode is like normal but does not treat these + // character sequences as special. + // + // Finally, the foreign mode reads everything until encountering a line that + // contains nothing (besides whitespaces) other than the closing multi- + // curly-brace (`}}...`) (or eos) returning the contents as the word token + // followed by the multi_rcbrace (or eos). In a way it is similar to the + // single-quote mode. The number of closing braces to expect is passed as + // mode data. // // The alternative modes must be set manually. The value/values and derived // modes automatically expires after the end of the line. The attribute mode // expires after the closing `]`. The variable mode expires after the word - // token. And the eval mode expires after the closing `)`. + // token. The eval mode expires after the closing `)`. And the foreign mode + // expires after the closing braces. // // Note that normally it is only safe to switch mode when the current token // is not quoted (or, more generally, when you are not in the double-quoted @@ -85,6 +94,7 @@ namespace build2 eval, single_quoted, double_quoted, + foreign, buildspec, value_next @@ -163,8 +173,10 @@ namespace build2 protected: struct state { - lexer_mode mode; - uintptr_t data; + lexer_mode mode; + uintptr_t data; + optional hold; + bool attributes; char sep_pair; @@ -190,6 +202,9 @@ namespace build2 token next_quoted (); + token + next_foreign (); + // Lex a word assuming current is the top state (which may already have // been "expired" from the top). // diff --git a/libbuild2/lexer.test.cxx b/libbuild2/lexer.test.cxx index 5e39e43..3458f56 100644 --- a/libbuild2/lexer.test.cxx +++ b/libbuild2/lexer.test.cxx @@ -1,6 +1,7 @@ // file : libbuild2/lexer.test.cxx -*- C++ -*- // license : MIT; see accompanying LICENSE file +#include // strtoul() #include #include @@ -14,13 +15,15 @@ using namespace std; namespace build2 { - // Usage: argv[0] [-q] [] + // Usage: argv[0] [-q] [[=]] // int main (int argc, char* argv[]) { bool quote (false); + lexer_mode m (lexer_mode::normal); + uintptr_t d (0); for (int i (1); i != argc; ++i) { @@ -36,7 +39,12 @@ namespace build2 else if (a == "attributes") m = lexer_mode::attributes; else if (a == "eval") m = lexer_mode::eval; else if (a == "buildspec") m = lexer_mode::buildspec; - else assert (false); + else if (a.compare (0, 8, "foreign=") == 0) + { + m = lexer_mode::foreign; + d = strtoul (a.c_str () + 8, nullptr, 10); + } + else assert (false); break; } } @@ -51,7 +59,7 @@ namespace build2 lexer l (cin, in); if (m != lexer_mode::normal) - l.mode (m); + l.mode (m, '\0', nullopt, d); // No use printing eos since we will either get it or loop forever. // diff --git a/libbuild2/test/script/lexer.cxx b/libbuild2/test/script/lexer.cxx index 1eeb0be..32c1cf4 100644 --- a/libbuild2/test/script/lexer.cxx +++ b/libbuild2/test/script/lexer.cxx @@ -138,7 +138,7 @@ namespace build2 } assert (ps == '\0'); - state_.push (state {m, data, a, ps, s, n, q, *esc, s1, s2}); + state_.push (state {m, data, nullopt, a, ps, s, n, q, *esc, s1, s2}); } token lexer:: diff --git a/libbuild2/token.cxx b/libbuild2/token.cxx index 11b080e..cfdc6bd 100644 --- a/libbuild2/token.cxx +++ b/libbuild2/token.cxx @@ -21,39 +21,42 @@ namespace build2 case token_type::pair_separator: os << ""; break; case token_type::word: os << '\'' << t.value << '\''; break; - case token_type::colon: os << q << ':' << q; break; - case token_type::dollar: os << q << '$' << q; break; - case token_type::question: os << q << '?' << q; break; - case token_type::percent: os << q << '%' << q; break; - case token_type::comma: os << q << ',' << q; break; - - case token_type::lparen: os << q << '(' << q; break; - case token_type::rparen: os << q << ')' << q; break; - - case token_type::lcbrace: os << q << '{' << q; break; - case token_type::rcbrace: os << q << '}' << q; break; - - case token_type::lsbrace: os << q << '[' << q; break; - case token_type::rsbrace: os << q << ']' << q; break; - - case token_type::labrace: os << q << '<' << q; break; - case token_type::rabrace: os << q << '>' << q; break; - - case token_type::assign: os << q << '=' << q; break; - case token_type::prepend: os << q << "=+" << q; break; - case token_type::append: os << q << "+=" << q; break; - case token_type::default_assign: os << q << "?=" << q; break; - - case token_type::equal: os << q << "==" << q; break; - case token_type::not_equal: os << q << "!=" << q; break; - case token_type::less: os << q << '<' << q; break; - case token_type::greater: os << q << '>' << q; break; - case token_type::less_equal: os << q << "<=" << q; break; - case token_type::greater_equal: os << q << ">=" << q; break; - - case token_type::log_or: os << q << "||" << q; break; - case token_type::log_and: os << q << "&&" << q; break; - case token_type::log_not: os << q << '!' << q; break; + case token_type::colon: os << q << ':' << q; break; + case token_type::dollar: os << q << '$' << q; break; + case token_type::question: os << q << '?' << q; break; + case token_type::percent: os << q << '%' << q; break; + case token_type::comma: os << q << ',' << q; break; + + case token_type::lparen: os << q << '(' << q; break; + case token_type::rparen: os << q << ')' << q; break; + + case token_type::lcbrace: os << q << '{' << q; break; + case token_type::rcbrace: os << q << '}' << q; break; + + case token_type::multi_lcbrace: os << q << t.value << q; break; + case token_type::multi_rcbrace: os << q << t.value << q; break; + + case token_type::lsbrace: os << q << '[' << q; break; + case token_type::rsbrace: os << q << ']' << q; break; + + case token_type::labrace: os << q << '<' << q; break; + case token_type::rabrace: os << q << '>' << q; break; + + case token_type::assign: os << q << '=' << q; break; + case token_type::prepend: os << q << "=+" << q; break; + case token_type::append: os << q << "+=" << q; break; + case token_type::default_assign: os << q << "?=" << q; break; + + case token_type::equal: os << q << "==" << q; break; + case token_type::not_equal: os << q << "!=" << q; break; + case token_type::less: os << q << '<' << q; break; + case token_type::greater: os << q << '>' << q; break; + case token_type::less_equal: os << q << "<=" << q; break; + case token_type::greater_equal: os << q << ">=" << q; break; + + case token_type::log_or: os << q << "||" << q; break; + case token_type::log_and: os << q << "&&" << q; break; + case token_type::log_not: os << q << '!' << q; break; default: assert (false); // Unhandled extended token. } diff --git a/libbuild2/token.hxx b/libbuild2/token.hxx index 8dad4ba..e11b880 100644 --- a/libbuild2/token.hxx +++ b/libbuild2/token.hxx @@ -45,6 +45,9 @@ namespace build2 lcbrace, // { rcbrace, // } + multi_lcbrace, // {{... (value contains the braces) + multi_rcbrace, // }}... (value contains the braces) + lsbrace, // [ rsbrace, // ] -- cgit v1.1