From 73c7f8615ebfaf76063207fbd071b2ff7b6b5a3f Mon Sep 17 00:00:00 2001 From: Boris Kolpackov Date: Sat, 26 Nov 2016 16:19:28 +0200 Subject: Spec testscript regex, add support in token/lexer --- build2/test/script/lexer.cxx | 110 ++++++++++++++++++++++--------------- build2/test/script/parser | 6 +- build2/test/script/parser.cxx | 124 ++++++++++++++++-------------------------- build2/test/script/token | 19 ++----- build2/test/script/token.cxx | 55 +++++++++---------- build2/token | 20 +++++-- doc/testscript.cli | 104 +++++++++++++++++++++++++++++++++-- 7 files changed, 262 insertions(+), 176 deletions(-) diff --git a/build2/test/script/lexer.cxx b/build2/test/script/lexer.cxx index 72fa85b..cdf726b 100644 --- a/build2/test/script/lexer.cxx +++ b/build2/test/script/lexer.cxx @@ -4,6 +4,8 @@ #include +#include // strchr() + using namespace std; namespace build2 @@ -176,6 +178,33 @@ namespace build2 if (eos (c)) return make_token (type::eos); + auto make_token_with_modifiers = + [&sep, ln, cn, this] (type t, const char* mods, bool exc = false) + { + string v; + if (mods != nullptr) + { + for (xchar p (peek ()); + (strchr (mods, p) != nullptr && // Modifier. + strchr (v.c_str (), p) == nullptr); // Not already seen. + p = peek ()) + { + get (); + v += p; + + // If mutually exclusive, then we are done. + // + if (exc) + break; + } + } + + return token (t, move (v), sep, + quote_type::unquoted, false, + ln, cn, + token_printer); + }; + state st (state_.top ()); // Make copy (see first/second_token). lexer_mode m (st.mode); @@ -299,27 +328,22 @@ namespace build2 { xchar p (peek ()); - if (p == '?' || p == '!' || p == '&') + if (p == '&') { get (); - - switch (p) - { - case '?': return make_token (type::clean_maybe); - case '!': return make_token (type::clean_never); - case '&': return make_token (type::log_and); - } + return make_token (type::log_and); } - else - return make_token (type::clean_always); + + return make_token_with_modifiers (type::clean, "!?", true); } // < // case '<': { + type r (type::in_str); xchar p (peek ()); - if (p == '+' || p == '-' || p == ':' || p == '<') + if (p == '+' || p == '-' || p == '<') { get (); @@ -327,35 +351,40 @@ namespace build2 { case '+': return make_token (type::in_pass); case '-': return make_token (type::in_null); - case ':': return make_token (type::in_str_nn); case '<': { + r = type::in_doc; p = peek (); - if (p == ':' || p == '<') + if (p == '<') { get (); - - return make_token (p == ':' - ? type::in_doc_nn - : type::in_file); + r = type::in_file; } - else - return make_token (type::in_doc); + break; } } } - else - return make_token (type::in_str); + // Handle modifiers. + // + const char* mod (nullptr); + switch (r) + { + case type::in_str: + case type::in_doc: mod = ":"; break; + } + + return make_token_with_modifiers (r, mod); } // > // case '>': { + type r (type::out_str); xchar p (peek ()); - if (p == '+' || p == '-' || p == '&' || p == ':' || p == '>') + if (p == '+' || p == '-' || p == '&' || p == '>') { get (); @@ -364,37 +393,32 @@ namespace build2 case '+': return make_token (type::out_pass); case '-': return make_token (type::out_null); case '&': return make_token (type::out_merge); - case ':': return make_token (type::out_str_nn); case '>': { + r = type::out_doc; p = peek (); - if (p == ':' || p == '>') + if (p == '>') { get (); - - if (p == ':') - return make_token (type::out_doc_nn); - - // File redirect. - // - p = peek (); - - if (p == '&') - { - get (); - return make_token (type::out_file_app); - } - else - return make_token (type::out_file); + r = type::out_file; } - else - return make_token (type::out_doc); + break; } } } - else - return make_token (type::out_str); + + // Handle modifiers. + // + const char* mod (nullptr); + switch (r) + { + case type::out_str: + case type::out_doc: mod = "~:"; break; + case type::out_file: mod = "&"; break; + } + + return make_token_with_modifiers (r, mod); } } } diff --git a/build2/test/script/parser b/build2/test/script/parser index da82df2..ee270d8 100644 --- a/build2/test/script/parser +++ b/build2/test/script/parser @@ -102,7 +102,7 @@ namespace build2 int fd; // Redirect fd (0 - in, 1 - out, 2 - err). string end; bool literal; // Literal (single-quote). - bool no_newline; // No final newline. + string modifiers; }; using here_docs = vector; @@ -116,7 +116,9 @@ namespace build2 parse_here_documents (token&, token_type&, pair&); string - parse_here_document (token&, token_type&, const string&, bool); + parse_here_document (token&, token_type&, + const string&, + const string&); // Execute. Issue diagnostics and throw failed in case of an error. // diff --git a/build2/test/script/parser.cxx b/build2/test/script/parser.cxx index 9e2018f..fae138b 100644 --- a/build2/test/script/parser.cxx +++ b/build2/test/script/parser.cxx @@ -1256,16 +1256,13 @@ namespace build2 clean }; pending p (pending::program); - bool nn (false); // True if pending here-{str,doc} is "no-newline". - bool app (false); // True if to append to pending file. - cleanup_type ct; // Pending cleanup type. - here_docs hd; // Expected here-documents. + string mod; // Modifiers for pending in_* and out_* positions. + here_docs hd; // Expected here-documents. // Add the next word to either one of the pending positions or to // program arguments by default. // - auto add_word = - [&c, &p, &nn, &app, &ct, this] (string&& w, const location& l) + auto add_word = [&c, &p, &mod, this] (string&& w, const location& l) { auto add_merge = [&l, this] (redirect& r, const string& w, int fd) { @@ -1284,9 +1281,10 @@ namespace build2 << "file descriptor must be " << fd; }; - auto add_here_str = [&nn] (redirect& r, string&& w) + auto add_here_str = [&mod] (redirect& r, string&& w) { - if (!nn) w += '\n'; + if (mod.find (':') == string::npos) + w += '\n'; r.str = move (w); }; @@ -1310,7 +1308,7 @@ namespace build2 } }; - auto add_file = [&app, &parse_path] (redirect& r, int fd, string&& w) + auto add_file = [&mod, &parse_path] (redirect& r, int fd, string&& w) { const char* what (nullptr); switch (fd) @@ -1321,7 +1319,7 @@ namespace build2 } r.file.path = parse_path (move (w), what); - r.file.append = app; + r.file.append = mod.find ('&') != string::npos; }; switch (p) @@ -1349,13 +1347,23 @@ namespace build2 case pending::err_file: add_file (c.err, 2, move (w)); break; case pending::clean: - c.cleanups.push_back ({ct, parse_path (move (w), "cleanup path")}); - break; + { + cleanup_type t; + switch (mod[0]) // Ok, if empty + { + case '!': t = cleanup_type::never; break; + case '?': t = cleanup_type::maybe; break; + default: t = cleanup_type::always; break; + } + + c.cleanups.push_back ( + {t, parse_path (move (w), "cleanup path")}); + break; + } } p = pending::none; - nn = false; - app = false; + mod.clear (); }; // Make sure we don't have any pending positions to fill. @@ -1389,11 +1397,11 @@ namespace build2 // Parse the redirect operator. // auto parse_redirect = - [&c, &p, &nn, &app, this] (const token& t, const location& l) + [&c, &p, &mod, this] (token& t, const location& l) { // Our semantics is the last redirect seen takes effect. // - assert (p == pending::none && !nn && !app); + assert (p == pending::none && mod.empty ()); // See if we have the file descriptor. // @@ -1430,9 +1438,7 @@ namespace build2 case type::in_pass: case type::in_null: case type::in_str: - case type::in_str_nn: case type::in_doc: - case type::in_doc_nn: case type::in_file: { if ((fd = fd == 3 ? 0 : fd) != 0) @@ -1444,11 +1450,8 @@ namespace build2 case type::out_null: case type::out_merge: case type::out_str: - case type::out_str_nn: case type::out_doc: - case type::out_doc_nn: case type::out_file: - case type::out_file_app: { if ((fd = fd == 3 ? 1 : fd) == 0) fail (l) << "invalid out redirect file descriptor " << fd; @@ -1468,17 +1471,12 @@ namespace build2 case type::out_merge: rt = redirect_type::merge; break; - case type::in_str_nn: - case type::out_str_nn: nn = true; // Fall through. case type::in_str: case type::out_str: rt = redirect_type::here_string; break; - case type::in_doc_nn: - case type::out_doc_nn: nn = true; // Fall through. case type::in_doc: case type::out_doc: rt = redirect_type::here_document; break; - case type::out_file_app: app = true; // Fall through. case type::in_file: case type::out_file: rt = redirect_type::file; break; } @@ -1525,20 +1523,16 @@ namespace build2 } break; } + + mod = move (t.value); }; // Set pending cleanup type. // - auto parse_clean = [&p, &ct] (type tt) + auto parse_clean = [&p, &mod] (token& t) { - switch (tt) - { - case type::clean_always: ct = cleanup_type::always; break; - case type::clean_maybe: ct = cleanup_type::maybe; break; - case type::clean_never: ct = cleanup_type::never; break; - } - p = pending::clean; + mod = move (t.value); }; const location ll (get_location (t)); // Line location. @@ -1606,18 +1600,10 @@ namespace build2 case type::out_str: case type::out_doc: - case type::in_str_nn: - case type::in_doc_nn: - case type::out_str_nn: - case type::out_doc_nn: - case type::in_file: case type::out_file: - case type::out_file_app: - case type::clean_always: - case type::clean_maybe: - case type::clean_never: + case type::clean: { if (pre_parse_) { @@ -1625,16 +1611,12 @@ namespace build2 // end markers since we need to know how many of them to pre- // parse after the command. // - nn = false; - switch (tt) { - case type::in_doc_nn: - case type::out_doc_nn: - nn = true; - // Fall through. case type::in_doc: case type::out_doc: + mod = move (t.value); + // We require the end marker to be a literal, unquoted word. // In particularm, we don't allow quoted because of cases // like foo"$bar" (where we will see word 'foo'). @@ -1683,7 +1665,10 @@ namespace build2 hd.push_back ( here_doc { - 0, 0, 0, move (t.value), qt == quote_type::single, nn}); + 0, 0, 0, + move (t.value), + qt == quote_type::single, + move (mod)}); break; } @@ -1736,24 +1721,16 @@ namespace build2 case type::out_str: case type::out_doc: - case type::in_str_nn: - case type::in_doc_nn: - case type::out_str_nn: - case type::out_doc_nn: - case type::in_file: case type::out_file: - case type::out_file_app: { parse_redirect (t, l); break; } - case type::clean_always: - case type::clean_maybe: - case type::clean_never: + case type::clean: { - parse_clean (tt); + parse_clean (t); break; } @@ -1789,10 +1766,10 @@ namespace build2 move (t.value), (t.qtype == quote_type::unquoted || t.qtype == quote_type::single), - nn}); + move (mod)}); p = pending::none; - nn = false; + mod.clear (); next (t, tt); break; @@ -1975,30 +1952,21 @@ namespace build2 case type::in_str: case type::out_str: - case type::in_str_nn: - case type::out_str_nn: - case type::in_file: case type::out_file: - case type::out_file_app: { parse_redirect (t, l); break; } - case type::clean_always: - case type::clean_maybe: - case type::clean_never: + case type::clean: { - parse_clean (tt); + parse_clean (t); break; } case type::in_doc: case type::out_doc: - - case type::in_doc_nn: - case type::out_doc_nn: { fail (l) << "here-document redirect in expansion"; break; @@ -2093,7 +2061,7 @@ namespace build2 : lexer_mode::here_line_double); next (t, tt); - string v (parse_here_document (t, tt, h.end, h.no_newline)); + string v (parse_here_document (t, tt, h.end, h.modifiers)); if (!pre_parse_) { @@ -2109,7 +2077,9 @@ namespace build2 } string parser:: - parse_here_document (token& t, type& tt, const string& em, bool nn) + parse_here_document (token& t, type& tt, + const string& em, + const string& mod) { // enter: first token on first line // leave: newline (after end marker) @@ -2259,9 +2229,9 @@ namespace build2 } else { - // Add final newline if requested. + // Add final newline unless suppressed. // - if (!nn) + if (mod.find (':') == string::npos) r += '\n'; } diff --git a/build2/test/script/token b/build2/test/script/token index d4f6eec..7f79746 100644 --- a/build2/test/script/token +++ b/build2/test/script/token @@ -30,29 +30,22 @@ namespace build2 minus, // - pipe, // | - clean_always, // & - clean_maybe, // &? - clean_never, // &! + clean, // &{?!} (modifiers in value) log_and, // && log_or, // || in_pass, // <+ in_null, // <- - in_str, // < - in_str_nn, // <: - in_doc, // << - in_doc_nn, // <<: + in_str, // <{:} (modifiers in value) + in_doc, // <<{:} (modifiers in value) in_file, // <<< out_pass, // >+ out_null, // >- out_merge, // >& - out_str, // > - out_str_nn, // >: - out_doc, // >> - out_doc_nn, // >>: - out_file, // >>> - out_file_app // >>>& + out_str, // >{:~} (modifiers in value) + out_doc, // >>{:~} (modifiers in value) + out_file // >>>{&} (modifiers in value) }; token_type () = default; diff --git a/build2/test/script/token.cxx b/build2/test/script/token.cxx index 79e64de..a8ef5b4 100644 --- a/build2/test/script/token.cxx +++ b/build2/test/script/token.cxx @@ -15,42 +15,37 @@ namespace build2 void token_printer (ostream& os, const token& t, bool d) { + const string& v (t.value); + // Only quote non-name tokens for diagnostics. // const char* q (d ? "'" : ""); switch (t.type) { - case token_type::semi: os << q << ';' << q; break; - - case token_type::plus: os << q << '+' << q; break; - case token_type::minus: os << q << '-' << q; break; - - case token_type::clean_always: os << q << '&' << q; break; - case token_type::clean_maybe: os << q << "&?" << q; break; - case token_type::clean_never: os << q << "&!" << q; break; - - case token_type::pipe: os << q << '|' << q; break; - case token_type::log_and: os << q << "&&" << q; break; - case token_type::log_or: os << q << "||" << q; break; - - case token_type::in_pass: os << q << "<+" << q; break; - case token_type::in_null: os << q << "<-" << q; break; - case token_type::in_str: os << q << '<' << q; break; - case token_type::in_str_nn: os << q << "<:" << q; break; - case token_type::in_doc: os << q << "<<" << q; break; - case token_type::in_doc_nn: os << q << "<<:" << q; break; - case token_type::in_file: os << q << "<<<" << q; break; - - case token_type::out_pass: os << q << ">+" << q; break; - case token_type::out_null: os << q << ">-" << q; break; - case token_type::out_merge: os << q << ">&" << q; break; - case token_type::out_str: os << q << '>' << q; break; - case token_type::out_str_nn: os << q << ">:" << q; break; - case token_type::out_doc: os << q << ">>" << q; break; - case token_type::out_doc_nn: os << q << ">>:" << q; break; - case token_type::out_file: os << q << ">>>" << q; break; - case token_type::out_file_app: os << q << ">>>&" << q; break; + case token_type::semi: os << q << ';' << q; break; + + case token_type::plus: os << q << '+' << q; break; + case token_type::minus: os << q << '-' << q; break; + + case token_type::clean: os << q << '&' << v << q; break; + + case token_type::pipe: os << q << '|' << q; break; + case token_type::log_and: os << q << "&&" << q; break; + case token_type::log_or: os << q << "||" << q; break; + + case token_type::in_pass: os << q << "<+" << q; break; + case token_type::in_null: os << q << "<-" << q; break; + case token_type::in_str: os << q << '<' << v << q; break; + case token_type::in_doc: os << q << "<<" << v << q; break; + case token_type::in_file: os << q << "<<<" << q; break; + + case token_type::out_pass: os << q << ">+" << q; break; + case token_type::out_null: os << q << ">-" << q; break; + case token_type::out_merge: os << q << ">&" << q; break; + case token_type::out_str: os << q << '>' << v << q; break; + case token_type::out_doc: os << q << ">>" << v << q; break; + case token_type::out_file: os << q << ">>>" << v << q; break; default: build2::token_printer (os, t, d); } diff --git a/build2/token b/build2/token index df25d4c..0dc914f 100644 --- a/build2/token +++ b/build2/token @@ -81,7 +81,10 @@ namespace build2 quote_type qtype; bool qcomp; - string value; // Only valid for word. + // Normally only used for word, but can also be used to store "modifiers" + // or some such for other tokens. + // + string value; uint64_t line; uint64_t column; @@ -93,18 +96,23 @@ namespace build2 : token (token_type::eos, false, 0, 0, token_printer) {} token (token_type t, bool s, uint64_t l, uint64_t c, printer_type* p) - : type (t), separated (s), qtype (quote_type::unquoted), - line (l), column (c), - printer (p) {} + : token (t, string (), s, quote_type::unquoted, false, l, c, p) {} token (string v, bool s, quote_type qt, bool qc, uint64_t l, uint64_t c) - : type (token_type::word), separated (s), + : token (token_type::word, move (v), s, qt, qc, l, c, &token_printer){} + + token (token_type t, + string v, bool s, + quote_type qt, bool qc, + uint64_t l, uint64_t c, + printer_type* p) + : type (t), separated (s), qtype (qt), qcomp (qc), value (move (v)), line (l), column (c), - printer (&token_printer) {} + printer (p) {} }; // Output the token value in a format suitable for diagnostics. diff --git a/doc/testscript.cli b/doc/testscript.cli index 79c6836..a9ba608 100644 --- a/doc/testscript.cli +++ b/doc/testscript.cli @@ -792,16 +792,16 @@ stderr: '2'(out-redirect) in-redirect: '<-'|\ '<+'|\ - ('<'|'<:') |\ - ('<<'|'<<:') |\ + '<'{':'?} |\ + '<<'{':'?} |\ '<<<' out-redirect: '>-'|\ '>+'|\ '>&' ('1'|'2')|\ - ('>'|'>:') |\ - ('>>'|'>>:') |\ - ('>>>'|'>>>&') + '>'{':'?'~'?} |\ + '>>'{':'?'~'?} |\ + '>>>'{'&'?} cleanup: ('&'|'&!'|'&?') (|) @@ -1463,6 +1463,100 @@ EOI The leading whitespace stripping does not apply to line continuations. +\h#here-regex|Output Regex| + +The expected result in output here-strings and here-documents can be specified +as a regular expression instead of plain text. To signal the use of regular +expressions the redirect must include the \c{~} modifier, for example: + +\ +$* >~'/fo+/' 2>>~/EOE/ +/ba+r/ +baz +EOE +\ + +The regular expression used for output matching has two levels. At the outer +level the expression is over lines with each line treated as a single +character. We will refer to this outer expression as \i{line-regex} and +to its characters as \i{line-char}. + +A line-char can be a literal line (like \c{baz} in the example above) in +which case it will only be equal to an identical line in the output. Or a +line-char can be an inner level regex (like \c{ba+r} above) in which +case it will be equal to any line in the output that matches this regex. +Where not clear from context we will refer to this inner expression as +\i{char-regex} and its characters as \c{char}. + +A line is treated as literal unless it starts with the \i{regex introducer +character} (\c{/} in the above example). In contrast, the line-regex is always +in effect (in a sense, the \c{~} modifier is its introducer). Note that the +here-string regex naturally must always start with an introducer. + +A char-regex line that starts with an introducer must also end with one +optionally followed by \i{match flags}. Currently the only supported flag is +\c{i} for case-insensitive match. For example: + +\ +$* >>~/EOO/ +/ba+r/i +/ba+z/i +EOO +\ + +Any character can act as a regex introducer. For here-strings it is the first +character in the string. For here-documents the introducer is specified as +part of the end marker. In this case the first character is the introducer, +everything after that and until the second occurrence of the introducer is the +actual end marker, and everything after that are global match flags. Global +match flags apply to every char-regex (but not literal line) in this +here-document. Note that there is no way to escape the introducer character +inside the regex. + +As an example, here is a shorter version of the previous example that also +uses a different introducer character. + +\ +$* >>~%EOO%i +%ba+r% +%ba+z% +EOO +\ + +By default a line-char is treated as an ordinary, non-syntax character with +regards to line-regex. Lines that start with a regex introducer but do not end +with one are used to specify syntax line-chars. Such syntax line-chars can +also be specified after (or instead of) match flags. For example: + +\ +$* >>~/EOO/ +/( +/fo+x/| +/ba+r/| +/ba+z/ +/)+ +EOO +\ + +As an illustration, if we call the \c{/fo+x/} expression \c{A}, \c{/ba+r/} \- +\c{B}, and \c{/ba+z/} \- C, then we can represent the above line-regex in +the following more traditional form: + +\ +(A|B|C)+ +\ + +Only characters from the \c{()|*+?{\}0123456789,=!} set are allowed as +syntax line-chars with presence of any other character being an error. + +A blank line as well as the \c{//} sequence (assuming \c{/} is the introducer) +are treated as an empty line-char. For the purpose of matching, newlines are +viewed as separators rather than being part of a line. In particular, in this +model, the customary trailing newline at the end of the output introduces a +trailing empty line-char. As a result, unless the \c{:} (no newline) redirect +modifier is used, an empty line-char is implicitly added to line-regex. + + \h1#style|Style Guide| This section describes the Testscript style that is used in the \c{build2} -- cgit v1.1