From 5381b25c51475c0c7a2f39e9f6efa623f621ef3e Mon Sep 17 00:00:00 2001 From: Boris Kolpackov Date: Thu, 13 Oct 2016 13:08:31 +0200 Subject: Continue work on testscript parser --- build2/test/script/lexer | 3 +- build2/test/script/lexer.cxx | 28 ++- build2/test/script/parser | 21 +-- build2/test/script/parser.cxx | 403 +++++++++++++++++++++++++++++++++++++----- build2/test/script/script | 39 ++++ doc/testscript.cli | 66 ++++--- 6 files changed, 467 insertions(+), 93 deletions(-) diff --git a/build2/test/script/lexer b/build2/test/script/lexer index de4c84e..f3d8fa9 100644 --- a/build2/test/script/lexer +++ b/build2/test/script/lexer @@ -27,7 +27,8 @@ namespace build2 script_line = base_type::value_next, variable_line, test_line, - command_line + command_line, + here_line }; using base_type::base_type; diff --git a/build2/test/script/lexer.cxx b/build2/test/script/lexer.cxx index 84be7c1..e0a3272 100644 --- a/build2/test/script/lexer.cxx +++ b/build2/test/script/lexer.cxx @@ -25,24 +25,24 @@ namespace build2 { case lexer_mode::script_line: { - s1 = "=+!|&<> $()#\t\n"; - s2 = " == "; + s1 = "=+!|&<> $(#\t\n"; + s2 = " == "; break; } case lexer_mode::variable_line: { // Like value except we don't recognize {. // - s1 = " $()[]#\t\n"; - s2 = " "; + s1 = " $([]#\t\n"; + s2 = " "; break; } case lexer_mode::test_line: { // As script_line but without variable assignments. // - s1 = "=!|&<> $()#\t\n"; - s2 = "== "; + s1 = "=!|&<> $(#\t\n"; + s2 = "== "; break; } case lexer_mode::command_line: @@ -54,6 +54,16 @@ namespace build2 s = false; break; } + case lexer_mode::here_line: + { + // This one is like a double-quoted string except it treats + // newlines as a separator. + // + s1 = "$(\n"; + s2 = " "; + s = false; + break; + } case lexer_mode::single_quoted: case lexer_mode::double_quoted: quoted_ = true; @@ -77,8 +87,9 @@ namespace build2 case lexer_mode::script_line: case lexer_mode::variable_line: case lexer_mode::test_line: - case lexer_mode::command_line: return next_line (); - default: return base_lexer::next_impl (); + case lexer_mode::command_line: + case lexer_mode::here_line: return next_line (); + default: return base_lexer::next_impl (); } } @@ -110,7 +121,6 @@ namespace build2 // case '$': return token (type::dollar, sep, ln, cn); case '(': return token (type::lparen, sep, ln, cn); - case ')': return token (type::rparen, sep, ln, cn); } } diff --git a/build2/test/script/parser b/build2/test/script/parser index 720a077..0ba4710 100644 --- a/build2/test/script/parser +++ b/build2/test/script/parser @@ -25,11 +25,9 @@ namespace build2 class parser: protected build2::parser { public: - using script_type = test::script::script; - // Issue diagnostics and throw failed in case of an error. // - script_type + script parse (istream&, const path& name, target& test, target& script); // Recursive descent parser. @@ -40,25 +38,28 @@ namespace build2 // protected: void - script (token&, token_type&); + parse_script (token&, token_type&); void - script_line (token&, token_type&); + parse_script_line (token&, token_type&); void - variable_line (token&, token_type&, string); + parse_variable_line (token&, token_type&, string); void - test_line (token&, token_type&, names_type, location); + parse_test_line (token&, token_type&, names_type, location); - void - command_exit (token&, token_type&); + command_exit + parse_command_exit (token&, token_type&); + + string + parse_here_document (token&, token_type&, const string&); protected: using base_parser = build2::parser; lexer* lexer_; - script_type* script_; + script* script_; }; } } diff --git a/build2/test/script/parser.cxx b/build2/test/script/parser.cxx index aba9f9a..ce867b3 100644 --- a/build2/test/script/parser.cxx +++ b/build2/test/script/parser.cxx @@ -25,14 +25,14 @@ namespace build2 lexer_ = &l; base_parser::lexer_ = &l; - script_type r (test_t, script_t); + script r (test_t, script_t); script_ = &r; token t (type::eos, false, 0, 0); type tt; next (t, tt); - script (t, tt); + parse_script (t, tt); if (tt != type::eos) fail (t) << "unexpected " << t; @@ -41,16 +41,16 @@ namespace build2 } void parser:: - script (token& t, token_type& tt) + parse_script (token& t, token_type& tt) { while (tt != type::eos) { - script_line (t, tt); + parse_script_line (t, tt); } } void parser:: - script_line (token& t, token_type& tt) + parse_script_line (token& t, token_type& tt) { // Parse first chunk. Keep track of whether anything in it was quoted. // @@ -98,14 +98,14 @@ namespace build2 if (ns.size () != 1 || !ns[0].simple () || ns[0].empty ()) fail (nl) << "variable name expected instead of '" << ns << "'"; - variable_line (t, tt, move (ns[0].value)); + parse_variable_line (t, tt, move (ns[0].value)); } else - test_line (t, tt, move (ns), move (nl)); + parse_test_line (t, tt, move (ns), move (nl)); } void parser:: - variable_line (token& t, token_type& tt, string name) + parse_variable_line (token& t, token_type& tt, string name) { type kind (tt); // Assignment kind. const variable_type& var (script_->var_pool.insert (move (name))); @@ -125,18 +125,196 @@ namespace build2 } void parser:: - test_line (token& t, token_type& tt, names_type ns, location nl) + parse_test_line (token& t, token_type& tt, names_type ns, location nl) { // Stop recognizing variable assignments. // mode (lexer_mode::test_line); + test ts; + + // Pending positions where the next word should go. + // + enum class pending + { + none, + program, + in_string, + in_document, + out_string, + out_document, + err_string, + err_document + }; + pending p (pending::program); + + // Ordered sequence of here-document redirects that we can expect to + // see after the command line. We temporarily store the end marker + // as the redirect's value. + // + vector> hd; + + // Add the next word to either one of the pending positions or + // to program arguments by default. + // + auto add_word = [&ts, &p, &hd, this] (string&& w, const location& l) + { + switch (p) + { + case pending::none: ts.arguments.push_back (move (w)); break; + case pending::program: + { + try + { + ts.program = path (move (w)); + + if (ts.program.empty ()) + fail (l) << "empty program path"; + } + catch (const invalid_path& e) + { + fail (l) << "invalid program path '" << e.path << "'"; + } + break; + } + case pending::in_document: hd.push_back (ts.in); // Fall through. + case pending::in_string: ts.in.value = move (w); break; + + case pending::out_document: hd.push_back (ts.out); // Fall through. + case pending::out_string: ts.out.value = move (w); break; + + case pending::err_document: hd.push_back (ts.err); // Fall through. + case pending::err_string: ts.err.value = move (w); break; + } + + p = pending::none; + }; + + // Make sure we don't have any pending positions to fill. + // + auto check_pending = [p, this] (const location& l) + { + const char* what (nullptr); + + switch (p) + { + case pending::none: break; + case pending::program: what = "program"; break; + case pending::in_string: what = "stdin here-string"; break; + case pending::in_document: what = "stdin here-document end"; break; + case pending::out_string: what = "stdout here-string"; break; + case pending::out_document: what = "stdout here-document end"; break; + case pending::err_string: what = "stderr here-string"; break; + case pending::err_document: what = "stderr here-document end"; break; + } + + if (what != nullptr) + fail (l) << "missing " << what; + }; + + // Parse the redirect operator. + // + auto parse_redirect = + [&ts, &p, this] (const token& t, const location& l) + { + // Our semantics is the last redirect seen takes effect. + // + assert (p == pending::none); + + // See if we have the file descriptor. + // + unsigned long fd (3); + if (!t.separated) + { + if (!ts.arguments.empty ()) + fail (l) << "missing redirect file descriptor"; + + const string& s (ts.arguments.back ()); + + try + { + size_t n; + fd = stoul (s, &n); + + if (n != s.size () || fd > 2) + throw invalid_argument (string ()); + } + catch (const exception&) + { + fail (l) << "invalid redirect file descriptor '" << s << "'"; + } + + ts.arguments.pop_back (); + } + + type tt (t.type); + + // Validate/set default file descriptor. + // + switch (tt) + { + case type::in_null: + case type::in_string: + case type::in_document: + { + if ((fd = fd == 3 ? 0 : fd) != 0) + fail (l) << "invalid in redirect file descriptor " << fd; + + break; + } + case type::out_null: + case type::out_string: + case type::out_document: + { + if ((fd = fd == 3 ? 1 : fd) == 0) + fail (l) << "invalid out redirect file descriptor " << fd; + + break; + } + } + + redirect_type rt; + switch (tt) + { + case type::in_null: + case type::out_null: rt = redirect_type::null; break; + case type::in_string: + case type::out_string: rt = redirect_type::here_string; break; + case type::in_document: + case type::out_document: rt = redirect_type::here_document; break; + } + + redirect& r (fd == 0 ? ts.in : fd == 1 ? ts.out : ts.err); + r.type = rt; + + switch (rt) + { + case redirect_type::none: + case redirect_type::null: + break; + case redirect_type::here_string: + switch (fd) + { + case 0: p = pending::in_string; break; + case 1: p = pending::out_string; break; + case 2: p = pending::err_string; break; + } + break; + case redirect_type::here_document: + switch (fd) + { + case 0: p = pending::in_document; break; + case 1: p = pending::out_document; break; + case 2: p = pending::err_document; break; + } + break; + } + }; + // Keep parsing chunks of the command line until we see the newline or // the exit status comparison. // - strings cmd; - - do + for (bool done (false); !done; ) { // Process words that we already have. // @@ -161,7 +339,7 @@ namespace build2 // re-lex. // if (q || s.find_first_of ("|&<>\'\"\\") == string::npos) - cmd.push_back (move (s)); + add_word (move (s), nl); else { // Come up with a "path" that contains both the original @@ -188,19 +366,28 @@ namespace build2 string w; bool f (true); // In case the whole thing is empty. - for (token t (lex.next ()); t.type != type::eos; t = lex.next ()) + + // Treat the first "sub-token" as always separated from what we + // saw earlier. + // + // Note that this is not "our" token so we cannot do fail(t). + // Rather we should do fail(l). + // + token t (lex.next ()); + location l (build2::get_location (t, name)); + t.separated = true; + + for (; t.type != type::eos; t = lex.next ()) { - // Note that this is not "our" token so we cannot do fail(t). - // Rather we should do fail(l). - // - location l (build2::get_location (t, lex.name ())); + type tt (t.type); + l = build2::get_location (t, name); // Re-lexing double-quotes will recognize $, ( inside as // tokens so we have to reverse them back. Since we don't // treat spaces as separators we can be sure we will get it // right. // - switch (t.type) + switch (tt) { case type::dollar: w += '$'; continue; case type::lparen: w += '('; continue; @@ -211,28 +398,33 @@ namespace build2 // if (!w.empty () || f) { - cmd.push_back (move (w)); + add_word (move (w), l); f = false; } - switch (t.type) + if (tt == type::name) { - case type::name: w = move (t.value); f = true; break; + w = move (t.value); + f = true; + continue; + } - // @@ TODO - // - case type::pipe: - case type::clean: - case type::log_and: - case type::log_or: + // If this is one of the operators/separators, check that we + // don't have any pending locations to be filled. + // + check_pending (l); + // Note: there is another one in the outer loop below. + // + switch (tt) + { case type::in_null: case type::in_string: case type::in_document: - case type::out_null: case type::out_string: case type::out_document: + parse_redirect (t, l); break; } } @@ -240,43 +432,164 @@ namespace build2 // Don't forget the last word. // if (!w.empty () || f) - cmd.push_back (move (w)); + add_word (move (w), l); } } - if (tt == type::newline || - tt == type::equal || - tt == type::not_equal) - break; + switch (tt) + { + case type::equal: + case type::not_equal: + case type::newline: + { + done = true; + break; + } + default: + { + // Parse the next chunk. + // + ns.clear (); + lexer_->reset_quoted (t.quoted); + nl = get_location (t); + names (t, tt, ns, true); + continue; + } + } - // Parse the next chunk. + // If this is one of the operators/separators, check that we don't + // have any pending locations to be filled. // - ns.clear (); - lexer_->reset_quoted (t.quoted); - names (t, tt, ns, true); + check_pending (nl); - } while (true); + // Note: there is another one in the inner loop above. + // + switch (tt) + { + case type::in_null: + case type::in_string: + case type::in_document: + case type::out_null: + case type::out_string: + case type::out_document: + parse_redirect (t, get_location (t)); + next (t, tt); + break; + } + } - //@@ switch mode (we no longer want to recognize command operators)? + // Verify we don't have anything pending to be filled. + // + check_pending (nl); + // While we no longer need to recognize command line operators, we + // also don't expect a valid test trailer to contain them. So we are + // going to continue lexing in the test_line mode. + // if (tt == type::equal || tt == type::not_equal) { - command_exit (t, tt); + next (t, tt); + ts.exit = parse_command_exit (t, tt); } - // here-document + if (tt != type::newline) + fail (t) << "unexpected " << t; + + expire_mode (); // Done parsing test-line. + + // Parse here-document fragments in the order they were mentioned on + // the command line. The end marker is temporarily stored as the + // redirect's value. + // + if (!hd.empty ()) + { + // Switch to the here-line mode which is like double-quoted but + // recognized the newline as a separator. + // + mode (lexer_mode::here_line); + next (t, tt); + + for (redirect& r: hd) + r.value = parse_here_document (t, tt, r.value); + + expire_mode (); + } } - void parser:: - command_exit (token& t, token_type& tt) + command_exit parser:: + parse_command_exit (token& t, token_type& tt) { // The next chunk should be the exit status. // - next (t, tt); names_type ns (names (t, tt, true)); //@@ TODO: validate to be single, simple, non-empty name that // converts to integer (is exit status always non-negative). + + return command_exit {exit_comparison::eq, 0}; + } + + string parser:: + parse_here_document (token& t, token_type& tt, const string& em) + { + string r; + + while (tt != type::eos) + { + // Check if this is the end marker. + // + if (tt == type::name && + !t.quoted && + t.value == em && + peek () == type::newline) + { + next (t, tt); // Get the newline. + break; + } + + // Expand the line. + // + names_type ns (names (t, tt)); + + // What shall we do if the expansion results in multiple names? For, + // example if the line contains just the variable expansion and it + // is of type strings. Adding all the elements space-separated seems + // like the natural thing to do. + // + for (auto b (ns.begin ()), i (b); i != ns.end (); ++i) + { + string s; + + try + { + s = value_traits::convert (move (*i), nullptr); + } + catch (const invalid_argument&) + { + fail (t) << "invalid string value '" << *i << "'"; + } + + if (i != b) + r += ' '; + + r += s; + r += '\n'; // Here-document line always includes a newline. + } + + // We should expand the whole line at once so this would normally be + // a newline but can also be an end-of-stream. + // + if (tt == type::newline) + next (t, tt); + else + assert (tt == type::eos); + } + + if (tt == type::eos) + fail (t) << "missing here-document end marker '" << em << "'"; + + next (t, tt); + return r; } } } diff --git a/build2/test/script/script b/build2/test/script/script index de81fa6..cda4feb 100644 --- a/build2/test/script/script +++ b/build2/test/script/script @@ -18,6 +18,45 @@ namespace build2 { namespace script { + enum class redirect_type + { + none, + null, + here_string, // Value is the string. + here_document // Value is the document. + }; + + struct redirect + { + redirect_type type = redirect_type::none; + string value; + }; + + struct command + { + path program; + strings arguments; + + redirect in; + redirect out; + redirect err; + }; + + enum class exit_comparison {eq, ne}; + + struct command_exit + { + // @@ Need to understand what type we should use for status. + + exit_comparison comparison = exit_comparison::eq; + uint16_t status = 0; + }; + + struct test: command + { + command_exit exit; + }; + class script { public: diff --git a/doc/testscript.cli b/doc/testscript.cli index 1aff571..a5aef9d 100644 --- a/doc/testscript.cli +++ b/doc/testscript.cli @@ -711,23 +711,33 @@ command-exit: ('=='|'!=') command: (' '+ )* {stdin? stdout? stderr?} -stdin: '0'? ('|\ - '<<' ) +stdin: '0'?('|\ + '<<' ) -stdout: '1'? ('>!'|\ - '>' |\ - '>>' ) +stdout: '1'?('>!'|\ + '>' |\ + '>>' ) -stderr: '2' ('>!'|\ - '>' |\ - '>>' ) +stderr: '2'('>!'|\ + '>' |\ + '>>' ) here-document: * \ +Note that if specified, file descriptors must not be separated from the +redirect operator with whitespaces. In other words, the following command +has \c{2} as an argument and redirects \c{stdout}, not \c{stderr}. + +Here-line is like double-quoted string by recognizes newlines. + +\ +$* 2 >! +\ + \ script: (script-scope|script-line)* @@ -782,25 +792,25 @@ command-pipe: command ('|' command)* command: (' '+ )* {stdin? stdout? stderr? cleanup*} -stdin: '0'? ('|\ - '<<' |\ - '<<<' ) - -stdout: '1'? ('>!'|\ - '>?'|\ - '>&' '2'|\ - '>' |\ - '>>' |\ - ('>>>'|'>>>&') ) - -stderr: '2' ('>!'|\ - '>?'|\ - '>&' '1' |\ - '>' |\ - '>>' |\ - ('>>>'|'>>>&') ) +stdin: ('|\ + '<<' |\ + '<<<' ) + +stdout: ('>!'|\ + '>?'|\ + '>&' '2'|\ + '>' |\ + '>>' |\ + ('>>>'|'>>>&') ) + +stderr: '2' ('>!'|\ + '>?'|\ + '>&' '1' |\ + '>' |\ + '>>' |\ + ('>>>'|'>>>&') ) cleanup: '&' (|) -- cgit v1.1