// file : build2/test/script/parser.cxx -*- C++ -*- // copyright : Copyright (c) 2014-2016 Code Synthesis Ltd // license : MIT; see accompanying LICENSE file #include #include #include using namespace std; namespace build2 { namespace test { namespace script { using type = token_type; void parser:: parse (istream& is, const path& p, target& test_t, target& script_t, runner& r) { path_ = &p; lexer l (is, *path_, lexer_mode::script_line); lexer_ = &l; base_parser::lexer_ = &l; script s (test_t, script_t); script_ = &s; runner_ = &r; token t; type tt; next (t, tt); parse_script (t, tt); if (tt != type::eos) fail (t) << "unexpected " << t; } void parser:: parse_script (token& t, token_type& tt) { for (; tt != type::eos; next (t, tt)) { parse_script_line (t, tt); } } void parser:: parse_script_line (token& t, token_type& tt) { // Parse first chunk. Keep track of whether anything in it was quoted. // names_type ns; location nl (get_location (t)); lexer_->reset_quoted (t.quoted); names (t, tt, ns, true, "variable or program name"); // See if this is a variable assignment or a test command. // if (tt == type::assign || tt == type::prepend || tt == type::append) { // We need to strike a balance between recognizing command lines // that contain the assignment operator and variable assignments. // // If we choose to treat these tokens literally (for example, if we // have several names on the LHS), then we have the reversibility // problem: we need to restore original whitespaces before and after // the assignment operator (e.g., foo=bar vs foo = bar). // // To keep things simple we will start with the following rule: if // the token after the first chunk of input is assignment, then it // must be a variable assignment. After all, command lines like this // are not expected to be common: // // $* =x // // It will also be easy to get the desired behavior with quoting: // // $* "=x" // // The only issue here is if $* above expands to a single, simple // name (e.g., an executable name) in which case it will be treated // as a variable name. One way to resolve it would be to detect // "funny" variable names and require that they be quoted (this // won't help with built-in commands; maybe we could warn if it's // the same as built-in). Note that currently we have no way of // knowing it's quoted. // // Or perhaps we should just let people learn that first assignment // needs to be quoted? // if (ns.size () != 1 || !ns[0].simple () || ns[0].empty ()) fail (nl) << "variable name expected instead of '" << ns << "'"; parse_variable_line (t, tt, move (ns[0].value)); } else parse_test_line (t, tt, move (ns), move (nl)); } void parser:: parse_variable_line (token& t, token_type& tt, string name) { type kind (tt); // Assignment kind. const variable_type& var (script_->var_pool.insert (move (name))); // We cannot reuse the value mode since it will recognize { which // we want to treat as a literal. // value rhs (variable_value (t, tt, lexer_mode::variable_line)); if (tt != type::newline) fail (t) << "unexpected " << t; value& lhs (kind == type::assign ? script_->assign (var) : script_->append (var)); // @@ Need to adjust to make strings the default type. // value_attributes (&var, lhs, move (rhs), kind); } void parser:: parse_test_line (token& t, token_type& tt, names_type ns, location nl) { // Stop recognizing variable assignments. // mode (lexer_mode::test_line); test ts; // Pending positions where the next word should go. // enum class pending { none, program, in_string, in_document, out_string, out_document, err_string, err_document }; pending p (pending::program); // Ordered sequence of here-document redirects that we can expect to // see after the command line. We temporarily store the end marker // as the redirect's value. // vector> hd; // Add the next word to either one of the pending positions or // to program arguments by default. // auto add_word = [&ts, &p, &hd, this] (string&& w, const location& l) { switch (p) { case pending::none: ts.arguments.push_back (move (w)); break; case pending::program: { try { ts.program = path (move (w)); if (ts.program.empty ()) fail (l) << "empty program path"; } catch (const invalid_path& e) { fail (l) << "invalid program path '" << e.path << "'"; } break; } case pending::in_document: hd.push_back (ts.in); // Fall through. case pending::in_string: ts.in.value = move (w); break; case pending::out_document: hd.push_back (ts.out); // Fall through. case pending::out_string: ts.out.value = move (w); break; case pending::err_document: hd.push_back (ts.err); // Fall through. case pending::err_string: ts.err.value = move (w); break; } p = pending::none; }; // Make sure we don't have any pending positions to fill. // auto check_pending = [&p, this] (const location& l) { const char* what (nullptr); switch (p) { case pending::none: break; case pending::program: what = "program"; break; case pending::in_string: what = "stdin here-string"; break; case pending::in_document: what = "stdin here-document end"; break; case pending::out_string: what = "stdout here-string"; break; case pending::out_document: what = "stdout here-document end"; break; case pending::err_string: what = "stderr here-string"; break; case pending::err_document: what = "stderr here-document end"; break; } if (what != nullptr) fail (l) << "missing " << what; }; // Parse the redirect operator. // auto parse_redirect = [&ts, &p, this] (const token& t, const location& l) { // Our semantics is the last redirect seen takes effect. // assert (p == pending::none); // See if we have the file descriptor. // unsigned long fd (3); if (!t.separated) { if (!ts.arguments.empty ()) fail (l) << "missing redirect file descriptor"; const string& s (ts.arguments.back ()); try { size_t n; fd = stoul (s, &n); if (n != s.size () || fd > 2) throw invalid_argument (string ()); } catch (const exception&) { fail (l) << "invalid redirect file descriptor '" << s << "'"; } ts.arguments.pop_back (); } type tt (t.type); // Validate/set default file descriptor. // switch (tt) { case type::in_null: case type::in_string: case type::in_document: { if ((fd = fd == 3 ? 0 : fd) != 0) fail (l) << "invalid in redirect file descriptor " << fd; break; } case type::out_null: case type::out_string: case type::out_document: { if ((fd = fd == 3 ? 1 : fd) == 0) fail (l) << "invalid out redirect file descriptor " << fd; break; } } redirect_type rt; switch (tt) { case type::in_null: case type::out_null: rt = redirect_type::null; break; case type::in_string: case type::out_string: rt = redirect_type::here_string; break; case type::in_document: case type::out_document: rt = redirect_type::here_document; break; } redirect& r (fd == 0 ? ts.in : fd == 1 ? ts.out : ts.err); r.type = rt; switch (rt) { case redirect_type::none: case redirect_type::null: break; case redirect_type::here_string: switch (fd) { case 0: p = pending::in_string; break; case 1: p = pending::out_string; break; case 2: p = pending::err_string; break; } break; case redirect_type::here_document: switch (fd) { case 0: p = pending::in_document; break; case 1: p = pending::out_document; break; case 2: p = pending::err_document; break; } break; } }; // Keep parsing chunks of the command line until we see the newline or // the exit status comparison. // for (bool done (false); !done; ) { // Process words that we already have. // bool q (lexer_->quoted ()); for (name& n: ns) { string s; try { s = value_traits::convert (move (n), nullptr); } catch (const invalid_argument&) { fail (nl) << "invalid string value '" << n << "'"; } // If it is a quoted chunk, then we add the word as is. Otherwise // we re-lex it. But if the word doesn't contain any interesting // characters (operators plus quotes/escapes), then no need to // re-lex. // if (q || s.find_first_of ("|&<>\'\"\\") == string::npos) add_word (move (s), nl); else { // Come up with a "path" that contains both the original // location as well as the expanded string. The resulting // diagnostics will look like this: // // testscript:10:1 ('abc): unterminated single quote // path name; { string n (nl.file->string ()); n += ':'; n += to_string (nl.line); n += ':'; n += to_string (nl.column); n += ": ("; n += s; n += ')'; name = path (move (n)); } istringstream is (s); lexer lex (is, name, lexer_mode::command_line); string w; bool f (true); // In case the whole thing is empty. // Treat the first "sub-token" as always separated from what we // saw earlier. // // Note that this is not "our" token so we cannot do fail(t). // Rather we should do fail(l). // token t (lex.next ()); location l (build2::get_location (t, name)); t.separated = true; for (; t.type != type::eos; t = lex.next ()) { type tt (t.type); l = build2::get_location (t, name); // Re-lexing double-quotes will recognize $, ( inside as // tokens so we have to reverse them back. Since we don't // treat spaces as separators we can be sure we will get it // right. // switch (tt) { case type::dollar: w += '$'; continue; case type::lparen: w += '('; continue; } // Retire the current word. We need to distinguish between // empty and non-existent (e.g., > vs >""). // if (!w.empty () || f) { add_word (move (w), l); f = false; } if (tt == type::name) { w = move (t.value); f = true; continue; } // If this is one of the operators/separators, check that we // don't have any pending locations to be filled. // check_pending (l); // Note: there is another one in the outer loop below. // switch (tt) { case type::in_null: case type::in_string: case type::in_document: case type::out_null: case type::out_string: case type::out_document: parse_redirect (t, l); break; } } // Don't forget the last word. // if (!w.empty () || f) add_word (move (w), l); } } switch (tt) { case type::equal: case type::not_equal: case type::newline: { done = true; continue; } default: { // Parse the next chunk. // ns.clear (); lexer_->reset_quoted (t.quoted); nl = get_location (t); names (t, tt, ns, true, "command"); continue; } } // If this is one of the operators/separators, check that we don't // have any pending locations to be filled. // check_pending (nl); // Note: there is another one in the inner loop above. // switch (tt) { case type::in_null: case type::in_string: case type::in_document: case type::out_null: case type::out_string: case type::out_document: parse_redirect (t, get_location (t)); next (t, tt); break; } } // Verify we don't have anything pending to be filled. // check_pending (nl); // While we no longer need to recognize command line operators, we // also don't expect a valid test trailer to contain them. So we are // going to continue lexing in the test_line mode. // if (tt == type::equal || tt == type::not_equal) { next (t, tt); ts.exit = parse_command_exit (t, tt); } if (tt != type::newline) fail (t) << "unexpected " << t; expire_mode (); // Done parsing test-line. // Parse here-document fragments in the order they were mentioned on // the command line. // for (redirect& r: hd) { // Switch to the here-line mode which is like double-quoted but // recognized the newline as a separator. // mode (lexer_mode::here_line); next (t, tt); // The end marker is temporarily stored as the redirect's value. // r.value = parse_here_document (t, tt, r.value); expire_mode (); } // Now that we have all the pieces, run the test. // runner_->run (ts); } command_exit parser:: parse_command_exit (token& t, token_type& tt) { // The next chunk should be the exit status. // names_type ns (names (t, tt, true, "exit status")); //@@ TODO: validate to be single, simple, non-empty name that // converts to integer (is exit status always non-negative). return command_exit {exit_comparison::eq, 0}; } string parser:: parse_here_document (token& t, token_type& tt, const string& em) { string r; while (tt != type::eos) { // Check if this is the end marker. // if (tt == type::name && !t.quoted && t.value == em && peek () == type::newline) { next (t, tt); // Get the newline. break; } // Expand the line. // names_type ns (names (t, tt, false, "here-document line")); // What shall we do if the expansion results in multiple names? For, // example if the line contains just the variable expansion and it // is of type strings. Adding all the elements space-separated seems // like the natural thing to do. // for (auto b (ns.begin ()), i (b); i != ns.end (); ++i) { string s; try { s = value_traits::convert (move (*i), nullptr); } catch (const invalid_argument&) { fail (t) << "invalid string value '" << *i << "'"; } if (i != b) r += ' '; r += s; r += '\n'; // Here-document line always includes a newline. } // We should expand the whole line at once so this would normally be // a newline but can also be an end-of-stream. // if (tt == type::newline) next (t, tt); else assert (tt == type::eos); } if (tt == type::eos) fail (t) << "missing here-document end marker '" << em << "'"; return r; } lookup parser:: lookup_variable (name&& qual, string&& name, const location& l) { if (!qual.empty ()) fail (l) << "qualified variable name"; const variable& var (script_->var_pool.insert (move (name))); return script_->find (var); } } } }