From 5381b25c51475c0c7a2f39e9f6efa623f621ef3e Mon Sep 17 00:00:00 2001
From: Boris Kolpackov <boris@codesynthesis.com>
Date: Thu, 13 Oct 2016 13:08:31 +0200
Subject: Continue work on testscript parser

---
 build2/test/script/lexer      |   3 +-
 build2/test/script/lexer.cxx  |  28 ++-
 build2/test/script/parser     |  21 +--
 build2/test/script/parser.cxx | 403 +++++++++++++++++++++++++++++++++++++-----
 build2/test/script/script     |  39 ++++
 doc/testscript.cli            |  66 ++++---
 6 files changed, 467 insertions(+), 93 deletions(-)

diff --git a/build2/test/script/lexer b/build2/test/script/lexer
index de4c84e..f3d8fa9 100644
--- a/build2/test/script/lexer
+++ b/build2/test/script/lexer
@@ -27,7 +27,8 @@ namespace build2
           script_line = base_type::value_next,
           variable_line,
           test_line,
-          command_line
+          command_line,
+          here_line
         };
 
         using base_type::base_type;
diff --git a/build2/test/script/lexer.cxx b/build2/test/script/lexer.cxx
index 84be7c1..e0a3272 100644
--- a/build2/test/script/lexer.cxx
+++ b/build2/test/script/lexer.cxx
@@ -25,24 +25,24 @@ namespace build2
         {
         case lexer_mode::script_line:
           {
-            s1 = "=+!|&<> $()#\t\n";
-            s2 = " ==           ";
+            s1 = "=+!|&<> $(#\t\n";
+            s2 = " ==          ";
             break;
           }
         case lexer_mode::variable_line:
           {
             // Like value except we don't recognize {.
             //
-            s1 = " $()[]#\t\n";
-            s2 = "         ";
+            s1 = " $([]#\t\n";
+            s2 = "        ";
             break;
           }
         case lexer_mode::test_line:
           {
             // As script_line but without variable assignments.
             //
-            s1 = "=!|&<> $()#\t\n";
-            s2 = "==           ";
+            s1 = "=!|&<> $(#\t\n";
+            s2 = "==          ";
             break;
           }
         case lexer_mode::command_line:
@@ -54,6 +54,16 @@ namespace build2
             s = false;
             break;
           }
+        case lexer_mode::here_line:
+          {
+            // This one is like a double-quoted string except it treats
+            // newlines as a separator.
+            //
+            s1 = "$(\n";
+            s2 = "   ";
+            s = false;
+            break;
+          }
         case lexer_mode::single_quoted:
         case lexer_mode::double_quoted:
           quoted_ = true;
@@ -77,8 +87,9 @@ namespace build2
         case lexer_mode::script_line:
         case lexer_mode::variable_line:
         case lexer_mode::test_line:
-        case lexer_mode::command_line: return next_line ();
-        default:                       return base_lexer::next_impl ();
+        case lexer_mode::command_line:
+        case lexer_mode::here_line:      return next_line ();
+        default:                         return base_lexer::next_impl ();
         }
       }
 
@@ -110,7 +121,6 @@ namespace build2
             //
           case '$': return token (type::dollar, sep, ln, cn);
           case '(': return token (type::lparen, sep, ln, cn);
-          case ')': return token (type::rparen, sep, ln, cn);
           }
         }
 
diff --git a/build2/test/script/parser b/build2/test/script/parser
index 720a077..0ba4710 100644
--- a/build2/test/script/parser
+++ b/build2/test/script/parser
@@ -25,11 +25,9 @@ namespace build2
       class parser: protected build2::parser
       {
       public:
-        using script_type = test::script::script;
-
         // Issue diagnostics and throw failed in case of an error.
         //
-        script_type
+        script
         parse (istream&, const path& name, target& test, target& script);
 
         // Recursive descent parser.
@@ -40,25 +38,28 @@ namespace build2
         //
       protected:
         void
-        script (token&, token_type&);
+        parse_script (token&, token_type&);
 
         void
-        script_line (token&, token_type&);
+        parse_script_line (token&, token_type&);
 
         void
-        variable_line (token&, token_type&, string);
+        parse_variable_line (token&, token_type&, string);
 
         void
-        test_line (token&, token_type&, names_type, location);
+        parse_test_line (token&, token_type&, names_type, location);
 
-        void
-        command_exit (token&, token_type&);
+        command_exit
+        parse_command_exit (token&, token_type&);
+
+        string
+        parse_here_document (token&, token_type&, const string&);
 
       protected:
         using base_parser = build2::parser;
 
         lexer* lexer_;
-        script_type* script_;
+        script* script_;
       };
     }
   }
diff --git a/build2/test/script/parser.cxx b/build2/test/script/parser.cxx
index aba9f9a..ce867b3 100644
--- a/build2/test/script/parser.cxx
+++ b/build2/test/script/parser.cxx
@@ -25,14 +25,14 @@ namespace build2
         lexer_ = &l;
         base_parser::lexer_ = &l;
 
-        script_type r (test_t, script_t);
+        script r (test_t, script_t);
         script_ = &r;
 
         token t (type::eos, false, 0, 0);
         type tt;
         next (t, tt);
 
-        script (t, tt);
+        parse_script (t, tt);
 
         if (tt != type::eos)
           fail (t) << "unexpected " << t;
@@ -41,16 +41,16 @@ namespace build2
       }
 
       void parser::
-      script (token& t, token_type& tt)
+      parse_script (token& t, token_type& tt)
       {
         while (tt != type::eos)
         {
-          script_line (t, tt);
+          parse_script_line (t, tt);
         }
       }
 
       void parser::
-      script_line (token& t, token_type& tt)
+      parse_script_line (token& t, token_type& tt)
       {
         // Parse first chunk. Keep track of whether anything in it was quoted.
         //
@@ -98,14 +98,14 @@ namespace build2
           if (ns.size () != 1 || !ns[0].simple () || ns[0].empty ())
             fail (nl) << "variable name expected instead of '" << ns << "'";
 
-          variable_line (t, tt, move (ns[0].value));
+          parse_variable_line (t, tt, move (ns[0].value));
         }
         else
-          test_line (t, tt, move (ns), move (nl));
+          parse_test_line (t, tt, move (ns), move (nl));
       }
 
       void parser::
-      variable_line (token& t, token_type& tt, string name)
+      parse_variable_line (token& t, token_type& tt, string name)
       {
         type kind (tt); // Assignment kind.
         const variable_type& var (script_->var_pool.insert (move (name)));
@@ -125,18 +125,196 @@ namespace build2
       }
 
       void parser::
-      test_line (token& t, token_type& tt, names_type ns, location nl)
+      parse_test_line (token& t, token_type& tt, names_type ns, location nl)
       {
         // Stop recognizing variable assignments.
         //
         mode (lexer_mode::test_line);
 
+        test ts;
+
+        // Pending positions where the next word should go.
+        //
+        enum class pending
+        {
+          none,
+          program,
+          in_string,
+          in_document,
+          out_string,
+          out_document,
+          err_string,
+          err_document
+        };
+        pending p (pending::program);
+
+        // Ordered sequence of here-document redirects that we can expect to
+        // see after the command line. We temporarily store the end marker
+        // as the redirect's value.
+        //
+        vector<reference_wrapper<redirect>> hd;
+
+        // Add the next word to either one of the pending positions or
+        // to program arguments by default.
+        //
+        auto add_word = [&ts, &p, &hd, this] (string&& w, const location& l)
+        {
+          switch (p)
+          {
+          case pending::none: ts.arguments.push_back (move (w)); break;
+          case pending::program:
+          {
+            try
+            {
+              ts.program = path (move (w));
+
+              if (ts.program.empty ())
+                fail (l) << "empty program path";
+            }
+            catch (const invalid_path& e)
+            {
+              fail (l) << "invalid program path '" << e.path << "'";
+            }
+            break;
+          }
+          case pending::in_document: hd.push_back (ts.in); // Fall through.
+          case pending::in_string: ts.in.value = move (w); break;
+
+          case pending::out_document: hd.push_back (ts.out); // Fall through.
+          case pending::out_string: ts.out.value = move (w); break;
+
+          case pending::err_document: hd.push_back (ts.err); // Fall through.
+          case pending::err_string: ts.err.value = move (w); break;
+          }
+
+          p = pending::none;
+        };
+
+        // Make sure we don't have any pending positions to fill.
+        //
+        auto check_pending = [p, this] (const location& l)
+        {
+          const char* what (nullptr);
+
+          switch (p)
+          {
+          case pending::none:                                            break;
+          case pending::program:      what = "program";                  break;
+          case pending::in_string:    what = "stdin here-string";        break;
+          case pending::in_document:  what = "stdin here-document end";  break;
+          case pending::out_string:   what = "stdout here-string";       break;
+          case pending::out_document: what = "stdout here-document end"; break;
+          case pending::err_string:   what = "stderr here-string";       break;
+          case pending::err_document: what = "stderr here-document end"; break;
+          }
+
+          if (what != nullptr)
+            fail (l) << "missing " << what;
+        };
+
+        // Parse the redirect operator.
+        //
+        auto parse_redirect =
+          [&ts, &p, this] (const token& t, const location& l)
+        {
+          // Our semantics is the last redirect seen takes effect.
+          //
+          assert (p == pending::none);
+
+          // See if we have the file descriptor.
+          //
+          unsigned long fd (3);
+          if (!t.separated)
+          {
+            if (!ts.arguments.empty ())
+              fail (l) << "missing redirect file descriptor";
+
+            const string& s (ts.arguments.back ());
+
+            try
+            {
+              size_t n;
+              fd = stoul (s, &n);
+
+              if (n != s.size () || fd > 2)
+                throw invalid_argument (string ());
+            }
+            catch (const exception&)
+            {
+              fail (l) << "invalid redirect file descriptor '" << s << "'";
+            }
+
+            ts.arguments.pop_back ();
+          }
+
+          type tt (t.type);
+
+          // Validate/set default file descriptor.
+          //
+          switch (tt)
+          {
+          case type::in_null:
+          case type::in_string:
+          case type::in_document:
+            {
+              if ((fd = fd == 3 ? 0 : fd) != 0)
+                fail (l) << "invalid in redirect file descriptor " << fd;
+
+              break;
+            }
+          case type::out_null:
+          case type::out_string:
+          case type::out_document:
+            {
+              if ((fd = fd == 3 ? 1 : fd) == 0)
+                fail (l) << "invalid out redirect file descriptor " << fd;
+
+              break;
+            }
+          }
+
+          redirect_type rt;
+          switch (tt)
+          {
+          case type::in_null:
+          case type::out_null:     rt = redirect_type::null;          break;
+          case type::in_string:
+          case type::out_string:   rt = redirect_type::here_string;   break;
+          case type::in_document:
+          case type::out_document: rt = redirect_type::here_document; break;
+          }
+
+          redirect& r (fd == 0 ? ts.in : fd == 1 ? ts.out : ts.err);
+          r.type = rt;
+
+          switch (rt)
+          {
+          case redirect_type::none:
+          case redirect_type::null:
+            break;
+          case redirect_type::here_string:
+            switch (fd)
+            {
+            case 0: p = pending::in_string;  break;
+            case 1: p = pending::out_string; break;
+            case 2: p = pending::err_string; break;
+            }
+            break;
+          case redirect_type::here_document:
+            switch (fd)
+            {
+            case 0: p = pending::in_document;  break;
+            case 1: p = pending::out_document; break;
+            case 2: p = pending::err_document; break;
+            }
+            break;
+          }
+        };
+
         // Keep parsing chunks of the command line until we see the newline or
         // the exit status comparison.
         //
-        strings cmd;
-
-        do
+        for (bool done (false); !done; )
         {
           // Process words that we already have.
           //
@@ -161,7 +339,7 @@ namespace build2
             // re-lex.
             //
             if (q || s.find_first_of ("|&<>\'\"\\") == string::npos)
-              cmd.push_back (move (s));
+              add_word (move (s), nl);
             else
             {
               // Come up with a "path" that contains both the original
@@ -188,19 +366,28 @@ namespace build2
 
               string w;
               bool f (true); // In case the whole thing is empty.
-              for (token t (lex.next ()); t.type != type::eos; t = lex.next ())
+
+              // Treat the first "sub-token" as always separated from what we
+              // saw earlier.
+              //
+              // Note that this is not "our" token so we cannot do fail(t).
+              // Rather we should do fail(l).
+              //
+              token t (lex.next ());
+              location l (build2::get_location (t, name));
+              t.separated = true;
+
+              for (; t.type != type::eos; t = lex.next ())
               {
-                // Note that this is not "our" token so we cannot do fail(t).
-                // Rather we should do fail(l).
-                //
-                location l (build2::get_location (t, lex.name ()));
+                type tt (t.type);
+                l = build2::get_location (t, name);
 
                 // Re-lexing double-quotes will recognize $, ( inside as
                 // tokens so we have to reverse them back. Since we don't
                 // treat spaces as separators we can be sure we will get it
                 // right.
                 //
-                switch (t.type)
+                switch (tt)
                 {
                 case type::dollar: w += '$'; continue;
                 case type::lparen: w += '('; continue;
@@ -211,28 +398,33 @@ namespace build2
                 //
                 if (!w.empty () || f)
                 {
-                  cmd.push_back (move (w));
+                  add_word (move (w), l);
                   f = false;
                 }
 
-                switch (t.type)
+                if (tt == type::name)
                 {
-                case type::name: w = move (t.value); f = true; break;
+                  w = move (t.value);
+                  f = true;
+                  continue;
+                }
 
-                  // @@ TODO
-                  //
-                case type::pipe:
-                case type::clean:
-                case type::log_and:
-                case type::log_or:
+                // If this is one of the operators/separators, check that we
+                // don't have any pending locations to be filled.
+                //
+                check_pending (l);
 
+                // Note: there is another one in the outer loop below.
+                //
+                switch (tt)
+                {
                 case type::in_null:
                 case type::in_string:
                 case type::in_document:
-
                 case type::out_null:
                 case type::out_string:
                 case type::out_document:
+                  parse_redirect (t, l);
                   break;
                 }
               }
@@ -240,43 +432,164 @@ namespace build2
               // Don't forget the last word.
               //
               if (!w.empty () || f)
-                cmd.push_back (move (w));
+                add_word (move (w), l);
             }
           }
 
-          if (tt == type::newline ||
-              tt == type::equal   ||
-              tt == type::not_equal)
-            break;
+          switch (tt)
+          {
+          case type::equal:
+          case type::not_equal:
+          case type::newline:
+            {
+              done = true;
+              break;
+            }
+          default:
+            {
+              // Parse the next chunk.
+              //
+              ns.clear ();
+              lexer_->reset_quoted (t.quoted);
+              nl = get_location (t);
+              names (t, tt, ns, true);
+              continue;
+            }
+          }
 
-          // Parse the next chunk.
+          // If this is one of the operators/separators, check that we don't
+          // have any pending locations to be filled.
           //
-          ns.clear ();
-          lexer_->reset_quoted (t.quoted);
-          names (t, tt, ns, true);
+          check_pending (nl);
 
-        } while (true);
+          // Note: there is another one in the inner loop above.
+          //
+          switch (tt)
+          {
+          case type::in_null:
+          case type::in_string:
+          case type::in_document:
+          case type::out_null:
+          case type::out_string:
+          case type::out_document:
+            parse_redirect (t, get_location (t));
+            next (t, tt);
+            break;
+          }
+        }
 
-        //@@ switch mode (we no longer want to recognize command operators)?
+        // Verify we don't have anything pending to be filled.
+        //
+        check_pending (nl);
 
+        // While we no longer need to recognize command line operators, we
+        // also don't expect a valid test trailer to contain them. So we are
+        // going to continue lexing in the test_line mode.
+        //
         if (tt == type::equal || tt == type::not_equal)
         {
-          command_exit (t, tt);
+          next (t, tt);
+          ts.exit = parse_command_exit (t, tt);
         }
 
-        // here-document
+        if (tt != type::newline)
+          fail (t) << "unexpected " << t;
+
+        expire_mode (); // Done parsing test-line.
+
+        // Parse here-document fragments in the order they were mentioned on
+        // the command line. The end marker is temporarily stored as the
+        // redirect's value.
+        //
+        if (!hd.empty ())
+        {
+          // Switch to the here-line mode which is like double-quoted but
+          // recognized the newline as a separator.
+          //
+          mode (lexer_mode::here_line);
+          next (t, tt);
+
+          for (redirect& r: hd)
+            r.value = parse_here_document (t, tt, r.value);
+
+          expire_mode ();
+        }
       }
 
-      void parser::
-      command_exit (token& t, token_type& tt)
+      command_exit parser::
+      parse_command_exit (token& t, token_type& tt)
       {
         // The next chunk should be the exit status.
         //
-        next (t, tt);
         names_type ns (names (t, tt, true));
 
         //@@ TODO: validate to be single, simple, non-empty name that
         //         converts to integer (is exit status always non-negative).
+
+        return command_exit {exit_comparison::eq, 0};
+      }
+
+      string parser::
+      parse_here_document (token& t, token_type& tt, const string& em)
+      {
+        string r;
+
+        while (tt != type::eos)
+        {
+          // Check if this is the end marker.
+          //
+          if (tt == type::name &&
+              !t.quoted        &&
+              t.value == em    &&
+              peek () == type::newline)
+          {
+            next (t, tt); // Get the newline.
+            break;
+          }
+
+          // Expand the line.
+          //
+          names_type ns (names (t, tt));
+
+          // What shall we do if the expansion results in multiple names? For,
+          // example if the line contains just the variable expansion and it
+          // is of type strings. Adding all the elements space-separated seems
+          // like the natural thing to do.
+          //
+          for (auto b (ns.begin ()), i (b); i != ns.end (); ++i)
+          {
+            string s;
+
+            try
+            {
+              s = value_traits<string>::convert (move (*i), nullptr);
+            }
+            catch (const invalid_argument&)
+            {
+              fail (t) << "invalid string value '" << *i << "'";
+            }
+
+            if (i != b)
+              r += ' ';
+
+            r += s;
+            r += '\n'; // Here-document line always includes a newline.
+          }
+
+          // We should expand the whole line at once so this would normally be
+          // a newline but can also be an end-of-stream.
+          //
+          if (tt == type::newline)
+            next (t, tt);
+          else
+            assert (tt == type::eos);
+        }
+
+        if (tt == type::eos)
+          fail (t) << "missing here-document end marker '" << em << "'";
+
+        next (t, tt);
+        return r;
       }
     }
   }
diff --git a/build2/test/script/script b/build2/test/script/script
index de81fa6..cda4feb 100644
--- a/build2/test/script/script
+++ b/build2/test/script/script
@@ -18,6 +18,45 @@ namespace build2
   {
     namespace script
     {
+      enum class redirect_type
+      {
+        none,
+        null,
+        here_string,  // Value is the string.
+        here_document // Value is the document.
+      };
+
+      struct redirect
+      {
+        redirect_type type = redirect_type::none;
+        string value;
+      };
+
+      struct command
+      {
+        path program;
+        strings arguments;
+
+        redirect in;
+        redirect out;
+        redirect err;
+      };
+
+      enum class exit_comparison {eq, ne};
+
+      struct command_exit
+      {
+        // @@ Need to understand what type we should use for status.
+
+        exit_comparison comparison = exit_comparison::eq;
+        uint16_t status = 0;
+      };
+
+      struct test: command
+      {
+        command_exit exit;
+      };
+
       class script
       {
       public:
diff --git a/doc/testscript.cli b/doc/testscript.cli
index 1aff571..a5aef9d 100644
--- a/doc/testscript.cli
+++ b/doc/testscript.cli
@@ -711,23 +711,33 @@ command-exit: ('=='|'!=') <exit-status>
 
 command: <path> (' '+ <arg>)* {stdin? stdout? stderr?}
 
-stdin:  '0'? ('<!'|\
-              '<' <text>|\
-              '<<' <here-end>)
+stdin:  '0'?('<!'|\
+             '<' <text>|\
+             '<<' <here-end>)
 
-stdout: '1'? ('>!'|\
-              '>' <text>|\
-              '>>' <here-end>)
+stdout: '1'?('>!'|\
+             '>' <text>|\
+             '>>' <here-end>)
 
-stderr: '2'  ('>!'|\
-              '>' <text>|\
-              '>>' <here-end>)
+stderr:  '2'('>!'|\
+             '>' <text>|\
+             '>>' <here-end>)
 
 here-document:
   <text>*
   <here-end>
 \
 
+Note that if specified, file descriptors must not be separated from the
+redirect operator with whitespaces. In other words, the following command
+has \c{2} as an argument and redirects \c{stdout}, not \c{stderr}.
+
+Here-line is like double-quoted string by recognizes newlines.
+
+\
+$* 2 >!
+\
+
 \
 script:
   (script-scope|script-line)*
@@ -782,25 +792,25 @@ command-pipe: command ('|' command)*
 
 command: <path> (' '+ <arg>)* {stdin? stdout? stderr? cleanup*}
 
-stdin:  '0'? ('<!'|\
-              '<?'|\
-              '<' <text>|\
-              '<<' <here-end>|\
-              '<<<' <file>)
-
-stdout: '1'? ('>!'|\
-              '>?'|\
-              '>&' '2'|\
-              '>' <text>|\
-              '>>' <here-end>|\
-              ('>>>'|'>>>&') <file>)
-
-stderr: '2'  ('>!'|\
-              '>?'|\
-              '>&' '1' |\
-              '>' <text>|\
-              '>>' <here-end>|\
-              ('>>>'|'>>>&') <file>)
+stdin:      ('<!'|\
+             '<?'|\
+             '<' <text>|\
+             '<<' <here-end>|\
+             '<<<' <file>)
+
+stdout:     ('>!'|\
+             '>?'|\
+             '>&' '2'|\
+             '>' <text>|\
+             '>>' <here-end>|\
+             ('>>>'|'>>>&') <file>)
+
+stderr: '2' ('>!'|\
+             '>?'|\
+             '>&' '1' |\
+             '>' <text>|\
+             '>>' <here-end>|\
+             ('>>>'|'>>>&') <file>)
 
 cleanup: '&' (<file>|<dir>)
 
-- 
cgit v1.1