Add support for portable path modifer and dot character escaping inversion

author: Karen Arutyunov <karen@codesynthesis.com> 2017-01-11 01:43:09 +0300
committer: Karen Arutyunov <karen@codesynthesis.com> 2017-01-19 17:56:07 +0300
commit: a83f3866667bca073c4d4c5d80b4deb5ac05906c (patch)
tree: 479464203f6be4535c8f165a20d21322a88a2751 /build2/test/script/parser.cxx
parent: ba99b60aeb8ccdeffc777589b99728395cd28f95 (diff)
1 files changed, 187 insertions, 199 deletions
diff --git a/build2/test/script/parser.cxx b/build2/test/script/parser.cxx
index f381118..4b1c777 100644
--- a/build2/test/script/parser.cxx
+++ b/build2/test/script/parser.cxx
@@ -5,7 +5,6 @@
 #include <build2/test/script/parser>
 
 #include <sstream>
-#include <cstring> // strstr()
 
 #include <build2/scheduler>
 
@@ -14,39 +13,6 @@
 
 using namespace std;
 
-namespace std
-{
-  // Print regex error description but only if it is meaningful (this is also
-  // why we have to print leading colon here).
-  //
-  // Currently libstdc++ just returns the name of the exception (bug #67361).
-  // So we check that the description contains at least one space character.
-  //
-  // While VC's description is meaningful, it has an undesired prefix that
-  // resembles the following: 'regex_error(error_badrepeat): '. So we skip it.
-  //
-  static ostream&
-  operator<< (ostream& o, const regex_error& e)
-  {
-    const char* d (e.what ());
-
-#if defined(_MSC_VER) && _MSC_VER <= 1910
-    const char* rd (strstr (d, "): "));
-    if (rd != nullptr)
-      d = rd + 3;
-#endif
-
-    ostringstream os;
-    os << runtime_error (d); // Sanitize the description.
-
-    string s (os.str ());
-    if (s.find (' ') != string::npos)
-      o << ": " << s;
-
-    return o;
-  }
-}
-
 namespace build2
 {
   namespace test
@@ -1340,23 +1306,22 @@ namespace build2
 
       // Parse the regular expression representation (non-empty string value
       // framed with introducer characters and optionally followed by flag
-      // characters from the {i} set, for example '/foo/i') into
+      // characters from the {di} set, for example '/foo/id') into
       // components. Also return end-of-parsing position if requested,
       // otherwise treat any unparsed characters left as an error.
       //
       struct regex_parts
       {
         string value;
-        char introducer;
-        regex::char_flags flags; // {icase}
+        char   intro;
+        string flags; // Combination of characters from {di} set.
 
         // Create a special empty object.
         //
-        regex_parts ()
-            : introducer ('\0'), flags (regex::char_flags ()) {}
+        regex_parts (): intro ('\0') {}
 
-        regex_parts (string v, char i, regex::char_flags f)
-            : value (move (v)), introducer (i), flags (f) {}
+        regex_parts (string v, char i, string f)
+            : value (move (v)), intro (i), flags (move (f)) {}
       };
 
       static regex_parts
@@ -1377,10 +1342,10 @@ namespace build2
         if (rn == 0)
           fail (l) << what << " is empty";
 
-        bool icase (s[++p] == 'i'); // Note: s[++p] can be '\0' (no flags).
-
-        if (icase)
-          ++p;
+        // Find end-of-flags position.
+        //
+        size_t fp (++p); // Save flags starting position.
+        for (char c; (c = s[p]) == 'd' || c == 'i'; ++p) ;
 
         // If string end is not reached then report invalid flags, unless
         // end-of-parsing position is requested (which means regex is just a
@@ -1392,11 +1357,7 @@ namespace build2
         if (end != nullptr)
           *end = p;
 
-        return regex_parts (string (s, 1, rn),
-                            s[0],
-                            icase
-                            ? regex::char_regex::icase
-                            : regex::char_flags ());
+        return regex_parts (string (s, 1, rn), s[0], string (s, fp, p - fp));
       }
 
       pair<command_expr, parser::here_docs> parser::
@@ -1419,6 +1380,27 @@ namespace build2
             fail (l) << "stdout and stderr redirected to each other";
         };
 
+        // Check that the introducer character differs from '/' if the
+        // portable path modifier is specified. Must be called before
+        // parse_regex() (see below) to make sure its diagnostics is
+        // meaningful.
+        //
+        // Note that the portable path modifier assumes '/' to be a valid
+        // regex character and so makes it indistinguishable from the
+        // terminating introducer.
+        //
+        auto check_regex_mod = [this] (const string& mod,
+                                       const string& re,
+                                       const location& l,
+                                       const char* what)
+        {
+          // Handles empty regex properly.
+          //
+          if (mod.find ('/') != string::npos && re[0] == '/')
+            fail (l) << "portable path modifier and '/' introducer in "
+                     << what;
+        };
+
         // Pending positions where the next word should go.
         //
         enum class pending
@@ -1449,7 +1431,8 @@ namespace build2
         // Add the next word to either one of the pending positions or to
         // program arguments by default.
         //
-        auto add_word = [&c, &p, &mod, this] (string&& w, const location& l)
+        auto add_word = [&c, &p, &mod, &check_regex_mod, this] (
+          string&& w, const location& l)
         {
           auto add_merge = [&l, this] (redirect& r, const string& w, int fd)
           {
@@ -1468,18 +1451,16 @@ namespace build2
                      << "file descriptor must be " << fd;
           };
 
-          auto add_here_str = [&mod] (redirect& r, string&& w)
+          auto add_here_str = [] (redirect& r, string&& w)
           {
-            if (mod.find (':') == string::npos)
+            if (r.modifiers.find (':') == string::npos)
               w += '\n';
             r.str = move (w);
           };
 
-          auto add_here_str_regex = [&l, &mod, this] (
+          auto add_here_str_regex = [&l, &check_regex_mod, this] (
             redirect& r, int fd, string&& w)
           {
-            using namespace regex;
-
             const char* what (nullptr);
             switch (fd)
             {
@@ -1487,36 +1468,23 @@ namespace build2
             case 2: what = "stderr regex redirect"; break;
             }
 
-            line_pool pool;
-            line_string s;
+            check_regex_mod (r.modifiers, w, l, what);
 
-            try
-            {
-              regex_parts re (parse_regex (w, l, what));
-              s += line_char (char_regex (re.value,
-                                          char_regex::ECMAScript | re.flags),
-                              pool);
-            }
-            catch (const regex_error& e)
-            {
-              // Print regex_error description if meaningful.
-              //
-              fail (l) << "invalid " << what << e <<
-                info << "regex: " << w;
-            }
+            regex_parts rp (parse_regex (w, l, what));
 
-            if (mod.find (':') == string::npos)
-            {
-              w += '\n';
-              s += line_char ("", pool);
-            }
+            regex_lines& re (r.regex);
+            re.intro = rp.intro;
 
-            r.regex.str = move (w);
+            re.lines.emplace_back (
+              l.line, l.column, move (rp.value), move (rp.flags));
 
-            // No special line-chars, so no way to try to create a malformed
-            // expression, and so can't throw.
+            // Add final blank line unless suppressed.
             //
-            r.regex.regex = line_regex (move (s), move (pool));
+            // Note that the position is synthetic, but that's ok as we don't
+            // expect any diagnostics to refer this line.
+            //
+            if (r.modifiers.find (':') == string::npos)
+              re.lines.emplace_back (l.line, l.column, string (), false);
           };
 
           auto parse_path = [&l, this] (string&& w, const char* what) -> path
@@ -1539,7 +1507,7 @@ namespace build2
             }
           };
 
-          auto add_file = [&mod, &parse_path] (redirect& r, int fd, string&& w)
+          auto add_file = [&parse_path] (redirect& r, int fd, string&& w)
           {
             const char* what (nullptr);
             switch (fd)
@@ -1550,7 +1518,7 @@ namespace build2
             }
 
             r.file.path = parse_path (move (w), what);
-            r.file.append = mod.find ('&') != string::npos;
+            r.file.append = r.modifiers.find ('&') != string::npos;
           };
 
           switch (p)
@@ -1771,6 +1739,11 @@ namespace build2
           redirect& r (fd == 0 ? c.in : fd == 1 ? c.out : c.err);
           r = redirect (rt);
 
+          // Don't move as still may be used for pending here-document end
+          // marker processing.
+          //
+          r.modifiers = mod;
+
           switch (rt)
           {
           case redirect_type::none:
@@ -1974,6 +1947,8 @@ namespace build2
 
                   if (re)
                   {
+                    check_regex_mod (mod, end, l, what);
+
                     r = parse_regex (end, l, what);
                     end = move (r.value); // The "cleared" end marker.
                   }
@@ -1984,7 +1959,7 @@ namespace build2
                       move (end),
                       qt == quote_type::single,
                       move (mod),
-                      r.introducer, r.flags});
+                      r.intro, move (r.flags)});
                   break;
                 }
 
@@ -2099,7 +2074,7 @@ namespace build2
                       (t.qtype == quote_type::unquoted ||
                        t.qtype == quote_type::single),
                       move (mod),
-                      r.introducer, r.flags});
+                      r.intro, move (r.flags)});
 
                   p = pending::none;
                   mod.clear ();
@@ -2396,54 +2371,43 @@ namespace build2
                 : lexer_mode::here_line_double);
           next (t, tt);
 
-          pair<string, regex::line_regex> v (
-            parse_here_document (
-              t, tt, h.end, h.modifiers, h.regex, h.regex_flags));
+          parsed_doc v (
+            parse_here_document (t, tt, h.end, h.modifiers, h.regex));
 
           if (!pre_parse_)
           {
             command& c (p.first[h.expr].pipe[h.pipe]);
             redirect& r (h.fd == 0 ? c.in : h.fd == 1 ? c.out : c.err);
 
-            if (h.regex)
+            if (v.re)
             {
-              r.regex.str   = move (v.first);
-              r.regex.regex = move (v.second);
-
-              // Restore the original end marker.
-              //
-              r.end = h.regex + h.end + h.regex;
-              if ((h.regex_flags & regex::char_regex::icase) != 0)
-                r.end += 'i';
+              r.regex = move (v.regex);
+              r.regex.flags = move (h.regex_flags);
             }
             else
-            {
-              r.str = move (v.first);
-              r.end = move (h.end);
-            }
+              r.str = move (v.str);
+
+            r.end        = move (h.end);
+            r.end_line   = v.end_line;
+            r.end_column = v.end_column;
           }
 
           expire_mode ();
         }
       }
 
-      pair<string, regex::line_regex> parser::
+      parser::parsed_doc parser::
       parse_here_document (token& t, type& tt,
                            const string& em,
                            const string& mod,
-                           char re,
-                           regex::char_flags refl)
+                           char re)
       {
         // enter: first token on first line
         // leave: newline (after end marker)
 
-        using namespace regex;
-
-        string rs; // String or regex literal.
+        string rs; // String literal.
 
-        line_pool pool;
-        line_string ls;
-        line_regex rre;
+        regex_lines rre;
 
         // Here-documents can be indented. The leading whitespaces of the end
         // marker line (called strip prefix) determine the indentation. Every
@@ -2465,8 +2429,7 @@ namespace build2
 
         // We will use the location of the first token on the line for the
         // regex diagnostics. At the end of the loop it will point to the
-        // beginning of the end marker which we use for diagnostics of the
-        // line_regex object creation.
+        // beginning of the end marker.
         //
         location l;
 
@@ -2543,97 +2506,93 @@ namespace build2
               }
             }
 
-            // Add newline after previous line.
-            //
-            if (!rs.empty ())
-              rs += '\n';
-
-            rs += s;
+            if (!re)
+            {
+              // Add newline after previous line.
+              //
+              if (!rs.empty ())
+                rs += '\n';
 
-            if (re)
+              rs += s;
+            }
+            else
             {
-              if (s[0] == re) // Line starts with the regex introducer.
+              // Due to expansion we can end up with multiple lines. If empty
+              // then will add a blank textual literal.
+              //
+              for (size_t p (0); p != string::npos; )
               {
-                size_t n (s.size ());
+                string ln;
+                size_t np (s.find ('\n', p));
 
-                // Handle the empty line-regex characters.
-                //
-                if (n == 1)
-                  fail (l) << "regex introducer without regex" <<
-                    info << "consider changing regex introducer '" << re
-                           << "' in here-document end marker";
-
-                // This is a char-regex, or a sequence of line-regex syntax
-                // characters or both (in this specific order). So we will add
-                // the char-regex first (if present), and then sequentially
-                // add the line-regex syntax characters (if present).
-                //
-                size_t p (s.find (re, 1));
-                if (p == string::npos)
+                if (np != string::npos)
                 {
-                  // No char-regex, just a sequence of line-regex syntax
-                  // characters. Prepare to parse them starting from the
-                  // position right after the introducer.
-                  //
-                  p = 1;
+                  ln = string (s, p, np - p);
+                  p = np + 1;
                 }
                 else
                 {
-                  // Add regex line-char, and then position to the end of the
-                  // regex (that includes terminating introducer and the
-                  // optional flags). This is the first line-regex syntax
-                  // character position (if present).
-                  //
-                  line_char c;
+                  ln = string (s, p);
+                  p = np;
+                }
 
-                  // Empty regex is a special case repesenting the blank line.
+                if (ln[0] != re) // Line doesn't start with regex introducer.
+                {
+                  // This is a line-char literal (covers blank lines as well).
                   //
-                  if (p == 1)
+                  // Append textual literal.
+                  //
+                  rre.lines.emplace_back (l.line, l.column, move (ln), false);
+                }
+                else // Line starts with the regex introducer.
+                {
+                  // This is a char-regex, or a sequence of line-regex syntax
+                  // characters or both (in this specific order). So we will
+                  // add regex (with optional special characters) or special
+                  // literal.
+                  //
+                  size_t p (ln.find (re, 1));
+                  if (p == string::npos)
                   {
-                    c = line_char ("", pool);
-                    ++p;
+                    // No regex, just a sequence of syntax characters.
+                    //
+                    string spec (ln, 1);
+                    if (spec.empty ())
+                      fail (l) << "no syntax line characters";
+
+                    // Append special literal.
+                    //
+                    rre.lines.emplace_back (
+                      l.line, l.column, move (spec), true);
                   }
                   else
                   {
-                    // Can't fail as all the pre-conditions verified (non-empty
-                    // with both introducers in place), so no description
-                    // required.
+                    // Regex (probably with syntax characters).
                     //
-                    regex_parts re (parse_regex (s, l, "", &p));
+                    regex_parts re;
 
-                    try
-                    {
-                      c = line_char (
-                        char_regex (re.value,
-                                    char_regex::ECMAScript | re.flags | refl),
-                        pool);
-                    }
-                    catch (const regex_error& e)
-                    {
-                      // Print regex_error description if meaningful.
+                    // Empty regex is a special case repesenting a blank line.
+                    //
+                    if (p == 1)
+                      // Position to optional specal characters of an empty
+                      // regex.
                       //
-                      fail (l) << "invalid regex" << e;
-                    }
-                  }
-
-                  ls += c;
-                }
+                      ++p;
+                    else
+                      // Can't fail as all the pre-conditions verified
+                      // (non-empty with both introducers in place), so no
+                      // description required.
+                      //
+                      re = parse_regex (ln, l, "", &p);
 
-                while (p != n)
-                {
-                  char c (s[p++]);
-                  if (line_char::syntax (c))
-                    ls += line_char (c);
-                  else
-                    fail (l) << "invalid line-regex syntax character '" << c
-                             << "'";
+                    // Append regex with optional special characters.
+                    //
+                    rre.lines.emplace_back (l.line, l.column,
+                                            move (re.value), move (re.flags),
+                                            string (ln, p));
+                  }
                 }
               }
-              else
-                // Line doesn't start with regex introducer. Add line-char
-                // literal (handles blank lines as well).
-                //
-                ls += line_char (move (s), pool);
             }
           }
 
@@ -2695,35 +2654,31 @@ namespace build2
           //
           if (mod.find (':') == string::npos)
           {
-            rs += '\n';
-
             if (re)
-              ls += line_char ("", pool);
+              // Note that the position is synthetic, but that's ok as we don't
+              // expect any diagnostics to refer this line.
+              //
+              rre.lines.emplace_back (l.line, l.column, string (), false);
+            else
+              rs += '\n';
           }
 
-          // Parse line-regex.
+          // Finalize regex lines.
           //
           if (re)
           {
             // Empty regex matches nothing, so not of much use.
             //
-            if (ls.empty ())
+            if (rre.lines.empty ())
               fail (l) << "empty here-document regex";
 
-            try
-            {
-              rre = line_regex (move (ls), move (pool));
-            }
-            catch (const regex_error& e)
-            {
-              // Print regex_error description if meaningful.
-              //
-              fail (l) << "invalid here-document regex" << e;
-            }
+            rre.intro  = re;
           }
         }
 
-        return make_pair (move (rs), move (rre));
+        return re
+          ? parsed_doc (move (rre), l.line, l.column)
+          : parsed_doc (move (rs), l.line, l.column);
       }
 
       //
@@ -3184,6 +3139,39 @@ namespace build2
         lexer_ = l;
         base_parser::lexer_ = l;
       }
+
+      // parser::parsed_doc
+      //
+      parser::parsed_doc::
+      parsed_doc (string s, uint64_t l, uint64_t c)
+          : str (move (s)), re (false), end_line (l), end_column (c)
+      {
+      }
+
+      parser::parsed_doc::
+      parsed_doc (regex_lines r, uint64_t l, uint64_t c)
+          : regex (move (r)), re (true), end_line (l), end_column (c)
+      {
+      }
+
+      parser::parsed_doc::
+      parsed_doc (parsed_doc&& d)
+          : re (d.re), end_line (d.end_line), end_column (d.end_column)
+      {
+        if (re)
+          new (&regex) regex_lines (move (d.regex));
+        else
+          new (&str) string (move (d.str));
+      }
+
+      parser::parsed_doc::
+      ~parsed_doc ()
+      {
+        if (re)
+          regex.~regex_lines ();
+        else
+          str.~string ();
+      }
     }
   }
 }
author	Karen Arutyunov <karen@codesynthesis.com>	2017-01-11 01:43:09 +0300
committer	Karen Arutyunov <karen@codesynthesis.com>	2017-01-19 17:56:07 +0300
commit	a83f3866667bca073c4d4c5d80b4deb5ac05906c (patch)
tree	479464203f6be4535c8f165a20d21322a88a2751 /build2/test/script/parser.cxx
parent	ba99b60aeb8ccdeffc777589b99728395cd28f95 (diff)