Add regex_replace_parse() overloads

author: Karen Arutyunov <karen@codesynthesis.com> 2021-03-23 18:50:55 +0300
committer: Karen Arutyunov <karen@codesynthesis.com> 2021-03-26 18:13:32 +0300
commit: 95c579df686f115c0fd3697f2723fa73476c4584 (patch)
tree: 5d76adbcf75692d278b4085c6e996ab58a3e4e25
parent: 5ecdb9a3b5cb85418f69126226b2636caed2e4da (diff)
7 files changed, 211 insertions, 79 deletions
diff --git a/libbutl/builtin.cxx b/libbutl/builtin.cxx
index 79ff968..a6bb94b 100644
--- a/libbutl/builtin.cxx
+++ b/libbutl/builtin.cxx
@@ -1632,15 +1632,6 @@ namespace butl
         string replacement;
         bool global;
         bool print;
-
-        subst (const string& re, bool ic, string rp, bool gl, bool pr)
-            //
-            // Note that ECMAScript is implied if no grammar flag is specified.
-            //
-            : regex (re, ic ? regex::icase : regex::ECMAScript),
-              replacement (move (rp)),
-              global (gl),
-              print (pr) {}
       };
 
       small_vector<subst, 1> substs;
@@ -1663,57 +1654,59 @@ namespace butl
         if (delim == '\\' || delim == '\n')
           fail () << "invalid delimiter for 's' command in '" << v << "'";
 
-        size_t p (v.find (delim, 2));
-        if (p == string::npos)
-          fail () << "unterminated 's' command regex in '" << v << "'";
-
-        string regex (v, 2, p - 2);
-
-        // Empty regex matches nothing, so not of much use.
-        //
-        if (regex.empty ())
-          fail () << "empty regex in 's' command in '" << v << "'";
-
-        size_t b (p + 1);
-        p = v.find (delim, b);
-        if (p == string::npos)
-          fail () << "unterminated 's' command replacement in '" << v << "'";
-
-        string replacement (v, b, p - b);
-
-        // Parse the substitute command flags.
+        // Parse the substitute command regex (as string), replacement, and
+        // flags.
         //
+        pair<string, string> rf;
         bool icase  (false);
         bool global (false);
         bool print  (false);
 
-        char c;
-        for (++p; (c = v[p]) != '\0'; ++p)
+        try
         {
-          switch (c)
+          size_t e;
+          rf = regex_replace_parse (v.c_str () + 1, v.size () - 1, e);
+
+          char c;
+          for (size_t i (e + 1); (c = v[i]) != '\0'; ++i)
           {
-          case 'i': icase  = true; break;
-          case 'g': global = true; break;
-          case 'p': print  = true; break;
-          default:
+            switch (c)
             {
-              fail () << "invalid 's' command flag '" << c << "' in '" << v
-                      << "'";
+            case 'i': icase  = true; break;
+            case 'g': global = true; break;
+            case 'p': print  = true; break;
+            default:
+              {
+                fail () << "invalid 's' command flag '" << c << "' in '" << v
+                        << "'";
+              }
             }
           }
         }
+        catch (const invalid_argument& e)
+        {
+          fail () << "invalid 's' command '" << v << "': " << e;
+        }
 
+        // Parse the regex and add the substitution to the list.
+        //
         try
         {
-          substs.emplace_back (regex, icase,
-                               move (replacement),
-                               global, print);
+          // Note that ECMAScript is implied if no grammar flag is specified.
+          //
+          regex re (rf.first, icase ? regex::icase : regex::ECMAScript);
+
+          substs.push_back ({move (re),
+                             move (rf.second),
+                             global,
+                             print});
         }
         catch (const regex_error& e)
         {
           // Print regex_error description if meaningful (no space).
           //
-          fail () << "invalid regex '" << regex << "' in '" << v << "'" << e;
+          fail () << "invalid regex '" << rf.first << "' in '" << v << "'"
+                  << e;
         }
       }
 
diff --git a/libbutl/regex.ixx b/libbutl/regex.ixx
index dec15d1..805acd1 100644
--- a/libbutl/regex.ixx
+++ b/libbutl/regex.ixx
@@ -21,4 +21,21 @@ LIBBUTL_MODEXPORT namespace butl //@@ MOD Clang needs this for some reason.
 
     return make_pair (move (r), match);
   }
+
+  template <typename C>
+  inline std::pair<std::basic_regex<C>, std::basic_string<C>>
+  regex_replace_parse (const std::basic_string<C>& s,
+                       std::regex_constants::syntax_option_type f)
+  {
+    return regex_replace_parse (s.c_str (), s.size (), f);
+  }
+
+  template <typename C>
+  inline std::pair<std::basic_regex<C>, std::basic_string<C>>
+  regex_replace_parse (const C* s,
+                       std::regex_constants::syntax_option_type f)
+  {
+    return regex_replace_parse (
+      s, std::basic_string<C>::traits_type::length (s), f);
+  }
 }
diff --git a/libbutl/regex.mxx b/libbutl/regex.mxx
index 84b024f..b5490b1 100644
--- a/libbutl/regex.mxx
+++ b/libbutl/regex.mxx
@@ -14,8 +14,9 @@
 #include <utility> // pair
 
 #include <locale>
-#include <cstddef> // size_t
-#include <utility> // move(), make_pair()
+#include <cstddef>   // size_t
+#include <utility>   // move(), make_pair()
+#include <stdexcept> // invalid_argument
 #endif
 
 #if defined(__clang__)
@@ -93,6 +94,37 @@ LIBBUTL_MODEXPORT namespace butl
   regex_replace_match (const std::basic_string<C>&,
                        const std::basic_regex<C>&,
                        const std::basic_string<C>& fmt);
+
+  // Parse the '/<regex>/<format>/' replacement string into the regex/format
+  // pair. Other character can be used as a delimiter instead of '/'. Throw
+  // std::invalid_argument or std::regex_error on parsing error.
+  //
+  // Note: escaping of the delimiter character is not (yet) supported.
+  //
+  template <typename C>
+  std::pair<std::basic_regex<C>, std::basic_string<C>>
+  regex_replace_parse (const std::basic_string<C>&,
+                       std::regex_constants::syntax_option_type =
+                         std::regex_constants::ECMAScript);
+
+  template <typename C>
+  std::pair<std::basic_regex<C>, std::basic_string<C>>
+  regex_replace_parse (const C*,
+                       std::regex_constants::syntax_option_type =
+                         std::regex_constants::ECMAScript);
+
+  template <typename C>
+  std::pair<std::basic_regex<C>, std::basic_string<C>>
+  regex_replace_parse (const C*, size_t,
+                       std::regex_constants::syntax_option_type =
+                         std::regex_constants::ECMAScript);
+
+  // As above but return string instead of regex and do not fail if there is
+  // text after the last delimiter instead returning its position.
+  //
+  template <typename C>
+  std::pair<std::basic_string<C>, std::basic_string<C>>
+  regex_replace_parse (const C*, size_t, size_t& end);
 }
 
 LIBBUTL_MODEXPORT namespace std
diff --git a/libbutl/regex.txx b/libbutl/regex.txx
index b785708..aa845be 100644
--- a/libbutl/regex.txx
+++ b/libbutl/regex.txx
@@ -278,4 +278,71 @@ LIBBUTL_MODEXPORT namespace butl //@@ MOD Clang needs this for some reason.
 
     return match;
   }
+
+  template <typename C>
+  std::pair<std::basic_regex<C>, std::basic_string<C>>
+  regex_replace_parse (const C* s, size_t n,
+                       std::regex_constants::syntax_option_type f)
+  {
+    using namespace std;
+
+    using string_type = basic_string<C>;
+
+    size_t e;
+    pair<string_type, string_type> r (regex_replace_parse (s, n, e));
+
+    if (e != n)
+      throw invalid_argument ("junk after trailing delimiter");
+
+    return make_pair (basic_regex<C> (r.first, f), move (r.second));
+  }
+
+  template <typename C>
+  std::pair<std::basic_string<C>, std::basic_string<C>>
+  regex_replace_parse (const C* s, size_t n, size_t& e)
+  {
+    using namespace std;
+
+    using string_type = basic_string<C>;
+
+    if (n == 0)
+      throw invalid_argument ("no leading delimiter");
+
+    const C* b (s); // Save the beginning of the string.
+
+    char delim (s[0]);
+
+    // Position to the regex first character and find the regex-terminating
+    // delimiter.
+    //
+    --n;
+    ++s;
+
+    const C* p (string_type::traits_type::find (s, n, delim));
+
+    if (p == nullptr)
+      throw invalid_argument ("no delimiter after regex");
+
+    // Empty regex matches nothing, so not of much use.
+    //
+    if (p == s)
+      throw invalid_argument ("empty regex");
+
+    // Save the regex.
+    //
+    string_type re (s, p - s);
+
+    // Position to the format first character and find the trailing delimiter.
+    //
+    n -= p - s + 1;
+    s  = p + 1;
+
+    p = string_type::traits_type::find (s, n, delim);
+
+    if (p == nullptr)
+      throw invalid_argument ("no delimiter after replacement");
+
+    e = p - b + 1;
+    return make_pair (move (re), string_type (s, p - s));
+  }
 }
diff --git a/tests/builtin/sed.testscript b/tests/builtin/sed.testscript
index 7fbc9b2..2ed3088 100644
--- a/tests/builtin/sed.testscript
+++ b/tests/builtin/sed.testscript
@@ -166,13 +166,13 @@ test.options += -c
         : unterminated
         :
         $* -e 's/foo' 2>>EOE != 0
-          sed: unterminated 's' command regex in 's/foo'
+          sed: invalid 's' command 's/foo': no delimiter after regex
           EOE
 
         : empty
         :
         $* -e 's///' 2>>EOE != 0
-          sed: empty regex in 's' command in 's///'
+          sed: invalid 's' command 's///': empty regex
           EOE
 
         : invalid
@@ -188,7 +188,7 @@ test.options += -c
       : unterminated-replacement
       :
       $* -e 's/foo/bar' 2>>EOE != 0
-        sed: unterminated 's' command replacement in 's/foo/bar'
+        sed: invalid 's' command 's/foo/bar': no delimiter after replacement
         EOE
 
       : invalid-flags
diff --git a/tests/regex/driver.cxx b/tests/regex/driver.cxx
index f78a100..cb59cd8 100644
--- a/tests/regex/driver.cxx
+++ b/tests/regex/driver.cxx
@@ -4,8 +4,11 @@
 #include <cassert>
 
 #ifndef __cpp_lib_modules_ts
+#include <regex>
 #include <string>
+#include <utility>   // pair
 #include <iostream>
+#include <stdexcept> // invalid_argument
 #include <exception>
 #endif
 
@@ -27,7 +30,7 @@ import butl.utility; // operator<<(ostream, exception)
 using namespace std;
 using namespace butl;
 
-// Usage: argv[0] [-ffo] [-fnc] [-m] <string> <regex> <format>
+// Usage: argv[0] [-ffo] [-fnc] [-m] <string> "/<regex>/<format>/"
 //
 // Perform substitution of matched substrings with formatted replacement
 // strings using regex_replace_*() functions. If the string matches the regex
@@ -66,11 +69,13 @@ try
       break;
   }
 
-  assert (i + 3 == argc);
+  assert (i + 2 == argc);
 
-  string s   (argv[i++]);
-  regex  re  (argv[i++]);
-  string fmt (argv[i]);
+  string s (argv[i++]);
+  pair<regex, string> rf (regex_replace_parse (argv[i]));
+
+  const regex&  re  (rf.first);
+  const string& fmt (rf.second);
 
   auto r (match
           ? regex_replace_match (s, re, fmt)
@@ -86,8 +91,13 @@ catch (const regex_error& e)
   cerr << "invalid regex" << e << endl; // Print sanitized.
   return 2;
 }
-catch (const exception& e)
+catch (const invalid_argument& e)
 {
   cerr << e << endl;
   return 2;
 }
+catch (const exception&)
+{
+  assert (false);
+  return 2;
+}
diff --git a/tests/regex/testscript b/tests/regex/testscript
index fbee1d6..93ad4b6 100644
--- a/tests/regex/testscript
+++ b/tests/regex/testscript
@@ -4,38 +4,38 @@
 : replace-search
 :
 {
-  $*      abcbd b x >axcxd : all
-  $* -ffo abcbd b x >axcbd : first-only
-  $* -fnc abcbd b x >xx    : no-copy
+  $*      abcbd /b/x/ >axcxd : all
+  $* -ffo abcbd /b/x/ >axcbd : first-only
+  $* -fnc abcbd /b/x/ >xx    : no-copy
 
   : ecma-escape
   :
   {
-    $* xay a '$b'      >'x$by'  : none
-    $* xay a '$'       >'x$y'   : none-term
-    $* xay a '$$'      >'x$y'   : self
-    $* xay a 'b$&c'    >'xbacy' : match
-    $* xay a 'b$`c'    >'xbxcy' : match-precede
-    $* xay a "b\\\$'c" >'xbycy' : match-follow
+    $* xay '/a/$b/'      >'x$by'  : none
+    $* xay '/a/$/'       >'x$y'   : none-term
+    $* xay '/a/$$/'      >'x$y'   : self
+    $* xay '/a/b$&c/'    >'xbacy' : match
+    $* xay '/a/b$`c/'    >'xbxcy' : match-precede
+    $* xay "/a/b\\\$'c/" >'xbycy' : match-follow
 
     : capture
     :
     {
-      $* abcdefghij '(a)(b)(c)(d)(e)(f)(g)(h)(i)(j)' '$1$10'  >aj : matched
-      $* a          '(a)|(b)'                        '$1$2$3' >a  : unmatched
+      $* abcdefghij '/(a)(b)(c)(d)(e)(f)(g)(h)(i)(j)/$1$10/'  >aj : matched
+      $* a          '/(a)|(b)/$1$2$3/'                        >a  : unmatched
     }
   }
 
   : perl-escape
   :
   {
-    $* xay a '\b' >'xby' : none
-    $* xay a '\'  >'xy'  : none-term
-    $* xay a '\\' >'x\y' : self
+    $* xay '/a/\b/' >'xby' : none
+    $* xay '/a/\/'  >'xy'  : none-term
+    $* xay '/a/\\/' >'x\y' : self
 
     : newline
     :
-    $* xay a '\n' >>EOO
+    $* xay '/a/\n/' >>EOO
       x
       y
       EOO
@@ -43,25 +43,25 @@
     : capture
     :
     {
-      $* abcdefghij '(a)(b)(c)(d)(e)(f)(g)(h)(i)(j)' '\1\10'  >aa0 : matched
-      $* a          '(a)|(b)'                        '\1\2\3' >a   : unmatched
+      $* abcdefghij '/(a)(b)(c)(d)(e)(f)(g)(h)(i)(j)/\1\10/'  >aa0 : matched
+      $* a          '/(a)|(b)/\1\2\3/'                        >a   : unmatched
     }
 
     : upper
     :
     {
-      $* xay a       '\U'     >xy   : none
-      $* xay a       '\Uvz'   >xVZy : repl
-      $* xay a       '\Uv\Ez' >xVzy : end
-      $* aa  a       'v\Uz'   >vZvZ : locality
-      $* xay '(a)'   '\U\1'   >xAy  : capt
-      $* x-y '(a?)-' '\U\1z'  >xZy  : capt-empty
-      $* xay a       '\uvz'   >xVzy : once
+      $* xay '/a/\U/'        >xy   : none
+      $* xay '/a/\Uvz/'      >xVZy : repl
+      $* xay '/a/\Uv\Ez/'    >xVzy : end
+      $* aa  '/a/v\Uz/'      >vZvZ : locality
+      $* xay '/(a)/\U\1/'    >xAy  : capt
+      $* x-y '/(a?)-/\U\1z/' >xZy  : capt-empty
+      $* xay '/a/\uvz/'      >xVzy : once
     }
 
     : lower
     :
-    $* xay a '\lVZ' >xvZy
+    $* xay '/a/\lVZ/' >xvZy
   }
 }
 
@@ -70,6 +70,19 @@
 {
   test.options += -m
 
-  $* abc  'a(b)c' 'x\1y'  >xby : match
-  $* abcd 'a(b)c' 'x\1yd' == 1 : no-match
+  $* abc  '/a(b)c/x\1y/'  >xby : match
+  $* abcd '/a(b)c/x\1yd/' == 1 : no-match
+}
+
+: invalid-regex-fmt
+:
+{
+  test.arguments += '' # Note: we will fail before the matching.
+
+  $* ''        2> 'no leading delimiter'           != 0 : no-leading-delim
+  $* '/a'      2> 'no delimiter after regex'       != 0 : no-mid-delim
+  $* '//'      2> 'empty regex'                    != 0 : no-regex
+  $* '/a[b/c/' 2>~'/invalid regex.*/'              != 0 : regex
+  $* '/a/b'    2> 'no delimiter after replacement' != 0 : no-trailing-delim
+  $* '/a/b/s'  2> 'junk after trailing delimiter'  != 0 : junk
 }
author	Karen Arutyunov <karen@codesynthesis.com>	2021-03-23 18:50:55 +0300
committer	Karen Arutyunov <karen@codesynthesis.com>	2021-03-26 18:13:32 +0300
commit	95c579df686f115c0fd3697f2723fa73476c4584 (patch)
tree	5d76adbcf75692d278b4085c6e996ab58a3e4e25
parent	5ecdb9a3b5cb85418f69126226b2636caed2e4da (diff)