Add regex_replace_match() and rename regex_replace_ex() to regex_replace_search()

author: Karen Arutyunov <karen@codesynthesis.com> 2018-06-19 15:30:22 +0300
committer: Karen Arutyunov <karen@codesynthesis.com> 2018-06-19 15:30:22 +0300
commit: 06e915be138b0638e30083f84cecda0eb1bfc895 (patch)
tree: 50f7eca40de25033116c6f6f75524ae5801dcc78
parent: 338d8065f1b681da841fa0d79cc9265776ff1e1e (diff)
5 files changed, 262 insertions, 204 deletions
diff --git a/libbutl/regex.ixx b/libbutl/regex.ixx
index 15189fb..8e3286f 100644
--- a/libbutl/regex.ixx
+++ b/libbutl/regex.ixx
@@ -6,19 +6,19 @@ LIBBUTL_MODEXPORT namespace butl //@@ MOD Clang needs this for some reason.
 {
   template <typename C>
   inline std::pair<std::basic_string<C>, bool>
-  regex_replace_ex (const std::basic_string<C>& s,
-                    const std::basic_regex<C>& re,
-                    const std::basic_string<C>& fmt,
-                    std::regex_constants::match_flag_type flags)
+  regex_replace_search (const std::basic_string<C>& s,
+                        const std::basic_regex<C>& re,
+                        const std::basic_string<C>& fmt,
+                        std::regex_constants::match_flag_type flags)
   {
     using namespace std;
 
     using it = typename basic_string<C>::const_iterator;
 
     basic_string<C> r;
-    bool match (regex_replace_ex (s, re, fmt,
-                                  [&r] (it b, it e) {r.append (b, e);},
-                                  flags));
+    bool match (regex_replace_search (s, re, fmt,
+                                      [&r] (it b, it e) {r.append (b, e);},
+                                      flags));
 
     return make_pair (move (r), match);
   }
diff --git a/libbutl/regex.mxx b/libbutl/regex.mxx
index 741b818..7fa0155 100644
--- a/libbutl/regex.mxx
+++ b/libbutl/regex.mxx
@@ -40,16 +40,9 @@ import std.regex; // @@ MOD TODO should probably be re-exported.
 
 LIBBUTL_MODEXPORT namespace butl
 {
-  // Call specified append() function for non-matched substrings and matched
-  // substring replacements returning true if search succeeded. The function
-  // must be callable with the following signature:
-  //
-  // void
-  // append(basic_string<C>::iterator begin, basic_string<C>::iterator end);
-  //
-  // The regex semantics is like that of std::regex_replace() extended the
-  // standard ECMA-262 substitution escape sequences with a subset of Perl
-  // sequences:
+  // The regex semantics for the following functions is like that of
+  // std::regex_replace() extended the standard ECMA-262 substitution escape
+  // sequences with a subset of Perl sequences:
   //
   // \\, \u, \l, \U, \L, \E, \1, ..., \9
   //
@@ -65,14 +58,22 @@ LIBBUTL_MODEXPORT namespace butl
   //   C++ locale (which is, unless changed, is the same as C locale and
   //   both default to the POSIX locale aka "C").
   //
+
+  // Call specified append() function for non-matched substrings and matched
+  // substring replacements returning true if search succeeded. The function
+  // must be callable with the following signature:
+  //
+  // void
+  // append(basic_string<C>::iterator begin, basic_string<C>::iterator end);
+  //
   template <typename C, typename F>
   bool
-  regex_replace_ex (const std::basic_string<C>&,
-                    const std::basic_regex<C>&,
-                    const std::basic_string<C>& fmt,
-                    F&& append,
-                    std::regex_constants::match_flag_type =
-                      std::regex_constants::match_default);
+  regex_replace_search (const std::basic_string<C>&,
+                        const std::basic_regex<C>&,
+                        const std::basic_string<C>& fmt,
+                        F&& append,
+                        std::regex_constants::match_flag_type =
+                        std::regex_constants::match_default);
 
   // As above but concatenate non-matched substrings and matched substring
   // replacements into a string returning it as well as whether the search
@@ -80,11 +81,19 @@ LIBBUTL_MODEXPORT namespace butl
   //
   template <typename C>
   std::pair<std::basic_string<C>, bool>
-  regex_replace_ex (const std::basic_string<C>&,
-                    const std::basic_regex<C>&,
-                    const std::basic_string<C>& fmt,
-                    std::regex_constants::match_flag_type =
-                      std::regex_constants::match_default);
+  regex_replace_search (const std::basic_string<C>&,
+                        const std::basic_regex<C>&,
+                        const std::basic_string<C>& fmt,
+                        std::regex_constants::match_flag_type =
+                          std::regex_constants::match_default);
+
+  // Match the entire string and, if it matches, return the string replacement.
+  //
+  template <typename C>
+  std::pair<std::basic_string<C>, bool>
+  regex_replace_match (const std::basic_string<C>&,
+                       const std::basic_regex<C>&,
+                       const std::basic_string<C>& fmt);
 }
 
 LIBBUTL_MODEXPORT namespace std
diff --git a/libbutl/regex.txx b/libbutl/regex.txx
index a7a6c9a..fbe2885 100644
--- a/libbutl/regex.txx
+++ b/libbutl/regex.txx
@@ -4,13 +4,209 @@
 
 LIBBUTL_MODEXPORT namespace butl //@@ MOD Clang needs this for some reason.
 {
+  // Replace the regex match results using the format string.
+  //
+  template <typename C>
+  std::basic_string<C>
+  regex_replace_match_results (
+    const std::match_results<typename std::basic_string<C>::const_iterator>& m,
+    const std::basic_string<C>& fmt)
+  {
+    using namespace std;
+
+    using string_type = basic_string<C>;
+    using str_it      = typename string_type::const_iterator;
+
+    string_type r;
+
+    // Note that we are using char type literals with the assumption that
+    // being ASCII characters they will be properly "widened" to the
+    // corresponding literals of the C template parameter type.
+    //
+    auto digit = [] (C c) -> int
+    {
+      return c >= '0' && c <= '9' ? c - '0' : -1;
+    };
+
+    enum class case_conv {none, upper, lower, upper_once, lower_once}
+    mode (case_conv::none);
+
+    locale cl; // Copy of the global C++ locale.
+
+    auto conv_chr = [&mode, &cl] (C c) -> C
+    {
+      switch (mode)
+      {
+      case case_conv::upper_once: mode = case_conv::none; // Fall through.
+      case case_conv::upper:      c = toupper (c, cl); break;
+      case case_conv::lower_once: mode = case_conv::none; // Fall through.
+      case case_conv::lower:      c = tolower (c, cl); break;
+      case case_conv::none:       break;
+      }
+      return c;
+    };
+
+    auto append_chr = [&r, &conv_chr] (C c) {r.push_back (conv_chr (c));};
+
+    auto append_str = [&r, &mode, &conv_chr] (str_it b, str_it e)
+    {
+      // Optimize for the common case.
+      //
+      if (mode == case_conv::none)
+        r.append (b, e);
+      else
+      {
+        for (str_it i (b); i != e; ++i)
+          r.push_back (conv_chr (*i));
+      }
+    };
+
+    size_t n (fmt.size ());
+    for (size_t i (0); i < n; ++i)
+    {
+      C c (fmt[i]);
+
+      switch (c)
+      {
+      case '$':
+        {
+          // Check if this is a $-based escape sequence. Interpret it
+          // accordingly if that's the case, treat '$' as a regular character
+          // otherwise.
+          //
+          c = fmt[++i]; // '\0' if last.
+
+          switch (c)
+          {
+          case '$': append_chr (c); break;
+          case '&': append_str (m[0].first, m[0].second); break;
+          case '`':
+            {
+              append_str (m.prefix ().first, m.prefix ().second);
+              break;
+            }
+          case '\'':
+            {
+              append_str (m.suffix ().first, m.suffix ().second);
+              break;
+            }
+          default:
+            {
+              // Check if this is a sub-expression 1-based index ($n or $nn).
+              // Append the matching substring if that's the case. Treat '$'
+              // as a regular character otherwise. Index greater than the
+              // sub-expression count is silently ignored.
+              //
+              int si (digit (c));
+              if (si >= 0)
+              {
+                int d;
+                if ((d = digit (fmt[i + 1])) >= 0) // '\0' if last.
+                {
+                  si = si * 10 + d;
+                  ++i;
+                }
+              }
+
+              if (si > 0)
+              {
+                // m[0] refers to the matched substring. Note that we ignore
+                // unmatched sub-expression references.
+                //
+                if (static_cast<size_t> (si) < m.size () && m[si].matched)
+                  append_str (m[si].first, m[si].second);
+              }
+              else
+              {
+                // Not a $-based escape sequence so treat '$' as a regular
+                // character.
+                //
+                --i;
+                append_chr ('$');
+              }
+
+              break;
+            }
+          }
+
+          break;
+        }
+      case '\\':
+        {
+          c = fmt[++i]; // '\0' if last.
+
+          switch (c)
+          {
+          case '\\': append_chr (c); break;
+
+          case 'u': mode = case_conv::upper_once; break;
+          case 'l': mode = case_conv::lower_once; break;
+          case 'U': mode = case_conv::upper;      break;
+          case 'L': mode = case_conv::lower;      break;
+          case 'E': mode = case_conv::none;       break;
+          default:
+            {
+              // Check if this is a sub-expression 1-based index. Append the
+              // matching substring if that's the case, Skip '\\' otherwise.
+              // Index greater than the sub-expression count is silently
+              // ignored.
+              //
+              int si (digit (c));
+              if (si > 0)
+              {
+                // m[0] refers to the matched substring. Note that we ignore
+                // unmatched sub-expression references.
+                //
+                if (static_cast<size_t> (si) < m.size () && m[si].matched)
+                  append_str (m[si].first, m[si].second);
+              }
+              else
+                --i;
+
+              break;
+            }
+          }
+
+          break;
+        }
+      default:
+        {
+          // Append a regular character.
+          //
+          append_chr (c);
+          break;
+        }
+      }
+    }
+
+    return r;
+  }
+
+  template <typename C>
+  std::pair<std::basic_string<C>, bool>
+  regex_replace_match (const std::basic_string<C>& s,
+                       const std::basic_regex<C>& re,
+                       const std::basic_string<C>& fmt)
+  {
+    using namespace std;
+
+    using string_type = basic_string<C>;
+    using str_it      = typename string_type::const_iterator;
+
+    match_results<str_it> m;
+    bool match (regex_match (s, m, re));
+
+    return make_pair (match ? regex_replace_match_results (m, fmt) : string (),
+                      match);
+  }
+
   template <typename C, typename F>
   bool
-  regex_replace_ex (const std::basic_string<C>& s,
-                    const std::basic_regex<C>& re,
-                    const std::basic_string<C>& fmt,
-                    F&& append,
-                    std::regex_constants::match_flag_type flags)
+  regex_replace_search (const std::basic_string<C>& s,
+                        const std::basic_regex<C>& re,
+                        const std::basic_string<C>& fmt,
+                        F&& append,
+                        std::regex_constants::match_flag_type flags)
   {
     using namespace std;
 
@@ -18,10 +214,8 @@ LIBBUTL_MODEXPORT namespace butl //@@ MOD Clang needs this for some reason.
     using str_it      = typename string_type::const_iterator;
     using regex_it    = regex_iterator<str_it>;
 
-    bool first_only ((flags & std::regex_constants::format_first_only) != 0);
-    bool no_copy ((flags & std::regex_constants::format_no_copy) != 0);
-
-    locale cl; // Copy of the global C++ locale.
+    bool first_only ((flags & regex_constants::format_first_only) != 0);
+    bool no_copy ((flags & regex_constants::format_no_copy) != 0);
 
     // Beginning of the last unmatched substring.
     //
@@ -72,169 +266,7 @@ LIBBUTL_MODEXPORT namespace butl //@@ MOD Clang needs this for some reason.
         // The standard implementation calls m.format() here. We perform our
         // own formatting.
         //
-        // Note that we are using char type literals with the assumption that
-        // being ASCII characters they will be properly "widened" to the
-        // corresponding literals of the C template parameter type.
-        //
-        auto digit = [] (C c) -> int
-        {
-          return c >= '0' && c <= '9' ? c - '0' : -1;
-        };
-
-        enum class case_conv {none, upper, lower, upper_once, lower_once}
-        mode (case_conv::none);
-
-        auto conv_chr = [&mode, &cl] (C c) -> C
-        {
-          switch (mode)
-          {
-          case case_conv::upper_once: mode = case_conv::none; // Fall through.
-          case case_conv::upper:      c = toupper (c, cl); break;
-          case case_conv::lower_once: mode = case_conv::none; // Fall through.
-          case case_conv::lower:      c = tolower (c, cl); break;
-          case case_conv::none:       break;
-          }
-          return c;
-        };
-
-        string_type r;
-
-        auto append_chr = [&r, &conv_chr] (C c)
-        {
-          r.push_back (conv_chr (c));
-        };
-
-        auto append_str = [&r, &mode, &conv_chr] (str_it b, str_it e)
-        {
-          // Optimize for the common case.
-          //
-          if (mode == case_conv::none)
-            r.append (b, e);
-          else
-          {
-            for (str_it i (b); i != e; ++i)
-              r.push_back (conv_chr (*i));
-          }
-        };
-
-        size_t n (fmt.size ());
-        for (size_t i (0); i < n; ++i)
-        {
-          C c (fmt[i]);
-
-          switch (c)
-          {
-          case '$':
-            {
-              // Check if this is a $-based escape sequence. Interpret it
-              // accordingly if that's the case, treat '$' as a regular
-              // character otherwise.
-              //
-              c = fmt[++i]; // '\0' if last.
-
-              switch (c)
-              {
-              case '$': append_chr (c); break;
-              case '&': append_str (m[0].first, m[0].second); break;
-              case '`':
-                {
-                  append_str (m.prefix ().first, m.prefix ().second);
-                  break;
-                }
-              case '\'':
-                {
-                  append_str (m.suffix ().first, m.suffix ().second);
-                  break;
-                }
-              default:
-                {
-                  // Check if this is a sub-expression 1-based index ($n or
-                  // $nn). Append the matching substring if that's the case.
-                  // Treat '$' as a regular character otherwise. Index greater
-                  // than the sub-expression count is silently ignored.
-                  //
-                  int si (digit (c));
-                  if (si >= 0)
-                  {
-                    int d;
-                    if ((d = digit (fmt[i + 1])) >= 0) // '\0' if last.
-                    {
-                      si = si * 10 + d;
-                      ++i;
-                    }
-                  }
-
-                  if (si > 0)
-                  {
-                    // m[0] refers to the matched substring. Note that we
-                    // ignore unmatched sub-expression references.
-                    //
-                    if (static_cast<size_t> (si) < m.size () && m[si].matched)
-                      append_str (m[si].first, m[si].second);
-                  }
-                  else
-                  {
-                    // Not a $-based escape sequence so treat '$' as a
-                    // regular character.
-                    //
-                    --i;
-                    append_chr ('$');
-                  }
-
-                  break;
-                }
-              }
-
-              break;
-            }
-          case '\\':
-            {
-              c = fmt[++i]; // '\0' if last.
-
-              switch (c)
-              {
-              case '\\': append_chr (c); break;
-
-              case 'u': mode = case_conv::upper_once; break;
-              case 'l': mode = case_conv::lower_once; break;
-              case 'U': mode = case_conv::upper;      break;
-              case 'L': mode = case_conv::lower;      break;
-              case 'E': mode = case_conv::none;       break;
-              default:
-                {
-                  // Check if this is a sub-expression 1-based index. Append
-                  // the matching substring if that's the case, Skip '\\'
-                  // otherwise. Index greater than the sub-expression count is
-                  // silently ignored.
-                  //
-                  int si (digit (c));
-                  if (si > 0)
-                  {
-                    // m[0] refers to the matched substring. Note that we
-                    // ignore unmatched sub-expression references.
-                    //
-                    if (static_cast<size_t> (si) < m.size () && m[si].matched)
-                      append_str (m[si].first, m[si].second);
-                  }
-                  else
-                    --i;
-
-                  break;
-                }
-              }
-
-              break;
-            }
-          default:
-            {
-              // Append a regular character.
-              //
-              append_chr (c);
-              break;
-            }
-          }
-        }
-
+        string_type r (regex_replace_match_results (m, fmt));
         append (r.begin (), r.end ());
       }
     }
diff --git a/tests/regex/driver.cxx b/tests/regex/driver.cxx
index 0f6a385..fb41ba2 100644
--- a/tests/regex/driver.cxx
+++ b/tests/regex/driver.cxx
@@ -28,10 +28,10 @@ import butl.utility; // operator<<(ostream, exception)
 using namespace std;
 using namespace butl;
 
-// Usage: argv[0] [-ffo] [-fnc] <string> <regex> <format>
+// Usage: argv[0] [-ffo] [-fnc] [-m] <string> <regex> <format>
 //
 // Perform substitution of matched substrings with formatted replacement
-// strings using regex_replace_ex() function. If the string matches the regex
+// strings using regex_replace_*() functions. If the string matches the regex
 // then print the replacement to STDOUT and exit with zero code. Exit with
 // code one if it doesn't match, and with code two on failure (print error
 // description to STDERR).
@@ -42,6 +42,9 @@ using namespace butl;
 // -fnc
 //    Use format_no_copy replacement flag.
 //
+// -m
+//    Match the entire string, rather than its sub-strings.
+//
 int
 main (int argc, const char* argv[])
 try
@@ -49,6 +52,7 @@ try
   regex_constants::match_flag_type fl (regex_constants::match_default);
 
   int i (1);
+  bool match (false);
   for (; i != argc; ++i)
   {
     string op (argv[i]);
@@ -57,6 +61,8 @@ try
       fl |= regex_constants::format_first_only;
     else if (op == "-fnc")
       fl |= regex_constants::format_no_copy;
+    else if (op == "-m")
+      match = true;
     else
       break;
   }
@@ -67,7 +73,9 @@ try
   regex  re  (argv[i++]);
   string fmt (argv[i]);
 
-  auto r (regex_replace_ex (s, re, fmt, fl));
+  auto r (match
+          ? regex_replace_match (s, re, fmt)
+          : regex_replace_search (s, re, fmt, fl));
 
   if (r.second)
     cout << r.first << endl;
diff --git a/tests/regex/testscript b/tests/regex/testscript
index 4b03e45..d431756 100644
--- a/tests/regex/testscript
+++ b/tests/regex/testscript
@@ -2,7 +2,7 @@
 # copyright : Copyright (c) 2014-2018 Code Synthesis Ltd
 # license   : MIT; see accompanying LICENSE file
 
-: match
+: replace-search
 :
 {
   $*      abcbd b x >axcxd : all
@@ -58,3 +58,12 @@
     $* xay a '\lVZ' >xvZy
   }
 }
+
+: replace-match
+:
+{
+  test.options += -m
+
+  $* abc  'a(b)c' 'x\1y'  >xby : match
+  $* abcd 'a(b)c' 'x\1yd' == 1 : no-match
+}
author	Karen Arutyunov <karen@codesynthesis.com>	2018-06-19 15:30:22 +0300
committer	Karen Arutyunov <karen@codesynthesis.com>	2018-06-19 15:30:22 +0300
commit	06e915be138b0638e30083f84cecda0eb1bfc895 (patch)
tree	50f7eca40de25033116c6f6f75524ae5801dcc78
parent	338d8065f1b681da841fa0d79cc9265776ff1e1e (diff)