Add regex utility functions

author: Karen Arutyunov <karen@codesynthesis.com> 2017-06-21 13:03:56 +0300
committer: Karen Arutyunov <karen@codesynthesis.com> 2017-06-21 13:03:56 +0300
commit: 5d50c0499b30650deafc291a3872a386d08a3200 (patch)
tree: 4e0936d2d51fa7d92b3fb7c4b05bfc099bbf0a58
parent: 273ce32a9a9c89410d4ab396c1bbdfb9a5024fa8 (diff)
4 files changed, 321 insertions, 0 deletions
diff --git a/libbutl/buildfile b/libbutl/buildfile
index d5f492b..42fc421 100644
--- a/libbutl/buildfile
+++ b/libbutl/buildfile
@@ -26,6 +26,7 @@ lib{butl}:                                  \
    {hxx            }{ process-details     } \
    {hxx            }{ process-io          } \
    {        txx cxx}{ process-run         } \
+   {hxx     txx cxx}{ regex               } \
    {hxx ixx     cxx}{ sendmail            } \
    {hxx         cxx}{ sha256              } \
    {hxx            }{ small-vector        } \
diff --git a/libbutl/regex.cxx b/libbutl/regex.cxx
new file mode 100644
index 0000000..4e2e26f
--- /dev/null
+++ b/libbutl/regex.cxx
@@ -0,0 +1,45 @@
+// file      : libbutl/regex.cxx -*- C++ -*-
+// copyright : Copyright (c) 2014-2017 Code Synthesis Ltd
+// license   : MIT; see accompanying LICENSE file
+
+#include <libbutl/regex.hxx>
+
+#if defined(_MSC_VER) && _MSC_VER <= 1910
+#  include <cstring> // strstr()
+#endif
+
+#include <ostream>
+#include <sstream>
+#include <stdexcept> // runtime_error
+
+#include <libbutl/utility.hxx> // operator<<(ostream, exception)
+
+namespace std
+{
+  // Currently libstdc++ just returns the name of the exception (bug #67361).
+  // So we check that the description contains at least one space character.
+  //
+  // While VC's description is meaningful, it has an undesired prefix that
+  // resembles the following: 'regex_error(error_badrepeat): '. So we skip it.
+  //
+  ostream&
+  operator<< (ostream& o, const regex_error& e)
+  {
+    const char* d (e.what ());
+
+#if defined(_MSC_VER) && _MSC_VER <= 1910
+    const char* rd (strstr (d, "): "));
+    if (rd != nullptr)
+      d = rd + 3;
+#endif
+
+    ostringstream os;
+    os << runtime_error (d); // Sanitize the description.
+
+    string s (os.str ());
+    if (s.find (' ') != string::npos)
+      o << ": " << s;
+
+    return o;
+  }
+}
diff --git a/libbutl/regex.hxx b/libbutl/regex.hxx
new file mode 100644
index 0000000..4a93106
--- /dev/null
+++ b/libbutl/regex.hxx
@@ -0,0 +1,57 @@
+// file      : libbutl/regex.hxx -*- C++ -*-
+// copyright : Copyright (c) 2014-2017 Code Synthesis Ltd
+// license   : MIT; see accompanying LICENSE file
+
+#ifndef LIBBUTL_REGEX_HXX
+#define LIBBUTL_REGEX_HXX
+
+#include <regex>
+#include <iosfwd>
+#include <string>  // basic_string
+#include <utility> // pair
+
+#include <libbutl/export.hxx>
+
+namespace butl
+{
+  // Like std::regex_match() but extends the standard ECMA-262
+  // substitution escape sequences with a subset of Perl sequences:
+  //
+  // \\, \u, \l, \U, \L, \E, \1, ..., \9
+  //
+  // Also return the resulting string as well as whether the search
+  // succeeded.
+  //
+  // Notes and limitations:
+  //
+  // - The only valid regex_constants flags are match_default,
+  //   format_first_only (format_no_copy can easily be supported).
+  //
+  // - If backslash doesn't start any of the listed sequences then it is
+  //   silently dropped and the following character is copied as is.
+  //
+  // - The character case conversion is performed according to the global
+  //   C++ locale (which is, unless changed, is the same as C locale and
+  //   both default to the POSIX locale aka "C").
+  //
+  template <typename C>
+  std::pair<std::basic_string<C>, bool>
+  regex_replace_ex (const std::basic_string<C>&,
+                    const std::basic_regex<C>&,
+                    const std::basic_string<C>& fmt,
+                    std::regex_constants::match_flag_type =
+                      std::regex_constants::match_default);
+}
+
+namespace std
+{
+  // Print regex error description but only if it is meaningful (this is also
+  // why we have to print leading colon).
+  //
+  LIBBUTL_EXPORT ostream&
+  operator<< (ostream&, const regex_error&);
+}
+
+#include <libbutl/regex.txx>
+
+#endif // LIBBUTL_REGEX_HXX
diff --git a/libbutl/regex.txx b/libbutl/regex.txx
new file mode 100644
index 0000000..cb8cfe0
--- /dev/null
+++ b/libbutl/regex.txx
@@ -0,0 +1,218 @@
+// file      : libbutl/regex.txx -*- C++ -*-
+// copyright : Copyright (c) 2014-2017 Code Synthesis Ltd
+// license   : MIT; see accompanying LICENSE file
+
+#include <locale>
+#include <cstddef> // size_t
+
+namespace butl
+{
+  template <typename C>
+  std::pair<std::basic_string<C>, bool>
+  regex_replace_ex (const std::basic_string<C>& s,
+                    const std::basic_regex<C>& re,
+                    const std::basic_string<C>& fmt,
+                    std::regex_constants::match_flag_type flags)
+  {
+    using namespace std;
+
+    using string_type = basic_string<C>;
+    using str_it      = typename string_type::const_iterator;
+    using regex_it    = regex_iterator<str_it>;
+
+    bool first_only ((flags & std::regex_constants::format_first_only) ==
+                     std::regex_constants::format_first_only);
+
+    locale cl; // Copy of the global C++ locale.
+    string_type r;
+
+    // Beginning of the last unmatched substring.
+    //
+    str_it ub (s.begin ());
+
+    regex_it b (s.begin (), s.end (), re, flags);
+    regex_it e;
+    bool match (b != e);
+
+    for (regex_it i (b); i != e; ++i)
+    {
+      const match_results<str_it>& m (*i);
+
+      // Copy the preceeding unmatched substring, save the beginning of the
+      // one that follows.
+      //
+      r.append (ub, m.prefix ().second);
+      ub = m.suffix ().first;
+
+      if (first_only && i != b)
+        r.append (m[0].first, m[0].second); // Append matched substring.
+      else
+      {
+        // The standard implementation calls m.format() here. We perform our
+        // own formatting.
+        //
+        // Note that we are using char type literals with the assumption that
+        // being ASCII characters they will be properly "widened" to the
+        // corresponding literals of the C template parameter type.
+        //
+        auto digit = [] (C c) -> int
+        {
+          return c >= '0' && c <= '9' ? c - '0' : -1;
+        };
+
+        enum class case_conv {none, upper, lower, upper_once, lower_once}
+        mode (case_conv::none);
+
+        auto conv_chr = [&mode, &cl] (C c) -> C
+        {
+          switch (mode)
+          {
+          case case_conv::upper_once: mode = case_conv::none;
+          case case_conv::upper:      c = toupper (c, cl); break;
+          case case_conv::lower_once: mode = case_conv::none;
+          case case_conv::lower:      c = tolower (c, cl); break;
+          case case_conv::none:       break;
+          }
+          return c;
+        };
+
+        auto append_chr = [&r, &conv_chr] (C c)
+        {
+          r.push_back (conv_chr (c));
+        };
+
+        auto append_str = [&r, &mode, &conv_chr] (str_it b, str_it e)
+        {
+          // Optimize for the common case.
+          //
+          if (mode == case_conv::none)
+            r.append (b, e);
+          else
+          {
+            for (str_it i (b); i != e; ++i)
+              r.push_back (conv_chr (*i));
+          }
+        };
+
+        size_t n (fmt.size ());
+        for (size_t i (0); i < n; ++i)
+        {
+          C c (fmt[i]);
+
+          switch (c)
+          {
+          case '$':
+            {
+              // Check if this is a $-based escape sequence. Interpret it
+              // accordingly if that's the case, treat '$' as a regular
+              // character otherwise.
+              //
+              c = fmt[++i]; // '\0' if last.
+
+              switch (c)
+              {
+              case '$': append_chr (c); break;
+              case '&': append_str (m[0].first, m[0].second); break;
+              case '`':
+                {
+                  append_str (m.prefix ().first, m.prefix ().second);
+                  break;
+                }
+              case '\'':
+                {
+                  append_str (m.suffix ().first, m.suffix ().second);
+                  break;
+                }
+              default:
+                {
+                  // Check if this is a sub-expression 1-based index ($n or
+                  // $nn). Append the matching substring if that's the case.
+                  // Treat '$' as a regular character otherwise. Index greater
+                  // than the sub-expression count is silently ignored.
+                  //
+                  int si (digit (c));
+                  if (si >= 0)
+                  {
+                    int d;
+                    if ((d = digit (fmt[i + 1])) >= 0) // '\0' if last.
+                    {
+                      si = si * 10 + d;
+                      ++i;
+                    }
+                  }
+
+                  if (si > 0)
+                  {
+                    // m[0] refers to the matched substring.
+                    //
+                    if (static_cast<size_t> (si) < m.size ())
+                      append_str (m[si].first, m[si].second);
+                  }
+                  else
+                  {
+                    // Not a $-based escape sequence so treat '$' as a
+                    // regular character.
+                    //
+                    --i;
+                    append_chr ('$');
+                  }
+
+                  break;
+                }
+              }
+
+              break;
+            }
+          case '\\':
+            {
+              c = fmt[++i]; // '\0' if last.
+
+              switch (c)
+              {
+              case '\\': append_chr (c); break;
+
+              case 'u': mode = case_conv::upper_once; break;
+              case 'l': mode = case_conv::lower_once; break;
+              case 'U': mode = case_conv::upper;      break;
+              case 'L': mode = case_conv::lower;      break;
+              case 'E': mode = case_conv::none;       break;
+              default:
+                {
+                  // Check if this is a sub-expression 1-based index. Append
+                  // the matching substring if that's the case, Skip '\\'
+                  // otherwise. Index greater than the sub-expression count is
+                  // silently ignored.
+                  //
+                  int si (digit (c));
+                  if (si > 0)
+                  {
+                    // m[0] refers to the matched substring.
+                    //
+                    if (static_cast<size_t> (si) < m.size ())
+                      append_str (m[si].first, m[si].second);
+                  }
+                  else
+                    --i;
+
+                  break;
+                }
+              }
+
+              break;
+            }
+          default:
+            {
+              // Append a regular character.
+              //
+              append_chr (c);
+              break;
+            }
+          }
+        }
+      }
+    }
+
+    r.append (ub, s.end ()); // Append the rightmost non-matched substring.
+    return make_pair (move (r), match);
+  }
+}
author	Karen Arutyunov <karen@codesynthesis.com>	2017-06-21 13:03:56 +0300
committer	Karen Arutyunov <karen@codesynthesis.com>	2017-06-21 13:03:56 +0300
commit	5d50c0499b30650deafc291a3872a386d08a3200 (patch)
tree	4e0936d2d51fa7d92b3fb7c4b05bfc099bbf0a58
parent	273ce32a9a9c89410d4ab396c1bbdfb9a5024fa8 (diff)