aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKaren Arutyunov <karen@codesynthesis.com>2021-03-23 18:50:55 +0300
committerKaren Arutyunov <karen@codesynthesis.com>2021-03-26 18:13:32 +0300
commit95c579df686f115c0fd3697f2723fa73476c4584 (patch)
tree5d76adbcf75692d278b4085c6e996ab58a3e4e25
parent5ecdb9a3b5cb85418f69126226b2636caed2e4da (diff)
Add regex_replace_parse() overloads
-rw-r--r--libbutl/builtin.cxx75
-rw-r--r--libbutl/regex.ixx17
-rw-r--r--libbutl/regex.mxx36
-rw-r--r--libbutl/regex.txx67
-rw-r--r--tests/builtin/sed.testscript6
-rw-r--r--tests/regex/driver.cxx22
-rw-r--r--tests/regex/testscript67
7 files changed, 211 insertions, 79 deletions
diff --git a/libbutl/builtin.cxx b/libbutl/builtin.cxx
index 79ff968..a6bb94b 100644
--- a/libbutl/builtin.cxx
+++ b/libbutl/builtin.cxx
@@ -1632,15 +1632,6 @@ namespace butl
string replacement;
bool global;
bool print;
-
- subst (const string& re, bool ic, string rp, bool gl, bool pr)
- //
- // Note that ECMAScript is implied if no grammar flag is specified.
- //
- : regex (re, ic ? regex::icase : regex::ECMAScript),
- replacement (move (rp)),
- global (gl),
- print (pr) {}
};
small_vector<subst, 1> substs;
@@ -1663,57 +1654,59 @@ namespace butl
if (delim == '\\' || delim == '\n')
fail () << "invalid delimiter for 's' command in '" << v << "'";
- size_t p (v.find (delim, 2));
- if (p == string::npos)
- fail () << "unterminated 's' command regex in '" << v << "'";
-
- string regex (v, 2, p - 2);
-
- // Empty regex matches nothing, so not of much use.
- //
- if (regex.empty ())
- fail () << "empty regex in 's' command in '" << v << "'";
-
- size_t b (p + 1);
- p = v.find (delim, b);
- if (p == string::npos)
- fail () << "unterminated 's' command replacement in '" << v << "'";
-
- string replacement (v, b, p - b);
-
- // Parse the substitute command flags.
+ // Parse the substitute command regex (as string), replacement, and
+ // flags.
//
+ pair<string, string> rf;
bool icase (false);
bool global (false);
bool print (false);
- char c;
- for (++p; (c = v[p]) != '\0'; ++p)
+ try
{
- switch (c)
+ size_t e;
+ rf = regex_replace_parse (v.c_str () + 1, v.size () - 1, e);
+
+ char c;
+ for (size_t i (e + 1); (c = v[i]) != '\0'; ++i)
{
- case 'i': icase = true; break;
- case 'g': global = true; break;
- case 'p': print = true; break;
- default:
+ switch (c)
{
- fail () << "invalid 's' command flag '" << c << "' in '" << v
- << "'";
+ case 'i': icase = true; break;
+ case 'g': global = true; break;
+ case 'p': print = true; break;
+ default:
+ {
+ fail () << "invalid 's' command flag '" << c << "' in '" << v
+ << "'";
+ }
}
}
}
+ catch (const invalid_argument& e)
+ {
+ fail () << "invalid 's' command '" << v << "': " << e;
+ }
+ // Parse the regex and add the substitution to the list.
+ //
try
{
- substs.emplace_back (regex, icase,
- move (replacement),
- global, print);
+ // Note that ECMAScript is implied if no grammar flag is specified.
+ //
+ regex re (rf.first, icase ? regex::icase : regex::ECMAScript);
+
+ substs.push_back ({move (re),
+ move (rf.second),
+ global,
+ print});
}
catch (const regex_error& e)
{
// Print regex_error description if meaningful (no space).
//
- fail () << "invalid regex '" << regex << "' in '" << v << "'" << e;
+ fail () << "invalid regex '" << rf.first << "' in '" << v << "'"
+ << e;
}
}
diff --git a/libbutl/regex.ixx b/libbutl/regex.ixx
index dec15d1..805acd1 100644
--- a/libbutl/regex.ixx
+++ b/libbutl/regex.ixx
@@ -21,4 +21,21 @@ LIBBUTL_MODEXPORT namespace butl //@@ MOD Clang needs this for some reason.
return make_pair (move (r), match);
}
+
+ template <typename C>
+ inline std::pair<std::basic_regex<C>, std::basic_string<C>>
+ regex_replace_parse (const std::basic_string<C>& s,
+ std::regex_constants::syntax_option_type f)
+ {
+ return regex_replace_parse (s.c_str (), s.size (), f);
+ }
+
+ template <typename C>
+ inline std::pair<std::basic_regex<C>, std::basic_string<C>>
+ regex_replace_parse (const C* s,
+ std::regex_constants::syntax_option_type f)
+ {
+ return regex_replace_parse (
+ s, std::basic_string<C>::traits_type::length (s), f);
+ }
}
diff --git a/libbutl/regex.mxx b/libbutl/regex.mxx
index 84b024f..b5490b1 100644
--- a/libbutl/regex.mxx
+++ b/libbutl/regex.mxx
@@ -14,8 +14,9 @@
#include <utility> // pair
#include <locale>
-#include <cstddef> // size_t
-#include <utility> // move(), make_pair()
+#include <cstddef> // size_t
+#include <utility> // move(), make_pair()
+#include <stdexcept> // invalid_argument
#endif
#if defined(__clang__)
@@ -93,6 +94,37 @@ LIBBUTL_MODEXPORT namespace butl
regex_replace_match (const std::basic_string<C>&,
const std::basic_regex<C>&,
const std::basic_string<C>& fmt);
+
+ // Parse the '/<regex>/<format>/' replacement string into the regex/format
+ // pair. Other character can be used as a delimiter instead of '/'. Throw
+ // std::invalid_argument or std::regex_error on parsing error.
+ //
+ // Note: escaping of the delimiter character is not (yet) supported.
+ //
+ template <typename C>
+ std::pair<std::basic_regex<C>, std::basic_string<C>>
+ regex_replace_parse (const std::basic_string<C>&,
+ std::regex_constants::syntax_option_type =
+ std::regex_constants::ECMAScript);
+
+ template <typename C>
+ std::pair<std::basic_regex<C>, std::basic_string<C>>
+ regex_replace_parse (const C*,
+ std::regex_constants::syntax_option_type =
+ std::regex_constants::ECMAScript);
+
+ template <typename C>
+ std::pair<std::basic_regex<C>, std::basic_string<C>>
+ regex_replace_parse (const C*, size_t,
+ std::regex_constants::syntax_option_type =
+ std::regex_constants::ECMAScript);
+
+ // As above but return string instead of regex and do not fail if there is
+ // text after the last delimiter instead returning its position.
+ //
+ template <typename C>
+ std::pair<std::basic_string<C>, std::basic_string<C>>
+ regex_replace_parse (const C*, size_t, size_t& end);
}
LIBBUTL_MODEXPORT namespace std
diff --git a/libbutl/regex.txx b/libbutl/regex.txx
index b785708..aa845be 100644
--- a/libbutl/regex.txx
+++ b/libbutl/regex.txx
@@ -278,4 +278,71 @@ LIBBUTL_MODEXPORT namespace butl //@@ MOD Clang needs this for some reason.
return match;
}
+
+ template <typename C>
+ std::pair<std::basic_regex<C>, std::basic_string<C>>
+ regex_replace_parse (const C* s, size_t n,
+ std::regex_constants::syntax_option_type f)
+ {
+ using namespace std;
+
+ using string_type = basic_string<C>;
+
+ size_t e;
+ pair<string_type, string_type> r (regex_replace_parse (s, n, e));
+
+ if (e != n)
+ throw invalid_argument ("junk after trailing delimiter");
+
+ return make_pair (basic_regex<C> (r.first, f), move (r.second));
+ }
+
+ template <typename C>
+ std::pair<std::basic_string<C>, std::basic_string<C>>
+ regex_replace_parse (const C* s, size_t n, size_t& e)
+ {
+ using namespace std;
+
+ using string_type = basic_string<C>;
+
+ if (n == 0)
+ throw invalid_argument ("no leading delimiter");
+
+ const C* b (s); // Save the beginning of the string.
+
+ char delim (s[0]);
+
+ // Position to the regex first character and find the regex-terminating
+ // delimiter.
+ //
+ --n;
+ ++s;
+
+ const C* p (string_type::traits_type::find (s, n, delim));
+
+ if (p == nullptr)
+ throw invalid_argument ("no delimiter after regex");
+
+ // Empty regex matches nothing, so not of much use.
+ //
+ if (p == s)
+ throw invalid_argument ("empty regex");
+
+ // Save the regex.
+ //
+ string_type re (s, p - s);
+
+ // Position to the format first character and find the trailing delimiter.
+ //
+ n -= p - s + 1;
+ s = p + 1;
+
+ p = string_type::traits_type::find (s, n, delim);
+
+ if (p == nullptr)
+ throw invalid_argument ("no delimiter after replacement");
+
+ e = p - b + 1;
+ return make_pair (move (re), string_type (s, p - s));
+ }
}
diff --git a/tests/builtin/sed.testscript b/tests/builtin/sed.testscript
index 7fbc9b2..2ed3088 100644
--- a/tests/builtin/sed.testscript
+++ b/tests/builtin/sed.testscript
@@ -166,13 +166,13 @@ test.options += -c
: unterminated
:
$* -e 's/foo' 2>>EOE != 0
- sed: unterminated 's' command regex in 's/foo'
+ sed: invalid 's' command 's/foo': no delimiter after regex
EOE
: empty
:
$* -e 's///' 2>>EOE != 0
- sed: empty regex in 's' command in 's///'
+ sed: invalid 's' command 's///': empty regex
EOE
: invalid
@@ -188,7 +188,7 @@ test.options += -c
: unterminated-replacement
:
$* -e 's/foo/bar' 2>>EOE != 0
- sed: unterminated 's' command replacement in 's/foo/bar'
+ sed: invalid 's' command 's/foo/bar': no delimiter after replacement
EOE
: invalid-flags
diff --git a/tests/regex/driver.cxx b/tests/regex/driver.cxx
index f78a100..cb59cd8 100644
--- a/tests/regex/driver.cxx
+++ b/tests/regex/driver.cxx
@@ -4,8 +4,11 @@
#include <cassert>
#ifndef __cpp_lib_modules_ts
+#include <regex>
#include <string>
+#include <utility> // pair
#include <iostream>
+#include <stdexcept> // invalid_argument
#include <exception>
#endif
@@ -27,7 +30,7 @@ import butl.utility; // operator<<(ostream, exception)
using namespace std;
using namespace butl;
-// Usage: argv[0] [-ffo] [-fnc] [-m] <string> <regex> <format>
+// Usage: argv[0] [-ffo] [-fnc] [-m] <string> "/<regex>/<format>/"
//
// Perform substitution of matched substrings with formatted replacement
// strings using regex_replace_*() functions. If the string matches the regex
@@ -66,11 +69,13 @@ try
break;
}
- assert (i + 3 == argc);
+ assert (i + 2 == argc);
- string s (argv[i++]);
- regex re (argv[i++]);
- string fmt (argv[i]);
+ string s (argv[i++]);
+ pair<regex, string> rf (regex_replace_parse (argv[i]));
+
+ const regex& re (rf.first);
+ const string& fmt (rf.second);
auto r (match
? regex_replace_match (s, re, fmt)
@@ -86,8 +91,13 @@ catch (const regex_error& e)
cerr << "invalid regex" << e << endl; // Print sanitized.
return 2;
}
-catch (const exception& e)
+catch (const invalid_argument& e)
{
cerr << e << endl;
return 2;
}
+catch (const exception&)
+{
+ assert (false);
+ return 2;
+}
diff --git a/tests/regex/testscript b/tests/regex/testscript
index fbee1d6..93ad4b6 100644
--- a/tests/regex/testscript
+++ b/tests/regex/testscript
@@ -4,38 +4,38 @@
: replace-search
:
{
- $* abcbd b x >axcxd : all
- $* -ffo abcbd b x >axcbd : first-only
- $* -fnc abcbd b x >xx : no-copy
+ $* abcbd /b/x/ >axcxd : all
+ $* -ffo abcbd /b/x/ >axcbd : first-only
+ $* -fnc abcbd /b/x/ >xx : no-copy
: ecma-escape
:
{
- $* xay a '$b' >'x$by' : none
- $* xay a '$' >'x$y' : none-term
- $* xay a '$$' >'x$y' : self
- $* xay a 'b$&c' >'xbacy' : match
- $* xay a 'b$`c' >'xbxcy' : match-precede
- $* xay a "b\\\$'c" >'xbycy' : match-follow
+ $* xay '/a/$b/' >'x$by' : none
+ $* xay '/a/$/' >'x$y' : none-term
+ $* xay '/a/$$/' >'x$y' : self
+ $* xay '/a/b$&c/' >'xbacy' : match
+ $* xay '/a/b$`c/' >'xbxcy' : match-precede
+ $* xay "/a/b\\\$'c/" >'xbycy' : match-follow
: capture
:
{
- $* abcdefghij '(a)(b)(c)(d)(e)(f)(g)(h)(i)(j)' '$1$10' >aj : matched
- $* a '(a)|(b)' '$1$2$3' >a : unmatched
+ $* abcdefghij '/(a)(b)(c)(d)(e)(f)(g)(h)(i)(j)/$1$10/' >aj : matched
+ $* a '/(a)|(b)/$1$2$3/' >a : unmatched
}
}
: perl-escape
:
{
- $* xay a '\b' >'xby' : none
- $* xay a '\' >'xy' : none-term
- $* xay a '\\' >'x\y' : self
+ $* xay '/a/\b/' >'xby' : none
+ $* xay '/a/\/' >'xy' : none-term
+ $* xay '/a/\\/' >'x\y' : self
: newline
:
- $* xay a '\n' >>EOO
+ $* xay '/a/\n/' >>EOO
x
y
EOO
@@ -43,25 +43,25 @@
: capture
:
{
- $* abcdefghij '(a)(b)(c)(d)(e)(f)(g)(h)(i)(j)' '\1\10' >aa0 : matched
- $* a '(a)|(b)' '\1\2\3' >a : unmatched
+ $* abcdefghij '/(a)(b)(c)(d)(e)(f)(g)(h)(i)(j)/\1\10/' >aa0 : matched
+ $* a '/(a)|(b)/\1\2\3/' >a : unmatched
}
: upper
:
{
- $* xay a '\U' >xy : none
- $* xay a '\Uvz' >xVZy : repl
- $* xay a '\Uv\Ez' >xVzy : end
- $* aa a 'v\Uz' >vZvZ : locality
- $* xay '(a)' '\U\1' >xAy : capt
- $* x-y '(a?)-' '\U\1z' >xZy : capt-empty
- $* xay a '\uvz' >xVzy : once
+ $* xay '/a/\U/' >xy : none
+ $* xay '/a/\Uvz/' >xVZy : repl
+ $* xay '/a/\Uv\Ez/' >xVzy : end
+ $* aa '/a/v\Uz/' >vZvZ : locality
+ $* xay '/(a)/\U\1/' >xAy : capt
+ $* x-y '/(a?)-/\U\1z/' >xZy : capt-empty
+ $* xay '/a/\uvz/' >xVzy : once
}
: lower
:
- $* xay a '\lVZ' >xvZy
+ $* xay '/a/\lVZ/' >xvZy
}
}
@@ -70,6 +70,19 @@
{
test.options += -m
- $* abc 'a(b)c' 'x\1y' >xby : match
- $* abcd 'a(b)c' 'x\1yd' == 1 : no-match
+ $* abc '/a(b)c/x\1y/' >xby : match
+ $* abcd '/a(b)c/x\1yd/' == 1 : no-match
+}
+
+: invalid-regex-fmt
+:
+{
+ test.arguments += '' # Note: we will fail before the matching.
+
+ $* '' 2> 'no leading delimiter' != 0 : no-leading-delim
+ $* '/a' 2> 'no delimiter after regex' != 0 : no-mid-delim
+ $* '//' 2> 'empty regex' != 0 : no-regex
+ $* '/a[b/c/' 2>~'/invalid regex.*/' != 0 : regex
+ $* '/a/b' 2> 'no delimiter after replacement' != 0 : no-trailing-delim
+ $* '/a/b/s' 2> 'junk after trailing delimiter' != 0 : junk
}