diff options
author | Karen Arutyunov <karen@codesynthesis.com> | 2021-03-23 18:50:55 +0300 |
---|---|---|
committer | Karen Arutyunov <karen@codesynthesis.com> | 2021-03-26 18:13:32 +0300 |
commit | 95c579df686f115c0fd3697f2723fa73476c4584 (patch) | |
tree | 5d76adbcf75692d278b4085c6e996ab58a3e4e25 | |
parent | 5ecdb9a3b5cb85418f69126226b2636caed2e4da (diff) |
Add regex_replace_parse() overloads
-rw-r--r-- | libbutl/builtin.cxx | 75 | ||||
-rw-r--r-- | libbutl/regex.ixx | 17 | ||||
-rw-r--r-- | libbutl/regex.mxx | 36 | ||||
-rw-r--r-- | libbutl/regex.txx | 67 | ||||
-rw-r--r-- | tests/builtin/sed.testscript | 6 | ||||
-rw-r--r-- | tests/regex/driver.cxx | 22 | ||||
-rw-r--r-- | tests/regex/testscript | 67 |
7 files changed, 211 insertions, 79 deletions
diff --git a/libbutl/builtin.cxx b/libbutl/builtin.cxx index 79ff968..a6bb94b 100644 --- a/libbutl/builtin.cxx +++ b/libbutl/builtin.cxx @@ -1632,15 +1632,6 @@ namespace butl string replacement; bool global; bool print; - - subst (const string& re, bool ic, string rp, bool gl, bool pr) - // - // Note that ECMAScript is implied if no grammar flag is specified. - // - : regex (re, ic ? regex::icase : regex::ECMAScript), - replacement (move (rp)), - global (gl), - print (pr) {} }; small_vector<subst, 1> substs; @@ -1663,57 +1654,59 @@ namespace butl if (delim == '\\' || delim == '\n') fail () << "invalid delimiter for 's' command in '" << v << "'"; - size_t p (v.find (delim, 2)); - if (p == string::npos) - fail () << "unterminated 's' command regex in '" << v << "'"; - - string regex (v, 2, p - 2); - - // Empty regex matches nothing, so not of much use. - // - if (regex.empty ()) - fail () << "empty regex in 's' command in '" << v << "'"; - - size_t b (p + 1); - p = v.find (delim, b); - if (p == string::npos) - fail () << "unterminated 's' command replacement in '" << v << "'"; - - string replacement (v, b, p - b); - - // Parse the substitute command flags. + // Parse the substitute command regex (as string), replacement, and + // flags. // + pair<string, string> rf; bool icase (false); bool global (false); bool print (false); - char c; - for (++p; (c = v[p]) != '\0'; ++p) + try { - switch (c) + size_t e; + rf = regex_replace_parse (v.c_str () + 1, v.size () - 1, e); + + char c; + for (size_t i (e + 1); (c = v[i]) != '\0'; ++i) { - case 'i': icase = true; break; - case 'g': global = true; break; - case 'p': print = true; break; - default: + switch (c) { - fail () << "invalid 's' command flag '" << c << "' in '" << v - << "'"; + case 'i': icase = true; break; + case 'g': global = true; break; + case 'p': print = true; break; + default: + { + fail () << "invalid 's' command flag '" << c << "' in '" << v + << "'"; + } } } } + catch (const invalid_argument& e) + { + fail () << "invalid 's' command '" << v << "': " << e; + } + // Parse the regex and add the substitution to the list. + // try { - substs.emplace_back (regex, icase, - move (replacement), - global, print); + // Note that ECMAScript is implied if no grammar flag is specified. + // + regex re (rf.first, icase ? regex::icase : regex::ECMAScript); + + substs.push_back ({move (re), + move (rf.second), + global, + print}); } catch (const regex_error& e) { // Print regex_error description if meaningful (no space). // - fail () << "invalid regex '" << regex << "' in '" << v << "'" << e; + fail () << "invalid regex '" << rf.first << "' in '" << v << "'" + << e; } } diff --git a/libbutl/regex.ixx b/libbutl/regex.ixx index dec15d1..805acd1 100644 --- a/libbutl/regex.ixx +++ b/libbutl/regex.ixx @@ -21,4 +21,21 @@ LIBBUTL_MODEXPORT namespace butl //@@ MOD Clang needs this for some reason. return make_pair (move (r), match); } + + template <typename C> + inline std::pair<std::basic_regex<C>, std::basic_string<C>> + regex_replace_parse (const std::basic_string<C>& s, + std::regex_constants::syntax_option_type f) + { + return regex_replace_parse (s.c_str (), s.size (), f); + } + + template <typename C> + inline std::pair<std::basic_regex<C>, std::basic_string<C>> + regex_replace_parse (const C* s, + std::regex_constants::syntax_option_type f) + { + return regex_replace_parse ( + s, std::basic_string<C>::traits_type::length (s), f); + } } diff --git a/libbutl/regex.mxx b/libbutl/regex.mxx index 84b024f..b5490b1 100644 --- a/libbutl/regex.mxx +++ b/libbutl/regex.mxx @@ -14,8 +14,9 @@ #include <utility> // pair #include <locale> -#include <cstddef> // size_t -#include <utility> // move(), make_pair() +#include <cstddef> // size_t +#include <utility> // move(), make_pair() +#include <stdexcept> // invalid_argument #endif #if defined(__clang__) @@ -93,6 +94,37 @@ LIBBUTL_MODEXPORT namespace butl regex_replace_match (const std::basic_string<C>&, const std::basic_regex<C>&, const std::basic_string<C>& fmt); + + // Parse the '/<regex>/<format>/' replacement string into the regex/format + // pair. Other character can be used as a delimiter instead of '/'. Throw + // std::invalid_argument or std::regex_error on parsing error. + // + // Note: escaping of the delimiter character is not (yet) supported. + // + template <typename C> + std::pair<std::basic_regex<C>, std::basic_string<C>> + regex_replace_parse (const std::basic_string<C>&, + std::regex_constants::syntax_option_type = + std::regex_constants::ECMAScript); + + template <typename C> + std::pair<std::basic_regex<C>, std::basic_string<C>> + regex_replace_parse (const C*, + std::regex_constants::syntax_option_type = + std::regex_constants::ECMAScript); + + template <typename C> + std::pair<std::basic_regex<C>, std::basic_string<C>> + regex_replace_parse (const C*, size_t, + std::regex_constants::syntax_option_type = + std::regex_constants::ECMAScript); + + // As above but return string instead of regex and do not fail if there is + // text after the last delimiter instead returning its position. + // + template <typename C> + std::pair<std::basic_string<C>, std::basic_string<C>> + regex_replace_parse (const C*, size_t, size_t& end); } LIBBUTL_MODEXPORT namespace std diff --git a/libbutl/regex.txx b/libbutl/regex.txx index b785708..aa845be 100644 --- a/libbutl/regex.txx +++ b/libbutl/regex.txx @@ -278,4 +278,71 @@ LIBBUTL_MODEXPORT namespace butl //@@ MOD Clang needs this for some reason. return match; } + + template <typename C> + std::pair<std::basic_regex<C>, std::basic_string<C>> + regex_replace_parse (const C* s, size_t n, + std::regex_constants::syntax_option_type f) + { + using namespace std; + + using string_type = basic_string<C>; + + size_t e; + pair<string_type, string_type> r (regex_replace_parse (s, n, e)); + + if (e != n) + throw invalid_argument ("junk after trailing delimiter"); + + return make_pair (basic_regex<C> (r.first, f), move (r.second)); + } + + template <typename C> + std::pair<std::basic_string<C>, std::basic_string<C>> + regex_replace_parse (const C* s, size_t n, size_t& e) + { + using namespace std; + + using string_type = basic_string<C>; + + if (n == 0) + throw invalid_argument ("no leading delimiter"); + + const C* b (s); // Save the beginning of the string. + + char delim (s[0]); + + // Position to the regex first character and find the regex-terminating + // delimiter. + // + --n; + ++s; + + const C* p (string_type::traits_type::find (s, n, delim)); + + if (p == nullptr) + throw invalid_argument ("no delimiter after regex"); + + // Empty regex matches nothing, so not of much use. + // + if (p == s) + throw invalid_argument ("empty regex"); + + // Save the regex. + // + string_type re (s, p - s); + + // Position to the format first character and find the trailing delimiter. + // + n -= p - s + 1; + s = p + 1; + + p = string_type::traits_type::find (s, n, delim); + + if (p == nullptr) + throw invalid_argument ("no delimiter after replacement"); + + e = p - b + 1; + return make_pair (move (re), string_type (s, p - s)); + } } diff --git a/tests/builtin/sed.testscript b/tests/builtin/sed.testscript index 7fbc9b2..2ed3088 100644 --- a/tests/builtin/sed.testscript +++ b/tests/builtin/sed.testscript @@ -166,13 +166,13 @@ test.options += -c : unterminated : $* -e 's/foo' 2>>EOE != 0 - sed: unterminated 's' command regex in 's/foo' + sed: invalid 's' command 's/foo': no delimiter after regex EOE : empty : $* -e 's///' 2>>EOE != 0 - sed: empty regex in 's' command in 's///' + sed: invalid 's' command 's///': empty regex EOE : invalid @@ -188,7 +188,7 @@ test.options += -c : unterminated-replacement : $* -e 's/foo/bar' 2>>EOE != 0 - sed: unterminated 's' command replacement in 's/foo/bar' + sed: invalid 's' command 's/foo/bar': no delimiter after replacement EOE : invalid-flags diff --git a/tests/regex/driver.cxx b/tests/regex/driver.cxx index f78a100..cb59cd8 100644 --- a/tests/regex/driver.cxx +++ b/tests/regex/driver.cxx @@ -4,8 +4,11 @@ #include <cassert> #ifndef __cpp_lib_modules_ts +#include <regex> #include <string> +#include <utility> // pair #include <iostream> +#include <stdexcept> // invalid_argument #include <exception> #endif @@ -27,7 +30,7 @@ import butl.utility; // operator<<(ostream, exception) using namespace std; using namespace butl; -// Usage: argv[0] [-ffo] [-fnc] [-m] <string> <regex> <format> +// Usage: argv[0] [-ffo] [-fnc] [-m] <string> "/<regex>/<format>/" // // Perform substitution of matched substrings with formatted replacement // strings using regex_replace_*() functions. If the string matches the regex @@ -66,11 +69,13 @@ try break; } - assert (i + 3 == argc); + assert (i + 2 == argc); - string s (argv[i++]); - regex re (argv[i++]); - string fmt (argv[i]); + string s (argv[i++]); + pair<regex, string> rf (regex_replace_parse (argv[i])); + + const regex& re (rf.first); + const string& fmt (rf.second); auto r (match ? regex_replace_match (s, re, fmt) @@ -86,8 +91,13 @@ catch (const regex_error& e) cerr << "invalid regex" << e << endl; // Print sanitized. return 2; } -catch (const exception& e) +catch (const invalid_argument& e) { cerr << e << endl; return 2; } +catch (const exception&) +{ + assert (false); + return 2; +} diff --git a/tests/regex/testscript b/tests/regex/testscript index fbee1d6..93ad4b6 100644 --- a/tests/regex/testscript +++ b/tests/regex/testscript @@ -4,38 +4,38 @@ : replace-search : { - $* abcbd b x >axcxd : all - $* -ffo abcbd b x >axcbd : first-only - $* -fnc abcbd b x >xx : no-copy + $* abcbd /b/x/ >axcxd : all + $* -ffo abcbd /b/x/ >axcbd : first-only + $* -fnc abcbd /b/x/ >xx : no-copy : ecma-escape : { - $* xay a '$b' >'x$by' : none - $* xay a '$' >'x$y' : none-term - $* xay a '$$' >'x$y' : self - $* xay a 'b$&c' >'xbacy' : match - $* xay a 'b$`c' >'xbxcy' : match-precede - $* xay a "b\\\$'c" >'xbycy' : match-follow + $* xay '/a/$b/' >'x$by' : none + $* xay '/a/$/' >'x$y' : none-term + $* xay '/a/$$/' >'x$y' : self + $* xay '/a/b$&c/' >'xbacy' : match + $* xay '/a/b$`c/' >'xbxcy' : match-precede + $* xay "/a/b\\\$'c/" >'xbycy' : match-follow : capture : { - $* abcdefghij '(a)(b)(c)(d)(e)(f)(g)(h)(i)(j)' '$1$10' >aj : matched - $* a '(a)|(b)' '$1$2$3' >a : unmatched + $* abcdefghij '/(a)(b)(c)(d)(e)(f)(g)(h)(i)(j)/$1$10/' >aj : matched + $* a '/(a)|(b)/$1$2$3/' >a : unmatched } } : perl-escape : { - $* xay a '\b' >'xby' : none - $* xay a '\' >'xy' : none-term - $* xay a '\\' >'x\y' : self + $* xay '/a/\b/' >'xby' : none + $* xay '/a/\/' >'xy' : none-term + $* xay '/a/\\/' >'x\y' : self : newline : - $* xay a '\n' >>EOO + $* xay '/a/\n/' >>EOO x y EOO @@ -43,25 +43,25 @@ : capture : { - $* abcdefghij '(a)(b)(c)(d)(e)(f)(g)(h)(i)(j)' '\1\10' >aa0 : matched - $* a '(a)|(b)' '\1\2\3' >a : unmatched + $* abcdefghij '/(a)(b)(c)(d)(e)(f)(g)(h)(i)(j)/\1\10/' >aa0 : matched + $* a '/(a)|(b)/\1\2\3/' >a : unmatched } : upper : { - $* xay a '\U' >xy : none - $* xay a '\Uvz' >xVZy : repl - $* xay a '\Uv\Ez' >xVzy : end - $* aa a 'v\Uz' >vZvZ : locality - $* xay '(a)' '\U\1' >xAy : capt - $* x-y '(a?)-' '\U\1z' >xZy : capt-empty - $* xay a '\uvz' >xVzy : once + $* xay '/a/\U/' >xy : none + $* xay '/a/\Uvz/' >xVZy : repl + $* xay '/a/\Uv\Ez/' >xVzy : end + $* aa '/a/v\Uz/' >vZvZ : locality + $* xay '/(a)/\U\1/' >xAy : capt + $* x-y '/(a?)-/\U\1z/' >xZy : capt-empty + $* xay '/a/\uvz/' >xVzy : once } : lower : - $* xay a '\lVZ' >xvZy + $* xay '/a/\lVZ/' >xvZy } } @@ -70,6 +70,19 @@ { test.options += -m - $* abc 'a(b)c' 'x\1y' >xby : match - $* abcd 'a(b)c' 'x\1yd' == 1 : no-match + $* abc '/a(b)c/x\1y/' >xby : match + $* abcd '/a(b)c/x\1yd/' == 1 : no-match +} + +: invalid-regex-fmt +: +{ + test.arguments += '' # Note: we will fail before the matching. + + $* '' 2> 'no leading delimiter' != 0 : no-leading-delim + $* '/a' 2> 'no delimiter after regex' != 0 : no-mid-delim + $* '//' 2> 'empty regex' != 0 : no-regex + $* '/a[b/c/' 2>~'/invalid regex.*/' != 0 : regex + $* '/a/b' 2> 'no delimiter after replacement' != 0 : no-trailing-delim + $* '/a/b/s' 2> 'junk after trailing delimiter' != 0 : junk } |