From 32df5529f791760161a027a1a7408bc92976a3cd Mon Sep 17 00:00:00 2001 From: Karen Arutyunov Date: Wed, 30 Aug 2017 13:18:50 +0300 Subject: Add $regex.split(), $regex.merge() and $regex.apply() functions --- build2/functions-regex.cxx | 379 +++++++++++++++++++++++++++++++++++++--- tests/function/regex/testscript | 96 ++++++++++ 2 files changed, 452 insertions(+), 23 deletions(-) diff --git a/build2/functions-regex.cxx b/build2/functions-regex.cxx index bdecc14..3185b85 100644 --- a/build2/functions-regex.cxx +++ b/build2/functions-regex.cxx @@ -97,7 +97,7 @@ namespace build2 } } - return value (r); + return value (move (r)); } // Determine if there is a match between the regular expression and some @@ -164,20 +164,12 @@ namespace build2 } } - return value (r); + return value (move (r)); } - // Replace matched parts in a value of an arbitrary type, using the format - // string. See replace() overloads (below) for details. - // - static names - replace (value&& v, - const string& re, - const string& fmt, - optional&& flags) + static pair + parse_replacement_flags (optional&& flags, bool first_only = true) { - // Parse flags. - // regex::flag_type rf (regex::ECMAScript); regex_constants::match_flag_type mf (regex_constants::match_default); @@ -189,7 +181,7 @@ namespace build2 if (s == "icase") rf |= regex::icase; - else if (s == "format_first_only") + else if (first_only && s == "format_first_only") mf |= regex_constants::format_first_only; else if (s == "format_no_copy") mf |= regex_constants::format_no_copy; @@ -198,18 +190,29 @@ namespace build2 } } - // Parse regex. - // - regex rge (parse_regex (re, rf)); + return make_pair (rf, mf); + } + + // Replace matched parts in a value of an arbitrary type, using the format + // string. See replace() overloads (below) for details. + // + static names + replace (value&& v, + const string& re, + const string& fmt, + optional&& flags) + { + auto fl (parse_replacement_flags (move (flags))); + regex rge (parse_regex (re, fl.first)); - // Replace. - // names r; try { - string s (to_string (move (v))); - r.emplace_back (regex_replace_ex (s, rge, fmt, mf).first); + r.emplace_back (regex_replace_ex (to_string (move (v)), + rge, + fmt, + fl.second).first); } catch (const regex_error& e) { @@ -219,6 +222,120 @@ namespace build2 return r; } + // Split a value of an arbitrary type into a list of unmatched value parts + // and replacements of the matched parts. See split() overloads (below) for + // details. + // + static names + split (value&& v, + const string& re, + const string& fmt, + optional&& flags) + { + auto fl (parse_replacement_flags (move (flags), false)); + regex rge (parse_regex (re, fl.first)); + + names r; + + try + { + regex_replace_ex (to_string (move (v)), rge, fmt, + [&r] (string::const_iterator b, + string::const_iterator e) + { + if (b != e) + r.emplace_back (string (b, e)); + }, + fl.second); + } + catch (const regex_error& e) + { + fail << "unable to split" << e; + } + + return r; + } + + // Replace matched parts of list elements using the format string. See + // apply() overloads (below) for details. + // + static names + apply (names&& s, + const string& re, + const string& fmt, + optional&& flags) + { + auto fl (parse_replacement_flags (move (flags))); + regex rge (parse_regex (re, fl.first)); + + names r; + + try + { + for (auto& v: s) + { + string s (regex_replace_ex (convert (move (v)), + rge, + fmt, + fl.second).first); + + if (!s.empty ()) + r.emplace_back (move (s)); + } + } + catch (const regex_error& e) + { + fail << "unable to apply" << e; + } + + return r; + } + + // Replace matched parts of list elements using the format string and + // concatenate the transformed elements. See merge() overloads (below) for + // details. + // + static names + merge (names&& s, + const string& re, + const string& fmt, + optional&& delim, + optional&& flags) + { + auto fl (parse_replacement_flags (move (flags))); + regex rge (parse_regex (re, fl.first)); + + string rs; + + try + { + for (auto& v: s) + { + string s (regex_replace_ex (convert (move (v)), + rge, + fmt, + fl.second).first); + + if (!s.empty ()) + { + if (!rs.empty () && delim) + rs.append (*delim); + + rs.append (s); + } + + } + } + catch (const regex_error& e) + { + fail << "unable to merge" << e; + } + + names r; + r.emplace_back (move (rs)); + return r; + } + void regex_functions () { @@ -286,7 +403,7 @@ namespace build2 // is always untyped, regardless of the argument type. // // Substitution escape sequences are extended with a subset of Perl - // sequences (see regex_replace_ex() for details). + // sequences (see butl::regex_replace_ex() for details). // // The following flags are supported: // @@ -294,10 +411,10 @@ namespace build2 // // format_first_only - only replace the first match // - // format_no_copy - do not copy unmatched value parts to the result + // format_no_copy - do not copy unmatched value parts into the result // // If both format_first_only and format_no_copy flags are specified then - // all the result will contain is the replacement of the first match. + // the result will only contain the replacement of the first match. // f[".replace"] = [](value s, string re, string fmt, optional flags) { @@ -327,5 +444,221 @@ namespace build2 convert (move (fmt)), move (flags)); }; + + // split + // + // Split a value of an arbitrary type into a list of unmatched value parts + // and replacements of the matched parts, omitting empty ones. Convert the + // value to string prior to matching. + // + // Substitution escape sequences are extended with a subset of Perl + // sequences (see butl::regex_replace_ex() for details). + // + // The following flags are supported: + // + // icase - match ignoring case + // + // format_no_copy - do not copy unmatched value parts into the result + // + f[".split"] = [](value s, string re, string fmt, optional flags) + { + return split (move (s), re, fmt, move (flags)); + }; + + f[".split"] = [](value s, string re, names fmt, optional flags) + { + return split (move (s), + re, + convert (move (fmt)), + move (flags)); + }; + + f[".split"] = [](value s, names re, string fmt, optional flags) + { + return split (move (s), + convert (move (re)), + fmt, + move (flags)); + }; + + f[".split"] = [](value s, names re, names fmt, optional flags) + { + return split (move (s), + convert (move (re)), + convert (move (fmt)), + move (flags)); + }; + + // merge + // + // Replace matched parts in a list of elements using the regex format + // string. Convert the elements to string prior to matching. The result + // value is untyped and contains concatenation of transformed non-empty + // elements optionally separated with a delimiter. + // + // Substitution escape sequences are extended with a subset of Perl + // sequences (see butl::regex_replace_ex() for details). + // + // The following flags are supported: + // + // icase - match ignoring case + // + // format_first_only - only replace the first match + // + // format_no_copy - do not copy unmatched value parts into the result + // + // If both format_first_only and format_no_copy flags are specified then + // the result will be a concatenation of only the first match + // replacements. + // + f[".merge"] = [](names s, + string re, + string fmt, + optional delim, + optional flags) + { + return merge (move (s), re, fmt, move (delim), move (flags)); + }; + + f[".merge"] = [](names s, + string re, + names fmt, + optional delim, + optional flags) + { + return merge (move (s), + re, + convert (move (fmt)), + move (delim), + move (flags)); + }; + + f[".merge"] = [](names s, + names re, + string fmt, + optional delim, + optional flags) + { + return merge (move (s), + convert (move (re)), + fmt, + move (delim), + move (flags)); + }; + + f[".merge"] = [](names s, + names re, + names fmt, + optional delim, + optional flags) + { + return merge (move (s), + convert (move (re)), + convert (move (fmt)), + move (delim), + move (flags)); + }; + + f[".merge"] = [](names s, + string re, + string fmt, + names delim, + optional flags) + { + return merge (move (s), + re, + fmt, + convert (move (delim)), + move (flags)); + }; + + f[".merge"] = [](names s, + string re, + names fmt, + names delim, + optional flags) + { + return merge (move (s), + re, + convert (move (fmt)), + convert (move (delim)), + move (flags)); + }; + + f[".merge"] = [](names s, + names re, + string fmt, + names delim, + optional flags) + { + return merge (move (s), + convert (move (re)), + fmt, + convert (move (delim)), + move (flags)); + }; + + f[".merge"] = [](names s, + names re, + names fmt, + names delim, + optional flags) + { + return merge (move (s), + convert (move (re)), + convert (move (fmt)), + convert (move (delim)), + move (flags)); + }; + + // apply + // + // Replace matched parts of each element in a list using the regex format + // string. Convert the elements to string prior to matching. Return a list + // of transformed elements, omitting the empty ones. + // + // Substitution escape sequences are extended with a subset of Perl + // sequences (see butl::regex_replace_ex() for details). + // + // The following flags are supported: + // + // icase - match ignoring case + // + // format_first_only - only replace the first match + // + // format_no_copy - do not copy unmatched value parts into the result + // + // If both format_first_only and format_no_copy flags are specified then + // the result elements will only contain the replacement of the first + // match. + // + f[".apply"] = [](names s, string re, string fmt, optional flags) + { + return apply (move (s), re, fmt, move (flags)); + }; + + f[".apply"] = [](names s, string re, names fmt, optional flags) + { + return apply (move (s), + re, + convert (move (fmt)), + move (flags)); + }; + + f[".apply"] = [](names s, names re, string fmt, optional flags) + { + return apply (move (s), + convert (move (re)), + fmt, + move (flags)); + }; + + f[".apply"] = [](names s, names re, names fmt, optional flags) + { + return apply (move (s), + convert (move (re)), + convert (move (fmt)), + move (flags)); + }; } } diff --git a/tests/function/regex/testscript b/tests/function/regex/testscript index deeefa5..6cce873 100644 --- a/tests/function/regex/testscript +++ b/tests/function/regex/testscript @@ -253,3 +253,99 @@ } } } + +: split +: +{ + : all-parts + : + : Note that 3 parts a printed here ('|abc|', ' ' and '|def|'), separated by + : the space character. + : + $* <'|abc| |def|' + print $regex.split('abc def', '(\S+)', '|\1|') + EOI + + : no-copy + : + : Note that 2 parts a printed here ('|abc|' and '|def|'), separated by the + : space character. + : + $* <'|abc| |def|' + print $regex.split('abc def', '(\S+)', '|\1|', format_no_copy) + EOI + + : unmatched + : + : Note that only unmatched part is printed here (' '). Empty replacements are + : omitted. + : + $* <' ' + print $regex.split('abc def', '(\S+)', '') + EOI + + : include-options + : + { + : quoted + : + $* <'|-Ic:/dir 1| |-IC:/dir2| |-IC:/dir3| |-IC:/dir4| ||' + opts = '"-Ic:/dir 1" "-IC:/dir2" "-IC:/dir3" "-IC:/dir4" ""' + print $regex.split($opts, ' *"([^"]*)" *', '|\1|') + EOI + + : quoted-unquoted + : + : Note that one of the two captures (\1\2) is always empty as they are + : alternative ones. + : + $* <'|-Ic:/dir 1| |-IC:/dir2| |-IC:/dir3| |-IC:/dir4| ||' + opts = '"-Ic:/dir 1" -IC:/dir2 "-IC:/dir3" "-IC:/dir4" ""' + print $regex.split($opts, '"([^"]*)"|([^" ]+)', '|\1\2|', format_no_copy) + EOI + } +} + +: apply +: +{ + : all-parts + : + $* <'xbc cbx' + print $regex.apply(abc cba, 'a', 'x') + EOI + + : omit-empty + : + $* <'bc cb' + print $regex.apply(abc a cba, 'a', '') + EOI +} + +: merge +: +{ + : all-parts + : + $* <'xbccbx' + print $regex.merge(abc cba, 'a', 'x') + EOI + + : omit-empty + : + $* <'bccb' + print $regex.merge(abc a cba, 'a', '') + EOI + + : delim + : + $* <'xbc-cbx' + print $regex.merge(abc cba, 'a', 'x', '-') + EOI + + : string-delim + : + $* <'xbc-cbx' + print $regex.merge(abc cba, 'a', 'x', [string] '-') + EOI +} -- cgit v1.1