From 0befab300849be7ac0f77bc4228f8de50a108191 Mon Sep 17 00:00:00 2001 From: Karen Arutyunov Date: Fri, 25 Oct 2024 18:35:43 +0200 Subject: Make regex_replace_search() not to match empty substrings in non-empty strings --- libbutl/regex.hxx | 2 +- libbutl/regex.txx | 19 +++++++++++++++++++ tests/regex/testscript | 23 +++++++++++++++++++++++ 3 files changed, 43 insertions(+), 1 deletion(-) diff --git a/libbutl/regex.hxx b/libbutl/regex.hxx index 9b31075..69009c3 100644 --- a/libbutl/regex.hxx +++ b/libbutl/regex.hxx @@ -52,7 +52,7 @@ namespace butl const std::basic_string& fmt, F&& append, std::regex_constants::match_flag_type = - std::regex_constants::match_default); + std::regex_constants::match_default); // As above but concatenate non-matched substrings and matched substring // replacements into a string returning it as well as whether the search diff --git a/libbutl/regex.txx b/libbutl/regex.txx index 214d949..ec9f7af 100644 --- a/libbutl/regex.txx +++ b/libbutl/regex.txx @@ -217,6 +217,25 @@ namespace butl bool first_only ((flags & regex_constants::format_first_only) != 0); bool no_copy ((flags & regex_constants::format_no_copy) != 0); + // Note that by default the std::regex_search(), std::regex_replace(), and + // std::regex_iterator() functions match the empty substrings in non-empty + // strings for all the major implementations. For example: + // + // - regex_search("bb", "a*") call returns true. + // + // - regex_replace("bb", "a*", "x") call returns "xbxbx". + // + // - regex_replace("a", ".*", "x") call returns "xx". + // + // - Iterating using the regex_iterator("a", ".*") object ends up with the + // two matches: "a" and "". + // + // Since such a behavior feels counter-intuitive, we suppress it using the + // match_not_null flag, except for the empty string. + // + if (!s.empty ()) + flags |= regex_constants::match_not_null; + // Beginning of the last unmatched substring. // str_it ub (s.begin ()); diff --git a/tests/regex/testscript b/tests/regex/testscript index 93ad4b6..137469d 100644 --- a/tests/regex/testscript +++ b/tests/regex/testscript @@ -63,6 +63,21 @@ : $* xay '/a/\lVZ/' >xvZy } + + : empty-substring + : + : Note that the regex search-based replacement with the match_not_null flag + : is broken for older versions of libstdc++ and libc++ (may ignore + : match_not_null for the former and may hang for some string/pattern for the + : latter). + : + if (($cxx.id != 'gcc' || $cxx.version.major >= 7) && \ + ($cxx.id != 'clang' || $cxx.version.major >= 6)) + { + $* '' '/.*/x/' >'x' : empty + $* a '/a*/x/' >'x' : match + $* aa '/b*/x/' == 1 : no-match + } } : replace-match @@ -72,6 +87,14 @@ $* abc '/a(b)c/x\1y/' >xby : match $* abcd '/a(b)c/x\1yd/' == 1 : no-match + + : empty-substring + : + { + $* '' '/.*/x/' >'x' : empty + $* a '/a*/x/' >'x' : match + $* ab '/a(c*)(b)/\1\2/' >'b' : match-mid + } } : invalid-regex-fmt -- cgit v1.1