From 6dc01f74431a40dac698417d2947f0d8ddbd8c8c Mon Sep 17 00:00:00 2001 From: Karen Arutyunov Date: Mon, 28 Oct 2024 12:52:11 +0200 Subject: Make $regex.*search() functions not to match empty substrings in non empty strings --- doc/manual.cli | 5 ++- doc/testscript.cli | 8 ++-- libbuild2/functions-regex.cxx | 42 +++++++++++++++++++-- tests/function/regex/testscript | 81 +++++++++++++++++++++++++++++++++++++++++ 4 files changed, 127 insertions(+), 9 deletions(-) diff --git a/doc/manual.cli b/doc/manual.cli index e16b0fa..8beb4ea 100644 --- a/doc/manual.cli +++ b/doc/manual.cli @@ -6128,8 +6128,9 @@ source ; The \c{$regex.*()} function family contains function that provide comprehensive regular expression matching and substitution facilities. The -supported regular expression flavor is ECMAScript (more specifically, -ECMA-262-based C++11 regular expressions). +supported regular expression flavor is ECMAScript, more precisely, +ECMA-262-based C++11 regular expressions. Note that the \c{match_not_null} +flag is in effect unless the string being matched is empty. In the \c{$regex.*()} functions the substitution escape sequences in the format string (the \ci{fmt} argument) are extended with a subset of the Perl diff --git a/doc/testscript.cli b/doc/testscript.cli index 18dac41..7f7522d 100644 --- a/doc/testscript.cli +++ b/doc/testscript.cli @@ -2372,8 +2372,8 @@ continuations. \h#syntax-regex|Output Regex| Instead of literal text the expected result in output here-strings and -here-documents can be specified as ECMAScript regular expressions (more -specifically, ECMA-262-based C++11 regular expressions). To signal the use of +here-documents can be specified as ECMAScript regular expressions, more +precisely, ECMA-262-based C++11 regular expressions. To signal the use of regular expressions the redirect must end with the \c{~} modifier, for example: @@ -3082,7 +3082,9 @@ illegal. Note that this builtin implementation deviates substantially from POSIX \c{sed} (as described next). Most significantly, the regular expression flavor -is ECMAScript (more specifically, ECMA-262-based C++11 regular expressions). +is ECMAScript, more precisely, ECMA-262-based C++11 regular expressions. Note +that the \c{match_not_null} flag is in effect unless the line being matched +is empty. \dl| diff --git a/libbuild2/functions-regex.cxx b/libbuild2/functions-regex.cxx index c46f6f5..de34d63 100644 --- a/libbuild2/functions-regex.cxx +++ b/libbuild2/functions-regex.cxx @@ -138,12 +138,24 @@ namespace build2 // string s (to_string (move (v))); + // Match flags. + // + // Note that by default std::regex_search() matches the empty substrings + // in non-empty strings for all the major implementations. We suppress + // such a counter-intuitive behavior with the match_not_null flag (see the + // butl::regex_replace_search() function implementation for details). + // + regex_constants::match_flag_type mf (regex_constants::match_default); + + if (!s.empty ()) + mf |= regex_constants::match_not_null; + if (!match && !subs) - return value (regex_search (s, rge)); // Return boolean value. + return value (regex_search (s, rge, mf)); // Return boolean value. match_results m; - if (regex_search (s, m, rge)) + if (regex_search (s, m, rge, mf)) { assert (!m.empty ()); @@ -483,7 +495,19 @@ namespace build2 for (auto& n: ns) { - if (regex_search (convert (move (n)), rge)) + string s (convert (move (n))); + + // Match flags. + // + // Suppress matching of empty substrings in non-empty strings (see above + // for details). + // + regex_constants::match_flag_type mf (regex_constants::match_default); + + if (!s.empty ()) + mf |= regex_constants::match_not_null; + + if (regex_search (s, rge, mf)) return true; } @@ -516,7 +540,17 @@ namespace build2 bool s (n.simple ()); string v (convert (s ? move (n) : name (n))); - if (regex_search (v, rge) == matching) + // Match flags. + // + // Suppress matching of empty substrings in non-empty strings (see above + // for details). + // + regex_constants::match_flag_type mf (regex_constants::match_default); + + if (!v.empty ()) + mf |= regex_constants::match_not_null; + + if (regex_search (v, rge, mf) == matching) r.emplace_back (s ? name (move (v)) : move (n)); } diff --git a/tests/function/regex/testscript b/tests/function/regex/testscript index 538bdab..7fbcc8e 100644 --- a/tests/function/regex/testscript +++ b/tests/function/regex/testscript @@ -366,6 +366,33 @@ EOI } } + + : empty-substring + : + : Note that regex_search() ignores the match_not_null flag for older + : versions of libstdc++ and libc++. + : + if (($cxx.id != 'gcc' || $cxx.version.major >= 7) && \ + ($cxx.id != 'clang' || $cxx.version.major >= 6)) + { + : empty + : + $* <'true' + print $regex.search('', '.*') + EOI + + : match + : + $* <'true' + print $regex.search('a', 'a*') + EOI + + : no-match + : + $* <'false' + print $regex.search('aa', 'b*') + EOI + } } : split @@ -576,6 +603,33 @@ print $regex.find_search(Foo.cxx, 'f', icase) EOI } + + : empty-substring + : + : Note that regex_search() ignores the match_not_null flag for older + : versions of libstdc++ and libc++. + : + if (($cxx.id != 'gcc' || $cxx.version.major >= 7) && \ + ($cxx.id != 'clang' || $cxx.version.major >= 6)) + { + : empty + : + $* <'true' + print $regex.find_search('', '.*') + EOI + + : match + : + $* <'true' + print $regex.find_search('a', 'a*') + EOI + + : no-match + : + $* <'false' + print $regex.find_search('aa', 'b*') + EOI + } } : filter-search @@ -607,6 +661,33 @@ $* <'' print $regex.filter_search(-g, '-O') EOI + + : empty-substring + : + : Note that regex_search() ignores the match_not_null flag for older + : versions of libstdc++ and libc++. + : + if (($cxx.id != 'gcc' || $cxx.version.major >= 7) && \ + ($cxx.id != 'clang' || $cxx.version.major >= 6)) + { + : empty + : + $* <'{}' + print $regex.filter_search('', '.*') + EOI + + : match + : + $* <'a' + print $regex.filter_search('a', 'a*') + EOI + + : no-match + : + $* <'' + print $regex.filter_search('aa', 'b*') + EOI + } } : filter-out -- cgit v1.1