From 9a3a8d1915c8a3666984d6603606af856dfd8c41 Mon Sep 17 00:00:00 2001 From: Karen Arutyunov Date: Mon, 26 Jun 2017 22:23:43 +0300 Subject: Add support for regex function family --- build2/buildfile | 1 + build2/function.cxx | 2 + build2/functions-regex.cxx | 331 ++++++++++++++++++++++++++++++++ build2/variable.hxx | 2 +- tests/function/regex/buildfile | 5 + tests/function/regex/testscript | 255 ++++++++++++++++++++++++ unit-tests/cc/lexer/buildfile | 9 +- unit-tests/cc/parser/buildfile | 2 +- unit-tests/function/buildfile | 6 +- unit-tests/lexer/buildfile | 6 +- unit-tests/scheduler/buildfile | 6 +- unit-tests/test/script/lexer/buildfile | 10 +- unit-tests/test/script/parser/buildfile | 12 +- 13 files changed, 623 insertions(+), 24 deletions(-) create mode 100644 build2/functions-regex.cxx create mode 100644 tests/function/regex/buildfile create mode 100644 tests/function/regex/testscript diff --git a/build2/buildfile b/build2/buildfile index 8525eb8..d2aa7c1 100644 --- a/build2/buildfile +++ b/build2/buildfile @@ -19,6 +19,7 @@ exe{b}: \ { cxx}{ functions-filesystem } \ { cxx}{ functions-path } \ { cxx}{ functions-process-path } \ + { cxx}{ functions-regex } \ { cxx}{ functions-string } \ { cxx}{ functions-target-triplet } \ {hxx cxx}{ lexer } \ diff --git a/build2/function.cxx b/build2/function.cxx index 399d679..f7de749 100644 --- a/build2/function.cxx +++ b/build2/function.cxx @@ -304,6 +304,7 @@ namespace build2 void filesystem_functions (); // functions-filesystem.cxx void path_functions (); // functions-path.cxx void process_path_functions (); // functions-process-path.cxx + void regex_functions (); // functions-regex.cxx void string_functions (); // functions-string.cxx void target_triplet_functions (); // functions-target-triplet.cxx @@ -315,6 +316,7 @@ namespace build2 filesystem_functions (); path_functions (); process_path_functions (); + regex_functions (); string_functions (); target_triplet_functions (); } diff --git a/build2/functions-regex.cxx b/build2/functions-regex.cxx new file mode 100644 index 0000000..bdecc14 --- /dev/null +++ b/build2/functions-regex.cxx @@ -0,0 +1,331 @@ +// file : build2/functions-regex.cxx -*- C++ -*- +// copyright : Copyright (c) 2014-2017 Code Synthesis Ltd +// license : MIT; see accompanying LICENSE file + +#include + +#include + +#include +#include + +using namespace std; +using namespace butl; + +namespace build2 +{ + // Convert value of an arbitrary type to string. + // + static inline string + to_string (value&& v) + { + // Optimize for the string value type. + // + if (v.type != &value_traits::value_type) + untypify (v); + + return convert (move (v)); + } + + // Parse a regular expression. Throw invalid_argument if it is not valid. + // + static regex + parse_regex (const string& s, regex::flag_type f) + { + try + { + return regex (s, f); + } + catch (const regex_error& e) + { + // Print regex_error description if meaningful (no space). + // + ostringstream os; + os << "invalid regex '" << s << "'" << e; + throw invalid_argument (os.str ()); + } + } + + // Match value of an arbitrary type against the regular expression. See + // match() overloads (below) for details. + // + static value + match (value&& v, const string& re, optional&& flags) + { + // Parse flags. + // + regex::flag_type rf (regex::ECMAScript); + bool subs (false); + + if (flags) + { + for (auto& f: *flags) + { + string s (convert (move (f))); + + if (s == "icase") + rf |= regex::icase; + else if (s == "return_subs") + subs = true; + else + throw invalid_argument ("invalid flag '" + s + "'"); + } + } + + // Parse regex. + // + regex rge (parse_regex (re, rf)); + + // Match. + // + string s (to_string (move (v))); + + if (!subs) + return value (regex_match (s, rge)); // Return boolean value. + + names r; + match_results m; + + if (regex_match (s, m, rge)) + { + assert (!m.empty ()); + + for (size_t i (1); i != m.size (); ++i) + { + if (m[i].matched) + r.emplace_back (m.str (i)); + } + } + + return value (r); + } + + // Determine if there is a match between the regular expression and some + // part of a value of an arbitrary type. See search() overloads (below) + // for details. + // + static value + search (value&& v, const string& re, optional&& flags) + { + // Parse flags. + // + regex::flag_type rf (regex::ECMAScript); + bool match (false); + bool subs (false); + + if (flags) + { + for (auto& f: *flags) + { + string s (convert (move (f))); + + if (s == "icase") + rf |= regex::icase; + else if (s == "return_match") + match = true; + else if (s == "return_subs") + subs = true; + else + throw invalid_argument ("invalid flag '" + s + "'"); + } + } + + // Parse regex. + // + regex rge (parse_regex (re, rf)); + + // Search. + // + string s (to_string (move (v))); + + if (!match && !subs) + return value (regex_search (s, rge)); // Return boolean value. + + names r; + match_results m; + + if (regex_search (s, m, rge)) + { + assert (!m.empty ()); + + if (match) + { + assert (m[0].matched); + r.emplace_back (m.str (0)); + } + + if (subs) + { + for (size_t i (1); i != m.size (); ++i) + { + if (m[i].matched) + r.emplace_back (m.str (i)); + } + } + } + + return value (r); + } + + // Replace matched parts in a value of an arbitrary type, using the format + // string. See replace() overloads (below) for details. + // + static names + replace (value&& v, + const string& re, + const string& fmt, + optional&& flags) + { + // Parse flags. + // + regex::flag_type rf (regex::ECMAScript); + regex_constants::match_flag_type mf (regex_constants::match_default); + + if (flags) + { + for (auto& f: *flags) + { + string s (convert (move (f))); + + if (s == "icase") + rf |= regex::icase; + else if (s == "format_first_only") + mf |= regex_constants::format_first_only; + else if (s == "format_no_copy") + mf |= regex_constants::format_no_copy; + else + throw invalid_argument ("invalid flag '" + s + "'"); + } + } + + // Parse regex. + // + regex rge (parse_regex (re, rf)); + + // Replace. + // + names r; + + try + { + string s (to_string (move (v))); + r.emplace_back (regex_replace_ex (s, rge, fmt, mf).first); + } + catch (const regex_error& e) + { + fail << "unable to replace" << e; + } + + return r; + } + + void + regex_functions () + { + function_family f ("regex"); + + // match + // + // Match a value of an arbitrary type against the regular expression. + // Convert the value to string prior to matching. Return the boolean value + // unless return_subs flag is specified (see below), in which case return + // names (empty if no match). + // + // The following flags are supported: + // + // icase - match ignoring case + // + // return_subs - return names (rather than boolean), that contain + // sub-strings that match the marked sub-expressions + // + f[".match"] = [](value s, string re, optional flags) + { + return match (move (s), re, move (flags)); + }; + + f[".match"] = [](value s, names re, optional flags) + { + return match (move (s), convert (move (re)), move (flags)); + }; + + // search + // + // Determine if there is a match between the regular expression and some + // part of a value of an arbitrary type. Convert the value to string prior + // to searching. Return the boolean value unless return_match or + // return_subs flag is specified (see below) in which case return names + // (empty if no match). + // + // The following flags are supported: + // + // icase - match ignoring case + // + // return_match - return names (rather than boolean), that contain a + // sub-string that matches the whole regular expression + // + // return_subs - return names (rather than boolean), that contain + // sub-strings that match the marked sub-expressions + // + // If both return_match and return_subs flags are specified then the + // sub-string that matches the whole regular expression comes first. + // + f[".search"] = [](value s, string re, optional flags) + { + return search (move (s), re, move (flags)); + }; + + f[".search"] = [](value s, names re, optional flags) + { + return search (move (s), convert (move (re)), move (flags)); + }; + + // replace + // + // Replace matched parts in a value of an arbitrary type, using the format + // string. Convert the value to string prior to matching. The result value + // is always untyped, regardless of the argument type. + // + // Substitution escape sequences are extended with a subset of Perl + // sequences (see regex_replace_ex() for details). + // + // The following flags are supported: + // + // icase - match ignoring case + // + // format_first_only - only replace the first match + // + // format_no_copy - do not copy unmatched value parts to the result + // + // If both format_first_only and format_no_copy flags are specified then + // all the result will contain is the replacement of the first match. + // + f[".replace"] = [](value s, string re, string fmt, optional flags) + { + return replace (move (s), re, fmt, move (flags)); + }; + + f[".replace"] = [](value s, string re, names fmt, optional flags) + { + return replace (move (s), + re, + convert (move (fmt)), + move (flags)); + }; + + f[".replace"] = [](value s, names re, string fmt, optional flags) + { + return replace (move (s), + convert (move (re)), + fmt, + move (flags)); + }; + + f[".replace"] = [](value s, names re, names fmt, optional flags) + { + return replace (move (s), + convert (move (re)), + convert (move (fmt)), + move (flags)); + }; + } +} diff --git a/build2/variable.hxx b/build2/variable.hxx index 19a6c69..f0218fe 100644 --- a/build2/variable.hxx +++ b/build2/variable.hxx @@ -263,7 +263,7 @@ namespace build2 bool operator>= (const value&, const value&); // Value cast. The first three expect the value to be not NULL. The cast - // from lookup expects the value to aslo be defined. + // from lookup expects the value to also be defined. // // Note that a cast to names expects the value to be untyped while a cast // to vector -- typed. diff --git a/tests/function/regex/buildfile b/tests/function/regex/buildfile new file mode 100644 index 0000000..fbac223 --- /dev/null +++ b/tests/function/regex/buildfile @@ -0,0 +1,5 @@ +# file : tests/function/regex/buildfile +# copyright : Copyright (c) 2014-2017 Code Synthesis Ltd +# license : MIT; see accompanying LICENSE file + +./: test{testscript} $b diff --git a/tests/function/regex/testscript b/tests/function/regex/testscript new file mode 100644 index 0000000..deeefa5 --- /dev/null +++ b/tests/function/regex/testscript @@ -0,0 +1,255 @@ +# file : tests/function/regex/testscript +# copyright : Copyright (c) 2014-2017 Code Synthesis Ltd +# license : MIT; see accompanying LICENSE file + +.include ../../common.test + +: replace +: +{ + : arg-types + : + { + : untyped-string-string + : + $* <'foo.o' + print $regex.replace('foo.cxx', [string] '(^[^.]*).*', [string] '\1.o') + EOI + + : string-untyped-string + : + $* <'foo.o' + print $regex.replace([string] 'foo.cxx', '(^[^.]*).*', [string] '\1.o') + EOI + + : bool-string-untyped + : + $* <'true.o' + print $regex.replace('a' == "a", [string] '(^[^.]*).*', '\1.o') + EOI + + : uint64-untyped-string + : + $* <'1.o' + print $regex.replace([uint64] 01, '(^[^.]*).*', [string] '\1.o') + EOI + + : path-untyped-untyped + : + $* <'foo.o' + print $regex.replace([path] 'foo.cxx', '(^[^.]*).*', '\1.o') + EOI + + : multiple-names + : + $* <'error: invalid argument: invalid string value: multiple names' != 0 + print $regex.replace(foo.cxx bar.cxx, '([^.]*)', '\1.o') + EOI + + : null + : + $* <'error: invalid argument: null value' != 0 + print $regex.replace([null], '([^.]*)', '\1.o') + EOI + + : null-regex + : + $* <'error: invalid argument: null value' != 0 + print $regex.replace(foo.cxx, [null], '\1.o') + EOI + } + + : no-subs + : + $* <'xbcxbc' + print $regex.replace('abcabc', 'a', 'x') + EOI + + : no-match + : + $* <'abcabc' + print $regex.replace('abcabc', 'd', 'x') + EOI + + : flags + : + { + : icase + : + $* <'Foo.o' + print $regex.replace("Foo.cxx", '(f[^.]*).*', '\1.o', icase) + EOI + + : format_first-only + : + $* <'foo.o' + print $regex.replace('foo.cxx', '([^.]*).*', '\1.o', format_first_only) + EOI + + : format_no_copy + : + { + : all-matches + : + $* <'xx' + print $regex.replace('abcabc', 'a', 'x', format_no_copy) + EOI + + : first-only + : + $* <'x' + print $regex.replace('abcabc', 'a', 'x', format_no_copy format_first_only) + EOI + } + + : unknown + : + $* <"error: invalid argument: invalid flag 'unknown'" != 0 + print $regex.replace("foo.cxx", '(f[^.]*).*', '\1.o', unknown) + EOI + } + + : invalid-regex + : + $* <'print $regex.replace(a, "[", b)' 2>>~/EOE/ != 0 + /error: invalid argument: invalid regex '\['.*/ + EOE +} + +: match +: +{ + : arg-types + : + { + : untyped-string + : + $* <'true' + print $regex.match('foo.cxx', [string] '(^[^.]*).*') + EOI + + : untyped-untyped + : + $* <'true' + print $regex.match('foo.cxx', '(^[^.]*).*') + EOI + } + + : flags + : + { + : none + : + $* <'false' + print $regex.match("Foo.cxx", '(f[^.]*).*') + EOI + + : icase + : + $* <'true' + print $regex.match("Foo.cxx", '(f[^.]*).*', icase) + EOI + + : return_subs + : + { + : success + : + $* <'foo bar' + print $regex.match("foo bar", '([^\s]*)\s+([^\s]*)', return_subs) + EOI + + : no-subexpr + : + $* <'' + print $regex.match("foo bar", '(?:[^\s]*)\s+(?:[^\s]*)', return_subs) + EOI + + : failure + : + $* <'' + print $regex.match(" bar", '([^\s]+)\s+([^\s]+)', return_subs) + EOI + } + } +} + +: search +: +{ + : arg-types + : + { + : untyped-string + : + $* <'true' + print $regex.search('.foo.cxx', [string] '([^.]*)') + EOI + + : untyped-untyped + : + $* <'true' + print $regex.search('.foo.cxx', '([^.]*)') + EOI + } + + : flags + : + { + : none + : + $* <'false' + print $regex.match("Foo.cxx", '(f[^.]*).*') + EOI + + : icase + : + $* <'true' + print $regex.match("Foo.cxx", '(f[^.]*).*', icase) + EOI + + : return_subs + : + { + : success + : + $* <'foo bar' + print $regex.search(" foo bar baz", '([^\s]+)\s+([^\s]+)', return_subs) + EOI + + : no-subexpr + : + $* <'' + print $regex.search("foo bar ba", '(?:[^\s]+)\s+(?:[^\s]+)', return_subs) + EOI + + : failure + : + $* <'' + print $regex.match(" bar", '([^\s]+)\s+([^\s]+)', return_subs) + EOI + } + + : return_match + : + { + : success + : + $* <'foo bar' + print $regex.search(" foo bar baz", '([^\s]+)\s+([^\s]+)', return_match) + EOI + + : subs + : + $* <'foo bar foo bar' + print $regex.search(" foo bar baz", '([^\s]+)\s+([^\s]+)', return_match return_subs) + EOI + + : failure + : + $* <'' + print $regex.search(" bar", '([^\s]+)\s+([^\s]+)', return_match) + EOI + } + } +} diff --git a/unit-tests/cc/lexer/buildfile b/unit-tests/cc/lexer/buildfile index 3152e77..53adc33 100644 --- a/unit-tests/cc/lexer/buildfile +++ b/unit-tests/cc/lexer/buildfile @@ -5,11 +5,12 @@ #@@ Temporary until we get utility library support. # import libs = libbutl%lib{butl} -src = cc/lexer token lexer diagnostics utility variable name b-options \ +src = cc/lexer token lexer diagnostics utility variable name b-options \ types-parsers context scope parser target operation rule prerequisite file \ -module function functions-builtin functions-filesystem functions-path \ -functions-process-path functions-string functions-target-triplet algorithm \ -search dump filesystem scheduler config/{utility init operation module} spec +module function functions-builtin functions-filesystem functions-path \ +functions-process-path functions-regex functions-string \ +functions-target-triplet algorithm search dump filesystem scheduler \ +config/{utility init operation module} spec exe{driver}: cxx{driver} ../../../build2/cxx{$src} ../../../build2/liba{b} \ $libs test{*} diff --git a/unit-tests/cc/parser/buildfile b/unit-tests/cc/parser/buildfile index bb1ad53..59ef4fa 100644 --- a/unit-tests/cc/parser/buildfile +++ b/unit-tests/cc/parser/buildfile @@ -8,7 +8,7 @@ import libs = libbutl%lib{butl} src = cc/{lexer parser} token lexer diagnostics utility variable name \ b-options types-parsers context scope parser target operation rule \ prerequisite file module function functions-builtin functions-filesystem \ -functions-path functions-process-path functions-string \ +functions-path functions-process-path functions-regex functions-string \ functions-target-triplet algorithm search dump filesystem scheduler \ config/{utility init operation module} spec diff --git a/unit-tests/function/buildfile b/unit-tests/function/buildfile index c0cd26f..ad29ac7 100644 --- a/unit-tests/function/buildfile +++ b/unit-tests/function/buildfile @@ -5,11 +5,11 @@ #@@ Temporary until we get utility library support. # import libs = libbutl%lib{butl} -src = token lexer diagnostics utility variable name b-options types-parsers \ +src = token lexer diagnostics utility variable name b-options types-parsers \ context scope parser target operation rule prerequisite file module function \ functions-builtin functions-filesystem functions-path functions-process-path \ -functions-string functions-target-triplet algorithm search dump filesystem \ -scheduler config/{utility init operation module} spec +functions-regex functions-string functions-target-triplet algorithm search \ +dump filesystem scheduler config/{utility init operation module} spec exe{driver}: cxx{driver} ../../build2/cxx{$src} ../../build2/liba{b} $libs test{call syntax} diff --git a/unit-tests/lexer/buildfile b/unit-tests/lexer/buildfile index 3d4e3ed..0ab50de 100644 --- a/unit-tests/lexer/buildfile +++ b/unit-tests/lexer/buildfile @@ -5,11 +5,11 @@ #@@ Temporary until we get utility library support. # import libs = libbutl%lib{butl} -src = token lexer diagnostics utility variable name b-options types-parsers \ +src = token lexer diagnostics utility variable name b-options types-parsers \ context scope parser target operation rule prerequisite file module function \ functions-builtin functions-filesystem functions-path functions-process-path \ -functions-string functions-target-triplet algorithm search dump filesystem \ -scheduler config/{utility init operation module} spec +functions-regex functions-string functions-target-triplet algorithm search \ +dump filesystem scheduler config/{utility init operation module} spec exe{driver}: cxx{driver} ../../build2/cxx{$src} ../../build2/liba{b} $libs \ test{*} diff --git a/unit-tests/scheduler/buildfile b/unit-tests/scheduler/buildfile index 22361b5..b3b97ce 100644 --- a/unit-tests/scheduler/buildfile +++ b/unit-tests/scheduler/buildfile @@ -5,11 +5,11 @@ #@@ Temporary until we get utility library support. # import libs = libbutl%lib{butl} -src = token lexer diagnostics utility variable name b-options types-parsers \ +src = token lexer diagnostics utility variable name b-options types-parsers \ context scope parser target operation rule prerequisite file module function \ functions-builtin functions-filesystem functions-path functions-process-path \ -functions-string functions-target-triplet algorithm search dump filesystem \ -scheduler config/{utility init operation module} spec +functions-regex functions-string functions-target-triplet algorithm search \ +dump filesystem scheduler config/{utility init operation module} spec exe{driver}: cxx{driver} ../../build2/cxx{$src} ../../build2/liba{b} $libs diff --git a/unit-tests/test/script/lexer/buildfile b/unit-tests/test/script/lexer/buildfile index 2070089..a21df7a 100644 --- a/unit-tests/test/script/lexer/buildfile +++ b/unit-tests/test/script/lexer/buildfile @@ -5,13 +5,15 @@ #@@ Temporary until we get utility library support. # import libs = libbutl%lib{butl} -src = token lexer diagnostics utility variable name b-options types-parsers \ +src = token lexer diagnostics utility variable name b-options types-parsers \ context scope parser target operation rule prerequisite file module function \ functions-builtin functions-filesystem functions-path functions-process-path \ -functions-string functions-target-triplet algorithm search dump filesystem \ -scheduler config/{utility init operation module} test/script/{token lexer} spec +functions-regex functions-string functions-target-triplet algorithm search \ +dump filesystem scheduler config/{utility init operation module} \ +test/script/{token lexer} spec -exe{driver}: cxx{driver} ../../../../build2/cxx{$src} ../../../../build2/liba{b} $libs \ +exe{driver}: cxx{driver} ../../../../build2/cxx{$src} \ +../../../../build2/liba{b} $libs \ test{command-line first-token second-token command-expansion variable-line \ description-line variable} diff --git a/unit-tests/test/script/parser/buildfile b/unit-tests/test/script/parser/buildfile index 634120c..184fd82 100644 --- a/unit-tests/test/script/parser/buildfile +++ b/unit-tests/test/script/parser/buildfile @@ -8,12 +8,14 @@ import libs = libbutl%lib{butl} src = token lexer parser diagnostics utility variable name context target \ scope prerequisite file module operation rule b-options algorithm search \ filesystem function functions-builtin functions-filesystem functions-path \ -functions-process-path functions-string functions-target-triplet \ -config/{utility init operation module} dump types-parsers \ -test/{target script/{token lexer parser regex script}} scheduler spec +functions-process-path functions-regex functions-string \ +functions-target-triplet config/{utility init operation module} dump \ +types-parsers test/{target script/{token lexer parser regex script}} \ +scheduler spec -exe{driver}: cxx{driver} ../../../../build2/cxx{$src} ../../../../build2/liba{b} $libs \ -test{cleanup command-if command-re-parse description directive exit \ +exe{driver}: cxx{driver} ../../../../build2/cxx{$src} \ +../../../../build2/liba{b} $libs \ +test{cleanup command-if command-re-parse description directive exit \ expansion here-document here-string include pipe-expr pre-parse \ redirect regex scope scope-if setup-teardown} -- cgit v1.1