// file : libbuild2/functions-regex.cxx -*- C++ -*- // license : MIT; see accompanying LICENSE file #include <sstream> #include <libbutl/regex.hxx> #include <libbuild2/function.hxx> #include <libbuild2/variable.hxx> using namespace std; using namespace butl; namespace build2 { // Convert value of an arbitrary type to string. // static inline string to_string (value&& v) { // Optimize for the string value type. // if (v.type != &value_traits<string>::value_type) untypify (v, true /* reduce */); return convert<string> (move (v)); } // Parse a regular expression. Throw invalid_argument if it is not valid. // // Note: also used in functions-process.cxx (thus not static). // regex parse_regex (const string& s, regex::flag_type f) { try { return regex (s, f); } catch (const regex_error& e) { // Print regex_error description if meaningful (no space). // ostringstream os; os << "invalid regex '" << s << "'" << e; throw invalid_argument (os.str ()); } } // Match value of an arbitrary type against the regular expression. See // match() overloads (below) for details. // static value match (value&& v, const string& re, optional<names>&& flags) { // Parse flags. // regex::flag_type rf (regex::ECMAScript); bool subs (false); if (flags) { for (name& f: *flags) { string s (convert<string> (move (f))); if (s == "icase") rf |= regex::icase; else if (s == "return_subs") subs = true; else throw invalid_argument ("invalid flag '" + s + '\''); } } // Parse regex. // regex rge (parse_regex (re, rf)); // Match. // string s (to_string (move (v))); if (!subs) return value (regex_match (s, rge)); // Return boolean value. match_results<string::const_iterator> m; if (regex_match (s, m, rge)) { assert (!m.empty ()); names r; for (size_t i (1); i != m.size (); ++i) r.emplace_back (m[i].matched ? m.str (i) : string ()); return value (move (r)); } else return value (); } // Determine if there is a match between the regular expression and some // part of a value of an arbitrary type. See search() overloads (below) // for details. // static value search (value&& v, const string& re, optional<names>&& flags) { // Parse flags. // regex::flag_type rf (regex::ECMAScript); bool match (false); bool subs (false); if (flags) { for (auto& f: *flags) { string s (convert<string> (move (f))); if (s == "icase") rf |= regex::icase; else if (s == "return_match") match = true; else if (s == "return_subs") subs = true; else throw invalid_argument ("invalid flag '" + s + '\''); } } // Parse regex. // regex rge (parse_regex (re, rf)); // Search. // string s (to_string (move (v))); if (!match && !subs) return value (regex_search (s, rge)); // Return boolean value. match_results<string::const_iterator> m; if (regex_search (s, m, rge)) { assert (!m.empty ()); names r; if (match) { assert (m[0].matched); r.emplace_back (m.str (0)); } if (subs) { for (size_t i (1); i != m.size (); ++i) r.emplace_back (m[i].matched ? m.str (i) : string ()); } return value (move (r)); } else return value (); } static pair<regex::flag_type, regex_constants::match_flag_type> parse_replacement_flags (optional<names>&& flags, bool first_only = true, bool* copy_empty = nullptr) { regex::flag_type rf (regex::ECMAScript); regex_constants::match_flag_type mf (regex_constants::match_default); if (flags) { for (auto& f: *flags) { string s (convert<string> (move (f))); if (s == "icase") rf |= regex::icase; else if (first_only && s == "format_first_only") mf |= regex_constants::format_first_only; else if (s == "format_no_copy") mf |= regex_constants::format_no_copy; else if (copy_empty != nullptr && s == "format_copy_empty") *copy_empty = true; else throw invalid_argument ("invalid flag '" + s + '\''); } } return make_pair (rf, mf); } // Replace matched parts in a value of an arbitrary type, using the format // string. See replace() overloads (below) for details. // static names replace (value&& v, const string& re, const string& fmt, optional<names>&& flags) { auto fl (parse_replacement_flags (move (flags))); regex rge (parse_regex (re, fl.first)); names r; try { r.emplace_back (regex_replace_search (to_string (move (v)), rge, fmt, fl.second).first); } catch (const regex_error& e) { // Print regex_error description if meaningful (no space). // fail << "unable to replace" << e; } return r; } // Replace matched parts in lines using the format string. See // replace_lines() overloads (below) for details. // static names replace_lines (value&& v, const string& re, const optional<string>& fmt, optional<names>&& flags) { string s (to_string (move (v))); // Extract the return_lines flag, if present, and parse the remaining // flags using parse_replacement_flags(). // bool rls (false); if (flags) { names& fs (*flags); const name rlf ("return_lines"); for (names::const_iterator i (fs.begin ()); i != fs.end (); ) { if (*i == rlf) { rls = true; i = fs.erase (i); } else ++i; } } auto fl (parse_replacement_flags (move (flags))); regex rge (parse_regex (re, fl.first)); names r; string ls; try { istringstream is (s); is.exceptions (istringstream::badbit); const string& efmt (fmt ? *fmt : ""); bool no_copy ((fl.second & regex_constants::format_no_copy) != 0); for (string l; !eof (getline (is, l)); ) { auto rr (regex_replace_search (l, rge, efmt, fl.second)); string& s (rr.first); // Skip the empty replacement for a matched line if the format is // absent and an unmatched line if the format_no_copy flag is // specified. // if (rr.second ? !fmt && s.empty () : no_copy) continue; if (!rls) r.emplace_back (to_name (move (s))); else { if (ls.empty ()) ls = move (s); else ls += s; // Append the trailing newline for the added line if EOS is not // reached, which indicates that the original line is terminated // with newline. // if (!is.eof ()) ls += '\n'; } } } catch (const regex_error& e) { // Print regex_error description if meaningful (no space). // fail << "unable to replace lines" << e; } catch (const io_error& e) { fail << "unable to read lines: " << e; } if (rls) r.push_back (move (ls)); return r; } // Split a value of an arbitrary type into a list of unmatched value parts // and replacements of the matched parts. See split() overloads (below) for // details. // static names split (value&& v, const string& re, const string& fmt, optional<names>&& flags) { bool copy_empty (false); auto fl (parse_replacement_flags (move (flags), false /* first_only */, ©_empty)); regex rge (parse_regex (re, fl.first)); names r; try { regex_replace_search (to_string (move (v)), rge, fmt, [copy_empty, &r] (string::const_iterator b, string::const_iterator e) { if (copy_empty || b != e) r.emplace_back (string (b, e)); }, fl.second); } catch (const regex_error& e) { // Print regex_error description if meaningful (no space). // fail << "unable to split" << e; } return r; } // Replace matched parts of list elements using the format string. See // apply() overloads (below) for details. // static names apply (names&& ns, const string& re, const string& fmt, optional<names>&& flags) { bool copy_empty (false); auto fl (parse_replacement_flags (move (flags), true /* first_only */, ©_empty)); regex rge (parse_regex (re, fl.first)); names r; try { for (auto& n: ns) { string s (regex_replace_search (convert<string> (move (n)), rge, fmt, fl.second).first); if (copy_empty || !s.empty ()) r.emplace_back (move (s)); } } catch (const regex_error& e) { // Print regex_error description if meaningful (no space). // fail << "unable to apply" << e; } return r; } static regex::flag_type parse_find_flags (optional<names>&& flags) { regex::flag_type r (regex::ECMAScript); if (flags) { for (auto& f: *flags) { string s (convert<string> (move (f))); if (s == "icase") r |= regex::icase; else throw invalid_argument ("invalid flag '" + s + '\''); } } return r; } // Return true if any of the list elements match the regular expression. // See find_match() overloads (below) for details. // static bool find_match (names&& ns, const string& re, optional<names>&& flags) { regex::flag_type fl (parse_find_flags (move (flags))); regex rge (parse_regex (re, fl)); for (auto& n: ns) { if (regex_match (convert<string> (move (n)), rge)) return true; } return false; } // Return a list of elements that match (matching is true) or don't match // (matching is false) the regular expression. See filter_match() and // filter_out_match() overloads (below) for details. // static names filter_match (names&& ns, const string& re, optional<names>&& flags, bool matching) { regex::flag_type fl (parse_find_flags (move (flags))); regex rge (parse_regex (re, fl)); names r; for (name& n: ns) { // Note that we need to preserve the element while converting it to // string since we may add it to the resulting list. But let's optimize // this for the simple value case by round-tripping it through the // string. // bool s (n.simple ()); string v (convert<string> (s ? move (n) : name (n))); if (regex_match (v, rge) == matching) r.emplace_back (s ? name (move (v)) : move (n)); } return r; } // Return true if a part of any of the list elements matches the regular // expression. See find_search() overloads (below) for details. // static bool find_search (names&& ns, const string& re, optional<names>&& flags) { regex::flag_type fl (parse_find_flags (move (flags))); regex rge (parse_regex (re, fl)); for (auto& n: ns) { if (regex_search (convert<string> (move (n)), rge)) return true; } return false; } // Return those elements of a list which have a match (matching is true) or // have no match (matching is false) between the regular expression and // some/any part of the element. See filter_search() and filter_out_search() // overloads (below) for details. // static names filter_search (names&& ns, const string& re, optional<names>&& flags, bool matching) { regex::flag_type fl (parse_find_flags (move (flags))); regex rge (parse_regex (re, fl)); names r; for (auto& n: ns) { // Note that we need to preserve the element while converting it to // string since we may add it to the resulting list. But let's optimize // this for the simple value case by round-tripping it through the // string. // bool s (n.simple ()); string v (convert<string> (s ? move (n) : name (n))); if (regex_search (v, rge) == matching) r.emplace_back (s ? name (move (v)) : move (n)); } return r; } // Replace matched parts of list elements using the format string and // concatenate the transformed elements. See merge() overloads (below) for // details. // static names merge (names&& ns, const string& re, const string& fmt, optional<string>&& delim, optional<names>&& flags) { bool copy_empty (false); auto fl (parse_replacement_flags (move (flags), true /* first_only */, ©_empty)); regex rge (parse_regex (re, fl.first)); string rs; try { bool first (true); for (auto& n: ns) { string s (regex_replace_search (convert<string> (move (n)), rge, fmt, fl.second).first); if (copy_empty || !s.empty ()) { if (delim) { if (first) first = false; else rs.append (*delim); } rs.append (s); } } } catch (const regex_error& e) { // Print regex_error description if meaningful (no space). // fail << "unable to merge" << e; } names r; r.emplace_back (move (rs)); return r; } void regex_functions (function_map& m) { function_family f (m, "regex"); // $regex.match(<val>, <pat> [, <flags>]) // // Match a value of an arbitrary type against the regular expression. // Convert the value to string prior to matching. Return the boolean value // unless `return_subs` flag is specified (see below), in which case // return names (or `null` if no match). // // The following flags are supported: // // icase - match ignoring case // // return_subs - return names (rather than boolean), that contain // sub-strings that match the marked sub-expressions // and null if no match // f[".match"] += [](value v, string re, optional<names> flags) { return match (move (v), re, move (flags)); }; f[".match"] += [](value v, names re, optional<names> flags) { return match (move (v), convert<string> (move (re)), move (flags)); }; // $regex.find_match(<vals>, <pat> [, <flags>]) // // Match list elements against the regular expression and return true if // the match is found. Convert the elements to strings prior to matching. // // The following flags are supported: // // icase - match ignoring case // f[".find_match"] += [](names ns, string re, optional<names> flags) { return find_match (move (ns), re, move (flags)); }; f[".find_match"] += [](names ns, names re, optional<names> flags) { return find_match (move (ns), convert<string> (move (re)), move (flags)); }; // $regex.filter_match(<vals>, <pat> [, <flags>]) // $regex.filter_out_match(<vals>, <pat> [, <flags>]) // // Return elements of a list that match (`filter`) or do not match // (`filter_out`) the regular expression. Convert the elements to strings // prior to matching. // // The following flags are supported: // // icase - match ignoring case // f[".filter_match"] += [](names ns, string re, optional<names> flags) { return filter_match (move (ns), re, move (flags), true /* matching */); }; f[".filter_match"] += [](names ns, names re, optional<names> flags) { return filter_match (move (ns), convert<string> (move (re)), move (flags), true /* matching */); }; f[".filter_out_match"] += [](names s, string re, optional<names> flags) { return filter_match (move (s), re, move (flags), false /* matching */); }; f[".filter_out_match"] += [](names ns, names re, optional<names> flags) { return filter_match (move (ns), convert<string> (move (re)), move (flags), false /* matching */); }; // $regex.search(<val>, <pat> [, <flags>]) // // Determine if there is a match between the regular expression and some // part of a value of an arbitrary type. Convert the value to string prior // to searching. Return the boolean value unless `return_match` or // `return_subs` flag is specified (see below) in which case return names // (`null` if no match). // // The following flags are supported: // // icase - match ignoring case // // return_match - return names (rather than boolean), that contain a // sub-string that matches the whole regular expression // and null if no match // // return_subs - return names (rather than boolean), that contain // sub-strings that match the marked sub-expressions // and null if no match // // If both `return_match` and `return_subs` flags are specified then the // sub-string that matches the whole regular expression comes first. // f[".search"] += [](value v, string re, optional<names> flags) { return search (move (v), re, move (flags)); }; f[".search"] += [](value v, names re, optional<names> flags) { return search (move (v), convert<string> (move (re)), move (flags)); }; // $regex.find_search(<vals>, <pat> [, <flags>]) // // Determine if there is a match between the regular expression and some // part of any of the list elements. Convert the elements to strings prior // to matching. // // The following flags are supported: // // icase - match ignoring case // f[".find_search"] += [](names ns, string re, optional<names> flags) { return find_search (move (ns), re, move (flags)); }; f[".find_search"] += [](names ns, names re, optional<names> flags) { return find_search (move (ns), convert<string> (move (re)), move (flags)); }; // $regex.filter_search(<vals>, <pat> [, <flags>]) // $regex.filter_out_search(<vals>, <pat> [, <flags>]) // // Return elements of a list for which there is a match (`filter`) or no // match (`filter_out`) between the regular expression and some part of // the element. Convert the elements to strings prior to matching. // // The following flags are supported: // // icase - match ignoring case // f[".filter_search"] += [](names ns, string re, optional<names> flags) { return filter_search (move (ns), re, move (flags), true /* matching */); }; f[".filter_search"] += [](names ns, names re, optional<names> flags) { return filter_search (move (ns), convert<string> (move (re)), move (flags), true /* matching */); }; f[".filter_out_search"] += [](names ns, string re, optional<names> flags) { return filter_search (move (ns), re, move (flags), false /* matching */); }; f[".filter_out_search"] += [](names ns, names re, optional<names> flags) { return filter_search (move (ns), convert<string> (move (re)), move (flags), false /* matching */); }; // $regex.replace(<val>, <pat>, <fmt> [, <flags>]) // // Replace matched parts in a value of an arbitrary type, using the format // string. Convert the value to string prior to matching. The result value // is always untyped, regardless of the argument type. // // The following flags are supported: // // icase - match ignoring case // // format_first_only - only replace the first match // // format_no_copy - do not copy unmatched value parts into the // result // // If both `format_first_only` and `format_no_copy` flags are specified // then the result will only contain the replacement of the first match. // f[".replace"] += [](value v, string re, string fmt, optional<names> flags) { return replace (move (v), re, fmt, move (flags)); }; f[".replace"] += [](value v, names re, names fmt, optional<names> flags) { return replace (move (v), convert<string> (move (re)), convert<string> (move (fmt)), move (flags)); }; // $regex.replace_lines(<val>, <pat>, <fmt> [, <flags>]) // // Convert the value to string, parse it into lines and for each line // apply the `$regex.replace()` function with the specified pattern, // format, and flags. If the format argument is `null`, omit the // "all-`null`" replacements for the matched lines from the result. Return // unmatched lines and line replacements as a `name` list unless // `return_lines` flag is specified (see below), in which case return a // single multi-line simple `name` value. // // The following flags are supported in addition to the `$regex.replace()` // function's flags: // // return_lines - return the simple name (rather than a name list) // containing the unmatched lines and line replacements // separated with newlines. // // Note that if `format_no_copy` is specified, unmatched lines are not // copied either. // f[".replace_lines"] += [](value v, string re, string fmt, optional<names> flags) { return replace_lines (move (v), re, move (fmt), move (flags)); }; f[".replace_lines"] += [](value v, names re, names* fmt, optional<names> flags) { return replace_lines ( move (v), convert<string> (move (re)), (fmt != nullptr ? optional<string> (convert<string> (move (*fmt))) : nullopt), move (flags)); }; // $regex.split(<val>, <pat>, <fmt> [, <flags>]) // // Split a value of an arbitrary type into a list of unmatched value parts // and replacements of the matched parts, omitting empty ones (unless the // `format_copy_empty` flag is specified). Convert the value to string // prior to matching. // // The following flags are supported: // // icase - match ignoring case // // format_no_copy - do not copy unmatched value parts into the // result // // format_copy_empty - copy empty elements into the result // f[".split"] += [](value v, string re, string fmt, optional<names> flags) { return split (move (v), re, fmt, move (flags)); }; f[".split"] += [](value v, names re, names fmt, optional<names> flags) { return split (move (v), convert<string> (move (re)), convert<string> (move (fmt)), move (flags)); }; // $regex.merge(<vals>, <pat>, <fmt> [, <delim> [, <flags>]]) // // Replace matched parts in a list of elements using the regex format // string. Convert the elements to strings prior to matching. The result // value is untyped and contains concatenation of transformed non-empty // elements (unless the `format_copy_empty` flag is specified) optionally // separated with a delimiter. // // The following flags are supported: // // icase - match ignoring case // // format_first_only - only replace the first match // // format_no_copy - do not copy unmatched value parts into the // result // // format_copy_empty - copy empty elements into the result // // If both `format_first_only` and `format_no_copy` flags are specified // then the result will be a concatenation of only the first match // replacements. // f[".merge"] += [](names ns, string re, string fmt, optional<string*> delim, optional<names> flags) { return merge (move (ns), re, fmt, delim && *delim != nullptr ? move (**delim) : optional<string> (), move (flags)); }; f[".merge"] += [](names ns, names re, names fmt, optional<names*> delim, optional<names> flags) { return merge (move (ns), convert<string> (move (re)), convert<string> (move (fmt)), delim && *delim != nullptr ? convert<string> (move (**delim)) : optional<string> (), move (flags)); }; // $regex.apply(<vals>, <pat>, <fmt> [, <flags>]) // // Replace matched parts of each element in a list using the regex format // string. Convert the elements to strings prior to matching. Return a // list of transformed elements, omitting the empty ones (unless the // `format_copy_empty` flag is specified). // // The following flags are supported: // // icase - match ignoring case // // format_first_only - only replace the first match // // format_no_copy - do not copy unmatched value parts into the // result // // format_copy_empty - copy empty elements into the result // // If both `format_first_only` and `format_no_copy` flags are specified // then the result elements will only contain the replacement of the first // match. // f[".apply"] += [](names ns, string re, string fmt, optional<names> flags) { return apply (move (ns), re, fmt, move (flags)); }; f[".apply"] += [](names ns, names re, names fmt, optional<names> flags) { return apply (move (ns), convert<string> (move (re)), convert<string> (move (fmt)), move (flags)); }; } }