From 6b3e75edf034ebcbd048a24c283c7bcf7b1da019 Mon Sep 17 00:00:00 2001 From: Boris Kolpackov Date: Tue, 25 May 2021 11:19:04 +0200 Subject: Add support for regex-based target type/pattern specific variables This is in addition to the already supported path-based target type/pattern specific variables. For example: hxx{*}: x = y # path-based hxx{~/.*/}: x = y # regex-based --- libbuild2/dump.cxx | 10 +- libbuild2/lexer.cxx | 19 +- libbuild2/name.cxx | 81 +++- libbuild2/name.hxx | 38 +- libbuild2/name.ixx | 15 +- libbuild2/parser.cxx | 448 ++++++++++++++------- libbuild2/parser.hxx | 14 +- libbuild2/scope.cxx | 8 +- libbuild2/token.hxx | 3 +- libbuild2/types.hxx | 10 + libbuild2/variable.cxx | 95 ++++- libbuild2/variable.hxx | 64 ++- old-tests/variable/type-pattern/buildfile | 39 -- .../target-type-pattern-specific/testscript | 127 ++++++ 14 files changed, 710 insertions(+), 261 deletions(-) delete mode 100644 old-tests/variable/type-pattern/buildfile diff --git a/libbuild2/dump.cxx b/libbuild2/dump.cxx index bc44b24..23d430e 100644 --- a/libbuild2/dump.cxx +++ b/libbuild2/dump.cxx @@ -147,6 +147,9 @@ namespace build2 const variable_type_map& vtm, const scope& s) { + using pattern = variable_pattern_map::pattern; + using pattern_type = variable_pattern_map::pattern_type; + for (const auto& vt: vtm) { const target_type& t (vt.first); @@ -154,7 +157,7 @@ namespace build2 for (const auto& vp: vpm) { - const string p (vp.first); + const pattern& pat (vp.first); const variable_map& vars (vp.second); os << endl @@ -163,7 +166,10 @@ namespace build2 if (t != target::static_type) os << t.name << '{'; - os << p; + if (pat.type == pattern_type::regex_pattern) + os << '~'; + + os << pat.text; if (t != target::static_type) os << '}'; diff --git a/libbuild2/lexer.cxx b/libbuild2/lexer.cxx index 0b6f96d..f445d4b 100644 --- a/libbuild2/lexer.cxx +++ b/libbuild2/lexer.cxx @@ -674,18 +674,15 @@ namespace build2 bool qcomp (false); bool qfirst (false); - auto append = [&lexeme, &m, &qcomp, &qfirst] (char c) + auto append = [&lexeme, &m, &qcomp, &qfirst] (char c, bool escaped = false) { - if (m == lexer_mode::double_quoted) - { - if (lexeme.empty ()) // First character. + if (lexeme.empty () && (escaped || m == lexer_mode::double_quoted)) qfirst = true; - } - else - { - if (qcomp) // An unquoted character after a quoted fragment. - qcomp = false; - } + + // An unquoted character after a quoted fragment. + // + if (m != lexer_mode::double_quoted && qcomp) + qcomp = false; lexeme += c; }; @@ -716,7 +713,7 @@ namespace build2 fail (p) << "unterminated escape sequence"; if (p != '\n') // Ignore if line continuation. - append (p); + append (p, true); continue; } diff --git a/libbuild2/name.cxx b/libbuild2/name.cxx index 6a33a63..1081b5c 100644 --- a/libbuild2/name.cxx +++ b/libbuild2/name.cxx @@ -82,8 +82,21 @@ namespace build2 ostream& to_stream (ostream& os, const name& n, bool quote, char pair, bool escape) { - auto write_string = [quote, pair, escape, &os](const string& v, bool pat) + using pattern_type = name::pattern_type; + + auto write_string = [&os, quote, pair, escape] ( + const string& v, + optional pat = nullopt, + bool curly = false) { + // Special characters, path pattern characters, and regex pattern + // characters. The latter only need to be quoted in the first position + // and if followed by a non-alphanumeric delimiter. If that's the only + // special character, then we handle it with escaping rather than + // quoting (see the parsing logic for rationale). Additionally, we + // escape leading `+` in the curly braces which is also recognized as a + // path pattern. + // char sc[] = { '{', '}', '[', ']', '$', '(', ')', // Token endings. ' ', '\t', '\n', '#', // Spaces. @@ -93,9 +106,24 @@ namespace build2 '\0'}; char pc[] = { - '*', '?', // Wildcard characters. + '*', '?', // Path wildcard characters. '\0'}; + auto rc = [] (const string& v) + { + return (v[0] == '~' || v[0] == '^') && v[1] != '\0' && !alnum (v[1]); + }; + + if (pat) + { + switch (*pat) + { + case pattern_type::path: break; + case pattern_type::regex_pattern: os << '~'; break; + case pattern_type::regex_substitution: os << '^'; break; + } + } + if (quote && v.find ('\'') != string::npos) { // Quote the string with the double quotes rather than with the single @@ -115,6 +143,11 @@ namespace build2 if (escape) os << '\\'; os << '"'; } + // + // Note that a regex pattern does not need to worry about special path + // pattern character but not vice-verse. See the parsing logic for + // details. + // else if (quote && (v.find_first_of (sc) != string::npos || (!pat && v.find_first_of (pc) != string::npos))) { @@ -126,17 +159,32 @@ namespace build2 if (escape) os << '\\'; os << '\''; } + // Note that currently we do not preserve a leading `+` as a pattern + // unless it has other wildcard characters (see the parsing code for + // details). So we escape it both if it's not a pattern or is a path + // pattern. + // + else if (quote && ((!pat || *pat == pattern_type::path) && + ((v[0] == '+' && curly) || rc (v)))) + { + if (escape) os << '\\'; + os << '\\' << v; + } else os << v; }; uint16_t dv (stream_verb (os).path); // Directory verbosity. - auto write_dir = [dv, quote, &os, &write_string] (const dir_path& d, - bool pat) + auto write_dir = [&os, quote, &write_string, dv] ( + const dir_path& d, + optional pat = nullopt, + bool curly = false) { if (quote) - write_string (dv < 1 ? diag_relative (d) : d.representation (), pat); + write_string (dv < 1 ? diag_relative (d) : d.representation (), + pat, + curly); else os << d; }; @@ -151,7 +199,7 @@ namespace build2 if (n.proj) { - write_string (n.proj->string (), false); + write_string (n.proj->string ()); os << '%'; } @@ -171,29 +219,34 @@ namespace build2 dir_path ()); if (!pd.empty ()) - write_dir (pd, false); + write_dir (pd); - if (t || (!d && !v)) + bool curly; + if ((curly = t || (!d && !v))) { if (t) - write_string (n.type, false); + write_string (n.type); os << '{'; } if (v) - write_string (n.value, n.pattern); + write_string (n.value, n.pattern, curly); else if (d) { + // A directory pattern cannot be regex. + // + assert (!n.pattern || *n.pattern == pattern_type::path); + if (rd.empty ()) - write_string (dir_path (".").representation (), false); + write_string (dir_path (".").representation (), nullopt, curly); else if (!pd.empty ()) - write_string (rd.leaf ().representation (), n.pattern); + write_string (rd.leaf ().representation (), n.pattern, curly); else - write_dir (rd, n.pattern); + write_dir (rd, n.pattern, curly); } - if (t || (!d && !v)) + if (curly) os << '}'; return os; diff --git a/libbuild2/name.hxx b/libbuild2/name.hxx index 5c76d07..216f207 100644 --- a/libbuild2/name.hxx +++ b/libbuild2/name.hxx @@ -33,16 +33,21 @@ namespace build2 // If pair is not '\0', then this name and the next in the list form a // pair. Can be used as a bool flag. // - // If pattern is true then this is a name pattern (e.g., file{*.txt}). + // If pattern is present then this is a name pattern (e.g., file{*.txt}, + // file{~'/(.+)\.txt/i'}, file{^'/\1/'}). A directory name cannot be a regex + // pattern (since we would need to store it in dir_path and a regex is not + // necessarily a valid path). // struct name { + enum class pattern_type: uint8_t {path, regex_pattern, regex_substitution}; + optional proj; dir_path dir; string type; string value; char pair = '\0'; - bool pattern = false; + optional pattern; name () {} // = default; Clang needs this to initialize const object. name (string v): value (move (v)) {} @@ -57,13 +62,16 @@ namespace build2 : proj (project_name (move (p))), dir (move (d)), type (move (t)), value (move (v)) {} - name (optional p, - dir_path d, - string t, - string v, - bool pat = false) + name (optional p, dir_path d, string t, string v) + : proj (move (p)), dir (move (d)), type (move (t)), value (move (v)) {} + + name (optional p, + dir_path d, + string t, + string v, + optional pt) : proj (move (p)), dir (move (d)), type (move (t)), value (move (v)), - pattern (pat) {} + pattern (pt) {} bool qualified () const {return proj.has_value ();} @@ -162,7 +170,8 @@ namespace build2 cs.append (n.type); cs.append (n.value); cs.append (n.pair); - cs.append (n.pattern); + if (n.pattern) + cs.append (static_cast (*n.pattern)); } // Store a string in a name in a reversible way. If the string ends with a @@ -173,14 +182,21 @@ namespace build2 to_name (string); // Serialize the name to the stream. If requested, the name components - // containing special characters are quoted. The special characters are: + // containing special characters are quoted and/or escaped. The special + // characters are: // // {}[]$() \t\n#\"'% // - // And additionally, if name is not a pattern: + // And additionally, unless name is a pattern: // // *? // + // As well as leading and if followed by a non-alphanumeric delimiter: + // + // ~^ + // + // As well as leading `+` if in the curly braces. + // // If the pair argument is not '\0', then it is added to the above special // characters set. If the quote character is present in the component then // it is double quoted rather than single quoted. In this case the following diff --git a/libbuild2/name.ixx b/libbuild2/name.ixx index 80a097e..a3ee94b 100644 --- a/libbuild2/name.ixx +++ b/libbuild2/name.ixx @@ -21,7 +21,20 @@ namespace build2 r = pair < x.pair ? -1 : (pair > x.pair ? 1 : 0); if (r == 0) - r = pattern == x.pattern ? 0 : (!pattern && x.pattern ? -1 : 1); + { + bool p (pattern); + bool xp (x.pattern); + + r = p == xp ? 0 : (p ? 1 : -1); + + if (r == 0 && p) + { + auto p (static_cast (*pattern)); + auto xp (static_cast (*x.pattern)); + + r = p < xp ? -1 : (p > xp ? 1 : 0); + } + } return r; } diff --git a/libbuild2/parser.cxx b/libbuild2/parser.cxx index a9646d5..120d6ab 100644 --- a/libbuild2/parser.cxx +++ b/libbuild2/parser.cxx @@ -638,7 +638,9 @@ namespace build2 // sensitive to the target context in which they are evaluated. The // function signature is: // - // void (token& t, type& tt, const target_type* type, string pat) + // void (token& t, type& tt, + // optional, const target_type* pat_tt, string pat, + // const location& pat_loc) // // Note that the target and its ad hoc members are inserted implied // but this flag can be cleared and default_target logic applied if @@ -695,42 +697,51 @@ namespace build2 // // foo*/dir{*/} -- foo*/*/dir{} // - if (n.value.empty () && !n.dir.empty ()) + // Note also that none of this applies to regex patterns (see + // the parsing code for details). + // + if (*n.pattern == pattern_type::path) { - // Note that we use string and not the representation: in a - // sense the trailing slash in the pattern is subsumed by the - // target type. - // - if (n.dir.simple ()) - n.value = move (n.dir).string (); - else + if (n.value.empty () && !n.dir.empty ()) { - n.value = n.dir.leaf ().string (); - n.dir.make_directory (); - } + // Note that we use string and not the representation: in a + // sense the trailing slash in the pattern is subsumed by + // the target type. + // + if (n.dir.simple ()) + n.value = move (n.dir).string (); + else + { + n.value = n.dir.leaf ().string (); + n.dir.make_directory (); + } - // Treat directory as type dir{} similar to other places. - // - if (n.untyped ()) - n.type = "dir"; - } - else - { - // Move the directory part, if any, from value to dir. - // - try - { - n.canonicalize (); - } - catch (const invalid_path& e) - { - fail (nloc) << "invalid path '" << e.path << "'"; + // Treat directory as type dir{} similar to other places. + // + if (n.untyped ()) + n.type = "dir"; } - catch (const invalid_argument&) + else { - fail (nloc) << "invalid pattern '" << n.value << "'"; + // Move the directory part, if any, from value to dir. + // + try + { + n.canonicalize (); + } + catch (const invalid_path& e) + { + fail (nloc) << "invalid path '" << e.path << "'"; + } + catch (const invalid_argument&) + { + fail (nloc) << "invalid pattern '" << n.value << "'"; + } } } + else if (*n.pattern == pattern_type::regex_substitution) + fail (nloc) << "regex substitution " << n << " without " + << "regex pattern"; // If we have the directory, then it is the scope. // @@ -760,7 +771,7 @@ namespace build2 if (ti == nullptr) fail (nloc) << "unknown target type " << n.type; - f (t, tt, ti, move (n.value)); + f (t, tt, n.pattern, ti, move (n.value), nloc); } else { @@ -781,7 +792,7 @@ namespace build2 enter_adhoc_members (move (ans[i]), true /* implied */); } - f (t, tt, nullptr, string ()); + f (t, tt, nullopt, nullptr, string (), location ()); } if (++i != e) @@ -832,7 +843,8 @@ namespace build2 st = token (t), // Save start token (will be gone on replay). recipes = small_vector, 1> ()] (token& t, type& tt, - const target_type* type, string pat) mutable + optional pt, const target_type* ptt, string pat, + const location& ploc) mutable { token rt; // Recipe start token. @@ -842,7 +854,7 @@ namespace build2 { next (t, tt); // Newline. next (t, tt); // First token inside the variable block. - parse_variable_block (t, tt, type, move (pat)); + parse_variable_block (t, tt, pt, ptt, move (pat), ploc); if (tt != type::rcbrace) fail (t) << "expected '}' instead of " << t; @@ -858,7 +870,7 @@ namespace build2 else rt = st; - if (type != nullptr) + if (pt) fail (rt) << "recipe in target type/pattern"; parse_recipe (t, tt, rt, recipes); @@ -921,17 +933,19 @@ namespace build2 // Parse the assignment for each target. // - for_each ([this, &var, akind, &aloc] (token& t, type& tt, - const target_type* type, - string pat) - { - if (type == nullptr) - parse_variable (t, tt, var, akind); - else - parse_type_pattern_variable (t, tt, - *type, move (pat), - var, akind, aloc); - }); + for_each ( + [this, &var, akind, &aloc] ( + token& t, type& tt, + optional pt, const target_type* ptt, string pat, + const location& ploc) + { + if (pt) + parse_type_pattern_variable (t, tt, + *pt, *ptt, move (pat), ploc, + var, akind, aloc); + else + parse_variable (t, tt, var, akind); + }); next_after_newline (t, tt); } @@ -1110,7 +1124,8 @@ namespace build2 void parser:: parse_variable_block (token& t, type& tt, - const target_type* type, string pat) + optional pt, const target_type* ptt, + string pat, const location& ploc) { // Parse a target or prerequisite-specific variable block. If type is not // NULL, then this is a target type/pattern-specific block. @@ -1148,12 +1163,12 @@ namespace build2 << " visibility but is assigned on a target"; } - if (type == nullptr) - parse_variable (t, tt, var, tt); - else + if (pt) parse_type_pattern_variable (t, tt, - *type, pat, // Note: can't move. + *pt, *ptt, pat, ploc, // Note: can't move. var, tt, get_location (t)); + else + parse_variable (t, tt, var, tt); if (tt != type::newline) fail (t) << "expected newline instead of " << t; @@ -3835,24 +3850,34 @@ namespace build2 } void parser:: - parse_type_pattern_variable (token& t, token_type& tt, - const target_type& type, string pat, - const variable& var, token_type kind, - const location& loc) + parse_type_pattern_variable ( + token& t, token_type& tt, + pattern_type pt, const target_type& ptt, string pat, const location& ploc, + const variable& var, token_type kind, const location& loc) { // Parse target type/pattern-specific variable assignment. // - // See old-tests/variable/type-pattern. // Note: expanding the value in the current scope context. // value rhs (parse_variable_value (t, tt)); - // Leave the value untyped unless we are assigning. - // - pair, bool> p ( - scope_->target_vars[type][move (pat)].insert ( - var, kind == type::assign)); + pair, bool> p (rhs /* dummy */, false); + try + { + // Leave the value untyped unless we are assigning. + // + // Note that the pattern is preserved if insert fails with regex_error. + // + p = scope_->target_vars[ptt].insert (pt, move (pat)).insert ( + var, kind == type::assign); + } + catch (const regex_error& e) + { + // Print regex_error description if meaningful (no space). + // + fail (ploc) << "invalid regex pattern '" << pat << "'" << e; + } value& lhs (p.first); @@ -4676,7 +4701,7 @@ namespace build2 dir_path d, string t, string v, - bool pat, + optional pat, const location& loc) { // The directory/value must not be empty if we have a type. @@ -4792,8 +4817,9 @@ namespace build2 } name& r ( - append_name ( - ns, move (p), move (d), move (t), move (v), cn.pattern, loc)); + append_name (ns, + move (p), move (d), move (t), move (v), cn.pattern, + loc)); r.pair = cn.pair; } @@ -5492,6 +5518,7 @@ namespace build2 // Return '+' or '-' if a token can start an inclusion or exclusion // (pattern or group), '\0' otherwise. The result can be used as bool. + // Note that token::qfirst covers both quoting and escaping. // auto pattern_prefix = [] (const token& t) -> char { @@ -5758,9 +5785,9 @@ namespace build2 // Find a separator (slash or %). // - string::size_type p (separators != nullptr - ? val.find_last_of (*separators) - : string::npos); + string::size_type pos (separators != nullptr + ? val.find_last_of (*separators) + : string::npos); // First take care of project. A project-qualified name is not very // common, so we can afford some copying for the sake of simplicity. @@ -5768,10 +5795,10 @@ namespace build2 optional p1; const optional* pp1 (&pp); - if (p != string::npos) + if (pos != string::npos) { - bool last (val[p] == '%'); - string::size_type q (last ? p : val.rfind ('%', p - 1)); + bool last (val[pos] == '%'); + string::size_type q (last ? pos : val.rfind ('%', pos - 1)); for (; q != string::npos; ) // Breakout loop. { @@ -5801,13 +5828,13 @@ namespace build2 // Now fix the rest of the name. // val.erase (0, q + 1); - p = last ? string::npos : p - (q + 1); + pos = last ? string::npos : pos - (q + 1); break; } } - string::size_type n (p != string::npos ? val.size () - 1 : 0); + size_t size (pos != string::npos ? val.size () - 1 : 0); // See if this is a type name, directory prefix, or both. That // is, it is followed by an un-separated '{'. @@ -5834,7 +5861,7 @@ namespace build2 } } - if (p != n && tp != nullptr && !pinc) + if (pos != size && tp != nullptr && !pinc) fail (loc) << "nested type name " << val; dir_path d1; @@ -5845,9 +5872,9 @@ namespace build2 try { - if (p == string::npos) // type + if (pos == string::npos) // type tp1 = &val; - else if (p == n) // directory + else if (pos == size) // directory { if (dp == nullptr) d1 = dir_path (val); @@ -5858,12 +5885,12 @@ namespace build2 } else // both { - t1.assign (val, p + 1, n - p); + t1.assign (val, pos + 1, size - pos); if (dp == nullptr) - d1 = dir_path (val, 0, p + 1); + d1 = dir_path (val, 0, pos + 1); else - d1 = *dp / dir_path (val, 0, p + 1); + d1 = *dp / dir_path (val, 0, pos + 1); dp1 = &d1; tp1 = &t1; @@ -5893,102 +5920,212 @@ namespace build2 continue; } - // See if this is a wildcard pattern. + // See if this is a pattern, path or regex. + // + // A path pattern either contains an unquoted wildcard character or, + // in the curly context, start with unquoted/unescaped `+`. + // + // A regex pattern starts with unquoted/unescaped `~` followed by a + // non-alphanumeric delimiter and has the following form: + // + // ~//[] + // + // A regex substitution starts with unquoted/unescaped '^' followed by + // a non-alphanumeric delimiter and has the follwing form: + // + // ^//[] + // + // Any non-alphanumeric character other that `/` can be used as a + // delimiter but escaping of the delimiter character is not supported + // (one benefit of this is that we can store and print the pattern as + // is without worrying about escaping; the non-alphanumeric part is to + // allow values like ~host and ^cat). // - // It should either contain a wildcard character or, in a curly - // context, start with unquoted '+'. + // The following pattern flags are recognized: // - // Note that in the general case we need to convert it to a path prior - // to testing for being a pattern (think of b[a/r] that is not a - // pattern). If the conversion fails then this is not a path pattern. + // i -- match ignoring case + // e -- match including extension // - auto pattern = [&val, &loc, this] () + // Note that we cannot express certain path patterns that start with + // the regex introducer using quoting (for example, `~*`) since + // quoting prevents the whole from being recognized as a path + // pattern. However, we can achieve this with escaping (for example, + // \~*). This works automatically since we treat (at the lexer level) + // escaped first characters as quoted without treating the whole thing + // as quoted. Note that there is also the corresponding logic in + // to_stream(name). + // + // A pattern cannot be project-qualified. + // + optional pat; + + if (pmode != pattern_mode::ignore && !*pp1) { - // Let's optimize it a bit for the common cases. + // Note that in the general case we need to convert it to a path + // prior to testing for being a pattern (think of b[a/r] that is not + // a pattern). // - if (val.find_first_of ("*?[") == string::npos) - return false; + auto path_pattern = [&val, &loc, this] () + { + // Let's optimize it a bit for the common cases. + // + if (val.find_first_of ("*?[") == string::npos) + return false; - if (path::traits_type::find_separator (val) == string::npos) - return path_pattern (val); + if (path_traits::find_separator (val) == string::npos) + return build2::path_pattern (val); - try - { - return path_pattern (path (val)); - } - catch (const invalid_path& e) + try + { + return build2::path_pattern (path (val)); + } + catch (const invalid_path& e) + { + fail (loc) << "invalid path '" << e.path << "'" << endf; + } + }; + + auto regex_pattern = [&val] () { - fail (loc) << "invalid path '" << e.path << "'" << endf; - } - }; + return ((val[0] == '~' || val[0] == '^') && + val[1] != '\0' && !alnum (val[1])); + }; - bool pat (false); - if (pmode == pattern_mode::expand || pmode == pattern_mode::detect) - { - if (!*pp1 && // Cannot be project-qualified. - !quoted && // Cannot be quoted. - ((dp != nullptr && dp->absolute ()) || pbase_ != nullptr) && - (pattern () || (curly && val[0] == '+'))) + if (pmode != pattern_mode::preserve) { - // Resolve the target type if there is one. If we fail, then this - // is not a pattern. + // Note that if we have no base directory or cannot resolve the + // target type, then this affectively becomes the ignore mode. // - const target_type* ttp (tp != nullptr && scope_ != nullptr - ? scope_->find_target_type (*tp) - : nullptr); - - if (tp == nullptr || ttp != nullptr) + if (pbase_ != nullptr || (dp != nullptr && dp->absolute ())) { - if (pmode == pattern_mode::detect) + // Note that we have to check for regex patterns first since + // they may also be detected as path patterns. + // + if (!quoted_first && regex_pattern ()) { - // Strip the literal unquoted plus character for the first - // pattern in the group. + // Note: we may decide to support regex-based name generation + // some day (though a substitution won't make sense here). // - if (ppat) - { - assert (val[0] == '+'); + fail (loc) << "regex pattern-based name generation" << + info << "quote '" << val << "' (or escape first character) " + << "to treat it as literal name (or path pattern)"; + } + else if ((!quoted && path_pattern ()) || + (!quoted_first && curly && val[0] == '+')) + { + // Resolve the target type if there is one. + // + const target_type* ttp (tp != nullptr && scope_ != nullptr + ? scope_->find_target_type (*tp) + : nullptr); - val.erase (0, 1); - ppat = pinc = false; + if (tp == nullptr || ttp != nullptr) + { + if (pmode == pattern_mode::detect) + { + // Strip the literal unquoted plus character for the first + // pattern in the group. + // + if (ppat) + { + assert (val[0] == '+'); + val.erase (0, 1); + ppat = pinc = false; + } + + // Set the detect pattern mode to expand if the pattern is + // not followed by the inclusion/exclusion pattern/match. + // Note that if it is '}' (i.e., the end of the group), + // then it is a single pattern and the expansion is what + // we want. + // + if (!pattern_prefix (peeked ())) + pmode = pattern_mode::expand; + } + + if (pmode == pattern_mode::expand) + { + count = expand_name_pattern (get_location (t), + names {name (move (val))}, + ns, + what, + pairn, + dp, tp, ttp); + continue; + } + + pattern_detected (ttp); + + // Fall through. } + } + } + } + else + { + // For the preserve mode we treat it as a pattern if it look like + // one syntactically. For now we also don't treat leading `+` in + // the curly context as an indication of a path pattern (since + // there isn't any good reason to; see also to_stream(name) for + // the corresponding serialization logic). + // + if (!quoted_first && regex_pattern ()) + { + const char* w; + if (val[0] == '~') + { + w = "regex pattern"; + pat = pattern_type::regex_pattern; + } + else + { + w = "regex substitution"; + pat = pattern_type::regex_substitution; + } - // Reset the detect pattern mode to expand if the pattern is - // not followed by the inclusion/exclusion pattern/match. Note - // that if it is '}' (i.e., the end of the group), then it is - // a single pattern and the expansion is what we want. - // - if (!pattern_prefix (peeked ())) - pmode = pattern_mode::expand; + size_t n (val.size ()); + + // Verify delimiters and find the position of the flags. + // + char d (val[1]); + size_t p (val.rfind (d)); + + if (p == 1) + { + fail (loc) << "no trailing delimiter '" << d << "' in " + << w << " '" << val << "'" << + info << "quote '" << val << "' (or escape first character) " + << "to treat it as literal name (or path pattern)"; } - if (pmode == pattern_mode::expand) + // Verify flags. + // + for (size_t i (++p); i != n; ++i) { - count = expand_name_pattern (get_location (t), - names {name (move (val))}, - ns, - what, - pairn, - dp, tp, ttp); - continue; + char f (val[i]); + + if (*pat == pattern_type::regex_pattern) + { + if (f == 'i' || f == 'e') + continue; + } + + fail (loc) << "unknown flag '" << f << "' in " << w << " '" + << val << "'"; } - pattern_detected (ttp); + val.erase (0, 1); // Remove `~` or `^`. - // Fall through. + // Make sure we don't treat something like `~/.../` as a + // directory. + // + pos = string::npos; + size = 0; } + else if (!quoted && path_pattern ()) + pat = pattern_type::path; } } - else if (pmode == pattern_mode::preserve) - { - // For the preserve mode we treat it as a pattern if it look like - // one syntactically. For now we also don't treat leading `+` in the - // curly context as an indication of a pattern. - // - if (!*pp1 && // Cannot be project-qualified. - !quoted && // Cannot be quoted. - pattern ()) - pat = true; - } // If we are a second half of a pair, add another first half // unless this is the first instance. @@ -6006,7 +6143,9 @@ namespace build2 // in scope::find_target_type(). This would also mess up // reversibility to simple name. // - if (p == n) + // Note: a regex pattern cannot be a directory (see above). + // + if (pos == size) { // For reversibility to simple name, only treat it as a directory // if the string is an exact representation. @@ -6021,8 +6160,7 @@ namespace build2 append_name ( ns, *pp1, move (dir), (tp != nullptr ? *tp : string ()), string (), - pat, - loc); + pat, loc); continue; } @@ -6568,7 +6706,7 @@ namespace build2 (dp != nullptr ? *dp : dir_path ()), (tp != nullptr ? *tp : string ()), string (), - false /* pattern */, + nullopt, /* pattern */ get_location (t)); count = 1; } @@ -6589,7 +6727,7 @@ namespace build2 (dp != nullptr ? *dp : dir_path ()), (tp != nullptr ? *tp : string ()), string (), - false /* pattern */, + nullopt, /* pattern */ get_location (t)); count = 0; } @@ -6617,7 +6755,7 @@ namespace build2 (dp != nullptr ? *dp : dir_path ()), (tp != nullptr ? *tp : string ()), string (), - false /* pattern */, + nullopt, /* pattern */ get_location (t)); break; } @@ -6636,7 +6774,7 @@ namespace build2 (dp != nullptr ? *dp : dir_path ()), (tp != nullptr ? *tp : string ()), string (), - false /* pattern */, + nullopt, /* pattern */ get_location (t)); } diff --git a/libbuild2/parser.hxx b/libbuild2/parser.hxx index 889d339..007e508 100644 --- a/libbuild2/parser.hxx +++ b/libbuild2/parser.hxx @@ -103,6 +103,7 @@ namespace build2 // Recursive descent parser. // protected: + using pattern_type = name::pattern_type; // Pattern expansion mode. // @@ -129,8 +130,10 @@ namespace build2 void parse_variable_block (token&, token_type&, - const target_type* = nullptr, - string = string ()); + optional = {}, + const target_type* = nullptr, + string = {}, + const location& = {}); void parse_recipe (token&, token_type&, @@ -223,9 +226,10 @@ namespace build2 parse_variable (token&, token_type&, const variable&, token_type); void - parse_type_pattern_variable (token&, token_type&, - const target_type&, string, - const variable&, token_type, const location&); + parse_type_pattern_variable ( + token&, token_type&, + pattern_type, const target_type&, string, const location&, + const variable&, token_type, const location&); const variable& parse_variable_name (names&&, const location&); diff --git a/libbuild2/scope.cxx b/libbuild2/scope.cxx index 46e3dcd..f2700c4 100644 --- a/libbuild2/scope.cxx +++ b/libbuild2/scope.cxx @@ -49,7 +49,7 @@ namespace build2 const scope* s, const target_key* tk, const target_key* gk, - optional n) + string n) { const value& v (*l); assert ((v.extra == 1 || v.extra == 2) && v.type == nullptr); @@ -70,7 +70,7 @@ namespace build2 pair entry ( s->target_vars.cache.insert ( ctx, - make_tuple (&v, tk->type, n && !n->empty () ? move (*n) : *tk->name), + make_tuple (&v, tk->type, !n.empty () ? move (n) : *tk->name), stem, static_cast (v).version, var)); @@ -146,7 +146,7 @@ namespace build2 if (l.defined ()) { if (l->extra != 0) // Prepend/append? - pre_app (l, s, tk, gk, move (tn)); + pre_app (l, s, tk, gk, move (*tn)); return make_pair (move (l), d); } @@ -164,7 +164,7 @@ namespace build2 if (l.defined ()) { if (l->extra != 0) // Prepend/append? - pre_app (l, s, gk, nullptr, move (gn)); + pre_app (l, s, gk, nullptr, move (*gn)); return make_pair (move (l), d); } diff --git a/libbuild2/token.hxx b/libbuild2/token.hxx index faae466..fca888c 100644 --- a/libbuild2/token.hxx +++ b/libbuild2/token.hxx @@ -120,7 +120,8 @@ namespace build2 // Quoting can be complete, where the token starts and ends with the quote // characters and quoting is contiguous or partial where only some part(s) // of the token are quoted or quoting continues to the next token. We also - // keep track whether the first character of a token is quoted. + // keep track whether the first character of a token is quoted (we also + // treat escaped first character as quoted). // quote_type qtype; bool qcomp; diff --git a/libbuild2/types.hxx b/libbuild2/types.hxx index dd82ef1..8dfda6e 100644 --- a/libbuild2/types.hxx +++ b/libbuild2/types.hxx @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include // unique_ptr, shared_ptr @@ -44,6 +45,7 @@ #include #include +#include #include #include #include @@ -105,6 +107,14 @@ namespace build2 using std::endl; using std::streamsize; // C++'s ssize_t. + // Regex. + // + // Note that includes an ostream insertion operator for + // regex_error which prints cleaned up message, if any. + // + using std::regex; + using std::regex_error; + // Concurrency. // using std::atomic; diff --git a/libbuild2/variable.cxx b/libbuild2/variable.cxx index 3e3bf05..1855f3e 100644 --- a/libbuild2/variable.cxx +++ b/libbuild2/variable.cxx @@ -1796,6 +1796,51 @@ namespace build2 return m_.erase (var) != 0; } + // variable_pattern_map + // + variable_map& variable_pattern_map:: + insert (pattern_type type, string&& text) + { + auto r (map_.emplace (pattern {type, false, move (text), {}}, + variable_map (ctx, global_))); + + // Compile the regex. + // + if (r.second && type == pattern_type::regex_pattern) + { + // On exception restore the text argument (so that it's available for + // diagnostics) and remove the element from the map. + // + auto eg (make_exception_guard ( + [&text, &r, this] () + { + text = r.first->first.text; + map_.erase (r.first); + })); + + const string& t (r.first->first.text); + size_t n (t.size ()), p (t.rfind (t[0])); + + // Convert flags. + // + regex::flag_type f (regex::ECMAScript); + for (size_t i (p + 1); i != n; ++i) + { + switch (t[i]) + { + case 'i': f |= regex::icase; break; + case 'e': r.first->first.match_ext = true; break; + } + } + + // Skip leading delimiter as well as trailing delimiter and flags. + // + r.first->first.regex = regex (t.c_str () + 1, p - 1, f); + } + + return r.first->second; + } + // variable_type_map // lookup variable_type_map:: @@ -1805,6 +1850,8 @@ namespace build2 { // Compute and cache "effective" name that we will be matching. // + // See also the additional match_ext logic below. + // auto name = [&tk, &oname] () -> const string& { if (!oname) @@ -1856,24 +1903,40 @@ namespace build2 if (i == end ()) continue; - // Try to match the pattern, starting from the longest values - // so that the more "specific" patterns (i.e., those that cover - // fewer characters with the wildcard) take precedence. See - // tests/variable/type-pattern. + // Try to match the pattern, starting from the longest values. // const variable_pattern_map& m (i->second); - for (auto j (m.rbegin ()); j != m.rend (); ++j) { - const string& pat (j->first); + using pattern = variable_pattern_map::pattern; + using pattern_type = variable_pattern_map::pattern_type; - //@@ TODO: should we detect ambiguity? 'foo-*' '*-foo' and 'foo-foo'? - // Right now the last defined will be used. - // - if (pat != "*") + const pattern& pat (j->first); + + bool r, e (false); + if (pat.type == pattern_type::path) + { + r = pat.text == "*" || butl::path_match (name (), pat.text); + } + else { - if (!butl::path_match (name (), pat)) - continue; + const string& n (name ()); + + // Deal with match_ext: first see if the extension would be added by + // default. If not, then temporarily add it in oname and then clean + // it up if there is no match (to prevent another pattern from using + // it). While we may keep adding it if there are multiple patterns + // with such a flag, we will at least reuse the buffer in oname. + // + e = pat.match_ext && tk.ext && !tk.ext->empty () && oname->empty (); + if (e) + { + *oname = *tk.name; + *oname += '.'; + *oname += *tk.ext; + } + + r = regex_match (e ? *oname : n, *pat.regex); } // Ok, this pattern matches. But is there a variable? @@ -1882,8 +1945,9 @@ namespace build2 // to automatically type it. And if it is assignment, then typify it // ourselves. // - const variable_map& vm (j->second); + if (r) { + const variable_map& vm (j->second); auto p (vm.lookup (var, false)); if (const variable_map::value_data* v = p.first) { @@ -1895,12 +1959,15 @@ namespace build2 // Make sure the effective name is computed if this is // append/prepend (it is used as a cache key). // - if (v->extra != 0) + if (v->extra != 0 && !oname) name (); return lookup (*v, p.second, vm); } } + + if (e) + oname->clear (); } } diff --git a/libbuild2/variable.hxx b/libbuild2/variable.hxx index a272013..573f968 100644 --- a/libbuild2/variable.hxx +++ b/libbuild2/variable.hxx @@ -1740,17 +1740,69 @@ namespace build2 class variable_pattern_map { public: - using map_type = map; + using pattern_type = name::pattern_type; + + // We use the map to keep the patterns in the shortest-first order. This + // is used during match where we starting from the longest values so that + // the more "specific" patterns (i.e., those that cover fewer characters + // with the wildcard) take precedence. + // + // Note that this is only an approximation (e.g., `*[0-9]` vs `[0-9]`) but + // it's sufficient in practice (e.g., `*` vs `*.txt`). We also have the + // ambiguity problem (e.g., `foo-foo` matching both `foo-*` and `*-foo`). + // + // And, of course, this doesn't apply accross pattern types so we always + // treat regex patterns as more specific than path patterns. + // + // While it feels like this should be a union (with pattern_type as the + // discriminator), we need to keep the original regex text for dumping. + // So we just keep optional which is absent for path patterns (it's + // optional since a default-constructed regex has a pattern). BTW, the + // size of std::regex object ranges between 32 and 64 bytes, depending on + // the implementation. + // + struct pattern + { + pattern_type type; + mutable bool match_ext; // Match extension flag. + string text; + mutable optional regex; + }; + + struct pattern_compare + { + bool operator() (const pattern& x, const pattern& y) const + { + return x.type != y.type + ? x.type == pattern_type::path + : (x.text.size () != y.text.size () + ? x.text.size () < y.text.size () + : x.text < y.text); + } + }; + + using map_type = map; using const_iterator = map_type::const_iterator; using const_reverse_iterator = map_type::const_reverse_iterator; variable_pattern_map (context& c, bool global) : ctx (c), global_ (global) {} + // Note that here we assume the "outer" pattern format (delimiters, flags, + // etc) is valid. + // + // Note: may throw regex_error in which case text is preserved. + // + variable_map& + insert (pattern_type type, string&& text); + + // Convenience shortcut or path patterns. + // variable_map& - operator[] (const string& v) + operator[] (string text) { - return map_.emplace (v, variable_map (ctx, global_)).first->second; + return map_.emplace (pattern {pattern_type::path, false, move (text), {}}, + variable_map (ctx, global_)).first->second; } const_iterator begin () const {return map_.begin ();} @@ -1769,7 +1821,7 @@ namespace build2 { public: using map_type = map, - variable_pattern_map>; + variable_pattern_map>; using const_iterator = map_type::const_iterator; variable_type_map (context& c, bool global): ctx (c), global_ (global) {} @@ -1785,6 +1837,10 @@ namespace build2 const_iterator end () const {return map_.end ();} bool empty () const {return map_.empty ();} + // If found append/prepend then name is guaranteed to either contain the + // full name that was used for the match or be empty in which case the + // orginal target name was used. + // lookup find (const target_key&, const variable&, optional& name) const; diff --git a/old-tests/variable/type-pattern/buildfile b/old-tests/variable/type-pattern/buildfile deleted file mode 100644 index dd218ac..0000000 --- a/old-tests/variable/type-pattern/buildfile +++ /dev/null @@ -1,39 +0,0 @@ -#dir/foo{*}: x = y # directory -#foo{*.*}: x = y # multiple wildcards -#foo{*}: x = y # unknown target type -#file{*}: x += y # append - -# Use --verbose 6 to examine. -# - -dir{*}: x = y - -x = z -dir{*-foo}: x = $x # 'z' - -x = G -file{*-foo}: x = x -file{xfoo}: x = $x # 'G' -file{-foo}: x = $x # 'x' -file{x-foo}: x = $x # 'x' -file{bar-*-foo}: x = X -file{bar-x}: x = $x # 'G' -file{bar--foo}: x = $x # 'X' -file{bar-x-foo}: x = $x # 'X' - -file{*-fox}: x = 1 -file{fox-*}: x = 2 -file{fox-fox}: x = $x # '2' -file{*-fox}: x = 3 -file{fox-x-fox}: x = $x # still '2'! - -*-foz: x = z # any target -file{x-foz}: x = $x # 'z' - -# These should all be the same. -# -*: x1 = X1 -{*}: x2 = X2 -*{*}: x3 = X3 - -./: diff --git a/tests/variable/target-type-pattern-specific/testscript b/tests/variable/target-type-pattern-specific/testscript index 1a3e98a..9962342 100644 --- a/tests/variable/target-type-pattern-specific/testscript +++ b/tests/variable/target-type-pattern-specific/testscript @@ -19,6 +19,65 @@ X y Y EOO +: old-tests-type-pattern +: +$* <>EOO +dir{*}: x = y + +x = z +dir{*-foo}: x = $x # 'z' +print $(bar-foo/: x) + +x = G +file{*-foo}: x = x +file{xfoo}: x = $x # 'G' +print $(file{xfoo}: x) +file{-foo}: x = $x # 'x' +print $(file{-foo}: x) +file{x-foo}: x = $x # 'x' +print $(file{x-foo}: x) +file{bar-*-foo}: x = X +file{bar-x}: x = $x # 'G' +print $(file{bar-x}: x) +file{bar--foo}: x = $x # 'X' +print $(file{bar--foo}: x) +file{bar-x-foo}: x = $x # 'X' +print $(file{bar-x-foo}: x) + +file{*-fox}: x = 1 +file{fox-*}: x = 2 +file{fox-fox}: x = $x # '2' +print $(file{fox-fox}: x) +file{*-fox}: x = 3 +file{fox-x-fox}: x = $x # still '2'! +print $(file{fox-fox}: x) + +*-foz: x = z # any target +file{x-foz}: x = $x # 'z' +print $(file{x-foz}: x) + +*: x1 = X1 +{*}: x2 = X2 +*{*}: x3 = X3 +print $(file{x}: x1) +print $(file{x}: x2) +print $(file{x}: x3) +EOI +z +G +x +x +G +X +X +2 +2 +z +X1 +X2 +X3 +EOO + : block : $* <>EOO @@ -55,3 +114,71 @@ EOI X y Y EOO + +: regex +: +{ + : flag-icase + : + $* <>EOO + file{~/'.+\.txt'/i}: x = 1 + + print $(file{foo.txt}: x) + print $(file{foo.TXT}: x) + EOI + 1 + 1 + EOO + + : flag-match-ext + : + $* <>EOO + define txt: file + + txt{*}: x = 0 + txt{~/'[^.]+'/}: x = 1 + txt{~/'.+\.tx'/e}: x = 2 + txt{~/'.+\.txt'/e}: x = 3 + + print $(txt{foo.x}: x) + print $(txt{foo.tx}: x) + print $(txt{foo.txt}: x) + print $(txt{foo.bar...}: x) + EOI + 1 + 2 + 3 + 0 + EOO + + : backref + : + $* <>EOO + x = 0 + file{~/'(.+)-\1'/}: x = 1 + + print $(file{foo-foo}: x) + print $(file{foo-bar}: x) + EOI + 1 + 0 + EOO + + : dir + : + $* <>EOO + foo/dir{~/b.+/}: x = 1 + + print $(foo/dir{bar}: x) + EOI + 1 + EOO + + : invalid + : + $* <>~/EOE/ != 0 + file{~/'(.+'/}: x = 1 + EOI + /:1:1: error: invalid regex pattern .+/ + EOE +} -- cgit v1.1