From 044e2e1c1460fb060f677a366144b98905522754 Mon Sep 17 00:00:00 2001 From: Karen Arutyunov Date: Tue, 31 Jan 2017 22:08:38 +0300 Subject: Add sed builtin --- build2/b.cxx | 16 +- build2/buildfile | 1 + build2/regex | 57 ++++ build2/regex.cxx | 42 +++ build2/regex.txx | 215 +++++++++++++++ build2/test/script/builtin.cxx | 506 ++++++++++++++++++++++++++++-------- build2/test/script/regex | 5 +- build2/test/script/runner.cxx | 38 +-- tests/test/script/builtin/buildfile | 2 +- tests/test/script/builtin/sed.test | 312 ++++++++++++++++++++++ 10 files changed, 1045 insertions(+), 149 deletions(-) create mode 100644 build2/regex create mode 100644 build2/regex.cxx create mode 100644 build2/regex.txx create mode 100644 tests/test/script/builtin/sed.test diff --git a/build2/b.cxx b/build2/b.cxx index e576435..b06459b 100644 --- a/build2/b.cxx +++ b/build2/b.cxx @@ -2,7 +2,10 @@ // copyright : Copyright (c) 2014-2017 Code Synthesis Ltd // license : MIT; see accompanying LICENSE file -#include // strerror() +#ifndef _WIN32 +# include // signal() +#endif + #include // getenv() _putenv()(_WIN32) #include @@ -82,6 +85,17 @@ main (int argc, char* argv[]) { tracer trace ("main"); + // On POSIX ignore SIGPIPE which is signaled to a pipe-writing process if + // the pipe reading end is closed. Note that by default this signal + // terminates a process. Also note that there is no way to disable this + // behavior on a file descriptor basis or for the write() function call. + // +#ifndef _WIN32 + if (signal (SIGPIPE, SIG_IGN) == SIG_ERR) + fail << "unable to ignore broken pipe (SIGPIPE) signal: " + << system_error (errno, system_category ()); // Sanitize. +#endif + // Parse the command line. We want to be able to specify options, vars, // and buildspecs in any order (it is really handy to just add -v at the // end of the command line). diff --git a/build2/buildfile b/build2/buildfile index 84e2f82..1ee7063 100644 --- a/build2/buildfile +++ b/build2/buildfile @@ -26,6 +26,7 @@ exe{b}: \ {hxx cxx}{ operation } \ {hxx cxx}{ parser } \ {hxx cxx}{ prerequisite } \ + {hxx txx cxx}{ regex } \ {hxx cxx}{ rule } \ {hxx }{ rule-map } \ {hxx txx cxx}{ scheduler } \ diff --git a/build2/regex b/build2/regex new file mode 100644 index 0000000..dc6dc96 --- /dev/null +++ b/build2/regex @@ -0,0 +1,57 @@ +// file : build2/regex -*- C++ -*- +// copyright : Copyright (c) 2014-2017 Code Synthesis Ltd +// license : MIT; see accompanying LICENSE file + +#ifndef BUILD2_REGEX +#define BUILD2_REGEX + +#include +#include +#include // basic_string + +#include +#include + +namespace build2 +{ + // Like std::regex_match() but extends the standard ECMA-262 + // substitution escape sequences with a subset of Perl sequences: + // + // \\, \u, \l, \U, \L, \E, \1, ..., \9 + // + // Also return the resulting string as well as whether the search + // succeeded. + // + // Notes and limitations: + // + // - The only valid regex_constants flags are match_default, + // format_first_only (format_no_copy can easily be supported). + // + // - If backslash doesn't start any of the listed sequences then it is + // silently dropped and the following character is copied as is. + // + // - The character case conversion is performed according to the global + // C++ locale (which is, unless changed, is the same as C locale and + // both default to the POSIX locale aka "C"). + // + template + pair, bool> + regex_replace_ex (const std::basic_string&, + const std::basic_regex&, + const std::basic_string& fmt, + std::regex_constants::match_flag_type = + std::regex_constants::match_default); +} + +namespace std +{ + // Print regex error description but only if it is meaningful (this is also + // why we have to print leading colon). + // + ostream& + operator<< (ostream&, const regex_error&); +} + +#include + +#endif // BUILD2_REGEX diff --git a/build2/regex.cxx b/build2/regex.cxx new file mode 100644 index 0000000..40347b5 --- /dev/null +++ b/build2/regex.cxx @@ -0,0 +1,42 @@ +// file : build2/regex.cxx -*- C++ -*- +// copyright : Copyright (c) 2014-2017 Code Synthesis Ltd +// license : MIT; see accompanying LICENSE file + +#include + +#if defined(_MSC_VER) && _MSC_VER <= 1910 +# include // strstr() +#endif + +#include +#include + +namespace std +{ + // Currently libstdc++ just returns the name of the exception (bug #67361). + // So we check that the description contains at least one space character. + // + // While VC's description is meaningful, it has an undesired prefix that + // resembles the following: 'regex_error(error_badrepeat): '. So we skip it. + // + ostream& + operator<< (ostream& o, const regex_error& e) + { + const char* d (e.what ()); + +#if defined(_MSC_VER) && _MSC_VER <= 1910 + const char* rd (strstr (d, "): ")); + if (rd != nullptr) + d = rd + 3; +#endif + + ostringstream os; + os << runtime_error (d); // Sanitize the description. + + string s (os.str ()); + if (s.find (' ') != string::npos) + o << ": " << s; + + return o; + } +} diff --git a/build2/regex.txx b/build2/regex.txx new file mode 100644 index 0000000..1325de9 --- /dev/null +++ b/build2/regex.txx @@ -0,0 +1,215 @@ +// file : build2/regex.txx -*- C++ -*- +// copyright : Copyright (c) 2014-2017 Code Synthesis Ltd +// license : MIT; see accompanying LICENSE file + +namespace build2 +{ + template + pair, bool> + regex_replace_ex (const std::basic_string& s, + const std::basic_regex& re, + const std::basic_string& fmt, + std::regex_constants::match_flag_type flags) + { + using namespace std; + + using string_type = basic_string; + using str_it = typename string_type::const_iterator; + using regex_it = regex_iterator; + + bool first_only ((flags & std::regex_constants::format_first_only) == + std::regex_constants::format_first_only); + + locale cl; // Copy of the global C++ locale. + string_type r; + + // Beginning of the last unmatched substring. + // + str_it ub (s.begin ()); + + regex_it b (s.begin (), s.end (), re, flags); + regex_it e; + bool match (b != e); + + for (regex_it i (b); i != e; ++i) + { + const match_results& m (*i); + + // Copy the preceeding unmatched substring, save the beginning of the + // one that follows. + // + r.append (ub, m.prefix ().second); + ub = m.suffix ().first; + + if (first_only && i != b) + r.append (m[0].first, m[0].second); // Append matched substring. + else + { + // The standard implementation calls m.format() here. We perform our + // own formatting. + // + // Note that we are using char type literals with the assumption that + // being ASCII characters they will be properly "widened" to the + // corresponding literals of the C template parameter type. + // + auto digit = [] (C c) -> int + { + return c >= '0' && c <= '9' ? c - '0' : -1; + }; + + enum class case_conv {none, upper, lower, upper_once, lower_once} + mode (case_conv::none); + + auto conv_chr = [&mode, &cl] (C c) -> C + { + switch (mode) + { + case case_conv::upper_once: mode = case_conv::none; + case case_conv::upper: c = toupper (c, cl); break; + case case_conv::lower_once: mode = case_conv::none; + case case_conv::lower: c = tolower (c, cl); break; + case case_conv::none: break; + } + return c; + }; + + auto append_chr = [&r, &conv_chr] (C c) + { + r.push_back (conv_chr (c)); + }; + + auto append_str = [&r, &mode, &conv_chr] (str_it b, str_it e) + { + // Optimize for the common case. + // + if (mode == case_conv::none) + r.append (b, e); + else + { + for (str_it i (b); i != e; ++i) + r.push_back (conv_chr (*i)); + } + }; + + size_t n (fmt.size ()); + for (size_t i (0); i < n; ++i) + { + C c (fmt[i]); + + switch (c) + { + case '$': + { + // Check if this is a $-based escape sequence. Interpret it + // accordingly if that's the case, treat '$' as a regular + // character otherwise. + // + c = fmt[++i]; // '\0' if last. + + switch (c) + { + case '$': append_chr (c); break; + case '&': append_str (m[0].first, m[0].second); break; + case '`': + { + append_str (m.prefix ().first, m.prefix ().second); + break; + } + case '\'': + { + append_str (m.suffix ().first, m.suffix ().second); + break; + } + default: + { + // Check if this is a sub-expression 1-based index ($n or + // $nn). Append the matching substring if that's the case. + // Treat '$' as a regular character otherwise. Index greater + // than the sub-expression count is silently ignored. + // + int si (digit (c)); + if (si >= 0) + { + int d; + if ((d = digit (fmt[i + 1])) >= 0) // '\0' if last. + { + si = si * 10 + d; + ++i; + } + } + + if (si > 0) + { + // m[0] refers to the matched substring. + // + if (static_cast (si) < m.size ()) + append_str (m[si].first, m[si].second); + } + else + { + // Not a $-based escape sequence so treat '$' as a + // regular character. + // + --i; + append_chr ('$'); + } + + break; + } + } + + break; + } + case '\\': + { + c = fmt[++i]; // '\0' if last. + + switch (c) + { + case '\\': append_chr (c); break; + + case 'u': mode = case_conv::upper_once; break; + case 'l': mode = case_conv::lower_once; break; + case 'U': mode = case_conv::upper; break; + case 'L': mode = case_conv::lower; break; + case 'E': mode = case_conv::none; break; + default: + { + // Check if this is a sub-expression 1-based index. Append + // the matching substring if that's the case, Skip '\\' + // otherwise. Index greater than the sub-expression count is + // silently ignored. + // + int si (digit (c)); + if (si > 0) + { + // m[0] refers to the matched substring. + // + if (static_cast (si) < m.size ()) + append_str (m[si].first, m[si].second); + } + else + --i; + + break; + } + } + + break; + } + default: + { + // Append a regular character. + // + append_chr (c); + break; + } + } + } + } + } + + r.append (ub, s.end ()); // Append the rightmost non-matched substring. + return make_pair (move (r), match); + } +} diff --git a/build2/test/script/builtin.cxx b/build2/test/script/builtin.cxx index 008ac32..3957adb 100644 --- a/build2/test/script/builtin.cxx +++ b/build2/test/script/builtin.cxx @@ -10,12 +10,17 @@ # include #endif +#include #include +#include +#include #include // use default operator<< implementation #include // fdopen_mode, fdstream_mode #include // mkdir_status +#include + #include // Strictly speaking a builtin which reads/writes from/to standard streams @@ -51,6 +56,74 @@ namespace build2 // struct failed {}; + // Accumulate an error message, print it atomically in dtor to the + // provided stream and throw failed afterwards if requested. Prefixes + // the message with the builtin name. + // + // Move constructible-only, not assignable (based to diag_record). + // + class error_record + { + public: + template + friend const error_record& + operator<< (const error_record& r, const T& x) + { + r.ss_ << x; + return r; + } + + error_record (ostream& o, bool fail, const char* name) + : os_ (o), fail_ (fail), empty_ (false) + { + ss_ << name << ": "; + } + + // Older versions of libstdc++ don't have the ostringstream move + // support. Luckily, GCC doesn't seem to be actually needing move due + // to copy/move elision. + // +#ifdef __GLIBCXX__ + error_record (error_record&&); +#else + error_record (error_record&& r) + : os_ (r.os_), + ss_ (move (r.ss_)), + fail_ (r.fail_), + empty_ (r.empty_) + { + r.empty_ = true; + } +#endif + + ~error_record () noexcept (false) + { + if (!empty_) + { + // The output stream can be in a bad state (for example as a + // result of unsuccessful attempt to report a previous error), so + // we check it. + // + if (os_.good ()) + { + ss_.put ('\n'); + os_ << ss_.str (); + os_.flush (); + } + + if (fail_) + throw failed (); + } + } + + private: + ostream& os_; + mutable ostringstream ss_; + + bool fail_; + bool empty_; + }; + // Parse and normalize a path. Also, unless it is already absolute, make // the path absolute using the specified directory. Throw invalid_path // if the path is empty, and on parsing and normalization failures. @@ -103,6 +176,11 @@ namespace build2 uint8_t r (1); ofdstream cerr (move (err)); + auto error = [&cerr] (bool fail = true) + { + return error_record (cerr, fail, "cat"); + }; + try { ifdstream cin (move (in), fdstream_mode::binary); @@ -154,15 +232,15 @@ namespace build2 } catch (const io_error& e) { - cerr << "cat: unable to print "; + error_record d (error ()); + d << "unable to print "; if (p.empty ()) - cerr << "stdin"; + d << "stdin"; else - cerr << "'" << p << "'"; + d << "'" << p << "'"; - cerr << ": " << e << endl; - throw failed (); + d << ": " << e; } cin.close (); @@ -171,15 +249,13 @@ namespace build2 } catch (const invalid_path& e) { - cerr << "cat: invalid path '" << e.path << "'" << endl; + error (false) << "invalid path '" << e.path << "'"; } - // Can be thrown while closing cin, cout or writing to cerr (that's - // why need to check its state before writing). + // Can be thrown while creating/closing cin, cout or writing to cerr. // catch (const io_error& e) { - if (cerr.good ()) - cerr << "cat: " << e << endl; + error (false) << e; } catch (const failed&) { @@ -215,8 +291,7 @@ namespace build2 for (auto b (args.begin ()), i (b), e (args.end ()); i != e; ++i) cout << (i != b ? " " : "") << *i; - cout << endl; - + cout << '\n'; cout.close (); r = 0; } @@ -291,6 +366,11 @@ namespace build2 uint8_t r (1); ofdstream cerr (move (err)); + auto error = [&cerr] (bool fail = true) + { + return error_record (cerr, fail, "mkdir"); + }; + try { in.close (); @@ -317,10 +397,7 @@ namespace build2 // Create directories. // if (i == args.end ()) - { - cerr << "mkdir: missing directory" << endl; - throw failed (); - } + error () << "missing directory"; for (; i != args.end (); ++i) { @@ -337,9 +414,7 @@ namespace build2 } catch (const system_error& e) { - cerr << "mkdir: unable to create directory '" << p << "': " - << e << endl; - throw failed (); + error () << "unable to create directory '" << p << "': " << e; } } @@ -347,15 +422,13 @@ namespace build2 } catch (const invalid_path& e) { - cerr << "mkdir: invalid path '" << e.path << "'" << endl; + error (false) << "invalid path '" << e.path << "'"; } - // Can be thrown while closing in, out or writing to cerr (that's why - // need to check its state before writing). + // Can be thrown while closing in, out or writing to cerr. // catch (const io_error& e) { - if (cerr.good ()) - cerr << "mkdir: " << e << endl; + error (false) << e; } catch (const failed&) { @@ -403,6 +476,11 @@ namespace build2 uint8_t r (1); ofdstream cerr (move (err)); + auto error = [&cerr] (bool fail = true) + { + return error_record (cerr, fail, "rm"); + }; + try { in.close (); @@ -432,10 +510,7 @@ namespace build2 // Remove entries. // if (i == args.end () && !force) - { - cerr << "rm: missing file" << endl; - throw failed (); - } + error () << "missing file"; const dir_path& wd (sp.wd_path); const dir_path& rwd (sp.root->wd_path); @@ -445,11 +520,8 @@ namespace build2 path p (parse_path (*i, wd)); if (!p.sub (rwd) && !force) - { - cerr << "rm: '" << p << "' is out of working directory '" << rwd - << "'" << endl; - throw failed (); - } + error () << "'" << p << "' is out of working directory '" << rwd + << "'"; try { @@ -458,17 +530,11 @@ namespace build2 if (dir_exists (d)) { if (!dir) - { - cerr << "rm: '" << p << "' is a directory" << endl; - throw failed (); - } + error () << "'" << p << "' is a directory"; if (wd.sub (d)) - { - cerr << "rm: '" << p << "' contains test working directory '" - << wd << "'" << endl; - throw failed (); - } + error () << "'" << p << "' contains test working directory '" + << wd << "'"; // The call can result in rmdir_status::not_exist. That's not // very likelly but there is also nothing bad about it. @@ -480,8 +546,7 @@ namespace build2 } catch (const system_error& e) { - cerr << "rm: unable to remove '" << p << "': " << e << endl; - throw failed (); + error () << "unable to remove '" << p << "': " << e; } } @@ -489,15 +554,13 @@ namespace build2 } catch (const invalid_path& e) { - cerr << "rm: invalid path '" << e.path << "'" << endl; + error (false) << "invalid path '" << e.path << "'"; } - // Can be thrown while closing in, out or writing to cerr (that's why - // need to check its state before writing). + // Can be thrown while closing in, out or writing to cerr. // catch (const io_error& e) { - if (cerr.good ()) - cerr << "rm: " << e << endl; + error (false) << e; } catch (const failed&) { @@ -533,6 +596,11 @@ namespace build2 uint8_t r (1); ofdstream cerr (move (err)); + auto error = [&cerr] (bool fail = true) + { + return error_record (cerr, fail, "rmdir"); + }; + try { in.close (); @@ -559,10 +627,7 @@ namespace build2 // Remove directories. // if (i == args.end () && !force) - { - cerr << "rmdir: missing directory" << endl; - throw failed (); - } + error () << "missing directory"; const dir_path& wd (sp.wd_path); const dir_path& rwd (sp.root->wd_path); @@ -572,18 +637,12 @@ namespace build2 dir_path p (path_cast (parse_path (*i, wd))); if (wd.sub (p)) - { - cerr << "rmdir: '" << p << "' contains test working directory '" - << wd << "'" << endl; - throw failed (); - } + error () << "'" << p << "' contains test working directory '" + << wd << "'"; if (!p.sub (rwd) && !force) - { - cerr << "rmdir: '" << p << "' is out of working directory '" - << rwd << "'" << endl; - throw failed (); - } + error () << "'" << p << "' is out of working directory '" + << rwd << "'"; try { @@ -596,8 +655,7 @@ namespace build2 } catch (const system_error& e) { - cerr << "rmdir: unable to remove '" << p << "': " << e << endl; - throw failed (); + error () << "unable to remove '" << p << "': " << e; } } @@ -605,15 +663,259 @@ namespace build2 } catch (const invalid_path& e) { - cerr << "rmdir: invalid path '" << e.path << "'" << endl; + error (false) << "invalid path '" << e.path << "'"; + } + // Can be thrown while closing in, out or writing to cerr. + // + catch (const io_error& e) + { + error (false) << e; + } + catch (const failed&) + { + // Diagnostics has already been issued. + } + + cerr.close (); + return r; + } + catch (const std::exception&) + { + return 1; + } + + // sed [-n] -e