diff options
author | Boris Kolpackov <boris@codesynthesis.com> | 2020-04-28 08:48:53 +0200 |
---|---|---|
committer | Boris Kolpackov <boris@codesynthesis.com> | 2020-05-27 15:47:28 +0200 |
commit | b808c255b6a9ddba085bf5646e7d20ec344f2e2d (patch) | |
tree | 32730291f7e6de8ef0a227905520dd66fb4ec0f3 /libbuild2/script | |
parent | 3552356a87402727e663131994fa87f48b3cd4fb (diff) |
Initial support for ad hoc recipes (still work in progress)
Diffstat (limited to 'libbuild2/script')
-rw-r--r-- | libbuild2/script/builtin-options.cxx | 661 | ||||
-rw-r--r-- | libbuild2/script/builtin-options.hxx | 339 | ||||
-rw-r--r-- | libbuild2/script/builtin-options.ixx | 182 | ||||
-rw-r--r-- | libbuild2/script/builtin.cli | 21 | ||||
-rw-r--r-- | libbuild2/script/lexer+command-expansion.test.testscript | 321 | ||||
-rw-r--r-- | libbuild2/script/lexer.cxx | 431 | ||||
-rw-r--r-- | libbuild2/script/lexer.hxx | 139 | ||||
-rw-r--r-- | libbuild2/script/lexer.test.cxx | 76 | ||||
-rw-r--r-- | libbuild2/script/parser.cxx | 2015 | ||||
-rw-r--r-- | libbuild2/script/parser.hxx | 189 | ||||
-rw-r--r-- | libbuild2/script/regex.cxx | 436 | ||||
-rw-r--r-- | libbuild2/script/regex.hxx | 678 | ||||
-rw-r--r-- | libbuild2/script/regex.ixx | 31 | ||||
-rw-r--r-- | libbuild2/script/regex.test.cxx | 303 | ||||
-rw-r--r-- | libbuild2/script/run.cxx | 2020 | ||||
-rw-r--r-- | libbuild2/script/run.hxx | 75 | ||||
-rw-r--r-- | libbuild2/script/script.cxx | 659 | ||||
-rw-r--r-- | libbuild2/script/script.hxx | 471 | ||||
-rw-r--r-- | libbuild2/script/script.ixx | 56 | ||||
-rw-r--r-- | libbuild2/script/token.cxx | 53 | ||||
-rw-r--r-- | libbuild2/script/token.hxx | 66 |
21 files changed, 9222 insertions, 0 deletions
diff --git a/libbuild2/script/builtin-options.cxx b/libbuild2/script/builtin-options.cxx new file mode 100644 index 0000000..2002764 --- /dev/null +++ b/libbuild2/script/builtin-options.cxx @@ -0,0 +1,661 @@ +// -*- C++ -*- +// +// This file was generated by CLI, a command line interface +// compiler for C++. +// + +// Begin prologue. +// +// +// End prologue. + +#include <libbuild2/script/builtin-options.hxx> + +#include <map> +#include <set> +#include <string> +#include <vector> +#include <ostream> +#include <sstream> + +namespace build2 +{ + namespace script + { + namespace cli + { + // unknown_option + // + unknown_option:: + ~unknown_option () throw () + { + } + + void unknown_option:: + print (::std::ostream& os) const + { + os << "unknown option '" << option ().c_str () << "'"; + } + + const char* unknown_option:: + what () const throw () + { + return "unknown option"; + } + + // unknown_argument + // + unknown_argument:: + ~unknown_argument () throw () + { + } + + void unknown_argument:: + print (::std::ostream& os) const + { + os << "unknown argument '" << argument ().c_str () << "'"; + } + + const char* unknown_argument:: + what () const throw () + { + return "unknown argument"; + } + + // missing_value + // + missing_value:: + ~missing_value () throw () + { + } + + void missing_value:: + print (::std::ostream& os) const + { + os << "missing value for option '" << option ().c_str () << "'"; + } + + const char* missing_value:: + what () const throw () + { + return "missing option value"; + } + + // invalid_value + // + invalid_value:: + ~invalid_value () throw () + { + } + + void invalid_value:: + print (::std::ostream& os) const + { + os << "invalid value '" << value ().c_str () << "' for option '" + << option ().c_str () << "'"; + + if (!message ().empty ()) + os << ": " << message ().c_str (); + } + + const char* invalid_value:: + what () const throw () + { + return "invalid option value"; + } + + // eos_reached + // + void eos_reached:: + print (::std::ostream& os) const + { + os << what (); + } + + const char* eos_reached:: + what () const throw () + { + return "end of argument stream reached"; + } + + // scanner + // + scanner:: + ~scanner () + { + } + + // argv_scanner + // + bool argv_scanner:: + more () + { + return i_ < argc_; + } + + const char* argv_scanner:: + peek () + { + if (i_ < argc_) + return argv_[i_]; + else + throw eos_reached (); + } + + const char* argv_scanner:: + next () + { + if (i_ < argc_) + { + const char* r (argv_[i_]); + + if (erase_) + { + for (int i (i_ + 1); i < argc_; ++i) + argv_[i - 1] = argv_[i]; + + --argc_; + argv_[argc_] = 0; + } + else + ++i_; + + return r; + } + else + throw eos_reached (); + } + + void argv_scanner:: + skip () + { + if (i_ < argc_) + ++i_; + else + throw eos_reached (); + } + + // vector_scanner + // + bool vector_scanner:: + more () + { + return i_ < v_.size (); + } + + const char* vector_scanner:: + peek () + { + if (i_ < v_.size ()) + return v_[i_].c_str (); + else + throw eos_reached (); + } + + const char* vector_scanner:: + next () + { + if (i_ < v_.size ()) + return v_[i_++].c_str (); + else + throw eos_reached (); + } + + void vector_scanner:: + skip () + { + if (i_ < v_.size ()) + ++i_; + else + throw eos_reached (); + } + + template <typename X> + struct parser + { + static void + parse (X& x, bool& xs, scanner& s) + { + using namespace std; + + const char* o (s.next ()); + if (s.more ()) + { + string v (s.next ()); + istringstream is (v); + if (!(is >> x && is.peek () == istringstream::traits_type::eof ())) + throw invalid_value (o, v); + } + else + throw missing_value (o); + + xs = true; + } + }; + + template <> + struct parser<bool> + { + static void + parse (bool& x, scanner& s) + { + s.next (); + x = true; + } + }; + + template <> + struct parser<std::string> + { + static void + parse (std::string& x, bool& xs, scanner& s) + { + const char* o (s.next ()); + + if (s.more ()) + x = s.next (); + else + throw missing_value (o); + + xs = true; + } + }; + + template <typename X> + struct parser<std::vector<X> > + { + static void + parse (std::vector<X>& c, bool& xs, scanner& s) + { + X x; + bool dummy; + parser<X>::parse (x, dummy, s); + c.push_back (x); + xs = true; + } + }; + + template <typename X> + struct parser<std::set<X> > + { + static void + parse (std::set<X>& c, bool& xs, scanner& s) + { + X x; + bool dummy; + parser<X>::parse (x, dummy, s); + c.insert (x); + xs = true; + } + }; + + template <typename K, typename V> + struct parser<std::map<K, V> > + { + static void + parse (std::map<K, V>& m, bool& xs, scanner& s) + { + const char* o (s.next ()); + + if (s.more ()) + { + std::string ov (s.next ()); + std::string::size_type p = ov.find ('='); + + K k = K (); + V v = V (); + std::string kstr (ov, 0, p); + std::string vstr (ov, (p != std::string::npos ? p + 1 : ov.size ())); + + int ac (2); + char* av[] = + { + const_cast<char*> (o), 0 + }; + + bool dummy; + if (!kstr.empty ()) + { + av[1] = const_cast<char*> (kstr.c_str ()); + argv_scanner s (0, ac, av); + parser<K>::parse (k, dummy, s); + } + + if (!vstr.empty ()) + { + av[1] = const_cast<char*> (vstr.c_str ()); + argv_scanner s (0, ac, av); + parser<V>::parse (v, dummy, s); + } + + m[k] = v; + } + else + throw missing_value (o); + + xs = true; + } + }; + + template <typename X, typename T, T X::*M> + void + thunk (X& x, scanner& s) + { + parser<T>::parse (x.*M, s); + } + + template <typename X, typename T, T X::*M, bool X::*S> + void + thunk (X& x, scanner& s) + { + parser<T>::parse (x.*M, x.*S, s); + } + } + } +} + +#include <map> +#include <cstring> + +namespace build2 +{ + namespace script + { + // set_options + // + + set_options:: + set_options () + : exact_ (), + newline_ (), + whitespace_ () + { + } + + set_options:: + set_options (int& argc, + char** argv, + bool erase, + ::build2::script::cli::unknown_mode opt, + ::build2::script::cli::unknown_mode arg) + : exact_ (), + newline_ (), + whitespace_ () + { + ::build2::script::cli::argv_scanner s (argc, argv, erase); + _parse (s, opt, arg); + } + + set_options:: + set_options (int start, + int& argc, + char** argv, + bool erase, + ::build2::script::cli::unknown_mode opt, + ::build2::script::cli::unknown_mode arg) + : exact_ (), + newline_ (), + whitespace_ () + { + ::build2::script::cli::argv_scanner s (start, argc, argv, erase); + _parse (s, opt, arg); + } + + set_options:: + set_options (int& argc, + char** argv, + int& end, + bool erase, + ::build2::script::cli::unknown_mode opt, + ::build2::script::cli::unknown_mode arg) + : exact_ (), + newline_ (), + whitespace_ () + { + ::build2::script::cli::argv_scanner s (argc, argv, erase); + _parse (s, opt, arg); + end = s.end (); + } + + set_options:: + set_options (int start, + int& argc, + char** argv, + int& end, + bool erase, + ::build2::script::cli::unknown_mode opt, + ::build2::script::cli::unknown_mode arg) + : exact_ (), + newline_ (), + whitespace_ () + { + ::build2::script::cli::argv_scanner s (start, argc, argv, erase); + _parse (s, opt, arg); + end = s.end (); + } + + set_options:: + set_options (::build2::script::cli::scanner& s, + ::build2::script::cli::unknown_mode opt, + ::build2::script::cli::unknown_mode arg) + : exact_ (), + newline_ (), + whitespace_ () + { + _parse (s, opt, arg); + } + + typedef + std::map<std::string, void (*) (set_options&, ::build2::script::cli::scanner&)> + _cli_set_options_map; + + static _cli_set_options_map _cli_set_options_map_; + + struct _cli_set_options_map_init + { + _cli_set_options_map_init () + { + _cli_set_options_map_["--exact"] = + &::build2::script::cli::thunk< set_options, bool, &set_options::exact_ >; + _cli_set_options_map_["-e"] = + &::build2::script::cli::thunk< set_options, bool, &set_options::exact_ >; + _cli_set_options_map_["--newline"] = + &::build2::script::cli::thunk< set_options, bool, &set_options::newline_ >; + _cli_set_options_map_["-n"] = + &::build2::script::cli::thunk< set_options, bool, &set_options::newline_ >; + _cli_set_options_map_["--whitespace"] = + &::build2::script::cli::thunk< set_options, bool, &set_options::whitespace_ >; + _cli_set_options_map_["-w"] = + &::build2::script::cli::thunk< set_options, bool, &set_options::whitespace_ >; + } + }; + + static _cli_set_options_map_init _cli_set_options_map_init_; + + bool set_options:: + _parse (const char* o, ::build2::script::cli::scanner& s) + { + _cli_set_options_map::const_iterator i (_cli_set_options_map_.find (o)); + + if (i != _cli_set_options_map_.end ()) + { + (*(i->second)) (*this, s); + return true; + } + + return false; + } + + bool set_options:: + _parse (::build2::script::cli::scanner& s, + ::build2::script::cli::unknown_mode opt_mode, + ::build2::script::cli::unknown_mode arg_mode) + { + // Can't skip combined flags (--no-combined-flags). + // + assert (opt_mode != ::build2::script::cli::unknown_mode::skip); + + bool r = false; + bool opt = true; + + while (s.more ()) + { + const char* o = s.peek (); + + if (std::strcmp (o, "--") == 0) + { + opt = false; + s.skip (); + r = true; + continue; + } + + if (opt) + { + if (_parse (o, s)) + { + r = true; + continue; + } + + if (std::strncmp (o, "-", 1) == 0 && o[1] != '\0') + { + // Handle combined option values. + // + std::string co; + if (const char* v = std::strchr (o, '=')) + { + co.assign (o, 0, v - o); + ++v; + + int ac (2); + char* av[] = + { + const_cast<char*> (co.c_str ()), + const_cast<char*> (v) + }; + + ::build2::script::cli::argv_scanner ns (0, ac, av); + + if (_parse (co.c_str (), ns)) + { + // Parsed the option but not its value? + // + if (ns.end () != 2) + throw ::build2::script::cli::invalid_value (co, v); + + s.next (); + r = true; + continue; + } + else + { + // Set the unknown option and fall through. + // + o = co.c_str (); + } + } + + // Handle combined flags. + // + char cf[3]; + { + const char* p = o + 1; + for (; *p != '\0'; ++p) + { + if (!((*p >= 'a' && *p <= 'z') || + (*p >= 'A' && *p <= 'Z') || + (*p >= '0' && *p <= '9'))) + break; + } + + if (*p == '\0') + { + for (p = o + 1; *p != '\0'; ++p) + { + std::strcpy (cf, "-"); + cf[1] = *p; + cf[2] = '\0'; + + int ac (1); + char* av[] = + { + cf + }; + + ::build2::script::cli::argv_scanner ns (0, ac, av); + + if (!_parse (cf, ns)) + break; + } + + if (*p == '\0') + { + // All handled. + // + s.next (); + r = true; + continue; + } + else + { + // Set the unknown option and fall through. + // + o = cf; + } + } + } + + switch (opt_mode) + { + case ::build2::script::cli::unknown_mode::skip: + { + s.skip (); + r = true; + continue; + } + case ::build2::script::cli::unknown_mode::stop: + { + break; + } + case ::build2::script::cli::unknown_mode::fail: + { + throw ::build2::script::cli::unknown_option (o); + } + } + + break; + } + } + + switch (arg_mode) + { + case ::build2::script::cli::unknown_mode::skip: + { + s.skip (); + r = true; + continue; + } + case ::build2::script::cli::unknown_mode::stop: + { + break; + } + case ::build2::script::cli::unknown_mode::fail: + { + throw ::build2::script::cli::unknown_argument (o); + } + } + + break; + } + + return r; + } + } +} + +// Begin epilogue. +// +// +// End epilogue. + diff --git a/libbuild2/script/builtin-options.hxx b/libbuild2/script/builtin-options.hxx new file mode 100644 index 0000000..5a3f153 --- /dev/null +++ b/libbuild2/script/builtin-options.hxx @@ -0,0 +1,339 @@ +// -*- C++ -*- +// +// This file was generated by CLI, a command line interface +// compiler for C++. +// + +#ifndef LIBBUILD2_SCRIPT_BUILTIN_OPTIONS_HXX +#define LIBBUILD2_SCRIPT_BUILTIN_OPTIONS_HXX + +// Begin prologue. +// +// +// End prologue. + +#include <vector> +#include <iosfwd> +#include <string> +#include <cstddef> +#include <exception> + +#ifndef CLI_POTENTIALLY_UNUSED +# if defined(_MSC_VER) || defined(__xlC__) +# define CLI_POTENTIALLY_UNUSED(x) (void*)&x +# else +# define CLI_POTENTIALLY_UNUSED(x) (void)x +# endif +#endif + +namespace build2 +{ + namespace script + { + namespace cli + { + class unknown_mode + { + public: + enum value + { + skip, + stop, + fail + }; + + unknown_mode (value); + + operator value () const + { + return v_; + } + + private: + value v_; + }; + + // Exceptions. + // + + class exception: public std::exception + { + public: + virtual void + print (::std::ostream&) const = 0; + }; + + ::std::ostream& + operator<< (::std::ostream&, const exception&); + + class unknown_option: public exception + { + public: + virtual + ~unknown_option () throw (); + + unknown_option (const std::string& option); + + const std::string& + option () const; + + virtual void + print (::std::ostream&) const; + + virtual const char* + what () const throw (); + + private: + std::string option_; + }; + + class unknown_argument: public exception + { + public: + virtual + ~unknown_argument () throw (); + + unknown_argument (const std::string& argument); + + const std::string& + argument () const; + + virtual void + print (::std::ostream&) const; + + virtual const char* + what () const throw (); + + private: + std::string argument_; + }; + + class missing_value: public exception + { + public: + virtual + ~missing_value () throw (); + + missing_value (const std::string& option); + + const std::string& + option () const; + + virtual void + print (::std::ostream&) const; + + virtual const char* + what () const throw (); + + private: + std::string option_; + }; + + class invalid_value: public exception + { + public: + virtual + ~invalid_value () throw (); + + invalid_value (const std::string& option, + const std::string& value, + const std::string& message = std::string ()); + + const std::string& + option () const; + + const std::string& + value () const; + + const std::string& + message () const; + + virtual void + print (::std::ostream&) const; + + virtual const char* + what () const throw (); + + private: + std::string option_; + std::string value_; + std::string message_; + }; + + class eos_reached: public exception + { + public: + virtual void + print (::std::ostream&) const; + + virtual const char* + what () const throw (); + }; + + // Command line argument scanner interface. + // + // The values returned by next() are guaranteed to be valid + // for the two previous arguments up until a call to a third + // peek() or next(). + // + class scanner + { + public: + virtual + ~scanner (); + + virtual bool + more () = 0; + + virtual const char* + peek () = 0; + + virtual const char* + next () = 0; + + virtual void + skip () = 0; + }; + + class argv_scanner: public scanner + { + public: + argv_scanner (int& argc, char** argv, bool erase = false); + argv_scanner (int start, int& argc, char** argv, bool erase = false); + + int + end () const; + + virtual bool + more (); + + virtual const char* + peek (); + + virtual const char* + next (); + + virtual void + skip (); + + private: + int i_; + int& argc_; + char** argv_; + bool erase_; + }; + + class vector_scanner: public scanner + { + public: + vector_scanner (const std::vector<std::string>&, std::size_t start = 0); + + std::size_t + end () const; + + void + reset (std::size_t start = 0); + + virtual bool + more (); + + virtual const char* + peek (); + + virtual const char* + next (); + + virtual void + skip (); + + private: + const std::vector<std::string>& v_; + std::size_t i_; + }; + + template <typename X> + struct parser; + } + } +} + +namespace build2 +{ + namespace script + { + class set_options + { + public: + set_options (); + + set_options (int& argc, + char** argv, + bool erase = false, + ::build2::script::cli::unknown_mode option = ::build2::script::cli::unknown_mode::fail, + ::build2::script::cli::unknown_mode argument = ::build2::script::cli::unknown_mode::stop); + + set_options (int start, + int& argc, + char** argv, + bool erase = false, + ::build2::script::cli::unknown_mode option = ::build2::script::cli::unknown_mode::fail, + ::build2::script::cli::unknown_mode argument = ::build2::script::cli::unknown_mode::stop); + + set_options (int& argc, + char** argv, + int& end, + bool erase = false, + ::build2::script::cli::unknown_mode option = ::build2::script::cli::unknown_mode::fail, + ::build2::script::cli::unknown_mode argument = ::build2::script::cli::unknown_mode::stop); + + set_options (int start, + int& argc, + char** argv, + int& end, + bool erase = false, + ::build2::script::cli::unknown_mode option = ::build2::script::cli::unknown_mode::fail, + ::build2::script::cli::unknown_mode argument = ::build2::script::cli::unknown_mode::stop); + + set_options (::build2::script::cli::scanner&, + ::build2::script::cli::unknown_mode option = ::build2::script::cli::unknown_mode::fail, + ::build2::script::cli::unknown_mode argument = ::build2::script::cli::unknown_mode::stop); + + // Option accessors. + // + const bool& + exact () const; + + const bool& + newline () const; + + const bool& + whitespace () const; + + // Implementation details. + // + protected: + bool + _parse (const char*, ::build2::script::cli::scanner&); + + private: + bool + _parse (::build2::script::cli::scanner&, + ::build2::script::cli::unknown_mode option, + ::build2::script::cli::unknown_mode argument); + + public: + bool exact_; + bool newline_; + bool whitespace_; + }; + } +} + +#include <libbuild2/script/builtin-options.ixx> + +// Begin epilogue. +// +// +// End epilogue. + +#endif // LIBBUILD2_SCRIPT_BUILTIN_OPTIONS_HXX diff --git a/libbuild2/script/builtin-options.ixx b/libbuild2/script/builtin-options.ixx new file mode 100644 index 0000000..dc59f98 --- /dev/null +++ b/libbuild2/script/builtin-options.ixx @@ -0,0 +1,182 @@ +// -*- C++ -*- +// +// This file was generated by CLI, a command line interface +// compiler for C++. +// + +// Begin prologue. +// +// +// End prologue. + +#include <cassert> + +namespace build2 +{ + namespace script + { + namespace cli + { + // unknown_mode + // + inline unknown_mode:: + unknown_mode (value v) + : v_ (v) + { + } + + // exception + // + inline ::std::ostream& + operator<< (::std::ostream& os, const exception& e) + { + e.print (os); + return os; + } + + // unknown_option + // + inline unknown_option:: + unknown_option (const std::string& option) + : option_ (option) + { + } + + inline const std::string& unknown_option:: + option () const + { + return option_; + } + + // unknown_argument + // + inline unknown_argument:: + unknown_argument (const std::string& argument) + : argument_ (argument) + { + } + + inline const std::string& unknown_argument:: + argument () const + { + return argument_; + } + + // missing_value + // + inline missing_value:: + missing_value (const std::string& option) + : option_ (option) + { + } + + inline const std::string& missing_value:: + option () const + { + return option_; + } + + // invalid_value + // + inline invalid_value:: + invalid_value (const std::string& option, + const std::string& value, + const std::string& message) + : option_ (option), + value_ (value), + message_ (message) + { + } + + inline const std::string& invalid_value:: + option () const + { + return option_; + } + + inline const std::string& invalid_value:: + value () const + { + return value_; + } + + inline const std::string& invalid_value:: + message () const + { + return message_; + } + + // argv_scanner + // + inline argv_scanner:: + argv_scanner (int& argc, char** argv, bool erase) + : i_ (1), argc_ (argc), argv_ (argv), erase_ (erase) + { + } + + inline argv_scanner:: + argv_scanner (int start, int& argc, char** argv, bool erase) + : i_ (start), argc_ (argc), argv_ (argv), erase_ (erase) + { + } + + inline int argv_scanner:: + end () const + { + return i_; + } + + // vector_scanner + // + inline vector_scanner:: + vector_scanner (const std::vector<std::string>& v, std::size_t i) + : v_ (v), i_ (i) + { + } + + inline std::size_t vector_scanner:: + end () const + { + return i_; + } + + inline void vector_scanner:: + reset (std::size_t i) + { + i_ = i; + } + } + } +} + +namespace build2 +{ + namespace script + { + // set_options + // + + inline const bool& set_options:: + exact () const + { + return this->exact_; + } + + inline const bool& set_options:: + newline () const + { + return this->newline_; + } + + inline const bool& set_options:: + whitespace () const + { + return this->whitespace_; + } + } +} + +// Begin epilogue. +// +// +// End epilogue. diff --git a/libbuild2/script/builtin.cli b/libbuild2/script/builtin.cli new file mode 100644 index 0000000..68db23e --- /dev/null +++ b/libbuild2/script/builtin.cli @@ -0,0 +1,21 @@ +// file : libbuild2/script/builtin.cli +// license : MIT; see accompanying LICENSE file + +// Note that options in this file are undocumented because we generate neither +// the usage printing code nor man pages. Instead, they are documented in the +// Testscript Language Manual's builtin descriptions. +// +namespace build2 +{ + namespace script + { + // Pseudo-builtin options. + // + class set_options + { + bool --exact|-e; + bool --newline|-n; + bool --whitespace|-w; + }; + } +} diff --git a/libbuild2/script/lexer+command-expansion.test.testscript b/libbuild2/script/lexer+command-expansion.test.testscript new file mode 100644 index 0000000..f4d69d2 --- /dev/null +++ b/libbuild2/script/lexer+command-expansion.test.testscript @@ -0,0 +1,321 @@ +# file : libbuild2/script/lexer+command-expansion.test.testscript +# license : MIT; see accompanying LICENSE file + +test.arguments = command-expansion + +: pass-redirect +: +{ + : in + : + $* <:"0<|" >>EOO + '0' + <| + EOO + + : arg-in + : + $* <:"0 <|" >>EOO + '0 ' + <| + EOO + + : out + : + $* <:"1>|" >>EOO + '1' + >| + EOO + + : arg-out + : + $* <:"1 >|" >>EOO + '1 ' + >| + EOO +} + +: null-redirect +: +{ + : in + : + $* <:"0<-" >>EOO + '0' + <- + EOO + + : arg-in + : + $* <:"0 <-" >>EOO + '0 ' + <- + EOO + + : out + : + $* <:"1>-" >>EOO + '1' + >- + EOO + + : arg-out + : + $* <:"1 >-" >>EOO + '1 ' + >- + EOO +} + +: trace-redirect +: +{ + : out + : + $* <:"1>!" >>EOO + '1' + >! + EOO + + : arg-out + : + $* <:"1 >!" >>EOO + '1 ' + >! + EOO +} + +: merge-redirect +: +{ + : out + : + $* <:"1>&2" >>EOO + '1' + >& + '2' + EOO + + : arg-out + : + $* <:"1 >&2" >>EOO + '1 ' + >& + '2' + EOO +} + +: str-redirect +: +{ + : in + : + { + : newline + : + $* <:"0<<<=a b" >>EOO + '0' + <<<= + 'a b' + EOO + + : no-newline + : + $* <:"0<<<=:a b" >>EOO + '0' + <<<=: + 'a b' + EOO + } + + : in-alias + : + { + : newline + : + $* <:"0<<<a b" >>EOO + '0' + <<< + 'a b' + EOO + + : no-newline + : + $* <:"0<<<:a b" >>EOO + '0' + <<<: + 'a b' + EOO + } + + : out + : + { + : newline + : + $* <:"1>>>?a b" >>EOO + '1' + >>>? + 'a b' + EOO + + : no-newline + : + $* <:"1>>>?:a b" >>EOO + '1' + >>>?: + 'a b' + EOO + } +} + +: doc-redirect +: +{ + : in + : + { + : newline + : + $* <:"0<<=E O I" >>EOO + '0' + <<= + 'E O I' + EOO + + : no-newline + : + $* <:"0<<=:E O I" >>EOO + '0' + <<=: + 'E O I' + EOO + } + + : in-alias + : + { + : newline + : + $* <:"0<<E O I" >>EOO + '0' + << + 'E O I' + EOO + + : no-newline + : + $* <:"0<<:E O I" >>EOO + '0' + <<: + 'E O I' + EOO + } + + : out + : + { + : newline + : + $* <:"1>>?E O O" >>EOO + '1' + >>? + 'E O O' + EOO + + : no-newline + : + $* <:"1>>?:E O O" >>EOO + '1' + >>?: + 'E O O' + EOO + } +} + +: file-redirect +: +{ + : in + : + $* <:"0<=a b" >>EOO + '0' + <= + 'a b' + EOO + + : in-alias + : + $* <:"0<a b" >>EOO + '0' + < + 'a b' + EOO + + : out + : + $* <:"1>=a b" >>EOO + '1' + >= + 'a b' + EOO + + : out-alias + : + $* <:"1>a b" >>EOO + '1' + > + 'a b' + EOO + + : out-app + : + $* <:"1>+a b" >>EOO + '1' + >+ + 'a b' + EOO + + : out-app-alias + : + $* <:"1>>a b" >>EOO + '1' + >> + 'a b' + EOO +} + +: no-out-alias +: +$* <:"1>>>a b" >>EOO +'1' +>> +> +'a b' +EOO + + +: cleanup +: +{ + : always + : + $* <:"&file" >>EOO + & + 'file' + EOO + + : maybe + : + $* <:"&?file" >>EOO + &? + 'file' + EOO + + : never + : + $* <:"&!file" >>EOO + &! + 'file' + EOO +} diff --git a/libbuild2/script/lexer.cxx b/libbuild2/script/lexer.cxx new file mode 100644 index 0000000..d78e999 --- /dev/null +++ b/libbuild2/script/lexer.cxx @@ -0,0 +1,431 @@ +// file : libbuild2/script/lexer.cxx -*- C++ -*- +// license : MIT; see accompanying LICENSE file + +#include <libbuild2/script/lexer.hxx> + +#include <cstring> // strchr() + +using namespace std; + +namespace build2 +{ + namespace script + { + using type = token_type; + + void lexer:: + mode (base_mode m, char ps, optional<const char*> esc, uintptr_t data) + { + bool a (false); // attributes + + const char* s1 (nullptr); + const char* s2 (nullptr); + + bool s (true); // space + bool n (true); // newline + bool q (true); // quotes + + if (!esc) + { + assert (!state_.empty ()); + esc = state_.top ().escapes; + } + + switch (m) + { + case lexer_mode::command_expansion: + { + // Note that whitespaces are not word separators in this mode. + // + s1 = "|&<>"; + s2 = " "; + s = false; + break; + } + case lexer_mode::here_line_single: + { + // This one is like a single-quoted string except it treats + // newlines as a separator. We also treat quotes as literals. + // + // Note that it might be tempting to enable line continuation + // escapes. However, we will then have to also enable escaping of + // the backslash, which makes it a lot less tempting. + // + s1 = "\n"; + s2 = " "; + esc = ""; // Disable escape sequences. + s = false; + q = false; + break; + } + case lexer_mode::here_line_double: + { + // This one is like a double-quoted string except it treats + // newlines as a separator. We also treat quotes as literals. + // + s1 = "$(\n"; + s2 = " "; + s = false; + q = false; + break; + } + default: + { + // Make sure pair separators are only enabled where we expect + // them. + // + // @@ Should we disable pair separators in the eval mode? + // + assert (ps == '\0' || + m == lexer_mode::eval || + m == lexer_mode::attribute_value); + + base_lexer::mode (m, ps, esc); + return; + } + } + + assert (ps == '\0'); + state_.push (state {m, data, nullopt, a, ps, s, n, q, *esc, s1, s2}); + } + + token lexer:: + next () + { + token r; + + switch (state_.top ().mode) + { + case lexer_mode::command_expansion: + case lexer_mode::here_line_single: + case lexer_mode::here_line_double: + r = next_line (); + break; + default: + r = base_lexer::next (); + break; + } + + if (r.qtype != quote_type::unquoted) + ++quoted_; + + return r; + } + + token lexer:: + next_line () + { + bool sep (skip_spaces ().first); + + xchar c (get ()); + uint64_t ln (c.line), cn (c.column); + + const state& st (state_.top ()); + lexer_mode m (st.mode); + + auto make_token = [&sep, &m, ln, cn] (type t) + { + bool q (m == lexer_mode::here_line_double); + + return token (t, string (), sep, + (q ? quote_type::double_ : quote_type::unquoted), q, + ln, cn, + token_printer); + }; + + if (eos (c)) + return make_token (type::eos); + + // NOTE: remember to update mode() if adding new special characters. + + if (m != lexer_mode::command_expansion) + { + switch (c) + { + case '\n': + { + sep = true; // Treat newline as always separated. + return make_token (type::newline); + } + } + } + + if (m != lexer_mode::here_line_single) + { + switch (c) + { + // Variable expansion, function call, and evaluation context. + // + case '$': return make_token (type::dollar); + case '(': return make_token (type::lparen); + } + } + + // Command operators. + // + if (m == lexer_mode::command_expansion) + { + if (optional<token> t = next_cmd_op (c, sep)) + return move (*t); + } + + // Otherwise it is a word. + // + unget (c); + return word (st, sep); + } + + optional<token> lexer:: + next_cmd_op (const xchar& c, bool sep) + { + auto make_token = [&sep, &c] (type t, string v = string ()) + { + return token (t, move (v), sep, + quote_type::unquoted, false, + c.line, c.column, + token_printer); + }; + + auto make_token_with_modifiers = + [&make_token, this] (type t, + const char* mods, // To recorgnize. + const char* stop = nullptr) // To stop after. + { + string v; + if (mods != nullptr) + { + for (xchar p (peek ()); + (strchr (mods, p) != nullptr && // Modifier. + strchr (v.c_str (), p) == nullptr); // Not already seen. + p = peek ()) + { + get (); + v += p; + + if (stop != nullptr && strchr (stop, p) != nullptr) + break; + } + } + + return make_token (t, move (v)); + }; + + switch (c) + { + // |, || + // + case '|': + { + if (peek () == '|') + { + get (); + return make_token (type::log_or); + } + else + return make_token (type::pipe); + } + // &, && + // + case '&': + { + xchar p (peek ()); + + if (p == '&') + { + get (); + return make_token (type::log_and); + } + + // These modifiers are mutually exclusive so stop after seeing + // either one. + // + return make_token_with_modifiers (type::clean, "!?", "!?"); + } + // < + // + case '<': + { + optional<type> r; + xchar p (peek ()); + + if (p == '|' || p == '-' || p == '=' || p == '<') // <| <- <= << + { + xchar c (get ()); + + switch (p) + { + case '|': return make_token (type::in_pass); // <| + case '-': return make_token (type::in_null); // <- + case '=': return make_token (type::in_file); // <= + case '<': // << + { + p = peek (); + + if (p == '=' || p == '<') // <<= <<< + { + xchar c (get ()); + + switch (p) + { + case '=': + { + r = type::in_doc; // <<= + break; + } + case '<': + { + p = peek (); + + if (p == '=') + { + get (); + r = type::in_str; // <<<= + } + + if (!r && redirect_aliases.lll) + r = type::in_lll; // <<< + + // We can still end up with the << or < redirect alias, + // if any of them is present. + // + if (!r) + unget (c); + } + + break; + } + } + + if (!r && redirect_aliases.ll) + r = type::in_ll; // << + + // We can still end up with the < redirect alias, if it is + // present. + // + if (!r) + unget (c); + + break; + } + } + } + + if (!r && redirect_aliases.l) + r = type::in_l; // < + + if (!r) + return nullopt; + + // Handle modifiers. + // + const char* mods (nullptr); + + switch (redirect_aliases.resolve (*r)) + { + case type::in_str: + case type::in_doc: mods = ":/"; break; + } + + token t (make_token_with_modifiers (*r, mods)); + + return t; + } + // > + // + case '>': + { + optional<type> r; + xchar p (peek ()); + + if (p == '|' || p == '-' || p == '!' || p == '&' || // >| >- >! >& + p == '=' || p == '+' || p == '?' || p == '>') // >= >+ >? >> + { + xchar c (get ()); + + switch (p) + { + case '|': return make_token (type::out_pass); // >| + case '-': return make_token (type::out_null); // >- + case '!': return make_token (type::out_trace); // >! + case '&': return make_token (type::out_merge); // >& + case '=': return make_token (type::out_file_ovr); // >= + case '+': return make_token (type::out_file_app); // >+ + case '?': return make_token (type::out_file_cmp); // >? + case '>': // >> + { + p = peek (); + + if (p == '?' || p == '>') // >>? >>> + { + xchar c (get ()); + + switch (p) + { + case '?': + { + r = type::out_doc; // >>? + break; + } + case '>': + { + p = peek (); + + if (p == '?') + { + get (); + r = type::out_str; // >>>? + } + + if (!r && redirect_aliases.ggg) + r = type::out_ggg; // >>> + + // We can still end up with the >> or > redirect alias, + // if any of themis present. + // + if (!r) + unget (c); + } + + break; + } + } + + if (!r && redirect_aliases.gg) + r = type::out_gg; // >> + + // We can still end up with the > redirect alias, if it is + // present. + // + if (!r) + unget (c); + + break; + } + } + } + + if (!r && redirect_aliases.g) + r = type::out_g; // > + + if (!r) + return nullopt; + + // Handle modifiers. + // + const char* mods (nullptr); + const char* stop (nullptr); + + switch (redirect_aliases.resolve (*r)) + { + case type::out_str: + case type::out_doc: mods = ":/~"; stop = "~"; break; + } + + return make_token_with_modifiers (*r, mods, stop); + } + } + + return nullopt; + } + } +} diff --git a/libbuild2/script/lexer.hxx b/libbuild2/script/lexer.hxx new file mode 100644 index 0000000..dbfdfcc --- /dev/null +++ b/libbuild2/script/lexer.hxx @@ -0,0 +1,139 @@ +// file : libbuild2/script/lexer.hxx -*- C++ -*- +// license : MIT; see accompanying LICENSE file + +#ifndef LIBBUILD2_SCRIPT_LEXER_HXX +#define LIBBUILD2_SCRIPT_LEXER_HXX + +#include <libbuild2/types.hxx> +#include <libbuild2/utility.hxx> + +#include <libbuild2/lexer.hxx> + +#include <libbuild2/script/token.hxx> + +namespace build2 +{ + namespace script + { + struct lexer_mode: build2::lexer_mode + { + using base_type = build2::lexer_mode; + + enum + { + command_expansion = base_type::value_next, + here_line_single, + here_line_double, + + value_next + }; + + lexer_mode () = default; + lexer_mode (value_type v): base_type (v) {} + lexer_mode (base_type v): base_type (v) {} + }; + + // Actual redirects (as tokens) for the the <, <<, <<<, and >, >>, >>> + // aliases. + // + struct redirect_aliases + { + optional<token_type> l; // < + optional<token_type> ll; // << + optional<token_type> lll; // <<< + optional<token_type> g; // > + optional<token_type> gg; // >> + optional<token_type> ggg; // >>> + + // If the token type is a redirect alias then return the token type it + // resolves to and the passed token type otherwise. Note that it's the + // caller's responsibility to make sure that the corresponding alias is + // present (normally by not recognizing absent aliases as tokens). + // + token_type + resolve (token_type t) const noexcept + { + switch (t) + { + case token_type::in_l: assert (l); return *l; + case token_type::in_ll: assert (ll); return *ll; + case token_type::in_lll: assert (lll); return *lll; + case token_type::out_g: assert (g); return *g; + case token_type::out_gg: assert (gg); return *gg; + case token_type::out_ggg: assert (ggg); return *ggg; + } + + return t; + } + }; + + class lexer: public build2::lexer + { + public: + using base_lexer = build2::lexer; + using base_mode = build2::lexer_mode; + + using redirect_aliases_type = script::redirect_aliases; + + // Note that none of the name, redirect aliases, and escape arguments + // are copied. + // + lexer (istream& is, + const path_name& name, + lexer_mode m, + const redirect_aliases_type& ra, + const char* escapes = nullptr) + : base_lexer (is, name, 1 /* line */, + nullptr /* escapes */, + false /* set_mode */), + redirect_aliases (ra) + { + mode (m, '\0', escapes); + } + + virtual void + mode (base_mode, + char = '\0', + optional<const char*> = nullopt, + uintptr_t = 0) override; + + // Number of quoted (double or single) tokens since last reset. + // + size_t + quoted () const {return quoted_;} + + void + reset_quoted (size_t q) {quoted_ = q;} + + virtual token + next () override; + + public: + const redirect_aliases_type& redirect_aliases; + + protected: + lexer (istream& is, const path_name& name, uint64_t line, + const char* escapes, + bool set_mode, + const redirect_aliases_type& ra) + : base_lexer (is, name, line, escapes, set_mode), + redirect_aliases (ra) {} + + // Return the next token if it is a command operator (|, ||, &&, + // redirect, or cleanup) and nullopt otherwise. + // + optional<token> + next_cmd_op (const xchar&, // The token first character (last got char). + bool sep); // The token is separated. + + private: + token + next_line (); + + protected: + size_t quoted_; + }; + } +} + +#endif // LIBBUILD2_SCRIPT_LEXER_HXX diff --git a/libbuild2/script/lexer.test.cxx b/libbuild2/script/lexer.test.cxx new file mode 100644 index 0000000..b8de241 --- /dev/null +++ b/libbuild2/script/lexer.test.cxx @@ -0,0 +1,76 @@ +// file : libbuild2/script/lexer.test.cxx -*- C++ -*- +// license : MIT; see accompanying LICENSE file + +#include <cassert> +#include <iostream> + +#include <libbuild2/types.hxx> +#include <libbuild2/utility.hxx> + +#include <libbuild2/script/token.hxx> +#include <libbuild2/script/lexer.hxx> + +using namespace std; + +namespace build2 +{ + namespace script + { + // Usage: argv[0] <lexer-mode> + // + int + main (int argc, char* argv[]) + { + lexer_mode m; + { + assert (argc == 2); + string s (argv[1]); + + if (s == "command-expansion") m = lexer_mode::command_expansion; + else if (s == "here-line-single") m = lexer_mode::here_line_single; + else if (s == "here-line-double") m = lexer_mode::here_line_double; + else assert (false); + } + + try + { + cin.exceptions (istream::failbit | istream::badbit); + + path_name in ("<stdin>"); + + using type = token_type; + + redirect_aliases ra {type (type::in_file), + type (type::in_doc), + type (type::in_str), + type (type::out_file_ovr), + type (type::out_file_app), + nullopt}; + + lexer l (cin, in, m, ra); + + // No use printing eos since we will either get it or loop forever. + // + for (token t (l.next ()); t.type != token_type::eos; t = l.next ()) + { + // Print each token on a separate line without quoting operators. + // + t.printer (cout, t, print_mode::normal); + cout << endl; + } + } + catch (const failed&) + { + return 1; + } + + return 0; + } + } +} + +int +main (int argc, char* argv[]) +{ + return build2::script::main (argc, argv); +} diff --git a/libbuild2/script/parser.cxx b/libbuild2/script/parser.cxx new file mode 100644 index 0000000..aa60111 --- /dev/null +++ b/libbuild2/script/parser.cxx @@ -0,0 +1,2015 @@ +// file : libbuild2/script/parser.cxx -*- C++ -*- +// license : MIT; see accompanying LICENSE file + +#include <libbuild2/script/parser.hxx> + +#include <libbuild2/variable.hxx> +#include <libbuild2/script/run.hxx> // exit +#include <libbuild2/script/lexer.hxx> + +using namespace std; + +namespace build2 +{ + namespace script + { + using type = token_type; + + value parser:: + parse_variable_line (token& t, type& tt) + { + // enter: assignment + // leave: newline or unknown token + + next_with_attributes (t, tt); + + // Parse value attributes if any. Note that it's ok not to have + // anything after the attributes (e.g., foo=[null]). + // + attributes_push (t, tt, true); + + // @@ PAT: Should we expand patterns? Note that it will only be + // simple ones since we have disabled {}. Also, what would be the + // pattern base directory? + // + return tt != type::newline && start_names (tt) + ? parse_value (t, tt, + pattern_mode::ignore, + "variable value", + nullptr) + : value (names ()); + } + + // Parse the regular expression representation (non-empty string value + // framed with introducer characters and optionally followed by flag + // characters from the {di} set, for example '/foo/id') into + // components. Also return end-of-parsing position if requested, + // otherwise treat any unparsed characters left as an error. + // + struct regex_parts + { + string value; + char intro; + string flags; // Combination of characters from {di} set. + + // Create a special empty object. + // + regex_parts (): intro ('\0') {} + + regex_parts (string v, char i, string f) + : value (move (v)), intro (i), flags (move (f)) {} + }; + + static regex_parts + parse_regex (const string& s, + const location& l, + const char* what, + size_t* end = nullptr) + { + if (s.empty ()) + fail (l) << "no introducer character in " << what; + + size_t p (s.find (s[0], 1)); // Find terminating introducer. + + if (p == string::npos) + fail (l) << "no closing introducer character in " << what; + + size_t rn (p - 1); // Regex length. + if (rn == 0) + fail (l) << what << " is empty"; + + // Find end-of-flags position. + // + size_t fp (++p); // Save flags starting position. + for (char c; (c = s[p]) == 'd' || c == 'i'; ++p) ; + + // If string end is not reached then report invalid flags, unless + // end-of-parsing position is requested (which means regex is just a + // prefix). + // + if (s[p] != '\0' && end == nullptr) + fail (l) << "junk at the end of " << what; + + if (end != nullptr) + *end = p; + + return regex_parts (string (s, 1, rn), s[0], string (s, fp, p - fp)); + } + + pair<command_expr, parser::here_docs> parser:: + parse_command_expr (token& t, type& tt, + const redirect_aliases& ra) + { + // enter: first token of the command line + // leave: <newline> or unknown token + + command_expr expr; + + // OR-ed to an implied false for the first term. + // + expr.push_back ({expr_operator::log_or, command_pipe ()}); + + command c; // Command being assembled. + + // Make sure the command makes sense. + // + auto check_command = [&c, this] (const location& l, bool last) + { + if (c.out && c.out->type == redirect_type::merge && + c.err && c.err->type == redirect_type::merge) + fail (l) << "stdout and stderr redirected to each other"; + + if (!last && c.out) + fail (l) << "stdout is both redirected and piped"; + }; + + // Check that the introducer character differs from '/' if the + // portable path modifier is specified. Must be called before + // parse_regex() (see below) to make sure its diagnostics is + // meaningful. + // + // Note that the portable path modifier assumes '/' to be a valid + // regex character and so makes it indistinguishable from the + // terminating introducer. + // + auto check_regex_mod = [this] (const string& mod, + const string& re, + const location& l, + const char* what) + { + // Handles empty regex properly. + // + if (mod.find ('/') != string::npos && re[0] == '/') + fail (l) << "portable path modifier and '/' introducer in " + << what; + }; + + // Pending positions where the next word should go. + // + enum class pending + { + none, + program, + in_string, + in_document, + in_file, + out_merge, + out_string, + out_str_regex, + out_document, + out_doc_regex, + out_file, + err_merge, + err_string, + err_str_regex, + err_document, + err_doc_regex, + err_file, + clean + }; + pending p (pending::program); + string mod; // Modifiers for pending in_* and out_* positions. + here_docs hd; // Expected here-documents. + + // Add the next word to either one of the pending positions or to + // program arguments by default. + // + auto add_word = [&c, &p, &mod, &check_regex_mod, this] ( + string&& w, const location& l) + { + auto add_merge = [&l, this] (optional<redirect>& r, + const string& w, + int fd) + { + assert (r); // Must already be present. + + try + { + size_t n; + if (stoi (w, &n) == fd && n == w.size ()) + { + r->fd = fd; + return; + } + } + catch (const exception&) {} // Fall through. + + fail (l) << (fd == 1 ? "stderr" : "stdout") << " merge redirect " + << "file descriptor must be " << fd; + }; + + auto add_here_str = [] (optional<redirect>& r, string&& w) + { + assert (r); // Must already be present. + + if (r->modifiers ().find (':') == string::npos) + w += '\n'; + r->str = move (w); + }; + + auto add_here_str_regex = [&l, &check_regex_mod] ( + optional<redirect>& r, int fd, string&& w) + { + assert (r); // Must already be present. + + const char* what (nullptr); + switch (fd) + { + case 1: what = "stdout regex redirect"; break; + case 2: what = "stderr regex redirect"; break; + } + + check_regex_mod (r->modifiers (), w, l, what); + + regex_parts rp (parse_regex (w, l, what)); + + regex_lines& re (r->regex); + re.intro = rp.intro; + + re.lines.emplace_back ( + l.line, l.column, move (rp.value), move (rp.flags)); + + // Add final blank line unless suppressed. + // + // Note that the position is synthetic, but that's ok as we don't + // expect any diagnostics to refer this line. + // + if (r->modifiers ().find (':') == string::npos) + re.lines.emplace_back (l.line, l.column, string (), false); + }; + + auto parse_path = [&l, this] (string&& w, const char* what) -> path + { + try + { + path p (move (w)); + + if (!p.empty ()) + { + p.normalize (); + return p; + } + + fail (l) << "empty " << what << endf; + } + catch (const invalid_path& e) + { + fail (l) << "invalid " << what << " '" << e.path << "'" << endf; + } + }; + + auto add_file = [&parse_path] (optional<redirect>& r, + int fd, + string&& w) + { + assert (r); // Must already be present. + + const char* what (nullptr); + switch (fd) + { + case 0: what = "stdin redirect path"; break; + case 1: what = "stdout redirect path"; break; + case 2: what = "stderr redirect path"; break; + } + + r->file.path = parse_path (move (w), what); + }; + + switch (p) + { + case pending::none: c.arguments.push_back (move (w)); break; + case pending::program: + c.program = parse_path (move (w), "program path"); + break; + + case pending::out_merge: add_merge (c.out, w, 2); break; + case pending::err_merge: add_merge (c.err, w, 1); break; + + case pending::in_string: add_here_str (c.in, move (w)); break; + case pending::out_string: add_here_str (c.out, move (w)); break; + case pending::err_string: add_here_str (c.err, move (w)); break; + + case pending::out_str_regex: + { + add_here_str_regex (c.out, 1, move (w)); + break; + } + case pending::err_str_regex: + { + add_here_str_regex (c.err, 2, move (w)); + break; + } + + // These are handled specially below. + // + case pending::in_document: + case pending::out_document: + case pending::err_document: + case pending::out_doc_regex: + case pending::err_doc_regex: assert (false); break; + + case pending::in_file: add_file (c.in, 0, move (w)); break; + case pending::out_file: add_file (c.out, 1, move (w)); break; + case pending::err_file: add_file (c.err, 2, move (w)); break; + + case pending::clean: + { + cleanup_type t; + switch (mod[0]) // Ok, if empty + { + case '!': t = cleanup_type::never; break; + case '?': t = cleanup_type::maybe; break; + default: t = cleanup_type::always; break; + } + + c.cleanups.push_back ( + {t, parse_path (move (w), "cleanup path")}); + break; + } + } + + p = pending::none; + mod.clear (); + }; + + // Make sure we don't have any pending positions to fill. + // + auto check_pending = [&p, this] (const location& l) + { + const char* what (nullptr); + + switch (p) + { + case pending::none: break; + case pending::program: what = "program"; break; + case pending::in_string: what = "stdin here-string"; break; + case pending::in_document: what = "stdin here-document end"; break; + case pending::in_file: what = "stdin file"; break; + case pending::out_merge: what = "stdout file descriptor"; break; + case pending::out_string: what = "stdout here-string"; break; + case pending::out_document: what = "stdout here-document end"; break; + case pending::out_file: what = "stdout file"; break; + case pending::err_merge: what = "stderr file descriptor"; break; + case pending::err_string: what = "stderr here-string"; break; + case pending::err_document: what = "stderr here-document end"; break; + case pending::err_file: what = "stderr file"; break; + case pending::clean: what = "cleanup path"; break; + + case pending::out_str_regex: + { + what = "stdout here-string regex"; + break; + } + case pending::err_str_regex: + { + what = "stderr here-string regex"; + break; + } + case pending::out_doc_regex: + { + what = "stdout here-document regex end"; + break; + } + case pending::err_doc_regex: + { + what = "stderr here-document regex end"; + break; + } + } + + if (what != nullptr) + fail (l) << "missing " << what; + }; + + // Parse the redirect operator. + // + // If the token type is the redirect alias then tt must contain the type + // the alias resolves to and the token type otherwise. Note that this + // argument defines the redirect semantics. Also note that the token is + // saved into the redirect to keep the modifiers and the original + // representation. + // + auto parse_redirect = [&c, &expr, &p, &mod, &hd, this] + (token&& t, type tt, const location& l) + { + // The redirect alias token type must be resolved. + // + assert (tt != type::in_l && + tt != type::in_ll && + tt != type::in_lll && + tt != type::out_g && + tt != type::out_gg && + tt != type::out_ggg); + + // Our semantics is the last redirect seen takes effect. + // + assert (p == pending::none && mod.empty ()); + + // See if we have the file descriptor. + // + unsigned long fd (3); + if (!t.separated) + { + if (c.arguments.empty ()) + fail (l) << "missing redirect file descriptor"; + + const string& s (c.arguments.back ()); + + try + { + size_t n; + fd = stoul (s, &n); + + if (n != s.size () || fd > 2) + throw invalid_argument (string ()); + } + catch (const exception&) + { + fail (l) << "invalid redirect file descriptor '" << s << "'"; + } + + c.arguments.pop_back (); + } + + // Validate/set default file descriptor. + // + switch (tt) + { + case type::in_pass: + case type::in_null: + case type::in_str: + case type::in_doc: + case type::in_file: + { + if ((fd = fd == 3 ? 0 : fd) != 0) + fail (l) << "invalid in redirect file descriptor " << fd; + + if (!expr.back ().pipe.empty ()) + fail (l) << "stdin is both piped and redirected"; + + break; + } + case type::out_pass: + case type::out_null: + case type::out_trace: + case type::out_merge: + case type::out_str: + case type::out_doc: + case type::out_file_cmp: + case type::out_file_ovr: + case type::out_file_app: + { + if ((fd = fd == 3 ? 1 : fd) == 0) + fail (l) << "invalid out redirect file descriptor " << fd; + + break; + } + } + + // Don't move as we will save the token into the redirect object. + // + mod = t.value; + + // Handle the none redirect (no data allowed) in the switch construct + // if/when the respective syntax is invented. + // + redirect_type rt (redirect_type::none); + switch (tt) + { + case type::in_pass: + case type::out_pass: rt = redirect_type::pass; break; + + case type::in_null: + case type::out_null: rt = redirect_type::null; break; + + case type::out_trace: rt = redirect_type::trace; break; + + case type::out_merge: rt = redirect_type::merge; break; + + case type::in_str: + case type::out_str: + { + bool re (mod.find ('~') != string::npos); + assert (tt == type::out_str || !re); + + rt = re + ? redirect_type::here_str_regex + : redirect_type::here_str_literal; + + break; + } + + case type::in_doc: + case type::out_doc: + { + bool re (mod.find ('~') != string::npos); + assert (tt == type::out_doc || !re); + + rt = re + ? redirect_type::here_doc_regex + : redirect_type::here_doc_literal; + + break; + } + + case type::in_file: + case type::out_file_cmp: + case type::out_file_ovr: + case type::out_file_app: rt = redirect_type::file; break; + } + + optional<redirect>& r (fd == 0 ? c.in : + fd == 1 ? c.out : + c.err); + + optional<redirect_type> overriden; + + if (r) + overriden = r->type; + + r = redirect (rt); + + // Don't move as still may be used for pending here-document end + // marker processing. + // + r->token = move (t); + + switch (rt) + { + case redirect_type::none: + // Remove the assertion if/when the none redirect syntax is + // invented. + // + assert (false); + // Fall through. + case redirect_type::pass: + case redirect_type::null: + case redirect_type::trace: + break; + case redirect_type::merge: + switch (fd) + { + case 0: assert (false); break; + case 1: p = pending::out_merge; break; + case 2: p = pending::err_merge; break; + } + break; + case redirect_type::here_str_literal: + switch (fd) + { + case 0: p = pending::in_string; break; + case 1: p = pending::out_string; break; + case 2: p = pending::err_string; break; + } + break; + case redirect_type::here_str_regex: + switch (fd) + { + case 0: assert (false); break; + case 1: p = pending::out_str_regex; break; + case 2: p = pending::err_str_regex; break; + } + break; + case redirect_type::here_doc_literal: + switch (fd) + { + case 0: p = pending::in_document; break; + case 1: p = pending::out_document; break; + case 2: p = pending::err_document; break; + } + break; + case redirect_type::here_doc_regex: + switch (fd) + { + case 0: assert (false); break; + case 1: p = pending::out_doc_regex; break; + case 2: p = pending::err_doc_regex; break; + } + break; + case redirect_type::file: + switch (fd) + { + case 0: p = pending::in_file; break; + case 1: p = pending::out_file; break; + case 2: p = pending::err_file; break; + } + + // Also sets for stdin, but this is harmless. + // + r->file.mode = tt == type::out_file_ovr ? redirect_fmode::overwrite : + tt == type::out_file_app ? redirect_fmode::append : + redirect_fmode::compare; + + break; + + case redirect_type::here_doc_ref: assert (false); break; + } + + // If we are overriding a here-document, then remove the reference + // to this command redirect from the corresponding here_doc object. + // + if (!pre_parse_ && + overriden && + (*overriden == redirect_type::here_doc_literal || + *overriden == redirect_type::here_doc_regex)) + { + size_t e (expr.size () - 1); + size_t p (expr.back ().pipe.size ()); + int f (static_cast<int> (fd)); + + for (here_doc& d: hd) + { + small_vector<here_redirect, 2>& rs (d.redirects); + + auto i (find_if (rs.begin (), rs.end (), + [e, p, f] (const here_redirect& r) + { + return r.expr == e && + r.pipe == p && + r.fd == f; + })); + + if (i != rs.end ()) + { + rs.erase (i); + break; + } + } + } + }; + + // Set pending cleanup type. + // + auto parse_clean = [&p, &mod] (token& t) + { + p = pending::clean; + mod = move (t.value); + }; + + const location ll (get_location (t)); // Line location. + + // Keep parsing chunks of the command line until we see one of the + // "terminators" (newline, exit status comparison, etc). + // + location l (ll); + names ns; // Reuse to reduce allocations. + + for (bool done (false); !done; l = get_location (t)) + { + tt = ra.resolve (tt); + + switch (tt) + { + case type::newline: + { + done = true; + break; + } + + case type::equal: + case type::not_equal: + { + if (!pre_parse_) + check_pending (l); + + c.exit = parse_command_exit (t, tt); + + // Only a limited set of things can appear after the exit status + // so we check this here. + // + switch (tt) + { + case type::newline: + + case type::pipe: + case type::log_or: + case type::log_and: + break; + + default: + { + // Bail out if this is one of the unknown/unexpected tokens. + // + done = true; + break; + } + } + + break; + } + + case type::pipe: + case type::log_or: + case type::log_and: + + case type::in_pass: + case type::out_pass: + + case type::in_null: + case type::out_null: + + case type::out_trace: + + case type::out_merge: + + case type::in_str: + case type::in_doc: + case type::out_str: + case type::out_doc: + + case type::in_file: + case type::out_file_cmp: + case type::out_file_ovr: + case type::out_file_app: + + case type::clean: + { + if (pre_parse_) + { + // The only things we need to handle here are the here-document + // and here-document regex end markers since we need to know + // how many of them to pre-parse after the command. + // + switch (tt) + { + case type::in_doc: + case type::out_doc: + mod = move (t.value); + + bool re (mod.find ('~') != string::npos); + const char* what (re + ? "here-document regex end marker" + : "here-document end marker"); + + // We require the end marker to be a literal, unquoted word. + // In particularm, we don't allow quoted because of cases + // like foo"$bar" (where we will see word 'foo'). + // + next (t, tt); + + // We require the end marker to be an unquoted or completely + // quoted word. The complete quoting becomes important for + // cases like foo"$bar" (where we will see word 'foo'). + // + // For good measure we could have also required it to be + // separated from the following token, but out grammar + // allows one to write >>EOO;. The problematic sequence + // would be >>FOO$bar -- on reparse it will be expanded + // as a single word. + // + if (tt != type::word || t.value.empty ()) + fail (t) << "expected " << what; + + peek (); + const token& p (peeked ()); + if (!p.separated) + { + switch (p.type) + { + case type::dollar: + case type::lparen: + fail (p) << what << " must be literal"; + } + } + + quote_type qt (t.qtype); + switch (qt) + { + case quote_type::unquoted: + qt = quote_type::single; // Treat as single-quoted. + break; + case quote_type::single: + case quote_type::double_: + if (t.qcomp) + break; + // Fall through. + case quote_type::mixed: + fail (t) << "partially-quoted " << what; + } + + regex_parts r; + string end (move (t.value)); + + if (re) + { + check_regex_mod (mod, end, l, what); + + r = parse_regex (end, l, what); + end = move (r.value); // The "cleared" end marker. + } + + bool literal (qt == quote_type::single); + bool shared (false); + + for (const auto& d: hd) + { + if (d.end == end) + { + auto check = [&t, &end, &re, this] (bool c, + const char* what) + { + if (!c) + fail (t) << "different " << what + << " for shared here-document " + << (re ? "regex '" : "'") << end << "'"; + }; + + check (d.modifiers == mod, "modifiers"); + check (d.literal == literal, "quoting"); + + if (re) + { + check (d.regex == r.intro, "introducers"); + check (d.regex_flags == r.flags, "global flags"); + } + + shared = true; + break; + } + } + + if (!shared) + hd.push_back ( + here_doc { + {}, + move (end), + literal, + move (mod), + r.intro, move (r.flags)}); + + break; + } + + next (t, tt); + break; + } + + // If this is one of the operators/separators, check that we + // don't have any pending locations to be filled. + // + check_pending (l); + + // Note: there is another one in the inner loop below. + // + switch (tt) + { + case type::pipe: + case type::log_or: + case type::log_and: + { + // Check that the previous command makes sense. + // + check_command (l, tt != type::pipe); + expr.back ().pipe.push_back (move (c)); + + c = command (); + p = pending::program; + + if (tt != type::pipe) + { + expr_operator o (tt == type::log_or + ? expr_operator::log_or + : expr_operator::log_and); + expr.push_back ({o, command_pipe ()}); + } + + break; + } + + case type::in_pass: + case type::out_pass: + + case type::in_null: + case type::out_null: + + case type::out_trace: + + case type::out_merge: + + case type::in_str: + case type::in_doc: + case type::out_str: + case type::out_doc: + + case type::in_file: + case type::out_file_cmp: + case type::out_file_ovr: + case type::out_file_app: + { + parse_redirect (move (t), tt, l); + break; + } + + case type::clean: + { + parse_clean (t); + break; + } + + default: assert (false); break; + } + + next (t, tt); + break; + } + default: + { + // Bail out if this is one of the unknown tokens. + // + if (!start_names (tt)) + { + done = true; + break; + } + + // Here-document end markers are literal (we verified that above + // during pre-parsing) and we need to know whether they were + // quoted. So handle this case specially. + // + { + int fd; + switch (p) + { + case pending::in_document: fd = 0; break; + case pending::out_document: + case pending::out_doc_regex: fd = 1; break; + case pending::err_document: + case pending::err_doc_regex: fd = 2; break; + default: fd = -1; break; + } + + if (fd != -1) + { + if (tt != type::word || t.value.empty ()) + fail (t) << "expected here-document end marker"; + + here_redirect rd { + expr.size () - 1, expr.back ().pipe.size (), fd}; + + string end (move (t.value)); + + regex_parts r; + + if (p == pending::out_doc_regex || + p == pending::err_doc_regex) + { + // We can't fail here as we already parsed all the end + // markers during pre-parsing stage, and so no need in the + // description. + // + r = parse_regex (end, l, ""); + end = move (r.value); // The "cleared" end marker. + } + + bool shared (false); + for (auto& d: hd) + { + // No need to check that redirects that share here-document + // have the same modifiers, etc. That have been done during + // pre-parsing. + // + if (d.end == end) + { + d.redirects.emplace_back (rd); + shared = true; + break; + } + } + + if (!shared) + hd.push_back ( + here_doc { + {rd}, + move (end), + (t.qtype == quote_type::unquoted || + t.qtype == quote_type::single), + move (mod), + r.intro, move (r.flags)}); + + p = pending::none; + mod.clear (); + + next (t, tt); + break; + } + } + + // Parse the next chunk as simple names to get expansion, etc. + // Note that we do it in the chunking mode to detect whether + // anything in each chunk is quoted. + // + // @@ PAT: should we support pattern expansion? This is even + // fuzzier than the variable case above. Though this is the + // shell semantics. Think what happens when we do rm *.txt? + // + reset_quoted (t); + parse_names (t, tt, + ns, + pattern_mode::ignore, + true, + "command line", + nullptr); + + if (pre_parse_) // Nothing else to do if we are pre-parsing. + break; + + // Process what we got. Determine whether anything inside was + // quoted (note that the current token is "next" and is not part + // of this). + // + bool q ((quoted () - + (t.qtype != quote_type::unquoted ? 1 : 0)) != 0); + + for (name& n: ns) + { + string s; + + try + { + s = value_traits<string>::convert (move (n), nullptr); + } + catch (const invalid_argument&) + { + diag_record dr (fail (l)); + dr << "invalid string value "; + to_stream (dr.os, n, true); // Quote. + } + + // If it is a quoted chunk, then we add the word as is. + // Otherwise we re-lex it. But if the word doesn't contain any + // interesting characters (operators plus quotes/escapes), + // then no need to re-lex. + // + // NOTE: update quoting (script.cxx:to_stream_q()) if adding + // any new characters. + // + if (q || s.find_first_of ("|&<>\'\"\\") == string::npos) + add_word (move (s), l); + else + { + // If the chunk re-parsing results in error, our diagnostics + // will look like this: + // + // <string>:1:4: error: stdout merge redirect file descriptor must be 2 + // script:2:5: info: while parsing string '1>&a' + // + auto df = make_diag_frame ( + [this, s, &l](const diag_record& dr) + { + dr << info (l) << "while parsing string '" << s << "'"; + }); + + // When re-lexing we do "effective escaping" and only for + // ['"\] (quotes plus the backslash itself). In particular, + // there is no way to escape redirects, operators, etc. The + // idea is to prefer quoting except for passing literal + // quotes, for example: + // + // args = \"&foo\" + // cmd $args # cmd &foo + // + // args = 'x=\"foo bar\"' + // cmd $args # cmd x="foo bar" + // + istringstream is (s); + path_name in ("<string>"); + lexer lex (is, in, + lexer_mode::command_expansion, + ra, + "\'\"\\"); + + // Treat the first "sub-token" as always separated from what + // we saw earlier. + // + // Note that this is not "our" token so we cannot do + // fail(t). Rather we should do fail(l). + // + token t (lex.next ()); + location l (build2::get_location (t, in)); + t.separated = true; + + string w; + bool f (t.type == type::eos); // If the whole thing is empty. + + for (; t.type != type::eos; t = lex.next ()) + { + type tt (ra.resolve (t.type)); + l = build2::get_location (t, in); + + // Re-lexing double-quotes will recognize $, ( inside as + // tokens so we have to reverse them back. Since we don't + // treat spaces as separators we can be sure we will get + // it right. + // + switch (tt) + { + case type::dollar: w += '$'; continue; + case type::lparen: w += '('; continue; + } + + // Retire the current word. We need to distinguish between + // empty and non-existent (e.g., > vs >""). + // + if (!w.empty () || f) + { + add_word (move (w), l); + f = false; + } + + if (tt == type::word) + { + w = move (t.value); + f = true; + continue; + } + + // If this is one of the operators/separators, check that + // we don't have any pending locations to be filled. + // + check_pending (l); + + // Note: there is another one in the outer loop above. + // + switch (tt) + { + case type::pipe: + case type::log_or: + case type::log_and: + { + // Check that the previous command makes sense. + // + check_command (l, tt != type::pipe); + expr.back ().pipe.push_back (move (c)); + + c = command (); + p = pending::program; + + if (tt != type::pipe) + { + expr_operator o (tt == type::log_or + ? expr_operator::log_or + : expr_operator::log_and); + expr.push_back ({o, command_pipe ()}); + } + + break; + } + + case type::in_pass: + case type::out_pass: + + case type::in_null: + case type::out_null: + + case type::out_trace: + + case type::out_merge: + + case type::in_str: + case type::out_str: + + case type::in_file: + case type::out_file_cmp: + case type::out_file_ovr: + case type::out_file_app: + { + parse_redirect (move (t), tt, l); + break; + } + + case type::clean: + { + parse_clean (t); + break; + } + + case type::in_doc: + case type::out_doc: + { + fail (l) << "here-document redirect in expansion"; + break; + } + } + } + + // Don't forget the last word. + // + if (!w.empty () || f) + add_word (move (w), l); + } + } + + ns.clear (); + break; + } + } + } + + if (!pre_parse_) + { + // Verify we don't have anything pending to be filled and the + // command makes sense. + // + check_pending (l); + check_command (l, true); + + expr.back ().pipe.push_back (move (c)); + } + + return make_pair (move (expr), move (hd)); + } + + command_exit parser:: + parse_command_exit (token& t, type& tt) + { + // enter: equal/not_equal + // leave: token after exit status (one parse_names() chunk) + + exit_comparison comp (tt == type::equal + ? exit_comparison::eq + : exit_comparison::ne); + + // The next chunk should be the exit status. + // + next (t, tt); + location l (get_location (t)); + names ns (parse_names (t, tt, + pattern_mode::ignore, + true, + "exit status", + nullptr)); + unsigned long es (256); + + if (!pre_parse_) + { + try + { + if (ns.size () == 1 && ns[0].simple () && !ns[0].empty ()) + es = stoul (ns[0].value); + } + catch (const exception&) {} // Fall through. + + if (es > 255) + { + diag_record dr; + + dr << fail (l) << "expected exit status instead of "; + to_stream (dr.os, ns, true); // Quote. + + dr << info << "exit status is an unsigned integer less than 256"; + } + } + + return command_exit {comp, static_cast<uint8_t> (es)}; + } + + void parser:: + parse_here_documents (token& t, type& tt, + pair<command_expr, here_docs>& p) + { + // enter: newline + // leave: newline + + // Parse here-document fragments in the order they were mentioned on + // the command line. + // + for (here_doc& h: p.second) + { + // Switch to the here-line mode which is like single/double-quoted + // string but recognized the newline as a separator. + // + mode (h.literal + ? lexer_mode::here_line_single + : lexer_mode::here_line_double); + next (t, tt); + + parsed_doc v ( + parse_here_document (t, tt, h.end, h.modifiers, h.regex)); + + // If all the here-document redirects are overridden, then we just + // drop the fragment. + // + if (!pre_parse_ && !h.redirects.empty ()) + { + auto i (h.redirects.cbegin ()); + + command& c (p.first[i->expr].pipe[i->pipe]); + + optional<redirect>& r (i->fd == 0 ? c.in : + i->fd == 1 ? c.out : + c.err); + + assert (r); // Must be present since it is referred. + + if (v.re) + { + assert (r->type == redirect_type::here_doc_regex); + + r->regex = move (v.regex); + r->regex.flags = move (h.regex_flags); + } + else + { + assert (r->type == redirect_type::here_doc_literal); + + r->str = move (v.str); + } + + r->end = move (h.end); + r->end_line = v.end_line; + r->end_column = v.end_column; + + // Note that our references cannot be invalidated because the + // command_expr/command-pipe vectors already contain all their + // elements. + // + for (++i; i != h.redirects.cend (); ++i) + { + command& c (p.first[i->expr].pipe[i->pipe]); + + optional<redirect>& ir (i->fd == 0 ? c.in : + i->fd == 1 ? c.out : + c.err); + + // Must be present since it is referenced by here-doc. + // + assert (ir); + + // Note: preserve the original representation. + // + ir = redirect (redirect_type::here_doc_ref, *r, move (ir->token)); + } + } + + expire_mode (); + } + } + + parser::parsed_doc parser:: + parse_here_document (token& t, type& tt, + const string& em, + const string& mod, + char re) + { + // enter: first token on first line + // leave: newline (after end marker) + + // String literal. Note that when decide if to terminate the previously + // added line with a newline, we need to distinguish a yet empty result + // and the one that has a single blank line added. + // + optional<string> rs; + + regex_lines rre; + + // Here-documents can be indented. The leading whitespaces of the end + // marker line (called strip prefix) determine the indentation. Every + // other line in the here-document should start with this prefix which + // is automatically stripped. The only exception is a blank line. + // + // The fact that the strip prefix is only known at the end, after + // seeing all the lines, is rather inconvenient. As a result, the way + // we implement this is a bit hackish (though there is also something + // elegant about it): at the end of the pre-parse stage we are going + // re-examine the sequence of tokens that comprise this here-document + // and "fix up" the first token of each line by stripping the prefix. + // + string sp; + + // Remember the position of the first token in this here-document. + // + size_t ri (pre_parse_ ? replay_data_.size () - 1 : 0); + + // We will use the location of the first token on the line for the + // regex diagnostics. At the end of the loop it will point to the + // beginning of the end marker. + // + location l; + + while (tt != type::eos) + { + l = get_location (t); + + // Check if this is the end marker. For starters, it should be a + // single, unquoted word followed by a newline. + // + if (tt == type::word && + t.qtype == quote_type::unquoted && + peek () == type::newline) + { + const string& v (t.value); + + size_t vn (v.size ()); + size_t en (em.size ()); + + // Then check that it ends with the end marker. + // + if (vn >= en && v.compare (vn - en, en, em) == 0) + { + // Now check that the prefix only contains whitespaces. + // + size_t n (vn - en); + + if (v.find_first_not_of (" \t") >= n) + { + assert (pre_parse_ || n == 0); // Should have been stripped. + + if (n != 0) + sp.assign (v, 0, n); // Save the strip prefix. + + next (t, tt); // Get the newline. + break; + } + } + } + + // Expand the line (can be blank). + // + // @@ PAT: one could argue that if we do it in variables, then we + // should do it here as well. Though feels bizarre. + // + names ns (tt != type::newline + ? parse_names (t, tt, + pattern_mode::ignore, + false, + "here-document line", + nullptr) + : names ()); + + if (!pre_parse_) + { + // What shall we do if the expansion results in multiple names? + // For, example if the line contains just the variable expansion + // and it is of type strings. Adding all the elements space- + // separated seems like the natural thing to do. + // + string s; + for (auto b (ns.begin ()), i (b); i != ns.end (); ++i) + { + string n; + + try + { + n = value_traits<string>::convert (move (*i), nullptr); + } + catch (const invalid_argument&) + { + fail (l) << "invalid string value '" << *i << "'"; + } + + if (i == b) + s = move (n); + else + { + s += ' '; + s += n; + } + } + + if (!re) + { + // Add newline after previous line. + // + if (rs) + { + *rs += '\n'; + *rs += s; + } + else + rs = move (s); + } + else + { + // Due to expansion we can end up with multiple lines. If empty + // then will add a blank textual literal. + // + for (size_t p (0); p != string::npos; ) + { + string ln; + size_t np (s.find ('\n', p)); + + if (np != string::npos) + { + ln = string (s, p, np - p); + p = np + 1; + } + else + { + ln = string (s, p); + p = np; + } + + if (ln[0] != re) // Line doesn't start with regex introducer. + { + // This is a line-char literal (covers blank lines as well). + // + // Append textual literal. + // + rre.lines.emplace_back (l.line, l.column, move (ln), false); + } + else // Line starts with the regex introducer. + { + // This is a char-regex, or a sequence of line-regex syntax + // characters or both (in this specific order). So we will + // add regex (with optional special characters) or special + // literal. + // + size_t p (ln.find (re, 1)); + if (p == string::npos) + { + // No regex, just a sequence of syntax characters. + // + string spec (ln, 1); + if (spec.empty ()) + fail (l) << "no syntax line characters"; + + // Append special literal. + // + rre.lines.emplace_back ( + l.line, l.column, move (spec), true); + } + else + { + // Regex (probably with syntax characters). + // + regex_parts re; + + // Empty regex is a special case repesenting a blank line. + // + if (p == 1) + // Position to optional specal characters of an empty + // regex. + // + ++p; + else + // Can't fail as all the pre-conditions verified + // (non-empty with both introducers in place), so no + // description required. + // + re = parse_regex (ln, l, "", &p); + + // Append regex with optional special characters. + // + rre.lines.emplace_back (l.line, l.column, + move (re.value), move (re.flags), + string (ln, p)); + } + } + } + } + } + + // We should expand the whole line at once so this would normally be + // a newline but can also be an end-of-stream. + // + if (tt == type::newline) + next (t, tt); + else + assert (tt == type::eos); + } + + if (tt == type::eos) + fail (t) << "missing here-document end marker '" << em << "'"; + + if (pre_parse_) + { + // Strip the indentation prefix if there is one. + // + assert (replay_ == replay::save); + + if (!sp.empty ()) + { + size_t sn (sp.size ()); + + for (; ri != replay_data_.size (); ++ri) + { + token& rt (replay_data_[ri].token); + + if (rt.type == type::newline) // Blank + continue; + + if (rt.type != type::word || rt.value.compare (0, sn, sp) != 0) + fail (rt) << "unindented here-document line"; + + // If the word is equal to the strip prefix then we have to drop + // the token. Note that simply making it an empty word won't + // have the same semantics. For instance, it would trigger + // concatenated expansion. + // + if (rt.value.size () == sn) + replay_data_.erase (replay_data_.begin () + ri); + else + { + rt.value.erase (0, sn); + rt.column += sn; + ++ri; + } + + // Skip until next newline. + // + for (; replay_data_[ri].token.type != type::newline; ++ri) ; + } + } + } + else + { + // Add final newline unless suppressed. + // + if (mod.find (':') == string::npos) + { + if (re) + // Note that the position is synthetic, but that's ok as we don't + // expect any diagnostics to refer this line. + // + rre.lines.emplace_back (l.line, l.column, string (), false); + else if (rs) + *rs += '\n'; + else + rs = "\n"; + } + + // Finalize regex lines. + // + if (re) + { + // Empty regex matches nothing, so not of much use. + // + if (rre.lines.empty ()) + fail (l) << "empty here-document regex"; + + rre.intro = re; + } + } + + return re + ? parsed_doc (move (rre), l.line, l.column) + : parsed_doc (rs ? move (*rs) : string (), l.line, l.column); + } + + size_t parser:: + quoted () const + { + size_t r (0); + + if (replay_ != replay::play) + r = lexer_->quoted (); + else + { + // Examine tokens we have replayed since last reset. + // + for (size_t i (replay_quoted_); i != replay_i_; ++i) + if (replay_data_[i].token.qtype != quote_type::unquoted) + ++r; + } + + return r; + } + + void parser:: + reset_quoted (token& cur) + { + if (replay_ != replay::play) + lexer_->reset_quoted (cur.qtype != quote_type::unquoted ? 1 : 0); + else + { + replay_quoted_ = replay_i_ - 1; + + // Must be the same token. + // + assert (replay_data_[replay_quoted_].token.qtype == cur.qtype); + } + } + + void parser:: + set_lexer (lexer* l) + { + lexer_ = l; + build2::parser::lexer_ = l; + } + + static redirect_aliases no_redirect_aliases; + + void parser:: + apply_value_attributes (const variable* var, + value& lhs, + value&& rhs, + const string& attributes, + token_type kind, + const path_name& name) + { + path_ = &name; + + istringstream is (attributes); + + // Note that the redirect alias information is not used in the + // attributes lexer mode. + // + lexer l (is, name, lexer_mode::attributes, no_redirect_aliases); + + set_lexer (&l); + + token t; + type tt; + + next_with_attributes (t, tt); // Enable `[` recognition. + + if (tt != type::lsbrace && tt != type::eos) + fail (t) << "expected '[' instead of " << t; + + attributes_push (t, tt, true); + + if (tt != type::eos) + fail (t) << "trailing junk after ']'"; + + build2::parser::apply_value_attributes (var, lhs, move (rhs), kind); + } + + line_type parser:: + pre_parse_line_start (token& t, token_type& tt, lexer_mode stm) + { + replay_save (); // Start saving tokens from the current one. + next (t, tt); + + // Decide whether this is a variable assignment or a command. + // + // It is an assignment if the first token is an unquoted name and + // the next token is an assign/append/prepend operator. Assignment + // to a computed variable name must use the set builtin. + // + // Note also that special commands take precedence over variable + // assignments. + // + line_type r (line_type::cmd); // Default. + + if (tt == type::word && t.qtype == quote_type::unquoted) + { + const string& n (t.value); + + if (n == "if") r = line_type::cmd_if; + else if (n == "if!") r = line_type::cmd_ifn; + else if (n == "elif") r = line_type::cmd_elif; + else if (n == "elif!") r = line_type::cmd_elifn; + else if (n == "else") r = line_type::cmd_else; + else if (n == "end") r = line_type::cmd_end; + else + { + // Switch the recognition of leading variable assignments for + // the next token. This is safe to do because we know we + // cannot be in the quoted mode (since the current token is + // not quoted). + // + type p (peek (stm)); + + if (p == type::assign || p == type::prepend || p == type::append) + { + r = line_type::var; + + // Note that the missing command program is detected later, by + // parse_command_expr(). + // + if (n.empty ()) + fail (t) << "missing variable name"; + } + } + } + + return r; + } + + bool parser:: + exec_lines (lines::const_iterator i, lines::const_iterator e, + const function<exec_set_function>& exec_set, + const function<exec_cmd_function>& exec_cmd, + const function<exec_if_function>& exec_if, + size_t& li, + variable_pool* var_pool) + { + try + { + token t; + type tt; + for (; i != e; ++i) + { + const line& ln (*i); + line_type lt (ln.type); + + assert (path_ == nullptr); + + // Copy the tokens and start playing. + // + replay_data (replay_tokens (ln.tokens)); + + // We don't really need to change the mode since we already know + // the line type. + // + next (t, tt); + const location ll (get_location (t)); + + switch (lt) + { + case line_type::var: + { + // Enter the variable into the pool if this is not done during + // the script parsing. Note that in this case the pool is + // expected to be provided. + // + const variable* var (ln.var); + + if (var == nullptr) + { + assert (var_pool != nullptr); + + var = &var_pool->insert (t.value); + } + + exec_set (*var, t, tt, ll); + + replay_stop (); + break; + } + case line_type::cmd: + { + bool single (false); + + if (li == 1) + { + lines::const_iterator j (i); + for (++j; j != e && j->type == line_type::var; ++j) ; + + if (j == e) // We have no another command. + single = true; + } + + exec_cmd (t, tt, li++, single, ll); + + replay_stop (); + break; + } + case line_type::cmd_if: + case line_type::cmd_ifn: + case line_type::cmd_elif: + case line_type::cmd_elifn: + case line_type::cmd_else: + { + next (t, tt); // Skip to start of command. + + bool take; + if (lt != line_type::cmd_else) + { + take = exec_if (t, tt, li++, ll); + + if (lt == line_type::cmd_ifn || lt == line_type::cmd_elifn) + take = !take; + } + else + { + assert (tt == type::newline); + take = true; + } + + replay_stop (); + + // If end is true, then find the 'end' line. Otherwise, find + // the next if-else line. If skip is true then increment the + // command line index. + // + auto next = [e, &li] (lines::const_iterator j, + bool end, + bool skip) -> lines::const_iterator + { + // We need to be aware of nested if-else chains. + // + size_t n (0); + + for (++j; j != e; ++j) + { + line_type lt (j->type); + + if (lt == line_type::cmd_if || lt == line_type::cmd_ifn) + ++n; + + // If we are nested then we just wait until we get back + // to the surface. + // + if (n == 0) + { + switch (lt) + { + case line_type::cmd_elif: + case line_type::cmd_elifn: + case line_type::cmd_else: + if (end) break; + // Fall through. + case line_type::cmd_end: return j; + default: break; + } + } + + if (lt == line_type::cmd_end) + --n; + + if (skip) + { + // Note that we don't count else and end as commands. + // + switch (lt) + { + case line_type::cmd: + case line_type::cmd_if: + case line_type::cmd_ifn: + case line_type::cmd_elif: + case line_type::cmd_elifn: ++li; break; + default: break; + } + } + } + + assert (false); // Missing end. + return e; + }; + + // If we are taking this branch then we need to parse all the + // lines until the next if-else line and then skip all the + // lines until the end (unless next is already end). + // + // Otherwise, we need to skip all the lines until the next + // if-else line and then continue parsing. + // + if (take) + { + // Next if-else. + // + lines::const_iterator j (next (i, false, false)); + if (!exec_lines (i + 1, j, + exec_set, exec_cmd, exec_if, + li, + var_pool)) + return false; + + i = j->type == line_type::cmd_end ? j : next (j, true, true); + } + else + { + i = next (i, false, true); + if (i->type != line_type::cmd_end) + --i; // Continue with this line (e.g., elif or else). + } + + break; + } + case line_type::cmd_end: + { + assert (false); + } + } + } + + return true; + } + catch (const exit& e) + { + // Bail out if the script is exited with the failure status. Otherwise + // exit the lines execution normally. + // + if (!e.status) + throw failed (); + + replay_stop (); + return false; + } + } + + // parser::parsed_doc + // + parser::parsed_doc:: + parsed_doc (string s, uint64_t l, uint64_t c) + : str (move (s)), re (false), end_line (l), end_column (c) + { + } + + parser::parsed_doc:: + parsed_doc (regex_lines&& r, uint64_t l, uint64_t c) + : regex (move (r)), re (true), end_line (l), end_column (c) + { + } + + parser::parsed_doc:: + parsed_doc (parsed_doc&& d) + : re (d.re), end_line (d.end_line), end_column (d.end_column) + { + if (re) + new (®ex) regex_lines (move (d.regex)); + else + new (&str) string (move (d.str)); + } + + parser::parsed_doc:: + ~parsed_doc () + { + if (re) + regex.~regex_lines (); + else + str.~string (); + } + } +} diff --git a/libbuild2/script/parser.hxx b/libbuild2/script/parser.hxx new file mode 100644 index 0000000..a63ecde --- /dev/null +++ b/libbuild2/script/parser.hxx @@ -0,0 +1,189 @@ +// file : libbuild2/script/parser.hxx -*- C++ -*- +// license : MIT; see accompanying LICENSE file + +#ifndef LIBBUILD2_SCRIPT_PARSER_HXX +#define LIBBUILD2_SCRIPT_PARSER_HXX + +#include <libbuild2/types.hxx> +#include <libbuild2/forward.hxx> +#include <libbuild2/utility.hxx> + +#include <libbuild2/parser.hxx> +#include <libbuild2/diagnostics.hxx> + +#include <libbuild2/script/token.hxx> +#include <libbuild2/script/lexer.hxx> // redirect_aliases +#include <libbuild2/script/script.hxx> + +namespace build2 +{ + namespace script + { + class lexer; + struct lexer_mode; + + class parser: protected build2::parser + { + public: + parser (context& c): build2::parser (c) {} + + // Helpers. + // + // Parse attribute string and perform attribute-guided assignment. + // Issue diagnostics and throw failed in case of an error. + // + void + apply_value_attributes (const variable*, // Optional. + value& lhs, + value&& rhs, + const string& attributes, + token_type assign_kind, + const path_name&); // For diagnostics. + + using build2::parser::apply_value_attributes; + + // Commonly used parsing functions. Issue diagnostics and throw failed + // in case of an error. + // + // Usually (but not always) parse functions receive the token/type + // from which it should start consuming and in return the token/type + // should contain the first token that has not been consumed. + // + // Functions that are called parse_*() rather than pre_parse_*() can be + // used for both stages. + // + protected: + value + parse_variable_line (token&, token_type&); + + // Ordered sequence of here-document redirects that we can expect to + // see after the command line. + // + struct here_redirect + { + size_t expr; // Index in command_expr. + size_t pipe; // Index in command_pipe. + int fd; // Redirect fd (0 - in, 1 - out, 2 - err). + }; + + struct here_doc + { + // Redirects that share here_doc. Most of the time we will have no + // more than 2 (2 - for the roundtrip cases). Doesn't refer overridden + // redirects and thus can be empty. + // + small_vector<here_redirect, 2> redirects; + + string end; + bool literal; // Literal (single-quote). + string modifiers; + + // Regex introducer ('\0' if not a regex, so can be used as bool). + // + char regex; + + // Regex global flags. Meaningful if regex != '\0'. + // + string regex_flags; + }; + using here_docs = vector<here_doc>; + + pair<command_expr, here_docs> + parse_command_expr (token&, token_type&, const redirect_aliases&); + + command_exit + parse_command_exit (token&, token_type&); + + void + parse_here_documents (token&, token_type&, + pair<command_expr, here_docs>&); + + struct parsed_doc + { + union + { + string str; // Here-document literal. + regex_lines regex; // Here-document regex. + }; + + bool re; // True if regex. + uint64_t end_line; // Here-document end marker location. + uint64_t end_column; + + parsed_doc (string, uint64_t line, uint64_t column); + parsed_doc (regex_lines&&, uint64_t line, uint64_t column); + parsed_doc (parsed_doc&&); // Note: move constuctible-only type. + ~parsed_doc (); + }; + + parsed_doc + parse_here_document (token&, token_type&, + const string&, + const string& mode, + char re_intro); // '\0' if not a regex. + + // Start pre-parsing a script line returning its type, detected based on + // the first two tokens. Use the specified lexer mode to peek the second + // token. + // + line_type + pre_parse_line_start (token&, token_type&, lexer_mode); + + // Execute. + // + protected: + // Return false if the execution of the script should be terminated with + // the success status (e.g., as a result of encountering the exit + // builtin). For unsuccessful termination the failed exception is thrown. + // + using exec_set_function = void (const variable&, + token&, token_type&, + const location&); + + using exec_cmd_function = void (token&, token_type&, + size_t li, + bool single, + const location&); + + using exec_if_function = bool (token&, token_type&, + size_t li, + const location&); + + // If a parser implementation doesn't pre-enter variables into a pool + // during the pre-parsing phase, then they are entered during the + // execution phase and so the variable pool must be provided. Note that + // in this case the variable pool insertions are not MT-safe. + // + bool + exec_lines (lines::const_iterator b, lines::const_iterator e, + const function<exec_set_function>&, + const function<exec_cmd_function>&, + const function<exec_if_function>&, + size_t& li, + variable_pool* = nullptr); + + // Set lexer pointers for both the current and the base classes. + // + protected: + void + set_lexer (lexer*); + + // Number of quoted tokens since last reset. Note that this includes + // the peeked token, if any. + // + protected: + size_t + quoted () const; + + void + reset_quoted (token& current); + + size_t replay_quoted_; + + protected: + lexer* lexer_ = nullptr; + }; + } +} + +#endif // LIBBUILD2_SCRIPT_PARSER_HXX diff --git a/libbuild2/script/regex.cxx b/libbuild2/script/regex.cxx new file mode 100644 index 0000000..3f796b6 --- /dev/null +++ b/libbuild2/script/regex.cxx @@ -0,0 +1,436 @@ +// file : libbuild2/script/regex.cxx -*- C++ -*- +// license : MIT; see accompanying LICENSE file + +#include <locale> + +#include <libbuild2/script/regex.hxx> + +using namespace std; + +namespace build2 +{ + namespace script + { + namespace regex + { + static_assert (alignof (char_string) % 4 == 0, + "unexpected char_string alignment"); + + static_assert (alignof (char_regex) % 4 == 0, + "unexpected char_regex alignment"); + + static_assert (sizeof (uintptr_t) > sizeof (int16_t), + "unexpected uintptr_t size"); + + const line_char line_char::nul (0); + const line_char line_char::eof (-1); + + // line_char + // + // We package the special character into uintptr_t with the following + // steps: + // + // - narrow down int value to int16_t (preserves all the valid values) + // + // - convert to uint16_t (bitwise representation stays the same, but no + // need to bother with signed value widening, leftmost bits loss on + // left shift, etc) + // + // - convert to uintptr_t (storage type) + // + // - shift left by two bits (the operation is fully reversible as + // uintptr_t is wider then uint16_t) + // + line_char:: + line_char (int c) + : data_ ( + (static_cast <uintptr_t> ( + static_cast<uint16_t> ( + static_cast<int16_t> (c))) << 2) | + static_cast <uintptr_t> (line_type::special)) + { + // @@ How can we allow anything for basic_regex but only subset + // for our own code? + // + const char ex[] = "pn\n\r"; + + assert (c == 0 || // Null character. + + // EOF. Note that is also passed by msvcrt as _Meta_eos + // enum value. + // + c == -1 || + + // libstdc++ line/paragraph separators. + // + c == u'\u2028' || c == u'\u2029' || + + (c > 0 && c <= 255 && ( + // Supported regex special characters. + // + syntax (c) || + + // libstdc++ look-ahead tokens, newline chars. + // + string::traits_type::find (ex, 4, c) != nullptr))); + } + + line_char:: + line_char (const char_string& s, line_pool& p) + : line_char (&(*p.strings.emplace (s).first)) + { + } + + line_char:: + line_char (char_string&& s, line_pool& p) + : line_char (&(*p.strings.emplace (move (s)).first)) + { + } + + line_char:: + line_char (char_regex r, line_pool& p) + // Note: in C++17 can write as p.regexes.emplace_front(move (r)) + // + : line_char (&(*p.regexes.emplace (p.regexes.begin (), move (r)))) + { + } + + bool + line_char::syntax (char c) + { + return string::traits_type::find ( + "()|.*+?{}\\0123456789,=!", 23, c) != nullptr; + } + + bool + operator== (const line_char& l, const line_char& r) + { + line_type lt (l.type ()); + line_type rt (r.type ()); + + if (lt == rt) + { + bool res (true); + + switch (lt) + { + case line_type::special: res = l.special () == r.special (); break; + case line_type::regex: assert (false); break; + + // Note that we use pointers (rather than vales) comparison + // assuming that the strings must belong to the same pool. + // + case line_type::literal: res = l.literal () == r.literal (); break; + } + + return res; + } + + // Match literal with regex. + // + if (lt == line_type::literal && rt == line_type::regex) + return regex_match (*l.literal (), *r.regex ()); + else if (rt == line_type::literal && lt == line_type::regex) + return regex_match (*r.literal (), *l.regex ()); + + return false; + } + + bool + operator< (const line_char& l, const line_char& r) + { + if (l == r) + return false; + + line_type lt (l.type ()); + line_type rt (r.type ()); + + if (lt != rt) + return lt < rt; + + bool res (false); + + switch (lt) + { + case line_type::special: res = l.special () < r.special (); break; + case line_type::literal: res = *l.literal () < *r.literal (); break; + case line_type::regex: assert (false); break; + } + + return res; + } + + // line_char_locale + // + + // An exemplar locale with the std::ctype<line_char> facet. It is used + // for the subsequent line char locale objects creation (see below) + // which normally ends up with a shallow copy of a reference-counted + // object. + // + // Note that creating the line char locales from the exemplar is not + // merely an optimization: there is a data race in the libstdc++ (at + // least as of GCC 9.1) implementation of the locale(const locale&, + // Facet*) constructor (bug #91057). + // + // Also note that we install the facet in init() rather than during + // the object creation to avoid a race with the std::locale-related + // global variables initialization. + // + static locale line_char_locale_exemplar; + + void + init () + { + line_char_locale_exemplar = + locale (locale (), + new std::ctype<line_char> ()); // Hidden by ctype bitmask. + } + + line_char_locale:: + line_char_locale () + : locale (line_char_locale_exemplar) + { + // Make sure init() has been called. + // + // Note: has_facet() is hidden by a private function in libc++. + // + assert (std::has_facet<std::ctype<line_char>> (*this)); + } + + // char_regex + // + // Transform regex according to the extended flags {idot}. If regex is + // malformed then keep transforming, so the resulting string is + // malformed the same way. We expect the error to be reported by the + // char_regex ctor. + // + static string + transform (const string& s, char_flags f) + { + assert ((f & char_flags::idot) != char_flags::none); + + string r; + bool escape (false); + bool cclass (false); + + for (char c: s) + { + // Inverse escaping for a dot which is out of the char class + // brackets. + // + bool inverse (c == '.' && !cclass); + + // Handle the escape case. Note that we delay adding the backslash + // since we may have to inverse things. + // + if (escape) + { + if (!inverse) + r += '\\'; + + r += c; + escape = false; + + continue; + } + else if (c == '\\') + { + escape = true; + continue; + } + + // Keep track of being inside the char class brackets, escape if + // inversion. Note that we never inverse square brackets. + // + if (c == '[' && !cclass) + cclass = true; + else if (c == ']' && cclass) + cclass = false; + else if (inverse) + r += '\\'; + + r += c; + } + + if (escape) // Regex is malformed but that's not our problem. + r += '\\'; + + return r; + } + + static char_regex::flag_type + to_std_flags (char_flags f) + { + // Note that ECMAScript flag is implied in the absense of a grammar + // flag. + // + return (f & char_flags::icase) != char_flags::none + ? char_regex::icase + : char_regex::flag_type (); + } + + char_regex:: + char_regex (const char_string& s, char_flags f) + : base_type ((f & char_flags::idot) != char_flags::none + ? transform (s, f) + : s, + to_std_flags (f)) + { + } + } + } +} + +namespace std +{ + using namespace build2::script::regex; + + // char_traits<line_char> + // + line_char* char_traits<line_char>:: + assign (char_type* s, size_t n, char_type c) + { + for (size_t i (0); i != n; ++i) + s[i] = c; + return s; + } + + line_char* char_traits<line_char>:: + move (char_type* d, const char_type* s, size_t n) + { + if (n > 0 && d != s) + { + // If d < s then it can't be in [s, s + n) range and so using copy() is + // safe. Otherwise d + n is out of (s, s + n] range and so using + // copy_backward() is safe. + // + if (d < s) + std::copy (s, s + n, d); // Hidden by char_traits<line_char>::copy(). + else + copy_backward (s, s + n, d + n); + } + + return d; + } + + line_char* char_traits<line_char>:: + copy (char_type* d, const char_type* s, size_t n) + { + std::copy (s, s + n, d); // Hidden by char_traits<line_char>::copy(). + return d; + } + + int char_traits<line_char>:: + compare (const char_type* s1, const char_type* s2, size_t n) + { + for (size_t i (0); i != n; ++i) + { + if (s1[i] < s2[i]) + return -1; + else if (s2[i] < s1[i]) + return 1; + } + + return 0; + } + + size_t char_traits<line_char>:: + length (const char_type* s) + { + size_t i (0); + while (s[i] != char_type::nul) + ++i; + + return i; + } + + const line_char* char_traits<line_char>:: + find (const char_type* s, size_t n, const char_type& c) + { + for (size_t i (0); i != n; ++i) + { + if (s[i] == c) + return s + i; + } + + return nullptr; + } + + // ctype<line_char> + // + locale::id ctype<line_char>::id; + + const line_char* ctype<line_char>:: + is (const char_type* b, const char_type* e, mask* m) const + { + while (b != e) + { + const char_type& c (*b++); + + *m++ = c.type () == line_type::special && c.special () >= 0 && + build2::digit (static_cast<char> (c.special ())) + ? digit + : 0; + } + + return e; + } + + const line_char* ctype<line_char>:: + scan_is (mask m, const char_type* b, const char_type* e) const + { + for (; b != e; ++b) + { + if (is (m, *b)) + return b; + } + + return e; + } + + const line_char* ctype<line_char>:: + scan_not (mask m, const char_type* b, const char_type* e) const + { + for (; b != e; ++b) + { + if (!is (m, *b)) + return b; + } + + return e; + } + + const char* ctype<line_char>:: + widen (const char* b, const char* e, char_type* c) const + { + while (b != e) + *c++ = widen (*b++); + + return e; + } + + const line_char* ctype<line_char>:: + narrow (const char_type* b, const char_type* e, char def, char* c) const + { + while (b != e) + *c++ = narrow (*b++, def); + + return e; + } + + // regex_traits<line_char> + // + int regex_traits<line_char>:: + value (char_type c, int radix) const + { + assert (radix == 8 || radix == 10 || radix == 16); + + if (c.type () != line_type::special) + return -1; + + const char digits[] = "0123456789ABCDEF"; + const char* d (string::traits_type::find (digits, radix, c.special ())); + return d != nullptr ? static_cast<int> (d - digits) : -1; + } +} diff --git a/libbuild2/script/regex.hxx b/libbuild2/script/regex.hxx new file mode 100644 index 0000000..30d3363 --- /dev/null +++ b/libbuild2/script/regex.hxx @@ -0,0 +1,678 @@ +// file : libbuild2/script/regex.hxx -*- C++ -*- +// license : MIT; see accompanying LICENSE file + +#ifndef LIBBUILD2_SCRIPT_REGEX_HXX +#define LIBBUILD2_SCRIPT_REGEX_HXX + +#include <list> +#include <regex> +#include <locale> +#include <string> // basic_string +#include <type_traits> // make_unsigned, enable_if, is_* +#include <unordered_set> + +#include <libbuild2/types.hxx> +#include <libbuild2/utility.hxx> + +namespace build2 +{ + namespace script + { + namespace regex + { + using char_string = std::basic_string<char>; + + enum class char_flags: uint16_t + { + icase = 0x1, // Case-insensitive match. + idot = 0x2, // Invert '.' escaping. + + none = 0 + }; + + // Restricts valid standard flags to just {icase}, extends with custom + // flags {idot}. + // + class char_regex: public std::basic_regex<char> + { + public: + using base_type = std::basic_regex<char>; + + char_regex (const char_string&, char_flags = char_flags::none); + }; + + // Newlines are line separators and are not part of the line: + // + // line<newline>line<newline> + // + // Specifically, this means that a customary trailing newline creates a + // trailing blank line. + // + // All characters can inter-compare (though there cannot be regex + // characters in the output, only in line_regex). + // + // Note that we assume that line_regex and the input to regex_match() + // use the same pool. + // + struct line_pool + { + // Note that we assume the pool can be moved without invalidating + // pointers to any already pooled entities. + // + std::unordered_set<char_string> strings; + std::list<char_regex> regexes; + }; + + enum class line_type + { + special, + literal, + regex + }; + + struct line_char + { + // Steal last two bits from the pointer to store the type. + // + private: + std::uintptr_t data_; + + public: + line_type + type () const {return static_cast<line_type> (data_ & 0x3);} + + int + special () const + { + // Stored as (shifted) int16_t. Perform steps reversed to those + // that are described in the comment for the corresponding ctor. + // Note that the intermediate cast to uint16_t is required to + // portably preserve the -1 special character. + // + return static_cast<int16_t> (static_cast<uint16_t> (data_ >> 2)); + } + + const char_string* + literal () const + { + // Note that 2 rightmost bits are used for packaging line_char + // type. Read the comment for the corresponding ctor for details. + // + return reinterpret_cast<const char_string*> ( + data_ & ~std::uintptr_t (0x3)); + } + + const char_regex* + regex () const + { + // Note that 2 rightmost bits are used for packaging line_char + // type. Read the comment for the corresponding ctor for details. + // + return reinterpret_cast<const char_regex*> ( + data_ & ~std::uintptr_t (0x3)); + } + + static const line_char nul; + static const line_char eof; + + // Note: creates an uninitialized value. + // + line_char () = default; + + // Create a special character. The argument value must be one of the + // following ones: + // + // 0 (nul character) + // -1 (EOF) + // [()|.*+?{}\0123456789,=!] (excluding []) + // + // Note that the constructor is implicit to allow basic_regex to + // implicitly construct line_chars from special char literals (in + // particular libstdc++ appends them to an internal line_string). + // + // Also note that we extend the valid characters set (see above) with + // 'p', 'n' (used by libstdc++ for positive/negative look-ahead + // tokens representation), and '\n', '\r', u'\u2028', u'\u2029' (used + // by libstdc++ for newline/newparagraph matching). + // + line_char (int); + + // Create a literal character. + // + // Don't copy string if already pooled. + // + explicit + line_char (const char_string&, line_pool&); + + explicit + line_char (char_string&&, line_pool&); + + explicit + line_char (const char_string* s) // Assume already pooled. + // + // Steal two bits from the pointer to package line_char type. + // Assume (and statically assert) that char_string address is a + // multiple of four. + // + : data_ (reinterpret_cast <std::uintptr_t> (s) | + static_cast <std::uintptr_t> (line_type::literal)) {} + + // Create a regex character. + // + explicit + line_char (char_regex, line_pool&); + + explicit + line_char (const char_regex* r) // Assume already pooled. + // + // Steal two bits from the pointer to package line_char type. + // Assume (and statically assert) that char_regex address is a + // multiple of four. + // + : data_ (reinterpret_cast <std::uintptr_t> (r) | + static_cast <std::uintptr_t> (line_type::regex)) {} + + // Provide basic_regex with the ability to use line_char in a context + // where a char value is expected (e.g., as a function argument). + // + // libstdc++ seems to cast special line_chars only (and such a + // conversion is meanigfull). + // + // msvcrt casts line_chars of arbitrary types instead. The only + // reasonable strategy is to return a value that differs from any + // other that can be encountered in a regex expression and so will + // unlikelly be misinterpreted. + // + operator char () const + { + return type () == line_type::special ? special () : '\a'; // BELL. + } + + // Return true if the character is a syntax (special) one. + // + static bool + syntax (char); + + // Provide basic_regex (such as from msvcrt) with the ability to + // explicitly cast line_chars to implementation-specific numeric + // types (enums, msvcrt's _Uelem, etc). + // + template <typename T> + explicit + operator T () const + { + assert (type () == line_type::special); + return static_cast<T> (special ()); + } + }; + + // Perform "deep" characters comparison (for example match literal + // character with a regex character), rather than just compare them + // literally. At least one argument must be of a type other than regex + // as there is no operator==() defined to compare regexes. Characters + // of the literal type must share the same pool (strings are compared + // by pointers not by values). + // + bool + operator== (const line_char&, const line_char&); + + // Return false if arguments are equal (operator==() returns true). + // Otherwise if types are different return the value implying that + // special < literal < regex. If types are special or literal return + // the result of the respective characters or strings comparison. At + // least one argument must be of a type other than regex as there is no + // operator<() defined to compare regexes. + // + // While not very natural operation for the class we have, we have to + // provide some meaningfull semantics for such a comparison as it is + // required by the char_traits<line_char> specialization. While we + // could provide it right in that specialization, let's keep it here + // for basic_regex implementations that potentially can compare + // line_chars as they compare them with expressions of other types (see + // below). + // + bool + operator< (const line_char&, const line_char&); + + inline bool + operator!= (const line_char& l, const line_char& r) + { + return !(l == r); + } + + inline bool + operator<= (const line_char& l, const line_char& r) + { + return l < r || l == r; + } + + // Provide basic_regex (such as from msvcrt) with the ability to + // compare line_char to a value of an integral or + // implementation-specific enum type. In the absense of the following + // template operators, such a comparisons would be ambigious for + // integral types (given that there are implicit conversions + // int->line_char and line_char->char) and impossible for enums. + // + // Note that these == and < operators can succeed only for a line_char + // of the special type. For other types they always return false. That + // in particular leads to the following case: + // + // (lc != c) != (lc < c || c < lc). + // + // Note that we can not assert line_char is of the special type as + // basic_regex (such as from libc++) may need the ability to check if + // arbitrary line_char belongs to some special characters range (like + // ['0', '9']). + // + template <typename T> + struct line_char_cmp + : public std::enable_if<std::is_integral<T>::value || + (std::is_enum<T>::value && + !std::is_same<T, char_flags>::value)> {}; + + template <typename T, typename = typename line_char_cmp<T>::type> + bool + operator== (const line_char& l, const T& r) + { + return l.type () == line_type::special && + static_cast<T> (l.special ()) == r; + } + + template <typename T, typename = typename line_char_cmp<T>::type> + bool + operator== (const T& l, const line_char& r) + { + return r.type () == line_type::special && + static_cast<T> (r.special ()) == l; + } + + template <typename T, typename = typename line_char_cmp<T>::type> + bool + operator!= (const line_char& l, const T& r) + { + return !(l == r); + } + + template <typename T, typename = typename line_char_cmp<T>::type> + bool + operator!= (const T& l, const line_char& r) + { + return !(l == r); + } + + template <typename T, typename = typename line_char_cmp<T>::type> + bool + operator< (const line_char& l, const T& r) + { + return l.type () == line_type::special && + static_cast<T> (l.special ()) < r; + } + + template <typename T, typename = typename line_char_cmp<T>::type> + bool + operator< (const T& l, const line_char& r) + { + return r.type () == line_type::special && + l < static_cast<T> (r.special ()); + } + + template <typename T, typename = typename line_char_cmp<T>::type> + inline bool + operator<= (const line_char& l, const T& r) + { + return l < r || l == r; + } + + template <typename T, typename = typename line_char_cmp<T>::type> + inline bool + operator<= (const T& l, const line_char& r) + { + return l < r || l == r; + } + + using line_string = std::basic_string<line_char>; + + // Locale that has ctype<line_char> facet installed. Used in the + // regex_traits<line_char> specialization (see below). + // + class line_char_locale: public std::locale + { + public: + // Create a copy of the global C++ locale. + // + line_char_locale (); + }; + + // Initialize the script regex global state. Should be called once + // prior to creating objects of types from this namespace. Note: not + // thread-safe. + // + void + init (); + } + } +} + +// Standard template specializations for line_char that are required for the +// basic_regex<line_char> instantiation. +// +namespace std +{ + template <> + class char_traits<build2::script::regex::line_char> + { + public: + using char_type = build2::script::regex::line_char; + using int_type = char_type; + using off_type = char_traits<char>::off_type; + using pos_type = char_traits<char>::pos_type; + using state_type = char_traits<char>::state_type; + + static void + assign (char_type& c1, const char_type& c2) {c1 = c2;} + + static char_type* + assign (char_type*, size_t, char_type); + + // Note that eq() and lt() are not constexpr (as required by C++11) + // because == and < operators for char_type are not constexpr. + // + static bool + eq (const char_type& l, const char_type& r) {return l == r;} + + static bool + lt (const char_type& l, const char_type& r) {return l < r;} + + static char_type* + move (char_type*, const char_type*, size_t); + + static char_type* + copy (char_type*, const char_type*, size_t); + + static int + compare (const char_type*, const char_type*, size_t); + + static size_t + length (const char_type*); + + static const char_type* + find (const char_type*, size_t, const char_type&); + + static constexpr char_type + to_char_type (const int_type& c) {return c;} + + static constexpr int_type + to_int_type (const char_type& c) {return int_type (c);} + + // Note that the following functions are not constexpr (as required by + // C++11) because their return expressions are not constexpr. + // + static bool + eq_int_type (const int_type& l, const int_type& r) {return l == r;} + + static int_type eof () {return char_type::eof;} + + static int_type + not_eof (const int_type& c) + { + return c != char_type::eof ? c : char_type::nul; + } + }; + + // ctype<> must be derived from both ctype_base and locale::facet (the later + // supports ref-counting used by the std::locale implementation internally). + // + // msvcrt for some reason also derives ctype_base from locale::facet which + // produces "already a base-class" warning and effectivelly breaks the + // reference counting. So we derive from ctype_base only in this case. + // + template <> + class ctype<build2::script::regex::line_char>: public ctype_base +#if !defined(_MSC_VER) || _MSC_VER >= 2000 + , public locale::facet +#endif + { + // Used by the implementation only. + // + using line_type = build2::script::regex::line_type; + + public: + using char_type = build2::script::regex::line_char; + + static locale::id id; + +#if !defined(_MSC_VER) || _MSC_VER >= 2000 + explicit + ctype (size_t refs = 0): locale::facet (refs) {} +#else + explicit + ctype (size_t refs = 0): ctype_base (refs) {} +#endif + + // While unnecessary, let's keep for completeness. + // + virtual + ~ctype () override = default; + + // The C++ standard requires the following functions to call their virtual + // (protected) do_*() counterparts that provide the real implementations. + // The only purpose for this indirection is to provide a user with the + // ability to customize existing (standard) ctype facets. As we do not + // provide such an ability, for simplicity we will omit the do_*() + // functions and provide the implementations directly. This should be safe + // as nobody except us could call those protected functions. + // + bool + is (mask m, char_type c) const + { + return m == + (c.type () == line_type::special && c.special () >= 0 && + build2::digit (static_cast<char> (c.special ())) + ? digit + : 0); + } + + const char_type* + is (const char_type*, const char_type*, mask*) const; + + const char_type* + scan_is (mask, const char_type*, const char_type*) const; + + const char_type* + scan_not (mask, const char_type*, const char_type*) const; + + char_type + toupper (char_type c) const {return c;} + + const char_type* + toupper (char_type*, const char_type* e) const {return e;} + + char_type + tolower (char_type c) const {return c;} + + const char_type* + tolower (char_type*, const char_type* e) const {return e;} + + char_type + widen (char c) const {return char_type (c);} + + const char* + widen (const char*, const char*, char_type*) const; + + char + narrow (char_type c, char def) const + { + return c.type () == line_type::special ? c.special () : def; + } + + const char_type* + narrow (const char_type*, const char_type*, char, char*) const; + }; + + // Note: the current application locale must be POSIX. Otherwise the + // behavior is undefined. + // + template <> + class regex_traits<build2::script::regex::line_char> + { + public: + using char_type = build2::script::regex::line_char; + using string_type = build2::script::regex::line_string; + using locale_type = build2::script::regex::line_char_locale; + using char_class_type = regex_traits<char>::char_class_type; + + // Workaround for msvcrt bugs. For some reason it assumes such a members + // to be present in a regex_traits specialization. + // +#if defined(_MSC_VER) && _MSC_VER < 2000 + static const ctype_base::mask _Ch_upper = ctype_base::upper; + static const ctype_base::mask _Ch_alpha = ctype_base::alpha; + + // Unsigned numeric type. msvcrt normally casts characters to this type + // for comparing with some numeric values or for calculating an index in + // some bit array. Luckily that all relates to the character class + // handling that we don't support. + // + using _Uelem = unsigned int; +#endif + + regex_traits () = default; // Unnecessary but let's keep for completeness. + + static size_t + length (const char_type* p) {return string_type::traits_type::length (p);} + + char_type + translate (char_type c) const {return c;} + + // Case-insensitive matching is not supported by line_regex. So there is no + // reason for the function to be called. + // + char_type + translate_nocase (char_type c) const {assert (false); return c;} + + // Return a sort-key - the exact copy of [b, e). + // + template <typename I> + string_type + transform (I b, I e) const {return string_type (b, e);} + + // Return a case-insensitive sort-key. Case-insensitive matching is not + // supported by line_regex. So there is no reason for the function to be + // called. + // + template <typename I> + string_type + transform_primary (I b, I e) const + { + assert (false); + return string_type (b, e); + } + + // POSIX regex grammar and collating elements (e.g., [.tilde.]) in + // particular are not supported. So there is no reason for the function to + // be called. + // + template <typename I> + string_type + lookup_collatename (I, I) const {assert (false); return string_type ();} + + // Character classes (e.g., [:lower:]) are not supported. So there is no + // reason for the function to be called. + // + template <typename I> + char_class_type + lookup_classname (I, I, bool = false) const + { + assert (false); + return char_class_type (); + } + + // Return false as we don't support character classes (e.g., [:lower:]). + // + bool + isctype (char_type, char_class_type) const {return false;} + + int + value (char_type, int) const; + + // Return the locale passed as an argument as we do not expect anything + // other than POSIX locale, that we also assume to be imbued by default. + // + locale_type + imbue (locale_type l) {return l;} + + locale_type + getloc () const {return locale_type ();} + }; + + // We assume line_char to be an unsigned type and express that with the + // following specialization used by basic_regex implementations. + // + // libstdc++ defines unsigned CharT type (regex_traits template parameter) + // to use as an index in some internal cache regardless if the cache is used + // for this specialization (and the cache is used only if CharT is char). + // + template <> + struct make_unsigned<build2::script::regex::line_char> + { + using type = build2::script::regex::line_char; + }; + + // When used with libc++ the linker complains that it can't find + // __match_any_but_newline<line_char>::__exec() function. The problem is + // that the function is only specialized for char and wchar_t + // (LLVM bug #31409). As line_char has no notion of the newline character we + // specialize the class template to behave as the __match_any<line_char> + // instantiation does (that luckily has all the functions in place). + // +#if defined(_LIBCPP_VERSION) && _LIBCPP_VERSION <= 9000 + template <> + class __match_any_but_newline<build2::script::regex::line_char> + : public __match_any<build2::script::regex::line_char> + { + public: + using base = __match_any<build2::script::regex::line_char>; + using base::base; + }; +#endif +} + +namespace build2 +{ + namespace script + { + namespace regex + { + class line_regex: public std::basic_regex<line_char> + { + public: + using base_type = std::basic_regex<line_char>; + + using base_type::base_type; + + line_regex () = default; + + // Move string regex together with the pool used to create it. + // + line_regex (line_string&& s, line_pool&& p) + // No move-string ctor for base_type, so emulate it. + // + : base_type (s), pool (move (p)) {s.clear ();} + + // Move constuctible/assignable-only type. + // + line_regex (line_regex&&) = default; + line_regex (const line_regex&) = delete; + line_regex& operator= (line_regex&&) = default; + line_regex& operator= (const line_regex&) = delete; + + public: + line_pool pool; + }; + } + } +} + +#include <libbuild2/script/regex.ixx> + +#endif // LIBBUILD2_SCRIPT_REGEX_HXX diff --git a/libbuild2/script/regex.ixx b/libbuild2/script/regex.ixx new file mode 100644 index 0000000..e72b578 --- /dev/null +++ b/libbuild2/script/regex.ixx @@ -0,0 +1,31 @@ +// file : libbuild2/script/regex.ixx -*- C++ -*- +// license : MIT; see accompanying LICENSE file + +namespace build2 +{ + namespace script + { + namespace regex + { + inline char_flags + operator&= (char_flags& x, char_flags y) + { + return x = static_cast<char_flags> ( + static_cast<uint16_t> (x) & static_cast<uint16_t> (y)); + } + + inline char_flags + operator|= (char_flags& x, char_flags y) + { + return x = static_cast<char_flags> ( + static_cast<uint16_t> (x) | static_cast<uint16_t> (y)); + } + + inline char_flags + operator& (char_flags x, char_flags y) {return x &= y;} + + inline char_flags + operator| (char_flags x, char_flags y) {return x |= y;} + } + } +} diff --git a/libbuild2/script/regex.test.cxx b/libbuild2/script/regex.test.cxx new file mode 100644 index 0000000..36d47e1 --- /dev/null +++ b/libbuild2/script/regex.test.cxx @@ -0,0 +1,303 @@ +// file : libbuild2/script/regex.test.cxx -*- C++ -*- +// license : MIT; see accompanying LICENSE file + +#include <regex> +#include <type_traits> // is_* + +#include <libbuild2/script/regex.hxx> + +using namespace std; +using namespace build2::script::regex; + +int +main () +{ + using lc = line_char; + using ls = line_string; + using lr = line_regex; + using cf = char_flags; + using cr = char_regex; + + init (); // Initializes the script regex global state. + + // Test line_char. + // + { + static_assert (is_trivial<lc>::value && + is_standard_layout<lc>::value && + !is_array<lc>::value, + "line_char must be char-like"); + + // Zero-initialed line_char should be the null-char as required by + // char_traits<>::length() specification. + // + assert (lc () == lc::nul); + + line_pool p; + + assert (lc::eof == -1); + assert (lc::nul == 0); + + enum meta {mn = 'n', mp = 'p'}; + + // Special roundtrip. + // + assert (lc ('0').special () == '0'); + assert (lc (0).special () == 0); + assert (lc (-1).special () == -1); + assert (lc ('p').special () == 'p'); + assert (lc (u'\u2028').special () == u'\u2028'); + + // Special comparison. + // + assert (lc ('0') == lc ('0')); + assert (lc ('0') == '0'); + assert (lc ('n') == mn); + assert (mn == static_cast<meta> (lc ('n'))); + + assert (lc ('0') != lc ('1')); + assert (lc ('0') != '1'); + assert (lc ('n') != mp); + assert (lc ('0') != lc ("0", p)); + assert (lc ('0') != lc (cr ("0"), p)); + + assert (lc ('0') < lc ('1')); + assert (lc ('0') < '1'); + assert (lc ('1') < lc ("0", p)); + assert (lc ('n') < mp); + + assert (lc ('0') <= '1'); + assert (lc ('0') <= lc ('1')); + assert (lc ('n') <= mn); + assert (lc ('1') <= lc ("0", p)); + + // Literal roundtrip. + // + assert (*lc ("abc", p).literal () == "abc"); + + // Literal comparison. + // + assert (lc ("a", p) == lc ("a", p)); + assert (lc ("a", p).literal () == lc ("a", p).literal ()); + assert (char (lc ("a", p)) == '\a'); + + assert (lc ("a", p) != lc ("b", p)); + assert (!(lc ("a", p) != lc (cr ("a"), p))); + assert (lc ("a", p) != lc (cr ("b"), p)); + + assert (lc ("a", p) < lc ("b", p)); + assert (!(lc ("a", p) < lc (cr ("a"), p))); + + assert (lc ("a", p) <= lc ("b", p)); + assert (lc ("a", p) <= lc (cr ("a"), p)); + assert (lc ("a", p) < lc (cr ("c"), p)); + + // Regex roundtrip. + // + assert (regex_match ("abc", *lc (cr ("abc"), p).regex ())); + + // Regex flags. + // + // icase + // + assert (regex_match ("ABC", cr ("abc", cf::icase))); + + // idot + // + assert (!regex_match ("a", cr ("[.]", cf::idot))); + assert (!regex_match ("a", cr ("[\\.]", cf::idot))); + + assert (regex_match ("a", cr ("."))); + assert (!regex_match ("a", cr (".", cf::idot))); + assert (regex_match ("a", cr ("\\.", cf::idot))); + assert (!regex_match ("a", cr ("\\."))); + + // regex::transform() + // + // The function is static and we can't test it directly. So we will test + // it indirectly via regex matches. + // + // @@ Would be nice to somehow address the inability to test internals (not + // exposed via headers). As a part of utility library support? + // + assert (regex_match (".a[.", cr (".\\.\\[[.]", cf::idot))); + assert (regex_match (".a[.", cr (".\\.\\[[\\.]", cf::idot))); + assert (!regex_match ("ba[.", cr (".\\.\\[[.]", cf::idot))); + assert (!regex_match (".a[b", cr (".\\.\\[[.]", cf::idot))); + assert (!regex_match (".a[b", cr (".\\.\\[[\\.]", cf::idot))); + + // Regex comparison. + // + assert (lc ("a", p) == lc (cr ("a|b"), p)); + assert (lc (cr ("a|b"), p) == lc ("a", p)); + } + + // Test char_traits<line_char>. + // + { + using ct = char_traits<lc>; + using vc = vector<lc>; + + lc c; + ct::assign (c, '0'); + assert (c == ct::char_type ('0')); + + assert (ct::to_char_type (c) == c); + assert (ct::to_int_type (c) == c); + + assert (ct::eq_int_type (c, c)); + assert (!ct::eq_int_type (c, lc::eof)); + + assert (ct::eof () == lc::eof); + + assert (ct::not_eof (c) == c); + assert (ct::not_eof (lc::eof) != lc::eof); + + ct::assign (&c, 1, '1'); + assert (c == ct::int_type ('1')); + + assert (ct::eq (lc ('0'), lc ('0'))); + assert (ct::lt (lc ('0'), lc ('1'))); + + vc v1 ({'0', '1', '2'}); + vc v2 (3, lc::nul); + + assert (ct::find (v1.data (), 3, '1') == v1.data () + 1); + + ct::copy (v2.data (), v1.data (), 3); + assert (v2 == v1); + + v2.push_back (lc::nul); + assert (ct::length (v2.data ()) == 3); + + // Overlaping ranges. + // + ct::move (v1.data () + 1, v1.data (), 2); + assert (v1 == vc ({'0', '0', '1'})); + + v1 = vc ({'0', '1', '2'}); + ct::move (v1.data (), v1.data () + 1, 2); + assert (v1 == vc ({'1', '2', '2'})); + } + + // Test line_char_locale and ctype<line_char> (only non-trivial functions). + // + { + using ct = ctype<lc>; + + line_char_locale l; + + // It is better not to create q facet on stack as it is + // reference-countable. + // + const ct& t (use_facet<ct> (l)); + line_pool p; + + assert (t.is (ct::digit, '0')); + assert (!t.is (ct::digit, '?')); + assert (!t.is (ct::digit, lc ("0", p))); + + const lc chars[] = { '0', '?' }; + ct::mask m[2]; + + const lc* b (chars); + const lc* e (chars + 2); + + // Cast flag value to mask type and compare to mask. + // + auto fl = [] (ct::mask m, ct::mask f) {return m == f;}; + + t.is (b, e, m); + assert (fl (m[0], ct::digit) && fl (m[1], 0)); + + assert (t.scan_is (ct::digit, b, e) == b); + assert (t.scan_is (0, b, e) == b + 1); + + assert (t.scan_not (ct::digit, b, e) == b + 1); + assert (t.scan_not (0, b, e) == b); + + { + char nr[] = "0?"; + lc wd[2]; + t.widen (nr, nr + 2, wd); + assert (wd[0] == b[0] && wd[1] == b[1]); + } + + { + lc wd[] = {'0', lc ("a", p)}; + char nr[2]; + t.narrow (wd, wd + 2, '-', nr); + assert (nr[0] == '0' && nr[1] == '-'); + } + } + + // Test regex_traits<line_char>. Functions other that value() are trivial. + // + { + regex_traits<lc> t; + + const int radix[] = {8, 10}; // Radix 16 is not supported by line_char. + const char digits[] = "0123456789ABCDEF"; + + for (size_t r (0); r < 2; ++r) + { + for (int i (0); i < radix[r]; ++i) + assert (t.value (digits[i], radix[r]) == i); + } + } + + // Test line_regex construction. + // + { + line_pool p; + lr r1 ({lc ("foo", p), lc (cr ("ba(r|z)"), p)}, move (p)); + + lr r2 (move (r1)); + assert (regex_match (ls ({lc ("foo", r2.pool), lc ("bar", r2.pool)}), r2)); + assert (!regex_match (ls ({lc ("foo", r2.pool), lc ("ba", r2.pool)}), r2)); + } + + // Test line_regex match. + // + { + line_pool p; + + const lc foo ("foo", p); + const lc bar ("bar", p); + const lc baz ("baz", p); + const lc blank ("", p); + + assert (regex_match (ls ({foo, bar}), lr ({foo, bar}))); + assert (!regex_match (ls ({foo, baz}), lr ({foo, bar}))); + + assert (regex_match (ls ({bar, foo}), + lr ({'(', foo, '|', bar, ')', '+'}))); + + assert (regex_match (ls ({foo, foo, bar}), + lr ({'(', foo, ')', '\\', '1', bar}))); + + assert (regex_match (ls ({foo}), lr ({lc (cr ("fo+"), p)}))); + assert (regex_match (ls ({foo}), lr ({lc (cr (".*"), p)}))); + assert (regex_match (ls ({blank}), lr ({lc (cr (".*"), p)}))); + + assert (regex_match (ls ({blank, blank, foo}), + lr ({blank, '*', foo, blank, '*'}))); + + assert (regex_match (ls ({blank, blank, foo}), lr ({'.', '*'}))); + + assert (regex_match (ls ({blank, blank}), + lr ({blank, '*', foo, '?', blank, '*'}))); + + assert (regex_match (ls ({foo}), lr ({foo, '{', '1', '}'}))); + assert (regex_match (ls ({foo, foo}), lr ({foo, '{', '1', ',', '}'}))); + + assert (regex_match (ls ({foo, foo}), + lr ({foo, '{', '1', ',', '2', '}'}))); + + assert (!regex_match (ls ({foo, foo}), + lr ({foo, '{', '3', ',', '4', '}'}))); + + assert (regex_match (ls ({foo}), lr ({'(', '?', '=', foo, ')', foo}))); + assert (regex_match (ls ({foo}), lr ({'(', '?', '!', bar, ')', foo}))); + } +} diff --git a/libbuild2/script/run.cxx b/libbuild2/script/run.cxx new file mode 100644 index 0000000..38436b9 --- /dev/null +++ b/libbuild2/script/run.cxx @@ -0,0 +1,2020 @@ +// file : libbuild2/script/run.cxx -*- C++ -*- +// license : MIT; see accompanying LICENSE file + +#include <libbuild2/script/run.hxx> + +#include <ios> // streamsize + +#include <libbutl/regex.mxx> +#include <libbutl/builtin.mxx> +#include <libbutl/fdstream.mxx> // fdopen_mode, fddup() +#include <libbutl/filesystem.mxx> // path_search() +#include <libbutl/path-pattern.mxx> + +#include <libbuild2/filesystem.hxx> +#include <libbuild2/diagnostics.hxx> + +#include <libbuild2/script/regex.hxx> +#include <libbuild2/script/builtin-options.hxx> + +using namespace std; +using namespace butl; + +namespace build2 +{ + namespace script + { + string + diag_path (const path& d) + { + string r ("'"); + + r += stream_verb_map ().path < 1 + ? diag_relative (d) + : d.representation (); + + r += '\''; + return r; + } + + string + diag_path (const dir_name_view& dn) + { + string r; + if (dn.name != nullptr && *dn.name) + { + r += **dn.name; + r += ' '; + } + + assert (dn.path != nullptr); + + r += diag_path (*dn.path); + return r; + } + + // Return the environment temporary directory, creating it if it doesn't + // exist. + // + static inline const dir_path& + temp_dir (environment& env) + { + if (env.temp_dir.empty ()) + env.create_temp_dir (); + + return env.temp_dir; + } + + // Normalize a path. Also make the relative path absolute using the + // specified directory unless it is already absolute. + // + static path + normalize (path p, const dir_path& d, const location& l) + { + path r (p.absolute () ? move (p) : d / move (p)); + + try + { + r.normalize (); + } + catch (const invalid_path& e) + { + fail (l) << "invalid file path " << e.path; + } + + return r; + } + + // Check if a path is not empty, the referenced file exists and is not + // empty. + // + static bool + non_empty (const path& p, const location& ll) + { + if (p.empty () || !exists (p)) + return false; + + try + { + ifdstream is (p); + return is.peek () != ifdstream::traits_type::eof (); + } + catch (const io_error& e) + { + // While there can be no fault of the script command being currently + // executed let's add the location anyway to ease the + // troubleshooting. And let's stick to that principle down the road. + // + fail (ll) << "unable to read " << p << ": " << e << endf; + } + } + + // If the file exists, not empty and not larger than 4KB print it to the + // diag record. The file content goes from the new line and is not + // indented. + // + static void + print_file (diag_record& d, const path& p, const location& ll) + { + if (exists (p)) + { + try + { + ifdstream is (p, ifdstream::badbit); + + if (is.peek () != ifdstream::traits_type::eof ()) + { + char buf[4096 + 1]; // Extra byte is for terminating '\0'. + + // Note that the string is always '\0'-terminated with a maximum + // sizeof (buf) - 1 bytes read. + // + is.getline (buf, sizeof (buf), '\0'); + + // Print if the file fits 4KB-size buffer. Note that if it + // doesn't the failbit is set. + // + if (is.eof ()) + { + // Suppress the trailing newline character as the diag record + // adds it's own one when flush. + // + streamsize n (is.gcount ()); + assert (n > 0); + + // Note that if the file contains '\0' it will also be counted + // by gcount(). But even in the worst case we will stay in the + // buffer boundaries (and so not crash). + // + if (buf[n - 1] == '\n') + buf[n - 1] = '\0'; + + d << '\n' << buf; + } + } + } + catch (const io_error& e) + { + fail (ll) << "unable to read " << p << ": " << e; + } + } + } + + // Save a string to the file. Fail if exception is thrown by underlying + // operations. + // + static void + save (const path& p, const string& s, const location& ll) + { + try + { + ofdstream os (p); + os << s; + os.close (); + } + catch (const io_error& e) + { + fail (ll) << "unable to write to " << p << ": " << e; + } + } + + // Transform string according to here-* redirect modifiers from the {/} + // set. + // + static string + transform (const string& s, + bool regex, + const string& modifiers, + environment& env) + { + if (modifiers.find ('/') == string::npos) + return s; + + // For targets other than Windows leave the string intact. + // + if (env.host.class_ != "windows") + return s; + + // Convert forward slashes to Windows path separators (escape for + // regex). + // + string r; + for (size_t p (0);;) + { + size_t sp (s.find ('/', p)); + + if (sp != string::npos) + { + r.append (s, p, sp - p); + r.append (regex ? "\\\\" : "\\"); + p = sp + 1; + } + else + { + r.append (s, p, sp); + break; + } + } + + return r; + } + + // Return true if the script temporary directory is not created yet (and + // so cannot contain any path), a path is not under the temporary + // directory or this directory will not be removed on failure. + // + static inline bool + avail_on_failure (const path& p, const environment& env) + { + return env.temp_dir.empty () || + env.temp_dir_keep || + !p.sub (env.temp_dir); + } + + // Check if the script command output matches the expected result + // (redirect value). Noop for redirect types other than none, here_*. + // + static bool + check_output (const path& pr, + const path& op, + const path& ip, + const redirect& rd, + const location& ll, + environment& env, + bool diag, + const char* what) + { + auto input_info = [&ip, &ll, &env] (diag_record& d) + { + if (non_empty (ip, ll) && avail_on_failure (ip, env)) + d << info << "stdin: " << ip; + }; + + auto output_info = [&what, &ll, &env] (diag_record& d, + const path& p, + const char* prefix = "", + const char* suffix = "") + { + if (non_empty (p, ll)) + { + if (avail_on_failure (p, env)) + d << info << prefix << what << suffix << ": " << p; + } + else + d << info << prefix << what << suffix << " is empty"; + }; + + if (rd.type == redirect_type::none) + { + // Check that there is no output produced. + // + assert (!op.empty ()); + + if (!non_empty (op, ll)) + return true; + + if (diag) + { + diag_record d (error (ll)); + d << pr << " unexpectedly writes to " << what; + + if (avail_on_failure (op, env)) + d << info << what << ": " << op; + + input_info (d); + + // Print cached output. + // + print_file (d, op, ll); + } + + // Fall through (to return false). + // + } + else if (rd.type == redirect_type::here_str_literal || + rd.type == redirect_type::here_doc_literal || + (rd.type == redirect_type::file && + rd.file.mode == redirect_fmode::compare)) + { + // The expected output is provided as a file or as a string. Save the + // string to a file in the later case. + // + assert (!op.empty ()); + + path eop; + + if (rd.type == redirect_type::file) + eop = normalize (rd.file.path, *env.work_dir.path, ll); + else + { + eop = path (op + ".orig"); + + save (eop, + transform (rd.str, false /* regex */, rd.modifiers (), env), + ll); + + env.clean_special (eop); + } + + // Use the diff utility for comparison. + // + path dp ("diff"); + process_path pp (run_search (dp, true)); + + cstrings args {pp.recall_string ()}; + + // If both files being compared won't be available on failure, then + // instruct diff not to print the file paths. It seems that the only + // way to achieve this is to abandon the output unified format in the + // favor of the minimal output, which normally is still informative + // enough for the troubleshooting (contains the difference line + // numbers, etc). + // + if (avail_on_failure (eop, env) || avail_on_failure (op, env)) + args.push_back ("-u"); + + // Ignore Windows newline fluff if that's what we are running on. + // + if (env.host.class_ == "windows") + args.push_back ("--strip-trailing-cr"); + + args.push_back (eop.string ().c_str ()); + args.push_back (op.string ().c_str ()); + args.push_back (nullptr); + + if (verb >= 2) + print_process (args); + + try + { + // Save diff's stdout to a file for troubleshooting and for the + // optional (if not too large) printing (at the end of + // diagnostics). + // + path ep (op + ".diff"); + auto_fd efd; + + try + { + efd = fdopen (ep, fdopen_mode::out | fdopen_mode::create); + env.clean_special (ep); + } + catch (const io_error& e) + { + fail (ll) << "unable to write to " << ep << ": " << e; + } + + // Diff utility prints the differences to stdout. But for the + // user it is a part of the script failure diagnostics so let's + // redirect stdout to stderr. + // + process p (pp, args.data (), 0, 2, efd.get ()); + efd.reset (); + + if (p.wait ()) + return true; + + assert (p.exit); + const process_exit& pe (*p.exit); + + // Note that both POSIX and GNU diff report error by exiting with + // the code > 1. + // + if (!pe.normal () || pe.code () > 1) + { + diag_record d (fail (ll)); + print_process (d, args); + d << " " << pe; + + print_file (d, ep, ll); + } + + // Output doesn't match the expected result. + // + if (diag) + { + diag_record d (error (ll)); + d << pr << " " << what << " doesn't match expected"; + + output_info (d, op); + output_info (d, eop, "expected "); + output_info (d, ep, "", " diff"); + input_info (d); + + print_file (d, ep, ll); + } + + // Fall through (to return false). + // + } + catch (const process_error& e) + { + error (ll) << "unable to execute " << pp << ": " << e; + + if (e.child) + exit (1); + + throw failed (); + } + } + else if (rd.type == redirect_type::here_str_regex || + rd.type == redirect_type::here_doc_regex) + { + // The overall plan is: + // + // 1. Create regex line string. While creating it's line characters + // transform regex lines according to the redirect modifiers. + // + // 2. Create line regex using the line string. If creation fails + // then save the (transformed) regex redirect to a file for + // troubleshooting. + // + // 3. Parse the output into the literal line string. + // + // 4. Match the output line string with the line regex. + // + // 5. If match fails save the (transformed) regex redirect to a file + // for troubleshooting. + // + using namespace regex; + + assert (!op.empty ()); + + // Create regex line string. + // + line_pool pool; + line_string rls; + const regex_lines rl (rd.regex); + + // Parse regex flags. + // + // When add support for new flags don't forget to update + // parse_regex(). + // + auto parse_flags = [] (const string& f) -> char_flags + { + char_flags r (char_flags::none); + + for (char c: f) + { + switch (c) + { + case 'd': r |= char_flags::idot; break; + case 'i': r |= char_flags::icase; break; + default: assert (false); // Error so should have been checked. + } + } + + return r; + }; + + // Return original regex line with the transformation applied. + // + auto line = [&rl, &rd, &env] (const regex_line& l) -> string + { + string r; + if (l.regex) // Regex (possibly empty), + { + r += rl.intro; + r += transform (l.value, true /* regex */, rd.modifiers (), env); + r += rl.intro; + r += l.flags; + } + else if (!l.special.empty ()) // Special literal. + r += rl.intro; + else // Textual literal. + r += transform (l.value, false /* regex */, rd.modifiers (), env); + + r += l.special; + return r; + }; + + // Return regex line location. + // + // Note that we rely on the fact that the command and regex lines + // are always belong to the same file. + // + auto loc = [&ll] (uint64_t line, uint64_t column) -> location + { + location r (ll); + r.line = line; + r.column = column; + return r; + }; + + // Save the regex to file for troubleshooting, return the file path + // it have been saved to. + // + // Note that we save the regex on line regex creation failure or if + // the program output doesn't match. + // + auto save_regex = [&op, &rl, &rd, &ll, &line] () -> path + { + path rp (op + ".regex"); + + // Encode here-document regex global flags if present as a file + // name suffix. For example if icase and idot flags are specified + // the name will look like: + // + // stdout.regex-di + // + if (rd.type == redirect_type::here_doc_regex && !rl.flags.empty ()) + rp += '-' + rl.flags; + + // Note that if would be more efficient to directly write chunks + // to file rather than to compose a string first. Hower we don't + // bother (about performance) for the sake of the code as we + // already failed. + // + string s; + for (auto b (rl.lines.cbegin ()), i (b), e (rl.lines.cend ()); + i != e; ++i) + { + if (i != b) s += '\n'; + s += line (*i); + } + + save (rp, s, ll); + return rp; + }; + + // Finally create regex line string. + // + // Note that diagnostics doesn't refer to the program path as it is + // irrelevant to failures at this stage. + // + char_flags gf (parse_flags (rl.flags)); // Regex global flags. + + for (const auto& l: rl.lines) + { + if (l.regex) // Regex (with optional special characters). + { + line_char c; + + // Empty regex is a special case repesenting the blank line. + // + if (l.value.empty ()) + c = line_char ("", pool); + else + { + try + { + string s (transform (l.value, + true /* regex */, + rd.modifiers (), + env)); + + c = line_char ( + char_regex (s, gf | parse_flags (l.flags)), pool); + } + catch (const regex_error& e) + { + // Print regex_error description if meaningful. + // + diag_record d (fail (loc (l.line, l.column))); + + if (rd.type == redirect_type::here_str_regex) + d << "invalid " << what << " regex redirect" << e << + info << "regex: '" << line (l) << "'"; + else + d << "invalid char-regex in " << what << " regex redirect" + << e << + info << "regex line: '" << line (l) << "'"; + + d << endf; + } + } + + rls += c; // Append blank literal or regex line char. + } + else if (!l.special.empty ()) // Special literal. + { + // Literal can not be followed by special characters in the same + // line. + // + assert (l.value.empty ()); + } + else // Textual literal. + { + // Append literal line char. + // + rls += line_char (transform (l.value, + false /* regex */, + rd.modifiers (), + env), + pool); + } + + for (char c: l.special) + { + if (line_char::syntax (c)) + rls += line_char (c); // Append special line char. + else + fail (loc (l.line, l.column)) + << "invalid syntax character '" << c << "' in " << what + << " regex redirect" << + info << "regex line: '" << line (l) << "'"; + } + } + + // Create line regex. + // + line_regex regex; + + try + { + regex = line_regex (move (rls), move (pool)); + } + catch (const regex_error& e) + { + // Note that line regex creation can not fail for here-string + // redirect as it doesn't have syntax line chars. That in + // particular means that end_line and end_column are meaningful. + // + assert (rd.type == redirect_type::here_doc_regex); + + diag_record d (fail (loc (rd.end_line, rd.end_column))); + + // Print regex_error description if meaningful. + // + d << "invalid " << what << " regex redirect" << e; + + // It would be a waste to save the regex into the file just to + // remove it. + // + if (env.temp_dir_keep) + output_info (d, save_regex (), "", " regex"); + } + + // Parse the output into the literal line string. + // + line_string ls; + + try + { + // Do not throw when eofbit is set (end of stream reached), and + // when failbit is set (getline() failed to extract any character). + // + // Note that newlines are treated as line-chars separators. That + // in particular means that the trailing newline produces a blank + // line-char (empty literal). Empty output produces the zero-length + // line-string. + // + // Also note that we strip the trailing CR characters (otherwise + // can mismatch when, for example, cross-testing). + // + ifdstream is (op, ifdstream::badbit); + is.peek (); // Sets eofbit for an empty stream. + + while (!is.eof ()) + { + string s; + getline (is, s); + + // It is safer to strip CRs in cycle, as msvcrt unexplainably + // adds too much trailing junk to the system_error descriptions, + // and so it can appear in programs output. For example: + // + // ...: Invalid data.\r\r\n + // + // Note that our custom operator<<(ostream&, const exception&) + // removes this junk. + // + while (!s.empty () && s.back () == '\r') + s.pop_back (); + + ls += line_char (move (s), regex.pool); + } + } + catch (const io_error& e) + { + fail (ll) << "unable to read " << op << ": " << e; + } + + // Match the output with the regex. + // + if (regex_match (ls, regex)) // Doesn't throw. + return true; + + // Output doesn't match the regex. + // + // Unless the temporary directory is removed on failure, we save the + // regex to file for troubleshooting regardless of whether we print + // the diagnostics or not. We, however, register it for cleanup in the + // later case (the expression may still succeed, we can be evaluating + // the if condition, etc). + // + optional<path> rp; + if (env.temp_dir_keep) + rp = save_regex (); + + if (diag) + { + diag_record d (error (ll)); + d << pr << " " << what << " doesn't match regex"; + + output_info (d, op); + + if (rp) + output_info (d, *rp, "", " regex"); + + input_info (d); + + // Print cached output. + // + print_file (d, op, ll); + } + else if (rp) + env.clean_special (*rp); + + // Fall through (to return false). + // + } + else // Noop. + return true; + + return false; + } + + // The exit pseudo-builtin: exit the script successfully, or print the + // diagnostics and exit the script unsuccessfully. Always throw exit + // exception. + // + // exit [<diagnostics>] + // + [[noreturn]] static void + exit_builtin (const strings& args, const location& ll) + { + auto i (args.begin ()); + auto e (args.end ()); + + // Process arguments. + // + // If no argument is specified, then exit successfully. Otherwise, + // print the diagnostics and exit unsuccessfully. + // + if (i == e) + throw exit (true); + + const string& s (*i++); + + if (i != e) + fail (ll) << "unexpected argument '" << *i << "'"; + + error (ll) << s; + throw exit (false); + } + + // The set pseudo-builtin: set variable from the stdin input. + // + // set [-e|--exact] [(-n|--newline)|(-w|--whitespace)] [<attr>] <var> + // + static void + set_builtin (environment& env, + const strings& args, + auto_fd in, + const location& ll) + { + try + { + // Do not throw when eofbit is set (end of stream reached), and + // when failbit is set (read operation failed to extract any + // character). + // + ifdstream cin (move (in), ifdstream::badbit); + + // Parse arguments. + // + cli::vector_scanner scan (args); + set_options ops (scan); + + if (ops.whitespace () && ops.newline ()) + fail (ll) << "both -n|--newline and -w|--whitespace specified"; + + if (!scan.more ()) + fail (ll) << "missing variable name"; + + string a (scan.next ()); // Either attributes or variable name. + const string* ats (!scan.more () ? nullptr : &a); + string vname (!scan.more () ? move (a) : scan.next ()); + + if (scan.more ()) + fail (ll) << "unexpected argument '" << scan.next () << "'"; + + if (ats != nullptr && ats->empty ()) + fail (ll) << "empty variable attributes"; + + if (vname.empty ()) + fail (ll) << "empty variable name"; + + // Read the input. + // + cin.peek (); // Sets eofbit for an empty stream. + + names ns; + while (!cin.eof ()) + { + // Read next element that depends on the whitespace mode being + // enabled or not. For the later case it also make sense to strip + // the trailing CRs that can appear while, for example, + // cross-testing Windows target or as a part of msvcrt junk + // production (see above). + // + string s; + if (ops.whitespace ()) + cin >> s; + else + { + getline (cin, s); + + while (!s.empty () && s.back () == '\r') + s.pop_back (); + } + + // If failbit is set then we read nothing into the string as eof is + // reached. That in particular means that the stream has trailing + // whitespaces (possibly including newlines) if the whitespace mode + // is enabled, or the trailing newline otherwise. If so then + // we append the "blank" to the variable value in the exact mode + // prior to bailing out. + // + if (cin.fail ()) + { + if (ops.exact ()) + { + if (ops.whitespace () || ops.newline ()) + ns.emplace_back (move (s)); // Reuse empty string. + else if (ns.empty ()) + ns.emplace_back ("\n"); + else + ns[0].value += '\n'; + } + + break; + } + + if (ops.whitespace () || ops.newline () || ns.empty ()) + ns.emplace_back (move (s)); + else + { + ns[0].value += '\n'; + ns[0].value += s; + } + } + + cin.close (); + + env.set_variable (move (vname), + move (ns), + ats != nullptr ? *ats : empty_string, + ll); + } + catch (const io_error& e) + { + fail (ll) << "set: " << e; + } + catch (const cli::exception& e) + { + fail (ll) << "set: " << e; + } + } + + // Sorted array of builtins that support filesystem entries cleanup. + // + static const char* cleanup_builtins[] = { + "cp", "ln", "mkdir", "mv", "touch"}; + + static inline bool + cleanup_builtin (const string& name) + { + return binary_search ( + cleanup_builtins, + cleanup_builtins + + sizeof (cleanup_builtins) / sizeof (*cleanup_builtins), + name); + } + + static bool + run_pipe (environment& env, + command_pipe::const_iterator bc, + command_pipe::const_iterator ec, + auto_fd ifd, + size_t ci, size_t li, const location& ll, + bool diag) + { + if (bc == ec) // End of the pipeline. + return true; + + // The overall plan is to run the first command in the pipe, reading + // its input from the file descriptor passed (or, for the first + // command, according to stdin redirect specification) and redirecting + // its output to the right-hand part of the pipe recursively. Fail if + // the right-hand part fails. Otherwise check the process exit code, + // match stderr (and stdout for the last command in the pipe) according + // to redirect specification(s) and fail if any of the above fails. + // + const command& c (*bc); + + // Register the command explicit cleanups. Verify that the path being + // cleaned up is a sub-path of the script working directory. Fail if + // this is not the case. + // + for (const auto& cl: c.cleanups) + { + const path& p (cl.path); + path np (normalize (p, *env.work_dir.path, ll)); + + const string& ls (np.leaf ().string ()); + bool wc (ls == "*" || ls == "**" || ls == "***"); + const path& cp (wc ? np.directory () : np); + const dir_path* sd (env.sandbox_dir.path); + + if (sd != nullptr && !cp.sub (*sd)) + fail (ll) << (wc ? "wildcard" : + p.to_directory () ? "directory" : + "file") + << " cleanup " << p << " is out of " + << diag_path (env.sandbox_dir); + + env.clean ({cl.type, move (np)}, false); + } + + bool eq (c.exit.comparison == exit_comparison::eq); + + // If stdin file descriptor is not open then this is the first pipeline + // command. + // + bool first (ifd.get () == -1); + + command_pipe::const_iterator nc (bc + 1); + bool last (nc == ec); + + const string& program (c.program.string ()); + + const redirect& in ((c.in ? *c.in : env.in).effective ()); + + const redirect* out (!last + ? nullptr // stdout is piped. + : &(c.out ? *c.out : env.out).effective ()); + + const redirect& err ((c.err ? *c.err : env.err).effective ()); + + auto process_args = [&c] () -> cstrings + { + cstrings args {c.program.string ().c_str ()}; + + for (const auto& a: c.arguments) + args.push_back (a.c_str ()); + + args.push_back (nullptr); + return args; + }; + + // Prior to opening file descriptors for command input/output + // redirects let's check if the command is the exit builtin. Being a + // builtin syntactically it differs from the regular ones in a number + // of ways. It doesn't communicate with standard streams, so + // redirecting them is meaningless. It may appear only as a single + // command in a pipeline. It doesn't return any value and stops the + // script execution, so checking its exit status is meaningless as + // well. That all means we can short-circuit here calling the builtin + // and bailing out right after that. Checking that the user didn't + // specify any redirects or exit code check sounds like a right thing + // to do. + // + if (program == "exit") + { + // In case the builtin is erroneously pipelined from the other + // command, we will close stdin gracefully (reading out the stream + // content), to make sure that the command doesn't print any + // unwanted diagnostics about IO operation failure. + // + // Note that dtor will ignore any errors (which is what we want). + // + ifdstream is (move (ifd), fdstream_mode::skip); + + if (!first || !last) + fail (ll) << "exit builtin must be the only pipe command"; + + if (c.in) + fail (ll) << "exit builtin stdin cannot be redirected"; + + if (c.out) + fail (ll) << "exit builtin stdout cannot be redirected"; + + if (c.err) + fail (ll) << "exit builtin stderr cannot be redirected"; + + // We can't make sure that there is no exit code check. Let's, at + // least, check that non-zero code is not expected. + // + if (eq != (c.exit.code == 0)) + fail (ll) << "exit builtin exit code cannot be non-zero"; + + if (verb >= 2) + print_process (process_args ()); + + exit_builtin (c.arguments, ll); // Throws exit exception. + } + + // Create a unique path for a command standard stream cache file. + // + auto std_path = [&env, &ci, &li, &ll] (const char* n) -> path + { + using std::to_string; + + path p (n); + + // 0 if belongs to a single-line script, otherwise is the command line + // number (start from one) in the script. + // + if (li > 0) + p += "-" + to_string (li); + + // 0 if belongs to a single-command expression, otherwise is the + // command number (start from one) in the expression. + // + // Note that the name like stdin-N can relate to N-th command of a + // single-line script or to N-th single-command line of multi-line + // script. These cases are mutually exclusive and so are unambiguous. + // + if (ci > 0) + p += "-" + to_string (ci); + + return normalize (move (p), temp_dir (env), ll); + }; + + // If this is the first pipeline command, then open stdin descriptor + // according to the redirect specified. + // + path isp; + + if (!first) + assert (!c.in); // No redirect expected. + else + { + // Open a file for passing to the command stdin. + // + auto open_stdin = [&isp, &ifd, &ll] () + { + assert (!isp.empty ()); + + try + { + ifd = fdopen (isp, fdopen_mode::in); + } + catch (const io_error& e) + { + fail (ll) << "unable to read " << isp << ": " << e; + } + }; + + switch (in.type) + { + case redirect_type::pass: + { + try + { + ifd = fddup (0); + } + catch (const io_error& e) + { + fail (ll) << "unable to duplicate stdin: " << e; + } + + break; + } + case redirect_type::none: + // Somehow need to make sure that the child process doesn't read + // from stdin. That is tricky to do in a portable way. Here we + // suppose that the program which (erroneously) tries to read some + // data from stdin being redirected to /dev/null fails not being + // able to read the expected data, and so the command doesn't pass + // through. + // + // @@ Obviously doesn't cover the case when the process reads + // whatever available. + // @@ Another approach could be not to redirect stdin and let the + // process to hang which can be interpreted as a command failure. + // @@ Both ways are quite ugly. Is there some better way to do + // this? + // + // Fall through. + // + case redirect_type::null: + { + ifd = open_null (); + break; + } + case redirect_type::file: + { + isp = normalize (in.file.path, *env.work_dir.path, ll); + + open_stdin (); + break; + } + case redirect_type::here_str_literal: + case redirect_type::here_doc_literal: + { + // We could write to the command stdin directly but instead will + // cache the data for potential troubleshooting. + // + isp = std_path ("stdin"); + + save (isp, + transform (in.str, false /* regex */, in.modifiers (), env), + ll); + + env.clean_special (isp); + + open_stdin (); + break; + } + case redirect_type::trace: + case redirect_type::merge: + case redirect_type::here_str_regex: + case redirect_type::here_doc_regex: + case redirect_type::here_doc_ref: assert (false); break; + } + } + + assert (ifd.get () != -1); + + // Prior to opening file descriptors for command outputs redirects + // let's check if the command is the set builtin. Being a builtin + // syntactically it differs from the regular ones in a number of ways. + // It either succeeds or terminates abnormally, so redirecting stderr + // is meaningless. It also never produces any output and may appear + // only as a terminal command in a pipeline. That means we can + // short-circuit here calling the builtin and returning right after + // that. Checking that the user didn't specify any meaningless + // redirects or exit code check sounds as a right thing to do. + // + if (program == "set") + { + if (!last) + fail (ll) << "set builtin must be the last pipe command"; + + if (c.out) + fail (ll) << "set builtin stdout cannot be redirected"; + + if (c.err) + fail (ll) << "set builtin stderr cannot be redirected"; + + if (eq != (c.exit.code == 0)) + fail (ll) << "set builtin exit code cannot be non-zero"; + + if (verb >= 2) + print_process (process_args ()); + + set_builtin (env, c.arguments, move (ifd), ll); + return true; + } + + // Open a file for command output redirect if requested explicitly + // (file overwrite/append redirects) or for the purpose of the output + // validation (none, here_*, file comparison redirects), register the + // file for cleanup, return the file descriptor. Interpret trace + // redirect according to the verbosity level (as null if below 2, as + // pass otherwise). Return nullfd, standard stream descriptor duplicate + // or null-device descriptor for merge, pass or null redirects + // respectively (not opening any file). + // + auto open = [&env, &ll, &std_path] (const redirect& r, + int dfd, + path& p) -> auto_fd + { + assert (dfd == 1 || dfd == 2); + const char* what (dfd == 1 ? "stdout" : "stderr"); + + fdopen_mode m (fdopen_mode::out | fdopen_mode::create); + + redirect_type rt (r.type != redirect_type::trace + ? r.type + : verb < 2 + ? redirect_type::null + : redirect_type::pass); + switch (rt) + { + case redirect_type::pass: + { + try + { + return fddup (dfd); + } + catch (const io_error& e) + { + fail (ll) << "unable to duplicate " << what << ": " << e; + } + } + + case redirect_type::null: return open_null (); + + // Duplicate the paired file descriptor later. + // + case redirect_type::merge: return nullfd; + + case redirect_type::file: + { + // For the cmp mode the user-provided path refers a content to + // match against, rather than a content to be produced (as for + // overwrite and append modes). And so for cmp mode we redirect + // the process output to a temporary file. + // + p = r.file.mode == redirect_fmode::compare + ? std_path (what) + : normalize (r.file.path, *env.work_dir.path, ll); + + m |= r.file.mode == redirect_fmode::append + ? fdopen_mode::at_end + : fdopen_mode::truncate; + + break; + } + + case redirect_type::none: + case redirect_type::here_str_literal: + case redirect_type::here_doc_literal: + case redirect_type::here_str_regex: + case redirect_type::here_doc_regex: + { + p = std_path (what); + m |= fdopen_mode::truncate; + break; + } + + case redirect_type::trace: + case redirect_type::here_doc_ref: assert (false); break; + } + + auto_fd fd; + + try + { + fd = fdopen (p, m); + + if ((m & fdopen_mode::at_end) != fdopen_mode::at_end) + { + if (rt == redirect_type::file) + env.clean ({cleanup_type::always, p}, true); + else + env.clean_special (p); + } + } + catch (const io_error& e) + { + fail (ll) << "unable to write to " << p << ": " << e; + } + + return fd; + }; + + path osp; + fdpipe ofd; + + // If this is the last command in the pipeline than redirect the + // command process stdout to a file. Otherwise create a pipe and + // redirect the stdout to the write-end of the pipe. The read-end will + // be passed as stdin for the next command in the pipeline. + // + // @@ Shouldn't we allow the here-* and file output redirects for a + // command with pipelined output? Say if such redirect is present + // then the process output is redirected to a file first (as it is + // when no output pipelined), and only after the process exit code + // and the output are validated the next command in the pipeline is + // executed taking the file as an input. This could be usefull for + // script failures investigation and, for example, for validation + // "tightening". + // + if (last) + ofd.out = open (*out, 1, osp); + else + { + assert (!c.out); // No redirect expected. + ofd = open_pipe (); + } + + path esp; + auto_fd efd (open (err, 2, esp)); + + // Merge standard streams. + // + bool mo (out != nullptr && out->type == redirect_type::merge); + if (mo || err.type == redirect_type::merge) + { + auto_fd& self (mo ? ofd.out : efd); + auto_fd& other (mo ? efd : ofd.out); + + try + { + assert (self.get () == -1 && other.get () != -1); + self = fddup (other.get ()); + } + catch (const io_error& e) + { + fail (ll) << "unable to duplicate " << (mo ? "stderr" : "stdout") + << ": " << e; + } + } + + // All descriptors should be open to the date. + // + assert (ofd.out.get () != -1 && efd.get () != -1); + + optional<process_exit> exit; + builtin_function* bf (builtins.find (program)); + + bool success; + + if (bf != nullptr) + { + // Execute the builtin. + // + if (verb >= 2) + print_process (process_args ()); + + // Some of the script builtins (cp, mkdir, etc) extend libbutl + // builtins (via callbacks) registering/moving cleanups for the + // filesystem entries they create/move, unless explicitly requested + // not to do so via the --no-cleanup option. + // + // Let's "wrap up" the cleanup-related flags into the single object + // to rely on "small function object" optimization. + // + struct cleanup + { + // Whether the cleanups are enabled for the builtin. Can be set to + // false by the parse_option callback if --no-cleanup is + // encountered. + // + bool enabled = true; + + // Whether to register cleanup for a filesystem entry being + // created/updated depending on its existence. Calculated by the + // create pre-hook and used by the subsequent post-hook. + // + bool add; + + // Whether to move existing cleanups for the filesystem entry + // being moved, rather than to erase them. Calculated by the move + // pre-hook and used by the subsequent post-hook. + // + bool move; + }; + + // nullopt if the builtin doesn't support cleanups. + // + optional<cleanup> cln; + + if (cleanup_builtin (program)) + cln = cleanup (); + + builtin_callbacks bcs { + + // create + // + // Unless cleanups are suppressed, test that the filesystem entry + // doesn't exist (pre-hook) and, if that's the case, register the + // cleanup for the newly created filesystem entry (post-hook). + // + [&env, &cln] (const path& p, bool pre) + { + // Cleanups must be supported by a filesystem entry-creating + // builtin. + // + assert (cln); + + if (cln->enabled) + { + if (pre) + cln->add = !butl::entry_exists (p); + else if (cln->add) + env.clean ({cleanup_type::always, p}, true /* implicit */); + } + }, + + // move + // + // Validate the source and destination paths (pre-hook) and, + // unless suppressed, adjust the cleanups that are sub-paths of + // the source path (post-hook). + // + [&env, &cln] (const path& from, const path& to, bool force, bool pre) + { + // Cleanups must be supported by a filesystem entry-moving + // builtin. + // + assert (cln); + + if (pre) + { + const dir_path& wd (*env.work_dir.path); + const dir_path* sd (env.sandbox_dir.path); + + auto fail = [] (const string& d) {throw runtime_error (d);}; + + if (sd != nullptr && !from.sub (*sd) && !force) + fail (diag_path (from) + " is out of " + + diag_path (env.sandbox_dir)); + + auto check_wd = [&wd, &env, fail] (const path& p) + { + if (wd.sub (path_cast<dir_path> (p))) + fail (diag_path (p) + " contains " + + diag_path (env.work_dir)); + }; + + check_wd (from); + check_wd (to); + + // Unless cleanups are disabled, "move" the matching cleanups + // if the destination path doesn't exist and it is a sub-path + // of the working directory and just remove them otherwise. + // + if (cln->enabled) + cln->move = !butl::entry_exists (to) && + (sd == nullptr || to.sub (*sd)); + } + else if (cln->enabled) + { + // Move or remove the matching cleanups (see above). + // + // Note that it's not enough to just change the cleanup paths. + // We also need to make sure that these cleanups happen before + // the destination directory (or any of its parents) cleanup, + // that is potentially registered. To achieve that we can just + // relocate these cleanup entries to the end of the list, + // preserving their mutual order. Remember that cleanups in + // the list are executed in the reversed order. + // + cleanups cs; + + // Remove the source path sub-path cleanups from the list, + // adjusting/caching them if required (see above). + // + for (auto i (env.cleanups.begin ()); i != env.cleanups.end (); ) + { + script::cleanup& c (*i); + path& p (c.path); + + if (p.sub (from)) + { + if (cln->move) + { + // Note that we need to preserve the cleanup path + // trailing separator which indicates the removal + // method. Also note that leaf(), in particular, does + // that. + // + p = p != from + ? to / p.leaf (path_cast<dir_path> (from)) + : p.to_directory () + ? path_cast<dir_path> (to) + : to; + + cs.push_back (move (c)); + } + + i = env.cleanups.erase (i); + } + else + ++i; + } + + // Re-insert the adjusted cleanups at the end of the list. + // + env.cleanups.insert (env.cleanups.end (), + make_move_iterator (cs.begin ()), + make_move_iterator (cs.end ())); + + } + }, + + // remove + // + // Validate the filesystem entry path (pre-hook). + // + [&env] (const path& p, bool force, bool pre) + { + if (pre) + { + const dir_path& wd (*env.work_dir.path); + const dir_path* sd (env.sandbox_dir.path); + + auto fail = [] (const string& d) {throw runtime_error (d);}; + + if (sd != nullptr && !p.sub (*sd) && !force) + fail (diag_path (p) + " is out of " + + diag_path (env.sandbox_dir)); + + if (wd.sub (path_cast<dir_path> (p))) + fail (diag_path (p) + " contains " + + diag_path (env.work_dir)); + } + }, + + // parse_option + // + [&cln] (const strings& args, size_t i) + { + // Parse --no-cleanup, if it is supported by the builtin. + // + if (cln && args[i] == "--no-cleanup") + { + cln->enabled = false; + return 1; + } + + return 0; + }, + + // sleep + // + // Deactivate the thread before going to sleep. + // + [&env] (const duration& d) + { + // If/when required we could probably support the precise sleep + // mode (e.g., via an option). + // + env.context.sched.sleep (d); + } + }; + + try + { + uint8_t r; // Storage. + builtin b (bf (r, + c.arguments, + move (ifd), move (ofd.out), move (efd), + *env.work_dir.path, + bcs)); + + success = run_pipe (env, + nc, + ec, + move (ofd.in), + ci + 1, li, ll, diag); + + exit = process_exit (b.wait ()); + } + catch (const system_error& e) + { + fail (ll) << "unable to execute " << c.program << " builtin: " + << e << endf; + } + } + else + { + // Execute the process. + // + cstrings args (process_args ()); + + // Resolve the relative not simple program path against the script's + // working directory. The simple one will be left for the process + // path search machinery. Also strip the potential leading `^`, + // indicating that this is an external program rather than a + // builtin. + // + path p; + + try + { + p = path (args[0]); + + if (p.relative ()) + { + auto program = [&p, &args] (path pp) + { + p = move (pp); + args[0] = p.string ().c_str (); + }; + + if (p.simple ()) + { + const string& s (p.string ()); + + // Don't end up with an empty path. + // + if (s.size () > 1 && s[0] == '^') + program (path (s, 1, s.size () - 1)); + } + else + program (*env.work_dir.path / p); + } + } + catch (const invalid_path& e) + { + fail (ll) << "invalid program path " << e.path; + } + + try + { + process_path pp (process::path_search (args[0])); + + // Note: the builtin-escaping character '^' is not printed. + // + if (verb >= 2) + print_process (args); + + process pr ( + pp, + args.data (), + {ifd.get (), -1}, process::pipe (ofd), {-1, efd.get ()}, + env.work_dir.path->string ().c_str ()); + + ifd.reset (); + ofd.out.reset (); + efd.reset (); + + success = run_pipe (env, + nc, + ec, + move (ofd.in), + ci + 1, li, ll, diag); + + pr.wait (); + + exit = move (pr.exit); + } + catch (const process_error& e) + { + error (ll) << "unable to execute " << args[0] << ": " << e; + + if (e.child) + std::exit (1); + + throw failed (); + } + } + + assert (exit); + + // If the righ-hand side pipeline failed than the whole pipeline fails, + // and no further checks are required. + // + if (!success) + return false; + + const path& pr (c.program); + + // If there is no valid exit code available by whatever reason then we + // print the proper diagnostics, dump stderr (if cached and not too + // large) and fail the whole script. Otherwise if the exit code is not + // correct then we print diagnostics if requested and fail the + // pipeline. + // + bool valid (exit->normal ()); + + // On Windows the exit code can be out of the valid codes range being + // defined as uint16_t. + // +#ifdef _WIN32 + if (valid) + valid = exit->code () < 256; +#endif + + success = valid && eq == (exit->code () == c.exit.code); + + if (!valid || (!success && diag)) + { + // In the presense of a valid exit code we print the diagnostics and + // return false rather than throw. + // + diag_record d (valid ? error (ll) : fail (ll)); + + if (!exit->normal ()) + d << pr << " " << *exit; + else + { + uint16_t ec (exit->code ()); // Make sure is printed as integer. + + if (!valid) + d << pr << " exit code " << ec << " out of 0-255 range"; + else if (!success) + { + if (diag) + d << pr << " exit code " << ec << (eq ? " != " : " == ") + << static_cast<uint16_t> (c.exit.code); + } + else + assert (false); + } + + if (non_empty (esp, ll) && avail_on_failure (esp, env)) + d << info << "stderr: " << esp; + + if (non_empty (osp, ll) && avail_on_failure (osp, env)) + d << info << "stdout: " << osp; + + if (non_empty (isp, ll) && avail_on_failure (isp, env)) + d << info << "stdin: " << isp; + + // Print cached stderr. + // + print_file (d, esp, ll); + } + + // If exit code is correct then check if the standard outputs match the + // expectations. Note that stdout is only redirected to file for the + // last command in the pipeline. + // + // The thinking behind matching stderr first is that if it mismatches, + // then the program probably misbehaves (executes wrong functionality, + // etc) in which case its stdout doesn't really matter. + // + if (success) + success = + check_output (pr, esp, isp, err, ll, env, diag, "stderr") && + (!last || + check_output (pr, osp, isp, *out, ll, env, diag, "stdout")); + + return success; + } + + static bool + run_expr (environment& env, + const command_expr& expr, + size_t li, const location& ll, + bool diag) + { + // Commands are numbered sequentially throughout the expression + // starting with 1. Number 0 means the command is a single one. + // + size_t ci (expr.size () == 1 && expr.back ().pipe.size () == 1 + ? 0 + : 1); + + // If there is no ORs to the right of a pipe then the pipe failure is + // fatal for the whole expression. In particular, the pipe must print + // the diagnostics on failure (if generally allowed). So we find the + // pipe that "switches on" the diagnostics potential printing. + // + command_expr::const_iterator trailing_ands; // Undefined if diag is + // disallowed. + if (diag) + { + auto i (expr.crbegin ()); + for (; i != expr.crend () && i->op == expr_operator::log_and; ++i) ; + trailing_ands = i.base (); + } + + bool r (false); + bool print (false); + + for (auto b (expr.cbegin ()), i (b), e (expr.cend ()); i != e; ++i) + { + if (diag && i + 1 == trailing_ands) + print = true; + + const command_pipe& p (i->pipe); + bool or_op (i->op == expr_operator::log_or); + + // Short-circuit if the pipe result must be OR-ed with true or AND-ed + // with false. + // + if (!((or_op && r) || (!or_op && !r))) + r = run_pipe ( + env, p.begin (), p.end (), auto_fd (), ci, li, ll, print); + + ci += p.size (); + } + + return r; + } + + void + run (environment& env, + const command_expr& expr, + size_t li, const location& ll) + { + // Note that we don't print the expression at any verbosity level + // assuming that the caller does this, potentially providing some + // additional information (command type, etc). + // + if (!run_expr (env, expr, li, ll, true /* diag */)) + throw failed (); // Assume diagnostics is already printed. + } + + bool + run_if (environment& env, + const command_expr& expr, + size_t li, const location& ll) + { + // Note that we don't print the expression here (see above). + // + return run_expr (env, expr, li, ll, false /* diag */); + } + + void + clean (environment& env, const location& ll) + { + context& ctx (env.context); + const dir_path& wdir (*env.work_dir.path); + + // Note that we operate with normalized paths here. + // + // Remove special files. The order is not important as we don't + // expect directories here. + // + for (const path& p: env.special_cleanups) + { + // Remove the file if exists. Fail otherwise. + // + if (rmfile (ctx, p, 3) == rmfile_status::not_exist) + fail (ll) << "registered for cleanup special file " << p + << " does not exist"; + } + + // Remove files and directories in the order opposite to the order of + // cleanup registration. + // + for (const auto& c: reverse_iterate (env.cleanups)) + { + cleanup_type t (c.type); + + // Skip whenever the path exists or not. + // + if (t == cleanup_type::never) + continue; + + const path& cp (c.path); + + // Wildcard with the last component being '***' (without trailing + // separator) matches all files and sub-directories recursively as + // well as the start directories itself. So we will recursively + // remove the directories that match the parent (for the original + // path) directory wildcard. + // + bool recursive (cp.leaf ().representation () == "***"); + const path& p (!recursive ? cp : cp.directory ()); + + // Remove files or directories using wildcard. + // + if (path_pattern (p)) + { + bool removed (false); + + auto rm = [&cp, recursive, &removed, &ll, &ctx, &wdir] + (path&& pe, const string&, bool interm) + { + if (!interm) + { + // While removing the entry we can get not_exist due to + // racing conditions, but that's ok if somebody did our job. + // Note that we still set the removed flag to true in this + // case. + // + removed = true; // Will be meaningless on failure. + + if (pe.to_directory ()) + { + dir_path d (path_cast<dir_path> (pe)); + + if (!recursive) + { + rmdir_status r (rmdir (ctx, d, 3)); + + if (r != rmdir_status::not_empty) + return true; + + diag_record dr (fail (ll)); + dr << "registered for cleanup directory " << d + << " is not empty"; + + print_dir (dr, d, ll); + dr << info << "wildcard: '" << cp << "'"; + } + else + { + // Don't remove the working directory (it will be removed + // by the dedicated cleanup). + // + // Cast to uint16_t to avoid ambiguity with + // libbutl::rmdir_r(). + // + rmdir_status r (rmdir_r (ctx, d, d != wdir, 3)); + + if (r != rmdir_status::not_empty) + return true; + + // The directory is unlikely to be current but let's keep + // for completeness. + // + fail (ll) << "registered for cleanup wildcard " << cp + << " matches the current directory"; + } + } + else + rmfile (ctx, pe, 3); + } + + return true; + }; + + // Note that here we rely on the fact that recursive iterating + // goes depth-first (which make sense for the cleanup). + // + try + { + // Doesn't follow symlinks. + // + path_search (p, + rm, + dir_path () /* start */, + path_match_flags::none); + } + catch (const system_error& e) + { + fail (ll) << "unable to cleanup wildcard " << cp << ": " << e; + } + + // Removal of no filesystem entries is not an error for 'maybe' + // cleanup type. + // + if (removed || t == cleanup_type::maybe) + continue; + + fail (ll) << "registered for cleanup wildcard " << cp + << " doesn't match any " + << (recursive + ? "path" + : p.to_directory () + ? "directory" + : "file"); + } + + // Remove the directory if exists and empty. Fail otherwise. + // Removal of non-existing directory is not an error for 'maybe' + // cleanup type. + // + if (p.to_directory ()) + { + dir_path d (path_cast<dir_path> (p)); + bool wd (d == wdir); + + // Don't remove the working directory for the recursive cleanup + // (it will be removed by the dedicated one). + // + // Note that the root working directory contains the + // .buildignore file (see above). + // + // @@ If 'd' is a file then will fail with a diagnostics having + // no location info. Probably need to add an optional location + // parameter to rmdir() function. The same problem exists for + // a file cleanup when try to rmfile() directory instead of + // file. + // + rmdir_status r (recursive + ? rmdir_r (ctx, d, !wd, static_cast <uint16_t> (3)) + : rmdir (ctx, d, 3)); + + if (r == rmdir_status::success || + (r == rmdir_status::not_exist && t == cleanup_type::maybe)) + continue; + + diag_record dr (fail (ll)); + dr << "registered for cleanup directory " << d + << (r == rmdir_status::not_exist ? " does not exist" : + !recursive ? " is not empty" + : " is current"); + + if (r == rmdir_status::not_empty) + print_dir (dr, d, ll); + } + + // Remove the file if exists. Fail otherwise. Removal of + // non-existing file is not an error for 'maybe' cleanup type. + // + if (rmfile (ctx, p, 3) == rmfile_status::not_exist && + t == cleanup_type::always) + fail (ll) << "registered for cleanup file " << p + << " does not exist"; + } + } + + void + print_dir (diag_record& dr, const dir_path& p, const location& ll) + { + try + { + size_t n (0); + for (const dir_entry& de: dir_iterator (p, + false /* ignore_dangling */)) + { + if (n++ < 10) + dr << '\n' << (de.ltype () == entry_type::directory + ? path_cast<dir_path> (de.path ()) + : de.path ()); + } + + if (n > 10) + dr << "\nand " << n - 10 << " more file(s)"; + } + catch (const system_error& e) + { + fail (ll) << "unable to iterate over " << p << ": " << e; + } + } + } +} diff --git a/libbuild2/script/run.hxx b/libbuild2/script/run.hxx new file mode 100644 index 0000000..477dd88 --- /dev/null +++ b/libbuild2/script/run.hxx @@ -0,0 +1,75 @@ +// file : libbuild2/script/run.hxx -*- C++ -*- +// license : MIT; see accompanying LICENSE file + +#ifndef LIBBUILD2_SCRIPT_RUN_HXX +#define LIBBUILD2_SCRIPT_RUN_HXX + +#include <libbuild2/types.hxx> +#include <libbuild2/utility.hxx> + +#include <libbuild2/script/script.hxx> + +namespace build2 +{ + namespace script + { + // An exception that can be thrown by an expression running function to + // exit the script (for example, as a result of executing the exit builtin + // by the below run*() functions). The status indicates whether the + // execution should be considered to have succeeded or failed. + // + struct exit + { + bool status; + + explicit + exit (bool s): status (s) {} + }; + + // Helpers. + // + + // Command expression running functions. + // + // Index is the 1-base index of this command line in the command list. + // If it is 0 then it means there is only one command. This information + // can be used, for example, to derive file names. + // + // Location is the start position of this command line in the script. It + // can be used in diagnostics. + // + void + run (environment&, const command_expr&, size_t index, const location&); + + bool + run_if (environment&, const command_expr&, size_t, const location&); + + // Perform the registered special file cleanups in the direct order and + // then the regular cleanups in the reverse order. + // + void + clean (environment&, const location&); + + // Print first 10 directory sub-entries to the diag record. The directory + // must exist. Is normally used while issuing diagnostics on non-empty + // directory removal failure. + // + void + print_dir (diag_record&, const dir_path&, const location&); + + // Return the quoted path representation with the preserved trailing + // directory separator. The path is relative if the verbosity level is + // less than 3. + // + string + diag_path (const path&); + + // Same as above, but prepends the path with a name, if present. The path + // must be not NULL. + // + string + diag_path (const dir_name_view&); + } +} + +#endif // LIBBUILD2_SCRIPT_RUN_HXX diff --git a/libbuild2/script/script.cxx b/libbuild2/script/script.cxx new file mode 100644 index 0000000..c85bfd3 --- /dev/null +++ b/libbuild2/script/script.cxx @@ -0,0 +1,659 @@ +// file : libbuild2/script/script.cxx -*- C++ -*- +// license : MIT; see accompanying LICENSE file + +#include <libbuild2/script/script.hxx> + +#include <sstream> +#include <cstring> // strchr() + +using namespace std; + +namespace build2 +{ + namespace script + { + ostream& + operator<< (ostream& o, line_type lt) + { + const char* s (nullptr); + + switch (lt) + { + case line_type::var: s = "variable"; break; + case line_type::cmd: s = "command"; break; + case line_type::cmd_if: s = "'if'"; break; + case line_type::cmd_ifn: s = "'if!'"; break; + case line_type::cmd_elif: s = "'elif'"; break; + case line_type::cmd_elifn: s = "'elif!'"; break; + case line_type::cmd_else: s = "'else'"; break; + case line_type::cmd_end: s = "'end'"; break; + } + + return o << s; + } + + void + dump (ostream& os, const string& ind, const lines& ls) + { + // For each line print its tokens literal representation trying to + // reproduce the quoting. Consider mixed quoting as double quoting + // since the information is lost. + // + // Also additionally indent the if-branch lines. + // + string if_ind; + + for (const line& l: ls) + { + // Before printing indentation, decrease it if the else or end line is + // reached. + // + switch (l.type) + { + case line_type::cmd_elif: + case line_type::cmd_elifn: + case line_type::cmd_else: + case line_type::cmd_end: + { + size_t n (if_ind.size ()); + assert (n >= 2); + if_ind.resize (n - 2); + break; + } + default: break; + } + + // Print indentations. + // + os << ind << if_ind; + + // After printing indentation, increase it for if/else branch. + // + switch (l.type) + { + case line_type::cmd_if: + case line_type::cmd_ifn: + case line_type::cmd_elif: + case line_type::cmd_elifn: + case line_type::cmd_else: if_ind += " "; break; + default: break; + } + + // '"' or '\'' if we are inside the quoted token sequence and '\0' + // otherwise. Thus, can be used as bool. + // + char qseq ('\0'); + + for (const replay_token& rt: l.tokens) + { + const token& t (rt.token); + + // '"' or '\'' if the token is quoted and '\0' otherwise. Thus, + // can be used as bool. + // + char qtok ('\0'); + + switch (t.qtype) + { + case quote_type::unquoted: qtok = '\0'; break; + case quote_type::single: qtok = '\''; break; + case quote_type::mixed: + case quote_type::double_: qtok = '"'; break; + } + + // If being inside a quoted token sequence we have reached a token + // quoted differently or the newline, then we probably made a + // mistake misinterpreting some previous partially quoted token, for + // example f"oo" as "foo. If that's the case, all we can do is to + // end the sequence adding the trailing quote. + // + // Note that a token inside the quoted sequence may well be + // unquoted, so for example "$foo" is lexed as: + // + // token quoting complete notes + // '' " no + // $ " yes + // 'foo' Unquoted since lexed in variable mode. + // '' " no + // \n + // + if (qseq && + ((qtok && qtok != qseq) || t.type == token_type::newline)) + { + os << qseq; + qseq = '\0'; + } + + // Left and right token quotes (can be used as bool). + // + char lq ('\0'); + char rq ('\0'); + + // If the token is quoted, then determine if/which quotes should be + // present on its sides and track the quoted token sequence. + // + if (qtok) + { + if (t.qcomp) // Complete token quoting. + { + // If we are inside a quoted token sequence then do noting. + // Otherwise just quote the current token not starting a + // sequence. + // + if (!qseq) + { + lq = qtok; + rq = qtok; + } + } + else // Partial token quoting. + { + // Note that we can not always reproduce the original tokens + // representation for partial quoting. For example, the two + // following tokens are lexed into the identical token objects: + // + // "foo + // f"oo" + // + // We will always assume that the partially quoted token either + // starts or ends the quoted token sequence. Sometimes this ends + // up unexpectedly, but seems there is not much we can do: + // + // f"oo" "ba"r -> "foo bar" + // + if (!qseq) // Start quoted sequence. + { + lq = qtok; + qseq = qtok; + } + else // End quoted sequence. + { + rq = qtok; + qseq = '\0'; + } + } + } + + // Print the space character prior to the separated token, unless + // it is a first like token or the newline. + // + if (t.separated && + t.type != token_type::newline && + &rt != &l.tokens[0]) + os << ' '; + + if (lq) os << lq; // Print the left quote, if required. + + // Escape the special characters, unless the token in not a word or + // is single-quoted. Note that the special character set depends on + // whether the word is double-quoted or unquoted. + // + if (t.type == token_type::word && qtok != '\'') + { + for (char c: t.value) + { + if (strchr (qtok ? "\\\"" : "|&<>=\\\"", c) != nullptr) + os << '\\'; + + os << c; + } + } + else + t.printer (os, t, print_mode::raw); + + if (rq) os << rq; // Print the right quote, if required. + } + } + } + + // Quote if empty or contains spaces or any of the special characters. + // Note that we use single quotes since double quotes still allow + // expansion. + // + // @@ What if it contains single quotes? + // + static void + to_stream_q (ostream& o, const string& s) + { + if (s.empty () || s.find_first_of (" |&<>=\\\"") != string::npos) + o << '\'' << s << '\''; + else + o << s; + }; + + void + to_stream (ostream& o, const command& c, command_to_stream m) + { + auto print_path = [&o] (const path& p) + { + using build2::operator<<; + + ostringstream s; + stream_verb (s, stream_verb (o)); + s << p; + + to_stream_q (o, s.str ()); + }; + + auto print_redirect = [&o, print_path] (const redirect& r, int fd) + { + const redirect& er (r.effective ()); + + // Print the none redirect (no data allowed) if/when the respective + // syntax is invented. + // + if (er.type == redirect_type::none) + return; + + o << ' '; + + // Print the redirect file descriptor. + // + if (fd == 2) + o << fd; + + // Print the redirect original representation and the modifiers, if + // present. + // + r.token.printer (o, r.token, print_mode::raw); + + // Print the rest of the redirect (file path, etc). + // + switch (er.type) + { + case redirect_type::none: assert (false); break; + case redirect_type::here_doc_ref: assert (false); break; + + case redirect_type::pass: + case redirect_type::null: + case redirect_type::trace: break; + case redirect_type::merge: o << er.fd; break; + + case redirect_type::file: + { + print_path (er.file.path); + break; + } + + case redirect_type::here_str_literal: + case redirect_type::here_doc_literal: + { + if (er.type == redirect_type::here_doc_literal) + o << er.end; + else + { + const string& v (er.str); + to_stream_q (o, + er.modifiers ().find (':') == string::npos + ? string (v, 0, v.size () - 1) // Strip newline. + : v); + } + + break; + } + + case redirect_type::here_str_regex: + case redirect_type::here_doc_regex: + { + const regex_lines& re (er.regex); + + if (er.type == redirect_type::here_doc_regex) + o << re.intro + er.end + re.intro + re.flags; + else + { + assert (!re.lines.empty ()); // Regex can't be empty. + + regex_line l (re.lines[0]); + to_stream_q (o, re.intro + l.value + re.intro + l.flags); + } + + break; + } + } + }; + + auto print_doc = [&o] (const redirect& r) + { + o << endl; + + if (r.type == redirect_type::here_doc_literal) + o << r.str; + else + { + assert (r.type == redirect_type::here_doc_regex); + + const regex_lines& rl (r.regex); + + for (auto b (rl.lines.cbegin ()), i (b), e (rl.lines.cend ()); + i != e; ++i) + { + if (i != b) + o << endl; + + const regex_line& l (*i); + + if (l.regex) // Regex (possibly empty), + o << rl.intro << l.value << rl.intro << l.flags; + else if (!l.special.empty ()) // Special literal. + o << rl.intro; + else // Textual literal. + o << l.value; + + o << l.special; + } + } + + o << (r.modifiers ().find (':') == string::npos ? "" : "\n") << r.end; + }; + + if ((m & command_to_stream::header) == command_to_stream::header) + { + // Program. + // + to_stream_q (o, c.program.string ()); + + // Arguments. + // + for (const string& a: c.arguments) + { + o << ' '; + to_stream_q (o, a); + } + + // Redirects. + // + if (c.in) + print_redirect (*c.in, 0); + + if (c.out) + print_redirect (*c.out, 1); + + if (c.err) + print_redirect (*c.err, 2); + + for (const auto& p: c.cleanups) + { + o << " &"; + + if (p.type != cleanup_type::always) + o << (p.type == cleanup_type::maybe ? '?' : '!'); + + print_path (p.path); + } + + if (c.exit.comparison != exit_comparison::eq || c.exit.code != 0) + { + switch (c.exit.comparison) + { + case exit_comparison::eq: o << " == "; break; + case exit_comparison::ne: o << " != "; break; + } + + o << static_cast<uint16_t> (c.exit.code); + } + } + + if ((m & command_to_stream::here_doc) == command_to_stream::here_doc) + { + // Here-documents. + // + if (c.in && + (c.in->type == redirect_type::here_doc_literal || + c.in->type == redirect_type::here_doc_regex)) + print_doc (*c.in); + + if (c.out && + (c.out->type == redirect_type::here_doc_literal || + c.out->type == redirect_type::here_doc_regex)) + print_doc (*c.out); + + if (c.err && + (c.err->type == redirect_type::here_doc_literal || + c.err->type == redirect_type::here_doc_regex)) + print_doc (*c.err); + } + } + + void + to_stream (ostream& o, const command_pipe& p, command_to_stream m) + { + if ((m & command_to_stream::header) == command_to_stream::header) + { + for (auto b (p.begin ()), i (b); i != p.end (); ++i) + { + if (i != b) + o << " | "; + + to_stream (o, *i, command_to_stream::header); + } + } + + if ((m & command_to_stream::here_doc) == command_to_stream::here_doc) + { + for (const command& c: p) + to_stream (o, c, command_to_stream::here_doc); + } + } + + void + to_stream (ostream& o, const command_expr& e, command_to_stream m) + { + if ((m & command_to_stream::header) == command_to_stream::header) + { + for (auto b (e.begin ()), i (b); i != e.end (); ++i) + { + if (i != b) + { + switch (i->op) + { + case expr_operator::log_or: o << " || "; break; + case expr_operator::log_and: o << " && "; break; + } + } + + to_stream (o, i->pipe, command_to_stream::header); + } + } + + if ((m & command_to_stream::here_doc) == command_to_stream::here_doc) + { + for (const expr_term& t: e) + to_stream (o, t.pipe, command_to_stream::here_doc); + } + } + + // redirect + // + redirect:: + redirect (redirect_type t) + : type (t) + { + switch (type) + { + case redirect_type::none: + case redirect_type::pass: + case redirect_type::null: + case redirect_type::trace: + case redirect_type::merge: break; + + case redirect_type::here_str_literal: + case redirect_type::here_doc_literal: new (&str) string (); break; + + case redirect_type::here_str_regex: + case redirect_type::here_doc_regex: + { + new (®ex) regex_lines (); + break; + } + + case redirect_type::file: new (&file) file_type (); break; + + case redirect_type::here_doc_ref: assert (false); break; + } + } + + redirect:: + redirect (redirect&& r) noexcept + : type (r.type), + token (move (r.token)), + end (move (r.end)), + end_line (r.end_line), + end_column (r.end_column) + { + switch (type) + { + case redirect_type::none: + case redirect_type::pass: + case redirect_type::null: + case redirect_type::trace: break; + + case redirect_type::merge: fd = r.fd; break; + + case redirect_type::here_str_literal: + case redirect_type::here_doc_literal: + { + new (&str) string (move (r.str)); + break; + } + case redirect_type::here_str_regex: + case redirect_type::here_doc_regex: + { + new (®ex) regex_lines (move (r.regex)); + break; + } + case redirect_type::file: + { + new (&file) file_type (move (r.file)); + break; + } + case redirect_type::here_doc_ref: + { + new (&ref) reference_wrapper<const redirect> (r.ref); + break; + } + } + } + + redirect& redirect:: + operator= (redirect&& r) noexcept + { + if (this != &r) + { + this->~redirect (); + new (this) redirect (move (r)); // Assume noexcept move-constructor. + } + return *this; + } + + redirect:: + ~redirect () + { + switch (type) + { + case redirect_type::none: + case redirect_type::pass: + case redirect_type::null: + case redirect_type::trace: + case redirect_type::merge: break; + + case redirect_type::here_str_literal: + case redirect_type::here_doc_literal: str.~string (); break; + + case redirect_type::here_str_regex: + case redirect_type::here_doc_regex: regex.~regex_lines (); break; + + case redirect_type::file: file.~file_type (); break; + + case redirect_type::here_doc_ref: + { + ref.~reference_wrapper<const redirect> (); + break; + } + } + } + + redirect:: + redirect (const redirect& r) + : type (r.type), + token (r.token), + end (r.end), + end_line (r.end_line), + end_column (r.end_column) + { + switch (type) + { + case redirect_type::none: + case redirect_type::pass: + case redirect_type::null: + case redirect_type::trace: break; + + case redirect_type::merge: fd = r.fd; break; + + case redirect_type::here_str_literal: + case redirect_type::here_doc_literal: + { + new (&str) string (r.str); + break; + } + case redirect_type::here_str_regex: + case redirect_type::here_doc_regex: + { + new (®ex) regex_lines (r.regex); + break; + } + case redirect_type::file: + { + new (&file) file_type (r.file); + break; + } + case redirect_type::here_doc_ref: + { + new (&ref) reference_wrapper<const redirect> (r.ref); + break; + } + } + } + + redirect& redirect:: + operator= (const redirect& r) + { + if (this != &r) + *this = redirect (r); // Reduce to move-assignment. + return *this; + } + + // environment + // + void environment:: + clean (script::cleanup c, bool implicit) + { + using script::cleanup; + + assert (!implicit || c.type == cleanup_type::always); + + const path& p (c.path); + + if (sandbox_dir.path != nullptr && !p.sub (*sandbox_dir.path)) + { + if (implicit) + return; + else + assert (false); // Error so should have been checked. + } + + auto pr = [&p] (const cleanup& v) -> bool {return v.path == p;}; + auto i (find_if (cleanups.begin (), cleanups.end (), pr)); + + if (i == cleanups.end ()) + cleanups.emplace_back (move (c)); + else if (!implicit) + i->type = c.type; + } + + void environment:: + clean_special (path p) + { + special_cleanups.emplace_back (move (p)); + } + } +} diff --git a/libbuild2/script/script.hxx b/libbuild2/script/script.hxx new file mode 100644 index 0000000..f4998b7 --- /dev/null +++ b/libbuild2/script/script.hxx @@ -0,0 +1,471 @@ +// file : libbuild2/script/script.hxx -*- C++ -*- +// license : MIT; see accompanying LICENSE file + +#ifndef LIBBUILD2_SCRIPT_SCRIPT_HXX +#define LIBBUILD2_SCRIPT_SCRIPT_HXX + +#include <libbuild2/types.hxx> +#include <libbuild2/forward.hxx> +#include <libbuild2/utility.hxx> + +#include <libbuild2/token.hxx> +#include <libbuild2/variable.hxx> + +namespace build2 +{ + namespace script + { + // Pre-parsed representation. + // + + enum class line_type + { + var, + cmd, + cmd_if, + cmd_ifn, + cmd_elif, + cmd_elifn, + cmd_else, + cmd_end + }; + + ostream& + operator<< (ostream&, line_type); + + struct line + { + line_type type; + replay_tokens tokens; + + union + { + const variable* var; // Pre-entered for line_type::var. + }; + }; + + // Most of the time we will have just one line (a command). + // + using lines = small_vector<line, 1>; + + // Print the script lines, trying to reproduce their original (non- + // expanded) representation. + // + // Note that the exact spacing and partial quoting may not be restored due + // to the information loss. + // + void + dump (ostream&, const string& ind, const lines&); + + // Parse object model. + // + + // redirect + // + enum class redirect_type + { + // No data is allowed to be read or written. + // + // Note that redirect of this type cannot be currently specified on the + // script command line and can only be set via the environment object + // as a default redirect (see below). + // + none, + + pass, + null, + trace, + merge, + here_str_literal, + here_str_regex, + here_doc_literal, + here_doc_regex, + here_doc_ref, // Reference to here_doc literal or regex. + file, + }; + + // Pre-parsed (but not instantiated) regex lines. The idea here is that + // we should be able to re-create their (more or less) exact text + // representation for diagnostics but also instantiate without any + // re-parsing. + // + struct regex_line + { + // If regex is true, then value is the regex expression. Otherwise, it + // is a literal. Note that special characters can be present in both + // cases. For example, //+ is a regex, while /+ is a literal, both + // with '+' as a special character. Flags are only valid for regex. + // Literals falls apart into textual (has no special characters) and + // special (has just special characters instead) ones. For example + // foo is a textual literal, while /.+ is a special one. Note that + // literal must not have value and special both non-empty. + // + bool regex; + + string value; + string flags; + string special; + + uint64_t line; + uint64_t column; + + // Create regex with optional special characters. + // + regex_line (uint64_t l, uint64_t c, + string v, string f, string s = string ()) + : regex (true), + value (move (v)), + flags (move (f)), + special (move (s)), + line (l), + column (c) {} + + // Create a literal, either text or special. + // + regex_line (uint64_t l, uint64_t c, string v, bool s) + : regex (false), + value (s ? string () : move (v)), + special (s ? move (v) : string ()), + line (l), + column (c) {} + }; + + struct regex_lines + { + char intro; // Introducer character. + string flags; // Global flags (here-document). + + small_vector<regex_line, 8> lines; + }; + + // Output file redirect mode. + // + enum class redirect_fmode + { + compare, + overwrite, + append + }; + + struct redirect + { + redirect_type type; + + struct file_type + { + using path_type = build2::path; + path_type path; + redirect_fmode mode; // Meaningless for input redirect. + }; + + union + { + int fd; // Merge-to descriptor. + string str; // Note: with trailing newline, if requested. + regex_lines regex; // Note: with trailing blank, if requested. + file_type file; + reference_wrapper<const redirect> ref; // Note: no chains. + }; + + // Modifiers and the original representation (potentially an alias). + // + build2::token token; + + string end; // Here-document end marker (no regex intro/flags). + uint64_t end_line; // Here-document end marker location. + uint64_t end_column; + + // Create redirect of a type other than reference. + // + explicit + redirect (redirect_type); + + // Create redirect of the reference type. + // + redirect (redirect_type t, const redirect& r, build2::token tk) + : type (redirect_type::here_doc_ref), + ref (r), + token (move (tk)) + { + // There is no support (and need) for reference chains. + // + assert (t == redirect_type::here_doc_ref && + r.type != redirect_type::here_doc_ref); + } + + // Create redirect of the merge type. + // + // Note that it's the caller's responsibility to make sure that the file + // descriptor is valid for this redirect (2 for stdout, etc). + // + redirect (redirect_type t, int f) + : type (redirect_type::merge), fd (f) + { + assert (t == redirect_type::merge && (f == 1 || f == 2)); + } + + redirect (redirect&&) noexcept; + redirect& operator= (redirect&&) noexcept; + + // @@ Defining optional movable-only redirects in the command class make + // the older C++ compilers (GCC 4.9, Clang 4, VC 15) fail to compile the + // command vector manipulating code. Thus, we make the redirect class + // copyable to workaround the issue. + // + redirect (const redirect&); + redirect& operator= (const redirect&); + + ~redirect (); + + const redirect& + effective () const noexcept + { + return type == redirect_type::here_doc_ref ? ref.get () : *this; + } + + const string& + modifiers () const noexcept + { + return token.value; + } + }; + + // cleanup + // + enum class cleanup_type + { + always, // &foo - cleanup, fail if does not exist. + maybe, // &?foo - cleanup, ignore if does not exist. + never // &!foo - don’t cleanup, ignore if doesn’t exist. + }; + + // File or directory to be automatically cleaned up at the end of the + // script execution. If the path ends with a trailing slash, then it is + // assumed to be a directory, otherwise -- a file. A directory that is + // about to be cleaned up must be empty. + // + // The last component in the path may contain a wildcard that have the + // following semantics: + // + // dir/* - remove all immediate files + // dir/*/ - remove all immediate sub-directories (must be empty) + // dir/** - remove all files recursively + // dir/**/ - remove all sub-directories recursively (must be empty) + // dir/*** - remove directory dir with all files and sub-directories + // recursively + // + struct cleanup + { + cleanup_type type; + build2::path path; + }; + using cleanups = vector<cleanup>; + + // command_exit + // + enum class exit_comparison {eq, ne}; + + struct command_exit + { + // C/C++ don't apply constraints on program exit code other than it + // being of type int. + // + // POSIX specifies that only the least significant 8 bits shall be + // available from wait() and waitpid(); the full value shall be + // available from waitid() (read more at _Exit, _exit Open Group + // spec). + // + // While the Linux man page for waitid() doesn't mention any + // deviations from the standard, the FreeBSD implementation (as of + // version 11.0) only returns 8 bits like the other wait*() calls. + // + // Windows supports 32-bit exit codes. + // + // Note that in shells some exit values can have special meaning so + // using them can be a source of confusion. For bash values in the + // [126, 255] range are such a special ones (see Appendix E, "Exit + // Codes With Special Meanings" in the Advanced Bash-Scripting Guide). + // + exit_comparison comparison; + uint8_t code; + }; + + // command + // + struct command + { + path program; + strings arguments; + + optional<redirect> in; + optional<redirect> out; + optional<redirect> err; + + script::cleanups cleanups; + + command_exit exit {exit_comparison::eq, 0}; + }; + + enum class command_to_stream: uint16_t + { + header = 0x01, + here_doc = 0x02, // Note: printed on a new line. + all = header | here_doc + }; + + void + to_stream (ostream&, const command&, command_to_stream); + + ostream& + operator<< (ostream&, const command&); + + // command_pipe + // + using command_pipe = vector<command>; + + void + to_stream (ostream&, const command_pipe&, command_to_stream); + + ostream& + operator<< (ostream&, const command_pipe&); + + // command_expr + // + enum class expr_operator {log_or, log_and}; + + struct expr_term + { + expr_operator op; // OR-ed to an implied false for the first term. + command_pipe pipe; + }; + + using command_expr = vector<expr_term>; + + void + to_stream (ostream&, const command_expr&, command_to_stream); + + ostream& + operator<< (ostream&, const command_expr&); + + // Script execution environment. + // + class environment + { + public: + build2::context& context; + + // The platform script programs run on. + // + const target_triplet& host; + + // The work directory is used as the builtin/process CWD and to complete + // relative paths. Any attempt to remove or move this directory (or its + // parent directory) using the rm or mv builtins will fail. Must be an + // absolute path. + // + const dir_name_view work_dir; + + // If the sanbox directory is not NULL, then any attempt to remove or + // move a filesystem entry outside this directory using an explicit + // cleanup or the rm/mv builtins will fail, unless the --force option is + // specified for the builtin. Must be an absolute path. + // + const dir_name_view sandbox_dir; + + // The temporary directory is used by the script running machinery to + // create special files. Must be an absolute path, unless empty. Can be + // empty until the create_temp_dir() function call, which can be used + // for creating this directory on demand. + // + const dir_path& temp_dir; + + // If true, the temporary directory will not be removed on the script + // failure. In particular, this allows the script running machinery to + // refer to the special files in diagnostics. + // + const bool temp_dir_keep; + + // Default process streams redirects. + // + // If a stream redirect is not specified on the script command line, + // then the respective redirect data member will be used as the default. + // + const redirect in; + const redirect out; + const redirect err; + + environment (build2::context& ctx, + const target_triplet& h, + const dir_name_view& wd, + const dir_name_view& sd, + const dir_path& td, bool tk, + redirect&& i = redirect (redirect_type::pass), + redirect&& o = redirect (redirect_type::pass), + redirect&& e = redirect (redirect_type::pass)) + : context (ctx), host (h), + work_dir (wd), sandbox_dir (sd), temp_dir (td), temp_dir_keep (tk), + in (move (i)), out (move (o)), err (move (e)) + { + } + + // Create environment without the sandbox. + // + environment (build2::context& ctx, + const target_triplet& h, + const dir_name_view& wd, + const dir_path& td, bool tk, + redirect&& i = redirect (redirect_type::pass), + redirect&& o = redirect (redirect_type::pass), + redirect&& e = redirect (redirect_type::pass)) + : environment (ctx, h, + wd, dir_name_view (), td, tk, + move (i), move (o), move (e)) + { + } + + // Cleanup. + // + public: + script::cleanups cleanups; + paths special_cleanups; + + // Register a cleanup. If the cleanup is explicit, then override the + // cleanup type if this path is already registered. Ignore implicit + // registration of a path outside root directory (see below). + // + void + clean (cleanup, bool implicit); + + // Register cleanup of a special file. Such files are created to + // maintain the script running machinery and must be removed first, not + // to interfere with the user-defined wildcard cleanups if the working + // and temporary directories are the same. + // + void + clean_special (path); + + public: + // Set variable value with optional (non-empty) attributes. + // + virtual void + set_variable (string&& name, + names&&, + const string& attrs, + const location&) = 0; + + // Create the temporary directory and set the temp_dir reference target + // to its path. Must only be called if temp_dir is empty. + // + virtual void + create_temp_dir () = 0; + + public: + virtual + ~environment () = default; + }; + } +} + +#include <libbuild2/script/script.ixx> + +#endif // LIBBUILD2_SCRIPT_SCRIPT_HXX diff --git a/libbuild2/script/script.ixx b/libbuild2/script/script.ixx new file mode 100644 index 0000000..56043b2 --- /dev/null +++ b/libbuild2/script/script.ixx @@ -0,0 +1,56 @@ +// file : libbuild2/script/script.ixx -*- C++ -*- +// license : MIT; see accompanying LICENSE file + +namespace build2 +{ + namespace script + { + inline command_to_stream + operator&= (command_to_stream& x, command_to_stream y) + { + return x = static_cast<command_to_stream> ( + static_cast<uint16_t> (x) & static_cast<uint16_t> (y)); + } + + inline command_to_stream + operator|= (command_to_stream& x, command_to_stream y) + { + return x = static_cast<command_to_stream> ( + static_cast<uint16_t> (x) | static_cast<uint16_t> (y)); + } + + inline command_to_stream + operator& (command_to_stream x, command_to_stream y) {return x &= y;} + + inline command_to_stream + operator| (command_to_stream x, command_to_stream y) {return x |= y;} + + + // command + // + inline ostream& + operator<< (ostream& o, const command& c) + { + to_stream (o, c, command_to_stream::all); + return o; + } + + // command_pipe + // + inline ostream& + operator<< (ostream& o, const command_pipe& p) + { + to_stream (o, p, command_to_stream::all); + return o; + } + + // command_expr + // + inline ostream& + operator<< (ostream& o, const command_expr& e) + { + to_stream (o, e, command_to_stream::all); + return o; + } + } +} diff --git a/libbuild2/script/token.cxx b/libbuild2/script/token.cxx new file mode 100644 index 0000000..1c612a5 --- /dev/null +++ b/libbuild2/script/token.cxx @@ -0,0 +1,53 @@ +// file : libbuild2/script/token.cxx -*- C++ -*- +// license : MIT; see accompanying LICENSE file + +#include <libbuild2/script/token.hxx> + +using namespace std; + +namespace build2 +{ + namespace script + { + void + token_printer (ostream& os, const token& t, print_mode m) + { + const string& v (t.value); + + // Only quote non-name tokens for diagnostics. + // + const char* q (m == print_mode::diagnostics ? "'" : ""); + + switch (t.type) + { + case token_type::clean: os << q << '&' << v << q; break; + case token_type::pipe: os << q << '|' << q; break; + + case token_type::in_pass: os << q << "<|" << q; break; + case token_type::in_null: os << q << "<-" << q; break; + case token_type::in_file: os << q << "<=" << q; break; + case token_type::in_doc: os << q << "<<=" << v << q; break; + case token_type::in_str: os << q << "<<<=" << v << q; break; + + case token_type::out_pass: os << q << ">|" << q; break; + case token_type::out_null: os << q << ">-" << q; break; + case token_type::out_trace: os << q << ">!" << q; break; + case token_type::out_merge: os << q << ">&" << q; break; + case token_type::out_file_ovr: os << q << ">=" << q; break; + case token_type::out_file_app: os << q << ">+" << q; break; + case token_type::out_file_cmp: os << q << ">?" << q; break; + case token_type::out_doc: os << q << ">>?" << v << q; break; + case token_type::out_str: os << q << ">>>?" << v << q; break; + + case token_type::in_l: os << q << '<' << v << q; break; + case token_type::in_ll: os << q << "<<" << v << q; break; + case token_type::in_lll: os << q << "<<<" << v << q; break; + case token_type::out_g: os << q << '>' << v << q; break; + case token_type::out_gg: os << q << ">>" << v << q; break; + case token_type::out_ggg: os << q << ">>>" << v << q; break; + + default: build2::token_printer (os, t, m); + } + } + } +} diff --git a/libbuild2/script/token.hxx b/libbuild2/script/token.hxx new file mode 100644 index 0000000..0186bd9 --- /dev/null +++ b/libbuild2/script/token.hxx @@ -0,0 +1,66 @@ +// file : libbuild2/script/token.hxx -*- C++ -*- +// license : MIT; see accompanying LICENSE file + +#ifndef LIBBUILD2_SCRIPT_TOKEN_HXX +#define LIBBUILD2_SCRIPT_TOKEN_HXX + +#include <libbuild2/types.hxx> +#include <libbuild2/utility.hxx> + +#include <libbuild2/token.hxx> + +namespace build2 +{ + namespace script + { + struct token_type: build2::token_type + { + using base_type = build2::token_type; + + enum + { + // NOTE: remember to update token_printer()! + + pipe = base_type::value_next, // | + clean, // &{?!} (modifiers in value) + + in_pass, // <| + in_null, // <- + in_file, // <= + in_doc, // <<={:/} (modifiers in value) + in_str, // <<<={:/} (modifiers in value) + + out_pass, // >| + out_null, // >- + out_trace, // >! + out_merge, // >& + out_file_ovr, // >= + out_file_app, // >+ + out_file_cmp, // >? + out_doc, // >>?{:/~} (modifiers in value) + out_str, // >>>?{:/~} (modifiers in value) + + // The modifiers are in the token value, if the redirect the alias + // resolves to supports the modifiers. + // + in_l, // < + in_ll, // << + in_lll, // <<< + out_g, // > + out_gg, // >> + out_ggg, // >>> + + value_next + }; + + token_type () = default; + token_type (value_type v): base_type (v) {} + token_type (base_type v): base_type (v) {} + }; + + void + token_printer (ostream&, const token&, print_mode); + } +} + +#endif // LIBBUILD2_SCRIPT_TOKEN_HXX |