// file : libbuild2/lexer.hxx -*- C++ -*- // copyright : Copyright (c) 2014-2019 Code Synthesis Ltd // license : MIT; see accompanying LICENSE file #ifndef LIBBUILD2_LEXER_HXX #define LIBBUILD2_LEXER_HXX #include <stack> #include <libbutl/char-scanner.mxx> #include <libbuild2/types.hxx> #include <libbuild2/utility.hxx> #include <libbuild2/token.hxx> #include <libbuild2/diagnostics.hxx> #include <libbuild2/export.hxx> namespace build2 { // Context-dependent lexing mode. Quoted modes are internal and should not // be set explicitly. In the value mode we don't treat certain characters // (e.g., `+`, `=`) as special so that we can use them in the variable // values, e.g., `foo = g++`. In contrast, in the variable mode, we restrict // certain character (e.g., `/`) from appearing in the name. The values mode // is like value but recogizes `,` as special (used in contexts where we // need to list multiple values). The attributes/attribute_value modes are // like values where each value is potentially a variable assignment; they // don't treat `{` and `}` as special (so we cannot have name groups in // attributes) as well as recognizes `=` and `]`. The eval mode is used in // the evaluation context. // // A number of modes are "derived" from the value/values mode by recognizing // a few extra characters: // // switch_expressions values plus `:` // case_patterns values plus `|` and `:` // // Note that the normal, value/values and derived, as well as eval modes // split words separated by the pair character (to disable pairs one can // pass `\0` as a pair character). // // The alternative modes must be set manually. The value/values and derived // modes automatically expires after the end of the line. The attribute mode // expires after the closing `]`. The variable mode expires after the word // token. And the eval mode expires after the closing `)`. // // Note that normally it is only safe to switch mode when the current token // is not quoted (or, more generally, when you are not in the double-quoted // mode) unless the mode treats the double-quote as a separator (e.g., // variable name mode). Failed that your mode (which now will be the top of // the mode stack) will prevent proper recognition of the closing quote. // // Finally, attributes recognition (the `[` token) cuts across most of the // modes and is handled with a flag. In the normal mode it is automatically // set at the beginning and after each newline. In all other modes it must // be explicitly set at points where attributes are recognized. In all the // cases it is automatically reset after lexing the next token (whether `[` // or not). // // Extendable/inheritable enum-like class. // struct lexer_mode: lexer_mode_base { using base_type = lexer_mode_base; enum { normal = base_type::value_next, variable, value, values, case_patterns, switch_expressions, attributes, attribute_value, eval, single_quoted, double_quoted, buildspec, value_next }; lexer_mode () = default; lexer_mode (value_type v): base_type (v) {} lexer_mode (base_type v): base_type (v) {} }; class LIBBUILD2_SYMEXPORT lexer: public butl::char_scanner { public: // If escape is not NULL then only escape sequences with characters from // this string are considered "effective escapes" with all others passed // through as is. Note that neither the name nor escape arguments are // copied. // lexer (istream& is, const path_name& name, uint64_t line = 1, // Start line in the stream. const char* escapes = nullptr) : lexer (is, name, line, escapes, true /* set_mode */) {} const path_name& name () const {return name_;} // Note: sets mode for the next token. The second argument can be used to // specify the pair separator character (if the mode supports pairs). If // escapes is not specified, then inherit the current mode's (though a // mode can also override it). // virtual void mode (lexer_mode, char pair_separator = '\0', optional<const char*> escapes = nullopt); // Enable attributes recognition for the next token. // void enable_attributes () {state_.top ().attributes = true;} // Expire the current mode early. // void expire_mode () {state_.pop ();} lexer_mode mode () const {return state_.top ().mode;} char pair_separator () const {return state_.top ().sep_pair;} // Scanner. Note that it is ok to call next() again after getting eos. // // If you extend the lexer and add a custom lexer mode, then you must // override next() and handle the custom mode there. // virtual token next (); // Peek at the first character of the next token. Return the character // or '\0' if the next token will be eos. Also return an indicator of // whether the next token will be separated. // pair<char, bool> peek_char (); protected: struct state { lexer_mode mode; bool attributes; char sep_pair; bool sep_space; // Are whitespaces separators (see skip_spaces())? bool sep_newline; // Is newline special (see skip_spaces())? bool quotes; // Recognize quoted fragments. const char* escapes; // Effective escape sequences to recognize. // Word separator characters. For two-character sequence put the first // one in sep_first and the second one in the corresponding position of // sep_second. If it's a single-character sequence, then put space in // sep_second. If there are multiple sequences that start with the same // character, then repeat the first character in sep_first. // const char* sep_first; const char* sep_second; }; token next_eval (); token next_quoted (); // Lex a word assuming current is the top state (which may already have // been "expired" from the top). // virtual token word (state current, bool separated); // Return true if we have seen any spaces. Skipped empty lines // don't count. In other words, we are only interested in spaces // that are on the same line as the following non-space character. // bool skip_spaces (); // Diagnostics. // protected: fail_mark fail; // Lexer state. // protected: lexer (istream& is, const path_name& name, uint64_t line, const char* escapes, bool set_mode) : char_scanner (is, true /* crlf */, line), fail ("error", &name), name_ (name), sep_ (false) { if (set_mode) mode (lexer_mode::normal, '@', escapes); } const path_name& name_; std::stack<state> state_; bool sep_; // True if we skipped spaces in peek(). }; } // Diagnostics plumbing. // namespace butl // ADL { inline build2::location get_location (const butl::char_scanner::xchar& c, const void* data) { using namespace build2; assert (data != nullptr); // E.g., must be &lexer::name_. return location (*static_cast<const path_name*> (data), c.line, c.column); } } #endif // LIBBUILD2_LEXER_HXX