diff options
Diffstat (limited to 'libbuild2/lexer.hxx')
-rw-r--r-- | libbuild2/lexer.hxx | 70 |
1 files changed, 53 insertions, 17 deletions
diff --git a/libbuild2/lexer.hxx b/libbuild2/lexer.hxx index 148666e..e913829 100644 --- a/libbuild2/lexer.hxx +++ b/libbuild2/lexer.hxx @@ -26,14 +26,15 @@ namespace build2 // mode we don't treat certain characters (e.g., `+`, `=`) as special so // that we can use them in the variable values, e.g., `foo = g++`. In // contrast, in the variable mode, we restrict certain character (e.g., `/`) - // from appearing in the name. The values mode is like value but recogizes - // `,` as special (used in contexts where we need to list multiple - // values). The attributes/attribute_value modes are like values where each - // value is potentially a variable assignment; they don't treat `{` and `}` - // as special (so we cannot have name groups in attributes) as well as - // recognizes `=` and `]`. The subscript mode is like value but doesn't - // treat `{` and `}` as special and recognizes `]`. The eval mode is used in - // the evaluation context. + // from appearing in the name. Additionally, in the variable mode we + // recognize leading `\` as the beginning of the escape sequent ($\n). The + // values mode is like value but recogizes `,` as special (used in contexts + // where we need to list multiple values). The attributes/attribute_value + // modes are like values where each value is potentially a variable + // assignment; they don't treat `{` and `}` as special (so we cannot have + // name groups in attributes) as well as recognizes `=` and `]`. The + // subscript mode is like value but doesn't treat `{` and `}` as special and + // recognizes `]`. The eval mode is used in the evaluation context. // // A number of modes are "derived" from the value/values mode by recognizing // a few extra characters: @@ -133,10 +134,23 @@ namespace build2 const path_name& name () const {return name_;} - // Note: sets mode for the next token. The second argument can be used to - // specify the pair separator character (if the mode supports pairs). If - // escapes is not specified, then inherit the current mode's (though a - // mode can also override it). + // Set the lexer mode for the next token or delay this until the end of a + // double-quoted token sequence is encountered. The second argument can be + // used to specify the pair separator character (if the mode supports + // pairs). If escapes is not specified, then inherit the current mode's + // (though a mode can also override it). + // + // Note that there is a common parsing pattern of sensing the language + // construct kind we are about to parse by reading its first token, + // switching to an appropriate lexing mode, and then parsing the rest. The + // problem here is that the first token may start the double-quoted token + // sequence, turning the lexer into the double-quoted mode. In this case + // switching the lexer mode right away would not be a good idea. Thus, + // this function delays the mode switch until the end of the double-quoted + // sequence is encountered. Note, however, that such a delay only works + // properly if the function is called right after the first quoted token + // is read (because any subsequent tokens may end up being parsed in a + // nested mode such as variable or eval; see mode_impl() for details). // virtual void mode (lexer_mode, @@ -153,10 +167,12 @@ namespace build2 state_.top ().lsbrace_unsep = unsep; } - // Expire the current mode early. + // Expire the current mode early or delay this until the end of a + // double-quoted token sequence is encountered (see mode() for details on + // the delay condition and reasoning). // void - expire_mode () {state_.pop ();} + expire_mode (); lexer_mode mode () const {return state_.top ().mode;} @@ -175,7 +191,7 @@ namespace build2 virtual token next (); - // Peek at the first two characters of the next token(s). Return the + // Peek at the first one/two characters of the next token(s). Return the // characters or '\0' if either would be eos. Also return an indicator of // whether the next token would be separated. Note: cannot be used to peek // at the first character of a line. @@ -184,6 +200,9 @@ namespace build2 // mode in which these characters will actually be parsed use the same // whitespace separation (the sep_space and sep_newline values). // + pair<char, bool> + peek_char (); + pair<pair<char, char>, bool> peek_chars (); @@ -244,7 +263,7 @@ namespace build2 // been "expired" from the top). // virtual token - word (state current, bool separated); + word (const state& current, bool separated); // Return true in first if we have seen any spaces. Skipped empty lines // don't count. In other words, we are only interested in spaces that are @@ -255,6 +274,20 @@ namespace build2 pair<bool, bool> skip_spaces (); + // Set state for the next token or delay until the end of a double-quoted + // token sequence is encountered (see mode() for details on the delay + // condition and reasoning). + // + void + mode_impl (state&&); + + state& + current_state () + { + assert (!state_.empty ()); + return state_.top (); + } + // Diagnostics. // protected: @@ -283,11 +316,14 @@ namespace build2 } const path_name& name_; - std::stack<state> state_; bool sep_; // True if we skipped spaces in peek(). private: + // Use current_state(), mode_impl(), and expire_mode(). + // + std::stack<state> state_; + using base = char_scanner<butl::utf8_validator, 2>; // Buffer for a get()/peek() potential error. |