1 files changed, 53 insertions, 17 deletions
diff --git a/libbuild2/lexer.hxx b/libbuild2/lexer.hxx
index 148666e..e913829 100644
--- a/libbuild2/lexer.hxx
+++ b/libbuild2/lexer.hxx
@@ -26,14 +26,15 @@ namespace build2
   // mode we don't treat certain characters (e.g., `+`, `=`) as special so
   // that we can use them in the variable values, e.g., `foo = g++`. In
   // contrast, in the variable mode, we restrict certain character (e.g., `/`)
-  // from appearing in the name. The values mode is like value but recogizes
-  // `,` as special (used in contexts where we need to list multiple
-  // values). The attributes/attribute_value modes are like values where each
-  // value is potentially a variable assignment; they don't treat `{` and `}`
-  // as special (so we cannot have name groups in attributes) as well as
-  // recognizes `=` and `]`. The subscript mode is like value but doesn't
-  // treat `{` and `}` as special and recognizes `]`. The eval mode is used in
-  // the evaluation context.
+  // from appearing in the name. Additionally, in the variable mode we
+  // recognize leading `\` as the beginning of the escape sequent ($\n). The
+  // values mode is like value but recogizes `,` as special (used in contexts
+  // where we need to list multiple values). The attributes/attribute_value
+  // modes are like values where each value is potentially a variable
+  // assignment; they don't treat `{` and `}` as special (so we cannot have
+  // name groups in attributes) as well as recognizes `=` and `]`. The
+  // subscript mode is like value but doesn't treat `{` and `}` as special and
+  // recognizes `]`. The eval mode is used in the evaluation context.
   //
   // A number of modes are "derived" from the value/values mode by recognizing
   // a few extra characters:
@@ -133,10 +134,23 @@ namespace build2
     const path_name&
     name () const {return name_;}
 
-    // Note: sets mode for the next token. The second argument can be used to
-    // specify the pair separator character (if the mode supports pairs). If
-    // escapes is not specified, then inherit the current mode's (though a
-    // mode can also override it).
+    // Set the lexer mode for the next token or delay this until the end of a
+    // double-quoted token sequence is encountered. The second argument can be
+    // used to specify the pair separator character (if the mode supports
+    // pairs). If escapes is not specified, then inherit the current mode's
+    // (though a mode can also override it).
+    //
+    // Note that there is a common parsing pattern of sensing the language
+    // construct kind we are about to parse by reading its first token,
+    // switching to an appropriate lexing mode, and then parsing the rest. The
+    // problem here is that the first token may start the double-quoted token
+    // sequence, turning the lexer into the double-quoted mode. In this case
+    // switching the lexer mode right away would not be a good idea. Thus,
+    // this function delays the mode switch until the end of the double-quoted
+    // sequence is encountered. Note, however, that such a delay only works
+    // properly if the function is called right after the first quoted token
+    // is read (because any subsequent tokens may end up being parsed in a
+    // nested mode such as variable or eval; see mode_impl() for details).
     //
     virtual void
     mode (lexer_mode,
@@ -153,10 +167,12 @@ namespace build2
       state_.top ().lsbrace_unsep = unsep;
     }
 
-    // Expire the current mode early.
+    // Expire the current mode early or delay this until the end of a
+    // double-quoted token sequence is encountered (see mode() for details on
+    // the delay condition and reasoning).
     //
     void
-    expire_mode () {state_.pop ();}
+    expire_mode ();
 
     lexer_mode
     mode () const {return state_.top ().mode;}
@@ -175,7 +191,7 @@ namespace build2
     virtual token
     next ();
 
-    // Peek at the first two characters of the next token(s). Return the
+    // Peek at the first one/two characters of the next token(s). Return the
     // characters or '\0' if either would be eos. Also return an indicator of
     // whether the next token would be separated. Note: cannot be used to peek
     // at the first character of a line.
@@ -184,6 +200,9 @@ namespace build2
     // mode in which these characters will actually be parsed use the same
     // whitespace separation (the sep_space and sep_newline values).
     //
+    pair<char, bool>
+    peek_char ();
+
     pair<pair<char, char>, bool>
     peek_chars ();
 
@@ -244,7 +263,7 @@ namespace build2
     // been "expired" from the top).
     //
     virtual token
-    word (state current, bool separated);
+    word (const state& current, bool separated);
 
     // Return true in first if we have seen any spaces. Skipped empty lines
     // don't count. In other words, we are only interested in spaces that are
@@ -255,6 +274,20 @@ namespace build2
     pair<bool, bool>
     skip_spaces ();
 
+    // Set state for the next token or delay until the end of a double-quoted
+    // token sequence is encountered (see mode() for details on the delay
+    // condition and reasoning).
+    //
+    void
+    mode_impl (state&&);
+
+    state&
+    current_state ()
+    {
+      assert (!state_.empty ());
+      return state_.top ();
+    }
+
     // Diagnostics.
     //
   protected:
@@ -283,11 +316,14 @@ namespace build2
     }
 
     const path_name& name_;
-    std::stack<state> state_;
 
     bool sep_; // True if we skipped spaces in peek().
 
   private:
+    // Use current_state(), mode_impl(), and expire_mode().
+    //
+    std::stack<state> state_;
+
     using base = char_scanner<butl::utf8_validator, 2>;
 
     // Buffer for a get()/peek() potential error.