From 1270101f4267ecd187bb604190d004daaae341b7 Mon Sep 17 00:00:00 2001
From: Boris Kolpackov <boris@codesynthesis.com>
Date: Fri, 4 Nov 2016 08:47:26 +0200
Subject: Various testscript lexer/parser fixes

---
 build2/lexer     | 41 ++++++++++++++++++++++++-----------------
 build2/lexer.cxx | 30 +++++++++++++++++++-----------
 build2/parser    |  2 ++
 3 files changed, 45 insertions(+), 28 deletions(-)

(limited to 'build2')
diff --git a/build2/lexer b/build2/lexer
index c5c3857..f7f7b82 100644
--- a/build2/lexer
+++ b/build2/lexer
@@ -31,6 +31,12 @@ namespace build2
   // automatically reset after the end of the line. The variable mode is reset
   // after the word token. And the eval mode is reset after the closing ')'.
   //
+  // Note that normally it is only safe to switch mode when the current token
+  // is not quoted (or, more generally, when you are not in the double-quoted
+  // mode) unless the mode treats the double-quote as a separator (e.g.,
+  // variable name mode). Failed that your mode (which now will be the top of
+  // the mode stack) will prevent proper recognition of the closing quote.
+  //
 
   // Extendable/inheritable enum-like class.
   //
@@ -102,6 +108,23 @@ namespace build2
     peek_char ();
 
   protected:
+    struct state
+    {
+      lexer_mode mode;
+
+      char sep_pair;
+      bool sep_space; // Are whitespaces separators (see skip_spaces())?
+
+      // Word separator characters. For two-character sequence put the first
+      // one in sep_first and the second one in the corresponding position of
+      // sep_second. If it's a single-character sequence, then put space in
+      // sep_second. If there are multiple sequences that start with the same
+      // character, then repeat the first character in sep_first.
+      //
+      const char* sep_first;
+      const char* sep_second;
+    };
+
     // If you extend the lexer and add a custom lexer mode, then you must
     // override next_impl() and handle the custom mode there.
     //
@@ -115,7 +138,7 @@ namespace build2
     next_quoted ();
 
     virtual token
-    word (bool separated);
+    word (state, bool separated);
 
     // Return true if we have seen any spaces. Skipped empty lines
     // don't count. In other words, we are only interested in spaces
@@ -161,22 +184,6 @@ namespace build2
     const char* escapes_;
     void (*processor_) (token&, const lexer&);
 
-    struct state
-    {
-      lexer_mode mode;
-
-      char sep_pair;
-      bool sep_space; // Are whitespaces separators (see skip_spaces())?
-
-      // Word separator characters. For two-character sequence put the first
-      // one in sep_first and the second one in the corresponding position of
-      // sep_second. If it's a single-character sequence, then put space in
-      // sep_second. If there are multiple sequences that start with the same
-      // character, then repeat the first character in sep_first.
-      //
-      const char* sep_first;
-      const char* sep_second;
-    };
     std::stack<state> state_;
 
     bool sep_; // True if we skipped spaces in peek().
diff --git a/build2/lexer.cxx b/build2/lexer.cxx
index b188396..c84b102 100644
--- a/build2/lexer.cxx
+++ b/build2/lexer.cxx
@@ -78,7 +78,8 @@ namespace build2
   token lexer::
   next_impl ()
   {
-    lexer_mode m (state_.top ().mode);
+    const state& st (state_.top ());
+    lexer_mode m (st.mode);
 
     // For some modes we have dedicated imlementations of next().
     //
@@ -108,7 +109,7 @@ namespace build2
     // Handle pair separator.
     //
     if ((m == lexer_mode::normal || m == lexer_mode::value) &&
-        c == state_.top ().sep_pair)
+        c == st.sep_pair)
       return make_token (type::pair_separator);
 
     switch (c)
@@ -168,7 +169,7 @@ namespace build2
     // Otherwise it is a word.
     //
     unget (c);
-    return word (sep);
+    return word (st, sep);
   }
 
   token lexer::
@@ -180,6 +181,8 @@ namespace build2
     if (eos (c))
       fail (c) << "unterminated evaluation context";
 
+    const state& st (state_.top ());
+
     uint64_t ln (c.line), cn (c.column);
 
     auto make_token = [sep, ln, cn] (type t)
@@ -193,7 +196,7 @@ namespace build2
 
     // Handle pair separator.
     //
-    if (c == state_.top ().sep_pair)
+    if (c == st.sep_pair)
       return make_token (type::pair_separator);
 
     // Note: we don't treat [ and ] as special here. Maybe can use them for
@@ -242,7 +245,7 @@ namespace build2
     // Otherwise it is a word.
     //
     unget (c);
-    return word (sep);
+    return word (st, sep);
   }
 
   token lexer::
@@ -264,13 +267,13 @@ namespace build2
     // Otherwise it is a word.
     //
     unget (c);
-    return word (false);
+    return word (state_.top (), false);
   }
 
   token lexer::
-  word (bool sep)
+  word (state st, bool sep)
   {
-    lexer_mode m (state_.top ().mode);
+    lexer_mode m (st.mode);
 
     xchar c (peek ());
     assert (!eos (c));
@@ -337,7 +340,9 @@ namespace build2
           {
             get ();
             state_.pop ();
-            m = state_.top ().mode;
+
+            st = state_.top ();
+            m = st.mode;
             continue;
           }
         }
@@ -366,7 +371,6 @@ namespace build2
       {
         // First check if it's a pair separator.
         //
-        const state& st (state_.top ());
         if (c == st.sep_pair)
           done = true;
         else
@@ -421,7 +425,11 @@ namespace build2
           case '\"':
             {
               get ();
-              mode ((m = lexer_mode::double_quoted));
+
+              mode (lexer_mode::double_quoted);
+              st = state_.top ();
+              m = st.mode;
+
               quoted = true;
               continue;
             }
diff --git a/build2/parser b/build2/parser
index 89f42b8..f038b97 100644
--- a/build2/parser
+++ b/build2/parser
@@ -445,6 +445,8 @@ namespace build2
     const fail_mark<failed> fail;
 
   protected:
+    bool pre_parse_ = false;
+
     bool boot_;
 
     const path* path_; // Current path.
-- 
cgit v1.1