1 files changed, 186 insertions, 52 deletions
diff --git a/libbuild2/lexer.cxx b/libbuild2/lexer.cxx
index 9b7d01e..04c15be 100644
--- a/libbuild2/lexer.cxx
+++ b/libbuild2/lexer.cxx
@@ -42,6 +42,22 @@ namespace build2
     return make_pair (make_pair (r[0], r[1]), sep_);
   }
 
+  pair<char, bool> lexer::
+  peek_char ()
+  {
+    auto p (skip_spaces ());
+    assert (!p.second);
+    sep_ = p.first;
+
+    char r ('\0');
+
+    xchar c (peek ());
+    if (!eos (c))
+      r = c;
+
+    return make_pair (r, sep_);
+  }
+
   void lexer::
   mode (lexer_mode m, char ps, optional<const char*> esc, uintptr_t data)
   {
@@ -144,13 +160,15 @@ namespace build2
         break;
       }
     case lexer_mode::foreign:
-      assert (data > 1);
-      // Fall through.
+      {
+        assert (ps == '\0' && data > 1);
+        s = false;
+        break;
+      }
     case lexer_mode::single_quoted:
     case lexer_mode::double_quoted:
       {
-        assert (ps == '\0');
-        s = false;
+        assert (false); // Can only be set manually in word().
         break;
       }
     case lexer_mode::variable:
@@ -162,8 +180,49 @@ namespace build2
     default: assert (false); // Unhandled custom mode.
     }
 
-    state_.push (
-      state {m, data, nullopt, lsb, false, ps, s, n, q, *esc, s1, s2});
+    mode_impl (state {m, data, nullopt, lsb, false, ps, s, n, q, *esc, s1, s2});
+  }
+
+  void lexer::
+  mode_impl (state&& s)
+  {
+    // If we are in the double-quoted mode then, unless the new mode is eval
+    // or variable, delay the state switch until the current mode is expired.
+    // Note that we delay by injecting the new state beneath the current
+    // state.
+    //
+    if (!state_.empty ()                                &&
+        state_.top ().mode == lexer_mode::double_quoted &&
+        s.mode != lexer_mode::eval                      &&
+        s.mode != lexer_mode::variable)
+    {
+      state qs (move (state_.top ())); // Save quoted state.
+      state_.top () = move (s);        // Overwrite quoted state with new state.
+      state_.push (move (qs));         // Restore quoted state.
+    }
+    else
+      state_.push (move (s));
+  }
+
+  void lexer::
+  expire_mode ()
+  {
+    // If we are in the double-quoted mode, then delay the state expiration
+    // until the current mode is expired. Note that we delay by overwriting
+    // the being expired state with the current state.
+    //
+    assert (!state_.empty () &&
+            (state_.top ().mode != lexer_mode::double_quoted ||
+             state_.size () > 1));
+
+    if (state_.top ().mode == lexer_mode::double_quoted)
+    {
+      state qs (move (state_.top ())); // Save quoted state.
+      state_.pop ();                   // Pop quoted state.
+      state_.top () = move (qs);       // Expire state, restoring quoted state.
+    }
+    else
+      state_.pop ();
   }
 
   token lexer::
@@ -202,9 +261,10 @@ namespace build2
 
     auto make_token = [&sep, ln, cn] (type t, string v = string ())
     {
-      return token (t, move (v),
-                    sep, quote_type::unquoted, false,
-                    ln, cn, token_printer);
+      return token (t, move (v), sep,
+                    quote_type::unquoted, false, false,
+                    ln, cn,
+                    token_printer);
     };
 
     // Handle `[` (do it first to make sure the flag is cleared regardless of
@@ -446,9 +506,10 @@ namespace build2
 
     auto make_token = [sep, ln, cn] (type t, string v = string ())
     {
-      return token (t, move (v),
-                    sep, quote_type::unquoted, false,
-                    ln, cn, token_printer);
+      return token (t, move (v), sep,
+                    quote_type::unquoted, false, false,
+                    ln, cn,
+                    token_printer);
     };
 
     // Handle `[` (do it first to make sure the flag is cleared regardless of
@@ -620,15 +681,14 @@ namespace build2
 
           if (c == '\n' || c == '#' || eos (c))
           {
-            st.hold = token (type::multi_rcbrace,
-                             string (count, '}'),
-                             false, quote_type::unquoted, false,
+            st.hold = token (type::multi_rcbrace, string (count, '}'), false,
+                             quote_type::unquoted, false, false,
                              bln, bcn,
                              token_printer);
 
             lexeme.resize (chop);
-            return token (move (lexeme),
-                          false, quote_type::unquoted, false,
+            return token (move (lexeme), false,
+                          quote_type::unquoted, false, false,
                           ln, cn);
           }
 
@@ -653,9 +713,9 @@ namespace build2
   }
 
   token lexer::
-  word (state st, bool sep)
+  word (const state& rst, bool sep)
   {
-    lexer_mode m (st.mode);
+    lexer_mode m (rst.mode);
 
     xchar c (peek ());
     assert (!eos (c));
@@ -671,33 +731,81 @@ namespace build2
     // quote character.
     //
     bool qcomp (false);
+    bool qfirst (false);
 
-    auto append = [&lexeme, &m, &qcomp] (char c)
+    auto append = [&lexeme, &m, &qcomp, &qfirst] (char c, bool escaped = false)
     {
-      lexeme += c;
+      if (lexeme.empty () && (escaped || m == lexer_mode::double_quoted))
+          qfirst = true;
 
       // An unquoted character after a quoted fragment.
       //
-      if (qcomp && m != lexer_mode::double_quoted)
+      if (m != lexer_mode::double_quoted && qcomp)
         qcomp = false;
+
+      lexeme += c;
     };
 
-    for (; !eos (c); c = peek ())
+    const state* st (&rst);
+    for (bool first (true); !eos (c); first = false, c = peek ())
     {
       // First handle escape sequences.
       //
       if (c == '\\')
       {
-        // In the variable mode we treat the beginning of the escape sequence
-        // as a separator (think \"$foo\").
+        // In the variable mode we treat immediate `\` as the escape sequence
+        // literal and any following as a separator (think \"$foo\").
         //
         if (m == lexer_mode::variable)
-          break;
+        {
+          if (!first)
+            break;
+
+          get ();
+          c = get ();
+
+          if (eos (c))
+            fail (c) << "unterminated escape sequence";
+
+          // For now we only support all the simple C/C++ escape sequences
+          // plus \0 (which in C/C++ is an octal escape sequence).
+          //
+          // In the future we may decide to support more elaborate sequences
+          // such as \xNN, \uNNNN, etc.
+          //
+          // Note: we return it in the literal form instead of translating for
+          // easier printing.
+          //
+          switch (c)
+          {
+          case '\'':
+          case '"':
+          case '?':
+          case '\\':
+          case '0':
+          case 'a':
+          case 'b':
+          case 'f':
+          case 'n':
+          case 'r':
+          case 't':
+          case 'v': lexeme = c; break;
+          default:
+            fail (c) << "unknown escape sequence \\" << c;
+          }
+
+          state_.pop ();
+          return token (type::escape,
+                        move (lexeme),
+                        sep,
+                        qtype, qcomp, qfirst,
+                        ln, cn);
+        }
 
         get ();
         xchar p (peek ());
 
-        const char* esc (st.escapes);
+        const char* esc (st->escapes);
 
         if (esc == nullptr ||
             (*esc != '\0' && !eos (p) && strchr (esc, p) != nullptr))
@@ -708,12 +816,12 @@ namespace build2
             fail (p) << "unterminated escape sequence";
 
           if (p != '\n') // Ignore if line continuation.
-            append (p);
+            append (p, true);
 
           continue;
         }
         else
-          unget (c); // Treat as a normal character.
+          unget (c); // Fall through to treat as a normal character.
       }
 
       bool done (false);
@@ -742,8 +850,8 @@ namespace build2
             get ();
             state_.pop ();
 
-            st = state_.top ();
-            m = st.mode;
+            st = &state_.top ();
+            m = st->mode;
             continue;
           }
         }
@@ -752,19 +860,17 @@ namespace build2
       //
       else if (m == lexer_mode::variable)
       {
-        bool first (lexeme.empty ());
-
         // Handle special variable names, if any.
         //
-        if (first        &&
-            st.data != 0 &&
-            strchr (reinterpret_cast<const char*> (st.data), c) != nullptr)
+        if (first         &&
+            st->data != 0 &&
+            strchr (reinterpret_cast<const char*> (st->data), c) != nullptr)
         {
           get ();
           lexeme += c;
           done = true;
         }
-        else if (c != '_' && !(first ? alpha (c) : alnum (c)))
+        else if (c != '_' && !(lexeme.empty () ? alpha (c) : alnum (c)))
         {
           if (c != '.')
             done = true;
@@ -784,17 +890,17 @@ namespace build2
       {
         // First check if it's a pair separator.
         //
-        if (c == st.sep_pair)
+        if (c == st->sep_pair)
           done = true;
         else
         {
           // Then see if this character or character sequence is a separator.
           //
-          for (const char* p (strchr (st.sep_first, c));
+          for (const char* p (strchr (st->sep_first, c));
                p != nullptr;
                p = done ? nullptr : strchr (p + 1, c))
           {
-            char s (st.sep_second[p - st.sep_first]);
+            char s (st->sep_second[p - st->sep_first]);
 
             // See if it has a second.
             //
@@ -812,8 +918,21 @@ namespace build2
         // Handle single and double quotes if enabled for this mode and unless
         // they were considered separators.
         //
-        if (st.quotes && !done)
+        if (st->quotes && !done)
         {
+          auto quoted_mode = [this] (lexer_mode m)
+          {
+            // In the double-quoted mode we only do effective escaping of the
+            // special `$("\` characters, line continuations, plus `)` for
+            // symmetry. Nothing can be escaped in single-quoted.
+            //
+            const char* esc (m == lexer_mode::double_quoted ? "$()\"\\\n" : "");
+
+            state_.push (state {
+              m, 0, nullopt, false, false, '\0', false, true, true,
+              esc, nullptr, nullptr});
+          };
+
           switch (c)
           {
           case '\'':
@@ -821,7 +940,7 @@ namespace build2
               // Enter the single-quoted mode in case the derived lexer needs
               // to notice this.
               //
-              mode (lexer_mode::single_quoted);
+              quoted_mode (lexer_mode::single_quoted);
 
               switch (qtype)
               {
@@ -840,6 +959,12 @@ namespace build2
                 break;
               }
 
+              // Note that we will treat plus in ''+ as quoted. This is
+              // probably the better option considering the "$empty"+ case
+              //
+              if (lexeme.empty ())
+                qfirst = true;
+
               get ();
               for (c = get (); !eos (c) && c != '\''; c = get ())
                 lexeme += c;
@@ -854,9 +979,10 @@ namespace build2
             {
               get ();
 
-              mode (lexer_mode::double_quoted);
-              st = state_.top ();
-              m = st.mode;
+              quoted_mode (lexer_mode::double_quoted);
+
+              st = &state_.top ();
+              m = st->mode;
 
               switch (qtype)
               {
@@ -875,6 +1001,11 @@ namespace build2
                 break;
               }
 
+              // The same reasoning as above.
+              //
+              if (lexeme.empty ())
+                qfirst = true;
+
               continue;
             }
           }
@@ -905,7 +1036,7 @@ namespace build2
     if (m == lexer_mode::variable)
       state_.pop ();
 
-    return token (move (lexeme), sep, qtype, qcomp, ln, cn);
+    return token (move (lexeme), sep, qtype, qcomp, qfirst, ln, cn);
   }
 
   pair<bool, bool> lexer::
@@ -973,7 +1104,7 @@ namespace build2
             if ((c = peek ()) == '\\')
             {
               get ();
-              if ((c = peek ()) == '\n')
+              if ((c = peek ()) == '\n' || eos (c))
                 return true;
             }
 
@@ -984,15 +1115,16 @@ namespace build2
           {
             // Scan until we see the closing one.
             //
-            for (; !eos (c); c = peek ())
+            for (;;)
             {
-              get ();
               if (c == '#' && ml ())
                 break;
-            }
 
-            if (eos (c))
-              fail (c) << "unterminated multi-line comment";
+              if (eos (c = peek ()))
+                fail (c) << "unterminated multi-line comment";
+
+              get ();
+            }
           }
           else
           {
@@ -1006,6 +1138,8 @@ namespace build2
         }
       case '\\':
         {
+          // See if this is line continuation.
+          //
           get ();
 
           if (peek () == '\n')