Reimplement double quote lexing to avoid "implied quote" trick

author: Boris Kolpackov <boris@codesynthesis.com> 2015-09-09 10:20:52 +0200
committer: Boris Kolpackov <boris@codesynthesis.com> 2015-09-09 10:20:52 +0200
commit: e3b6dc455ab5c98606e38983bd19426ae346f469 (patch)
tree: 62f145eac81c7c6f955ca9e63df17aa07c392c11
parent: ccca13f8eadef31f2df873cb505ebca98501c45a (diff)
6 files changed, 105 insertions, 105 deletions
diff --git a/build/lexer b/build/lexer
index 0740f14..37c7807 100644
--- a/build/lexer
+++ b/build/lexer
@@ -69,13 +69,10 @@ namespace build
 
   private:
     token
-    name (bool separated);
-
-    void
-    single_quote (std::string&);
+    next_quoted ();
 
-    bool
-    double_quote (std::string&);
+    token
+    name (bool separated);
 
     // Return true we have seen any spaces. Skipped empty lines don't
     // count. In other words, we are only interested in spaces that
diff --git a/build/lexer.cxx b/build/lexer.cxx
index 9c76377..6da18eb 100644
--- a/build/lexer.cxx
+++ b/build/lexer.cxx
@@ -13,29 +13,12 @@ namespace build
   {
     lexer_mode m (mode_.top ());
 
-    // If we are in the quoted mode, then this means we have seen a
-    // variable expansion ($) and had to "break" the quoted sequence
-    // into multiple "concatenated" tokens. So what we have now is
-    // the "tail" of that quoted sequence which we need to continue
-    // scanning. To make this work auto-magically (well, almost) we
-    // are going to use a little trick: we will "pretend" that the
-    // next character is the opening quote. After all, a sequence
-    // like "$foo bar" is semantically equivalent to "$foo"" bar".
+    // For some modes we have dedicated imlementations of next().
     //
-    if (m == lexer_mode::quoted)
+    switch (m)
     {
-      xchar c (peek ());
-
-      // Detect the beginning of the "break". After that, we rely
-      // on the caller switching to the variable mode.
-      //
-      if (c != '$')
-      {
-        mode_.pop ();  // As if we saw closing quote.
-        c.value = '"'; // Keep line/column information.
-        unget (c);
-        return name (false);
-      }
+    case lexer_mode::quoted: return next_quoted ();
+    default: break;
     }
 
     bool sep (skip_spaces ());
@@ -120,6 +103,23 @@ namespace build
   }
 
   token lexer::
+  next_quoted ()
+  {
+    xchar c (peek ());
+
+    if (eos (c))
+      fail (c) << "unterminated double-quoted sequence";
+
+    uint64_t ln (c.line), cn (c.column);
+
+    switch (c)
+    {
+    case '$': get (); return token (token_type::dollar, false, ln, cn);
+    default:          return name (false);
+    }
+  }
+
+  token lexer::
   name (bool sep)
   {
     xchar c (peek ());
@@ -140,9 +140,11 @@ namespace build
         break;
 
       // The following characters are not treated as special in the
-      // value or pairs mode.
+      // value/pairs and quoted modes.
       //
-      if (m != lexer_mode::value && m != lexer_mode::pairs)
+      if (m != lexer_mode::value &&
+          m != lexer_mode::pairs &&
+          m != lexer_mode::quoted)
       {
         switch (c)
         {
@@ -178,61 +180,104 @@ namespace build
           break;
       }
 
+      // If we are quoted, these are ordinary characters.
+      //
+      if (m != lexer_mode::quoted)
+      {
+        switch (c)
+        {
+        case ' ':
+        case '\t':
+        case '\n':
+        case '#':
+        case '{':
+        case '}':
+        case '(':
+        case ')':
+          {
+            done = true;
+            break;
+          }
+        case '\\':
+          {
+            get ();
+            lexeme += escape ();
+            continue;
+          }
+        case '\'':
+          {
+            // If we are in the variable mode, then treat quote as just
+            // another separator.
+            //
+            if (m == lexer_mode::variable)
+            {
+              done = true;
+              break;
+            }
+            else
+            {
+              get ();
+
+              for (c = get (); !eos (c) && c != '\''; c = get ())
+                lexeme += c;
+
+              if (eos (c))
+                fail (c) << "unterminated single-quoted sequence";
+
+              continue;
+            }
+          }
+        }
+
+        if (done)
+          break;
+      }
+
       switch (c)
       {
-      case ' ':
-      case '\t':
-      case '\n':
-      case '#':
-      case '{':
-      case '}':
       case '$':
-      case '(':
-      case ')':
         {
           done = true;
           break;
         }
-      case '\\':
-        {
-          get ();
-          lexeme += escape ();
-          break;
-        }
-      case '\'':
       case '\"':
         {
-          // If we are in the variable mode, then treat quotes as just
+          // If we are in the variable mode, then treat quote as just
           // another separator.
           //
           if (m == lexer_mode::variable)
+          {
             done = true;
+            break;
+          }
           else
           {
             get ();
 
-            if (c == '\'')
-              single_quote (lexeme);
+            if (m == lexer_mode::quoted)
+              mode_.pop ();
             else
-            {
               mode_.push (lexer_mode::quoted);
-              done = double_quote (lexeme);
-            }
+
+            m = mode_.top ();
+            continue;
           }
-          break;
         }
       default:
         {
           get ();
           lexeme += c;
-          break;
+          continue;
         }
       }
 
-      if (done)
-        break;
+      assert (done);
+      break;
     }
 
+    if (m == lexer_mode::quoted && eos (c))
+      fail (c) << "unterminated double-quoted sequence";
+
     // Expire variable mode at the end of the name.
     //
     if (m == lexer_mode::variable)
@@ -241,56 +286,6 @@ namespace build
     return token (lexeme, sep, ln, cn);
   }
 
-  // Assuming the previous character is the opening single quote, scan
-  // the stream until the closing quote or eos, accumulating characters
-  // in between in lexeme. Fail if eos is reached before the closing
-  // quote.
-  //
-  void lexer::
-  single_quote (string& lexeme)
-  {
-    xchar c (get ());
-
-    for (; !eos (c) && c != '\''; c = get ())
-      lexeme += c;
-
-    if (eos (c))
-      fail (c) << "unterminated single-quoted sequence";
-  }
-
-  // Assuming the previous character is the opening double quote, scan
-  // the stream until the closing quote, $, or eos, accumulating
-  // characters in between in lexeme. Return false if we stopped
-  // because of the closing quote (which means the normal name
-  // scanning can continue) and true if we stopped at $ (meaning this
-  // name is done and what follows is another token). Fail if eos is
-  // reached before the closing quote.
-  //
-  bool lexer::
-  double_quote (string& lexeme)
-  {
-    xchar c (peek ());
-
-    for (; !eos (c); c = peek ())
-    {
-      if (c == '$')
-        return true;
-
-      get ();
-
-      if (c == '"')
-      {
-        mode_.pop (); // Expire quoted mode.
-        return false;
-      }
-
-      lexeme += c;
-    }
-
-    fail (c) << "unterminated double-quoted sequence";
-    return false; // Never reached.
-  }
-
   bool lexer::
   skip_spaces ()
   {
diff --git a/tests/lexer/driver.cxx b/tests/lexer/driver.cxx
index e3543da..a3819f5 100644
--- a/tests/lexer/driver.cxx
+++ b/tests/lexer/driver.cxx
@@ -99,6 +99,7 @@ main ()
   assert (lex ("f\"oo$ba\"r") == tokens ({"foo", "$", "bar", ""}));
 
   assert (lex ("\"foo bar") == tokens ({"<lexer error>"}));
+  assert (lex ("\"foo $") == tokens ({"foo ", "$", "<lexer error>"}));
   assert (lex ("\"foo $bar") == tokens ({"foo ", "$", "<lexer error>"}));
 
   // Combinations.
diff --git a/tests/quote/buildfile b/tests/quote/buildfile
index 931bc36..6dd22b4 100644
--- a/tests/quote/buildfile
+++ b/tests/quote/buildfile
@@ -15,4 +15,8 @@ print "[ $bar ]"
 print "[ $foo $bar ]"
 print "[ $foo/$bar ]"
 
+print $foo'bar'
+print $foo"$bar"
+print "$foo"bar
+
 ./:
diff --git a/tests/quote/test.out b/tests/quote/test.out
index 802f28f..216b1c8 100644
--- a/tests/quote/test.out
+++ b/tests/quote/test.out
@@ -9,3 +9,6 @@ fo o bar
 [  bar  ]
 [ fo o  bar  ]
 [ fo o/ bar  ]
+fo obar
+fo o bar 
+fo obar
diff --git a/tests/quote/test.sh b/tests/quote/test.sh
index 145ea6b..b898b3c 100755
--- a/tests/quote/test.sh
+++ b/tests/quote/test.sh
@@ -1,3 +1,3 @@
 #!/bin/sh
 
-valgrind -q b -q | diff test.out -
+valgrind -q b -q | diff -u test.out -
author	Boris Kolpackov <boris@codesynthesis.com>	2015-09-09 10:20:52 +0200
committer	Boris Kolpackov <boris@codesynthesis.com>	2015-09-09 10:20:52 +0200
commit	e3b6dc455ab5c98606e38983bd19426ae346f469 (patch)
tree	62f145eac81c7c6f955ca9e63df17aa07c392c11
parent	ccca13f8eadef31f2df873cb505ebca98501c45a (diff)