Distinguish token quoting type and completeness

author: Boris Kolpackov <boris@codesynthesis.com> 2016-11-25 11:18:34 +0200
committer: Boris Kolpackov <boris@codesynthesis.com> 2016-11-25 11:18:34 +0200
commit: 28f8338ded34f160e0083da9be4679bc778be7ca (patch)
tree: 7bd01311683d835f946c73d7d8220f552bae718f /build2
parent: f32bb0aceb00cfa4bd04eea72f8fa2fe02b738b3 (diff)
5 files changed, 109 insertions, 28 deletions
diff --git a/build2/lexer.cxx b/build2/lexer.cxx
index cf8a789..b73c291 100644
--- a/build2/lexer.cxx
+++ b/build2/lexer.cxx
@@ -295,7 +295,24 @@ namespace build2
     uint64_t ln (c.line), cn (c.column);
 
     string lexeme;
-    bool quoted (m == lexer_mode::double_quoted);
+    quote_type qtype (m == lexer_mode::double_quoted
+                      ? quote_type::double_
+                      : quote_type::unquoted);
+
+    // If we are already in the quoted mode then we didn't start with the
+    // quote character.
+    //
+    bool qcomp (false);
+
+    auto append = [&lexeme, &m, &qcomp] (char c)
+    {
+      lexeme += c;
+
+      // An unquoted character after a quoted fragment.
+      //
+      if (qcomp && m != lexer_mode::double_quoted)
+        qcomp = false;
+    };
 
     for (; !eos (c); c = peek ())
     {
@@ -321,7 +338,7 @@ namespace build2
             fail (p) << "unterminated escape sequence";
 
           if (p != '\n') // Ignore if line continuation.
-            lexeme += p;
+            append (p);
 
           continue;
         }
@@ -424,6 +441,22 @@ namespace build2
               //
               mode (lexer_mode::single_quoted);
 
+              switch (qtype)
+              {
+              case quote_type::unquoted:
+                qtype = quote_type::single;
+                qcomp = lexeme.empty ();
+                break;
+              case quote_type::single:
+                qcomp = false; // Non-contiguous.
+                break;
+              case quote_type::double_:
+                qtype = quote_type::mixed;
+              case quote_type::mixed:
+                qcomp = false;
+                break;
+              }
+
               get ();
               for (c = get (); !eos (c) && c != '\''; c = get ())
                 lexeme += c;
@@ -432,8 +465,6 @@ namespace build2
                 fail (c) << "unterminated single-quoted sequence";
 
               state_.pop ();
-
-              quoted = true;
               continue;
             }
           case '\"':
@@ -444,7 +475,22 @@ namespace build2
               st = state_.top ();
               m = st.mode;
 
-              quoted = true;
+              switch (qtype)
+              {
+              case quote_type::unquoted:
+                qtype = quote_type::double_;
+                qcomp = lexeme.empty ();
+                break;
+              case quote_type::double_:
+                qcomp = false; // Non-contiguous.
+                break;
+              case quote_type::single:
+                qtype = quote_type::mixed;
+              case quote_type::mixed:
+                qcomp = false;
+                break;
+              }
+
               continue;
             }
           }
@@ -455,19 +501,27 @@ namespace build2
         break;
 
       get ();
-      lexeme += c;
+      append (c);
     }
 
-    if (eos (c) && m == lexer_mode::double_quoted)
-      fail (c) << "unterminated double-quoted sequence";
+    if (m == lexer_mode::double_quoted)
+    {
+      if (eos (c))
+        fail (c) << "unterminated double-quoted sequence";
+
+      // If we are still in the quoted mode then we didn't end with the quote
+      // character.
+      //
+      if (qcomp)
+        qcomp = false;
+    }
 
     // Expire variable mode at the end of the word.
     //
     if (m == lexer_mode::variable)
       state_.pop ();
 
-    return token (move (lexeme), sep, quoted, ln, cn);
-
+    return token (move (lexeme), sep, qtype, qcomp, ln, cn);
   }
 
   bool lexer::
diff --git a/build2/parser.cxx b/build2/parser.cxx
index 5f9850d..c2737cb 100644
--- a/build2/parser.cxx
+++ b/build2/parser.cxx
@@ -2154,7 +2154,10 @@ namespace build2
             tt != type::lparen) || peeked ().separated))
       {
         tt = type::word;
-        t = token (move (concat_str), true, false, t.line, t.column);
+        t = token (move (concat_str),
+                   true,
+                   quote_type::unquoted, false,
+                   t.line, t.column);
         concat = false;
       }
       else if (!first)
@@ -2792,7 +2795,7 @@ namespace build2
     //
     // See tests/keyword.
     //
-    if (!t.quoted)
+    if (t.qtype == quote_type::unquoted)
     {
       // We cannot peek at the whole token here since it might have to be
       // lexed in a different mode. So peek at its first character.
diff --git a/build2/test/script/lexer.cxx b/build2/test/script/lexer.cxx
index 5e6c66a..19e7498 100644
--- a/build2/test/script/lexer.cxx
+++ b/build2/test/script/lexer.cxx
@@ -137,7 +137,7 @@ namespace build2
           break;
         }
 
-        if (r.quoted)
+        if (r.qtype != quote_type::unquoted)
           ++quoted_;
 
         return r;
@@ -448,7 +448,10 @@ namespace build2
           lexeme += c;
         }
 
-        return token (move (lexeme), false, false, ln, cn);
+        return token (move (lexeme),
+                      false,
+                      quote_type::unquoted, false,
+                      ln, cn);
       }
 
       token lexer::
@@ -480,7 +483,10 @@ namespace build2
         }
 
         state_.pop (); // Expire the variable mode.
-        return token (move (lexeme), sep, false, ln, cn);
+        return token (move (lexeme),
+                      sep,
+                      quote_type::unquoted, false,
+                      ln, cn);
       }
     }
   }
diff --git a/build2/test/script/parser.cxx b/build2/test/script/parser.cxx
index 9afef75..a116873 100644
--- a/build2/test/script/parser.cxx
+++ b/build2/test/script/parser.cxx
@@ -321,7 +321,7 @@ namespace build2
             //
             lt = line_type::cmd; // Default.
 
-            if (tt == type::word && !t.quoted)
+            if (tt == type::word && t.qtype == quote_type::unquoted)
             {
               const string& n (t.value);
 
@@ -353,7 +353,7 @@ namespace build2
             //
             lt = line_type::cmd; // Default.
 
-            if (tt == type::word && !t.quoted)
+            if (tt == type::word && t.qtype == quote_type::unquoted)
             {
               const string& n (t.value);
 
@@ -719,7 +719,7 @@ namespace build2
           const token& p (peeked ());
           const location ll (get_location (p));
 
-          if (pt == type::word && !p.quoted)
+          if (pt == type::word && p.qtype == quote_type::unquoted)
           {
             if      (p.value == "elif")  lt = line_type::cmd_elif;
             else if (p.value == "elif!") lt = line_type::cmd_elifn;
@@ -1652,7 +1652,7 @@ namespace build2
                   //
                   next (t, tt);
 
-                  if (tt != type::word || t.quoted)
+                  if (tt != type::word || t.qtype != quote_type::unquoted)
                     fail (l) << "expected here-document end marker";
 
                   hd.push_back (here_doc {0, 0, 0, move (t.value), nn});
@@ -1751,7 +1751,8 @@ namespace build2
               // quoted (note that the current token is "next" and is not part
               // of this).
               //
-              bool q ((quoted () - (t.quoted ? 1 : 0)) != 0);
+              bool q ((quoted () -
+                       (t.qtype != quote_type::unquoted ? 1 : 0)) != 0);
 
               for (name& n: ns)
               {
@@ -2074,7 +2075,9 @@ namespace build2
           // Check if this is the end marker. For starters, it should be a
           // single, unquoted word followed by a newline.
           //
-          if (tt == type::word && !t.quoted && peek () == type::newline)
+          if (tt == type::word &&
+              t.qtype == quote_type::unquoted &&
+              peek () == type::newline)
           {
             const string& v (t.value);
 
@@ -2652,7 +2655,7 @@ namespace build2
           // Examine tokens we have replayed since last reset.
           //
           for (size_t i (replay_quoted_); i != replay_i_; ++i)
-            if (replay_data_[i].token.quoted)
+            if (replay_data_[i].token.qtype != quote_type::unquoted)
               ++r;
         }
 
@@ -2663,14 +2666,14 @@ namespace build2
       reset_quoted (token& cur)
       {
         if (replay_ != replay::play)
-          lexer_->reset_quoted (cur.quoted ? 1 : 0);
+          lexer_->reset_quoted (cur.qtype != quote_type::unquoted ? 1 : 0);
         else
         {
           replay_quoted_ = replay_i_ - 1;
 
           // Must be the same token.
           //
-          assert (replay_data_[replay_quoted_].token.quoted == cur.quoted);
+          assert (replay_data_[replay_quoted_].token.qtype == cur.qtype);
         }
       }
 
diff --git a/build2/token b/build2/token
index b3ebf5b..df25d4c 100644
--- a/build2/token
+++ b/build2/token
@@ -56,6 +56,11 @@ namespace build2
     value_type v_;
   };
 
+  // Token can be unquoted, single-quoted ('') or double-quoted (""). It can
+  // also be mixed.
+  //
+  enum class quote_type {unquoted, single, double_, mixed};
+
   class token;
 
   void
@@ -68,7 +73,13 @@ namespace build2
 
     token_type type;
     bool separated; // Whitespace-separated from the previous token.
-    bool quoted;    // Word (or some part of it) was quoted.
+
+    // Quoting can be complete, where the token starts and ends with the quote
+    // characters and quoting is contiguous or partial where only some part(s)
+    // of the token are quoted or quoting continus to the next token.
+    //
+    quote_type qtype;
+    bool qcomp;
 
     string value;   // Only valid for word.
 
@@ -82,12 +93,16 @@ namespace build2
         : token (token_type::eos, false, 0, 0, token_printer) {}
 
     token (token_type t, bool s, uint64_t l, uint64_t c, printer_type* p)
-        : type (t), separated (s), quoted (false),
+        : type (t), separated (s), qtype (quote_type::unquoted),
           line (l), column (c),
           printer (p) {}
 
-    token (string v, bool s, bool q, uint64_t l, uint64_t c)
-        : type (token_type::word), separated (s), quoted (q), value (move (v)),
+    token (string v, bool s,
+           quote_type qt, bool qc,
+           uint64_t l, uint64_t c)
+        : type (token_type::word), separated (s),
+          qtype (qt), qcomp (qc),
+          value (move (v)),
           line (l), column (c),
           printer (&token_printer) {}
   };
author	Boris Kolpackov <boris@codesynthesis.com>	2016-11-25 11:18:34 +0200
committer	Boris Kolpackov <boris@codesynthesis.com>	2016-11-25 11:18:34 +0200
commit	28f8338ded34f160e0083da9be4679bc778be7ca (patch)
tree	7bd01311683d835f946c73d7d8220f552bae718f /build2
parent	f32bb0aceb00cfa4bd04eea72f8fa2fe02b738b3 (diff)