From bb02e152dc036879ab0b2d1d8aa2cb19084b8e16 Mon Sep 17 00:00:00 2001
From: Boris Kolpackov <boris@codesynthesis.com>
Date: Tue, 25 May 2021 13:42:41 +0200
Subject: Recognize quoting of first character in token

Use this to relax the pattern inclusion/exclusion syntax to only require
unquoted +/-.
---
 libbuild2/lexer+quoting.test.testscript | 30 ++++++++++++++----
 libbuild2/lexer.cxx                     | 55 ++++++++++++++++++++++-----------
 libbuild2/lexer.test.cxx                |  6 +++-
 libbuild2/parser.cxx                    | 48 +++++++++++++++++-----------
 libbuild2/script/lexer.cxx              |  4 +--
 libbuild2/test/script/lexer.cxx         |  5 ++-
 libbuild2/token.hxx                     | 25 ++++++++++-----
 7 files changed, 118 insertions(+), 55 deletions(-)
diff --git a/libbuild2/lexer+quoting.test.testscript b/libbuild2/lexer+quoting.test.testscript
index 0143c90..ddfb0d0 100644
--- a/libbuild2/lexer+quoting.test.testscript
+++ b/libbuild2/lexer+quoting.test.testscript
@@ -56,9 +56,9 @@ EOO
     : Token start already quoted
     :
     $* <'"$foo"' >>EOO
-    '' [D/P]
+    '' [D/P/F]
     $ [D/C]
-    'foo' [D/P]
+    'foo' [D/P/F]
     <newline>
     EOO
 
@@ -66,7 +66,7 @@ EOO
     : Token end still quoted
     :
     $* <'"foo$"' >>EOO
-    'foo' [D/P]
+    'foo' [D/P/F]
     $ [D/C]
     '' [D/P]
     <newline>
@@ -87,7 +87,7 @@ EOO
     : Token continous with unquoted character
     :
     $* <'"fo"o' >>EOO
-    'foo' [D/P]
+    'foo' [D/P/F]
     <newline>
     EOO
 
@@ -95,7 +95,7 @@ EOO
     : Token continous with unquoted escaped character
     :
     $* <'"fo"\"' >>EOO
-    'fo"' [D/P]
+    'fo"' [D/P/F]
     <newline>
     EOO
   }
@@ -104,6 +104,24 @@ EOO
 : mixed
 :
 $* <"\"fo\"'o'" >>EOO
-'foo' [M/P]
+'foo' [M/P/F]
 <newline>
 EOO
+
+: first
+:
+{
+  : empty-single
+  :
+  $* <"''+foo" >>EOO
+  '+foo' [S/P/F]
+  <newline>
+  EOO
+
+  : empty-double
+  :
+  $* <'""+foo' >>EOO
+  '+foo' [D/P/F]
+  <newline>
+  EOO
+}
diff --git a/libbuild2/lexer.cxx b/libbuild2/lexer.cxx
index 9b7d01e..0b6f96d 100644
--- a/libbuild2/lexer.cxx
+++ b/libbuild2/lexer.cxx
@@ -202,9 +202,10 @@ namespace build2
 
     auto make_token = [&sep, ln, cn] (type t, string v = string ())
     {
-      return token (t, move (v),
-                    sep, quote_type::unquoted, false,
-                    ln, cn, token_printer);
+      return token (t, move (v), sep,
+                    quote_type::unquoted, false, false,
+                    ln, cn,
+                    token_printer);
     };
 
     // Handle `[` (do it first to make sure the flag is cleared regardless of
@@ -446,9 +447,10 @@ namespace build2
 
     auto make_token = [sep, ln, cn] (type t, string v = string ())
     {
-      return token (t, move (v),
-                    sep, quote_type::unquoted, false,
-                    ln, cn, token_printer);
+      return token (t, move (v), sep,
+                    quote_type::unquoted, false, false,
+                    ln, cn,
+                    token_printer);
     };
 
     // Handle `[` (do it first to make sure the flag is cleared regardless of
@@ -620,15 +622,14 @@ namespace build2
 
           if (c == '\n' || c == '#' || eos (c))
           {
-            st.hold = token (type::multi_rcbrace,
-                             string (count, '}'),
-                             false, quote_type::unquoted, false,
+            st.hold = token (type::multi_rcbrace, string (count, '}'), false,
+                             quote_type::unquoted, false, false,
                              bln, bcn,
                              token_printer);
 
             lexeme.resize (chop);
-            return token (move (lexeme),
-                          false, quote_type::unquoted, false,
+            return token (move (lexeme), false,
+                          quote_type::unquoted, false, false,
                           ln, cn);
           }
 
@@ -671,15 +672,22 @@ namespace build2
     // quote character.
     //
     bool qcomp (false);
+    bool qfirst (false);
 
-    auto append = [&lexeme, &m, &qcomp] (char c)
+    auto append = [&lexeme, &m, &qcomp, &qfirst] (char c)
     {
-      lexeme += c;
+      if (m == lexer_mode::double_quoted)
+      {
+        if (lexeme.empty ()) // First character.
+          qfirst = true;
+      }
+      else
+      {
+        if (qcomp) // An unquoted character after a quoted fragment.
+          qcomp = false;
+      }
 
-      // An unquoted character after a quoted fragment.
-      //
-      if (qcomp && m != lexer_mode::double_quoted)
-        qcomp = false;
+      lexeme += c;
     };
 
     for (; !eos (c); c = peek ())
@@ -840,6 +848,12 @@ namespace build2
                 break;
               }
 
+              // Note that we will treat plus in ''+ as quoted. This is
+              // probably the better option considering the "$empty"+ case
+              //
+              if (lexeme.empty ())
+                qfirst = true;
+
               get ();
               for (c = get (); !eos (c) && c != '\''; c = get ())
                 lexeme += c;
@@ -875,6 +889,11 @@ namespace build2
                 break;
               }
 
+              // The same reasoning as above.
+              //
+              if (lexeme.empty ())
+                qfirst = true;
+
               continue;
             }
           }
@@ -905,7 +924,7 @@ namespace build2
     if (m == lexer_mode::variable)
       state_.pop ();
 
-    return token (move (lexeme), sep, qtype, qcomp, ln, cn);
+    return token (move (lexeme), sep, qtype, qcomp, qfirst, ln, cn);
   }
 
   pair<bool, bool> lexer::
diff --git a/libbuild2/lexer.test.cxx b/libbuild2/lexer.test.cxx
index 24f0528..6d48885 100644
--- a/libbuild2/lexer.test.cxx
+++ b/libbuild2/lexer.test.cxx
@@ -84,7 +84,11 @@ namespace build2
           }
 
           if (q != '\0')
-            cout << " [" << q << (t.qcomp ? "/C" : "/P") << ']';
+            cout << " ["
+                 << q
+                 << (t.qcomp ? "/C" : "/P")
+                 << (!t.qcomp && t.qfirst ? "/F" : "")
+                 << ']';
         }
 
         cout << endl;
diff --git a/libbuild2/parser.cxx b/libbuild2/parser.cxx
index f152b17..a9646d5 100644
--- a/libbuild2/parser.cxx
+++ b/libbuild2/parser.cxx
@@ -5394,6 +5394,7 @@ namespace build2
     //
     bool concat (false);
     bool concat_quoted (false);
+    bool concat_quoted_first (false);
     name concat_data;
 
     auto concat_typed = [&vnull, &vtype, &concat, &concat_data, this]
@@ -5492,21 +5493,13 @@ namespace build2
     // Return '+' or '-' if a token can start an inclusion or exclusion
     // (pattern or group), '\0' otherwise. The result can be used as bool.
     //
-    // @@ Note that we only need to make sure that the leading '+' or '-'
-    //    characters are unquoted. We could consider some partially quoted
-    //    tokens as starting inclusion or exclusion as well, for example
-    //    +'foo*'. However, currently we can not determine which part of a
-    //    token is quoted, and so can't distinguish the above token from
-    //    '+'foo*. This is why we end up with a criteria that is stricter than
-    //    is really required.
-    //
     auto pattern_prefix = [] (const token& t) -> char
     {
       char c;
-      return t.type == type::word && ((c = t.value[0]) == '+' || c == '-') &&
-             t.qtype == quote_type::unquoted
-             ? c
-             : '\0';
+      return (t.type == type::word && !t.qfirst &&
+              ((c = t.value[0]) == '+' || c == '-')
+              ? c
+              : '\0');
     };
 
     // A name sequence potentially starts with a pattern if it starts with a
@@ -5586,9 +5579,11 @@ namespace build2
         assert (!pre_parse_);
 
         bool quoted (concat_quoted);
+        bool quoted_first (concat_quoted_first);
 
         concat = false;
         concat_quoted = false;
+        concat_quoted_first = false;
 
         // If this is a result of typed concatenation, then don't inject. For
         // one we don't want any of the "interpretations" performed in the
@@ -5671,7 +5666,7 @@ namespace build2
         t = token (move (concat_data.value),
                    true,
                    quoted ? quote_type::mixed : quote_type::unquoted,
-                   false,
+                   false, quoted_first,
                    t.line, t.column);
       }
       else if (!first)
@@ -5713,6 +5708,7 @@ namespace build2
         string val (move (t.value));
         const location loc (get_location (t));
         bool quoted (t.qtype != quote_type::unquoted);
+        bool quoted_first (t.qfirst);
 
         // Should we accumulate? If the buffer is not empty, then we continue
         // accumulating (the case where we are separated should have been
@@ -5723,6 +5719,8 @@ namespace build2
         if (concat        || // Continue.
             !last_concat ()) // Start.
         {
+          bool e (val.empty ());
+
           // If LHS is typed then do typed concatenation.
           //
           if (concat && vtype != nullptr)
@@ -5743,8 +5741,17 @@ namespace build2
               v += val;
           }
 
-          concat = true;
+          // Consider something like this: ""$foo where foo='+foo'. Should we
+          // treat the plus as a first (unquoted) character? Feels like we
+          // should not. The way we achieve this is a bit hackish: we make it
+          // look like a quoted first character. Note that there is a second
+          // half of this in expansion case which deals with $empty+foo.
+          //
+          if (!concat) // First.
+            concat_quoted_first = quoted_first || e;
+
           concat_quoted = quoted || concat_quoted;
+          concat = true;
 
           continue;
         }
@@ -6451,7 +6458,7 @@ namespace build2
           //
           else if (!result->null && !result->empty ())
           {
-            // This can only an untyped value.
+            // This can only be an untyped value.
             //
             // @@ Could move if result == &result_data.
             //
@@ -6487,8 +6494,13 @@ namespace build2
               concat_data.value += n.value;
           }
 
-          concat = true;
+          // The same little hack as in the word case ($empty+foo).
+          //
+          if (!concat) // First.
+            concat_quoted_first = true;
+
           concat_quoted = quoted || concat_quoted;
+          concat = true;
         }
         else
         {
@@ -6703,8 +6715,8 @@ namespace build2
       //
       // print +foo
       //
-      // So wepeek at one more character since what we expect next ('=') can't
-      // be whitespace-separated.
+      // So we peek at one more character since what we expect next ('=')
+      // can't be whitespace-separated.
       //
       return c0 == '\n' || c0 == '\0' || c0 == '(' ||
         (p.second                 &&
diff --git a/libbuild2/script/lexer.cxx b/libbuild2/script/lexer.cxx
index a18c1df..7577149 100644
--- a/libbuild2/script/lexer.cxx
+++ b/libbuild2/script/lexer.cxx
@@ -127,7 +127,7 @@ namespace build2
         bool q (m == lexer_mode::here_line_double);
 
         return token (t, string (), sep,
-                      (q ? quote_type::double_ : quote_type::unquoted), q,
+                      (q ? quote_type::double_ : quote_type::unquoted), q, q,
                       ln, cn,
                       token_printer);
       };
@@ -180,7 +180,7 @@ namespace build2
       auto make_token = [&sep, &c] (type t, string v = string ())
       {
         return token (t, move (v), sep,
-                      quote_type::unquoted, false,
+                      quote_type::unquoted, false, false,
                       c.line, c.column,
                       token_printer);
       };
diff --git a/libbuild2/test/script/lexer.cxx b/libbuild2/test/script/lexer.cxx
index c23dea4..f9c8ac6 100644
--- a/libbuild2/test/script/lexer.cxx
+++ b/libbuild2/test/script/lexer.cxx
@@ -324,9 +324,8 @@ namespace build2
           lexeme += c;
         }
 
-        return token (move (lexeme),
-                      false,
-                      quote_type::unquoted, false,
+        return token (move (lexeme), false,
+                      quote_type::unquoted, false, false,
                       ln, cn);
       }
 
diff --git a/libbuild2/token.hxx b/libbuild2/token.hxx
index 030ab48..faae466 100644
--- a/libbuild2/token.hxx
+++ b/libbuild2/token.hxx
@@ -119,10 +119,12 @@ namespace build2
 
     // Quoting can be complete, where the token starts and ends with the quote
     // characters and quoting is contiguous or partial where only some part(s)
-    // of the token are quoted or quoting continues to the next token.
+    // of the token are quoted or quoting continues to the next token. We also
+    // keep track whether the first character of a token is quoted.
     //
     quote_type qtype;
     bool qcomp;
+    bool qfirst;
 
     // Normally only used for word, but can also be used to store "modifiers"
     // or some such for other tokens.
@@ -139,26 +141,35 @@ namespace build2
         : token (token_type::eos, false, 0, 0, token_printer) {}
 
     token (token_type t, bool s, uint64_t l, uint64_t c, printer_type* p)
-        : token (t, string (), s, quote_type::unquoted, false, l, c, p) {}
+        : token (t, string (), s,
+                 quote_type::unquoted, false, false,
+                 l, c,
+                 p) {}
 
     token (token_type t, bool s,
            quote_type qt,
            uint64_t l, uint64_t c,
            printer_type* p)
-        : token (t, string (), s, qt, qt != quote_type::unquoted, l, c, p) {}
+        : token (t, string (), s,
+                 qt, qt != quote_type::unquoted, qt != quote_type::unquoted,
+                 l, c,
+                 p) {}
 
     token (string v, bool s,
-           quote_type qt, bool qc,
+           quote_type qt, bool qc, bool qf,
            uint64_t l, uint64_t c)
-        : token (token_type::word, move (v), s, qt, qc, l, c, &token_printer){}
+        : token (token_type::word, move (v), s,
+                 qt, qc, qf,
+                 l, c,
+                 &token_printer) {}
 
     token (token_type t,
            string v, bool s,
-           quote_type qt, bool qc,
+           quote_type qt, bool qc, bool qf,
            uint64_t l, uint64_t c,
            printer_type* p)
         : type (t), separated (s),
-          qtype (qt), qcomp (qc),
+          qtype (qt), qcomp (qc), qfirst (qf),
           value (move (v)),
           line (l), column (c),
           printer (p) {}
-- 
cgit v1.1