From ba628f6f90e7412245dcebdecd9cfa7e4bbf989c Mon Sep 17 00:00:00 2001
From: Boris Kolpackov <boris@codesynthesis.com>
Date: Mon, 25 May 2020 12:12:13 +0200
Subject: Add support for value subscript after expansions

Value subscript is only recognized in evaluation contexts (due to ambiguity
with wildcard patterns; consider: $x[123].txt) and should be unseparated from
the previous token. For example:

x = ($y[1])
x = (($f ? $y : $z)[1])
x = ($identity($x)[$z])
---
 libbuild2/build/script/lexer.cxx     |  15 ++--
 libbuild2/lexer.cxx                  |  59 +++++++++-----
 libbuild2/lexer.hxx                  |  38 +++++----
 libbuild2/parser.cxx                 | 145 ++++++++++++++++++++++++++++-------
 libbuild2/parser.hxx                 |   9 ++-
 libbuild2/script/lexer.cxx           |   5 +-
 libbuild2/test/script/lexer.cxx      |  15 ++--
 libbuild2/variable.cxx               |   8 +-
 tests/expansion/concat.testscript    |   2 +-
 tests/expansion/subscript.testscript |  97 +++++++++++++++++++++++
 10 files changed, 308 insertions(+), 85 deletions(-)
 create mode 100644 tests/expansion/subscript.testscript
diff --git a/libbuild2/build/script/lexer.cxx b/libbuild2/build/script/lexer.cxx
index 7b8bdd4..a58f794 100644
--- a/libbuild2/build/script/lexer.cxx
+++ b/libbuild2/build/script/lexer.cxx
@@ -27,8 +27,6 @@ namespace build2
             optional<const char*> esc,
             uintptr_t data)
       {
-        bool a (false); // attributes
-
         const char* s1 (nullptr);
         const char* s2 (nullptr);
 
@@ -88,7 +86,8 @@ namespace build2
         }
 
         assert (ps == '\0');
-        state_.push (state {m, data, nullopt, a, ps, s, n, q, *esc, s1, s2});
+        state_.push (
+          state {m, data, nullopt, false, false, ps, s, n, q, *esc, s1, s2});
       }
 
       token lexer::
@@ -129,16 +128,16 @@ namespace build2
           return token (t, sep, ln, cn, token_printer);
         };
 
-        // Handle attributes (do it first to make sure the flag is cleared
-        // regardless of what we return).
+        // Handle `[` (do it first to make sure the flag is cleared regardless
+        // of what we return).
         //
-        if (st.attributes)
+        if (st.lsbrace)
         {
           assert (m == lexer_mode::variable_line);
 
-          state_.top ().attributes = false;
+          state_.top ().lsbrace = false; // Note: st is a copy.
 
-          if (c == '[')
+          if (c == '[' && (!st.lsbrace_unsep || !sep))
             return make_token (type::lsbrace);
         }
 
diff --git a/libbuild2/lexer.cxx b/libbuild2/lexer.cxx
index 6d3504c..7149d45 100644
--- a/libbuild2/lexer.cxx
+++ b/libbuild2/lexer.cxx
@@ -39,7 +39,7 @@ namespace build2
   void lexer::
   mode (lexer_mode m, char ps, optional<const char*> esc, uintptr_t data)
   {
-    bool a (false); // attributes
+    bool lsb (false); // Enable `[` recognition.
 
     const char* s1 (nullptr);
     const char* s2 (nullptr);
@@ -62,9 +62,9 @@ namespace build2
         // Note: `%` is only recognized at the beginning of the line so it
         // should not be included here.
         //
-        a  = true;
         s1 = ":<>=+? $(){}#\t\n";
         s2 = "    ==         ";
+        lsb  = true;
         break;
       }
     case lexer_mode::value:
@@ -103,6 +103,12 @@ namespace build2
         s2 = "         ";
         break;
       }
+    case lexer_mode::subscript:
+      {
+        s1 = " $()]#\t\n";
+        s2 = "        ";
+        break;
+      }
     case lexer_mode::eval:
       {
         s1 = ":<>=!&|?, $(){}#\t\n";
@@ -147,7 +153,8 @@ namespace build2
     default: assert (false); // Unhandled custom mode.
     }
 
-    state_.push (state {m, data, nullopt, a, ps, s, n, q, *esc, s1, s2});
+    state_.push (
+      state {m, data, nullopt, lsb, false, ps, s, n, q, *esc, s1, s2});
   }
 
   token lexer::
@@ -168,6 +175,7 @@ namespace build2
     case lexer_mode::case_patterns:
     case lexer_mode::attributes:
     case lexer_mode::attribute_value:
+    case lexer_mode::subscript:
     case lexer_mode::variable:
     case lexer_mode::buildspec:     break;
     case lexer_mode::eval:          return next_eval ();
@@ -190,14 +198,14 @@ namespace build2
                     ln, cn, token_printer);
     };
 
-    // Handle attributes (do it first to make sure the flag is cleared
-    // regardless of what we return).
+    // Handle `[` (do it first to make sure the flag is cleared regardless of
+    // what we return).
     //
-    if (st.attributes)
+    if (st.lsbrace)
     {
-      st.attributes = false;
+      st.lsbrace = false;
 
-      if (c == '[')
+      if (c == '[' && (!st.lsbrace_unsep || !sep))
         return make_token (type::lsbrace);
     }
 
@@ -226,11 +234,15 @@ namespace build2
             m == lexer_mode::case_patterns)
           state_.pop ();
 
-        // Re-enable attributes in the normal mode (should never be needed in
-        // cmdvar).
+        // Re-enable `[` recognition (attributes) in the normal mode (should
+        // never be needed in cmdvar).
         //
-        if (state_.top ().mode == lexer_mode::normal)
-          state_.top ().attributes = true;
+        state& st (state_.top ());
+        if (st.mode == lexer_mode::normal)
+        {
+          st.lsbrace = true;
+          st.lsbrace_unsep = false;
+        }
 
         sep = true; // Treat newline as always separated.
         return make_token (type::newline);
@@ -274,9 +286,12 @@ namespace build2
       }
     }
 
-    // The following characters are special in all modes except attributes.
+    // The following characters are special in all modes except attributes
+    // and subscript.
     //
-    if (m != lexer_mode::attributes && m != lexer_mode::attribute_value)
+    if (m != lexer_mode::attributes      &&
+        m != lexer_mode::attribute_value &&
+        m != lexer_mode::subscript)
     {
       switch (c)
       {
@@ -295,13 +310,15 @@ namespace build2
       }
     }
 
-    if (m == lexer_mode::attributes || m == lexer_mode::attribute_value)
+    if (m == lexer_mode::attributes      ||
+        m == lexer_mode::attribute_value ||
+        m == lexer_mode::subscript)
     {
       switch (c)
       {
       case ']':
         {
-          state_.pop (); // Expire the attributes mode after closing `]`.
+          state_.pop (); // Expire the mode after closing `]`.
           return make_token (type::rsbrace);
         }
       }
@@ -425,14 +442,14 @@ namespace build2
                     ln, cn, token_printer);
     };
 
-    // Handle attributes (do it first to make sure the flag is cleared
-    // regardless of what we return).
+    // Handle `[` (do it first to make sure the flag is cleared regardless of
+    // what we return).
     //
-    if (st.attributes)
+    if (st.lsbrace)
     {
-      st.attributes = false;
+      st.lsbrace = false;
 
-      if (c == '[')
+      if (c == '[' && (!st.lsbrace_unsep || !sep))
         return make_token (type::lsbrace);
     }
 
diff --git a/libbuild2/lexer.hxx b/libbuild2/lexer.hxx
index 749668e..d5f1c99 100644
--- a/libbuild2/lexer.hxx
+++ b/libbuild2/lexer.hxx
@@ -31,7 +31,9 @@ namespace build2
   // values). The attributes/attribute_value modes are like values where each
   // value is potentially a variable assignment; they don't treat `{` and `}`
   // as special (so we cannot have name groups in attributes) as well as
-  // recognizes `=` and `]`. The eval mode is used in the evaluation context.
+  // recognizes `=` and `]`. The subscript mode is like value but doesn't
+  // treat `{` and `}` as special and recognizes `]`. The eval mode is used in
+  // the evaluation context.
   //
   // A number of modes are "derived" from the value/values mode by recognizing
   // a few extra characters:
@@ -55,10 +57,10 @@ namespace build2
   // mode data.
   //
   // The alternative modes must be set manually. The value/values and derived
-  // modes automatically expires after the end of the line. The attribute mode
-  // expires after the closing `]`. The variable mode expires after the word
-  // token. The eval mode expires after the closing `)`. And the foreign mode
-  // expires after the closing braces.
+  // modes automatically expires after the end of the line. The attribute and
+  // subscript modes expires after the closing `]`. The variable mode expires
+  // after the word token. The eval mode expires after the closing `)`. And
+  // the foreign mode expires after the closing braces.
   //
   // Note that normally it is only safe to switch mode when the current token
   // is not quoted (or, more generally, when you are not in the double-quoted
@@ -66,13 +68,13 @@ namespace build2
   // variable name mode). Failed that your mode (which now will be the top of
   // the mode stack) will prevent proper recognition of the closing quote.
   //
-  // Finally, attributes recognition (the `[` token) cuts across most of the
-  // modes and is handled with a flag. In the normal mode it is automatically
-  // set at the beginning and after each newline. In all other modes it must
-  // be explicitly set at points where attributes are recognized. In all the
-  // cases it is automatically reset after lexing the next token (whether `[`
-  // or not).
-  //
+  // The `[` token is used for attributes (where it cuts across most of the
+  // modes) as well as for value subscript (where it is only recognized after
+  // expansions). It is handled with a flag. In the normal mode it is
+  // automatically set at the beginning and after each newline. In all other
+  // modes it must be explicitly set at points where attribute/subscript is
+  // recognized. In all the cases it is automatically reset after lexing the
+  // next token (whether `[` or not).
 
   // Extendable/inheritable enum-like class.
   //
@@ -91,6 +93,7 @@ namespace build2
       switch_expressions,
       attributes,
       attribute_value,
+      subscript,
       eval,
       single_quoted,
       double_quoted,
@@ -134,10 +137,14 @@ namespace build2
           optional<const char*> escapes = nullopt,
           uintptr_t data = 0);
 
-    // Enable attributes recognition for the next token.
+    // Enable `[` recognition for the next token.
     //
     void
-    enable_attributes () {state_.top ().attributes = true;}
+    enable_lsbrace (bool unsep = false)
+    {
+      state_.top ().lsbrace = true;
+      state_.top ().lsbrace_unsep = unsep;
+    }
 
     // Expire the current mode early.
     //
@@ -177,7 +184,8 @@ namespace build2
       uintptr_t       data;
       optional<token> hold;
 
-      bool       attributes;
+      bool lsbrace;       // Recognize `[`.
+      bool lsbrace_unsep; // Recognize it only if unseparated.
 
       char sep_pair;
       bool sep_space;    // Are whitespaces separators (see skip_spaces())?
diff --git a/libbuild2/parser.cxx b/libbuild2/parser.cxx
index 94f597d..c359ce0 100644
--- a/libbuild2/parser.cxx
+++ b/libbuild2/parser.cxx
@@ -4204,8 +4204,9 @@ namespace build2
                               tt == type::dollar ||
                               tt == type::lparen ||
                               tt == type::lcbrace))
-      fail (t) << "whitespace required after attributes" <<
-        info << "use the '\\[' escape sequence if this is a wildcard pattern";
+      fail (t)   << "whitespace required after attributes" <<
+        info (l) << "use the '\\[' escape sequence if this is a wildcard "
+                 << "pattern";
 
     return make_pair (has, l);
   }
@@ -5520,7 +5521,7 @@ namespace build2
         continue;
       }
 
-      // Variable expansion, function call, or eval context.
+      // Expanions: variable expansion, function call, or eval context.
       //
       if (tt == type::dollar || tt == type::lparen)
       {
@@ -5533,6 +5534,11 @@ namespace build2
         const char* what; // Variable, function, or evaluation context.
         bool quoted (t.qtype != quote_type::unquoted);
 
+        // We only recognize value subscripts inside eval contexts due to the
+        // ambiguity with wildcard patterns (consider: $x[123].txt).
+        //
+        bool sub (mode () == lexer_mode::eval);
+
         if (tt == type::dollar)
         {
           // Switch to the variable name mode. We want to use this mode for
@@ -5625,9 +5631,10 @@ namespace build2
           if (!pre_parse_ && name.empty ())
             fail (loc) << "empty variable/function name";
 
-          // Figure out whether this is a variable expansion or a function
-          // call.
+          // Figure out whether this is a variable expansion with potential
+          // subscript or a function call.
           //
+          if (sub) enable_subscript ();
           tt = peek ();
 
           // Note that we require function call opening paren to be
@@ -5645,15 +5652,17 @@ namespace build2
             // context in which to call the function? Hm, interesting...
             //
             values args (parse_eval (t, tt, pmode));
-            tt = peek ();
 
-            if (pre_parse_)
-              continue; // As if empty result.
+            if (sub) enable_subscript ();
+            tt = peek ();
 
             // Note that we "move" args to call().
             //
-            result_data = ctx.functions.call (scope_, name, args, loc);
-            what = "function call";
+            if (!pre_parse_)
+            {
+              result_data = ctx.functions.call (scope_, name, args, loc);
+              what = "function call";
+            }
           }
           else
           {
@@ -5661,42 +5670,124 @@ namespace build2
             //
             lookup l (lookup_variable (move (qual), move (name), loc));
 
-            if (pre_parse_)
-              continue; // As if empty value.
-
-            if (l.defined ())
-              result = l.value; // Otherwise leave as NULL result_data.
+            if (!pre_parse_)
+            {
+              if (l.defined ())
+                result = l.value; // Otherwise leave as NULL result_data.
 
-            what = "variable expansion";
+              what = "variable expansion";
+            }
           }
         }
         else
         {
-          // Context evaluation.
+          // Evaluation context.
           //
           loc = get_location (t);
           mode (lexer_mode::eval, '@');
           next_with_attributes (t, tt);
 
           values vs (parse_eval (t, tt, pmode));
+
+          if (sub) enable_subscript ();
           tt = peek ();
 
-          if (pre_parse_)
-            continue; // As if empty result.
+          if (!pre_parse_)
+          {
+            switch (vs.size ())
+            {
+            case 0:  result_data = value (names ()); break;
+            case 1:  result_data = move (vs[0]); break;
+            default: fail (loc) << "expected single value";
+            }
 
-          switch (vs.size ())
+            what = "context evaluation";
+          }
+        }
+
+        // Handle value subscript.
+        //
+        if (tt == type::lsbrace)
+        {
+          location bl (get_location (t));
+          next (t, tt); // `[`
+          mode (lexer_mode::subscript, '\0' /* pair */);
+          next (t, tt);
+
+          location l (get_location (t));
+          value v (
+            tt != type::rsbrace
+            ? parse_value (t, tt, pattern_mode::ignore, "value subscript")
+            : value (names ()));
+
+          if (tt != type::rsbrace)
           {
-          case 0:  result_data = value (names ()); break;
-          case 1:  result_data = move (vs[0]); break;
-          default: fail (loc) << "expected single value";
+            // Note: wildcard pattern should have `]` as well so no escaping
+            // suggestion.
+            //
+            fail (t) << "expected ']' instead of " << t;
           }
 
-          what = "context evaluation";
+          if (!pre_parse_)
+          {
+            uint64_t j;
+            try
+            {
+              j = convert<uint64_t> (move (v));
+            }
+            catch (const invalid_argument& e)
+            {
+              fail (l)    << "invalid value subscript: " << e <<
+                info (bl) << "use the '\\[' escape sequence if this is a "
+                          << "wildcard pattern";
+            }
+
+            // Similar to expanding an undefined variable, we return NULL if
+            // the index is out of bounds.
+            //
+            // Note that result may or may not point to result_data.
+            //
+            if (result->type == nullptr)
+            {
+              const names& ns (result->as<names> ());
+
+              // Pair-aware subscript.
+              //
+              names r;
+              for (auto i (ns.begin ()); i != ns.end (); ++i, --j)
+              {
+                if (j == 0)
+                {
+                  r.push_back (*i);
+                  if (i->pair)
+                    r.push_back (*++i);
+                  break;
+                }
+
+                if (i->pair)
+                  ++i;
+              }
+
+              result_data = r.empty () ? value () : value (move (r));
+            }
+            else
+            {
+              // @@ TODO: we would want to return a value with element type.
+              //
+              //result_data = ...
+              fail (l)    << "typed value subscript not yet supported" <<
+                info (bl) << "use the '\\[' escape sequence if this is a "
+                          << "wildcard pattern";
+            }
+
+            result = &result_data;
+          }
+
+          tt = peek ();
         }
 
-        // We never end up here during pre-parsing.
-        //
-        assert (!pre_parse_);
+        if (pre_parse_)
+          continue; // As if empty result.
 
         // Should we accumulate? If the buffer is not empty, then we continue
         // accumulating (the case where we are separated should have been
diff --git a/libbuild2/parser.hxx b/libbuild2/parser.hxx
index bc01e08..2f67c31 100644
--- a/libbuild2/parser.hxx
+++ b/libbuild2/parser.hxx
@@ -619,7 +619,14 @@ namespace build2
     enable_attributes ()
     {
       if (replay_ != replay::play)
-        lexer_->enable_attributes ();
+        lexer_->enable_lsbrace ();
+    }
+
+    void
+    enable_subscript ()
+    {
+      if (replay_ != replay::play)
+        lexer_->enable_lsbrace (true /* unseparated */);
     }
 
     void
diff --git a/libbuild2/script/lexer.cxx b/libbuild2/script/lexer.cxx
index d78e999..ce409c1 100644
--- a/libbuild2/script/lexer.cxx
+++ b/libbuild2/script/lexer.cxx
@@ -16,8 +16,6 @@ namespace build2
     void lexer::
     mode (base_mode m, char ps, optional<const char*> esc, uintptr_t data)
     {
-      bool a (false); // attributes
-
       const char* s1 (nullptr);
       const char* s2 (nullptr);
 
@@ -86,7 +84,8 @@ namespace build2
       }
 
       assert (ps == '\0');
-      state_.push (state {m, data, nullopt, a, ps, s, n, q, *esc, s1, s2});
+      state_.push (
+        state {m, data, nullopt, false, false, ps, s, n, q, *esc, s1, s2});
     }
 
     token lexer::
diff --git a/libbuild2/test/script/lexer.cxx b/libbuild2/test/script/lexer.cxx
index a94109b..e895d4a 100644
--- a/libbuild2/test/script/lexer.cxx
+++ b/libbuild2/test/script/lexer.cxx
@@ -26,8 +26,6 @@ namespace build2
       void lexer::
       mode (base_mode m, char ps, optional<const char*> esc, uintptr_t data)
       {
-        bool a (false); // attributes
-
         const char* s1 (nullptr);
         const char* s2 (nullptr);
 
@@ -109,7 +107,8 @@ namespace build2
         }
 
         assert (ps == '\0');
-        state_.push (state {m, data, nullopt, a, ps, s, n, q, *esc, s1, s2});
+        state_.push (
+          state {m, data, nullopt, false, false, ps, s, n, q, *esc, s1, s2});
       }
 
       token lexer::
@@ -153,16 +152,16 @@ namespace build2
           return token (t, sep, ln, cn, token_printer);
         };
 
-        // Handle attributes (do it first to make sure the flag is cleared
-        // regardless of what we return).
+        // Handle `[` (do it first to make sure the flag is cleared regardless
+        // of what we return).
         //
-        if (st.attributes)
+        if (st.lsbrace)
         {
           assert (m == lexer_mode::variable_line);
 
-          state_.top ().attributes = false;
+          state_.top ().lsbrace = false; // Note: st is a copy.
 
-          if (c == '[')
+          if (c == '[' && (!st.lsbrace_unsep || !sep))
             return make_token (type::lsbrace);
         }
 
diff --git a/libbuild2/variable.cxx b/libbuild2/variable.cxx
index d16fcb4..206eb54 100644
--- a/libbuild2/variable.cxx
+++ b/libbuild2/variable.cxx
@@ -491,7 +491,13 @@ namespace build2
       {
         // May throw invalid_argument or out_of_range.
         //
-        return stoull (n.value);
+        size_t i;
+        uint64_t r (stoull (n.value, &i));
+
+        if (i == n.value.size ())
+          return r;
+
+        // Fall through.
       }
       catch (const std::exception&)
       {
diff --git a/tests/expansion/concat.testscript b/tests/expansion/concat.testscript
index 181a738..bec48ce 100644
--- a/tests/expansion/concat.testscript
+++ b/tests/expansion/concat.testscript
@@ -1,4 +1,4 @@
-# file      : tests/expansion/type.testscript
+# file      : tests/expansion/concat.testscript
 # license   : MIT; see accompanying LICENSE file
 
 # Test concatenated expansion.
diff --git a/tests/expansion/subscript.testscript b/tests/expansion/subscript.testscript
new file mode 100644
index 0000000..0c06394
--- /dev/null
+++ b/tests/expansion/subscript.testscript
@@ -0,0 +1,97 @@
+# file      : tests/expansion/subscript.testscript
+# license   : MIT; see accompanying LICENSE file
+
+# Test subscript expansion.
+
+.include ../common.testscript
+
+: basics
+:
+$* <<EOI >>EOO
+x = zero one two three
+y = zero@one two@three
+i = 2
+
+print ($x[1])
+print ($x[4])
+print (($x)[1])
+print (($x)[4])
+print ($identity($x)[1])
+print ($identity($x)[4])
+
+print
+
+print ($y[1])
+print ($y[4])
+print (($y)[1])
+print (($y)[4])
+print ($identity($y)[1])
+print ($identity($y)[4])
+
+print
+
+print ($x[$i])
+
+EOI
+one
+[null]
+one
+[null]
+one
+[null]
+
+two@three
+[null]
+two@three
+[null]
+two@three
+[null]
+
+two
+EOO
+
+: unseparated
+:
+$* <<EOI >>EOO
+x = zero one
+print ($x [1])
+EOI
+zero one
+EOO
+
+: escape
+:
+$* <<EOI >>EOO
+x = zero
+print ($x\[abc])
+EOI
+EOO
+
+: preparse
+:
+$* <<EOI >>EOO
+x = zero one two three
+print (true ? $x[1] : $x[])
+EOI
+one
+EOO
+
+: missing-rsbrace
+:
+$* <'print ($x[1)' 2>>EOE != 0
+<stdin>:1:12: error: expected ']' instead of ')'
+EOE
+
+: invalid-subscript
+:
+$* <'print ($x[1a])' 2>>EOE != 0
+<stdin>:1:11: error: invalid value subscript: invalid uint64 value: '1a'
+  <stdin>:1:9: info: use the '\[' escape sequence if this is a wildcard pattern
+EOE
+
+: empty-subscript
+:
+$* <'print ($x[])' 2>>EOE != 0
+<stdin>:1:11: error: invalid value subscript: invalid uint64 value: empty
+  <stdin>:1:9: info: use the '\[' escape sequence if this is a wildcard pattern
+EOE
-- 
cgit v1.1