From fce9782a330e8f701a8df0b5200e5b78e97ec4b5 Mon Sep 17 00:00:00 2001
From: Boris Kolpackov <boris@codesynthesis.com>
Date: Wed, 6 May 2020 06:58:34 +0200
Subject: Handle multi-curly-brace tokens in lexer

---
 libbuild2/lexer+foreign.test.testscript |  96 +++++++++++++++++++++++++
 libbuild2/lexer+normal.test.testscript  |  18 +++++
 libbuild2/lexer.cxx                     | 124 +++++++++++++++++++++++++++++++-
 libbuild2/lexer.hxx                     |  25 +++++--
 libbuild2/lexer.test.cxx                |  14 +++-
 libbuild2/test/script/lexer.cxx         |   2 +-
 libbuild2/token.cxx                     |  69 +++++++++---------
 libbuild2/token.hxx                     |   3 +
 8 files changed, 306 insertions(+), 45 deletions(-)
 create mode 100644 libbuild2/lexer+foreign.test.testscript
diff --git a/libbuild2/lexer+foreign.test.testscript b/libbuild2/lexer+foreign.test.testscript
new file mode 100644
index 0000000..94c83c1
--- /dev/null
+++ b/libbuild2/lexer+foreign.test.testscript
@@ -0,0 +1,96 @@
+# file      : libbuild2/lexer+foreign.test.testscript
+# license   : MIT; see accompanying LICENSE file
+
+test.arguments = foreign=2
+
+: basics
+:
+$* <<EOI >>EOO
+echo foo
+}}
+EOI
+'echo foo
+'
+}}
+<newline>
+EOO
+
+: empty
+:
+$* <<EOI >>EOO
+}}
+EOI
+''
+}}
+<newline>
+EOO
+
+: braces
+:
+$* <<EOI >>EOO
+}
+}}}
+{{}}
+}} }
+}}
+EOI
+'}
+}}}
+{{}}
+}} }
+'
+}}
+<newline>
+EOO
+
+: whitespaces
+:
+$* <'	 }} 	' >>EOO # Note: there are TABs.
+''
+}}
+<newline>
+EOO
+
+: comment
+:
+$* <'}} # comment' >>EOO
+''
+}}
+<newline>
+EOO
+
+: eos
+:
+$* <:'}}' >>EOO
+''
+}}
+EOO
+
+: missing
+: Note that we get eos right away (i.e., there is no word token).
+:
+$* <<EOI
+}
+}}}
+{{}}
+}} }
+}
+EOI
+
+: three
+:
+{
+  test.arguments = foreign=3
+
+  : basic
+  :
+  $* <<EOI >>EOO
+    echo foo
+  }}}
+  EOI
+  '  echo foo
+  '
+  }}}
+  <newline>
+  EOO
+}
diff --git a/libbuild2/lexer+normal.test.testscript b/libbuild2/lexer+normal.test.testscript
index e66b81e..e2780a2 100644
--- a/libbuild2/lexer+normal.test.testscript
+++ b/libbuild2/lexer+normal.test.testscript
@@ -70,3 +70,21 @@ EOO
   'x%'
   EOO
 }
+
+: multi-lcbrace
+: Leading multi-curly-brace recognition.
+:
+{
+  : two
+  :
+  $* <:'{{' >>EOO
+  {{
+  EOO
+
+  : three
+  :
+  $* <:'{{{c++' >>EOO
+  {{{
+  'c++'
+  EOO
+}
diff --git a/libbuild2/lexer.cxx b/libbuild2/lexer.cxx
index e970437..6d3504c 100644
--- a/libbuild2/lexer.cxx
+++ b/libbuild2/lexer.cxx
@@ -128,10 +128,16 @@ namespace build2
         n = false;
         break;
       }
+    case lexer_mode::foreign:
+      assert (data > 1);
+      // Fall through.
     case lexer_mode::single_quoted:
     case lexer_mode::double_quoted:
-      s = false;
-      // Fall through.
+      {
+        assert (ps == '\0');
+        s = false;
+        break;
+      }
     case lexer_mode::variable:
       {
         // These are handled in an ad hoc way in word().
@@ -141,7 +147,7 @@ namespace build2
     default: assert (false); // Unhandled custom mode.
     }
 
-    state_.push (state {m, data, a, ps, s, n, q, *esc, s1, s2});
+    state_.push (state {m, data, nullopt, a, ps, s, n, q, *esc, s1, s2});
   }
 
   token lexer::
@@ -166,6 +172,7 @@ namespace build2
     case lexer_mode::buildspec:     break;
     case lexer_mode::eval:          return next_eval ();
     case lexer_mode::double_quoted: return next_quoted ();
+    case lexer_mode::foreign:       return next_foreign ();
     default:                        assert (false); // Unhandled custom mode.
     }
 
@@ -241,11 +248,29 @@ namespace build2
       }
     }
 
+    // Line-leading tokens in the normal mode.
+    //
+    // Note: must come before any other (e.g., `{`) tests below.
+    //
     if (m == lexer_mode::normal && first)
     {
       switch (c)
       {
       case '%': return make_token (type::percent);
+      case '{':
+        {
+          string v;
+          while (peek () == '{')
+            v += get ();
+
+          if (!v.empty ())
+          {
+            v += '{';
+            return make_token (type::multi_lcbrace, move (v));
+          }
+
+          break;
+        }
       }
     }
 
@@ -507,6 +532,99 @@ namespace build2
   }
 
   token lexer::
+  next_foreign ()
+  {
+    state& st (state_.top ());
+
+    if (st.hold)
+    {
+      token r (move (*st.hold));
+      state_.pop (); // Expire foreign mode.
+      return r;
+    }
+
+    auto count (state_.top ().data); // Number of closing braces to expect.
+
+    xchar c (get ()); // First character of first line after `{{...`.
+    uint64_t ln (c.line), cn (c.column);
+
+    string lexeme;
+    for (bool first (true); !eos (c); c = get ())
+    {
+      // If this is the first character of a line, recognize closing braces.
+      //
+      if (first)
+      {
+        first = false;
+
+        // If this turns not to be the closing braces, we need to add any
+        // characters we have extracted to lexeme. Instead of saving these
+        // characters in a temporary we speculatively add them to the lexeme
+        // but then chop them off if this turned out to be the closing braces.
+        //
+        size_t chop (lexeme.size ());
+
+        // Skip leading whitespaces, if any.
+        //
+        for (; c == ' ' || c == '\t'; c = get ())
+          lexeme += c;
+
+        uint64_t bln (c.line), bcn (c.column); // Position of first `}`.
+
+        // Count braces.
+        //
+        auto i (count);
+        for (; c == '}'; c = get ())
+        {
+          lexeme += c;
+
+          if (--i == 0)
+            break;
+        }
+
+        if (i == 0) // Got enough braces.
+        {
+          // Make sure there are only whitespaces/comments after. Note that
+          // now we must start peeking since newline is not "ours".
+          //
+          for (c = peek (); c == ' ' || c == '\t'; c = peek ())
+            lexeme += get ();
+
+          if (c == '\n' || c == '#' || eos (c))
+          {
+            st.hold = token (type::multi_rcbrace,
+                             string (count, '}'),
+                             false, quote_type::unquoted, false,
+                             bln, bcn,
+                             token_printer);
+
+            lexeme.resize (chop);
+            return token (move (lexeme),
+                          false, quote_type::unquoted, false,
+                          ln, cn);
+          }
+
+          get (); // And fall through (not eos).
+        }
+        else
+        {
+          if (eos (c))
+            break;
+
+          // Fall through.
+        }
+      }
+
+      if (c == '\n')
+        first = true;
+
+      lexeme += c;
+    }
+
+    return token (type::eos, false, c.line, c.column, token_printer);
+  }
+
+  token lexer::
   word (state st, bool sep)
   {
     lexer_mode m (st.mode);
diff --git a/libbuild2/lexer.hxx b/libbuild2/lexer.hxx
index 6dc5027..8dd58c8 100644
--- a/libbuild2/lexer.hxx
+++ b/libbuild2/lexer.hxx
@@ -43,13 +43,22 @@ namespace build2
   // split words separated by the pair character (to disable pairs one can
   // pass `\0` as a pair character).
   //
-  // The normal mode recognizes `%` at the beginning of the line as special.
-  // The cmdvar mode is like normal but does not treat `%` as special.
+  // The normal mode recognizes `%` and `{{...` at the beginning of the line
+  // as special. The cmdvar mode is like normal but does not treat these
+  // character sequences as special.
+  //
+  // Finally, the foreign mode reads everything until encountering a line that
+  // contains nothing (besides whitespaces) other than the closing multi-
+  // curly-brace (`}}...`) (or eos) returning the contents as the word token
+  // followed by the multi_rcbrace (or eos). In a way it is similar to the
+  // single-quote mode. The number of closing braces to expect is passed as
+  // mode data.
   //
   // The alternative modes must be set manually. The value/values and derived
   // modes automatically expires after the end of the line. The attribute mode
   // expires after the closing `]`. The variable mode expires after the word
-  // token. And the eval mode expires after the closing `)`.
+  // token. The eval mode expires after the closing `)`. And the foreign mode
+  // expires after the closing braces.
   //
   // Note that normally it is only safe to switch mode when the current token
   // is not quoted (or, more generally, when you are not in the double-quoted
@@ -85,6 +94,7 @@ namespace build2
       eval,
       single_quoted,
       double_quoted,
+      foreign,
       buildspec,
 
       value_next
@@ -163,8 +173,10 @@ namespace build2
   protected:
     struct state
     {
-      lexer_mode mode;
-      uintptr_t  data;
+      lexer_mode      mode;
+      uintptr_t       data;
+      optional<token> hold;
+
       bool       attributes;
 
       char sep_pair;
@@ -190,6 +202,9 @@ namespace build2
     token
     next_quoted ();
 
+    token
+    next_foreign ();
+
     // Lex a word assuming current is the top state (which may already have
     // been "expired" from the top).
     //
diff --git a/libbuild2/lexer.test.cxx b/libbuild2/lexer.test.cxx
index 5e39e43..3458f56 100644
--- a/libbuild2/lexer.test.cxx
+++ b/libbuild2/lexer.test.cxx
@@ -1,6 +1,7 @@
 // file      : libbuild2/lexer.test.cxx -*- C++ -*-
 // license   : MIT; see accompanying LICENSE file
 
+#include <cstdlib> // strtoul()
 #include <cassert>
 #include <iostream>
 
@@ -14,13 +15,15 @@ using namespace std;
 
 namespace build2
 {
-  // Usage: argv[0] [-q] [<lexer-mode>]
+  // Usage: argv[0] [-q] [<lexer-mode>[=<data>]]
   //
   int
   main (int argc, char* argv[])
   {
     bool quote (false);
+
     lexer_mode m (lexer_mode::normal);
+    uintptr_t d (0);
 
     for (int i (1); i != argc; ++i)
     {
@@ -36,7 +39,12 @@ namespace build2
         else if (a == "attributes") m = lexer_mode::attributes;
         else if (a == "eval")       m = lexer_mode::eval;
         else if (a == "buildspec")  m = lexer_mode::buildspec;
-        else                       assert (false);
+        else if (a.compare (0, 8, "foreign=") == 0)
+        {
+          m = lexer_mode::foreign;
+          d = strtoul (a.c_str () + 8, nullptr, 10);
+        }
+        else                        assert (false);
         break;
       }
     }
@@ -51,7 +59,7 @@ namespace build2
       lexer l (cin, in);
 
       if (m != lexer_mode::normal)
-        l.mode (m);
+        l.mode (m, '\0', nullopt, d);
 
       // No use printing eos since we will either get it or loop forever.
       //
diff --git a/libbuild2/test/script/lexer.cxx b/libbuild2/test/script/lexer.cxx
index 1eeb0be..32c1cf4 100644
--- a/libbuild2/test/script/lexer.cxx
+++ b/libbuild2/test/script/lexer.cxx
@@ -138,7 +138,7 @@ namespace build2
         }
 
         assert (ps == '\0');
-        state_.push (state {m, data, a, ps, s, n, q, *esc, s1, s2});
+        state_.push (state {m, data, nullopt, a, ps, s, n, q, *esc, s1, s2});
       }
 
       token lexer::
diff --git a/libbuild2/token.cxx b/libbuild2/token.cxx
index 11b080e..cfdc6bd 100644
--- a/libbuild2/token.cxx
+++ b/libbuild2/token.cxx
@@ -21,39 +21,42 @@ namespace build2
     case token_type::pair_separator: os << "<pair separator " << t.value[0] << ">"; break;
     case token_type::word:           os << '\'' << t.value << '\''; break;
 
-    case token_type::colon:          os << q << ':'  << q; break;
-    case token_type::dollar:         os << q << '$'  << q; break;
-    case token_type::question:       os << q << '?'  << q; break;
-    case token_type::percent:        os << q << '%'  << q; break;
-    case token_type::comma:          os << q << ','  << q; break;
-
-    case token_type::lparen:         os << q << '('  << q; break;
-    case token_type::rparen:         os << q << ')'  << q; break;
-
-    case token_type::lcbrace:        os << q << '{'  << q; break;
-    case token_type::rcbrace:        os << q << '}'  << q; break;
-
-    case token_type::lsbrace:        os << q << '['  << q; break;
-    case token_type::rsbrace:        os << q << ']'  << q; break;
-
-    case token_type::labrace:        os << q << '<'  << q; break;
-    case token_type::rabrace:        os << q << '>'  << q; break;
-
-    case token_type::assign:         os << q << '='  << q; break;
-    case token_type::prepend:        os << q << "=+" << q; break;
-    case token_type::append:         os << q << "+=" << q; break;
-    case token_type::default_assign: os << q << "?=" << q; break;
-
-    case token_type::equal:          os << q << "==" << q; break;
-    case token_type::not_equal:      os << q << "!=" << q; break;
-    case token_type::less:           os << q << '<'  << q; break;
-    case token_type::greater:        os << q << '>'  << q; break;
-    case token_type::less_equal:     os << q << "<=" << q; break;
-    case token_type::greater_equal:  os << q << ">=" << q; break;
-
-    case token_type::log_or:         os << q << "||" << q; break;
-    case token_type::log_and:        os << q << "&&" << q; break;
-    case token_type::log_not:        os << q << '!'  << q; break;
+    case token_type::colon:          os << q << ':'     << q; break;
+    case token_type::dollar:         os << q << '$'     << q; break;
+    case token_type::question:       os << q << '?'     << q; break;
+    case token_type::percent:        os << q << '%'     << q; break;
+    case token_type::comma:          os << q << ','     << q; break;
+
+    case token_type::lparen:         os << q << '('     << q; break;
+    case token_type::rparen:         os << q << ')'     << q; break;
+
+    case token_type::lcbrace:        os << q << '{'     << q; break;
+    case token_type::rcbrace:        os << q << '}'     << q; break;
+
+    case token_type::multi_lcbrace:  os << q << t.value << q; break;
+    case token_type::multi_rcbrace:  os << q << t.value << q; break;
+
+    case token_type::lsbrace:        os << q << '['     << q; break;
+    case token_type::rsbrace:        os << q << ']'     << q; break;
+
+    case token_type::labrace:        os << q << '<'     << q; break;
+    case token_type::rabrace:        os << q << '>'     << q; break;
+
+    case token_type::assign:         os << q << '='     << q; break;
+    case token_type::prepend:        os << q << "=+"    << q; break;
+    case token_type::append:         os << q << "+="    << q; break;
+    case token_type::default_assign: os << q << "?="    << q; break;
+
+    case token_type::equal:          os << q << "=="    << q; break;
+    case token_type::not_equal:      os << q << "!="    << q; break;
+    case token_type::less:           os << q << '<'     << q; break;
+    case token_type::greater:        os << q << '>'     << q; break;
+    case token_type::less_equal:     os << q << "<="    << q; break;
+    case token_type::greater_equal:  os << q << ">="    << q; break;
+
+    case token_type::log_or:         os << q << "||"    << q; break;
+    case token_type::log_and:        os << q << "&&"    << q; break;
+    case token_type::log_not:        os << q << '!'     << q; break;
 
     default: assert (false); // Unhandled extended token.
     }
diff --git a/libbuild2/token.hxx b/libbuild2/token.hxx
index 8dad4ba..e11b880 100644
--- a/libbuild2/token.hxx
+++ b/libbuild2/token.hxx
@@ -45,6 +45,9 @@ namespace build2
       lcbrace,         // {
       rcbrace,         // }
 
+      multi_lcbrace,   // {{... (value contains the braces)
+      multi_rcbrace,   // }}... (value contains the braces)
+
       lsbrace,         // [
       rsbrace,         // ]
 
-- 
cgit v1.1