Add recognition for line-leading `%` as token

author: Boris Kolpackov <boris@codesynthesis.com> 2020-05-04 07:27:47 +0200
committer: Boris Kolpackov <boris@codesynthesis.com> 2020-05-27 08:35:29 +0200
commit: a54abb2f4e5e66877619097bfd281261f99c5103 (patch)
tree: b10a24fd2c5d1d1dd2602f25cc7228b287200cbb
parent: e63b427c51e37135e50dec9435659d661872fe95 (diff)
8 files changed, 104 insertions, 28 deletions
diff --git a/libbuild2/context.cxx b/libbuild2/context.cxx
index 0be0046..fe046ae 100644
--- a/libbuild2/context.cxx
+++ b/libbuild2/context.cxx
@@ -340,8 +340,10 @@ namespace build2
       // And so the first token should be a word which can be either a
       // variable name (potentially with the directory qualification) or just
       // the directory, in which case it should be followed by another word
-      // (unqualified variable name).
+      // (unqualified variable name). To avoid treating any of the visibility
+      // modifiers as special we use the cmdvar mode.
       //
+      l.mode (lexer_mode::cmdvar);
       token t (l.next ());
 
       optional<dir_path> dir;
diff --git a/libbuild2/lexer+normal.test.testscript b/libbuild2/lexer+normal.test.testscript
index c9448c3..e66b81e 100644
--- a/libbuild2/lexer+normal.test.testscript
+++ b/libbuild2/lexer+normal.test.testscript
@@ -34,3 +34,39 @@ $* <:'x?=y' >>EOO
 ?=
 'y'
 EOO
+
+: percent
+: Leading percent sign recognition.
+:
+{
+  : first
+  :
+  $* <:'%%' >>EOO
+  %
+  '%'
+  EOO
+
+  : space
+  :
+  $* <:'  %%' >>EOO
+   %
+  '%'
+  EOO
+
+  : newline
+  :
+  $* <<EOI >>EOO
+
+    %%
+  EOI
+   %
+  '%'
+  <newline>
+  EOO
+
+  : non-token
+  :
+  $* <:'x%' >>EOO
+  'x%'
+  EOO
+}
diff --git a/libbuild2/lexer.cxx b/libbuild2/lexer.cxx
index c0cadd3..1e400e3 100644
--- a/libbuild2/lexer.cxx
+++ b/libbuild2/lexer.cxx
@@ -14,7 +14,10 @@ namespace build2
   pair<pair<char, char>, bool> lexer::
   peek_chars ()
   {
-    sep_ = skip_spaces ();
+    auto p (skip_spaces ());
+    assert (!p.second);
+    sep_ = p.first;
+
     char r[2] = {'\0', '\0'};
 
     xchar c0 (peek ());
@@ -54,7 +57,11 @@ namespace build2
     switch (m)
     {
     case lexer_mode::normal:
+    case lexer_mode::cmdvar:
       {
+        // Note: `%` is only recognized at the beginning of the line so it
+        // should not be included here.
+        //
         a  = true;
         s1 = ":<>=+? $(){}#\t\n";
         s2 = "    ==         ";
@@ -148,6 +155,7 @@ namespace build2
     switch (m)
     {
     case lexer_mode::normal:
+    case lexer_mode::cmdvar:
     case lexer_mode::value:
     case lexer_mode::values:
     case lexer_mode::switch_expressions:
@@ -161,7 +169,9 @@ namespace build2
     default:                        assert (false); // Unhandled custom mode.
     }
 
-    bool sep (skip_spaces ());
+    pair<bool, bool> skip (skip_spaces ());
+    bool sep (skip.first);    // Separated from a previous character.
+    bool first (skip.second); // First non-whitespace character of a line.
 
     xchar c (get ());
     uint64_t ln (c.line), cn (c.column);
@@ -209,7 +219,8 @@ namespace build2
             m == lexer_mode::case_patterns)
           state_.pop ();
 
-        // Re-enable attributes in the normal mode.
+        // Re-enable attributes in the normal mode (should never be needed in
+        // cmdvar).
         //
         if (state_.top ().mode == lexer_mode::normal)
           state_.top ().attributes = true;
@@ -230,6 +241,14 @@ namespace build2
       }
     }
 
+    if (m == lexer_mode::normal && first)
+    {
+      switch (c)
+      {
+      case '%': return make_token (type::percent);
+      }
+    }
+
     // The following characters are special in all modes except attributes.
     //
     if (m != lexer_mode::attributes && m != lexer_mode::attribute_value)
@@ -267,6 +286,7 @@ namespace build2
     // switch_expressions modes.
     //
     if (m == lexer_mode::normal             ||
+        m == lexer_mode::cmdvar             ||
         m == lexer_mode::switch_expressions ||
         m == lexer_mode::case_patterns)
     {
@@ -278,7 +298,8 @@ namespace build2
 
     // The following characters are special in the normal mode.
     //
-    if (m == lexer_mode::normal)
+    if (m == lexer_mode::normal ||
+        m == lexer_mode::cmdvar)
     {
       switch (c)
       {
@@ -315,7 +336,8 @@ namespace build2
 
     // The following characters are special in the normal mode.
     //
-    if (m == lexer_mode::normal)
+    if (m == lexer_mode::normal ||
+        m == lexer_mode::cmdvar)
     {
       switch (c)
       {
@@ -361,7 +383,7 @@ namespace build2
     // This mode is quite a bit like the value mode when it comes to special
     // characters, except that we have some of our own.
 
-    bool sep (skip_spaces ());
+    bool sep (skip_spaces ().first);
     xchar c (get ());
 
     if (eos (c))
@@ -728,7 +750,7 @@ namespace build2
     return token (move (lexeme), sep, qtype, qcomp, ln, cn);
   }
 
-  bool lexer::
+  pair<bool, bool> lexer::
   skip_spaces ()
   {
     bool r (sep_);
@@ -739,7 +761,7 @@ namespace build2
     // In some special modes we don't skip spaces.
     //
     if (!s.sep_space)
-      return r;
+      return make_pair (r, false);
 
     xchar c (peek ());
     bool start (c.column == 1);
@@ -758,6 +780,8 @@ namespace build2
         {
           // In some modes we treat newlines as ordinary spaces.
           //
+          // Note that in this case we don't adjust start.
+          //
           if (!s.sep_newline)
           {
             r = true;
@@ -772,7 +796,7 @@ namespace build2
             break;
           }
 
-          return r;
+          return make_pair (r, start);
         }
       case '#':
         {
@@ -833,12 +857,12 @@ namespace build2
         }
         // Fall through.
       default:
-        return r; // Not a space.
+        return make_pair (r, start); // Not a space.
       }
 
       get ();
     }
 
-    return r;
+    return make_pair (r, start);
   }
 }
diff --git a/libbuild2/lexer.hxx b/libbuild2/lexer.hxx
index 02112cb..c7e96fb 100644
--- a/libbuild2/lexer.hxx
+++ b/libbuild2/lexer.hxx
@@ -20,17 +20,18 @@
 
 namespace build2
 {
-  // Context-dependent lexing mode. Quoted modes are internal and should not
-  // be set explicitly. In the value mode we don't treat certain characters
-  // (e.g., `+`, `=`) as special so that we can use them in the variable
-  // values, e.g., `foo = g++`. In contrast, in the variable mode, we restrict
-  // certain character (e.g., `/`) from appearing in the name. The values mode
-  // is like value but recogizes `,` as special (used in contexts where we
-  // need to list multiple values). The attributes/attribute_value modes are
-  // like values where each value is potentially a variable assignment; they
-  // don't treat `{` and `}` as special (so we cannot have name groups in
-  // attributes) as well as recognizes `=` and `]`. The eval mode is used in
-  // the evaluation context.
+  // Context-dependent lexing mode.
+  //
+  // Quoted modes are internal and should not be set explicitly. In the value
+  // mode we don't treat certain characters (e.g., `+`, `=`) as special so
+  // that we can use them in the variable values, e.g., `foo = g++`. In
+  // contrast, in the variable mode, we restrict certain character (e.g., `/`)
+  // from appearing in the name. The values mode is like value but recogizes
+  // `,` as special (used in contexts where we need to list multiple
+  // values). The attributes/attribute_value modes are like values where each
+  // value is potentially a variable assignment; they don't treat `{` and `}`
+  // as special (so we cannot have name groups in attributes) as well as
+  // recognizes `=` and `]`. The eval mode is used in the evaluation context.
   //
   // A number of modes are "derived" from the value/values mode by recognizing
   // a few extra characters:
@@ -42,6 +43,9 @@ namespace build2
   // split words separated by the pair character (to disable pairs one can
   // pass `\0` as a pair character).
   //
+  // The normal mode recognizes `%` at the beginning of the line as special.
+  // The cmdvar mode is like normal but does not treat `%` as special.
+  //
   // The alternative modes must be set manually. The value/values and derived
   // modes automatically expires after the end of the line. The attribute mode
   // expires after the closing `]`. The variable mode expires after the word
@@ -70,6 +74,7 @@ namespace build2
     enum
     {
       normal = base_type::value_next,
+      cmdvar,
       variable,
       value,
       values,
@@ -189,11 +194,13 @@ namespace build2
     virtual token
     word (state current, bool separated);
 
-    // Return true if we have seen any spaces. Skipped empty lines
-    // don't count. In other words, we are only interested in spaces
-    // that are on the same line as the following non-space character.
+    // Return true in first if we have seen any spaces. Skipped empty lines
+    // don't count. In other words, we are only interested in spaces that are
+    // on the same line as the following non-space character. Return true in
+    // second if we have started skipping spaces from column 1 (note that
+    // if this mode does not skip spaces, then second will always be false).
     //
-    bool
+    pair<bool, bool>
     skip_spaces ();
 
     // Diagnostics.
diff --git a/libbuild2/parser.cxx b/libbuild2/parser.cxx
index e87ca95..000670b 100644
--- a/libbuild2/parser.cxx
+++ b/libbuild2/parser.cxx
@@ -335,6 +335,11 @@ namespace build2
 
     while (tt != type::eos && !(one && parsed))
     {
+      // Issue better diagnostics for stray `%`.
+      //
+      if (tt == type::percent)
+        fail (t) << "recipe without target";
+
       // Extract attributes if any.
       //
       assert (attributes_.empty ());
diff --git a/libbuild2/test/script/lexer.cxx b/libbuild2/test/script/lexer.cxx
index 26d77b5..4683bc7 100644
--- a/libbuild2/test/script/lexer.cxx
+++ b/libbuild2/test/script/lexer.cxx
@@ -174,7 +174,7 @@ namespace build2
       token lexer::
       next_line ()
       {
-        bool sep (skip_spaces ());
+        bool sep (skip_spaces ().first);
 
         xchar c (get ());
         uint64_t ln (c.line), cn (c.column);
diff --git a/libbuild2/token.cxx b/libbuild2/token.cxx
index 4975a02..11b080e 100644
--- a/libbuild2/token.cxx
+++ b/libbuild2/token.cxx
@@ -24,6 +24,7 @@ namespace build2
     case token_type::colon:          os << q << ':'  << q; break;
     case token_type::dollar:         os << q << '$'  << q; break;
     case token_type::question:       os << q << '?'  << q; break;
+    case token_type::percent:        os << q << '%'  << q; break;
     case token_type::comma:          os << q << ','  << q; break;
 
     case token_type::lparen:         os << q << '('  << q; break;
diff --git a/libbuild2/token.hxx b/libbuild2/token.hxx
index e48c088..8dad4ba 100644
--- a/libbuild2/token.hxx
+++ b/libbuild2/token.hxx
@@ -36,6 +36,7 @@ namespace build2
       colon,           // :
       dollar,          // $
       question,        // ?
+      percent,         // %
       comma,           // ,
 
       lparen,          // (
author	Boris Kolpackov <boris@codesynthesis.com>	2020-05-04 07:27:47 +0200
committer	Boris Kolpackov <boris@codesynthesis.com>	2020-05-27 08:35:29 +0200
commit	a54abb2f4e5e66877619097bfd281261f99c5103 (patch)
tree	b10a24fd2c5d1d1dd2602f25cc7228b287200cbb
parent	e63b427c51e37135e50dec9435659d661872fe95 (diff)