From 818dd4a4e743bc8c93d1be67685b1f2e5db6dcf5 Mon Sep 17 00:00:00 2001
From: Boris Kolpackov <boris@codesynthesis.com>
Date: Tue, 17 Nov 2020 11:23:36 +0200
Subject: Implement modules pseudo-directive parsing (p1703, p1857)

---
 libbuild2/cc/lexer+first.test.testscript   | 25 ++++++++
 libbuild2/cc/lexer.cxx                     | 34 ++++++++---
 libbuild2/cc/lexer.hxx                     |  8 ++-
 libbuild2/cc/lexer.test.cxx                | 14 ++++-
 libbuild2/cc/parser+module.test.testscript | 23 ++++---
 libbuild2/cc/parser.cxx                    | 96 ++++++++++++++++++++----------
 libbuild2/cc/parser.hxx                    |  2 +-
 7 files changed, 148 insertions(+), 54 deletions(-)
 create mode 100644 libbuild2/cc/lexer+first.test.testscript
diff --git a/libbuild2/cc/lexer+first.test.testscript b/libbuild2/cc/lexer+first.test.testscript
new file mode 100644
index 0000000..5c55030
--- /dev/null
+++ b/libbuild2/cc/lexer+first.test.testscript
@@ -0,0 +1,25 @@
+# file      : libbuild2/cc/lexer+first.test.testscript
+# license   : MIT; see accompanying LICENSE file
+
+# Test the first token of a logical line logic.
+#
+
+: basics
+:
+$* -f <<EOI >>EOO
+; .
+ ; .
+; // Hello
+;
+; /* Hello
+World */ .
+EOI
+';' t
+'.' f
+';' t
+'.' f
+';' t
+';' t
+';' t
+'.' f
+EOO
diff --git a/libbuild2/cc/lexer.cxx b/libbuild2/cc/lexer.cxx
index d2be3d8..123a41e 100644
--- a/libbuild2/cc/lexer.cxx
+++ b/libbuild2/cc/lexer.cxx
@@ -138,10 +138,13 @@ namespace build2
     using type = token_type;
 
     void lexer::
-    next (token& t, xchar c, bool ignore_pp)
+    next (token& t, pair<xchar, bool> cf, bool ignore_pp)
     {
-      for (;; c = skip_spaces ())
+      for (;; cf = skip_spaces ())
       {
+        xchar c (cf.first);
+
+        t.first = cf.second;
         t.file = &log_file_;
         t.line = log_line_ ? *log_line_ : c.line;
         t.column = c.column;
@@ -197,7 +200,7 @@ namespace build2
               {
                 // Note that we keep using the passed token for buffers.
                 //
-                c = skip_spaces (false); // Stop at newline.
+                c = skip_spaces (false).first; // Stop at newline.
 
                 if (eos (c) || c == '\n')
                   break;
@@ -215,7 +218,7 @@ namespace build2
                   //
                   if (!(c >= '0' && c <= '9'))
                   {
-                    next (t, c, false);
+                    next (t, make_pair (c, false), false);
 
                     if (t.type == type::identifier)
                     {
@@ -230,7 +233,7 @@ namespace build2
                     if (t.type != type::identifier || t.value != "line")
                       continue;
 
-                    c = skip_spaces (false);
+                    c = skip_spaces (false).first;
 
                     if (!(c >= '0' && c <= '9'))
                       fail (c) << "line number expected after #line directive";
@@ -242,7 +245,7 @@ namespace build2
                   continue; // Parse the tail, if any.
                 }
 
-                next (t, c, false);
+                next (t, make_pair (c, false), false);
               }
               break;
             }
@@ -823,7 +826,7 @@ namespace build2
 
       // See if we have the file.
       //
-      c = skip_spaces (false);
+      c = skip_spaces (false).first;
 
       if (c == '\"')
       {
@@ -1007,16 +1010,24 @@ namespace build2
     }
 
     auto lexer::
-    skip_spaces (bool nl) -> xchar
+    skip_spaces (bool nl) -> pair<xchar, bool>
     {
       xchar c (get ());
 
+      // Besides the first character, we also need to take into account any
+      // newlines that we are skipping. For example, the first character may
+      // be a space at the end of the line which we will skip along with the
+      // following newline.
+      //
+      bool first (c.column == 1);
+
       for (; !eos (c); c = get ())
       {
         switch (c)
         {
         case '\n':
           if (!nl) break;
+          first = true;
           // Fall through.
         case ' ':
         case '\t':
@@ -1072,11 +1083,16 @@ namespace build2
               if (!nl)
                 break;
 
+              first = true;
               continue;
             }
 
             // C comment.
             //
+            // Note that for the first logic we consider a C comment to be
+            // entirely part of the same logical line even if there are
+            // newlines inside.
+            //
             if (p == '*')
             {
               get (p);
@@ -1132,7 +1148,7 @@ namespace build2
         break;
       }
 
-      return c;
+      return make_pair (c, first);
     }
 
     ostream&
diff --git a/libbuild2/cc/lexer.hxx b/libbuild2/cc/lexer.hxx
index d3fe807..b4e1045 100644
--- a/libbuild2/cc/lexer.hxx
+++ b/libbuild2/cc/lexer.hxx
@@ -25,7 +25,8 @@ namespace build2
     // as #line, #pragma, but not #include (which is diagnosed). Currently,
     // all preprocessor directives except #line are ignored and no values are
     // saved from literals. The #line directive (and its shorthand notation)
-    // is recognized to provide the logical token location.
+    // is recognized to provide the logical token location. Note that the
+    // modules-related pseudo-directives are not recognized or handled.
     //
     // While at it we also calculate the checksum of the input ignoring
     // comments, whitespaces, etc. This is used to detect changes that do not
@@ -58,6 +59,7 @@ namespace build2
     struct token
     {
       token_type type = token_type::eos;
+      bool       first = false;          // First token of a logical line.
       string     value;
 
       // Logical position.
@@ -121,7 +123,7 @@ namespace build2
 
     private:
       void
-      next (token&, xchar, bool);
+      next (token&, pair<xchar, bool /* first */>, bool);
 
       void
       number_literal (token&, xchar);
@@ -141,7 +143,7 @@ namespace build2
       void
       line_directive (token&, xchar);
 
-      xchar
+      pair<xchar, bool /* first */>
       skip_spaces (bool newline = true);
 
       // The char_scanner adaptation for newline escape sequence processing.
diff --git a/libbuild2/cc/lexer.test.cxx b/libbuild2/cc/lexer.test.cxx
index 852d8b2..284d592 100644
--- a/libbuild2/cc/lexer.test.cxx
+++ b/libbuild2/cc/lexer.test.cxx
@@ -16,12 +16,19 @@ namespace build2
 {
   namespace cc
   {
-    // Usage: argv[0] [-l] [<file>]
+    // Usage: argv[0] [-l] [-f] [<file>]
+    //
+    // -l
+    //   Print location.
+    //
+    // -f
+    //   Print first flag.
     //
     int
     main (int argc, char* argv[])
     {
       bool loc (false);
+      bool first (false);
       path file;
 
       for (int i (1); i != argc; ++i)
@@ -30,6 +37,8 @@ namespace build2
 
         if (a == "-l")
           loc = true;
+        else if (a == "-f")
+          first = true;
         else
         {
           file = path (argv[i]);
@@ -61,6 +70,9 @@ namespace build2
         {
           cout << t;
 
+          if (first)
+            cout << ' ' << (t.first ? 't' : 'f');
+
           if (loc)
             cout << ' ' << *t.file << ':' << t.line << ':' << t.column;
 
diff --git a/libbuild2/cc/parser+module.test.testscript b/libbuild2/cc/parser+module.test.testscript
index b92f80b..e4ec139 100644
--- a/libbuild2/cc/parser+module.test.testscript
+++ b/libbuild2/cc/parser+module.test.testscript
@@ -45,6 +45,17 @@ EOI
 export import foo;
 EOO
 
+: non-import
+:
+$* <<EOI
+import
+foo;
+export import(*a);
+import::inner xi = {};
+::import <a>;
+class import<int>;
+EOI
+
 : non-module
 :
 $* <<EOI
@@ -52,6 +63,10 @@ $* <<EOI
 #pragma export module foo;
 #pragma module foo;
 export namespace bar {int fox ();}
+module
+foo;
+foo::module();
+module::inner yi = {};
 EOI
 
 : attribute
@@ -105,14 +120,6 @@ EOI
 <stdin>:6:1: warning: extraneous '}'
 EOE
 
-: import-missing-name
-:
-$* <<EOI 2>>EOE != 0
-import ;
-EOI
-<stdin>:1:8: error: module or header name expected instead of ';'
-EOE
-
 : module-missing-name
 :
 $* <<EOI 2>>EOE != 0
diff --git a/libbuild2/cc/parser.cxx b/libbuild2/cc/parser.cxx
index 55be8b7..fbf076c 100644
--- a/libbuild2/cc/parser.cxx
+++ b/libbuild2/cc/parser.cxx
@@ -43,8 +43,8 @@ namespace build2
       token t;
       for (bool n (true); (n ? l_->next (t) : t.type) != type::eos; )
       {
-        // Break to stop, continue to continue, set n to false if the
-        // next token already extracted.
+        // Break to stop, continue to continue, and set n to false if the
+        // next token is already extracted.
         //
         n = true;
 
@@ -71,37 +71,63 @@ namespace build2
             // [export]  import <module-name> [<attributes>] ;
             // [export]  import <header-name> [<attributes>] ;
             //
+            // The leading module/export/import keyword should be the first
+            // token of a logical line and only if certain characters appear
+            // after module/import and all the tokens are on the same line,
+            // then the line is recognized as a pseudo-directive; see p1857
+            // for details.
+            //
             // Additionally, when include is translated to an import, it's
             // normally replaced with the special __import keyword since it
             // may appear in C context.
             //
-            const string& id (t.value);
-
-            if (bb == 0)
+            if (bb == 0 && t.first)
             {
-              if      (id == "import" || id == "__import")
+              const string& id (t.value); // Note: tracks t.
+
+              // Handle the export prefix which can appear for both module
+              // and import.
+              //
+              bool ex (false);
+              if (id == "export")
               {
-                parse_import (t, false);
+                if (l_->next (t) != type::identifier || t.first)
+                {
+                  n = false; // Could be module/import on next line.
+                  continue;
+                }
+
+                ex = true;
+                // Fall through.
               }
-              else if (id == "module")
+
+              if (id == "module")
               {
-                parse_module (t, false);
+                location_value l (get_location (t));
+                l_->next (t);
+
+                if ((t.type == type::semi     ||
+                     t.type == type::identifier) && !t.first)
+                  parse_module (t, ex, move (l));
+                else
+                  n = false;
               }
-              else if (id == "export")
+              else if (id == "import" || id == "__import")
               {
-                if (l_->next (t) == type::identifier)
-                {
-                  if      (id == "module") parse_module (t, true);
-                  else if (id == "import") parse_import (t, true);
-                  else n = false; // Something else (e.g., export namespace).
-                }
+                l_->next (t);
+
+                if ((t.type == type::less     ||
+                     t.type == type::string   ||
+                     t.type == type::identifier) && !t.first)
+                  parse_import (t, ex);
                 else
                   n = false;
               }
             }
             continue;
           }
-        default: continue;
+        default:
+          continue;
         }
 
         break;
@@ -120,6 +146,8 @@ namespace build2
       // if anything in between fails (probably by having it sitting in a
       // diag_frame). So let's keep it simple for now.
       //
+      // @@ We now do that for missing include, so could do here as well.
+      //
       if (bb != 0)
         warn (t) << (bb > 0 ? "missing '}'" : "extraneous '}'");
 
@@ -134,12 +162,12 @@ namespace build2
     void parser::
     parse_import (token& t, bool ex)
     {
-      // enter: import keyword
+      // enter: token after import keyword
       // leave: semi
 
       string un;
       unit_type ut;
-      switch (l_->next (t)) // Start of module/header name.
+      switch (t.type) // Start of module/header name.
       {
       case type::less:
       case type::string:
@@ -155,15 +183,19 @@ namespace build2
           break;
         }
       default:
-        fail (t) << "module or header name expected instead of " << t << endf;
+        assert (false);
       }
 
       // Should be {}-balanced.
       //
-      for (; t.type != type::eos && t.type != type::semi; l_->next (t)) ;
+      for (;
+           t.type != type::eos && t.type != type::semi && !t.first;
+           l_->next (t)) ;
 
       if (t.type != type::semi)
         fail (t) << "';' expected instead of " << t;
+      else if (t.first)
+        fail (t) << "';' must be on the same line";
 
       // For now we skip header units (see a comment on module type/info
       // string serialization in compile rule for details). Note that
@@ -191,21 +223,17 @@ namespace build2
     }
 
     void parser::
-    parse_module (token& t, bool ex)
+    parse_module (token& t, bool ex, location_value l)
     {
-      // enter: module keyword
+      // enter: token after module keyword (l is the module keyword location)
       // leave: semi
 
-      location_value l (get_location (t));
-
-      l_->next (t);
-
       // Handle the leading 'module;' marker (p0713).
       //
       // Note that we don't bother diagnosing invalid/duplicate markers
       // leaving that to the compiler.
       //
-      if (!ex && t.type == type::semi)
+      if (!ex && t.type == type::semi && !t.first)
       {
         module_marker_ = move (l);
         return;
@@ -217,10 +245,14 @@ namespace build2
 
       // Should be {}-balanced.
       //
-      for (; t.type != type::eos && t.type != type::semi; l_->next (t)) ;
+      for (;
+           t.type != type::eos && t.type != type::semi && !t.first;
+           l_->next (t)) ;
 
       if (t.type != type::semi)
         fail (t) << "';' expected instead of " << t;
+      else if (t.first)
+        fail (t) << "';' must be on the same line";
 
       if (!u_->module_info.name.empty ())
         fail (l) << "multiple module declarations";
@@ -241,12 +273,12 @@ namespace build2
       //
       for (;; l_->next (t))
       {
-        if (t.type != type::identifier)
+        if (t.type != type::identifier || t.first)
           fail (t) << "module name expected instead of " << t;
 
         n += t.value;
 
-        if (l_->next (t) != type::dot)
+        if (l_->next (t) != type::dot || t.first)
           break;
 
         n += '.';
@@ -271,7 +303,7 @@ namespace build2
       {
         while (l_->next (t) != type::greater)
         {
-          if (t.type == type::eos)
+          if (t.type == type::eos || t.first)
             fail (t) << "closing '>' expected after header name" << endf;
         }
       }
diff --git a/libbuild2/cc/parser.hxx b/libbuild2/cc/parser.hxx
index 7b33ef9..7c893b5 100644
--- a/libbuild2/cc/parser.hxx
+++ b/libbuild2/cc/parser.hxx
@@ -31,7 +31,7 @@ namespace build2
       parse_import (token&, bool);
 
       void
-      parse_module (token&, bool);
+      parse_module (token&, bool, location_value);
 
       string
       parse_module_name (token&);
-- 
cgit v1.1