From b30107085c003b364fc11ba84b5e7130d0926940 Mon Sep 17 00:00:00 2001
From: Boris Kolpackov <boris@codesynthesis.com>
Date: Sun, 28 May 2017 11:57:41 +0200
Subject: Diagnose #include directives in C/C++ lexer

---
 build2/cc/lexer.cxx | 48 +++++++++++++++++++++++++++++-------------------
 build2/cc/lexer.hxx |  8 ++++----
 2 files changed, 33 insertions(+), 23 deletions(-)

(limited to 'build2')
diff --git a/build2/cc/lexer.cxx b/build2/cc/lexer.cxx
index 40178bb..faf73c8 100644
--- a/build2/cc/lexer.cxx
+++ b/build2/cc/lexer.cxx
@@ -102,6 +102,8 @@ namespace build2
           return;
         }
 
+        const location l (&name_, c.line, c.column);
+
         switch (c)
         {
           // Preprocessor lines.
@@ -110,12 +112,12 @@ namespace build2
           {
             // It is tempting to simply scan until the newline ignoring
             // anything in between. However, these lines can start a
-            // multi-line C-style comment. So we have to tokenize it. Note
-            // that we assume there cannot be #include directives.
+            // multi-line C-style comment. So we have to tokenize them.
             //
-            // This may not work for things like #error that can contain
-            // pretty much anything. Also note that lines that start with #
-            // can contain # further down.
+            // Note that this may not work for things like #error that can
+            // contain pretty much anything. Also note that lines that start
+            // with '#' can contain '#' further down. In this case we need to
+            // be careful not to recurse (and consume multiple newlines).
             //
             // Finally, to support diagnostics properly we need to recognize
             // #line directives.
@@ -140,10 +142,22 @@ namespace build2
                   // #line <integer> [<string literal>] ...
                   // #     <integer> [<string literal>] ...
                   //
+                  // Also diagnose #include while at it.
+                  //
                   if (!(c >= '0' && c <= '9'))
                   {
                     next (t, c, false);
 
+                    if (t.type == type::identifier)
+                    {
+                      if (t.value == "include")
+                        fail (l) << "unexpected #include directive";
+                      else if (t.value != "line")
+                        continue;
+                    }
+                    else
+                      continue;
+
                     if (t.type != type::identifier || t.value != "line")
                       continue;
 
@@ -505,15 +519,14 @@ namespace build2
     void lexer::
     char_literal (token& t, xchar c)
     {
-      uint64_t ln (c.line);
-      uint64_t cn (c.column);
+      const location l (&name_, c.line, c.column);
 
       for (char p (c);;) // Previous character (see below).
       {
         c = get ();
 
         if (eos (c) || c == '\n')
-          fail (location (&name_, ln, cn)) << "unterminated character literal";
+          fail (l) << "unterminated character literal";
 
         if (c == '\'' && p != '\\')
           break;
@@ -535,15 +548,14 @@ namespace build2
     void lexer::
     string_literal (token& t, xchar c)
     {
-      uint64_t ln (c.line);
-      uint64_t cn (c.column);
+      const location l (&name_, c.line, c.column);
 
       for (char p (c);;) // Previous character (see below).
       {
         c = get ();
 
         if (eos (c) || c == '\n')
-          fail (location (&name_, ln, cn)) << "unterminated string literal";
+          fail (l) << "unterminated string literal";
 
         if (c == '\"' && p != '\\')
           break;
@@ -576,8 +588,7 @@ namespace build2
       // Note that the <raw_characters> are not processed in any way, not even
       // for line continuations.
       //
-      uint64_t ln (c.line);
-      uint64_t cn (c.column);
+      const location l (&name_, c.line, c.column);
 
       // As a first step, parse the delimiter (including the openning paren).
       //
@@ -588,7 +599,7 @@ namespace build2
         c = get ();
 
         if (eos (c) || c == '\"' || c == ')' || c == '\\' || c == ' ')
-          fail (location (&name_, ln, cn)) << "invalid raw string literal";
+          fail (l) << "invalid raw string literal";
 
         if (c == '(')
           break;
@@ -606,7 +617,7 @@ namespace build2
         c = get (false); // No newline escaping.
 
         if (eos (c)) // Note: newline is ok.
-          fail (location (&name_, ln, cn)) << "invalid raw string literal";
+          fail (l) << "invalid raw string literal";
 
         if (c != d[i] && i != 0) // Restart from the beginning.
           i = 0;
@@ -664,18 +675,17 @@ namespace build2
 
       if (c == '\"')
       {
+        const location l (&name_, c.line, c.column);
+
         string s (move (log_file_).string ()); // Move string rep out.
         s.clear ();
 
-        uint64_t ln (c.line);
-        uint64_t cn (c.column);
-
         for (char p ('\0'); p != '\"'; ) // Previous character.
         {
           c = get ();
 
           if (eos (c) || c == '\n')
-            fail (location (&name_, ln, cn)) << "unterminated string literal";
+            fail (l) << "unterminated string literal";
 
           // Handle escapes.
           //
diff --git a/build2/cc/lexer.hxx b/build2/cc/lexer.hxx
index 8767606..c74a0a9 100644
--- a/build2/cc/lexer.hxx
+++ b/build2/cc/lexer.hxx
@@ -22,10 +22,10 @@ namespace build2
     //
     // The input is a (partially-)preprocessed translation unit that may still
     // contain comments, line continuations, and preprocessor directives such
-    // as #line, #pragma, etc., but not #include's. Currently all preprocessor
-    // directives except #line are ignored and no values are saved from
-    // literals. The #line directive (and its shorthand notation) is
-    // recognized to provide the logical token location.
+    // as #line, #pragma, but not #include (which is diagnosed). Currently,
+    // all preprocessor directives except #line are ignored and no values are
+    // saved from literals. The #line directive (and its shorthand notation)
+    // is recognized to provide the logical token location.
     //
     enum class token_type
     {
-- 
cgit v1.1