Implement support for tokenization of preprocessed C/C++ source

author: Boris Kolpackov <boris@codesynthesis.com> 2017-05-24 13:24:31 +0200
committer: Boris Kolpackov <boris@codesynthesis.com> 2017-05-24 13:24:31 +0200
commit: 0cef93b4e2e9bf39b0ca542876f9ab1af6d0f01d (patch)
tree: 187b83b65f28cdf4f8a2b0feadf392b49554fbf3 /build2
parent: b3526a5c925169b3be00a5dd4d8c8222f3a475cd (diff)
3 files changed, 850 insertions, 0 deletions
diff --git a/build2/buildfile b/build2/buildfile
index 2d65001..69dfc94 100644
--- a/build2/buildfile
+++ b/build2/buildfile
@@ -54,6 +54,7 @@ exe{b}:                                                   \
          cc/{hxx         cxx}{ guess                    } \
          cc/{hxx         cxx}{ init                     } \
          cc/{hxx         cxx}{ install                  } \
+         cc/{hxx         cxx}{ lexer                    } \
          cc/{hxx         cxx}{ link                     } \
          cc/{hxx         cxx}{ module                   } \
          cc/{            cxx}{ msvc                     } \
diff --git a/build2/cc/lexer.cxx b/build2/cc/lexer.cxx
new file mode 100644
index 0000000..3eb5d5b
--- /dev/null
+++ b/build2/cc/lexer.cxx
@@ -0,0 +1,683 @@
+// file      : build2/cc/lexer.cxx -*- C++ -*-
+// copyright : Copyright (c) 2014-2017 Code Synthesis Ltd
+// license   : MIT; see accompanying LICENSE file
+
+#include <build2/cc/lexer.hxx>
+
+using namespace std;
+using namespace butl;
+
+// Diagnostics plumbing.
+//
+namespace butl // ADL
+{
+  inline build2::location
+  get_location (const butl::char_scanner::xchar& c, const void* data)
+  {
+    using namespace build2;
+
+    assert (data != nullptr); // E.g., must be &lexer::name_.
+    return location (static_cast<const path*> (data), c.line, c.column);
+  }
+}
+
+namespace build2
+{
+  namespace cc
+  {
+    inline auto lexer::
+    get (bool e) -> xchar
+    {
+      if (unget_)
+      {
+        unget_ = false;
+        return ungetc_;
+      }
+      else
+      {
+        xchar c (peek (e));
+        base::get (c);
+        return c;
+      }
+    }
+
+    auto lexer::
+    peek (bool e) -> xchar
+    {
+      if (unget_)
+        return ungetc_;
+
+      if (unpeek_)
+        return unpeekc_;
+
+      xchar c (base::peek ());
+
+      if (e && c == '\\')
+      {
+        base::get (c);
+        xchar p (base::peek ());
+
+        if (p == '\n')
+        {
+          base::get (p);
+          return peek (e); // Recurse.
+        }
+
+        // Save in the unpeek buffer so that it is returned on the subsequent
+        // calls to peek() (until get()).
+        //
+        unpeek_ = true;
+        unpeekc_ = c;
+      }
+
+      return c;
+    }
+
+    using type = token_type;
+
+    void lexer::
+    next (token& t, xchar c)
+    {
+      for (;; c = skip_spaces ())
+      {
+        t.line = c.line;
+        t.column = c.column;
+
+        if (eos (c))
+        {
+          t.type = type::eos;
+          return;
+        }
+
+        switch (c)
+        {
+          // Preprocessor lines.
+          //
+        case '#':
+          {
+            // It is tempting to simply scan until the newline ignoring
+            // anything in between. However, these lines can start a
+            // multi-line C-style comment. So we have to tokenize it. Note
+            // that we assume there cannot be #include directives.
+            //
+            // This may not work for things like #error that can contain
+            // pretty much anything.
+            //
+            for (;;)
+            {
+              c = skip_spaces (false); // Stop at newline.
+
+              if (eos (c) || c == '\n')
+                break;
+
+              next (t, c); // Keep using the passed token for buffers.
+            }
+            break;
+          }
+          // Single-letter punctuation.
+          //
+        case ';': t.type = type::semi;    return;
+        case '{': t.type = type::lcbrace; return;
+        case '}': t.type = type::rcbrace; return;
+          // Other single-letter punctuation.
+          //
+        case '(':
+        case ')':
+        case '[':
+        case ']':
+        case ',':
+        case '?':
+        case '~':
+        case '\\': t.type = type::punctuation; return;
+          // Potentially multi-letter punctuation.
+          //
+        case '.': // . .* .<N> ...
+          {
+            xchar p (peek ());
+
+            if (p == '*')
+            {
+              get (p);
+              t.type = type::punctuation;
+              return;
+            }
+            else if (p >= '0' && p <= '9')
+            {
+              number_literal (t, c);
+              return;
+            }
+            else if (p == '.')
+            {
+              get (p);
+              xchar q (peek ());
+              if (q == '.')
+              {
+                get (q);
+                t.type = type::punctuation;
+                return;
+              }
+              unget (p);
+              // Fall through.
+            }
+
+            t.type = type::dot;
+            return;
+          }
+        case '=': // = ==
+        case '!': // ! !=
+        case '*': // * *=
+        case '/': // / /=   (/* and // handled by skip_spaced() above)
+        case '%': // % %=
+        case '^': // ^ ^=
+          {
+            xchar p (peek ());
+
+            if (p == '=')
+              get (p);
+
+            t.type = type::punctuation;
+            return;
+          }
+        case '>': // > >= >> >>=
+        case '<': // < <= << <<=
+          {
+            xchar p (peek ());
+
+            if (p == c)
+            {
+              get (p);
+              if ((p = peek ()) == '=')
+                get (p);
+            }
+            else if (p == '=')
+              get (p);
+
+            t.type = type::punctuation;
+            return;
+          }
+        case '+': // + ++ +=
+        case '-': // - -- -= -> ->*
+          {
+            xchar p (peek ());
+
+            if (p == c)
+              get (p);
+            else if (p == '=')
+              get (p);
+            else if (c == '-' && p == '>')
+            {
+              get (p);
+              if ((p = peek ()) == '*')
+                get (p);
+            }
+
+            t.type = type::punctuation;
+            return;
+          }
+        case '&': // & && &=
+        case '|': // | || |=
+          {
+            xchar p (peek ());
+
+            if (p == c)
+              get (p);
+            else if (p == '=')
+              get (p);
+
+            t.type = type::punctuation;
+            return;
+          }
+        case ':': // : ::
+          {
+            xchar p (peek ());
+
+            if (p == ':')
+              get (p);
+
+            t.type = type::punctuation;
+            return;
+          }
+          // Number (and also .<N> above).
+          //
+        case '0':
+        case '1':
+        case '2':
+        case '3':
+        case '4':
+        case '5':
+        case '6':
+        case '7':
+        case '8':
+        case '9':
+          {
+            number_literal (t, c);
+            return;
+          }
+          // Char/string literal, identifier, or other (\, $, @, `).
+          //
+        default:
+          {
+            bool raw (false); // Raw string literal.
+
+            if (alpha (c) || c == '_')
+            {
+              string& id (t.value);
+              id.clear ();
+
+              for (id += c; (c = peek ()) == '_' || alnum (c); get (c))
+                id += c;
+
+              // If the following character is a quote, see if the identifier
+              // is one of the literal prefixes.
+              //
+              if (c == '\'' || c == '\"')
+              {
+                size_t n (id.size ()), i (0);
+                switch (id[0])
+                {
+                case 'u':
+                  {
+                    if (n > 1 && id[1] == '8')
+                      ++i;
+                    // Fall through.
+                  }
+                case 'L':
+                case 'U':
+                  {
+                    ++i;
+
+                    if (c == '\"' && n > i && id[i] == 'R')
+                    {
+                      ++i;
+                      raw = true;
+                    }
+                    break;
+                  }
+                case 'R':
+                  {
+                    if (c == '\"')
+                    {
+                      ++i;
+                      raw = true;
+                    }
+                    break;
+                  }
+                }
+
+                if (i == n) // All characters "consumed".
+                {
+                  get (c);
+                  id.clear ();
+                }
+              }
+
+              if (!id.empty ())
+              {
+                t.type = type::identifier;
+                return;
+              }
+            }
+
+            switch (c)
+            {
+            case '\'':
+              {
+                char_literal (t, c);
+                return;
+              }
+            case '\"':
+              {
+                if (raw)
+                  raw_string_literal (t, c);
+                else
+                  string_literal (t, c);
+                return;
+              }
+            default:
+              {
+                t.type = type::other;
+                return;
+              }
+            }
+          }
+        }
+      }
+    }
+
+    void lexer::
+    number_literal (token& t, xchar c)
+    {
+      t.line = c.line;
+      t.column = c.column;
+
+      // A number (integer or floating point literal) can:
+      //
+      // 1. Start with a dot (which must be followed by a digit, e.g., .123).
+      //
+      // 2. Can have a radix prefix (0b101, 0123, 0X12AB).
+      //
+      // 3. Can have an exponent (1e10, 0x1.p-10, 1.).
+      //
+      // 4. Digits can be separated with ' (123'456, 0xff00'00ff).
+      //
+      // 5. End with a built-in or user defined literal (123f, 123UL, 123_X)
+      //
+      // Quoting from GCC's preprocessor documentation:
+      //
+      // "Formally preprocessing numbers begin with an optional period, a
+      // required decimal digit, and then continue with any sequence of
+      // letters, digits, underscores, periods, and exponents. Exponents are
+      // the two-character sequences 'e+', 'e-', 'E+', 'E-', 'p+', 'p-', 'P+',
+      // and 'P-'."
+      //
+      // So it looks like a "C++ number" is then any unseparated (with
+      // whitespace or punctuation) sequence of those plus '. The only mildly
+      // tricky part is then to recognize +/- as being part of the exponent.
+      //
+      while (!eos ((c = peek ())))
+      {
+        switch (c)
+        {
+          // All the whitespace, punctuation, and other characters that end
+          // the number.
+          //
+        case ' ':
+        case '\n':
+        case '\t':
+        case '\r':
+        case '\f':
+        case '\v':
+
+        case '#':
+        case ';':
+        case '{':
+        case '}':
+        case '(':
+        case ')':
+        case '[':
+        case ']':
+        case ',':
+        case '?':
+        case '~':
+        case '=':
+        case '!':
+        case '*':
+        case '/':
+        case '%':
+        case '^':
+        case '>':
+        case '<':
+        case '&':
+        case '|':
+        case ':':
+        case '+': // The exponent case is handled below.
+        case '-': // The exponent case is handled below.
+        case '"':
+        case '\\':
+
+        case '@':
+        case '$':
+        case '`':
+          break;
+
+          // Recognize +/- after the exponent.
+          //
+        case 'e':
+        case 'E':
+        case 'p':
+        case 'P':
+          {
+            get (c);
+            c = peek ();
+            if (c == '+' || c == '-')
+              get (c);
+            continue;
+          }
+
+        case '_':
+        case '.':
+        case '\'':
+        default: // Digits and letters.
+          {
+            get (c);
+            continue;
+          }
+        }
+
+        break;
+      }
+
+      t.type = type::number;
+    }
+
+    void lexer::
+    char_literal (token& t, xchar c)
+    {
+      t.line = c.line;
+      t.column = c.column;
+
+      char p (c); // Previous character (see below).
+
+      for (;;)
+      {
+        c = get ();
+
+        if (eos (c))
+          fail (location (&name_, t.line, t.column)) << "unterminated literal";
+
+        if (c == '\'' && p != '\\')
+          break;
+
+        // Keep track of \\-escapings so we don't confuse them with \', as in
+        // '\\'.
+        //
+        p = (c == '\\' && p == '\\') ? '\0' : static_cast<char> (c);
+      }
+
+      // See if we have a user-defined suffix (which is an identifier).
+      //
+      if ((c = peek ()) == '_' || alpha (c))
+        literal_suffix (c);
+
+      t.type = type::character;
+    }
+
+    void lexer::
+    string_literal (token& t, xchar c)
+    {
+      t.line = c.line;
+      t.column = c.column;
+
+      char p (c); // Previous character (see below).
+
+      for (;;)
+      {
+        c = get ();
+
+        if (eos (c))
+          fail (location (&name_, t.line, t.column)) << "unterminated literal";
+
+        if (c == '\"' && p != '\\')
+          break;
+
+        // Keep track of \\-escapings so we don't confuse them with \", as in
+        // "\\".
+        //
+        p = (c == '\\' && p == '\\') ? '\0' : static_cast<char> (c);
+      }
+
+      // See if we have a user-defined suffix (which is an identifier).
+      //
+      if ((c = peek ()) == '_' || alpha (c))
+        literal_suffix (c);
+
+      t.type = type::string;
+    }
+
+    void lexer::
+    raw_string_literal (token& t, xchar c)
+    {
+      t.line = c.line;
+      t.column = c.column;
+
+      // The overall form is:
+      //
+      // R"<delimiter>(<raw_characters>)<delimiter>"
+      //
+      // Where <delimiter> is a potentially-empty character sequence made of
+      // any source character but parentheses, backslash and spaces. It can be
+      // at most 16 characters long.
+      //
+      // Note that the <raw_characters> are not processed in any way, not even
+      // for line continuations.
+      //
+
+      // As a first step, parse the delimiter (including the openning paren).
+      //
+      string d (1, ')');
+
+      for (;;)
+      {
+        c = get ();
+
+        if (eos (c) || c == '\"' || c == ')' || c == '\\' || c == ' ')
+          fail (location (&name_, t.line, t.column)) << "invalid raw literal";
+
+        if (c == '(')
+          break;
+
+        d += c;
+      }
+
+      d += '"';
+
+      // Now parse the raw characters while trying to match the closing
+      // delimiter.
+      //
+      for (size_t i (0);;) // Position to match in d.
+      {
+        c = get (false); // No newline escaping.
+
+        if (eos (c))
+          fail (location (&name_, t.line, t.column)) << "invalid raw literal";
+
+        if (c != d[i] && i != 0) // Restart from the beginning.
+          i = 0;
+
+        if (c == d[i])
+        {
+          if (++i == d.size ())
+            break;
+        }
+      }
+
+      // See if we have a user-defined suffix (which is an identifier).
+      //
+      if ((c = peek ()) == '_' || alpha (c))
+        literal_suffix (c);
+
+      t.type = type::string;
+    }
+
+    void lexer::
+    literal_suffix (xchar c)
+    {
+      // Parse a user-defined literal suffix identifier.
+      //
+      for (get (c); (c = peek ()) == '_' || alnum (c); get (c)) ;
+    }
+
+    auto lexer::
+    skip_spaces (bool nl) -> xchar
+    {
+      xchar c (get ());
+
+      for (; !eos (c); c = get ())
+      {
+        switch (c)
+        {
+        case '\n':
+          {
+            if (!nl)
+              break;
+
+            // Fall through.
+          }
+        case ' ':
+        case '\t':
+        case '\r':
+        case '\f':
+        case '\v': continue;
+
+        case '/':
+          {
+            xchar p (peek ());
+
+            // C++ comment.
+            //
+            if (p == '/')
+            {
+              get (p);
+              do { c = get (); } while (!eos (c) && c != '\n');
+
+              if (!nl)
+                break;
+
+              continue;
+            }
+
+            // C comment.
+            //
+            if (p == '*')
+            {
+              get (p);
+
+              for (;;)
+              {
+                c = get ();
+
+                if (eos (c))
+                  fail (p) << "unterminated comment";
+
+                if (c == '*' && (c = peek ()) == '/')
+                {
+                  get (c);
+                  break;
+                }
+              }
+              continue;
+            }
+            break;
+          }
+        }
+        break;
+      }
+
+      return c;
+    }
+
+    ostream&
+    operator<< (ostream& o, const token& t)
+    {
+      switch (t.type)
+      {
+      case type::dot:         o << "'.'";                   break;
+      case type::semi:        o << "';'";                   break;
+      case type::lcbrace:     o << "'{'";                   break;
+      case type::rcbrace:     o << "'}'";                   break;
+      case type::punctuation: o << "<punctuation>";         break;
+
+      case type::identifier:  o << '\'' << t.value << '\''; break;
+
+      case type::number:      o << "<number literal>";      break;
+      case type::character:   o << "<char literal>";        break;
+      case type::string:      o << "<string literal>";      break;
+
+      case type::other:       o << "<other>";               break;
+      case type::eos:         o << "<end of file>";         break;
+      }
+
+      return o;
+    }
+  }
+}
diff --git a/build2/cc/lexer.hxx b/build2/cc/lexer.hxx
new file mode 100644
index 0000000..0735b45
--- /dev/null
+++ b/build2/cc/lexer.hxx
@@ -0,0 +1,166 @@
+// file      : build2/cc/lexer.hxx -*- C++ -*-
+// copyright : Copyright (c) 2014-2017 Code Synthesis Ltd
+// license   : MIT; see accompanying LICENSE file
+
+#ifndef BUILD2_CC_LEXER_HXX
+#define BUILD2_CC_LEXER_HXX
+
+#include <libbutl/char-scanner.hxx>
+
+#include <build2/types.hxx>
+#include <build2/utility.hxx>
+
+#include <build2/diagnostics.hxx>
+
+namespace build2
+{
+  namespace cc
+  {
+    // Preprocessor-level tokenization of C/C++ source. In other words, the
+    // sequence of tokens returned is similar to what a real C/C++ compiler
+    // would see from its preprocessor.
+    //
+    // The input is a (partially-)preprocessed translation unit that may still
+    // contain comments, line continuations, and preprocessor directives such
+    // as #line, #pragma, etc. Currently all preprocessor directives are
+    // discarded and no values are saved for literals.
+    //
+    enum class token_type
+    {
+      // NOTE: remember to update operator<<() if changing anything here!
+      //
+      eos,
+
+      dot,         // .
+      semi,        // ;
+      lcbrace,     // {
+      rcbrace,     // }
+      punctuation, // Other punctuation.
+
+      identifier,
+
+      number,      // Number literal.
+      character,   // Char   literal.
+      string,      // String literal.
+
+      other        // Other token.
+    };
+
+    struct token
+    {
+      token_type type;
+      string     value;
+
+      uint64_t line;
+      uint64_t column;
+
+    public:
+      token ()
+          : token (token_type::eos, 0, 0) {}
+
+      token (token_type t, uint64_t l, uint64_t c)
+          : token (t, string (), l, c) {}
+
+      token (token_type t, string v, uint64_t l, uint64_t c)
+          : type (t), value (move (v)), line (l), column (c) {}
+    };
+
+    // Output the token value in a format suitable for diagnostics.
+    //
+    ostream&
+    operator<< (ostream&, const token&);
+
+    class lexer: protected butl::char_scanner
+    {
+    public:
+      lexer (istream& is, const path& name)
+          : char_scanner (is, false), name_ (name), fail ("error", &name_) {}
+
+      const path&
+      name () const {return name_;}
+
+      // Note that it is ok to call next() again after getting eos.
+      //
+      token
+      next ()
+      {
+        token t;
+        next (t, skip_spaces ());
+        return t;
+      }
+
+      // As above but reuse the token to avoid a (potential) memory
+      // allocation. Typical usage:
+      //
+      // for (token t; l.next (t) != token_type::eos; )
+      //   ...
+      //
+      token_type
+      next (token& t)
+      {
+        next (t, skip_spaces ());
+        return t.type;
+      }
+
+    private:
+      void
+      next (token&, xchar);
+
+      void
+      number_literal (token&, xchar);
+
+      void
+      char_literal (token&, xchar);
+
+      void
+      string_literal (token&, xchar);
+
+      void
+      raw_string_literal (token&, xchar);
+
+      void
+      literal_suffix (xchar);
+
+      xchar
+      skip_spaces (bool newline = true);
+
+      // The char_scanner adaptation for newline escape sequence processing.
+      // Enabled by default and is only disabled in the raw string literals.
+      //
+    private:
+      using base = char_scanner;
+
+      xchar
+      get (bool escape = true);
+
+      void
+      get (const xchar& peeked) {base::get (peeked);}
+
+      xchar
+      peek (bool escape = true);
+
+    private:
+      const path name_;
+      fail_mark fail;
+    };
+
+    // Diagnostics plumbing. We assume that any diag stream for which we can
+    // use token as location has its aux data pointing to pointer to path.
+    //
+    inline location
+    get_location (const token& t, const path& p)
+    {
+      return location (&p, t.line, t.column);
+    }
+
+    inline location
+    get_location (const token& t, const void* data)
+    {
+      assert (data != nullptr); // E.g., must be &parser::path_.
+      const path* p (*static_cast<const path* const*> (data));
+      return get_location (t, *p);
+    }
+  }
+}
+
+#endif // BUILD2_CC_LEXER_HXX
author	Boris Kolpackov <boris@codesynthesis.com>	2017-05-24 13:24:31 +0200
committer	Boris Kolpackov <boris@codesynthesis.com>	2017-05-24 13:24:31 +0200
commit	0cef93b4e2e9bf39b0ca542876f9ab1af6d0f01d (patch)
tree	187b83b65f28cdf4f8a2b0feadf392b49554fbf3 /build2
parent	b3526a5c925169b3be00a5dd4d8c8222f3a475cd (diff)