From 0cef93b4e2e9bf39b0ca542876f9ab1af6d0f01d Mon Sep 17 00:00:00 2001
From: Boris Kolpackov <boris@codesynthesis.com>
Date: Wed, 24 May 2017 13:24:31 +0200
Subject: Implement support for tokenization of preprocessed C/C++ source

---
 build2/buildfile                            |   1 +
 build2/cc/lexer.cxx                         | 683 ++++++++++++++++++++++++++++
 build2/cc/lexer.hxx                         | 166 +++++++
 unit-tests/cc/lexer/buildfile               |  17 +
 unit-tests/cc/lexer/char-literal.test       |  67 +++
 unit-tests/cc/lexer/comment.test            |  88 ++++
 unit-tests/cc/lexer/driver.cxx              |  66 +++
 unit-tests/cc/lexer/line.test               |  67 +++
 unit-tests/cc/lexer/number.test             |  48 ++
 unit-tests/cc/lexer/preprocessor.test       |  38 ++
 unit-tests/cc/lexer/raw-string-literal.test |  90 ++++
 unit-tests/cc/lexer/string-literal.test     |  65 +++
 12 files changed, 1396 insertions(+)
 create mode 100644 build2/cc/lexer.cxx
 create mode 100644 build2/cc/lexer.hxx
 create mode 100644 unit-tests/cc/lexer/buildfile
 create mode 100644 unit-tests/cc/lexer/char-literal.test
 create mode 100644 unit-tests/cc/lexer/comment.test
 create mode 100644 unit-tests/cc/lexer/driver.cxx
 create mode 100644 unit-tests/cc/lexer/line.test
 create mode 100644 unit-tests/cc/lexer/number.test
 create mode 100644 unit-tests/cc/lexer/preprocessor.test
 create mode 100644 unit-tests/cc/lexer/raw-string-literal.test
 create mode 100644 unit-tests/cc/lexer/string-literal.test
diff --git a/build2/buildfile b/build2/buildfile
index 2d65001..69dfc94 100644
--- a/build2/buildfile
+++ b/build2/buildfile
@@ -54,6 +54,7 @@ exe{b}:                                                   \
          cc/{hxx         cxx}{ guess                    } \
          cc/{hxx         cxx}{ init                     } \
          cc/{hxx         cxx}{ install                  } \
+         cc/{hxx         cxx}{ lexer                    } \
          cc/{hxx         cxx}{ link                     } \
          cc/{hxx         cxx}{ module                   } \
          cc/{            cxx}{ msvc                     } \
diff --git a/build2/cc/lexer.cxx b/build2/cc/lexer.cxx
new file mode 100644
index 0000000..3eb5d5b
--- /dev/null
+++ b/build2/cc/lexer.cxx
@@ -0,0 +1,683 @@
+// file      : build2/cc/lexer.cxx -*- C++ -*-
+// copyright : Copyright (c) 2014-2017 Code Synthesis Ltd
+// license   : MIT; see accompanying LICENSE file
+
+#include <build2/cc/lexer.hxx>
+
+using namespace std;
+using namespace butl;
+
+// Diagnostics plumbing.
+//
+namespace butl // ADL
+{
+  inline build2::location
+  get_location (const butl::char_scanner::xchar& c, const void* data)
+  {
+    using namespace build2;
+
+    assert (data != nullptr); // E.g., must be &lexer::name_.
+    return location (static_cast<const path*> (data), c.line, c.column);
+  }
+}
+
+namespace build2
+{
+  namespace cc
+  {
+    inline auto lexer::
+    get (bool e) -> xchar
+    {
+      if (unget_)
+      {
+        unget_ = false;
+        return ungetc_;
+      }
+      else
+      {
+        xchar c (peek (e));
+        base::get (c);
+        return c;
+      }
+    }
+
+    auto lexer::
+    peek (bool e) -> xchar
+    {
+      if (unget_)
+        return ungetc_;
+
+      if (unpeek_)
+        return unpeekc_;
+
+      xchar c (base::peek ());
+
+      if (e && c == '\\')
+      {
+        base::get (c);
+        xchar p (base::peek ());
+
+        if (p == '\n')
+        {
+          base::get (p);
+          return peek (e); // Recurse.
+        }
+
+        // Save in the unpeek buffer so that it is returned on the subsequent
+        // calls to peek() (until get()).
+        //
+        unpeek_ = true;
+        unpeekc_ = c;
+      }
+
+      return c;
+    }
+
+    using type = token_type;
+
+    void lexer::
+    next (token& t, xchar c)
+    {
+      for (;; c = skip_spaces ())
+      {
+        t.line = c.line;
+        t.column = c.column;
+
+        if (eos (c))
+        {
+          t.type = type::eos;
+          return;
+        }
+
+        switch (c)
+        {
+          // Preprocessor lines.
+          //
+        case '#':
+          {
+            // It is tempting to simply scan until the newline ignoring
+            // anything in between. However, these lines can start a
+            // multi-line C-style comment. So we have to tokenize it. Note
+            // that we assume there cannot be #include directives.
+            //
+            // This may not work for things like #error that can contain
+            // pretty much anything.
+            //
+            for (;;)
+            {
+              c = skip_spaces (false); // Stop at newline.
+
+              if (eos (c) || c == '\n')
+                break;
+
+              next (t, c); // Keep using the passed token for buffers.
+            }
+            break;
+          }
+          // Single-letter punctuation.
+          //
+        case ';': t.type = type::semi;    return;
+        case '{': t.type = type::lcbrace; return;
+        case '}': t.type = type::rcbrace; return;
+          // Other single-letter punctuation.
+          //
+        case '(':
+        case ')':
+        case '[':
+        case ']':
+        case ',':
+        case '?':
+        case '~':
+        case '\\': t.type = type::punctuation; return;
+          // Potentially multi-letter punctuation.
+          //
+        case '.': // . .* .<N> ...
+          {
+            xchar p (peek ());
+
+            if (p == '*')
+            {
+              get (p);
+              t.type = type::punctuation;
+              return;
+            }
+            else if (p >= '0' && p <= '9')
+            {
+              number_literal (t, c);
+              return;
+            }
+            else if (p == '.')
+            {
+              get (p);
+              xchar q (peek ());
+              if (q == '.')
+              {
+                get (q);
+                t.type = type::punctuation;
+                return;
+              }
+              unget (p);
+              // Fall through.
+            }
+
+            t.type = type::dot;
+            return;
+          }
+        case '=': // = ==
+        case '!': // ! !=
+        case '*': // * *=
+        case '/': // / /=   (/* and // handled by skip_spaced() above)
+        case '%': // % %=
+        case '^': // ^ ^=
+          {
+            xchar p (peek ());
+
+            if (p == '=')
+              get (p);
+
+            t.type = type::punctuation;
+            return;
+          }
+        case '>': // > >= >> >>=
+        case '<': // < <= << <<=
+          {
+            xchar p (peek ());
+
+            if (p == c)
+            {
+              get (p);
+              if ((p = peek ()) == '=')
+                get (p);
+            }
+            else if (p == '=')
+              get (p);
+
+            t.type = type::punctuation;
+            return;
+          }
+        case '+': // + ++ +=
+        case '-': // - -- -= -> ->*
+          {
+            xchar p (peek ());
+
+            if (p == c)
+              get (p);
+            else if (p == '=')
+              get (p);
+            else if (c == '-' && p == '>')
+            {
+              get (p);
+              if ((p = peek ()) == '*')
+                get (p);
+            }
+
+            t.type = type::punctuation;
+            return;
+          }
+        case '&': // & && &=
+        case '|': // | || |=
+          {
+            xchar p (peek ());
+
+            if (p == c)
+              get (p);
+            else if (p == '=')
+              get (p);
+
+            t.type = type::punctuation;
+            return;
+          }
+        case ':': // : ::
+          {
+            xchar p (peek ());
+
+            if (p == ':')
+              get (p);
+
+            t.type = type::punctuation;
+            return;
+          }
+          // Number (and also .<N> above).
+          //
+        case '0':
+        case '1':
+        case '2':
+        case '3':
+        case '4':
+        case '5':
+        case '6':
+        case '7':
+        case '8':
+        case '9':
+          {
+            number_literal (t, c);
+            return;
+          }
+          // Char/string literal, identifier, or other (\, $, @, `).
+          //
+        default:
+          {
+            bool raw (false); // Raw string literal.
+
+            if (alpha (c) || c == '_')
+            {
+              string& id (t.value);
+              id.clear ();
+
+              for (id += c; (c = peek ()) == '_' || alnum (c); get (c))
+                id += c;
+
+              // If the following character is a quote, see if the identifier
+              // is one of the literal prefixes.
+              //
+              if (c == '\'' || c == '\"')
+              {
+                size_t n (id.size ()), i (0);
+                switch (id[0])
+                {
+                case 'u':
+                  {
+                    if (n > 1 && id[1] == '8')
+                      ++i;
+                    // Fall through.
+                  }
+                case 'L':
+                case 'U':
+                  {
+                    ++i;
+
+                    if (c == '\"' && n > i && id[i] == 'R')
+                    {
+                      ++i;
+                      raw = true;
+                    }
+                    break;
+                  }
+                case 'R':
+                  {
+                    if (c == '\"')
+                    {
+                      ++i;
+                      raw = true;
+                    }
+                    break;
+                  }
+                }
+
+                if (i == n) // All characters "consumed".
+                {
+                  get (c);
+                  id.clear ();
+                }
+              }
+
+              if (!id.empty ())
+              {
+                t.type = type::identifier;
+                return;
+              }
+            }
+
+            switch (c)
+            {
+            case '\'':
+              {
+                char_literal (t, c);
+                return;
+              }
+            case '\"':
+              {
+                if (raw)
+                  raw_string_literal (t, c);
+                else
+                  string_literal (t, c);
+                return;
+              }
+            default:
+              {
+                t.type = type::other;
+                return;
+              }
+            }
+          }
+        }
+      }
+    }
+
+    void lexer::
+    number_literal (token& t, xchar c)
+    {
+      t.line = c.line;
+      t.column = c.column;
+
+      // A number (integer or floating point literal) can:
+      //
+      // 1. Start with a dot (which must be followed by a digit, e.g., .123).
+      //
+      // 2. Can have a radix prefix (0b101, 0123, 0X12AB).
+      //
+      // 3. Can have an exponent (1e10, 0x1.p-10, 1.).
+      //
+      // 4. Digits can be separated with ' (123'456, 0xff00'00ff).
+      //
+      // 5. End with a built-in or user defined literal (123f, 123UL, 123_X)
+      //
+      // Quoting from GCC's preprocessor documentation:
+      //
+      // "Formally preprocessing numbers begin with an optional period, a
+      // required decimal digit, and then continue with any sequence of
+      // letters, digits, underscores, periods, and exponents. Exponents are
+      // the two-character sequences 'e+', 'e-', 'E+', 'E-', 'p+', 'p-', 'P+',
+      // and 'P-'."
+      //
+      // So it looks like a "C++ number" is then any unseparated (with
+      // whitespace or punctuation) sequence of those plus '. The only mildly
+      // tricky part is then to recognize +/- as being part of the exponent.
+      //
+      while (!eos ((c = peek ())))
+      {
+        switch (c)
+        {
+          // All the whitespace, punctuation, and other characters that end
+          // the number.
+          //
+        case ' ':
+        case '\n':
+        case '\t':
+        case '\r':
+        case '\f':
+        case '\v':
+
+        case '#':
+        case ';':
+        case '{':
+        case '}':
+        case '(':
+        case ')':
+        case '[':
+        case ']':
+        case ',':
+        case '?':
+        case '~':
+        case '=':
+        case '!':
+        case '*':
+        case '/':
+        case '%':
+        case '^':
+        case '>':
+        case '<':
+        case '&':
+        case '|':
+        case ':':
+        case '+': // The exponent case is handled below.
+        case '-': // The exponent case is handled below.
+        case '"':
+        case '\\':
+
+        case '@':
+        case '$':
+        case '`':
+          break;
+
+          // Recognize +/- after the exponent.
+          //
+        case 'e':
+        case 'E':
+        case 'p':
+        case 'P':
+          {
+            get (c);
+            c = peek ();
+            if (c == '+' || c == '-')
+              get (c);
+            continue;
+          }
+
+        case '_':
+        case '.':
+        case '\'':
+        default: // Digits and letters.
+          {
+            get (c);
+            continue;
+          }
+        }
+
+        break;
+      }
+
+      t.type = type::number;
+    }
+
+    void lexer::
+    char_literal (token& t, xchar c)
+    {
+      t.line = c.line;
+      t.column = c.column;
+
+      char p (c); // Previous character (see below).
+
+      for (;;)
+      {
+        c = get ();
+
+        if (eos (c))
+          fail (location (&name_, t.line, t.column)) << "unterminated literal";
+
+        if (c == '\'' && p != '\\')
+          break;
+
+        // Keep track of \\-escapings so we don't confuse them with \', as in
+        // '\\'.
+        //
+        p = (c == '\\' && p == '\\') ? '\0' : static_cast<char> (c);
+      }
+
+      // See if we have a user-defined suffix (which is an identifier).
+      //
+      if ((c = peek ()) == '_' || alpha (c))
+        literal_suffix (c);
+
+      t.type = type::character;
+    }
+
+    void lexer::
+    string_literal (token& t, xchar c)
+    {
+      t.line = c.line;
+      t.column = c.column;
+
+      char p (c); // Previous character (see below).
+
+      for (;;)
+      {
+        c = get ();
+
+        if (eos (c))
+          fail (location (&name_, t.line, t.column)) << "unterminated literal";
+
+        if (c == '\"' && p != '\\')
+          break;
+
+        // Keep track of \\-escapings so we don't confuse them with \", as in
+        // "\\".
+        //
+        p = (c == '\\' && p == '\\') ? '\0' : static_cast<char> (c);
+      }
+
+      // See if we have a user-defined suffix (which is an identifier).
+      //
+      if ((c = peek ()) == '_' || alpha (c))
+        literal_suffix (c);
+
+      t.type = type::string;
+    }
+
+    void lexer::
+    raw_string_literal (token& t, xchar c)
+    {
+      t.line = c.line;
+      t.column = c.column;
+
+      // The overall form is:
+      //
+      // R"<delimiter>(<raw_characters>)<delimiter>"
+      //
+      // Where <delimiter> is a potentially-empty character sequence made of
+      // any source character but parentheses, backslash and spaces. It can be
+      // at most 16 characters long.
+      //
+      // Note that the <raw_characters> are not processed in any way, not even
+      // for line continuations.
+      //
+
+      // As a first step, parse the delimiter (including the openning paren).
+      //
+      string d (1, ')');
+
+      for (;;)
+      {
+        c = get ();
+
+        if (eos (c) || c == '\"' || c == ')' || c == '\\' || c == ' ')
+          fail (location (&name_, t.line, t.column)) << "invalid raw literal";
+
+        if (c == '(')
+          break;
+
+        d += c;
+      }
+
+      d += '"';
+
+      // Now parse the raw characters while trying to match the closing
+      // delimiter.
+      //
+      for (size_t i (0);;) // Position to match in d.
+      {
+        c = get (false); // No newline escaping.
+
+        if (eos (c))
+          fail (location (&name_, t.line, t.column)) << "invalid raw literal";
+
+        if (c != d[i] && i != 0) // Restart from the beginning.
+          i = 0;
+
+        if (c == d[i])
+        {
+          if (++i == d.size ())
+            break;
+        }
+      }
+
+      // See if we have a user-defined suffix (which is an identifier).
+      //
+      if ((c = peek ()) == '_' || alpha (c))
+        literal_suffix (c);
+
+      t.type = type::string;
+    }
+
+    void lexer::
+    literal_suffix (xchar c)
+    {
+      // Parse a user-defined literal suffix identifier.
+      //
+      for (get (c); (c = peek ()) == '_' || alnum (c); get (c)) ;
+    }
+
+    auto lexer::
+    skip_spaces (bool nl) -> xchar
+    {
+      xchar c (get ());
+
+      for (; !eos (c); c = get ())
+      {
+        switch (c)
+        {
+        case '\n':
+          {
+            if (!nl)
+              break;
+
+            // Fall through.
+          }
+        case ' ':
+        case '\t':
+        case '\r':
+        case '\f':
+        case '\v': continue;
+
+        case '/':
+          {
+            xchar p (peek ());
+
+            // C++ comment.
+            //
+            if (p == '/')
+            {
+              get (p);
+              do { c = get (); } while (!eos (c) && c != '\n');
+
+              if (!nl)
+                break;
+
+              continue;
+            }
+
+            // C comment.
+            //
+            if (p == '*')
+            {
+              get (p);
+
+              for (;;)
+              {
+                c = get ();
+
+                if (eos (c))
+                  fail (p) << "unterminated comment";
+
+                if (c == '*' && (c = peek ()) == '/')
+                {
+                  get (c);
+                  break;
+                }
+              }
+              continue;
+            }
+            break;
+          }
+        }
+        break;
+      }
+
+      return c;
+    }
+
+    ostream&
+    operator<< (ostream& o, const token& t)
+    {
+      switch (t.type)
+      {
+      case type::dot:         o << "'.'";                   break;
+      case type::semi:        o << "';'";                   break;
+      case type::lcbrace:     o << "'{'";                   break;
+      case type::rcbrace:     o << "'}'";                   break;
+      case type::punctuation: o << "<punctuation>";         break;
+
+      case type::identifier:  o << '\'' << t.value << '\''; break;
+
+      case type::number:      o << "<number literal>";      break;
+      case type::character:   o << "<char literal>";        break;
+      case type::string:      o << "<string literal>";      break;
+
+      case type::other:       o << "<other>";               break;
+      case type::eos:         o << "<end of file>";         break;
+      }
+
+      return o;
+    }
+  }
+}
diff --git a/build2/cc/lexer.hxx b/build2/cc/lexer.hxx
new file mode 100644
index 0000000..0735b45
--- /dev/null
+++ b/build2/cc/lexer.hxx
@@ -0,0 +1,166 @@
+// file      : build2/cc/lexer.hxx -*- C++ -*-
+// copyright : Copyright (c) 2014-2017 Code Synthesis Ltd
+// license   : MIT; see accompanying LICENSE file
+
+#ifndef BUILD2_CC_LEXER_HXX
+#define BUILD2_CC_LEXER_HXX
+
+#include <libbutl/char-scanner.hxx>
+
+#include <build2/types.hxx>
+#include <build2/utility.hxx>
+
+#include <build2/diagnostics.hxx>
+
+namespace build2
+{
+  namespace cc
+  {
+    // Preprocessor-level tokenization of C/C++ source. In other words, the
+    // sequence of tokens returned is similar to what a real C/C++ compiler
+    // would see from its preprocessor.
+    //
+    // The input is a (partially-)preprocessed translation unit that may still
+    // contain comments, line continuations, and preprocessor directives such
+    // as #line, #pragma, etc. Currently all preprocessor directives are
+    // discarded and no values are saved for literals.
+    //
+    enum class token_type
+    {
+      // NOTE: remember to update operator<<() if changing anything here!
+      //
+      eos,
+
+      dot,         // .
+      semi,        // ;
+      lcbrace,     // {
+      rcbrace,     // }
+      punctuation, // Other punctuation.
+
+      identifier,
+
+      number,      // Number literal.
+      character,   // Char   literal.
+      string,      // String literal.
+
+      other        // Other token.
+    };
+
+    struct token
+    {
+      token_type type;
+      string     value;
+
+      uint64_t line;
+      uint64_t column;
+
+    public:
+      token ()
+          : token (token_type::eos, 0, 0) {}
+
+      token (token_type t, uint64_t l, uint64_t c)
+          : token (t, string (), l, c) {}
+
+      token (token_type t, string v, uint64_t l, uint64_t c)
+          : type (t), value (move (v)), line (l), column (c) {}
+    };
+
+    // Output the token value in a format suitable for diagnostics.
+    //
+    ostream&
+    operator<< (ostream&, const token&);
+
+    class lexer: protected butl::char_scanner
+    {
+    public:
+      lexer (istream& is, const path& name)
+          : char_scanner (is, false), name_ (name), fail ("error", &name_) {}
+
+      const path&
+      name () const {return name_;}
+
+      // Note that it is ok to call next() again after getting eos.
+      //
+      token
+      next ()
+      {
+        token t;
+        next (t, skip_spaces ());
+        return t;
+      }
+
+      // As above but reuse the token to avoid a (potential) memory
+      // allocation. Typical usage:
+      //
+      // for (token t; l.next (t) != token_type::eos; )
+      //   ...
+      //
+      token_type
+      next (token& t)
+      {
+        next (t, skip_spaces ());
+        return t.type;
+      }
+
+    private:
+      void
+      next (token&, xchar);
+
+      void
+      number_literal (token&, xchar);
+
+      void
+      char_literal (token&, xchar);
+
+      void
+      string_literal (token&, xchar);
+
+      void
+      raw_string_literal (token&, xchar);
+
+      void
+      literal_suffix (xchar);
+
+      xchar
+      skip_spaces (bool newline = true);
+
+      // The char_scanner adaptation for newline escape sequence processing.
+      // Enabled by default and is only disabled in the raw string literals.
+      //
+    private:
+      using base = char_scanner;
+
+      xchar
+      get (bool escape = true);
+
+      void
+      get (const xchar& peeked) {base::get (peeked);}
+
+      xchar
+      peek (bool escape = true);
+
+    private:
+      const path name_;
+      fail_mark fail;
+    };
+
+    // Diagnostics plumbing. We assume that any diag stream for which we can
+    // use token as location has its aux data pointing to pointer to path.
+    //
+    inline location
+    get_location (const token& t, const path& p)
+    {
+      return location (&p, t.line, t.column);
+    }
+
+    inline location
+    get_location (const token& t, const void* data)
+    {
+      assert (data != nullptr); // E.g., must be &parser::path_.
+      const path* p (*static_cast<const path* const*> (data));
+      return get_location (t, *p);
+    }
+  }
+}
+
+#endif // BUILD2_CC_LEXER_HXX
diff --git a/unit-tests/cc/lexer/buildfile b/unit-tests/cc/lexer/buildfile
new file mode 100644
index 0000000..ff4e0b3
--- /dev/null
+++ b/unit-tests/cc/lexer/buildfile
@@ -0,0 +1,17 @@
+# file      : unit-tests/cc/lexer/buildfile
+# copyright : Copyright (c) 2014-2017 Code Synthesis Ltd
+# license   : MIT; see accompanying LICENSE file
+
+#@@ Temporary until we get utility library support.
+#
+import libs = libbutl%lib{butl}
+src = cc/lexer token lexer diagnostics utility variable name b-options types-parsers \
+context scope parser target operation rule prerequisite file module function \
+functions-builtin functions-path functions-process-path functions-string \
+functions-target-triplet algorithm search dump filesystem scheduler \
+config/{utility init operation module} spec
+
+exe{driver}: cxx{driver} ../../../build2/cxx{$src} ../../../build2/liba{b} \
+$libs test{*}
+
+include ../../../build2/
diff --git a/unit-tests/cc/lexer/char-literal.test b/unit-tests/cc/lexer/char-literal.test
new file mode 100644
index 0000000..f256785
--- /dev/null
+++ b/unit-tests/cc/lexer/char-literal.test
@@ -0,0 +1,67 @@
+# file      : unit-tests/cc/lexer/char-literal.test
+# copyright : Copyright (c) 2014-2017 Code Synthesis Ltd
+# license   : MIT; see accompanying LICENSE file
+
+# Test character literals.
+#
+
+: normal
+:
+$* <<EOI >>EOO
+'a'
+'aa'
+'"'
+EOI
+<char literal>
+<char literal>
+<char literal>
+EOO
+
+: prefix
+:
+$* <<EOI >>EOO
+L'a'
+U'a'
+u'a'
+u8'a'
+u8R'a'
+EOI
+<char literal>
+<char literal>
+<char literal>
+<char literal>
+'u8R'
+<char literal>
+EOO
+
+: suffix
+:
+$* <<EOI >>EOO
+'a'x
+'a'_X123
+EOI
+<char literal>
+<char literal>
+EOO
+
+: escape
+:
+$* <<EOI >>EOO
+'\''
+'\\'
+'\\\''
+'\n'
+U'\U0001f34c'
+EOI
+<char literal>
+<char literal>
+<char literal>
+<char literal>
+<char literal>
+EOO
+
+: unterminated
+:
+$* <"'a" 2>>EOE != 0
+stdin:1:1: error: unterminated literal
+EOE
diff --git a/unit-tests/cc/lexer/comment.test b/unit-tests/cc/lexer/comment.test
new file mode 100644
index 0000000..e90d8e0
--- /dev/null
+++ b/unit-tests/cc/lexer/comment.test
@@ -0,0 +1,88 @@
+# file      : unit-tests/cc/lexer/comment.test
+# copyright : Copyright (c) 2014-2017 Code Synthesis Ltd
+# license   : MIT; see accompanying LICENSE file
+
+# Test C and C++ comments.
+#
+
+: c-comment
+:
+$* <<EOI
+/* 'one' */
+/* "two" // three
+*/
+/**
+four
+// five */
+/**
+six /*
+*/
+EOI
+
+: cxx-comment
+:
+$* <<EOI
+// 'one'
+// "two" // three
+// four /* five */
+EOI
+
+: commented-out
+:
+$* <<EOI >"';'"
+// /*
+;
+// */
+EOI
+
+: c-unterminated
+:
+$* <<EOI 2>>EOE != 0
+/*
+comment
+EOI
+stdin:1:2: error: unterminated comment
+EOE
+
+: cxx-unterminated
+:
+$* <<:EOI
+// comment
+EOI
+
+: in-char-literal
+:
+$* <<EOI >>EOO
+'//'
+'/*'*/
+EOI
+<char literal>
+<char literal>
+<punctuation>
+<punctuation>
+EOO
+
+: in-string-literal
+:
+$* <<EOI >>EOO
+"//foo"
+"/*"*/
+EOI
+<string literal>
+<string literal>
+<punctuation>
+<punctuation>
+EOO
+
+: in-raw-string-literal
+:
+$* <<EOI >>EOO
+R"X(
+// foo
+/* bar
+)X"*/
+EOI
+<string literal>
+<punctuation>
+<punctuation>
+EOO
diff --git a/unit-tests/cc/lexer/driver.cxx b/unit-tests/cc/lexer/driver.cxx
new file mode 100644
index 0000000..db3f516
--- /dev/null
+++ b/unit-tests/cc/lexer/driver.cxx
@@ -0,0 +1,66 @@
+// file      : unit-tests/cc/lexer/driver.cxx -*- C++ -*-
+// copyright : Copyright (c) 2014-2017 Code Synthesis Ltd
+// license   : MIT; see accompanying LICENSE file
+
+#include <cassert>
+#include <iostream>
+
+#include <build2/types.hxx>
+#include <build2/utility.hxx>
+
+#include <build2/cc/lexer.hxx>
+
+using namespace std;
+
+namespace build2
+{
+  namespace cc
+  {
+    // Usage: argv[0] [<file>]
+    //
+    int
+    main (int argc, char* argv[])
+    {
+      try
+      {
+        istream* is;
+        const char* in;
+
+        // Reading from file is several times faster.
+        //
+        ifdstream ifs;
+        if (argc > 1)
+        {
+          in = argv[1];
+          ifs.open (in);
+          is = &ifs;
+        }
+        else
+        {
+          in = "stdin";
+          cin.exceptions (istream::failbit | istream::badbit);
+          is = &cin;
+        }
+
+        lexer l (*is, path (in));
+
+        // No use printing eos since we will either get it or loop forever.
+        //
+        for (token t; l.next (t) != token_type::eos; )
+          cout << t << endl;
+      }
+      catch (const failed&)
+      {
+        return 1;
+      }
+
+      return 0;
+    }
+  }
+}
+
+int
+main (int argc, char* argv[])
+{
+  return build2::cc::main (argc, argv);
+}
diff --git a/unit-tests/cc/lexer/line.test b/unit-tests/cc/lexer/line.test
new file mode 100644
index 0000000..9eda9c3
--- /dev/null
+++ b/unit-tests/cc/lexer/line.test
@@ -0,0 +1,67 @@
+# file      : unit-tests/cc/lexer/line.test
+# copyright : Copyright (c) 2014-2017 Code Synthesis Ltd
+# license   : MIT; see accompanying LICENSE file
+
+# Test line continuations.
+#
+
+: identifier
+:
+$* <<EOI >"'foo123'"
+fo\
+o\
+1\
+2\
+3
+EOI
+
+: punctuation
+:
+$* <<EOI >'<punctuation>'
+.\
+.\
+.
+EOI
+
+: c-comment
+:
+$* <<EOI
+/\
+*
+comment
+*\
+/\
+
+EOI
+
+: cxx-comment
+:
+$* <<EOI
+/\
+/ comment\
+more\
+more
+EOI
+
+: other
+:
+$* <<EOI >>EOO
+\abc
+EOI
+<punctuation>
+'abc'
+EOO
+
+: multiple
+:
+$* <<EOI >>EOO
+\\
+EOI
+<punctuation>
+EOO
+
+: unterminated
+:
+$* <<:EOI >'<punctuation>'
+\
+EOI
diff --git a/unit-tests/cc/lexer/number.test b/unit-tests/cc/lexer/number.test
new file mode 100644
index 0000000..1d9b9c5
--- /dev/null
+++ b/unit-tests/cc/lexer/number.test
@@ -0,0 +1,48 @@
+# file      : unit-tests/cc/lexer/number.test
+# copyright : Copyright (c) 2014-2017 Code Synthesis Ltd
+# license   : MIT; see accompanying LICENSE file
+
+# Test numbers.
+#
+
+$* <'1'  >'<number literal>'
+$* <'.1' >'<number literal>'
+$* <'1.' >'<number literal>'
+
+$* <'0b101'  >'<number literal>'
+$* <'0123'   >'<number literal>'
+$* <'0X12AB' >'<number literal>'
+
+$* <'1e10'     >'<number literal>'
+$* <'1E+10'    >'<number literal>'
+$* <'0x1.p10'  >'<number literal>'
+$* <'0x1.P-10' >'<number literal>'
+
+$* <"123'456"     >'<number literal>'
+$* <"0xff00'00ff" >'<number literal>'
+
+$* <'123f'  >'<number literal>'
+$* <'123UL' >'<number literal>'
+$* <'123_X' >'<number literal>'
+
+: separate-punctuation
+:
+$* <'123;' >>EOO
+<number literal>
+';'
+EOO
+
+: separate-plus-minus
+:
+$* <'1.0_a+2.0' >>EOO
+<number literal>
+<punctuation>
+<number literal>
+EOO
+
+: separate-whitespace
+:
+$* <'123 abc' >>EOO
+<number literal>
+'abc'
+EOO
diff --git a/unit-tests/cc/lexer/preprocessor.test b/unit-tests/cc/lexer/preprocessor.test
new file mode 100644
index 0000000..2917649
--- /dev/null
+++ b/unit-tests/cc/lexer/preprocessor.test
@@ -0,0 +1,38 @@
+# file      : unit-tests/cc/lexer/preprocessor.test
+# copyright : Copyright (c) 2014-2017 Code Synthesis Ltd
+# license   : MIT; see accompanying LICENSE file
+
+# Test preprocessor lines.
+#
+
+: normal
+:
+$* <<EOI
+#pragma message("abc")
+EOI
+
+: multiline
+:
+$* <<EOI
+#pragma message \
+( \
+"abc" \
+)
+EOI
+
+: comment
+:
+$* <<EOI
+#pragma foo /*
+bar
+baz
+*/
+#pragma foo // bar baz
+EOI
+
+: line
+:
+$* <<EOI
+# 1 "test.cxx" 2
+#line 8 "z:\\tmp\\test.hxx"
+EOI
diff --git a/unit-tests/cc/lexer/raw-string-literal.test b/unit-tests/cc/lexer/raw-string-literal.test
new file mode 100644
index 0000000..e8e8b6b
--- /dev/null
+++ b/unit-tests/cc/lexer/raw-string-literal.test
@@ -0,0 +1,90 @@
+# file      : unit-tests/cc/lexer/raw-string-literal.test
+# copyright : Copyright (c) 2014-2017 Code Synthesis Ltd
+# license   : MIT; see accompanying LICENSE file
+
+# Test raw string literals.
+#
+
+: normal
+:
+$* <<EOI >>EOO
+R"()"
+R"(ab)"
+R"(a"b)"
+R"(a)b)"
+R"%(a%)b)%"
+R"X(a
+    b)X"
+R"X(a\
+    b)X"
+EOI
+<string literal>
+<string literal>
+<string literal>
+<string literal>
+<string literal>
+<string literal>
+<string literal>
+EOO
+
+: prefix
+:
+$* <<EOI >>EOO
+LR"(ab)"
+UR"(ab)"
+uR"(ab)"
+u8R"(ab)"
+EOI
+<string literal>
+<string literal>
+<string literal>
+<string literal>
+EOO
+
+: suffix
+:
+$* <<EOI >>EOO
+R"(ab)"x
+R"(ab)"_X123
+EOI
+<string literal>
+<string literal>
+EOO
+
+: escape
+:
+$* <<EOI >>EOO
+R"(\)"
+EOI
+<string literal>
+EOO
+
+: invalid-no-paren
+:
+$* <'R"a"' 2>>EOE != 0
+stdin:1:2: error: invalid raw literal
+EOE
+
+: invalid-paren
+:
+$* <'R")()("' 2>>EOE != 0
+stdin:1:2: error: invalid raw literal
+EOE
+
+: invalid-unterminated-paren
+:
+$* <'R"(abc"' 2>>EOE != 0
+stdin:1:2: error: invalid raw literal
+EOE
+
+: invalid-unterminated-delimiter
+:
+$* <'R"X(abc)"' 2>>EOE != 0
+stdin:1:2: error: invalid raw literal
+EOE
+
+: invalid-unterminated-quote
+:
+$* <'R"X(abc)X' 2>>EOE != 0
+stdin:1:2: error: invalid raw literal
+EOE
diff --git a/unit-tests/cc/lexer/string-literal.test b/unit-tests/cc/lexer/string-literal.test
new file mode 100644
index 0000000..062d290
--- /dev/null
+++ b/unit-tests/cc/lexer/string-literal.test
@@ -0,0 +1,65 @@
+# file      : unit-tests/cc/lexer/string-literal.test
+# copyright : Copyright (c) 2014-2017 Code Synthesis Ltd
+# license   : MIT; see accompanying LICENSE file
+
+# Test string literals (except raw).
+#
+
+: normal
+:
+$* <<EOI >>EOO
+"aa"
+"'"
+"a""b"
+EOI
+<string literal>
+<string literal>
+<string literal>
+<string literal>
+EOO
+
+: prefix
+:
+$* <<EOI >>EOO
+L"ab"
+U"ab"
+u"ab"
+u8"ab"
+EOI
+<string literal>
+<string literal>
+<string literal>
+<string literal>
+EOO
+
+: suffix
+:
+$* <<EOI >>EOO
+"ab"x
+"ab"_X123
+EOI
+<string literal>
+<string literal>
+EOO
+
+: escape
+:
+$* <<EOI >>EOO
+"\"\""
+"\\\\"
+"\\\"\\"
+"\n\t"
+U"a\U0001f34c"
+EOI
+<string literal>
+<string literal>
+<string literal>
+<string literal>
+<string literal>
+EOO
+
+: unterminated
+:
+$* <'"ab' 2>>EOE != 0
+stdin:1:1: error: unterminated literal
+EOE
-- 
cgit v1.1