aboutsummaryrefslogtreecommitdiff
path: root/build2
diff options
context:
space:
mode:
authorBoris Kolpackov <boris@codesynthesis.com>2017-05-24 13:24:31 +0200
committerBoris Kolpackov <boris@codesynthesis.com>2017-05-24 13:24:31 +0200
commit0cef93b4e2e9bf39b0ca542876f9ab1af6d0f01d (patch)
tree187b83b65f28cdf4f8a2b0feadf392b49554fbf3 /build2
parentb3526a5c925169b3be00a5dd4d8c8222f3a475cd (diff)
Implement support for tokenization of preprocessed C/C++ source
Diffstat (limited to 'build2')
-rw-r--r--build2/buildfile1
-rw-r--r--build2/cc/lexer.cxx683
-rw-r--r--build2/cc/lexer.hxx166
3 files changed, 850 insertions, 0 deletions
diff --git a/build2/buildfile b/build2/buildfile
index 2d65001..69dfc94 100644
--- a/build2/buildfile
+++ b/build2/buildfile
@@ -54,6 +54,7 @@ exe{b}: \
cc/{hxx cxx}{ guess } \
cc/{hxx cxx}{ init } \
cc/{hxx cxx}{ install } \
+ cc/{hxx cxx}{ lexer } \
cc/{hxx cxx}{ link } \
cc/{hxx cxx}{ module } \
cc/{ cxx}{ msvc } \
diff --git a/build2/cc/lexer.cxx b/build2/cc/lexer.cxx
new file mode 100644
index 0000000..3eb5d5b
--- /dev/null
+++ b/build2/cc/lexer.cxx
@@ -0,0 +1,683 @@
+// file : build2/cc/lexer.cxx -*- C++ -*-
+// copyright : Copyright (c) 2014-2017 Code Synthesis Ltd
+// license : MIT; see accompanying LICENSE file
+
+#include <build2/cc/lexer.hxx>
+
+using namespace std;
+using namespace butl;
+
+// Diagnostics plumbing.
+//
+namespace butl // ADL
+{
+ inline build2::location
+ get_location (const butl::char_scanner::xchar& c, const void* data)
+ {
+ using namespace build2;
+
+ assert (data != nullptr); // E.g., must be &lexer::name_.
+ return location (static_cast<const path*> (data), c.line, c.column);
+ }
+}
+
+namespace build2
+{
+ namespace cc
+ {
+ inline auto lexer::
+ get (bool e) -> xchar
+ {
+ if (unget_)
+ {
+ unget_ = false;
+ return ungetc_;
+ }
+ else
+ {
+ xchar c (peek (e));
+ base::get (c);
+ return c;
+ }
+ }
+
+ auto lexer::
+ peek (bool e) -> xchar
+ {
+ if (unget_)
+ return ungetc_;
+
+ if (unpeek_)
+ return unpeekc_;
+
+ xchar c (base::peek ());
+
+ if (e && c == '\\')
+ {
+ base::get (c);
+ xchar p (base::peek ());
+
+ if (p == '\n')
+ {
+ base::get (p);
+ return peek (e); // Recurse.
+ }
+
+ // Save in the unpeek buffer so that it is returned on the subsequent
+ // calls to peek() (until get()).
+ //
+ unpeek_ = true;
+ unpeekc_ = c;
+ }
+
+ return c;
+ }
+
+ using type = token_type;
+
+ void lexer::
+ next (token& t, xchar c)
+ {
+ for (;; c = skip_spaces ())
+ {
+ t.line = c.line;
+ t.column = c.column;
+
+ if (eos (c))
+ {
+ t.type = type::eos;
+ return;
+ }
+
+ switch (c)
+ {
+ // Preprocessor lines.
+ //
+ case '#':
+ {
+ // It is tempting to simply scan until the newline ignoring
+ // anything in between. However, these lines can start a
+ // multi-line C-style comment. So we have to tokenize it. Note
+ // that we assume there cannot be #include directives.
+ //
+ // This may not work for things like #error that can contain
+ // pretty much anything.
+ //
+ for (;;)
+ {
+ c = skip_spaces (false); // Stop at newline.
+
+ if (eos (c) || c == '\n')
+ break;
+
+ next (t, c); // Keep using the passed token for buffers.
+ }
+ break;
+ }
+ // Single-letter punctuation.
+ //
+ case ';': t.type = type::semi; return;
+ case '{': t.type = type::lcbrace; return;
+ case '}': t.type = type::rcbrace; return;
+ // Other single-letter punctuation.
+ //
+ case '(':
+ case ')':
+ case '[':
+ case ']':
+ case ',':
+ case '?':
+ case '~':
+ case '\\': t.type = type::punctuation; return;
+ // Potentially multi-letter punctuation.
+ //
+ case '.': // . .* .<N> ...
+ {
+ xchar p (peek ());
+
+ if (p == '*')
+ {
+ get (p);
+ t.type = type::punctuation;
+ return;
+ }
+ else if (p >= '0' && p <= '9')
+ {
+ number_literal (t, c);
+ return;
+ }
+ else if (p == '.')
+ {
+ get (p);
+ xchar q (peek ());
+ if (q == '.')
+ {
+ get (q);
+ t.type = type::punctuation;
+ return;
+ }
+ unget (p);
+ // Fall through.
+ }
+
+ t.type = type::dot;
+ return;
+ }
+ case '=': // = ==
+ case '!': // ! !=
+ case '*': // * *=
+ case '/': // / /= (/* and // handled by skip_spaced() above)
+ case '%': // % %=
+ case '^': // ^ ^=
+ {
+ xchar p (peek ());
+
+ if (p == '=')
+ get (p);
+
+ t.type = type::punctuation;
+ return;
+ }
+ case '>': // > >= >> >>=
+ case '<': // < <= << <<=
+ {
+ xchar p (peek ());
+
+ if (p == c)
+ {
+ get (p);
+ if ((p = peek ()) == '=')
+ get (p);
+ }
+ else if (p == '=')
+ get (p);
+
+ t.type = type::punctuation;
+ return;
+ }
+ case '+': // + ++ +=
+ case '-': // - -- -= -> ->*
+ {
+ xchar p (peek ());
+
+ if (p == c)
+ get (p);
+ else if (p == '=')
+ get (p);
+ else if (c == '-' && p == '>')
+ {
+ get (p);
+ if ((p = peek ()) == '*')
+ get (p);
+ }
+
+ t.type = type::punctuation;
+ return;
+ }
+ case '&': // & && &=
+ case '|': // | || |=
+ {
+ xchar p (peek ());
+
+ if (p == c)
+ get (p);
+ else if (p == '=')
+ get (p);
+
+ t.type = type::punctuation;
+ return;
+ }
+ case ':': // : ::
+ {
+ xchar p (peek ());
+
+ if (p == ':')
+ get (p);
+
+ t.type = type::punctuation;
+ return;
+ }
+ // Number (and also .<N> above).
+ //
+ case '0':
+ case '1':
+ case '2':
+ case '3':
+ case '4':
+ case '5':
+ case '6':
+ case '7':
+ case '8':
+ case '9':
+ {
+ number_literal (t, c);
+ return;
+ }
+ // Char/string literal, identifier, or other (\, $, @, `).
+ //
+ default:
+ {
+ bool raw (false); // Raw string literal.
+
+ if (alpha (c) || c == '_')
+ {
+ string& id (t.value);
+ id.clear ();
+
+ for (id += c; (c = peek ()) == '_' || alnum (c); get (c))
+ id += c;
+
+ // If the following character is a quote, see if the identifier
+ // is one of the literal prefixes.
+ //
+ if (c == '\'' || c == '\"')
+ {
+ size_t n (id.size ()), i (0);
+ switch (id[0])
+ {
+ case 'u':
+ {
+ if (n > 1 && id[1] == '8')
+ ++i;
+ // Fall through.
+ }
+ case 'L':
+ case 'U':
+ {
+ ++i;
+
+ if (c == '\"' && n > i && id[i] == 'R')
+ {
+ ++i;
+ raw = true;
+ }
+ break;
+ }
+ case 'R':
+ {
+ if (c == '\"')
+ {
+ ++i;
+ raw = true;
+ }
+ break;
+ }
+ }
+
+ if (i == n) // All characters "consumed".
+ {
+ get (c);
+ id.clear ();
+ }
+ }
+
+ if (!id.empty ())
+ {
+ t.type = type::identifier;
+ return;
+ }
+ }
+
+ switch (c)
+ {
+ case '\'':
+ {
+ char_literal (t, c);
+ return;
+ }
+ case '\"':
+ {
+ if (raw)
+ raw_string_literal (t, c);
+ else
+ string_literal (t, c);
+ return;
+ }
+ default:
+ {
+ t.type = type::other;
+ return;
+ }
+ }
+ }
+ }
+ }
+ }
+
+ void lexer::
+ number_literal (token& t, xchar c)
+ {
+ t.line = c.line;
+ t.column = c.column;
+
+ // A number (integer or floating point literal) can:
+ //
+ // 1. Start with a dot (which must be followed by a digit, e.g., .123).
+ //
+ // 2. Can have a radix prefix (0b101, 0123, 0X12AB).
+ //
+ // 3. Can have an exponent (1e10, 0x1.p-10, 1.).
+ //
+ // 4. Digits can be separated with ' (123'456, 0xff00'00ff).
+ //
+ // 5. End with a built-in or user defined literal (123f, 123UL, 123_X)
+ //
+ // Quoting from GCC's preprocessor documentation:
+ //
+ // "Formally preprocessing numbers begin with an optional period, a
+ // required decimal digit, and then continue with any sequence of
+ // letters, digits, underscores, periods, and exponents. Exponents are
+ // the two-character sequences 'e+', 'e-', 'E+', 'E-', 'p+', 'p-', 'P+',
+ // and 'P-'."
+ //
+ // So it looks like a "C++ number" is then any unseparated (with
+ // whitespace or punctuation) sequence of those plus '. The only mildly
+ // tricky part is then to recognize +/- as being part of the exponent.
+ //
+ while (!eos ((c = peek ())))
+ {
+ switch (c)
+ {
+ // All the whitespace, punctuation, and other characters that end
+ // the number.
+ //
+ case ' ':
+ case '\n':
+ case '\t':
+ case '\r':
+ case '\f':
+ case '\v':
+
+ case '#':
+ case ';':
+ case '{':
+ case '}':
+ case '(':
+ case ')':
+ case '[':
+ case ']':
+ case ',':
+ case '?':
+ case '~':
+ case '=':
+ case '!':
+ case '*':
+ case '/':
+ case '%':
+ case '^':
+ case '>':
+ case '<':
+ case '&':
+ case '|':
+ case ':':
+ case '+': // The exponent case is handled below.
+ case '-': // The exponent case is handled below.
+ case '"':
+ case '\\':
+
+ case '@':
+ case '$':
+ case '`':
+ break;
+
+ // Recognize +/- after the exponent.
+ //
+ case 'e':
+ case 'E':
+ case 'p':
+ case 'P':
+ {
+ get (c);
+ c = peek ();
+ if (c == '+' || c == '-')
+ get (c);
+ continue;
+ }
+
+ case '_':
+ case '.':
+ case '\'':
+ default: // Digits and letters.
+ {
+ get (c);
+ continue;
+ }
+ }
+
+ break;
+ }
+
+ t.type = type::number;
+ }
+
+ void lexer::
+ char_literal (token& t, xchar c)
+ {
+ t.line = c.line;
+ t.column = c.column;
+
+ char p (c); // Previous character (see below).
+
+ for (;;)
+ {
+ c = get ();
+
+ if (eos (c))
+ fail (location (&name_, t.line, t.column)) << "unterminated literal";
+
+ if (c == '\'' && p != '\\')
+ break;
+
+ // Keep track of \\-escapings so we don't confuse them with \', as in
+ // '\\'.
+ //
+ p = (c == '\\' && p == '\\') ? '\0' : static_cast<char> (c);
+ }
+
+ // See if we have a user-defined suffix (which is an identifier).
+ //
+ if ((c = peek ()) == '_' || alpha (c))
+ literal_suffix (c);
+
+ t.type = type::character;
+ }
+
+ void lexer::
+ string_literal (token& t, xchar c)
+ {
+ t.line = c.line;
+ t.column = c.column;
+
+ char p (c); // Previous character (see below).
+
+ for (;;)
+ {
+ c = get ();
+
+ if (eos (c))
+ fail (location (&name_, t.line, t.column)) << "unterminated literal";
+
+ if (c == '\"' && p != '\\')
+ break;
+
+ // Keep track of \\-escapings so we don't confuse them with \", as in
+ // "\\".
+ //
+ p = (c == '\\' && p == '\\') ? '\0' : static_cast<char> (c);
+ }
+
+ // See if we have a user-defined suffix (which is an identifier).
+ //
+ if ((c = peek ()) == '_' || alpha (c))
+ literal_suffix (c);
+
+ t.type = type::string;
+ }
+
+ void lexer::
+ raw_string_literal (token& t, xchar c)
+ {
+ t.line = c.line;
+ t.column = c.column;
+
+ // The overall form is:
+ //
+ // R"<delimiter>(<raw_characters>)<delimiter>"
+ //
+ // Where <delimiter> is a potentially-empty character sequence made of
+ // any source character but parentheses, backslash and spaces. It can be
+ // at most 16 characters long.
+ //
+ // Note that the <raw_characters> are not processed in any way, not even
+ // for line continuations.
+ //
+
+ // As a first step, parse the delimiter (including the openning paren).
+ //
+ string d (1, ')');
+
+ for (;;)
+ {
+ c = get ();
+
+ if (eos (c) || c == '\"' || c == ')' || c == '\\' || c == ' ')
+ fail (location (&name_, t.line, t.column)) << "invalid raw literal";
+
+ if (c == '(')
+ break;
+
+ d += c;
+ }
+
+ d += '"';
+
+ // Now parse the raw characters while trying to match the closing
+ // delimiter.
+ //
+ for (size_t i (0);;) // Position to match in d.
+ {
+ c = get (false); // No newline escaping.
+
+ if (eos (c))
+ fail (location (&name_, t.line, t.column)) << "invalid raw literal";
+
+ if (c != d[i] && i != 0) // Restart from the beginning.
+ i = 0;
+
+ if (c == d[i])
+ {
+ if (++i == d.size ())
+ break;
+ }
+ }
+
+ // See if we have a user-defined suffix (which is an identifier).
+ //
+ if ((c = peek ()) == '_' || alpha (c))
+ literal_suffix (c);
+
+ t.type = type::string;
+ }
+
+ void lexer::
+ literal_suffix (xchar c)
+ {
+ // Parse a user-defined literal suffix identifier.
+ //
+ for (get (c); (c = peek ()) == '_' || alnum (c); get (c)) ;
+ }
+
+ auto lexer::
+ skip_spaces (bool nl) -> xchar
+ {
+ xchar c (get ());
+
+ for (; !eos (c); c = get ())
+ {
+ switch (c)
+ {
+ case '\n':
+ {
+ if (!nl)
+ break;
+
+ // Fall through.
+ }
+ case ' ':
+ case '\t':
+ case '\r':
+ case '\f':
+ case '\v': continue;
+
+ case '/':
+ {
+ xchar p (peek ());
+
+ // C++ comment.
+ //
+ if (p == '/')
+ {
+ get (p);
+ do { c = get (); } while (!eos (c) && c != '\n');
+
+ if (!nl)
+ break;
+
+ continue;
+ }
+
+ // C comment.
+ //
+ if (p == '*')
+ {
+ get (p);
+
+ for (;;)
+ {
+ c = get ();
+
+ if (eos (c))
+ fail (p) << "unterminated comment";
+
+ if (c == '*' && (c = peek ()) == '/')
+ {
+ get (c);
+ break;
+ }
+ }
+ continue;
+ }
+ break;
+ }
+ }
+ break;
+ }
+
+ return c;
+ }
+
+ ostream&
+ operator<< (ostream& o, const token& t)
+ {
+ switch (t.type)
+ {
+ case type::dot: o << "'.'"; break;
+ case type::semi: o << "';'"; break;
+ case type::lcbrace: o << "'{'"; break;
+ case type::rcbrace: o << "'}'"; break;
+ case type::punctuation: o << "<punctuation>"; break;
+
+ case type::identifier: o << '\'' << t.value << '\''; break;
+
+ case type::number: o << "<number literal>"; break;
+ case type::character: o << "<char literal>"; break;
+ case type::string: o << "<string literal>"; break;
+
+ case type::other: o << "<other>"; break;
+ case type::eos: o << "<end of file>"; break;
+ }
+
+ return o;
+ }
+ }
+}
diff --git a/build2/cc/lexer.hxx b/build2/cc/lexer.hxx
new file mode 100644
index 0000000..0735b45
--- /dev/null
+++ b/build2/cc/lexer.hxx
@@ -0,0 +1,166 @@
+// file : build2/cc/lexer.hxx -*- C++ -*-
+// copyright : Copyright (c) 2014-2017 Code Synthesis Ltd
+// license : MIT; see accompanying LICENSE file
+
+#ifndef BUILD2_CC_LEXER_HXX
+#define BUILD2_CC_LEXER_HXX
+
+#include <libbutl/char-scanner.hxx>
+
+#include <build2/types.hxx>
+#include <build2/utility.hxx>
+
+#include <build2/diagnostics.hxx>
+
+namespace build2
+{
+ namespace cc
+ {
+ // Preprocessor-level tokenization of C/C++ source. In other words, the
+ // sequence of tokens returned is similar to what a real C/C++ compiler
+ // would see from its preprocessor.
+ //
+ // The input is a (partially-)preprocessed translation unit that may still
+ // contain comments, line continuations, and preprocessor directives such
+ // as #line, #pragma, etc. Currently all preprocessor directives are
+ // discarded and no values are saved for literals.
+ //
+ enum class token_type
+ {
+ // NOTE: remember to update operator<<() if changing anything here!
+ //
+ eos,
+
+ dot, // .
+ semi, // ;
+ lcbrace, // {
+ rcbrace, // }
+ punctuation, // Other punctuation.
+
+ identifier,
+
+ number, // Number literal.
+ character, // Char literal.
+ string, // String literal.
+
+ other // Other token.
+ };
+
+ struct token
+ {
+ token_type type;
+ string value;
+
+ uint64_t line;
+ uint64_t column;
+
+ public:
+ token ()
+ : token (token_type::eos, 0, 0) {}
+
+ token (token_type t, uint64_t l, uint64_t c)
+ : token (t, string (), l, c) {}
+
+ token (token_type t, string v, uint64_t l, uint64_t c)
+ : type (t), value (move (v)), line (l), column (c) {}
+ };
+
+ // Output the token value in a format suitable for diagnostics.
+ //
+ ostream&
+ operator<< (ostream&, const token&);
+
+ class lexer: protected butl::char_scanner
+ {
+ public:
+ lexer (istream& is, const path& name)
+ : char_scanner (is, false), name_ (name), fail ("error", &name_) {}
+
+ const path&
+ name () const {return name_;}
+
+ // Note that it is ok to call next() again after getting eos.
+ //
+ token
+ next ()
+ {
+ token t;
+ next (t, skip_spaces ());
+ return t;
+ }
+
+ // As above but reuse the token to avoid a (potential) memory
+ // allocation. Typical usage:
+ //
+ // for (token t; l.next (t) != token_type::eos; )
+ // ...
+ //
+ token_type
+ next (token& t)
+ {
+ next (t, skip_spaces ());
+ return t.type;
+ }
+
+ private:
+ void
+ next (token&, xchar);
+
+ void
+ number_literal (token&, xchar);
+
+ void
+ char_literal (token&, xchar);
+
+ void
+ string_literal (token&, xchar);
+
+ void
+ raw_string_literal (token&, xchar);
+
+ void
+ literal_suffix (xchar);
+
+ xchar
+ skip_spaces (bool newline = true);
+
+ // The char_scanner adaptation for newline escape sequence processing.
+ // Enabled by default and is only disabled in the raw string literals.
+ //
+ private:
+ using base = char_scanner;
+
+ xchar
+ get (bool escape = true);
+
+ void
+ get (const xchar& peeked) {base::get (peeked);}
+
+ xchar
+ peek (bool escape = true);
+
+ private:
+ const path name_;
+ fail_mark fail;
+ };
+
+ // Diagnostics plumbing. We assume that any diag stream for which we can
+ // use token as location has its aux data pointing to pointer to path.
+ //
+ inline location
+ get_location (const token& t, const path& p)
+ {
+ return location (&p, t.line, t.column);
+ }
+
+ inline location
+ get_location (const token& t, const void* data)
+ {
+ assert (data != nullptr); // E.g., must be &parser::path_.
+ const path* p (*static_cast<const path* const*> (data));
+ return get_location (t, *p);
+ }
+ }
+}
+
+#endif // BUILD2_CC_LEXER_HXX