aboutsummaryrefslogtreecommitdiff
path: root/libbuild2/cc/lexer.hxx
diff options
context:
space:
mode:
authorKaren Arutyunov <karen@codesynthesis.com>2019-08-24 17:41:30 +0300
committerKaren Arutyunov <karen@codesynthesis.com>2019-08-28 15:01:48 +0300
commit4bdf53837e010073de802070d4e6087410662d3e (patch)
tree2820d3964877d1a7d498833da325aa3d3a699353 /libbuild2/cc/lexer.hxx
parentea24f530048cbce0c5335ca3fd3632c8ce34315a (diff)
Move cc build system module to separate library
Diffstat (limited to 'libbuild2/cc/lexer.hxx')
-rw-r--r--libbuild2/cc/lexer.hxx190
1 files changed, 190 insertions, 0 deletions
diff --git a/libbuild2/cc/lexer.hxx b/libbuild2/cc/lexer.hxx
new file mode 100644
index 0000000..cb2b3a5
--- /dev/null
+++ b/libbuild2/cc/lexer.hxx
@@ -0,0 +1,190 @@
+// file : libbuild2/cc/lexer.hxx -*- C++ -*-
+// copyright : Copyright (c) 2014-2019 Code Synthesis Ltd
+// license : MIT; see accompanying LICENSE file
+
+#ifndef LIBBUILD2_CC_LEXER_HXX
+#define LIBBUILD2_CC_LEXER_HXX
+
+#include <libbutl/sha256.mxx>
+#include <libbutl/char-scanner.mxx>
+
+#include <libbuild2/types.hxx>
+#include <libbuild2/utility.hxx>
+
+#include <libbuild2/diagnostics.hxx>
+
+namespace build2
+{
+ namespace cc
+ {
+ // Preprocessor-level tokenization of C/C++ source. In other words, the
+ // sequence of tokens returned is similar to what a real C/C++ compiler
+ // would see from its preprocessor.
+ //
+ // The input is a (partially-)preprocessed translation unit that may still
+ // contain comments, line continuations, and preprocessor directives such
+ // as #line, #pragma, but not #include (which is diagnosed). Currently,
+ // all preprocessor directives except #line are ignored and no values are
+ // saved from literals. The #line directive (and its shorthand notation)
+ // is recognized to provide the logical token location.
+ //
+ // While at it we also calculate the checksum of the input ignoring
+ // comments, whitespaces, etc. This is used to detect changes that do not
+ // alter the resulting token stream.
+ //
+ enum class token_type
+ {
+ // NOTE: remember to update operator<<() if changing anything here!
+ //
+ eos,
+
+ dot, // .
+ semi, // ;
+ less, // <
+ greater, // >
+ lcbrace, // {
+ rcbrace, // }
+
+ punctuation, // Other punctuation.
+
+ identifier,
+
+ number, // Number literal.
+ character, // Char literal.
+ string, // String literal.
+
+ other // Other token.
+ };
+
+ struct token
+ {
+ token_type type = token_type::eos;
+ string value;
+
+ // Logical position.
+ //
+ path file;
+ uint64_t line = 0;
+ uint64_t column = 0;
+
+ // Physical position in the stream, currently only for identifiers.
+ //
+ uint64_t position = 0;
+ };
+
+ // Output the token value in a format suitable for diagnostics.
+ //
+ ostream&
+ operator<< (ostream&, const token&);
+
+ class lexer: protected butl::char_scanner
+ {
+ public:
+ lexer (ifdstream& is, const path& name)
+ : char_scanner (is, false),
+ name_ (name),
+ fail ("error", &name_),
+ log_file_ (name) {}
+
+ const path&
+ name () const {return name_;}
+
+ string
+ checksum () const {return cs_.string ();}
+
+ // Note that it is ok to call next() again after getting eos.
+ //
+ token
+ next ()
+ {
+ token t;
+ next (t, skip_spaces (), true);
+ return t;
+ }
+
+ // As above but reuse the token to avoid a (potential) memory
+ // allocation. Typical usage:
+ //
+ // for (token t; l.next (t) != token_type::eos; )
+ // ...
+ //
+ token_type
+ next (token& t)
+ {
+ next (t, skip_spaces (), true);
+ return t.type;
+ }
+
+ private:
+ void
+ next (token&, xchar, bool);
+
+ void
+ number_literal (token&, xchar);
+
+ void
+ char_literal (token&, xchar);
+
+ void
+ string_literal (token&, xchar);
+
+ void
+ raw_string_literal (token&, xchar);
+
+ void
+ literal_suffix (xchar);
+
+ void
+ line_directive (token&, xchar);
+
+ xchar
+ skip_spaces (bool newline = true);
+
+ // The char_scanner adaptation for newline escape sequence processing.
+ // Enabled by default and is only disabled in the raw string literals.
+ //
+ private:
+ using base = char_scanner;
+
+ xchar
+ peek (bool escape = true);
+
+ xchar
+ get (bool escape = true);
+
+ void
+ get (const xchar& peeked);
+
+ // Hashing versions.
+ //
+ xchar
+ geth (bool escape = true);
+
+ void
+ geth (const xchar& peeked);
+
+ private:
+ const path name_;
+ const fail_mark fail;
+
+ // Logical file and line as set by the #line directives. Note that the
+ // lexer diagnostics still uses the physical file/lines.
+ //
+ path log_file_;
+ optional<uint64_t> log_line_;
+
+ string tmp_file_;
+ sha256 cs_;
+ };
+
+ // Diagnostics plumbing.
+ //
+ inline location
+ get_location (const token& t, const void* = nullptr)
+ {
+ return location (&t.file, t.line, t.column);
+ }
+ }
+}
+
+#endif // LIBBUILD2_CC_LEXER_HXX