From 4bdf53837e010073de802070d4e6087410662d3e Mon Sep 17 00:00:00 2001 From: Karen Arutyunov Date: Sat, 24 Aug 2019 17:41:30 +0300 Subject: Move cc build system module to separate library --- libbuild2/cc/lexer.hxx | 190 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 190 insertions(+) create mode 100644 libbuild2/cc/lexer.hxx (limited to 'libbuild2/cc/lexer.hxx') diff --git a/libbuild2/cc/lexer.hxx b/libbuild2/cc/lexer.hxx new file mode 100644 index 0000000..cb2b3a5 --- /dev/null +++ b/libbuild2/cc/lexer.hxx @@ -0,0 +1,190 @@ +// file : libbuild2/cc/lexer.hxx -*- C++ -*- +// copyright : Copyright (c) 2014-2019 Code Synthesis Ltd +// license : MIT; see accompanying LICENSE file + +#ifndef LIBBUILD2_CC_LEXER_HXX +#define LIBBUILD2_CC_LEXER_HXX + +#include +#include + +#include +#include + +#include + +namespace build2 +{ + namespace cc + { + // Preprocessor-level tokenization of C/C++ source. In other words, the + // sequence of tokens returned is similar to what a real C/C++ compiler + // would see from its preprocessor. + // + // The input is a (partially-)preprocessed translation unit that may still + // contain comments, line continuations, and preprocessor directives such + // as #line, #pragma, but not #include (which is diagnosed). Currently, + // all preprocessor directives except #line are ignored and no values are + // saved from literals. The #line directive (and its shorthand notation) + // is recognized to provide the logical token location. + // + // While at it we also calculate the checksum of the input ignoring + // comments, whitespaces, etc. This is used to detect changes that do not + // alter the resulting token stream. + // + enum class token_type + { + // NOTE: remember to update operator<<() if changing anything here! + // + eos, + + dot, // . + semi, // ; + less, // < + greater, // > + lcbrace, // { + rcbrace, // } + + punctuation, // Other punctuation. + + identifier, + + number, // Number literal. + character, // Char literal. + string, // String literal. + + other // Other token. + }; + + struct token + { + token_type type = token_type::eos; + string value; + + // Logical position. + // + path file; + uint64_t line = 0; + uint64_t column = 0; + + // Physical position in the stream, currently only for identifiers. + // + uint64_t position = 0; + }; + + // Output the token value in a format suitable for diagnostics. + // + ostream& + operator<< (ostream&, const token&); + + class lexer: protected butl::char_scanner + { + public: + lexer (ifdstream& is, const path& name) + : char_scanner (is, false), + name_ (name), + fail ("error", &name_), + log_file_ (name) {} + + const path& + name () const {return name_;} + + string + checksum () const {return cs_.string ();} + + // Note that it is ok to call next() again after getting eos. + // + token + next () + { + token t; + next (t, skip_spaces (), true); + return t; + } + + // As above but reuse the token to avoid a (potential) memory + // allocation. Typical usage: + // + // for (token t; l.next (t) != token_type::eos; ) + // ... + // + token_type + next (token& t) + { + next (t, skip_spaces (), true); + return t.type; + } + + private: + void + next (token&, xchar, bool); + + void + number_literal (token&, xchar); + + void + char_literal (token&, xchar); + + void + string_literal (token&, xchar); + + void + raw_string_literal (token&, xchar); + + void + literal_suffix (xchar); + + void + line_directive (token&, xchar); + + xchar + skip_spaces (bool newline = true); + + // The char_scanner adaptation for newline escape sequence processing. + // Enabled by default and is only disabled in the raw string literals. + // + private: + using base = char_scanner; + + xchar + peek (bool escape = true); + + xchar + get (bool escape = true); + + void + get (const xchar& peeked); + + // Hashing versions. + // + xchar + geth (bool escape = true); + + void + geth (const xchar& peeked); + + private: + const path name_; + const fail_mark fail; + + // Logical file and line as set by the #line directives. Note that the + // lexer diagnostics still uses the physical file/lines. + // + path log_file_; + optional log_line_; + + string tmp_file_; + sha256 cs_; + }; + + // Diagnostics plumbing. + // + inline location + get_location (const token& t, const void* = nullptr) + { + return location (&t.file, t.line, t.column); + } + } +} + +#endif // LIBBUILD2_CC_LEXER_HXX -- cgit v1.1