// file : libbuild2/cc/lexer.cxx -*- C++ -*- // license : MIT; see accompanying LICENSE file #include <libbuild2/cc/lexer.hxx> using namespace std; using namespace butl; // bit 0 - identifier character (_0-9A-Ba-b). // static const uint8_t char_flags[256] = //0 1 2 3 4 5 6 7 8 9 A B C D E F { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 1 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 2 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, // 3 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 4 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, // 5 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 6 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, // 7 // 128-255 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0 }; // Diagnostics plumbing. // namespace butl // ADL { inline build2::location get_location (const butl::char_scanner<>::xchar& c, const void* data) { using namespace build2; assert (data != nullptr); // E.g., must be &lexer::name_. return location (*static_cast<const path_name*> (data), c.line, c.column); } } namespace build2 { namespace cc { auto lexer:: peek (bool e) -> xchar { if (ungetn_ != 0) return ungetb_[ungetn_ - 1]; if (unpeek_) return unpeekc_; xchar c (base::peek ()); if (e && c == '\\') { get (c); xchar p (base::peek ()); // Handle Windows CRLF sequence. Similar to char_scanner, we treat a // single CR as if it was followed by LF and also collapse multiple // CRs. // while (p == '\r') { get (p); p = base::peek (); if (p == '\n') break; // Pretend '\n' was there and recurse. // if (p != '\r') return peek (e); } if (p == '\n') { get (p); return peek (e); // Recurse. } // Save in the unpeek buffer so that it is returned on the subsequent // calls to peek() (until get()). // unpeek_ = true; unpeekc_ = c; } return c; } inline auto lexer:: get (bool e) -> xchar { if (ungetn_ != 0) return ungetb_[--ungetn_]; else { xchar c (peek (e)); get (c); return c; } } inline void lexer:: get (const xchar& c) { // Increment the logical line similar to how base will increment the // physical (the column counts are the same). // if (log_line_ && c == '\n' && ungetn_ == 0) ++*log_line_; base::get (c); } inline auto lexer:: geth (bool e) -> xchar { xchar c (get (e)); cs_.append (c); return c; } inline void lexer:: geth (const xchar& c) { get (c); cs_.append (c); } using type = token_type; void lexer:: next (token& t, pair<xchar, bool> cf, bool ignore_pp) { for (;; cf = skip_spaces ()) { xchar c (cf.first); t.first = cf.second; t.file = &log_file_; t.line = log_line_ ? *log_line_ : c.line; t.column = c.column; if (eos (c)) { t.type = type::eos; return; } const location l (name_, c.line, c.column); // Hash the token's line. The reason is debug info. In fact, doing // this will make quite a few "noop" changes (like adding a newline // anywhere in the source) cause the checksum change. But there // doesn't seem to be any way around it: the case where we benefit // from the precise change detection the most (development) is also // where we will most likely have debug info enable. // // Note that in order not to make this completely useless we don't // hash the column. Even if it is part of the debug info, having it a // bit off shouldn't cause any significant mis-positioning. We also // don't hash the file path for each token instead only hashing it // when changed with the #line directive (as well as in the // constructor for the initial path). // cs_.append (t.line); cs_.append (c); switch (c) { // Preprocessor lines. // case '#': { // It is tempting to simply scan until the newline ignoring // anything in between. However, these lines can start a // multi-line C-style comment. So we have to tokenize them (and // hash the data for each token). // // Note that this may not work for things like #error that can // contain pretty much anything. Also note that lines that start // with '#' can contain '#' further down. In this case we need to // be careful not to recurse (and consume multiple newlines). Thus // the ignore_pp flag. // // Finally, to support diagnostics properly we need to recognize // #line directives. // if (ignore_pp) { for (bool first (true);;) { // Note that we keep using the passed token for buffers. // c = skip_spaces (false).first; // Stop at newline. if (eos (c) || c == '\n') break; if (first) { first = false; // Recognize #line and its shorthand version: // // #line <integer> [<string literal>] ... // # <integer> [<string literal>] ... // // Also diagnose #include while at it. // if (!(c >= '0' && c <= '9')) { next (t, make_pair (c, false), false); if (t.type == type::identifier) { if (t.value == "include") fail (l) << "unexpected #include directive"; else if (t.value != "line") continue; } else continue; if (t.type != type::identifier || t.value != "line") continue; c = skip_spaces (false).first; if (!(c >= '0' && c <= '9')) fail (c) << "line number expected after #line directive"; } // Ok, this is #line and next comes the line number. // line_directive (t, c); continue; // Parse the tail, if any. } next (t, make_pair (c, false), false); } break; } else { t.type = type::punctuation; return; } } // Single-letter punctuation. // case ';': t.type = type::semi; return; case '{': t.type = type::lcbrace; return; case '}': t.type = type::rcbrace; return; // Other single-letter punctuation. // case '(': case ')': case '[': case ']': case ',': case '?': case '~': case '\\': t.type = type::punctuation; return; // Potentially multi-letter punctuation. // case '.': // . .* .<N> ... { xchar p (peek ()); if (p == '*') { geth (p); t.type = type::punctuation; return; } else if (p >= '0' && p <= '9') { number_literal (t, c); return; } else if (p == '.') { get (p); xchar q (peek ()); if (q == '.') { cs_.append (p); geth (q); t.type = type::punctuation; return; } unget (p); // Fall through. } t.type = type::dot; return; } case '=': // = == case '!': // ! != case '*': // * *= case '/': // / /= (/* and // handled by skip_spaced() above) case '%': // % %= case '^': // ^ ^= { xchar p (peek ()); if (p == '=') geth (p); t.type = type::punctuation; return; } case '<': // < <= << <<= case '>': // > >= >> >>= { xchar p (peek ()); if (p == c) { geth (p); if ((p = peek ()) == '=') geth (p); t.type = type::punctuation; } else if (p == '=') { geth (p); t.type = type::punctuation; } else t.type = (c == '<' ? type::less : type::greater); return; } case '+': // + ++ += case '-': // - -- -= -> ->* { xchar p (peek ()); if (p == c || p == '=') geth (p); else if (c == '-' && p == '>') { geth (p); if ((p = peek ()) == '*') geth (p); } t.type = type::punctuation; return; } case '&': // & && &= case '|': // | || |= { xchar p (peek ()); if (p == c || p == '=') geth (p); t.type = type::punctuation; return; } case ':': // : :: { xchar p (peek ()); if (p == ':') { geth (p); t.type = type::scope; } else t.type = type::colon; return; } // Number (and also .<N> above). // case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': { number_literal (t, c); return; } // Char/string literal, identifier, or other (\, $, @, `). // default: { bool raw (false); // Raw string literal. // Note: known not to be a digit (see above). // if (char_flags[static_cast<uint8_t> (c)] & 0x01) { // This smells a little: we know skip_spaces() did not peek at // the next character because this is not '/'. Which means the // position in the stream must be of this character + 1. // t.position = buf_->tellg () - 1; string& id (t.value); id = c; while (char_flags[static_cast<uint8_t> (c = peek ())] & 0x01) { geth (c); id += c; // Direct buffer scan. Note that we always follow up with the // normal peek() call which may load the next chunk, handle // line continuations, etc. In other words, the end of the // "raw" scan doesn't necessarily mean the end. // const char* b (gptr_); const char* p (b); for (const char* e (egptr_); p != e && char_flags[static_cast<uint8_t> (*p)] & 0x01; ++p) ; // Unrolling this loop doesn't make a difference. // // for (const char* e (egptr_ - 4); p < e; p += 4) // { // uint8_t c; // // c = static_cast<uint8_t> (p[0]); // if (!(char_flags[c] & 0x01)) break; // // c = static_cast<uint8_t> (p[1]); // if (!(char_flags[c] & 0x01)) {p += 1; break;} // // c = static_cast<uint8_t> (p[2]); // if (!(char_flags[c] & 0x01)) {p += 2; break;} // // c = static_cast<uint8_t> (p[3]); // if (!(char_flags[c] & 0x01)) {p += 3; break;} // } size_t n (p - b); id.append (b, n); cs_.append (b, n); gptr_ = p; buf_->gbump (static_cast<int> (n)); column += n; } // If the following character is a quote, see if the identifier // is one of the literal prefixes. // if (c == '\'' || c == '\"') { size_t n (id.size ()), i (0); switch (id[0]) { case 'u': { if (n > 1 && id[1] == '8') ++i; } // Fall through. case 'L': case 'U': { ++i; if (c == '\"' && n > i && id[i] == 'R') { ++i; raw = true; } break; } case 'R': { if (c == '\"') { ++i; raw = true; } break; } } if (i == n) // All characters "consumed". { geth (c); id.clear (); } } if (!id.empty ()) { t.type = type::identifier; return; } } switch (c) { case '\'': { char_literal (t, c); return; } case '\"': { if (raw) raw_string_literal (t, c); else string_literal (t, c); return; } default: { t.type = type::other; return; } } } } } } void lexer:: number_literal (token& t, xchar c) { // note: c is hashed // A number (integer or floating point literal) can: // // 1. Start with a dot (which must be followed by a digit, e.g., .123). // // 2. Can have a radix prefix (0b101, 0123, 0X12AB). // // 3. Can have an exponent (1e10, 0x1.p-10, 1.). // // 4. Digits can be separated with ' (123'456, 0xff00'00ff). // // 5. End with a built-in or user defined literal (123f, 123UL, 123_X) // // Quoting from GCC's preprocessor documentation: // // "Formally preprocessing numbers begin with an optional period, a // required decimal digit, and then continue with any sequence of // letters, digits, underscores, periods, and exponents. Exponents are // the two-character sequences 'e+', 'e-', 'E+', 'E-', 'p+', 'p-', 'P+', // and 'P-'." // // So it looks like a "C++ number" is then any unseparated (with // whitespace or punctuation) sequence of those plus '. The only mildly // tricky part is then to recognize +/- as being part of the exponent. // while (!eos ((c = peek ()))) { switch (c) { // All the whitespace, punctuation, and other characters that end // the number. // case ' ': case '\n': case '\t': case '\r': case '\f': case '\v': case '#': case ';': case '{': case '}': case '(': case ')': case '[': case ']': case ',': case '?': case '~': case '=': case '!': case '*': case '/': case '%': case '^': case '>': case '<': case '&': case '|': case ':': case '+': // The exponent case is handled below. case '-': // The exponent case is handled below. case '"': case '\\': case '@': case '$': case '`': break; // Recognize +/- after the exponent. // case 'e': case 'E': case 'p': case 'P': { geth (c); c = peek (); if (c == '+' || c == '-') geth (c); continue; } case '_': case '.': case '\'': default: // Digits and letters. { geth (c); continue; } } break; } t.type = type::number; } void lexer:: char_literal (token& t, xchar c) { // note: c is hashed const location l (name_, c.line, c.column); for (char p (c);;) // Previous character (see below). { c = geth (); if (eos (c) || c == '\n') fail (l) << "unterminated character literal"; if (c == '\'' && p != '\\') break; // Keep track of \\-escapings so we don't confuse them with \', as in // '\\'. // p = (c == '\\' && p == '\\') ? '\0' : static_cast<char> (c); } // See if we have a user-defined suffix (which is an identifier). // if ((c = peek ()) == '_' || alpha (c)) literal_suffix (c); t.type = type::character; } void lexer:: string_literal (token& t, xchar c) { // note: c is hashed const location l (name_, c.line, c.column); for (char p (c);;) // Previous character (see below). { c = geth (); if (eos (c) || c == '\n') fail (l) << "unterminated string literal"; if (c == '\"' && p != '\\') break; // Keep track of \\-escapings so we don't confuse them with \", as in // "\\". // p = (c == '\\' && p == '\\') ? '\0' : static_cast<char> (c); // Direct buffer scan. // if (p != '\\') { const char* b (gptr_); const char* e (egptr_); const char* p (b); for (char c; p != e && (c = *p) != '\"' && c != '\\' && c != '\n' && c != '\r'; ++p) ; size_t n (p - b); cs_.append (b, n); gptr_ = p; buf_->gbump (static_cast<int> (n)); column += n; } } // See if we have a user-defined suffix (which is an identifier). // if ((c = peek ()) == '_' || alpha (c)) literal_suffix (c); t.type = type::string; } void lexer:: raw_string_literal (token& t, xchar c) { // note: c is hashed // The overall form is: // // R"<delimiter>(<raw_characters>)<delimiter>" // // Where <delimiter> is a potentially-empty character sequence made of // any source character but parentheses, backslash and spaces. It can be // at most 16 characters long. // // Note that the <raw_characters> are not processed in any way, not even // for line continuations. // const location l (name_, c.line, c.column); // As a first step, parse the delimiter (including the openning paren). // string d (1, ')'); for (;;) { c = geth (); if (eos (c) || c == '\"' || c == ')' || c == '\\' || c == ' ') fail (l) << "invalid raw string literal"; if (c == '(') break; d += c; } d += '"'; // Now parse the raw characters while trying to match the closing // delimiter. // for (size_t i (0);;) // Position to match in d. { c = geth (false); // No newline escaping. if (eos (c)) // Note: newline is ok. fail (l) << "invalid raw string literal"; if (c != d[i] && i != 0) // Restart from the beginning. i = 0; if (c == d[i]) { if (++i == d.size ()) break; } } // See if we have a user-defined suffix (which is an identifier). // if ((c = peek ()) == '_' || alpha (c)) literal_suffix (c); t.type = type::string; } void lexer:: literal_suffix (xchar c) { // note: c is unhashed // Parse a user-defined literal suffix identifier. // for (geth (c); (c = peek ()) == '_' || alnum (c); geth (c)) ; } void lexer:: line_directive (token& t, xchar c) { // enter: first digit of the line number // leave: last character of the line number or file string // note: c is unhashed // If our number and string tokens contained the literal values, then we // could have used that. However, we ignore the value (along with escape // processing, etc), for performance. Let's keep it that way and instead // handle it ourselves. // // Note also that we are not hashing these at the character level // instead hashing the switch to a new file path below and leaving the // line number to the token line hashing. // { string& s (t.value); for (s = c; (c = peek ()) >= '0' && c <= '9'; get (c)) s += c; // The newline that ends the directive will increment the logical line // so subtract one to compensate. Note: can't be 0 and shouldn't throw // for valid lines. // log_line_ = stoull (s.c_str ()) - 1; } // See if we have the file. // c = skip_spaces (false).first; if (c == '\"') { const location l (name_, c.line, c.column); // It is common to have a large number of #line directives that don't // change the file (they seem to be used to track macro locations or // some such). So we are going to optimize for this by comparing the // current path to what's in #line. // string& s (tmp_file_); s.clear (); for (char p ('\0'); p != '\"'; ) // Previous character. { c = get (); if (eos (c) || c == '\n') fail (l) << "unterminated string literal"; // Handle escapes. // if (p == '\\') { p = '\0'; // Clear so we don't confuse \" and \\". // We only handle what can reasonably be expected in a file name. // switch (c) { case '\\': case '\'': case '\"': break; // Add as is. default: fail (c) << "unsupported escape sequence in #line directive"; } } else { p = c; switch (c) { case '\\': case '\"': continue; } } s += c; // Direct buffer scan. // if (p != '\\') { const char* b (gptr_); const char* e (egptr_); const char* p (b); for (char c; p != e && (c = *p) != '\"' && c != '\\' && c != '\n' && c != '\r'; ++p) ; size_t n (p - b); s.append (b, n); gptr_ = p; buf_->gbump (static_cast<int> (n)); column += n; } } try { if (s.empty ()) throw invalid_path (""); // Handle special names (<stdin>, <built-in>, etc). // if (s.front () == '<' && s.back () == '>') { if (log_file_.name) { if (*log_file_.name == s) return; log_file_.name->swap (s); } else log_file_.name = move (s); log_file_.path.clear (); } else { if (log_file_.path.string () == s) return; string r (move (log_file_.path).string ()); // Move string rep out. r.swap (s); log_file_.path = path (move (r)); // Move back in. log_file_.name = nullopt; } } catch (const invalid_path&) { fail (l) << "invalid path in #line directive"; } // If the path is relative, then prefix it with the current working // directory. Failed that, we will end up with different checksums for // invocations from different directories. // // While this should work fine for normal cross-compilation, it's an // entirely different story for the emulated case (e.g., msvc-linux // where the preprocessed output contains absolute Windows paths). So // we try to sense if things look fishy and leave the path alone. // // Also detect special names like <built-in> and <command-line>. Plus // GCC sometimes adds what looks like working directory (has trailing // slash). So ignore that as well. // // We now switched to using absolute translation unit paths (because // of __FILE__/assert(); see compile.cxx for details). But we might // still need this logic when we try to calculate location-independent // hash for distributed compilation/caching. The idea is to only hash // the part starting from the project root which is immutable. Plus // we will need -ffile-prefix-map to deal with __FILE__. // if (!log_file_.path.to_directory ()) // Also covers special names. cs_.append (log_file_.name ? *log_file_.name : log_file_.path.string ()); #if 0 { using tr = path::traits_type; const string& f (log_file_.path.string ()); if (log_file_.name || f.find (':') != string::npos || log_file_.path.absolute ()) cs_.append (log_file_.name ? *log_file_.name : f); else { // This gets complicated and slow: the path may contain '..' and // '.' so strictly speaking we would need to normalize it. // Instead, we are going to handle leading '..'s ourselves (the // sane case) and ignore everything else (so if you have '..' or // '.' somewhere in the middle, then things might not work // optimally for you). // const string& d (work.string ()); // Iterate over leading '..' in f "popping" the corresponding // number of trailing components from d. // size_t fp (0); size_t dp (d.size () - 1); for (size_t p;; ) { // Note that in file we recognize any directory separator, not // just of this platform (see note about emulation above). // if (f.compare (fp, 2, "..") != 0 || (f[fp + 2] != '/' && f[fp + 2] != '\\') || // Could be '\0'. (p = tr::rfind_separator (d, dp)) == string::npos) break; fp += 3; dp = p - 1; } cs_.append (d.c_str (), dp + 1); cs_.append (tr::directory_separator); // Canonical in work. cs_.append (f.c_str () + fp); } } #endif } else unget (c); } auto lexer:: skip_spaces (bool nl) -> pair<xchar, bool> { xchar c (get ()); // Besides the first character, we also need to take into account any // newlines that we are skipping. For example, the first character may // be a space at the end of the line which we will skip along with the // following newline. // bool first (c.column == 1); for (; !eos (c); c = get ()) { switch (c) { case '\n': if (!nl) break; first = true; // Fall through. case ' ': case '\t': case '\r': case '\f': case '\v': { // Direct buffer scan. // const char* b (gptr_); const char* e (egptr_); const char* p (b); for (char c; p != e && ((c = *p) == ' ' || c == '\t'); ++p) ; size_t n (p - b); gptr_ = p; buf_->gbump (static_cast<int> (n)); column += n; continue; } case '/': { xchar p (peek ()); // C++ comment. // if (p == '/') { get (p); for (;;) { c = get (); if (c == '\n' || eos (c)) break; // Direct buffer scan. // const char* b (gptr_); const char* e (egptr_); const char* p (b); for (char c; p != e && (c = *p) != '\n' && c != '\\'; ++p) ; size_t n (p - b); gptr_ = p; buf_->gbump (static_cast<int> (n)); column += n; } if (!nl) break; first = true; continue; } // C comment. // // Note that for the first logic we consider a C comment to be // entirely part of the same logical line even if there are // newlines inside. // if (p == '*') { get (p); for (;;) { c = get (); if (eos (c)) fail (p) << "unterminated comment"; if (c == '*' && (c = peek ()) == '/') { get (c); break; } if (c != '*' && c != '\\') { // Direct buffer scan. // // Note that we should call get() prior to the direct buffer // scan (see butl::char_scanner for details). // get (c); const char* b (gptr_); const char* e (egptr_); const char* p (b); for (char c; p != e && (c = *p) != '*' && c != '\\'; ++p) { if (c == '\n') { if (log_line_) ++*log_line_; ++line; column = 1; } else ++column; } gptr_ = p; buf_->gbump (static_cast<int> (p - b)); } } continue; } break; } } break; } return make_pair (c, first); } ostream& operator<< (ostream& o, const token& t) { switch (t.type) { case type::dot: o << "'.'"; break; case type::semi: o << "';'"; break; case type::colon: o << "':'"; break; case type::scope: o << "'::'"; break; case type::less: o << "'<'"; break; case type::greater: o << "'>'"; break; case type::lcbrace: o << "'{'"; break; case type::rcbrace: o << "'}'"; break; case type::punctuation: o << "<punctuation>"; break; case type::identifier: o << '\'' << t.value << '\''; break; case type::number: o << "<number literal>"; break; case type::character: o << "<char literal>"; break; case type::string: o << "<string literal>"; break; case type::other: o << "<other>"; break; case type::eos: o << "<end of file>"; break; } return o; } } }