From 4bdf53837e010073de802070d4e6087410662d3e Mon Sep 17 00:00:00 2001 From: Karen Arutyunov Date: Sat, 24 Aug 2019 17:41:30 +0300 Subject: Move cc build system module to separate library --- build2/cc/lexer.cxx | 1129 --------------------------------------------------- 1 file changed, 1129 deletions(-) delete mode 100644 build2/cc/lexer.cxx (limited to 'build2/cc/lexer.cxx') diff --git a/build2/cc/lexer.cxx b/build2/cc/lexer.cxx deleted file mode 100644 index 7795192..0000000 --- a/build2/cc/lexer.cxx +++ /dev/null @@ -1,1129 +0,0 @@ -// file : build2/cc/lexer.cxx -*- C++ -*- -// copyright : Copyright (c) 2014-2019 Code Synthesis Ltd -// license : MIT; see accompanying LICENSE file - -#include - -using namespace std; -using namespace butl; - -// bit 0 - identifier character (_0-9A-Ba-b). -// -static const uint8_t char_flags[256] = -//0 1 2 3 4 5 6 7 8 9 A B C D E F -{ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0 - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 1 - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 2 - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, // 3 - 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 4 - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, // 5 - 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 6 - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, // 7 - - // 128-255 - 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0 -}; - -// Diagnostics plumbing. -// -namespace butl // ADL -{ - inline build2::location - get_location (const butl::char_scanner::xchar& c, const void* data) - { - using namespace build2; - - assert (data != nullptr); // E.g., must be &lexer::name_. - return location (static_cast (data), c.line, c.column); - } -} - -namespace build2 -{ - namespace cc - { - auto lexer:: - peek (bool e) -> xchar - { - if (unget_) - return ungetc_; - - if (unpeek_) - return unpeekc_; - - xchar c (base::peek ()); - - if (e && c == '\\') - { - get (c); - xchar p (base::peek ()); - - // Handle Windows CRLF sequence. Similar to char_scanner, we treat a - // single CR as if it was followed by LF and also collapse multiple - // CRs. - // - while (p == '\r') - { - get (p); - p = base::peek (); - - if (p == '\n') - break; - - // Pretend '\n' was there and recurse. - // - if (p != '\r') - return peek (e); - } - - if (p == '\n') - { - get (p); - return peek (e); // Recurse. - } - - // Save in the unpeek buffer so that it is returned on the subsequent - // calls to peek() (until get()). - // - unpeek_ = true; - unpeekc_ = c; - } - - return c; - } - - inline auto lexer:: - get (bool e) -> xchar - { - if (unget_) - { - unget_ = false; - return ungetc_; - } - else - { - xchar c (peek (e)); - get (c); - return c; - } - } - - inline void lexer:: - get (const xchar& c) - { - // Increment the logical line similar to how base will increment the - // physical (the column counts are the same). - // - if (log_line_ && c == '\n' && !unget_) - ++*log_line_; - - base::get (c); - } - - inline auto lexer:: - geth (bool e) -> xchar - { - xchar c (get (e)); - cs_.append (c); - return c; - } - - inline void lexer:: - geth (const xchar& c) - { - get (c); - cs_.append (c); - } - - using type = token_type; - - void lexer:: - next (token& t, xchar c, bool ignore_pp) - { - for (;; c = skip_spaces ()) - { - t.file = log_file_; - t.line = log_line_ ? *log_line_ : c.line; - t.column = c.column; - - if (eos (c)) - { - t.type = type::eos; - return; - } - - const location l (&name_, c.line, c.column); - - // Hash the token's line. The reason is debug info. In fact, doing - // this will make quite a few "noop" changes (like adding a newline - // anywhere in the source) cause the checksum change. But there - // doesn't seem to be any way around it: the case where we benefit - // from the precise change detection the most (development) is also - // where we will most likely have debug info enable. - // - // Note that in order not to make this completely useless we don't - // hash the column. Even if it is part of the debug info, having it a - // bit off shouldn't cause any significant mis-positioning. We also - // don't hash the file path for each token instead only hashing it - // when changed with the #line directive (as well as in the - // constructor for the initial path). - // - cs_.append (t.line); - cs_.append (c); - - switch (c) - { - // Preprocessor lines. - // - case '#': - { - // It is tempting to simply scan until the newline ignoring - // anything in between. However, these lines can start a - // multi-line C-style comment. So we have to tokenize them (and - // hash the data for each token). - // - // Note that this may not work for things like #error that can - // contain pretty much anything. Also note that lines that start - // with '#' can contain '#' further down. In this case we need to - // be careful not to recurse (and consume multiple newlines). Thus - // the ignore_pp flag. - // - // Finally, to support diagnostics properly we need to recognize - // #line directives. - // - if (ignore_pp) - { - for (bool first (true);;) - { - // Note that we keep using the passed token for buffers. - // - c = skip_spaces (false); // Stop at newline. - - if (eos (c) || c == '\n') - break; - - if (first) - { - first = false; - - // Recognize #line and its shorthand version: - // - // #line [] ... - // # [] ... - // - // Also diagnose #include while at it. - // - if (!(c >= '0' && c <= '9')) - { - next (t, c, false); - - if (t.type == type::identifier) - { - if (t.value == "include") - fail (l) << "unexpected #include directive"; - else if (t.value != "line") - continue; - } - else - continue; - - if (t.type != type::identifier || t.value != "line") - continue; - - c = skip_spaces (false); - - if (!(c >= '0' && c <= '9')) - fail (c) << "line number expected after #line directive"; - } - - // Ok, this is #line and next comes the line number. - // - line_directive (t, c); - continue; // Parse the tail, if any. - } - - next (t, c, false); - } - break; - } - else - { - t.type = type::punctuation; - return; - } - } - // Single-letter punctuation. - // - case ';': t.type = type::semi; return; - case '{': t.type = type::lcbrace; return; - case '}': t.type = type::rcbrace; return; - // Other single-letter punctuation. - // - case '(': - case ')': - case '[': - case ']': - case ',': - case '?': - case '~': - case '\\': t.type = type::punctuation; return; - // Potentially multi-letter punctuation. - // - case '.': // . .* . ... - { - xchar p (peek ()); - - if (p == '*') - { - geth (p); - t.type = type::punctuation; - return; - } - else if (p >= '0' && p <= '9') - { - number_literal (t, c); - return; - } - else if (p == '.') - { - get (p); - - xchar q (peek ()); - if (q == '.') - { - cs_.append (p); - - geth (q); - t.type = type::punctuation; - return; - } - unget (p); - // Fall through. - } - - t.type = type::dot; - return; - } - case '=': // = == - case '!': // ! != - case '*': // * *= - case '/': // / /= (/* and // handled by skip_spaced() above) - case '%': // % %= - case '^': // ^ ^= - { - xchar p (peek ()); - - if (p == '=') - geth (p); - - t.type = type::punctuation; - return; - } - case '<': // < <= << <<= - case '>': // > >= >> >>= - { - xchar p (peek ()); - - if (p == c) - { - geth (p); - if ((p = peek ()) == '=') - geth (p); - t.type = type::punctuation; - } - else if (p == '=') - { - geth (p); - t.type = type::punctuation; - } - else - t.type = (c == '<' ? type::less : type::greater); - - return; - } - case '+': // + ++ += - case '-': // - -- -= -> ->* - { - xchar p (peek ()); - - if (p == c || p == '=') - geth (p); - else if (c == '-' && p == '>') - { - geth (p); - if ((p = peek ()) == '*') - geth (p); - } - - t.type = type::punctuation; - return; - } - case '&': // & && &= - case '|': // | || |= - { - xchar p (peek ()); - - if (p == c || p == '=') - geth (p); - - t.type = type::punctuation; - return; - } - case ':': // : :: - { - xchar p (peek ()); - - if (p == ':') - geth (p); - - t.type = type::punctuation; - return; - } - // Number (and also . above). - // - case '0': - case '1': - case '2': - case '3': - case '4': - case '5': - case '6': - case '7': - case '8': - case '9': - { - number_literal (t, c); - return; - } - // Char/string literal, identifier, or other (\, $, @, `). - // - default: - { - bool raw (false); // Raw string literal. - - // Note: known not to be a digit (see above). - // - if (char_flags[static_cast (c)] & 0x01) - { - // This smells a little: we know skip_spaces() did not peek at - // the next character because this is not '/'. Which means the - // position in the stream must be of this character + 1. - // - t.position = buf_->tellg () - 1; - - string& id (t.value); - id = c; - - while (char_flags[static_cast (c = peek ())] & 0x01) - { - geth (c); - id += c; - - // Direct buffer scan. Note that we always follow up with the - // normal peek() call which may load the next chunk, handle - // line continuations, etc. In other words, the end of the - // "raw" scan doesn't necessarily mean the end. - // - const char* b (gptr_); - const char* p (b); - - for (const char* e (egptr_); - p != e && char_flags[static_cast (*p)] & 0x01; - ++p) ; - - // Unrolling this loop doesn't make a difference. - // - // for (const char* e (egptr_ - 4); p < e; p += 4) - // { - // uint8_t c; - // - // c = static_cast (p[0]); - // if (!(char_flags[c] & 0x01)) break; - // - // c = static_cast (p[1]); - // if (!(char_flags[c] & 0x01)) {p += 1; break;} - // - // c = static_cast (p[2]); - // if (!(char_flags[c] & 0x01)) {p += 2; break;} - // - // c = static_cast (p[3]); - // if (!(char_flags[c] & 0x01)) {p += 3; break;} - // } - - size_t n (p - b); - id.append (b, n); cs_.append (b, n); - gptr_ = p; buf_->gbump (static_cast (n)); column += n; - } - - // If the following character is a quote, see if the identifier - // is one of the literal prefixes. - // - if (c == '\'' || c == '\"') - { - size_t n (id.size ()), i (0); - switch (id[0]) - { - case 'u': - { - if (n > 1 && id[1] == '8') - ++i; - } - // Fall through. - case 'L': - case 'U': - { - ++i; - - if (c == '\"' && n > i && id[i] == 'R') - { - ++i; - raw = true; - } - break; - } - case 'R': - { - if (c == '\"') - { - ++i; - raw = true; - } - break; - } - } - - if (i == n) // All characters "consumed". - { - geth (c); - id.clear (); - } - } - - if (!id.empty ()) - { - t.type = type::identifier; - return; - } - } - - switch (c) - { - case '\'': - { - char_literal (t, c); - return; - } - case '\"': - { - if (raw) - raw_string_literal (t, c); - else - string_literal (t, c); - return; - } - default: - { - t.type = type::other; - return; - } - } - } - } - } - } - - void lexer:: - number_literal (token& t, xchar c) - { - // note: c is hashed - - // A number (integer or floating point literal) can: - // - // 1. Start with a dot (which must be followed by a digit, e.g., .123). - // - // 2. Can have a radix prefix (0b101, 0123, 0X12AB). - // - // 3. Can have an exponent (1e10, 0x1.p-10, 1.). - // - // 4. Digits can be separated with ' (123'456, 0xff00'00ff). - // - // 5. End with a built-in or user defined literal (123f, 123UL, 123_X) - // - // Quoting from GCC's preprocessor documentation: - // - // "Formally preprocessing numbers begin with an optional period, a - // required decimal digit, and then continue with any sequence of - // letters, digits, underscores, periods, and exponents. Exponents are - // the two-character sequences 'e+', 'e-', 'E+', 'E-', 'p+', 'p-', 'P+', - // and 'P-'." - // - // So it looks like a "C++ number" is then any unseparated (with - // whitespace or punctuation) sequence of those plus '. The only mildly - // tricky part is then to recognize +/- as being part of the exponent. - // - while (!eos ((c = peek ()))) - { - switch (c) - { - // All the whitespace, punctuation, and other characters that end - // the number. - // - case ' ': - case '\n': - case '\t': - case '\r': - case '\f': - case '\v': - - case '#': - case ';': - case '{': - case '}': - case '(': - case ')': - case '[': - case ']': - case ',': - case '?': - case '~': - case '=': - case '!': - case '*': - case '/': - case '%': - case '^': - case '>': - case '<': - case '&': - case '|': - case ':': - case '+': // The exponent case is handled below. - case '-': // The exponent case is handled below. - case '"': - case '\\': - - case '@': - case '$': - case '`': - break; - - // Recognize +/- after the exponent. - // - case 'e': - case 'E': - case 'p': - case 'P': - { - geth (c); - c = peek (); - if (c == '+' || c == '-') - geth (c); - continue; - } - - case '_': - case '.': - case '\'': - default: // Digits and letters. - { - geth (c); - continue; - } - } - - break; - } - - t.type = type::number; - } - - void lexer:: - char_literal (token& t, xchar c) - { - // note: c is hashed - - const location l (&name_, c.line, c.column); - - for (char p (c);;) // Previous character (see below). - { - c = geth (); - - if (eos (c) || c == '\n') - fail (l) << "unterminated character literal"; - - if (c == '\'' && p != '\\') - break; - - // Keep track of \\-escapings so we don't confuse them with \', as in - // '\\'. - // - p = (c == '\\' && p == '\\') ? '\0' : static_cast (c); - } - - // See if we have a user-defined suffix (which is an identifier). - // - if ((c = peek ()) == '_' || alpha (c)) - literal_suffix (c); - - t.type = type::character; - } - - void lexer:: - string_literal (token& t, xchar c) - { - // note: c is hashed - - const location l (&name_, c.line, c.column); - - for (char p (c);;) // Previous character (see below). - { - c = geth (); - - if (eos (c) || c == '\n') - fail (l) << "unterminated string literal"; - - if (c == '\"' && p != '\\') - break; - - // Keep track of \\-escapings so we don't confuse them with \", as in - // "\\". - // - p = (c == '\\' && p == '\\') ? '\0' : static_cast (c); - - // Direct buffer scan. - // - if (p != '\\') - { - const char* b (gptr_); - const char* e (egptr_); - const char* p (b); - - for (char c; - p != e && (c = *p) != '\"' && c != '\\' && c != '\n'; - ++p) ; - - size_t n (p - b); - cs_.append (b, n); - gptr_ = p; buf_->gbump (static_cast (n)); column += n; - } - } - - // See if we have a user-defined suffix (which is an identifier). - // - if ((c = peek ()) == '_' || alpha (c)) - literal_suffix (c); - - t.type = type::string; - } - - void lexer:: - raw_string_literal (token& t, xchar c) - { - // note: c is hashed - - // The overall form is: - // - // R"()" - // - // Where is a potentially-empty character sequence made of - // any source character but parentheses, backslash and spaces. It can be - // at most 16 characters long. - // - // Note that the are not processed in any way, not even - // for line continuations. - // - const location l (&name_, c.line, c.column); - - // As a first step, parse the delimiter (including the openning paren). - // - string d (1, ')'); - - for (;;) - { - c = geth (); - - if (eos (c) || c == '\"' || c == ')' || c == '\\' || c == ' ') - fail (l) << "invalid raw string literal"; - - if (c == '(') - break; - - d += c; - } - - d += '"'; - - // Now parse the raw characters while trying to match the closing - // delimiter. - // - for (size_t i (0);;) // Position to match in d. - { - c = geth (false); // No newline escaping. - - if (eos (c)) // Note: newline is ok. - fail (l) << "invalid raw string literal"; - - if (c != d[i] && i != 0) // Restart from the beginning. - i = 0; - - if (c == d[i]) - { - if (++i == d.size ()) - break; - } - } - - // See if we have a user-defined suffix (which is an identifier). - // - if ((c = peek ()) == '_' || alpha (c)) - literal_suffix (c); - - t.type = type::string; - } - - void lexer:: - literal_suffix (xchar c) - { - // note: c is unhashed - - // Parse a user-defined literal suffix identifier. - // - for (geth (c); (c = peek ()) == '_' || alnum (c); geth (c)) ; - } - - void lexer:: - line_directive (token& t, xchar c) - { - // enter: first digit of the line number - // leave: last character of the line number or file string - // note: c is unhashed - - // If our number and string tokens contained the literal values, then we - // could have used that. However, we ignore the value (along with escape - // processing, etc), for performance. Let's keep it that way and instead - // handle it ourselves. - // - // Note also that we are not hashing these at the character level - // instead hashing the switch to a new file path below and leaving the - // line number to the token line hashing. - // - { - string& s (t.value); - - for (s = c; (c = peek ()) >= '0' && c <= '9'; get (c)) - s += c; - - // The newline that ends the directive will increment the logical line - // so subtract one to compensate. Note: can't be 0 and shouldn't throw - // for valid lines. - // - log_line_ = stoull (s.c_str ()) - 1; - } - - // See if we have the file. - // - c = skip_spaces (false); - - if (c == '\"') - { - const location l (&name_, c.line, c.column); - - // It is common to have a large number of #line directives that don't - // change the file (they seem to be used to track macro locations or - // some such). So we are going to optimize for this by comparing the - // current path to what's in #line. - // - string& s (tmp_file_); - s.clear (); - - for (char p ('\0'); p != '\"'; ) // Previous character. - { - c = get (); - - if (eos (c) || c == '\n') - fail (l) << "unterminated string literal"; - - // Handle escapes. - // - if (p == '\\') - { - p = '\0'; // Clear so we don't confuse \" and \\". - - // We only handle what can reasonably be expected in a file name. - // - switch (c) - { - case '\\': - case '\'': - case '\"': break; // Add as is. - default: - fail (c) << "unsupported escape sequence in #line directive"; - } - } - else - { - p = c; - - switch (c) - { - case '\\': - case '\"': continue; - } - } - - s += c; - - // Direct buffer scan. - // - if (p != '\\') - { - const char* b (gptr_); - const char* e (egptr_); - const char* p (b); - - for (char c; - p != e && (c = *p) != '\"' && c != '\\' && c != '\n'; - ++p) ; - - size_t n (p - b); - s.append (b, n); - gptr_ = p; buf_->gbump (static_cast (n)); column += n; - } - } - - if (log_file_.string () == s) - return; - - // Swap the two string buffers. - // - { - string r (move (log_file_).string ()); // Move string rep out. - r.swap (s); - log_file_ = path (move (r)); // Move back in. - } - - // If the path is relative, then prefix it with the current working - // directory. Failed that, we will end up with different checksums for - // invocations from different directories. - // - // While this should work fine for normal cross-compilation, it's an - // entirely different story for the emulated case (e.g., msvc-linux - // where the preprocessed output contains absolute Windows paths). So - // we try to sense if things look fishy and leave the path alone. - // - // Also detect special names like and . Plus - // GCC sometimes adds what looks like working directory (has trailing - // slash). So ignore that as well. - // - // We now switched to using absolute translation unit paths (because - // of __FILE__/assert(); see compile.cxx for details). But we might - // still need this logic when we try to calculate location-independent - // hash for distributed compilation/caching. The idea is to only hash - // the part starting from the project root which is immutable. Plus - // we will need -ffile-prefix-map to deal with __FILE__. - // - if (!log_file_.to_directory ()) - cs_.append (log_file_.string ()); -#if 0 - { - using tr = path::traits; - const string& f (log_file_.string ()); - - if (f.find (':') != string::npos || - (f.front () == '<' && f.back () == '>') || - log_file_.absolute ()) - cs_.append (f); - else - { - // This gets complicated and slow: the path may contain '..' and - // '.' so strictly speaking we would need to normalize it. - // Instead, we are going to handle leading '..'s ourselves (the - // sane case) and ignore everything else (so if you have '..' or - // '.' somewhere in the middle, then things might not work - // optimally for you). - // - const string& d (work.string ()); - - // Iterate over leading '..' in f "popping" the corresponding - // number of trailing components from d. - // - size_t fp (0); - size_t dp (d.size () - 1); - - for (size_t p;; ) - { - // Note that in file we recognize any directory separator, not - // just of this platform (see note about emulation above). - // - if (f.compare (fp, 2, "..") != 0 || - (f[fp + 2] != '/' && f[fp + 2] != '\\') || // Could be '\0'. - (p = tr::rfind_separator (d, dp)) == string::npos) - break; - - fp += 3; - dp = p - 1; - } - - cs_.append (d.c_str (), dp + 1); - cs_.append (tr::directory_separator); // Canonical in work. - cs_.append (f.c_str () + fp); - } - } -#endif - } - else - unget (c); - } - - auto lexer:: - skip_spaces (bool nl) -> xchar - { - xchar c (get ()); - - for (; !eos (c); c = get ()) - { - switch (c) - { - case '\n': - if (!nl) break; - // Fall through. - case ' ': - case '\t': - case '\r': - case '\f': - case '\v': - { - // Direct buffer scan. - // - const char* b (gptr_); - const char* e (egptr_); - const char* p (b); - - for (char c; - p != e && ((c = *p) == ' ' || c == '\t'); - ++p) ; - - size_t n (p - b); - gptr_ = p; buf_->gbump (static_cast (n)); column += n; - - continue; - } - case '/': - { - xchar p (peek ()); - - // C++ comment. - // - if (p == '/') - { - get (p); - - for (;;) - { - c = get (); - if (c == '\n' || eos (c)) - break; - - // Direct buffer scan. - // - const char* b (gptr_); - const char* e (egptr_); - const char* p (b); - - for (char c; - p != e && (c = *p) != '\n' && c != '\\'; - ++p) ; - - size_t n (p - b); - gptr_ = p; buf_->gbump (static_cast (n)); column += n; - } - - if (!nl) - break; - - continue; - } - - // C comment. - // - if (p == '*') - { - get (p); - - for (;;) - { - c = get (); - - if (eos (c)) - fail (p) << "unterminated comment"; - - if (c == '*' && (c = peek ()) == '/') - { - get (c); - break; - } - - // Direct buffer scan. - // - const char* b (gptr_); - const char* e (egptr_); - const char* p (b); - - for (char c; - p != e && (c = *p) != '*' && c != '\\'; - ++p) - { - if (c == '\n') - { - if (log_line_) ++*log_line_; - ++line; - column = 1; - } - else - ++column; - } - - gptr_ = p; buf_->gbump (static_cast (p - b)); - } - continue; - } - break; - } - } - break; - } - - return c; - } - - ostream& - operator<< (ostream& o, const token& t) - { - switch (t.type) - { - case type::dot: o << "'.'"; break; - case type::semi: o << "';'"; break; - case type::less: o << "'<'"; break; - case type::greater: o << "'>'"; break; - case type::lcbrace: o << "'{'"; break; - case type::rcbrace: o << "'}'"; break; - case type::punctuation: o << ""; break; - - case type::identifier: o << '\'' << t.value << '\''; break; - - case type::number: o << ""; break; - case type::character: o << ""; break; - case type::string: o << ""; break; - - case type::other: o << ""; break; - case type::eos: o << ""; break; - } - - return o; - } - } -} -- cgit v1.1