From 875c6e81e5ed52df46740083451380c4597b560c Mon Sep 17 00:00:00 2001 From: Boris Kolpackov Date: Mon, 24 Jul 2017 14:08:26 +0200 Subject: Optimize C/C++ lexer --- build2/cc/lexer.cxx | 164 ++++++++++++++++++++++++++++++++++++++-- build2/cc/lexer.hxx | 5 +- build2/cc/parser.cxx | 2 +- build2/cc/parser.hxx | 2 +- unit-tests/cc/lexer/driver.cxx | 17 ++--- unit-tests/cc/parser/driver.cxx | 20 ++--- 6 files changed, 174 insertions(+), 36 deletions(-) diff --git a/build2/cc/lexer.cxx b/build2/cc/lexer.cxx index 8cabffd..2f7f1a3 100644 --- a/build2/cc/lexer.cxx +++ b/build2/cc/lexer.cxx @@ -7,6 +7,27 @@ using namespace std; using namespace butl; +// bit 0 - identifier character (_0-9A-Ba-b). +// +static const uint8_t char_flags[256] = +//0 1 2 3 4 5 6 7 8 9 A B C D E F +{ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 1 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 2 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, // 3 + 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 4 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, // 5 + 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 6 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, // 7 + + // 128-255 + 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0 +}; + // Diagnostics plumbing. // namespace butl // ADL @@ -361,21 +382,60 @@ namespace build2 { bool raw (false); // Raw string literal. - if (alpha (c) || c == '_') + // Note: known not to be a digit (see above). + // + if (char_flags[static_cast (c)] & 0x01) { // This smells a little: we know skip_spaces() did not peek at // the next character because this is not '/'. Which means the // position in the stream must be of this character + 1. // - if (buf_ != nullptr) - t.position = buf_->tellg () - 1; + t.position = buf_->tellg () - 1; string& id (t.value); - id.clear (); + id = c; - for (id += c; (c = peek ()) == '_' || alnum (c); geth (c)) + while (char_flags[static_cast (c = peek ())] & 0x01) + { + geth (c); id += c; + // Direct buffer scan. Note that we always follow up with the + // normal peek() call which may load the next chunk, handle + // line continuations, etc. In other words, the end of the + // "raw" scan doesn't necessarily mean the end. + // + const char* b (gptr_); + const char* p (b); + + for (const char* e (egptr_); + p != e && char_flags[static_cast (*p)] & 0x01; + ++p) ; + + // Unrolling this loop doesn't make a difference. + // + // for (const char* e (egptr_ - 4); p < e; p += 4) + // { + // uint8_t c; + // + // c = static_cast (p[0]); + // if (!(char_flags[c] & 0x01)) break; + // + // c = static_cast (p[1]); + // if (!(char_flags[c] & 0x01)) {p += 1; break;} + // + // c = static_cast (p[2]); + // if (!(char_flags[c] & 0x01)) {p += 2; break;} + // + // c = static_cast (p[3]); + // if (!(char_flags[c] & 0x01)) {p += 3; break;} + // } + + size_t n (p - b); + id.append (b, n); cs_.append (b, n); + gptr_ = p; buf_->gbump (n); column += n; + } + // If the following character is a quote, see if the identifier // is one of the literal prefixes. // @@ -610,6 +670,23 @@ namespace build2 // "\\". // p = (c == '\\' && p == '\\') ? '\0' : static_cast (c); + + // Direct buffer scan. + // + if (p != '\\') + { + const char* b (gptr_); + const char* e (egptr_); + const char* p (b); + + for (char c; + p != e && (c = *p) != '\"' && c != '\\' && c != '\n'; + ++p) ; + + size_t n (p - b); + cs_.append (b, n); + gptr_ = p; buf_->gbump (n); column += n; + } } // See if we have a user-defined suffix (which is an identifier). @@ -771,6 +848,23 @@ namespace build2 } s += c; + + // Direct buffer scan. + // + if (p != '\\') + { + const char* b (gptr_); + const char* e (egptr_); + const char* p (b); + + for (char c; + p != e && (c = *p) != '\"' && c != '\\' && c != '\n'; + ++p) ; + + size_t n (p - b); + s.append (b, n); + gptr_ = p; buf_->gbump (n); column += n; + } } log_file_ = path (move (s)); // Move back in. @@ -858,8 +952,23 @@ namespace build2 case '\t': case '\r': case '\f': - case '\v': continue; + case '\v': + { + // Direct buffer scan. + // + const char* b (gptr_); + const char* e (egptr_); + const char* p (b); + for (char c; + p != e && ((c = *p) == ' ' || c == '\t'); + ++p) ; + + size_t n (p - b); + gptr_ = p; buf_->gbump (n); column += n; + + continue; + } case '/': { xchar p (peek ()); @@ -869,7 +978,26 @@ namespace build2 if (p == '/') { get (p); - do { c = get (); } while (!eos (c) && c != '\n'); + + for (;;) + { + c = get (); + if (c == '\n' || eos (c)) + break; + + // Direct buffer scan. + // + const char* b (gptr_); + const char* e (egptr_); + const char* p (b); + + for (char c; + p != e && (c = *p) != '\n' && c != '\\'; + ++p) ; + + size_t n (p - b); + gptr_ = p; buf_->gbump (n); column += n; + } if (!nl) break; @@ -895,6 +1023,28 @@ namespace build2 get (c); break; } + + // Direct buffer scan. + // + const char* b (gptr_); + const char* e (egptr_); + const char* p (b); + + for (char c; + p != e && (c = *p) != '*' && c != '\\'; + ++p) + { + if (c == '\n') + { + if (log_line_) ++*log_line_; + ++line; + column = 1; + } + else + ++column; + } + + gptr_ = p; buf_->gbump (p - b); } continue; } diff --git a/build2/cc/lexer.hxx b/build2/cc/lexer.hxx index 1869344..aa24f6a 100644 --- a/build2/cc/lexer.hxx +++ b/build2/cc/lexer.hxx @@ -64,8 +64,7 @@ namespace build2 uint64_t line = 0; uint64_t column = 0; - // Physical position in the stream, currently only for identifiers and - // only if the stream is ifdstream. + // Physical position in the stream, currently only for identifiers. // uint64_t position = 0; }; @@ -78,7 +77,7 @@ namespace build2 class lexer: protected butl::char_scanner { public: - lexer (istream& is, const path& name) + lexer (ifdstream& is, const path& name) : char_scanner (is, false), name_ (name), fail ("error", &name_), diff --git a/build2/cc/parser.cxx b/build2/cc/parser.cxx index a97a98b..fcb6265 100644 --- a/build2/cc/parser.cxx +++ b/build2/cc/parser.cxx @@ -16,7 +16,7 @@ namespace build2 using type = token_type; translation_unit parser:: - parse (istream& is, const path& name) + parse (ifdstream& is, const path& name) { lexer l (is, name); l_ = &l; diff --git a/build2/cc/parser.hxx b/build2/cc/parser.hxx index 9142001..bec67bd 100644 --- a/build2/cc/parser.hxx +++ b/build2/cc/parser.hxx @@ -25,7 +25,7 @@ namespace build2 { public: translation_unit - parse (istream&, const path& name); + parse (ifdstream&, const path& name); private: void diff --git a/unit-tests/cc/lexer/driver.cxx b/unit-tests/cc/lexer/driver.cxx index 5803a88..53910a6 100644 --- a/unit-tests/cc/lexer/driver.cxx +++ b/unit-tests/cc/lexer/driver.cxx @@ -11,6 +11,7 @@ #include using namespace std; +using namespace butl; namespace build2 { @@ -39,24 +40,16 @@ namespace build2 try { - istream* is; - - // Reading from file is several times faster. - // - ifdstream ifs; + ifdstream is; if (file != nullptr) - { - ifs.open (file); - is = &ifs; - } + is.open (file); else { file = "stdin"; - cin.exceptions (istream::failbit | istream::badbit); - is = &cin; + is.open (fddup (stdin_fd ())); } - lexer l (*is, path (file)); + lexer l (is, path (file)); // No use printing eos since we will either get it or loop forever. // diff --git a/unit-tests/cc/parser/driver.cxx b/unit-tests/cc/parser/driver.cxx index ec346d3..19d85e8 100644 --- a/unit-tests/cc/parser/driver.cxx +++ b/unit-tests/cc/parser/driver.cxx @@ -11,6 +11,7 @@ #include using namespace std; +using namespace butl; namespace build2 { @@ -23,27 +24,22 @@ namespace build2 { try { - istream* is; - const char* in; + const char* file; - // Reading from file is several times faster. - // - ifdstream ifs; + ifdstream is; if (argc > 1) { - in = argv[1]; - ifs.open (in); - is = &ifs; + file = argv[1]; + is.open (file); } else { - in = "stdin"; - cin.exceptions (istream::failbit | istream::badbit); - is = &cin; + file = "stdin"; + is.open (fddup (stdin_fd ())); } parser p; - translation_unit u (p.parse (*is, path (in))); + translation_unit u (p.parse (is, path (file))); for (const module_import& m: u.mod.imports) cout << (m.exported ? "export " : "") -- cgit v1.1