aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBoris Kolpackov <boris@codesynthesis.com>2017-07-24 14:08:26 +0200
committerBoris Kolpackov <boris@codesynthesis.com>2017-07-24 14:09:08 +0200
commit875c6e81e5ed52df46740083451380c4597b560c (patch)
treec3df90087a7797575a6343fb0fe6bc4444975cb5
parentc18a8d2d43f22ccf0b21461b1f0d3395d4e50c1f (diff)
Optimize C/C++ lexer
-rw-r--r--build2/cc/lexer.cxx164
-rw-r--r--build2/cc/lexer.hxx5
-rw-r--r--build2/cc/parser.cxx2
-rw-r--r--build2/cc/parser.hxx2
-rw-r--r--unit-tests/cc/lexer/driver.cxx17
-rw-r--r--unit-tests/cc/parser/driver.cxx20
6 files changed, 174 insertions, 36 deletions
diff --git a/build2/cc/lexer.cxx b/build2/cc/lexer.cxx
index 8cabffd..2f7f1a3 100644
--- a/build2/cc/lexer.cxx
+++ b/build2/cc/lexer.cxx
@@ -7,6 +7,27 @@
using namespace std;
using namespace butl;
+// bit 0 - identifier character (_0-9A-Ba-b).
+//
+static const uint8_t char_flags[256] =
+//0 1 2 3 4 5 6 7 8 9 A B C D E F
+{
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 1
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 2
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, // 3
+ 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 4
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, // 5
+ 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 6
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, // 7
+
+ // 128-255
+ 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0
+};
+
// Diagnostics plumbing.
//
namespace butl // ADL
@@ -361,21 +382,60 @@ namespace build2
{
bool raw (false); // Raw string literal.
- if (alpha (c) || c == '_')
+ // Note: known not to be a digit (see above).
+ //
+ if (char_flags[static_cast<uint8_t> (c)] & 0x01)
{
// This smells a little: we know skip_spaces() did not peek at
// the next character because this is not '/'. Which means the
// position in the stream must be of this character + 1.
//
- if (buf_ != nullptr)
- t.position = buf_->tellg () - 1;
+ t.position = buf_->tellg () - 1;
string& id (t.value);
- id.clear ();
+ id = c;
- for (id += c; (c = peek ()) == '_' || alnum (c); geth (c))
+ while (char_flags[static_cast<uint8_t> (c = peek ())] & 0x01)
+ {
+ geth (c);
id += c;
+ // Direct buffer scan. Note that we always follow up with the
+ // normal peek() call which may load the next chunk, handle
+ // line continuations, etc. In other words, the end of the
+ // "raw" scan doesn't necessarily mean the end.
+ //
+ const char* b (gptr_);
+ const char* p (b);
+
+ for (const char* e (egptr_);
+ p != e && char_flags[static_cast<uint8_t> (*p)] & 0x01;
+ ++p) ;
+
+ // Unrolling this loop doesn't make a difference.
+ //
+ // for (const char* e (egptr_ - 4); p < e; p += 4)
+ // {
+ // uint8_t c;
+ //
+ // c = static_cast<uint8_t> (p[0]);
+ // if (!(char_flags[c] & 0x01)) break;
+ //
+ // c = static_cast<uint8_t> (p[1]);
+ // if (!(char_flags[c] & 0x01)) {p += 1; break;}
+ //
+ // c = static_cast<uint8_t> (p[2]);
+ // if (!(char_flags[c] & 0x01)) {p += 2; break;}
+ //
+ // c = static_cast<uint8_t> (p[3]);
+ // if (!(char_flags[c] & 0x01)) {p += 3; break;}
+ // }
+
+ size_t n (p - b);
+ id.append (b, n); cs_.append (b, n);
+ gptr_ = p; buf_->gbump (n); column += n;
+ }
+
// If the following character is a quote, see if the identifier
// is one of the literal prefixes.
//
@@ -610,6 +670,23 @@ namespace build2
// "\\".
//
p = (c == '\\' && p == '\\') ? '\0' : static_cast<char> (c);
+
+ // Direct buffer scan.
+ //
+ if (p != '\\')
+ {
+ const char* b (gptr_);
+ const char* e (egptr_);
+ const char* p (b);
+
+ for (char c;
+ p != e && (c = *p) != '\"' && c != '\\' && c != '\n';
+ ++p) ;
+
+ size_t n (p - b);
+ cs_.append (b, n);
+ gptr_ = p; buf_->gbump (n); column += n;
+ }
}
// See if we have a user-defined suffix (which is an identifier).
@@ -771,6 +848,23 @@ namespace build2
}
s += c;
+
+ // Direct buffer scan.
+ //
+ if (p != '\\')
+ {
+ const char* b (gptr_);
+ const char* e (egptr_);
+ const char* p (b);
+
+ for (char c;
+ p != e && (c = *p) != '\"' && c != '\\' && c != '\n';
+ ++p) ;
+
+ size_t n (p - b);
+ s.append (b, n);
+ gptr_ = p; buf_->gbump (n); column += n;
+ }
}
log_file_ = path (move (s)); // Move back in.
@@ -858,8 +952,23 @@ namespace build2
case '\t':
case '\r':
case '\f':
- case '\v': continue;
+ case '\v':
+ {
+ // Direct buffer scan.
+ //
+ const char* b (gptr_);
+ const char* e (egptr_);
+ const char* p (b);
+ for (char c;
+ p != e && ((c = *p) == ' ' || c == '\t');
+ ++p) ;
+
+ size_t n (p - b);
+ gptr_ = p; buf_->gbump (n); column += n;
+
+ continue;
+ }
case '/':
{
xchar p (peek ());
@@ -869,7 +978,26 @@ namespace build2
if (p == '/')
{
get (p);
- do { c = get (); } while (!eos (c) && c != '\n');
+
+ for (;;)
+ {
+ c = get ();
+ if (c == '\n' || eos (c))
+ break;
+
+ // Direct buffer scan.
+ //
+ const char* b (gptr_);
+ const char* e (egptr_);
+ const char* p (b);
+
+ for (char c;
+ p != e && (c = *p) != '\n' && c != '\\';
+ ++p) ;
+
+ size_t n (p - b);
+ gptr_ = p; buf_->gbump (n); column += n;
+ }
if (!nl)
break;
@@ -895,6 +1023,28 @@ namespace build2
get (c);
break;
}
+
+ // Direct buffer scan.
+ //
+ const char* b (gptr_);
+ const char* e (egptr_);
+ const char* p (b);
+
+ for (char c;
+ p != e && (c = *p) != '*' && c != '\\';
+ ++p)
+ {
+ if (c == '\n')
+ {
+ if (log_line_) ++*log_line_;
+ ++line;
+ column = 1;
+ }
+ else
+ ++column;
+ }
+
+ gptr_ = p; buf_->gbump (p - b);
}
continue;
}
diff --git a/build2/cc/lexer.hxx b/build2/cc/lexer.hxx
index 1869344..aa24f6a 100644
--- a/build2/cc/lexer.hxx
+++ b/build2/cc/lexer.hxx
@@ -64,8 +64,7 @@ namespace build2
uint64_t line = 0;
uint64_t column = 0;
- // Physical position in the stream, currently only for identifiers and
- // only if the stream is ifdstream.
+ // Physical position in the stream, currently only for identifiers.
//
uint64_t position = 0;
};
@@ -78,7 +77,7 @@ namespace build2
class lexer: protected butl::char_scanner
{
public:
- lexer (istream& is, const path& name)
+ lexer (ifdstream& is, const path& name)
: char_scanner (is, false),
name_ (name),
fail ("error", &name_),
diff --git a/build2/cc/parser.cxx b/build2/cc/parser.cxx
index a97a98b..fcb6265 100644
--- a/build2/cc/parser.cxx
+++ b/build2/cc/parser.cxx
@@ -16,7 +16,7 @@ namespace build2
using type = token_type;
translation_unit parser::
- parse (istream& is, const path& name)
+ parse (ifdstream& is, const path& name)
{
lexer l (is, name);
l_ = &l;
diff --git a/build2/cc/parser.hxx b/build2/cc/parser.hxx
index 9142001..bec67bd 100644
--- a/build2/cc/parser.hxx
+++ b/build2/cc/parser.hxx
@@ -25,7 +25,7 @@ namespace build2
{
public:
translation_unit
- parse (istream&, const path& name);
+ parse (ifdstream&, const path& name);
private:
void
diff --git a/unit-tests/cc/lexer/driver.cxx b/unit-tests/cc/lexer/driver.cxx
index 5803a88..53910a6 100644
--- a/unit-tests/cc/lexer/driver.cxx
+++ b/unit-tests/cc/lexer/driver.cxx
@@ -11,6 +11,7 @@
#include <build2/cc/lexer.hxx>
using namespace std;
+using namespace butl;
namespace build2
{
@@ -39,24 +40,16 @@ namespace build2
try
{
- istream* is;
-
- // Reading from file is several times faster.
- //
- ifdstream ifs;
+ ifdstream is;
if (file != nullptr)
- {
- ifs.open (file);
- is = &ifs;
- }
+ is.open (file);
else
{
file = "stdin";
- cin.exceptions (istream::failbit | istream::badbit);
- is = &cin;
+ is.open (fddup (stdin_fd ()));
}
- lexer l (*is, path (file));
+ lexer l (is, path (file));
// No use printing eos since we will either get it or loop forever.
//
diff --git a/unit-tests/cc/parser/driver.cxx b/unit-tests/cc/parser/driver.cxx
index ec346d3..19d85e8 100644
--- a/unit-tests/cc/parser/driver.cxx
+++ b/unit-tests/cc/parser/driver.cxx
@@ -11,6 +11,7 @@
#include <build2/cc/parser.hxx>
using namespace std;
+using namespace butl;
namespace build2
{
@@ -23,27 +24,22 @@ namespace build2
{
try
{
- istream* is;
- const char* in;
+ const char* file;
- // Reading from file is several times faster.
- //
- ifdstream ifs;
+ ifdstream is;
if (argc > 1)
{
- in = argv[1];
- ifs.open (in);
- is = &ifs;
+ file = argv[1];
+ is.open (file);
}
else
{
- in = "stdin";
- cin.exceptions (istream::failbit | istream::badbit);
- is = &cin;
+ file = "stdin";
+ is.open (fddup (stdin_fd ()));
}
parser p;
- translation_unit u (p.parse (*is, path (in)));
+ translation_unit u (p.parse (is, path (file)));
for (const module_import& m: u.mod.imports)
cout << (m.exported ? "export " : "")