From 875c6e81e5ed52df46740083451380c4597b560c Mon Sep 17 00:00:00 2001
From: Boris Kolpackov <boris@codesynthesis.com>
Date: Mon, 24 Jul 2017 14:08:26 +0200
Subject: Optimize C/C++ lexer

---
 build2/cc/lexer.cxx             | 164 ++++++++++++++++++++++++++++++++++++++--
 build2/cc/lexer.hxx             |   5 +-
 build2/cc/parser.cxx            |   2 +-
 build2/cc/parser.hxx            |   2 +-
 unit-tests/cc/lexer/driver.cxx  |  17 ++---
 unit-tests/cc/parser/driver.cxx |  20 ++---
 6 files changed, 174 insertions(+), 36 deletions(-)

diff --git a/build2/cc/lexer.cxx b/build2/cc/lexer.cxx
index 8cabffd..2f7f1a3 100644
--- a/build2/cc/lexer.cxx
+++ b/build2/cc/lexer.cxx
@@ -7,6 +7,27 @@
 using namespace std;
 using namespace butl;
 
+// bit 0 - identifier character (_0-9A-Ba-b).
+//
+static const uint8_t char_flags[256] =
+//0    1    2    3    4    5    6    7      8    9    A    B    C    D    E    F
+{
+  0,   0,   0,   0,   0,   0,   0,   0,     0,   0,   0,   0,   0,   0,   0,   0, // 0
+  0,   0,   0,   0,   0,   0,   0,   0,     0,   0,   0,   0,   0,   0,   0,   0, // 1
+  0,   0,   0,   0,   0,   0,   0,   0,     0,   0,   0,   0,   0,   0,   0,   0, // 2
+  1,   1,   1,   1,   1,   1,   1,   1,     1,   1,   0,   0,   0,   0,   0,   0, // 3
+  0,   1,   1,   1,   1,   1,   1,   1,     1,   1,   1,   1,   1,   1,   1,   1, // 4
+  1,   1,   1,   1,   1,   1,   1,   1,     1,   1,   1,   0,   0,   0,   0,   1, // 5
+  0,   1,   1,   1,   1,   1,   1,   1,     1,   1,   1,   1,   1,   1,   1,   1, // 6
+  1,   1,   1,   1,   1,   1,   1,   1,     1,   1,   1,   0,   0,   0,   0,   0, // 7
+
+  // 128-255
+  0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,
+  0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,
+  0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,
+  0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0
+};
+
 // Diagnostics plumbing.
 //
 namespace butl // ADL
@@ -361,21 +382,60 @@ namespace build2
           {
             bool raw (false); // Raw string literal.
 
-            if (alpha (c) || c == '_')
+            // Note: known not to be a digit (see above).
+            //
+            if (char_flags[static_cast<uint8_t> (c)] & 0x01)
             {
               // This smells a little: we know skip_spaces() did not peek at
               // the next character because this is not '/'. Which means the
               // position in the stream must be of this character + 1.
               //
-              if (buf_ != nullptr)
-                t.position = buf_->tellg () - 1;
+              t.position = buf_->tellg () - 1;
 
               string& id (t.value);
-              id.clear ();
+              id = c;
 
-              for (id += c; (c = peek ()) == '_' || alnum (c); geth (c))
+              while (char_flags[static_cast<uint8_t> (c = peek ())] & 0x01)
+              {
+                geth (c);
                 id += c;
 
+                // Direct buffer scan. Note that we always follow up with the
+                // normal peek() call which may load the next chunk, handle
+                // line continuations, etc. In other words, the end of the
+                // "raw" scan doesn't necessarily mean the end.
+                //
+                const char* b (gptr_);
+                const char* p (b);
+
+                for (const char* e (egptr_);
+                     p != e && char_flags[static_cast<uint8_t> (*p)] & 0x01;
+                     ++p) ;
+
+                // Unrolling this loop doesn't make a difference.
+                //
+                // for (const char* e (egptr_ - 4); p < e; p += 4)
+                // {
+                //   uint8_t c;
+                //
+                //  c = static_cast<uint8_t> (p[0]);
+                //  if (!(char_flags[c] & 0x01)) break;
+                //
+                //  c = static_cast<uint8_t> (p[1]);
+                //  if (!(char_flags[c] & 0x01)) {p += 1; break;}
+                //
+                //  c = static_cast<uint8_t> (p[2]);
+                //  if (!(char_flags[c] & 0x01)) {p += 2; break;}
+                //
+                //  c = static_cast<uint8_t> (p[3]);
+                //  if (!(char_flags[c] & 0x01)) {p += 3; break;}
+                // }
+
+                size_t n (p - b);
+                id.append (b, n); cs_.append (b, n);
+                gptr_ = p; buf_->gbump (n); column += n;
+              }
+
               // If the following character is a quote, see if the identifier
               // is one of the literal prefixes.
               //
@@ -610,6 +670,23 @@ namespace build2
         // "\\".
         //
         p = (c == '\\' && p == '\\') ? '\0' : static_cast<char> (c);
+
+        // Direct buffer scan.
+        //
+        if (p != '\\')
+        {
+          const char* b (gptr_);
+          const char* e (egptr_);
+          const char* p (b);
+
+          for (char c;
+               p != e && (c = *p) != '\"' && c != '\\' && c != '\n';
+               ++p) ;
+
+          size_t n (p - b);
+          cs_.append (b, n);
+          gptr_ = p; buf_->gbump (n); column += n;
+        }
       }
 
       // See if we have a user-defined suffix (which is an identifier).
@@ -771,6 +848,23 @@ namespace build2
           }
 
           s += c;
+
+          // Direct buffer scan.
+          //
+          if (p != '\\')
+          {
+            const char* b (gptr_);
+            const char* e (egptr_);
+            const char* p (b);
+
+            for (char c;
+                 p != e && (c = *p) != '\"' && c != '\\' && c != '\n';
+                 ++p) ;
+
+            size_t n (p - b);
+            s.append (b, n);
+            gptr_ = p; buf_->gbump (n); column += n;
+          }
         }
 
         log_file_ = path (move (s)); // Move back in.
@@ -858,8 +952,23 @@ namespace build2
         case '\t':
         case '\r':
         case '\f':
-        case '\v': continue;
+        case '\v':
+          {
+            // Direct buffer scan.
+            //
+            const char* b (gptr_);
+            const char* e (egptr_);
+            const char* p (b);
 
+            for (char c;
+                 p != e && ((c = *p) == ' ' || c == '\t');
+                 ++p) ;
+
+            size_t n (p - b);
+            gptr_ = p; buf_->gbump (n); column += n;
+
+            continue;
+          }
         case '/':
           {
             xchar p (peek ());
@@ -869,7 +978,26 @@ namespace build2
             if (p == '/')
             {
               get (p);
-              do { c = get (); } while (!eos (c) && c != '\n');
+
+              for (;;)
+              {
+                c = get ();
+                if (c == '\n' || eos (c))
+                  break;
+
+                // Direct buffer scan.
+                //
+                const char* b (gptr_);
+                const char* e (egptr_);
+                const char* p (b);
+
+                for (char c;
+                     p != e && (c = *p) != '\n' && c != '\\';
+                     ++p) ;
+
+                size_t n (p - b);
+                gptr_ = p; buf_->gbump (n); column += n;
+              }
 
               if (!nl)
                 break;
@@ -895,6 +1023,28 @@ namespace build2
                   get (c);
                   break;
                 }
+
+                // Direct buffer scan.
+                //
+                const char* b (gptr_);
+                const char* e (egptr_);
+                const char* p (b);
+
+                for (char c;
+                     p != e && (c = *p) != '*' && c != '\\';
+                     ++p)
+                {
+                  if (c == '\n')
+                  {
+                    if (log_line_) ++*log_line_;
+                    ++line;
+                    column = 1;
+                  }
+                  else
+                    ++column;
+                }
+
+                gptr_ = p; buf_->gbump (p - b);
               }
               continue;
             }
diff --git a/build2/cc/lexer.hxx b/build2/cc/lexer.hxx
index 1869344..aa24f6a 100644
--- a/build2/cc/lexer.hxx
+++ b/build2/cc/lexer.hxx
@@ -64,8 +64,7 @@ namespace build2
       uint64_t line   = 0;
       uint64_t column = 0;
 
-      // Physical position in the stream, currently only for identifiers and
-      // only if the stream is ifdstream.
+      // Physical position in the stream, currently only for identifiers.
       //
       uint64_t position = 0;
     };
@@ -78,7 +77,7 @@ namespace build2
     class lexer: protected butl::char_scanner
     {
     public:
-      lexer (istream& is, const path& name)
+      lexer (ifdstream& is, const path& name)
           : char_scanner (is, false),
             name_ (name),
             fail ("error", &name_),
diff --git a/build2/cc/parser.cxx b/build2/cc/parser.cxx
index a97a98b..fcb6265 100644
--- a/build2/cc/parser.cxx
+++ b/build2/cc/parser.cxx
@@ -16,7 +16,7 @@ namespace build2
     using type = token_type;
 
     translation_unit parser::
-    parse (istream& is, const path& name)
+    parse (ifdstream& is, const path& name)
     {
       lexer l (is, name);
       l_ = &l;
diff --git a/build2/cc/parser.hxx b/build2/cc/parser.hxx
index 9142001..bec67bd 100644
--- a/build2/cc/parser.hxx
+++ b/build2/cc/parser.hxx
@@ -25,7 +25,7 @@ namespace build2
     {
     public:
       translation_unit
-      parse (istream&, const path& name);
+      parse (ifdstream&, const path& name);
 
     private:
       void
diff --git a/unit-tests/cc/lexer/driver.cxx b/unit-tests/cc/lexer/driver.cxx
index 5803a88..53910a6 100644
--- a/unit-tests/cc/lexer/driver.cxx
+++ b/unit-tests/cc/lexer/driver.cxx
@@ -11,6 +11,7 @@
 #include <build2/cc/lexer.hxx>
 
 using namespace std;
+using namespace butl;
 
 namespace build2
 {
@@ -39,24 +40,16 @@ namespace build2
 
       try
       {
-        istream* is;
-
-        // Reading from file is several times faster.
-        //
-        ifdstream ifs;
+        ifdstream is;
         if (file != nullptr)
-        {
-          ifs.open (file);
-          is = &ifs;
-        }
+          is.open (file);
         else
         {
           file = "stdin";
-          cin.exceptions (istream::failbit | istream::badbit);
-          is = &cin;
+          is.open (fddup (stdin_fd ()));
         }
 
-        lexer l (*is, path (file));
+        lexer l (is, path (file));
 
         // No use printing eos since we will either get it or loop forever.
         //
diff --git a/unit-tests/cc/parser/driver.cxx b/unit-tests/cc/parser/driver.cxx
index ec346d3..19d85e8 100644
--- a/unit-tests/cc/parser/driver.cxx
+++ b/unit-tests/cc/parser/driver.cxx
@@ -11,6 +11,7 @@
 #include <build2/cc/parser.hxx>
 
 using namespace std;
+using namespace butl;
 
 namespace build2
 {
@@ -23,27 +24,22 @@ namespace build2
     {
       try
       {
-        istream* is;
-        const char* in;
+        const char* file;
 
-        // Reading from file is several times faster.
-        //
-        ifdstream ifs;
+        ifdstream is;
         if (argc > 1)
         {
-          in = argv[1];
-          ifs.open (in);
-          is = &ifs;
+          file = argv[1];
+          is.open (file);
         }
         else
         {
-          in = "stdin";
-          cin.exceptions (istream::failbit | istream::badbit);
-          is = &cin;
+          file = "stdin";
+          is.open (fddup (stdin_fd ()));
         }
 
         parser p;
-        translation_unit u (p.parse (*is, path (in)));
+        translation_unit u (p.parse (is, path (file)));
 
         for (const module_import& m: u.mod.imports)
           cout << (m.exported ? "export " : "")
-- 
cgit v1.1