From 2e19434e09b819105055ddc8e58f69db98ec8669 Mon Sep 17 00:00:00 2001 From: Boris Kolpackov Date: Sat, 27 May 2017 15:24:25 +0200 Subject: Handle #line directives in C/C++ lexer This way the parser now reports logical rather than physical location in diagnostics. --- build2/cc/lexer.cxx | 180 +++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 149 insertions(+), 31 deletions(-) (limited to 'build2/cc/lexer.cxx') diff --git a/build2/cc/lexer.cxx b/build2/cc/lexer.cxx index 05c734c..40178bb 100644 --- a/build2/cc/lexer.cxx +++ b/build2/cc/lexer.cxx @@ -25,6 +25,18 @@ namespace build2 { namespace cc { + inline void lexer:: + get (const xchar& c) + { + // Increment the logical line similar to how base will increment the + // physical (the column counts are the same). + // + if (log_line_ && c == '\n' && !unget_ && !unpeek_) + ++*log_line_; + + base::get (c); + } + inline auto lexer:: get (bool e) -> xchar { @@ -36,7 +48,7 @@ namespace build2 else { xchar c (peek (e)); - base::get (c); + get (c); return c; } } @@ -54,12 +66,12 @@ namespace build2 if (e && c == '\\') { - base::get (c); + get (c); xchar p (base::peek ()); if (p == '\n') { - base::get (p); + get (p); return peek (e); // Recurse. } @@ -80,7 +92,8 @@ namespace build2 { for (;; c = skip_spaces ()) { - t.line = c.line; + t.file = log_file_; + t.line = log_line_ ? * log_line_ : c.line; t.column = c.column; if (eos (c)) @@ -101,19 +114,52 @@ namespace build2 // that we assume there cannot be #include directives. // // This may not work for things like #error that can contain - // pretty much anything. Also note that lines that start with - // # can contain # further down. + // pretty much anything. Also note that lines that start with # + // can contain # further down. + // + // Finally, to support diagnostics properly we need to recognize + // #line directives. // if (ignore_pp) { - for (;;) + for (bool first (true);;) { + // Note that we keep using the passed token for buffers. + // c = skip_spaces (false); // Stop at newline. if (eos (c) || c == '\n') break; - next (t, c, false); // Keep using the passed token for buffers. + if (first) + { + first = false; + + // Recognize #line and its shorthand version: + // + // #line [] ... + // # [] ... + // + if (!(c >= '0' && c <= '9')) + { + next (t, c, false); + + if (t.type != type::identifier || t.value != "line") + continue; + + c = skip_spaces (false); + + if (!(c >= '0' && c <= '9')) + fail (c) << "line number expected after #line directive"; + } + + // Ok, this is #line and next comes the line number. + // + line_directive (t, c); + continue; // Parse the tail, if any. + } + + next (t, c, false); } break; } @@ -356,9 +402,6 @@ namespace build2 void lexer:: number_literal (token& t, xchar c) { - t.line = c.line; - t.column = c.column; - // A number (integer or floating point literal) can: // // 1. Start with a dot (which must be followed by a digit, e.g., .123). @@ -462,17 +505,15 @@ namespace build2 void lexer:: char_literal (token& t, xchar c) { - t.line = c.line; - t.column = c.column; + uint64_t ln (c.line); + uint64_t cn (c.column); - char p (c); // Previous character (see below). - - for (;;) + for (char p (c);;) // Previous character (see below). { c = get (); - if (eos (c)) - fail (location (&name_, t.line, t.column)) << "unterminated literal"; + if (eos (c) || c == '\n') + fail (location (&name_, ln, cn)) << "unterminated character literal"; if (c == '\'' && p != '\\') break; @@ -494,17 +535,15 @@ namespace build2 void lexer:: string_literal (token& t, xchar c) { - t.line = c.line; - t.column = c.column; - - char p (c); // Previous character (see below). + uint64_t ln (c.line); + uint64_t cn (c.column); - for (;;) + for (char p (c);;) // Previous character (see below). { c = get (); - if (eos (c)) - fail (location (&name_, t.line, t.column)) << "unterminated literal"; + if (eos (c) || c == '\n') + fail (location (&name_, ln, cn)) << "unterminated string literal"; if (c == '\"' && p != '\\') break; @@ -526,9 +565,6 @@ namespace build2 void lexer:: raw_string_literal (token& t, xchar c) { - t.line = c.line; - t.column = c.column; - // The overall form is: // // R"()" @@ -540,6 +576,8 @@ namespace build2 // Note that the are not processed in any way, not even // for line continuations. // + uint64_t ln (c.line); + uint64_t cn (c.column); // As a first step, parse the delimiter (including the openning paren). // @@ -550,7 +588,7 @@ namespace build2 c = get (); if (eos (c) || c == '\"' || c == ')' || c == '\\' || c == ' ') - fail (location (&name_, t.line, t.column)) << "invalid raw literal"; + fail (location (&name_, ln, cn)) << "invalid raw string literal"; if (c == '(') break; @@ -567,8 +605,8 @@ namespace build2 { c = get (false); // No newline escaping. - if (eos (c)) - fail (location (&name_, t.line, t.column)) << "invalid raw literal"; + if (eos (c)) // Note: newline is ok. + fail (location (&name_, ln, cn)) << "invalid raw string literal"; if (c != d[i] && i != 0) // Restart from the beginning. i = 0; @@ -596,6 +634,86 @@ namespace build2 for (get (c); (c = peek ()) == '_' || alnum (c); get (c)) ; } + void lexer:: + line_directive (token& t, xchar c) + { + // enter: first digit of the line number + // leave: last character of the line number or file string + + // If our number and string tokens contained the literal values, then we + // could have used that. However, we ignore the value (along with escape + // processing, etc), for performance. Let's keep it that way and instead + // handle it ourselves. + // + { + string& s (t.value); + + for (s = c; (c = peek ()) >= '0' && c <= '9'; get (c)) + s += c; + + // The newline that ends the directive will increment the logical line + // so subtract one to compensate. Note: can't be 0 and shouldn't throw + // for valid lines. + // + log_line_ = stoull (s.c_str ()) - 1; + } + + // See if we have the file. + // + c = skip_spaces (false); + + if (c == '\"') + { + string s (move (log_file_).string ()); // Move string rep out. + s.clear (); + + uint64_t ln (c.line); + uint64_t cn (c.column); + + for (char p ('\0'); p != '\"'; ) // Previous character. + { + c = get (); + + if (eos (c) || c == '\n') + fail (location (&name_, ln, cn)) << "unterminated string literal"; + + // Handle escapes. + // + if (p == '\\') + { + p = '\0'; // Clear so we don't confuse \" and \\". + + // We only handle what can reasonably be expected in a file name. + // + switch (c) + { + case '\\': + case '\'': + case '\"': break; // Add as is. + default: + fail (c) << "unsupported escape sequence in #line directive"; + } + } + else + { + p = c; + + switch (c) + { + case '\\': + case '\"': continue; + } + } + + s += c; + } + + log_file_ = path (move (s)); // Move back in. + } + else + unget (c); + } + auto lexer:: skip_spaces (bool nl) -> xchar { -- cgit v1.1