From 2e19434e09b819105055ddc8e58f69db98ec8669 Mon Sep 17 00:00:00 2001 From: Boris Kolpackov Date: Sat, 27 May 2017 15:24:25 +0200 Subject: Handle #line directives in C/C++ lexer This way the parser now reports logical rather than physical location in diagnostics. --- build2/cc/lexer.cxx | 180 +++++++++++++++++++++++----- build2/cc/lexer.hxx | 38 +++--- build2/cc/parser.cxx | 4 +- build2/cc/parser.hxx | 7 -- unit-tests/cc/lexer/char-literal.test | 2 +- unit-tests/cc/lexer/driver.cxx | 37 ++++-- unit-tests/cc/lexer/preprocessor.test | 20 +++- unit-tests/cc/lexer/raw-string-literal.test | 10 +- unit-tests/cc/lexer/string-literal.test | 2 +- 9 files changed, 227 insertions(+), 73 deletions(-) diff --git a/build2/cc/lexer.cxx b/build2/cc/lexer.cxx index 05c734c..40178bb 100644 --- a/build2/cc/lexer.cxx +++ b/build2/cc/lexer.cxx @@ -25,6 +25,18 @@ namespace build2 { namespace cc { + inline void lexer:: + get (const xchar& c) + { + // Increment the logical line similar to how base will increment the + // physical (the column counts are the same). + // + if (log_line_ && c == '\n' && !unget_ && !unpeek_) + ++*log_line_; + + base::get (c); + } + inline auto lexer:: get (bool e) -> xchar { @@ -36,7 +48,7 @@ namespace build2 else { xchar c (peek (e)); - base::get (c); + get (c); return c; } } @@ -54,12 +66,12 @@ namespace build2 if (e && c == '\\') { - base::get (c); + get (c); xchar p (base::peek ()); if (p == '\n') { - base::get (p); + get (p); return peek (e); // Recurse. } @@ -80,7 +92,8 @@ namespace build2 { for (;; c = skip_spaces ()) { - t.line = c.line; + t.file = log_file_; + t.line = log_line_ ? * log_line_ : c.line; t.column = c.column; if (eos (c)) @@ -101,19 +114,52 @@ namespace build2 // that we assume there cannot be #include directives. // // This may not work for things like #error that can contain - // pretty much anything. Also note that lines that start with - // # can contain # further down. + // pretty much anything. Also note that lines that start with # + // can contain # further down. + // + // Finally, to support diagnostics properly we need to recognize + // #line directives. // if (ignore_pp) { - for (;;) + for (bool first (true);;) { + // Note that we keep using the passed token for buffers. + // c = skip_spaces (false); // Stop at newline. if (eos (c) || c == '\n') break; - next (t, c, false); // Keep using the passed token for buffers. + if (first) + { + first = false; + + // Recognize #line and its shorthand version: + // + // #line [] ... + // # [] ... + // + if (!(c >= '0' && c <= '9')) + { + next (t, c, false); + + if (t.type != type::identifier || t.value != "line") + continue; + + c = skip_spaces (false); + + if (!(c >= '0' && c <= '9')) + fail (c) << "line number expected after #line directive"; + } + + // Ok, this is #line and next comes the line number. + // + line_directive (t, c); + continue; // Parse the tail, if any. + } + + next (t, c, false); } break; } @@ -356,9 +402,6 @@ namespace build2 void lexer:: number_literal (token& t, xchar c) { - t.line = c.line; - t.column = c.column; - // A number (integer or floating point literal) can: // // 1. Start with a dot (which must be followed by a digit, e.g., .123). @@ -462,17 +505,15 @@ namespace build2 void lexer:: char_literal (token& t, xchar c) { - t.line = c.line; - t.column = c.column; + uint64_t ln (c.line); + uint64_t cn (c.column); - char p (c); // Previous character (see below). - - for (;;) + for (char p (c);;) // Previous character (see below). { c = get (); - if (eos (c)) - fail (location (&name_, t.line, t.column)) << "unterminated literal"; + if (eos (c) || c == '\n') + fail (location (&name_, ln, cn)) << "unterminated character literal"; if (c == '\'' && p != '\\') break; @@ -494,17 +535,15 @@ namespace build2 void lexer:: string_literal (token& t, xchar c) { - t.line = c.line; - t.column = c.column; - - char p (c); // Previous character (see below). + uint64_t ln (c.line); + uint64_t cn (c.column); - for (;;) + for (char p (c);;) // Previous character (see below). { c = get (); - if (eos (c)) - fail (location (&name_, t.line, t.column)) << "unterminated literal"; + if (eos (c) || c == '\n') + fail (location (&name_, ln, cn)) << "unterminated string literal"; if (c == '\"' && p != '\\') break; @@ -526,9 +565,6 @@ namespace build2 void lexer:: raw_string_literal (token& t, xchar c) { - t.line = c.line; - t.column = c.column; - // The overall form is: // // R"()" @@ -540,6 +576,8 @@ namespace build2 // Note that the are not processed in any way, not even // for line continuations. // + uint64_t ln (c.line); + uint64_t cn (c.column); // As a first step, parse the delimiter (including the openning paren). // @@ -550,7 +588,7 @@ namespace build2 c = get (); if (eos (c) || c == '\"' || c == ')' || c == '\\' || c == ' ') - fail (location (&name_, t.line, t.column)) << "invalid raw literal"; + fail (location (&name_, ln, cn)) << "invalid raw string literal"; if (c == '(') break; @@ -567,8 +605,8 @@ namespace build2 { c = get (false); // No newline escaping. - if (eos (c)) - fail (location (&name_, t.line, t.column)) << "invalid raw literal"; + if (eos (c)) // Note: newline is ok. + fail (location (&name_, ln, cn)) << "invalid raw string literal"; if (c != d[i] && i != 0) // Restart from the beginning. i = 0; @@ -596,6 +634,86 @@ namespace build2 for (get (c); (c = peek ()) == '_' || alnum (c); get (c)) ; } + void lexer:: + line_directive (token& t, xchar c) + { + // enter: first digit of the line number + // leave: last character of the line number or file string + + // If our number and string tokens contained the literal values, then we + // could have used that. However, we ignore the value (along with escape + // processing, etc), for performance. Let's keep it that way and instead + // handle it ourselves. + // + { + string& s (t.value); + + for (s = c; (c = peek ()) >= '0' && c <= '9'; get (c)) + s += c; + + // The newline that ends the directive will increment the logical line + // so subtract one to compensate. Note: can't be 0 and shouldn't throw + // for valid lines. + // + log_line_ = stoull (s.c_str ()) - 1; + } + + // See if we have the file. + // + c = skip_spaces (false); + + if (c == '\"') + { + string s (move (log_file_).string ()); // Move string rep out. + s.clear (); + + uint64_t ln (c.line); + uint64_t cn (c.column); + + for (char p ('\0'); p != '\"'; ) // Previous character. + { + c = get (); + + if (eos (c) || c == '\n') + fail (location (&name_, ln, cn)) << "unterminated string literal"; + + // Handle escapes. + // + if (p == '\\') + { + p = '\0'; // Clear so we don't confuse \" and \\". + + // We only handle what can reasonably be expected in a file name. + // + switch (c) + { + case '\\': + case '\'': + case '\"': break; // Add as is. + default: + fail (c) << "unsupported escape sequence in #line directive"; + } + } + else + { + p = c; + + switch (c) + { + case '\\': + case '\"': continue; + } + } + + s += c; + } + + log_file_ = path (move (s)); // Move back in. + } + else + unget (c); + } + auto lexer:: skip_spaces (bool nl) -> xchar { diff --git a/build2/cc/lexer.hxx b/build2/cc/lexer.hxx index 7865a4e..8767606 100644 --- a/build2/cc/lexer.hxx +++ b/build2/cc/lexer.hxx @@ -22,8 +22,10 @@ namespace build2 // // The input is a (partially-)preprocessed translation unit that may still // contain comments, line continuations, and preprocessor directives such - // as #line, #pragma, etc. Currently all preprocessor directives are - // discarded and no values are saved for literals. + // as #line, #pragma, etc., but not #include's. Currently all preprocessor + // directives except #line are ignored and no values are saved from + // literals. The #line directive (and its shorthand notation) is + // recognized to provide the logical token location. // enum class token_type { @@ -51,6 +53,7 @@ namespace build2 token_type type; string value; + path file; uint64_t line; uint64_t column; @@ -74,7 +77,10 @@ namespace build2 { public: lexer (istream& is, const path& name) - : char_scanner (is, false), name_ (name), fail ("error", &name_) {} + : char_scanner (is, false), + name_ (name), + fail ("error", &name_), + log_file_ (name) {} const path& name () const {return name_;} @@ -121,6 +127,9 @@ namespace build2 void literal_suffix (xchar); + void + line_directive (token&, xchar); + xchar skip_spaces (bool newline = true); @@ -134,7 +143,7 @@ namespace build2 get (bool escape = true); void - get (const xchar& peeked) {base::get (peeked);} + get (const xchar& peeked); xchar peek (bool escape = true); @@ -142,23 +151,20 @@ namespace build2 private: const path name_; const fail_mark fail; + + // Logical file and line as set by the #line directives. Note that the + // lexer diagnostics still uses the physical file/lines. + // + path log_file_; + optional log_line_; }; - // Diagnostics plumbing. We assume that any diag stream for which we can - // use token as location has its aux data pointing to pointer to path. + // Diagnostics plumbing. // inline location - get_location (const token& t, const path& p) - { - return location (&p, t.line, t.column); - } - - inline location - get_location (const token& t, const void* data) + get_location (const token& t, const void*) { - assert (data != nullptr); // E.g., must be &parser::path_. - const path* p (*static_cast (data)); - return get_location (t, *p); + return location (&t.file, t.line, t.column); } } } diff --git a/build2/cc/parser.cxx b/build2/cc/parser.cxx index b21e99f..24de7ba 100644 --- a/build2/cc/parser.cxx +++ b/build2/cc/parser.cxx @@ -18,9 +18,7 @@ namespace build2 translation_unit parser:: parse (istream& is, const path& name) { - name_ = &name; - - lexer l (is, *name_); + lexer l (is, name); l_ = &l; translation_unit u; diff --git a/build2/cc/parser.hxx b/build2/cc/parser.hxx index d52ddc9..00be190 100644 --- a/build2/cc/parser.hxx +++ b/build2/cc/parser.hxx @@ -30,8 +30,6 @@ namespace build2 class parser { public: - parser (): fail ("error", &name_), warn ("warning", &name_) {} - translation_unit parse (istream&, const path& name); @@ -46,11 +44,6 @@ namespace build2 parse_module_name (token&); private: - const path* name_; - - const fail_mark fail; - const basic_mark warn; - lexer* l_; translation_unit* u_; }; diff --git a/unit-tests/cc/lexer/char-literal.test b/unit-tests/cc/lexer/char-literal.test index f256785..f2c6249 100644 --- a/unit-tests/cc/lexer/char-literal.test +++ b/unit-tests/cc/lexer/char-literal.test @@ -63,5 +63,5 @@ EOO : unterminated : $* <"'a" 2>>EOE != 0 -stdin:1:1: error: unterminated literal +stdin:1:1: error: unterminated character literal EOE diff --git a/unit-tests/cc/lexer/driver.cxx b/unit-tests/cc/lexer/driver.cxx index db3f516..5803a88 100644 --- a/unit-tests/cc/lexer/driver.cxx +++ b/unit-tests/cc/lexer/driver.cxx @@ -16,38 +16,59 @@ namespace build2 { namespace cc { - // Usage: argv[0] [] + // Usage: argv[0] [-l] [] // int main (int argc, char* argv[]) { + bool loc (false); + const char* file (nullptr); + + for (int i (1); i != argc; ++i) + { + string a (argv[i]); + + if (a == "-l") + loc = true; + else + { + file = argv[i]; + break; + } + } + try { istream* is; - const char* in; // Reading from file is several times faster. // ifdstream ifs; - if (argc > 1) + if (file != nullptr) { - in = argv[1]; - ifs.open (in); + ifs.open (file); is = &ifs; } else { - in = "stdin"; + file = "stdin"; cin.exceptions (istream::failbit | istream::badbit); is = &cin; } - lexer l (*is, path (in)); + lexer l (*is, path (file)); // No use printing eos since we will either get it or loop forever. // for (token t; l.next (t) != token_type::eos; ) - cout << t << endl; + { + cout << t; + + if (loc) + cout << ' ' << t.file << ':' << t.line << ':' << t.column; + + cout << endl; + } } catch (const failed&) { diff --git a/unit-tests/cc/lexer/preprocessor.test b/unit-tests/cc/lexer/preprocessor.test index e082062..a3fab9f 100644 --- a/unit-tests/cc/lexer/preprocessor.test +++ b/unit-tests/cc/lexer/preprocessor.test @@ -32,10 +32,28 @@ EOI : line : -$* <>EOO +; # 1 "test.cxx" 2 +; + ; +# 4 +; #line 8 "z:\\tmp\\test.hxx" +; +#line 10 +; +# 5 "test.cxx" +; EOI +';' stdin:1:1 +';' test.cxx:1:1 +';' test.cxx:2:3 +';' test.cxx:4:1 +';' z:\tmp\test.hxx:8:1 +';' z:\tmp\test.hxx:10:1 +';' test.cxx:5:1 +EOO : nested : diff --git a/unit-tests/cc/lexer/raw-string-literal.test b/unit-tests/cc/lexer/raw-string-literal.test index e8e8b6b..7d5b920 100644 --- a/unit-tests/cc/lexer/raw-string-literal.test +++ b/unit-tests/cc/lexer/raw-string-literal.test @@ -62,29 +62,29 @@ EOO : invalid-no-paren : $* <'R"a"' 2>>EOE != 0 -stdin:1:2: error: invalid raw literal +stdin:1:2: error: invalid raw string literal EOE : invalid-paren : $* <'R")()("' 2>>EOE != 0 -stdin:1:2: error: invalid raw literal +stdin:1:2: error: invalid raw string literal EOE : invalid-unterminated-paren : $* <'R"(abc"' 2>>EOE != 0 -stdin:1:2: error: invalid raw literal +stdin:1:2: error: invalid raw string literal EOE : invalid-unterminated-delimiter : $* <'R"X(abc)"' 2>>EOE != 0 -stdin:1:2: error: invalid raw literal +stdin:1:2: error: invalid raw string literal EOE : invalid-unterminated-quote : $* <'R"X(abc)X' 2>>EOE != 0 -stdin:1:2: error: invalid raw literal +stdin:1:2: error: invalid raw string literal EOE diff --git a/unit-tests/cc/lexer/string-literal.test b/unit-tests/cc/lexer/string-literal.test index 062d290..f726c76 100644 --- a/unit-tests/cc/lexer/string-literal.test +++ b/unit-tests/cc/lexer/string-literal.test @@ -61,5 +61,5 @@ EOO : unterminated : $* <'"ab' 2>>EOE != 0 -stdin:1:1: error: unterminated literal +stdin:1:1: error: unterminated string literal EOE -- cgit v1.1