From 28f8338ded34f160e0083da9be4679bc778be7ca Mon Sep 17 00:00:00 2001 From: Boris Kolpackov Date: Fri, 25 Nov 2016 11:18:34 +0200 Subject: Distinguish token quoting type and completeness --- build2/lexer.cxx | 74 ++++++++++++++++--- build2/parser.cxx | 7 +- build2/test/script/lexer.cxx | 12 +++- build2/test/script/parser.cxx | 21 +++--- build2/token | 23 ++++-- unit-tests/buildfile | 2 +- unit-tests/lexer/buildfile | 13 ++++ unit-tests/lexer/comment.test | 112 +++++++++++++++++++++++++++++ unit-tests/lexer/driver.cxx | 94 +++++++++++++++++++++++++ unit-tests/lexer/quoting.test | 95 +++++++++++++++++++++++++ unit-tests/test/script/lexer/comment.test | 113 ------------------------------ 11 files changed, 424 insertions(+), 142 deletions(-) create mode 100644 unit-tests/lexer/buildfile create mode 100644 unit-tests/lexer/comment.test create mode 100644 unit-tests/lexer/driver.cxx create mode 100644 unit-tests/lexer/quoting.test delete mode 100644 unit-tests/test/script/lexer/comment.test diff --git a/build2/lexer.cxx b/build2/lexer.cxx index cf8a789..b73c291 100644 --- a/build2/lexer.cxx +++ b/build2/lexer.cxx @@ -295,7 +295,24 @@ namespace build2 uint64_t ln (c.line), cn (c.column); string lexeme; - bool quoted (m == lexer_mode::double_quoted); + quote_type qtype (m == lexer_mode::double_quoted + ? quote_type::double_ + : quote_type::unquoted); + + // If we are already in the quoted mode then we didn't start with the + // quote character. + // + bool qcomp (false); + + auto append = [&lexeme, &m, &qcomp] (char c) + { + lexeme += c; + + // An unquoted character after a quoted fragment. + // + if (qcomp && m != lexer_mode::double_quoted) + qcomp = false; + }; for (; !eos (c); c = peek ()) { @@ -321,7 +338,7 @@ namespace build2 fail (p) << "unterminated escape sequence"; if (p != '\n') // Ignore if line continuation. - lexeme += p; + append (p); continue; } @@ -424,6 +441,22 @@ namespace build2 // mode (lexer_mode::single_quoted); + switch (qtype) + { + case quote_type::unquoted: + qtype = quote_type::single; + qcomp = lexeme.empty (); + break; + case quote_type::single: + qcomp = false; // Non-contiguous. + break; + case quote_type::double_: + qtype = quote_type::mixed; + case quote_type::mixed: + qcomp = false; + break; + } + get (); for (c = get (); !eos (c) && c != '\''; c = get ()) lexeme += c; @@ -432,8 +465,6 @@ namespace build2 fail (c) << "unterminated single-quoted sequence"; state_.pop (); - - quoted = true; continue; } case '\"': @@ -444,7 +475,22 @@ namespace build2 st = state_.top (); m = st.mode; - quoted = true; + switch (qtype) + { + case quote_type::unquoted: + qtype = quote_type::double_; + qcomp = lexeme.empty (); + break; + case quote_type::double_: + qcomp = false; // Non-contiguous. + break; + case quote_type::single: + qtype = quote_type::mixed; + case quote_type::mixed: + qcomp = false; + break; + } + continue; } } @@ -455,19 +501,27 @@ namespace build2 break; get (); - lexeme += c; + append (c); } - if (eos (c) && m == lexer_mode::double_quoted) - fail (c) << "unterminated double-quoted sequence"; + if (m == lexer_mode::double_quoted) + { + if (eos (c)) + fail (c) << "unterminated double-quoted sequence"; + + // If we are still in the quoted mode then we didn't end with the quote + // character. + // + if (qcomp) + qcomp = false; + } // Expire variable mode at the end of the word. // if (m == lexer_mode::variable) state_.pop (); - return token (move (lexeme), sep, quoted, ln, cn); - + return token (move (lexeme), sep, qtype, qcomp, ln, cn); } bool lexer:: diff --git a/build2/parser.cxx b/build2/parser.cxx index 5f9850d..c2737cb 100644 --- a/build2/parser.cxx +++ b/build2/parser.cxx @@ -2154,7 +2154,10 @@ namespace build2 tt != type::lparen) || peeked ().separated)) { tt = type::word; - t = token (move (concat_str), true, false, t.line, t.column); + t = token (move (concat_str), + true, + quote_type::unquoted, false, + t.line, t.column); concat = false; } else if (!first) @@ -2792,7 +2795,7 @@ namespace build2 // // See tests/keyword. // - if (!t.quoted) + if (t.qtype == quote_type::unquoted) { // We cannot peek at the whole token here since it might have to be // lexed in a different mode. So peek at its first character. diff --git a/build2/test/script/lexer.cxx b/build2/test/script/lexer.cxx index 5e6c66a..19e7498 100644 --- a/build2/test/script/lexer.cxx +++ b/build2/test/script/lexer.cxx @@ -137,7 +137,7 @@ namespace build2 break; } - if (r.quoted) + if (r.qtype != quote_type::unquoted) ++quoted_; return r; @@ -448,7 +448,10 @@ namespace build2 lexeme += c; } - return token (move (lexeme), false, false, ln, cn); + return token (move (lexeme), + false, + quote_type::unquoted, false, + ln, cn); } token lexer:: @@ -480,7 +483,10 @@ namespace build2 } state_.pop (); // Expire the variable mode. - return token (move (lexeme), sep, false, ln, cn); + return token (move (lexeme), + sep, + quote_type::unquoted, false, + ln, cn); } } } diff --git a/build2/test/script/parser.cxx b/build2/test/script/parser.cxx index 9afef75..a116873 100644 --- a/build2/test/script/parser.cxx +++ b/build2/test/script/parser.cxx @@ -321,7 +321,7 @@ namespace build2 // lt = line_type::cmd; // Default. - if (tt == type::word && !t.quoted) + if (tt == type::word && t.qtype == quote_type::unquoted) { const string& n (t.value); @@ -353,7 +353,7 @@ namespace build2 // lt = line_type::cmd; // Default. - if (tt == type::word && !t.quoted) + if (tt == type::word && t.qtype == quote_type::unquoted) { const string& n (t.value); @@ -719,7 +719,7 @@ namespace build2 const token& p (peeked ()); const location ll (get_location (p)); - if (pt == type::word && !p.quoted) + if (pt == type::word && p.qtype == quote_type::unquoted) { if (p.value == "elif") lt = line_type::cmd_elif; else if (p.value == "elif!") lt = line_type::cmd_elifn; @@ -1652,7 +1652,7 @@ namespace build2 // next (t, tt); - if (tt != type::word || t.quoted) + if (tt != type::word || t.qtype != quote_type::unquoted) fail (l) << "expected here-document end marker"; hd.push_back (here_doc {0, 0, 0, move (t.value), nn}); @@ -1751,7 +1751,8 @@ namespace build2 // quoted (note that the current token is "next" and is not part // of this). // - bool q ((quoted () - (t.quoted ? 1 : 0)) != 0); + bool q ((quoted () - + (t.qtype != quote_type::unquoted ? 1 : 0)) != 0); for (name& n: ns) { @@ -2074,7 +2075,9 @@ namespace build2 // Check if this is the end marker. For starters, it should be a // single, unquoted word followed by a newline. // - if (tt == type::word && !t.quoted && peek () == type::newline) + if (tt == type::word && + t.qtype == quote_type::unquoted && + peek () == type::newline) { const string& v (t.value); @@ -2652,7 +2655,7 @@ namespace build2 // Examine tokens we have replayed since last reset. // for (size_t i (replay_quoted_); i != replay_i_; ++i) - if (replay_data_[i].token.quoted) + if (replay_data_[i].token.qtype != quote_type::unquoted) ++r; } @@ -2663,14 +2666,14 @@ namespace build2 reset_quoted (token& cur) { if (replay_ != replay::play) - lexer_->reset_quoted (cur.quoted ? 1 : 0); + lexer_->reset_quoted (cur.qtype != quote_type::unquoted ? 1 : 0); else { replay_quoted_ = replay_i_ - 1; // Must be the same token. // - assert (replay_data_[replay_quoted_].token.quoted == cur.quoted); + assert (replay_data_[replay_quoted_].token.qtype == cur.qtype); } } diff --git a/build2/token b/build2/token index b3ebf5b..df25d4c 100644 --- a/build2/token +++ b/build2/token @@ -56,6 +56,11 @@ namespace build2 value_type v_; }; + // Token can be unquoted, single-quoted ('') or double-quoted (""). It can + // also be mixed. + // + enum class quote_type {unquoted, single, double_, mixed}; + class token; void @@ -68,7 +73,13 @@ namespace build2 token_type type; bool separated; // Whitespace-separated from the previous token. - bool quoted; // Word (or some part of it) was quoted. + + // Quoting can be complete, where the token starts and ends with the quote + // characters and quoting is contiguous or partial where only some part(s) + // of the token are quoted or quoting continus to the next token. + // + quote_type qtype; + bool qcomp; string value; // Only valid for word. @@ -82,12 +93,16 @@ namespace build2 : token (token_type::eos, false, 0, 0, token_printer) {} token (token_type t, bool s, uint64_t l, uint64_t c, printer_type* p) - : type (t), separated (s), quoted (false), + : type (t), separated (s), qtype (quote_type::unquoted), line (l), column (c), printer (p) {} - token (string v, bool s, bool q, uint64_t l, uint64_t c) - : type (token_type::word), separated (s), quoted (q), value (move (v)), + token (string v, bool s, + quote_type qt, bool qc, + uint64_t l, uint64_t c) + : type (token_type::word), separated (s), + qtype (qt), qcomp (qc), + value (move (v)), line (l), column (c), printer (&token_printer) {} }; diff --git a/unit-tests/buildfile b/unit-tests/buildfile index 5d06ec7..f8cfb9d 100644 --- a/unit-tests/buildfile +++ b/unit-tests/buildfile @@ -2,6 +2,6 @@ # copyright : Copyright (c) 2014-2016 Code Synthesis Ltd # license : MIT; see accompanying LICENSE file -d = function/ test/script/ +d = function/ lexer/ test/script/ ./: $d include $d diff --git a/unit-tests/lexer/buildfile b/unit-tests/lexer/buildfile new file mode 100644 index 0000000..d9bd2df --- /dev/null +++ b/unit-tests/lexer/buildfile @@ -0,0 +1,13 @@ +# file : unit-tests/lexer/buildfile +# copyright : Copyright (c) 2014-2016 Code Synthesis Ltd +# license : MIT; see accompanying LICENSE file + +#@@ Temporary until we get utility library support. +# +import libs = libbutl%lib{butl} +src = token lexer diagnostics utility variable name b-options types-parsers + +exe{driver}: cxx{driver} ../../build2/cxx{$src} $libs \ +test{comment quoting} + +include ../../build2/ diff --git a/unit-tests/lexer/comment.test b/unit-tests/lexer/comment.test new file mode 100644 index 0000000..07d7ac5 --- /dev/null +++ b/unit-tests/lexer/comment.test @@ -0,0 +1,112 @@ +# file : unit-tests/lexer/comment.test +# copyright : Copyright (c) 2014-2016 Code Synthesis Ltd +# license : MIT; see accompanying LICENSE file + +# Single-line comments. + +$* <>:EOO # single-only +# comment +EOI +EOO + +$* <>EOO # single-first +# comment +foo +EOI +'foo' + +EOO + +$* <>EOO # single-last +foo +# comment +EOI +'foo' + +EOO + +$* <>EOO # single-few +foo +# comment +# comment +EOI +'foo' + +EOO + +$* <>EOO # single-cont +foo +# comment\\ +bar +EOI +'foo' + +'bar' + +EOO + +$* <>EOO # single-same +foo # comment +bar # comment +EOI +'foo' + +'bar' + +EOO + +# Multi-line comments. +# + +$* <>:EOO # multi-only +#\\ +comment +comment +#\\ +EOI +EOO + +$* <>:EOO # multi-empty +#\\ +#\\ +EOI +EOO + +$* <>EOO # multi-start-same +foo #\\ +comment +comment +#\\ +EOI +'foo' + +EOO + +$* <>EOO # multi-end-same +#\\ +comment +comment +foo #\\ +bar +EOI +'bar' + +EOO + +$* <>EOO # multi-end-not +#\\ +comment +#\\ not an end +foo #\\ +bar +EOI +'bar' + +EOO + +$* <>EOE != 0 # multi-unterm +#\\ +comment +EOI +stdin:3:1: error: unterminated multi-line comment +EOE diff --git a/unit-tests/lexer/driver.cxx b/unit-tests/lexer/driver.cxx new file mode 100644 index 0000000..326ac8a --- /dev/null +++ b/unit-tests/lexer/driver.cxx @@ -0,0 +1,94 @@ +// file : unit-tests/lexer/driver.cxx -*- C++ -*- +// copyright : Copyright (c) 2014-2016 Code Synthesis Ltd +// license : MIT; see accompanying LICENSE file + +#include +#include + +#include +#include + +#include +#include + +using namespace std; + +namespace build2 +{ + // Usage: argv[0] [-q] [] + // + int + main (int argc, char* argv[]) + { + bool quote (false); + lexer_mode m (lexer_mode::normal); + + for (int i (1); i != argc; ++i) + { + string a (argv[i]); + + if (a == "-q") + quote = true; + else + { + if (a == "normal") m = lexer_mode::normal; + else if (a == "variable") m = lexer_mode::variable; + else if (a == "value") m = lexer_mode::value; + else if (a == "attribute") m = lexer_mode::attribute; + else if (a == "eval") m = lexer_mode::eval; + else assert (false); + break; + } + } + + try + { + cin.exceptions (istream::failbit | istream::badbit); + + // Most alternative modes auto-expire so we need something underneath. + // + lexer l (cin, path ("stdin")); + + if (m != lexer_mode::normal) + l.mode (m); + + // No use printing eos since we will either get it or loop forever. + // + for (token t (l.next ()); t.type != token_type::eos; t = l.next ()) + { + // Print each token on a separate line without quoting operators. + // + t.printer (cout, t, false); + + if (quote) + { + char q ('\0'); + switch (t.qtype) + { + case quote_type::single: q = 'S'; break; + case quote_type::double_: q = 'D'; break; + case quote_type::mixed: q = 'M'; break; + case quote_type::unquoted: break; + } + + if (q != '\0') + cout << " [" << q << (t.qcomp ? "/C" : "/P") << ']'; + } + + cout << endl; + } + } + catch (const failed&) + { + return 1; + } + + return 0; + } +} + +int +main (int argc, char* argv[]) +{ + return build2::main (argc, argv); +} diff --git a/unit-tests/lexer/quoting.test b/unit-tests/lexer/quoting.test new file mode 100644 index 0000000..76fd904 --- /dev/null +++ b/unit-tests/lexer/quoting.test @@ -0,0 +1,95 @@ +# file : unit-tests/lexer/quoting.test +# copyright : Copyright (c) 2014-2016 Code Synthesis Ltd +# license : MIT; see accompanying LICENSE file + +test.options += -q + +: unquoted +: +$* <'foo' >>EOO +'foo' + +EOO + +: single-comp +: +$* <":'foo':" >>EOO +: +'foo' [S/C] +: + +EOO + +: double-comp +: +$* <':"foo":' >>EOO +: +'foo' [D/C] +: + +EOO + +: single-empty-comp +: +$* <"''" >>EOO +'' [S/C] + +EOO + +: double-empty-comp +: +$* <'""' >>EOO +'' [D/C] + +EOO + +: part-start-quoted +: Token start already quoted +: +$* <'"$foo"' >>EOO +'' [D/P] +\$ +'foo' [D/P] + +EOO + +: part-end-quoted +: Token end still quoted +: +$* <'"foo$"' >>EOO +'foo' [D/P] +\$ +'' [D/P] + +EOO + +: part-start-unquoted +: Token starts with unquoted character +: +$* <'f"oo"' >>EOO +'foo' [D/P] + +EOO + +: part-unquoted +: Token continous with unquoted character +: +$* <'"fo"o' >>EOO +'foo' [D/P] + +EOO + +: part-unquoted-escape +: Token continous with unquoted escaped character +: +$* <'"fo"\"' >>EOO +'fo"' [D/P] + +EOO + +: mixed +: +$* <"\"fo\"'o'" >>EOO +'foo' [M/P] + +EOO diff --git a/unit-tests/test/script/lexer/comment.test b/unit-tests/test/script/lexer/comment.test deleted file mode 100644 index 0092ed9..0000000 --- a/unit-tests/test/script/lexer/comment.test +++ /dev/null @@ -1,113 +0,0 @@ -# @@ This one should be moved to build2/lexer since we use base lexer -# functionality as is. -# -test.arguments += script-line - -# Single-line comments. - -$* <>:EOO # single-only -# comment -EOI -EOO - -$* <>EOO # single-first -# comment -foo -EOI -'foo' - -EOO - -$* <>EOO # single-last -foo -# comment -EOI -'foo' - -EOO - -$* <>EOO # single-few -foo -# comment -# comment -EOI -'foo' - -EOO - -$* <>EOO # single-cont -foo -# comment\\ -bar -EOI -'foo' - -'bar' - -EOO - -$* <>EOO # single-same -foo # comment -bar # comment -EOI -'foo' - -'bar' - -EOO - -# Multi-line comments. -# - -$* <>:EOO # multi-only -#\\ -comment -comment -#\\ -EOI -EOO - -$* <>:EOO # multi-empty -#\\ -#\\ -EOI -EOO - -$* <>EOO # multi-start-same -foo #\\ -comment -comment -#\\ -EOI -'foo' - -EOO - -$* <>EOO # multi-end-same -#\\ -comment -comment -foo #\\ -bar -EOI -'bar' - -EOO - -$* <>EOO # multi-end-not -#\\ -comment -#\\ not an end -foo #\\ -bar -EOI -'bar' - -EOO - -$* <>EOE != 0 # multi-unterm -#\\ -comment -EOI -stdin:3:1: error: unterminated multi-line comment -EOE -- cgit v1.1