aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBoris Kolpackov <boris@codesynthesis.com>2016-11-25 11:18:34 +0200
committerBoris Kolpackov <boris@codesynthesis.com>2016-11-25 11:18:34 +0200
commit28f8338ded34f160e0083da9be4679bc778be7ca (patch)
tree7bd01311683d835f946c73d7d8220f552bae718f
parentf32bb0aceb00cfa4bd04eea72f8fa2fe02b738b3 (diff)
Distinguish token quoting type and completeness
-rw-r--r--build2/lexer.cxx74
-rw-r--r--build2/parser.cxx7
-rw-r--r--build2/test/script/lexer.cxx12
-rw-r--r--build2/test/script/parser.cxx21
-rw-r--r--build2/token23
-rw-r--r--unit-tests/buildfile2
-rw-r--r--unit-tests/lexer/buildfile13
-rw-r--r--unit-tests/lexer/comment.test (renamed from unit-tests/test/script/lexer/comment.test)7
-rw-r--r--unit-tests/lexer/driver.cxx94
-rw-r--r--unit-tests/lexer/quoting.test95
10 files changed, 315 insertions, 33 deletions
diff --git a/build2/lexer.cxx b/build2/lexer.cxx
index cf8a789..b73c291 100644
--- a/build2/lexer.cxx
+++ b/build2/lexer.cxx
@@ -295,7 +295,24 @@ namespace build2
uint64_t ln (c.line), cn (c.column);
string lexeme;
- bool quoted (m == lexer_mode::double_quoted);
+ quote_type qtype (m == lexer_mode::double_quoted
+ ? quote_type::double_
+ : quote_type::unquoted);
+
+ // If we are already in the quoted mode then we didn't start with the
+ // quote character.
+ //
+ bool qcomp (false);
+
+ auto append = [&lexeme, &m, &qcomp] (char c)
+ {
+ lexeme += c;
+
+ // An unquoted character after a quoted fragment.
+ //
+ if (qcomp && m != lexer_mode::double_quoted)
+ qcomp = false;
+ };
for (; !eos (c); c = peek ())
{
@@ -321,7 +338,7 @@ namespace build2
fail (p) << "unterminated escape sequence";
if (p != '\n') // Ignore if line continuation.
- lexeme += p;
+ append (p);
continue;
}
@@ -424,6 +441,22 @@ namespace build2
//
mode (lexer_mode::single_quoted);
+ switch (qtype)
+ {
+ case quote_type::unquoted:
+ qtype = quote_type::single;
+ qcomp = lexeme.empty ();
+ break;
+ case quote_type::single:
+ qcomp = false; // Non-contiguous.
+ break;
+ case quote_type::double_:
+ qtype = quote_type::mixed;
+ case quote_type::mixed:
+ qcomp = false;
+ break;
+ }
+
get ();
for (c = get (); !eos (c) && c != '\''; c = get ())
lexeme += c;
@@ -432,8 +465,6 @@ namespace build2
fail (c) << "unterminated single-quoted sequence";
state_.pop ();
-
- quoted = true;
continue;
}
case '\"':
@@ -444,7 +475,22 @@ namespace build2
st = state_.top ();
m = st.mode;
- quoted = true;
+ switch (qtype)
+ {
+ case quote_type::unquoted:
+ qtype = quote_type::double_;
+ qcomp = lexeme.empty ();
+ break;
+ case quote_type::double_:
+ qcomp = false; // Non-contiguous.
+ break;
+ case quote_type::single:
+ qtype = quote_type::mixed;
+ case quote_type::mixed:
+ qcomp = false;
+ break;
+ }
+
continue;
}
}
@@ -455,19 +501,27 @@ namespace build2
break;
get ();
- lexeme += c;
+ append (c);
}
- if (eos (c) && m == lexer_mode::double_quoted)
- fail (c) << "unterminated double-quoted sequence";
+ if (m == lexer_mode::double_quoted)
+ {
+ if (eos (c))
+ fail (c) << "unterminated double-quoted sequence";
+
+ // If we are still in the quoted mode then we didn't end with the quote
+ // character.
+ //
+ if (qcomp)
+ qcomp = false;
+ }
// Expire variable mode at the end of the word.
//
if (m == lexer_mode::variable)
state_.pop ();
- return token (move (lexeme), sep, quoted, ln, cn);
-
+ return token (move (lexeme), sep, qtype, qcomp, ln, cn);
}
bool lexer::
diff --git a/build2/parser.cxx b/build2/parser.cxx
index 5f9850d..c2737cb 100644
--- a/build2/parser.cxx
+++ b/build2/parser.cxx
@@ -2154,7 +2154,10 @@ namespace build2
tt != type::lparen) || peeked ().separated))
{
tt = type::word;
- t = token (move (concat_str), true, false, t.line, t.column);
+ t = token (move (concat_str),
+ true,
+ quote_type::unquoted, false,
+ t.line, t.column);
concat = false;
}
else if (!first)
@@ -2792,7 +2795,7 @@ namespace build2
//
// See tests/keyword.
//
- if (!t.quoted)
+ if (t.qtype == quote_type::unquoted)
{
// We cannot peek at the whole token here since it might have to be
// lexed in a different mode. So peek at its first character.
diff --git a/build2/test/script/lexer.cxx b/build2/test/script/lexer.cxx
index 5e6c66a..19e7498 100644
--- a/build2/test/script/lexer.cxx
+++ b/build2/test/script/lexer.cxx
@@ -137,7 +137,7 @@ namespace build2
break;
}
- if (r.quoted)
+ if (r.qtype != quote_type::unquoted)
++quoted_;
return r;
@@ -448,7 +448,10 @@ namespace build2
lexeme += c;
}
- return token (move (lexeme), false, false, ln, cn);
+ return token (move (lexeme),
+ false,
+ quote_type::unquoted, false,
+ ln, cn);
}
token lexer::
@@ -480,7 +483,10 @@ namespace build2
}
state_.pop (); // Expire the variable mode.
- return token (move (lexeme), sep, false, ln, cn);
+ return token (move (lexeme),
+ sep,
+ quote_type::unquoted, false,
+ ln, cn);
}
}
}
diff --git a/build2/test/script/parser.cxx b/build2/test/script/parser.cxx
index 9afef75..a116873 100644
--- a/build2/test/script/parser.cxx
+++ b/build2/test/script/parser.cxx
@@ -321,7 +321,7 @@ namespace build2
//
lt = line_type::cmd; // Default.
- if (tt == type::word && !t.quoted)
+ if (tt == type::word && t.qtype == quote_type::unquoted)
{
const string& n (t.value);
@@ -353,7 +353,7 @@ namespace build2
//
lt = line_type::cmd; // Default.
- if (tt == type::word && !t.quoted)
+ if (tt == type::word && t.qtype == quote_type::unquoted)
{
const string& n (t.value);
@@ -719,7 +719,7 @@ namespace build2
const token& p (peeked ());
const location ll (get_location (p));
- if (pt == type::word && !p.quoted)
+ if (pt == type::word && p.qtype == quote_type::unquoted)
{
if (p.value == "elif") lt = line_type::cmd_elif;
else if (p.value == "elif!") lt = line_type::cmd_elifn;
@@ -1652,7 +1652,7 @@ namespace build2
//
next (t, tt);
- if (tt != type::word || t.quoted)
+ if (tt != type::word || t.qtype != quote_type::unquoted)
fail (l) << "expected here-document end marker";
hd.push_back (here_doc {0, 0, 0, move (t.value), nn});
@@ -1751,7 +1751,8 @@ namespace build2
// quoted (note that the current token is "next" and is not part
// of this).
//
- bool q ((quoted () - (t.quoted ? 1 : 0)) != 0);
+ bool q ((quoted () -
+ (t.qtype != quote_type::unquoted ? 1 : 0)) != 0);
for (name& n: ns)
{
@@ -2074,7 +2075,9 @@ namespace build2
// Check if this is the end marker. For starters, it should be a
// single, unquoted word followed by a newline.
//
- if (tt == type::word && !t.quoted && peek () == type::newline)
+ if (tt == type::word &&
+ t.qtype == quote_type::unquoted &&
+ peek () == type::newline)
{
const string& v (t.value);
@@ -2652,7 +2655,7 @@ namespace build2
// Examine tokens we have replayed since last reset.
//
for (size_t i (replay_quoted_); i != replay_i_; ++i)
- if (replay_data_[i].token.quoted)
+ if (replay_data_[i].token.qtype != quote_type::unquoted)
++r;
}
@@ -2663,14 +2666,14 @@ namespace build2
reset_quoted (token& cur)
{
if (replay_ != replay::play)
- lexer_->reset_quoted (cur.quoted ? 1 : 0);
+ lexer_->reset_quoted (cur.qtype != quote_type::unquoted ? 1 : 0);
else
{
replay_quoted_ = replay_i_ - 1;
// Must be the same token.
//
- assert (replay_data_[replay_quoted_].token.quoted == cur.quoted);
+ assert (replay_data_[replay_quoted_].token.qtype == cur.qtype);
}
}
diff --git a/build2/token b/build2/token
index b3ebf5b..df25d4c 100644
--- a/build2/token
+++ b/build2/token
@@ -56,6 +56,11 @@ namespace build2
value_type v_;
};
+ // Token can be unquoted, single-quoted ('') or double-quoted (""). It can
+ // also be mixed.
+ //
+ enum class quote_type {unquoted, single, double_, mixed};
+
class token;
void
@@ -68,7 +73,13 @@ namespace build2
token_type type;
bool separated; // Whitespace-separated from the previous token.
- bool quoted; // Word (or some part of it) was quoted.
+
+ // Quoting can be complete, where the token starts and ends with the quote
+ // characters and quoting is contiguous or partial where only some part(s)
+ // of the token are quoted or quoting continus to the next token.
+ //
+ quote_type qtype;
+ bool qcomp;
string value; // Only valid for word.
@@ -82,12 +93,16 @@ namespace build2
: token (token_type::eos, false, 0, 0, token_printer) {}
token (token_type t, bool s, uint64_t l, uint64_t c, printer_type* p)
- : type (t), separated (s), quoted (false),
+ : type (t), separated (s), qtype (quote_type::unquoted),
line (l), column (c),
printer (p) {}
- token (string v, bool s, bool q, uint64_t l, uint64_t c)
- : type (token_type::word), separated (s), quoted (q), value (move (v)),
+ token (string v, bool s,
+ quote_type qt, bool qc,
+ uint64_t l, uint64_t c)
+ : type (token_type::word), separated (s),
+ qtype (qt), qcomp (qc),
+ value (move (v)),
line (l), column (c),
printer (&token_printer) {}
};
diff --git a/unit-tests/buildfile b/unit-tests/buildfile
index 5d06ec7..f8cfb9d 100644
--- a/unit-tests/buildfile
+++ b/unit-tests/buildfile
@@ -2,6 +2,6 @@
# copyright : Copyright (c) 2014-2016 Code Synthesis Ltd
# license : MIT; see accompanying LICENSE file
-d = function/ test/script/
+d = function/ lexer/ test/script/
./: $d
include $d
diff --git a/unit-tests/lexer/buildfile b/unit-tests/lexer/buildfile
new file mode 100644
index 0000000..d9bd2df
--- /dev/null
+++ b/unit-tests/lexer/buildfile
@@ -0,0 +1,13 @@
+# file : unit-tests/lexer/buildfile
+# copyright : Copyright (c) 2014-2016 Code Synthesis Ltd
+# license : MIT; see accompanying LICENSE file
+
+#@@ Temporary until we get utility library support.
+#
+import libs = libbutl%lib{butl}
+src = token lexer diagnostics utility variable name b-options types-parsers
+
+exe{driver}: cxx{driver} ../../build2/cxx{$src} $libs \
+test{comment quoting}
+
+include ../../build2/
diff --git a/unit-tests/test/script/lexer/comment.test b/unit-tests/lexer/comment.test
index 0092ed9..07d7ac5 100644
--- a/unit-tests/test/script/lexer/comment.test
+++ b/unit-tests/lexer/comment.test
@@ -1,7 +1,6 @@
-# @@ This one should be moved to build2/lexer since we use base lexer
-# functionality as is.
-#
-test.arguments += script-line
+# file : unit-tests/lexer/comment.test
+# copyright : Copyright (c) 2014-2016 Code Synthesis Ltd
+# license : MIT; see accompanying LICENSE file
# Single-line comments.
diff --git a/unit-tests/lexer/driver.cxx b/unit-tests/lexer/driver.cxx
new file mode 100644
index 0000000..326ac8a
--- /dev/null
+++ b/unit-tests/lexer/driver.cxx
@@ -0,0 +1,94 @@
+// file : unit-tests/lexer/driver.cxx -*- C++ -*-
+// copyright : Copyright (c) 2014-2016 Code Synthesis Ltd
+// license : MIT; see accompanying LICENSE file
+
+#include <cassert>
+#include <iostream>
+
+#include <build2/types>
+#include <build2/utility>
+
+#include <build2/token>
+#include <build2/lexer>
+
+using namespace std;
+
+namespace build2
+{
+ // Usage: argv[0] [-q] [<lexer-mode>]
+ //
+ int
+ main (int argc, char* argv[])
+ {
+ bool quote (false);
+ lexer_mode m (lexer_mode::normal);
+
+ for (int i (1); i != argc; ++i)
+ {
+ string a (argv[i]);
+
+ if (a == "-q")
+ quote = true;
+ else
+ {
+ if (a == "normal") m = lexer_mode::normal;
+ else if (a == "variable") m = lexer_mode::variable;
+ else if (a == "value") m = lexer_mode::value;
+ else if (a == "attribute") m = lexer_mode::attribute;
+ else if (a == "eval") m = lexer_mode::eval;
+ else assert (false);
+ break;
+ }
+ }
+
+ try
+ {
+ cin.exceptions (istream::failbit | istream::badbit);
+
+ // Most alternative modes auto-expire so we need something underneath.
+ //
+ lexer l (cin, path ("stdin"));
+
+ if (m != lexer_mode::normal)
+ l.mode (m);
+
+ // No use printing eos since we will either get it or loop forever.
+ //
+ for (token t (l.next ()); t.type != token_type::eos; t = l.next ())
+ {
+ // Print each token on a separate line without quoting operators.
+ //
+ t.printer (cout, t, false);
+
+ if (quote)
+ {
+ char q ('\0');
+ switch (t.qtype)
+ {
+ case quote_type::single: q = 'S'; break;
+ case quote_type::double_: q = 'D'; break;
+ case quote_type::mixed: q = 'M'; break;
+ case quote_type::unquoted: break;
+ }
+
+ if (q != '\0')
+ cout << " [" << q << (t.qcomp ? "/C" : "/P") << ']';
+ }
+
+ cout << endl;
+ }
+ }
+ catch (const failed&)
+ {
+ return 1;
+ }
+
+ return 0;
+ }
+}
+
+int
+main (int argc, char* argv[])
+{
+ return build2::main (argc, argv);
+}
diff --git a/unit-tests/lexer/quoting.test b/unit-tests/lexer/quoting.test
new file mode 100644
index 0000000..76fd904
--- /dev/null
+++ b/unit-tests/lexer/quoting.test
@@ -0,0 +1,95 @@
+# file : unit-tests/lexer/quoting.test
+# copyright : Copyright (c) 2014-2016 Code Synthesis Ltd
+# license : MIT; see accompanying LICENSE file
+
+test.options += -q
+
+: unquoted
+:
+$* <'foo' >>EOO
+'foo'
+<newline>
+EOO
+
+: single-comp
+:
+$* <":'foo':" >>EOO
+:
+'foo' [S/C]
+:
+<newline>
+EOO
+
+: double-comp
+:
+$* <':"foo":' >>EOO
+:
+'foo' [D/C]
+:
+<newline>
+EOO
+
+: single-empty-comp
+:
+$* <"''" >>EOO
+'' [S/C]
+<newline>
+EOO
+
+: double-empty-comp
+:
+$* <'""' >>EOO
+'' [D/C]
+<newline>
+EOO
+
+: part-start-quoted
+: Token start already quoted
+:
+$* <'"$foo"' >>EOO
+'' [D/P]
+\$
+'foo' [D/P]
+<newline>
+EOO
+
+: part-end-quoted
+: Token end still quoted
+:
+$* <'"foo$"' >>EOO
+'foo' [D/P]
+\$
+'' [D/P]
+<newline>
+EOO
+
+: part-start-unquoted
+: Token starts with unquoted character
+:
+$* <'f"oo"' >>EOO
+'foo' [D/P]
+<newline>
+EOO
+
+: part-unquoted
+: Token continous with unquoted character
+:
+$* <'"fo"o' >>EOO
+'foo' [D/P]
+<newline>
+EOO
+
+: part-unquoted-escape
+: Token continous with unquoted escaped character
+:
+$* <'"fo"\"' >>EOO
+'fo"' [D/P]
+<newline>
+EOO
+
+: mixed
+:
+$* <"\"fo\"'o'" >>EOO
+'foo' [M/P]
+<newline>
+EOO