aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBoris Kolpackov <boris@codesynthesis.com>2022-12-15 11:24:18 +0200
committerBoris Kolpackov <boris@codesynthesis.com>2022-12-15 11:24:18 +0200
commit3ca670b7b7c71ca67d70cac9dffb2ba6120b2e36 (patch)
tree1424ac78fe10f697c8a0b63d91bb49889e8cdc85
parent0aa7a94e1032a96a2a72cb6a82824f9fe970d412 (diff)
Improve escape sequence support
Specifically: 1. In the double-quoted strings we now only do effective escaping of the special `$("\` characters plus `)` for symmetry. 2. There is now support for "escape sequence expansion" in the form $\X where \X can be any of the C/C++ simple escape sequences (\n, \t, etc) plus \0 (which in C/C++ is an octal escape sequence). For example: info "foo$\n$\tbar$\n$\tbaz" Will print: buildfile:1:1: info: foo bar baz
-rw-r--r--libbuild2/lexer.cxx96
-rw-r--r--libbuild2/lexer.hxx19
-rw-r--r--libbuild2/parser.cxx292
-rw-r--r--libbuild2/test/script/lexer.cxx8
-rw-r--r--libbuild2/test/script/lexer.hxx2
-rw-r--r--libbuild2/token.cxx21
-rw-r--r--libbuild2/token.hxx8
-rw-r--r--tests/expansion/escape.testscript17
8 files changed, 291 insertions, 172 deletions
diff --git a/libbuild2/lexer.cxx b/libbuild2/lexer.cxx
index 9176422..d82c135 100644
--- a/libbuild2/lexer.cxx
+++ b/libbuild2/lexer.cxx
@@ -713,9 +713,9 @@ namespace build2
}
token lexer::
- word (state st, bool sep)
+ word (const state& rst, bool sep)
{
- lexer_mode m (st.mode);
+ lexer_mode m (rst.mode);
xchar c (peek ());
assert (!eos (c));
@@ -746,22 +746,66 @@ namespace build2
lexeme += c;
};
- for (; !eos (c); c = peek ())
+ const state* st (&rst);
+ for (bool first (true); !eos (c); first = false, c = peek ())
{
// First handle escape sequences.
//
if (c == '\\')
{
- // In the variable mode we treat the beginning of the escape sequence
- // as a separator (think \"$foo\").
+ // In the variable mode we treat immediate `\` as the escape sequence
+ // literal and any following as a separator (think \"$foo\").
//
if (m == lexer_mode::variable)
- break;
+ {
+ if (!first)
+ break;
+
+ get ();
+ c = get ();
+
+ if (eos (c))
+ fail (c) << "unterminated escape sequence";
+
+ // For now we only support all the simple C/C++ escape sequences
+ // plus \0 (which in C/C++ is an octal escape sequence).
+ //
+ // In the future we may decide to support more elaborate sequences
+ // such as \xNN, \uNNNN, etc.
+ //
+ // Note: we return it in the literal form instead of translating for
+ // easier printing.
+ //
+ switch (c)
+ {
+ case '\'':
+ case '"':
+ case '?':
+ case '\\':
+ case '0':
+ case 'a':
+ case 'b':
+ case 'f':
+ case 'n':
+ case 'r':
+ case 't':
+ case 'v': lexeme = c; break;
+ default:
+ fail (c) << "unknown escape sequence \\" << c;
+ }
+
+ state_.pop ();
+ return token (type::escape,
+ move (lexeme),
+ sep,
+ qtype, qcomp, qfirst,
+ ln, cn);
+ }
get ();
xchar p (peek ());
- const char* esc (st.escapes);
+ const char* esc (st->escapes);
if (esc == nullptr ||
(*esc != '\0' && !eos (p) && strchr (esc, p) != nullptr))
@@ -777,7 +821,7 @@ namespace build2
continue;
}
else
- unget (c); // Treat as a normal character.
+ unget (c); // Fall through to treat as a normal character.
}
bool done (false);
@@ -806,8 +850,8 @@ namespace build2
get ();
state_.pop ();
- st = state_.top ();
- m = st.mode;
+ st = &state_.top ();
+ m = st->mode;
continue;
}
}
@@ -816,19 +860,17 @@ namespace build2
//
else if (m == lexer_mode::variable)
{
- bool first (lexeme.empty ());
-
// Handle special variable names, if any.
//
- if (first &&
- st.data != 0 &&
- strchr (reinterpret_cast<const char*> (st.data), c) != nullptr)
+ if (first &&
+ st->data != 0 &&
+ strchr (reinterpret_cast<const char*> (st->data), c) != nullptr)
{
get ();
lexeme += c;
done = true;
}
- else if (c != '_' && !(first ? alpha (c) : alnum (c)))
+ else if (c != '_' && !(lexeme.empty () ? alpha (c) : alnum (c)))
{
if (c != '.')
done = true;
@@ -848,17 +890,17 @@ namespace build2
{
// First check if it's a pair separator.
//
- if (c == st.sep_pair)
+ if (c == st->sep_pair)
done = true;
else
{
// Then see if this character or character sequence is a separator.
//
- for (const char* p (strchr (st.sep_first, c));
+ for (const char* p (strchr (st->sep_first, c));
p != nullptr;
p = done ? nullptr : strchr (p + 1, c))
{
- char s (st.sep_second[p - st.sep_first]);
+ char s (st->sep_second[p - st->sep_first]);
// See if it has a second.
//
@@ -876,13 +918,19 @@ namespace build2
// Handle single and double quotes if enabled for this mode and unless
// they were considered separators.
//
- if (st.quotes && !done)
+ if (st->quotes && !done)
{
auto quoted_mode = [this] (lexer_mode m)
{
+ // In the double-quoted mode we only do effective escaping of the
+ // special `$("\` characters plus `)` for symmetry. Nothing can be
+ // escaped in single-quoted.
+ //
+ const char* esc (m == lexer_mode::double_quoted ? "$()\"\\" : "");
+
state_.push (state {
m, 0, nullopt, false, false, '\0', false, true, true,
- state_.top ().escapes, nullptr, nullptr});
+ esc, nullptr, nullptr});
};
switch (c)
@@ -933,8 +981,8 @@ namespace build2
quoted_mode (lexer_mode::double_quoted);
- st = state_.top ();
- m = st.mode;
+ st = &state_.top ();
+ m = st->mode;
switch (qtype)
{
@@ -1090,6 +1138,8 @@ namespace build2
}
case '\\':
{
+ // See if this is line continuation.
+ //
get ();
if (peek () == '\n')
diff --git a/libbuild2/lexer.hxx b/libbuild2/lexer.hxx
index 4371206..e913829 100644
--- a/libbuild2/lexer.hxx
+++ b/libbuild2/lexer.hxx
@@ -26,14 +26,15 @@ namespace build2
// mode we don't treat certain characters (e.g., `+`, `=`) as special so
// that we can use them in the variable values, e.g., `foo = g++`. In
// contrast, in the variable mode, we restrict certain character (e.g., `/`)
- // from appearing in the name. The values mode is like value but recogizes
- // `,` as special (used in contexts where we need to list multiple
- // values). The attributes/attribute_value modes are like values where each
- // value is potentially a variable assignment; they don't treat `{` and `}`
- // as special (so we cannot have name groups in attributes) as well as
- // recognizes `=` and `]`. The subscript mode is like value but doesn't
- // treat `{` and `}` as special and recognizes `]`. The eval mode is used in
- // the evaluation context.
+ // from appearing in the name. Additionally, in the variable mode we
+ // recognize leading `\` as the beginning of the escape sequent ($\n). The
+ // values mode is like value but recogizes `,` as special (used in contexts
+ // where we need to list multiple values). The attributes/attribute_value
+ // modes are like values where each value is potentially a variable
+ // assignment; they don't treat `{` and `}` as special (so we cannot have
+ // name groups in attributes) as well as recognizes `=` and `]`. The
+ // subscript mode is like value but doesn't treat `{` and `}` as special and
+ // recognizes `]`. The eval mode is used in the evaluation context.
//
// A number of modes are "derived" from the value/values mode by recognizing
// a few extra characters:
@@ -262,7 +263,7 @@ namespace build2
// been "expired" from the top).
//
virtual token
- word (state current, bool separated);
+ word (const state& current, bool separated);
// Return true in first if we have seen any spaces. Skipped empty lines
// don't count. In other words, we are only interested in spaces that are
diff --git a/libbuild2/parser.cxx b/libbuild2/parser.cxx
index b118cee..2507a02 100644
--- a/libbuild2/parser.cxx
+++ b/libbuild2/parser.cxx
@@ -7357,11 +7357,15 @@ namespace build2
// token is a paren or a word, we turn it on and switch to the eval
// mode if what we get next is a paren.
//
- // Also sniff out the special variables string from mode data for
- // the ad hoc $() handling below.
- //
mode (lexer_mode::variable);
+ // Sniff out the special variables string from mode data and use
+ // that to recognize special variables in the ad hoc $() handling
+ // below.
+ //
+ // Note: must be done before calling next() which may expire the
+ // mode.
+ //
auto special = [s = reinterpret_cast<const char*> (mode_data ())]
(const token& t) -> char
{
@@ -7400,164 +7404,202 @@ namespace build2
next (t, tt);
loc = get_location (t);
- names qual;
- string name;
-
- if (t.separated)
- ; // Leave the name empty to fail below.
- else if (tt == type::word)
+ if (tt == type::escape)
{
- name = move (t.value);
+ // For now we only support all the simple C/C++ escape sequences
+ // plus \0 (which in C/C++ is an octal escape sequence). See the
+ // lexer part for details.
+ //
+ // Note: cannot be subscripted.
+ //
+ if (!pre_parse_)
+ {
+ string s;
+ switch (char c = t.value[0])
+ {
+ case '\'':
+ case '"':
+ case '?':
+ case '\\': s = c; break;
+ case '0': s = '\0'; break;
+ case 'a': s = '\a'; break;
+ case 'b': s = '\b'; break;
+ case 'f': s = '\f'; break;
+ case 'n': s = '\n'; break;
+ case 'r': s = '\r'; break;
+ case 't': s = '\t'; break;
+ case 'v': s = '\v'; break;
+ default:
+ assert (false);
+ }
+
+ result_data = name (move (s));
+ what = "escape sequence expansion";
+ }
+
+ tt = peek ();
}
- else if (tt == type::lparen)
+ else
{
- expire_mode ();
- mode (lexer_mode::eval, '@');
- next_with_attributes (t, tt);
+ names qual;
+ string name;
- // Handle the $(x) case ad hoc. We do it this way in order to get
- // the variable name even during pre-parse. It should also be
- // faster.
- //
- char c;
- if ((tt == type::word
- ? path_traits::rfind_separator (t.value) == string::npos
- : (c = special (t))) &&
- peek () == type::rparen)
+ if (t.separated)
+ ; // Leave the name empty to fail below.
+ else if (tt == type::word)
{
- name = (tt == type::word ? move (t.value) : string (1, c));
- next (t, tt); // Get `)`.
+ name = move (t.value);
}
- else
+ else if (tt == type::lparen)
{
- using name_type = build2::name;
-
- values vs (parse_eval (t, tt, pmode));
+ expire_mode ();
+ mode (lexer_mode::eval, '@');
+ next_with_attributes (t, tt);
- if (!pre_parse_)
+ // Handle the $(x) case ad hoc. We do it this way in order to
+ // get the variable name even during pre-parse. It should also
+ // be faster.
+ //
+ char c;
+ if ((tt == type::word
+ ? path_traits::rfind_separator (t.value) == string::npos
+ : (c = special (t))) &&
+ peek () == type::rparen)
{
- if (vs.size () != 1)
- fail (loc) << "expected single variable/function name";
+ name = (tt == type::word ? move (t.value) : string (1, c));
+ next (t, tt); // Get `)`.
+ }
+ else
+ {
+ using name_type = build2::name;
- value& v (vs[0]);
+ values vs (parse_eval (t, tt, pmode));
- if (!v)
- fail (loc) << "null variable/function name";
+ if (!pre_parse_)
+ {
+ if (vs.size () != 1)
+ fail (loc) << "expected single variable/function name";
- names storage;
- vector_view<name_type> ns (
- reverse (v, storage, true /* reduce */)); // Movable.
- size_t n (ns.size ());
+ value& v (vs[0]);
- // We cannot handle scope-qualification in the eval context as
- // we do for target-qualification (see eval-qual) since then
- // we would be treating all paths as qualified variables. So
- // we have to do it here.
- //
- if (n >= 2 && ns[0].pair == ':') // $(foo: x)
- {
- // Note: name is first (see eval for details).
+ if (!v)
+ fail (loc) << "null variable/function name";
+
+ names storage;
+ vector_view<name_type> ns (
+ reverse (v, storage, true /* reduce */)); // Movable.
+ size_t n (ns.size ());
+
+ // We cannot handle scope-qualification in the eval context
+ // as we do for target-qualification (see eval-qual) since
+ // then we would be treating all paths as qualified
+ // variables. So we have to do it here.
//
- qual.push_back (move (ns[1]));
+ if (n >= 2 && ns[0].pair == ':') // $(foo: x)
+ {
+ // Note: name is first (see eval for details).
+ //
+ qual.push_back (move (ns[1]));
- if (qual.back ().empty ())
- fail (loc) << "empty variable/function qualification";
+ if (qual.back ().empty ())
+ fail (loc) << "empty variable/function qualification";
- if (n > 2)
- qual.push_back (move (ns[2]));
+ if (n > 2)
+ qual.push_back (move (ns[2]));
- // Move name to the last position (see below).
- //
- swap (ns[0], ns[n - 1]);
- }
- else if (n == 2 && ns[0].directory ()) // $(foo/ x)
- {
- qual.push_back (move (ns[0]));
- qual.back ().pair = '/';
- }
- else if (n > 1)
- fail (loc) << "expected variable/function name instead of '"
- << ns << "'";
+ // Move name to the last position (see below).
+ //
+ swap (ns[0], ns[n - 1]);
+ }
+ else if (n == 2 && ns[0].directory ()) // $(foo/ x)
+ {
+ qual.push_back (move (ns[0]));
+ qual.back ().pair = '/';
+ }
+ else if (n > 1)
+ fail (loc) << "expected variable/function name instead of '"
+ << ns << "'";
- // Note: checked for empty below.
- //
- if (!ns[n - 1].simple ())
- fail (loc) << "expected variable/function name instead of '"
- << ns[n - 1] << "'";
+ // Note: checked for empty below.
+ //
+ if (!ns[n - 1].simple ())
+ fail (loc) << "expected variable/function name instead of '"
+ << ns[n - 1] << "'";
- size_t p;
- if (n == 1 && // $(foo/x)
- (p = path_traits::rfind_separator (ns[0].value)) !=
+ size_t p;
+ if (n == 1 && // $(foo/x)
+ (p = path_traits::rfind_separator (ns[0].value)) !=
string::npos)
- {
- // Note that p cannot point to the last character since then
- // it would have been a directory, not a simple name.
- //
- string& s (ns[0].value);
+ {
+ // Note that p cannot point to the last character since
+ // then it would have been a directory, not a simple name.
+ //
+ string& s (ns[0].value);
- name = string (s, p + 1);
- s.resize (p + 1);
- qual.push_back (name_type (dir_path (move (s))));
- qual.back ().pair = '/';
+ name = string (s, p + 1);
+ s.resize (p + 1);
+ qual.push_back (name_type (dir_path (move (s))));
+ qual.back ().pair = '/';
+ }
+ else
+ name = move (ns[n - 1].value);
}
- else
- name = move (ns[n - 1].value);
}
}
- }
- else
- fail (t) << "expected variable/function name instead of " << t;
-
- if (!pre_parse_ && name.empty ())
- fail (loc) << "empty variable/function name";
-
- // Figure out whether this is a variable expansion with potential
- // subscript or a function call.
- //
- if (sub) enable_subscript ();
- tt = peek ();
+ else
+ fail (t) << "expected variable/function name instead of " << t;
- // Note that we require function call opening paren to be
- // unseparated; consider: $x ($x == 'foo' ? 'FOO' : 'BAR').
- //
- if (tt == type::lparen && !peeked ().separated)
- {
- // Function call.
- //
- next (t, tt); // Get '('.
- mode (lexer_mode::eval, '@');
- next_with_attributes (t, tt);
+ if (!pre_parse_ && name.empty ())
+ fail (loc) << "empty variable/function name";
- // @@ Should we use (target/scope) qualification (of name) as the
- // context in which to call the function? Hm, interesting...
+ // Figure out whether this is a variable expansion with potential
+ // subscript or a function call.
//
- values args (parse_eval (t, tt, pmode));
-
if (sub) enable_subscript ();
tt = peek ();
- // Note that we "move" args to call().
+ // Note that we require function call opening paren to be
+ // unseparated; consider: $x ($x == 'foo' ? 'FOO' : 'BAR').
//
- if (!pre_parse_)
+ if (tt == type::lparen && !peeked ().separated)
{
- result_data = ctx->functions.call (scope_, name, args, loc);
- what = "function call";
+ // Function call.
+ //
+ next (t, tt); // Get '('.
+ mode (lexer_mode::eval, '@');
+ next_with_attributes (t, tt);
+
+ // @@ Should we use (target/scope) qualification (of name) as
+ // the context in which to call the function? Hm, interesting...
+ //
+ values args (parse_eval (t, tt, pmode));
+
+ if (sub) enable_subscript ();
+ tt = peek ();
+
+ // Note that we "move" args to call().
+ //
+ if (!pre_parse_)
+ {
+ result_data = ctx->functions.call (scope_, name, args, loc);
+ what = "function call";
+ }
+ else
+ lookup_function (move (name), loc);
}
else
- lookup_function (move (name), loc);
- }
- else
- {
- // Variable expansion.
- //
- lookup l (lookup_variable (move (qual), move (name), loc));
-
- if (!pre_parse_)
{
- if (l.defined ())
- result = l.value; // Otherwise leave as NULL result_data.
+ // Variable expansion.
+ //
+ lookup l (lookup_variable (move (qual), move (name), loc));
- what = "variable expansion";
+ if (!pre_parse_)
+ {
+ if (l.defined ())
+ result = l.value; // Otherwise leave as NULL result_data.
+
+ what = "variable expansion";
+ }
}
}
}
diff --git a/libbuild2/test/script/lexer.cxx b/libbuild2/test/script/lexer.cxx
index b470d25..aec91fc 100644
--- a/libbuild2/test/script/lexer.cxx
+++ b/libbuild2/test/script/lexer.cxx
@@ -339,15 +339,17 @@ namespace build2
}
token lexer::
- word (state st, bool sep)
+ word (const state& st, bool sep)
{
- lexer_mode m (st.mode);
+ lexer_mode m (st.mode); // Save.
token r (base_lexer::word (st, sep));
if (m == lexer_mode::variable)
{
- if (r.value.size () == 1 && digit (r.value[0])) // $N
+ if (r.type == type::word &&
+ r.value.size () == 1 &&
+ digit (r.value[0])) // $N
{
xchar c (peek ());
diff --git a/libbuild2/test/script/lexer.hxx b/libbuild2/test/script/lexer.hxx
index 993a9db..39b950a 100644
--- a/libbuild2/test/script/lexer.hxx
+++ b/libbuild2/test/script/lexer.hxx
@@ -77,7 +77,7 @@ namespace build2
next_description ();
virtual token
- word (state, bool) override;
+ word (const state&, bool) override;
};
}
}
diff --git a/libbuild2/token.cxx b/libbuild2/token.cxx
index ab14388..cc102cc 100644
--- a/libbuild2/token.cxx
+++ b/libbuild2/token.cxx
@@ -29,21 +29,30 @@ namespace build2
os << (r ? "\n" : "<newline>");
break;
}
- case token_type::pair_separator:
+ case token_type::word:
{
if (r)
- os << t.value[0];
+ os << t.value;
else
- os << "<pair separator " << t.value[0] << ">";
+ os << '\'' << t.value << '\'';
break;
}
- case token_type::word:
+ case token_type::escape:
{
if (r)
- os << t.value;
+ os << '\\' << t.value;
else
- os << '\'' << t.value << '\'';
+ os << "<escape sequence \\" << t.value << ">";
+
+ break;
+ }
+ case token_type::pair_separator:
+ {
+ if (r)
+ os << t.value[0];
+ else
+ os << "<pair separator " << t.value[0] << ">";
break;
}
diff --git a/libbuild2/token.hxx b/libbuild2/token.hxx
index fca888c..f9ede65 100644
--- a/libbuild2/token.hxx
+++ b/libbuild2/token.hxx
@@ -30,6 +30,7 @@ namespace build2
eos,
newline,
word,
+ escape, // token::value is <...> in $\<...>
pair_separator, // token::value[0] is the pair separator char.
colon, // :
@@ -159,16 +160,13 @@ namespace build2
token (string v, bool s,
quote_type qt, bool qc, bool qf,
uint64_t l, uint64_t c)
- : token (token_type::word, move (v), s,
- qt, qc, qf,
- l, c,
- &token_printer) {}
+ : token (token_type::word, move (v), s, qt, qc, qf, l, c) {}
token (token_type t,
string v, bool s,
quote_type qt, bool qc, bool qf,
uint64_t l, uint64_t c,
- printer_type* p)
+ printer_type* p = &token_printer)
: type (t), separated (s),
qtype (qt), qcomp (qc), qfirst (qf),
value (move (v)),
diff --git a/tests/expansion/escape.testscript b/tests/expansion/escape.testscript
new file mode 100644
index 0000000..1140032
--- /dev/null
+++ b/tests/expansion/escape.testscript
@@ -0,0 +1,17 @@
+# file : tests/expansion/type.testscript
+# license : MIT; see accompanying LICENSE file
+
+# Test escape sequence expansion.
+
+.include ../common.testscript
+
+: simple
+:
+$* <<EOI >>EOO
+print "foo$\nbar"
+print $size([string] "foo$\0bar")
+EOI
+foo
+bar
+7
+EOO