aboutsummaryrefslogtreecommitdiff
path: root/libbuild2/lexer.cxx
diff options
context:
space:
mode:
Diffstat (limited to 'libbuild2/lexer.cxx')
-rw-r--r--libbuild2/lexer.cxx238
1 files changed, 186 insertions, 52 deletions
diff --git a/libbuild2/lexer.cxx b/libbuild2/lexer.cxx
index 9b7d01e..04c15be 100644
--- a/libbuild2/lexer.cxx
+++ b/libbuild2/lexer.cxx
@@ -42,6 +42,22 @@ namespace build2
return make_pair (make_pair (r[0], r[1]), sep_);
}
+ pair<char, bool> lexer::
+ peek_char ()
+ {
+ auto p (skip_spaces ());
+ assert (!p.second);
+ sep_ = p.first;
+
+ char r ('\0');
+
+ xchar c (peek ());
+ if (!eos (c))
+ r = c;
+
+ return make_pair (r, sep_);
+ }
+
void lexer::
mode (lexer_mode m, char ps, optional<const char*> esc, uintptr_t data)
{
@@ -144,13 +160,15 @@ namespace build2
break;
}
case lexer_mode::foreign:
- assert (data > 1);
- // Fall through.
+ {
+ assert (ps == '\0' && data > 1);
+ s = false;
+ break;
+ }
case lexer_mode::single_quoted:
case lexer_mode::double_quoted:
{
- assert (ps == '\0');
- s = false;
+ assert (false); // Can only be set manually in word().
break;
}
case lexer_mode::variable:
@@ -162,8 +180,49 @@ namespace build2
default: assert (false); // Unhandled custom mode.
}
- state_.push (
- state {m, data, nullopt, lsb, false, ps, s, n, q, *esc, s1, s2});
+ mode_impl (state {m, data, nullopt, lsb, false, ps, s, n, q, *esc, s1, s2});
+ }
+
+ void lexer::
+ mode_impl (state&& s)
+ {
+ // If we are in the double-quoted mode then, unless the new mode is eval
+ // or variable, delay the state switch until the current mode is expired.
+ // Note that we delay by injecting the new state beneath the current
+ // state.
+ //
+ if (!state_.empty () &&
+ state_.top ().mode == lexer_mode::double_quoted &&
+ s.mode != lexer_mode::eval &&
+ s.mode != lexer_mode::variable)
+ {
+ state qs (move (state_.top ())); // Save quoted state.
+ state_.top () = move (s); // Overwrite quoted state with new state.
+ state_.push (move (qs)); // Restore quoted state.
+ }
+ else
+ state_.push (move (s));
+ }
+
+ void lexer::
+ expire_mode ()
+ {
+ // If we are in the double-quoted mode, then delay the state expiration
+ // until the current mode is expired. Note that we delay by overwriting
+ // the being expired state with the current state.
+ //
+ assert (!state_.empty () &&
+ (state_.top ().mode != lexer_mode::double_quoted ||
+ state_.size () > 1));
+
+ if (state_.top ().mode == lexer_mode::double_quoted)
+ {
+ state qs (move (state_.top ())); // Save quoted state.
+ state_.pop (); // Pop quoted state.
+ state_.top () = move (qs); // Expire state, restoring quoted state.
+ }
+ else
+ state_.pop ();
}
token lexer::
@@ -202,9 +261,10 @@ namespace build2
auto make_token = [&sep, ln, cn] (type t, string v = string ())
{
- return token (t, move (v),
- sep, quote_type::unquoted, false,
- ln, cn, token_printer);
+ return token (t, move (v), sep,
+ quote_type::unquoted, false, false,
+ ln, cn,
+ token_printer);
};
// Handle `[` (do it first to make sure the flag is cleared regardless of
@@ -446,9 +506,10 @@ namespace build2
auto make_token = [sep, ln, cn] (type t, string v = string ())
{
- return token (t, move (v),
- sep, quote_type::unquoted, false,
- ln, cn, token_printer);
+ return token (t, move (v), sep,
+ quote_type::unquoted, false, false,
+ ln, cn,
+ token_printer);
};
// Handle `[` (do it first to make sure the flag is cleared regardless of
@@ -620,15 +681,14 @@ namespace build2
if (c == '\n' || c == '#' || eos (c))
{
- st.hold = token (type::multi_rcbrace,
- string (count, '}'),
- false, quote_type::unquoted, false,
+ st.hold = token (type::multi_rcbrace, string (count, '}'), false,
+ quote_type::unquoted, false, false,
bln, bcn,
token_printer);
lexeme.resize (chop);
- return token (move (lexeme),
- false, quote_type::unquoted, false,
+ return token (move (lexeme), false,
+ quote_type::unquoted, false, false,
ln, cn);
}
@@ -653,9 +713,9 @@ namespace build2
}
token lexer::
- word (state st, bool sep)
+ word (const state& rst, bool sep)
{
- lexer_mode m (st.mode);
+ lexer_mode m (rst.mode);
xchar c (peek ());
assert (!eos (c));
@@ -671,33 +731,81 @@ namespace build2
// quote character.
//
bool qcomp (false);
+ bool qfirst (false);
- auto append = [&lexeme, &m, &qcomp] (char c)
+ auto append = [&lexeme, &m, &qcomp, &qfirst] (char c, bool escaped = false)
{
- lexeme += c;
+ if (lexeme.empty () && (escaped || m == lexer_mode::double_quoted))
+ qfirst = true;
// An unquoted character after a quoted fragment.
//
- if (qcomp && m != lexer_mode::double_quoted)
+ if (m != lexer_mode::double_quoted && qcomp)
qcomp = false;
+
+ lexeme += c;
};
- for (; !eos (c); c = peek ())
+ const state* st (&rst);
+ for (bool first (true); !eos (c); first = false, c = peek ())
{
// First handle escape sequences.
//
if (c == '\\')
{
- // In the variable mode we treat the beginning of the escape sequence
- // as a separator (think \"$foo\").
+ // In the variable mode we treat immediate `\` as the escape sequence
+ // literal and any following as a separator (think \"$foo\").
//
if (m == lexer_mode::variable)
- break;
+ {
+ if (!first)
+ break;
+
+ get ();
+ c = get ();
+
+ if (eos (c))
+ fail (c) << "unterminated escape sequence";
+
+ // For now we only support all the simple C/C++ escape sequences
+ // plus \0 (which in C/C++ is an octal escape sequence).
+ //
+ // In the future we may decide to support more elaborate sequences
+ // such as \xNN, \uNNNN, etc.
+ //
+ // Note: we return it in the literal form instead of translating for
+ // easier printing.
+ //
+ switch (c)
+ {
+ case '\'':
+ case '"':
+ case '?':
+ case '\\':
+ case '0':
+ case 'a':
+ case 'b':
+ case 'f':
+ case 'n':
+ case 'r':
+ case 't':
+ case 'v': lexeme = c; break;
+ default:
+ fail (c) << "unknown escape sequence \\" << c;
+ }
+
+ state_.pop ();
+ return token (type::escape,
+ move (lexeme),
+ sep,
+ qtype, qcomp, qfirst,
+ ln, cn);
+ }
get ();
xchar p (peek ());
- const char* esc (st.escapes);
+ const char* esc (st->escapes);
if (esc == nullptr ||
(*esc != '\0' && !eos (p) && strchr (esc, p) != nullptr))
@@ -708,12 +816,12 @@ namespace build2
fail (p) << "unterminated escape sequence";
if (p != '\n') // Ignore if line continuation.
- append (p);
+ append (p, true);
continue;
}
else
- unget (c); // Treat as a normal character.
+ unget (c); // Fall through to treat as a normal character.
}
bool done (false);
@@ -742,8 +850,8 @@ namespace build2
get ();
state_.pop ();
- st = state_.top ();
- m = st.mode;
+ st = &state_.top ();
+ m = st->mode;
continue;
}
}
@@ -752,19 +860,17 @@ namespace build2
//
else if (m == lexer_mode::variable)
{
- bool first (lexeme.empty ());
-
// Handle special variable names, if any.
//
- if (first &&
- st.data != 0 &&
- strchr (reinterpret_cast<const char*> (st.data), c) != nullptr)
+ if (first &&
+ st->data != 0 &&
+ strchr (reinterpret_cast<const char*> (st->data), c) != nullptr)
{
get ();
lexeme += c;
done = true;
}
- else if (c != '_' && !(first ? alpha (c) : alnum (c)))
+ else if (c != '_' && !(lexeme.empty () ? alpha (c) : alnum (c)))
{
if (c != '.')
done = true;
@@ -784,17 +890,17 @@ namespace build2
{
// First check if it's a pair separator.
//
- if (c == st.sep_pair)
+ if (c == st->sep_pair)
done = true;
else
{
// Then see if this character or character sequence is a separator.
//
- for (const char* p (strchr (st.sep_first, c));
+ for (const char* p (strchr (st->sep_first, c));
p != nullptr;
p = done ? nullptr : strchr (p + 1, c))
{
- char s (st.sep_second[p - st.sep_first]);
+ char s (st->sep_second[p - st->sep_first]);
// See if it has a second.
//
@@ -812,8 +918,21 @@ namespace build2
// Handle single and double quotes if enabled for this mode and unless
// they were considered separators.
//
- if (st.quotes && !done)
+ if (st->quotes && !done)
{
+ auto quoted_mode = [this] (lexer_mode m)
+ {
+ // In the double-quoted mode we only do effective escaping of the
+ // special `$("\` characters, line continuations, plus `)` for
+ // symmetry. Nothing can be escaped in single-quoted.
+ //
+ const char* esc (m == lexer_mode::double_quoted ? "$()\"\\\n" : "");
+
+ state_.push (state {
+ m, 0, nullopt, false, false, '\0', false, true, true,
+ esc, nullptr, nullptr});
+ };
+
switch (c)
{
case '\'':
@@ -821,7 +940,7 @@ namespace build2
// Enter the single-quoted mode in case the derived lexer needs
// to notice this.
//
- mode (lexer_mode::single_quoted);
+ quoted_mode (lexer_mode::single_quoted);
switch (qtype)
{
@@ -840,6 +959,12 @@ namespace build2
break;
}
+ // Note that we will treat plus in ''+ as quoted. This is
+ // probably the better option considering the "$empty"+ case
+ //
+ if (lexeme.empty ())
+ qfirst = true;
+
get ();
for (c = get (); !eos (c) && c != '\''; c = get ())
lexeme += c;
@@ -854,9 +979,10 @@ namespace build2
{
get ();
- mode (lexer_mode::double_quoted);
- st = state_.top ();
- m = st.mode;
+ quoted_mode (lexer_mode::double_quoted);
+
+ st = &state_.top ();
+ m = st->mode;
switch (qtype)
{
@@ -875,6 +1001,11 @@ namespace build2
break;
}
+ // The same reasoning as above.
+ //
+ if (lexeme.empty ())
+ qfirst = true;
+
continue;
}
}
@@ -905,7 +1036,7 @@ namespace build2
if (m == lexer_mode::variable)
state_.pop ();
- return token (move (lexeme), sep, qtype, qcomp, ln, cn);
+ return token (move (lexeme), sep, qtype, qcomp, qfirst, ln, cn);
}
pair<bool, bool> lexer::
@@ -973,7 +1104,7 @@ namespace build2
if ((c = peek ()) == '\\')
{
get ();
- if ((c = peek ()) == '\n')
+ if ((c = peek ()) == '\n' || eos (c))
return true;
}
@@ -984,15 +1115,16 @@ namespace build2
{
// Scan until we see the closing one.
//
- for (; !eos (c); c = peek ())
+ for (;;)
{
- get ();
if (c == '#' && ml ())
break;
- }
- if (eos (c))
- fail (c) << "unterminated multi-line comment";
+ if (eos (c = peek ()))
+ fail (c) << "unterminated multi-line comment";
+
+ get ();
+ }
}
else
{
@@ -1006,6 +1138,8 @@ namespace build2
}
case '\\':
{
+ // See if this is line continuation.
+ //
get ();
if (peek () == '\n')