diff options
Diffstat (limited to 'libbuild2/lexer.cxx')
-rw-r--r-- | libbuild2/lexer.cxx | 238 |
1 files changed, 186 insertions, 52 deletions
diff --git a/libbuild2/lexer.cxx b/libbuild2/lexer.cxx index 9b7d01e..04c15be 100644 --- a/libbuild2/lexer.cxx +++ b/libbuild2/lexer.cxx @@ -42,6 +42,22 @@ namespace build2 return make_pair (make_pair (r[0], r[1]), sep_); } + pair<char, bool> lexer:: + peek_char () + { + auto p (skip_spaces ()); + assert (!p.second); + sep_ = p.first; + + char r ('\0'); + + xchar c (peek ()); + if (!eos (c)) + r = c; + + return make_pair (r, sep_); + } + void lexer:: mode (lexer_mode m, char ps, optional<const char*> esc, uintptr_t data) { @@ -144,13 +160,15 @@ namespace build2 break; } case lexer_mode::foreign: - assert (data > 1); - // Fall through. + { + assert (ps == '\0' && data > 1); + s = false; + break; + } case lexer_mode::single_quoted: case lexer_mode::double_quoted: { - assert (ps == '\0'); - s = false; + assert (false); // Can only be set manually in word(). break; } case lexer_mode::variable: @@ -162,8 +180,49 @@ namespace build2 default: assert (false); // Unhandled custom mode. } - state_.push ( - state {m, data, nullopt, lsb, false, ps, s, n, q, *esc, s1, s2}); + mode_impl (state {m, data, nullopt, lsb, false, ps, s, n, q, *esc, s1, s2}); + } + + void lexer:: + mode_impl (state&& s) + { + // If we are in the double-quoted mode then, unless the new mode is eval + // or variable, delay the state switch until the current mode is expired. + // Note that we delay by injecting the new state beneath the current + // state. + // + if (!state_.empty () && + state_.top ().mode == lexer_mode::double_quoted && + s.mode != lexer_mode::eval && + s.mode != lexer_mode::variable) + { + state qs (move (state_.top ())); // Save quoted state. + state_.top () = move (s); // Overwrite quoted state with new state. + state_.push (move (qs)); // Restore quoted state. + } + else + state_.push (move (s)); + } + + void lexer:: + expire_mode () + { + // If we are in the double-quoted mode, then delay the state expiration + // until the current mode is expired. Note that we delay by overwriting + // the being expired state with the current state. + // + assert (!state_.empty () && + (state_.top ().mode != lexer_mode::double_quoted || + state_.size () > 1)); + + if (state_.top ().mode == lexer_mode::double_quoted) + { + state qs (move (state_.top ())); // Save quoted state. + state_.pop (); // Pop quoted state. + state_.top () = move (qs); // Expire state, restoring quoted state. + } + else + state_.pop (); } token lexer:: @@ -202,9 +261,10 @@ namespace build2 auto make_token = [&sep, ln, cn] (type t, string v = string ()) { - return token (t, move (v), - sep, quote_type::unquoted, false, - ln, cn, token_printer); + return token (t, move (v), sep, + quote_type::unquoted, false, false, + ln, cn, + token_printer); }; // Handle `[` (do it first to make sure the flag is cleared regardless of @@ -446,9 +506,10 @@ namespace build2 auto make_token = [sep, ln, cn] (type t, string v = string ()) { - return token (t, move (v), - sep, quote_type::unquoted, false, - ln, cn, token_printer); + return token (t, move (v), sep, + quote_type::unquoted, false, false, + ln, cn, + token_printer); }; // Handle `[` (do it first to make sure the flag is cleared regardless of @@ -620,15 +681,14 @@ namespace build2 if (c == '\n' || c == '#' || eos (c)) { - st.hold = token (type::multi_rcbrace, - string (count, '}'), - false, quote_type::unquoted, false, + st.hold = token (type::multi_rcbrace, string (count, '}'), false, + quote_type::unquoted, false, false, bln, bcn, token_printer); lexeme.resize (chop); - return token (move (lexeme), - false, quote_type::unquoted, false, + return token (move (lexeme), false, + quote_type::unquoted, false, false, ln, cn); } @@ -653,9 +713,9 @@ namespace build2 } token lexer:: - word (state st, bool sep) + word (const state& rst, bool sep) { - lexer_mode m (st.mode); + lexer_mode m (rst.mode); xchar c (peek ()); assert (!eos (c)); @@ -671,33 +731,81 @@ namespace build2 // quote character. // bool qcomp (false); + bool qfirst (false); - auto append = [&lexeme, &m, &qcomp] (char c) + auto append = [&lexeme, &m, &qcomp, &qfirst] (char c, bool escaped = false) { - lexeme += c; + if (lexeme.empty () && (escaped || m == lexer_mode::double_quoted)) + qfirst = true; // An unquoted character after a quoted fragment. // - if (qcomp && m != lexer_mode::double_quoted) + if (m != lexer_mode::double_quoted && qcomp) qcomp = false; + + lexeme += c; }; - for (; !eos (c); c = peek ()) + const state* st (&rst); + for (bool first (true); !eos (c); first = false, c = peek ()) { // First handle escape sequences. // if (c == '\\') { - // In the variable mode we treat the beginning of the escape sequence - // as a separator (think \"$foo\"). + // In the variable mode we treat immediate `\` as the escape sequence + // literal and any following as a separator (think \"$foo\"). // if (m == lexer_mode::variable) - break; + { + if (!first) + break; + + get (); + c = get (); + + if (eos (c)) + fail (c) << "unterminated escape sequence"; + + // For now we only support all the simple C/C++ escape sequences + // plus \0 (which in C/C++ is an octal escape sequence). + // + // In the future we may decide to support more elaborate sequences + // such as \xNN, \uNNNN, etc. + // + // Note: we return it in the literal form instead of translating for + // easier printing. + // + switch (c) + { + case '\'': + case '"': + case '?': + case '\\': + case '0': + case 'a': + case 'b': + case 'f': + case 'n': + case 'r': + case 't': + case 'v': lexeme = c; break; + default: + fail (c) << "unknown escape sequence \\" << c; + } + + state_.pop (); + return token (type::escape, + move (lexeme), + sep, + qtype, qcomp, qfirst, + ln, cn); + } get (); xchar p (peek ()); - const char* esc (st.escapes); + const char* esc (st->escapes); if (esc == nullptr || (*esc != '\0' && !eos (p) && strchr (esc, p) != nullptr)) @@ -708,12 +816,12 @@ namespace build2 fail (p) << "unterminated escape sequence"; if (p != '\n') // Ignore if line continuation. - append (p); + append (p, true); continue; } else - unget (c); // Treat as a normal character. + unget (c); // Fall through to treat as a normal character. } bool done (false); @@ -742,8 +850,8 @@ namespace build2 get (); state_.pop (); - st = state_.top (); - m = st.mode; + st = &state_.top (); + m = st->mode; continue; } } @@ -752,19 +860,17 @@ namespace build2 // else if (m == lexer_mode::variable) { - bool first (lexeme.empty ()); - // Handle special variable names, if any. // - if (first && - st.data != 0 && - strchr (reinterpret_cast<const char*> (st.data), c) != nullptr) + if (first && + st->data != 0 && + strchr (reinterpret_cast<const char*> (st->data), c) != nullptr) { get (); lexeme += c; done = true; } - else if (c != '_' && !(first ? alpha (c) : alnum (c))) + else if (c != '_' && !(lexeme.empty () ? alpha (c) : alnum (c))) { if (c != '.') done = true; @@ -784,17 +890,17 @@ namespace build2 { // First check if it's a pair separator. // - if (c == st.sep_pair) + if (c == st->sep_pair) done = true; else { // Then see if this character or character sequence is a separator. // - for (const char* p (strchr (st.sep_first, c)); + for (const char* p (strchr (st->sep_first, c)); p != nullptr; p = done ? nullptr : strchr (p + 1, c)) { - char s (st.sep_second[p - st.sep_first]); + char s (st->sep_second[p - st->sep_first]); // See if it has a second. // @@ -812,8 +918,21 @@ namespace build2 // Handle single and double quotes if enabled for this mode and unless // they were considered separators. // - if (st.quotes && !done) + if (st->quotes && !done) { + auto quoted_mode = [this] (lexer_mode m) + { + // In the double-quoted mode we only do effective escaping of the + // special `$("\` characters, line continuations, plus `)` for + // symmetry. Nothing can be escaped in single-quoted. + // + const char* esc (m == lexer_mode::double_quoted ? "$()\"\\\n" : ""); + + state_.push (state { + m, 0, nullopt, false, false, '\0', false, true, true, + esc, nullptr, nullptr}); + }; + switch (c) { case '\'': @@ -821,7 +940,7 @@ namespace build2 // Enter the single-quoted mode in case the derived lexer needs // to notice this. // - mode (lexer_mode::single_quoted); + quoted_mode (lexer_mode::single_quoted); switch (qtype) { @@ -840,6 +959,12 @@ namespace build2 break; } + // Note that we will treat plus in ''+ as quoted. This is + // probably the better option considering the "$empty"+ case + // + if (lexeme.empty ()) + qfirst = true; + get (); for (c = get (); !eos (c) && c != '\''; c = get ()) lexeme += c; @@ -854,9 +979,10 @@ namespace build2 { get (); - mode (lexer_mode::double_quoted); - st = state_.top (); - m = st.mode; + quoted_mode (lexer_mode::double_quoted); + + st = &state_.top (); + m = st->mode; switch (qtype) { @@ -875,6 +1001,11 @@ namespace build2 break; } + // The same reasoning as above. + // + if (lexeme.empty ()) + qfirst = true; + continue; } } @@ -905,7 +1036,7 @@ namespace build2 if (m == lexer_mode::variable) state_.pop (); - return token (move (lexeme), sep, qtype, qcomp, ln, cn); + return token (move (lexeme), sep, qtype, qcomp, qfirst, ln, cn); } pair<bool, bool> lexer:: @@ -973,7 +1104,7 @@ namespace build2 if ((c = peek ()) == '\\') { get (); - if ((c = peek ()) == '\n') + if ((c = peek ()) == '\n' || eos (c)) return true; } @@ -984,15 +1115,16 @@ namespace build2 { // Scan until we see the closing one. // - for (; !eos (c); c = peek ()) + for (;;) { - get (); if (c == '#' && ml ()) break; - } - if (eos (c)) - fail (c) << "unterminated multi-line comment"; + if (eos (c = peek ())) + fail (c) << "unterminated multi-line comment"; + + get (); + } } else { @@ -1006,6 +1138,8 @@ namespace build2 } case '\\': { + // See if this is line continuation. + // get (); if (peek () == '\n') |