aboutsummaryrefslogtreecommitdiff
path: root/libbuild2/test/script/lexer.cxx
diff options
context:
space:
mode:
Diffstat (limited to 'libbuild2/test/script/lexer.cxx')
-rw-r--r--libbuild2/test/script/lexer.cxx551
1 files changed, 551 insertions, 0 deletions
diff --git a/libbuild2/test/script/lexer.cxx b/libbuild2/test/script/lexer.cxx
new file mode 100644
index 0000000..75c04c8
--- /dev/null
+++ b/libbuild2/test/script/lexer.cxx
@@ -0,0 +1,551 @@
+// file : libbuild2/test/script/lexer.cxx -*- C++ -*-
+// copyright : Copyright (c) 2014-2019 Code Synthesis Ltd
+// license : MIT; see accompanying LICENSE file
+
+#include <libbuild2/test/script/lexer.hxx>
+
+#include <cstring> // strchr()
+
+using namespace std;
+
+namespace build2
+{
+ namespace test
+ {
+ namespace script
+ {
+ using type = token_type;
+
+ void lexer::
+ mode (base_mode m, char ps, optional<const char*> esc)
+ {
+ const char* s1 (nullptr);
+ const char* s2 (nullptr);
+ bool s (true);
+ bool n (true);
+ bool q (true);
+
+ if (!esc)
+ {
+ assert (!state_.empty ());
+ esc = state_.top ().escapes;
+ }
+
+ switch (m)
+ {
+ case lexer_mode::command_line:
+ {
+ s1 = ":;=!|&<> $(#\t\n";
+ s2 = " == ";
+ break;
+ }
+ case lexer_mode::first_token:
+ {
+ // First token on the script line. Like command_line but
+ // recognizes leading '.+-{}' as tokens as well as variable
+ // assignments as separators.
+ //
+ // Note that to recognize only leading '.+-{}' we shouldn't add
+ // them to the separator strings.
+ //
+ s1 = ":;=+!|&<> $(#\t\n";
+ s2 = " == ";
+ break;
+ }
+ case lexer_mode::second_token:
+ {
+ // Second token on the script line. Like command_line but
+ // recognizes leading variable assignments.
+ //
+ // Note that to recognize only leading assignments we shouldn't
+ // add them to the separator strings (so this is identical to
+ // command_line).
+ //
+ s1 = ":;=!|&<> $(#\t\n";
+ s2 = " == ";
+ break;
+ }
+ case lexer_mode::variable_line:
+ {
+ // Like value except we recognize ';' and don't recognize '{'.
+ // Note that we don't recognize ':' since having a trailing
+ // variable assignment is illegal.
+ //
+ s1 = "; $([]#\t\n";
+ s2 = " ";
+ break;
+ }
+
+ case lexer_mode::command_expansion:
+ {
+ // Note that whitespaces are not word separators in this mode.
+ //
+ s1 = "|&<>";
+ s2 = " ";
+ s = false;
+ break;
+ }
+ case lexer_mode::here_line_single:
+ {
+ // This one is like a single-quoted string except it treats
+ // newlines as a separator. We also treat quotes as literals.
+ //
+ // Note that it might be tempting to enable line continuation
+ // escapes. However, we will then have to also enable escaping of
+ // the backslash, which makes it a lot less tempting.
+ //
+ s1 = "\n";
+ s2 = " ";
+ esc = ""; // Disable escape sequences.
+ s = false;
+ q = false;
+ break;
+ }
+ case lexer_mode::here_line_double:
+ {
+ // This one is like a double-quoted string except it treats
+ // newlines as a separator. We also treat quotes as literals.
+ //
+ s1 = "$(\n";
+ s2 = " ";
+ s = false;
+ q = false;
+ break;
+ }
+ case lexer_mode::description_line:
+ {
+ // This one is like a single-quoted string and has an ad hoc
+ // implementation.
+ //
+ break;
+ }
+ default:
+ {
+ // Make sure pair separators are only enabled where we expect
+ // them.
+ //
+ // @@ Should we disable pair separators in the eval mode?
+ //
+ assert (ps == '\0' ||
+ m == lexer_mode::eval ||
+ m == lexer_mode::attribute);
+
+ base_lexer::mode (m, ps, esc);
+ return;
+ }
+ }
+
+ assert (ps == '\0');
+ state_.push (state {m, ps, s, n, q, *esc, s1, s2});
+ }
+
+ token lexer::
+ next ()
+ {
+ token r;
+
+ switch (state_.top ().mode)
+ {
+ case lexer_mode::command_line:
+ case lexer_mode::first_token:
+ case lexer_mode::second_token:
+ case lexer_mode::variable_line:
+ case lexer_mode::command_expansion:
+ case lexer_mode::here_line_single:
+ case lexer_mode::here_line_double:
+ r = next_line ();
+ break;
+ case lexer_mode::description_line:
+ r = next_description ();
+ break;
+ default:
+ r = base_lexer::next ();
+ break;
+ }
+
+ if (r.qtype != quote_type::unquoted)
+ ++quoted_;
+
+ return r;
+ }
+
+ token lexer::
+ next_line ()
+ {
+ bool sep (skip_spaces ());
+
+ xchar c (get ());
+ uint64_t ln (c.line), cn (c.column);
+
+ if (eos (c))
+ return token (type::eos, sep, ln, cn, token_printer);
+
+ state st (state_.top ()); // Make copy (see first/second_token).
+ lexer_mode m (st.mode);
+
+ auto make_token = [&sep, &m, ln, cn] (type t, string v = string ())
+ {
+ bool q (m == lexer_mode::here_line_double);
+
+ return token (t, move (v), sep,
+ (q ? quote_type::double_ : quote_type::unquoted), q,
+ ln, cn,
+ token_printer);
+ };
+
+ auto make_token_with_modifiers =
+ [&make_token, this] (type t,
+ const char* mods, // To recorgnize.
+ const char* stop = nullptr) // To stop after.
+ {
+ string v;
+ if (mods != nullptr)
+ {
+ for (xchar p (peek ());
+ (strchr (mods, p) != nullptr && // Modifier.
+ strchr (v.c_str (), p) == nullptr); // Not already seen.
+ p = peek ())
+ {
+ get ();
+ v += p;
+
+ if (stop != nullptr && strchr (stop, p) != nullptr)
+ break;
+ }
+ }
+
+ return make_token (t, move (v));
+ };
+
+ // Expire certain modes at the end of the token. Do it early in case
+ // we push any new mode (e.g., double quote).
+ //
+ if (m == lexer_mode::first_token || m == lexer_mode::second_token)
+ state_.pop ();
+
+ // NOTE: remember to update mode() if adding new special characters.
+
+ if (m != lexer_mode::command_expansion)
+ {
+ switch (c)
+ {
+ case '\n':
+ {
+ // Expire variable value mode at the end of the line.
+ //
+ if (m == lexer_mode::variable_line)
+ state_.pop ();
+
+ sep = true; // Treat newline as always separated.
+ return make_token (type::newline);
+ }
+ }
+ }
+
+ if (m != lexer_mode::here_line_single)
+ {
+ switch (c)
+ {
+ // Variable expansion, function call, and evaluation context.
+ //
+ case '$': return make_token (type::dollar);
+ case '(': return make_token (type::lparen);
+ }
+ }
+
+
+ if (m == lexer_mode::variable_line)
+ {
+ switch (c)
+ {
+ // Attributes.
+ //
+ case '[': return make_token (type::lsbrace);
+ case ']': return make_token (type::rsbrace);
+ }
+ }
+
+ // Line separators.
+ //
+ if (m == lexer_mode::command_line ||
+ m == lexer_mode::first_token ||
+ m == lexer_mode::second_token ||
+ m == lexer_mode::variable_line)
+ {
+ switch (c)
+ {
+ case ';': return make_token (type::semi);
+ }
+ }
+
+ if (m == lexer_mode::command_line ||
+ m == lexer_mode::first_token ||
+ m == lexer_mode::second_token)
+ {
+ switch (c)
+ {
+ case ':': return make_token (type::colon);
+ }
+ }
+
+ // Command line operator/separators.
+ //
+ if (m == lexer_mode::command_line ||
+ m == lexer_mode::first_token ||
+ m == lexer_mode::second_token)
+ {
+ switch (c)
+ {
+ // Comparison (==, !=).
+ //
+ case '=':
+ case '!':
+ {
+ if (peek () == '=')
+ {
+ get ();
+ return make_token (c == '=' ? type::equal : type::not_equal);
+ }
+ }
+ }
+ }
+
+ // Command operators/separators.
+ //
+ if (m == lexer_mode::command_line ||
+ m == lexer_mode::first_token ||
+ m == lexer_mode::second_token ||
+ m == lexer_mode::command_expansion)
+ {
+ switch (c)
+ {
+ // |, ||
+ //
+ case '|':
+ {
+ if (peek () == '|')
+ {
+ get ();
+ return make_token (type::log_or);
+ }
+ else
+ return make_token (type::pipe);
+ }
+ // &, &&
+ //
+ case '&':
+ {
+ xchar p (peek ());
+
+ if (p == '&')
+ {
+ get ();
+ return make_token (type::log_and);
+ }
+
+ // These modifiers are mutually exclusive so stop after seeing
+ // either one.
+ //
+ return make_token_with_modifiers (type::clean, "!?", "!?");
+ }
+ // <
+ //
+ case '<':
+ {
+ type r (type::in_str);
+ xchar p (peek ());
+
+ if (p == '|' || p == '-' || p == '<')
+ {
+ get ();
+
+ switch (p)
+ {
+ case '|': return make_token (type::in_pass);
+ case '-': return make_token (type::in_null);
+ case '<':
+ {
+ r = type::in_doc;
+ p = peek ();
+
+ if (p == '<')
+ {
+ get ();
+ r = type::in_file;
+ }
+ break;
+ }
+ }
+ }
+
+ // Handle modifiers.
+ //
+ const char* mods (nullptr);
+ switch (r)
+ {
+ case type::in_str:
+ case type::in_doc: mods = ":/"; break;
+ }
+
+ return make_token_with_modifiers (r, mods);
+ }
+ // >
+ //
+ case '>':
+ {
+ type r (type::out_str);
+ xchar p (peek ());
+
+ if (p == '|' || p == '-' || p == '!' || p == '&' ||
+ p == '=' || p == '+' || p == '>')
+ {
+ get ();
+
+ switch (p)
+ {
+ case '|': return make_token (type::out_pass);
+ case '-': return make_token (type::out_null);
+ case '!': return make_token (type::out_trace);
+ case '&': return make_token (type::out_merge);
+ case '=': return make_token (type::out_file_ovr);
+ case '+': return make_token (type::out_file_app);
+ case '>':
+ {
+ r = type::out_doc;
+ p = peek ();
+
+ if (p == '>')
+ {
+ get ();
+ r = type::out_file_cmp;
+ }
+ break;
+ }
+ }
+ }
+
+ // Handle modifiers.
+ //
+ const char* mods (nullptr);
+ const char* stop (nullptr);
+ switch (r)
+ {
+ case type::out_str:
+ case type::out_doc: mods = ":/~"; stop = "~"; break;
+ }
+
+ return make_token_with_modifiers (r, mods, stop);
+ }
+ }
+ }
+
+ // Dot, plus/minus, and left/right curly braces.
+ //
+ if (m == lexer_mode::first_token)
+ {
+ switch (c)
+ {
+ case '.': return make_token (type::dot);
+ case '+': return make_token (type::plus);
+ case '-': return make_token (type::minus);
+ case '{': return make_token (type::lcbrace);
+ case '}': return make_token (type::rcbrace);
+ }
+ }
+
+ // Variable assignment (=, +=, =+).
+ //
+ if (m == lexer_mode::second_token)
+ {
+ switch (c)
+ {
+ case '=':
+ {
+ if (peek () == '+')
+ {
+ get ();
+ return make_token (type::prepend);
+ }
+ else
+ return make_token (type::assign);
+ }
+ case '+':
+ {
+ if (peek () == '=')
+ {
+ get ();
+ return make_token (type::append);
+ }
+ }
+ }
+ }
+
+ // Otherwise it is a word.
+ //
+ unget (c);
+ return word (st, sep);
+ }
+
+ token lexer::
+ next_description ()
+ {
+ xchar c (peek ());
+
+ if (eos (c))
+ fail (c) << "expected newline at the end of description line";
+
+ uint64_t ln (c.line), cn (c.column);
+
+ if (c == '\n')
+ {
+ get ();
+ state_.pop (); // Expire the description mode.
+ return token (type::newline, true, ln, cn, token_printer);
+ }
+
+ string lexeme;
+
+ // For now no line continutions though we could support them.
+ //
+ for (; !eos (c) && c != '\n'; c = peek ())
+ {
+ get ();
+ lexeme += c;
+ }
+
+ return token (move (lexeme),
+ false,
+ quote_type::unquoted, false,
+ ln, cn);
+ }
+
+ token lexer::
+ word (state st, bool sep)
+ {
+ lexer_mode m (st.mode);
+
+ // Customized implementation that handles special variable names ($*,
+ // $N, $~, $@).
+ //
+ if (m != lexer_mode::variable)
+ return base_lexer::word (st, sep);
+
+ xchar c (peek ());
+
+ if (c != '*' && c != '~' && c != '@' && !digit (c))
+ return base_lexer::word (st, sep);
+
+ get ();
+
+ if (digit (c) && digit (peek ()))
+ fail (c) << "multi-digit special variable name";
+
+ state_.pop (); // Expire the variable mode.
+ return token (string (1, c),
+ sep,
+ quote_type::unquoted, false,
+ c.line, c.column);
+ }
+ }
+ }
+}