libbuild2/lexer.hxx


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352

// file      : libbuild2/lexer.hxx -*- C++ -*-
// license   : MIT; see accompanying LICENSE file

#ifndef LIBBUILD2_LEXER_HXX
#define LIBBUILD2_LEXER_HXX

#include <stack>

#include <libbutl/utf8.hxx>
#include <libbutl/unicode.hxx>
#include <libbutl/char-scanner.hxx>

#include <libbuild2/types.hxx>
#include <libbuild2/utility.hxx>

#include <libbuild2/token.hxx>
#include <libbuild2/diagnostics.hxx>

#include <libbuild2/export.hxx>

namespace build2
{
  // Context-dependent lexing mode.
  //
  // Quoted modes are internal and should not be set explicitly. In the value
  // mode we don't treat certain characters (e.g., `+`, `=`) as special so
  // that we can use them in the variable values, e.g., `foo = g++`. In
  // contrast, in the variable mode, we restrict certain character (e.g., `/`)
  // from appearing in the name. Additionally, in the variable mode we
  // recognize leading `\` as the beginning of the escape sequent ($\n). The
  // values mode is like value but recogizes `,` as special (used in contexts
  // where we need to list multiple values). The attributes/attribute_value
  // modes are like values where each value is potentially a variable
  // assignment; they don't treat `{` and `}` as special (so we cannot have
  // name groups in attributes) as well as recognizes `=` and `]`. The
  // subscript mode is like value but doesn't treat `{` and `}` as special and
  // recognizes `]`. The eval mode is used in the evaluation context.
  //
  // A number of modes are "derived" from the value/values mode by recognizing
  // a few extra characters:
  //
  //   switch_expressions  values plus `:`
  //   case_patterns       values plus `|` and `:`
  //
  // Note that the normal, value/values and derived, as well as eval modes
  // split words separated by the pair character (to disable pairs one can
  // pass `\0` as a pair character).
  //
  // The normal mode recognizes `%` and `{{...` at the beginning of the line
  // as special. The cmdvar mode is like normal but does not treat these
  // character sequences as special.
  //
  // Finally, the foreign mode reads everything until encountering a line that
  // contains nothing (besides whitespaces) other than the closing multi-
  // curly-brace (`}}...`) (or eos) returning the contents as the word token
  // followed by the multi_rcbrace (or eos). In a way it is similar to the
  // single-quote mode. The number of closing braces to expect is passed as
  // mode data.
  //
  // The mode data is also used by a few other modes. The buildspec mode uses
  // it as a boolean value to decide whether to recognize newlines as tokens.
  // In the variable mode the mode data may be a pointer to a C string that
  // contains a list of special one-character variable names to recognize
  // (e.g., $<, $~, etc). Note that the parser has a special kludge to also
  // recognize them as $(<), etc.
  //
  // The alternative modes must be set manually. The value/values and derived
  // modes automatically expires after the end of the line. The attribute and
  // subscript modes expires after the closing `]`. The variable mode expires
  // after the word token. The eval mode expires after the closing `)`. And
  // the foreign mode expires after the closing braces.
  //
  // Note that normally it is only safe to switch mode when the current token
  // is not quoted (or, more generally, when you are not in the double-quoted
  // mode) unless the mode treats the double-quote as a separator (e.g.,
  // variable name mode). Failed that your mode (which now will be the top of
  // the mode stack) will prevent proper recognition of the closing quote.
  //
  // The `[` token is used for attributes (where it cuts across most of the
  // modes) as well as for value subscript (where it is only recognized after
  // expansions). It is handled with a flag. In the normal mode it is
  // automatically set at the beginning and after each newline. In all other
  // modes it must be explicitly set at points where attribute/subscript is
  // recognized. In all the cases it is automatically reset after lexing the
  // next token (whether `[` or not).

  // Extendable/inheritable enum-like class.
  //
  struct lexer_mode: lexer_mode_base
  {
    using base_type = lexer_mode_base;

    enum
    {
      normal = base_type::value_next,
      cmdvar,
      variable,
      value,
      values,
      case_patterns,
      switch_expressions,
      attributes,
      attribute_value,
      subscript,
      eval,
      single_quoted,
      double_quoted,
      foreign,
      buildspec,

      value_next
    };

    lexer_mode () = default;
    lexer_mode (value_type v): base_type (v) {}
    lexer_mode (base_type v): base_type (v) {}
  };

  class LIBBUILD2_SYMEXPORT lexer:
    public butl::char_scanner<butl::utf8_validator, 2>
  {
  public:
    // If escape is not NULL then only escape sequences with characters from
    // this string are considered "effective escapes" with all others passed
    // through as is. Note that neither the name nor escape arguments are
    // copied.
    //
    lexer (istream& is,
           const path_name& name,
           uint64_t line = 1, // Start line in the stream.
           const char* escapes = nullptr)
      : lexer (is, name, line, escapes, true /* set_mode */) {}

    const path_name&
    name () const {return name_;}

    // Set the lexer mode for the next token or delay this until the end of a
    // double-quoted token sequence is encountered. The second argument can be
    // used to specify the pair separator character (if the mode supports
    // pairs). If escapes is not specified, then inherit the current mode's
    // (though a mode can also override it).
    //
    // Note that there is a common parsing pattern of sensing the language
    // construct kind we are about to parse by reading its first token,
    // switching to an appropriate lexing mode, and then parsing the rest. The
    // problem here is that the first token may start the double-quoted token
    // sequence, turning the lexer into the double-quoted mode. In this case
    // switching the lexer mode right away would not be a good idea. Thus,
    // this function delays the mode switch until the end of the double-quoted
    // sequence is encountered. Note, however, that such a delay only works
    // properly if the function is called right after the first quoted token
    // is read (because any subsequent tokens may end up being parsed in a
    // nested mode such as variable or eval; see mode_impl() for details).
    //
    virtual void
    mode (lexer_mode,
          char pair_separator = '\0',
          optional<const char*> escapes = nullopt,
          uintptr_t data = 0);

    // Enable `[` recognition for the next token.
    //
    void
    enable_lsbrace (bool unsep = false)
    {
      state_.top ().lsbrace = true;
      state_.top ().lsbrace_unsep = unsep;
    }

    // Expire the current mode early or delay this until the end of a
    // double-quoted token sequence is encountered (see mode() for details on
    // the delay condition and reasoning).
    //
    void
    expire_mode ();

    lexer_mode
    mode () const {return state_.top ().mode;}

    uintptr_t
    mode_data () const {return state_.top ().data;}

    char
    pair_separator () const {return state_.top ().sep_pair;}

    // Scanner. Note that it is ok to call next() again after getting eos.
    //
    // If you extend the lexer and add a custom lexer mode, then you must
    // override next() and handle the custom mode there.
    //
    virtual token
    next ();

    // Peek at the first one/two characters of the next token(s). Return the
    // characters or '\0' if either would be eos. Also return an indicator of
    // whether the next token would be separated. Note: cannot be used to peek
    // at the first character of a line.
    //
    // Note also that it assumes that the current mode and the potential new
    // mode in which these characters will actually be parsed use the same
    // whitespace separation (the sep_space and sep_newline values).
    //
    pair<char, bool>
    peek_char ();

    pair<pair<char, char>, bool>
    peek_chars ();

    // As base::get() but in case of an invalid character issue diagnostics
    // and throw failed.
    //
    xchar
    get ();

    // Get previously peeked character (faster).
    //
    void
    get (const xchar&);

    // As base::peek() but in case of an invalid character issue diagnostics
    // and throw failed.
    //
    xchar
    peek ();

  protected:
    struct state
    {
      lexer_mode      mode;
      uintptr_t       data;
      optional<token> hold;

      bool lsbrace;       // Recognize `[`.
      bool lsbrace_unsep; // Recognize it only if unseparated.

      char sep_pair;
      bool sep_space;    // Are whitespaces separators (see skip_spaces())?
      bool sep_newline;  // Is newline special (see skip_spaces())?
      bool quotes;       // Recognize quoted fragments.

      const char* escapes; // Effective escape sequences to recognize.

      // Word separator characters. For two-character sequence put the first
      // one in sep_first and the second one in the corresponding position of
      // sep_second. If it's a single-character sequence, then put space in
      // sep_second. If there are multiple sequences that start with the same
      // character, then repeat the first character in sep_first.
      //
      const char* sep_first;
      const char* sep_second;
    };

    token
    next_eval ();

    token
    next_quoted ();

    token
    next_foreign ();

    // Lex a word assuming current is the top state (which may already have
    // been "expired" from the top).
    //
    virtual token
    word (const state& current, bool separated);

    // Return true in first if we have seen any spaces. Skipped empty lines
    // don't count. In other words, we are only interested in spaces that are
    // on the same line as the following non-space character. Return true in
    // second if we have started skipping spaces from column 1 (note that
    // if this mode does not skip spaces, then second will always be false).
    //
    pair<bool, bool>
    skip_spaces ();

    // Set state for the next token or delay until the end of a double-quoted
    // token sequence is encountered (see mode() for details on the delay
    // condition and reasoning).
    //
    void
    mode_impl (state&&);

    state&
    current_state ()
    {
      assert (!state_.empty ());
      return state_.top ();
    }

    // Diagnostics.
    //
  protected:
    fail_mark fail;

    [[noreturn]] void
    fail_char (const xchar&);

    // Lexer state.
    //
  protected:
    lexer (istream& is, const path_name& name, uint64_t line,
           const char* escapes,
           bool set_mode)
      : char_scanner (is,
                      butl::utf8_validator (butl::codepoint_types::graphic,
                                            U"\n\r\t"),
                      true /* crlf */,
                      line),
        fail ("error", &name),
        name_ (name),
        sep_ (false)
    {
      if (set_mode)
        mode (lexer_mode::normal, '@', escapes);
    }

    const path_name& name_;

    bool sep_; // True if we skipped spaces in peek().

  private:
    // Use current_state(), mode_impl(), and expire_mode().
    //
    std::stack<state> state_;

    using base = char_scanner<butl::utf8_validator, 2>;

    // Buffer for a get()/peek() potential error.
    //
    string ebuf_;
  };
}

// Diagnostics plumbing.
//
namespace butl // ADL
{
  inline build2::location
  get_location (const butl::char_scanner<butl::utf8_validator, 2>::xchar& c,
                const void* data)
  {
    using namespace build2;

    assert (data != nullptr); // E.g., must be &lexer::name_.
    return location (*static_cast<const path_name*> (data), c.line, c.column);
  }
}

#include <libbuild2/lexer.ixx>

#endif // LIBBUILD2_LEXER_HXX