aboutsummaryrefslogtreecommitdiff
path: root/build2/lexer.hxx
blob: a12d26f55bbae8cdb86a9157798d572d6a735a86 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
// file      : build2/lexer.hxx -*- C++ -*-
// copyright : Copyright (c) 2014-2018 Code Synthesis Ltd
// license   : MIT; see accompanying LICENSE file

#ifndef BUILD2_LEXER_HXX
#define BUILD2_LEXER_HXX

#include <stack>

#include <libbutl/char-scanner.mxx>

#include <build2/types.hxx>
#include <build2/utility.hxx>

#include <build2/token.hxx>
#include <build2/diagnostics.hxx>

namespace build2
{
  // Context-dependent lexing mode. In the value mode we don't treat certain
  // characters (e.g., '+', '=') as special so that we can use them in the
  // variable values, e.g., 'foo = g++'. In contrast, in the variable mode, we
  // restrict certain character (e.g., '/') from appearing in the name. The
  // attribute mode is like value except it doesn't treat '{' and '}' as
  // special (so we cannot have name groups in attributes). The eval mode is
  // used in the evaluation context. Quoted are internal modes and should not
  // be set explicitly.
  //
  // Note that the normal, value, and eval modes split words separated by the
  // pair character (to disable pairs one can pass '\0' as a pair character).
  //
  // The alternnative modes must be set manually. The value mode automatically
  // expires after the end of the line. The attribute mode expires after the
  // closing ']'. The variable mode expires after the word token. And the eval
  // mode expires after the closing ')'.
  //
  // Note that normally it is only safe to switch mode when the current token
  // is not quoted (or, more generally, when you are not in the double-quoted
  // mode) unless the mode treats the double-quote as a separator (e.g.,
  // variable name mode). Failed that your mode (which now will be the top of
  // the mode stack) will prevent proper recognition of the closing quote.
  //

  // Extendable/inheritable enum-like class.
  //
  struct lexer_mode: lexer_mode_base
  {
    using base_type = lexer_mode_base;

    enum
    {
      normal = base_type::value_next,
      variable,
      value,
      attribute,
      eval,
      single_quoted,
      double_quoted,
      buildspec,

      value_next
    };

    lexer_mode () = default;
    lexer_mode (value_type v): base_type (v) {}
    lexer_mode (base_type v): base_type (v) {}
  };

  class lexer: public butl::char_scanner
  {
  public:
    // If escape is not NULL then only escape sequences with characters from
    // this string are considered "effective escapes" with all others passed
    // through as is. Note that the escape string is not copied.
    //
    lexer (istream& is,
           const path& name,
           uint64_t line = 1, // Start line in the stream.
           const char* escapes = nullptr)
        : lexer (is, name, line, escapes, true /* set_mode */) {}

    const path&
    name () const {return name_;}

    // Note: sets mode for the next token. The second argument can be used to
    // specifythe pair separator character (if the mode supports pairs). If
    // escapes not specified, then inherit the current mode's (thought a mode
    // can also override it).
    //
    virtual void
    mode (lexer_mode,
          char pair_separator = '\0',
          optional<const char*> escapes = nullopt);

    // Expire the current mode early.
    //
    void
    expire_mode () {state_.pop ();}

    lexer_mode
    mode () const {return state_.top ().mode;}

    char
    pair_separator () const {return state_.top ().sep_pair;}

    // Scanner. Note that it is ok to call next() again after getting eos.
    //
    // If you extend the lexer and add a custom lexer mode, then you must
    // override next() and handle the custom mode there.
    //
    virtual token
    next ();

    // Peek at the first character of the next token. Return the character
    // or '\0' if the next token will be eos. Also return an indicator of
    // whether the next token will be separated.
    //
    pair<char, bool>
    peek_char ();

  protected:
    struct state
    {
      lexer_mode mode;

      char sep_pair;
      bool sep_space;   // Are whitespaces separators (see skip_spaces())?
      bool sep_newline; // Is newline special (see skip_spaces())?
      bool quotes;    // Recognize quoted fragments.

      const char* escapes; // Effective escape sequences to recognize.

      // Word separator characters. For two-character sequence put the first
      // one in sep_first and the second one in the corresponding position of
      // sep_second. If it's a single-character sequence, then put space in
      // sep_second. If there are multiple sequences that start with the same
      // character, then repeat the first character in sep_first.
      //
      const char* sep_first;
      const char* sep_second;
    };

    token
    next_eval ();

    token
    next_quoted ();

    // Lex a word assuming current is the top state (which may already have
    // been "expired" from the top).
    //
    virtual token
    word (state current, bool separated);

    // Return true if we have seen any spaces. Skipped empty lines
    // don't count. In other words, we are only interested in spaces
    // that are on the same line as the following non-space character.
    //
    bool
    skip_spaces ();

    // Diagnostics.
    //
  protected:
    fail_mark fail;

    // Lexer state.
    //
  protected:
    lexer (istream& is,
           const path& name,
           uint64_t line,
           const char* escapes,
           bool set_mode)
        : char_scanner (is, true /* crlf */, line),
          fail ("error", &name_),
          name_ (name),
          sep_ (false)
    {
      if (set_mode)
        mode (lexer_mode::normal, '@', escapes);
    }

    const path name_;
    std::stack<state> state_;

    bool sep_; // True if we skipped spaces in peek().
  };
}

// Diagnostics plumbing.
//
namespace butl // ADL
{
  inline build2::location
  get_location (const butl::char_scanner::xchar& c, const void* data)
  {
    using namespace build2;

    assert (data != nullptr); // E.g., must be &lexer::name_.
    return location (static_cast<const path*> (data), c.line, c.column);
  }
}

#endif // BUILD2_LEXER_HXX