build2/cc/lexer.hxx


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187

// file      : build2/cc/lexer.hxx -*- C++ -*-
// copyright : Copyright (c) 2014-2017 Code Synthesis Ltd
// license   : MIT; see accompanying LICENSE file

#ifndef BUILD2_CC_LEXER_HXX
#define BUILD2_CC_LEXER_HXX

#include <libbutl/sha256.mxx>
#include <libbutl/char-scanner.mxx>

#include <build2/types.hxx>
#include <build2/utility.hxx>

#include <build2/diagnostics.hxx>

namespace build2
{
  namespace cc
  {
    // Preprocessor-level tokenization of C/C++ source. In other words, the
    // sequence of tokens returned is similar to what a real C/C++ compiler
    // would see from its preprocessor.
    //
    // The input is a (partially-)preprocessed translation unit that may still
    // contain comments, line continuations, and preprocessor directives such
    // as #line, #pragma, but not #include (which is diagnosed). Currently,
    // all preprocessor directives except #line are ignored and no values are
    // saved from literals. The #line directive (and its shorthand notation)
    // is recognized to provide the logical token location.
    //
    // While at it we also calculate the checksum of the input ignoring
    // comments, whitespaces, etc. This is used to detect changes that do not
    // alter the resulting token stream.
    //
    enum class token_type
    {
      // NOTE: remember to update operator<<() if changing anything here!
      //
      eos,

      dot,         // .
      semi,        // ;
      lcbrace,     // {
      rcbrace,     // }
      punctuation, // Other punctuation.

      identifier,

      number,      // Number literal.
      character,   // Char   literal.
      string,      // String literal.

      other        // Other token.
    };

    struct token
    {
      token_type type = token_type::eos;
      string     value;

      // Logical position.
      //
      path     file;
      uint64_t line   = 0;
      uint64_t column = 0;

      // Physical position in the stream, currently only for identifiers.
      //
      uint64_t position = 0;
    };

    // Output the token value in a format suitable for diagnostics.
    //
    ostream&
    operator<< (ostream&, const token&);

    class lexer: protected butl::char_scanner
    {
    public:
      lexer (ifdstream& is, const path& name)
          : char_scanner (is, false),
            name_ (name),
            fail ("error", &name_),
            log_file_ (name) {}

      const path&
      name () const {return name_;}

      string
      checksum () const {return cs_.string ();}

      // Note that it is ok to call next() again after getting eos.
      //
      token
      next ()
      {
        token t;
        next (t, skip_spaces (), true);
        return t;
      }

      // As above but reuse the token to avoid a (potential) memory
      // allocation. Typical usage:
      //
      // for (token t; l.next (t) != token_type::eos; )
      //   ...
      //
      token_type
      next (token& t)
      {
        next (t, skip_spaces (), true);
        return t.type;
      }

    private:
      void
      next (token&, xchar, bool);

      void
      number_literal (token&, xchar);

      void
      char_literal (token&, xchar);

      void
      string_literal (token&, xchar);

      void
      raw_string_literal (token&, xchar);

      void
      literal_suffix (xchar);

      void
      line_directive (token&, xchar);

      xchar
      skip_spaces (bool newline = true);

      // The char_scanner adaptation for newline escape sequence processing.
      // Enabled by default and is only disabled in the raw string literals.
      //
    private:
      using base = char_scanner;

      xchar
      peek (bool escape = true);

      xchar
      get (bool escape = true);

      void
      get (const xchar& peeked);

      // Hashing versions.
      //
      xchar
      geth (bool escape = true);

      void
      geth (const xchar& peeked);

    private:
      const path name_;
      const fail_mark fail;

      // Logical file and line as set by the #line directives. Note that the
      // lexer diagnostics still uses the physical file/lines.
      //
      path               log_file_;
      optional<uint64_t> log_line_;

      string tmp_file_;
      sha256 cs_;
    };

    // Diagnostics plumbing.
    //
    inline location
    get_location (const token& t, const void*)
    {
      return location (&t.file, t.line, t.column);
    }
  }
}

#endif // BUILD2_CC_LEXER_HXX