build/lexer


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141

// file      : build/lexer -*- C++ -*-
// copyright : Copyright (c) 2014-2015 Code Synthesis Tools CC
// license   : MIT; see accompanying LICENSE file

#ifndef BUILD_LEXER
#define BUILD_LEXER

#include <string>
#include <iosfwd>
#include <cstdint> // uint64_t
#include <exception>

#include <build/token>
#include <build/diagnostics>

namespace build
{
  // Context-dependent lexing mode. In the value mode we don't treat
  // certain characters (e.g., +, =) as special so that we can use
  // them in the variable values, e.g., 'foo = g++'. In contrast,
  // in the variable mode, we restrict certain character (e.g., /)
  // from appearing in the name. The pairs mode is just like value
  // except that we split names separated by the pair character.
  // The pairs mode must be set manually.
  //
  enum class lexer_mode {normal, value, variable, pairs};

  class lexer
  {
  public:
    lexer (std::istream& is, const std::string& name): is_ (is), fail (name) {}

    // Note: sets mode for the next token. If mode is pairs, then
    // the second argument specifies the separator character.
    //
    void
    mode (lexer_mode m, char pair_separator = '=')
    {
      next_mode_ = m;
      pair_separator_ = pair_separator;
    }

    lexer_mode
    mode () const {return mode_;}

    // Scanner.
    //
    token
    next ();

  private:
    class xchar
    {
    public:
      typedef std::char_traits<char> traits_type;
      typedef traits_type::int_type int_type;
      typedef traits_type::char_type char_type;

      xchar (int_type v, std::uint64_t l, std::uint64_t c)
          : v_ (v), l_ (l), c_ (c) {}

      operator char_type () const {return static_cast<char_type> (v_);}

      int_type
      value () const {return v_;}

      std::uint64_t line () const {return l_;}
      std::uint64_t column () const {return c_;}

    private:
      int_type v_;
      std::uint64_t l_;
      std::uint64_t c_;
    };

    token
    name (xchar, bool separated);

    // Return true we have seen any spaces. Skipped empty lines don't
    // count. In other words, we are only interested in spaces that
    // are on the same line as the following non-space character.
    //
    bool
    skip_spaces ();

    xchar
    escape ();

    // Character interface.
    //
  private:
    xchar
    peek ();

    xchar
    get ();

    void
    unget (const xchar&);

    // Tests.
    //
    bool
    is_eos (const xchar& c) const
    {
      return c.value () == xchar::traits_type::eof ();
    }

    // Diagnostics.
    //
  private:
    struct fail_mark_base: build::fail_mark_base<failed>
    {
      fail_mark_base (const std::string& n): name_ (n) {}

      location_prologue
      operator() (const xchar&) const;

      std::string name_;
    };
    typedef diag_mark<fail_mark_base> fail_mark;

  private:
    std::istream& is_;
    fail_mark fail;

    std::uint64_t l_ {1};
    std::uint64_t c_ {1};

    bool unget_ {false};
    xchar buf_ {0, 0, 0};

    bool eos_ {false};
    lexer_mode mode_ {lexer_mode::normal};
    char pair_separator_;
    lexer_mode next_mode_ {lexer_mode::normal}; // Switch to for next token.
    lexer_mode prev_mode_; // Return to after current mode expires.
  };
}

#endif // BUILD_LEXER