// file      : build2/lexer -*- C++ -*-
// copyright : Copyright (c) 2014-2016 Code Synthesis Ltd
// license   : MIT; see accompanying LICENSE file

#ifndef BUILD2_LEXER
#define BUILD2_LEXER

#include <stack>

#include <butl/char-scanner>

#include <build2/types>
#include <build2/utility>

#include <build2/token>
#include <build2/diagnostics>

namespace build2
{
  // Context-dependent lexing mode. In the value mode we don't treat certain
  // characters (e.g., '+', '=') as special so that we can use them in the
  // variable values, e.g., 'foo = g++'. In contrast, in the variable mode, we
  // restrict certain character (e.g., '/') from appearing in the name. The
  // attribute mode is like value except it doesn't treat '{' and '}' as
  // special (so we cannot have name groups in attributes). The eval mode is
  // used in the evaluation context. Quoted are internal modes and should not
  // be set explicitly.
  //
  // Note that the normal, value, and eval modes split words separated by the
  // pair character (to disable pairs one can pass '\0' as a pair character).
  //
  // The alternnative modes must be set manually. The value mode automatically
  // expires after the end of the line. The attribute mode expires after the
  // closing ']'. The variable mode expires after the word token. And the eval
  // mode expires after the closing ')'.
  //
  // Note that normally it is only safe to switch mode when the current token
  // is not quoted (or, more generally, when you are not in the double-quoted
  // mode) unless the mode treats the double-quote as a separator (e.g.,
  // variable name mode). Failed that your mode (which now will be the top of
  // the mode stack) will prevent proper recognition of the closing quote.
  //

  // Extendable/inheritable enum-like class.
  //
  struct lexer_mode: lexer_mode_base
  {
    using base_type = lexer_mode_base;

    enum
    {
      normal = base_type::value_next,
      variable,
      value,
      attribute,
      eval,
      single_quoted,
      double_quoted,

      value_next
    };

    lexer_mode () = default;
    lexer_mode (value_type v): base_type (v) {}
    lexer_mode (base_type v): base_type (v) {}
  };

  class lexer: protected butl::char_scanner
  {
  public:
    // If escape is not NULL then only escape sequences with characters from
    // this string are considered "effective escapes" with all others passed
    // through as is. Note that the escape string is not copied.
    //
    lexer (istream& is,
           const path& name,
           const char* escapes = nullptr,
           void (*processor) (token&, const lexer&) = nullptr)
        : lexer (is, name, escapes, processor, true) {}

    const path&
    name () const {return name_;}

    // Note: sets mode for the next token. The second argument can be used to
    // specifythe pair separator character (if the mode supports pairs). If
    // escapes not specified, then inherit the current mode's (thought a mode
    // can also override it).
    //
    virtual void
    mode (lexer_mode,
          char pair_separator = '\0',
          optional<const char*> escapes = nullopt);

    // Expire the current mode early.
    //
    void
    expire_mode () {state_.pop ();}

    lexer_mode
    mode () const {return state_.top ().mode;}

    char
    pair_separator () const {return state_.top ().sep_pair;}

    // Scanner. Note that it is ok to call next() again after getting eos.
    //
    token
    next ();

    // Peek at the first character of the next token. Return the character
    // or '\0' if the next token will be eos. Also return an indicator of
    // whether the next token will be separated.
    //
    pair<char, bool>
    peek_char ();

  protected:
    struct state
    {
      lexer_mode mode;

      char sep_pair;
      bool sep_space; // Are whitespaces separators (see skip_spaces())?
      bool quotes;    // Recognize quoted fragments.

      const char* escapes; // Effective escape sequences to recognize.

      // Word separator characters. For two-character sequence put the first
      // one in sep_first and the second one in the corresponding position of
      // sep_second. If it's a single-character sequence, then put space in
      // sep_second. If there are multiple sequences that start with the same
      // character, then repeat the first character in sep_first.
      //
      const char* sep_first;
      const char* sep_second;
    };

    // If you extend the lexer and add a custom lexer mode, then you must
    // override next_impl() and handle the custom mode there.
    //
    virtual token
    next_impl ();

    token
    next_eval ();

    token
    next_quoted ();

    // Lex a word assuming current is the top state (which may already have
    // been "expired" from the top).
    //
    virtual token
    word (state current, bool separated);

    // Return true if we have seen any spaces. Skipped empty lines
    // don't count. In other words, we are only interested in spaces
    // that are on the same line as the following non-space character.
    //
    bool
    skip_spaces ();

    // Diagnostics.
    //
  protected:
    fail_mark fail;

    // Lexer state.
    //
  protected:
    lexer (istream& is,
           const path& n,
           const char* e,
           void (*p) (token&, const lexer&),
           bool sm)
        : char_scanner (is),
          fail ("error", &name_),
          name_ (n),
          processor_ (p),
          sep_ (false)
    {
      if (sm)
        mode (lexer_mode::normal, '@', e);
    }

    const path name_;
    void (*processor_) (token&, const lexer&);

    std::stack<state> state_;

    bool sep_; // True if we skipped spaces in peek().
  };
}

// Diagnostics plumbing.
//
namespace butl // ADL
{
  inline build2::location
  get_location (const butl::char_scanner::xchar& c, const void* data)
  {
    using namespace build2;

    assert (data != nullptr); // E.g., must be &lexer::name_.
    return location (static_cast<const path*> (data), c.line, c.column);
  }
}

#endif // BUILD2_LEXER