// file      : libbuild2/lexer.cxx -*- C++ -*-
// license   : MIT; see accompanying LICENSE file

#include <libbuild2/lexer.hxx>

#include <cstring> // strchr()

using namespace std;

namespace build2
{
  using type = token_type;

  [[noreturn]] void lexer::
  fail_char (const xchar& c)
  {
    fail (c) << ebuf_ << endf;
  }

  pair<pair<char, char>, bool> lexer::
  peek_chars ()
  {
    auto p (skip_spaces ());
    assert (!p.second);
    sep_ = p.first;

    char r[2] = {'\0', '\0'};

    xchar c0 (peek ());
    if (!eos (c0))
    {
      get (c0);
      r[0] = c0;

      xchar c1 (peek ());
      if (!eos (c1))
        r[1] = c1;

      unget (c0);
    }

    return make_pair (make_pair (r[0], r[1]), sep_);
  }

  void lexer::
  mode (lexer_mode m, char ps, optional<const char*> esc, uintptr_t data)
  {
    bool lsb (false); // Enable `[` recognition.

    const char* s1 (nullptr);
    const char* s2 (nullptr);

    bool s (true); // space
    bool n (true); // newline
    bool q (true); // quotes

    if (!esc)
    {
      assert (!state_.empty ());
      esc = state_.top ().escapes;
    }

    switch (m)
    {
    case lexer_mode::normal:
    case lexer_mode::cmdvar:
      {
        // Note: `%` is only recognized at the beginning of the line so it
        // should not be included here.
        //
        s1 = ":<>=+? $(){}#\t\n";
        s2 = "    ==         ";
        lsb  = true;
        break;
      }
    case lexer_mode::value:
      {
        s1 = " $(){}#\t\n";
        s2 = "         ";
        break;
      }
    case lexer_mode::values:
      {
        s1 = " $(){},#\t\n";
        s2 = "          ";
        break;
      }
    case lexer_mode::switch_expressions:
      {
        s1 = " $(){},:#\t\n";
        s2 = "           ";
        break;
      }
    case lexer_mode::case_patterns:
      {
        s1 = " $(){},|:#\t\n";
        s2 = "            ";
        break;
      }
    case lexer_mode::attributes:
      {
        s1 = " $()=,]#\t\n";
        s2 = "          ";
        break;
      }
    case lexer_mode::attribute_value:
      {
        s1 = " $(),]#\t\n";
        s2 = "         ";
        break;
      }
    case lexer_mode::subscript:
      {
        s1 = " $()]#\t\n";
        s2 = "        ";
        break;
      }
    case lexer_mode::eval:
      {
        // NOTE: remember to update special() lambda in parse_names() if
        //       adding any new single-character tokens to the eval mode.
        //
        s1 = ":<>=!&|?,` $(){}#\t\n";
        s2 = "   = &             ";
        break;
      }
    case lexer_mode::buildspec:
      {
        // Like the value mode with these differences:
        //
        // 1. Returns '(' as a separated token provided the state stack depth
        //    is less than or equal to 3 (initial state plus two buildspec)
        //    (see parse_buildspec() for details).
        //
        // 2. Recognizes comma.
        //
        // Note that because we use this mode for both the command line
        // buildspec and ad hoc recipe actions, we control the recognition of
        // newlines as tokens via the auxiliary data.
        //
        s1 = " $(){},\t\n";
        s2 = "         ";
        n = (data != 0);
        break;
      }
    case lexer_mode::foreign:
      assert (data > 1);
      // Fall through.
    case lexer_mode::single_quoted:
    case lexer_mode::double_quoted:
      {
        assert (ps == '\0');
        s = false;
        break;
      }
    case lexer_mode::variable:
      {
        // These are handled in an ad hoc way in word().
        assert (ps == '\0');
        break;
      }
    default: assert (false); // Unhandled custom mode.
    }

    state_.push (
      state {m, data, nullopt, lsb, false, ps, s, n, q, *esc, s1, s2});
  }

  token lexer::
  next ()
  {
    state& st (state_.top ());
    lexer_mode m (st.mode);

    // For some modes we have dedicated imlementations of next().
    //
    switch (m)
    {
    case lexer_mode::normal:
    case lexer_mode::cmdvar:
    case lexer_mode::value:
    case lexer_mode::values:
    case lexer_mode::switch_expressions:
    case lexer_mode::case_patterns:
    case lexer_mode::attributes:
    case lexer_mode::attribute_value:
    case lexer_mode::subscript:
    case lexer_mode::variable:
    case lexer_mode::buildspec:     break;
    case lexer_mode::eval:          return next_eval ();
    case lexer_mode::double_quoted: return next_quoted ();
    case lexer_mode::foreign:       return next_foreign ();
    default:                        assert (false); // Unhandled custom mode.
    }

    pair<bool, bool> skip (skip_spaces ());
    bool sep (skip.first);    // Separated from a previous character.
    bool first (skip.second); // First non-whitespace character of a line.

    xchar c (get ());
    uint64_t ln (c.line), cn (c.column);

    auto make_token = [&sep, ln, cn] (type t, string v = string ())
    {
      return token (t, move (v),
                    sep, quote_type::unquoted, false,
                    ln, cn, token_printer);
    };

    // Handle `[` (do it first to make sure the flag is cleared regardless of
    // what we return).
    //
    if (st.lsbrace)
    {
      st.lsbrace = false;

      if (c == '[' && (!st.lsbrace_unsep || !sep))
        return make_token (type::lsbrace);
    }

    if (eos (c))
      return make_token (type::eos);

    // Handle pair separator.
    //
    if (c == st.sep_pair)
      return make_token (type::pair_separator, string (1, c));

    // NOTE: remember to update mode(), next_eval() if adding any new special
    // characters.

    // These are special in all the modes handled by this function.
    //
    switch (c)
    {
    case '\n':
      {
        // Expire value/values modes at the end of the line.
        //
        if (m == lexer_mode::value              ||
            m == lexer_mode::values             ||
            m == lexer_mode::switch_expressions ||
            m == lexer_mode::case_patterns)
          state_.pop ();

        // Re-enable `[` recognition (attributes) in the normal mode (should
        // never be needed in cmdvar).
        //
        state& st (state_.top ());
        if (st.mode == lexer_mode::normal)
        {
          st.lsbrace = true;
          st.lsbrace_unsep = false;
        }

        sep = true; // Treat newline as always separated.
        return make_token (type::newline);
      }
    case '$': return make_token (type::dollar);
    case ')': return make_token (type::rparen);
    case '(':
      {
        // Left paren is always separated in the buildspec mode.
        //
        if (m == lexer_mode::buildspec && state_.size () <= 3)
          sep = true;

        return make_token (type::lparen);
      }
    }

    // Line-leading tokens in the normal mode.
    //
    // Note: must come before any other (e.g., `{`) tests below.
    //
    if (m == lexer_mode::normal && first)
    {
      switch (c)
      {
      case '%': return make_token (type::percent);
      case '{':
        {
          string v;
          while (peek () == '{')
            v += get ();

          if (!v.empty ())
          {
            v += '{';
            return make_token (type::multi_lcbrace, move (v));
          }

          break;
        }
      }
    }

    // The following characters are special in all modes except attributes
    // and subscript.
    //
    if (m != lexer_mode::attributes      &&
        m != lexer_mode::attribute_value &&
        m != lexer_mode::subscript)
    {
      switch (c)
      {
      case '{': return make_token (type::lcbrace);
      case '}': return make_token (type::rcbrace);
      }
    }

    // The following characters are special in the attributes modes.
    //
    if (m == lexer_mode::attributes)
    {
      switch (c)
      {
      case '=': return make_token (type::assign);
      }
    }

    if (m == lexer_mode::attributes      ||
        m == lexer_mode::attribute_value ||
        m == lexer_mode::subscript)
    {
      switch (c)
      {
      case ']':
        {
          state_.pop (); // Expire the mode after closing `]`.
          return make_token (type::rsbrace);
        }
      }
    }

    // The following characters are special in the normal and
    // switch_expressions modes.
    //
    if (m == lexer_mode::normal             ||
        m == lexer_mode::cmdvar             ||
        m == lexer_mode::switch_expressions ||
        m == lexer_mode::case_patterns)
    {
      switch (c)
      {
      case ':': return make_token (type::colon);
      }
    }

    // The following characters are special in the normal mode.
    //
    if (m == lexer_mode::normal ||
        m == lexer_mode::cmdvar)
    {
      switch (c)
      {
      case '=':
        {
          if (peek () == '+')
          {
            get ();
            return make_token (type::prepend);
          }
          else
            return make_token (type::assign);
        }
      case '+':
        {
          if (peek () == '=')
          {
            get ();
            return make_token (type::append);
          }
          break;
        }
      case '?':
        {
          if (peek () == '=')
          {
            get ();
            return make_token (type::default_assign);
          }
          break;
        }
      }
    }

    // The following characters are special in the normal mode.
    //
    if (m == lexer_mode::normal ||
        m == lexer_mode::cmdvar)
    {
      switch (c)
      {
      case '<': return make_token (type::labrace);
      case '>': return make_token (type::rabrace);
      }
    }

    // The following characters are special in the values and alike modes.
    //
    if (m == lexer_mode::buildspec          ||
        m == lexer_mode::values             ||
        m == lexer_mode::switch_expressions ||
        m == lexer_mode::case_patterns      ||
        m == lexer_mode::attributes         ||
        m == lexer_mode::attribute_value)
    {
      switch (c)
      {
      case ',': return make_token (type::comma);
      }
    }

    // The following characters are special in the case_patterns mode.
    //
    if (m == lexer_mode::case_patterns)
    {
      switch (c)
      {
      case '|': return make_token (type::bit_or);
      }
    }

    // Otherwise it is a word.
    //
    unget (c);
    return word (st, sep);
  }

  token lexer::
  next_eval ()
  {
    // This mode is quite a bit like the value mode when it comes to special
    // characters, except that we have some of our own.

    bool sep (skip_spaces ().first);
    xchar c (get ());

    if (eos (c))
      fail (c) << "unterminated evaluation context";

    state& st (state_.top ());

    uint64_t ln (c.line), cn (c.column);

    auto make_token = [sep, ln, cn] (type t, string v = string ())
    {
      return token (t, move (v),
                    sep, quote_type::unquoted, false,
                    ln, cn, token_printer);
    };

    // Handle `[` (do it first to make sure the flag is cleared regardless of
    // what we return).
    //
    if (st.lsbrace)
    {
      st.lsbrace = false;

      if (c == '[' && (!st.lsbrace_unsep || !sep))
        return make_token (type::lsbrace);
    }

    // Handle pair separator.
    //
    if (c == st.sep_pair)
      return make_token (type::pair_separator, string (1, c));

    // NOTE: remember to update mode() if adding any new special characters.

    switch (c)
    {
    case '\n': fail (c) << "newline in evaluation context" << endf;
    case ':': return make_token (type::colon);
    case '{': return make_token (type::lcbrace);
    case '}': return make_token (type::rcbrace);
    case '$': return make_token (type::dollar);
    case '?': return make_token (type::question);
    case ',': return make_token (type::comma);
    case '`': return make_token (type::backtick);
    case '(': return make_token (type::lparen);
    case ')':
      {
        state_.pop (); // Expire eval mode.
        return make_token (type::rparen);
      }
      // Potentially two-character tokens.
      //
    case '=':
    case '!':
    case '<':
    case '>':
    case '|':
    case '&':
      {
        xchar p (peek ());

        type r (type::eos);
        switch (c)
        {
        case '|': r = (p == '|' ? type::log_or : type::bit_or); break;
        case '&': if (p == '&') r = type::log_and; break;

        case '<': r = (p == '=' ? type::less_equal : type::less); break;
        case '>': r = (p == '=' ? type::greater_equal : type::greater); break;

        case '=': if (p == '=') r = type::equal; break;

        case '!': r = (p == '=' ? type::not_equal : type::log_not); break;
        }

        if (r == type::eos)
          break;

        switch (r)
        {
        case type::bit_or:
        case type::less:
        case type::greater:
        case type::log_not: break;
        default:            get ();
        }

        return make_token (r);
      }
    }

    // Otherwise it is a word.
    //
    unget (c);
    return word (st, sep);
  }

  token lexer::
  next_quoted ()
  {
    xchar c (get ());

    if (eos (c))
      fail (c) << "unterminated double-quoted sequence";

    uint64_t ln (c.line), cn (c.column);

    auto make_token = [ln, cn] (type t)
    {
      return token (t, false, quote_type::double_, ln, cn, token_printer);
    };

    switch (c)
    {
    case '$': return make_token (type::dollar);
    case '(': return make_token (type::lparen);
    }

    // Otherwise it is a word.
    //
    unget (c);
    return word (state_.top (), false);
  }

  token lexer::
  next_foreign ()
  {
    state& st (state_.top ());

    if (st.hold)
    {
      token r (move (*st.hold));
      state_.pop (); // Expire foreign mode.
      return r;
    }

    auto count (state_.top ().data); // Number of closing braces to expect.

    xchar c (get ()); // First character of first line after `{{...`.
    uint64_t ln (c.line), cn (c.column);

    string lexeme;
    for (bool first (true); !eos (c); c = get ())
    {
      // If this is the first character of a line, recognize closing braces.
      //
      if (first)
      {
        first = false;

        // If this turns not to be the closing braces, we need to add any
        // characters we have extracted to lexeme. Instead of saving these
        // characters in a temporary we speculatively add them to the lexeme
        // but then chop them off if this turned out to be the closing braces.
        //
        size_t chop (lexeme.size ());

        // Skip leading whitespaces, if any.
        //
        for (; c == ' ' || c == '\t'; c = get ())
          lexeme += c;

        uint64_t bln (c.line), bcn (c.column); // Position of first `}`.

        // Count braces.
        //
        auto i (count);
        for (; c == '}'; c = get ())
        {
          lexeme += c;

          if (--i == 0)
            break;
        }

        if (i == 0) // Got enough braces.
        {
          // Make sure there are only whitespaces/comments after. Note that
          // now we must start peeking since newline is not "ours".
          //
          for (c = peek (); c == ' ' || c == '\t'; c = peek ())
            lexeme += get ();

          if (c == '\n' || c == '#' || eos (c))
          {
            st.hold = token (type::multi_rcbrace,
                             string (count, '}'),
                             false, quote_type::unquoted, false,
                             bln, bcn,
                             token_printer);

            lexeme.resize (chop);
            return token (move (lexeme),
                          false, quote_type::unquoted, false,
                          ln, cn);
          }

          get (); // And fall through (not eos).
        }
        else
        {
          if (eos (c))
            break;

          // Fall through.
        }
      }

      if (c == '\n')
        first = true;

      lexeme += c;
    }

    return token (type::eos, false, c.line, c.column, token_printer);
  }

  token lexer::
  word (state st, bool sep)
  {
    lexer_mode m (st.mode);

    xchar c (peek ());
    assert (!eos (c));

    uint64_t ln (c.line), cn (c.column);

    string lexeme;
    quote_type qtype (m == lexer_mode::double_quoted
                      ? quote_type::double_
                      : quote_type::unquoted);

    // If we are already in the quoted mode then we didn't start with the
    // quote character.
    //
    bool qcomp (false);

    auto append = [&lexeme, &m, &qcomp] (char c)
    {
      lexeme += c;

      // An unquoted character after a quoted fragment.
      //
      if (qcomp && m != lexer_mode::double_quoted)
        qcomp = false;
    };

    for (; !eos (c); c = peek ())
    {
      // First handle escape sequences.
      //
      if (c == '\\')
      {
        // In the variable mode we treat the beginning of the escape sequence
        // as a separator (think \"$foo\").
        //
        if (m == lexer_mode::variable)
          break;

        get ();
        xchar p (peek ());

        const char* esc (st.escapes);

        if (esc == nullptr ||
            (*esc != '\0' && !eos (p) && strchr (esc, p) != nullptr))
        {
          get ();

          if (eos (p))
            fail (p) << "unterminated escape sequence";

          if (p != '\n') // Ignore if line continuation.
            append (p);

          continue;
        }
        else
          unget (c); // Treat as a normal character.
      }

      bool done (false);

      // Next take care of the double-quoted mode. This one is tricky since
      // we push/pop modes while accumulating the same lexeme for example:
      //
      // foo" bar "baz
      //
      if (m == lexer_mode::double_quoted)
      {
        switch (c)
        {
          // Only these two characters are special in the double-quoted mode.
          //
        case '$':
        case '(':
          {
            done = true;
            break;
          }
          // End quote.
          //
        case '\"':
          {
            get ();
            state_.pop ();

            st = state_.top ();
            m = st.mode;
            continue;
          }
        }
      }
      // We also handle the variable mode in an ad hoc way.
      //
      else if (m == lexer_mode::variable)
      {
        bool first (lexeme.empty ());

        // Handle special variable names, if any.
        //
        if (first        &&
            st.data != 0 &&
            strchr (reinterpret_cast<const char*> (st.data), c) != nullptr)
        {
          get ();
          lexeme += c;
          done = true;
        }
        else if (c != '_' && !(first ? alpha (c) : alnum (c)))
        {
          if (c != '.')
            done = true;
          else
          {
            // Normally '.' is part of the variable (namespace separator)
            // unless it is trailing (think $major.$minor).
            //
            get ();
            xchar p (peek ());
            done = eos (p) || !(alpha (p) ||  p == '_');
            unget (c);
          }
        }
      }
      else
      {
        // First check if it's a pair separator.
        //
        if (c == st.sep_pair)
          done = true;
        else
        {
          // Then see if this character or character sequence is a separator.
          //
          for (const char* p (strchr (st.sep_first, c));
               p != nullptr;
               p = done ? nullptr : strchr (p + 1, c))
          {
            char s (st.sep_second[p - st.sep_first]);

            // See if it has a second.
            //
            if (s != ' ')
            {
              get ();
              done = (peek () == s);
              unget (c);
            }
            else
              done = true;
          }
        }

        // Handle single and double quotes if enabled for this mode and unless
        // they were considered separators.
        //
        if (st.quotes && !done)
        {
          switch (c)
          {
          case '\'':
            {
              // Enter the single-quoted mode in case the derived lexer needs
              // to notice this.
              //
              mode (lexer_mode::single_quoted);

              switch (qtype)
              {
              case quote_type::unquoted:
                qtype = quote_type::single;
                qcomp = lexeme.empty ();
                break;
              case quote_type::single:
                qcomp = false; // Non-contiguous.
                break;
              case quote_type::double_:
                qtype = quote_type::mixed;
                // Fall through.
              case quote_type::mixed:
                qcomp = false;
                break;
              }

              get ();
              for (c = get (); !eos (c) && c != '\''; c = get ())
                lexeme += c;

              if (eos (c))
                fail (c) << "unterminated single-quoted sequence";

              state_.pop ();
              continue;
            }
          case '\"':
            {
              get ();

              mode (lexer_mode::double_quoted);
              st = state_.top ();
              m = st.mode;

              switch (qtype)
              {
              case quote_type::unquoted:
                qtype = quote_type::double_;
                qcomp = lexeme.empty ();
                break;
              case quote_type::double_:
                qcomp = false; // Non-contiguous.
                break;
              case quote_type::single:
                qtype = quote_type::mixed;
                // Fall through.
              case quote_type::mixed:
                qcomp = false;
                break;
              }

              continue;
            }
          }
        }
      }

      if (done)
        break;

      get ();
      append (c);
    }

    if (m == lexer_mode::double_quoted)
    {
      if (eos (c))
        fail (c) << "unterminated double-quoted sequence";

      // If we are still in the quoted mode then we didn't end with the quote
      // character.
      //
      if (qcomp)
        qcomp = false;
    }

    // Expire variable mode at the end of the word.
    //
    if (m == lexer_mode::variable)
      state_.pop ();

    return token (move (lexeme), sep, qtype, qcomp, ln, cn);
  }

  pair<bool, bool> lexer::
  skip_spaces ()
  {
    bool r (sep_);
    sep_ = false;

    const state& s (state_.top ());

    // In some special modes we don't skip spaces.
    //
    if (!s.sep_space)
      return make_pair (r, false);

    xchar c (peek ());
    bool start (c.column == 1);

    for (; !eos (c); c = peek ())
    {
      switch (c)
      {
      case ' ':
      case '\t':
        {
          r = true;
          break;
        }
      case '\n':
        {
          // In some modes we treat newlines as ordinary spaces.
          //
          // Note that in this case we don't adjust start.
          //
          if (!s.sep_newline)
          {
            r = true;
            break;
          }

          // Skip empty lines.
          //
          if (start)
          {
            r = false;
            break;
          }

          return make_pair (r, start);
        }
      case '#':
        {
          r = true;
          get ();

          // See if this is a multi-line comment in the form:
          //
          /*
             #\
             ...
             #\
          */
          auto ml = [&c, this] () -> bool
          {
            if ((c = peek ()) == '\\')
            {
              get ();
              if ((c = peek ()) == '\n')
                return true;
            }

            return false;
          };

          if (ml ())
          {
            // Scan until we see the closing one.
            //
            for (; !eos (c); c = peek ())
            {
              get ();
              if (c == '#' && ml ())
                break;
            }

            if (eos (c))
              fail (c) << "unterminated multi-line comment";
          }
          else
          {
            // Read until newline or eos.
            //
            for (; !eos (c) && c != '\n'; c = peek ())
              get ();
          }

          continue;
        }
      case '\\':
        {
          get ();

          if (peek () == '\n')
            break; // Ignore.

          unget (c);
        }
        // Fall through.
      default:
        return make_pair (r, start); // Not a space.
      }

      get ();
    }

    return make_pair (r, start);
  }
}