// file      : tests/utf8/driver.cxx -*- C++ -*-
// license   : MIT; see accompanying LICENSE file

#include <string>

#include <libbutl/utf8.hxx>
#include <libbutl/utility.hxx>

#undef NDEBUG
#include <cassert>

using namespace std;
using namespace butl;

int
main ()
{
  // utf8() tests.
  //
  auto utf8_error = [] (const string& s,
                        codepoint_types ts = codepoint_types::any,
                        const char32_t* wl = nullptr)
  {
    string error;
    assert (!utf8 (s, error, ts, wl));
    return error;
  };

  // Valid sequences.
  //
  // Empty.
  //
  assert (utf8 (""));

  // 1 code point.
  //
  assert (utf8 ("a"));                // 1 byte.
  assert (utf8 ("\xD0\xB0"));         // 2 bytes.
  assert (utf8 ("\xE4\xBA\x8C"));     // 3 bytes.
  assert (utf8 ("\xF0\x90\x8C\x82")); // 4 bytes.

  // Multiple code points.
  //
  assert (utf8 ("a\xD0\xB0\xE4\xBA\x8C\xF0\x90\x8C\x82"));

  // Ill-formed sequences.
  //
  // Long sequences.
  //
  assert (!utf8 ("\xF8")); // 5-byte sequence.
  assert (!utf8 ("\xFC")); // 6-byte sequence.

  assert (utf8_error ("\xF8") == "5-byte length UTF-8 sequence");
  assert (utf8_error ("\xFC") == "6-byte length UTF-8 sequence");
  assert (utf8_error ("\xFE") == "invalid UTF-8 sequence first byte (0xFE)");

  // 2-byte sequences.
  //
  assert (!utf8 ("\xC1\x80")); // Invalid first byte.
  assert (!utf8 ("\xD0y"));    // Invalid second byte.

  assert (utf8_error ("\xC1\x80") ==
          "invalid UTF-8 sequence first byte (0xC1)");

  assert (utf8_error ("\xD0y") ==
          "invalid UTF-8 sequence second byte (0x79 'y')");

  // 3-byte sequences.
  //
  assert (!utf8 ("\xE2\x70\x80")); // Invalid second byte.
  assert (!utf8 ("\xE2\x80\x70")); // Invalid third byte.

  assert (!utf8 ("\xED\xA0\x80")); // Min UTF-16 surrogate.
  assert (!utf8 ("\xED\xBF\xBF")); // Max UTF-16 surrogate.

  assert (utf8_error ("\xE2\x80\x70") ==
          "invalid UTF-8 sequence third byte (0x70 'p')");

  // 4-byte sequences.
  //
  assert (!utf8 ("\xF5\x80\x80\x80")); // Invalid first byte.
  assert (!utf8 ("\xF0\x80\x80\x80")); // Invalid second byte.
  assert (!utf8 ("\xF0\x90\x70\x80")); // Invalid third byte.
  assert (!utf8 ("\xF1\x80\x80\xC0")); // Invalid forth byte.

  assert (utf8_error ("\xF1\x80\x80\xC0") ==
          "invalid UTF-8 sequence forth byte (0xC0)");

  // Incomplete sequences.
  //
  assert (!utf8 ("\xD0"));         // 2-byte sequence.
  assert (!utf8 ("\xE4\xBA"));     // 3-byte sequence.
  assert (!utf8 ("\xF0\x90\x8C")); // 4-byte sequence.

  assert (utf8_error ("\xD0") == "incomplete UTF-8 sequence");

  // Missing sequence leading bytes.
  //
  assert (!utf8 ("\xB0xyz"));            // 2-byte sequence.
  assert (!utf8 ("\xBA\x8C\xD0\xB0yz")); // 3-byte sequence.
  assert (!utf8 ("\x8Cxyz"));            // 3-byte sequence.
  assert (!utf8 ("\x90\x8C\x82xyz"));    // 4-byte sequence.
  assert (!utf8 ("\x8C\x82xyz"));        // 4-byte sequence.
  assert (!utf8 ("\x82xyz"));            // 4-byte sequence.

  assert (utf8_error ("\xB0") == "invalid UTF-8 sequence first byte (0xB0)");

  // Above the valid codepoint range (0x10ffff + 1).
  //
  assert (!utf8 ("\xF4\x90\x80\x80"));

  assert (utf8_error ("\xF4\x90\x80\x80") ==
          "invalid UTF-8 sequence second byte (0x90)");

  // Whitelisting.
  //
  assert (utf8 ("\r\t\n"));

  // Matched codepoint types.
  //
  // Control.
  //
  assert (utf8 ("\r",   codepoint_types::control));
  assert (utf8 ("\x7F", codepoint_types::control));

  // Non-character.
  //
  assert (utf8 ("\xF4\x8F\xBF\xBF", codepoint_types::non_character));
  assert (utf8 ("\xEF\xB7\x90",     codepoint_types::non_character));

  // Private-use.
  //
  assert (utf8 ("\xEE\x80\x80",     codepoint_types::private_use));
  assert (utf8 ("\xF3\xB0\x80\x80", codepoint_types::private_use));

  // Reserved.
  //
  assert (utf8 ("\xF3\xA1\x80\x80", codepoint_types::reserved));
  assert (utf8 ("\xF0\xB0\x80\x80", codepoint_types::reserved));
  assert (utf8 ("\xF3\xA0\x82\x80", codepoint_types::reserved));

  // Format.
  //
  assert (utf8 ("\xC2\xAD",         codepoint_types::format));
  assert (utf8 ("\xD8\x80",         codepoint_types::format));
  assert (utf8 ("\xD8\x81",         codepoint_types::format));
  assert (utf8 ("\xD8\x85",         codepoint_types::format));
  assert (utf8 ("\xF3\xA0\x81\xBF", codepoint_types::format));

  // Graphic.
  //
  assert (utf8 ("\xC2\xAC",         codepoint_types::graphic));
  assert (utf8 ("\xC2\xAE",         codepoint_types::graphic));
  assert (utf8 ("\xD8\x86",         codepoint_types::graphic));
  assert (utf8 ("\xF3\xA0\x84\x80", codepoint_types::graphic));

  // Private-use & graphic.
  //
  assert (utf8 ("\xEE\x80\x80\xF3\xB0\x80\x80\xC2\xAC",
                codepoint_types::private_use | codepoint_types::graphic));

  // None.
  //
  assert (utf8 ("\t", codepoint_types::none, U"\t")); // Whitelisted.

  // Any.
  //
  assert (utf8 ("\t"));

  // Unmatched codepoint types.
  //
  assert (!utf8 ("\x7F", codepoint_types::graphic, U"\t"));      // Control.
  assert (!utf8 ("\xEF\xB7\x90", codepoint_types::graphic));     // Non-char.
  assert (!utf8 ("\xEE\x80\x80", codepoint_types::graphic));     // Private.
  assert (!utf8 ("\xF3\xA1\x80\x80", codepoint_types::graphic)); // Reserved.
  assert (!utf8 ("\xF3\xA0\x81\xBF", codepoint_types::graphic)); // Format.

  assert (utf8_error ("\xF3\xA0\x81\xBF", codepoint_types::graphic) ==
          "invalid Unicode codepoint (format)");

  assert (!utf8 ("\xC2\xAC", codepoint_types::format)); // Graphic.

  // Private-use & Graphic.
  //
  assert (!utf8 ("\xEE\x80\x80\xF3\xB0\x80\x80\xC2\xAC",
                 codepoint_types::format));

  assert (!utf8 ("a", codepoint_types::none)); // None.

  assert (utf8_error ("a", codepoint_types::none) ==
          "invalid Unicode codepoint (graphic)");

  // UTF-8 string length.
  //
  auto invalid_utf8 = [] (string s, codepoint_types ts = codepoint_types::any)
  {
    try
    {
      utf8_length (s, ts);
      return false;
    }
    catch (const invalid_argument&)
    {
      return true;
    }
  };

  assert (utf8_length ("") == 0);
  assert (utf8_length ("x\xD0\xB0\xE4\xBA\x8C\xF0\x90\x8C\x82y") == 5);

  assert (invalid_utf8 ("\xFE"));                         // Invalid byte.
  assert (invalid_utf8 ("\xD0"));                         // Incomplete.
  assert (invalid_utf8 ("\n", codepoint_types::graphic)); // Invalid codepoint.

  // to_utf8() tests.
  //
  auto roundtrip = [] (const char* s)
  {
    string r (s);
    to_utf8 (r, '?');
    return r == s;
  };

  auto sanitize = [] (string s, codepoint_types ts = codepoint_types::any)
  {
    to_utf8 (s, '?', ts);
    return s;
  };

  // Empty.
  //
  assert (roundtrip (""));

  // 1 code point.
  //
  assert (roundtrip ("a"));                // 1 byte.
  assert (roundtrip ("\xD0\xB0"));         // 2 bytes.
  assert (roundtrip ("\xE4\xBA\x8C"));     // 3 bytes.
  assert (roundtrip ("\xF0\x90\x8C\x82")); // 4 bytes.

  // Multiple code points.
  //
  assert (roundtrip ("a\xD0\xB0\xE4\xBA\x8C\xF0\x90\x8C\x82"));

  // Ill-formed sequences.
  //
  // Long sequence.
  //
  assert (sanitize ("\xF8") == "?"); // 5-byte sequence.

  // Invalid first byte followed by a second byte which ...
  //
  assert (sanitize ("\xC1\x80")     == "??");        // is a trailing byte.
  assert (sanitize ("\xC1y")        == "?y");        // starts 1-byte sequence.
  assert (sanitize ("\xC1\xD0\xB0") == "?\xD0\xB0"); // starts 2-byte sequence.
  assert (sanitize ("\xC1\xFE")     == "??");        // is not UTF-8.

  // Invalid second byte which ...
  //
  assert (sanitize ("\xD0y")        == "?y");        // starts 1-byte sequence.
  assert (sanitize ("\xD0\xD0\xB0") == "?\xD0\xB0"); // starts 2-byte sequence.
  assert (sanitize ("\xD0\xFE")     == "??");        // is not UTF-8.

  // Incomplete sequences.
  //
  assert (sanitize ("\xD0")     == "?");   // 2-byte sequence.
  assert (sanitize ("y\xD0")    == "y?");  // 2-byte sequence.
  assert (sanitize ("\xE4\xBA") == "??");  // 3-byte sequence.
  assert (sanitize ("\xD0\xD0") == "??");  // 2-byte sequence.

  // Incomplete recovery.
  //
  assert (sanitize ("\xD0\xFE")     == "??");  // 2-byte sequence.
  assert (sanitize ("\xD0\xFE\xFE") == "???"); // 2-byte sequence.

  assert (sanitize ("\xF4\x90\x80\x80") == "????"); // Above the range.
  assert (sanitize ("\xED\xA0\x80")     == "???");  // Min UTF-16 surrogate.
  assert (sanitize ("\xED\xBF\xBF")     == "???");  // Max UTF-16 surrogate.

  // Invalid codepoints.
  //
  auto sanitize_g = [&sanitize] (string s)
  {
    return sanitize (move (s), codepoint_types::graphic);
  };

  assert (sanitize_g ("\xEF\xB7\x90")  == "?");
  assert (sanitize_g ("y\xEF\xB7\x90") == "y?");
  assert (sanitize_g ("\xEF\xB7\x90y") == "?y");

  // Invalid during recovery.
  //
  assert (sanitize_g ("\xD0\n")     == "??");
  assert (sanitize_g ("\xD0\ny")    == "??y");
  assert (sanitize_g ("\xD0\xFE\n") == "???");

  assert (sanitize_g ("\xD0\xEF\xB7\x90") == "??");

  // utf8_validator::codepoint() tests.
  //
  {
    u32string r;
    size_t invalid_codepoints (0);

    string s ("a"
              "\xD0\xB0"
              "\n"                 // Control.
              "\xE4\xBA\x8C"
              "\xEE\x80\x80"       // Private-use.
              "\xF0\x90\x8C\x82");

    utf8_validator val (codepoint_types::graphic);

    for (char c: s)
    {
      pair<bool, bool> v (val.validate (c));

      if (v.first)
      {
        if (v.second)
          r.push_back (val.codepoint ());
      }
      else
        ++invalid_codepoints;
    }

    assert (r == U"a\x430\x4E8C\x10302");
    assert (invalid_codepoints == 2);
  }
}