Add JSON parser (copy of libstud-json)

author: Boris Kolpackov <boris@codesynthesis.com> 2022-09-30 13:31:25 +0200
committer: Boris Kolpackov <boris@codesynthesis.com> 2022-09-30 13:31:25 +0200
commit: d9dd84487bda8303590d5b30987f1d76b93867ba (patch)
tree: 7fd0a926dea644a2ddb70c863574af386b9d7f79 /libbutl
parent: e53da6ae4665ce49dddcc9aaa97a4e87bb94f48d (diff)
6 files changed, 2295 insertions, 2 deletions
diff --git a/libbutl/buildfile b/libbutl/buildfile
index 6c490af..ba4ad96 100644
--- a/libbutl/buildfile
+++ b/libbutl/buildfile
@@ -29,9 +29,9 @@ lib{butl}: {hxx ixx cxx}{win32-utility}: include = $windows
 lib{butl}: hxx{mingw-*}: include = $mingw_stdthread
 
 # Our C-files are always included into C++-files that wrap the corresponding
-# API so treat them as files exclude from the compilation.
+# API so treat them as files to exclude from the compilation.
 #
-lib{butl}: file{*.c *.h}
+lib{butl}: file{**.c **.h}
 
 # Platform-specific UUID implementations.
 #
diff --git a/libbutl/json/parser.cxx b/libbutl/json/parser.cxx
new file mode 100644
index 0000000..fa8916b
--- /dev/null
+++ b/libbutl/json/parser.cxx
@@ -0,0 +1,502 @@
+#define PDJSON_SYMEXPORT static // See below.
+
+#include <libbutl/json/parser.hxx>
+
+#include <istream>
+
+// There is an issue (segfault) with using std::current_exception() and
+// std::rethrow_exception() with older versions of libc++ on Linux. While the
+// exact root cause hasn't been determined, the suspicion is that something
+// gets messed up if we "smuggle" std::exception_ptr through extern "C" call
+// frames (we cannot even destroy such an exception without a segfault). We
+// also could not determine in which version exactly this has been fixed but
+// we know that libc++ 6.0.0 doesn't appear to have this issue (though we are
+// not entirely sure the issue is (only) in libc++; libgcc_s could also be
+// involved).
+//
+// The workaround is to just catch (and note) the exception and then throw a
+// new instance of generic std::istream::failure. In order not to drag the
+// below test into the header, we wrap exception_ptr with optional<> and use
+// NULL to indicate the presence of the exception when the workaround is
+// required.
+//
+// Note that if/when we drop this workaround, we should also get rid of
+// optional<> in stream::exception member.
+//
+#undef LIBBUTL_JSON_NO_EXCEPTION_PTR
+
+#if defined (__linux__) && defined(__clang__)
+#  if __has_include(<__config>)
+#    include <__config> // _LIBCPP_VERSION
+#    if _LIBCPP_VERSION < 6000
+#      define LIBBUTL_JSON_NO_EXCEPTION_PTR 1
+#    endif
+#  endif
+#endif
+
+namespace butl
+{
+  namespace json
+  {
+    using namespace std;
+
+    parser::
+    ~parser ()
+    {
+      json_close (impl_);
+    }
+
+    static int
+    stream_get (void* x)
+    {
+      auto& s (*static_cast<parser::stream*> (x));
+
+      // In the multi-value mode reading of whitespaces/separators is split
+      // between our code and pdjson's. As a result, these functions may end
+      // up being called more than once after EOF is reached. Which is
+      // something iostream does not handle gracefully.
+      //
+      if (!s.is->eof ())
+      {
+        try
+        {
+          // We first peek not to trip failbit on EOF.
+          //
+          if (s.is->peek () != istream::traits_type::eof ())
+            return static_cast<char> (s.is->get ());
+        }
+        catch (...)
+        {
+#ifndef LIBBUTL_JSON_NO_EXCEPTION_PTR
+          s.exception = current_exception ();
+#else
+          s.exception = nullptr;
+#endif
+        }
+      }
+
+      return EOF;
+    }
+
+    static int
+    stream_peek (void* x)
+    {
+      auto& s (*static_cast<parser::stream*> (x));
+
+      if (!s.is->eof ())
+      {
+        try
+        {
+          auto c (s.is->peek ());
+          if (c != istream::traits_type::eof ())
+            return static_cast<char> (c);
+        }
+        catch (...)
+        {
+#ifndef LIBBUTL_JSON_NO_EXCEPTION_PTR
+          s.exception = current_exception ();
+#else
+          s.exception = nullptr;
+#endif
+        }
+      }
+
+      return EOF;
+    }
+
+    // NOTE: watch out for exception safety (specifically, doing anything that
+    // might throw after opening the stream).
+    //
+    parser::
+    parser (istream& is, const char* n, bool mv, const char* sep) noexcept
+        : input_name (n),
+          stream_ {&is, nullopt},
+          multi_value_ (mv),
+          separators_ (sep),
+          raw_s_ (nullptr),
+          raw_n_ (0)
+    {
+      json_open_user (impl_, &stream_get, &stream_peek, &stream_);
+      json_set_streaming (impl_, multi_value_);
+    }
+
+    parser::
+    parser (const void* t,
+            size_t s,
+            const char* n,
+            bool mv,
+            const char* sep) noexcept
+        : input_name (n),
+          stream_ {nullptr, nullopt},
+          multi_value_ (mv),
+          separators_ (sep),
+          raw_s_ (nullptr),
+          raw_n_ (0)
+    {
+      json_open_buffer (impl_, t, s);
+      json_set_streaming (impl_, multi_value_);
+    }
+
+    optional<event> parser::
+    next ()
+    {
+      name_p_ = value_p_ = location_p_ = false;
+
+      // Note that for now we don't worry about the state of the parser if
+      // next_impl() throws assuming it is not going to be reused.
+      //
+      if (peeked_)
+      {
+        parsed_ = peeked_;
+        peeked_ = nullopt;
+      }
+      else
+        parsed_ = next_impl ();
+
+      return translate (*parsed_);
+    }
+
+    optional<event> parser::
+    peek ()
+    {
+      if (!peeked_)
+      {
+        if (parsed_)
+        {
+          cache_parsed_data ();
+          cache_parsed_location ();
+        }
+        peeked_ = next_impl ();
+      }
+      return translate (*peeked_);
+    }
+
+    std::uint64_t parser::
+    line () const noexcept
+    {
+      if (!location_p_)
+      {
+        if (!parsed_)
+          return 0;
+
+        assert (!peeked_);
+
+        return static_cast<uint64_t> (
+            json_get_lineno (const_cast<json_stream*> (impl_)));
+      }
+
+      return line_;
+    }
+
+    std::uint64_t parser::
+    column () const noexcept
+    {
+      if (!location_p_)
+      {
+        if (!parsed_)
+          return 0;
+
+        assert (!peeked_);
+
+        return static_cast<uint64_t> (
+            json_get_column (const_cast<json_stream*> (impl_)));
+      }
+
+      return column_;
+    }
+
+    std::uint64_t parser::
+    position () const noexcept
+    {
+      if (!location_p_)
+      {
+        if (!parsed_)
+          return 0;
+
+        assert (!peeked_);
+
+        return static_cast<uint64_t> (
+            json_get_position (const_cast<json_stream*> (impl_)));
+      }
+
+      return position_;
+    }
+
+    json_type parser::
+    next_impl ()
+    {
+      raw_s_ = nullptr;
+      raw_n_ = 0;
+      json_type e;
+
+      // Read characters between values skipping required separators and JSON
+      // whitespaces. Return whether a required separator was encountered as
+      // well as the first non-separator/whitespace character (which, if EOF,
+      // should trigger a check for input/output errors).
+      //
+      // Note that the returned non-separator will not have been extracted
+      // from the input (so position, column, etc. will still refer to its
+      // predecessor).
+      //
+      auto skip_separators = [this] () -> pair<bool, int>
+      {
+        bool r (separators_ == nullptr);
+
+        int c;
+        for (; (c = json_source_peek (impl_)) != EOF; json_source_get (impl_))
+        {
+          // User separator.
+          //
+          if (separators_ != nullptr && *separators_ != '\0')
+          {
+            if (strchr (separators_, c) != nullptr)
+            {
+              r = true;
+              continue;
+            }
+          }
+
+          // JSON separator.
+          //
+          if (json_isspace (c))
+          {
+            if (separators_ != nullptr && *separators_ == '\0')
+              r = true;
+
+            continue;
+          }
+
+          break;
+        }
+
+        return make_pair (r, c);
+      };
+
+      // In the multi-value mode skip any instances of required separators
+      // (and any other JSON whitespace) preceding the first JSON value.
+      //
+      if (multi_value_ && !parsed_ && !peeked_)
+      {
+        if (skip_separators ().second == EOF && stream_.is != nullptr)
+        {
+          if (stream_.exception)   goto fail_rethrow;
+          if (stream_.is->fail ()) goto fail_stream;
+        }
+      }
+
+      e = json_next (impl_);
+
+      // First check for a pending input/output error.
+      //
+      if (stream_.is != nullptr)
+      {
+        if (stream_.exception)   goto fail_rethrow;
+        if (stream_.is->fail ()) goto fail_stream;
+      }
+
+      // There are two ways to view separation between two values: as following
+      // the first value or as preceding the second value. And one aspect that
+      // is determined by this is whether a separation violation is a problem
+      // with the first value or with the second, which becomes important if
+      // the user bails out before parsing the second value.
+      //
+      // Consider these two unseparated value (yes, in JSON they are two
+      // values, leading zeros are not allowed in JSON numbers):
+      //
+      // 01
+      //
+      // If the user bails out after parsing 0 in a stream that should have
+      // been newline-delimited, they most likely would want to get an error
+      // since this is most definitely an invalid value rather than two
+      // values that are not properly separated. So in this light we handle
+      // separators at the end of the first value.
+      //
+      switch (e)
+      {
+      case JSON_DONE:
+        {
+          // Deal with the following value separators.
+          //
+          // Note that we must not do this for the second JSON_DONE (or the
+          // first one in case there are no values) that signals the end of
+          // input.
+          //
+          if (multi_value_         &&
+              (parsed_ || peeked_) &&
+              (peeked_ ? *peeked_ : *parsed_) != JSON_DONE)
+          {
+            auto p (skip_separators ());
+
+            if (p.second == EOF && stream_.is != nullptr)
+            {
+              if (stream_.exception)   goto fail_rethrow;
+              if (stream_.is->fail ()) goto fail_stream;
+            }
+
+            // Note that we don't require separators after the last value.
+            //
+            if (!p.first && p.second != EOF)
+            {
+              json_source_get (impl_); // Consume to update column number.
+              goto fail_separation;
+            }
+
+            json_reset (impl_);
+          }
+          break;
+        }
+      case JSON_ERROR: goto fail_json;
+      case JSON_STRING:
+      case JSON_NUMBER:
+        raw_s_ = json_get_string (impl_, &raw_n_);
+        raw_n_--; // Includes terminating `\0`.
+        break;
+      case JSON_TRUE:  raw_s_ = "true";  raw_n_ = 4; break;
+      case JSON_FALSE: raw_s_ = "false"; raw_n_ = 5; break;
+      case JSON_NULL:  raw_s_ = "null";  raw_n_ = 4; break;
+      default: break;
+      }
+
+      return e;
+
+    fail_json:
+      throw invalid_json_input (
+          input_name != nullptr ? input_name : "",
+          static_cast<uint64_t> (json_get_lineno (impl_)),
+          static_cast<uint64_t> (json_get_column (impl_)),
+          static_cast<uint64_t> (json_get_position (impl_)),
+          json_get_error (impl_));
+
+    fail_separation:
+      throw invalid_json_input (
+          input_name != nullptr ? input_name : "",
+          static_cast<uint64_t> (json_get_lineno (impl_)),
+          static_cast<uint64_t> (json_get_column (impl_)),
+          static_cast<uint64_t> (json_get_position (impl_)),
+          "missing separator between JSON values");
+
+    fail_stream:
+      throw invalid_json_input (
+          input_name != nullptr ? input_name : "",
+          static_cast<uint64_t> (json_get_lineno (impl_)),
+          static_cast<uint64_t> (json_get_column (impl_)),
+          static_cast<uint64_t> (json_get_position (impl_)),
+          "unable to read JSON input text");
+
+    fail_rethrow:
+#ifndef LIBBUTL_JSON_NO_EXCEPTION_PTR
+      rethrow_exception (move (*stream_.exception));
+#else
+      throw istream::failure ("unable to read");
+#endif
+    }
+
+    optional<event> parser::
+    translate (json_type e) const noexcept
+    {
+      switch (e)
+      {
+      case JSON_DONE: return nullopt;
+      case JSON_OBJECT: return event::begin_object;
+      case JSON_OBJECT_END: return event::end_object;
+      case JSON_ARRAY: return event::begin_array;
+      case JSON_ARRAY_END: return event::end_array;
+      case JSON_STRING:
+        {
+          // This can be a value or, inside an object, a name from the
+          // name/value pair.
+          //
+          size_t n;
+          return json_get_context (const_cast<json_stream*> (impl_), &n) ==
+                             JSON_OBJECT &&
+                         n % 2 == 1
+                     ? event::name
+                     : event::string;
+        }
+      case JSON_NUMBER: return event::number;
+      case JSON_TRUE: return event::boolean;
+      case JSON_FALSE: return event::boolean;
+      case JSON_NULL: return event::null;
+      case JSON_ERROR: assert (false); // Should've been handled by caller.
+      }
+
+      return nullopt; // Should never reach.
+    }
+
+    void parser::
+    cache_parsed_data ()
+    {
+      name_p_ = value_p_ = false;
+      if (const optional<event> e = translate (*parsed_))
+      {
+        if (e == event::name)
+        {
+          name_.assign (raw_s_, raw_n_);
+          name_p_ = true;
+        }
+        else if (value_event (e))
+        {
+          value_.assign (raw_s_, raw_n_);
+          value_p_ = true;
+        }
+      }
+    }
+
+    void parser::
+    cache_parsed_location () noexcept
+    {
+      line_ = static_cast<uint64_t> (json_get_lineno (impl_));
+      column_ = static_cast<uint64_t> (json_get_column (impl_));
+      position_ = static_cast<uint64_t> (json_get_position (impl_));
+      location_p_ = true;
+    }
+
+    bool parser::
+    value_event (optional<event> e) noexcept
+    {
+      if (!e)
+        return false;
+
+      switch (*e)
+      {
+      case event::string:
+      case event::number:
+      case event::boolean:
+      case event::null:
+        return true;
+      default:
+        return false;
+      }
+    }
+
+    [[noreturn]] void parser::
+    throw_invalid_value (const char* type, const char* v, size_t n) const
+    {
+      string d (string ("invalid ") + type + " value: '");
+      d.append (v, n);
+      d += '\'';
+
+      throw invalid_json_input (input_name != nullptr ? input_name : "",
+                                line (),
+                                column (),
+                                position (),
+                                move (d));
+    }
+  } // namespace json
+} // namespace butl
+
+// Include the implementation into our translation unit (instead of compiling
+// it separately) to (hopefully) get function inlining without LTO.
+//
+// Let's keep it last since the implementation defines a couple of macros.
+//
+#if defined(__clang__) || defined(__GNUC__)
+#  pragma GCC diagnostic ignored "-Wunused-function"
+#endif
+
+extern "C"
+{
+#define PDJSON_STACK_INC 16
+#define PDJSON_STACK_MAX 2048
+#include "pdjson.c"
+}
diff --git a/libbutl/json/parser.hxx b/libbutl/json/parser.hxx
new file mode 100644
index 0000000..241ca46
--- /dev/null
+++ b/libbutl/json/parser.hxx
@@ -0,0 +1,408 @@
+#pragma once
+
+#ifdef BUILD2_BOOTSTRAP
+#  error JSON parser not available during bootstrap
+#endif
+
+#include <iosfwd>
+#include <string>
+#include <cstddef>   // size_t
+#include <cstdint>   // uint64_t
+#include <utility>   // pair
+#include <exception> // exception_ptr
+#include <stdexcept> // invalid_argument
+
+#include <libbutl/optional.hxx> // butl::optional is std::optional or similar.
+
+#include <libbutl/json/event.hxx>
+
+#include <libbutl/json/pdjson.h> // Implementation details.
+
+#include <libbutl/export.hxx>
+
+namespace butl
+{
+  // Using the RFC8259 terminology: JSON (input) text, JSON value, object
+  // member.
+  //
+  namespace json
+  {
+    class invalid_json_input: public std::invalid_argument
+    {
+    public:
+      std::string   name;
+      std::uint64_t line;
+      std::uint64_t column;
+      std::uint64_t position;
+
+      invalid_json_input (std::string name,
+                          std::uint64_t line,
+                          std::uint64_t column,
+                          std::uint64_t position,
+                          const std::string& description);
+
+      invalid_json_input (std::string name,
+                          std::uint64_t line,
+                          std::uint64_t column,
+                          std::uint64_t position,
+                          const char* description);
+    };
+
+    class LIBBUTL_SYMEXPORT parser
+    {
+    public:
+      const char* input_name;
+
+      // Construction.
+      //
+
+      // Parse JSON input text from std::istream.
+      //
+      // The name argument is used to identify the input being parsed. Note
+      // that the stream, name, and separators are kept as references so they
+      // must outlive the parser instance.
+      //
+      // If stream exceptions are enabled then the std::ios_base::failure
+      // exception is used to report input/output errors (badbit and failbit).
+      // Otherwise, those are reported as the invalid_json_input exception.
+      //
+      // If multi_value is true, enable the multi-value mode in which case the
+      // input stream may contain multiple JSON values (more precisely, zero
+      // or more). If false (the default), parsing will fail unless there is
+      // exactly one JSON value in the input stream.
+      //
+      // If multi_value is true, the separators argument specifies the
+      // required separator characters between JSON values. At least one of
+      // them must be present between every pair of JSON values (in addition
+      // to any number of JSON whitespaces). No separators are required after
+      // the last JSON value (but any found will be skipped).
+      //
+      // Specifically, if it is NULL, then no separation is required (that is,
+      // both `{...}{...}` and `{...}  {...}` would be valid). If it is empty,
+      // then at least one JSON whitespace is required. And if it is non-
+      // empty, then at least one of its characters must be present (for
+      // example, "\n\t" would require at least one newline or TAB character
+      // between JSON values).
+      //
+      // Note that a separator need not be valid JSON whitespace: any
+      // character is acceptable (though it probably shouldn't be an object,
+      // array, or string delimiter and should not occur within a non-self-
+      // delimited top-level value, such as `true`, `false`, `null`, or a
+      // number). All instances of required separators before and after a
+      // value are skipped. Therefore JSON Text Sequences (RFC 7464; AKA
+      // Record Separator-delimited JSON), which requires the RS (0x1E)
+      // character before each value, can be handled as well.
+      //
+      parser (std::istream&,
+              const std::string& name,
+              bool multi_value = false,
+              const char* separators = nullptr) noexcept;
+
+      parser (std::istream&,
+              const char* name,
+              bool multi_value = false,
+              const char* separators = nullptr) noexcept;
+
+      parser (std::istream&,
+              std::string&&,
+              bool = false,
+              const char* = nullptr) = delete;
+
+      // Parse a memory buffer that contains the entire JSON input text.
+      //
+      // The name argument is used to identify the input being parsed. Note
+      // that the buffer, name, and separators are kept as references so they
+      // must outlive the parser instance.
+      //
+      parser (const void* text,
+              std::size_t size,
+              const std::string& name,
+              bool multi_value = false,
+              const char* separators = nullptr) noexcept;
+
+      parser (const void* text,
+              std::size_t size,
+              const char* name,
+              bool multi_value = false,
+              const char* separators = nullptr) noexcept;
+
+      parser (const void*,
+              std::size_t,
+              std::string&&,
+              bool = false,
+              const char* = nullptr) = delete;
+
+      // Similar to the above but parse a string.
+      //
+      parser (const std::string& text,
+              const std::string& name,
+              bool multi_value = false,
+              const char* separators = nullptr) noexcept;
+
+      parser (const std::string& text,
+              const char* name,
+              bool multi_value = false,
+              const char* separators = nullptr) noexcept;
+
+      parser (const std::string&,
+              std::string&&,
+              bool = false,
+              const char* = nullptr) = delete;
+
+      // Similar to the above but parse a C-string.
+      //
+      parser (const char* text,
+              const std::string& name,
+              bool multi_value = false,
+              const char* separators = nullptr) noexcept;
+
+      parser (const char* text,
+              const char* name,
+              bool multi_value = false,
+              const char* separators = nullptr) noexcept;
+
+      parser (const char*,
+              std::string&&,
+              bool = false,
+              const char* = nullptr) = delete;
+
+      parser (parser&&) = delete;
+      parser (const parser&) = delete;
+
+      parser& operator= (parser&&) = delete;
+      parser& operator= (const parser&) = delete;
+
+      // Return the next event or nullopt if end of input is reached.
+      //
+      // In the single-value parsing mode (default) the parsing code could
+      // look like this:
+      //
+      //     while (optional<event> e = p.next ())
+      //     {
+      //       switch (*e)
+      //       {
+      //         // ...
+      //       }
+      //     }
+      //
+      // In the multi-value mode the parser additionally returns nullopt after
+      // every JSON value parsed (so there will be two nullopt's after the
+      // last JSON value, the second indicating the end of input).
+      //
+      // One way to perform multi-value parsing is with the help of the peek()
+      // function (see below):
+      //
+      //     while (p.peek ())
+      //     {
+      //       while (optional<event> e = p.next ())
+      //       {
+      //         switch (*e)
+      //         {
+      //           //...
+      //         }
+      //       }
+      //     }
+      //
+      // Note that while the single-value mode will always parse exactly one
+      // value, the multi-value mode will accept zero values in which case a
+      // single nullopt is returned.
+      //
+      optional<event>
+      next ();
+
+      // The range-based for loop support.
+      //
+      // In the single-value parsing mode (default) the parsing code could
+      // look like this:
+      //
+      //     for (event e: p)
+      //     {
+      //       switch (e)
+      //       {
+      //         //...
+      //       }
+      //     }
+      //
+      // And in the multi-value mode (see next() for more information) like
+      // this:
+      //
+      //     while (p.peek ())
+      //     {
+      //       for (event e: p)
+      //       {
+      //         switch (e)
+      //         {
+      //           //...
+      //         }
+      //       }
+      //     }
+      //
+      // Note that generally, the iterator interface doesn't make much sense
+      // for the parser so for now we have an implementation that is just
+      // enough for the range-based for.
+      //
+      struct iterator;
+
+      iterator begin () {return iterator (this, next ());}
+      iterator end ()   {return iterator (nullptr, nullopt);}
+
+      // Return the next event without considering it parsed. In other words,
+      // after this call, any subsequent calls to peek() and the next call to
+      // next() (if any) will all return the same event.
+      //
+      // Note that the name, value, and line corresponding to the peeked event
+      // are not accessible with name(), value() and line(); these functions
+      // will still return values corresponding to the most recent call to
+      // next(). The peeked values, however, can be accessed in the raw form
+      // using data().
+      //
+      optional<event>
+      peek ();
+
+      // Event data.
+      //
+
+      // Return the object member name.
+      //
+      const std::string&
+      name ();
+
+      // Any value (string, number, boolean, and null) can be retrieved as a
+      // string. Calling this function after any non-value events is illegal.
+      //
+      // Note that the value is returned as a non-const string reference and
+      // you are allowed to move the value out of it. However, this should not
+      // be done unnecessarily or in cases where the small string optimization
+      // is likely since the string's buffer is reused to store subsequent
+      // values.
+      //
+      std::string&
+      value ();
+
+      // Convert the value to an integer, floating point, or bool. Throw
+      // invalid_json_input if the conversion is impossible without a loss.
+      //
+      template <typename T>
+      T
+      value () const;
+
+      // Return the value or object member name in the raw form.
+      //
+      // Calling this function on non-value/name events is legal in which case
+      // NULL is returned. Note also that the returned data corresponds to the
+      // most recent event, whether peeked or parsed.
+      //
+      std::pair<const char*, std::size_t>
+      data () const {return std::make_pair (raw_s_, raw_n_);}
+
+      // Return the line number (1-based) corresponding to the most recently
+      // parsed event or 0 if nothing has been parsed yet.
+      //
+      std::uint64_t
+      line () const noexcept;
+
+      // Return the column number (1-based) corresponding to the beginning of
+      // the most recently parsed event or 0 if nothing has been parsed yet.
+      //
+      std::uint64_t
+      column () const noexcept;
+
+      // Return the position (byte offset) pointing immediately after the most
+      // recently parsed event or 0 if nothing has been parsed yet.
+      //
+      std::uint64_t
+      position () const noexcept;
+
+      // Implementation details.
+      //
+    public:
+      struct iterator
+      {
+        using value_type = event;
+
+        explicit
+        iterator (parser* p = nullptr, optional<event> e = nullopt)
+            : p_ (p), e_ (e) {}
+
+        event operator* () const {return *e_;}
+        iterator& operator++ () {e_ = p_->next (); return *this;}
+
+        // Comparison only makes sense when comparing to end (eof).
+        //
+        bool operator== (iterator y) const {return !e_ && !y.e_;}
+        bool operator!= (iterator y) const {return !(*this == y);}
+
+      private:
+        parser* p_;
+        optional<event> e_;
+      };
+
+      struct stream
+      {
+        std::istream*                is;
+        optional<std::exception_ptr> exception;
+      };
+
+      [[noreturn]] void
+      throw_invalid_value (const char* type, const char*, std::size_t) const;
+
+      ~parser ();
+
+    private:
+      // Functionality shared by next() and peek().
+      //
+      json_type
+      next_impl ();
+
+      // Translate the event produced by the most recent call to next_impl().
+      //
+      // Note that the underlying parser state determines whether name or
+      // value is returned when translating JSON_STRING.
+      //
+      optional<event>
+      translate (json_type) const noexcept;
+
+      // Cache state (name/value) produced by the most recent call to
+      // next_impl().
+      //
+      void
+      cache_parsed_data ();
+
+      // Cache the location numbers as determined by the most recent call to
+      // next_impl().
+      //
+      void
+      cache_parsed_location () noexcept;
+
+      // Return true if this is a value event (string, number, boolean, or
+      // null).
+      //
+      static bool
+      value_event (optional<event>) noexcept;
+
+      stream stream_;
+
+      bool multi_value_;
+      const char* separators_;
+
+      // The *_p_ members indicate whether the value is present (cached).
+      // Note: not using optional not to reallocate the string's buffer.
+      //
+      std::string name_;                       bool name_p_     = false;
+      std::string value_;                      bool value_p_    = false;
+      std::uint64_t line_, column_, position_; bool location_p_ = false;
+
+      optional<json_type> parsed_; // Current parsed event if any.
+      optional<json_type> peeked_; // Current peeked event if any.
+
+      ::json_stream impl_[1];
+
+      // Cached raw value.
+      //
+      const char* raw_s_;
+      std::size_t raw_n_;
+    };
+  }
+}
+
+#include <libbutl/json/parser.ixx>
diff --git a/libbutl/json/parser.ixx b/libbutl/json/parser.ixx
new file mode 100644
index 0000000..3f02a1e
--- /dev/null
+++ b/libbutl/json/parser.ixx
@@ -0,0 +1,216 @@
+#include <cerrno>
+#include <limits>      // numeric_limits
+#include <utility>     // move()
+#include <cassert>
+#include <cstdlib>     // strto*()
+#include <type_traits> // enable_if, is_*
+#include <cstring>     // strlen()
+
+namespace butl
+{
+  namespace json
+  {
+    inline invalid_json_input::
+    invalid_json_input (std::string n,
+                        std::uint64_t l,
+                        std::uint64_t c,
+                        std::uint64_t p,
+                        const std::string& d)
+        : invalid_json_input (move (n), l, c, p, d.c_str ())
+    {
+    }
+
+    inline invalid_json_input::
+    invalid_json_input (std::string n,
+                        std::uint64_t l,
+                        std::uint64_t c,
+                        std::uint64_t p,
+                        const char* d)
+        : invalid_argument (d),
+          name (std::move (n)),
+          line (l), column (c), position (p)
+    {
+    }
+
+    inline parser::
+    parser (std::istream& is,
+            const std::string& n,
+            bool mv,
+            const char* sep) noexcept
+        : parser (is, n.c_str (), mv, sep)
+    {
+    }
+
+    inline parser::
+    parser (const void* t,
+            std::size_t s,
+            const std::string& n,
+            bool mv,
+            const char* sep) noexcept
+        : parser (t, s, n.c_str (), mv, sep)
+    {
+    }
+
+    inline parser::
+    parser (const std::string& t,
+            const std::string& n,
+            bool mv,
+            const char* sep) noexcept
+        : parser (t.data (), t.size (), n.c_str (), mv, sep)
+    {
+    }
+
+    inline parser::
+    parser (const std::string& t,
+            const char* n,
+            bool mv,
+            const char* sep) noexcept
+        : parser (t.data (), t.size (), n, mv, sep)
+    {
+    }
+
+    inline parser::
+    parser (const char* t,
+            const std::string& n,
+            bool mv,
+            const char* sep) noexcept
+        : parser (t, std::strlen (t), n.c_str (), mv, sep)
+    {
+    }
+
+    inline parser::
+    parser (const char* t,
+            const char* n,
+            bool mv,
+            const char* sep) noexcept
+        : parser (t, std::strlen (t), n, mv, sep)
+    {
+    }
+
+    inline const std::string& parser::
+    name ()
+    {
+      if (!name_p_)
+      {
+        assert (parsed_ && !peeked_ && !value_p_);
+        cache_parsed_data ();
+        assert (name_p_);
+      }
+      return name_;
+    }
+
+    inline std::string& parser::
+    value ()
+    {
+      if (!value_p_)
+      {
+        assert (parsed_ && !peeked_ && !name_p_);
+        cache_parsed_data ();
+        assert (value_p_);
+      }
+      return value_;
+    }
+
+    // Note: one day we will be able to use C++17 from_chars() which was made
+    // exactly for this.
+    //
+    template <typename T>
+    inline typename std::enable_if<std::is_same<T, bool>::value, T>::type
+    parse_value (const char* b, size_t, const parser&)
+    {
+      return *b == 't';
+    }
+
+    template <typename T>
+    inline typename std::enable_if<
+      std::is_integral<T>::value &&
+      std::is_signed<T>::value &&
+      !std::is_same<T, bool>::value, T>::type
+    parse_value (const char* b, size_t n, const parser& p)
+    {
+      char* e (nullptr);
+      errno = 0; // We must clear it according to POSIX.
+      std::int64_t v (strtoll (b, &e, 10)); // Can't throw.
+
+      if (e == b || e != b + n || errno == ERANGE ||
+          v < std::numeric_limits<T>::min () ||
+          v > std::numeric_limits<T>::max ())
+        p.throw_invalid_value ("signed integer", b, n);
+
+      return static_cast<T> (v);
+    }
+
+    template <typename T>
+    inline typename std::enable_if<
+      std::is_integral<T>::value &&
+      std::is_unsigned<T>::value &&
+      !std::is_same<T, bool>::value, T>::type
+    parse_value (const char* b, size_t n, const parser& p)
+    {
+      char* e (nullptr);
+      errno = 0; // We must clear it according to POSIX.
+      std::uint64_t v (strtoull (b, &e, 10)); // Can't throw.
+
+      if (e == b || e != b + n || errno == ERANGE ||
+          v > std::numeric_limits<T>::max ())
+        p.throw_invalid_value ("unsigned integer", b, n);
+
+      return static_cast<T> (v);
+    }
+
+    template <typename T>
+    inline typename std::enable_if<std::is_same<T, float>::value, T>::type
+    parse_value (const char* b, size_t n, const parser& p)
+    {
+      char* e (nullptr);
+      errno = 0; // We must clear it according to POSIX.
+      T r (std::strtof (b, &e));
+
+      if (e == b || e != b + n || errno == ERANGE)
+        p.throw_invalid_value ("float", b, n);
+
+      return r;
+    }
+
+    template <typename T>
+    inline typename std::enable_if<std::is_same<T, double>::value, T>::type
+    parse_value (const char* b, size_t n, const parser& p)
+    {
+      char* e (nullptr);
+      errno = 0; // We must clear it according to POSIX.
+      T r (std::strtod (b, &e));
+
+      if (e == b || e != b + n || errno == ERANGE)
+        p.throw_invalid_value ("double", b, n);
+
+      return r;
+    }
+
+    template <typename T>
+    inline typename std::enable_if<std::is_same<T, long double>::value, T>::type
+    parse_value (const char* b, size_t n, const parser& p)
+    {
+      char* e (nullptr);
+      errno = 0; // We must clear it according to POSIX.
+      T r (std::strtold (b, &e));
+
+      if (e == b || e != b + n || errno == ERANGE)
+        p.throw_invalid_value ("long double", b, n);
+
+      return r;
+    }
+
+    template <typename T>
+    inline T parser::
+    value () const
+    {
+      if (!value_p_)
+      {
+        assert (parsed_ && !peeked_ && value_event (translate (*parsed_)));
+        return parse_value<T> (raw_s_, raw_n_, *this);
+      }
+
+      return parse_value<T> (value_.data (), value_.size (), *this);
+    }
+  }
+}
diff --git a/libbutl/json/pdjson.c b/libbutl/json/pdjson.c
new file mode 100644
index 0000000..279d169
--- /dev/null
+++ b/libbutl/json/pdjson.c
@@ -0,0 +1,1020 @@
+#ifndef _POSIX_C_SOURCE
+#  define _POSIX_C_SOURCE 200112L
+#elif _POSIX_C_SOURCE < 200112L
+#  error incompatible _POSIX_C_SOURCE level
+#endif
+
+#include <stdlib.h>
+#include <string.h>
+#include <ctype.h>
+
+#ifndef PDJSON_H
+#  include "pdjson.h"
+#endif
+
+#define JSON_FLAG_ERROR      (1u << 0)
+#define JSON_FLAG_STREAMING  (1u << 1)
+
+#if defined(_MSC_VER) && (_MSC_VER < 1900)
+
+#define json_error(json, format, ...)                             \
+    if (!(json->flags & JSON_FLAG_ERROR)) {                       \
+        json->flags |= JSON_FLAG_ERROR;                           \
+        _snprintf_s(json->errmsg, sizeof(json->errmsg),           \
+                 _TRUNCATE,                                       \
+                 format,                                          \
+                 __VA_ARGS__);                                    \
+    }                                                             \
+
+#else
+
+#define json_error(json, format, ...)                             \
+    if (!(json->flags & JSON_FLAG_ERROR)) {                       \
+        json->flags |= JSON_FLAG_ERROR;                           \
+        snprintf(json->errmsg, sizeof(json->errmsg),              \
+                 format,                                          \
+                 __VA_ARGS__);                                    \
+    }                                                             \
+
+#endif /* _MSC_VER */
+
+/* See also PDJSON_STACK_MAX below. */
+#ifndef PDJSON_STACK_INC
+#  define PDJSON_STACK_INC 4
+#endif
+
+struct json_stack {
+    enum json_type type;
+    long count;
+};
+
+static enum json_type
+push(json_stream *json, enum json_type type)
+{
+    json->stack_top++;
+
+#ifdef PDJSON_STACK_MAX
+    if (json->stack_top > PDJSON_STACK_MAX) {
+        json_error(json, "%s", "maximum depth of nesting reached");
+        return JSON_ERROR;
+    }
+#endif
+
+    if (json->stack_top >= json->stack_size) {
+        struct json_stack *stack;
+        size_t size = (json->stack_size + PDJSON_STACK_INC) * sizeof(*json->stack);
+        stack = (struct json_stack *)json->alloc.realloc(json->stack, size);
+        if (stack == NULL) {
+            json_error(json, "%s", "out of memory");
+            return JSON_ERROR;
+        }
+
+        json->stack_size += PDJSON_STACK_INC;
+        json->stack = stack;
+    }
+
+    json->stack[json->stack_top].type = type;
+    json->stack[json->stack_top].count = 0;
+
+    return type;
+}
+
+static enum json_type
+pop(json_stream *json, int c, enum json_type expected)
+{
+    if (json->stack == NULL || json->stack[json->stack_top].type != expected) {
+        json_error(json, "unexpected byte '%c'", c);
+        return JSON_ERROR;
+    }
+    json->stack_top--;
+    return expected == JSON_ARRAY ? JSON_ARRAY_END : JSON_OBJECT_END;
+}
+
+static int buffer_peek(struct json_source *source)
+{
+    if (source->position < source->source.buffer.length)
+        return source->source.buffer.buffer[source->position];
+    else
+        return EOF;
+}
+
+static int buffer_get(struct json_source *source)
+{
+    int c = source->peek(source);
+    if (c != EOF)
+        source->position++;
+    return c;
+}
+
+static int stream_get(struct json_source *source)
+{
+    int c = fgetc(source->source.stream.stream);
+    if (c != EOF)
+        source->position++;
+    return c;
+}
+
+static int stream_peek(struct json_source *source)
+{
+    int c = fgetc(source->source.stream.stream);
+    ungetc(c, source->source.stream.stream);
+    return c;
+}
+
+static void init(json_stream *json)
+{
+    json->lineno = 1;
+    json->linepos = 0;
+    json->lineadj = 0;
+    json->linecon = 0;
+    json->colno = 0;
+    json->flags = JSON_FLAG_STREAMING;
+    json->errmsg[0] = '\0';
+    json->ntokens = 0;
+    json->next = (enum json_type)0;
+
+    json->stack = NULL;
+    json->stack_top = -1;
+    json->stack_size = 0;
+
+    json->data.string = NULL;
+    json->data.string_size = 0;
+    json->data.string_fill = 0;
+    json->source.position = 0;
+
+    json->alloc.malloc = malloc;
+    json->alloc.realloc = realloc;
+    json->alloc.free = free;
+}
+
+static enum json_type
+is_match(json_stream *json, const char *pattern, enum json_type type)
+{
+    int c;
+    for (const char *p = pattern; *p; p++) {
+        if (*p != (c = json->source.get(&json->source))) {
+            json_error(json, "expected '%c' instead of byte '%c'", *p, c);
+            return JSON_ERROR;
+        }
+    }
+    return type;
+}
+
+static int pushchar(json_stream *json, int c)
+{
+    if (json->data.string_fill == json->data.string_size) {
+        size_t size = json->data.string_size * 2;
+        char *buffer = (char *)json->alloc.realloc(json->data.string, size);
+        if (buffer == NULL) {
+            json_error(json, "%s", "out of memory");
+            return -1;
+        } else {
+            json->data.string_size = size;
+            json->data.string = buffer;
+        }
+    }
+    json->data.string[json->data.string_fill++] = c;
+    return 0;
+}
+
+static int init_string(json_stream *json)
+{
+    json->data.string_fill = 0;
+    if (json->data.string == NULL) {
+        json->data.string_size = 1024;
+        json->data.string = (char *)json->alloc.malloc(json->data.string_size);
+        if (json->data.string == NULL) {
+            json_error(json, "%s", "out of memory");
+            return -1;
+        }
+    }
+    json->data.string[0] = '\0';
+    return 0;
+}
+
+static int encode_utf8(json_stream *json, unsigned long c)
+{
+    if (c < 0x80UL) {
+        return pushchar(json, c);
+    } else if (c < 0x0800UL) {
+        return !((pushchar(json, (c >> 6 & 0x1F) | 0xC0) == 0) &&
+                 (pushchar(json, (c >> 0 & 0x3F) | 0x80) == 0));
+    } else if (c < 0x010000UL) {
+        if (c >= 0xd800 && c <= 0xdfff) {
+            json_error(json, "invalid codepoint %06lx", c);
+            return -1;
+        }
+        return !((pushchar(json, (c >> 12 & 0x0F) | 0xE0) == 0) &&
+                 (pushchar(json, (c >>  6 & 0x3F) | 0x80) == 0) &&
+                 (pushchar(json, (c >>  0 & 0x3F) | 0x80) == 0));
+    } else if (c < 0x110000UL) {
+        return !((pushchar(json, (c >> 18 & 0x07) | 0xF0) == 0) &&
+                (pushchar(json, (c >> 12 & 0x3F) | 0x80) == 0) &&
+                (pushchar(json, (c >> 6  & 0x3F) | 0x80) == 0) &&
+                (pushchar(json, (c >> 0  & 0x3F) | 0x80) == 0));
+    } else {
+        json_error(json, "unable to encode %06lx as UTF-8", c);
+        return -1;
+    }
+}
+
+static int hexchar(int c)
+{
+    switch (c) {
+    case '0': return 0;
+    case '1': return 1;
+    case '2': return 2;
+    case '3': return 3;
+    case '4': return 4;
+    case '5': return 5;
+    case '6': return 6;
+    case '7': return 7;
+    case '8': return 8;
+    case '9': return 9;
+    case 'a':
+    case 'A': return 10;
+    case 'b':
+    case 'B': return 11;
+    case 'c':
+    case 'C': return 12;
+    case 'd':
+    case 'D': return 13;
+    case 'e':
+    case 'E': return 14;
+    case 'f':
+    case 'F': return 15;
+    default:
+        return -1;
+    }
+}
+
+static long
+read_unicode_cp(json_stream *json)
+{
+    long cp = 0;
+    int shift = 12;
+
+    for (size_t i = 0; i < 4; i++) {
+        int c = json->source.get(&json->source);
+        int hc;
+
+        if (c == EOF) {
+            json_error(json, "%s", "unterminated string literal in Unicode");
+            return -1;
+        } else if ((hc = hexchar(c)) == -1) {
+            json_error(json, "invalid escape Unicode byte '%c'", c);
+            return -1;
+        }
+
+        cp += hc * (1 << shift);
+        shift -= 4;
+    }
+
+
+    return cp;
+}
+
+static int read_unicode(json_stream *json)
+{
+    long cp, h, l;
+
+    if ((cp = read_unicode_cp(json)) == -1) {
+        return -1;
+    }
+
+    if (cp >= 0xd800 && cp <= 0xdbff) {
+        /* This is the high portion of a surrogate pair; we need to read the
+         * lower portion to get the codepoint
+         */
+        h = cp;
+
+        int c = json->source.get(&json->source);
+        if (c == EOF) {
+            json_error(json, "%s", "unterminated string literal in Unicode");
+            return -1;
+        } else if (c != '\\') {
+            json_error(json, "invalid continuation for surrogate pair '%c', "
+                             "expected '\\'", c);
+            return -1;
+        }
+
+        c = json->source.get(&json->source);
+        if (c == EOF) {
+            json_error(json, "%s", "unterminated string literal in Unicode");
+            return -1;
+        } else if (c != 'u') {
+            json_error(json, "invalid continuation for surrogate pair '%c', "
+                             "expected 'u'", c);
+            return -1;
+        }
+
+        if ((l = read_unicode_cp(json)) == -1) {
+            return -1;
+        }
+
+        if (l < 0xdc00 || l > 0xdfff) {
+            json_error(json, "surrogate pair continuation \\u%04lx out "
+                             "of range (dc00-dfff)", l);
+            return -1;
+        }
+
+        cp = ((h - 0xd800) * 0x400) + ((l - 0xdc00) + 0x10000);
+    } else if (cp >= 0xdc00 && cp <= 0xdfff) {
+            json_error(json, "dangling surrogate \\u%04lx", cp);
+            return -1;
+    }
+
+    return encode_utf8(json, cp);
+}
+
+static int
+read_escaped(json_stream *json)
+{
+    int c = json->source.get(&json->source);
+    if (c == EOF) {
+        json_error(json, "%s", "unterminated string literal in escape");
+        return -1;
+    } else if (c == 'u') {
+        if (read_unicode(json) != 0)
+            return -1;
+    } else {
+        switch (c) {
+        case '\\':
+        case 'b':
+        case 'f':
+        case 'n':
+        case 'r':
+        case 't':
+        case '/':
+        case '"':
+            {
+                const char *codes = "\\bfnrt/\"";
+                const char *p = strchr(codes, c);
+                if (pushchar(json, "\\\b\f\n\r\t/\""[p - codes]) != 0)
+                    return -1;
+            }
+            break;
+        default:
+            json_error(json, "invalid escaped byte '%c'", c);
+            return -1;
+        }
+    }
+    return 0;
+}
+
+static int
+char_needs_escaping(int c)
+{
+    if ((c >= 0) && (c < 0x20 || c == 0x22 || c == 0x5c)) {
+        return 1;
+    }
+
+    return 0;
+}
+
+static int
+utf8_seq_length(char byte)
+{
+    unsigned char u = (unsigned char) byte;
+    if (u < 0x80) return 1;
+
+    if (0x80 <= u && u <= 0xBF)
+    {
+        // second, third or fourth byte of a multi-byte
+        // sequence, i.e. a "continuation byte"
+        return 0;
+    }
+    else if (u == 0xC0 || u == 0xC1)
+    {
+        // overlong encoding of an ASCII byte
+        return 0;
+    }
+    else if (0xC2 <= u && u <= 0xDF)
+    {
+        // 2-byte sequence
+        return 2;
+    }
+    else if (0xE0 <= u && u <= 0xEF)
+    {
+        // 3-byte sequence
+        return 3;
+    }
+    else if (0xF0 <= u && u <= 0xF4)
+    {
+        // 4-byte sequence
+        return 4;
+    }
+    else
+    {
+        // u >= 0xF5
+        // Restricted (start of 4-, 5- or 6-byte sequence) or invalid UTF-8
+        return 0;
+    }
+}
+
+static int
+is_legal_utf8(const unsigned char *bytes, int length)
+{
+    if (0 == bytes || 0 == length) return 0;
+
+    unsigned char a;
+    const unsigned char* srcptr = bytes + length;
+    switch (length)
+    {
+    default:
+        return 0;
+        // Everything else falls through when true.
+    case 4:
+        if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return 0;
+        /* FALLTHRU */
+    case 3:
+        if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return 0;
+        /* FALLTHRU */
+    case 2:
+        a = (*--srcptr);
+        switch (*bytes)
+        {
+        case 0xE0:
+            if (a < 0xA0 || a > 0xBF) return 0;
+            break;
+        case 0xED:
+            if (a < 0x80 || a > 0x9F) return 0;
+            break;
+        case 0xF0:
+            if (a < 0x90 || a > 0xBF) return 0;
+            break;
+        case 0xF4:
+            if (a < 0x80 || a > 0x8F) return 0;
+            break;
+        default:
+            if (a < 0x80 || a > 0xBF) return 0;
+            break;
+        }
+        /* FALLTHRU */
+    case 1:
+        if (*bytes >= 0x80 && *bytes < 0xC2) return 0;
+    }
+    return *bytes <= 0xF4;
+}
+
+static int
+read_utf8(json_stream* json, int next_char)
+{
+    int count = utf8_seq_length(next_char);
+    if (!count)
+    {
+        json_error(json, "%s", "invalid UTF-8 character");
+        return -1;
+    }
+
+    char buffer[4];
+    buffer[0] = next_char;
+    int i;
+    for (i = 1; i < count; ++i)
+    {
+        if ((buffer[i] = json->source.get(&json->source)) != EOF)
+            json->lineadj++;
+    }
+
+    if (!is_legal_utf8((unsigned char*) buffer, count))
+    {
+        json_error(json, "%s", "invalid UTF-8 text");
+        return -1;
+    }
+
+    for (i = 0; i < count; ++i)
+    {
+        if (pushchar(json, buffer[i]) != 0)
+            return -1;
+    }
+    return 0;
+}
+
+static enum json_type
+read_string(json_stream *json)
+{
+    if (init_string(json) != 0)
+        return JSON_ERROR;
+    while (1) {
+        int c = json->source.get(&json->source);
+        if (c == EOF) {
+            json_error(json, "%s", "unterminated string literal");
+            return JSON_ERROR;
+        } else if (c == '"') {
+            if (pushchar(json, '\0') == 0)
+                return JSON_STRING;
+            else
+                return JSON_ERROR;
+        } else if (c == '\\') {
+            if (read_escaped(json) != 0)
+                return JSON_ERROR;
+        } else if ((unsigned) c >= 0x80) {
+            if (read_utf8(json, c) != 0)
+                return JSON_ERROR;
+        } else {
+            if (char_needs_escaping(c)) {
+                json_error(json, "%s", "unescaped control character in string");
+                return JSON_ERROR;
+            }
+
+            if (pushchar(json, c) != 0)
+                return JSON_ERROR;
+        }
+    }
+    return JSON_ERROR;
+}
+
+static int
+is_digit(int c)
+{
+    return c >= 48 /*0*/ && c <= 57 /*9*/;
+}
+
+static int
+read_digits(json_stream *json)
+{
+    int c;
+    unsigned nread = 0;
+    while (is_digit(c = json->source.peek(&json->source))) {
+        if (pushchar(json, json->source.get(&json->source)) != 0)
+            return -1;
+
+        nread++;
+    }
+
+    if (nread == 0) {
+        json_error(json, "expected digit instead of byte '%c'", c);
+        return -1;
+    }
+
+    return 0;
+}
+
+static enum json_type
+read_number(json_stream *json, int c)
+{
+    if (pushchar(json, c) != 0)
+        return JSON_ERROR;
+    if (c == '-') {
+        c = json->source.get(&json->source);
+        if (is_digit(c)) {
+            return read_number(json, c);
+        } else {
+            json_error(json, "unexpected byte '%c' in number", c);
+            return JSON_ERROR;
+        }
+    } else if (strchr("123456789", c) != NULL) {
+        c = json->source.peek(&json->source);
+        if (is_digit(c)) {
+            if (read_digits(json) != 0)
+                return JSON_ERROR;
+        }
+    }
+    /* Up to decimal or exponent has been read. */
+    c = json->source.peek(&json->source);
+    if (strchr(".eE", c) == NULL) {
+        if (pushchar(json, '\0') != 0)
+            return JSON_ERROR;
+        else
+            return JSON_NUMBER;
+    }
+    if (c == '.') {
+        json->source.get(&json->source); // consume .
+        if (pushchar(json, c) != 0)
+            return JSON_ERROR;
+        if (read_digits(json) != 0)
+            return JSON_ERROR;
+    }
+    /* Check for exponent. */
+    c = json->source.peek(&json->source);
+    if (c == 'e' || c == 'E') {
+        json->source.get(&json->source); // consume e/E
+        if (pushchar(json, c) != 0)
+            return JSON_ERROR;
+        c = json->source.peek(&json->source);
+        if (c == '+' || c == '-') {
+            json->source.get(&json->source); // consume
+            if (pushchar(json, c) != 0)
+                return JSON_ERROR;
+            if (read_digits(json) != 0)
+                return JSON_ERROR;
+        } else if (is_digit(c)) {
+            if (read_digits(json) != 0)
+                return JSON_ERROR;
+        } else {
+            json->source.get(&json->source); // consume (for column)
+            json_error(json, "unexpected byte '%c' in number", c);
+            return JSON_ERROR;
+        }
+    }
+    if (pushchar(json, '\0') != 0)
+        return JSON_ERROR;
+    else
+        return JSON_NUMBER;
+}
+
+bool
+json_isspace(int c)
+{
+    switch (c) {
+    case 0x09:
+    case 0x0a:
+    case 0x0d:
+    case 0x20:
+        return true;
+    }
+
+    return false;
+}
+
+static void newline(json_stream *json)
+{
+    json->lineno++;
+    json->linepos = json->source.position;
+    json->lineadj = 0;
+    json->linecon = 0;
+}
+
+/* Returns the next non-whitespace character in the stream.
+ *
+ * Note that this is the only function (besides user-facing json_source_get())
+ * that needs to worry about newline housekeeping.
+ */
+static int next(json_stream *json)
+{
+   int c;
+   while (json_isspace(c = json->source.get(&json->source)))
+       if (c == '\n')
+           newline(json);
+   return c;
+}
+
+static enum json_type
+read_value(json_stream *json, int c)
+{
+    enum json_type type;
+    size_t colno = json_get_column(json);
+
+    json->ntokens++;
+
+    switch (c) {
+    case EOF:
+        json_error(json, "%s", "unexpected end of text");
+        type = JSON_ERROR;
+        break;
+    case '{':
+        type = push(json, JSON_OBJECT);
+        break;
+    case '[':
+        type = push(json, JSON_ARRAY);
+        break;
+    case '"':
+        type = read_string(json);
+        break;
+    case 'n':
+        type = is_match(json, "ull", JSON_NULL);
+        break;
+    case 'f':
+        type = is_match(json, "alse", JSON_FALSE);
+        break;
+    case 't':
+        type = is_match(json, "rue", JSON_TRUE);
+        break;
+    case '0':
+    case '1':
+    case '2':
+    case '3':
+    case '4':
+    case '5':
+    case '6':
+    case '7':
+    case '8':
+    case '9':
+    case '-':
+        type = init_string(json) == 0 ? read_number(json, c) : JSON_ERROR;
+        break;
+    default:
+        type = JSON_ERROR;
+        json_error(json, "unexpected byte '%c' in value", c);
+        break;
+    }
+
+    if (type != JSON_ERROR)
+        json->colno = colno;
+
+    return type;
+}
+
+enum json_type json_peek(json_stream *json)
+{
+    enum json_type next;
+    if (json->next)
+        next = json->next;
+    else
+        next = json->next = json_next(json);
+    return next;
+}
+
+enum json_type json_next(json_stream *json)
+{
+    if (json->flags & JSON_FLAG_ERROR)
+        return JSON_ERROR;
+    if (json->next != 0) {
+        enum json_type next = json->next;
+        json->next = (enum json_type)0;
+        return next;
+    }
+
+    json->colno = 0;
+
+    if (json->ntokens > 0 && json->stack_top == (size_t)-1) {
+
+        /* In the streaming mode leave any trailing whitespaces in the stream.
+         * This allows the user to validate any desired separation between
+         * values (such as newlines) using json_source_get/peek() with any
+         * remaining whitespaces ignored as leading when we parse the next
+         * value. */
+        if (!(json->flags & JSON_FLAG_STREAMING)) {
+            int c = next(json);
+            if (c != EOF) {
+                json_error(json, "expected end of text instead of byte '%c'", c);
+                return JSON_ERROR;
+            }
+        }
+
+        return JSON_DONE;
+    }
+    int c = next(json);
+    if (json->stack_top == (size_t)-1) {
+        if (c == EOF && (json->flags & JSON_FLAG_STREAMING))
+            return JSON_DONE;
+
+        return read_value(json, c);
+    }
+    if (json->stack[json->stack_top].type == JSON_ARRAY) {
+        if (json->stack[json->stack_top].count == 0) {
+            if (c == ']') {
+                return pop(json, c, JSON_ARRAY);
+            }
+            json->stack[json->stack_top].count++;
+            return read_value(json, c);
+        } else if (c == ',') {
+            json->stack[json->stack_top].count++;
+            return read_value(json, next(json));
+        } else if (c == ']') {
+            return pop(json, c, JSON_ARRAY);
+        } else {
+            json_error(json, "unexpected byte '%c'", c);
+            return JSON_ERROR;
+        }
+    } else if (json->stack[json->stack_top].type == JSON_OBJECT) {
+        if (json->stack[json->stack_top].count == 0) {
+            if (c == '}') {
+                return pop(json, c, JSON_OBJECT);
+            }
+
+            /* No member name/value pairs yet. */
+            enum json_type value = read_value(json, c);
+            if (value != JSON_STRING) {
+                if (value != JSON_ERROR)
+                    json_error(json, "%s", "expected member name or '}'");
+                return JSON_ERROR;
+            } else {
+                json->stack[json->stack_top].count++;
+                return value;
+            }
+        } else if ((json->stack[json->stack_top].count % 2) == 0) {
+            /* Expecting comma followed by member name. */
+            if (c != ',' && c != '}') {
+                json_error(json, "%s", "expected ',' or '}' after member value");
+                return JSON_ERROR;
+            } else if (c == '}') {
+                return pop(json, c, JSON_OBJECT);
+            } else {
+                enum json_type value = read_value(json, next(json));
+                if (value != JSON_STRING) {
+                    if (value != JSON_ERROR)
+                        json_error(json, "%s", "expected member name");
+                    return JSON_ERROR;
+                } else {
+                    json->stack[json->stack_top].count++;
+                    return value;
+                }
+            }
+        } else if ((json->stack[json->stack_top].count % 2) == 1) {
+            /* Expecting colon followed by value. */
+            if (c != ':') {
+                json_error(json, "%s", "expected ':' after member name");
+                return JSON_ERROR;
+            } else {
+                json->stack[json->stack_top].count++;
+                return read_value(json, next(json));
+            }
+        }
+    }
+    json_error(json, "%s", "invalid parser state");
+    return JSON_ERROR;
+}
+
+void json_reset(json_stream *json)
+{
+    json->stack_top = -1;
+    json->ntokens = 0;
+    json->flags &= ~JSON_FLAG_ERROR;
+    json->errmsg[0] = '\0';
+}
+
+enum json_type json_skip(json_stream *json)
+{
+    enum json_type type = json_next(json);
+    size_t cnt_arr = 0;
+    size_t cnt_obj = 0;
+
+    for (enum json_type skip = type; ; skip = json_next(json)) {
+        if (skip == JSON_ERROR || skip == JSON_DONE)
+            return skip;
+
+        if (skip == JSON_ARRAY) {
+            ++cnt_arr;
+        } else if (skip == JSON_ARRAY_END && cnt_arr > 0) {
+            --cnt_arr;
+        } else if (skip == JSON_OBJECT) {
+            ++cnt_obj;
+        } else if (skip == JSON_OBJECT_END && cnt_obj > 0) {
+            --cnt_obj;
+        }
+
+        if (!cnt_arr && !cnt_obj)
+            break;
+    }
+
+    return type;
+}
+
+enum json_type json_skip_until(json_stream *json, enum json_type type)
+{
+    while (1) {
+        enum json_type skip = json_skip(json);
+
+        if (skip == JSON_ERROR || skip == JSON_DONE)
+            return skip;
+
+        if (skip == type)
+            break;
+    }
+
+    return type;
+}
+
+const char *json_get_string(json_stream *json, size_t *length)
+{
+    if (length != NULL)
+        *length = json->data.string_fill;
+    if (json->data.string == NULL)
+        return "";
+    else
+        return json->data.string;
+}
+
+double json_get_number(json_stream *json)
+{
+    char *p = json->data.string;
+    return p == NULL ? 0 : strtod(p, NULL);
+}
+
+const char *json_get_error(json_stream *json)
+{
+    return json->flags & JSON_FLAG_ERROR ? json->errmsg : NULL;
+}
+
+size_t json_get_lineno(json_stream *json)
+{
+    return json->lineno;
+}
+
+size_t json_get_position(json_stream *json)
+{
+    return json->source.position;
+}
+
+size_t json_get_column(json_stream *json)
+{
+    return json->colno == 0
+               ? json->source.position == 0 ? 1 : json->source.position - json->linepos - json->lineadj
+               : json->colno;
+}
+
+size_t json_get_depth(json_stream *json)
+{
+    return json->stack_top + 1;
+}
+
+/* Return the current parsing context, that is, JSON_OBJECT if we are inside
+   an object, JSON_ARRAY if we are inside an array, and JSON_DONE if we are
+   not yet/anymore in either.
+
+   Additionally, for the first two cases, also return the number of parsing
+   events that have already been observed at this level with json_next/peek().
+   In particular, inside an object, an odd number would indicate that the just
+   observed JSON_STRING event is a member name.
+*/
+enum json_type json_get_context(json_stream *json, size_t *count)
+{
+    if (json->stack_top == (size_t)-1)
+        return JSON_DONE;
+
+    if (count != NULL)
+        *count = json->stack[json->stack_top].count;
+
+    return json->stack[json->stack_top].type;
+}
+
+int json_source_get(json_stream *json)
+{
+    /* If the caller reads a multi-byte UTF-8 sequence, we expect them to read
+     * it in its entirety. We also assume that any invalid bytes within such a
+     * sequence belong to the same column (as opposed to starting a new column
+     * or some such). */
+
+    int c = json->source.get(&json->source);
+    if (json->linecon > 0) {
+        /* Expecting a continuation byte within a multi-byte UTF-8 sequence. */
+        json->linecon--;
+        if (c != EOF)
+            json->lineadj++;
+    } else if (c == '\n')
+        newline(json);
+    else if (c >= 0xC2 && c <= 0xF4) /* First in multi-byte UTF-8 sequence. */
+        json->linecon = utf8_seq_length(c) - 1;
+
+    return c;
+}
+
+int json_source_peek(json_stream *json)
+{
+    return json->source.peek(&json->source);
+}
+
+void json_open_buffer(json_stream *json, const void *buffer, size_t size)
+{
+    init(json);
+    json->source.get = buffer_get;
+    json->source.peek = buffer_peek;
+    json->source.source.buffer.buffer = (const char *)buffer;
+    json->source.source.buffer.length = size;
+}
+
+void json_open_string(json_stream *json, const char *string)
+{
+    json_open_buffer(json, string, strlen(string));
+}
+
+void json_open_stream(json_stream *json, FILE * stream)
+{
+    init(json);
+    json->source.get = stream_get;
+    json->source.peek = stream_peek;
+    json->source.source.stream.stream = stream;
+}
+
+static int user_get(struct json_source *json)
+{
+    int c = json->source.user.get(json->source.user.ptr);
+    if (c != EOF)
+        json->position++;
+    return c;
+}
+
+static int user_peek(struct json_source *json)
+{
+    return json->source.user.peek(json->source.user.ptr);
+}
+
+void json_open_user(json_stream *json, json_user_io get, json_user_io peek, void *user)
+{
+    init(json);
+    json->source.get = user_get;
+    json->source.peek = user_peek;
+    json->source.source.user.ptr = user;
+    json->source.source.user.get = get;
+    json->source.source.user.peek = peek;
+}
+
+void json_set_allocator(json_stream *json, json_allocator *a)
+{
+    json->alloc = *a;
+}
+
+void json_set_streaming(json_stream *json, bool streaming)
+{
+    if (streaming)
+        json->flags |= JSON_FLAG_STREAMING;
+    else
+        json->flags &= ~JSON_FLAG_STREAMING;
+}
+
+void json_close(json_stream *json)
+{
+    json->alloc.free(json->stack);
+    json->alloc.free(json->data.string);
+}
diff --git a/libbutl/json/pdjson.h b/libbutl/json/pdjson.h
new file mode 100644
index 0000000..ac698e4
--- /dev/null
+++ b/libbutl/json/pdjson.h
@@ -0,0 +1,147 @@
+#ifndef PDJSON_H
+#define PDJSON_H
+
+#ifndef PDJSON_SYMEXPORT
+#   define PDJSON_SYMEXPORT
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#else
+#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)
+    #include <stdbool.h>
+#else
+    #ifndef bool
+        #define bool int
+        #define true 1
+        #define false 0
+    #endif /* bool */
+#endif /* __STDC_VERSION__ */
+#endif /* __cplusplus */
+
+#include <stdio.h>
+
+enum json_type {
+    JSON_ERROR = 1, JSON_DONE,
+    JSON_OBJECT, JSON_OBJECT_END, JSON_ARRAY, JSON_ARRAY_END,
+    JSON_STRING, JSON_NUMBER, JSON_TRUE, JSON_FALSE, JSON_NULL
+};
+
+struct json_allocator {
+    void *(*malloc)(size_t);
+    void *(*realloc)(void *, size_t);
+    void (*free)(void *);
+};
+
+typedef int (*json_user_io)(void *user);
+
+typedef struct json_stream json_stream;
+typedef struct json_allocator json_allocator;
+
+PDJSON_SYMEXPORT void json_open_buffer(json_stream *json, const void *buffer, size_t size);
+PDJSON_SYMEXPORT void json_open_string(json_stream *json, const char *string);
+PDJSON_SYMEXPORT void json_open_stream(json_stream *json, FILE *stream);
+PDJSON_SYMEXPORT void json_open_user(json_stream *json, json_user_io get, json_user_io peek, void *user);
+PDJSON_SYMEXPORT void json_close(json_stream *json);
+
+PDJSON_SYMEXPORT void json_set_allocator(json_stream *json, json_allocator *a);
+PDJSON_SYMEXPORT void json_set_streaming(json_stream *json, bool mode);
+
+PDJSON_SYMEXPORT enum json_type json_next(json_stream *json);
+PDJSON_SYMEXPORT enum json_type json_peek(json_stream *json);
+PDJSON_SYMEXPORT void json_reset(json_stream *json);
+PDJSON_SYMEXPORT const char *json_get_string(json_stream *json, size_t *length);
+PDJSON_SYMEXPORT double json_get_number(json_stream *json);
+
+PDJSON_SYMEXPORT enum json_type json_skip(json_stream *json);
+PDJSON_SYMEXPORT enum json_type json_skip_until(json_stream *json, enum json_type type);
+
+PDJSON_SYMEXPORT size_t json_get_lineno(json_stream *json);
+PDJSON_SYMEXPORT size_t json_get_position(json_stream *json);
+PDJSON_SYMEXPORT size_t json_get_column(json_stream *json);
+PDJSON_SYMEXPORT size_t json_get_depth(json_stream *json);
+PDJSON_SYMEXPORT enum json_type json_get_context(json_stream *json, size_t *count);
+PDJSON_SYMEXPORT const char *json_get_error(json_stream *json);
+
+PDJSON_SYMEXPORT int json_source_get(json_stream *json);
+PDJSON_SYMEXPORT int json_source_peek(json_stream *json);
+PDJSON_SYMEXPORT bool json_isspace(int c);
+
+/* internal */
+
+struct json_source {
+    int (*get)(struct json_source *);
+    int (*peek)(struct json_source *);
+    size_t position;
+    union {
+        struct {
+            FILE *stream;
+        } stream;
+        struct {
+            const char *buffer;
+            size_t length;
+        } buffer;
+        struct {
+            void *ptr;
+            json_user_io get;
+            json_user_io peek;
+        } user;
+    } source;
+};
+
+struct json_stream {
+    size_t lineno;
+
+    /* While counting lines is straightforward, columns are tricky because we
+     * have to count codepoints, not bytes. We could have peppered the code
+     * with increments in all the relevant places but that seems inelegant.
+     * So instead we calculate the column dynamically, based on the current
+     * position.
+     *
+     * Specifically, we will remember the position at the beginning of each
+     * line (linepos) and, assuming only the ASCII characters on the line, the
+     * column will be the difference between the current position and linepos.
+     * Of course there could also be multi-byte UTF-8 sequences which we will
+     * handle by keeping an adjustment (lineadj) -- the number of continuation
+     * bytes encountered on this line so far. Finally, for json_source_get()
+     * we also have to keep the number of remaining continuation bytes in the
+     * current multi-byte UTF-8 sequence (linecon).
+     *
+     * This is not the end of the story, however: with only the just described
+     * approach we will always end up with the column of the latest character
+     * read which is not what we want when returning potentially multi-
+     * character value events (string, number, etc); in these cases we want to
+     * return the column of the first character (note that if the value itself
+     * is invalid and we are returning JSON_ERROR, we still want the current
+     * column). So to handle this we will cache the start column (colno) for
+     * such events.
+     */
+    size_t linepos; /* Position at the beginning of the current line. */
+    size_t lineadj; /* Adjustment for multi-byte UTF-8 sequences. */
+    size_t linecon; /* Number of remaining UTF-8 continuation bytes. */
+    size_t colno;   /* Start column for value events or 0. */
+
+    struct json_stack *stack;
+    size_t stack_top;
+    size_t stack_size;
+    enum json_type next;
+    unsigned flags;
+
+    struct {
+        char *string;
+        size_t string_fill;
+        size_t string_size;
+    } data;
+
+    size_t ntokens;
+
+    struct json_source source;
+    struct json_allocator alloc;
+    char errmsg[128];
+};
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif /* __cplusplus */
+
+#endif
author	Boris Kolpackov <boris@codesynthesis.com>	2022-09-30 13:31:25 +0200
committer	Boris Kolpackov <boris@codesynthesis.com>	2022-09-30 13:31:25 +0200
commit	d9dd84487bda8303590d5b30987f1d76b93867ba (patch)
tree	7fd0a926dea644a2ddb70c863574af386b9d7f79 /libbutl
parent	e53da6ae4665ce49dddcc9aaa97a4e87bb94f48d (diff)