From e5bfd17637bf297c3cfe509d51027916864092d5 Mon Sep 17 00:00:00 2001
From: Karen Arutyunov <karen@codesynthesis.com>
Date: Sun, 10 Dec 2017 10:02:19 +0300
Subject: Add basic_url<H,T> class template

---
 libbutl/url.mxx | 476 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 476 insertions(+)
 create mode 100644 libbutl/url.mxx

(limited to 'libbutl/url.mxx')
diff --git a/libbutl/url.mxx b/libbutl/url.mxx
new file mode 100644
index 0000000..fe091f1
--- /dev/null
+++ b/libbutl/url.mxx
@@ -0,0 +1,476 @@
+// file      : libbutl/url.mxx -*- C++ -*-
+// copyright : Copyright (c) 2014-2017 Code Synthesis Ltd
+// license   : MIT; see accompanying LICENSE file
+
+#ifndef __cpp_modules
+#pragma once
+#endif
+
+// C includes.
+
+#include <cassert>
+
+#ifndef __cpp_lib_modules
+#include <string>
+#include <cstdint>  // uint*_t
+#include <utility>  // move()
+#include <ostream>
+#include <iterator> // back_inserter
+
+#include <cstddef>   // size_t
+#include <stdexcept> // invalid_argument
+#endif
+
+// Other includes.
+
+#ifdef __cpp_modules
+export module butl.url;
+#ifdef __cpp_lib_modules
+import std.core;
+import std.io;
+#endif
+import butl.path;
+import butl.utility;
+import butl.optional;
+#else
+#include <libbutl/path.mxx>
+#include <libbutl/utility.mxx>
+#include <libbutl/optional.mxx>
+#endif
+
+#include <libbutl/export.hxx>
+
+LIBBUTL_MODEXPORT namespace butl
+{
+  // RFC3986 Uniform Resource Locator (URL).
+  //
+  // <url>       = <scheme>:[//[<authority>]][/<path>][?<query>][#<fragment>]
+  // <authority> = [<user>@]<host>[:<port>]
+  //
+  // Some examples of equivalent URLs to meditate upon:
+  //
+  // file://localhost/tmp     (localhost authority)
+  // file:///tmp              (empty     authority)
+  // file:/tmp                (absent    authority)
+  //
+  // file://localhost/c:/tmp
+  // file:///c:/tmp
+  // file:/c:/tmp
+  //
+  // We think of the slash between <authority> and <path> as a separator but
+  // with the path always interpreted as starting from the "root" of the
+  // authority. Thus:
+  //
+  // file://localhost/tmp     ->  'file'://'localhost'/'tmp'    ->  /tmp
+  // file://localhost/c:/tmp  ->  'file'://'localhost'/'c:/tmp' ->  c:/tmp
+  //
+  // This means that the <path> component is represented as a relative path
+  // and, in the general case, we cannot use our path type for its storage
+  // since it assumes the path is for the host platform. In other words, the
+  // interpretation of the path has to take into account the platform of the
+  // authority host. Note, however, that a custom url_traits implementation
+  // can choose to use the path type if local paths are to be interpreted as
+  // relative to the host.
+  //
+  // Note that we currently forbid one character schemes to support scheme-
+  // less (Windows) paths which can be done by url_traits::translate_scheme()
+  // (see below). (A Windows path that uses forward slashes would be parsed as
+  // a valid authority-less URL).
+
+  // URL host component can be an IPv4 address (if matches its dotted-decimal
+  // notation), an IPv6 address (if enclosed in [square brackets]) or
+  // otherwise a name.
+  //
+  // Note that non-ASCII host names are allowed in URLs. They must be
+  // UTF8-encoded and URL-encoded afterwards. Curently we store the parsed
+  // host name UTF8-encoded without regards to the template argument string
+  // type. Later we may add support for more appropriate encodings for
+  // multi-byte character types.
+  //
+  enum class url_host_kind {ipv4, ipv6, name};
+
+  template <typename S>
+  struct basic_url_host
+  {
+    using string_type = S;
+    using kind_type   = url_host_kind;
+
+    string_type value;
+    kind_type   kind;
+
+    // Can be treated as const string_type&.
+    //
+    operator const string_type& () const noexcept {return value;}
+
+    // Create an empty host.
+    //
+    basic_url_host (): kind (kind_type::name) {}
+
+    // Create the host object from its string representation as it appears in
+    // a URL, throwing std::invalid_argument if invalid. Remove the enclosing
+    // square brackets for IPv6 addresses, and URL-decode host names.
+    //
+    // Note that currently we don't validate IPv6 addresses.
+    //
+    explicit
+    basic_url_host (string_type);
+
+    basic_url_host (string_type v, kind_type k)
+        : value (std::move (v)), kind (k) {}
+
+    bool
+    empty () const
+    {
+      assert (kind == kind_type::name || !value.empty ());
+      return value.empty ();
+    }
+
+    // Return string representation of the host as it would appear in a URL.
+    //
+    string_type
+    string () const;
+  };
+
+  template <typename S>
+  struct basic_url_authority
+  {
+    using string_type = S;
+    using host_type   = basic_url_host<string_type>;
+
+    string_type   user;  // Empty if not specified.
+    host_type     host;
+    std::uint16_t port;  // Zero if not specified.
+
+    bool
+    empty () const
+    {
+      assert (!host.empty () || (user.empty () && port == 0));
+      return host.empty ();
+    }
+
+    // Return a string representation of the URL authority. String
+    // representation of an empty instance is the empty string.
+    //
+    string_type
+    string () const;
+  };
+
+  template <typename H, typename S = H, typename P = S>
+  struct url_traits
+  {
+    using scheme_type = H;
+    using string_type = S;
+    using path_type   = P;
+
+    using authority_type = basic_url_authority<string_type>;
+
+    // Translate the scheme string representation to its type. May throw
+    // std::invalid_argument. May change the URL components.
+    //
+    // This function is called with an empty scheme if the URL has no scheme,
+    // the scheme is invalid, or it could not be parsed into components
+    // according to the URL syntax. In this case all the passed components
+    // reference empty/absent values and if they remain unchanged on return,
+    // the URL is considered invalid and the std::invalid_argument exception
+    // with an appropriate description is thrown by the URL object constructor.
+    // This can be used to support scheme-less URLs, local paths, etc.
+    //
+    static scheme_type
+    translate_scheme (const string_type&         /*url*/,
+                      string_type&&              scheme,
+                      optional<authority_type>&  /*authority*/,
+                      optional<path_type>&       /*path*/,
+                      optional<string_type>&     /*query*/,
+                      optional<string_type>&     /*fragment*/)
+    {
+      return scheme_type (std::move (scheme));
+    }
+
+    // Translate scheme type back to its string representation.
+    //
+    // Similar to the above the function is called with an empty string
+    // representation. If on return this value is no longer empty, then it is
+    // assume the URL has been translated in a custom manner (in which case
+    // the returned scheme value is ignored).
+    //
+    static string_type
+    translate_scheme (string_type&,                    /*url*/
+                      const scheme_type& scheme,
+                      const optional<authority_type>&  /*authority*/,
+                      const optional<path_type>&       /*path*/,
+                      const optional<string_type>&     /*query*/,
+                      const optional<string_type>&     /*fragment*/)
+    {
+      return string_type (scheme);
+    }
+
+    // Translate the path string representation to its type.
+    //
+    static path_type
+    translate_path (string_type&& path)
+    {
+      return path_type (std::move (path));
+    }
+
+    // Translate path type back to its string representation.
+    //
+    static string_type
+    translate_path (const path_type& path) {return string_type (path);}
+  };
+
+  template <typename H, // scheme
+            typename T = url_traits<H>>
+  class basic_url
+  {
+  public:
+    using traits = T;
+
+    using string_type = typename traits::string_type;
+    using char_type   = typename string_type::value_type;
+    using path_type   = typename traits::path_type;
+
+    using scheme_type    = typename traits::scheme_type;
+    using authority_type = typename traits::authority_type;
+    using host_type      = typename authority_type::host_type;
+
+    scheme_type              scheme;
+    optional<authority_type> authority;
+    optional<path_type>      path;
+    optional<string_type>    query;
+    optional<string_type>    fragment;
+
+    // Create an empty URL object.
+    //
+    basic_url () = default;
+
+    // Create the URL object from its string representation. If the argument is
+    // empty, then create an empty object. Otherwise verify that the string is
+    // compliant to the generic URL syntax. URL-decode and validate components
+    // with common for all schemes syntax (scheme, host, port, path).
+    // Throw std::invalid_argument if the passed string is not a valid URL
+    // representation.
+    //
+    // Validation and URL-decoding of the scheme-specific components can be
+    // provided by a custom url_traits::translate_scheme() implementation.
+    //
+    explicit
+    basic_url (const string_type&);
+
+    // Create the URL object from individual components. Performs no
+    // components URL-decoding or verification.
+    //
+    basic_url (scheme_type,
+               optional<authority_type>,
+               optional<path_type> path,
+               optional<string_type> query = nullopt,
+               optional<string_type> fragment = nullopt);
+
+    basic_url (scheme_type,
+               host_type host,
+               optional<path_type> path,
+               optional<string_type> query = nullopt,
+               optional<string_type> fragment = nullopt);
+
+    basic_url (scheme_type,
+               host_type host,
+               std::uint16_t port,
+               optional<path_type> path,
+               optional<string_type> query = nullopt,
+               optional<string_type> fragment = nullopt);
+
+    basic_url (scheme_type,
+               string_type host,
+               optional<path_type> path,
+               optional<string_type> query = nullopt,
+               optional<string_type> fragment = nullopt);
+
+    basic_url (scheme_type,
+               string_type host,
+               std::uint16_t port,
+               optional<path_type> path,
+               optional<string_type> query = nullopt,
+               optional<string_type> fragment = nullopt);
+
+    bool
+    empty () const noexcept
+    {
+      assert (authority || path || query || !fragment);
+      return !authority && !path && !query;
+    }
+
+    // Return a string representation of the URL. Note that while this is not
+    // necessarily syntactically the same string as what was used to
+    // initialize this instance, it should be semantically equivalent. String
+    // representation of an empty instance is the empty string.
+    //
+    string_type
+    string () const;
+
+    // The following predicates can be used to classify URL characters while
+    // parsing, validating or encoding scheme-specific components. For the
+    // semantics of character classes see RFC3986.
+    //
+    static bool
+    gen_delim (char_type c)
+    {
+      return c == ':' || c == '/' || c == '?' || c == '#' || c == '[' ||
+             c == ']' || c == '@';
+    }
+
+    static bool
+    sub_delim (char_type c)
+    {
+      return c == '!' || c == '$' || c == '&' || c == '=' || c == '(' ||
+             c == ')' || c == '*' || c == '+' || c == ',' || c == ';' ||
+             c == '\'';
+    }
+
+    static bool
+    reserved (char_type c) {return sub_delim (c) || gen_delim (c);}
+
+    static bool
+    unreserved (char_type c)
+    {
+      return alnum (c) || c == '-' ||  c == '.' || c =='_' || c == '~';
+    }
+
+    // URL-encode a character sequence.
+    //
+    // Note that the set of characters that should be encoded may differ for
+    // different URL components. The optional callback function must return
+    // true for characters that should be percent-encoded. The function may
+    // encode the passed character in it's own way with another character (but
+    // never with '%'), and return false. By default all characters other than
+    // unreserved are percent-encoded.
+    //
+    // Also note that the characters are interpreted as bytes. In other words,
+    // each character may result in a single encoding triplet.
+    //
+    template <typename I, typename O, typename F = bool (*) (char_type&)>
+    static void
+    encode (I b, I e,
+            O o,
+
+            // VC (as of 15u3) doesn't see unreserved() unless qualified.
+            //
+            F&& f = [] (char_type& c) {return !basic_url<H,T>::unreserved (c);});
+
+    template <typename F = bool (*) (char_type&)>
+    static string_type
+    encode (const string_type& s,
+            F&& f = [] (char_type& c) {return !basic_url<H,T>::unreserved (c);})
+    {
+      string_type r;
+      encode (s.begin (), s.end (), std::back_inserter (r), f);
+      return r;
+    }
+
+    template <typename F = bool (*) (char_type&)>
+    static string_type
+    encode (const char_type* s,
+            F&& f = [] (char_type& c) {return !basic_url<H,T>::unreserved (c);})
+    {
+      string_type r;
+      encode (s, s + string_type::traits_type::length (s),
+              std::back_inserter (r), f);
+      return r;
+    }
+
+    // URL-decode a character sequence. Throw std::invalid_argument if an
+    // invalid encoding sequence is encountered.
+    //
+    // If some characters in the sequence are encoded with another characters
+    // (rather than percent-encoded), then one must provide the callback
+    // function to decode them.
+    //
+    template <typename I, typename O, typename F = void (*) (char_type&)>
+    static void
+    decode (I b, I e, O o, F&& f = [] (char_type&) {});
+
+    template <typename F = void (*) (char_type&)>
+    static string_type
+    decode (const string_type& s, F&& f = [] (char_type&) {})
+    {
+      string_type r;
+      decode (s.begin (), s.end (), std::back_inserter (r), f);
+      return r;
+    }
+
+    template <typename F = void (*) (char_type&)>
+    static string_type
+    decode (const char_type* s, F&& f = [] (char_type&) {})
+    {
+      string_type r;
+      decode (s, s + string_type::traits_type::length (s),
+              std::back_inserter (r), f);
+      return r;
+    }
+  };
+
+  using url_authority = basic_url_authority<std::string>;
+  using url           = basic_url          <std::string>;
+
+  template <typename S>
+  inline bool
+  operator== (const basic_url_host<S>& x, const basic_url_host<S>& y) noexcept
+  {
+    return x.value == y.value && x.kind == y.kind;
+  }
+
+  template <typename S>
+  inline bool
+  operator!= (const basic_url_host<S>& x, const basic_url_host<S>& y) noexcept
+  {
+    return !(x == y);
+  }
+
+  template <typename S>
+  inline bool
+  operator== (const basic_url_authority<S>& x,
+              const basic_url_authority<S>& y) noexcept
+  {
+    return x.user == y.user && x.host == y.host && x.port == y.port;
+  }
+
+  template <typename S>
+  inline bool
+  operator!= (const basic_url_authority<S>& x,
+              const basic_url_authority<S>& y) noexcept
+  {
+    return !(x == y);
+  }
+
+  template <typename S, typename T>
+  inline bool
+  operator== (const basic_url<S, T>& x, const basic_url<S, T>& y) noexcept
+  {
+    if (!(x.authority == y.authority && x.path == y.path &&
+          x.query == y.query && x.fragment == y.fragment))
+      return false;
+
+    assert (x.empty () == y.empty ());
+
+    if (x.empty ())
+      return true;
+
+    return x.scheme == y.scheme; // None is empty, so schemes are valid.
+  }
+
+  template <typename S, typename T>
+  inline bool
+  operator!= (const basic_url<S, T>& x, const basic_url<S, T>& y) noexcept
+  {
+    return !(x == y);
+  }
+
+  template <typename S, typename T>
+  inline auto
+  operator<< (std::basic_ostream<typename T::string_type::value_type>& o,
+              const basic_url<S, T>& u) -> decltype (o)
+  {
+    return o << u.string ();
+  }
+}
+
+#include <libbutl/url.ixx>
+#include <libbutl/url.txx>
-- 
cgit v1.1