8 files changed, 1841 insertions, 8 deletions
diff --git a/libbutl/url.ixx b/libbutl/url.ixx
new file mode 100644
index 0000000..4ff7a06
--- /dev/null
+++ b/libbutl/url.ixx
@@ -0,0 +1,84 @@
+// file      : libbutl/url.ixx -*- C++ -*-
+// copyright : Copyright (c) 2014-2017 Code Synthesis Ltd
+// license   : MIT; see accompanying LICENSE file
+
+LIBBUTL_MODEXPORT namespace butl //@@ MOD Clang needs this for some reason.
+{
+  template <typename S, typename T>
+  inline basic_url<S, T>::
+  basic_url (scheme_type s,
+             optional<authority_type> a,
+             optional<path_type> p,
+             optional<string_type> q,
+             optional<string_type> f)
+      : scheme (std::move (s)),
+        authority (std::move (a)),
+        path (std::move (p)),
+        query (std::move (q)),
+        fragment (std::move (f))
+  {
+  }
+
+  template <typename S, typename T>
+  inline basic_url<S, T>::
+  basic_url (scheme_type s,
+             host_type h,
+             optional<path_type> p,
+             optional<string_type> q,
+             optional<string_type> f)
+      : basic_url (std::move (s),
+                   authority_type {string_type (), std::move (h), 0},
+                   std::move (p),
+                   std::move (q),
+                   std::move (f))
+  {
+  }
+
+  template <typename S, typename T>
+  inline basic_url<S, T>::
+  basic_url (scheme_type s,
+             host_type h,
+             std::uint16_t o,
+             optional<path_type> p,
+             optional<string_type> q,
+             optional<string_type> f)
+      : basic_url (std::move (s),
+                   authority_type {string_type (), std::move (h), o},
+                   std::move (p),
+                   std::move (q),
+                   std::move (f))
+  {
+  }
+
+  template <typename S, typename T>
+  inline basic_url<S, T>::
+  basic_url (scheme_type s,
+             string_type h,
+             optional<path_type> p,
+             optional<string_type> q,
+             optional<string_type> f)
+      : basic_url (std::move (s),
+                   host_type (std::move (h)),
+                   std::move (p),
+                   std::move (q),
+                   std::move (f))
+  {
+  }
+
+  template <typename S, typename T>
+  inline basic_url<S, T>::
+  basic_url (scheme_type s,
+             string_type h,
+             std::uint16_t o,
+             optional<path_type> p,
+             optional<string_type> q,
+             optional<string_type> f)
+      : basic_url (std::move (s),
+                   host_type (std::move (h)),
+                   o,
+                   std::move (p),
+                   std::move (q),
+                   std::move (f))
+  {
+  }
+}
diff --git a/libbutl/url.mxx b/libbutl/url.mxx
new file mode 100644
index 0000000..fe091f1
--- /dev/null
+++ b/libbutl/url.mxx
@@ -0,0 +1,476 @@
+// file      : libbutl/url.mxx -*- C++ -*-
+// copyright : Copyright (c) 2014-2017 Code Synthesis Ltd
+// license   : MIT; see accompanying LICENSE file
+
+#ifndef __cpp_modules
+#pragma once
+#endif
+
+// C includes.
+
+#include <cassert>
+
+#ifndef __cpp_lib_modules
+#include <string>
+#include <cstdint>  // uint*_t
+#include <utility>  // move()
+#include <ostream>
+#include <iterator> // back_inserter
+
+#include <cstddef>   // size_t
+#include <stdexcept> // invalid_argument
+#endif
+
+// Other includes.
+
+#ifdef __cpp_modules
+export module butl.url;
+#ifdef __cpp_lib_modules
+import std.core;
+import std.io;
+#endif
+import butl.path;
+import butl.utility;
+import butl.optional;
+#else
+#include <libbutl/path.mxx>
+#include <libbutl/utility.mxx>
+#include <libbutl/optional.mxx>
+#endif
+
+#include <libbutl/export.hxx>
+
+LIBBUTL_MODEXPORT namespace butl
+{
+  // RFC3986 Uniform Resource Locator (URL).
+  //
+  // <url>       = <scheme>:[//[<authority>]][/<path>][?<query>][#<fragment>]
+  // <authority> = [<user>@]<host>[:<port>]
+  //
+  // Some examples of equivalent URLs to meditate upon:
+  //
+  // file://localhost/tmp     (localhost authority)
+  // file:///tmp              (empty     authority)
+  // file:/tmp                (absent    authority)
+  //
+  // file://localhost/c:/tmp
+  // file:///c:/tmp
+  // file:/c:/tmp
+  //
+  // We think of the slash between <authority> and <path> as a separator but
+  // with the path always interpreted as starting from the "root" of the
+  // authority. Thus:
+  //
+  // file://localhost/tmp     ->  'file'://'localhost'/'tmp'    ->  /tmp
+  // file://localhost/c:/tmp  ->  'file'://'localhost'/'c:/tmp' ->  c:/tmp
+  //
+  // This means that the <path> component is represented as a relative path
+  // and, in the general case, we cannot use our path type for its storage
+  // since it assumes the path is for the host platform. In other words, the
+  // interpretation of the path has to take into account the platform of the
+  // authority host. Note, however, that a custom url_traits implementation
+  // can choose to use the path type if local paths are to be interpreted as
+  // relative to the host.
+  //
+  // Note that we currently forbid one character schemes to support scheme-
+  // less (Windows) paths which can be done by url_traits::translate_scheme()
+  // (see below). (A Windows path that uses forward slashes would be parsed as
+  // a valid authority-less URL).
+
+  // URL host component can be an IPv4 address (if matches its dotted-decimal
+  // notation), an IPv6 address (if enclosed in [square brackets]) or
+  // otherwise a name.
+  //
+  // Note that non-ASCII host names are allowed in URLs. They must be
+  // UTF8-encoded and URL-encoded afterwards. Curently we store the parsed
+  // host name UTF8-encoded without regards to the template argument string
+  // type. Later we may add support for more appropriate encodings for
+  // multi-byte character types.
+  //
+  enum class url_host_kind {ipv4, ipv6, name};
+
+  template <typename S>
+  struct basic_url_host
+  {
+    using string_type = S;
+    using kind_type   = url_host_kind;
+
+    string_type value;
+    kind_type   kind;
+
+    // Can be treated as const string_type&.
+    //
+    operator const string_type& () const noexcept {return value;}
+
+    // Create an empty host.
+    //
+    basic_url_host (): kind (kind_type::name) {}
+
+    // Create the host object from its string representation as it appears in
+    // a URL, throwing std::invalid_argument if invalid. Remove the enclosing
+    // square brackets for IPv6 addresses, and URL-decode host names.
+    //
+    // Note that currently we don't validate IPv6 addresses.
+    //
+    explicit
+    basic_url_host (string_type);
+
+    basic_url_host (string_type v, kind_type k)
+        : value (std::move (v)), kind (k) {}
+
+    bool
+    empty () const
+    {
+      assert (kind == kind_type::name || !value.empty ());
+      return value.empty ();
+    }
+
+    // Return string representation of the host as it would appear in a URL.
+    //
+    string_type
+    string () const;
+  };
+
+  template <typename S>
+  struct basic_url_authority
+  {
+    using string_type = S;
+    using host_type   = basic_url_host<string_type>;
+
+    string_type   user;  // Empty if not specified.
+    host_type     host;
+    std::uint16_t port;  // Zero if not specified.
+
+    bool
+    empty () const
+    {
+      assert (!host.empty () || (user.empty () && port == 0));
+      return host.empty ();
+    }
+
+    // Return a string representation of the URL authority. String
+    // representation of an empty instance is the empty string.
+    //
+    string_type
+    string () const;
+  };
+
+  template <typename H, typename S = H, typename P = S>
+  struct url_traits
+  {
+    using scheme_type = H;
+    using string_type = S;
+    using path_type   = P;
+
+    using authority_type = basic_url_authority<string_type>;
+
+    // Translate the scheme string representation to its type. May throw
+    // std::invalid_argument. May change the URL components.
+    //
+    // This function is called with an empty scheme if the URL has no scheme,
+    // the scheme is invalid, or it could not be parsed into components
+    // according to the URL syntax. In this case all the passed components
+    // reference empty/absent values and if they remain unchanged on return,
+    // the URL is considered invalid and the std::invalid_argument exception
+    // with an appropriate description is thrown by the URL object constructor.
+    // This can be used to support scheme-less URLs, local paths, etc.
+    //
+    static scheme_type
+    translate_scheme (const string_type&         /*url*/,
+                      string_type&&              scheme,
+                      optional<authority_type>&  /*authority*/,
+                      optional<path_type>&       /*path*/,
+                      optional<string_type>&     /*query*/,
+                      optional<string_type>&     /*fragment*/)
+    {
+      return scheme_type (std::move (scheme));
+    }
+
+    // Translate scheme type back to its string representation.
+    //
+    // Similar to the above the function is called with an empty string
+    // representation. If on return this value is no longer empty, then it is
+    // assume the URL has been translated in a custom manner (in which case
+    // the returned scheme value is ignored).
+    //
+    static string_type
+    translate_scheme (string_type&,                    /*url*/
+                      const scheme_type& scheme,
+                      const optional<authority_type>&  /*authority*/,
+                      const optional<path_type>&       /*path*/,
+                      const optional<string_type>&     /*query*/,
+                      const optional<string_type>&     /*fragment*/)
+    {
+      return string_type (scheme);
+    }
+
+    // Translate the path string representation to its type.
+    //
+    static path_type
+    translate_path (string_type&& path)
+    {
+      return path_type (std::move (path));
+    }
+
+    // Translate path type back to its string representation.
+    //
+    static string_type
+    translate_path (const path_type& path) {return string_type (path);}
+  };
+
+  template <typename H, // scheme
+            typename T = url_traits<H>>
+  class basic_url
+  {
+  public:
+    using traits = T;
+
+    using string_type = typename traits::string_type;
+    using char_type   = typename string_type::value_type;
+    using path_type   = typename traits::path_type;
+
+    using scheme_type    = typename traits::scheme_type;
+    using authority_type = typename traits::authority_type;
+    using host_type      = typename authority_type::host_type;
+
+    scheme_type              scheme;
+    optional<authority_type> authority;
+    optional<path_type>      path;
+    optional<string_type>    query;
+    optional<string_type>    fragment;
+
+    // Create an empty URL object.
+    //
+    basic_url () = default;
+
+    // Create the URL object from its string representation. If the argument is
+    // empty, then create an empty object. Otherwise verify that the string is
+    // compliant to the generic URL syntax. URL-decode and validate components
+    // with common for all schemes syntax (scheme, host, port, path).
+    // Throw std::invalid_argument if the passed string is not a valid URL
+    // representation.
+    //
+    // Validation and URL-decoding of the scheme-specific components can be
+    // provided by a custom url_traits::translate_scheme() implementation.
+    //
+    explicit
+    basic_url (const string_type&);
+
+    // Create the URL object from individual components. Performs no
+    // components URL-decoding or verification.
+    //
+    basic_url (scheme_type,
+               optional<authority_type>,
+               optional<path_type> path,
+               optional<string_type> query = nullopt,
+               optional<string_type> fragment = nullopt);
+
+    basic_url (scheme_type,
+               host_type host,
+               optional<path_type> path,
+               optional<string_type> query = nullopt,
+               optional<string_type> fragment = nullopt);
+
+    basic_url (scheme_type,
+               host_type host,
+               std::uint16_t port,
+               optional<path_type> path,
+               optional<string_type> query = nullopt,
+               optional<string_type> fragment = nullopt);
+
+    basic_url (scheme_type,
+               string_type host,
+               optional<path_type> path,
+               optional<string_type> query = nullopt,
+               optional<string_type> fragment = nullopt);
+
+    basic_url (scheme_type,
+               string_type host,
+               std::uint16_t port,
+               optional<path_type> path,
+               optional<string_type> query = nullopt,
+               optional<string_type> fragment = nullopt);
+
+    bool
+    empty () const noexcept
+    {
+      assert (authority || path || query || !fragment);
+      return !authority && !path && !query;
+    }
+
+    // Return a string representation of the URL. Note that while this is not
+    // necessarily syntactically the same string as what was used to
+    // initialize this instance, it should be semantically equivalent. String
+    // representation of an empty instance is the empty string.
+    //
+    string_type
+    string () const;
+
+    // The following predicates can be used to classify URL characters while
+    // parsing, validating or encoding scheme-specific components. For the
+    // semantics of character classes see RFC3986.
+    //
+    static bool
+    gen_delim (char_type c)
+    {
+      return c == ':' || c == '/' || c == '?' || c == '#' || c == '[' ||
+             c == ']' || c == '@';
+    }
+
+    static bool
+    sub_delim (char_type c)
+    {
+      return c == '!' || c == '$' || c == '&' || c == '=' || c == '(' ||
+             c == ')' || c == '*' || c == '+' || c == ',' || c == ';' ||
+             c == '\'';
+    }
+
+    static bool
+    reserved (char_type c) {return sub_delim (c) || gen_delim (c);}
+
+    static bool
+    unreserved (char_type c)
+    {
+      return alnum (c) || c == '-' ||  c == '.' || c =='_' || c == '~';
+    }
+
+    // URL-encode a character sequence.
+    //
+    // Note that the set of characters that should be encoded may differ for
+    // different URL components. The optional callback function must return
+    // true for characters that should be percent-encoded. The function may
+    // encode the passed character in it's own way with another character (but
+    // never with '%'), and return false. By default all characters other than
+    // unreserved are percent-encoded.
+    //
+    // Also note that the characters are interpreted as bytes. In other words,
+    // each character may result in a single encoding triplet.
+    //
+    template <typename I, typename O, typename F = bool (*) (char_type&)>
+    static void
+    encode (I b, I e,
+            O o,
+
+            // VC (as of 15u3) doesn't see unreserved() unless qualified.
+            //
+            F&& f = [] (char_type& c) {return !basic_url<H,T>::unreserved (c);});
+
+    template <typename F = bool (*) (char_type&)>
+    static string_type
+    encode (const string_type& s,
+            F&& f = [] (char_type& c) {return !basic_url<H,T>::unreserved (c);})
+    {
+      string_type r;
+      encode (s.begin (), s.end (), std::back_inserter (r), f);
+      return r;
+    }
+
+    template <typename F = bool (*) (char_type&)>
+    static string_type
+    encode (const char_type* s,
+            F&& f = [] (char_type& c) {return !basic_url<H,T>::unreserved (c);})
+    {
+      string_type r;
+      encode (s, s + string_type::traits_type::length (s),
+              std::back_inserter (r), f);
+      return r;
+    }
+
+    // URL-decode a character sequence. Throw std::invalid_argument if an
+    // invalid encoding sequence is encountered.
+    //
+    // If some characters in the sequence are encoded with another characters
+    // (rather than percent-encoded), then one must provide the callback
+    // function to decode them.
+    //
+    template <typename I, typename O, typename F = void (*) (char_type&)>
+    static void
+    decode (I b, I e, O o, F&& f = [] (char_type&) {});
+
+    template <typename F = void (*) (char_type&)>
+    static string_type
+    decode (const string_type& s, F&& f = [] (char_type&) {})
+    {
+      string_type r;
+      decode (s.begin (), s.end (), std::back_inserter (r), f);
+      return r;
+    }
+
+    template <typename F = void (*) (char_type&)>
+    static string_type
+    decode (const char_type* s, F&& f = [] (char_type&) {})
+    {
+      string_type r;
+      decode (s, s + string_type::traits_type::length (s),
+              std::back_inserter (r), f);
+      return r;
+    }
+  };
+
+  using url_authority = basic_url_authority<std::string>;
+  using url           = basic_url          <std::string>;
+
+  template <typename S>
+  inline bool
+  operator== (const basic_url_host<S>& x, const basic_url_host<S>& y) noexcept
+  {
+    return x.value == y.value && x.kind == y.kind;
+  }
+
+  template <typename S>
+  inline bool
+  operator!= (const basic_url_host<S>& x, const basic_url_host<S>& y) noexcept
+  {
+    return !(x == y);
+  }
+
+  template <typename S>
+  inline bool
+  operator== (const basic_url_authority<S>& x,
+              const basic_url_authority<S>& y) noexcept
+  {
+    return x.user == y.user && x.host == y.host && x.port == y.port;
+  }
+
+  template <typename S>
+  inline bool
+  operator!= (const basic_url_authority<S>& x,
+              const basic_url_authority<S>& y) noexcept
+  {
+    return !(x == y);
+  }
+
+  template <typename S, typename T>
+  inline bool
+  operator== (const basic_url<S, T>& x, const basic_url<S, T>& y) noexcept
+  {
+    if (!(x.authority == y.authority && x.path == y.path &&
+          x.query == y.query && x.fragment == y.fragment))
+      return false;
+
+    assert (x.empty () == y.empty ());
+
+    if (x.empty ())
+      return true;
+
+    return x.scheme == y.scheme; // None is empty, so schemes are valid.
+  }
+
+  template <typename S, typename T>
+  inline bool
+  operator!= (const basic_url<S, T>& x, const basic_url<S, T>& y) noexcept
+  {
+    return !(x == y);
+  }
+
+  template <typename S, typename T>
+  inline auto
+  operator<< (std::basic_ostream<typename T::string_type::value_type>& o,
+              const basic_url<S, T>& u) -> decltype (o)
+  {
+    return o << u.string ();
+  }
+}
+
+#include <libbutl/url.ixx>
+#include <libbutl/url.txx>
diff --git a/libbutl/url.txx b/libbutl/url.txx
new file mode 100644
index 0000000..addfe88
--- /dev/null
+++ b/libbutl/url.txx
@@ -0,0 +1,509 @@
+// file      : libbutl/url.txx -*- C++ -*-
+// copyright : Copyright (c) 2014-2017 Code Synthesis Ltd
+// license   : MIT; see accompanying LICENSE file
+
+LIBBUTL_MODEXPORT namespace butl //@@ MOD Clang needs this for some reason.
+{
+  // Convenience functions.
+  //
+  template <typename C>
+  inline bool
+  url_path_char (C c)
+  {
+    using url = basic_url<std::basic_string<C>>;
+
+    return c == '/' || c == ':' || url::unreserved (c) ||
+           c == '@' || url::sub_delim (c);
+  }
+
+  // basic_url_host
+  //
+  template <typename S>
+  basic_url_host<S>::
+  basic_url_host (string_type v)
+  {
+    using std::invalid_argument;
+
+    using url       = basic_url<string_type>;
+    using char_type = typename string_type::value_type;
+
+    kind = v[0] == '[' ? kind_type::ipv6 : kind_type::name;
+
+    if (kind == url_host_kind::ipv6)
+    {
+      if (v.back () != ']')
+        throw invalid_argument ("invalid IPv6 address");
+
+      value.assign (v, 1, v.size () - 2);
+    }
+    else
+    {
+      // Detect the IPv4 address host type.
+      //
+      {
+        size_t n (0);
+        string_type oct;
+
+        auto ipv4_oct = [&oct, &n] () -> bool
+        {
+          if (n == 4 || oct.empty () || oct.size () > 3 || stoul (oct) > 255)
+            return false;
+
+          ++n;
+          oct.clear ();
+          return true;
+        };
+
+        auto i (v.cbegin ());
+        auto e (v.cend ());
+
+        for (; i != e; ++i)
+        {
+          char_type c (*i);
+
+          if (digit (c))
+            oct += c;
+          else if (c != '.' || !ipv4_oct ())
+            break;
+        }
+
+        if (i == e && ipv4_oct () && n == 4)
+          kind = url_host_kind::ipv4;
+      }
+
+      // Verify and decode the host name.
+      //
+      bool dec (false);
+      if (kind == url_host_kind::name)
+      {
+        for (auto c: v)
+        {
+          if (!(url::unreserved (c) || url::sub_delim (c) || c == '%'))
+            throw invalid_argument ("invalid host name");
+
+          if (c == '%')
+            dec = true;
+        }
+      }
+
+      value = dec ? url::decode (v) : move (v);
+    }
+  }
+
+  template <typename S>
+  S basic_url_host<S>::
+  string () const
+  {
+    using url       = basic_url<string_type>;
+    using char_type = typename string_type::value_type;
+
+    if (empty ())
+      return string_type ();
+
+    switch (kind)
+    {
+    case url_host_kind::ipv4: return value;
+    case url_host_kind::ipv6:
+      {
+        string_type r;
+        r += '[';
+        r += value;
+        r += ']';
+        return r;
+      }
+    case url_host_kind::name:
+      {
+        // We don't encode all characters that are disallowed for the host
+        // part as RFC3986 requests:
+        //
+        // URI producing applications must not use percent-encoding in host
+        // unless it is used to represent a UTF-8 character sequence.
+        //
+        // The callback requests to encode characters outside the ASCII
+        // character set.
+        //
+        return url::encode (value,
+                            [] (char_type& c)
+                            {
+                              // Convert to the unsigned numeric type, that is
+                              // long enough to hold any character type.
+                              //
+                              return static_cast<unsigned long> (c) >= 0x80;
+                            });
+      }
+    }
+
+    assert (false); // Can't be here.
+    return string_type ();
+  }
+
+  // basic_url_authority
+  //
+  template <typename S>
+  S
+  port_string (std::uint16_t p);
+
+  template <>
+  inline std::string
+  port_string (std::uint16_t p)
+  {
+    return std::to_string (p);
+  }
+
+  template <>
+  inline std::wstring
+  port_string (std::uint16_t p)
+  {
+    return std::to_wstring (p);
+  }
+
+  template <typename S>
+  S basic_url_authority<S>::
+  string () const
+  {
+    if (empty ())
+      return string_type ();
+
+    string_type r;
+    if (!user.empty ())
+    {
+      r += user;
+      r += '@';
+    }
+
+    r += host.string ();
+
+    if (port != 0)
+    {
+      r += ':';
+      r += port_string<string_type> (port);
+    }
+
+    return r;
+  }
+
+  // basic_url
+  //
+  template <typename S, typename T>
+  basic_url<S, T>::
+  basic_url (const string_type& u)
+  {
+    using namespace std;
+
+    using iterator  = typename string_type::const_iterator;
+
+    // Create an empty URL object for the empty argument. Note that the scheme
+    // is default-constructed, and so may stay undefined in this case.
+    //
+    if (u.empty ())
+      return;
+
+    try
+    {
+      // At the end of a component parsing 'i' points to the next component
+      // start, and 'b' stays unchanged.
+      //
+      iterator b (u.cbegin ());
+      iterator i (b);
+      iterator e (u.cend ());
+
+      // Extract scheme.
+      //
+      for(char_type c; i != e && (c = *i) != ':'; ++i)
+      {
+        if (!(i == b
+              ? alpha (c)
+              : (alnum (c) || c == '+' || c == '-' || c == '.')))
+          throw invalid_argument ("invalid scheme");
+      }
+
+      if (i == b || i == e || i == b + 1) // Forbids one letter length schemes.
+        throw invalid_argument ("no scheme");
+
+      string_type sc (b, i++); // Skip ':'.
+
+      // Parse authority.
+      //
+      if (i != e && i + 1 != e && *i == '/' && *(i + 1) == '/')
+      {
+        i += 2; // Skip '//'.
+
+        // Find the authority end.
+        //
+        size_t p (u.find_first_of (string_type ({'/', '?', '#'}), i - b));
+        iterator ae (p != string_type::npos ? b + p : e);
+
+        string_type auth (i, ae);
+        i = ae;
+
+        // Extract user information.
+        //
+        string_type user;
+        p = auth.find ('@');
+        if (p != string_type::npos)
+        {
+          // Don't URL-decode the user information (scheme-specific).
+          //
+          user = string_type (auth, 0, p);
+          auth = string_type (auth, p + 1);
+        }
+
+        // Extract host.
+        //
+        string_type host;
+        p = auth.find_last_of({']', ':'}); // Note: ':' can belong to IPv6.
+
+        if (p != string_type::npos && auth[p] == ']') // There is no port.
+          p = string_type::npos;
+
+        if (p != string_type::npos)
+        {
+          host = string_type (auth, 0, p);
+          auth = string_type (auth, p + 1);
+        }
+        else
+        {
+          host = move (auth);
+          auth = string_type ();
+        }
+
+        // Extract port.
+        //
+        uint16_t port (0);
+        if (!auth.empty ())
+        {
+          auto bad_port = [] () {throw invalid_argument ("invalid port");};
+
+          for (auto c: auth)
+          {
+            if (!digit (c))
+              bad_port ();
+          }
+
+          unsigned long long n (stoull (auth));
+          if (n == 0 || n > UINT16_MAX)
+            bad_port ();
+
+          port = static_cast<uint16_t> (n);
+        }
+
+        // User information and port are only meaningful if the host part is
+        // present.
+        //
+        if (host.empty () && (!user.empty () || port != 0))
+          throw invalid_argument ("no host");
+
+        authority = {move (user), host_type (move (host)), port};
+      }
+
+      // Extract path.
+      //
+      if (i != e && *i == '/')
+      {
+        ++i; // Skip '/'.
+
+        // Verify and URL-decode the path.
+        //
+        iterator j (i);
+        for(char_type c; j != e && (c = *j) != '?' && c != '#'; ++j)
+        {
+          if (!(url_path_char (c) || c == '%'))
+            throw invalid_argument ("invalid path");
+        }
+
+        // Note that encoding for non-ASCII path is not specified (in contrast
+        // to the host name), and presumably is local to the referenced
+        // authority.
+        //
+        string_type s;
+        decode (i, j, back_inserter (s));
+        path = traits::translate_path (move (s));
+        i = j;
+      }
+
+      // Extract query.
+      //
+      if (i != e && *i == '?')
+      {
+        ++i; // Skip '?'.
+
+        // Find the query component end.
+        //
+        size_t p (u.find ('#', i - b));
+        iterator qe (p != string_type::npos ? b + p : e);
+
+        // Don't URL-decode the query (scheme-specific).
+        //
+        query = string_type (i, qe);
+        i = qe;
+      }
+
+      // We don't suppose to end up with an empty URL.
+      //
+      if (empty ())
+        throw invalid_argument ("no authority, path or query");
+
+      // Parse fragment.
+      //
+      if (i != e)
+      {
+        ++i; // Skip '#'.
+
+        // Don't URL-decode the fragment (media type-specific).
+        //
+        fragment = string_type (i, e);
+        i = e;
+      }
+
+      assert (i == e);
+
+      // Translate the scheme string representation to its type.
+      //
+      scheme = traits::translate_scheme (u,
+                                         move (sc),
+                                         authority,
+                                         path,
+                                         query,
+                                         fragment);
+    }
+    // If we fail to parse the URL, then delegate this job to
+    // traits::translate_scheme(). If it also fails, leaving the components
+    // absent, then we re-throw.
+    //
+    catch (const invalid_argument&)
+    {
+      authority = nullopt;
+      path      = nullopt;
+      query     = nullopt;
+      fragment  = nullopt;
+
+      scheme = traits::translate_scheme (u,
+                                         string_type () /* scheme */,
+                                         authority,
+                                         path,
+                                         query,
+                                         fragment);
+
+      if (!authority && !path && !query && !fragment)
+        throw;
+    }
+  }
+
+  template <typename S, typename T>
+  typename basic_url<S, T>::string_type basic_url<S, T>::
+  string () const
+  {
+    if (empty ())
+      return string_type ();
+
+    string_type u;
+    string_type r (traits::translate_scheme (u,
+                                             scheme,
+                                             authority,
+                                             path,
+                                             query,
+                                             fragment));
+
+    // Return the custom URL pbject representation if provided.
+    //
+    if (!u.empty ())
+      return u;
+
+    r += ':';
+
+    if (authority)
+    {
+      r += '/';
+      r += '/';
+      r += authority->string ();
+    }
+
+    if (path)
+    {
+      r += '/';
+      r += encode (traits::translate_path (*path),
+                   [] (char_type& c) {return !url_path_char (c);});
+    }
+
+    if (query)
+    {
+      r += '?';
+      r += *query;
+    }
+
+    if (fragment)
+    {
+      r += '#';
+      r += *fragment;
+    }
+
+    return r;
+  }
+
+  template <typename S, typename T>
+  template <typename I, typename O, typename F>
+  void basic_url<S, T>::
+  encode (I b, I e, O o, F&& f)
+  {
+    const char_type digits[] = {'0', '1', '2', '3', '4', '5', '6', '7', '8',
+                                '9', 'A', 'B', 'C', 'D', 'E', 'F'};
+
+    for (; b != e; ++b)
+    {
+      char_type c (*b);
+
+      if (c == '%' || f (c))
+      {
+        assert (c == *b); // Must not be custom-encoded.
+
+        *o++ = '%';
+        *o++ = digits[(c >> 4) & 0xF];
+        *o++ = digits[c & 0xF];
+      }
+      else
+      {
+        assert (c != '%'); // Otherwise decoding will be ambiguous.
+        *o++ = c;
+      }
+    }
+  }
+
+  template <typename S, typename T>
+  template <typename I, typename O, typename F>
+  void basic_url<S, T>::
+  decode (I b, I e, O o, F&& f)
+  {
+    using namespace std;
+
+    for (; b != e; ++b)
+    {
+      char_type c (*b);
+
+      // URL-decode the character.
+      //
+      if (c == '%')
+      {
+        // Note that we can't use (potentially more efficient) strtoul() here
+        // as it doesn't have an overload for the wide character string.
+        // However, the code below shouldn't be inefficient, given that the
+        // string is short, and so is (probably) stack-allocated.
+        //
+        // Note that stoul() throws if no conversion could be performed, so we
+        // explicitly check for xdigits.
+        //
+        if (++b != e && xdigit (*b) && b + 1 != e && xdigit (*(b + 1)))
+          c = static_cast<char_type> (stoul (string_type (b, b + 2),
+                                             nullptr,
+                                             16));
+        else
+          throw invalid_argument ("invalid URL-encoding");
+
+        ++b; // Position to the second xdigit.
+      }
+      else
+        f (c);
+
+      *o++ = c;
+    }
+  }
+}
diff --git a/libbutl/utility.ixx b/libbutl/utility.ixx
index d703211..fcb8789 100644
--- a/libbutl/utility.ixx
+++ b/libbutl/utility.ixx
@@ -137,4 +137,34 @@ namespace butl
   {
     return std::isalnum (c);
   }
+
+  inline bool
+  xdigit (char c)
+  {
+    return std::isxdigit (c);
+  }
+
+  inline bool
+  alpha (wchar_t c)
+  {
+    return std::iswalpha (c);
+  }
+
+  inline bool
+  digit (wchar_t c)
+  {
+    return std::iswdigit (c);
+  }
+
+  inline bool
+  alnum (wchar_t c)
+  {
+    return std::iswalnum (c);
+  }
+
+  inline bool
+  xdigit (wchar_t c)
+  {
+    return std::iswxdigit (c);
+  }
 }
diff --git a/libbutl/utility.mxx b/libbutl/utility.mxx
index 3f23581..988ca22 100644
--- a/libbutl/utility.mxx
+++ b/libbutl/utility.mxx
@@ -22,7 +22,8 @@
 #include <exception>    // exception, uncaught_exception[s]()
 //#include <functional> // hash
 
-#include <cctype> // toupper(), tolower(), isalpha(), isdigit(), isalnum()
+#include <cctype>  // toupper(), tolower(), is*()
+#include <cwctype> // isw*()
 #endif
 
 #include <libbutl/ft/lang.hxx>      // thread_local
@@ -138,14 +139,15 @@ LIBBUTL_MODEXPORT namespace butl
     }
   };
 
-  bool
-  alpha (char);
-
-  bool
-  digit (char);
+  bool alpha  (char);
+  bool digit  (char);
+  bool alnum  (char);
+  bool xdigit (char);
 
-  bool
-  alnum (char);
+  bool alpha  (wchar_t);
+  bool digit  (wchar_t);
+  bool alnum  (wchar_t);
+  bool xdigit (wchar_t);
 
   // Key comparators (i.e., to be used in sets, maps, etc).
   //
diff --git a/tests/url/buildfile b/tests/url/buildfile
new file mode 100644
index 0000000..ed8380c
--- /dev/null
+++ b/tests/url/buildfile
@@ -0,0 +1,8 @@
+# file      : tests/url/buildfile
+# copyright : Copyright (c) 2014-2017 Code Synthesis Ltd
+# license   : MIT; see accompanying LICENSE file
+
+import libs = libbutl%lib{butl}
+libs += $stdmod_lib
+
+exe{driver}: {hxx cxx}{*} $libs test{testscript}
diff --git a/tests/url/driver.cxx b/tests/url/driver.cxx
new file mode 100644
index 0000000..95fe9cb
--- /dev/null
+++ b/tests/url/driver.cxx
@@ -0,0 +1,346 @@
+// file      : tests/url/driver.cxx -*- C++ -*-
+// copyright : Copyright (c) 2014-2017 Code Synthesis Ltd
+// license   : MIT; see accompanying LICENSE file
+
+#include <cassert>
+
+#ifndef __cpp_lib_modules
+#include <string>
+#include <utility>   // move()
+#include <iostream>
+#include <iterator>  // back_inserter
+#include <stdexcept> // invalid_argument
+#endif
+
+// Other includes.
+
+#ifdef __cpp_modules
+#ifdef __cpp_lib_modules
+import std.core;
+import std.io;
+#endif
+import butl.url;
+import butl.utility; // operator<<(ostream, exception)
+#else
+#include <libbutl/url.mxx>
+#include <libbutl/utility.mxx>
+#endif
+
+using namespace std;
+using namespace butl;
+
+enum class scheme
+{
+  http,
+  https,
+  file
+};
+
+namespace butl
+{
+  template <>
+  struct url_traits<scheme>
+  {
+    using string_type = wstring;
+    using path_type   = wstring;
+
+    using scheme_type    = scheme;
+    using authority_type = basic_url_authority<string_type>;
+
+    static scheme_type
+    translate_scheme (const string_type&         url,
+                      string_type&&              scheme,
+                      optional<authority_type>&  /*authority*/,
+                      optional<path_type>&       path,
+                      optional<string_type>&     /*query*/,
+                      optional<string_type>&     /*fragment*/)
+    {
+      // Note that we must compare case-insensitive in the real program.
+      //
+      if (scheme == L"http")
+        return scheme_type::http;
+      else if (scheme == L"https")
+        return scheme_type::https;
+      else if (scheme == L"file")
+        return scheme_type::file;
+      else if (scheme.empty ())
+      {
+        // If the URL looks like an absolute filesystem path, then translate it
+        // to the file URL. If it is not, then leave all the components absent
+        // to fail with a proper exception description.
+        //
+        wchar_t c;
+        if ((c = url[0]) == '/' ||
+            (url.size () > 2 && alpha (c) && url[1] == ':' && url[2] == '/'))
+          path = url;
+
+        return scheme_type::file;
+      }
+      else
+        throw invalid_argument ("unknown scheme");
+    }
+
+    // Translate scheme type back to its string representation.
+    //
+    static string_type
+    translate_scheme (string_type&,                    /*url*/
+                      const scheme_type& scheme,
+                      const optional<authority_type>&  /*authority*/,
+                      const optional<path_type>&       /*path*/,
+                      const optional<string_type>&     /*query*/,
+                      const optional<string_type>&     /*fragment*/)
+    {
+      switch (scheme)
+      {
+      case scheme_type::http:  return L"http";
+      case scheme_type::https: return L"https";
+      case scheme_type::file:  return L"file";
+      }
+
+      assert (false); // Can't be here.
+      return L"";
+    }
+
+    static path_type
+    translate_path (string_type&& path)
+    {
+      return path_type (move (path));
+    }
+
+    static string_type
+    translate_path (const path_type& path) {return string_type (path);}
+  };
+}
+
+// Usages:
+//
+// argv[0]
+// argv[0] [-c|-s|-w] <url>
+//
+// Perform some basic tests if no URL is provided. Otherwise round-trip the URL
+// to STDOUT. URL must contain only ASCII characters. Exit with zero code on
+// success. Exit with code one on parsing failure, printing error description
+// to STDERR.
+//
+// -c
+//    Print the URL components one per line. Print the special '[null]' string
+//    for an absent components. This is the default option if URL is provided.
+//
+// -s
+//    Print stringified url object representation.
+//
+// -w
+//    Same as above, but use the custom wstring-based url_traits
+//    implementation for the basic_url template.
+//
+int
+main (int argc, const char* argv[])
+try
+{
+  using wurl           = basic_url<scheme>;
+  using wurl_authority = wurl::authority_type;
+  using wurl_host      = wurl::host_type;
+
+  enum class print_mode
+  {
+    str,
+    wstr,
+    comp
+  } mode (print_mode::comp);
+
+  int i (1);
+  for (; i != argc; ++i)
+  {
+    string o (argv[i]);
+    if (o == "-s")
+      mode = print_mode::str;
+    else if (o == "-w")
+      mode = print_mode::wstr;
+    else if (o == "-c")
+      mode = print_mode::comp;
+    else
+      break; // End of options.
+  }
+
+  if (i == argc)
+  {
+    // Test ctors and operators.
+    //
+    {
+      wurl u0 ((wstring ()));
+      assert (u0.empty ());
+      assert (u0 == wurl ());
+
+      wurl u1 (scheme::http,
+               wurl_authority {wstring (), wurl_host (L"[123]"), 0},
+               wstring (L"login"),
+               wstring (L"q="),
+               wstring (L"f"));
+
+      assert (!u1.empty ());
+      assert (u1 != u0);
+
+      wurl u2 (scheme::http,
+               wurl_host (L"123", url_host_kind::ipv6),
+               wstring (L"login"),
+               wstring (L"q="),
+               wstring (L"f"));
+
+      assert (u2 == u1);
+
+      wurl u3 (scheme::http,
+               wurl_host (L"123", url_host_kind::ipv6),
+               0,
+               wstring (L"login"),
+               wstring (L"q="),
+               wstring (L"f"));
+
+      assert (u3 == u2);
+
+      wurl u4 (scheme::http,
+               L"[123]",
+               wstring (L"login"),
+               wstring (L"q="),
+               wstring (L"f"));
+
+      assert (u4 == u3);
+
+      wurl u5 (scheme::http,
+               L"[123]",
+               0,
+               wstring (L"login"),
+               wstring (L"q="),
+               wstring (L"f"));
+
+      assert (u5 == u4);
+    }
+
+    // Test encode and decode.
+    //
+    {
+      const char* s ("ABC +");
+      string es (url::encode (s));
+
+      assert (es == "ABC%20%2B");
+      string ds (url::decode (es));
+
+      assert (ds == s);
+    }
+
+    {
+      const char* s ("ABC +");
+
+      string es (url::encode (s,
+                              [] (char& c) -> bool
+                              {
+                                if (c == ' ')
+                                {
+                                  c = '+';
+                                  return false;
+                                }
+                                return !url::unreserved (c);
+                              }));
+
+      assert (es == "ABC+%2B");
+
+      string ds (url::decode (es.c_str (),
+                              [] (char& c)
+                              {
+                                if (c == '+')
+                                  c = ' ';
+                              }));
+      assert (ds == s);
+    }
+    {
+      const wchar_t s[] = L"ABC ";
+
+      wstring es;
+      wurl::encode (s, s + 4,
+                    back_inserter (es),
+                    [] (wchar_t& c) -> bool
+                    {
+                      if (!alnum (c))
+                        return true;
+
+                      ++c;
+                      return false;
+                    });
+      assert (es == L"BCD%20");
+
+      wstring ds (wurl::decode (es,
+                                [] (wchar_t& c)
+                                {
+                                  if (alnum (c))
+                                    --c;
+                                }));
+      assert (ds == s);
+    }
+  }
+  else // Round-trip the URL.
+  {
+    assert (i + 1 == argc);
+
+    const char* ua (argv[i]);
+
+    switch (mode)
+    {
+    case print_mode::str:
+      {
+        cout << url (ua) << endl;
+        break;
+      }
+    case print_mode::wstr:
+      {
+        // Convert ASCII string to wstring.
+        //
+        wstring s (ua, ua + strlen (ua));
+
+        wcout << wurl (s) << endl;
+        break;
+      }
+    case print_mode::comp:
+      {
+        // Convert ASCII string to wstring.
+        //
+        wstring s (ua, ua + strlen (ua));
+        wurl u (s);
+
+        if (!u.empty ())
+        {
+          wstring s;
+          wcout << wurl::traits::translate_scheme (s,
+                                                   u.scheme,
+                                                   nullopt,
+                                                   nullopt,
+                                                   nullopt,
+                                                   nullopt) << endl;
+        }
+        else
+          wcout << L"[null]" << endl;
+
+        if (u.authority)
+        {
+          const wchar_t* kinds[] = {L"ipv4", L"ipv6", L"name"};
+          const wurl_authority& a (*u.authority);
+
+          wcout << a.user << L'@' << a.host.value << L':' << a.port
+                << " " << kinds[static_cast<size_t> (a.host.kind)] << endl;
+        }
+        else
+          wcout << L"[null]" << endl;
+
+        wcout << (u.path     ? *u.path     : L"[null]") << endl
+              << (u.query    ? *u.query    : L"[null]") << endl
+              << (u.fragment ? *u.fragment : L"[null]") << endl;
+        break;
+      }
+    }
+  }
+
+  return 0;
+}
+catch (const invalid_argument& e)
+{
+  cerr << e << endl;
+  return 1;
+}
diff --git a/tests/url/testscript b/tests/url/testscript
new file mode 100644
index 0000000..d81f282
--- /dev/null
+++ b/tests/url/testscript
@@ -0,0 +1,378 @@
+# file      : tests/url/testscript
+# copyright : Copyright (c) 2014-2017 Code Synthesis Ltd
+# license   : MIT; see accompanying LICENSE file
+
+:basic
+:
+$*
+
+: components
+:
+{
+  : all
+  :
+  $* 'https://user@stage.b2.org:443/libbutl?f=full#description' >>EOO
+  https
+  user@stage.b2.org:443 name
+  libbutl
+  f=full
+  description
+  EOO
+
+  : empty-url
+  :
+  $* '' >>EOO
+  [null]
+  [null]
+  [null]
+  [null]
+  [null]
+  EOO
+
+  : no-id
+  :
+  {
+    $* 'file:#f'  2>'no authority, path or query' != 0 : fragment
+    $* 'file:aaa' 2>'no authority, path or query' != 0 : junk
+    $* 'file:'    2>'no authority, path or query' != 0 : none
+  }
+
+  : scheme
+  :
+  {
+    : detected
+    :
+    $* 'http://build2.org' >>EOO
+    http
+    @build2.org:0 name
+    [null]
+    [null]
+    [null]
+    EOO
+
+    : deduced
+    :
+    $* 'c:/a' >>EOO
+    file
+    [null]
+    c:/a
+    [null]
+    [null]
+    EOO
+
+    $* ':/a'           2>'no scheme'      != 0 : none
+    $* 'http'          2>'no scheme'      != 0 : unterminated
+    $* 'ht~tp://a.com' 2>'invalid scheme' != 0 : invalid-char
+    $* '1http://a.com' 2>'invalid scheme' != 0 : invalid-first-char
+  }
+
+  : authority
+  {
+    : absent
+    :
+    $* 'file:/tmp/a' >>EOO
+    file
+    [null]
+    tmp/a
+    [null]
+    [null]
+    EOO
+
+    : empty
+    :
+    $* 'file:///tmp/a' >>EOO
+    file
+    @:0 name
+    tmp/a
+    [null]
+    [null]
+    EOO
+
+    : trailing
+    :
+    $* 'http://localhost' >>EOO
+    http
+    @localhost:0 name
+    [null]
+    [null]
+    [null]
+    EOO
+
+    : user
+    :
+    {
+      : non-empty
+      :
+      $* 'http://admin@localhost' >>EOO
+      http
+      admin@localhost:0 name
+      [null]
+      [null]
+      [null]
+      EOO
+
+      : empty
+      :
+      $* 'http://@localhost' >>EOO
+      http
+      @localhost:0 name
+      [null]
+      [null]
+      [null]
+      EOO
+    }
+
+    : host
+    :
+    {
+      : ipv6
+      :
+      {
+        : port
+        :
+        $* 'http://[1:23]:443' >>EOO
+        http
+        @1:23:443 ipv6
+        [null]
+        [null]
+        [null]
+        EOO
+
+        : no-port
+        :
+        $* 'http://[1:23]' >>EOO
+        http
+        @1:23:0 ipv6
+        [null]
+        [null]
+        [null]
+        EOO
+
+        $* 'http://[123'      2>'invalid IPv6 address' != 0 : missed-bracket
+        $* 'http://[123] :80' 2>'invalid IPv6 address' != 0 : extra-char
+      }
+
+      : ipv4
+      :
+      {
+        : valid
+        :
+        $* 'http://0.10.200.255' >>EOO
+        http
+        @0.10.200.255:0 ipv4
+        [null]
+        [null]
+        [null]
+        EOO
+
+        : long
+        :
+        $* 'http://0.10.200.255.30' >>EOO
+        http
+        @0.10.200.255.30:0 name
+        [null]
+        [null]
+        [null]
+        EOO
+
+        : short
+        :
+        $* 'http://0.10.200' >>EOO
+        http
+        @0.10.200:0 name
+        [null]
+        [null]
+        [null]
+        EOO
+
+        : missed
+        :
+        $* 'http://0.10..200' >>EOO
+        http
+        @0.10..200:0 name
+        [null]
+        [null]
+        [null]
+        EOO
+
+        : out-of-range
+        :
+        $* 'http://0.10.200.256' >>EOO
+        http
+        @0.10.200.256:0 name
+        [null]
+        [null]
+        [null]
+        EOO
+      }
+
+      : name
+      :
+      {
+        : valid
+        :
+        $* 'https://www.b2.org' >>EOO
+        https
+        @www.b2.org:0 name
+        [null]
+        [null]
+        [null]
+        EOO
+
+        : encoded
+        :
+        {
+          : valid
+          :
+          $* 'https://www.%62%32.org' >>EOO
+          https
+          @www.b2.org:0 name
+          [null]
+          [null]
+          [null]
+          EOO
+
+          $* 'https://www.%62%3.org'  2>'invalid URL-encoding' != 0 : short
+          $* 'https://www.%62%3x.org' 2>'invalid URL-encoding' != 0 : invalid
+          $* 'https://www.%62%.org'   2>'invalid URL-encoding' != 0 : absent
+        }
+
+        $* 'https://www.b|2.org' 2>'invalid host name' != 0 : invalid-char
+      }
+
+      $* 'http://admin@:80?q=' 2>'no host' != 0: no-host
+    }
+
+    : port
+    :
+    {
+      : valid
+      :
+      $* 'http://build2.org:443' >>EOO
+      http
+      @build2.org:443 name
+      [null]
+      [null]
+      [null]
+      EOO
+
+      $* 'http://build2.org:-433'  2>'invalid port' != 0 : invalid-char
+      $* 'http://build2.org:70000' 2>'invalid port' != 0 : exceeds-max
+      $* 'http://build2.org:0'     2>'invalid port' != 0 : zero
+    }
+  }
+
+  : path
+  :
+  {
+    : absent
+    :
+    $* 'http://b2.org' >>EOO
+    http
+    @b2.org:0 name
+    [null]
+    [null]
+    [null]
+    EOO
+
+    : empty
+    :
+    $* 'http://b2.org/' >>EOO
+    http
+    @b2.org:0 name
+
+    [null]
+    [null]
+    EOO
+
+    : non-empty
+    :
+    $* 'http://b2.org/s/q' >>EOO
+    http
+    @b2.org:0 name
+    s/q
+    [null]
+    [null]
+    EOO
+
+    : encoded
+    :
+    $* 'http://b2.org/%6F/s' >>EOO
+    http
+    @b2.org:0 name
+    o/s
+    [null]
+    [null]
+    EOO
+  }
+
+  : query
+  :
+  {
+    : no-fragment
+    :
+    $* 'http://b2.org/a?x=foo&y=bar' >>EOO
+    http
+    @b2.org:0 name
+    a
+    x=foo&y=bar
+    [null]
+    EOO
+
+    : fragment
+    :
+    $* 'http://b2.org/a?foo#bar' >>EOO
+    http
+    @b2.org:0 name
+    a
+    foo
+    bar
+    EOO
+  }
+
+  : fragment
+  :
+  {
+    $* 'http://b2.org#foo' >>EOO
+    http
+    @b2.org:0 name
+    [null]
+    [null]
+    foo
+    EOO
+  }
+}
+
+: string
+{
+  test.options += -s
+
+  : authority
+  :
+  {
+    : host
+    :
+    {
+      $* 'file:///a'       >'file:///a'       : empty
+      $* 'http://1.1.1.1'  >'http://1.1.1.1'  : ipv4
+      $* 'https://[1:2:3]' >'https://[1:2:3]' : ipv6
+      $* 'file://a%d1%84'  >'file://a%D1%84'  : name
+    }
+
+    $* 'http://admin@localhost' >'http://admin@localhost' : user
+    $* 'http://localhost:8080'  >'http://localhost:8080'  : port
+    $* 'file:/a'                >'file:/a'                : absent
+  }
+
+  $* ''            >''            : empty
+  $* 'file:/b%7C2' >'file:/b%7C2' : path
+  $* 'http://a?q=' >'http://a?q=' : query
+  $* 'http://a#f'  >'http://a#f'  : fragment
+}
+
+: wstring
+:
+{
+  u = 'https://user@stage.b2.org:443/libbutl?f=full#description'
+  $* -w "$u" >"$u"
+}