aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--libbutl/url.ixx84
-rw-r--r--libbutl/url.mxx476
-rw-r--r--libbutl/url.txx509
-rw-r--r--libbutl/utility.ixx30
-rw-r--r--libbutl/utility.mxx18
-rw-r--r--tests/url/buildfile8
-rw-r--r--tests/url/driver.cxx346
-rw-r--r--tests/url/testscript378
8 files changed, 1841 insertions, 8 deletions
diff --git a/libbutl/url.ixx b/libbutl/url.ixx
new file mode 100644
index 0000000..4ff7a06
--- /dev/null
+++ b/libbutl/url.ixx
@@ -0,0 +1,84 @@
+// file : libbutl/url.ixx -*- C++ -*-
+// copyright : Copyright (c) 2014-2017 Code Synthesis Ltd
+// license : MIT; see accompanying LICENSE file
+
+LIBBUTL_MODEXPORT namespace butl //@@ MOD Clang needs this for some reason.
+{
+ template <typename S, typename T>
+ inline basic_url<S, T>::
+ basic_url (scheme_type s,
+ optional<authority_type> a,
+ optional<path_type> p,
+ optional<string_type> q,
+ optional<string_type> f)
+ : scheme (std::move (s)),
+ authority (std::move (a)),
+ path (std::move (p)),
+ query (std::move (q)),
+ fragment (std::move (f))
+ {
+ }
+
+ template <typename S, typename T>
+ inline basic_url<S, T>::
+ basic_url (scheme_type s,
+ host_type h,
+ optional<path_type> p,
+ optional<string_type> q,
+ optional<string_type> f)
+ : basic_url (std::move (s),
+ authority_type {string_type (), std::move (h), 0},
+ std::move (p),
+ std::move (q),
+ std::move (f))
+ {
+ }
+
+ template <typename S, typename T>
+ inline basic_url<S, T>::
+ basic_url (scheme_type s,
+ host_type h,
+ std::uint16_t o,
+ optional<path_type> p,
+ optional<string_type> q,
+ optional<string_type> f)
+ : basic_url (std::move (s),
+ authority_type {string_type (), std::move (h), o},
+ std::move (p),
+ std::move (q),
+ std::move (f))
+ {
+ }
+
+ template <typename S, typename T>
+ inline basic_url<S, T>::
+ basic_url (scheme_type s,
+ string_type h,
+ optional<path_type> p,
+ optional<string_type> q,
+ optional<string_type> f)
+ : basic_url (std::move (s),
+ host_type (std::move (h)),
+ std::move (p),
+ std::move (q),
+ std::move (f))
+ {
+ }
+
+ template <typename S, typename T>
+ inline basic_url<S, T>::
+ basic_url (scheme_type s,
+ string_type h,
+ std::uint16_t o,
+ optional<path_type> p,
+ optional<string_type> q,
+ optional<string_type> f)
+ : basic_url (std::move (s),
+ host_type (std::move (h)),
+ o,
+ std::move (p),
+ std::move (q),
+ std::move (f))
+ {
+ }
+}
diff --git a/libbutl/url.mxx b/libbutl/url.mxx
new file mode 100644
index 0000000..fe091f1
--- /dev/null
+++ b/libbutl/url.mxx
@@ -0,0 +1,476 @@
+// file : libbutl/url.mxx -*- C++ -*-
+// copyright : Copyright (c) 2014-2017 Code Synthesis Ltd
+// license : MIT; see accompanying LICENSE file
+
+#ifndef __cpp_modules
+#pragma once
+#endif
+
+// C includes.
+
+#include <cassert>
+
+#ifndef __cpp_lib_modules
+#include <string>
+#include <cstdint> // uint*_t
+#include <utility> // move()
+#include <ostream>
+#include <iterator> // back_inserter
+
+#include <cstddef> // size_t
+#include <stdexcept> // invalid_argument
+#endif
+
+// Other includes.
+
+#ifdef __cpp_modules
+export module butl.url;
+#ifdef __cpp_lib_modules
+import std.core;
+import std.io;
+#endif
+import butl.path;
+import butl.utility;
+import butl.optional;
+#else
+#include <libbutl/path.mxx>
+#include <libbutl/utility.mxx>
+#include <libbutl/optional.mxx>
+#endif
+
+#include <libbutl/export.hxx>
+
+LIBBUTL_MODEXPORT namespace butl
+{
+ // RFC3986 Uniform Resource Locator (URL).
+ //
+ // <url> = <scheme>:[//[<authority>]][/<path>][?<query>][#<fragment>]
+ // <authority> = [<user>@]<host>[:<port>]
+ //
+ // Some examples of equivalent URLs to meditate upon:
+ //
+ // file://localhost/tmp (localhost authority)
+ // file:///tmp (empty authority)
+ // file:/tmp (absent authority)
+ //
+ // file://localhost/c:/tmp
+ // file:///c:/tmp
+ // file:/c:/tmp
+ //
+ // We think of the slash between <authority> and <path> as a separator but
+ // with the path always interpreted as starting from the "root" of the
+ // authority. Thus:
+ //
+ // file://localhost/tmp -> 'file'://'localhost'/'tmp' -> /tmp
+ // file://localhost/c:/tmp -> 'file'://'localhost'/'c:/tmp' -> c:/tmp
+ //
+ // This means that the <path> component is represented as a relative path
+ // and, in the general case, we cannot use our path type for its storage
+ // since it assumes the path is for the host platform. In other words, the
+ // interpretation of the path has to take into account the platform of the
+ // authority host. Note, however, that a custom url_traits implementation
+ // can choose to use the path type if local paths are to be interpreted as
+ // relative to the host.
+ //
+ // Note that we currently forbid one character schemes to support scheme-
+ // less (Windows) paths which can be done by url_traits::translate_scheme()
+ // (see below). (A Windows path that uses forward slashes would be parsed as
+ // a valid authority-less URL).
+
+ // URL host component can be an IPv4 address (if matches its dotted-decimal
+ // notation), an IPv6 address (if enclosed in [square brackets]) or
+ // otherwise a name.
+ //
+ // Note that non-ASCII host names are allowed in URLs. They must be
+ // UTF8-encoded and URL-encoded afterwards. Curently we store the parsed
+ // host name UTF8-encoded without regards to the template argument string
+ // type. Later we may add support for more appropriate encodings for
+ // multi-byte character types.
+ //
+ enum class url_host_kind {ipv4, ipv6, name};
+
+ template <typename S>
+ struct basic_url_host
+ {
+ using string_type = S;
+ using kind_type = url_host_kind;
+
+ string_type value;
+ kind_type kind;
+
+ // Can be treated as const string_type&.
+ //
+ operator const string_type& () const noexcept {return value;}
+
+ // Create an empty host.
+ //
+ basic_url_host (): kind (kind_type::name) {}
+
+ // Create the host object from its string representation as it appears in
+ // a URL, throwing std::invalid_argument if invalid. Remove the enclosing
+ // square brackets for IPv6 addresses, and URL-decode host names.
+ //
+ // Note that currently we don't validate IPv6 addresses.
+ //
+ explicit
+ basic_url_host (string_type);
+
+ basic_url_host (string_type v, kind_type k)
+ : value (std::move (v)), kind (k) {}
+
+ bool
+ empty () const
+ {
+ assert (kind == kind_type::name || !value.empty ());
+ return value.empty ();
+ }
+
+ // Return string representation of the host as it would appear in a URL.
+ //
+ string_type
+ string () const;
+ };
+
+ template <typename S>
+ struct basic_url_authority
+ {
+ using string_type = S;
+ using host_type = basic_url_host<string_type>;
+
+ string_type user; // Empty if not specified.
+ host_type host;
+ std::uint16_t port; // Zero if not specified.
+
+ bool
+ empty () const
+ {
+ assert (!host.empty () || (user.empty () && port == 0));
+ return host.empty ();
+ }
+
+ // Return a string representation of the URL authority. String
+ // representation of an empty instance is the empty string.
+ //
+ string_type
+ string () const;
+ };
+
+ template <typename H, typename S = H, typename P = S>
+ struct url_traits
+ {
+ using scheme_type = H;
+ using string_type = S;
+ using path_type = P;
+
+ using authority_type = basic_url_authority<string_type>;
+
+ // Translate the scheme string representation to its type. May throw
+ // std::invalid_argument. May change the URL components.
+ //
+ // This function is called with an empty scheme if the URL has no scheme,
+ // the scheme is invalid, or it could not be parsed into components
+ // according to the URL syntax. In this case all the passed components
+ // reference empty/absent values and if they remain unchanged on return,
+ // the URL is considered invalid and the std::invalid_argument exception
+ // with an appropriate description is thrown by the URL object constructor.
+ // This can be used to support scheme-less URLs, local paths, etc.
+ //
+ static scheme_type
+ translate_scheme (const string_type& /*url*/,
+ string_type&& scheme,
+ optional<authority_type>& /*authority*/,
+ optional<path_type>& /*path*/,
+ optional<string_type>& /*query*/,
+ optional<string_type>& /*fragment*/)
+ {
+ return scheme_type (std::move (scheme));
+ }
+
+ // Translate scheme type back to its string representation.
+ //
+ // Similar to the above the function is called with an empty string
+ // representation. If on return this value is no longer empty, then it is
+ // assume the URL has been translated in a custom manner (in which case
+ // the returned scheme value is ignored).
+ //
+ static string_type
+ translate_scheme (string_type&, /*url*/
+ const scheme_type& scheme,
+ const optional<authority_type>& /*authority*/,
+ const optional<path_type>& /*path*/,
+ const optional<string_type>& /*query*/,
+ const optional<string_type>& /*fragment*/)
+ {
+ return string_type (scheme);
+ }
+
+ // Translate the path string representation to its type.
+ //
+ static path_type
+ translate_path (string_type&& path)
+ {
+ return path_type (std::move (path));
+ }
+
+ // Translate path type back to its string representation.
+ //
+ static string_type
+ translate_path (const path_type& path) {return string_type (path);}
+ };
+
+ template <typename H, // scheme
+ typename T = url_traits<H>>
+ class basic_url
+ {
+ public:
+ using traits = T;
+
+ using string_type = typename traits::string_type;
+ using char_type = typename string_type::value_type;
+ using path_type = typename traits::path_type;
+
+ using scheme_type = typename traits::scheme_type;
+ using authority_type = typename traits::authority_type;
+ using host_type = typename authority_type::host_type;
+
+ scheme_type scheme;
+ optional<authority_type> authority;
+ optional<path_type> path;
+ optional<string_type> query;
+ optional<string_type> fragment;
+
+ // Create an empty URL object.
+ //
+ basic_url () = default;
+
+ // Create the URL object from its string representation. If the argument is
+ // empty, then create an empty object. Otherwise verify that the string is
+ // compliant to the generic URL syntax. URL-decode and validate components
+ // with common for all schemes syntax (scheme, host, port, path).
+ // Throw std::invalid_argument if the passed string is not a valid URL
+ // representation.
+ //
+ // Validation and URL-decoding of the scheme-specific components can be
+ // provided by a custom url_traits::translate_scheme() implementation.
+ //
+ explicit
+ basic_url (const string_type&);
+
+ // Create the URL object from individual components. Performs no
+ // components URL-decoding or verification.
+ //
+ basic_url (scheme_type,
+ optional<authority_type>,
+ optional<path_type> path,
+ optional<string_type> query = nullopt,
+ optional<string_type> fragment = nullopt);
+
+ basic_url (scheme_type,
+ host_type host,
+ optional<path_type> path,
+ optional<string_type> query = nullopt,
+ optional<string_type> fragment = nullopt);
+
+ basic_url (scheme_type,
+ host_type host,
+ std::uint16_t port,
+ optional<path_type> path,
+ optional<string_type> query = nullopt,
+ optional<string_type> fragment = nullopt);
+
+ basic_url (scheme_type,
+ string_type host,
+ optional<path_type> path,
+ optional<string_type> query = nullopt,
+ optional<string_type> fragment = nullopt);
+
+ basic_url (scheme_type,
+ string_type host,
+ std::uint16_t port,
+ optional<path_type> path,
+ optional<string_type> query = nullopt,
+ optional<string_type> fragment = nullopt);
+
+ bool
+ empty () const noexcept
+ {
+ assert (authority || path || query || !fragment);
+ return !authority && !path && !query;
+ }
+
+ // Return a string representation of the URL. Note that while this is not
+ // necessarily syntactically the same string as what was used to
+ // initialize this instance, it should be semantically equivalent. String
+ // representation of an empty instance is the empty string.
+ //
+ string_type
+ string () const;
+
+ // The following predicates can be used to classify URL characters while
+ // parsing, validating or encoding scheme-specific components. For the
+ // semantics of character classes see RFC3986.
+ //
+ static bool
+ gen_delim (char_type c)
+ {
+ return c == ':' || c == '/' || c == '?' || c == '#' || c == '[' ||
+ c == ']' || c == '@';
+ }
+
+ static bool
+ sub_delim (char_type c)
+ {
+ return c == '!' || c == '$' || c == '&' || c == '=' || c == '(' ||
+ c == ')' || c == '*' || c == '+' || c == ',' || c == ';' ||
+ c == '\'';
+ }
+
+ static bool
+ reserved (char_type c) {return sub_delim (c) || gen_delim (c);}
+
+ static bool
+ unreserved (char_type c)
+ {
+ return alnum (c) || c == '-' || c == '.' || c =='_' || c == '~';
+ }
+
+ // URL-encode a character sequence.
+ //
+ // Note that the set of characters that should be encoded may differ for
+ // different URL components. The optional callback function must return
+ // true for characters that should be percent-encoded. The function may
+ // encode the passed character in it's own way with another character (but
+ // never with '%'), and return false. By default all characters other than
+ // unreserved are percent-encoded.
+ //
+ // Also note that the characters are interpreted as bytes. In other words,
+ // each character may result in a single encoding triplet.
+ //
+ template <typename I, typename O, typename F = bool (*) (char_type&)>
+ static void
+ encode (I b, I e,
+ O o,
+
+ // VC (as of 15u3) doesn't see unreserved() unless qualified.
+ //
+ F&& f = [] (char_type& c) {return !basic_url<H,T>::unreserved (c);});
+
+ template <typename F = bool (*) (char_type&)>
+ static string_type
+ encode (const string_type& s,
+ F&& f = [] (char_type& c) {return !basic_url<H,T>::unreserved (c);})
+ {
+ string_type r;
+ encode (s.begin (), s.end (), std::back_inserter (r), f);
+ return r;
+ }
+
+ template <typename F = bool (*) (char_type&)>
+ static string_type
+ encode (const char_type* s,
+ F&& f = [] (char_type& c) {return !basic_url<H,T>::unreserved (c);})
+ {
+ string_type r;
+ encode (s, s + string_type::traits_type::length (s),
+ std::back_inserter (r), f);
+ return r;
+ }
+
+ // URL-decode a character sequence. Throw std::invalid_argument if an
+ // invalid encoding sequence is encountered.
+ //
+ // If some characters in the sequence are encoded with another characters
+ // (rather than percent-encoded), then one must provide the callback
+ // function to decode them.
+ //
+ template <typename I, typename O, typename F = void (*) (char_type&)>
+ static void
+ decode (I b, I e, O o, F&& f = [] (char_type&) {});
+
+ template <typename F = void (*) (char_type&)>
+ static string_type
+ decode (const string_type& s, F&& f = [] (char_type&) {})
+ {
+ string_type r;
+ decode (s.begin (), s.end (), std::back_inserter (r), f);
+ return r;
+ }
+
+ template <typename F = void (*) (char_type&)>
+ static string_type
+ decode (const char_type* s, F&& f = [] (char_type&) {})
+ {
+ string_type r;
+ decode (s, s + string_type::traits_type::length (s),
+ std::back_inserter (r), f);
+ return r;
+ }
+ };
+
+ using url_authority = basic_url_authority<std::string>;
+ using url = basic_url <std::string>;
+
+ template <typename S>
+ inline bool
+ operator== (const basic_url_host<S>& x, const basic_url_host<S>& y) noexcept
+ {
+ return x.value == y.value && x.kind == y.kind;
+ }
+
+ template <typename S>
+ inline bool
+ operator!= (const basic_url_host<S>& x, const basic_url_host<S>& y) noexcept
+ {
+ return !(x == y);
+ }
+
+ template <typename S>
+ inline bool
+ operator== (const basic_url_authority<S>& x,
+ const basic_url_authority<S>& y) noexcept
+ {
+ return x.user == y.user && x.host == y.host && x.port == y.port;
+ }
+
+ template <typename S>
+ inline bool
+ operator!= (const basic_url_authority<S>& x,
+ const basic_url_authority<S>& y) noexcept
+ {
+ return !(x == y);
+ }
+
+ template <typename S, typename T>
+ inline bool
+ operator== (const basic_url<S, T>& x, const basic_url<S, T>& y) noexcept
+ {
+ if (!(x.authority == y.authority && x.path == y.path &&
+ x.query == y.query && x.fragment == y.fragment))
+ return false;
+
+ assert (x.empty () == y.empty ());
+
+ if (x.empty ())
+ return true;
+
+ return x.scheme == y.scheme; // None is empty, so schemes are valid.
+ }
+
+ template <typename S, typename T>
+ inline bool
+ operator!= (const basic_url<S, T>& x, const basic_url<S, T>& y) noexcept
+ {
+ return !(x == y);
+ }
+
+ template <typename S, typename T>
+ inline auto
+ operator<< (std::basic_ostream<typename T::string_type::value_type>& o,
+ const basic_url<S, T>& u) -> decltype (o)
+ {
+ return o << u.string ();
+ }
+}
+
+#include <libbutl/url.ixx>
+#include <libbutl/url.txx>
diff --git a/libbutl/url.txx b/libbutl/url.txx
new file mode 100644
index 0000000..addfe88
--- /dev/null
+++ b/libbutl/url.txx
@@ -0,0 +1,509 @@
+// file : libbutl/url.txx -*- C++ -*-
+// copyright : Copyright (c) 2014-2017 Code Synthesis Ltd
+// license : MIT; see accompanying LICENSE file
+
+LIBBUTL_MODEXPORT namespace butl //@@ MOD Clang needs this for some reason.
+{
+ // Convenience functions.
+ //
+ template <typename C>
+ inline bool
+ url_path_char (C c)
+ {
+ using url = basic_url<std::basic_string<C>>;
+
+ return c == '/' || c == ':' || url::unreserved (c) ||
+ c == '@' || url::sub_delim (c);
+ }
+
+ // basic_url_host
+ //
+ template <typename S>
+ basic_url_host<S>::
+ basic_url_host (string_type v)
+ {
+ using std::invalid_argument;
+
+ using url = basic_url<string_type>;
+ using char_type = typename string_type::value_type;
+
+ kind = v[0] == '[' ? kind_type::ipv6 : kind_type::name;
+
+ if (kind == url_host_kind::ipv6)
+ {
+ if (v.back () != ']')
+ throw invalid_argument ("invalid IPv6 address");
+
+ value.assign (v, 1, v.size () - 2);
+ }
+ else
+ {
+ // Detect the IPv4 address host type.
+ //
+ {
+ size_t n (0);
+ string_type oct;
+
+ auto ipv4_oct = [&oct, &n] () -> bool
+ {
+ if (n == 4 || oct.empty () || oct.size () > 3 || stoul (oct) > 255)
+ return false;
+
+ ++n;
+ oct.clear ();
+ return true;
+ };
+
+ auto i (v.cbegin ());
+ auto e (v.cend ());
+
+ for (; i != e; ++i)
+ {
+ char_type c (*i);
+
+ if (digit (c))
+ oct += c;
+ else if (c != '.' || !ipv4_oct ())
+ break;
+ }
+
+ if (i == e && ipv4_oct () && n == 4)
+ kind = url_host_kind::ipv4;
+ }
+
+ // Verify and decode the host name.
+ //
+ bool dec (false);
+ if (kind == url_host_kind::name)
+ {
+ for (auto c: v)
+ {
+ if (!(url::unreserved (c) || url::sub_delim (c) || c == '%'))
+ throw invalid_argument ("invalid host name");
+
+ if (c == '%')
+ dec = true;
+ }
+ }
+
+ value = dec ? url::decode (v) : move (v);
+ }
+ }
+
+ template <typename S>
+ S basic_url_host<S>::
+ string () const
+ {
+ using url = basic_url<string_type>;
+ using char_type = typename string_type::value_type;
+
+ if (empty ())
+ return string_type ();
+
+ switch (kind)
+ {
+ case url_host_kind::ipv4: return value;
+ case url_host_kind::ipv6:
+ {
+ string_type r;
+ r += '[';
+ r += value;
+ r += ']';
+ return r;
+ }
+ case url_host_kind::name:
+ {
+ // We don't encode all characters that are disallowed for the host
+ // part as RFC3986 requests:
+ //
+ // URI producing applications must not use percent-encoding in host
+ // unless it is used to represent a UTF-8 character sequence.
+ //
+ // The callback requests to encode characters outside the ASCII
+ // character set.
+ //
+ return url::encode (value,
+ [] (char_type& c)
+ {
+ // Convert to the unsigned numeric type, that is
+ // long enough to hold any character type.
+ //
+ return static_cast<unsigned long> (c) >= 0x80;
+ });
+ }
+ }
+
+ assert (false); // Can't be here.
+ return string_type ();
+ }
+
+ // basic_url_authority
+ //
+ template <typename S>
+ S
+ port_string (std::uint16_t p);
+
+ template <>
+ inline std::string
+ port_string (std::uint16_t p)
+ {
+ return std::to_string (p);
+ }
+
+ template <>
+ inline std::wstring
+ port_string (std::uint16_t p)
+ {
+ return std::to_wstring (p);
+ }
+
+ template <typename S>
+ S basic_url_authority<S>::
+ string () const
+ {
+ if (empty ())
+ return string_type ();
+
+ string_type r;
+ if (!user.empty ())
+ {
+ r += user;
+ r += '@';
+ }
+
+ r += host.string ();
+
+ if (port != 0)
+ {
+ r += ':';
+ r += port_string<string_type> (port);
+ }
+
+ return r;
+ }
+
+ // basic_url
+ //
+ template <typename S, typename T>
+ basic_url<S, T>::
+ basic_url (const string_type& u)
+ {
+ using namespace std;
+
+ using iterator = typename string_type::const_iterator;
+
+ // Create an empty URL object for the empty argument. Note that the scheme
+ // is default-constructed, and so may stay undefined in this case.
+ //
+ if (u.empty ())
+ return;
+
+ try
+ {
+ // At the end of a component parsing 'i' points to the next component
+ // start, and 'b' stays unchanged.
+ //
+ iterator b (u.cbegin ());
+ iterator i (b);
+ iterator e (u.cend ());
+
+ // Extract scheme.
+ //
+ for(char_type c; i != e && (c = *i) != ':'; ++i)
+ {
+ if (!(i == b
+ ? alpha (c)
+ : (alnum (c) || c == '+' || c == '-' || c == '.')))
+ throw invalid_argument ("invalid scheme");
+ }
+
+ if (i == b || i == e || i == b + 1) // Forbids one letter length schemes.
+ throw invalid_argument ("no scheme");
+
+ string_type sc (b, i++); // Skip ':'.
+
+ // Parse authority.
+ //
+ if (i != e && i + 1 != e && *i == '/' && *(i + 1) == '/')
+ {
+ i += 2; // Skip '//'.
+
+ // Find the authority end.
+ //
+ size_t p (u.find_first_of (string_type ({'/', '?', '#'}), i - b));
+ iterator ae (p != string_type::npos ? b + p : e);
+
+ string_type auth (i, ae);
+ i = ae;
+
+ // Extract user information.
+ //
+ string_type user;
+ p = auth.find ('@');
+ if (p != string_type::npos)
+ {
+ // Don't URL-decode the user information (scheme-specific).
+ //
+ user = string_type (auth, 0, p);
+ auth = string_type (auth, p + 1);
+ }
+
+ // Extract host.
+ //
+ string_type host;
+ p = auth.find_last_of({']', ':'}); // Note: ':' can belong to IPv6.
+
+ if (p != string_type::npos && auth[p] == ']') // There is no port.
+ p = string_type::npos;
+
+ if (p != string_type::npos)
+ {
+ host = string_type (auth, 0, p);
+ auth = string_type (auth, p + 1);
+ }
+ else
+ {
+ host = move (auth);
+ auth = string_type ();
+ }
+
+ // Extract port.
+ //
+ uint16_t port (0);
+ if (!auth.empty ())
+ {
+ auto bad_port = [] () {throw invalid_argument ("invalid port");};
+
+ for (auto c: auth)
+ {
+ if (!digit (c))
+ bad_port ();
+ }
+
+ unsigned long long n (stoull (auth));
+ if (n == 0 || n > UINT16_MAX)
+ bad_port ();
+
+ port = static_cast<uint16_t> (n);
+ }
+
+ // User information and port are only meaningful if the host part is
+ // present.
+ //
+ if (host.empty () && (!user.empty () || port != 0))
+ throw invalid_argument ("no host");
+
+ authority = {move (user), host_type (move (host)), port};
+ }
+
+ // Extract path.
+ //
+ if (i != e && *i == '/')
+ {
+ ++i; // Skip '/'.
+
+ // Verify and URL-decode the path.
+ //
+ iterator j (i);
+ for(char_type c; j != e && (c = *j) != '?' && c != '#'; ++j)
+ {
+ if (!(url_path_char (c) || c == '%'))
+ throw invalid_argument ("invalid path");
+ }
+
+ // Note that encoding for non-ASCII path is not specified (in contrast
+ // to the host name), and presumably is local to the referenced
+ // authority.
+ //
+ string_type s;
+ decode (i, j, back_inserter (s));
+ path = traits::translate_path (move (s));
+ i = j;
+ }
+
+ // Extract query.
+ //
+ if (i != e && *i == '?')
+ {
+ ++i; // Skip '?'.
+
+ // Find the query component end.
+ //
+ size_t p (u.find ('#', i - b));
+ iterator qe (p != string_type::npos ? b + p : e);
+
+ // Don't URL-decode the query (scheme-specific).
+ //
+ query = string_type (i, qe);
+ i = qe;
+ }
+
+ // We don't suppose to end up with an empty URL.
+ //
+ if (empty ())
+ throw invalid_argument ("no authority, path or query");
+
+ // Parse fragment.
+ //
+ if (i != e)
+ {
+ ++i; // Skip '#'.
+
+ // Don't URL-decode the fragment (media type-specific).
+ //
+ fragment = string_type (i, e);
+ i = e;
+ }
+
+ assert (i == e);
+
+ // Translate the scheme string representation to its type.
+ //
+ scheme = traits::translate_scheme (u,
+ move (sc),
+ authority,
+ path,
+ query,
+ fragment);
+ }
+ // If we fail to parse the URL, then delegate this job to
+ // traits::translate_scheme(). If it also fails, leaving the components
+ // absent, then we re-throw.
+ //
+ catch (const invalid_argument&)
+ {
+ authority = nullopt;
+ path = nullopt;
+ query = nullopt;
+ fragment = nullopt;
+
+ scheme = traits::translate_scheme (u,
+ string_type () /* scheme */,
+ authority,
+ path,
+ query,
+ fragment);
+
+ if (!authority && !path && !query && !fragment)
+ throw;
+ }
+ }
+
+ template <typename S, typename T>
+ typename basic_url<S, T>::string_type basic_url<S, T>::
+ string () const
+ {
+ if (empty ())
+ return string_type ();
+
+ string_type u;
+ string_type r (traits::translate_scheme (u,
+ scheme,
+ authority,
+ path,
+ query,
+ fragment));
+
+ // Return the custom URL pbject representation if provided.
+ //
+ if (!u.empty ())
+ return u;
+
+ r += ':';
+
+ if (authority)
+ {
+ r += '/';
+ r += '/';
+ r += authority->string ();
+ }
+
+ if (path)
+ {
+ r += '/';
+ r += encode (traits::translate_path (*path),
+ [] (char_type& c) {return !url_path_char (c);});
+ }
+
+ if (query)
+ {
+ r += '?';
+ r += *query;
+ }
+
+ if (fragment)
+ {
+ r += '#';
+ r += *fragment;
+ }
+
+ return r;
+ }
+
+ template <typename S, typename T>
+ template <typename I, typename O, typename F>
+ void basic_url<S, T>::
+ encode (I b, I e, O o, F&& f)
+ {
+ const char_type digits[] = {'0', '1', '2', '3', '4', '5', '6', '7', '8',
+ '9', 'A', 'B', 'C', 'D', 'E', 'F'};
+
+ for (; b != e; ++b)
+ {
+ char_type c (*b);
+
+ if (c == '%' || f (c))
+ {
+ assert (c == *b); // Must not be custom-encoded.
+
+ *o++ = '%';
+ *o++ = digits[(c >> 4) & 0xF];
+ *o++ = digits[c & 0xF];
+ }
+ else
+ {
+ assert (c != '%'); // Otherwise decoding will be ambiguous.
+ *o++ = c;
+ }
+ }
+ }
+
+ template <typename S, typename T>
+ template <typename I, typename O, typename F>
+ void basic_url<S, T>::
+ decode (I b, I e, O o, F&& f)
+ {
+ using namespace std;
+
+ for (; b != e; ++b)
+ {
+ char_type c (*b);
+
+ // URL-decode the character.
+ //
+ if (c == '%')
+ {
+ // Note that we can't use (potentially more efficient) strtoul() here
+ // as it doesn't have an overload for the wide character string.
+ // However, the code below shouldn't be inefficient, given that the
+ // string is short, and so is (probably) stack-allocated.
+ //
+ // Note that stoul() throws if no conversion could be performed, so we
+ // explicitly check for xdigits.
+ //
+ if (++b != e && xdigit (*b) && b + 1 != e && xdigit (*(b + 1)))
+ c = static_cast<char_type> (stoul (string_type (b, b + 2),
+ nullptr,
+ 16));
+ else
+ throw invalid_argument ("invalid URL-encoding");
+
+ ++b; // Position to the second xdigit.
+ }
+ else
+ f (c);
+
+ *o++ = c;
+ }
+ }
+}
diff --git a/libbutl/utility.ixx b/libbutl/utility.ixx
index d703211..fcb8789 100644
--- a/libbutl/utility.ixx
+++ b/libbutl/utility.ixx
@@ -137,4 +137,34 @@ namespace butl
{
return std::isalnum (c);
}
+
+ inline bool
+ xdigit (char c)
+ {
+ return std::isxdigit (c);
+ }
+
+ inline bool
+ alpha (wchar_t c)
+ {
+ return std::iswalpha (c);
+ }
+
+ inline bool
+ digit (wchar_t c)
+ {
+ return std::iswdigit (c);
+ }
+
+ inline bool
+ alnum (wchar_t c)
+ {
+ return std::iswalnum (c);
+ }
+
+ inline bool
+ xdigit (wchar_t c)
+ {
+ return std::iswxdigit (c);
+ }
}
diff --git a/libbutl/utility.mxx b/libbutl/utility.mxx
index 3f23581..988ca22 100644
--- a/libbutl/utility.mxx
+++ b/libbutl/utility.mxx
@@ -22,7 +22,8 @@
#include <exception> // exception, uncaught_exception[s]()
//#include <functional> // hash
-#include <cctype> // toupper(), tolower(), isalpha(), isdigit(), isalnum()
+#include <cctype> // toupper(), tolower(), is*()
+#include <cwctype> // isw*()
#endif
#include <libbutl/ft/lang.hxx> // thread_local
@@ -138,14 +139,15 @@ LIBBUTL_MODEXPORT namespace butl
}
};
- bool
- alpha (char);
-
- bool
- digit (char);
+ bool alpha (char);
+ bool digit (char);
+ bool alnum (char);
+ bool xdigit (char);
- bool
- alnum (char);
+ bool alpha (wchar_t);
+ bool digit (wchar_t);
+ bool alnum (wchar_t);
+ bool xdigit (wchar_t);
// Key comparators (i.e., to be used in sets, maps, etc).
//
diff --git a/tests/url/buildfile b/tests/url/buildfile
new file mode 100644
index 0000000..ed8380c
--- /dev/null
+++ b/tests/url/buildfile
@@ -0,0 +1,8 @@
+# file : tests/url/buildfile
+# copyright : Copyright (c) 2014-2017 Code Synthesis Ltd
+# license : MIT; see accompanying LICENSE file
+
+import libs = libbutl%lib{butl}
+libs += $stdmod_lib
+
+exe{driver}: {hxx cxx}{*} $libs test{testscript}
diff --git a/tests/url/driver.cxx b/tests/url/driver.cxx
new file mode 100644
index 0000000..95fe9cb
--- /dev/null
+++ b/tests/url/driver.cxx
@@ -0,0 +1,346 @@
+// file : tests/url/driver.cxx -*- C++ -*-
+// copyright : Copyright (c) 2014-2017 Code Synthesis Ltd
+// license : MIT; see accompanying LICENSE file
+
+#include <cassert>
+
+#ifndef __cpp_lib_modules
+#include <string>
+#include <utility> // move()
+#include <iostream>
+#include <iterator> // back_inserter
+#include <stdexcept> // invalid_argument
+#endif
+
+// Other includes.
+
+#ifdef __cpp_modules
+#ifdef __cpp_lib_modules
+import std.core;
+import std.io;
+#endif
+import butl.url;
+import butl.utility; // operator<<(ostream, exception)
+#else
+#include <libbutl/url.mxx>
+#include <libbutl/utility.mxx>
+#endif
+
+using namespace std;
+using namespace butl;
+
+enum class scheme
+{
+ http,
+ https,
+ file
+};
+
+namespace butl
+{
+ template <>
+ struct url_traits<scheme>
+ {
+ using string_type = wstring;
+ using path_type = wstring;
+
+ using scheme_type = scheme;
+ using authority_type = basic_url_authority<string_type>;
+
+ static scheme_type
+ translate_scheme (const string_type& url,
+ string_type&& scheme,
+ optional<authority_type>& /*authority*/,
+ optional<path_type>& path,
+ optional<string_type>& /*query*/,
+ optional<string_type>& /*fragment*/)
+ {
+ // Note that we must compare case-insensitive in the real program.
+ //
+ if (scheme == L"http")
+ return scheme_type::http;
+ else if (scheme == L"https")
+ return scheme_type::https;
+ else if (scheme == L"file")
+ return scheme_type::file;
+ else if (scheme.empty ())
+ {
+ // If the URL looks like an absolute filesystem path, then translate it
+ // to the file URL. If it is not, then leave all the components absent
+ // to fail with a proper exception description.
+ //
+ wchar_t c;
+ if ((c = url[0]) == '/' ||
+ (url.size () > 2 && alpha (c) && url[1] == ':' && url[2] == '/'))
+ path = url;
+
+ return scheme_type::file;
+ }
+ else
+ throw invalid_argument ("unknown scheme");
+ }
+
+ // Translate scheme type back to its string representation.
+ //
+ static string_type
+ translate_scheme (string_type&, /*url*/
+ const scheme_type& scheme,
+ const optional<authority_type>& /*authority*/,
+ const optional<path_type>& /*path*/,
+ const optional<string_type>& /*query*/,
+ const optional<string_type>& /*fragment*/)
+ {
+ switch (scheme)
+ {
+ case scheme_type::http: return L"http";
+ case scheme_type::https: return L"https";
+ case scheme_type::file: return L"file";
+ }
+
+ assert (false); // Can't be here.
+ return L"";
+ }
+
+ static path_type
+ translate_path (string_type&& path)
+ {
+ return path_type (move (path));
+ }
+
+ static string_type
+ translate_path (const path_type& path) {return string_type (path);}
+ };
+}
+
+// Usages:
+//
+// argv[0]
+// argv[0] [-c|-s|-w] <url>
+//
+// Perform some basic tests if no URL is provided. Otherwise round-trip the URL
+// to STDOUT. URL must contain only ASCII characters. Exit with zero code on
+// success. Exit with code one on parsing failure, printing error description
+// to STDERR.
+//
+// -c
+// Print the URL components one per line. Print the special '[null]' string
+// for an absent components. This is the default option if URL is provided.
+//
+// -s
+// Print stringified url object representation.
+//
+// -w
+// Same as above, but use the custom wstring-based url_traits
+// implementation for the basic_url template.
+//
+int
+main (int argc, const char* argv[])
+try
+{
+ using wurl = basic_url<scheme>;
+ using wurl_authority = wurl::authority_type;
+ using wurl_host = wurl::host_type;
+
+ enum class print_mode
+ {
+ str,
+ wstr,
+ comp
+ } mode (print_mode::comp);
+
+ int i (1);
+ for (; i != argc; ++i)
+ {
+ string o (argv[i]);
+ if (o == "-s")
+ mode = print_mode::str;
+ else if (o == "-w")
+ mode = print_mode::wstr;
+ else if (o == "-c")
+ mode = print_mode::comp;
+ else
+ break; // End of options.
+ }
+
+ if (i == argc)
+ {
+ // Test ctors and operators.
+ //
+ {
+ wurl u0 ((wstring ()));
+ assert (u0.empty ());
+ assert (u0 == wurl ());
+
+ wurl u1 (scheme::http,
+ wurl_authority {wstring (), wurl_host (L"[123]"), 0},
+ wstring (L"login"),
+ wstring (L"q="),
+ wstring (L"f"));
+
+ assert (!u1.empty ());
+ assert (u1 != u0);
+
+ wurl u2 (scheme::http,
+ wurl_host (L"123", url_host_kind::ipv6),
+ wstring (L"login"),
+ wstring (L"q="),
+ wstring (L"f"));
+
+ assert (u2 == u1);
+
+ wurl u3 (scheme::http,
+ wurl_host (L"123", url_host_kind::ipv6),
+ 0,
+ wstring (L"login"),
+ wstring (L"q="),
+ wstring (L"f"));
+
+ assert (u3 == u2);
+
+ wurl u4 (scheme::http,
+ L"[123]",
+ wstring (L"login"),
+ wstring (L"q="),
+ wstring (L"f"));
+
+ assert (u4 == u3);
+
+ wurl u5 (scheme::http,
+ L"[123]",
+ 0,
+ wstring (L"login"),
+ wstring (L"q="),
+ wstring (L"f"));
+
+ assert (u5 == u4);
+ }
+
+ // Test encode and decode.
+ //
+ {
+ const char* s ("ABC +");
+ string es (url::encode (s));
+
+ assert (es == "ABC%20%2B");
+ string ds (url::decode (es));
+
+ assert (ds == s);
+ }
+
+ {
+ const char* s ("ABC +");
+
+ string es (url::encode (s,
+ [] (char& c) -> bool
+ {
+ if (c == ' ')
+ {
+ c = '+';
+ return false;
+ }
+ return !url::unreserved (c);
+ }));
+
+ assert (es == "ABC+%2B");
+
+ string ds (url::decode (es.c_str (),
+ [] (char& c)
+ {
+ if (c == '+')
+ c = ' ';
+ }));
+ assert (ds == s);
+ }
+ {
+ const wchar_t s[] = L"ABC ";
+
+ wstring es;
+ wurl::encode (s, s + 4,
+ back_inserter (es),
+ [] (wchar_t& c) -> bool
+ {
+ if (!alnum (c))
+ return true;
+
+ ++c;
+ return false;
+ });
+ assert (es == L"BCD%20");
+
+ wstring ds (wurl::decode (es,
+ [] (wchar_t& c)
+ {
+ if (alnum (c))
+ --c;
+ }));
+ assert (ds == s);
+ }
+ }
+ else // Round-trip the URL.
+ {
+ assert (i + 1 == argc);
+
+ const char* ua (argv[i]);
+
+ switch (mode)
+ {
+ case print_mode::str:
+ {
+ cout << url (ua) << endl;
+ break;
+ }
+ case print_mode::wstr:
+ {
+ // Convert ASCII string to wstring.
+ //
+ wstring s (ua, ua + strlen (ua));
+
+ wcout << wurl (s) << endl;
+ break;
+ }
+ case print_mode::comp:
+ {
+ // Convert ASCII string to wstring.
+ //
+ wstring s (ua, ua + strlen (ua));
+ wurl u (s);
+
+ if (!u.empty ())
+ {
+ wstring s;
+ wcout << wurl::traits::translate_scheme (s,
+ u.scheme,
+ nullopt,
+ nullopt,
+ nullopt,
+ nullopt) << endl;
+ }
+ else
+ wcout << L"[null]" << endl;
+
+ if (u.authority)
+ {
+ const wchar_t* kinds[] = {L"ipv4", L"ipv6", L"name"};
+ const wurl_authority& a (*u.authority);
+
+ wcout << a.user << L'@' << a.host.value << L':' << a.port
+ << " " << kinds[static_cast<size_t> (a.host.kind)] << endl;
+ }
+ else
+ wcout << L"[null]" << endl;
+
+ wcout << (u.path ? *u.path : L"[null]") << endl
+ << (u.query ? *u.query : L"[null]") << endl
+ << (u.fragment ? *u.fragment : L"[null]") << endl;
+ break;
+ }
+ }
+ }
+
+ return 0;
+}
+catch (const invalid_argument& e)
+{
+ cerr << e << endl;
+ return 1;
+}
diff --git a/tests/url/testscript b/tests/url/testscript
new file mode 100644
index 0000000..d81f282
--- /dev/null
+++ b/tests/url/testscript
@@ -0,0 +1,378 @@
+# file : tests/url/testscript
+# copyright : Copyright (c) 2014-2017 Code Synthesis Ltd
+# license : MIT; see accompanying LICENSE file
+
+:basic
+:
+$*
+
+: components
+:
+{
+ : all
+ :
+ $* 'https://user@stage.b2.org:443/libbutl?f=full#description' >>EOO
+ https
+ user@stage.b2.org:443 name
+ libbutl
+ f=full
+ description
+ EOO
+
+ : empty-url
+ :
+ $* '' >>EOO
+ [null]
+ [null]
+ [null]
+ [null]
+ [null]
+ EOO
+
+ : no-id
+ :
+ {
+ $* 'file:#f' 2>'no authority, path or query' != 0 : fragment
+ $* 'file:aaa' 2>'no authority, path or query' != 0 : junk
+ $* 'file:' 2>'no authority, path or query' != 0 : none
+ }
+
+ : scheme
+ :
+ {
+ : detected
+ :
+ $* 'http://build2.org' >>EOO
+ http
+ @build2.org:0 name
+ [null]
+ [null]
+ [null]
+ EOO
+
+ : deduced
+ :
+ $* 'c:/a' >>EOO
+ file
+ [null]
+ c:/a
+ [null]
+ [null]
+ EOO
+
+ $* ':/a' 2>'no scheme' != 0 : none
+ $* 'http' 2>'no scheme' != 0 : unterminated
+ $* 'ht~tp://a.com' 2>'invalid scheme' != 0 : invalid-char
+ $* '1http://a.com' 2>'invalid scheme' != 0 : invalid-first-char
+ }
+
+ : authority
+ {
+ : absent
+ :
+ $* 'file:/tmp/a' >>EOO
+ file
+ [null]
+ tmp/a
+ [null]
+ [null]
+ EOO
+
+ : empty
+ :
+ $* 'file:///tmp/a' >>EOO
+ file
+ @:0 name
+ tmp/a
+ [null]
+ [null]
+ EOO
+
+ : trailing
+ :
+ $* 'http://localhost' >>EOO
+ http
+ @localhost:0 name
+ [null]
+ [null]
+ [null]
+ EOO
+
+ : user
+ :
+ {
+ : non-empty
+ :
+ $* 'http://admin@localhost' >>EOO
+ http
+ admin@localhost:0 name
+ [null]
+ [null]
+ [null]
+ EOO
+
+ : empty
+ :
+ $* 'http://@localhost' >>EOO
+ http
+ @localhost:0 name
+ [null]
+ [null]
+ [null]
+ EOO
+ }
+
+ : host
+ :
+ {
+ : ipv6
+ :
+ {
+ : port
+ :
+ $* 'http://[1:23]:443' >>EOO
+ http
+ @1:23:443 ipv6
+ [null]
+ [null]
+ [null]
+ EOO
+
+ : no-port
+ :
+ $* 'http://[1:23]' >>EOO
+ http
+ @1:23:0 ipv6
+ [null]
+ [null]
+ [null]
+ EOO
+
+ $* 'http://[123' 2>'invalid IPv6 address' != 0 : missed-bracket
+ $* 'http://[123] :80' 2>'invalid IPv6 address' != 0 : extra-char
+ }
+
+ : ipv4
+ :
+ {
+ : valid
+ :
+ $* 'http://0.10.200.255' >>EOO
+ http
+ @0.10.200.255:0 ipv4
+ [null]
+ [null]
+ [null]
+ EOO
+
+ : long
+ :
+ $* 'http://0.10.200.255.30' >>EOO
+ http
+ @0.10.200.255.30:0 name
+ [null]
+ [null]
+ [null]
+ EOO
+
+ : short
+ :
+ $* 'http://0.10.200' >>EOO
+ http
+ @0.10.200:0 name
+ [null]
+ [null]
+ [null]
+ EOO
+
+ : missed
+ :
+ $* 'http://0.10..200' >>EOO
+ http
+ @0.10..200:0 name
+ [null]
+ [null]
+ [null]
+ EOO
+
+ : out-of-range
+ :
+ $* 'http://0.10.200.256' >>EOO
+ http
+ @0.10.200.256:0 name
+ [null]
+ [null]
+ [null]
+ EOO
+ }
+
+ : name
+ :
+ {
+ : valid
+ :
+ $* 'https://www.b2.org' >>EOO
+ https
+ @www.b2.org:0 name
+ [null]
+ [null]
+ [null]
+ EOO
+
+ : encoded
+ :
+ {
+ : valid
+ :
+ $* 'https://www.%62%32.org' >>EOO
+ https
+ @www.b2.org:0 name
+ [null]
+ [null]
+ [null]
+ EOO
+
+ $* 'https://www.%62%3.org' 2>'invalid URL-encoding' != 0 : short
+ $* 'https://www.%62%3x.org' 2>'invalid URL-encoding' != 0 : invalid
+ $* 'https://www.%62%.org' 2>'invalid URL-encoding' != 0 : absent
+ }
+
+ $* 'https://www.b|2.org' 2>'invalid host name' != 0 : invalid-char
+ }
+
+ $* 'http://admin@:80?q=' 2>'no host' != 0: no-host
+ }
+
+ : port
+ :
+ {
+ : valid
+ :
+ $* 'http://build2.org:443' >>EOO
+ http
+ @build2.org:443 name
+ [null]
+ [null]
+ [null]
+ EOO
+
+ $* 'http://build2.org:-433' 2>'invalid port' != 0 : invalid-char
+ $* 'http://build2.org:70000' 2>'invalid port' != 0 : exceeds-max
+ $* 'http://build2.org:0' 2>'invalid port' != 0 : zero
+ }
+ }
+
+ : path
+ :
+ {
+ : absent
+ :
+ $* 'http://b2.org' >>EOO
+ http
+ @b2.org:0 name
+ [null]
+ [null]
+ [null]
+ EOO
+
+ : empty
+ :
+ $* 'http://b2.org/' >>EOO
+ http
+ @b2.org:0 name
+
+ [null]
+ [null]
+ EOO
+
+ : non-empty
+ :
+ $* 'http://b2.org/s/q' >>EOO
+ http
+ @b2.org:0 name
+ s/q
+ [null]
+ [null]
+ EOO
+
+ : encoded
+ :
+ $* 'http://b2.org/%6F/s' >>EOO
+ http
+ @b2.org:0 name
+ o/s
+ [null]
+ [null]
+ EOO
+ }
+
+ : query
+ :
+ {
+ : no-fragment
+ :
+ $* 'http://b2.org/a?x=foo&y=bar' >>EOO
+ http
+ @b2.org:0 name
+ a
+ x=foo&y=bar
+ [null]
+ EOO
+
+ : fragment
+ :
+ $* 'http://b2.org/a?foo#bar' >>EOO
+ http
+ @b2.org:0 name
+ a
+ foo
+ bar
+ EOO
+ }
+
+ : fragment
+ :
+ {
+ $* 'http://b2.org#foo' >>EOO
+ http
+ @b2.org:0 name
+ [null]
+ [null]
+ foo
+ EOO
+ }
+}
+
+: string
+{
+ test.options += -s
+
+ : authority
+ :
+ {
+ : host
+ :
+ {
+ $* 'file:///a' >'file:///a' : empty
+ $* 'http://1.1.1.1' >'http://1.1.1.1' : ipv4
+ $* 'https://[1:2:3]' >'https://[1:2:3]' : ipv6
+ $* 'file://a%d1%84' >'file://a%D1%84' : name
+ }
+
+ $* 'http://admin@localhost' >'http://admin@localhost' : user
+ $* 'http://localhost:8080' >'http://localhost:8080' : port
+ $* 'file:/a' >'file:/a' : absent
+ }
+
+ $* '' >'' : empty
+ $* 'file:/b%7C2' >'file:/b%7C2' : path
+ $* 'http://a?q=' >'http://a?q=' : query
+ $* 'http://a#f' >'http://a#f' : fragment
+}
+
+: wstring
+:
+{
+ u = 'https://user@stage.b2.org:443/libbutl?f=full#description'
+ $* -w "$u" >"$u"
+}