diff options
-rw-r--r-- | libbutl/url.ixx | 84 | ||||
-rw-r--r-- | libbutl/url.mxx | 476 | ||||
-rw-r--r-- | libbutl/url.txx | 509 | ||||
-rw-r--r-- | libbutl/utility.ixx | 30 | ||||
-rw-r--r-- | libbutl/utility.mxx | 18 | ||||
-rw-r--r-- | tests/url/buildfile | 8 | ||||
-rw-r--r-- | tests/url/driver.cxx | 346 | ||||
-rw-r--r-- | tests/url/testscript | 378 |
8 files changed, 1841 insertions, 8 deletions
diff --git a/libbutl/url.ixx b/libbutl/url.ixx new file mode 100644 index 0000000..4ff7a06 --- /dev/null +++ b/libbutl/url.ixx @@ -0,0 +1,84 @@ +// file : libbutl/url.ixx -*- C++ -*- +// copyright : Copyright (c) 2014-2017 Code Synthesis Ltd +// license : MIT; see accompanying LICENSE file + +LIBBUTL_MODEXPORT namespace butl //@@ MOD Clang needs this for some reason. +{ + template <typename S, typename T> + inline basic_url<S, T>:: + basic_url (scheme_type s, + optional<authority_type> a, + optional<path_type> p, + optional<string_type> q, + optional<string_type> f) + : scheme (std::move (s)), + authority (std::move (a)), + path (std::move (p)), + query (std::move (q)), + fragment (std::move (f)) + { + } + + template <typename S, typename T> + inline basic_url<S, T>:: + basic_url (scheme_type s, + host_type h, + optional<path_type> p, + optional<string_type> q, + optional<string_type> f) + : basic_url (std::move (s), + authority_type {string_type (), std::move (h), 0}, + std::move (p), + std::move (q), + std::move (f)) + { + } + + template <typename S, typename T> + inline basic_url<S, T>:: + basic_url (scheme_type s, + host_type h, + std::uint16_t o, + optional<path_type> p, + optional<string_type> q, + optional<string_type> f) + : basic_url (std::move (s), + authority_type {string_type (), std::move (h), o}, + std::move (p), + std::move (q), + std::move (f)) + { + } + + template <typename S, typename T> + inline basic_url<S, T>:: + basic_url (scheme_type s, + string_type h, + optional<path_type> p, + optional<string_type> q, + optional<string_type> f) + : basic_url (std::move (s), + host_type (std::move (h)), + std::move (p), + std::move (q), + std::move (f)) + { + } + + template <typename S, typename T> + inline basic_url<S, T>:: + basic_url (scheme_type s, + string_type h, + std::uint16_t o, + optional<path_type> p, + optional<string_type> q, + optional<string_type> f) + : basic_url (std::move (s), + host_type (std::move (h)), + o, + std::move (p), + std::move (q), + std::move (f)) + { + } +} diff --git a/libbutl/url.mxx b/libbutl/url.mxx new file mode 100644 index 0000000..fe091f1 --- /dev/null +++ b/libbutl/url.mxx @@ -0,0 +1,476 @@ +// file : libbutl/url.mxx -*- C++ -*- +// copyright : Copyright (c) 2014-2017 Code Synthesis Ltd +// license : MIT; see accompanying LICENSE file + +#ifndef __cpp_modules +#pragma once +#endif + +// C includes. + +#include <cassert> + +#ifndef __cpp_lib_modules +#include <string> +#include <cstdint> // uint*_t +#include <utility> // move() +#include <ostream> +#include <iterator> // back_inserter + +#include <cstddef> // size_t +#include <stdexcept> // invalid_argument +#endif + +// Other includes. + +#ifdef __cpp_modules +export module butl.url; +#ifdef __cpp_lib_modules +import std.core; +import std.io; +#endif +import butl.path; +import butl.utility; +import butl.optional; +#else +#include <libbutl/path.mxx> +#include <libbutl/utility.mxx> +#include <libbutl/optional.mxx> +#endif + +#include <libbutl/export.hxx> + +LIBBUTL_MODEXPORT namespace butl +{ + // RFC3986 Uniform Resource Locator (URL). + // + // <url> = <scheme>:[//[<authority>]][/<path>][?<query>][#<fragment>] + // <authority> = [<user>@]<host>[:<port>] + // + // Some examples of equivalent URLs to meditate upon: + // + // file://localhost/tmp (localhost authority) + // file:///tmp (empty authority) + // file:/tmp (absent authority) + // + // file://localhost/c:/tmp + // file:///c:/tmp + // file:/c:/tmp + // + // We think of the slash between <authority> and <path> as a separator but + // with the path always interpreted as starting from the "root" of the + // authority. Thus: + // + // file://localhost/tmp -> 'file'://'localhost'/'tmp' -> /tmp + // file://localhost/c:/tmp -> 'file'://'localhost'/'c:/tmp' -> c:/tmp + // + // This means that the <path> component is represented as a relative path + // and, in the general case, we cannot use our path type for its storage + // since it assumes the path is for the host platform. In other words, the + // interpretation of the path has to take into account the platform of the + // authority host. Note, however, that a custom url_traits implementation + // can choose to use the path type if local paths are to be interpreted as + // relative to the host. + // + // Note that we currently forbid one character schemes to support scheme- + // less (Windows) paths which can be done by url_traits::translate_scheme() + // (see below). (A Windows path that uses forward slashes would be parsed as + // a valid authority-less URL). + + // URL host component can be an IPv4 address (if matches its dotted-decimal + // notation), an IPv6 address (if enclosed in [square brackets]) or + // otherwise a name. + // + // Note that non-ASCII host names are allowed in URLs. They must be + // UTF8-encoded and URL-encoded afterwards. Curently we store the parsed + // host name UTF8-encoded without regards to the template argument string + // type. Later we may add support for more appropriate encodings for + // multi-byte character types. + // + enum class url_host_kind {ipv4, ipv6, name}; + + template <typename S> + struct basic_url_host + { + using string_type = S; + using kind_type = url_host_kind; + + string_type value; + kind_type kind; + + // Can be treated as const string_type&. + // + operator const string_type& () const noexcept {return value;} + + // Create an empty host. + // + basic_url_host (): kind (kind_type::name) {} + + // Create the host object from its string representation as it appears in + // a URL, throwing std::invalid_argument if invalid. Remove the enclosing + // square brackets for IPv6 addresses, and URL-decode host names. + // + // Note that currently we don't validate IPv6 addresses. + // + explicit + basic_url_host (string_type); + + basic_url_host (string_type v, kind_type k) + : value (std::move (v)), kind (k) {} + + bool + empty () const + { + assert (kind == kind_type::name || !value.empty ()); + return value.empty (); + } + + // Return string representation of the host as it would appear in a URL. + // + string_type + string () const; + }; + + template <typename S> + struct basic_url_authority + { + using string_type = S; + using host_type = basic_url_host<string_type>; + + string_type user; // Empty if not specified. + host_type host; + std::uint16_t port; // Zero if not specified. + + bool + empty () const + { + assert (!host.empty () || (user.empty () && port == 0)); + return host.empty (); + } + + // Return a string representation of the URL authority. String + // representation of an empty instance is the empty string. + // + string_type + string () const; + }; + + template <typename H, typename S = H, typename P = S> + struct url_traits + { + using scheme_type = H; + using string_type = S; + using path_type = P; + + using authority_type = basic_url_authority<string_type>; + + // Translate the scheme string representation to its type. May throw + // std::invalid_argument. May change the URL components. + // + // This function is called with an empty scheme if the URL has no scheme, + // the scheme is invalid, or it could not be parsed into components + // according to the URL syntax. In this case all the passed components + // reference empty/absent values and if they remain unchanged on return, + // the URL is considered invalid and the std::invalid_argument exception + // with an appropriate description is thrown by the URL object constructor. + // This can be used to support scheme-less URLs, local paths, etc. + // + static scheme_type + translate_scheme (const string_type& /*url*/, + string_type&& scheme, + optional<authority_type>& /*authority*/, + optional<path_type>& /*path*/, + optional<string_type>& /*query*/, + optional<string_type>& /*fragment*/) + { + return scheme_type (std::move (scheme)); + } + + // Translate scheme type back to its string representation. + // + // Similar to the above the function is called with an empty string + // representation. If on return this value is no longer empty, then it is + // assume the URL has been translated in a custom manner (in which case + // the returned scheme value is ignored). + // + static string_type + translate_scheme (string_type&, /*url*/ + const scheme_type& scheme, + const optional<authority_type>& /*authority*/, + const optional<path_type>& /*path*/, + const optional<string_type>& /*query*/, + const optional<string_type>& /*fragment*/) + { + return string_type (scheme); + } + + // Translate the path string representation to its type. + // + static path_type + translate_path (string_type&& path) + { + return path_type (std::move (path)); + } + + // Translate path type back to its string representation. + // + static string_type + translate_path (const path_type& path) {return string_type (path);} + }; + + template <typename H, // scheme + typename T = url_traits<H>> + class basic_url + { + public: + using traits = T; + + using string_type = typename traits::string_type; + using char_type = typename string_type::value_type; + using path_type = typename traits::path_type; + + using scheme_type = typename traits::scheme_type; + using authority_type = typename traits::authority_type; + using host_type = typename authority_type::host_type; + + scheme_type scheme; + optional<authority_type> authority; + optional<path_type> path; + optional<string_type> query; + optional<string_type> fragment; + + // Create an empty URL object. + // + basic_url () = default; + + // Create the URL object from its string representation. If the argument is + // empty, then create an empty object. Otherwise verify that the string is + // compliant to the generic URL syntax. URL-decode and validate components + // with common for all schemes syntax (scheme, host, port, path). + // Throw std::invalid_argument if the passed string is not a valid URL + // representation. + // + // Validation and URL-decoding of the scheme-specific components can be + // provided by a custom url_traits::translate_scheme() implementation. + // + explicit + basic_url (const string_type&); + + // Create the URL object from individual components. Performs no + // components URL-decoding or verification. + // + basic_url (scheme_type, + optional<authority_type>, + optional<path_type> path, + optional<string_type> query = nullopt, + optional<string_type> fragment = nullopt); + + basic_url (scheme_type, + host_type host, + optional<path_type> path, + optional<string_type> query = nullopt, + optional<string_type> fragment = nullopt); + + basic_url (scheme_type, + host_type host, + std::uint16_t port, + optional<path_type> path, + optional<string_type> query = nullopt, + optional<string_type> fragment = nullopt); + + basic_url (scheme_type, + string_type host, + optional<path_type> path, + optional<string_type> query = nullopt, + optional<string_type> fragment = nullopt); + + basic_url (scheme_type, + string_type host, + std::uint16_t port, + optional<path_type> path, + optional<string_type> query = nullopt, + optional<string_type> fragment = nullopt); + + bool + empty () const noexcept + { + assert (authority || path || query || !fragment); + return !authority && !path && !query; + } + + // Return a string representation of the URL. Note that while this is not + // necessarily syntactically the same string as what was used to + // initialize this instance, it should be semantically equivalent. String + // representation of an empty instance is the empty string. + // + string_type + string () const; + + // The following predicates can be used to classify URL characters while + // parsing, validating or encoding scheme-specific components. For the + // semantics of character classes see RFC3986. + // + static bool + gen_delim (char_type c) + { + return c == ':' || c == '/' || c == '?' || c == '#' || c == '[' || + c == ']' || c == '@'; + } + + static bool + sub_delim (char_type c) + { + return c == '!' || c == '$' || c == '&' || c == '=' || c == '(' || + c == ')' || c == '*' || c == '+' || c == ',' || c == ';' || + c == '\''; + } + + static bool + reserved (char_type c) {return sub_delim (c) || gen_delim (c);} + + static bool + unreserved (char_type c) + { + return alnum (c) || c == '-' || c == '.' || c =='_' || c == '~'; + } + + // URL-encode a character sequence. + // + // Note that the set of characters that should be encoded may differ for + // different URL components. The optional callback function must return + // true for characters that should be percent-encoded. The function may + // encode the passed character in it's own way with another character (but + // never with '%'), and return false. By default all characters other than + // unreserved are percent-encoded. + // + // Also note that the characters are interpreted as bytes. In other words, + // each character may result in a single encoding triplet. + // + template <typename I, typename O, typename F = bool (*) (char_type&)> + static void + encode (I b, I e, + O o, + + // VC (as of 15u3) doesn't see unreserved() unless qualified. + // + F&& f = [] (char_type& c) {return !basic_url<H,T>::unreserved (c);}); + + template <typename F = bool (*) (char_type&)> + static string_type + encode (const string_type& s, + F&& f = [] (char_type& c) {return !basic_url<H,T>::unreserved (c);}) + { + string_type r; + encode (s.begin (), s.end (), std::back_inserter (r), f); + return r; + } + + template <typename F = bool (*) (char_type&)> + static string_type + encode (const char_type* s, + F&& f = [] (char_type& c) {return !basic_url<H,T>::unreserved (c);}) + { + string_type r; + encode (s, s + string_type::traits_type::length (s), + std::back_inserter (r), f); + return r; + } + + // URL-decode a character sequence. Throw std::invalid_argument if an + // invalid encoding sequence is encountered. + // + // If some characters in the sequence are encoded with another characters + // (rather than percent-encoded), then one must provide the callback + // function to decode them. + // + template <typename I, typename O, typename F = void (*) (char_type&)> + static void + decode (I b, I e, O o, F&& f = [] (char_type&) {}); + + template <typename F = void (*) (char_type&)> + static string_type + decode (const string_type& s, F&& f = [] (char_type&) {}) + { + string_type r; + decode (s.begin (), s.end (), std::back_inserter (r), f); + return r; + } + + template <typename F = void (*) (char_type&)> + static string_type + decode (const char_type* s, F&& f = [] (char_type&) {}) + { + string_type r; + decode (s, s + string_type::traits_type::length (s), + std::back_inserter (r), f); + return r; + } + }; + + using url_authority = basic_url_authority<std::string>; + using url = basic_url <std::string>; + + template <typename S> + inline bool + operator== (const basic_url_host<S>& x, const basic_url_host<S>& y) noexcept + { + return x.value == y.value && x.kind == y.kind; + } + + template <typename S> + inline bool + operator!= (const basic_url_host<S>& x, const basic_url_host<S>& y) noexcept + { + return !(x == y); + } + + template <typename S> + inline bool + operator== (const basic_url_authority<S>& x, + const basic_url_authority<S>& y) noexcept + { + return x.user == y.user && x.host == y.host && x.port == y.port; + } + + template <typename S> + inline bool + operator!= (const basic_url_authority<S>& x, + const basic_url_authority<S>& y) noexcept + { + return !(x == y); + } + + template <typename S, typename T> + inline bool + operator== (const basic_url<S, T>& x, const basic_url<S, T>& y) noexcept + { + if (!(x.authority == y.authority && x.path == y.path && + x.query == y.query && x.fragment == y.fragment)) + return false; + + assert (x.empty () == y.empty ()); + + if (x.empty ()) + return true; + + return x.scheme == y.scheme; // None is empty, so schemes are valid. + } + + template <typename S, typename T> + inline bool + operator!= (const basic_url<S, T>& x, const basic_url<S, T>& y) noexcept + { + return !(x == y); + } + + template <typename S, typename T> + inline auto + operator<< (std::basic_ostream<typename T::string_type::value_type>& o, + const basic_url<S, T>& u) -> decltype (o) + { + return o << u.string (); + } +} + +#include <libbutl/url.ixx> +#include <libbutl/url.txx> diff --git a/libbutl/url.txx b/libbutl/url.txx new file mode 100644 index 0000000..addfe88 --- /dev/null +++ b/libbutl/url.txx @@ -0,0 +1,509 @@ +// file : libbutl/url.txx -*- C++ -*- +// copyright : Copyright (c) 2014-2017 Code Synthesis Ltd +// license : MIT; see accompanying LICENSE file + +LIBBUTL_MODEXPORT namespace butl //@@ MOD Clang needs this for some reason. +{ + // Convenience functions. + // + template <typename C> + inline bool + url_path_char (C c) + { + using url = basic_url<std::basic_string<C>>; + + return c == '/' || c == ':' || url::unreserved (c) || + c == '@' || url::sub_delim (c); + } + + // basic_url_host + // + template <typename S> + basic_url_host<S>:: + basic_url_host (string_type v) + { + using std::invalid_argument; + + using url = basic_url<string_type>; + using char_type = typename string_type::value_type; + + kind = v[0] == '[' ? kind_type::ipv6 : kind_type::name; + + if (kind == url_host_kind::ipv6) + { + if (v.back () != ']') + throw invalid_argument ("invalid IPv6 address"); + + value.assign (v, 1, v.size () - 2); + } + else + { + // Detect the IPv4 address host type. + // + { + size_t n (0); + string_type oct; + + auto ipv4_oct = [&oct, &n] () -> bool + { + if (n == 4 || oct.empty () || oct.size () > 3 || stoul (oct) > 255) + return false; + + ++n; + oct.clear (); + return true; + }; + + auto i (v.cbegin ()); + auto e (v.cend ()); + + for (; i != e; ++i) + { + char_type c (*i); + + if (digit (c)) + oct += c; + else if (c != '.' || !ipv4_oct ()) + break; + } + + if (i == e && ipv4_oct () && n == 4) + kind = url_host_kind::ipv4; + } + + // Verify and decode the host name. + // + bool dec (false); + if (kind == url_host_kind::name) + { + for (auto c: v) + { + if (!(url::unreserved (c) || url::sub_delim (c) || c == '%')) + throw invalid_argument ("invalid host name"); + + if (c == '%') + dec = true; + } + } + + value = dec ? url::decode (v) : move (v); + } + } + + template <typename S> + S basic_url_host<S>:: + string () const + { + using url = basic_url<string_type>; + using char_type = typename string_type::value_type; + + if (empty ()) + return string_type (); + + switch (kind) + { + case url_host_kind::ipv4: return value; + case url_host_kind::ipv6: + { + string_type r; + r += '['; + r += value; + r += ']'; + return r; + } + case url_host_kind::name: + { + // We don't encode all characters that are disallowed for the host + // part as RFC3986 requests: + // + // URI producing applications must not use percent-encoding in host + // unless it is used to represent a UTF-8 character sequence. + // + // The callback requests to encode characters outside the ASCII + // character set. + // + return url::encode (value, + [] (char_type& c) + { + // Convert to the unsigned numeric type, that is + // long enough to hold any character type. + // + return static_cast<unsigned long> (c) >= 0x80; + }); + } + } + + assert (false); // Can't be here. + return string_type (); + } + + // basic_url_authority + // + template <typename S> + S + port_string (std::uint16_t p); + + template <> + inline std::string + port_string (std::uint16_t p) + { + return std::to_string (p); + } + + template <> + inline std::wstring + port_string (std::uint16_t p) + { + return std::to_wstring (p); + } + + template <typename S> + S basic_url_authority<S>:: + string () const + { + if (empty ()) + return string_type (); + + string_type r; + if (!user.empty ()) + { + r += user; + r += '@'; + } + + r += host.string (); + + if (port != 0) + { + r += ':'; + r += port_string<string_type> (port); + } + + return r; + } + + // basic_url + // + template <typename S, typename T> + basic_url<S, T>:: + basic_url (const string_type& u) + { + using namespace std; + + using iterator = typename string_type::const_iterator; + + // Create an empty URL object for the empty argument. Note that the scheme + // is default-constructed, and so may stay undefined in this case. + // + if (u.empty ()) + return; + + try + { + // At the end of a component parsing 'i' points to the next component + // start, and 'b' stays unchanged. + // + iterator b (u.cbegin ()); + iterator i (b); + iterator e (u.cend ()); + + // Extract scheme. + // + for(char_type c; i != e && (c = *i) != ':'; ++i) + { + if (!(i == b + ? alpha (c) + : (alnum (c) || c == '+' || c == '-' || c == '.'))) + throw invalid_argument ("invalid scheme"); + } + + if (i == b || i == e || i == b + 1) // Forbids one letter length schemes. + throw invalid_argument ("no scheme"); + + string_type sc (b, i++); // Skip ':'. + + // Parse authority. + // + if (i != e && i + 1 != e && *i == '/' && *(i + 1) == '/') + { + i += 2; // Skip '//'. + + // Find the authority end. + // + size_t p (u.find_first_of (string_type ({'/', '?', '#'}), i - b)); + iterator ae (p != string_type::npos ? b + p : e); + + string_type auth (i, ae); + i = ae; + + // Extract user information. + // + string_type user; + p = auth.find ('@'); + if (p != string_type::npos) + { + // Don't URL-decode the user information (scheme-specific). + // + user = string_type (auth, 0, p); + auth = string_type (auth, p + 1); + } + + // Extract host. + // + string_type host; + p = auth.find_last_of({']', ':'}); // Note: ':' can belong to IPv6. + + if (p != string_type::npos && auth[p] == ']') // There is no port. + p = string_type::npos; + + if (p != string_type::npos) + { + host = string_type (auth, 0, p); + auth = string_type (auth, p + 1); + } + else + { + host = move (auth); + auth = string_type (); + } + + // Extract port. + // + uint16_t port (0); + if (!auth.empty ()) + { + auto bad_port = [] () {throw invalid_argument ("invalid port");}; + + for (auto c: auth) + { + if (!digit (c)) + bad_port (); + } + + unsigned long long n (stoull (auth)); + if (n == 0 || n > UINT16_MAX) + bad_port (); + + port = static_cast<uint16_t> (n); + } + + // User information and port are only meaningful if the host part is + // present. + // + if (host.empty () && (!user.empty () || port != 0)) + throw invalid_argument ("no host"); + + authority = {move (user), host_type (move (host)), port}; + } + + // Extract path. + // + if (i != e && *i == '/') + { + ++i; // Skip '/'. + + // Verify and URL-decode the path. + // + iterator j (i); + for(char_type c; j != e && (c = *j) != '?' && c != '#'; ++j) + { + if (!(url_path_char (c) || c == '%')) + throw invalid_argument ("invalid path"); + } + + // Note that encoding for non-ASCII path is not specified (in contrast + // to the host name), and presumably is local to the referenced + // authority. + // + string_type s; + decode (i, j, back_inserter (s)); + path = traits::translate_path (move (s)); + i = j; + } + + // Extract query. + // + if (i != e && *i == '?') + { + ++i; // Skip '?'. + + // Find the query component end. + // + size_t p (u.find ('#', i - b)); + iterator qe (p != string_type::npos ? b + p : e); + + // Don't URL-decode the query (scheme-specific). + // + query = string_type (i, qe); + i = qe; + } + + // We don't suppose to end up with an empty URL. + // + if (empty ()) + throw invalid_argument ("no authority, path or query"); + + // Parse fragment. + // + if (i != e) + { + ++i; // Skip '#'. + + // Don't URL-decode the fragment (media type-specific). + // + fragment = string_type (i, e); + i = e; + } + + assert (i == e); + + // Translate the scheme string representation to its type. + // + scheme = traits::translate_scheme (u, + move (sc), + authority, + path, + query, + fragment); + } + // If we fail to parse the URL, then delegate this job to + // traits::translate_scheme(). If it also fails, leaving the components + // absent, then we re-throw. + // + catch (const invalid_argument&) + { + authority = nullopt; + path = nullopt; + query = nullopt; + fragment = nullopt; + + scheme = traits::translate_scheme (u, + string_type () /* scheme */, + authority, + path, + query, + fragment); + + if (!authority && !path && !query && !fragment) + throw; + } + } + + template <typename S, typename T> + typename basic_url<S, T>::string_type basic_url<S, T>:: + string () const + { + if (empty ()) + return string_type (); + + string_type u; + string_type r (traits::translate_scheme (u, + scheme, + authority, + path, + query, + fragment)); + + // Return the custom URL pbject representation if provided. + // + if (!u.empty ()) + return u; + + r += ':'; + + if (authority) + { + r += '/'; + r += '/'; + r += authority->string (); + } + + if (path) + { + r += '/'; + r += encode (traits::translate_path (*path), + [] (char_type& c) {return !url_path_char (c);}); + } + + if (query) + { + r += '?'; + r += *query; + } + + if (fragment) + { + r += '#'; + r += *fragment; + } + + return r; + } + + template <typename S, typename T> + template <typename I, typename O, typename F> + void basic_url<S, T>:: + encode (I b, I e, O o, F&& f) + { + const char_type digits[] = {'0', '1', '2', '3', '4', '5', '6', '7', '8', + '9', 'A', 'B', 'C', 'D', 'E', 'F'}; + + for (; b != e; ++b) + { + char_type c (*b); + + if (c == '%' || f (c)) + { + assert (c == *b); // Must not be custom-encoded. + + *o++ = '%'; + *o++ = digits[(c >> 4) & 0xF]; + *o++ = digits[c & 0xF]; + } + else + { + assert (c != '%'); // Otherwise decoding will be ambiguous. + *o++ = c; + } + } + } + + template <typename S, typename T> + template <typename I, typename O, typename F> + void basic_url<S, T>:: + decode (I b, I e, O o, F&& f) + { + using namespace std; + + for (; b != e; ++b) + { + char_type c (*b); + + // URL-decode the character. + // + if (c == '%') + { + // Note that we can't use (potentially more efficient) strtoul() here + // as it doesn't have an overload for the wide character string. + // However, the code below shouldn't be inefficient, given that the + // string is short, and so is (probably) stack-allocated. + // + // Note that stoul() throws if no conversion could be performed, so we + // explicitly check for xdigits. + // + if (++b != e && xdigit (*b) && b + 1 != e && xdigit (*(b + 1))) + c = static_cast<char_type> (stoul (string_type (b, b + 2), + nullptr, + 16)); + else + throw invalid_argument ("invalid URL-encoding"); + + ++b; // Position to the second xdigit. + } + else + f (c); + + *o++ = c; + } + } +} diff --git a/libbutl/utility.ixx b/libbutl/utility.ixx index d703211..fcb8789 100644 --- a/libbutl/utility.ixx +++ b/libbutl/utility.ixx @@ -137,4 +137,34 @@ namespace butl { return std::isalnum (c); } + + inline bool + xdigit (char c) + { + return std::isxdigit (c); + } + + inline bool + alpha (wchar_t c) + { + return std::iswalpha (c); + } + + inline bool + digit (wchar_t c) + { + return std::iswdigit (c); + } + + inline bool + alnum (wchar_t c) + { + return std::iswalnum (c); + } + + inline bool + xdigit (wchar_t c) + { + return std::iswxdigit (c); + } } diff --git a/libbutl/utility.mxx b/libbutl/utility.mxx index 3f23581..988ca22 100644 --- a/libbutl/utility.mxx +++ b/libbutl/utility.mxx @@ -22,7 +22,8 @@ #include <exception> // exception, uncaught_exception[s]() //#include <functional> // hash -#include <cctype> // toupper(), tolower(), isalpha(), isdigit(), isalnum() +#include <cctype> // toupper(), tolower(), is*() +#include <cwctype> // isw*() #endif #include <libbutl/ft/lang.hxx> // thread_local @@ -138,14 +139,15 @@ LIBBUTL_MODEXPORT namespace butl } }; - bool - alpha (char); - - bool - digit (char); + bool alpha (char); + bool digit (char); + bool alnum (char); + bool xdigit (char); - bool - alnum (char); + bool alpha (wchar_t); + bool digit (wchar_t); + bool alnum (wchar_t); + bool xdigit (wchar_t); // Key comparators (i.e., to be used in sets, maps, etc). // diff --git a/tests/url/buildfile b/tests/url/buildfile new file mode 100644 index 0000000..ed8380c --- /dev/null +++ b/tests/url/buildfile @@ -0,0 +1,8 @@ +# file : tests/url/buildfile +# copyright : Copyright (c) 2014-2017 Code Synthesis Ltd +# license : MIT; see accompanying LICENSE file + +import libs = libbutl%lib{butl} +libs += $stdmod_lib + +exe{driver}: {hxx cxx}{*} $libs test{testscript} diff --git a/tests/url/driver.cxx b/tests/url/driver.cxx new file mode 100644 index 0000000..95fe9cb --- /dev/null +++ b/tests/url/driver.cxx @@ -0,0 +1,346 @@ +// file : tests/url/driver.cxx -*- C++ -*- +// copyright : Copyright (c) 2014-2017 Code Synthesis Ltd +// license : MIT; see accompanying LICENSE file + +#include <cassert> + +#ifndef __cpp_lib_modules +#include <string> +#include <utility> // move() +#include <iostream> +#include <iterator> // back_inserter +#include <stdexcept> // invalid_argument +#endif + +// Other includes. + +#ifdef __cpp_modules +#ifdef __cpp_lib_modules +import std.core; +import std.io; +#endif +import butl.url; +import butl.utility; // operator<<(ostream, exception) +#else +#include <libbutl/url.mxx> +#include <libbutl/utility.mxx> +#endif + +using namespace std; +using namespace butl; + +enum class scheme +{ + http, + https, + file +}; + +namespace butl +{ + template <> + struct url_traits<scheme> + { + using string_type = wstring; + using path_type = wstring; + + using scheme_type = scheme; + using authority_type = basic_url_authority<string_type>; + + static scheme_type + translate_scheme (const string_type& url, + string_type&& scheme, + optional<authority_type>& /*authority*/, + optional<path_type>& path, + optional<string_type>& /*query*/, + optional<string_type>& /*fragment*/) + { + // Note that we must compare case-insensitive in the real program. + // + if (scheme == L"http") + return scheme_type::http; + else if (scheme == L"https") + return scheme_type::https; + else if (scheme == L"file") + return scheme_type::file; + else if (scheme.empty ()) + { + // If the URL looks like an absolute filesystem path, then translate it + // to the file URL. If it is not, then leave all the components absent + // to fail with a proper exception description. + // + wchar_t c; + if ((c = url[0]) == '/' || + (url.size () > 2 && alpha (c) && url[1] == ':' && url[2] == '/')) + path = url; + + return scheme_type::file; + } + else + throw invalid_argument ("unknown scheme"); + } + + // Translate scheme type back to its string representation. + // + static string_type + translate_scheme (string_type&, /*url*/ + const scheme_type& scheme, + const optional<authority_type>& /*authority*/, + const optional<path_type>& /*path*/, + const optional<string_type>& /*query*/, + const optional<string_type>& /*fragment*/) + { + switch (scheme) + { + case scheme_type::http: return L"http"; + case scheme_type::https: return L"https"; + case scheme_type::file: return L"file"; + } + + assert (false); // Can't be here. + return L""; + } + + static path_type + translate_path (string_type&& path) + { + return path_type (move (path)); + } + + static string_type + translate_path (const path_type& path) {return string_type (path);} + }; +} + +// Usages: +// +// argv[0] +// argv[0] [-c|-s|-w] <url> +// +// Perform some basic tests if no URL is provided. Otherwise round-trip the URL +// to STDOUT. URL must contain only ASCII characters. Exit with zero code on +// success. Exit with code one on parsing failure, printing error description +// to STDERR. +// +// -c +// Print the URL components one per line. Print the special '[null]' string +// for an absent components. This is the default option if URL is provided. +// +// -s +// Print stringified url object representation. +// +// -w +// Same as above, but use the custom wstring-based url_traits +// implementation for the basic_url template. +// +int +main (int argc, const char* argv[]) +try +{ + using wurl = basic_url<scheme>; + using wurl_authority = wurl::authority_type; + using wurl_host = wurl::host_type; + + enum class print_mode + { + str, + wstr, + comp + } mode (print_mode::comp); + + int i (1); + for (; i != argc; ++i) + { + string o (argv[i]); + if (o == "-s") + mode = print_mode::str; + else if (o == "-w") + mode = print_mode::wstr; + else if (o == "-c") + mode = print_mode::comp; + else + break; // End of options. + } + + if (i == argc) + { + // Test ctors and operators. + // + { + wurl u0 ((wstring ())); + assert (u0.empty ()); + assert (u0 == wurl ()); + + wurl u1 (scheme::http, + wurl_authority {wstring (), wurl_host (L"[123]"), 0}, + wstring (L"login"), + wstring (L"q="), + wstring (L"f")); + + assert (!u1.empty ()); + assert (u1 != u0); + + wurl u2 (scheme::http, + wurl_host (L"123", url_host_kind::ipv6), + wstring (L"login"), + wstring (L"q="), + wstring (L"f")); + + assert (u2 == u1); + + wurl u3 (scheme::http, + wurl_host (L"123", url_host_kind::ipv6), + 0, + wstring (L"login"), + wstring (L"q="), + wstring (L"f")); + + assert (u3 == u2); + + wurl u4 (scheme::http, + L"[123]", + wstring (L"login"), + wstring (L"q="), + wstring (L"f")); + + assert (u4 == u3); + + wurl u5 (scheme::http, + L"[123]", + 0, + wstring (L"login"), + wstring (L"q="), + wstring (L"f")); + + assert (u5 == u4); + } + + // Test encode and decode. + // + { + const char* s ("ABC +"); + string es (url::encode (s)); + + assert (es == "ABC%20%2B"); + string ds (url::decode (es)); + + assert (ds == s); + } + + { + const char* s ("ABC +"); + + string es (url::encode (s, + [] (char& c) -> bool + { + if (c == ' ') + { + c = '+'; + return false; + } + return !url::unreserved (c); + })); + + assert (es == "ABC+%2B"); + + string ds (url::decode (es.c_str (), + [] (char& c) + { + if (c == '+') + c = ' '; + })); + assert (ds == s); + } + { + const wchar_t s[] = L"ABC "; + + wstring es; + wurl::encode (s, s + 4, + back_inserter (es), + [] (wchar_t& c) -> bool + { + if (!alnum (c)) + return true; + + ++c; + return false; + }); + assert (es == L"BCD%20"); + + wstring ds (wurl::decode (es, + [] (wchar_t& c) + { + if (alnum (c)) + --c; + })); + assert (ds == s); + } + } + else // Round-trip the URL. + { + assert (i + 1 == argc); + + const char* ua (argv[i]); + + switch (mode) + { + case print_mode::str: + { + cout << url (ua) << endl; + break; + } + case print_mode::wstr: + { + // Convert ASCII string to wstring. + // + wstring s (ua, ua + strlen (ua)); + + wcout << wurl (s) << endl; + break; + } + case print_mode::comp: + { + // Convert ASCII string to wstring. + // + wstring s (ua, ua + strlen (ua)); + wurl u (s); + + if (!u.empty ()) + { + wstring s; + wcout << wurl::traits::translate_scheme (s, + u.scheme, + nullopt, + nullopt, + nullopt, + nullopt) << endl; + } + else + wcout << L"[null]" << endl; + + if (u.authority) + { + const wchar_t* kinds[] = {L"ipv4", L"ipv6", L"name"}; + const wurl_authority& a (*u.authority); + + wcout << a.user << L'@' << a.host.value << L':' << a.port + << " " << kinds[static_cast<size_t> (a.host.kind)] << endl; + } + else + wcout << L"[null]" << endl; + + wcout << (u.path ? *u.path : L"[null]") << endl + << (u.query ? *u.query : L"[null]") << endl + << (u.fragment ? *u.fragment : L"[null]") << endl; + break; + } + } + } + + return 0; +} +catch (const invalid_argument& e) +{ + cerr << e << endl; + return 1; +} diff --git a/tests/url/testscript b/tests/url/testscript new file mode 100644 index 0000000..d81f282 --- /dev/null +++ b/tests/url/testscript @@ -0,0 +1,378 @@ +# file : tests/url/testscript +# copyright : Copyright (c) 2014-2017 Code Synthesis Ltd +# license : MIT; see accompanying LICENSE file + +:basic +: +$* + +: components +: +{ + : all + : + $* 'https://user@stage.b2.org:443/libbutl?f=full#description' >>EOO + https + user@stage.b2.org:443 name + libbutl + f=full + description + EOO + + : empty-url + : + $* '' >>EOO + [null] + [null] + [null] + [null] + [null] + EOO + + : no-id + : + { + $* 'file:#f' 2>'no authority, path or query' != 0 : fragment + $* 'file:aaa' 2>'no authority, path or query' != 0 : junk + $* 'file:' 2>'no authority, path or query' != 0 : none + } + + : scheme + : + { + : detected + : + $* 'http://build2.org' >>EOO + http + @build2.org:0 name + [null] + [null] + [null] + EOO + + : deduced + : + $* 'c:/a' >>EOO + file + [null] + c:/a + [null] + [null] + EOO + + $* ':/a' 2>'no scheme' != 0 : none + $* 'http' 2>'no scheme' != 0 : unterminated + $* 'ht~tp://a.com' 2>'invalid scheme' != 0 : invalid-char + $* '1http://a.com' 2>'invalid scheme' != 0 : invalid-first-char + } + + : authority + { + : absent + : + $* 'file:/tmp/a' >>EOO + file + [null] + tmp/a + [null] + [null] + EOO + + : empty + : + $* 'file:///tmp/a' >>EOO + file + @:0 name + tmp/a + [null] + [null] + EOO + + : trailing + : + $* 'http://localhost' >>EOO + http + @localhost:0 name + [null] + [null] + [null] + EOO + + : user + : + { + : non-empty + : + $* 'http://admin@localhost' >>EOO + http + admin@localhost:0 name + [null] + [null] + [null] + EOO + + : empty + : + $* 'http://@localhost' >>EOO + http + @localhost:0 name + [null] + [null] + [null] + EOO + } + + : host + : + { + : ipv6 + : + { + : port + : + $* 'http://[1:23]:443' >>EOO + http + @1:23:443 ipv6 + [null] + [null] + [null] + EOO + + : no-port + : + $* 'http://[1:23]' >>EOO + http + @1:23:0 ipv6 + [null] + [null] + [null] + EOO + + $* 'http://[123' 2>'invalid IPv6 address' != 0 : missed-bracket + $* 'http://[123] :80' 2>'invalid IPv6 address' != 0 : extra-char + } + + : ipv4 + : + { + : valid + : + $* 'http://0.10.200.255' >>EOO + http + @0.10.200.255:0 ipv4 + [null] + [null] + [null] + EOO + + : long + : + $* 'http://0.10.200.255.30' >>EOO + http + @0.10.200.255.30:0 name + [null] + [null] + [null] + EOO + + : short + : + $* 'http://0.10.200' >>EOO + http + @0.10.200:0 name + [null] + [null] + [null] + EOO + + : missed + : + $* 'http://0.10..200' >>EOO + http + @0.10..200:0 name + [null] + [null] + [null] + EOO + + : out-of-range + : + $* 'http://0.10.200.256' >>EOO + http + @0.10.200.256:0 name + [null] + [null] + [null] + EOO + } + + : name + : + { + : valid + : + $* 'https://www.b2.org' >>EOO + https + @www.b2.org:0 name + [null] + [null] + [null] + EOO + + : encoded + : + { + : valid + : + $* 'https://www.%62%32.org' >>EOO + https + @www.b2.org:0 name + [null] + [null] + [null] + EOO + + $* 'https://www.%62%3.org' 2>'invalid URL-encoding' != 0 : short + $* 'https://www.%62%3x.org' 2>'invalid URL-encoding' != 0 : invalid + $* 'https://www.%62%.org' 2>'invalid URL-encoding' != 0 : absent + } + + $* 'https://www.b|2.org' 2>'invalid host name' != 0 : invalid-char + } + + $* 'http://admin@:80?q=' 2>'no host' != 0: no-host + } + + : port + : + { + : valid + : + $* 'http://build2.org:443' >>EOO + http + @build2.org:443 name + [null] + [null] + [null] + EOO + + $* 'http://build2.org:-433' 2>'invalid port' != 0 : invalid-char + $* 'http://build2.org:70000' 2>'invalid port' != 0 : exceeds-max + $* 'http://build2.org:0' 2>'invalid port' != 0 : zero + } + } + + : path + : + { + : absent + : + $* 'http://b2.org' >>EOO + http + @b2.org:0 name + [null] + [null] + [null] + EOO + + : empty + : + $* 'http://b2.org/' >>EOO + http + @b2.org:0 name + + [null] + [null] + EOO + + : non-empty + : + $* 'http://b2.org/s/q' >>EOO + http + @b2.org:0 name + s/q + [null] + [null] + EOO + + : encoded + : + $* 'http://b2.org/%6F/s' >>EOO + http + @b2.org:0 name + o/s + [null] + [null] + EOO + } + + : query + : + { + : no-fragment + : + $* 'http://b2.org/a?x=foo&y=bar' >>EOO + http + @b2.org:0 name + a + x=foo&y=bar + [null] + EOO + + : fragment + : + $* 'http://b2.org/a?foo#bar' >>EOO + http + @b2.org:0 name + a + foo + bar + EOO + } + + : fragment + : + { + $* 'http://b2.org#foo' >>EOO + http + @b2.org:0 name + [null] + [null] + foo + EOO + } +} + +: string +{ + test.options += -s + + : authority + : + { + : host + : + { + $* 'file:///a' >'file:///a' : empty + $* 'http://1.1.1.1' >'http://1.1.1.1' : ipv4 + $* 'https://[1:2:3]' >'https://[1:2:3]' : ipv6 + $* 'file://a%d1%84' >'file://a%D1%84' : name + } + + $* 'http://admin@localhost' >'http://admin@localhost' : user + $* 'http://localhost:8080' >'http://localhost:8080' : port + $* 'file:/a' >'file:/a' : absent + } + + $* '' >'' : empty + $* 'file:/b%7C2' >'file:/b%7C2' : path + $* 'http://a?q=' >'http://a?q=' : query + $* 'http://a#f' >'http://a#f' : fragment +} + +: wstring +: +{ + u = 'https://user@stage.b2.org:443/libbutl?f=full#description' + $* -w "$u" >"$u" +} |