diff options
Diffstat (limited to 'libbutl/url.mxx')
-rw-r--r-- | libbutl/url.mxx | 476 |
1 files changed, 476 insertions, 0 deletions
diff --git a/libbutl/url.mxx b/libbutl/url.mxx new file mode 100644 index 0000000..fe091f1 --- /dev/null +++ b/libbutl/url.mxx @@ -0,0 +1,476 @@ +// file : libbutl/url.mxx -*- C++ -*- +// copyright : Copyright (c) 2014-2017 Code Synthesis Ltd +// license : MIT; see accompanying LICENSE file + +#ifndef __cpp_modules +#pragma once +#endif + +// C includes. + +#include <cassert> + +#ifndef __cpp_lib_modules +#include <string> +#include <cstdint> // uint*_t +#include <utility> // move() +#include <ostream> +#include <iterator> // back_inserter + +#include <cstddef> // size_t +#include <stdexcept> // invalid_argument +#endif + +// Other includes. + +#ifdef __cpp_modules +export module butl.url; +#ifdef __cpp_lib_modules +import std.core; +import std.io; +#endif +import butl.path; +import butl.utility; +import butl.optional; +#else +#include <libbutl/path.mxx> +#include <libbutl/utility.mxx> +#include <libbutl/optional.mxx> +#endif + +#include <libbutl/export.hxx> + +LIBBUTL_MODEXPORT namespace butl +{ + // RFC3986 Uniform Resource Locator (URL). + // + // <url> = <scheme>:[//[<authority>]][/<path>][?<query>][#<fragment>] + // <authority> = [<user>@]<host>[:<port>] + // + // Some examples of equivalent URLs to meditate upon: + // + // file://localhost/tmp (localhost authority) + // file:///tmp (empty authority) + // file:/tmp (absent authority) + // + // file://localhost/c:/tmp + // file:///c:/tmp + // file:/c:/tmp + // + // We think of the slash between <authority> and <path> as a separator but + // with the path always interpreted as starting from the "root" of the + // authority. Thus: + // + // file://localhost/tmp -> 'file'://'localhost'/'tmp' -> /tmp + // file://localhost/c:/tmp -> 'file'://'localhost'/'c:/tmp' -> c:/tmp + // + // This means that the <path> component is represented as a relative path + // and, in the general case, we cannot use our path type for its storage + // since it assumes the path is for the host platform. In other words, the + // interpretation of the path has to take into account the platform of the + // authority host. Note, however, that a custom url_traits implementation + // can choose to use the path type if local paths are to be interpreted as + // relative to the host. + // + // Note that we currently forbid one character schemes to support scheme- + // less (Windows) paths which can be done by url_traits::translate_scheme() + // (see below). (A Windows path that uses forward slashes would be parsed as + // a valid authority-less URL). + + // URL host component can be an IPv4 address (if matches its dotted-decimal + // notation), an IPv6 address (if enclosed in [square brackets]) or + // otherwise a name. + // + // Note that non-ASCII host names are allowed in URLs. They must be + // UTF8-encoded and URL-encoded afterwards. Curently we store the parsed + // host name UTF8-encoded without regards to the template argument string + // type. Later we may add support for more appropriate encodings for + // multi-byte character types. + // + enum class url_host_kind {ipv4, ipv6, name}; + + template <typename S> + struct basic_url_host + { + using string_type = S; + using kind_type = url_host_kind; + + string_type value; + kind_type kind; + + // Can be treated as const string_type&. + // + operator const string_type& () const noexcept {return value;} + + // Create an empty host. + // + basic_url_host (): kind (kind_type::name) {} + + // Create the host object from its string representation as it appears in + // a URL, throwing std::invalid_argument if invalid. Remove the enclosing + // square brackets for IPv6 addresses, and URL-decode host names. + // + // Note that currently we don't validate IPv6 addresses. + // + explicit + basic_url_host (string_type); + + basic_url_host (string_type v, kind_type k) + : value (std::move (v)), kind (k) {} + + bool + empty () const + { + assert (kind == kind_type::name || !value.empty ()); + return value.empty (); + } + + // Return string representation of the host as it would appear in a URL. + // + string_type + string () const; + }; + + template <typename S> + struct basic_url_authority + { + using string_type = S; + using host_type = basic_url_host<string_type>; + + string_type user; // Empty if not specified. + host_type host; + std::uint16_t port; // Zero if not specified. + + bool + empty () const + { + assert (!host.empty () || (user.empty () && port == 0)); + return host.empty (); + } + + // Return a string representation of the URL authority. String + // representation of an empty instance is the empty string. + // + string_type + string () const; + }; + + template <typename H, typename S = H, typename P = S> + struct url_traits + { + using scheme_type = H; + using string_type = S; + using path_type = P; + + using authority_type = basic_url_authority<string_type>; + + // Translate the scheme string representation to its type. May throw + // std::invalid_argument. May change the URL components. + // + // This function is called with an empty scheme if the URL has no scheme, + // the scheme is invalid, or it could not be parsed into components + // according to the URL syntax. In this case all the passed components + // reference empty/absent values and if they remain unchanged on return, + // the URL is considered invalid and the std::invalid_argument exception + // with an appropriate description is thrown by the URL object constructor. + // This can be used to support scheme-less URLs, local paths, etc. + // + static scheme_type + translate_scheme (const string_type& /*url*/, + string_type&& scheme, + optional<authority_type>& /*authority*/, + optional<path_type>& /*path*/, + optional<string_type>& /*query*/, + optional<string_type>& /*fragment*/) + { + return scheme_type (std::move (scheme)); + } + + // Translate scheme type back to its string representation. + // + // Similar to the above the function is called with an empty string + // representation. If on return this value is no longer empty, then it is + // assume the URL has been translated in a custom manner (in which case + // the returned scheme value is ignored). + // + static string_type + translate_scheme (string_type&, /*url*/ + const scheme_type& scheme, + const optional<authority_type>& /*authority*/, + const optional<path_type>& /*path*/, + const optional<string_type>& /*query*/, + const optional<string_type>& /*fragment*/) + { + return string_type (scheme); + } + + // Translate the path string representation to its type. + // + static path_type + translate_path (string_type&& path) + { + return path_type (std::move (path)); + } + + // Translate path type back to its string representation. + // + static string_type + translate_path (const path_type& path) {return string_type (path);} + }; + + template <typename H, // scheme + typename T = url_traits<H>> + class basic_url + { + public: + using traits = T; + + using string_type = typename traits::string_type; + using char_type = typename string_type::value_type; + using path_type = typename traits::path_type; + + using scheme_type = typename traits::scheme_type; + using authority_type = typename traits::authority_type; + using host_type = typename authority_type::host_type; + + scheme_type scheme; + optional<authority_type> authority; + optional<path_type> path; + optional<string_type> query; + optional<string_type> fragment; + + // Create an empty URL object. + // + basic_url () = default; + + // Create the URL object from its string representation. If the argument is + // empty, then create an empty object. Otherwise verify that the string is + // compliant to the generic URL syntax. URL-decode and validate components + // with common for all schemes syntax (scheme, host, port, path). + // Throw std::invalid_argument if the passed string is not a valid URL + // representation. + // + // Validation and URL-decoding of the scheme-specific components can be + // provided by a custom url_traits::translate_scheme() implementation. + // + explicit + basic_url (const string_type&); + + // Create the URL object from individual components. Performs no + // components URL-decoding or verification. + // + basic_url (scheme_type, + optional<authority_type>, + optional<path_type> path, + optional<string_type> query = nullopt, + optional<string_type> fragment = nullopt); + + basic_url (scheme_type, + host_type host, + optional<path_type> path, + optional<string_type> query = nullopt, + optional<string_type> fragment = nullopt); + + basic_url (scheme_type, + host_type host, + std::uint16_t port, + optional<path_type> path, + optional<string_type> query = nullopt, + optional<string_type> fragment = nullopt); + + basic_url (scheme_type, + string_type host, + optional<path_type> path, + optional<string_type> query = nullopt, + optional<string_type> fragment = nullopt); + + basic_url (scheme_type, + string_type host, + std::uint16_t port, + optional<path_type> path, + optional<string_type> query = nullopt, + optional<string_type> fragment = nullopt); + + bool + empty () const noexcept + { + assert (authority || path || query || !fragment); + return !authority && !path && !query; + } + + // Return a string representation of the URL. Note that while this is not + // necessarily syntactically the same string as what was used to + // initialize this instance, it should be semantically equivalent. String + // representation of an empty instance is the empty string. + // + string_type + string () const; + + // The following predicates can be used to classify URL characters while + // parsing, validating or encoding scheme-specific components. For the + // semantics of character classes see RFC3986. + // + static bool + gen_delim (char_type c) + { + return c == ':' || c == '/' || c == '?' || c == '#' || c == '[' || + c == ']' || c == '@'; + } + + static bool + sub_delim (char_type c) + { + return c == '!' || c == '$' || c == '&' || c == '=' || c == '(' || + c == ')' || c == '*' || c == '+' || c == ',' || c == ';' || + c == '\''; + } + + static bool + reserved (char_type c) {return sub_delim (c) || gen_delim (c);} + + static bool + unreserved (char_type c) + { + return alnum (c) || c == '-' || c == '.' || c =='_' || c == '~'; + } + + // URL-encode a character sequence. + // + // Note that the set of characters that should be encoded may differ for + // different URL components. The optional callback function must return + // true for characters that should be percent-encoded. The function may + // encode the passed character in it's own way with another character (but + // never with '%'), and return false. By default all characters other than + // unreserved are percent-encoded. + // + // Also note that the characters are interpreted as bytes. In other words, + // each character may result in a single encoding triplet. + // + template <typename I, typename O, typename F = bool (*) (char_type&)> + static void + encode (I b, I e, + O o, + + // VC (as of 15u3) doesn't see unreserved() unless qualified. + // + F&& f = [] (char_type& c) {return !basic_url<H,T>::unreserved (c);}); + + template <typename F = bool (*) (char_type&)> + static string_type + encode (const string_type& s, + F&& f = [] (char_type& c) {return !basic_url<H,T>::unreserved (c);}) + { + string_type r; + encode (s.begin (), s.end (), std::back_inserter (r), f); + return r; + } + + template <typename F = bool (*) (char_type&)> + static string_type + encode (const char_type* s, + F&& f = [] (char_type& c) {return !basic_url<H,T>::unreserved (c);}) + { + string_type r; + encode (s, s + string_type::traits_type::length (s), + std::back_inserter (r), f); + return r; + } + + // URL-decode a character sequence. Throw std::invalid_argument if an + // invalid encoding sequence is encountered. + // + // If some characters in the sequence are encoded with another characters + // (rather than percent-encoded), then one must provide the callback + // function to decode them. + // + template <typename I, typename O, typename F = void (*) (char_type&)> + static void + decode (I b, I e, O o, F&& f = [] (char_type&) {}); + + template <typename F = void (*) (char_type&)> + static string_type + decode (const string_type& s, F&& f = [] (char_type&) {}) + { + string_type r; + decode (s.begin (), s.end (), std::back_inserter (r), f); + return r; + } + + template <typename F = void (*) (char_type&)> + static string_type + decode (const char_type* s, F&& f = [] (char_type&) {}) + { + string_type r; + decode (s, s + string_type::traits_type::length (s), + std::back_inserter (r), f); + return r; + } + }; + + using url_authority = basic_url_authority<std::string>; + using url = basic_url <std::string>; + + template <typename S> + inline bool + operator== (const basic_url_host<S>& x, const basic_url_host<S>& y) noexcept + { + return x.value == y.value && x.kind == y.kind; + } + + template <typename S> + inline bool + operator!= (const basic_url_host<S>& x, const basic_url_host<S>& y) noexcept + { + return !(x == y); + } + + template <typename S> + inline bool + operator== (const basic_url_authority<S>& x, + const basic_url_authority<S>& y) noexcept + { + return x.user == y.user && x.host == y.host && x.port == y.port; + } + + template <typename S> + inline bool + operator!= (const basic_url_authority<S>& x, + const basic_url_authority<S>& y) noexcept + { + return !(x == y); + } + + template <typename S, typename T> + inline bool + operator== (const basic_url<S, T>& x, const basic_url<S, T>& y) noexcept + { + if (!(x.authority == y.authority && x.path == y.path && + x.query == y.query && x.fragment == y.fragment)) + return false; + + assert (x.empty () == y.empty ()); + + if (x.empty ()) + return true; + + return x.scheme == y.scheme; // None is empty, so schemes are valid. + } + + template <typename S, typename T> + inline bool + operator!= (const basic_url<S, T>& x, const basic_url<S, T>& y) noexcept + { + return !(x == y); + } + + template <typename S, typename T> + inline auto + operator<< (std::basic_ostream<typename T::string_type::value_type>& o, + const basic_url<S, T>& u) -> decltype (o) + { + return o << u.string (); + } +} + +#include <libbutl/url.ixx> +#include <libbutl/url.txx> |