From e5bfd17637bf297c3cfe509d51027916864092d5 Mon Sep 17 00:00:00 2001 From: Karen Arutyunov Date: Sun, 10 Dec 2017 10:02:19 +0300 Subject: Add basic_url class template --- libbutl/url.mxx | 476 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 476 insertions(+) create mode 100644 libbutl/url.mxx (limited to 'libbutl/url.mxx') diff --git a/libbutl/url.mxx b/libbutl/url.mxx new file mode 100644 index 0000000..fe091f1 --- /dev/null +++ b/libbutl/url.mxx @@ -0,0 +1,476 @@ +// file : libbutl/url.mxx -*- C++ -*- +// copyright : Copyright (c) 2014-2017 Code Synthesis Ltd +// license : MIT; see accompanying LICENSE file + +#ifndef __cpp_modules +#pragma once +#endif + +// C includes. + +#include + +#ifndef __cpp_lib_modules +#include +#include // uint*_t +#include // move() +#include +#include // back_inserter + +#include // size_t +#include // invalid_argument +#endif + +// Other includes. + +#ifdef __cpp_modules +export module butl.url; +#ifdef __cpp_lib_modules +import std.core; +import std.io; +#endif +import butl.path; +import butl.utility; +import butl.optional; +#else +#include +#include +#include +#endif + +#include + +LIBBUTL_MODEXPORT namespace butl +{ + // RFC3986 Uniform Resource Locator (URL). + // + // = :[//[]][/][?][#] + // = [@][:] + // + // Some examples of equivalent URLs to meditate upon: + // + // file://localhost/tmp (localhost authority) + // file:///tmp (empty authority) + // file:/tmp (absent authority) + // + // file://localhost/c:/tmp + // file:///c:/tmp + // file:/c:/tmp + // + // We think of the slash between and as a separator but + // with the path always interpreted as starting from the "root" of the + // authority. Thus: + // + // file://localhost/tmp -> 'file'://'localhost'/'tmp' -> /tmp + // file://localhost/c:/tmp -> 'file'://'localhost'/'c:/tmp' -> c:/tmp + // + // This means that the component is represented as a relative path + // and, in the general case, we cannot use our path type for its storage + // since it assumes the path is for the host platform. In other words, the + // interpretation of the path has to take into account the platform of the + // authority host. Note, however, that a custom url_traits implementation + // can choose to use the path type if local paths are to be interpreted as + // relative to the host. + // + // Note that we currently forbid one character schemes to support scheme- + // less (Windows) paths which can be done by url_traits::translate_scheme() + // (see below). (A Windows path that uses forward slashes would be parsed as + // a valid authority-less URL). + + // URL host component can be an IPv4 address (if matches its dotted-decimal + // notation), an IPv6 address (if enclosed in [square brackets]) or + // otherwise a name. + // + // Note that non-ASCII host names are allowed in URLs. They must be + // UTF8-encoded and URL-encoded afterwards. Curently we store the parsed + // host name UTF8-encoded without regards to the template argument string + // type. Later we may add support for more appropriate encodings for + // multi-byte character types. + // + enum class url_host_kind {ipv4, ipv6, name}; + + template + struct basic_url_host + { + using string_type = S; + using kind_type = url_host_kind; + + string_type value; + kind_type kind; + + // Can be treated as const string_type&. + // + operator const string_type& () const noexcept {return value;} + + // Create an empty host. + // + basic_url_host (): kind (kind_type::name) {} + + // Create the host object from its string representation as it appears in + // a URL, throwing std::invalid_argument if invalid. Remove the enclosing + // square brackets for IPv6 addresses, and URL-decode host names. + // + // Note that currently we don't validate IPv6 addresses. + // + explicit + basic_url_host (string_type); + + basic_url_host (string_type v, kind_type k) + : value (std::move (v)), kind (k) {} + + bool + empty () const + { + assert (kind == kind_type::name || !value.empty ()); + return value.empty (); + } + + // Return string representation of the host as it would appear in a URL. + // + string_type + string () const; + }; + + template + struct basic_url_authority + { + using string_type = S; + using host_type = basic_url_host; + + string_type user; // Empty if not specified. + host_type host; + std::uint16_t port; // Zero if not specified. + + bool + empty () const + { + assert (!host.empty () || (user.empty () && port == 0)); + return host.empty (); + } + + // Return a string representation of the URL authority. String + // representation of an empty instance is the empty string. + // + string_type + string () const; + }; + + template + struct url_traits + { + using scheme_type = H; + using string_type = S; + using path_type = P; + + using authority_type = basic_url_authority; + + // Translate the scheme string representation to its type. May throw + // std::invalid_argument. May change the URL components. + // + // This function is called with an empty scheme if the URL has no scheme, + // the scheme is invalid, or it could not be parsed into components + // according to the URL syntax. In this case all the passed components + // reference empty/absent values and if they remain unchanged on return, + // the URL is considered invalid and the std::invalid_argument exception + // with an appropriate description is thrown by the URL object constructor. + // This can be used to support scheme-less URLs, local paths, etc. + // + static scheme_type + translate_scheme (const string_type& /*url*/, + string_type&& scheme, + optional& /*authority*/, + optional& /*path*/, + optional& /*query*/, + optional& /*fragment*/) + { + return scheme_type (std::move (scheme)); + } + + // Translate scheme type back to its string representation. + // + // Similar to the above the function is called with an empty string + // representation. If on return this value is no longer empty, then it is + // assume the URL has been translated in a custom manner (in which case + // the returned scheme value is ignored). + // + static string_type + translate_scheme (string_type&, /*url*/ + const scheme_type& scheme, + const optional& /*authority*/, + const optional& /*path*/, + const optional& /*query*/, + const optional& /*fragment*/) + { + return string_type (scheme); + } + + // Translate the path string representation to its type. + // + static path_type + translate_path (string_type&& path) + { + return path_type (std::move (path)); + } + + // Translate path type back to its string representation. + // + static string_type + translate_path (const path_type& path) {return string_type (path);} + }; + + template > + class basic_url + { + public: + using traits = T; + + using string_type = typename traits::string_type; + using char_type = typename string_type::value_type; + using path_type = typename traits::path_type; + + using scheme_type = typename traits::scheme_type; + using authority_type = typename traits::authority_type; + using host_type = typename authority_type::host_type; + + scheme_type scheme; + optional authority; + optional path; + optional query; + optional fragment; + + // Create an empty URL object. + // + basic_url () = default; + + // Create the URL object from its string representation. If the argument is + // empty, then create an empty object. Otherwise verify that the string is + // compliant to the generic URL syntax. URL-decode and validate components + // with common for all schemes syntax (scheme, host, port, path). + // Throw std::invalid_argument if the passed string is not a valid URL + // representation. + // + // Validation and URL-decoding of the scheme-specific components can be + // provided by a custom url_traits::translate_scheme() implementation. + // + explicit + basic_url (const string_type&); + + // Create the URL object from individual components. Performs no + // components URL-decoding or verification. + // + basic_url (scheme_type, + optional, + optional path, + optional query = nullopt, + optional fragment = nullopt); + + basic_url (scheme_type, + host_type host, + optional path, + optional query = nullopt, + optional fragment = nullopt); + + basic_url (scheme_type, + host_type host, + std::uint16_t port, + optional path, + optional query = nullopt, + optional fragment = nullopt); + + basic_url (scheme_type, + string_type host, + optional path, + optional query = nullopt, + optional fragment = nullopt); + + basic_url (scheme_type, + string_type host, + std::uint16_t port, + optional path, + optional query = nullopt, + optional fragment = nullopt); + + bool + empty () const noexcept + { + assert (authority || path || query || !fragment); + return !authority && !path && !query; + } + + // Return a string representation of the URL. Note that while this is not + // necessarily syntactically the same string as what was used to + // initialize this instance, it should be semantically equivalent. String + // representation of an empty instance is the empty string. + // + string_type + string () const; + + // The following predicates can be used to classify URL characters while + // parsing, validating or encoding scheme-specific components. For the + // semantics of character classes see RFC3986. + // + static bool + gen_delim (char_type c) + { + return c == ':' || c == '/' || c == '?' || c == '#' || c == '[' || + c == ']' || c == '@'; + } + + static bool + sub_delim (char_type c) + { + return c == '!' || c == '$' || c == '&' || c == '=' || c == '(' || + c == ')' || c == '*' || c == '+' || c == ',' || c == ';' || + c == '\''; + } + + static bool + reserved (char_type c) {return sub_delim (c) || gen_delim (c);} + + static bool + unreserved (char_type c) + { + return alnum (c) || c == '-' || c == '.' || c =='_' || c == '~'; + } + + // URL-encode a character sequence. + // + // Note that the set of characters that should be encoded may differ for + // different URL components. The optional callback function must return + // true for characters that should be percent-encoded. The function may + // encode the passed character in it's own way with another character (but + // never with '%'), and return false. By default all characters other than + // unreserved are percent-encoded. + // + // Also note that the characters are interpreted as bytes. In other words, + // each character may result in a single encoding triplet. + // + template + static void + encode (I b, I e, + O o, + + // VC (as of 15u3) doesn't see unreserved() unless qualified. + // + F&& f = [] (char_type& c) {return !basic_url::unreserved (c);}); + + template + static string_type + encode (const string_type& s, + F&& f = [] (char_type& c) {return !basic_url::unreserved (c);}) + { + string_type r; + encode (s.begin (), s.end (), std::back_inserter (r), f); + return r; + } + + template + static string_type + encode (const char_type* s, + F&& f = [] (char_type& c) {return !basic_url::unreserved (c);}) + { + string_type r; + encode (s, s + string_type::traits_type::length (s), + std::back_inserter (r), f); + return r; + } + + // URL-decode a character sequence. Throw std::invalid_argument if an + // invalid encoding sequence is encountered. + // + // If some characters in the sequence are encoded with another characters + // (rather than percent-encoded), then one must provide the callback + // function to decode them. + // + template + static void + decode (I b, I e, O o, F&& f = [] (char_type&) {}); + + template + static string_type + decode (const string_type& s, F&& f = [] (char_type&) {}) + { + string_type r; + decode (s.begin (), s.end (), std::back_inserter (r), f); + return r; + } + + template + static string_type + decode (const char_type* s, F&& f = [] (char_type&) {}) + { + string_type r; + decode (s, s + string_type::traits_type::length (s), + std::back_inserter (r), f); + return r; + } + }; + + using url_authority = basic_url_authority; + using url = basic_url ; + + template + inline bool + operator== (const basic_url_host& x, const basic_url_host& y) noexcept + { + return x.value == y.value && x.kind == y.kind; + } + + template + inline bool + operator!= (const basic_url_host& x, const basic_url_host& y) noexcept + { + return !(x == y); + } + + template + inline bool + operator== (const basic_url_authority& x, + const basic_url_authority& y) noexcept + { + return x.user == y.user && x.host == y.host && x.port == y.port; + } + + template + inline bool + operator!= (const basic_url_authority& x, + const basic_url_authority& y) noexcept + { + return !(x == y); + } + + template + inline bool + operator== (const basic_url& x, const basic_url& y) noexcept + { + if (!(x.authority == y.authority && x.path == y.path && + x.query == y.query && x.fragment == y.fragment)) + return false; + + assert (x.empty () == y.empty ()); + + if (x.empty ()) + return true; + + return x.scheme == y.scheme; // None is empty, so schemes are valid. + } + + template + inline bool + operator!= (const basic_url& x, const basic_url& y) noexcept + { + return !(x == y); + } + + template + inline auto + operator<< (std::basic_ostream& o, + const basic_url& u) -> decltype (o) + { + return o << u.string (); + } +} + +#include +#include -- cgit v1.1