From 2b14f09675c10d999779858ae31934b7eef55b89 Mon Sep 17 00:00:00 2001 From: Karen Arutyunov Date: Thu, 3 Sep 2020 20:23:45 +0300 Subject: Add normalize() function to host/URL class templates Also add IPv6 verification to host constructor. --- libbutl/url.ixx | 8 ++ libbutl/url.mxx | 29 +++++- libbutl/url.txx | 269 +++++++++++++++++++++++++++++++++++++++++++++++++-- tests/url/driver.cxx | 57 ++++++++--- tests/url/testscript | 252 +++++++++++++++++++++++++++++++---------------- 5 files changed, 512 insertions(+), 103 deletions(-) diff --git a/libbutl/url.ixx b/libbutl/url.ixx index 9ff3653..b823ee7 100644 --- a/libbutl/url.ixx +++ b/libbutl/url.ixx @@ -116,4 +116,12 @@ LIBBUTL_MODEXPORT namespace butl //@@ MOD Clang needs this for some reason. rootless (true) { } + + template + inline void basic_url:: + normalize () + { + if (authority) + authority->host.normalize (); + } } diff --git a/libbutl/url.mxx b/libbutl/url.mxx index 3ced734..713bc3e 100644 --- a/libbutl/url.mxx +++ b/libbutl/url.mxx @@ -18,6 +18,7 @@ #include // size_t #include // invalid_argument +#include // find(), find_if() #endif // Other includes. @@ -31,10 +32,14 @@ import std.io; import butl.path; import butl.utility; import butl.optional; + +import butl.small_vector; #else #include #include #include + +#include #endif #include @@ -123,7 +128,8 @@ LIBBUTL_MODEXPORT namespace butl // a URL, throwing std::invalid_argument if invalid. Remove the enclosing // square brackets for IPv6 addresses, and URL-decode host names. // - // Note that currently we don't validate IPv6 addresses. + // Note that the 'x:x:x:x:x:x:d.d.d.d' IPv6 address mixed notation is not + // supported. // explicit basic_url_host (string_type); @@ -142,6 +148,22 @@ LIBBUTL_MODEXPORT namespace butl // string_type string () const; + + // Normalize the host value in accordance with its type: + // + // Name - convert to the lower case. Note: only ASCII names are currently + // supported. + // + // IPv4 - strip the leading zeros in its octets. + // + // IPv6 - strip the leading zeros in its groups (hextets), squash the + // longest zero-only hextet sequence, and convert to the lower case + // (as per RFC5952). + // + // Assume that the host value is valid. + // + void + normalize (); }; template @@ -340,6 +362,11 @@ LIBBUTL_MODEXPORT namespace butl string_type string () const; + // Normalize the URL host, if present. + // + void + normalize (); + // The following predicates can be used to classify URL characters while // parsing, validating or encoding scheme-specific components. For the // semantics of character classes see RFC3986. diff --git a/libbutl/url.txx b/libbutl/url.txx index 546ebd1..0951e80 100644 --- a/libbutl/url.txx +++ b/libbutl/url.txx @@ -11,28 +11,99 @@ LIBBUTL_MODEXPORT namespace butl //@@ MOD Clang needs this for some reason. basic_url_host:: basic_url_host (string_type v) { - using std::invalid_argument; + using namespace std; using url = basic_url; using char_type = typename string_type::value_type; kind = v[0] == '[' ? kind_type::ipv6 : kind_type::name; + // Note that an IPv6 address is represented as eight colon-separated + // groups (hextets) of four or less hexadecimal digits. One or more + // consecutive zero hextets can be represented by double-colon (squashed), + // but only once, for example: 1::2:0:0:3. + // if (kind == url_host_kind::ipv6) { + auto bad_ip = [] () {throw invalid_argument ("invalid IPv6 address");}; + if (v.back () != ']') - throw invalid_argument ("invalid IPv6 address"); + bad_ip (); + + // Validate the IPv6 address. + // + // If the address doesn't contain the double-colon, then we will verify + // that it is composed of eight valid hextets. Otherwise, we will split + // the address by the double-colon into two hextet sequences, validate + // their hextets, and verify that their cumulative length is less than + // eight. + // + using iter = typename string_type::const_iterator; + + // Validate a hextet sequence and return its length. + // + auto len = [&bad_ip] (iter b, iter e) + { + size_t r (0); + + if (b == e) + return r; + + size_t n (0); // Current hextet length. + + // Fail if the current hextet is of an invalid length and increment + // the sequence length counter otherwise. + // + auto validate = [&r, &n, &bad_ip] () + { + if (n == 0 || n > 4) + bad_ip (); - value.assign (v, 1, v.size () - 2); + ++r; + n = 0; + }; + + for (iter i (b); i != e; ++i) + { + char_type c (*i); + + if (xdigit (c)) + ++n; + else if (c == ':') + validate (); + else + bad_ip (); + } + + validate (); // Validate the trailing hextet. + return r; + }; + + size_t p (v.find (string_type (2, ':'), 1)); + + size_t n1 (p != string_type::npos + ? len (v.begin () + 1, v.begin () + p) + : len (v.begin () + 1, v.end () - 1)); + + size_t n2 (p != string_type::npos + ? len (v.begin () + p + 2, v.end () - 1) + : 0); + + if (p != string_type::npos ? (n1 + n2 < 8) : (n1 == 8)) + value.assign (v, 1, v.size () - 2); + else + bad_ip (); } - else + else // IPV4 or name. { // Detect the IPv4 address host type. // { - size_t n (0); - string_type oct; + size_t n (0); // Number of octets. + string_type oct; // Current octet. + // Return true if the current octet is valid. + // auto ipv4_oct = [&oct, &n] () -> bool { if (n == 4 || oct.empty () || oct.size () > 3 || stoul (oct) > 255) @@ -126,6 +197,192 @@ LIBBUTL_MODEXPORT namespace butl //@@ MOD Clang needs this for some reason. return string_type (); } + template + void basic_url_host:: + normalize () + { + using namespace std; + + using char_type = typename string_type::value_type; + + switch (kind) + { + case url_host_kind::name: + { + for (char_type& c: value) + c = lcase (static_cast (c)); + + break; + } + case url_host_kind::ipv4: + { + // Strip the leading zeros in the octets. + // + string_type v; // Normalized address. + size_t n (0); // End of the last octet (including dot). + + for (char_type c: value) + { + if (c == '.') + { + // If no digits were added since the last octet was processed, + // then the current octet is zero and so we add it. + // + if (n == v.size ()) + v += '0'; + + v += '.'; + n = v.size (); + } + else if (c != '0' || n != v.size ()) // Not a leading zero? + v += c; + } + + // Handle the trailing zero octet. + // + if (n == v.size ()) + v += '0'; + + value = move (v); + break; + } + case url_host_kind::ipv6: + { + // The overall plan is to (1) normalize the address hextets by + // converting them into lower case and stripping the leading zeros, + // (2) expand the potentially present double-colon into the zero + // hextet sequence, and (3) then squash the longest zero hextet + // sequence into double-colon. For example: + // + // 0ABC::1:0:0:0:0 -> abc:0:0:1:: + + // Parse the address into an array of normalized hextets. + // + // Note that if we meet the double-colon, we cannot expand it into the + // zero hextet sequence right away, since its length is unknown at + // this stage. Instead, we will save its index and expand it later. + // + small_vector v; // Normalized address. + string_type hex; // Current hextet. + optional dci; // Double-colon index, if present. + const string_type z (1, '0'); // Zero hextet. + + // True if any leading zeros are stripped for the current hextet. + // + bool stripped (false); + + auto add_hex = [&v, &hex, &stripped, &dci, &z] () + { + if (!hex.empty ()) + { + v.emplace_back (move (hex)); + hex.clear (); + } + else + { + if (!stripped) // Double-colon? + dci = v.size (); // Note: can be set twice to 0 (think of ::1). + else + v.push_back (z); + } + + stripped = false; + }; + + for (char_type c: value) + { + if (c == ':') + add_hex (); + else if (c == '0' && hex.empty ()) // Leading zero? + stripped = true; + else + hex += lcase (static_cast (c)); + } + + // Handle the trailing hextet. + // + if (!hex.empty ()) + v.emplace_back (move (hex)); + else if (stripped) + v.push_back (z); + // + // Else this is the trailing (already handled) double-colon. + + // Expand double-colon, if present. + // + if (dci) + { + if (v.size () < 8) + v.insert (v.begin () + *dci, 8 - v.size (), z); + else + assert (false); // Too long address. + } + + // Find the longest zero hextet sequence. + // + // Note that the first sequence of zeros is chosen between the two of + // the same length. + // + // Also note that we don't squash the single zero. + // + using iter = typename small_vector::const_iterator; + + iter e (v.end ()); + iter mxi (e); // Longest sequence start. + iter mxe; // Longest sequence end. + size_t mxn (1); // Longest sequence length (must be > 1). + + for (iter i (v.begin ()); i != e; ) + { + i = find (i, e, z); + + if (i != e) + { + iter ze (find_if (i + 1, e, + [&z] (const string_type& h) {return h != z;})); + + size_t n (ze - i); + + if (mxn < n) + { + mxn = n; + mxi = i; + mxe = ze; + } + + i = ze; + } + } + + // Compose the IPv6 string, squashing the longest zero hextet + // sequence, if present. + // + value.clear (); + + for (iter i (v.begin ()); i != e; ) + { + if (i != mxi) + { + // Add ':', unless the hextet is the first or follows double- + // colon. + // + if (!value.empty () && value.back () != ':') + value += ':'; + + value += *i++; + } + else + { + value.append (2, ':'); + i = mxe; + } + } + + break; + } + } + } + // basic_url_authority // template diff --git a/tests/url/driver.cxx b/tests/url/driver.cxx index 5f787bd..95be244 100644 --- a/tests/url/driver.cxx +++ b/tests/url/driver.cxx @@ -151,7 +151,7 @@ namespace butl // Usages: // // argv[0] -// argv[0] [-c|-s|-w] +// argv[0] [-c|-s|-w] [-n] // // Perform some basic tests if no URL is provided. Otherwise round-trip the URL // to STDOUT. URL must contain only ASCII characters. Exit with zero code on @@ -159,7 +159,7 @@ namespace butl // to STDERR. // // -c -// Print the URL components one per line. Print the special '[null]' string +// Print the URL components one per line. Print the special '' string // for an absent components. This is the default option if URL is provided. // // -s @@ -169,6 +169,9 @@ namespace butl // Same as above, but use the custom wstring-based url_traits // implementation for the basic_url template. // +// -n +// Normalize the URL. +// int main (int argc, const char* argv[]) try @@ -186,6 +189,8 @@ try comp } mode (print_mode::comp); + bool norm (false); + int i (1); for (; i != argc; ++i) { @@ -196,6 +201,8 @@ try mode = print_mode::wstr; else if (o == "-c") mode = print_mode::comp; + else if (o == "-n") + norm = true; else break; // End of options. } @@ -209,16 +216,18 @@ try assert (u0.empty ()); wurl u1 (scheme::http, - wurl_authority {wstring (), wurl_host (L"[123]"), 0}, + wurl_authority {wstring (), wurl_host (L"[::123]"), 0}, wstring (L"login"), wstring (L"q="), wstring (L"f")); + u1.normalize (); + assert (!u1.empty ()); assert (u1 != u0); wurl u2 (scheme::http, - wurl_host (L"123", url_host_kind::ipv6), + wurl_host (L"::123", url_host_kind::ipv6), wstring (L"login"), wstring (L"q="), wstring (L"f")); @@ -226,16 +235,18 @@ try assert (u2 == u1); wurl u3 (scheme::http, - wurl_host (L"123", url_host_kind::ipv6), + wurl_host (L"::123", url_host_kind::ipv6), 0, wstring (L"login"), wstring (L"q="), wstring (L"f")); + u3.normalize (); + assert (u3 == u2); wurl u4 (scheme::http, - L"[123]", + L"[::123]", wstring (L"login"), wstring (L"q="), wstring (L"f")); @@ -243,7 +254,7 @@ try assert (u4 == u3); wurl u5 (scheme::http, - L"[123]", + L"[::123]", 0, wstring (L"login"), wstring (L"q="), @@ -323,16 +334,31 @@ try { case print_mode::str: { - cout << (*ua != '\0' ? url (ua) : url ()) << endl; + url u; + if (*ua != '\0') + u = url (ua); + + if (norm) + u.normalize (); + + cout << u << endl; break; } case print_mode::wstr: { + wurl u; + // Convert ASCII string to wstring. // wstring s (ua, ua + strlen (ua)); - wcout << (!s.empty () ? wurl (s) : wurl ()) << endl; + if (!s.empty ()) + u = wurl (s); + + if (norm) + u.normalize (); + + wcout << u << endl; break; } case print_mode::comp: @@ -345,6 +371,9 @@ try if (!s.empty ()) u = wurl (s); + if (norm) + u.normalize (); + if (!u.empty ()) { wstring s; @@ -357,7 +386,7 @@ try false) << endl; } else - wcout << L"[null]" << endl; + wcout << L"" << endl; if (u.authority) { @@ -368,11 +397,11 @@ try << " " << kinds[static_cast (a.host.kind)] << endl; } else - wcout << L"[null]" << endl; + wcout << L"" << endl; - wcout << (u.path ? *u.path : L"[null]") << endl - << (u.query ? *u.query : L"[null]") << endl - << (u.fragment ? *u.fragment : L"[null]") << endl; + wcout << (u.path ? *u.path : L"") << endl + << (u.query ? *u.query : L"") << endl + << (u.fragment ? *u.fragment : L"") << endl; break; } } diff --git a/tests/url/testscript b/tests/url/testscript index 94f63ff..52c5005 100644 --- a/tests/url/testscript +++ b/tests/url/testscript @@ -21,11 +21,11 @@ $* : empty-url : $* '' >>EOO - [null] - [null] - [null] - [null] - [null] + + + + + EOO : no-id @@ -43,19 +43,19 @@ $* $* 'http://build2.org' >>EOO http @build2.org:0 name - [null] - [null] - [null] + + + EOO : deduced : $* 'c:/a' >>EOO file - [null] + c:/a - [null] - [null] + + EOO $* ':/a' 2>'no scheme' != 0 : none @@ -70,10 +70,10 @@ $* : $* 'file:/tmp/a' >>EOO file - [null] + tmp/a - [null] - [null] + + EOO : empty @@ -82,8 +82,8 @@ $* file @:0 name tmp/a - [null] - [null] + + EOO : query @@ -91,9 +91,9 @@ $* $* 'http://localhost?q' >>EOO http @localhost:0 name - [null] + q - [null] + EOO : fragment @@ -101,8 +101,8 @@ $* $* 'http://localhost#master' >>EOO http @localhost:0 name - [null] - [null] + + master EOO @@ -111,9 +111,9 @@ $* $* 'http://localhost' >>EOO http @localhost:0 name - [null] - [null] - [null] + + + EOO : user @@ -124,9 +124,9 @@ $* $* 'http://admin@localhost' >>EOO http admin@localhost:0 name - [null] - [null] - [null] + + + EOO : empty @@ -134,9 +134,9 @@ $* $* 'http://@localhost' >>EOO http @localhost:0 name - [null] - [null] - [null] + + + EOO } @@ -148,26 +148,97 @@ $* { : port : - $* 'http://[1:23]:443' >>EOO + $* 'http://[1:2:3:4:5:6:7:8]:443' >>EOO http - @1:23:443 ipv6 - [null] - [null] - [null] + @1:2:3:4:5:6:7:8:443 ipv6 + + + EOO : no-port : - $* 'http://[1:23]' >>EOO + $* 'http://[1:2:3:4:5:6:7:abcd]' >>EOO http - @1:23:0 ipv6 - [null] - [null] - [null] + @1:2:3:4:5:6:7:abcd:0 ipv6 + + + + EOO + + : squashed2-begin + : + $* 'http://[::3:4:5:6:7:8]' >>EOO + http + @::3:4:5:6:7:8:0 ipv6 + + + + EOO + + : squashed3-end + : + $* 'http://[1:2:3:4:5::]' >>EOO + http + @1:2:3:4:5:::0 ipv6 + + + + EOO + + : squashed4-middle + : + $* 'http://[1:2::7:8]' >>EOO + http + @1:2::7:8:0 ipv6 + + + + EOO + + : squashed-all + : + $* 'http://[::]' >>EOO + http + @:::0 ipv6 + + + EOO $* 'http://[123' 2>'invalid IPv6 address' != 0 : missed-bracket $* 'http://[123] :80' 2>'invalid IPv6 address' != 0 : extra-char + + $* 'http://[]' 2>'invalid IPv6 address' != 0 : empty + $* 'http://[1:2]' 2>'invalid IPv6 address' != 0 : too-short + $* 'http://[1:2:3:4:5:6:7:8:9]' 2>'invalid IPv6 address' != 0 : too-long1 + $* 'http://[::2:3:4:5:6:7:8:9]' 2>'invalid IPv6 address' != 0 : too-long2 + $* 'http://[::3:4::7:8:9]' 2>'invalid IPv6 address' != 0 : several-squashes + $* 'http://[1:2:3:4::6:7:8:9]' 2>'invalid IPv6 address' != 0 : squash-one-hextet + $* 'http://[12345:2:3:4:5:6:7:8:9]' 2>'invalid IPv6 address' != 0 : long-hextet + $* 'http://[123z:2:3:4:5:6:7:8:9]' 2>'invalid IPv6 address' != 0 : not-hex + + : normalize + : + { + test.options += -n -s + + $* 'http://[::01:0:002:00:0003]' >'http://[::1:0:2:0:3]' : strip-zeros + $* 'http://[::ABC]' >'http://[::abc]' : lower-case + + $* 'http://[::]' >'http://[::]' : squash-all + $* 'http://[::1]' >'http://[::1]' : squash-left + $* 'http://[1::]' >'http://[1::]' : squash-right + $* 'http://[1::2]' >'http://[1::2]' : squash-middle + + $* 'http://[1::0:2:0:0:3]' >'http://[1::2:0:0:3]' : squash-longest1 + $* 'http://[::0:2:0:0:3]' >'http://[::2:0:0:3]' : squash-longest2 + $* 'http://[::0:2:0:0:0:0]' >'http://[0:0:0:2::]' : squash-longest3 + $* 'http://[0:0:1::2:3:4]' >'http://[::1:0:0:2:3:4]' : squash-first + $* 'http://[0:0:2:0:0:0::]' >'http://[0:0:2::]' : squash-trailing + + $* 'http://[::1:2:3:4:5:6:7]' >'http://[0:1:2:3:4:5:6:7]' : expand-zero + } } : ipv4 @@ -178,9 +249,9 @@ $* $* 'http://0.10.200.255' >>EOO http @0.10.200.255:0 ipv4 - [null] - [null] - [null] + + + EOO : long @@ -188,9 +259,9 @@ $* $* 'http://0.10.200.255.30' >>EOO http @0.10.200.255.30:0 name - [null] - [null] - [null] + + + EOO : short @@ -198,9 +269,9 @@ $* $* 'http://0.10.200' >>EOO http @0.10.200:0 name - [null] - [null] - [null] + + + EOO : missed @@ -208,9 +279,9 @@ $* $* 'http://0.10..200' >>EOO http @0.10..200:0 name - [null] - [null] - [null] + + + EOO : out-of-range @@ -218,10 +289,18 @@ $* $* 'http://0.10.200.256' >>EOO http @0.10.200.256:0 name - [null] - [null] - [null] + + + EOO + + : normalize + : + { + test.options += -n -s + + $* 'http://0.010.000.00' >'http://0.10.0.0' : strip-zeros + } } : name @@ -232,9 +311,9 @@ $* $* 'https://www.b2.org' >>EOO https @www.b2.org:0 name - [null] - [null] - [null] + + + EOO : encoded @@ -245,9 +324,9 @@ $* $* 'https://www.%62%32.org' >>EOO https @www.b2.org:0 name - [null] - [null] - [null] + + + EOO $* 'https://www.%62%3.org' 2>'invalid URL-encoding' != 0 : short @@ -256,6 +335,15 @@ $* } $* 'https://www.b|2.org' 2>'invalid host name' != 0 : invalid-char + + : normalize + : + { + test.options += -n + + $* -s 'http://Build2.org' >'http://build2.org' : lower-case-char + $* -w 'http://Build2.org' >'http://build2.org' : lower-case-wchar + } } $* 'http://admin@:80?q=' 2>'no host' != 0: no-host @@ -269,9 +357,9 @@ $* $* 'http://build2.org:443' >>EOO http @build2.org:443 name - [null] - [null] - [null] + + + EOO $* 'http://build2.org:-433' 2>'invalid port' != 0 : invalid-char @@ -288,9 +376,9 @@ $* $* 'http://b2.org' >>EOO http @b2.org:0 name - [null] - [null] - [null] + + + EOO : empty @@ -299,8 +387,8 @@ $* http @b2.org:0 name - [null] - [null] + + EOO : non-empty @@ -309,8 +397,8 @@ $* http @b2.org:0 name s/q - [null] - [null] + + EOO : encoded @@ -319,8 +407,8 @@ $* http @b2.org:0 name o/s - [null] - [null] + + EOO $* 'http:a/b/c' 2>'rootless path' != 0 : rootless-path @@ -335,10 +423,10 @@ $* : $* 'pkcs11:token=sign;object=SIGN%20key' >>EOO pkcs11 - [null] + token=sign;object=SIGN key - [null] - [null] + + EOO } @@ -352,7 +440,7 @@ $* @b2.org:0 name a x=foo&y=bar - [null] + EOO : fragment @@ -372,8 +460,8 @@ $* $* 'http://b2.org#foo' >>EOO http @b2.org:0 name - [null] - [null] + + foo EOO } @@ -389,10 +477,10 @@ $* : host : { - $* 'file:///a' >'file:///a' : empty - $* 'http://1.1.1.1' >'http://1.1.1.1' : ipv4 - $* 'https://[1:2:3]' >'https://[1:2:3]' : ipv6 - $* 'file://a%d1%84' >'file://a%D1%84' : name + $* 'file:///a' >'file:///a' : empty + $* 'https://[1:2:3:4:5:6:7:8]' >'https://[1:2:3:4:5:6:7:8]' : ipv6 + $* 'http://1.1.1.1' >'http://1.1.1.1' : ipv4 + $* 'file://a%d1%84' >'file://a%D1%84' : name } $* 'http://admin@localhost' >'http://admin@localhost' : user -- cgit v1.1