From 2d15efda5db161accd5f572fd4816885bce7c68c Mon Sep 17 00:00:00 2001 From: Karen Arutyunov Date: Thu, 23 Jun 2022 22:55:20 +0300 Subject: Split and merge manifest value/comment pair differently depending on whether it is multiline or not --- libbutl/manifest-parser.cxx | 145 +++++++++++++++++++++++++++++------ libbutl/manifest-serializer.cxx | 91 +++++++++++++++++++--- tests/manifest-parser/driver.cxx | 65 +++++++++++++++- tests/manifest-roundtrip/driver.cxx | 13 ++++ tests/manifest-roundtrip/testscript | 40 ++++++++++ tests/manifest-serializer/driver.cxx | 29 ++++++- 6 files changed, 340 insertions(+), 43 deletions(-) diff --git a/libbutl/manifest-parser.cxx b/libbutl/manifest-parser.cxx index 258a536..904910a 100644 --- a/libbutl/manifest-parser.cxx +++ b/libbutl/manifest-parser.cxx @@ -148,41 +148,136 @@ namespace butl { using iterator = string::const_iterator; - auto space = [] (char c) -> bool {return c == ' ' || c == '\t';}; + // Parse the value differently depending on whether it is multi-line or + // not. + // + if (v.find ('\n') == string::npos) // Single-line. + { + auto space = [] (char c) {return c == ' ' || c == '\t';}; - iterator i (v.begin ()); - iterator e (v.end ()); + iterator i (v.begin ()); + iterator e (v.end ()); - string r; - size_t n (0); - for (char c; i != e && (c = *i) != ';'; ++i) - { - // Unescape ';' character. + string r; + size_t n (0); + for (char c; i != e && (c = *i) != ';'; ++i) + { + // Unescape ';' and '\' characters. + // + if (c == '\\' && i + 1 != e && (*(i + 1) == ';' || *(i + 1) == '\\')) + c = *++i; + + r += c; + + if (!space (c)) + n = r.size (); + } + + // Strip the value trailing spaces. // - if (c == '\\' && i + 1 != e && *(i + 1) == ';') - c = *++i; + if (r.size () != n) + r.resize (n); - r += c; + // Find beginning of a comment (i). + // + if (i != e) + { + // Skip spaces. + // + for (++i; i != e && space (*i); ++i); + } - if (!space (c)) - n = r.size (); + return make_pair (move (r), string (i, e)); } + else // Multi-line. + { + string r; + string c; - // Strip the value trailing spaces. - // - if (r.size () != n) - r.resize (n); + // Parse the value lines until the comment separator is encountered or + // the end of the value is reached. Add these lines to the resulting + // value, unescaping them if required. + // + // Note that we only need to unescape lines which have the '\+;' form. + // + auto i (v.begin ()); + auto e (v.end ()); - // Find beginning of a comment (i). - // - if (i != e) - { - // Skip spaces. + while (i != e) + { + // Find the end of the line and while at it the first non-backslash + // character. + // + auto le (i); + auto nb (e); + for (; le != e && *le != '\n'; ++le) + { + if (nb == e && *le != '\\') + nb = le; + } + + // If the value end is not reached then position to the beginning of + // the next line and to the end of the value otherwise. + // + auto next = [&i, &le, &e] () {i = (le != e ? le + 1 : e);}; + + // If the first non-backslash character is ';' and it is the last + // character on the line, then this is either the comment separator or + // an escape sequence. + // + if (nb != e && *nb == ';' && nb + 1 == le) + { + // If ';' is the first (and thus the only) character on the line, + // then this is the comment separator and we bail out from this + // loop. Note that in this case we need to trim the trailing newline + // (but only one) from the resulting value since it is considered as + // a part of the separator. + // + if (nb == i) + { + if (!r.empty ()) + { + assert (r.back () == '\n'); + r.pop_back (); + } + + next (); + break; + } + // + // Otherwise, this is an escape sequence, so unescape it. For that + // just take the rightmost half of the string: + // + // \; -> ; + // \\; -> \; + // \\\; -> \; + // \\\\; -> \\; + // \\\\\; -> \\; + // + else + i += (le - i) / 2; + } + + // Add the line to the resulting value together with the trailing + // newline, if present. + // + r.append (i, le); + + if (le != e) + r += '\n'; + + next (); + } + + // If we haven't reached the end of the value then it means we've + // encountered the comment separator. In this case save the remaining + // value part as a comment. // - for (++i; i != e && space (*i); ++i); - } + if (i != e) + c = string (i, e); - return make_pair (move (r), string (i, e)); + return make_pair (move (r), move (c)); + } } void manifest_parser:: diff --git a/libbutl/manifest-serializer.cxx b/libbutl/manifest-serializer.cxx index b0d0324..26699e0 100644 --- a/libbutl/manifest-serializer.cxx +++ b/libbutl/manifest-serializer.cxx @@ -101,22 +101,89 @@ namespace butl merge_comment (const string& value, const string& comment) { string r; - for (char c: value) + + // Merge the value and comment differently depending on whether any of + // them is multi-line or not. + // + if (value.find ('\n') == string::npos && // Single-line. + comment.find ('\n') == string::npos) { - // Escape ';' character. - // - if (c == ';') - r += '\\'; + for (char c: value) + { + // Escape ';' and '\' characters. + // + if (c == ';' || c == '\\') + r += '\\'; - r += c; - } + r += c; + } - // Add the comment. - // - if (!comment.empty ()) + // Add the comment. + // + if (!comment.empty ()) + { + r += "; "; + r += comment; + } + } + else // Multi-line. { - r += "; "; - r += comment; + // Parse the value lines and add them to the resulting value, escaping + // them if required. + // + // Note that we only need to escape lines which have the '\*;' form. + // + for (auto i (value.begin ()), e (value.end ()); i != e; ) + { + // Find the end of the line and while at it the first non-backslash + // character. + // + auto le (i); + auto nb (e); + for (; le != e && *le != '\n'; ++le) + { + if (nb == e && *le != '\\') + nb = le; + } + + // If the first non-backslash character is ';' and it is the last + // character on the line, then we need to escape the line characters. + // Note that we only escape ';' if it is the only character on the + // line. Otherwise, we only escape backslashes doubling the number of + // them from the left: + // + // ; -> \; + // \; -> \\; + // \\; -> \\\\; + // \\\; -> \\\\\\; + // + if (nb != e && *nb == ';' && nb + 1 == le) + r.append (nb == i ? 1 : nb - i, '\\'); + + // Add the line to the resulting value together with the trailing + // newline, if present. + // + r.append (i, le); + + if (le != e) + r += '\n'; + + // If the value end is not reached then position to the beginning of + // the next line and to the end of the value otherwise. + // + i = (le != e ? le + 1 : e); + } + + // Append the comment, if present. + // + if (!comment.empty ()) + { + if (!r.empty ()) + r += '\n'; + + r += ";\n"; + r += comment; + } } return r; diff --git a/tests/manifest-parser/driver.cxx b/tests/manifest-parser/driver.cxx index 6924321..56c614a 100644 --- a/tests/manifest-parser/driver.cxx +++ b/tests/manifest-parser/driver.cxx @@ -164,14 +164,18 @@ namespace butl // Manifest value splitting (into the value/comment pair). // + // Single-line. + // { - auto p (manifest_parser::split_comment ("value\\; text ; comment text")); - assert (p.first == "value; text" && p.second == "comment text"); + auto p (manifest_parser::split_comment ( + "\\value\\\\\\; text ; comment text")); + + assert (p.first == "\\value\\; text" && p.second == "comment text"); } { - auto p (manifest_parser::split_comment ("value")); - assert (p.first == "value" && p.second == ""); + auto p (manifest_parser::split_comment ("value\\")); + assert (p.first == "value\\" && p.second == ""); } { @@ -179,6 +183,59 @@ namespace butl assert (p.first == "" && p.second == "comment"); } + // Multi-line. + // + { + auto p (manifest_parser::split_comment ("value\n;")); + assert (p.first == "value" && p.second == ""); + } + + { + auto p (manifest_parser::split_comment ("value\ntext\n")); + assert (p.first == "value\ntext\n" && p.second == ""); + } + + { + auto p (manifest_parser::split_comment ("value\ntext\n;")); + assert (p.first == "value\ntext" && p.second == ""); + } + + { + auto p (manifest_parser::split_comment ("value\ntext\n;\n")); + assert (p.first == "value\ntext" && p.second == ""); + } + + { + auto p (manifest_parser::split_comment ("\n\\\nvalue\ntext\n" + ";\n" + "\n\n comment\ntext")); + + assert (p.first == "\n\\\nvalue\ntext" && p.second == + "\n\n comment\ntext"); + } + + { + auto p (manifest_parser::split_comment ("\n;\ncomment")); + assert (p.first == "" && p.second == "comment"); + } + + { + auto p (manifest_parser::split_comment (";\ncomment")); + assert (p.first == "" && p.second == "comment"); + } + + { + auto p (manifest_parser::split_comment (";\n")); + assert (p.first == "" && p.second == ""); + } + + { + auto p (manifest_parser::split_comment ( + "\\;\n\\\\;\n\\\\\\;\n\\\\\\\\;\n\\\\\\\\\\;")); + + assert (p.first == ";\n\\;\n\\;\n\\\\;\n\\\\;" && p.second == ""); + } + // UTF-8. // assert (test (":1\n#\xD0\xB0\n\xD0\xB0y\xD0\xB0:\xD0\xB0z\xD0\xB0", diff --git a/tests/manifest-roundtrip/driver.cxx b/tests/manifest-roundtrip/driver.cxx index 5dc5862..c63a729 100644 --- a/tests/manifest-roundtrip/driver.cxx +++ b/tests/manifest-roundtrip/driver.cxx @@ -22,11 +22,16 @@ using namespace butl; // -m // Serialize multi-line manifest values using the v2 form. // +// -s +// Split values into the value/comment pairs and merge them back before +// printing. +// int main (int argc, const char* argv[]) try { bool multiline_v2 (false); + bool split (false); for (int i (1); i != argc; ++i) { @@ -34,6 +39,8 @@ try if (v == "-m") multiline_v2 = true; + else if (v == "-s") + split = true; } // Read/write in binary mode. @@ -61,6 +68,12 @@ try else eom = false; + if (split) + { + const auto& vc (manifest_parser::split_comment (nv.value)); + nv.value = manifest_serializer::merge_comment (vc.first, vc.second); + } + s.next (nv.name, nv.value); } } diff --git a/tests/manifest-roundtrip/testscript b/tests/manifest-roundtrip/testscript index e0a15cc..a228b0f 100644 --- a/tests/manifest-roundtrip/testscript +++ b/tests/manifest-roundtrip/testscript @@ -76,3 +76,43 @@ $* -m <>EOF c:\windows\\ \ EOF + +: split-merge-comment +: +$* -s <>EOF + : 1 + info:\ + value + text + \ + info:\ + value + text + ; + comment + \ + info:\ + ; + comment + text + \ + info:\ + value + \; + \\ + ; + comment + \ + info:\ + value + \\; + ; + comment + \ + info:\ + value + \\\\; + ; + comment + \ + EOF diff --git a/tests/manifest-serializer/driver.cxx b/tests/manifest-serializer/driver.cxx index be3ae25..a003fa4 100644 --- a/tests/manifest-serializer/driver.cxx +++ b/tests/manifest-serializer/driver.cxx @@ -251,12 +251,37 @@ main () // Manifest value/comment merging. // - assert (manifest_serializer::merge_comment ("value; text", "comment") == - "value\\; text; comment"); + // Single-line. + // + assert (manifest_serializer::merge_comment ("value\\; text", "comment") == + "value\\\\\\; text; comment"); assert (manifest_serializer::merge_comment ("value text", "") == "value text"); + // Multi-line. + // + assert (manifest_serializer::merge_comment ("value\n;\ntext", "comment") == + "value\n\\;\ntext\n;\ncomment"); + + assert (manifest_serializer::merge_comment ("value\n\\;\ntext\n", + "comment") == + "value\n\\\\;\ntext\n\n;\ncomment"); + + assert (manifest_serializer::merge_comment ("value\n\\\\;\ntext\n", + "comment") == + "value\n\\\\\\\\;\ntext\n\n;\ncomment"); + + + assert (manifest_serializer::merge_comment ("value\n\\\ntext", "comment") == + "value\n\\\ntext\n;\ncomment"); + + assert (manifest_serializer::merge_comment ("\\", "comment\n") == + "\\\n;\ncomment\n"); + + assert (manifest_serializer::merge_comment ("", "comment\ntext") == + ";\ncomment\ntext"); + // Filtering. // assert (test ({{"","1"},{"a","abc"},{"b","bca"},{"c","cab"},{"",""},{"",""}}, -- cgit v1.1