From 2d15efda5db161accd5f572fd4816885bce7c68c Mon Sep 17 00:00:00 2001
From: Karen Arutyunov <karen@codesynthesis.com>
Date: Thu, 23 Jun 2022 22:55:20 +0300
Subject: Split and merge manifest value/comment pair differently depending on
 whether it is multiline or not

---
 libbutl/manifest-parser.cxx          | 145 +++++++++++++++++++++++++++++------
 libbutl/manifest-serializer.cxx      |  91 +++++++++++++++++++---
 tests/manifest-parser/driver.cxx     |  65 +++++++++++++++-
 tests/manifest-roundtrip/driver.cxx  |  13 ++++
 tests/manifest-roundtrip/testscript  |  40 ++++++++++
 tests/manifest-serializer/driver.cxx |  29 ++++++-
 6 files changed, 340 insertions(+), 43 deletions(-)

diff --git a/libbutl/manifest-parser.cxx b/libbutl/manifest-parser.cxx
index 258a536..904910a 100644
--- a/libbutl/manifest-parser.cxx
+++ b/libbutl/manifest-parser.cxx
@@ -148,41 +148,136 @@ namespace butl
   {
     using iterator = string::const_iterator;
 
-    auto space = [] (char c) -> bool {return c == ' ' || c == '\t';};
+    // Parse the value differently depending on whether it is multi-line or
+    // not.
+    //
+    if (v.find ('\n') == string::npos) // Single-line.
+    {
+      auto space = [] (char c) {return c == ' ' || c == '\t';};
 
-    iterator i (v.begin ());
-    iterator e (v.end ());
+      iterator i (v.begin ());
+      iterator e (v.end ());
 
-    string r;
-    size_t n (0);
-    for (char c; i != e && (c = *i) != ';'; ++i)
-    {
-      // Unescape ';' character.
+      string r;
+      size_t n (0);
+      for (char c; i != e && (c = *i) != ';'; ++i)
+      {
+        // Unescape ';' and '\' characters.
+        //
+        if (c == '\\' && i + 1 != e && (*(i + 1) == ';' || *(i + 1) == '\\'))
+          c = *++i;
+
+        r += c;
+
+        if (!space (c))
+          n = r.size ();
+      }
+
+      // Strip the value trailing spaces.
       //
-      if (c == '\\' && i + 1 != e && *(i + 1) == ';')
-        c = *++i;
+      if (r.size () != n)
+        r.resize (n);
 
-      r += c;
+      // Find beginning of a comment (i).
+      //
+      if (i != e)
+      {
+        // Skip spaces.
+        //
+        for (++i; i != e && space (*i); ++i);
+      }
 
-      if (!space (c))
-        n = r.size ();
+      return make_pair (move (r), string (i, e));
     }
+    else // Multi-line.
+    {
+      string r;
+      string c;
 
-    // Strip the value trailing spaces.
-    //
-    if (r.size () != n)
-      r.resize (n);
+      // Parse the value lines until the comment separator is encountered or
+      // the end of the value is reached. Add these lines to the resulting
+      // value, unescaping them if required.
+      //
+      // Note that we only need to unescape lines which have the '\+;' form.
+      //
+      auto i (v.begin ());
+      auto e (v.end ());
 
-    // Find beginning of a comment (i).
-    //
-    if (i != e)
-    {
-      // Skip spaces.
+      while (i != e)
+      {
+        // Find the end of the line and while at it the first non-backslash
+        // character.
+        //
+        auto le (i);
+        auto nb (e);
+        for (; le != e && *le != '\n'; ++le)
+        {
+          if (nb == e && *le != '\\')
+            nb = le;
+        }
+
+        // If the value end is not reached then position to the beginning of
+        // the next line and to the end of the value otherwise.
+        //
+        auto next = [&i, &le, &e] () {i = (le != e ? le + 1 : e);};
+
+        // If the first non-backslash character is ';' and it is the last
+        // character on the line, then this is either the comment separator or
+        // an escape sequence.
+        //
+        if (nb != e && *nb == ';' && nb + 1 == le)
+        {
+          // If ';' is the first (and thus the only) character on the line,
+          // then this is the comment separator and we bail out from this
+          // loop. Note that in this case we need to trim the trailing newline
+          // (but only one) from the resulting value since it is considered as
+          // a part of the separator.
+          //
+          if (nb == i)
+          {
+            if (!r.empty ())
+            {
+              assert (r.back () == '\n');
+              r.pop_back ();
+            }
+
+            next ();
+            break;
+          }
+          //
+          // Otherwise, this is an escape sequence, so unescape it. For that
+          // just take the rightmost half of the string:
+          //
+          // \;     -> ;
+          // \\;    -> \;
+          // \\\;   -> \;
+          // \\\\;  -> \\;
+          // \\\\\; -> \\;
+          //
+          else
+            i += (le - i) / 2;
+        }
+
+        // Add the line to the resulting value together with the trailing
+        // newline, if present.
+        //
+        r.append (i, le);
+
+        if (le != e)
+          r += '\n';
+
+        next ();
+      }
+
+      // If we haven't reached the end of the value then it means we've
+      // encountered the comment separator. In this case save the remaining
+      // value part as a comment.
       //
-      for (++i; i != e && space (*i); ++i);
-    }
+      if (i != e)
+        c = string (i, e);
 
-    return make_pair (move (r), string (i, e));
+      return make_pair (move (r), move (c));
+    }
   }
 
   void manifest_parser::
diff --git a/libbutl/manifest-serializer.cxx b/libbutl/manifest-serializer.cxx
index b0d0324..26699e0 100644
--- a/libbutl/manifest-serializer.cxx
+++ b/libbutl/manifest-serializer.cxx
@@ -101,22 +101,89 @@ namespace butl
   merge_comment (const string& value, const string& comment)
   {
     string r;
-    for (char c: value)
+
+    // Merge the value and comment differently depending on whether any of
+    // them is multi-line or not.
+    //
+    if (value.find ('\n') == string::npos && // Single-line.
+        comment.find ('\n') == string::npos)
     {
-      // Escape ';' character.
-      //
-      if (c == ';')
-        r += '\\';
+      for (char c: value)
+      {
+        // Escape ';' and '\' characters.
+        //
+        if (c == ';' || c == '\\')
+          r += '\\';
 
-      r += c;
-    }
+        r += c;
+      }
 
-    // Add the comment.
-    //
-    if (!comment.empty ())
+      // Add the comment.
+      //
+      if (!comment.empty ())
+      {
+        r += "; ";
+        r += comment;
+      }
+    }
+    else // Multi-line.
     {
-      r += "; ";
-      r += comment;
+      // Parse the value lines and add them to the resulting value, escaping
+      // them if required.
+      //
+      // Note that we only need to escape lines which have the '\*;' form.
+      //
+      for (auto i (value.begin ()), e (value.end ()); i != e; )
+      {
+        // Find the end of the line and while at it the first non-backslash
+        // character.
+        //
+        auto le (i);
+        auto nb (e);
+        for (; le != e && *le != '\n'; ++le)
+        {
+          if (nb == e && *le != '\\')
+            nb = le;
+        }
+
+        // If the first non-backslash character is ';' and it is the last
+        // character on the line, then we need to escape the line characters.
+        // Note that we only escape ';' if it is the only character on the
+        // line. Otherwise, we only escape backslashes doubling the number of
+        // them from the left:
+        //
+        // ;   -> \;
+        // \;  -> \\;
+        // \\; -> \\\\;
+        // \\\; -> \\\\\\;
+        //
+        if (nb != e && *nb == ';' && nb + 1 == le)
+          r.append (nb == i ? 1 : nb - i, '\\');
+
+        // Add the line to the resulting value together with the trailing
+        // newline, if present.
+        //
+        r.append (i, le);
+
+        if (le != e)
+          r += '\n';
+
+        // If the value end is not reached then position to the beginning of
+        // the next line and to the end of the value otherwise.
+        //
+        i = (le != e ? le + 1 : e);
+      }
+
+      // Append the comment, if present.
+      //
+      if (!comment.empty ())
+      {
+        if (!r.empty ())
+          r += '\n';
+
+        r += ";\n";
+        r += comment;
+      }
     }
 
     return r;
diff --git a/tests/manifest-parser/driver.cxx b/tests/manifest-parser/driver.cxx
index 6924321..56c614a 100644
--- a/tests/manifest-parser/driver.cxx
+++ b/tests/manifest-parser/driver.cxx
@@ -164,14 +164,18 @@ namespace butl
 
     // Manifest value splitting (into the value/comment pair).
     //
+    // Single-line.
+    //
     {
-      auto p (manifest_parser::split_comment ("value\\; text ; comment text"));
-      assert (p.first == "value; text" && p.second == "comment text");
+      auto p (manifest_parser::split_comment (
+                "\\value\\\\\\; text ; comment text"));
+
+      assert (p.first == "\\value\\; text" && p.second == "comment text");
     }
 
     {
-      auto p (manifest_parser::split_comment ("value"));
-      assert (p.first == "value" && p.second == "");
+      auto p (manifest_parser::split_comment ("value\\"));
+      assert (p.first == "value\\" && p.second == "");
     }
 
     {
@@ -179,6 +183,59 @@ namespace butl
       assert (p.first == "" && p.second == "comment");
     }
 
+    // Multi-line.
+    //
+    {
+      auto p (manifest_parser::split_comment ("value\n;"));
+      assert (p.first == "value" && p.second == "");
+    }
+
+    {
+      auto p (manifest_parser::split_comment ("value\ntext\n"));
+      assert (p.first == "value\ntext\n" && p.second == "");
+    }
+
+    {
+      auto p (manifest_parser::split_comment ("value\ntext\n;"));
+      assert (p.first == "value\ntext" && p.second == "");
+    }
+
+    {
+      auto p (manifest_parser::split_comment ("value\ntext\n;\n"));
+      assert (p.first == "value\ntext" && p.second == "");
+    }
+
+    {
+      auto p (manifest_parser::split_comment ("\n\\\nvalue\ntext\n"
+                                              ";\n"
+                                              "\n\n comment\ntext"));
+
+      assert (p.first == "\n\\\nvalue\ntext" && p.second ==
+              "\n\n comment\ntext");
+    }
+
+    {
+      auto p (manifest_parser::split_comment ("\n;\ncomment"));
+      assert (p.first == "" && p.second == "comment");
+    }
+
+    {
+      auto p (manifest_parser::split_comment (";\ncomment"));
+      assert (p.first == "" && p.second == "comment");
+    }
+
+    {
+      auto p (manifest_parser::split_comment (";\n"));
+      assert (p.first == "" && p.second == "");
+    }
+
+    {
+      auto p (manifest_parser::split_comment (
+                "\\;\n\\\\;\n\\\\\\;\n\\\\\\\\;\n\\\\\\\\\\;"));
+
+      assert (p.first == ";\n\\;\n\\;\n\\\\;\n\\\\;" && p.second == "");
+    }
+
     // UTF-8.
     //
     assert (test (":1\n#\xD0\xB0\n\xD0\xB0y\xD0\xB0:\xD0\xB0z\xD0\xB0",
diff --git a/tests/manifest-roundtrip/driver.cxx b/tests/manifest-roundtrip/driver.cxx
index 5dc5862..c63a729 100644
--- a/tests/manifest-roundtrip/driver.cxx
+++ b/tests/manifest-roundtrip/driver.cxx
@@ -22,11 +22,16 @@ using namespace butl;
 // -m
 //    Serialize multi-line manifest values using the v2 form.
 //
+// -s
+//    Split values into the value/comment pairs and merge them back before
+//    printing.
+//
 int
 main (int argc, const char* argv[])
 try
 {
   bool multiline_v2 (false);
+  bool split (false);
 
   for (int i (1); i != argc; ++i)
   {
@@ -34,6 +39,8 @@ try
 
     if (v == "-m")
       multiline_v2 = true;
+    else if (v == "-s")
+      split = true;
   }
 
   // Read/write in binary mode.
@@ -61,6 +68,12 @@ try
     else
       eom = false;
 
+    if (split)
+    {
+      const auto& vc (manifest_parser::split_comment (nv.value));
+      nv.value = manifest_serializer::merge_comment (vc.first, vc.second);
+    }
+
     s.next (nv.name, nv.value);
   }
 }
diff --git a/tests/manifest-roundtrip/testscript b/tests/manifest-roundtrip/testscript
index e0a15cc..a228b0f 100644
--- a/tests/manifest-roundtrip/testscript
+++ b/tests/manifest-roundtrip/testscript
@@ -76,3 +76,43 @@ $* -m <<EOF >>EOF
   c:\windows\\
   \
   EOF
+
+: split-merge-comment
+:
+$* -s <<EOF >>EOF
+  : 1
+  info:\
+  value
+  text
+  \
+  info:\
+  value
+  text
+  ;
+  comment
+  \
+  info:\
+  ;
+  comment
+  text
+  \
+  info:\
+  value
+  \;
+  \\
+  ;
+  comment
+  \
+  info:\
+  value
+  \\;
+  ;
+  comment
+  \
+  info:\
+  value
+  \\\\;
+  ;
+  comment
+  \
+  EOF
diff --git a/tests/manifest-serializer/driver.cxx b/tests/manifest-serializer/driver.cxx
index be3ae25..a003fa4 100644
--- a/tests/manifest-serializer/driver.cxx
+++ b/tests/manifest-serializer/driver.cxx
@@ -251,12 +251,37 @@ main ()
 
   // Manifest value/comment merging.
   //
-  assert (manifest_serializer::merge_comment ("value; text", "comment") ==
-          "value\\; text; comment");
+  // Single-line.
+  //
+  assert (manifest_serializer::merge_comment ("value\\; text", "comment") ==
+          "value\\\\\\; text; comment");
 
   assert (manifest_serializer::merge_comment ("value text", "") ==
           "value text");
 
+  // Multi-line.
+  //
+  assert (manifest_serializer::merge_comment ("value\n;\ntext", "comment") ==
+          "value\n\\;\ntext\n;\ncomment");
+
+  assert (manifest_serializer::merge_comment ("value\n\\;\ntext\n",
+                                              "comment") ==
+          "value\n\\\\;\ntext\n\n;\ncomment");
+
+  assert (manifest_serializer::merge_comment ("value\n\\\\;\ntext\n",
+                                              "comment") ==
+          "value\n\\\\\\\\;\ntext\n\n;\ncomment");
+
+
+  assert (manifest_serializer::merge_comment ("value\n\\\ntext", "comment") ==
+          "value\n\\\ntext\n;\ncomment");
+
+  assert (manifest_serializer::merge_comment ("\\", "comment\n") ==
+          "\\\n;\ncomment\n");
+
+  assert (manifest_serializer::merge_comment ("", "comment\ntext") ==
+          ";\ncomment\ntext");
+
   // Filtering.
   //
   assert (test ({{"","1"},{"a","abc"},{"b","bca"},{"c","cab"},{"",""},{"",""}},
-- 
cgit v1.1