From 78910e3cb0b9cc215e53142c28f8b9f52c30af60 Mon Sep 17 00:00:00 2001 From: Boris Kolpackov Date: Fri, 3 Feb 2017 14:57:03 +0200 Subject: Implement path_match() and path_search() --- butl/filesystem | 53 +++++++ butl/filesystem.cxx | 369 ++++++++++++++++++++++++++++++++++++++++++++ butl/path | 2 +- tests/buildfile | 3 +- tests/wildcard/buildfile | 7 + tests/wildcard/driver.cxx | 105 +++++++++++++ tests/wildcard/testscript | 378 ++++++++++++++++++++++++++++++++++++++++++++++ 7 files changed, 915 insertions(+), 2 deletions(-) create mode 100644 tests/wildcard/buildfile create mode 100644 tests/wildcard/driver.cxx create mode 100644 tests/wildcard/testscript diff --git a/butl/filesystem b/butl/filesystem index ef715c6..a125dab 100644 --- a/butl/filesystem +++ b/butl/filesystem @@ -24,6 +24,7 @@ #include // uint16_t #include // move(), pair #include +#include #include @@ -396,6 +397,58 @@ namespace butl // inline dir_iterator begin (dir_iterator&); inline dir_iterator end (const dir_iterator&); + + // Wildcard pattern match and search (aka glob). + // + + // Return true if name matches pattern. Both must be single path components, + // possibly with a trailing directory separator to indicate a directory. + // + // If the pattern ends with a directory separator, then it only matches a + // directory name (i.e., ends with a directory separator, but potentially + // different). Otherwise, it only matches a non-directory name (no trailing + // directory separator). + // + // Currently the following wildcard characters are supported: + // + // * - match any number of characters (including zero) + // ? - match any single character + // + LIBBUTL_EXPORT bool + path_match (const std::string& pattern, const std::string& name); + + // Search for paths matching the pattern calling the specified function for + // each matching path. Stop the search if the function returns false. + // + // If the pattern is relative, then search in the start directory. If the + // start directory is empty, then search in the current working directory. + // Searching in non-existent directories is not an error. Throw + // std::system_error in case of a failure (insufficient permissions, etc). + // + // The pattern may contain multiple components that include wildcards. On + // Windows the drive letter may not be a wildcard. + // + // In addition to the wildcard characters listed in path_match(), + // path_search() also recognizes the ** and *** wildcard sequences. If a + // path component contains **, then it is matched just like * but in all the + // subdirectories, recursively. The *** wildcard behaves like ** but also + // matches the start directory itself. + // + // So, for example, foo/bar-**.txt will return all the files matching the + // bar-*.txt pattern in all the subdirectoris of foo/. And foo/f***/ will + // return all the subdirectories matching the f*/ pattern plus foo/ itself. + // + // Note that having multiple recursive components in the pattern we can end + // up with calling func() multiple times (once per such a component) for the + // same path. For example the search with pattern f***/b**/ starting in + // directory foo, that has the foo/fox/box/ structure, will result in + // calling func(foo/fox/box/) twice: first time for being a child of fox/, + // second time for being a child of foo/. + // + LIBBUTL_EXPORT void + path_search (const path& pattern, + const std::function&, + const dir_path& start = dir_path ()); } #include diff --git a/butl/filesystem.cxx b/butl/filesystem.cxx index e7e54a8..1553324 100644 --- a/butl/filesystem.cxx +++ b/butl/filesystem.cxx @@ -25,10 +25,16 @@ #include // errno, E* +#include +#include #include // unique_ptr +#include // pair +#include // reverse_iterator #include +#include #include +#include using namespace std; @@ -728,4 +734,367 @@ namespace butl } } #endif + + // Match the name [ni, ne) to the pattern [pi, pe). Ranges can be empty. + // + static bool + match (string::const_iterator pi, string::const_iterator pe, + string::const_iterator ni, string::const_iterator ne) + { + using reverse_iterator = std::reverse_iterator; + + reverse_iterator rpi (pe); + reverse_iterator rpe (pi); + + reverse_iterator rni (ne); + reverse_iterator rne (ni); + + // Match the pattern suffix (follows the last *) to the name trailing + // characters. + // + char pc; + for (; rpi != rpe && (pc = *rpi) != '*' && rni != rne; ++rpi, ++rni) + { + if (*rni != pc && pc != '?') + return false; + } + + // If we got to the (reversed) end of the pattern (no * is encountered) + // than we are done. The success depends on if we got to the (reversed) end + // of the name as well. + // + if (rpi == rpe) + return rni == rne; + + // If we didn't reach * in the pattern then we reached the (reversed) end + // of the name. That means we have unmatched non-star characters in the + // pattern, and so match failed. + // + if (pc != '*') + { + assert (rni == rne); + return false; + } + + // Match the pattern prefix (ends with the first *) to the name leading + // characters. If they mismatch we failed. Otherwise if this is an only * + // in the pattern (matches whatever is left in the name) then we succeed, + // otherwise we perform backtracking (recursively). + // + pe = rpi.base (); + ne = rni.base (); + + // Compare the pattern and the name char by char until the name suffix or + // * is encountered in the pattern (whichever happens first). Fail if a + // char mismatches. + // + for (; (pc = *pi) != '*' && ni != ne; ++pi, ++ni) + { + if (*ni != pc && pc != '?') + return false; + } + + // If we didn't get to * in the pattern then we got to the name suffix. + // That means that the pattern has unmatched non-star characters, and so + // match failed. + // + if (pc != '*') + { + assert (ni == ne); + return false; + } + + // If * that we have reached is the last one, then it matches whatever is + // left in the name (including an empty range). + // + if (++pi == pe) + return true; + + // Perform backtracking. + // + // From now on, we will call the pattern not-yet-matched part (starting + // the leftmost * and ending the rightmost one inclusively) as pattern, and + // the name not-yet-matched part as name. + // + // Here we sequentially assume that * that starts the pattern matches the + // name leading part (staring from an empty one and iterating till the full + // name). So if, at some iteration, the pattern trailing part (that follows + // the leftmost *) matches the name trailing part, then the pattern matches + // the name. + // + bool r; + for (; !(r = match (pi, pe, ni, ne)) && ni != ne; ++ni) ; + return r; + } + + bool + path_match (const string& pattern, const string& name) + { + // Implementation notes: + // + // - This has a good potential of becoming hairy quickly so need to strive + // for an elegant way to implement this. + // + // - Most patterns will contains a single * wildcard with a prefix and/or + // suffix (e.g., *.txt, foo*, f*.txt). Something like this is not very + // common: *foo*. + // + // So it would be nice to have a clever implementation that first + // "anchors" itself with a literal prefix and/or suffix and only then + // continue with backtracking. In other words, reduce: + // + // *.txt vs foo.txt -> * vs foo + // foo* vs foo.txt -> * vs .txt + // f*.txt vs foo.txt -> * vs oo + // + + auto pi (pattern.rbegin ()); + auto pe (pattern.rend ()); + + auto ni (name.rbegin ()); + auto ne (name.rend ()); + + // The name doesn't match the pattern if it is of a different type than the + // pattern is. + // + bool pd (pi != pe && path::traits::is_separator (*pi)); + bool nd (ni != ne && path::traits::is_separator (*ni)); + + if (pd != nd) + return false; + + // Skip trailing separators if present. + // + if (pd) + { + ++pi; + ++ni; + } + + return match (pattern.begin (), pi.base (), name.begin (), ni.base ()); + } + + // Iterate over directory sub-entries, recursively and including itself if + // requested. Note that recursive iterating goes depth-first which make + // sense for the cleanup use cases (@@ maybe this should be controllable + // since for directory creation it won't make sense). + // + // Note that iterating over non-existent directory is not en error. The + // subsequent next() call returns false for such a directory. + // + class recursive_dir_iterator + { + public: + recursive_dir_iterator (dir_path p, bool recursive, bool self) + : recursive_ (recursive), self_ (self), start_ (p) + { + open (dir_path ()); + } + + // Non-copyable, non-movable type. + // + recursive_dir_iterator (const recursive_dir_iterator&) = delete; + recursive_dir_iterator& operator= (const recursive_dir_iterator&) = delete; + + // Return false if no more entries left. Otherwise save the next entry path + // and return true. The path is relative against the directory being + // traversed and contains a trailing separator for sub-directories. Throw + // std::system_error in case of a failure (insufficient permissions, + // dangling symlink encountered, etc). + // + bool + next (path& p) + { + if (iters_.empty ()) + return false; + + auto& i (iters_.back ()); + + // If we got to the end of directory sub-entries, then go one level up + // and return this directory path. + // + if (i.first == dir_iterator ()) + { + path d (move (i.second)); + iters_.pop_back (); + + // Return the path unless it is the last one (the directory we started + // to iterate from) and the self flag is not set. + // + if (iters_.empty () && !self_) + return false; + + p = move (d); + return true; + } + + const dir_entry& de (*i.first); + + // Append separator if a directory. Note that dir_entry::type() can + // throw. + // + path pe (de.type () == entry_type::directory + ? path_cast (i.second / de.path ()) + : i.second / de.path ()); + + ++i.first; + + if (recursive_ && pe.to_directory ()) + { + open (path_cast (move (pe))); + return next (p); + } + + p = move (pe); + return true; + } + + private: + void + open (dir_path p) + { + // We should consider a racing condition here. The directory can be + // removed before we create an iterator for it. In this case we just do + // nothing, so the directory is silently skipped. + // + try + { + dir_path d (start_ / p); + dir_iterator i (!d.empty () ? d : dir_path (".")); + iters_.emplace_back (move (i), move (p)); + } + catch (const system_error& e) + { + // Ignore non-existent directory (ENOENT or ENOTDIR). Rethrow for any + // other error. We consider ENOTDIR as a variety of removal, with a + // new filesystem entry being created afterwards. + // + int ec (e.code ().value ()); + if (ec != ENOENT && ec != ENOTDIR) + throw; + } + } + + private: + bool recursive_; + bool self_; + dir_path start_; + small_vector, 1> iters_; + }; + + // Search for paths matching the pattern and call the specified function for + // each matching path. Return false if the underlying func() call returns + // false. Otherwise the function conforms to the path_search() description. + // + static bool + search (path pattern, dir_path pattern_dir, const dir_path start_dir, + const function& func) + { + // Fast-forward the leftmost pattern non-wildcard components. So, for + // example, search for foo/f* in /bar/ becomes search for f* in /bar/foo/. + // + { + auto b (pattern.begin ()); + auto e (pattern.end ()); + auto i (b); + for (; i != e && (*i).find_first_of ("*?") == string::npos; ++i) ; + + // If the pattern has no wildcards then we reduce to checking for the + // filesystem entry existence. It matches if exists and is of the proper + // type. + // + if (i == e) + { + path p (pattern_dir / pattern); + auto pe (path_entry (start_dir / p, true)); + + if (pe.first && + ((pe.second == entry_type::directory) == p.to_directory ())) + return func (move (p)); + + return true; + } + else if (i != b) // There are non-wildcard components, so fast-forward. + { + path p (b, i); + pattern = pattern.leaf (p); + pattern_dir /= path_cast (move (p)); + } + } + + assert (!pattern.empty ()); + + // The pattern leftmost component. Will use it to match the start directory + // sub-entries. + // + path pc (pattern.begin (), ++pattern.begin ()); + string pcr (pc.representation ()); + + // Note that if the pattern has multiple components (is not a simple path), + // then the leftmost one has a trailing separator, and so will match + // sub-directories only. + // + bool simple (pattern.simple ()); + + recursive_dir_iterator i ( + start_dir / pattern_dir, + pcr.find ("**") != string::npos, // Recursive. + pcr.find ("***") != string::npos); // Self-inclusive. + + path p; + while (i.next (p)) + { + // Skip sub-entry if its name doesn't match the pattern leftmost + // component. + // + // Matching the directory we are iterating through (as for a pattern + // component containing ***) is a bit tricky. This directory is + // represented by the iterator as an empty path, and so we need to + // compute it (the leaf would actually be enough) for matching. This + // leaf can be aquired from the pattern_dir / start_dir path except the + // case when both directories are empty. This is the case when we search + // in the current directory (start_dir is empty) with a pattern that + // starts with *** wildcard (for example f***/bar). All we can do here is + // to fallback to path::current_directory() call. Note that this will be + // the only call per path_search() as the next time pattern_dir will not + // be empty. + // + const path& se (!p.empty () + ? p + : path_cast (!pattern_dir.empty () + ? pattern_dir + : !start_dir.empty () + ? start_dir + : path::current_directory ())); + + if (!path_match (pcr, se.leaf ().representation ())) + continue; + + // If the pattern is a simple path then call func() for the sub-entry. + // Otherwise the sub-entry is a directory (read above), and we search in + // it using the trailing part of the pattern. + // + if (!( + simple + ? func (pattern_dir / p) + : search (pattern.leaf (pc), + pattern_dir / path_cast (move (p)), + start_dir, + func))) + return false; + } + + return true; + } + + void + path_search (const path& pattern, + const function& func, + const dir_path& start) + { + search (pattern, + dir_path (), + pattern.relative () ? start : dir_path (), + func); + } } diff --git a/butl/path b/butl/path index a81848b..19e3d42 100644 --- a/butl/path +++ b/butl/path @@ -27,7 +27,7 @@ namespace butl // p -= "*/"; // leaf // p -= ".*"; // base // - // - Faster normalize() implementation. In many cases (e.g., in buil2) + // - Faster normalize() implementation. In many cases (e.g., in build2) // the path is either already normal or the difference is just slashes // (i.e., there are no '.' or '..' components). So a fast path case // might be in order. diff --git a/tests/buildfile b/tests/buildfile index 8608ed9..6f5dd03 100644 --- a/tests/buildfile +++ b/tests/buildfile @@ -4,7 +4,8 @@ d = base64/ cpfile/ dir-iterator/ fdstream/ link/ manifest-parser/ \ manifest-serializer/ manifest-roundtrip/ pager/ path/ prefix-map/ \ - process/ sha256/ small-vector/ strcase/ timestamp/ target-triplet/ + process/ sha256/ small-vector/ strcase/ timestamp/ target-triplet/ \ + wildcard/ ./: $d include $d diff --git a/tests/wildcard/buildfile b/tests/wildcard/buildfile new file mode 100644 index 0000000..7de67f4 --- /dev/null +++ b/tests/wildcard/buildfile @@ -0,0 +1,7 @@ +# file : tests/wildcard/buildfile +# copyright : Copyright (c) 2014-2017 Code Synthesis Ltd +# license : MIT; see accompanying LICENSE file + +exe{driver}: cxx{driver} ../../butl/lib{butl} test{testscript} + +include ../../butl/ diff --git a/tests/wildcard/driver.cxx b/tests/wildcard/driver.cxx new file mode 100644 index 0000000..7df5556 --- /dev/null +++ b/tests/wildcard/driver.cxx @@ -0,0 +1,105 @@ +// file : tests/wildcard/driver.cxx -*- C++ -*- +// copyright : Copyright (c) 2014-2017 Code Synthesis Ltd +// license : MIT; see accompanying LICENSE file + +#include +#include +#include +#include +#include // sort() +#include + +#include +#include // operator<<(ostream, exception) +#include + +using namespace std; +using namespace butl; + +// Usage: argv[0] (-m | -s [-n] []) +// +// Execute actions specified by -m or -s options. Exit with code 0 if succeed, +// 1 if fail, 2 on the underlying OS error (print error description to STDERR). +// +// -m +// Match a name against the pattern. +// +// -s +// Search for paths matching the pattern in the directory specified (absent +// directory means the current one). Print the matching canonicalized paths +// to STDOUT in the ascending order. Succeed if at least one matching path +// is found. Note that this option must go first in the command line, +// +// -n +// Do not sort paths found. +// +int +main (int argc, const char* argv[]) +try +{ + assert (argc >= 2); + + string op (argv[1]); + bool match (op == "-m"); + assert (match || op == "-s"); + + if (match) + { + assert (argc == 4); + + string pattern (argv[2]); + string name (argv[3]); + return path_match (pattern, name) ? 0 : 1; + } + else + { + assert (argc >= 3); + + bool sort (true); + int i (2); + for (; i != argc; ++i) + { + string o (argv[i]); + if (o == "-n") + sort = false; + else + break; // End of options. + } + + assert (i != argc); // Still need pattern. + path pattern (argv[i++]); + + dir_path start; + if (i != argc) + start = dir_path (argv[i++]); + + assert (i == argc); // All args parsed, + + vector paths; + auto add = [&paths] (path&& p) -> bool + { + paths.emplace_back (move (p.canonicalize ())); + return true; + }; + + path_search (pattern, add, start); + + if (sort) + std::sort (paths.begin (), paths.end ()); + + for (const auto& p: paths) + cout << p.representation () << endl; + + return paths.empty () ? 1 : 0; + } +} +catch (const invalid_path& e) +{ + cerr << e << ": " << e.path << endl; + return 2; +} +catch (const exception& e) +{ + cerr << e << endl; + return 2; +} diff --git a/tests/wildcard/testscript b/tests/wildcard/testscript new file mode 100644 index 0000000..bcd618f --- /dev/null +++ b/tests/wildcard/testscript @@ -0,0 +1,378 @@ +# file : tests/wildcard/testscript +# copyright : Copyright (c) 2014-2017 Code Synthesis Ltd +# license : MIT; see accompanying LICENSE file + +: path-match +: +{ + test.options = -m + + $* foo/ foo == 1 : dir-vs-file + $* foo foo/ == 1 : file-vs-dir + + : no-star + : + { + : match + : + { + $* foo/ foo/ : dir + $* foo foo : file + $* f?o foo : qmark + $* '' '' : empty + } + + : no-match + : + { + $* oo foo == 1 : name-char + $* foo oo == 1 : pattern-char + } + } + + : with-star + : + { + : no-backtracking + : + { + : match + : + { + $* * '' : empty + $* *.txt foo.txt : suffix-only + $* foo* foo.txt : prefix-only + $* f*.txt foo.txt : prefix-suffix + } + + : no-match + : + { + $* fox* foo.txt == 1 : pattern-prefix-char + $* foo* fo == 1 : short-name + } + } + + : backtracking + : + { + : match + : + { + $* ** '' : empty1 + $* *** '' : empty2 + $* f*.* f.txt : empty3 + $* f**.* f.txt : empty4 + $* f*.* foo.txt : non-empty + $* f*?* foo-txt : qmark + } + + : no-match + : + { + $* f*.* foo-txt == 1 + } + } + } +} + +: path-search +: +{ + test.options = -s + + : start + : + { + : empty + : + $* * >>/EOO + stderr + stdout + EOO + + : relative + : + mkdir -p foo/fox/bar; + $* ba*/ foo/fox >>/EOO + bar/ + EOO + + : absolute + : + : When cross-testing we can't guarantee that host absolute paths are + : recognized by the target process. + : + if ($test.target == $build.host) + { + wd = $~ + +mkdir -p foo/bar + + : dir + : + $* ba*/ "$wd/foo" >>/EOO + bar/ + EOO + + : pattern + : + $* "$wd/foo/ba*/" >>/"EOO" + $wd/foo/bar/ + EOO + } + } + + : pattern + : + { + : simple + : + { + : file + : + { + +mkdir -p foo/bar + +touch foo/baz foo/box foo/bar/bar foo/bar/fox + + : immediate + : + $* b* ../foo >>EOO + baz + box + EOO + + : recursive + : + $* ba** ../foo >>/EOO + bar/bar + baz + EOO + + : self-recursive + : + { + : start + : + $* f*** ../../foo >>/EOO + bar/fox + EOO + + : current + : + mkdir -p bar/fox; + touch bar/fox/cox; + $* c*** >>/EOO + bar/fox/cox + EOO + } + } + + : dir + : + { + +mkdir -p foo/bar/bar foo/bar/fox/ + +touch foo/baz + + : immediate + : + $* b*/ ../foo >>/EOO + bar/ + EOO + + : recursive + : + $* -n b**/ ../foo >>/EOO + bar/bar/ + bar/ + EOO + + : self-recursive + : + { + : start + : + : Note that the start dir is represented as an empty path being + : found. + : + $* f***/ ../../foo >>/EOO + + bar/fox/ + EOO + + : current + : + mkdir -p bar/cox/box/; + $* c***/ >>/EOO + + bar/cox/ + EOO + } + } + } + + : compound + : + { + : file + : + { + +mkdir -p foo fox fix/bar baz/foo/zab baz/foo/zab/baz + +touch foo/bar foo/fox fox/baz baz/foo/zab/bar + + : immediate + : + $* f*/b* .. >>/EOO + foo/bar + fox/baz + EOO + + : recursive + : + $* f**/b** .. >>/EOO + baz/foo/zab/bar + foo/bar + fox/baz + EOO + + : self-recursive + : + { + : pattern + : + $* foo/f*** ../.. >>/EOO + foo/fox + EOO + + : start + : + $* f*** ../../foo >>/EOO + fox + EOO + + : current + : + mkdir -p bar; + touch bar/cox; + $* c*** >>/EOO + bar/cox + EOO + } + } + + : dir + : + { + +mkdir -p foo/bar foo/fox/box fox/baz fix baz/foo/zab/baz + +touch fix/bar baz/foo/zab/bar + + : immediate + : + $* f*/b*/ .. >>/EOO + foo/bar/ + fox/baz/ + EOO + + : recursive + : + $* f**/b**/ .. >>/EOO + baz/foo/zab/baz/ + foo/bar/ + foo/fox/box/ + foo/fox/box/ + fox/baz/ + EOO + + : self-recursive + : + { + : pattern + : + $* foo/f***/b**/ ../.. >>/EOO + foo/bar/ + foo/fox/box/ + foo/fox/box/ + EOO + + : start + : + $* f***/b**/ ../../foo >>/EOO + bar/ + fox/box/ + fox/box/ + EOO + + : current + : + mkdir -p bar/cox/box/; + $* c***/b**/ >>/EOO + bar/ + bar/cox/box/ + bar/cox/box/ + EOO + } + } + } + + : fast-forward + : + { + +mkdir -p foo/bar/baz foo/box + +touch foo/bar/baz/fox + + : partial + : + { + wd = ../.. + + : file + : + $* foo/ba*/baz/f* $wd >>/EOO + foo/bar/baz/fox + EOO + + : dir + : + $* foo/b*/baz/ $wd >>/EOO + foo/bar/baz/ + EOO + } + + : reduce + : + { + wd = ../../.. + + : exists + : + { + : file + : + $* fo?/bar/baz/fox $wd >>/EOO + foo/bar/baz/fox + EOO + + : dir + : + $* fo?/bar/baz/ $wd >>/EOO + foo/bar/baz/ + EOO + + : parent + : + $* fo?/box/../bar/baz/fox $wd >>/EOO + foo/box/../bar/baz/fox + EOO + } + + : not-exists + : + { + $* fo?/bar/baz/foz $wd == 1 : file + $* fo?/bar/bax/ $wd == 1 : dir + $* fo?/bar/baz/fox/ $wd == 1 : not-dir + $* fo?/bix/../bar/baz/foz $wd == 1 : parent + } + } + } + } +} -- cgit v1.1