diff options
author | Karen Arutyunov <karen@codesynthesis.com> | 2017-06-07 17:18:54 +0300 |
---|---|---|
committer | Karen Arutyunov <karen@codesynthesis.com> | 2017-06-08 19:57:50 +0300 |
commit | 04f4bb3e2492bb6e4b0769b4c7f020493cfa5cc4 (patch) | |
tree | 9ab6bf532ea9a906fa9d82570dbd8dd63e582f0a | |
parent | 3ea3b1c94f320bb55849d4a9fea1c3009ab3b298 (diff) |
Add path_match() and path_search() overloads
-rw-r--r-- | libbutl/filesystem.cxx | 491 | ||||
-rw-r--r-- | libbutl/filesystem.hxx | 42 | ||||
-rw-r--r-- | tests/wildcard/driver.cxx | 55 |
3 files changed, 447 insertions, 141 deletions
diff --git a/libbutl/filesystem.cxx b/libbutl/filesystem.cxx index 18b4d22..3002c93 100644 --- a/libbutl/filesystem.cxx +++ b/libbutl/filesystem.cxx @@ -1097,6 +1097,192 @@ namespace butl return match (pattern.begin (), pi.base (), name.begin (), ni.base ()); } + // Search for paths matching the pattern and call the specified function for + // each matching path. Return false if the underlying func() call returns + // false. Otherwise the function conforms to the path_search() description. + // + // Note that the access to the traversed directory tree (real or virtual) is + // performed through the provided filesystem object. + // + static const string any_dir ("*/"); + + template <typename FS> + static bool + search ( + path pattern, + dir_path pattern_dir, + bool follow_symlinks, + const function<bool (path&&, const string& pattern, bool interm)>& func, + FS& filesystem) + { + // Fast-forward the leftmost pattern non-wildcard components. So, for + // example, search for foo/f* in /bar/ becomes search for f* in /bar/foo/. + // + { + auto b (pattern.begin ()); + auto e (pattern.end ()); + auto i (b); + for (; i != e && (*i).find_first_of ("*?") == string::npos; ++i) ; + + // If the pattern has no wildcards then we reduce to checking for the + // filesystem entry existence. It matches if exists and is of the proper + // type. + // + if (i == e) + { + path p (pattern_dir / pattern); + auto pe (filesystem.path_entry (p, follow_symlinks)); + + if (pe.first && + ((pe.second.type == entry_type::directory) == p.to_directory ())) + return func (move (p), string (), false); + + return true; + } + else if (i != b) // There are non-wildcard components, so fast-forward. + { + path p (b, i); + pattern = pattern.leaf (p); + pattern_dir /= path_cast<dir_path> (move (p)); + } + } + + assert (!pattern.empty ()); + + // The pattern leftmost component. Will use it to match the start directory + // sub-entries. + // + path pc (pattern.begin (), ++pattern.begin ()); + string pcr (pc.representation ()); + + // Note that if the pattern has multiple components (is not a simple path), + // then the leftmost one has a trailing separator, and so will match + // sub-directories only. + // + bool simple (pattern.simple ()); + + // Note that we rely on "small function object" optimization here. + // + typename FS::iterator_type i (filesystem.iterator ( + pattern_dir, + pcr.find ("**") != string::npos, // Recursive. + pcr.find ("***") != string::npos, // Self-inclusive. + follow_symlinks || !simple, + [&pattern_dir, &func] (const dir_path& p) -> bool // Preopen. + { + return func (pattern_dir / p, any_dir, true); + })); + + // Canonicalize the pattern component collapsing consecutive stars (used to + // express that it is recursive) into a single one. + // + size_t j (0); + size_t n (pcr.size ()); + for (size_t i (0); i < n; ++i) + { + char c (pcr[i]); + if (!(c == '*' && i > 0 && pcr[i - 1] == '*')) + pcr[j++] = c; + } + + if (j != n) + pcr.resize (j); + + // Note that the callback function can be called for the same directory + // twice: first time as intermediate match from iterator's preopen() call, + // and then, if the first call succeed, from the iterating loop (possibly + // as the final match). + // + path p; + while (i.next (p)) + { + // Skip sub-entry if its name doesn't match the pattern leftmost + // component. + // + // Matching the directory we are iterating through (as for a pattern + // component containing ***) is a bit tricky. This directory is + // represented by the iterator as an empty path, and so we need to + // compute it (the leaf would actually be enough) for matching. This + // leaf can be acquired from the start_dir / pattern_dir. We don't expect + // this path to be empty, as the filesystem object must replace an empty + // start directory with the current one. This is the case when we search + // in the current directory (start_dir is empty) with a pattern that + // starts with *** wildcard (for example f***/bar). Note that this will + // be the only case per path_search() as the next time pattern_dir will + // not be empty. + // + const path& se (!p.empty () + ? p + : path_cast<path> (!pattern_dir.empty () + ? pattern_dir + : filesystem.start_dir ())); + + if (!path_match (pcr, se.leaf ().representation ())) + continue; + + // If the callback function returns false, then we stop the entire search + // for the final match, or do not search below the path for the + // intermediate one. + // + if (!func (pattern_dir / p, pcr, !simple)) + { + if (simple) // Final match. + return false; + else + continue; + } + + // If the pattern is not a simple one, and it's leftmost component + // matches the sub-entry, then the sub-entry is a directory (see the note + // above), and we search in it using the trailing part of the pattern. + // + if (!simple && !search (pattern.leaf (pc), + pattern_dir / path_cast<dir_path> (move (p)), + follow_symlinks, + func, + filesystem)) + return false; + } + + return true; + } + + // Path search implementations. + // + static const dir_path empty_dir; + + using preopen = std::function<bool (const dir_path&)>; + + // Base for filesystem (see above) implementations. + // + // Don't copy start directory. It is expected to exist till the end of the + // object lifetime. + // + class filesystem_base + { + protected: + filesystem_base (const dir_path& start): start_ (start) {} + + public: + const dir_path& + start_dir () + { + if (!start_.empty ()) + return start_; + + if (current_.empty ()) + current_ = dir_path::current_directory (); + + return current_; + } + + protected: + const dir_path& start_; + dir_path current_; + }; + + // Search path in the real filesystem. + // // Iterate over directory sub-entries, recursively and including itself if // requested. Note that recursive iterating goes depth-first which make // sense for the cleanup use cases (@@ maybe this should be controllable @@ -1109,8 +1295,6 @@ namespace butl // Note that iterating over non-existent directory is not en error. The // subsequent next() call returns false for such a directory. // - using preopen = std::function<bool (const dir_path&)>; - class recursive_dir_iterator { public: @@ -1128,10 +1312,11 @@ namespace butl open (dir_path (), self_); } - // Non-copyable, non-movable type. + // Move constructible-only, non-assignable type. // recursive_dir_iterator (const recursive_dir_iterator&) = delete; recursive_dir_iterator& operator= (const recursive_dir_iterator&) = delete; + recursive_dir_iterator (recursive_dir_iterator&&) = default; // Return false if no more entries left. Otherwise save the next entry path // and return true. The path is relative against the directory being @@ -1235,165 +1420,207 @@ namespace butl small_vector<pair<dir_iterator, dir_path>, 1> iters_; }; - // Search for paths matching the pattern and call the specified function for - // each matching path. Return false if the underlying func() call returns - // false. Otherwise the function conforms to the path_search() description. + // Provide an access to the real filesystem. // - static const string any_dir ("*/"); - - static bool - search ( - path pattern, - dir_path pattern_dir, - const dir_path start_dir, - bool follow_symlinks, - const function<bool (path&&, const string& pattern, bool interm)>& func) + class real_filesystem: public filesystem_base { - // Fast-forward the leftmost pattern non-wildcard components. So, for - // example, search for foo/f* in /bar/ becomes search for f* in /bar/foo/. - // - { - auto b (pattern.begin ()); - auto e (pattern.end ()); - auto i (b); - for (; i != e && (*i).find_first_of ("*?") == string::npos; ++i) ; + public: + using iterator_type = recursive_dir_iterator; - // If the pattern has no wildcards then we reduce to checking for the - // filesystem entry existence. It matches if exists and is of the proper - // type. - // - if (i == e) - { - path p (pattern_dir / pattern); - auto pe (path_entry (start_dir / p, true)); + real_filesystem (const dir_path& start): filesystem_base (start) {} - if (pe.first && - ((pe.second.type == entry_type::directory) == p.to_directory ())) - return func (move (p), string (), false); + pair<bool, entry_stat> + path_entry (const path& p, bool follow_symlinks) const + { + return butl::path_entry (start_ / p, follow_symlinks); + } - return true; - } - else if (i != b) // There are non-wildcard components, so fast-forward. - { - path p (b, i); - pattern = pattern.leaf (p); - pattern_dir /= path_cast<dir_path> (move (p)); - } + iterator_type + iterator (const dir_path& p, + bool recursive, + bool self, + bool follow_symlinks, + preopen po) const + { + return iterator_type (start_ / p, recursive, self, follow_symlinks, po); } + }; - assert (!pattern.empty ()); + void + path_search ( + const path& pattern, + const function<bool (path&&, const string& pattern, bool interm)>& func, + const dir_path& start, + bool follow_symlinks) + { + real_filesystem fs (pattern.relative () ? start : empty_dir); + search (pattern, dir_path (), follow_symlinks, func, fs); + } - // The pattern leftmost component. Will use it to match the start directory - // sub-entries. - // - path pc (pattern.begin (), ++pattern.begin ()); - string pcr (pc.representation ()); + // Search path in the directory tree represented by a path. + // + // Iterate over path prefixes, as recursive_dir_iterator (see above) would + // iterate over the real directory tree. + // + class path_iterator + { + public: + path_iterator (path p, bool recursive, bool self, preopen po) + : path_ (move (p)), + recursive_ (recursive), + self_ (self), + preopen_ (move (po)), + iter_ (path_.begin ()) + { + open (dir_path (), self_); + } - // Note that if the pattern has multiple components (is not a simple path), - // then the leftmost one has a trailing separator, and so will match - // sub-directories only. + // Move constructible-only, non-assignable type. // - bool simple (pattern.simple ()); + path_iterator (const path_iterator&) = delete; + path_iterator& operator= (const path_iterator&) = delete; + path_iterator (path_iterator&&) = default; - // Note that we rely on "small function object" optimization here. + // Return false if no more entries left. Otherwise save the next entry path + // and return true. // - recursive_dir_iterator i ( - start_dir / pattern_dir, - pcr.find ("**") != string::npos, // Recursive. - pcr.find ("***") != string::npos, // Self-inclusive. - follow_symlinks, - [&pattern_dir, &func] (const dir_path& p) -> bool // Preopen. + bool + next (path& p) + { + if (iter_ == path_.begin ()) { - return func (pattern_dir / p, any_dir, true); - }); + if (!self_) + return false; - // Canonicalize the pattern component collapsing consecutive stars (used to - // express that it is recursive) into a single one. - // - size_t j (0); - size_t n (pcr.size ()); - for (size_t i (0); i < n; ++i) - { - char c (pcr[i]); - if (!(c == '*' && i > 0 && pcr[i - 1] == '*')) - pcr[j++] = c; - } + p = path (); + self_ = false; // To bail out the next time. + return true; + } - if (j != n) - pcr.resize (j); + path pe (path_.begin (), iter_); + if (recursive_ && pe.to_directory ()) + { + open (path_cast<dir_path> (pe), true); + return next (p); + } - // Note that the callback function can be called for the same directory - // twice: first time as intermediate match from iterator's preopen() call, - // and then, if the first call succeed, from the iterating loop (possibly - // as the final match). - // - path p; - while (i.next (p)) + --iter_; // Return one level up. + + p = move (pe); + return true; + } + + private: + void + open (const dir_path& p, bool preopen) { - // Skip sub-entry if its name doesn't match the pattern leftmost - // component. + // If preopen_() returns false, then the directory will not be + // traversed (as we reset the recursive flag) but still be returned by + // the next() call as a sub-entry. // - // Matching the directory we are iterating through (as for a pattern - // component containing ***) is a bit tricky. This directory is - // represented by the iterator as an empty path, and so we need to - // compute it (the leaf would actually be enough) for matching. This - // leaf can be aquired from the pattern_dir / start_dir path except the - // case when both directories are empty. This is the case when we search - // in the current directory (start_dir is empty) with a pattern that - // starts with *** wildcard (for example f***/bar). All we can do here is - // to fallback to path::current_directory() call. Note that this will be - // the only call per path_search() as the next time pattern_dir will not - // be empty. + if (preopen && !preopen_ (p)) + recursive_ = false; + else if (iter_ != path_.end ()) + ++iter_; + + // If the rightmost component is reached, then all the directories were + // traversed, so we reset the recursive flag. // - const path& se (!p.empty () - ? p - : path_cast<path> (!pattern_dir.empty () - ? pattern_dir - : !start_dir.empty () - ? start_dir - : path::current_directory ())); + if (iter_ == path_.end ()) + recursive_ = false; + } - if (!path_match (pcr, se.leaf ().representation ())) - continue; + private: + path path_; + bool recursive_; + bool self_; + preopen preopen_; + path::iterator iter_; + }; - // If the callback function returns false, then we stop the entire search - // for the final match, or do not search below the path for the - // intermediate one. - // - if (!func (pattern_dir / p, pcr, !simple)) - { - if (simple) // Final match. - return false; - else - continue; - } + // Provide an access to a directory tree, that is represented by the path. + // + // Note that symlinks are meaningless for this filesystem. + // + class path_filesystem: public filesystem_base + { + public: + using iterator_type = path_iterator; - // If the pattern is not a simple one, and it's leftmost component - // matches the sub-entry, then the sub-entry is a directory (see the note - // above), and we search in it using the trailing part of the pattern. + path_filesystem (const dir_path& start, const path& p) + : filesystem_base (start), + path_ (p) {} + + pair<bool, entry_stat> + path_entry (const path& p, bool /*follow_symlinks*/) const + { + // Note that paths are not required to be normalized, so we just check + // that one path is a literal prefix of the other one. // - if (!simple && !search (pattern.leaf (pc), - pattern_dir / path_cast<dir_path> (move (p)), - start_dir, - follow_symlinks, - func)) - return false; + if (!path_.sub (p)) + return make_pair (false, entry_stat {entry_type::unknown, 0}); + + entry_type t (p == path_ && !p.to_directory () + ? entry_type::regular + : entry_type::directory); + + return make_pair (true, entry_stat {t, 0}); } - return true; - } + iterator_type + iterator (const dir_path& p, + bool recursive, + bool self, + bool /*follow_symlinks*/, + preopen po) const + { + assert (path_.sub (p)); + return iterator_type (path_.leaf (p), recursive, self, po); + } + + private: + const path& path_; + }; void path_search ( const path& pattern, + const path& entry, const function<bool (path&&, const string& pattern, bool interm)>& func, - const dir_path& start, - bool follow_symlinks) + const dir_path& start) + { + path_filesystem fs (pattern.relative () ? start : empty_dir, entry); + search (pattern, dir_path (), true, func, fs); + } + + bool + path_match (const path& pattern, const path& entry, const dir_path& start) { - search (pattern, - dir_path (), - pattern.relative () ? start : dir_path (), - follow_symlinks, - func); + bool r (false); + + auto match = [&entry, &r] (path&& p, const std::string&, bool interim) + { + if (p == entry) + { + // If we found the entry (possibly through one of the recursive + // components) no need to search further. + // + if (!interim) + { + r = true; + return false; + } + else + // For self-matching the callback is first called in the interim + // mode (through the preopen function) with an empty path. + // + assert (p.empty ()); + } + + return true; + }; + + path_search (pattern, entry, match, start); + return r; } } diff --git a/libbutl/filesystem.hxx b/libbutl/filesystem.hxx index 6ea7d2c..b4f8d96 100644 --- a/libbutl/filesystem.hxx +++ b/libbutl/filesystem.hxx @@ -489,6 +489,10 @@ namespace butl // Wildcard pattern match and search (aka glob). // + // Currently the following wildcard characters are supported: + // + // * - match any number of characters (including zero) + // ? - match any single character // Return true if name matches pattern. Both must be single path components, // possibly with a trailing directory separator to indicate a directory. @@ -498,14 +502,19 @@ namespace butl // different). Otherwise, it only matches a non-directory name (no trailing // directory separator). // - // Currently the following wildcard characters are supported: - // - // * - match any number of characters (including zero) - // ? - match any single character - // LIBBUTL_EXPORT bool path_match (const std::string& pattern, const std::string& name); + // Return true if path entry matches pattern. Note that the match is + // performed literally, with no paths normalization being performed. The + // start directory is used if the first pattern component is a self-matching + // wildcard (see below for the start directory and wildcard semantics). + // + LIBBUTL_EXPORT bool + path_match (const path& pattern, + const path& entry, + const dir_path& start = dir_path ()); + // Search for paths matching the pattern calling the specified function for // each matching path (see below for details). // @@ -521,7 +530,10 @@ namespace butl // path_search() also recognizes the ** and *** wildcard sequences. If a // path component contains **, then it is matched just like * but in all the // subdirectories, recursively. The *** wildcard behaves like ** but also - // matches the start directory itself. + // matches the start directory itself. Note that if the first pattern + // component contains ***, then the start directory must be empty or be + // terminated with a "meaningful" component (e.g., probably not '.' or + // '..'). // // So, for example, foo/bar-**.txt will return all the files matching the // bar-*.txt pattern in all the subdirectoris of foo/. And foo/f***/ will @@ -567,6 +579,12 @@ namespace butl // (a/b/, b*/, true) // (a/b/c/, c*/, false) // + // Symlinks are not followed if the follow_symlinks argument is false. This + // rule is only applied for symlinks that are matched against the rightmost + // component of the pattern. In particular, this mean that such symlinks will + // never match a directory pattern, and some results can be missing for the + // recursive rightmost component. + // // Note that recursive iterating through directories currently goes // depth-first which make sense for the cleanup use cases. In future we may // want to make it controllable. @@ -578,6 +596,18 @@ namespace butl bool interm)>&, const dir_path& start = dir_path (), bool follow_symlinks = true); + + // Same as above, but behaves as if the directory tree being searched + // through contains only the specified entry. The start directory is used if + // the first pattern component is a self-matching wildcard (see above). + // + LIBBUTL_EXPORT void + path_search (const path& pattern, + const path& entry, + const std::function<bool (path&&, + const std::string& pattern, + bool interm)>&, + const dir_path& start = dir_path ()); } #include <libbutl/filesystem.ixx> diff --git a/tests/wildcard/driver.cxx b/tests/wildcard/driver.cxx index 2397fc8..1e600f6 100644 --- a/tests/wildcard/driver.cxx +++ b/tests/wildcard/driver.cxx @@ -2,6 +2,7 @@ // copyright : Copyright (c) 2014-2017 Code Synthesis Ltd // license : MIT; see accompanying LICENSE file +#include <map> #include <string> #include <vector> #include <cassert> @@ -90,8 +91,11 @@ try assert (i == argc); // All args parsed, vector<path> paths; - auto add = - [&paths, &start] (path&& p, const std::string& pt, bool interim) -> bool + map<path, size_t> path_count; + + auto add = [&paths, &path_count, &start] (path&& p, + const string& pt, + bool interim) { bool pd (!pt.empty () && pt[0] == '.'); // Dot-started pattern. @@ -114,13 +118,58 @@ try return !skip; if (!skip) - paths.emplace_back (move (p.canonicalize ())); + { + p.canonicalize (); + + auto i (path_count.find (p)); + if (i == path_count.end ()) + path_count[p] = 1; + else + ++(i->second); + + paths.emplace_back (move (p)); + } return true; }; path_search (pattern, add, start); + // Test search in the directory tree represented by the path. + // + for (const auto& p: path_count) + { + // Will match multiple times if the pattern contains several recursive + // components. + // + size_t match_count (0); + + auto check = [&p, &match_count] (path&& pe, const string&, bool interim) + { + if (pe == p.first) + { + if (!interim) + ++match_count; + else + // For self-matching the callback is first called in the interim + // mode (through the preopen function) with an empty path. + // + assert (pe.empty ()); + } + + return true; + }; + + path_search (pattern, p.first, check, start); + assert (match_count == p.second); + + // Test path match. + // + assert (path_match (pattern, p.first, start)); + } + + // Print the found paths. + // if (sort) std::sort (paths.begin (), paths.end ()); |