aboutsummaryrefslogtreecommitdiff
path: root/libbutl/path-pattern.mxx
blob: 2d37b58df67c2548f053c33557a3e777a71024bb (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
// file      : libbutl/path-pattern.mxx -*- C++ -*-
// copyright : Copyright (c) 2014-2019 Code Synthesis Ltd
// license   : MIT; see accompanying LICENSE file

#ifndef __cpp_modules_ts
#pragma once
#endif

#include <cassert>

#ifndef __cpp_lib_modules_ts
#include <string>
#include <cstdint>  // uint16_t
#include <cstddef>  // ptrdiff_t, size_t
#include <iterator> // input_iterator_tag
#endif

// Other includes.
#ifdef __cpp_modules_ts
export module butl.path_pattern;

#ifdef __cpp_lib_modules_ts
import std.core;
#endif

import butl.path;
import butl.optional;
#else
#include <libbutl/path.mxx>
#include <libbutl/optional.mxx>
#endif

#include <libbutl/export.hxx>

LIBBUTL_MODEXPORT namespace butl
{
  // Wildcard pattern match (aka glob).
  //
  // The wildcard pattern contains the literal characters that match
  // themselves and the wildcard characters that match a single or multiple
  // characters. Currently the following wildcards are supported:
  //
  // *     - match any number of characters (including zero)
  // ?     - match any single character
  // [...] - match a character with a "bracket expression"; currently we only
  //         support literal characters and ranges (no character/equivalence
  //         classes, etc; see Pattern Matching Notation section of the Shell
  //         Command Language POSIX specification for details)
  //
  // Note also that currently we don't support the special characters
  // backslash-escaping (as mandated by POSIX).

  // Path match/search flags.
  //
  enum class path_match_flags: std::uint16_t
  {
    // Follow symlinks. This only applies to symlinks that are matched against
    // the rightmost component of the pattern. In particular, this mean that
    // such symlinks will never match a directory pattern and some results can
    // be missing for the recursive rightmost component.
    //
    // Note that this flag is only used for path_search().
    //
    follow_symlinks = 0x1,

    // Make wildcard-only pattern component (e.g., `*/...`, `.../*/...`, or
    // `.../*`) match absent path component. For example, with this flag
    // set, the `a/*/b` pattern matches not only `a/x/b` path, but also `a/b`.
    //
    // Note that this does not apply to single-component patterns and the
    // pattern type is always preserved. In particular, the `a/*/` pattern
    // matches `a/` but not `a`.
    //
    // Finally, keep in mind that only absent directory components can be
    // matched this way. In particular, pattern `a*/*` does not match `ab`
    // (but `a*/*/` matches `ab/`).
    //
    match_absent = 0x2,

    none = 0
  };

  inline path_match_flags operator& (path_match_flags, path_match_flags);
  inline path_match_flags operator| (path_match_flags, path_match_flags);
  inline path_match_flags operator&= (path_match_flags&, path_match_flags);
  inline path_match_flags operator|= (path_match_flags&, path_match_flags);

  // Return true if name matches pattern. Both must be single path components,
  // possibly with a trailing directory separator to indicate a directory.
  //
  // If the pattern ends with a directory separator, then it only matches a
  // directory name (i.e., ends with a directory separator, but potentially
  // different). Otherwise, it only matches a non-directory name (no trailing
  // directory separator).
  //
  LIBBUTL_SYMEXPORT bool
  path_match (const std::string& name, const std::string& pattern);

  // Return true if path entry matches pattern. Note that the match is
  // performed literally, with no paths normalization being performed. The
  // start directory is used if the first pattern component is a self-matching
  // wildcard (see below for the start directory and wildcard semantics).
  //
  // In addition to the wildcard characters, it also recognizes the ** and ***
  // wildcard sequences (see path_search() for details).
  //
  LIBBUTL_SYMEXPORT bool
  path_match (const path& entry,
              const path& pattern,
              const dir_path& start = dir_path (),
              path_match_flags = path_match_flags::none);

  // Return true if a name contains the wildcard characters.
  //
  bool
  path_pattern (const std::string&);

  // Return true if a name contains the ** wildcard sequences.
  //
  bool
  path_pattern_recursive (const std::string&);

  // Return true if a name contains the *** wildcard sequences.
  //
  bool
  path_pattern_self_matching (const std::string&);

  // Return true if a path contains the pattern components.
  //
  bool
  path_pattern (const path&);

  // Return the number of recursive pattern components.
  //
  // Knowing the number of such components allows us to make some assumptions
  // regarding the search result. For example, if it is zero or one, then the
  // result contains no duplicates.
  //
  // Also note that the result can be used as bool.
  //
  std::size_t
  path_pattern_recursive (const path&);

  // Return true if the path is not empty and its first component is a self-
  // matching pattern.
  //
  bool
  path_pattern_self_matching (const path&);

  // Iteration over pattern terminals.
  //
  enum class path_pattern_term_type
  {
    literal,  // Literal character.
    question, // Question mark wildcard.
    star,     // Star wildcard.
    bracket   // Bracket expression wildcard.
  };

  class path_pattern_term
  {
  public:
    path_pattern_term_type      type;
    std::string::const_iterator begin;
    std::string::const_iterator end;

    std::size_t
    size () const {return end - begin;}

    // Predicates.
    //
    bool literal  () const {return type == path_pattern_term_type::literal;}
    bool question () const {return type == path_pattern_term_type::question;}
    bool star     () const {return type == path_pattern_term_type::star;}
    bool bracket  () const {return type == path_pattern_term_type::bracket;}
  };

  // Return the literal terminal character.
  //
  char
  get_literal (const path_pattern_term&);

  // Match a character against the bracket expression terminal.
  //
  LIBBUTL_SYMEXPORT bool
  match_bracket (char, const path_pattern_term&);

  class LIBBUTL_SYMEXPORT path_pattern_iterator
  {
  public:
    using value_type = path_pattern_term;
    using pointer = const path_pattern_term*;
    using reference = const path_pattern_term&;
    using difference_type = std::ptrdiff_t;
    using iterator_category = std::input_iterator_tag;

    explicit
    path_pattern_iterator (const std::string&);

    path_pattern_iterator (std::string::const_iterator begin,
                           std::string::const_iterator end);

    path_pattern_iterator () = default; // Create the end iterator.

    path_pattern_iterator& operator++ () {assert (t_); next (); return *this;}

    reference operator*  () const {assert (t_); return *t_;}
    pointer   operator-> () const {assert (t_); return &*t_;}

    friend bool
    operator== (const path_pattern_iterator&, const path_pattern_iterator&);

    friend bool
    operator!= (const path_pattern_iterator&, const path_pattern_iterator&);

  private:
    void
    next ();

  private:
    // nullopt denotes the end iterator.
    //
    // Note that the default-constructed i_ and e_ iterators (having singular
    // values) may not represent the end iterator as are not comparable for
    // equality. That's why we use an absent term to represent such an
    // iterator.
    //
    optional<path_pattern_term> t_;

    std::string::const_iterator i_;
    std::string::const_iterator e_;
  };

  // Range-based for loop support.
  //
  // for (const path_pattern_term& t: path_pattern_iterator (pattern)) ...
  //
  path_pattern_iterator begin (const path_pattern_iterator&);
  path_pattern_iterator end   (const path_pattern_iterator&);
}

#include <libbutl/path-pattern.ixx>