1 | /* GLIB - Library of useful routines for C programming |
2 | * Copyright (C) 1995-1997, 1999 Peter Mattis, Red Hat, Inc. |
3 | * |
4 | * This library is free software; you can redistribute it and/or |
5 | * modify it under the terms of the GNU Lesser General Public |
6 | * License as published by the Free Software Foundation; either |
7 | * version 2.1 of the License, or (at your option) any later version. |
8 | * |
9 | * This library is distributed in the hope that it will be useful, |
10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
12 | * Lesser General Public License for more details. |
13 | * |
14 | * You should have received a copy of the GNU Lesser General Public |
15 | * License along with this library; if not, see <http://www.gnu.org/licenses/>. |
16 | */ |
17 | |
18 | #include "config.h" |
19 | |
20 | #include <string.h> |
21 | |
22 | #include "gpattern.h" |
23 | |
24 | #include "gmacros.h" |
25 | #include "gmessages.h" |
26 | #include "gmem.h" |
27 | #include "gunicode.h" |
28 | #include "gutils.h" |
29 | |
30 | /** |
31 | * SECTION:patterns |
32 | * @title: Glob-style pattern matching |
33 | * @short_description: matches strings against patterns containing '*' |
34 | * (wildcard) and '?' (joker) |
35 | * |
36 | * The g_pattern_match* functions match a string |
37 | * against a pattern containing '*' and '?' wildcards with similar |
38 | * semantics as the standard glob() function: '*' matches an arbitrary, |
39 | * possibly empty, string, '?' matches an arbitrary character. |
40 | * |
41 | * Note that in contrast to glob(), the '/' character can be matched by |
42 | * the wildcards, there are no '[...]' character ranges and '*' and '?' |
43 | * can not be escaped to include them literally in a pattern. |
44 | * |
45 | * When multiple strings must be matched against the same pattern, it |
46 | * is better to compile the pattern to a #GPatternSpec using |
47 | * g_pattern_spec_new() and use g_pattern_match_string() instead of |
48 | * g_pattern_match_simple(). This avoids the overhead of repeated |
49 | * pattern compilation. |
50 | **/ |
51 | |
52 | /** |
53 | * GPatternSpec: |
54 | * |
55 | * A GPatternSpec struct is the 'compiled' form of a pattern. This |
56 | * structure is opaque and its fields cannot be accessed directly. |
57 | */ |
58 | |
59 | /* keep enum and structure of gpattern.c and patterntest.c in sync */ |
60 | typedef enum |
61 | { |
62 | G_MATCH_ALL, /* "*A?A*" */ |
63 | G_MATCH_ALL_TAIL, /* "*A?AA" */ |
64 | G_MATCH_HEAD, /* "AAAA*" */ |
65 | G_MATCH_TAIL, /* "*AAAA" */ |
66 | G_MATCH_EXACT, /* "AAAAA" */ |
67 | G_MATCH_LAST |
68 | } GMatchType; |
69 | |
70 | struct _GPatternSpec |
71 | { |
72 | GMatchType match_type; |
73 | guint pattern_length; |
74 | guint min_length; |
75 | guint max_length; |
76 | gchar *pattern; |
77 | }; |
78 | |
79 | |
80 | /* --- functions --- */ |
81 | static inline gboolean |
82 | g_pattern_ph_match (const gchar *match_pattern, |
83 | const gchar *match_string, |
84 | gboolean *wildcard_reached_p) |
85 | { |
86 | const gchar *pattern, *string; |
87 | gchar ch; |
88 | |
89 | pattern = match_pattern; |
90 | string = match_string; |
91 | |
92 | ch = *pattern; |
93 | pattern++; |
94 | while (ch) |
95 | { |
96 | switch (ch) |
97 | { |
98 | case '?': |
99 | if (!*string) |
100 | return FALSE; |
101 | string = g_utf8_next_char (string); |
102 | break; |
103 | |
104 | case '*': |
105 | *wildcard_reached_p = TRUE; |
106 | do |
107 | { |
108 | ch = *pattern; |
109 | pattern++; |
110 | if (ch == '?') |
111 | { |
112 | if (!*string) |
113 | return FALSE; |
114 | string = g_utf8_next_char (string); |
115 | } |
116 | } |
117 | while (ch == '*' || ch == '?'); |
118 | if (!ch) |
119 | return TRUE; |
120 | do |
121 | { |
122 | gboolean next_wildcard_reached = FALSE; |
123 | while (ch != *string) |
124 | { |
125 | if (!*string) |
126 | return FALSE; |
127 | string = g_utf8_next_char (string); |
128 | } |
129 | string++; |
130 | if (g_pattern_ph_match (match_pattern: pattern, match_string: string, wildcard_reached_p: &next_wildcard_reached)) |
131 | return TRUE; |
132 | if (next_wildcard_reached) |
133 | /* the forthcoming pattern substring up to the next wildcard has |
134 | * been matched, but a mismatch occurred for the rest of the |
135 | * pattern, following the next wildcard. |
136 | * there's no need to advance the current match position any |
137 | * further if the rest pattern will not match. |
138 | */ |
139 | return FALSE; |
140 | } |
141 | while (*string); |
142 | break; |
143 | |
144 | default: |
145 | if (ch == *string) |
146 | string++; |
147 | else |
148 | return FALSE; |
149 | break; |
150 | } |
151 | |
152 | ch = *pattern; |
153 | pattern++; |
154 | } |
155 | |
156 | return *string == 0; |
157 | } |
158 | |
159 | /** |
160 | * g_pattern_match: |
161 | * @pspec: a #GPatternSpec |
162 | * @string_length: the length of @string (in bytes, i.e. strlen(), |
163 | * not g_utf8_strlen()) |
164 | * @string: the UTF-8 encoded string to match |
165 | * @string_reversed: (nullable): the reverse of @string or %NULL |
166 | * |
167 | * Matches a string against a compiled pattern. Passing the correct |
168 | * length of the string given is mandatory. The reversed string can be |
169 | * omitted by passing %NULL, this is more efficient if the reversed |
170 | * version of the string to be matched is not at hand, as |
171 | * g_pattern_match() will only construct it if the compiled pattern |
172 | * requires reverse matches. |
173 | * |
174 | * Note that, if the user code will (possibly) match a string against a |
175 | * multitude of patterns containing wildcards, chances are high that |
176 | * some patterns will require a reversed string. In this case, it's |
177 | * more efficient to provide the reversed string to avoid multiple |
178 | * constructions thereof in the various calls to g_pattern_match(). |
179 | * |
180 | * Note also that the reverse of a UTF-8 encoded string can in general |
181 | * not be obtained by g_strreverse(). This works only if the string |
182 | * does not contain any multibyte characters. GLib offers the |
183 | * g_utf8_strreverse() function to reverse UTF-8 encoded strings. |
184 | * |
185 | * Returns: %TRUE if @string matches @pspec |
186 | **/ |
187 | gboolean |
188 | g_pattern_match (GPatternSpec *pspec, |
189 | guint string_length, |
190 | const gchar *string, |
191 | const gchar *string_reversed) |
192 | { |
193 | g_return_val_if_fail (pspec != NULL, FALSE); |
194 | g_return_val_if_fail (string != NULL, FALSE); |
195 | |
196 | if (string_length < pspec->min_length || |
197 | string_length > pspec->max_length) |
198 | return FALSE; |
199 | |
200 | switch (pspec->match_type) |
201 | { |
202 | gboolean dummy; |
203 | case G_MATCH_ALL: |
204 | return g_pattern_ph_match (match_pattern: pspec->pattern, match_string: string, wildcard_reached_p: &dummy); |
205 | case G_MATCH_ALL_TAIL: |
206 | if (string_reversed) |
207 | return g_pattern_ph_match (match_pattern: pspec->pattern, match_string: string_reversed, wildcard_reached_p: &dummy); |
208 | else |
209 | { |
210 | gboolean result; |
211 | gchar *tmp; |
212 | tmp = g_utf8_strreverse (str: string, len: string_length); |
213 | result = g_pattern_ph_match (match_pattern: pspec->pattern, match_string: tmp, wildcard_reached_p: &dummy); |
214 | g_free (mem: tmp); |
215 | return result; |
216 | } |
217 | case G_MATCH_HEAD: |
218 | if (pspec->pattern_length == string_length) |
219 | return strcmp (s1: pspec->pattern, s2: string) == 0; |
220 | else if (pspec->pattern_length) |
221 | return strncmp (s1: pspec->pattern, s2: string, n: pspec->pattern_length) == 0; |
222 | else |
223 | return TRUE; |
224 | case G_MATCH_TAIL: |
225 | if (pspec->pattern_length) |
226 | return strcmp (s1: pspec->pattern, s2: string + (string_length - pspec->pattern_length)) == 0; |
227 | else |
228 | return TRUE; |
229 | case G_MATCH_EXACT: |
230 | if (pspec->pattern_length != string_length) |
231 | return FALSE; |
232 | else |
233 | return strcmp (s1: pspec->pattern, s2: string) == 0; |
234 | default: |
235 | g_return_val_if_fail (pspec->match_type < G_MATCH_LAST, FALSE); |
236 | return FALSE; |
237 | } |
238 | } |
239 | |
240 | /** |
241 | * g_pattern_spec_new: |
242 | * @pattern: a zero-terminated UTF-8 encoded string |
243 | * |
244 | * Compiles a pattern to a #GPatternSpec. |
245 | * |
246 | * Returns: a newly-allocated #GPatternSpec |
247 | **/ |
248 | GPatternSpec* |
249 | g_pattern_spec_new (const gchar *pattern) |
250 | { |
251 | GPatternSpec *pspec; |
252 | gboolean seen_joker = FALSE, seen_wildcard = FALSE, more_wildcards = FALSE; |
253 | gint hw_pos = -1, tw_pos = -1, hj_pos = -1, tj_pos = -1; |
254 | gboolean follows_wildcard = FALSE; |
255 | guint pending_jokers = 0; |
256 | const gchar *s; |
257 | gchar *d; |
258 | guint i; |
259 | |
260 | g_return_val_if_fail (pattern != NULL, NULL); |
261 | |
262 | /* canonicalize pattern and collect necessary stats */ |
263 | pspec = g_new (GPatternSpec, 1); |
264 | pspec->pattern_length = strlen (s: pattern); |
265 | pspec->min_length = 0; |
266 | pspec->max_length = 0; |
267 | pspec->pattern = g_new (gchar, pspec->pattern_length + 1); |
268 | d = pspec->pattern; |
269 | for (i = 0, s = pattern; *s != 0; s++) |
270 | { |
271 | switch (*s) |
272 | { |
273 | case '*': |
274 | if (follows_wildcard) /* compress multiple wildcards */ |
275 | { |
276 | pspec->pattern_length--; |
277 | continue; |
278 | } |
279 | follows_wildcard = TRUE; |
280 | if (hw_pos < 0) |
281 | hw_pos = i; |
282 | tw_pos = i; |
283 | break; |
284 | case '?': |
285 | pending_jokers++; |
286 | pspec->min_length++; |
287 | pspec->max_length += 4; /* maximum UTF-8 character length */ |
288 | continue; |
289 | default: |
290 | for (; pending_jokers; pending_jokers--, i++) { |
291 | *d++ = '?'; |
292 | if (hj_pos < 0) |
293 | hj_pos = i; |
294 | tj_pos = i; |
295 | } |
296 | follows_wildcard = FALSE; |
297 | pspec->min_length++; |
298 | pspec->max_length++; |
299 | break; |
300 | } |
301 | *d++ = *s; |
302 | i++; |
303 | } |
304 | for (; pending_jokers; pending_jokers--) { |
305 | *d++ = '?'; |
306 | if (hj_pos < 0) |
307 | hj_pos = i; |
308 | tj_pos = i; |
309 | } |
310 | *d++ = 0; |
311 | seen_joker = hj_pos >= 0; |
312 | seen_wildcard = hw_pos >= 0; |
313 | more_wildcards = seen_wildcard && hw_pos != tw_pos; |
314 | if (seen_wildcard) |
315 | pspec->max_length = G_MAXUINT; |
316 | |
317 | /* special case sole head/tail wildcard or exact matches */ |
318 | if (!seen_joker && !more_wildcards) |
319 | { |
320 | if (pspec->pattern[0] == '*') |
321 | { |
322 | pspec->match_type = G_MATCH_TAIL; |
323 | memmove (dest: pspec->pattern, src: pspec->pattern + 1, n: --pspec->pattern_length); |
324 | pspec->pattern[pspec->pattern_length] = 0; |
325 | return pspec; |
326 | } |
327 | if (pspec->pattern_length > 0 && |
328 | pspec->pattern[pspec->pattern_length - 1] == '*') |
329 | { |
330 | pspec->match_type = G_MATCH_HEAD; |
331 | pspec->pattern[--pspec->pattern_length] = 0; |
332 | return pspec; |
333 | } |
334 | if (!seen_wildcard) |
335 | { |
336 | pspec->match_type = G_MATCH_EXACT; |
337 | return pspec; |
338 | } |
339 | } |
340 | |
341 | /* now just need to distinguish between head or tail match start */ |
342 | tw_pos = pspec->pattern_length - 1 - tw_pos; /* last pos to tail distance */ |
343 | tj_pos = pspec->pattern_length - 1 - tj_pos; /* last pos to tail distance */ |
344 | if (seen_wildcard) |
345 | pspec->match_type = tw_pos > hw_pos ? G_MATCH_ALL_TAIL : G_MATCH_ALL; |
346 | else /* seen_joker */ |
347 | pspec->match_type = tj_pos > hj_pos ? G_MATCH_ALL_TAIL : G_MATCH_ALL; |
348 | if (pspec->match_type == G_MATCH_ALL_TAIL) { |
349 | gchar *tmp = pspec->pattern; |
350 | pspec->pattern = g_utf8_strreverse (str: pspec->pattern, len: pspec->pattern_length); |
351 | g_free (mem: tmp); |
352 | } |
353 | return pspec; |
354 | } |
355 | |
356 | /** |
357 | * g_pattern_spec_free: |
358 | * @pspec: a #GPatternSpec |
359 | * |
360 | * Frees the memory allocated for the #GPatternSpec. |
361 | **/ |
362 | void |
363 | g_pattern_spec_free (GPatternSpec *pspec) |
364 | { |
365 | g_return_if_fail (pspec != NULL); |
366 | |
367 | g_free (mem: pspec->pattern); |
368 | g_free (mem: pspec); |
369 | } |
370 | |
371 | /** |
372 | * g_pattern_spec_equal: |
373 | * @pspec1: a #GPatternSpec |
374 | * @pspec2: another #GPatternSpec |
375 | * |
376 | * Compares two compiled pattern specs and returns whether they will |
377 | * match the same set of strings. |
378 | * |
379 | * Returns: Whether the compiled patterns are equal |
380 | **/ |
381 | gboolean |
382 | g_pattern_spec_equal (GPatternSpec *pspec1, |
383 | GPatternSpec *pspec2) |
384 | { |
385 | g_return_val_if_fail (pspec1 != NULL, FALSE); |
386 | g_return_val_if_fail (pspec2 != NULL, FALSE); |
387 | |
388 | return (pspec1->pattern_length == pspec2->pattern_length && |
389 | pspec1->match_type == pspec2->match_type && |
390 | strcmp (s1: pspec1->pattern, s2: pspec2->pattern) == 0); |
391 | } |
392 | |
393 | /** |
394 | * g_pattern_match_string: |
395 | * @pspec: a #GPatternSpec |
396 | * @string: the UTF-8 encoded string to match |
397 | * |
398 | * Matches a string against a compiled pattern. If the string is to be |
399 | * matched against more than one pattern, consider using |
400 | * g_pattern_match() instead while supplying the reversed string. |
401 | * |
402 | * Returns: %TRUE if @string matches @pspec |
403 | **/ |
404 | gboolean |
405 | g_pattern_match_string (GPatternSpec *pspec, |
406 | const gchar *string) |
407 | { |
408 | g_return_val_if_fail (pspec != NULL, FALSE); |
409 | g_return_val_if_fail (string != NULL, FALSE); |
410 | |
411 | return g_pattern_match (pspec, string_length: strlen (s: string), string, NULL); |
412 | } |
413 | |
414 | /** |
415 | * g_pattern_match_simple: |
416 | * @pattern: the UTF-8 encoded pattern |
417 | * @string: the UTF-8 encoded string to match |
418 | * |
419 | * Matches a string against a pattern given as a string. If this |
420 | * function is to be called in a loop, it's more efficient to compile |
421 | * the pattern once with g_pattern_spec_new() and call |
422 | * g_pattern_match_string() repeatedly. |
423 | * |
424 | * Returns: %TRUE if @string matches @pspec |
425 | **/ |
426 | gboolean |
427 | g_pattern_match_simple (const gchar *pattern, |
428 | const gchar *string) |
429 | { |
430 | GPatternSpec *pspec; |
431 | gboolean ergo; |
432 | |
433 | g_return_val_if_fail (pattern != NULL, FALSE); |
434 | g_return_val_if_fail (string != NULL, FALSE); |
435 | |
436 | pspec = g_pattern_spec_new (pattern); |
437 | ergo = g_pattern_match (pspec, string_length: strlen (s: string), string, NULL); |
438 | g_pattern_spec_free (pspec); |
439 | |
440 | return ergo; |
441 | } |
442 | |