1 | /* GRegex -- regular expression API wrapper around PCRE. |
2 | * |
3 | * Copyright (C) 1999, 2000 Scott Wimer |
4 | * Copyright (C) 2004, Matthias Clasen <mclasen@redhat.com> |
5 | * Copyright (C) 2005 - 2007, Marco Barisione <marco@barisione.org> |
6 | * |
7 | * This library is free software; you can redistribute it and/or |
8 | * modify it under the terms of the GNU Lesser General Public |
9 | * License as published by the Free Software Foundation; either |
10 | * version 2.1 of the License, or (at your option) any later version. |
11 | * |
12 | * This library is distributed in the hope that it will be useful, |
13 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
14 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
15 | * Lesser General Public License for more details. |
16 | * |
17 | * You should have received a copy of the GNU Lesser General Public License |
18 | * along with this library; if not, see <http://www.gnu.org/licenses/>. |
19 | */ |
20 | |
21 | #include "config.h" |
22 | |
23 | #include <string.h> |
24 | |
25 | #ifdef USE_SYSTEM_PCRE |
26 | #include <pcre.h> |
27 | #else |
28 | #include "pcre/pcre.h" |
29 | #endif |
30 | |
31 | #include "gtypes.h" |
32 | #include "gregex.h" |
33 | #include "glibintl.h" |
34 | #include "glist.h" |
35 | #include "gmessages.h" |
36 | #include "gstrfuncs.h" |
37 | #include "gatomic.h" |
38 | #include "gthread.h" |
39 | |
40 | /** |
41 | * SECTION:gregex |
42 | * @title: Perl-compatible regular expressions |
43 | * @short_description: matches strings against regular expressions |
44 | * @see_also: [Regular expression syntax][glib-regex-syntax] |
45 | * |
46 | * The g_regex_*() functions implement regular |
47 | * expression pattern matching using syntax and semantics similar to |
48 | * Perl regular expression. |
49 | * |
50 | * Some functions accept a @start_position argument, setting it differs |
51 | * from just passing over a shortened string and setting #G_REGEX_MATCH_NOTBOL |
52 | * in the case of a pattern that begins with any kind of lookbehind assertion. |
53 | * For example, consider the pattern "\Biss\B" which finds occurrences of "iss" |
54 | * in the middle of words. ("\B" matches only if the current position in the |
55 | * subject is not a word boundary.) When applied to the string "Mississipi" |
56 | * from the fourth byte, namely "issipi", it does not match, because "\B" is |
57 | * always false at the start of the subject, which is deemed to be a word |
58 | * boundary. However, if the entire string is passed , but with |
59 | * @start_position set to 4, it finds the second occurrence of "iss" because |
60 | * it is able to look behind the starting point to discover that it is |
61 | * preceded by a letter. |
62 | * |
63 | * Note that, unless you set the #G_REGEX_RAW flag, all the strings passed |
64 | * to these functions must be encoded in UTF-8. The lengths and the positions |
65 | * inside the strings are in bytes and not in characters, so, for instance, |
66 | * "\xc3\xa0" (i.e. "à") is two bytes long but it is treated as a |
67 | * single character. If you set #G_REGEX_RAW the strings can be non-valid |
68 | * UTF-8 strings and a byte is treated as a character, so "\xc3\xa0" is two |
69 | * bytes and two characters long. |
70 | * |
71 | * When matching a pattern, "\n" matches only against a "\n" character in |
72 | * the string, and "\r" matches only a "\r" character. To match any newline |
73 | * sequence use "\R". This particular group matches either the two-character |
74 | * sequence CR + LF ("\r\n"), or one of the single characters LF (linefeed, |
75 | * U+000A, "\n"), VT vertical tab, U+000B, "\v"), FF (formfeed, U+000C, "\f"), |
76 | * CR (carriage return, U+000D, "\r"), NEL (next line, U+0085), LS (line |
77 | * separator, U+2028), or PS (paragraph separator, U+2029). |
78 | * |
79 | * The behaviour of the dot, circumflex, and dollar metacharacters are |
80 | * affected by newline characters, the default is to recognize any newline |
81 | * character (the same characters recognized by "\R"). This can be changed |
82 | * with #G_REGEX_NEWLINE_CR, #G_REGEX_NEWLINE_LF and #G_REGEX_NEWLINE_CRLF |
83 | * compile options, and with #G_REGEX_MATCH_NEWLINE_ANY, |
84 | * #G_REGEX_MATCH_NEWLINE_CR, #G_REGEX_MATCH_NEWLINE_LF and |
85 | * #G_REGEX_MATCH_NEWLINE_CRLF match options. These settings are also |
86 | * relevant when compiling a pattern if #G_REGEX_EXTENDED is set, and an |
87 | * unescaped "#" outside a character class is encountered. This indicates |
88 | * a comment that lasts until after the next newline. |
89 | * |
90 | * When setting the %G_REGEX_JAVASCRIPT_COMPAT flag, pattern syntax and pattern |
91 | * matching is changed to be compatible with the way that regular expressions |
92 | * work in JavaScript. More precisely, a lonely ']' character in the pattern |
93 | * is a syntax error; the '\x' escape only allows 0 to 2 hexadecimal digits, and |
94 | * you must use the '\u' escape sequence with 4 hex digits to specify a unicode |
95 | * codepoint instead of '\x' or 'x{....}'. If '\x' or '\u' are not followed by |
96 | * the specified number of hex digits, they match 'x' and 'u' literally; also |
97 | * '\U' always matches 'U' instead of being an error in the pattern. Finally, |
98 | * pattern matching is modified so that back references to an unset subpattern |
99 | * group produces a match with the empty string instead of an error. See |
100 | * pcreapi(3) for more information. |
101 | * |
102 | * Creating and manipulating the same #GRegex structure from different |
103 | * threads is not a problem as #GRegex does not modify its internal |
104 | * state between creation and destruction, on the other hand #GMatchInfo |
105 | * is not threadsafe. |
106 | * |
107 | * The regular expressions low-level functionalities are obtained through |
108 | * the excellent |
109 | * [PCRE](http://www.pcre.org/) |
110 | * library written by Philip Hazel. |
111 | */ |
112 | |
113 | /* Mask of all the possible values for GRegexCompileFlags. */ |
114 | #define G_REGEX_COMPILE_MASK (G_REGEX_CASELESS | \ |
115 | G_REGEX_MULTILINE | \ |
116 | G_REGEX_DOTALL | \ |
117 | G_REGEX_EXTENDED | \ |
118 | G_REGEX_ANCHORED | \ |
119 | G_REGEX_DOLLAR_ENDONLY | \ |
120 | G_REGEX_UNGREEDY | \ |
121 | G_REGEX_RAW | \ |
122 | G_REGEX_NO_AUTO_CAPTURE | \ |
123 | G_REGEX_OPTIMIZE | \ |
124 | G_REGEX_FIRSTLINE | \ |
125 | G_REGEX_DUPNAMES | \ |
126 | G_REGEX_NEWLINE_CR | \ |
127 | G_REGEX_NEWLINE_LF | \ |
128 | G_REGEX_NEWLINE_CRLF | \ |
129 | G_REGEX_NEWLINE_ANYCRLF | \ |
130 | G_REGEX_BSR_ANYCRLF | \ |
131 | G_REGEX_JAVASCRIPT_COMPAT) |
132 | |
133 | /* Mask of all GRegexCompileFlags values that are (not) passed trough to PCRE */ |
134 | #define G_REGEX_COMPILE_PCRE_MASK (G_REGEX_COMPILE_MASK & ~G_REGEX_COMPILE_NONPCRE_MASK) |
135 | #define G_REGEX_COMPILE_NONPCRE_MASK (G_REGEX_RAW | \ |
136 | G_REGEX_OPTIMIZE) |
137 | |
138 | /* Mask of all the possible values for GRegexMatchFlags. */ |
139 | #define G_REGEX_MATCH_MASK (G_REGEX_MATCH_ANCHORED | \ |
140 | G_REGEX_MATCH_NOTBOL | \ |
141 | G_REGEX_MATCH_NOTEOL | \ |
142 | G_REGEX_MATCH_NOTEMPTY | \ |
143 | G_REGEX_MATCH_PARTIAL | \ |
144 | G_REGEX_MATCH_NEWLINE_CR | \ |
145 | G_REGEX_MATCH_NEWLINE_LF | \ |
146 | G_REGEX_MATCH_NEWLINE_CRLF | \ |
147 | G_REGEX_MATCH_NEWLINE_ANY | \ |
148 | G_REGEX_MATCH_NEWLINE_ANYCRLF | \ |
149 | G_REGEX_MATCH_BSR_ANYCRLF | \ |
150 | G_REGEX_MATCH_BSR_ANY | \ |
151 | G_REGEX_MATCH_PARTIAL_SOFT | \ |
152 | G_REGEX_MATCH_PARTIAL_HARD | \ |
153 | G_REGEX_MATCH_NOTEMPTY_ATSTART) |
154 | |
155 | /* we rely on these flags having the same values */ |
156 | G_STATIC_ASSERT (G_REGEX_CASELESS == PCRE_CASELESS); |
157 | G_STATIC_ASSERT (G_REGEX_MULTILINE == PCRE_MULTILINE); |
158 | G_STATIC_ASSERT (G_REGEX_DOTALL == PCRE_DOTALL); |
159 | G_STATIC_ASSERT (G_REGEX_EXTENDED == PCRE_EXTENDED); |
160 | G_STATIC_ASSERT (G_REGEX_ANCHORED == PCRE_ANCHORED); |
161 | G_STATIC_ASSERT (G_REGEX_DOLLAR_ENDONLY == PCRE_DOLLAR_ENDONLY); |
162 | G_STATIC_ASSERT (G_REGEX_UNGREEDY == PCRE_UNGREEDY); |
163 | G_STATIC_ASSERT (G_REGEX_NO_AUTO_CAPTURE == PCRE_NO_AUTO_CAPTURE); |
164 | G_STATIC_ASSERT (G_REGEX_FIRSTLINE == PCRE_FIRSTLINE); |
165 | G_STATIC_ASSERT (G_REGEX_DUPNAMES == PCRE_DUPNAMES); |
166 | G_STATIC_ASSERT (G_REGEX_NEWLINE_CR == PCRE_NEWLINE_CR); |
167 | G_STATIC_ASSERT (G_REGEX_NEWLINE_LF == PCRE_NEWLINE_LF); |
168 | G_STATIC_ASSERT (G_REGEX_NEWLINE_CRLF == PCRE_NEWLINE_CRLF); |
169 | G_STATIC_ASSERT (G_REGEX_NEWLINE_ANYCRLF == PCRE_NEWLINE_ANYCRLF); |
170 | G_STATIC_ASSERT (G_REGEX_BSR_ANYCRLF == PCRE_BSR_ANYCRLF); |
171 | G_STATIC_ASSERT (G_REGEX_JAVASCRIPT_COMPAT == PCRE_JAVASCRIPT_COMPAT); |
172 | |
173 | G_STATIC_ASSERT (G_REGEX_MATCH_ANCHORED == PCRE_ANCHORED); |
174 | G_STATIC_ASSERT (G_REGEX_MATCH_NOTBOL == PCRE_NOTBOL); |
175 | G_STATIC_ASSERT (G_REGEX_MATCH_NOTEOL == PCRE_NOTEOL); |
176 | G_STATIC_ASSERT (G_REGEX_MATCH_NOTEMPTY == PCRE_NOTEMPTY); |
177 | G_STATIC_ASSERT (G_REGEX_MATCH_PARTIAL == PCRE_PARTIAL); |
178 | G_STATIC_ASSERT (G_REGEX_MATCH_NEWLINE_CR == PCRE_NEWLINE_CR); |
179 | G_STATIC_ASSERT (G_REGEX_MATCH_NEWLINE_LF == PCRE_NEWLINE_LF); |
180 | G_STATIC_ASSERT (G_REGEX_MATCH_NEWLINE_CRLF == PCRE_NEWLINE_CRLF); |
181 | G_STATIC_ASSERT (G_REGEX_MATCH_NEWLINE_ANY == PCRE_NEWLINE_ANY); |
182 | G_STATIC_ASSERT (G_REGEX_MATCH_NEWLINE_ANYCRLF == PCRE_NEWLINE_ANYCRLF); |
183 | G_STATIC_ASSERT (G_REGEX_MATCH_BSR_ANYCRLF == PCRE_BSR_ANYCRLF); |
184 | G_STATIC_ASSERT (G_REGEX_MATCH_BSR_ANY == PCRE_BSR_UNICODE); |
185 | G_STATIC_ASSERT (G_REGEX_MATCH_PARTIAL_SOFT == PCRE_PARTIAL_SOFT); |
186 | G_STATIC_ASSERT (G_REGEX_MATCH_PARTIAL_HARD == PCRE_PARTIAL_HARD); |
187 | G_STATIC_ASSERT (G_REGEX_MATCH_NOTEMPTY_ATSTART == PCRE_NOTEMPTY_ATSTART); |
188 | |
189 | /* These PCRE flags are unused or not exposed publicly in GRegexFlags, so |
190 | * it should be ok to reuse them for different things. |
191 | */ |
192 | G_STATIC_ASSERT (G_REGEX_OPTIMIZE == PCRE_NO_UTF8_CHECK); |
193 | G_STATIC_ASSERT (G_REGEX_RAW == PCRE_UTF8); |
194 | |
195 | /* if the string is in UTF-8 use g_utf8_ functions, else use |
196 | * use just +/- 1. */ |
197 | #define NEXT_CHAR(re, s) (((re)->compile_opts & G_REGEX_RAW) ? \ |
198 | ((s) + 1) : \ |
199 | g_utf8_next_char (s)) |
200 | #define PREV_CHAR(re, s) (((re)->compile_opts & G_REGEX_RAW) ? \ |
201 | ((s) - 1) : \ |
202 | g_utf8_prev_char (s)) |
203 | |
204 | struct _GMatchInfo |
205 | { |
206 | gint ref_count; /* the ref count (atomic) */ |
207 | GRegex *regex; /* the regex */ |
208 | GRegexMatchFlags match_opts; /* options used at match time on the regex */ |
209 | gint matches; /* number of matching sub patterns */ |
210 | gint pos; /* position in the string where last match left off */ |
211 | gint n_offsets; /* number of offsets */ |
212 | gint *offsets; /* array of offsets paired 0,1 ; 2,3 ; 3,4 etc */ |
213 | gint *workspace; /* workspace for pcre_dfa_exec() */ |
214 | gint n_workspace; /* number of workspace elements */ |
215 | const gchar *string; /* string passed to the match function */ |
216 | gssize string_len; /* length of string, in bytes */ |
217 | }; |
218 | |
219 | struct _GRegex |
220 | { |
221 | gint ref_count; /* the ref count for the immutable part (atomic) */ |
222 | gchar *pattern; /* the pattern */ |
223 | pcre *pcre_re; /* compiled form of the pattern */ |
224 | GRegexCompileFlags compile_opts; /* options used at compile time on the pattern */ |
225 | GRegexMatchFlags match_opts; /* options used at match time on the regex */ |
226 | pcre_extra *; /* data stored when G_REGEX_OPTIMIZE is used */ |
227 | }; |
228 | |
229 | /* TRUE if ret is an error code, FALSE otherwise. */ |
230 | #define IS_PCRE_ERROR(ret) ((ret) < PCRE_ERROR_NOMATCH && (ret) != PCRE_ERROR_PARTIAL) |
231 | |
232 | typedef struct _InterpolationData InterpolationData; |
233 | static gboolean interpolation_list_needs_match (GList *list); |
234 | static gboolean interpolate_replacement (const GMatchInfo *match_info, |
235 | GString *result, |
236 | gpointer data); |
237 | static GList *split_replacement (const gchar *replacement, |
238 | GError **error); |
239 | static void free_interpolation_data (InterpolationData *data); |
240 | |
241 | |
242 | static const gchar * |
243 | match_error (gint errcode) |
244 | { |
245 | switch (errcode) |
246 | { |
247 | case PCRE_ERROR_NOMATCH: |
248 | /* not an error */ |
249 | break; |
250 | case PCRE_ERROR_NULL: |
251 | /* NULL argument, this should not happen in GRegex */ |
252 | g_warning ("A NULL argument was passed to PCRE" ); |
253 | break; |
254 | case PCRE_ERROR_BADOPTION: |
255 | return "bad options" ; |
256 | case PCRE_ERROR_BADMAGIC: |
257 | return _("corrupted object" ); |
258 | case PCRE_ERROR_UNKNOWN_OPCODE: |
259 | return N_("internal error or corrupted object" ); |
260 | case PCRE_ERROR_NOMEMORY: |
261 | return _("out of memory" ); |
262 | case PCRE_ERROR_NOSUBSTRING: |
263 | /* not used by pcre_exec() */ |
264 | break; |
265 | case PCRE_ERROR_MATCHLIMIT: |
266 | return _("backtracking limit reached" ); |
267 | case PCRE_ERROR_CALLOUT: |
268 | /* callouts are not implemented */ |
269 | break; |
270 | case PCRE_ERROR_BADUTF8: |
271 | case PCRE_ERROR_BADUTF8_OFFSET: |
272 | /* we do not check if strings are valid */ |
273 | break; |
274 | case PCRE_ERROR_PARTIAL: |
275 | /* not an error */ |
276 | break; |
277 | case PCRE_ERROR_BADPARTIAL: |
278 | return _("the pattern contains items not supported for partial matching" ); |
279 | case PCRE_ERROR_INTERNAL: |
280 | return _("internal error" ); |
281 | case PCRE_ERROR_BADCOUNT: |
282 | /* negative ovecsize, this should not happen in GRegex */ |
283 | g_warning ("A negative ovecsize was passed to PCRE" ); |
284 | break; |
285 | case PCRE_ERROR_DFA_UITEM: |
286 | return _("the pattern contains items not supported for partial matching" ); |
287 | case PCRE_ERROR_DFA_UCOND: |
288 | return _("back references as conditions are not supported for partial matching" ); |
289 | case PCRE_ERROR_DFA_UMLIMIT: |
290 | /* the match_field field is not used in GRegex */ |
291 | break; |
292 | case PCRE_ERROR_DFA_WSSIZE: |
293 | /* handled expanding the workspace */ |
294 | break; |
295 | case PCRE_ERROR_DFA_RECURSE: |
296 | case PCRE_ERROR_RECURSIONLIMIT: |
297 | return _("recursion limit reached" ); |
298 | case PCRE_ERROR_BADNEWLINE: |
299 | return _("invalid combination of newline flags" ); |
300 | case PCRE_ERROR_BADOFFSET: |
301 | return _("bad offset" ); |
302 | case PCRE_ERROR_SHORTUTF8: |
303 | return _("short utf8" ); |
304 | case PCRE_ERROR_RECURSELOOP: |
305 | return _("recursion loop" ); |
306 | default: |
307 | break; |
308 | } |
309 | return _("unknown error" ); |
310 | } |
311 | |
312 | static void |
313 | translate_compile_error (gint *errcode, const gchar **errmsg) |
314 | { |
315 | /* Compile errors are created adding 100 to the error code returned |
316 | * by PCRE. |
317 | * If errcode is known we put the translatable error message in |
318 | * erromsg. If errcode is unknown we put the generic |
319 | * G_REGEX_ERROR_COMPILE error code in errcode and keep the |
320 | * untranslated error message returned by PCRE. |
321 | * Note that there can be more PCRE errors with the same GRegexError |
322 | * and that some PCRE errors are useless for us. |
323 | */ |
324 | *errcode += 100; |
325 | |
326 | switch (*errcode) |
327 | { |
328 | case G_REGEX_ERROR_STRAY_BACKSLASH: |
329 | *errmsg = _("\\ at end of pattern" ); |
330 | break; |
331 | case G_REGEX_ERROR_MISSING_CONTROL_CHAR: |
332 | *errmsg = _("\\c at end of pattern" ); |
333 | break; |
334 | case G_REGEX_ERROR_UNRECOGNIZED_ESCAPE: |
335 | *errmsg = _("unrecognized character following \\" ); |
336 | break; |
337 | case G_REGEX_ERROR_QUANTIFIERS_OUT_OF_ORDER: |
338 | *errmsg = _("numbers out of order in {} quantifier" ); |
339 | break; |
340 | case G_REGEX_ERROR_QUANTIFIER_TOO_BIG: |
341 | *errmsg = _("number too big in {} quantifier" ); |
342 | break; |
343 | case G_REGEX_ERROR_UNTERMINATED_CHARACTER_CLASS: |
344 | *errmsg = _("missing terminating ] for character class" ); |
345 | break; |
346 | case G_REGEX_ERROR_INVALID_ESCAPE_IN_CHARACTER_CLASS: |
347 | *errmsg = _("invalid escape sequence in character class" ); |
348 | break; |
349 | case G_REGEX_ERROR_RANGE_OUT_OF_ORDER: |
350 | *errmsg = _("range out of order in character class" ); |
351 | break; |
352 | case G_REGEX_ERROR_NOTHING_TO_REPEAT: |
353 | *errmsg = _("nothing to repeat" ); |
354 | break; |
355 | case 111: /* internal error: unexpected repeat */ |
356 | *errcode = G_REGEX_ERROR_INTERNAL; |
357 | *errmsg = _("unexpected repeat" ); |
358 | break; |
359 | case G_REGEX_ERROR_UNRECOGNIZED_CHARACTER: |
360 | *errmsg = _("unrecognized character after (? or (?-" ); |
361 | break; |
362 | case G_REGEX_ERROR_POSIX_NAMED_CLASS_OUTSIDE_CLASS: |
363 | *errmsg = _("POSIX named classes are supported only within a class" ); |
364 | break; |
365 | case G_REGEX_ERROR_UNMATCHED_PARENTHESIS: |
366 | *errmsg = _("missing terminating )" ); |
367 | break; |
368 | case G_REGEX_ERROR_INEXISTENT_SUBPATTERN_REFERENCE: |
369 | *errmsg = _("reference to non-existent subpattern" ); |
370 | break; |
371 | case G_REGEX_ERROR_UNTERMINATED_COMMENT: |
372 | *errmsg = _("missing ) after comment" ); |
373 | break; |
374 | case G_REGEX_ERROR_EXPRESSION_TOO_LARGE: |
375 | *errmsg = _("regular expression is too large" ); |
376 | break; |
377 | case G_REGEX_ERROR_MEMORY_ERROR: |
378 | *errmsg = _("failed to get memory" ); |
379 | break; |
380 | case 122: /* unmatched parentheses */ |
381 | *errcode = G_REGEX_ERROR_UNMATCHED_PARENTHESIS; |
382 | *errmsg = _(") without opening (" ); |
383 | break; |
384 | case 123: /* internal error: code overflow */ |
385 | *errcode = G_REGEX_ERROR_INTERNAL; |
386 | *errmsg = _("code overflow" ); |
387 | break; |
388 | case 124: /* "unrecognized character after (?<\0 */ |
389 | *errcode = G_REGEX_ERROR_UNRECOGNIZED_CHARACTER; |
390 | *errmsg = _("unrecognized character after (?<" ); |
391 | break; |
392 | case G_REGEX_ERROR_VARIABLE_LENGTH_LOOKBEHIND: |
393 | *errmsg = _("lookbehind assertion is not fixed length" ); |
394 | break; |
395 | case G_REGEX_ERROR_MALFORMED_CONDITION: |
396 | *errmsg = _("malformed number or name after (?(" ); |
397 | break; |
398 | case G_REGEX_ERROR_TOO_MANY_CONDITIONAL_BRANCHES: |
399 | *errmsg = _("conditional group contains more than two branches" ); |
400 | break; |
401 | case G_REGEX_ERROR_ASSERTION_EXPECTED: |
402 | *errmsg = _("assertion expected after (?(" ); |
403 | break; |
404 | case 129: |
405 | *errcode = G_REGEX_ERROR_UNMATCHED_PARENTHESIS; |
406 | /* translators: '(?R' and '(?[+-]digits' are both meant as (groups of) |
407 | * sequences here, '(?-54' would be an example for the second group. |
408 | */ |
409 | *errmsg = _("(?R or (?[+-]digits must be followed by )" ); |
410 | break; |
411 | case G_REGEX_ERROR_UNKNOWN_POSIX_CLASS_NAME: |
412 | *errmsg = _("unknown POSIX class name" ); |
413 | break; |
414 | case G_REGEX_ERROR_POSIX_COLLATING_ELEMENTS_NOT_SUPPORTED: |
415 | *errmsg = _("POSIX collating elements are not supported" ); |
416 | break; |
417 | case G_REGEX_ERROR_HEX_CODE_TOO_LARGE: |
418 | *errmsg = _("character value in \\x{...} sequence is too large" ); |
419 | break; |
420 | case G_REGEX_ERROR_INVALID_CONDITION: |
421 | *errmsg = _("invalid condition (?(0)" ); |
422 | break; |
423 | case G_REGEX_ERROR_SINGLE_BYTE_MATCH_IN_LOOKBEHIND: |
424 | *errmsg = _("\\C not allowed in lookbehind assertion" ); |
425 | break; |
426 | case 137: /* PCRE does not support \\L, \\l, \\N{name}, \\U, or \\u\0 */ |
427 | /* A number of Perl escapes are not handled by PCRE. |
428 | * Therefore it explicitly raises ERR37. |
429 | */ |
430 | *errcode = G_REGEX_ERROR_UNRECOGNIZED_ESCAPE; |
431 | *errmsg = _("escapes \\L, \\l, \\N{name}, \\U, and \\u are not supported" ); |
432 | break; |
433 | case G_REGEX_ERROR_INFINITE_LOOP: |
434 | *errmsg = _("recursive call could loop indefinitely" ); |
435 | break; |
436 | case 141: /* unrecognized character after (?P\0 */ |
437 | *errcode = G_REGEX_ERROR_UNRECOGNIZED_CHARACTER; |
438 | *errmsg = _("unrecognized character after (?P" ); |
439 | break; |
440 | case G_REGEX_ERROR_MISSING_SUBPATTERN_NAME_TERMINATOR: |
441 | *errmsg = _("missing terminator in subpattern name" ); |
442 | break; |
443 | case G_REGEX_ERROR_DUPLICATE_SUBPATTERN_NAME: |
444 | *errmsg = _("two named subpatterns have the same name" ); |
445 | break; |
446 | case G_REGEX_ERROR_MALFORMED_PROPERTY: |
447 | *errmsg = _("malformed \\P or \\p sequence" ); |
448 | break; |
449 | case G_REGEX_ERROR_UNKNOWN_PROPERTY: |
450 | *errmsg = _("unknown property name after \\P or \\p" ); |
451 | break; |
452 | case G_REGEX_ERROR_SUBPATTERN_NAME_TOO_LONG: |
453 | *errmsg = _("subpattern name is too long (maximum 32 characters)" ); |
454 | break; |
455 | case G_REGEX_ERROR_TOO_MANY_SUBPATTERNS: |
456 | *errmsg = _("too many named subpatterns (maximum 10,000)" ); |
457 | break; |
458 | case G_REGEX_ERROR_INVALID_OCTAL_VALUE: |
459 | *errmsg = _("octal value is greater than \\377" ); |
460 | break; |
461 | case 152: /* internal error: overran compiling workspace */ |
462 | *errcode = G_REGEX_ERROR_INTERNAL; |
463 | *errmsg = _("overran compiling workspace" ); |
464 | break; |
465 | case 153: /* internal error: previously-checked referenced subpattern not found */ |
466 | *errcode = G_REGEX_ERROR_INTERNAL; |
467 | *errmsg = _("previously-checked referenced subpattern not found" ); |
468 | break; |
469 | case G_REGEX_ERROR_TOO_MANY_BRANCHES_IN_DEFINE: |
470 | *errmsg = _("DEFINE group contains more than one branch" ); |
471 | break; |
472 | case G_REGEX_ERROR_INCONSISTENT_NEWLINE_OPTIONS: |
473 | *errmsg = _("inconsistent NEWLINE options" ); |
474 | break; |
475 | case G_REGEX_ERROR_MISSING_BACK_REFERENCE: |
476 | *errmsg = _("\\g is not followed by a braced, angle-bracketed, or quoted name or " |
477 | "number, or by a plain number" ); |
478 | break; |
479 | case G_REGEX_ERROR_INVALID_RELATIVE_REFERENCE: |
480 | *errmsg = _("a numbered reference must not be zero" ); |
481 | break; |
482 | case G_REGEX_ERROR_BACKTRACKING_CONTROL_VERB_ARGUMENT_FORBIDDEN: |
483 | *errmsg = _("an argument is not allowed for (*ACCEPT), (*FAIL), or (*COMMIT)" ); |
484 | break; |
485 | case G_REGEX_ERROR_UNKNOWN_BACKTRACKING_CONTROL_VERB: |
486 | *errmsg = _("(*VERB) not recognized" ); |
487 | break; |
488 | case G_REGEX_ERROR_NUMBER_TOO_BIG: |
489 | *errmsg = _("number is too big" ); |
490 | break; |
491 | case G_REGEX_ERROR_MISSING_SUBPATTERN_NAME: |
492 | *errmsg = _("missing subpattern name after (?&" ); |
493 | break; |
494 | case G_REGEX_ERROR_MISSING_DIGIT: |
495 | *errmsg = _("digit expected after (?+" ); |
496 | break; |
497 | case G_REGEX_ERROR_INVALID_DATA_CHARACTER: |
498 | *errmsg = _("] is an invalid data character in JavaScript compatibility mode" ); |
499 | break; |
500 | case G_REGEX_ERROR_EXTRA_SUBPATTERN_NAME: |
501 | *errmsg = _("different names for subpatterns of the same number are not allowed" ); |
502 | break; |
503 | case G_REGEX_ERROR_BACKTRACKING_CONTROL_VERB_ARGUMENT_REQUIRED: |
504 | *errmsg = _("(*MARK) must have an argument" ); |
505 | break; |
506 | case G_REGEX_ERROR_INVALID_CONTROL_CHAR: |
507 | *errmsg = _( "\\c must be followed by an ASCII character" ); |
508 | break; |
509 | case G_REGEX_ERROR_MISSING_NAME: |
510 | *errmsg = _("\\k is not followed by a braced, angle-bracketed, or quoted name" ); |
511 | break; |
512 | case G_REGEX_ERROR_NOT_SUPPORTED_IN_CLASS: |
513 | *errmsg = _("\\N is not supported in a class" ); |
514 | break; |
515 | case G_REGEX_ERROR_TOO_MANY_FORWARD_REFERENCES: |
516 | *errmsg = _("too many forward references" ); |
517 | break; |
518 | case G_REGEX_ERROR_NAME_TOO_LONG: |
519 | *errmsg = _("name is too long in (*MARK), (*PRUNE), (*SKIP), or (*THEN)" ); |
520 | break; |
521 | case G_REGEX_ERROR_CHARACTER_VALUE_TOO_LARGE: |
522 | *errmsg = _("character value in \\u.... sequence is too large" ); |
523 | break; |
524 | |
525 | case 116: /* erroffset passed as NULL */ |
526 | /* This should not happen as we never pass a NULL erroffset */ |
527 | g_warning ("erroffset passed as NULL" ); |
528 | *errcode = G_REGEX_ERROR_COMPILE; |
529 | break; |
530 | case 117: /* unknown option bit(s) set */ |
531 | /* This should not happen as we check options before passing them |
532 | * to pcre_compile2() */ |
533 | g_warning ("unknown option bit(s) set" ); |
534 | *errcode = G_REGEX_ERROR_COMPILE; |
535 | break; |
536 | case 132: /* this version of PCRE is compiled without UTF support */ |
537 | case 144: /* invalid UTF-8 string */ |
538 | case 145: /* support for \\P, \\p, and \\X has not been compiled */ |
539 | case 167: /* this version of PCRE is not compiled with Unicode property support */ |
540 | case 173: /* disallowed Unicode code point (>= 0xd800 && <= 0xdfff) */ |
541 | case 174: /* invalid UTF-16 string */ |
542 | /* These errors should not happen as we are using an UTF-8 and UCP-enabled PCRE |
543 | * and we do not check if strings are valid */ |
544 | case 170: /* internal error: unknown opcode in find_fixedlength() */ |
545 | *errcode = G_REGEX_ERROR_INTERNAL; |
546 | break; |
547 | |
548 | default: |
549 | *errcode = G_REGEX_ERROR_COMPILE; |
550 | } |
551 | } |
552 | |
553 | /* GMatchInfo */ |
554 | |
555 | static GMatchInfo * |
556 | match_info_new (const GRegex *regex, |
557 | const gchar *string, |
558 | gint string_len, |
559 | gint start_position, |
560 | gint match_options, |
561 | gboolean is_dfa) |
562 | { |
563 | GMatchInfo *match_info; |
564 | |
565 | if (string_len < 0) |
566 | string_len = strlen (s: string); |
567 | |
568 | match_info = g_new0 (GMatchInfo, 1); |
569 | match_info->ref_count = 1; |
570 | match_info->regex = g_regex_ref (regex: (GRegex *)regex); |
571 | match_info->string = string; |
572 | match_info->string_len = string_len; |
573 | match_info->matches = PCRE_ERROR_NOMATCH; |
574 | match_info->pos = start_position; |
575 | match_info->match_opts = match_options; |
576 | |
577 | if (is_dfa) |
578 | { |
579 | /* These values should be enough for most cases, if they are not |
580 | * enough g_regex_match_all_full() will expand them. */ |
581 | match_info->n_offsets = 24; |
582 | match_info->n_workspace = 100; |
583 | match_info->workspace = g_new (gint, match_info->n_workspace); |
584 | } |
585 | else |
586 | { |
587 | gint capture_count; |
588 | pcre_fullinfo (regex->pcre_re, regex->extra, |
589 | PCRE_INFO_CAPTURECOUNT, &capture_count); |
590 | match_info->n_offsets = (capture_count + 1) * 3; |
591 | } |
592 | |
593 | match_info->offsets = g_new0 (gint, match_info->n_offsets); |
594 | /* Set an invalid position for the previous match. */ |
595 | match_info->offsets[0] = -1; |
596 | match_info->offsets[1] = -1; |
597 | |
598 | return match_info; |
599 | } |
600 | |
601 | /** |
602 | * g_match_info_get_regex: |
603 | * @match_info: a #GMatchInfo |
604 | * |
605 | * Returns #GRegex object used in @match_info. It belongs to Glib |
606 | * and must not be freed. Use g_regex_ref() if you need to keep it |
607 | * after you free @match_info object. |
608 | * |
609 | * Returns: #GRegex object used in @match_info |
610 | * |
611 | * Since: 2.14 |
612 | */ |
613 | GRegex * |
614 | g_match_info_get_regex (const GMatchInfo *match_info) |
615 | { |
616 | g_return_val_if_fail (match_info != NULL, NULL); |
617 | return match_info->regex; |
618 | } |
619 | |
620 | /** |
621 | * g_match_info_get_string: |
622 | * @match_info: a #GMatchInfo |
623 | * |
624 | * Returns the string searched with @match_info. This is the |
625 | * string passed to g_regex_match() or g_regex_replace() so |
626 | * you may not free it before calling this function. |
627 | * |
628 | * Returns: the string searched with @match_info |
629 | * |
630 | * Since: 2.14 |
631 | */ |
632 | const gchar * |
633 | g_match_info_get_string (const GMatchInfo *match_info) |
634 | { |
635 | g_return_val_if_fail (match_info != NULL, NULL); |
636 | return match_info->string; |
637 | } |
638 | |
639 | /** |
640 | * g_match_info_ref: |
641 | * @match_info: a #GMatchInfo |
642 | * |
643 | * Increases reference count of @match_info by 1. |
644 | * |
645 | * Returns: @match_info |
646 | * |
647 | * Since: 2.30 |
648 | */ |
649 | GMatchInfo * |
650 | g_match_info_ref (GMatchInfo *match_info) |
651 | { |
652 | g_return_val_if_fail (match_info != NULL, NULL); |
653 | g_atomic_int_inc (&match_info->ref_count); |
654 | return match_info; |
655 | } |
656 | |
657 | /** |
658 | * g_match_info_unref: |
659 | * @match_info: a #GMatchInfo |
660 | * |
661 | * Decreases reference count of @match_info by 1. When reference count drops |
662 | * to zero, it frees all the memory associated with the match_info structure. |
663 | * |
664 | * Since: 2.30 |
665 | */ |
666 | void |
667 | g_match_info_unref (GMatchInfo *match_info) |
668 | { |
669 | if (g_atomic_int_dec_and_test (&match_info->ref_count)) |
670 | { |
671 | g_regex_unref (regex: match_info->regex); |
672 | g_free (mem: match_info->offsets); |
673 | g_free (mem: match_info->workspace); |
674 | g_free (mem: match_info); |
675 | } |
676 | } |
677 | |
678 | /** |
679 | * g_match_info_free: |
680 | * @match_info: (nullable): a #GMatchInfo, or %NULL |
681 | * |
682 | * If @match_info is not %NULL, calls g_match_info_unref(); otherwise does |
683 | * nothing. |
684 | * |
685 | * Since: 2.14 |
686 | */ |
687 | void |
688 | g_match_info_free (GMatchInfo *match_info) |
689 | { |
690 | if (match_info == NULL) |
691 | return; |
692 | |
693 | g_match_info_unref (match_info); |
694 | } |
695 | |
696 | /** |
697 | * g_match_info_next: |
698 | * @match_info: a #GMatchInfo structure |
699 | * @error: location to store the error occurring, or %NULL to ignore errors |
700 | * |
701 | * Scans for the next match using the same parameters of the previous |
702 | * call to g_regex_match_full() or g_regex_match() that returned |
703 | * @match_info. |
704 | * |
705 | * The match is done on the string passed to the match function, so you |
706 | * cannot free it before calling this function. |
707 | * |
708 | * Returns: %TRUE is the string matched, %FALSE otherwise |
709 | * |
710 | * Since: 2.14 |
711 | */ |
712 | gboolean |
713 | g_match_info_next (GMatchInfo *match_info, |
714 | GError **error) |
715 | { |
716 | gint prev_match_start; |
717 | gint prev_match_end; |
718 | |
719 | g_return_val_if_fail (match_info != NULL, FALSE); |
720 | g_return_val_if_fail (error == NULL || *error == NULL, FALSE); |
721 | g_return_val_if_fail (match_info->pos >= 0, FALSE); |
722 | |
723 | prev_match_start = match_info->offsets[0]; |
724 | prev_match_end = match_info->offsets[1]; |
725 | |
726 | if (match_info->pos > match_info->string_len) |
727 | { |
728 | /* we have reached the end of the string */ |
729 | match_info->pos = -1; |
730 | match_info->matches = PCRE_ERROR_NOMATCH; |
731 | return FALSE; |
732 | } |
733 | |
734 | match_info->matches = pcre_exec (match_info->regex->pcre_re, |
735 | match_info->regex->extra, |
736 | match_info->string, |
737 | match_info->string_len, |
738 | match_info->pos, |
739 | match_info->regex->match_opts | match_info->match_opts, |
740 | match_info->offsets, |
741 | match_info->n_offsets); |
742 | if (IS_PCRE_ERROR (match_info->matches)) |
743 | { |
744 | g_set_error (err: error, G_REGEX_ERROR, code: G_REGEX_ERROR_MATCH, |
745 | _("Error while matching regular expression %s: %s" ), |
746 | match_info->regex->pattern, match_error (errcode: match_info->matches)); |
747 | return FALSE; |
748 | } |
749 | |
750 | /* avoid infinite loops if the pattern is an empty string or something |
751 | * equivalent */ |
752 | if (match_info->pos == match_info->offsets[1]) |
753 | { |
754 | if (match_info->pos > match_info->string_len) |
755 | { |
756 | /* we have reached the end of the string */ |
757 | match_info->pos = -1; |
758 | match_info->matches = PCRE_ERROR_NOMATCH; |
759 | return FALSE; |
760 | } |
761 | |
762 | match_info->pos = NEXT_CHAR (match_info->regex, |
763 | &match_info->string[match_info->pos]) - |
764 | match_info->string; |
765 | } |
766 | else |
767 | { |
768 | match_info->pos = match_info->offsets[1]; |
769 | } |
770 | |
771 | /* it's possible to get two identical matches when we are matching |
772 | * empty strings, for instance if the pattern is "(?=[A-Z0-9])" and |
773 | * the string is "RegExTest" we have: |
774 | * - search at position 0: match from 0 to 0 |
775 | * - search at position 1: match from 3 to 3 |
776 | * - search at position 3: match from 3 to 3 (duplicate) |
777 | * - search at position 4: match from 5 to 5 |
778 | * - search at position 5: match from 5 to 5 (duplicate) |
779 | * - search at position 6: no match -> stop |
780 | * so we have to ignore the duplicates. |
781 | * see bug #515944: http://bugzilla.gnome.org/show_bug.cgi?id=515944 */ |
782 | if (match_info->matches >= 0 && |
783 | prev_match_start == match_info->offsets[0] && |
784 | prev_match_end == match_info->offsets[1]) |
785 | { |
786 | /* ignore this match and search the next one */ |
787 | return g_match_info_next (match_info, error); |
788 | } |
789 | |
790 | return match_info->matches >= 0; |
791 | } |
792 | |
793 | /** |
794 | * g_match_info_matches: |
795 | * @match_info: a #GMatchInfo structure |
796 | * |
797 | * Returns whether the previous match operation succeeded. |
798 | * |
799 | * Returns: %TRUE if the previous match operation succeeded, |
800 | * %FALSE otherwise |
801 | * |
802 | * Since: 2.14 |
803 | */ |
804 | gboolean |
805 | g_match_info_matches (const GMatchInfo *match_info) |
806 | { |
807 | g_return_val_if_fail (match_info != NULL, FALSE); |
808 | |
809 | return match_info->matches >= 0; |
810 | } |
811 | |
812 | /** |
813 | * g_match_info_get_match_count: |
814 | * @match_info: a #GMatchInfo structure |
815 | * |
816 | * Retrieves the number of matched substrings (including substring 0, |
817 | * that is the whole matched text), so 1 is returned if the pattern |
818 | * has no substrings in it and 0 is returned if the match failed. |
819 | * |
820 | * If the last match was obtained using the DFA algorithm, that is |
821 | * using g_regex_match_all() or g_regex_match_all_full(), the retrieved |
822 | * count is not that of the number of capturing parentheses but that of |
823 | * the number of matched substrings. |
824 | * |
825 | * Returns: Number of matched substrings, or -1 if an error occurred |
826 | * |
827 | * Since: 2.14 |
828 | */ |
829 | gint |
830 | g_match_info_get_match_count (const GMatchInfo *match_info) |
831 | { |
832 | g_return_val_if_fail (match_info, -1); |
833 | |
834 | if (match_info->matches == PCRE_ERROR_NOMATCH) |
835 | /* no match */ |
836 | return 0; |
837 | else if (match_info->matches < PCRE_ERROR_NOMATCH) |
838 | /* error */ |
839 | return -1; |
840 | else |
841 | /* match */ |
842 | return match_info->matches; |
843 | } |
844 | |
845 | /** |
846 | * g_match_info_is_partial_match: |
847 | * @match_info: a #GMatchInfo structure |
848 | * |
849 | * Usually if the string passed to g_regex_match*() matches as far as |
850 | * it goes, but is too short to match the entire pattern, %FALSE is |
851 | * returned. There are circumstances where it might be helpful to |
852 | * distinguish this case from other cases in which there is no match. |
853 | * |
854 | * Consider, for example, an application where a human is required to |
855 | * type in data for a field with specific formatting requirements. An |
856 | * example might be a date in the form ddmmmyy, defined by the pattern |
857 | * "^\d?\d(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\d\d$". |
858 | * If the application sees the user’s keystrokes one by one, and can |
859 | * check that what has been typed so far is potentially valid, it is |
860 | * able to raise an error as soon as a mistake is made. |
861 | * |
862 | * GRegex supports the concept of partial matching by means of the |
863 | * #G_REGEX_MATCH_PARTIAL_SOFT and #G_REGEX_MATCH_PARTIAL_HARD flags. |
864 | * When they are used, the return code for |
865 | * g_regex_match() or g_regex_match_full() is, as usual, %TRUE |
866 | * for a complete match, %FALSE otherwise. But, when these functions |
867 | * return %FALSE, you can check if the match was partial calling |
868 | * g_match_info_is_partial_match(). |
869 | * |
870 | * The difference between #G_REGEX_MATCH_PARTIAL_SOFT and |
871 | * #G_REGEX_MATCH_PARTIAL_HARD is that when a partial match is encountered |
872 | * with #G_REGEX_MATCH_PARTIAL_SOFT, matching continues to search for a |
873 | * possible complete match, while with #G_REGEX_MATCH_PARTIAL_HARD matching |
874 | * stops at the partial match. |
875 | * When both #G_REGEX_MATCH_PARTIAL_SOFT and #G_REGEX_MATCH_PARTIAL_HARD |
876 | * are set, the latter takes precedence. |
877 | * |
878 | * There were formerly some restrictions on the pattern for partial matching. |
879 | * The restrictions no longer apply. |
880 | * |
881 | * See pcrepartial(3) for more information on partial matching. |
882 | * |
883 | * Returns: %TRUE if the match was partial, %FALSE otherwise |
884 | * |
885 | * Since: 2.14 |
886 | */ |
887 | gboolean |
888 | g_match_info_is_partial_match (const GMatchInfo *match_info) |
889 | { |
890 | g_return_val_if_fail (match_info != NULL, FALSE); |
891 | |
892 | return match_info->matches == PCRE_ERROR_PARTIAL; |
893 | } |
894 | |
895 | /** |
896 | * g_match_info_expand_references: |
897 | * @match_info: (nullable): a #GMatchInfo or %NULL |
898 | * @string_to_expand: the string to expand |
899 | * @error: location to store the error occurring, or %NULL to ignore errors |
900 | * |
901 | * Returns a new string containing the text in @string_to_expand with |
902 | * references and escape sequences expanded. References refer to the last |
903 | * match done with @string against @regex and have the same syntax used by |
904 | * g_regex_replace(). |
905 | * |
906 | * The @string_to_expand must be UTF-8 encoded even if #G_REGEX_RAW was |
907 | * passed to g_regex_new(). |
908 | * |
909 | * The backreferences are extracted from the string passed to the match |
910 | * function, so you cannot call this function after freeing the string. |
911 | * |
912 | * @match_info may be %NULL in which case @string_to_expand must not |
913 | * contain references. For instance "foo\n" does not refer to an actual |
914 | * pattern and '\n' merely will be replaced with \n character, |
915 | * while to expand "\0" (whole match) one needs the result of a match. |
916 | * Use g_regex_check_replacement() to find out whether @string_to_expand |
917 | * contains references. |
918 | * |
919 | * Returns: (nullable): the expanded string, or %NULL if an error occurred |
920 | * |
921 | * Since: 2.14 |
922 | */ |
923 | gchar * |
924 | g_match_info_expand_references (const GMatchInfo *match_info, |
925 | const gchar *string_to_expand, |
926 | GError **error) |
927 | { |
928 | GString *result; |
929 | GList *list; |
930 | GError *tmp_error = NULL; |
931 | |
932 | g_return_val_if_fail (string_to_expand != NULL, NULL); |
933 | g_return_val_if_fail (error == NULL || *error == NULL, NULL); |
934 | |
935 | list = split_replacement (replacement: string_to_expand, error: &tmp_error); |
936 | if (tmp_error != NULL) |
937 | { |
938 | g_propagate_error (dest: error, src: tmp_error); |
939 | return NULL; |
940 | } |
941 | |
942 | if (!match_info && interpolation_list_needs_match (list)) |
943 | { |
944 | g_critical ("String '%s' contains references to the match, can't " |
945 | "expand references without GMatchInfo object" , |
946 | string_to_expand); |
947 | return NULL; |
948 | } |
949 | |
950 | result = g_string_sized_new (dfl_size: strlen (s: string_to_expand)); |
951 | interpolate_replacement (match_info, result, data: list); |
952 | |
953 | g_list_free_full (list, free_func: (GDestroyNotify) free_interpolation_data); |
954 | |
955 | return g_string_free (string: result, FALSE); |
956 | } |
957 | |
958 | /** |
959 | * g_match_info_fetch: |
960 | * @match_info: #GMatchInfo structure |
961 | * @match_num: number of the sub expression |
962 | * |
963 | * Retrieves the text matching the @match_num'th capturing |
964 | * parentheses. 0 is the full text of the match, 1 is the first paren |
965 | * set, 2 the second, and so on. |
966 | * |
967 | * If @match_num is a valid sub pattern but it didn't match anything |
968 | * (e.g. sub pattern 1, matching "b" against "(a)?b") then an empty |
969 | * string is returned. |
970 | * |
971 | * If the match was obtained using the DFA algorithm, that is using |
972 | * g_regex_match_all() or g_regex_match_all_full(), the retrieved |
973 | * string is not that of a set of parentheses but that of a matched |
974 | * substring. Substrings are matched in reverse order of length, so |
975 | * 0 is the longest match. |
976 | * |
977 | * The string is fetched from the string passed to the match function, |
978 | * so you cannot call this function after freeing the string. |
979 | * |
980 | * Returns: (nullable): The matched substring, or %NULL if an error |
981 | * occurred. You have to free the string yourself |
982 | * |
983 | * Since: 2.14 |
984 | */ |
985 | gchar * |
986 | g_match_info_fetch (const GMatchInfo *match_info, |
987 | gint match_num) |
988 | { |
989 | /* we cannot use pcre_get_substring() because it allocates the |
990 | * string using pcre_malloc(). */ |
991 | gchar *match = NULL; |
992 | gint start, end; |
993 | |
994 | g_return_val_if_fail (match_info != NULL, NULL); |
995 | g_return_val_if_fail (match_num >= 0, NULL); |
996 | |
997 | /* match_num does not exist or it didn't matched, i.e. matching "b" |
998 | * against "(a)?b" then group 0 is empty. */ |
999 | if (!g_match_info_fetch_pos (match_info, match_num, start_pos: &start, end_pos: &end)) |
1000 | match = NULL; |
1001 | else if (start == -1) |
1002 | match = g_strdup (str: "" ); |
1003 | else |
1004 | match = g_strndup (str: &match_info->string[start], n: end - start); |
1005 | |
1006 | return match; |
1007 | } |
1008 | |
1009 | /** |
1010 | * g_match_info_fetch_pos: |
1011 | * @match_info: #GMatchInfo structure |
1012 | * @match_num: number of the sub expression |
1013 | * @start_pos: (out) (optional): pointer to location where to store |
1014 | * the start position, or %NULL |
1015 | * @end_pos: (out) (optional): pointer to location where to store |
1016 | * the end position, or %NULL |
1017 | * |
1018 | * Retrieves the position in bytes of the @match_num'th capturing |
1019 | * parentheses. 0 is the full text of the match, 1 is the first |
1020 | * paren set, 2 the second, and so on. |
1021 | * |
1022 | * If @match_num is a valid sub pattern but it didn't match anything |
1023 | * (e.g. sub pattern 1, matching "b" against "(a)?b") then @start_pos |
1024 | * and @end_pos are set to -1 and %TRUE is returned. |
1025 | * |
1026 | * If the match was obtained using the DFA algorithm, that is using |
1027 | * g_regex_match_all() or g_regex_match_all_full(), the retrieved |
1028 | * position is not that of a set of parentheses but that of a matched |
1029 | * substring. Substrings are matched in reverse order of length, so |
1030 | * 0 is the longest match. |
1031 | * |
1032 | * Returns: %TRUE if the position was fetched, %FALSE otherwise. If |
1033 | * the position cannot be fetched, @start_pos and @end_pos are left |
1034 | * unchanged |
1035 | * |
1036 | * Since: 2.14 |
1037 | */ |
1038 | gboolean |
1039 | g_match_info_fetch_pos (const GMatchInfo *match_info, |
1040 | gint match_num, |
1041 | gint *start_pos, |
1042 | gint *end_pos) |
1043 | { |
1044 | g_return_val_if_fail (match_info != NULL, FALSE); |
1045 | g_return_val_if_fail (match_num >= 0, FALSE); |
1046 | |
1047 | /* make sure the sub expression number they're requesting is less than |
1048 | * the total number of sub expressions that were matched. */ |
1049 | if (match_num >= match_info->matches) |
1050 | return FALSE; |
1051 | |
1052 | if (start_pos != NULL) |
1053 | *start_pos = match_info->offsets[2 * match_num]; |
1054 | |
1055 | if (end_pos != NULL) |
1056 | *end_pos = match_info->offsets[2 * match_num + 1]; |
1057 | |
1058 | return TRUE; |
1059 | } |
1060 | |
1061 | /* |
1062 | * Returns number of first matched subpattern with name @name. |
1063 | * There may be more than one in case when DUPNAMES is used, |
1064 | * and not all subpatterns with that name match; |
1065 | * pcre_get_stringnumber() does not work in that case. |
1066 | */ |
1067 | static gint |
1068 | get_matched_substring_number (const GMatchInfo *match_info, |
1069 | const gchar *name) |
1070 | { |
1071 | gint entrysize; |
1072 | gchar *first, *last; |
1073 | guchar *entry; |
1074 | |
1075 | if (!(match_info->regex->compile_opts & G_REGEX_DUPNAMES)) |
1076 | return pcre_get_stringnumber (match_info->regex->pcre_re, name); |
1077 | |
1078 | /* This code is copied from pcre_get.c: get_first_set() */ |
1079 | entrysize = pcre_get_stringtable_entries (match_info->regex->pcre_re, |
1080 | name, |
1081 | &first, |
1082 | &last); |
1083 | |
1084 | if (entrysize <= 0) |
1085 | return entrysize; |
1086 | |
1087 | for (entry = (guchar*) first; entry <= (guchar*) last; entry += entrysize) |
1088 | { |
1089 | gint n = (entry[0] << 8) + entry[1]; |
1090 | if (match_info->offsets[n*2] >= 0) |
1091 | return n; |
1092 | } |
1093 | |
1094 | return (first[0] << 8) + first[1]; |
1095 | } |
1096 | |
1097 | /** |
1098 | * g_match_info_fetch_named: |
1099 | * @match_info: #GMatchInfo structure |
1100 | * @name: name of the subexpression |
1101 | * |
1102 | * Retrieves the text matching the capturing parentheses named @name. |
1103 | * |
1104 | * If @name is a valid sub pattern name but it didn't match anything |
1105 | * (e.g. sub pattern "X", matching "b" against "(?P<X>a)?b") |
1106 | * then an empty string is returned. |
1107 | * |
1108 | * The string is fetched from the string passed to the match function, |
1109 | * so you cannot call this function after freeing the string. |
1110 | * |
1111 | * Returns: (nullable): The matched substring, or %NULL if an error |
1112 | * occurred. You have to free the string yourself |
1113 | * |
1114 | * Since: 2.14 |
1115 | */ |
1116 | gchar * |
1117 | g_match_info_fetch_named (const GMatchInfo *match_info, |
1118 | const gchar *name) |
1119 | { |
1120 | /* we cannot use pcre_get_named_substring() because it allocates the |
1121 | * string using pcre_malloc(). */ |
1122 | gint num; |
1123 | |
1124 | g_return_val_if_fail (match_info != NULL, NULL); |
1125 | g_return_val_if_fail (name != NULL, NULL); |
1126 | |
1127 | num = get_matched_substring_number (match_info, name); |
1128 | if (num < 0) |
1129 | return NULL; |
1130 | else |
1131 | return g_match_info_fetch (match_info, match_num: num); |
1132 | } |
1133 | |
1134 | /** |
1135 | * g_match_info_fetch_named_pos: |
1136 | * @match_info: #GMatchInfo structure |
1137 | * @name: name of the subexpression |
1138 | * @start_pos: (out) (optional): pointer to location where to store |
1139 | * the start position, or %NULL |
1140 | * @end_pos: (out) (optional): pointer to location where to store |
1141 | * the end position, or %NULL |
1142 | * |
1143 | * Retrieves the position in bytes of the capturing parentheses named @name. |
1144 | * |
1145 | * If @name is a valid sub pattern name but it didn't match anything |
1146 | * (e.g. sub pattern "X", matching "b" against "(?P<X>a)?b") |
1147 | * then @start_pos and @end_pos are set to -1 and %TRUE is returned. |
1148 | * |
1149 | * Returns: %TRUE if the position was fetched, %FALSE otherwise. |
1150 | * If the position cannot be fetched, @start_pos and @end_pos |
1151 | * are left unchanged. |
1152 | * |
1153 | * Since: 2.14 |
1154 | */ |
1155 | gboolean |
1156 | g_match_info_fetch_named_pos (const GMatchInfo *match_info, |
1157 | const gchar *name, |
1158 | gint *start_pos, |
1159 | gint *end_pos) |
1160 | { |
1161 | gint num; |
1162 | |
1163 | g_return_val_if_fail (match_info != NULL, FALSE); |
1164 | g_return_val_if_fail (name != NULL, FALSE); |
1165 | |
1166 | num = get_matched_substring_number (match_info, name); |
1167 | if (num < 0) |
1168 | return FALSE; |
1169 | |
1170 | return g_match_info_fetch_pos (match_info, match_num: num, start_pos, end_pos); |
1171 | } |
1172 | |
1173 | /** |
1174 | * g_match_info_fetch_all: |
1175 | * @match_info: a #GMatchInfo structure |
1176 | * |
1177 | * Bundles up pointers to each of the matching substrings from a match |
1178 | * and stores them in an array of gchar pointers. The first element in |
1179 | * the returned array is the match number 0, i.e. the entire matched |
1180 | * text. |
1181 | * |
1182 | * If a sub pattern didn't match anything (e.g. sub pattern 1, matching |
1183 | * "b" against "(a)?b") then an empty string is inserted. |
1184 | * |
1185 | * If the last match was obtained using the DFA algorithm, that is using |
1186 | * g_regex_match_all() or g_regex_match_all_full(), the retrieved |
1187 | * strings are not that matched by sets of parentheses but that of the |
1188 | * matched substring. Substrings are matched in reverse order of length, |
1189 | * so the first one is the longest match. |
1190 | * |
1191 | * The strings are fetched from the string passed to the match function, |
1192 | * so you cannot call this function after freeing the string. |
1193 | * |
1194 | * Returns: (transfer full): a %NULL-terminated array of gchar * |
1195 | * pointers. It must be freed using g_strfreev(). If the previous |
1196 | * match failed %NULL is returned |
1197 | * |
1198 | * Since: 2.14 |
1199 | */ |
1200 | gchar ** |
1201 | g_match_info_fetch_all (const GMatchInfo *match_info) |
1202 | { |
1203 | /* we cannot use pcre_get_substring_list() because the returned value |
1204 | * isn't suitable for g_strfreev(). */ |
1205 | gchar **result; |
1206 | gint i; |
1207 | |
1208 | g_return_val_if_fail (match_info != NULL, NULL); |
1209 | |
1210 | if (match_info->matches < 0) |
1211 | return NULL; |
1212 | |
1213 | result = g_new (gchar *, match_info->matches + 1); |
1214 | for (i = 0; i < match_info->matches; i++) |
1215 | result[i] = g_match_info_fetch (match_info, match_num: i); |
1216 | result[i] = NULL; |
1217 | |
1218 | return result; |
1219 | } |
1220 | |
1221 | |
1222 | /* GRegex */ |
1223 | |
1224 | G_DEFINE_QUARK (g-regex-error-quark, g_regex_error) |
1225 | |
1226 | /** |
1227 | * g_regex_ref: |
1228 | * @regex: a #GRegex |
1229 | * |
1230 | * Increases reference count of @regex by 1. |
1231 | * |
1232 | * Returns: @regex |
1233 | * |
1234 | * Since: 2.14 |
1235 | */ |
1236 | GRegex * |
1237 | g_regex_ref (GRegex *regex) |
1238 | { |
1239 | g_return_val_if_fail (regex != NULL, NULL); |
1240 | g_atomic_int_inc (®ex->ref_count); |
1241 | return regex; |
1242 | } |
1243 | |
1244 | /** |
1245 | * g_regex_unref: |
1246 | * @regex: a #GRegex |
1247 | * |
1248 | * Decreases reference count of @regex by 1. When reference count drops |
1249 | * to zero, it frees all the memory associated with the regex structure. |
1250 | * |
1251 | * Since: 2.14 |
1252 | */ |
1253 | void |
1254 | g_regex_unref (GRegex *regex) |
1255 | { |
1256 | g_return_if_fail (regex != NULL); |
1257 | |
1258 | if (g_atomic_int_dec_and_test (®ex->ref_count)) |
1259 | { |
1260 | g_free (mem: regex->pattern); |
1261 | if (regex->pcre_re != NULL) |
1262 | pcre_free (regex->pcre_re); |
1263 | if (regex->extra != NULL) |
1264 | pcre_free (regex->extra); |
1265 | g_free (mem: regex); |
1266 | } |
1267 | } |
1268 | |
1269 | /* |
1270 | * @match_options: (inout) (optional): |
1271 | */ |
1272 | static pcre *regex_compile (const gchar *pattern, |
1273 | GRegexCompileFlags compile_options, |
1274 | GRegexCompileFlags *compile_options_out, |
1275 | GRegexMatchFlags *match_options, |
1276 | GError **error); |
1277 | |
1278 | /** |
1279 | * g_regex_new: |
1280 | * @pattern: the regular expression |
1281 | * @compile_options: compile options for the regular expression, or 0 |
1282 | * @match_options: match options for the regular expression, or 0 |
1283 | * @error: return location for a #GError |
1284 | * |
1285 | * Compiles the regular expression to an internal form, and does |
1286 | * the initial setup of the #GRegex structure. |
1287 | * |
1288 | * Returns: (nullable): a #GRegex structure or %NULL if an error occurred. Call |
1289 | * g_regex_unref() when you are done with it |
1290 | * |
1291 | * Since: 2.14 |
1292 | */ |
1293 | GRegex * |
1294 | g_regex_new (const gchar *pattern, |
1295 | GRegexCompileFlags compile_options, |
1296 | GRegexMatchFlags match_options, |
1297 | GError **error) |
1298 | { |
1299 | GRegex *regex; |
1300 | pcre *re; |
1301 | const gchar *errmsg; |
1302 | gboolean optimize = FALSE; |
1303 | static gsize initialised = 0; |
1304 | |
1305 | g_return_val_if_fail (pattern != NULL, NULL); |
1306 | g_return_val_if_fail (error == NULL || *error == NULL, NULL); |
1307 | g_return_val_if_fail ((compile_options & ~G_REGEX_COMPILE_MASK) == 0, NULL); |
1308 | g_return_val_if_fail ((match_options & ~G_REGEX_MATCH_MASK) == 0, NULL); |
1309 | |
1310 | if (g_once_init_enter (&initialised)) |
1311 | { |
1312 | int supports_utf8, supports_ucp; |
1313 | |
1314 | pcre_config (PCRE_CONFIG_UTF8, &supports_utf8); |
1315 | if (!supports_utf8) |
1316 | g_critical (_("PCRE library is compiled without UTF8 support" )); |
1317 | |
1318 | pcre_config (PCRE_CONFIG_UNICODE_PROPERTIES, &supports_ucp); |
1319 | if (!supports_ucp) |
1320 | g_critical (_("PCRE library is compiled without UTF8 properties support" )); |
1321 | |
1322 | g_once_init_leave (&initialised, supports_utf8 && supports_ucp ? 1 : 2); |
1323 | } |
1324 | |
1325 | if (G_UNLIKELY (initialised != 1)) |
1326 | { |
1327 | g_set_error_literal (err: error, G_REGEX_ERROR, code: G_REGEX_ERROR_COMPILE, |
1328 | _("PCRE library is compiled with incompatible options" )); |
1329 | return NULL; |
1330 | } |
1331 | |
1332 | /* G_REGEX_OPTIMIZE has the same numeric value of PCRE_NO_UTF8_CHECK, |
1333 | * as we do not need to wrap PCRE_NO_UTF8_CHECK. */ |
1334 | if (compile_options & G_REGEX_OPTIMIZE) |
1335 | optimize = TRUE; |
1336 | |
1337 | re = regex_compile (pattern, compile_options, compile_options_out: &compile_options, |
1338 | match_options: &match_options, error); |
1339 | |
1340 | if (re == NULL) |
1341 | return NULL; |
1342 | |
1343 | regex = g_new0 (GRegex, 1); |
1344 | regex->ref_count = 1; |
1345 | regex->pattern = g_strdup (str: pattern); |
1346 | regex->pcre_re = re; |
1347 | regex->compile_opts = compile_options; |
1348 | regex->match_opts = match_options; |
1349 | |
1350 | if (optimize) |
1351 | { |
1352 | regex->extra = pcre_study (regex->pcre_re, 0, &errmsg); |
1353 | if (errmsg != NULL) |
1354 | { |
1355 | GError *tmp_error = g_error_new (G_REGEX_ERROR, |
1356 | code: G_REGEX_ERROR_OPTIMIZE, |
1357 | _("Error while optimizing " |
1358 | "regular expression %s: %s" ), |
1359 | regex->pattern, |
1360 | errmsg); |
1361 | g_propagate_error (dest: error, src: tmp_error); |
1362 | |
1363 | g_regex_unref (regex); |
1364 | return NULL; |
1365 | } |
1366 | } |
1367 | |
1368 | return regex; |
1369 | } |
1370 | |
1371 | static pcre * |
1372 | regex_compile (const gchar *pattern, |
1373 | GRegexCompileFlags compile_options, |
1374 | GRegexCompileFlags *compile_options_out, |
1375 | GRegexMatchFlags *match_options, |
1376 | GError **error) |
1377 | { |
1378 | pcre *re; |
1379 | const gchar *errmsg; |
1380 | gint erroffset; |
1381 | gint errcode; |
1382 | GRegexCompileFlags nonpcre_compile_options; |
1383 | unsigned long int pcre_compile_options; |
1384 | |
1385 | nonpcre_compile_options = compile_options & G_REGEX_COMPILE_NONPCRE_MASK; |
1386 | |
1387 | /* In GRegex the string are, by default, UTF-8 encoded. PCRE |
1388 | * instead uses UTF-8 only if required with PCRE_UTF8. */ |
1389 | if (compile_options & G_REGEX_RAW) |
1390 | { |
1391 | /* disable utf-8 */ |
1392 | compile_options &= ~G_REGEX_RAW; |
1393 | } |
1394 | else |
1395 | { |
1396 | /* enable utf-8 */ |
1397 | compile_options |= PCRE_UTF8 | PCRE_NO_UTF8_CHECK; |
1398 | |
1399 | if (match_options != NULL) |
1400 | *match_options |= PCRE_NO_UTF8_CHECK; |
1401 | } |
1402 | |
1403 | /* PCRE_NEWLINE_ANY is the default for the internal PCRE but |
1404 | * not for the system one. */ |
1405 | if (!(compile_options & G_REGEX_NEWLINE_CR) && |
1406 | !(compile_options & G_REGEX_NEWLINE_LF)) |
1407 | { |
1408 | compile_options |= PCRE_NEWLINE_ANY; |
1409 | } |
1410 | |
1411 | compile_options |= PCRE_UCP; |
1412 | |
1413 | /* PCRE_BSR_UNICODE is the default for the internal PCRE but |
1414 | * possibly not for the system one. |
1415 | */ |
1416 | if (~compile_options & G_REGEX_BSR_ANYCRLF) |
1417 | compile_options |= PCRE_BSR_UNICODE; |
1418 | |
1419 | /* compile the pattern */ |
1420 | re = pcre_compile2 (pattern, compile_options, &errcode, |
1421 | &errmsg, &erroffset, NULL); |
1422 | |
1423 | /* if the compilation failed, set the error member and return |
1424 | * immediately */ |
1425 | if (re == NULL) |
1426 | { |
1427 | GError *tmp_error; |
1428 | |
1429 | /* Translate the PCRE error code to GRegexError and use a translated |
1430 | * error message if possible */ |
1431 | translate_compile_error (errcode: &errcode, errmsg: &errmsg); |
1432 | |
1433 | /* PCRE uses byte offsets but we want to show character offsets */ |
1434 | erroffset = g_utf8_pointer_to_offset (str: pattern, pos: &pattern[erroffset]); |
1435 | |
1436 | tmp_error = g_error_new (G_REGEX_ERROR, code: errcode, |
1437 | _("Error while compiling regular " |
1438 | "expression %s at char %d: %s" ), |
1439 | pattern, erroffset, errmsg); |
1440 | g_propagate_error (dest: error, src: tmp_error); |
1441 | |
1442 | return NULL; |
1443 | } |
1444 | |
1445 | /* For options set at the beginning of the pattern, pcre puts them into |
1446 | * compile options, e.g. "(?i)foo" will make the pcre structure store |
1447 | * PCRE_CASELESS even though it wasn't explicitly given for compilation. */ |
1448 | pcre_fullinfo (re, NULL, PCRE_INFO_OPTIONS, &pcre_compile_options); |
1449 | compile_options = pcre_compile_options & G_REGEX_COMPILE_PCRE_MASK; |
1450 | |
1451 | /* Don't leak PCRE_NEWLINE_ANY, which is part of PCRE_NEWLINE_ANYCRLF */ |
1452 | if ((pcre_compile_options & PCRE_NEWLINE_ANYCRLF) != PCRE_NEWLINE_ANYCRLF) |
1453 | compile_options &= ~PCRE_NEWLINE_ANY; |
1454 | |
1455 | compile_options |= nonpcre_compile_options; |
1456 | |
1457 | if (!(compile_options & G_REGEX_DUPNAMES)) |
1458 | { |
1459 | gboolean jchanged = FALSE; |
1460 | pcre_fullinfo (re, NULL, PCRE_INFO_JCHANGED, &jchanged); |
1461 | if (jchanged) |
1462 | compile_options |= G_REGEX_DUPNAMES; |
1463 | } |
1464 | |
1465 | if (compile_options_out != 0) |
1466 | *compile_options_out = compile_options; |
1467 | |
1468 | return re; |
1469 | } |
1470 | |
1471 | /** |
1472 | * g_regex_get_pattern: |
1473 | * @regex: a #GRegex structure |
1474 | * |
1475 | * Gets the pattern string associated with @regex, i.e. a copy of |
1476 | * the string passed to g_regex_new(). |
1477 | * |
1478 | * Returns: the pattern of @regex |
1479 | * |
1480 | * Since: 2.14 |
1481 | */ |
1482 | const gchar * |
1483 | g_regex_get_pattern (const GRegex *regex) |
1484 | { |
1485 | g_return_val_if_fail (regex != NULL, NULL); |
1486 | |
1487 | return regex->pattern; |
1488 | } |
1489 | |
1490 | /** |
1491 | * g_regex_get_max_backref: |
1492 | * @regex: a #GRegex |
1493 | * |
1494 | * Returns the number of the highest back reference |
1495 | * in the pattern, or 0 if the pattern does not contain |
1496 | * back references. |
1497 | * |
1498 | * Returns: the number of the highest back reference |
1499 | * |
1500 | * Since: 2.14 |
1501 | */ |
1502 | gint |
1503 | g_regex_get_max_backref (const GRegex *regex) |
1504 | { |
1505 | gint value; |
1506 | |
1507 | pcre_fullinfo (regex->pcre_re, regex->extra, |
1508 | PCRE_INFO_BACKREFMAX, &value); |
1509 | |
1510 | return value; |
1511 | } |
1512 | |
1513 | /** |
1514 | * g_regex_get_capture_count: |
1515 | * @regex: a #GRegex |
1516 | * |
1517 | * Returns the number of capturing subpatterns in the pattern. |
1518 | * |
1519 | * Returns: the number of capturing subpatterns |
1520 | * |
1521 | * Since: 2.14 |
1522 | */ |
1523 | gint |
1524 | g_regex_get_capture_count (const GRegex *regex) |
1525 | { |
1526 | gint value; |
1527 | |
1528 | pcre_fullinfo (regex->pcre_re, regex->extra, |
1529 | PCRE_INFO_CAPTURECOUNT, &value); |
1530 | |
1531 | return value; |
1532 | } |
1533 | |
1534 | /** |
1535 | * g_regex_get_has_cr_or_lf: |
1536 | * @regex: a #GRegex structure |
1537 | * |
1538 | * Checks whether the pattern contains explicit CR or LF references. |
1539 | * |
1540 | * Returns: %TRUE if the pattern contains explicit CR or LF references |
1541 | * |
1542 | * Since: 2.34 |
1543 | */ |
1544 | gboolean |
1545 | g_regex_get_has_cr_or_lf (const GRegex *regex) |
1546 | { |
1547 | gint value; |
1548 | |
1549 | pcre_fullinfo (regex->pcre_re, regex->extra, |
1550 | PCRE_INFO_HASCRORLF, &value); |
1551 | |
1552 | return !!value; |
1553 | } |
1554 | |
1555 | /** |
1556 | * g_regex_get_max_lookbehind: |
1557 | * @regex: a #GRegex structure |
1558 | * |
1559 | * Gets the number of characters in the longest lookbehind assertion in the |
1560 | * pattern. This information is useful when doing multi-segment matching using |
1561 | * the partial matching facilities. |
1562 | * |
1563 | * Returns: the number of characters in the longest lookbehind assertion. |
1564 | * |
1565 | * Since: 2.38 |
1566 | */ |
1567 | gint |
1568 | g_regex_get_max_lookbehind (const GRegex *regex) |
1569 | { |
1570 | gint max_lookbehind; |
1571 | |
1572 | pcre_fullinfo (regex->pcre_re, regex->extra, |
1573 | PCRE_INFO_MAXLOOKBEHIND, &max_lookbehind); |
1574 | |
1575 | return max_lookbehind; |
1576 | } |
1577 | |
1578 | /** |
1579 | * g_regex_get_compile_flags: |
1580 | * @regex: a #GRegex |
1581 | * |
1582 | * Returns the compile options that @regex was created with. |
1583 | * |
1584 | * Depending on the version of PCRE that is used, this may or may not |
1585 | * include flags set by option expressions such as `(?i)` found at the |
1586 | * top-level within the compiled pattern. |
1587 | * |
1588 | * Returns: flags from #GRegexCompileFlags |
1589 | * |
1590 | * Since: 2.26 |
1591 | */ |
1592 | GRegexCompileFlags |
1593 | g_regex_get_compile_flags (const GRegex *regex) |
1594 | { |
1595 | g_return_val_if_fail (regex != NULL, 0); |
1596 | |
1597 | return regex->compile_opts; |
1598 | } |
1599 | |
1600 | /** |
1601 | * g_regex_get_match_flags: |
1602 | * @regex: a #GRegex |
1603 | * |
1604 | * Returns the match options that @regex was created with. |
1605 | * |
1606 | * Returns: flags from #GRegexMatchFlags |
1607 | * |
1608 | * Since: 2.26 |
1609 | */ |
1610 | GRegexMatchFlags |
1611 | g_regex_get_match_flags (const GRegex *regex) |
1612 | { |
1613 | g_return_val_if_fail (regex != NULL, 0); |
1614 | |
1615 | return regex->match_opts & G_REGEX_MATCH_MASK; |
1616 | } |
1617 | |
1618 | /** |
1619 | * g_regex_match_simple: |
1620 | * @pattern: the regular expression |
1621 | * @string: the string to scan for matches |
1622 | * @compile_options: compile options for the regular expression, or 0 |
1623 | * @match_options: match options, or 0 |
1624 | * |
1625 | * Scans for a match in @string for @pattern. |
1626 | * |
1627 | * This function is equivalent to g_regex_match() but it does not |
1628 | * require to compile the pattern with g_regex_new(), avoiding some |
1629 | * lines of code when you need just to do a match without extracting |
1630 | * substrings, capture counts, and so on. |
1631 | * |
1632 | * If this function is to be called on the same @pattern more than |
1633 | * once, it's more efficient to compile the pattern once with |
1634 | * g_regex_new() and then use g_regex_match(). |
1635 | * |
1636 | * Returns: %TRUE if the string matched, %FALSE otherwise |
1637 | * |
1638 | * Since: 2.14 |
1639 | */ |
1640 | gboolean |
1641 | g_regex_match_simple (const gchar *pattern, |
1642 | const gchar *string, |
1643 | GRegexCompileFlags compile_options, |
1644 | GRegexMatchFlags match_options) |
1645 | { |
1646 | GRegex *regex; |
1647 | gboolean result; |
1648 | |
1649 | regex = g_regex_new (pattern, compile_options, match_options: 0, NULL); |
1650 | if (!regex) |
1651 | return FALSE; |
1652 | result = g_regex_match_full (regex, string, string_len: -1, start_position: 0, match_options, NULL, NULL); |
1653 | g_regex_unref (regex); |
1654 | return result; |
1655 | } |
1656 | |
1657 | /** |
1658 | * g_regex_match: |
1659 | * @regex: a #GRegex structure from g_regex_new() |
1660 | * @string: the string to scan for matches |
1661 | * @match_options: match options |
1662 | * @match_info: (out) (optional): pointer to location where to store |
1663 | * the #GMatchInfo, or %NULL if you do not need it |
1664 | * |
1665 | * Scans for a match in @string for the pattern in @regex. |
1666 | * The @match_options are combined with the match options specified |
1667 | * when the @regex structure was created, letting you have more |
1668 | * flexibility in reusing #GRegex structures. |
1669 | * |
1670 | * Unless %G_REGEX_RAW is specified in the options, @string must be valid UTF-8. |
1671 | * |
1672 | * A #GMatchInfo structure, used to get information on the match, |
1673 | * is stored in @match_info if not %NULL. Note that if @match_info |
1674 | * is not %NULL then it is created even if the function returns %FALSE, |
1675 | * i.e. you must free it regardless if regular expression actually matched. |
1676 | * |
1677 | * To retrieve all the non-overlapping matches of the pattern in |
1678 | * string you can use g_match_info_next(). |
1679 | * |
1680 | * |[<!-- language="C" --> |
1681 | * static void |
1682 | * print_uppercase_words (const gchar *string) |
1683 | * { |
1684 | * // Print all uppercase-only words. |
1685 | * GRegex *regex; |
1686 | * GMatchInfo *match_info; |
1687 | * |
1688 | * regex = g_regex_new ("[A-Z]+", 0, 0, NULL); |
1689 | * g_regex_match (regex, string, 0, &match_info); |
1690 | * while (g_match_info_matches (match_info)) |
1691 | * { |
1692 | * gchar *word = g_match_info_fetch (match_info, 0); |
1693 | * g_print ("Found: %s\n", word); |
1694 | * g_free (word); |
1695 | * g_match_info_next (match_info, NULL); |
1696 | * } |
1697 | * g_match_info_free (match_info); |
1698 | * g_regex_unref (regex); |
1699 | * } |
1700 | * ]| |
1701 | * |
1702 | * @string is not copied and is used in #GMatchInfo internally. If |
1703 | * you use any #GMatchInfo method (except g_match_info_free()) after |
1704 | * freeing or modifying @string then the behaviour is undefined. |
1705 | * |
1706 | * Returns: %TRUE is the string matched, %FALSE otherwise |
1707 | * |
1708 | * Since: 2.14 |
1709 | */ |
1710 | gboolean |
1711 | g_regex_match (const GRegex *regex, |
1712 | const gchar *string, |
1713 | GRegexMatchFlags match_options, |
1714 | GMatchInfo **match_info) |
1715 | { |
1716 | return g_regex_match_full (regex, string, string_len: -1, start_position: 0, match_options, |
1717 | match_info, NULL); |
1718 | } |
1719 | |
1720 | /** |
1721 | * g_regex_match_full: |
1722 | * @regex: a #GRegex structure from g_regex_new() |
1723 | * @string: (array length=string_len): the string to scan for matches |
1724 | * @string_len: the length of @string, in bytes, or -1 if @string is nul-terminated |
1725 | * @start_position: starting index of the string to match, in bytes |
1726 | * @match_options: match options |
1727 | * @match_info: (out) (optional): pointer to location where to store |
1728 | * the #GMatchInfo, or %NULL if you do not need it |
1729 | * @error: location to store the error occurring, or %NULL to ignore errors |
1730 | * |
1731 | * Scans for a match in @string for the pattern in @regex. |
1732 | * The @match_options are combined with the match options specified |
1733 | * when the @regex structure was created, letting you have more |
1734 | * flexibility in reusing #GRegex structures. |
1735 | * |
1736 | * Setting @start_position differs from just passing over a shortened |
1737 | * string and setting #G_REGEX_MATCH_NOTBOL in the case of a pattern |
1738 | * that begins with any kind of lookbehind assertion, such as "\b". |
1739 | * |
1740 | * Unless %G_REGEX_RAW is specified in the options, @string must be valid UTF-8. |
1741 | * |
1742 | * A #GMatchInfo structure, used to get information on the match, is |
1743 | * stored in @match_info if not %NULL. Note that if @match_info is |
1744 | * not %NULL then it is created even if the function returns %FALSE, |
1745 | * i.e. you must free it regardless if regular expression actually |
1746 | * matched. |
1747 | * |
1748 | * @string is not copied and is used in #GMatchInfo internally. If |
1749 | * you use any #GMatchInfo method (except g_match_info_free()) after |
1750 | * freeing or modifying @string then the behaviour is undefined. |
1751 | * |
1752 | * To retrieve all the non-overlapping matches of the pattern in |
1753 | * string you can use g_match_info_next(). |
1754 | * |
1755 | * |[<!-- language="C" --> |
1756 | * static void |
1757 | * print_uppercase_words (const gchar *string) |
1758 | * { |
1759 | * // Print all uppercase-only words. |
1760 | * GRegex *regex; |
1761 | * GMatchInfo *match_info; |
1762 | * GError *error = NULL; |
1763 | * |
1764 | * regex = g_regex_new ("[A-Z]+", 0, 0, NULL); |
1765 | * g_regex_match_full (regex, string, -1, 0, 0, &match_info, &error); |
1766 | * while (g_match_info_matches (match_info)) |
1767 | * { |
1768 | * gchar *word = g_match_info_fetch (match_info, 0); |
1769 | * g_print ("Found: %s\n", word); |
1770 | * g_free (word); |
1771 | * g_match_info_next (match_info, &error); |
1772 | * } |
1773 | * g_match_info_free (match_info); |
1774 | * g_regex_unref (regex); |
1775 | * if (error != NULL) |
1776 | * { |
1777 | * g_printerr ("Error while matching: %s\n", error->message); |
1778 | * g_error_free (error); |
1779 | * } |
1780 | * } |
1781 | * ]| |
1782 | * |
1783 | * Returns: %TRUE is the string matched, %FALSE otherwise |
1784 | * |
1785 | * Since: 2.14 |
1786 | */ |
1787 | gboolean |
1788 | g_regex_match_full (const GRegex *regex, |
1789 | const gchar *string, |
1790 | gssize string_len, |
1791 | gint start_position, |
1792 | GRegexMatchFlags match_options, |
1793 | GMatchInfo **match_info, |
1794 | GError **error) |
1795 | { |
1796 | GMatchInfo *info; |
1797 | gboolean match_ok; |
1798 | |
1799 | g_return_val_if_fail (regex != NULL, FALSE); |
1800 | g_return_val_if_fail (string != NULL, FALSE); |
1801 | g_return_val_if_fail (start_position >= 0, FALSE); |
1802 | g_return_val_if_fail (error == NULL || *error == NULL, FALSE); |
1803 | g_return_val_if_fail ((match_options & ~G_REGEX_MATCH_MASK) == 0, FALSE); |
1804 | |
1805 | info = match_info_new (regex, string, string_len, start_position, |
1806 | match_options, FALSE); |
1807 | match_ok = g_match_info_next (match_info: info, error); |
1808 | if (match_info != NULL) |
1809 | *match_info = info; |
1810 | else |
1811 | g_match_info_free (match_info: info); |
1812 | |
1813 | return match_ok; |
1814 | } |
1815 | |
1816 | /** |
1817 | * g_regex_match_all: |
1818 | * @regex: a #GRegex structure from g_regex_new() |
1819 | * @string: the string to scan for matches |
1820 | * @match_options: match options |
1821 | * @match_info: (out) (optional): pointer to location where to store |
1822 | * the #GMatchInfo, or %NULL if you do not need it |
1823 | * |
1824 | * Using the standard algorithm for regular expression matching only |
1825 | * the longest match in the string is retrieved. This function uses |
1826 | * a different algorithm so it can retrieve all the possible matches. |
1827 | * For more documentation see g_regex_match_all_full(). |
1828 | * |
1829 | * A #GMatchInfo structure, used to get information on the match, is |
1830 | * stored in @match_info if not %NULL. Note that if @match_info is |
1831 | * not %NULL then it is created even if the function returns %FALSE, |
1832 | * i.e. you must free it regardless if regular expression actually |
1833 | * matched. |
1834 | * |
1835 | * @string is not copied and is used in #GMatchInfo internally. If |
1836 | * you use any #GMatchInfo method (except g_match_info_free()) after |
1837 | * freeing or modifying @string then the behaviour is undefined. |
1838 | * |
1839 | * Returns: %TRUE is the string matched, %FALSE otherwise |
1840 | * |
1841 | * Since: 2.14 |
1842 | */ |
1843 | gboolean |
1844 | g_regex_match_all (const GRegex *regex, |
1845 | const gchar *string, |
1846 | GRegexMatchFlags match_options, |
1847 | GMatchInfo **match_info) |
1848 | { |
1849 | return g_regex_match_all_full (regex, string, string_len: -1, start_position: 0, match_options, |
1850 | match_info, NULL); |
1851 | } |
1852 | |
1853 | /** |
1854 | * g_regex_match_all_full: |
1855 | * @regex: a #GRegex structure from g_regex_new() |
1856 | * @string: (array length=string_len): the string to scan for matches |
1857 | * @string_len: the length of @string, in bytes, or -1 if @string is nul-terminated |
1858 | * @start_position: starting index of the string to match, in bytes |
1859 | * @match_options: match options |
1860 | * @match_info: (out) (optional): pointer to location where to store |
1861 | * the #GMatchInfo, or %NULL if you do not need it |
1862 | * @error: location to store the error occurring, or %NULL to ignore errors |
1863 | * |
1864 | * Using the standard algorithm for regular expression matching only |
1865 | * the longest match in the @string is retrieved, it is not possible |
1866 | * to obtain all the available matches. For instance matching |
1867 | * "<a> <b> <c>" against the pattern "<.*>" |
1868 | * you get "<a> <b> <c>". |
1869 | * |
1870 | * This function uses a different algorithm (called DFA, i.e. deterministic |
1871 | * finite automaton), so it can retrieve all the possible matches, all |
1872 | * starting at the same point in the string. For instance matching |
1873 | * "<a> <b> <c>" against the pattern "<.*>;" |
1874 | * you would obtain three matches: "<a> <b> <c>", |
1875 | * "<a> <b>" and "<a>". |
1876 | * |
1877 | * The number of matched strings is retrieved using |
1878 | * g_match_info_get_match_count(). To obtain the matched strings and |
1879 | * their position you can use, respectively, g_match_info_fetch() and |
1880 | * g_match_info_fetch_pos(). Note that the strings are returned in |
1881 | * reverse order of length; that is, the longest matching string is |
1882 | * given first. |
1883 | * |
1884 | * Note that the DFA algorithm is slower than the standard one and it |
1885 | * is not able to capture substrings, so backreferences do not work. |
1886 | * |
1887 | * Setting @start_position differs from just passing over a shortened |
1888 | * string and setting #G_REGEX_MATCH_NOTBOL in the case of a pattern |
1889 | * that begins with any kind of lookbehind assertion, such as "\b". |
1890 | * |
1891 | * Unless %G_REGEX_RAW is specified in the options, @string must be valid UTF-8. |
1892 | * |
1893 | * A #GMatchInfo structure, used to get information on the match, is |
1894 | * stored in @match_info if not %NULL. Note that if @match_info is |
1895 | * not %NULL then it is created even if the function returns %FALSE, |
1896 | * i.e. you must free it regardless if regular expression actually |
1897 | * matched. |
1898 | * |
1899 | * @string is not copied and is used in #GMatchInfo internally. If |
1900 | * you use any #GMatchInfo method (except g_match_info_free()) after |
1901 | * freeing or modifying @string then the behaviour is undefined. |
1902 | * |
1903 | * Returns: %TRUE is the string matched, %FALSE otherwise |
1904 | * |
1905 | * Since: 2.14 |
1906 | */ |
1907 | gboolean |
1908 | g_regex_match_all_full (const GRegex *regex, |
1909 | const gchar *string, |
1910 | gssize string_len, |
1911 | gint start_position, |
1912 | GRegexMatchFlags match_options, |
1913 | GMatchInfo **match_info, |
1914 | GError **error) |
1915 | { |
1916 | GMatchInfo *info; |
1917 | gboolean done; |
1918 | pcre *pcre_re; |
1919 | pcre_extra *; |
1920 | gboolean retval; |
1921 | |
1922 | g_return_val_if_fail (regex != NULL, FALSE); |
1923 | g_return_val_if_fail (string != NULL, FALSE); |
1924 | g_return_val_if_fail (start_position >= 0, FALSE); |
1925 | g_return_val_if_fail (error == NULL || *error == NULL, FALSE); |
1926 | g_return_val_if_fail ((match_options & ~G_REGEX_MATCH_MASK) == 0, FALSE); |
1927 | |
1928 | #ifdef PCRE_NO_AUTO_POSSESS |
1929 | /* For PCRE >= 8.34 we need to turn off PCRE_NO_AUTO_POSSESS, which |
1930 | * is an optimization for normal regex matching, but results in omitting |
1931 | * some shorter matches here, and an observable behaviour change. |
1932 | * |
1933 | * DFA matching is rather niche, and very rarely used according to |
1934 | * codesearch.debian.net, so don't bother caching the recompiled RE. */ |
1935 | pcre_re = regex_compile (pattern: regex->pattern, |
1936 | compile_options: regex->compile_opts | PCRE_NO_AUTO_POSSESS, |
1937 | NULL, NULL, error); |
1938 | |
1939 | if (pcre_re == NULL) |
1940 | return FALSE; |
1941 | |
1942 | /* Not bothering to cache the optimization data either, with similar |
1943 | * reasoning */ |
1944 | extra = NULL; |
1945 | #else |
1946 | /* For PCRE < 8.33 the precompiled regex is fine. */ |
1947 | pcre_re = regex->pcre_re; |
1948 | extra = regex->extra; |
1949 | #endif |
1950 | |
1951 | info = match_info_new (regex, string, string_len, start_position, |
1952 | match_options, TRUE); |
1953 | |
1954 | done = FALSE; |
1955 | while (!done) |
1956 | { |
1957 | done = TRUE; |
1958 | info->matches = pcre_dfa_exec (pcre_re, extra, |
1959 | info->string, info->string_len, |
1960 | info->pos, |
1961 | regex->match_opts | match_options, |
1962 | info->offsets, info->n_offsets, |
1963 | info->workspace, info->n_workspace); |
1964 | if (info->matches == PCRE_ERROR_DFA_WSSIZE) |
1965 | { |
1966 | /* info->workspace is too small. */ |
1967 | info->n_workspace *= 2; |
1968 | info->workspace = g_realloc (mem: info->workspace, |
1969 | n_bytes: info->n_workspace * sizeof (gint)); |
1970 | done = FALSE; |
1971 | } |
1972 | else if (info->matches == 0) |
1973 | { |
1974 | /* info->offsets is too small. */ |
1975 | info->n_offsets *= 2; |
1976 | info->offsets = g_realloc (mem: info->offsets, |
1977 | n_bytes: info->n_offsets * sizeof (gint)); |
1978 | done = FALSE; |
1979 | } |
1980 | else if (IS_PCRE_ERROR (info->matches)) |
1981 | { |
1982 | g_set_error (err: error, G_REGEX_ERROR, code: G_REGEX_ERROR_MATCH, |
1983 | _("Error while matching regular expression %s: %s" ), |
1984 | regex->pattern, match_error (errcode: info->matches)); |
1985 | } |
1986 | } |
1987 | |
1988 | #ifdef PCRE_NO_AUTO_POSSESS |
1989 | pcre_free (pcre_re); |
1990 | #endif |
1991 | |
1992 | /* set info->pos to -1 so that a call to g_match_info_next() fails. */ |
1993 | info->pos = -1; |
1994 | retval = info->matches >= 0; |
1995 | |
1996 | if (match_info != NULL) |
1997 | *match_info = info; |
1998 | else |
1999 | g_match_info_free (match_info: info); |
2000 | |
2001 | return retval; |
2002 | } |
2003 | |
2004 | /** |
2005 | * g_regex_get_string_number: |
2006 | * @regex: #GRegex structure |
2007 | * @name: name of the subexpression |
2008 | * |
2009 | * Retrieves the number of the subexpression named @name. |
2010 | * |
2011 | * Returns: The number of the subexpression or -1 if @name |
2012 | * does not exists |
2013 | * |
2014 | * Since: 2.14 |
2015 | */ |
2016 | gint |
2017 | g_regex_get_string_number (const GRegex *regex, |
2018 | const gchar *name) |
2019 | { |
2020 | gint num; |
2021 | |
2022 | g_return_val_if_fail (regex != NULL, -1); |
2023 | g_return_val_if_fail (name != NULL, -1); |
2024 | |
2025 | num = pcre_get_stringnumber (regex->pcre_re, name); |
2026 | if (num == PCRE_ERROR_NOSUBSTRING) |
2027 | num = -1; |
2028 | |
2029 | return num; |
2030 | } |
2031 | |
2032 | /** |
2033 | * g_regex_split_simple: |
2034 | * @pattern: the regular expression |
2035 | * @string: the string to scan for matches |
2036 | * @compile_options: compile options for the regular expression, or 0 |
2037 | * @match_options: match options, or 0 |
2038 | * |
2039 | * Breaks the string on the pattern, and returns an array of |
2040 | * the tokens. If the pattern contains capturing parentheses, |
2041 | * then the text for each of the substrings will also be returned. |
2042 | * If the pattern does not match anywhere in the string, then the |
2043 | * whole string is returned as the first token. |
2044 | * |
2045 | * This function is equivalent to g_regex_split() but it does |
2046 | * not require to compile the pattern with g_regex_new(), avoiding |
2047 | * some lines of code when you need just to do a split without |
2048 | * extracting substrings, capture counts, and so on. |
2049 | * |
2050 | * If this function is to be called on the same @pattern more than |
2051 | * once, it's more efficient to compile the pattern once with |
2052 | * g_regex_new() and then use g_regex_split(). |
2053 | * |
2054 | * As a special case, the result of splitting the empty string "" |
2055 | * is an empty vector, not a vector containing a single string. |
2056 | * The reason for this special case is that being able to represent |
2057 | * an empty vector is typically more useful than consistent handling |
2058 | * of empty elements. If you do need to represent empty elements, |
2059 | * you'll need to check for the empty string before calling this |
2060 | * function. |
2061 | * |
2062 | * A pattern that can match empty strings splits @string into |
2063 | * separate characters wherever it matches the empty string between |
2064 | * characters. For example splitting "ab c" using as a separator |
2065 | * "\s*", you will get "a", "b" and "c". |
2066 | * |
2067 | * Returns: (transfer full): a %NULL-terminated array of strings. Free |
2068 | * it using g_strfreev() |
2069 | * |
2070 | * Since: 2.14 |
2071 | **/ |
2072 | gchar ** |
2073 | g_regex_split_simple (const gchar *pattern, |
2074 | const gchar *string, |
2075 | GRegexCompileFlags compile_options, |
2076 | GRegexMatchFlags match_options) |
2077 | { |
2078 | GRegex *regex; |
2079 | gchar **result; |
2080 | |
2081 | regex = g_regex_new (pattern, compile_options, match_options: 0, NULL); |
2082 | if (!regex) |
2083 | return NULL; |
2084 | |
2085 | result = g_regex_split_full (regex, string, string_len: -1, start_position: 0, match_options, max_tokens: 0, NULL); |
2086 | g_regex_unref (regex); |
2087 | return result; |
2088 | } |
2089 | |
2090 | /** |
2091 | * g_regex_split: |
2092 | * @regex: a #GRegex structure |
2093 | * @string: the string to split with the pattern |
2094 | * @match_options: match time option flags |
2095 | * |
2096 | * Breaks the string on the pattern, and returns an array of the tokens. |
2097 | * If the pattern contains capturing parentheses, then the text for each |
2098 | * of the substrings will also be returned. If the pattern does not match |
2099 | * anywhere in the string, then the whole string is returned as the first |
2100 | * token. |
2101 | * |
2102 | * As a special case, the result of splitting the empty string "" is an |
2103 | * empty vector, not a vector containing a single string. The reason for |
2104 | * this special case is that being able to represent an empty vector is |
2105 | * typically more useful than consistent handling of empty elements. If |
2106 | * you do need to represent empty elements, you'll need to check for the |
2107 | * empty string before calling this function. |
2108 | * |
2109 | * A pattern that can match empty strings splits @string into separate |
2110 | * characters wherever it matches the empty string between characters. |
2111 | * For example splitting "ab c" using as a separator "\s*", you will get |
2112 | * "a", "b" and "c". |
2113 | * |
2114 | * Returns: (transfer full): a %NULL-terminated gchar ** array. Free |
2115 | * it using g_strfreev() |
2116 | * |
2117 | * Since: 2.14 |
2118 | **/ |
2119 | gchar ** |
2120 | g_regex_split (const GRegex *regex, |
2121 | const gchar *string, |
2122 | GRegexMatchFlags match_options) |
2123 | { |
2124 | return g_regex_split_full (regex, string, string_len: -1, start_position: 0, |
2125 | match_options, max_tokens: 0, NULL); |
2126 | } |
2127 | |
2128 | /** |
2129 | * g_regex_split_full: |
2130 | * @regex: a #GRegex structure |
2131 | * @string: (array length=string_len): the string to split with the pattern |
2132 | * @string_len: the length of @string, in bytes, or -1 if @string is nul-terminated |
2133 | * @start_position: starting index of the string to match, in bytes |
2134 | * @match_options: match time option flags |
2135 | * @max_tokens: the maximum number of tokens to split @string into. |
2136 | * If this is less than 1, the string is split completely |
2137 | * @error: return location for a #GError |
2138 | * |
2139 | * Breaks the string on the pattern, and returns an array of the tokens. |
2140 | * If the pattern contains capturing parentheses, then the text for each |
2141 | * of the substrings will also be returned. If the pattern does not match |
2142 | * anywhere in the string, then the whole string is returned as the first |
2143 | * token. |
2144 | * |
2145 | * As a special case, the result of splitting the empty string "" is an |
2146 | * empty vector, not a vector containing a single string. The reason for |
2147 | * this special case is that being able to represent an empty vector is |
2148 | * typically more useful than consistent handling of empty elements. If |
2149 | * you do need to represent empty elements, you'll need to check for the |
2150 | * empty string before calling this function. |
2151 | * |
2152 | * A pattern that can match empty strings splits @string into separate |
2153 | * characters wherever it matches the empty string between characters. |
2154 | * For example splitting "ab c" using as a separator "\s*", you will get |
2155 | * "a", "b" and "c". |
2156 | * |
2157 | * Setting @start_position differs from just passing over a shortened |
2158 | * string and setting #G_REGEX_MATCH_NOTBOL in the case of a pattern |
2159 | * that begins with any kind of lookbehind assertion, such as "\b". |
2160 | * |
2161 | * Returns: (transfer full): a %NULL-terminated gchar ** array. Free |
2162 | * it using g_strfreev() |
2163 | * |
2164 | * Since: 2.14 |
2165 | **/ |
2166 | gchar ** |
2167 | g_regex_split_full (const GRegex *regex, |
2168 | const gchar *string, |
2169 | gssize string_len, |
2170 | gint start_position, |
2171 | GRegexMatchFlags match_options, |
2172 | gint max_tokens, |
2173 | GError **error) |
2174 | { |
2175 | GError *tmp_error = NULL; |
2176 | GMatchInfo *match_info; |
2177 | GList *list, *last; |
2178 | gint i; |
2179 | gint token_count; |
2180 | gboolean match_ok; |
2181 | /* position of the last separator. */ |
2182 | gint last_separator_end; |
2183 | /* was the last match 0 bytes long? */ |
2184 | gboolean last_match_is_empty; |
2185 | /* the returned array of char **s */ |
2186 | gchar **string_list; |
2187 | |
2188 | g_return_val_if_fail (regex != NULL, NULL); |
2189 | g_return_val_if_fail (string != NULL, NULL); |
2190 | g_return_val_if_fail (start_position >= 0, NULL); |
2191 | g_return_val_if_fail (error == NULL || *error == NULL, NULL); |
2192 | g_return_val_if_fail ((match_options & ~G_REGEX_MATCH_MASK) == 0, NULL); |
2193 | |
2194 | if (max_tokens <= 0) |
2195 | max_tokens = G_MAXINT; |
2196 | |
2197 | if (string_len < 0) |
2198 | string_len = strlen (s: string); |
2199 | |
2200 | /* zero-length string */ |
2201 | if (string_len - start_position == 0) |
2202 | return g_new0 (gchar *, 1); |
2203 | |
2204 | if (max_tokens == 1) |
2205 | { |
2206 | string_list = g_new0 (gchar *, 2); |
2207 | string_list[0] = g_strndup (str: &string[start_position], |
2208 | n: string_len - start_position); |
2209 | return string_list; |
2210 | } |
2211 | |
2212 | list = NULL; |
2213 | token_count = 0; |
2214 | last_separator_end = start_position; |
2215 | last_match_is_empty = FALSE; |
2216 | |
2217 | match_ok = g_regex_match_full (regex, string, string_len, start_position, |
2218 | match_options, match_info: &match_info, error: &tmp_error); |
2219 | |
2220 | while (tmp_error == NULL) |
2221 | { |
2222 | if (match_ok) |
2223 | { |
2224 | last_match_is_empty = |
2225 | (match_info->offsets[0] == match_info->offsets[1]); |
2226 | |
2227 | /* we need to skip empty separators at the same position of the end |
2228 | * of another separator. e.g. the string is "a b" and the separator |
2229 | * is " *", so from 1 to 2 we have a match and at position 2 we have |
2230 | * an empty match. */ |
2231 | if (last_separator_end != match_info->offsets[1]) |
2232 | { |
2233 | gchar *token; |
2234 | gint match_count; |
2235 | |
2236 | token = g_strndup (str: string + last_separator_end, |
2237 | n: match_info->offsets[0] - last_separator_end); |
2238 | list = g_list_prepend (list, data: token); |
2239 | token_count++; |
2240 | |
2241 | /* if there were substrings, these need to be added to |
2242 | * the list. */ |
2243 | match_count = g_match_info_get_match_count (match_info); |
2244 | if (match_count > 1) |
2245 | { |
2246 | for (i = 1; i < match_count; i++) |
2247 | list = g_list_prepend (list, data: g_match_info_fetch (match_info, match_num: i)); |
2248 | } |
2249 | } |
2250 | } |
2251 | else |
2252 | { |
2253 | /* if there was no match, copy to end of string. */ |
2254 | if (!last_match_is_empty) |
2255 | { |
2256 | gchar *token = g_strndup (str: string + last_separator_end, |
2257 | n: match_info->string_len - last_separator_end); |
2258 | list = g_list_prepend (list, data: token); |
2259 | } |
2260 | /* no more tokens, end the loop. */ |
2261 | break; |
2262 | } |
2263 | |
2264 | /* -1 to leave room for the last part. */ |
2265 | if (token_count >= max_tokens - 1) |
2266 | { |
2267 | /* we have reached the maximum number of tokens, so we copy |
2268 | * the remaining part of the string. */ |
2269 | if (last_match_is_empty) |
2270 | { |
2271 | /* the last match was empty, so we have moved one char |
2272 | * after the real position to avoid empty matches at the |
2273 | * same position. */ |
2274 | match_info->pos = PREV_CHAR (regex, &string[match_info->pos]) - string; |
2275 | } |
2276 | /* the if is needed in the case we have terminated the available |
2277 | * tokens, but we are at the end of the string, so there are no |
2278 | * characters left to copy. */ |
2279 | if (string_len > match_info->pos) |
2280 | { |
2281 | gchar *token = g_strndup (str: string + match_info->pos, |
2282 | n: string_len - match_info->pos); |
2283 | list = g_list_prepend (list, data: token); |
2284 | } |
2285 | /* end the loop. */ |
2286 | break; |
2287 | } |
2288 | |
2289 | last_separator_end = match_info->pos; |
2290 | if (last_match_is_empty) |
2291 | /* if the last match was empty, g_match_info_next() has moved |
2292 | * forward to avoid infinite loops, but we still need to copy that |
2293 | * character. */ |
2294 | last_separator_end = PREV_CHAR (regex, &string[last_separator_end]) - string; |
2295 | |
2296 | match_ok = g_match_info_next (match_info, error: &tmp_error); |
2297 | } |
2298 | g_match_info_free (match_info); |
2299 | if (tmp_error != NULL) |
2300 | { |
2301 | g_propagate_error (dest: error, src: tmp_error); |
2302 | g_list_free_full (list, free_func: g_free); |
2303 | return NULL; |
2304 | } |
2305 | |
2306 | string_list = g_new (gchar *, g_list_length (list) + 1); |
2307 | i = 0; |
2308 | for (last = g_list_last (list); last; last = g_list_previous (last)) |
2309 | string_list[i++] = last->data; |
2310 | string_list[i] = NULL; |
2311 | g_list_free (list); |
2312 | |
2313 | return string_list; |
2314 | } |
2315 | |
2316 | enum |
2317 | { |
2318 | REPL_TYPE_STRING, |
2319 | REPL_TYPE_CHARACTER, |
2320 | REPL_TYPE_SYMBOLIC_REFERENCE, |
2321 | REPL_TYPE_NUMERIC_REFERENCE, |
2322 | REPL_TYPE_CHANGE_CASE |
2323 | }; |
2324 | |
2325 | typedef enum |
2326 | { |
2327 | CHANGE_CASE_NONE = 1 << 0, |
2328 | CHANGE_CASE_UPPER = 1 << 1, |
2329 | CHANGE_CASE_LOWER = 1 << 2, |
2330 | CHANGE_CASE_UPPER_SINGLE = 1 << 3, |
2331 | CHANGE_CASE_LOWER_SINGLE = 1 << 4, |
2332 | CHANGE_CASE_SINGLE_MASK = CHANGE_CASE_UPPER_SINGLE | CHANGE_CASE_LOWER_SINGLE, |
2333 | CHANGE_CASE_LOWER_MASK = CHANGE_CASE_LOWER | CHANGE_CASE_LOWER_SINGLE, |
2334 | CHANGE_CASE_UPPER_MASK = CHANGE_CASE_UPPER | CHANGE_CASE_UPPER_SINGLE |
2335 | } ChangeCase; |
2336 | |
2337 | struct _InterpolationData |
2338 | { |
2339 | gchar *text; |
2340 | gint type; |
2341 | gint num; |
2342 | gchar c; |
2343 | ChangeCase change_case; |
2344 | }; |
2345 | |
2346 | static void |
2347 | free_interpolation_data (InterpolationData *data) |
2348 | { |
2349 | g_free (mem: data->text); |
2350 | g_free (mem: data); |
2351 | } |
2352 | |
2353 | static const gchar * |
2354 | expand_escape (const gchar *replacement, |
2355 | const gchar *p, |
2356 | InterpolationData *data, |
2357 | GError **error) |
2358 | { |
2359 | const gchar *q, *r; |
2360 | gint x, d, h, i; |
2361 | const gchar *error_detail; |
2362 | gint base = 0; |
2363 | GError *tmp_error = NULL; |
2364 | |
2365 | p++; |
2366 | switch (*p) |
2367 | { |
2368 | case 't': |
2369 | p++; |
2370 | data->c = '\t'; |
2371 | data->type = REPL_TYPE_CHARACTER; |
2372 | break; |
2373 | case 'n': |
2374 | p++; |
2375 | data->c = '\n'; |
2376 | data->type = REPL_TYPE_CHARACTER; |
2377 | break; |
2378 | case 'v': |
2379 | p++; |
2380 | data->c = '\v'; |
2381 | data->type = REPL_TYPE_CHARACTER; |
2382 | break; |
2383 | case 'r': |
2384 | p++; |
2385 | data->c = '\r'; |
2386 | data->type = REPL_TYPE_CHARACTER; |
2387 | break; |
2388 | case 'f': |
2389 | p++; |
2390 | data->c = '\f'; |
2391 | data->type = REPL_TYPE_CHARACTER; |
2392 | break; |
2393 | case 'a': |
2394 | p++; |
2395 | data->c = '\a'; |
2396 | data->type = REPL_TYPE_CHARACTER; |
2397 | break; |
2398 | case 'b': |
2399 | p++; |
2400 | data->c = '\b'; |
2401 | data->type = REPL_TYPE_CHARACTER; |
2402 | break; |
2403 | case '\\': |
2404 | p++; |
2405 | data->c = '\\'; |
2406 | data->type = REPL_TYPE_CHARACTER; |
2407 | break; |
2408 | case 'x': |
2409 | p++; |
2410 | x = 0; |
2411 | if (*p == '{') |
2412 | { |
2413 | p++; |
2414 | do |
2415 | { |
2416 | h = g_ascii_xdigit_value (c: *p); |
2417 | if (h < 0) |
2418 | { |
2419 | error_detail = _("hexadecimal digit or “}” expected" ); |
2420 | goto error; |
2421 | } |
2422 | x = x * 16 + h; |
2423 | p++; |
2424 | } |
2425 | while (*p != '}'); |
2426 | p++; |
2427 | } |
2428 | else |
2429 | { |
2430 | for (i = 0; i < 2; i++) |
2431 | { |
2432 | h = g_ascii_xdigit_value (c: *p); |
2433 | if (h < 0) |
2434 | { |
2435 | error_detail = _("hexadecimal digit expected" ); |
2436 | goto error; |
2437 | } |
2438 | x = x * 16 + h; |
2439 | p++; |
2440 | } |
2441 | } |
2442 | data->type = REPL_TYPE_STRING; |
2443 | data->text = g_new0 (gchar, 8); |
2444 | g_unichar_to_utf8 (c: x, outbuf: data->text); |
2445 | break; |
2446 | case 'l': |
2447 | p++; |
2448 | data->type = REPL_TYPE_CHANGE_CASE; |
2449 | data->change_case = CHANGE_CASE_LOWER_SINGLE; |
2450 | break; |
2451 | case 'u': |
2452 | p++; |
2453 | data->type = REPL_TYPE_CHANGE_CASE; |
2454 | data->change_case = CHANGE_CASE_UPPER_SINGLE; |
2455 | break; |
2456 | case 'L': |
2457 | p++; |
2458 | data->type = REPL_TYPE_CHANGE_CASE; |
2459 | data->change_case = CHANGE_CASE_LOWER; |
2460 | break; |
2461 | case 'U': |
2462 | p++; |
2463 | data->type = REPL_TYPE_CHANGE_CASE; |
2464 | data->change_case = CHANGE_CASE_UPPER; |
2465 | break; |
2466 | case 'E': |
2467 | p++; |
2468 | data->type = REPL_TYPE_CHANGE_CASE; |
2469 | data->change_case = CHANGE_CASE_NONE; |
2470 | break; |
2471 | case 'g': |
2472 | p++; |
2473 | if (*p != '<') |
2474 | { |
2475 | error_detail = _("missing “<” in symbolic reference" ); |
2476 | goto error; |
2477 | } |
2478 | q = p + 1; |
2479 | do |
2480 | { |
2481 | p++; |
2482 | if (!*p) |
2483 | { |
2484 | error_detail = _("unfinished symbolic reference" ); |
2485 | goto error; |
2486 | } |
2487 | } |
2488 | while (*p != '>'); |
2489 | if (p - q == 0) |
2490 | { |
2491 | error_detail = _("zero-length symbolic reference" ); |
2492 | goto error; |
2493 | } |
2494 | if (g_ascii_isdigit (*q)) |
2495 | { |
2496 | x = 0; |
2497 | do |
2498 | { |
2499 | h = g_ascii_digit_value (c: *q); |
2500 | if (h < 0) |
2501 | { |
2502 | error_detail = _("digit expected" ); |
2503 | p = q; |
2504 | goto error; |
2505 | } |
2506 | x = x * 10 + h; |
2507 | q++; |
2508 | } |
2509 | while (q != p); |
2510 | data->num = x; |
2511 | data->type = REPL_TYPE_NUMERIC_REFERENCE; |
2512 | } |
2513 | else |
2514 | { |
2515 | r = q; |
2516 | do |
2517 | { |
2518 | if (!g_ascii_isalnum (*r)) |
2519 | { |
2520 | error_detail = _("illegal symbolic reference" ); |
2521 | p = r; |
2522 | goto error; |
2523 | } |
2524 | r++; |
2525 | } |
2526 | while (r != p); |
2527 | data->text = g_strndup (str: q, n: p - q); |
2528 | data->type = REPL_TYPE_SYMBOLIC_REFERENCE; |
2529 | } |
2530 | p++; |
2531 | break; |
2532 | case '0': |
2533 | /* if \0 is followed by a number is an octal number representing a |
2534 | * character, else it is a numeric reference. */ |
2535 | if (g_ascii_digit_value (c: *g_utf8_next_char (p)) >= 0) |
2536 | { |
2537 | base = 8; |
2538 | p = g_utf8_next_char (p); |
2539 | } |
2540 | G_GNUC_FALLTHROUGH; |
2541 | case '1': |
2542 | case '2': |
2543 | case '3': |
2544 | case '4': |
2545 | case '5': |
2546 | case '6': |
2547 | case '7': |
2548 | case '8': |
2549 | case '9': |
2550 | x = 0; |
2551 | d = 0; |
2552 | for (i = 0; i < 3; i++) |
2553 | { |
2554 | h = g_ascii_digit_value (c: *p); |
2555 | if (h < 0) |
2556 | break; |
2557 | if (h > 7) |
2558 | { |
2559 | if (base == 8) |
2560 | break; |
2561 | else |
2562 | base = 10; |
2563 | } |
2564 | if (i == 2 && base == 10) |
2565 | break; |
2566 | x = x * 8 + h; |
2567 | d = d * 10 + h; |
2568 | p++; |
2569 | } |
2570 | if (base == 8 || i == 3) |
2571 | { |
2572 | data->type = REPL_TYPE_STRING; |
2573 | data->text = g_new0 (gchar, 8); |
2574 | g_unichar_to_utf8 (c: x, outbuf: data->text); |
2575 | } |
2576 | else |
2577 | { |
2578 | data->type = REPL_TYPE_NUMERIC_REFERENCE; |
2579 | data->num = d; |
2580 | } |
2581 | break; |
2582 | case 0: |
2583 | error_detail = _("stray final “\\”" ); |
2584 | goto error; |
2585 | break; |
2586 | default: |
2587 | error_detail = _("unknown escape sequence" ); |
2588 | goto error; |
2589 | } |
2590 | |
2591 | return p; |
2592 | |
2593 | error: |
2594 | /* G_GSSIZE_FORMAT doesn't work with gettext, so we use %lu */ |
2595 | tmp_error = g_error_new (G_REGEX_ERROR, |
2596 | code: G_REGEX_ERROR_REPLACE, |
2597 | _("Error while parsing replacement " |
2598 | "text “%s” at char %lu: %s" ), |
2599 | replacement, |
2600 | (gulong)(p - replacement), |
2601 | error_detail); |
2602 | g_propagate_error (dest: error, src: tmp_error); |
2603 | |
2604 | return NULL; |
2605 | } |
2606 | |
2607 | static GList * |
2608 | split_replacement (const gchar *replacement, |
2609 | GError **error) |
2610 | { |
2611 | GList *list = NULL; |
2612 | InterpolationData *data; |
2613 | const gchar *p, *start; |
2614 | |
2615 | start = p = replacement; |
2616 | while (*p) |
2617 | { |
2618 | if (*p == '\\') |
2619 | { |
2620 | data = g_new0 (InterpolationData, 1); |
2621 | start = p = expand_escape (replacement, p, data, error); |
2622 | if (p == NULL) |
2623 | { |
2624 | g_list_free_full (list, free_func: (GDestroyNotify) free_interpolation_data); |
2625 | free_interpolation_data (data); |
2626 | |
2627 | return NULL; |
2628 | } |
2629 | list = g_list_prepend (list, data); |
2630 | } |
2631 | else |
2632 | { |
2633 | p++; |
2634 | if (*p == '\\' || *p == '\0') |
2635 | { |
2636 | if (p - start > 0) |
2637 | { |
2638 | data = g_new0 (InterpolationData, 1); |
2639 | data->text = g_strndup (str: start, n: p - start); |
2640 | data->type = REPL_TYPE_STRING; |
2641 | list = g_list_prepend (list, data); |
2642 | } |
2643 | } |
2644 | } |
2645 | } |
2646 | |
2647 | return g_list_reverse (list); |
2648 | } |
2649 | |
2650 | /* Change the case of c based on change_case. */ |
2651 | #define CHANGE_CASE(c, change_case) \ |
2652 | (((change_case) & CHANGE_CASE_LOWER_MASK) ? \ |
2653 | g_unichar_tolower (c) : \ |
2654 | g_unichar_toupper (c)) |
2655 | |
2656 | static void |
2657 | string_append (GString *string, |
2658 | const gchar *text, |
2659 | ChangeCase *change_case) |
2660 | { |
2661 | gunichar c; |
2662 | |
2663 | if (text[0] == '\0') |
2664 | return; |
2665 | |
2666 | if (*change_case == CHANGE_CASE_NONE) |
2667 | { |
2668 | g_string_append (string, val: text); |
2669 | } |
2670 | else if (*change_case & CHANGE_CASE_SINGLE_MASK) |
2671 | { |
2672 | c = g_utf8_get_char (p: text); |
2673 | g_string_append_unichar (string, CHANGE_CASE (c, *change_case)); |
2674 | g_string_append (string, g_utf8_next_char (text)); |
2675 | *change_case = CHANGE_CASE_NONE; |
2676 | } |
2677 | else |
2678 | { |
2679 | while (*text != '\0') |
2680 | { |
2681 | c = g_utf8_get_char (p: text); |
2682 | g_string_append_unichar (string, CHANGE_CASE (c, *change_case)); |
2683 | text = g_utf8_next_char (text); |
2684 | } |
2685 | } |
2686 | } |
2687 | |
2688 | static gboolean |
2689 | interpolate_replacement (const GMatchInfo *match_info, |
2690 | GString *result, |
2691 | gpointer data) |
2692 | { |
2693 | GList *list; |
2694 | InterpolationData *idata; |
2695 | gchar *match; |
2696 | ChangeCase change_case = CHANGE_CASE_NONE; |
2697 | |
2698 | for (list = data; list; list = list->next) |
2699 | { |
2700 | idata = list->data; |
2701 | switch (idata->type) |
2702 | { |
2703 | case REPL_TYPE_STRING: |
2704 | string_append (string: result, text: idata->text, change_case: &change_case); |
2705 | break; |
2706 | case REPL_TYPE_CHARACTER: |
2707 | g_string_append_c (result, CHANGE_CASE (idata->c, change_case)); |
2708 | if (change_case & CHANGE_CASE_SINGLE_MASK) |
2709 | change_case = CHANGE_CASE_NONE; |
2710 | break; |
2711 | case REPL_TYPE_NUMERIC_REFERENCE: |
2712 | match = g_match_info_fetch (match_info, match_num: idata->num); |
2713 | if (match) |
2714 | { |
2715 | string_append (string: result, text: match, change_case: &change_case); |
2716 | g_free (mem: match); |
2717 | } |
2718 | break; |
2719 | case REPL_TYPE_SYMBOLIC_REFERENCE: |
2720 | match = g_match_info_fetch_named (match_info, name: idata->text); |
2721 | if (match) |
2722 | { |
2723 | string_append (string: result, text: match, change_case: &change_case); |
2724 | g_free (mem: match); |
2725 | } |
2726 | break; |
2727 | case REPL_TYPE_CHANGE_CASE: |
2728 | change_case = idata->change_case; |
2729 | break; |
2730 | } |
2731 | } |
2732 | |
2733 | return FALSE; |
2734 | } |
2735 | |
2736 | /* whether actual match_info is needed for replacement, i.e. |
2737 | * whether there are references |
2738 | */ |
2739 | static gboolean |
2740 | interpolation_list_needs_match (GList *list) |
2741 | { |
2742 | while (list != NULL) |
2743 | { |
2744 | InterpolationData *data = list->data; |
2745 | |
2746 | if (data->type == REPL_TYPE_SYMBOLIC_REFERENCE || |
2747 | data->type == REPL_TYPE_NUMERIC_REFERENCE) |
2748 | { |
2749 | return TRUE; |
2750 | } |
2751 | |
2752 | list = list->next; |
2753 | } |
2754 | |
2755 | return FALSE; |
2756 | } |
2757 | |
2758 | /** |
2759 | * g_regex_replace: |
2760 | * @regex: a #GRegex structure |
2761 | * @string: (array length=string_len): the string to perform matches against |
2762 | * @string_len: the length of @string, in bytes, or -1 if @string is nul-terminated |
2763 | * @start_position: starting index of the string to match, in bytes |
2764 | * @replacement: text to replace each match with |
2765 | * @match_options: options for the match |
2766 | * @error: location to store the error occurring, or %NULL to ignore errors |
2767 | * |
2768 | * Replaces all occurrences of the pattern in @regex with the |
2769 | * replacement text. Backreferences of the form '\number' or |
2770 | * '\g<number>' in the replacement text are interpolated by the |
2771 | * number-th captured subexpression of the match, '\g<name>' refers |
2772 | * to the captured subexpression with the given name. '\0' refers |
2773 | * to the complete match, but '\0' followed by a number is the octal |
2774 | * representation of a character. To include a literal '\' in the |
2775 | * replacement, write '\\\\'. |
2776 | * |
2777 | * There are also escapes that changes the case of the following text: |
2778 | * |
2779 | * - \l: Convert to lower case the next character |
2780 | * - \u: Convert to upper case the next character |
2781 | * - \L: Convert to lower case till \E |
2782 | * - \U: Convert to upper case till \E |
2783 | * - \E: End case modification |
2784 | * |
2785 | * If you do not need to use backreferences use g_regex_replace_literal(). |
2786 | * |
2787 | * The @replacement string must be UTF-8 encoded even if #G_REGEX_RAW was |
2788 | * passed to g_regex_new(). If you want to use not UTF-8 encoded strings |
2789 | * you can use g_regex_replace_literal(). |
2790 | * |
2791 | * Setting @start_position differs from just passing over a shortened |
2792 | * string and setting #G_REGEX_MATCH_NOTBOL in the case of a pattern that |
2793 | * begins with any kind of lookbehind assertion, such as "\b". |
2794 | * |
2795 | * Returns: a newly allocated string containing the replacements |
2796 | * |
2797 | * Since: 2.14 |
2798 | */ |
2799 | gchar * |
2800 | g_regex_replace (const GRegex *regex, |
2801 | const gchar *string, |
2802 | gssize string_len, |
2803 | gint start_position, |
2804 | const gchar *replacement, |
2805 | GRegexMatchFlags match_options, |
2806 | GError **error) |
2807 | { |
2808 | gchar *result; |
2809 | GList *list; |
2810 | GError *tmp_error = NULL; |
2811 | |
2812 | g_return_val_if_fail (regex != NULL, NULL); |
2813 | g_return_val_if_fail (string != NULL, NULL); |
2814 | g_return_val_if_fail (start_position >= 0, NULL); |
2815 | g_return_val_if_fail (replacement != NULL, NULL); |
2816 | g_return_val_if_fail (error == NULL || *error == NULL, NULL); |
2817 | g_return_val_if_fail ((match_options & ~G_REGEX_MATCH_MASK) == 0, NULL); |
2818 | |
2819 | list = split_replacement (replacement, error: &tmp_error); |
2820 | if (tmp_error != NULL) |
2821 | { |
2822 | g_propagate_error (dest: error, src: tmp_error); |
2823 | return NULL; |
2824 | } |
2825 | |
2826 | result = g_regex_replace_eval (regex, |
2827 | string, string_len, start_position, |
2828 | match_options, |
2829 | eval: interpolate_replacement, |
2830 | user_data: (gpointer)list, |
2831 | error: &tmp_error); |
2832 | if (tmp_error != NULL) |
2833 | g_propagate_error (dest: error, src: tmp_error); |
2834 | |
2835 | g_list_free_full (list, free_func: (GDestroyNotify) free_interpolation_data); |
2836 | |
2837 | return result; |
2838 | } |
2839 | |
2840 | static gboolean |
2841 | literal_replacement (const GMatchInfo *match_info, |
2842 | GString *result, |
2843 | gpointer data) |
2844 | { |
2845 | g_string_append (string: result, val: data); |
2846 | return FALSE; |
2847 | } |
2848 | |
2849 | /** |
2850 | * g_regex_replace_literal: |
2851 | * @regex: a #GRegex structure |
2852 | * @string: (array length=string_len): the string to perform matches against |
2853 | * @string_len: the length of @string, in bytes, or -1 if @string is nul-terminated |
2854 | * @start_position: starting index of the string to match, in bytes |
2855 | * @replacement: text to replace each match with |
2856 | * @match_options: options for the match |
2857 | * @error: location to store the error occurring, or %NULL to ignore errors |
2858 | * |
2859 | * Replaces all occurrences of the pattern in @regex with the |
2860 | * replacement text. @replacement is replaced literally, to |
2861 | * include backreferences use g_regex_replace(). |
2862 | * |
2863 | * Setting @start_position differs from just passing over a |
2864 | * shortened string and setting #G_REGEX_MATCH_NOTBOL in the |
2865 | * case of a pattern that begins with any kind of lookbehind |
2866 | * assertion, such as "\b". |
2867 | * |
2868 | * Returns: a newly allocated string containing the replacements |
2869 | * |
2870 | * Since: 2.14 |
2871 | */ |
2872 | gchar * |
2873 | g_regex_replace_literal (const GRegex *regex, |
2874 | const gchar *string, |
2875 | gssize string_len, |
2876 | gint start_position, |
2877 | const gchar *replacement, |
2878 | GRegexMatchFlags match_options, |
2879 | GError **error) |
2880 | { |
2881 | g_return_val_if_fail (replacement != NULL, NULL); |
2882 | g_return_val_if_fail ((match_options & ~G_REGEX_MATCH_MASK) == 0, NULL); |
2883 | |
2884 | return g_regex_replace_eval (regex, |
2885 | string, string_len, start_position, |
2886 | match_options, |
2887 | eval: literal_replacement, |
2888 | user_data: (gpointer)replacement, |
2889 | error); |
2890 | } |
2891 | |
2892 | /** |
2893 | * g_regex_replace_eval: |
2894 | * @regex: a #GRegex structure from g_regex_new() |
2895 | * @string: (array length=string_len): string to perform matches against |
2896 | * @string_len: the length of @string, in bytes, or -1 if @string is nul-terminated |
2897 | * @start_position: starting index of the string to match, in bytes |
2898 | * @match_options: options for the match |
2899 | * @eval: a function to call for each match |
2900 | * @user_data: user data to pass to the function |
2901 | * @error: location to store the error occurring, or %NULL to ignore errors |
2902 | * |
2903 | * Replaces occurrences of the pattern in regex with the output of |
2904 | * @eval for that occurrence. |
2905 | * |
2906 | * Setting @start_position differs from just passing over a shortened |
2907 | * string and setting #G_REGEX_MATCH_NOTBOL in the case of a pattern |
2908 | * that begins with any kind of lookbehind assertion, such as "\b". |
2909 | * |
2910 | * The following example uses g_regex_replace_eval() to replace multiple |
2911 | * strings at once: |
2912 | * |[<!-- language="C" --> |
2913 | * static gboolean |
2914 | * eval_cb (const GMatchInfo *info, |
2915 | * GString *res, |
2916 | * gpointer data) |
2917 | * { |
2918 | * gchar *match; |
2919 | * gchar *r; |
2920 | * |
2921 | * match = g_match_info_fetch (info, 0); |
2922 | * r = g_hash_table_lookup ((GHashTable *)data, match); |
2923 | * g_string_append (res, r); |
2924 | * g_free (match); |
2925 | * |
2926 | * return FALSE; |
2927 | * } |
2928 | * |
2929 | * ... |
2930 | * |
2931 | * GRegex *reg; |
2932 | * GHashTable *h; |
2933 | * gchar *res; |
2934 | * |
2935 | * h = g_hash_table_new (g_str_hash, g_str_equal); |
2936 | * |
2937 | * g_hash_table_insert (h, "1", "ONE"); |
2938 | * g_hash_table_insert (h, "2", "TWO"); |
2939 | * g_hash_table_insert (h, "3", "THREE"); |
2940 | * g_hash_table_insert (h, "4", "FOUR"); |
2941 | * |
2942 | * reg = g_regex_new ("1|2|3|4", 0, 0, NULL); |
2943 | * res = g_regex_replace_eval (reg, text, -1, 0, 0, eval_cb, h, NULL); |
2944 | * g_hash_table_destroy (h); |
2945 | * |
2946 | * ... |
2947 | * ]| |
2948 | * |
2949 | * Returns: a newly allocated string containing the replacements |
2950 | * |
2951 | * Since: 2.14 |
2952 | */ |
2953 | gchar * |
2954 | g_regex_replace_eval (const GRegex *regex, |
2955 | const gchar *string, |
2956 | gssize string_len, |
2957 | gint start_position, |
2958 | GRegexMatchFlags match_options, |
2959 | GRegexEvalCallback eval, |
2960 | gpointer user_data, |
2961 | GError **error) |
2962 | { |
2963 | GMatchInfo *match_info; |
2964 | GString *result; |
2965 | gint str_pos = 0; |
2966 | gboolean done = FALSE; |
2967 | GError *tmp_error = NULL; |
2968 | |
2969 | g_return_val_if_fail (regex != NULL, NULL); |
2970 | g_return_val_if_fail (string != NULL, NULL); |
2971 | g_return_val_if_fail (start_position >= 0, NULL); |
2972 | g_return_val_if_fail (eval != NULL, NULL); |
2973 | g_return_val_if_fail ((match_options & ~G_REGEX_MATCH_MASK) == 0, NULL); |
2974 | |
2975 | if (string_len < 0) |
2976 | string_len = strlen (s: string); |
2977 | |
2978 | result = g_string_sized_new (dfl_size: string_len); |
2979 | |
2980 | /* run down the string making matches. */ |
2981 | g_regex_match_full (regex, string, string_len, start_position, |
2982 | match_options, match_info: &match_info, error: &tmp_error); |
2983 | while (!done && g_match_info_matches (match_info)) |
2984 | { |
2985 | g_string_append_len (string: result, |
2986 | val: string + str_pos, |
2987 | len: match_info->offsets[0] - str_pos); |
2988 | done = (*eval) (match_info, result, user_data); |
2989 | str_pos = match_info->offsets[1]; |
2990 | g_match_info_next (match_info, error: &tmp_error); |
2991 | } |
2992 | g_match_info_free (match_info); |
2993 | if (tmp_error != NULL) |
2994 | { |
2995 | g_propagate_error (dest: error, src: tmp_error); |
2996 | g_string_free (string: result, TRUE); |
2997 | return NULL; |
2998 | } |
2999 | |
3000 | g_string_append_len (string: result, val: string + str_pos, len: string_len - str_pos); |
3001 | return g_string_free (string: result, FALSE); |
3002 | } |
3003 | |
3004 | /** |
3005 | * g_regex_check_replacement: |
3006 | * @replacement: the replacement string |
3007 | * @has_references: (out) (optional): location to store information about |
3008 | * references in @replacement or %NULL |
3009 | * @error: location to store error |
3010 | * |
3011 | * Checks whether @replacement is a valid replacement string |
3012 | * (see g_regex_replace()), i.e. that all escape sequences in |
3013 | * it are valid. |
3014 | * |
3015 | * If @has_references is not %NULL then @replacement is checked |
3016 | * for pattern references. For instance, replacement text 'foo\n' |
3017 | * does not contain references and may be evaluated without information |
3018 | * about actual match, but '\0\1' (whole match followed by first |
3019 | * subpattern) requires valid #GMatchInfo object. |
3020 | * |
3021 | * Returns: whether @replacement is a valid replacement string |
3022 | * |
3023 | * Since: 2.14 |
3024 | */ |
3025 | gboolean |
3026 | g_regex_check_replacement (const gchar *replacement, |
3027 | gboolean *has_references, |
3028 | GError **error) |
3029 | { |
3030 | GList *list; |
3031 | GError *tmp = NULL; |
3032 | |
3033 | list = split_replacement (replacement, error: &tmp); |
3034 | |
3035 | if (tmp) |
3036 | { |
3037 | g_propagate_error (dest: error, src: tmp); |
3038 | return FALSE; |
3039 | } |
3040 | |
3041 | if (has_references) |
3042 | *has_references = interpolation_list_needs_match (list); |
3043 | |
3044 | g_list_free_full (list, free_func: (GDestroyNotify) free_interpolation_data); |
3045 | |
3046 | return TRUE; |
3047 | } |
3048 | |
3049 | /** |
3050 | * g_regex_escape_nul: |
3051 | * @string: the string to escape |
3052 | * @length: the length of @string |
3053 | * |
3054 | * Escapes the nul characters in @string to "\x00". It can be used |
3055 | * to compile a regex with embedded nul characters. |
3056 | * |
3057 | * For completeness, @length can be -1 for a nul-terminated string. |
3058 | * In this case the output string will be of course equal to @string. |
3059 | * |
3060 | * Returns: a newly-allocated escaped string |
3061 | * |
3062 | * Since: 2.30 |
3063 | */ |
3064 | gchar * |
3065 | g_regex_escape_nul (const gchar *string, |
3066 | gint length) |
3067 | { |
3068 | GString *escaped; |
3069 | const gchar *p, *piece_start, *end; |
3070 | gint backslashes; |
3071 | |
3072 | g_return_val_if_fail (string != NULL, NULL); |
3073 | |
3074 | if (length < 0) |
3075 | return g_strdup (str: string); |
3076 | |
3077 | end = string + length; |
3078 | p = piece_start = string; |
3079 | escaped = g_string_sized_new (dfl_size: length + 1); |
3080 | |
3081 | backslashes = 0; |
3082 | while (p < end) |
3083 | { |
3084 | switch (*p) |
3085 | { |
3086 | case '\0': |
3087 | if (p != piece_start) |
3088 | { |
3089 | /* copy the previous piece. */ |
3090 | g_string_append_len (string: escaped, val: piece_start, len: p - piece_start); |
3091 | } |
3092 | if ((backslashes & 1) == 0) |
3093 | g_string_append_c (escaped, '\\'); |
3094 | g_string_append_c (escaped, 'x'); |
3095 | g_string_append_c (escaped, '0'); |
3096 | g_string_append_c (escaped, '0'); |
3097 | piece_start = ++p; |
3098 | backslashes = 0; |
3099 | break; |
3100 | case '\\': |
3101 | backslashes++; |
3102 | ++p; |
3103 | break; |
3104 | default: |
3105 | backslashes = 0; |
3106 | p = g_utf8_next_char (p); |
3107 | break; |
3108 | } |
3109 | } |
3110 | |
3111 | if (piece_start < end) |
3112 | g_string_append_len (string: escaped, val: piece_start, len: end - piece_start); |
3113 | |
3114 | return g_string_free (string: escaped, FALSE); |
3115 | } |
3116 | |
3117 | /** |
3118 | * g_regex_escape_string: |
3119 | * @string: (array length=length): the string to escape |
3120 | * @length: the length of @string, in bytes, or -1 if @string is nul-terminated |
3121 | * |
3122 | * Escapes the special characters used for regular expressions |
3123 | * in @string, for instance "a.b*c" becomes "a\.b\*c". This |
3124 | * function is useful to dynamically generate regular expressions. |
3125 | * |
3126 | * @string can contain nul characters that are replaced with "\0", |
3127 | * in this case remember to specify the correct length of @string |
3128 | * in @length. |
3129 | * |
3130 | * Returns: a newly-allocated escaped string |
3131 | * |
3132 | * Since: 2.14 |
3133 | */ |
3134 | gchar * |
3135 | g_regex_escape_string (const gchar *string, |
3136 | gint length) |
3137 | { |
3138 | GString *escaped; |
3139 | const char *p, *piece_start, *end; |
3140 | |
3141 | g_return_val_if_fail (string != NULL, NULL); |
3142 | |
3143 | if (length < 0) |
3144 | length = strlen (s: string); |
3145 | |
3146 | end = string + length; |
3147 | p = piece_start = string; |
3148 | escaped = g_string_sized_new (dfl_size: length + 1); |
3149 | |
3150 | while (p < end) |
3151 | { |
3152 | switch (*p) |
3153 | { |
3154 | case '\0': |
3155 | case '\\': |
3156 | case '|': |
3157 | case '(': |
3158 | case ')': |
3159 | case '[': |
3160 | case ']': |
3161 | case '{': |
3162 | case '}': |
3163 | case '^': |
3164 | case '$': |
3165 | case '*': |
3166 | case '+': |
3167 | case '?': |
3168 | case '.': |
3169 | if (p != piece_start) |
3170 | /* copy the previous piece. */ |
3171 | g_string_append_len (string: escaped, val: piece_start, len: p - piece_start); |
3172 | g_string_append_c (escaped, '\\'); |
3173 | if (*p == '\0') |
3174 | g_string_append_c (escaped, '0'); |
3175 | else |
3176 | g_string_append_c (escaped, *p); |
3177 | piece_start = ++p; |
3178 | break; |
3179 | default: |
3180 | p = g_utf8_next_char (p); |
3181 | break; |
3182 | } |
3183 | } |
3184 | |
3185 | if (piece_start < end) |
3186 | g_string_append_len (string: escaped, val: piece_start, len: end - piece_start); |
3187 | |
3188 | return g_string_free (string: escaped, FALSE); |
3189 | } |
3190 | |