1 | // Copyright (C) 2016 The Qt Company Ltd. |
2 | // SPDX-License-Identifier: LicenseRef-Qt-Commercial OR LGPL-3.0-only OR GPL-2.0-only OR GPL-3.0-only |
3 | |
4 | #include "qregexp.h" |
5 | |
6 | #include "qalgorithms.h" |
7 | #include "qbitarray.h" |
8 | #include "qcache.h" |
9 | #include "qdatastream.h" |
10 | #include "qdebug.h" |
11 | #include "qhashfunctions.h" |
12 | #include "qlist.h" |
13 | #include "qmap.h" |
14 | #include "qmutex.h" |
15 | #include "qstring.h" |
16 | #include "qstringlist.h" |
17 | #include "qstringmatcher.h" |
18 | #include "private/qlocking_p.h" |
19 | #include "qvarlengtharray.h" |
20 | |
21 | #include <limits.h> |
22 | #include <algorithm> |
23 | |
24 | QT_BEGIN_NAMESPACE |
25 | |
26 | // error strings for the regexp parser |
27 | #define RXERR_OK QT_TRANSLATE_NOOP("QRegExp", "no error occurred") |
28 | #define RXERR_DISABLED QT_TRANSLATE_NOOP("QRegExp", "disabled feature used") |
29 | #define RXERR_CHARCLASS QT_TRANSLATE_NOOP("QRegExp", "bad char class syntax") |
30 | #define RXERR_LOOKAHEAD QT_TRANSLATE_NOOP("QRegExp", "bad lookahead syntax") |
31 | #define RXERR_LOOKBEHIND QT_TRANSLATE_NOOP("QRegExp", "lookbehinds not supported, see QTBUG-2371") |
32 | #define RXERR_REPETITION QT_TRANSLATE_NOOP("QRegExp", "bad repetition syntax") |
33 | #define RXERR_OCTAL QT_TRANSLATE_NOOP("QRegExp", "invalid octal value") |
34 | #define RXERR_LEFTDELIM QT_TRANSLATE_NOOP("QRegExp", "missing left delim") |
35 | #define RXERR_END QT_TRANSLATE_NOOP("QRegExp", "unexpected end") |
36 | #define RXERR_LIMIT QT_TRANSLATE_NOOP("QRegExp", "met internal limit") |
37 | #define RXERR_INTERVAL QT_TRANSLATE_NOOP("QRegExp", "invalid interval") |
38 | #define RXERR_CATEGORY QT_TRANSLATE_NOOP("QRegExp", "invalid category") |
39 | |
40 | /*! |
41 | \class QRegExp |
42 | \inmodule QtCore5Compat |
43 | \reentrant |
44 | \brief The QRegExp class provides pattern matching using regular expressions. |
45 | |
46 | \ingroup tools |
47 | \ingroup shared |
48 | |
49 | \keyword regular expression |
50 | |
51 | This class is deprecated in Qt 6. Please use QRegularExpression instead |
52 | for all new code. For guidelines on porting old code from QRegExp to |
53 | QRegularExpression, see {Porting to QRegularExpression} |
54 | |
55 | A regular expression, or "regexp", is a pattern for matching |
56 | substrings in a text. This is useful in many contexts, e.g., |
57 | |
58 | \table |
59 | \row \li Validation |
60 | \li A regexp can test whether a substring meets some criteria, |
61 | e.g. is an integer or contains no whitespace. |
62 | \row \li Searching |
63 | \li A regexp provides more powerful pattern matching than |
64 | simple substring matching, e.g., match one of the words |
65 | \e{mail}, \e{letter} or \e{correspondence}, but none of the |
66 | words \e{email}, \e{mailman}, \e{mailer}, \e{letterbox}, etc. |
67 | \row \li Search and Replace |
68 | \li A regexp can replace all occurrences of a substring with a |
69 | different substring, e.g., replace all occurrences of \e{&} |
70 | with \e{\&} except where the \e{&} is already followed by |
71 | an \e{amp;}. |
72 | \row \li String Splitting |
73 | \li A regexp can be used to identify where a string should be |
74 | split apart, e.g. splitting tab-delimited strings. |
75 | \endtable |
76 | |
77 | A brief introduction to regexps is presented, a description of |
78 | Qt's regexp language, some examples, and the function |
79 | documentation itself. QRegExp is modeled on Perl's regexp |
80 | language. It fully supports Unicode. QRegExp can also be used in a |
81 | simpler, \e{wildcard mode} that is similar to the functionality |
82 | found in command shells. The syntax rules used by QRegExp can be |
83 | changed with setPatternSyntax(). In particular, the pattern syntax |
84 | can be set to QRegExp::FixedString, which means the pattern to be |
85 | matched is interpreted as a plain string, i.e., special characters |
86 | (e.g., backslash) are not escaped. |
87 | |
88 | A good text on regexps is \e {Mastering Regular Expressions} |
89 | (Third Edition) by Jeffrey E. F. Friedl, ISBN 0-596-52812-4. |
90 | |
91 | \note In Qt 5, the new QRegularExpression class provides a Perl |
92 | compatible implementation of regular expressions and is recommended |
93 | in place of QRegExp. |
94 | |
95 | \section1 Introduction |
96 | |
97 | Regexps are built up from expressions, quantifiers, and |
98 | assertions. The simplest expression is a character, e.g. \b{x} |
99 | or \b{5}. An expression can also be a set of characters |
100 | enclosed in square brackets. \b{[ABCD]} will match an \b{A} |
101 | or a \b{B} or a \b{C} or a \b{D}. We can write this same |
102 | expression as \b{[A-D]}, and an expression to match any |
103 | capital letter in the English alphabet is written as |
104 | \b{[A-Z]}. |
105 | |
106 | A quantifier specifies the number of occurrences of an expression |
107 | that must be matched. \b{x{1,1}} means match one and only one |
108 | \b{x}. \b{x{1,5}} means match a sequence of \b{x} |
109 | characters that contains at least one \b{x} but no more than |
110 | five. |
111 | |
112 | Note that in general regexps cannot be used to check for balanced |
113 | brackets or tags. For example, a regexp can be written to match an |
114 | opening html \c{<b>} and its closing \c{</b>}, if the \c{<b>} tags |
115 | are not nested, but if the \c{<b>} tags are nested, that same |
116 | regexp will match an opening \c{<b>} tag with the wrong closing |
117 | \c{</b>}. For the fragment \c{<b>bold <b>bolder</b></b>}, the |
118 | first \c{<b>} would be matched with the first \c{</b>}, which is |
119 | not correct. However, it is possible to write a regexp that will |
120 | match nested brackets or tags correctly, but only if the number of |
121 | nesting levels is fixed and known. If the number of nesting levels |
122 | is not fixed and known, it is impossible to write a regexp that |
123 | will not fail. |
124 | |
125 | Suppose we want a regexp to match integers in the range 0 to 99. |
126 | At least one digit is required, so we start with the expression |
127 | \b{[0-9]{1,1}}, which matches a single digit exactly once. This |
128 | regexp matches integers in the range 0 to 9. To match integers up |
129 | to 99, increase the maximum number of occurrences to 2, so the |
130 | regexp becomes \b{[0-9]{1,2}}. This regexp satisfies the |
131 | original requirement to match integers from 0 to 99, but it will |
132 | also match integers that occur in the middle of strings. If we |
133 | want the matched integer to be the whole string, we must use the |
134 | anchor assertions, \b{^} (caret) and \b{$} (dollar). When |
135 | \b{^} is the first character in a regexp, it means the regexp |
136 | must match from the beginning of the string. When \b{$} is the |
137 | last character of the regexp, it means the regexp must match to |
138 | the end of the string. The regexp becomes \b{^[0-9]{1,2}$}. |
139 | Note that assertions, e.g. \b{^} and \b{$}, do not match |
140 | characters but locations in the string. |
141 | |
142 | If you have seen regexps described elsewhere, they may have looked |
143 | different from the ones shown here. This is because some sets of |
144 | characters and some quantifiers are so common that they have been |
145 | given special symbols to represent them. \b{[0-9]} can be |
146 | replaced with the symbol \b{\\d}. The quantifier to match |
147 | exactly one occurrence, \b{{1,1}}, can be replaced with the |
148 | expression itself, i.e. \b{x{1,1}} is the same as \b{x}. So |
149 | our 0 to 99 matcher could be written as \b{^\\d{1,2}$}. It can |
150 | also be written \b{^\\d\\d{0,1}$}, i.e. \e{From the start of |
151 | the string, match a digit, followed immediately by 0 or 1 digits}. |
152 | In practice, it would be written as \b{^\\d\\d?$}. The \b{?} |
153 | is shorthand for the quantifier \b{{0,1}}, i.e. 0 or 1 |
154 | occurrences. \b{?} makes an expression optional. The regexp |
155 | \b{^\\d\\d?$} means \e{From the beginning of the string, match |
156 | one digit, followed immediately by 0 or 1 more digit, followed |
157 | immediately by end of string}. |
158 | |
159 | To write a regexp that matches one of the words 'mail' \e or |
160 | 'letter' \e or 'correspondence' but does not match words that |
161 | contain these words, e.g., 'email', 'mailman', 'mailer', and |
162 | 'letterbox', start with a regexp that matches 'mail'. Expressed |
163 | fully, the regexp is \b{m{1,1}a{1,1}i{1,1}l{1,1}}, but because |
164 | a character expression is automatically quantified by |
165 | \b{{1,1}}, we can simplify the regexp to \b{mail}, i.e., an |
166 | 'm' followed by an 'a' followed by an 'i' followed by an 'l'. Now |
167 | we can use the vertical bar \b{|}, which means \b{or}, to |
168 | include the other two words, so our regexp for matching any of the |
169 | three words becomes \b{mail|letter|correspondence}. Match |
170 | 'mail' \b{or} 'letter' \b{or} 'correspondence'. While this |
171 | regexp will match one of the three words we want to match, it will |
172 | also match words we don't want to match, e.g., 'email'. To |
173 | prevent the regexp from matching unwanted words, we must tell it |
174 | to begin and end the match at word boundaries. First we enclose |
175 | our regexp in parentheses, \b{(mail|letter|correspondence)}. |
176 | Parentheses group expressions together, and they identify a part |
177 | of the regexp that we wish to \l{capturing text}{capture}. |
178 | Enclosing the expression in parentheses allows us to use it as a |
179 | component in more complex regexps. It also allows us to examine |
180 | which of the three words was actually matched. To force the match |
181 | to begin and end on word boundaries, we enclose the regexp in |
182 | \b{\\b} \e{word boundary} assertions: |
183 | \b{\\b(mail|letter|correspondence)\\b}. Now the regexp means: |
184 | \e{Match a word boundary, followed by the regexp in parentheses, |
185 | followed by a word boundary}. The \b{\\b} assertion matches a |
186 | \e position in the regexp, not a \e character. A word boundary is |
187 | any non-word character, e.g., a space, newline, or the beginning |
188 | or ending of a string. |
189 | |
190 | If we want to replace ampersand characters with the HTML entity |
191 | \b{\&}, the regexp to match is simply \b{\&}. But this |
192 | regexp will also match ampersands that have already been converted |
193 | to HTML entities. We want to replace only ampersands that are not |
194 | already followed by \b{amp;}. For this, we need the negative |
195 | lookahead assertion, \b{(?!}__\b{)}. The regexp can then be |
196 | written as \b{\&(?!amp;)}, i.e. \e{Match an ampersand that is} |
197 | \b{not} \e{followed by} \b{amp;}. |
198 | |
199 | If we want to count all the occurrences of 'Eric' and 'Eirik' in a |
200 | string, two valid solutions are \b{\\b(Eric|Eirik)\\b} and |
201 | \b{\\bEi?ri[ck]\\b}. The word boundary assertion '\\b' is |
202 | required to avoid matching words that contain either name, |
203 | e.g. 'Ericsson'. Note that the second regexp matches more |
204 | spellings than we want: 'Eric', 'Erik', 'Eiric' and 'Eirik'. |
205 | |
206 | Some of the examples discussed above are implemented in the |
207 | \l{#code-examples}{code examples} section. |
208 | |
209 | \target characters-and-abbreviations-for-sets-of-characters |
210 | \section1 Characters and Abbreviations for Sets of Characters |
211 | |
212 | \table |
213 | \header \li Element \li Meaning |
214 | \row \li \b{c} |
215 | \li A character represents itself unless it has a special |
216 | regexp meaning. e.g. \b{c} matches the character \e c. |
217 | \row \li \b{\\c} |
218 | \li A character that follows a backslash matches the character |
219 | itself, except as specified below. e.g., To match a literal |
220 | caret at the beginning of a string, write \b{\\^}. |
221 | \row \li \b{\\a} |
222 | \li Matches the ASCII bell (BEL, 0x07). |
223 | \row \li \b{\\f} |
224 | \li Matches the ASCII form feed (FF, 0x0C). |
225 | \row \li \b{\\n} |
226 | \li Matches the ASCII line feed (LF, 0x0A, Unix newline). |
227 | \row \li \b{\\r} |
228 | \li Matches the ASCII carriage return (CR, 0x0D). |
229 | \row \li \b{\\t} |
230 | \li Matches the ASCII horizontal tab (HT, 0x09). |
231 | \row \li \b{\\v} |
232 | \li Matches the ASCII vertical tab (VT, 0x0B). |
233 | \row \li \b{\\x\e{hhhh}} |
234 | \li Matches the Unicode character corresponding to the |
235 | hexadecimal number \e{hhhh} (between 0x0000 and 0xFFFF). |
236 | \row \li \b{\\0\e{ooo}} (i.e., \\zero \e{ooo}) |
237 | \li matches the ASCII/Latin1 character for the octal number |
238 | \e{ooo} (between 0 and 0377). |
239 | \row \li \b{. (dot)} |
240 | \li Matches any character (including newline). |
241 | \row \li \b{\\d} |
242 | \li Matches a digit (QChar::isDigit()). |
243 | \row \li \b{\\D} |
244 | \li Matches a non-digit. |
245 | \row \li \b{\\s} |
246 | \li Matches a whitespace character (QChar::isSpace()). |
247 | \row \li \b{\\S} |
248 | \li Matches a non-whitespace character. |
249 | \row \li \b{\\w} |
250 | \li Matches a word character (QChar::isLetterOrNumber(), QChar::isMark(), or '_'). |
251 | \row \li \b{\\W} |
252 | \li Matches a non-word character. |
253 | \row \li \b{\\\e{n}} |
254 | \li The \e{n}-th backreference, e.g. \\1, \\2, etc. |
255 | \endtable |
256 | |
257 | \b{Note:} The C++ compiler transforms backslashes in strings. |
258 | To include a \b{\\} in a regexp, enter it twice, i.e. \c{\\}. |
259 | To match the backslash character itself, enter it four times, i.e. |
260 | \c{\\\\}. |
261 | |
262 | \target sets-of-characters |
263 | \section1 Sets of Characters |
264 | |
265 | Square brackets mean match any character contained in the square |
266 | brackets. The character set abbreviations described above can |
267 | appear in a character set in square brackets. Except for the |
268 | character set abbreviations and the following two exceptions, |
269 | characters do not have special meanings in square brackets. |
270 | |
271 | \table |
272 | \row \li \b{^} |
273 | |
274 | \li The caret negates the character set if it occurs as the |
275 | first character (i.e. immediately after the opening square |
276 | bracket). \b{[abc]} matches 'a' or 'b' or 'c', but |
277 | \b{[^abc]} matches anything \e but 'a' or 'b' or 'c'. |
278 | |
279 | \row \li \b{-} |
280 | |
281 | \li The dash indicates a range of characters. \b{[W-Z]} |
282 | matches 'W' or 'X' or 'Y' or 'Z'. |
283 | |
284 | \endtable |
285 | |
286 | Using the predefined character set abbreviations is more portable |
287 | than using character ranges across platforms and languages. For |
288 | example, \b{[0-9]} matches a digit in Western alphabets but |
289 | \b{\\d} matches a digit in \e any alphabet. |
290 | |
291 | Note: In other regexp documentation, sets of characters are often |
292 | called "character classes". |
293 | |
294 | \target quantifiers |
295 | \section1 Quantifiers |
296 | |
297 | By default, an expression is automatically quantified by |
298 | \b{{1,1}}, i.e. it should occur exactly once. In the following |
299 | list, \b{\e {E}} stands for expression. An expression is a |
300 | character, or an abbreviation for a set of characters, or a set of |
301 | characters in square brackets, or an expression in parentheses. |
302 | |
303 | \table |
304 | \row \li \b{\e {E}?} |
305 | |
306 | \li Matches zero or one occurrences of \e E. This quantifier |
307 | means \e{The previous expression is optional}, because it |
308 | will match whether or not the expression is found. \b{\e |
309 | {E}?} is the same as \b{\e {E}{0,1}}. e.g., \b{dents?} |
310 | matches 'dent' or 'dents'. |
311 | |
312 | \row \li \b{\e {E}+} |
313 | |
314 | \li Matches one or more occurrences of \e E. \b{\e {E}+} is |
315 | the same as \b{\e {E}{1,}}. e.g., \b{0+} matches '0', |
316 | '00', '000', etc. |
317 | |
318 | \row \li \b{\e {E}*} |
319 | |
320 | \li Matches zero or more occurrences of \e E. It is the same |
321 | as \b{\e {E}{0,}}. The \b{*} quantifier is often used |
322 | in error where \b{+} should be used. For example, if |
323 | \b{\\s*$} is used in an expression to match strings that |
324 | end in whitespace, it will match every string because |
325 | \b{\\s*$} means \e{Match zero or more whitespaces followed |
326 | by end of string}. The correct regexp to match strings that |
327 | have at least one trailing whitespace character is |
328 | \b{\\s+$}. |
329 | |
330 | \row \li \b{\e {E}{n}} |
331 | |
332 | \li Matches exactly \e n occurrences of \e E. \b{\e {E}{n}} |
333 | is the same as repeating \e E \e n times. For example, |
334 | \b{x{5}} is the same as \b{xxxxx}. It is also the same |
335 | as \b{\e {E}{n,n}}, e.g. \b{x{5,5}}. |
336 | |
337 | \row \li \b{\e {E}{n,}} |
338 | \li Matches at least \e n occurrences of \e E. |
339 | |
340 | \row \li \b{\e {E}{,m}} |
341 | \li Matches at most \e m occurrences of \e E. \b{\e {E}{,m}} |
342 | is the same as \b{\e {E}{0,m}}. |
343 | |
344 | \row \li \b{\e {E}{n,m}} |
345 | \li Matches at least \e n and at most \e m occurrences of \e E. |
346 | \endtable |
347 | |
348 | To apply a quantifier to more than just the preceding character, |
349 | use parentheses to group characters together in an expression. For |
350 | example, \b{tag+} matches a 't' followed by an 'a' followed by |
351 | at least one 'g', whereas \b{(tag)+} matches at least one |
352 | occurrence of 'tag'. |
353 | |
354 | Note: Quantifiers are normally "greedy". They always match as much |
355 | text as they can. For example, \b{0+} matches the first zero it |
356 | finds and all the consecutive zeros after the first zero. Applied |
357 | to '20005', it matches '2\underline{000}5'. Quantifiers can be made |
358 | non-greedy, see setMinimal(). |
359 | |
360 | \target capturing parentheses |
361 | \target backreferences |
362 | \section1 Capturing Text |
363 | |
364 | Parentheses allow us to group elements together so that we can |
365 | quantify and capture them. For example if we have the expression |
366 | \b{mail|letter|correspondence} that matches a string we know |
367 | that \e one of the words matched but not which one. Using |
368 | parentheses allows us to "capture" whatever is matched within |
369 | their bounds, so if we used \b{(mail|letter|correspondence)} |
370 | and matched this regexp against the string "I sent you some email" |
371 | we can use the cap() or capturedTexts() functions to extract the |
372 | matched characters, in this case 'mail'. |
373 | |
374 | We can use captured text within the regexp itself. To refer to the |
375 | captured text we use \e backreferences which are indexed from 1, |
376 | the same as for cap(). For example we could search for duplicate |
377 | words in a string using \b{\\b(\\w+)\\W+\\1\\b} which means match a |
378 | word boundary followed by one or more word characters followed by |
379 | one or more non-word characters followed by the same text as the |
380 | first parenthesized expression followed by a word boundary. |
381 | |
382 | If we want to use parentheses purely for grouping and not for |
383 | capturing we can use the non-capturing syntax, e.g. |
384 | \b{(?:green|blue)}. Non-capturing parentheses begin '(?:' and |
385 | end ')'. In this example we match either 'green' or 'blue' but we |
386 | do not capture the match so we only know whether or not we matched |
387 | but not which color we actually found. Using non-capturing |
388 | parentheses is more efficient than using capturing parentheses |
389 | since the regexp engine has to do less book-keeping. |
390 | |
391 | Both capturing and non-capturing parentheses may be nested. |
392 | |
393 | \target greedy quantifiers |
394 | |
395 | For historical reasons, quantifiers (e.g. \b{*}) that apply to |
396 | capturing parentheses are more "greedy" than other quantifiers. |
397 | For example, \b{a*(a*)} will match "aaa" with cap(1) == "aaa". |
398 | This behavior is different from what other regexp engines do |
399 | (notably, Perl). To obtain a more intuitive capturing behavior, |
400 | specify QRegExp::RegExp2 to the QRegExp constructor or call |
401 | setPatternSyntax(QRegExp::RegExp2). |
402 | |
403 | \target cap_in_a_loop |
404 | |
405 | When the number of matches cannot be determined in advance, a |
406 | common idiom is to use cap() in a loop. For example: |
407 | |
408 | \snippet code/src_corelib_text_qregexp.cpp 0 |
409 | |
410 | \target assertions |
411 | \section1 Assertions |
412 | |
413 | Assertions make some statement about the text at the point where |
414 | they occur in the regexp but they do not match any characters. In |
415 | the following list \b{\e {E}} stands for any expression. |
416 | |
417 | \table |
418 | \row \li \b{^} |
419 | \li The caret signifies the beginning of the string. If you |
420 | wish to match a literal \c{^} you must escape it by |
421 | writing \c{\\^}. For example, \b{^#include} will only |
422 | match strings which \e begin with the characters '#include'. |
423 | (When the caret is the first character of a character set it |
424 | has a special meaning, see \l{#sets-of-characters}{Sets of Characters}.) |
425 | |
426 | \row \li \b{$} |
427 | \li The dollar signifies the end of the string. For example |
428 | \b{\\d\\s*$} will match strings which end with a digit |
429 | optionally followed by whitespace. If you wish to match a |
430 | literal \c{$} you must escape it by writing |
431 | \c{\\$}. |
432 | |
433 | \row \li \b{\\b} |
434 | \li A word boundary. For example the regexp |
435 | \b{\\bOK\\b} means match immediately after a word |
436 | boundary (e.g. start of string or whitespace) the letter 'O' |
437 | then the letter 'K' immediately before another word boundary |
438 | (e.g. end of string or whitespace). But note that the |
439 | assertion does not actually match any whitespace so if we |
440 | write \b{(\\bOK\\b)} and we have a match it will only |
441 | contain 'OK' even if the string is "It's \underline{OK} now". |
442 | |
443 | \row \li \b{\\B} |
444 | \li A non-word boundary. This assertion is true wherever |
445 | \b{\\b} is false. For example if we searched for |
446 | \b{\\Bon\\B} in "Left on" the match would fail (space |
447 | and end of string aren't non-word boundaries), but it would |
448 | match in "t\underline{on}ne". |
449 | |
450 | \row \li \b{(?=\e E)} |
451 | \li Positive lookahead. This assertion is true if the |
452 | expression matches at this point in the regexp. For example, |
453 | \b{const(?=\\s+char)} matches 'const' whenever it is |
454 | followed by 'char', as in 'static \underline{const} char *'. |
455 | (Compare with \b{const\\s+char}, which matches 'static |
456 | \underline{const char} *'.) |
457 | |
458 | \row \li \b{(?!\e E)} |
459 | \li Negative lookahead. This assertion is true if the |
460 | expression does not match at this point in the regexp. For |
461 | example, \b{const(?!\\s+char)} matches 'const' \e except |
462 | when it is followed by 'char'. |
463 | \endtable |
464 | |
465 | \target QRegExp wildcard matching |
466 | \section1 Wildcard Matching |
467 | |
468 | Most command shells such as \e bash or \e cmd.exe support "file |
469 | globbing", the ability to identify a group of files by using |
470 | wildcards. The setPatternSyntax() function is used to switch |
471 | between regexp and wildcard mode. Wildcard matching is much |
472 | simpler than full regexps and has only four features: |
473 | |
474 | \table |
475 | \row \li \b{c} |
476 | \li Any character represents itself apart from those mentioned |
477 | below. Thus \b{c} matches the character \e c. |
478 | \row \li \b{?} |
479 | \li Matches any single character. It is the same as |
480 | \b{.} in full regexps. |
481 | \row \li \b{*} |
482 | \li Matches zero or more of any characters. It is the |
483 | same as \b{.*} in full regexps. |
484 | \row \li \b{[...]} |
485 | \li Sets of characters can be represented in square brackets, |
486 | similar to full regexps. Within the character class, like |
487 | outside, backslash has no special meaning. |
488 | \endtable |
489 | |
490 | In the mode Wildcard, the wildcard characters cannot be |
491 | escaped. In the mode WildcardUnix, the character '\\' escapes the |
492 | wildcard. |
493 | |
494 | For example if we are in wildcard mode and have strings which |
495 | contain filenames we could identify HTML files with \b{*.html}. |
496 | This will match zero or more characters followed by a dot followed |
497 | by 'h', 't', 'm' and 'l'. |
498 | |
499 | To test a string against a wildcard expression, use exactMatch(). |
500 | For example: |
501 | |
502 | \snippet code/src_corelib_text_qregexp.cpp 1 |
503 | |
504 | \target perl-users |
505 | \section1 Notes for Perl Users |
506 | |
507 | Most of the character class abbreviations supported by Perl are |
508 | supported by QRegExp, see \l{#characters-and-abbreviations-for-sets-of-characters} |
509 | {characters and abbreviations for sets of characters}. |
510 | |
511 | In QRegExp, apart from within character classes, \c{^} always |
512 | signifies the start of the string, so carets must always be |
513 | escaped unless used for that purpose. In Perl the meaning of caret |
514 | varies automagically depending on where it occurs so escaping it |
515 | is rarely necessary. The same applies to \c{$} which in |
516 | QRegExp always signifies the end of the string. |
517 | |
518 | QRegExp's quantifiers are the same as Perl's greedy quantifiers |
519 | (but see the \l{greedy quantifiers}{note above}). Non-greedy |
520 | matching cannot be applied to individual quantifiers, but can be |
521 | applied to all the quantifiers in the pattern. For example, to |
522 | match the Perl regexp \b{ro+?m} requires: |
523 | |
524 | \snippet code/src_corelib_text_qregexp.cpp 2 |
525 | |
526 | The equivalent of Perl's \c{/i} option is |
527 | setCaseSensitivity(Qt::CaseInsensitive). |
528 | |
529 | Perl's \c{/g} option can be emulated using a \l{#cap_in_a_loop}{loop}. |
530 | |
531 | In QRegExp \b{.} matches any character, therefore all QRegExp |
532 | regexps have the equivalent of Perl's \c{/s} option. QRegExp |
533 | does not have an equivalent to Perl's \c{/m} option, but this |
534 | can be emulated in various ways for example by splitting the input |
535 | into lines or by looping with a regexp that searches for newlines. |
536 | |
537 | Because QRegExp is string oriented, there are no \\A, \\Z, or \\z |
538 | assertions. The \\G assertion is not supported but can be emulated |
539 | in a loop. |
540 | |
541 | Perl's $& is cap(0) or capturedTexts()[0]. There are no QRegExp |
542 | equivalents for $`, $' or $+. Perl's capturing variables, $1, $2, |
543 | ... correspond to cap(1) or capturedTexts()[1], cap(2) or |
544 | capturedTexts()[2], etc. |
545 | |
546 | To substitute a pattern use QString::replace(). |
547 | |
548 | Perl's extended \c{/x} syntax is not supported, nor are |
549 | directives, e.g. (?i), or regexp comments, e.g. (?#comment). On |
550 | the other hand, C++'s rules for literal strings can be used to |
551 | achieve the same: |
552 | |
553 | \snippet code/src_corelib_text_qregexp.cpp 3 |
554 | |
555 | Both zero-width positive and zero-width negative lookahead |
556 | assertions (?=pattern) and (?!pattern) are supported with the same |
557 | syntax as Perl. Perl's lookbehind assertions, "independent" |
558 | subexpressions and conditional expressions are not supported. |
559 | |
560 | Non-capturing parentheses are also supported, with the same |
561 | (?:pattern) syntax. |
562 | |
563 | See QString::split() and QStringList::join() for equivalents |
564 | to Perl's split and join functions. |
565 | |
566 | Note: because C++ transforms \\'s they must be written \e twice in |
567 | code, e.g. \b{\\b} must be written \b{\\\\b}. |
568 | |
569 | \target code-examples |
570 | \section1 Code Examples |
571 | |
572 | \snippet code/src_corelib_text_qregexp.cpp 4 |
573 | |
574 | The third string matches '\underline{6}'. This is a simple validation |
575 | regexp for integers in the range 0 to 99. |
576 | |
577 | \snippet code/src_corelib_text_qregexp.cpp 5 |
578 | |
579 | The second string matches '\underline{This_is-OK}'. We've used the |
580 | character set abbreviation '\\S' (non-whitespace) and the anchors |
581 | to match strings which contain no whitespace. |
582 | |
583 | In the following example we match strings containing 'mail' or |
584 | 'letter' or 'correspondence' but only match whole words i.e. not |
585 | 'email' |
586 | |
587 | \snippet code/src_corelib_text_qregexp.cpp 6 |
588 | |
589 | The second string matches "Please write the \underline{letter}". The |
590 | word 'letter' is also captured (because of the parentheses). We |
591 | can see what text we've captured like this: |
592 | |
593 | \snippet code/src_corelib_text_qregexp.cpp 7 |
594 | |
595 | This will capture the text from the first set of capturing |
596 | parentheses (counting capturing left parentheses from left to |
597 | right). The parentheses are counted from 1 since cap(0) is the |
598 | whole matched regexp (equivalent to '&' in most regexp engines). |
599 | |
600 | \snippet code/src_corelib_text_qregexp.cpp 8 |
601 | |
602 | Here we've passed the QRegExp to QString's replace() function to |
603 | replace the matched text with new text. |
604 | |
605 | \snippet code/src_corelib_text_qregexp.cpp 9 |
606 | |
607 | We've used the indexIn() function to repeatedly match the regexp in |
608 | the string. Note that instead of moving forward by one character |
609 | at a time \c pos++ we could have written \c {pos += |
610 | rx.matchedLength()} to skip over the already matched string. The |
611 | count will equal 3, matching 'One \underline{Eric} another |
612 | \underline{Eirik}, and an Ericsson. How many Eiriks, \underline{Eric}?'; it |
613 | doesn't match 'Ericsson' or 'Eiriks' because they are not bounded |
614 | by non-word boundaries. |
615 | |
616 | One common use of regexps is to split lines of delimited data into |
617 | their component fields. |
618 | |
619 | \snippet code/src_corelib_text_qregexp.cpp 10 |
620 | |
621 | In this example our input lines have the format company name, web |
622 | address and country. Unfortunately the regexp is rather long and |
623 | not very versatile -- the code will break if we add any more |
624 | fields. A simpler and better solution is to look for the |
625 | separator, '\\t' in this case, and take the surrounding text. The |
626 | QString::split() function can take a separator string or regexp |
627 | as an argument and split a string accordingly. |
628 | |
629 | \snippet code/src_corelib_text_qregexp.cpp 11 |
630 | |
631 | Here field[0] is the company, field[1] the web address and so on. |
632 | |
633 | To imitate the matching of a shell we can use wildcard mode. |
634 | |
635 | \snippet code/src_corelib_text_qregexp.cpp 12 |
636 | |
637 | Wildcard matching can be convenient because of its simplicity, but |
638 | any wildcard regexp can be defined using full regexps, e.g. |
639 | \b{.*\\.html$}. Notice that we can't match both \c .html and \c |
640 | .htm files with a wildcard unless we use \b{*.htm*} which will |
641 | also match 'test.html.bak'. A full regexp gives us the precision |
642 | we need, \b{.*\\.html?$}. |
643 | |
644 | QRegExp can match case insensitively using setCaseSensitivity(), |
645 | and can use non-greedy matching, see setMinimal(). By |
646 | default QRegExp uses full regexps but this can be changed with |
647 | setPatternSyntax(). Searching can be done forward with indexIn() or backward |
648 | with lastIndexIn(). Captured text can be accessed using |
649 | capturedTexts() which returns a string list of all captured |
650 | strings, or using cap() which returns the captured string for the |
651 | given index. The pos() function takes a match index and returns |
652 | the position in the string where the match was made (or -1 if |
653 | there was no match). |
654 | |
655 | \sa QString, QStringList, QSortFilterProxyModel |
656 | |
657 | \section1 Porting to QRegularExpression |
658 | |
659 | \include corelib/port-from-qregexp.qdocinc porting-to-qregularexpression |
660 | */ |
661 | |
662 | #if defined(Q_OS_VXWORKS) && defined(EOS) |
663 | # undef EOS |
664 | #endif |
665 | |
666 | const int NumBadChars = 64; |
667 | #define BadChar(ch) ((ch).unicode() % NumBadChars) |
668 | |
669 | const int NoOccurrence = INT_MAX; |
670 | const int EmptyCapture = INT_MAX; |
671 | const int InftyLen = INT_MAX; |
672 | const int InftyRep = 1025; |
673 | const int EOS = -1; |
674 | |
675 | static bool isWord(QChar ch) |
676 | { |
677 | return ch.isLetterOrNumber() || ch.isMark() || ch == QLatin1Char('_'); |
678 | } |
679 | |
680 | /* |
681 | Merges two vectors of ints and puts the result into the first |
682 | one. |
683 | */ |
684 | static void mergeInto(QList<int> *a, const QList<int> &b) |
685 | { |
686 | int asize = a->size(); |
687 | int bsize = b.size(); |
688 | if (asize == 0) { |
689 | *a = b; |
690 | #ifndef QT_NO_REGEXP_OPTIM |
691 | } else if (bsize == 1 && a->at(i: asize - 1) < b.at(i: 0)) { |
692 | a->resize(size: asize + 1); |
693 | (*a)[asize] = b.at(i: 0); |
694 | #endif |
695 | } else if (bsize >= 1) { |
696 | int csize = asize + bsize; |
697 | QList<int> c(csize); |
698 | int i = 0, j = 0, k = 0; |
699 | while (i < asize) { |
700 | if (j < bsize) { |
701 | if (a->at(i) == b.at(i: j)) { |
702 | ++i; |
703 | --csize; |
704 | } else if (a->at(i) < b.at(i: j)) { |
705 | c[k++] = a->at(i: i++); |
706 | } else { |
707 | c[k++] = b.at(i: j++); |
708 | } |
709 | } else { |
710 | memcpy(dest: c.data() + k, src: a->constData() + i, n: (asize - i) * sizeof(int)); |
711 | break; |
712 | } |
713 | } |
714 | c.resize(size: csize); |
715 | if (j < bsize) |
716 | memcpy(dest: c.data() + k, src: b.constData() + j, n: (bsize - j) * sizeof(int)); |
717 | *a = c; |
718 | } |
719 | } |
720 | |
721 | #ifndef QT_NO_REGEXP_WILDCARD |
722 | /* |
723 | Translates a wildcard pattern to an equivalent regular expression |
724 | pattern (e.g., *.cpp to .*\.cpp). |
725 | |
726 | If enableEscaping is true, it is possible to escape the wildcard |
727 | characters with \ |
728 | */ |
729 | static QString wc2rx(const QString &wc_str, const bool enableEscaping) |
730 | { |
731 | const int wclen = wc_str.size(); |
732 | QString rx; |
733 | int i = 0; |
734 | bool isEscaping = false; // the previous character is '\' |
735 | const QChar *wc = wc_str.unicode(); |
736 | |
737 | while (i < wclen) { |
738 | const QChar c = wc[i++]; |
739 | switch (c.unicode()) { |
740 | case '\\': |
741 | if (enableEscaping) { |
742 | if (isEscaping) { |
743 | rx += QLatin1String("\\\\" ); |
744 | } // we insert the \\ later if necessary |
745 | if (i == wclen) { // the end |
746 | rx += QLatin1String("\\\\" ); |
747 | } |
748 | } else { |
749 | rx += QLatin1String("\\\\" ); |
750 | } |
751 | isEscaping = true; |
752 | break; |
753 | case '*': |
754 | if (isEscaping) { |
755 | rx += QLatin1String("\\*" ); |
756 | isEscaping = false; |
757 | } else { |
758 | rx += QLatin1String(".*" ); |
759 | } |
760 | break; |
761 | case '?': |
762 | if (isEscaping) { |
763 | rx += QLatin1String("\\?" ); |
764 | isEscaping = false; |
765 | } else { |
766 | rx += QLatin1Char('.'); |
767 | } |
768 | |
769 | break; |
770 | case '$': |
771 | case '(': |
772 | case ')': |
773 | case '+': |
774 | case '.': |
775 | case '^': |
776 | case '{': |
777 | case '|': |
778 | case '}': |
779 | if (isEscaping) { |
780 | isEscaping = false; |
781 | rx += QLatin1String("\\\\" ); |
782 | } |
783 | rx += QLatin1Char('\\'); |
784 | rx += c; |
785 | break; |
786 | case '[': |
787 | if (isEscaping) { |
788 | isEscaping = false; |
789 | rx += QLatin1String("\\[" ); |
790 | } else { |
791 | rx += c; |
792 | if (wc[i] == QLatin1Char('^')) |
793 | rx += wc[i++]; |
794 | if (i < wclen) { |
795 | if (wc[i] == QLatin1Char(']')) |
796 | rx += wc[i++]; |
797 | while (i < wclen && wc[i] != QLatin1Char(']')) { |
798 | if (wc[i] == QLatin1Char('\\')) |
799 | rx += QLatin1Char('\\'); |
800 | rx += wc[i++]; |
801 | } |
802 | } |
803 | } |
804 | break; |
805 | |
806 | case ']': |
807 | if (isEscaping){ |
808 | isEscaping = false; |
809 | rx += QLatin1String("\\" ); |
810 | } |
811 | rx += c; |
812 | break; |
813 | |
814 | default: |
815 | if (isEscaping){ |
816 | isEscaping = false; |
817 | rx += QLatin1String("\\\\" ); |
818 | } |
819 | rx += c; |
820 | } |
821 | } |
822 | return rx; |
823 | } |
824 | #endif |
825 | |
826 | static int caretIndex(int offset, QRegExp::CaretMode caretMode) |
827 | { |
828 | if (caretMode == QRegExp::CaretAtZero) { |
829 | return 0; |
830 | } else if (caretMode == QRegExp::CaretAtOffset) { |
831 | return offset; |
832 | } else { // QRegExp::CaretWontMatch |
833 | return -1; |
834 | } |
835 | } |
836 | |
837 | /* |
838 | The QRegExpEngineKey struct uniquely identifies an engine. |
839 | */ |
840 | struct QRegExpEngineKey |
841 | { |
842 | QString pattern; |
843 | QRegExp::PatternSyntax patternSyntax; |
844 | Qt::CaseSensitivity cs; |
845 | |
846 | inline QRegExpEngineKey(const QString &pattern, QRegExp::PatternSyntax patternSyntax, |
847 | Qt::CaseSensitivity cs) |
848 | : pattern(pattern), patternSyntax(patternSyntax), cs(cs) {} |
849 | |
850 | inline void clear() { |
851 | pattern.clear(); |
852 | patternSyntax = QRegExp::RegExp; |
853 | cs = Qt::CaseSensitive; |
854 | } |
855 | }; |
856 | |
857 | static bool operator==(const QRegExpEngineKey &key1, const QRegExpEngineKey &key2) |
858 | { |
859 | return key1.pattern == key2.pattern && key1.patternSyntax == key2.patternSyntax |
860 | && key1.cs == key2.cs; |
861 | } |
862 | |
863 | static size_t qHash(const QRegExpEngineKey &key, size_t seed = 0) noexcept |
864 | { |
865 | return qHashMulti(seed, args: key.pattern, args: key.patternSyntax, args: key.cs); |
866 | } |
867 | |
868 | class QRegExpEngine; |
869 | |
870 | /* |
871 | This is the engine state during matching. |
872 | */ |
873 | struct QRegExpMatchState |
874 | { |
875 | const QChar *in; // a pointer to the input string data |
876 | int pos; // the current position in the string |
877 | int caretPos; |
878 | int len; // the length of the input string |
879 | bool minimal; // minimal matching? |
880 | int *bigArray; // big array holding the data for the next pointers |
881 | int *inNextStack; // is state is nextStack? |
882 | int *curStack; // stack of current states |
883 | int *nextStack; // stack of next states |
884 | int *curCapBegin; // start of current states' captures |
885 | int *nextCapBegin; // start of next states' captures |
886 | int *curCapEnd; // end of current states' captures |
887 | int *nextCapEnd; // end of next states' captures |
888 | int *tempCapBegin; // start of temporary captures |
889 | int *tempCapEnd; // end of temporary captures |
890 | int *capBegin; // start of captures for a next state |
891 | int *capEnd; // end of captures for a next state |
892 | int *slideTab; // bump-along slide table for bad-character heuristic |
893 | int *captured; // what match() returned last |
894 | int slideTabSize; // size of slide table |
895 | int capturedSize; |
896 | #ifndef QT_NO_REGEXP_BACKREF |
897 | QList<QList<int>> sleeping; // list of back-reference sleepers |
898 | #endif |
899 | int matchLen; // length of match |
900 | int oneTestMatchedLen; // length of partial match |
901 | |
902 | const QRegExpEngine *eng; |
903 | |
904 | inline QRegExpMatchState() : bigArray(nullptr), captured(nullptr) {} |
905 | inline ~QRegExpMatchState() { free(ptr: bigArray); } |
906 | |
907 | void drain() { free(ptr: bigArray); bigArray = nullptr; captured = nullptr; } // to save memory |
908 | void prepareForMatch(QRegExpEngine *eng); |
909 | void match(const QChar *str, int len, int pos, bool minimal, |
910 | bool oneTest, int caretIndex); |
911 | bool matchHere(); |
912 | bool testAnchor(int i, int a, const int *capBegin); |
913 | }; |
914 | |
915 | /* |
916 | The struct QRegExpAutomatonState represents one state in a modified NFA. The |
917 | input characters matched are stored in the state instead of on |
918 | the transitions, something possible for an automaton |
919 | constructed from a regular expression. |
920 | */ |
921 | struct QRegExpAutomatonState |
922 | { |
923 | #ifndef QT_NO_REGEXP_CAPTURE |
924 | int atom; // which atom does this state belong to? |
925 | #endif |
926 | int match; // what does it match? (see CharClassBit and BackRefBit) |
927 | QList<int> outs; // out-transitions |
928 | QMap<int, int> reenter; // atoms reentered when transiting out |
929 | QMap<int, int> anchors; // anchors met when transiting out |
930 | |
931 | inline QRegExpAutomatonState() { } |
932 | #ifndef QT_NO_REGEXP_CAPTURE |
933 | inline QRegExpAutomatonState(int a, int m) |
934 | : atom(a), match(m) { } |
935 | #else |
936 | inline QRegExpAutomatonState(int m) |
937 | : match(m) { } |
938 | #endif |
939 | }; |
940 | |
941 | Q_DECLARE_TYPEINFO(QRegExpAutomatonState, Q_RELOCATABLE_TYPE); |
942 | |
943 | /* |
944 | The struct QRegExpCharClassRange represents a range of characters (e.g., |
945 | [0-9] denotes range 48 to 57). |
946 | */ |
947 | struct QRegExpCharClassRange |
948 | { |
949 | ushort from; // 48 |
950 | ushort len; // 10 |
951 | }; |
952 | |
953 | Q_DECLARE_TYPEINFO(QRegExpCharClassRange, Q_PRIMITIVE_TYPE); |
954 | |
955 | #ifndef QT_NO_REGEXP_CAPTURE |
956 | /* |
957 | The struct QRegExpAtom represents one node in the hierarchy of regular |
958 | expression atoms. |
959 | */ |
960 | struct QRegExpAtom |
961 | { |
962 | enum { NoCapture = -1, OfficialCapture = -2, UnofficialCapture = -3 }; |
963 | |
964 | int parent; // index of parent in array of atoms |
965 | int capture; // index of capture, from 1 to ncap - 1 |
966 | }; |
967 | |
968 | Q_DECLARE_TYPEINFO(QRegExpAtom, Q_PRIMITIVE_TYPE); |
969 | #endif |
970 | |
971 | struct QRegExpLookahead; |
972 | |
973 | #ifndef QT_NO_REGEXP_ANCHOR_ALT |
974 | /* |
975 | The struct QRegExpAnchorAlternation represents a pair of anchors with |
976 | OR semantics. |
977 | */ |
978 | struct QRegExpAnchorAlternation |
979 | { |
980 | int a; // this anchor... |
981 | int b; // ...or this one |
982 | }; |
983 | |
984 | Q_DECLARE_TYPEINFO(QRegExpAnchorAlternation, Q_PRIMITIVE_TYPE); |
985 | #endif |
986 | |
987 | #ifndef QT_NO_REGEXP_CCLASS |
988 | |
989 | #define FLAG(x) (1 << (x)) |
990 | /* |
991 | The class QRegExpCharClass represents a set of characters, such as can |
992 | be found in regular expressions (e.g., [a-z] denotes the set |
993 | {a, b, ..., z}). |
994 | */ |
995 | class QRegExpCharClass |
996 | { |
997 | public: |
998 | QRegExpCharClass(); |
999 | |
1000 | void clear(); |
1001 | bool negative() const { return n; } |
1002 | void setNegative(bool negative); |
1003 | void addCategories(uint cats); |
1004 | void addRange(ushort from, ushort to); |
1005 | void addSingleton(ushort ch) { addRange(from: ch, to: ch); } |
1006 | |
1007 | bool in(QChar ch) const; |
1008 | #ifndef QT_NO_REGEXP_OPTIM |
1009 | const QList<int> &firstOccurrence() const { return occ1; } |
1010 | #endif |
1011 | |
1012 | #if defined(QT_DEBUG) |
1013 | void dump() const; |
1014 | #endif |
1015 | |
1016 | private: |
1017 | QList<QRegExpCharClassRange> r; // character ranges |
1018 | #ifndef QT_NO_REGEXP_OPTIM |
1019 | QList<int> occ1; // first-occurrence array |
1020 | #endif |
1021 | uint c; // character classes |
1022 | bool n; // negative? |
1023 | }; |
1024 | #else |
1025 | struct QRegExpCharClass |
1026 | { |
1027 | int dummy; |
1028 | |
1029 | #ifndef QT_NO_REGEXP_OPTIM |
1030 | QRegExpCharClass() { occ1.fill(0, NumBadChars); } |
1031 | |
1032 | const QList<int> &firstOccurrence() const { return occ1; } |
1033 | QList<int> occ1; |
1034 | #endif |
1035 | }; |
1036 | #endif |
1037 | |
1038 | Q_DECLARE_TYPEINFO(QRegExpCharClass, Q_RELOCATABLE_TYPE); |
1039 | |
1040 | /* |
1041 | The QRegExpEngine class encapsulates a modified nondeterministic |
1042 | finite automaton (NFA). |
1043 | */ |
1044 | class QRegExpEngine |
1045 | { |
1046 | public: |
1047 | QRegExpEngine(Qt::CaseSensitivity cs, bool greedyQuantifiers) |
1048 | : cs(cs), greedyQuantifiers(greedyQuantifiers) { setup(); } |
1049 | |
1050 | QRegExpEngine(const QRegExpEngineKey &key); |
1051 | ~QRegExpEngine(); |
1052 | |
1053 | bool isValid() const { return valid; } |
1054 | const QString &errorString() const { return yyError; } |
1055 | int captureCount() const { return officialncap; } |
1056 | |
1057 | int createState(QChar ch); |
1058 | int createState(const QRegExpCharClass &cc); |
1059 | #ifndef QT_NO_REGEXP_BACKREF |
1060 | int createState(int bref); |
1061 | #endif |
1062 | |
1063 | void addCatTransitions(const QList<int> &from, const QList<int> &to); |
1064 | #ifndef QT_NO_REGEXP_CAPTURE |
1065 | void addPlusTransitions(const QList<int> &from, const QList<int> &to, int atom); |
1066 | #endif |
1067 | |
1068 | #ifndef QT_NO_REGEXP_ANCHOR_ALT |
1069 | int anchorAlternation(int a, int b); |
1070 | int anchorConcatenation(int a, int b); |
1071 | #else |
1072 | int anchorAlternation(int a, int b) { return a & b; } |
1073 | int anchorConcatenation(int a, int b) { return a | b; } |
1074 | #endif |
1075 | void addAnchors(int from, int to, int a); |
1076 | |
1077 | #ifndef QT_NO_REGEXP_OPTIM |
1078 | void heuristicallyChooseHeuristic(); |
1079 | #endif |
1080 | |
1081 | #if defined(QT_DEBUG) |
1082 | void dump() const; |
1083 | #endif |
1084 | |
1085 | QAtomicInt ref; |
1086 | |
1087 | private: |
1088 | enum { CharClassBit = 0x10000, BackRefBit = 0x20000 }; |
1089 | enum { InitialState = 0, FinalState = 1 }; |
1090 | |
1091 | void setup(); |
1092 | int setupState(int match); |
1093 | |
1094 | /* |
1095 | Let's hope that 13 lookaheads and 14 back-references are |
1096 | enough. |
1097 | */ |
1098 | enum { MaxLookaheads = 13, MaxBackRefs = 14 }; |
1099 | enum { Anchor_Dollar = 0x00000001, Anchor_Caret = 0x00000002, Anchor_Word = 0x00000004, |
1100 | Anchor_NonWord = 0x00000008, Anchor_FirstLookahead = 0x00000010, |
1101 | Anchor_BackRef1Empty = Anchor_FirstLookahead << MaxLookaheads, |
1102 | Anchor_BackRef0Empty = Anchor_BackRef1Empty >> 1, |
1103 | Anchor_Alternation = unsigned(Anchor_BackRef1Empty) << MaxBackRefs, |
1104 | |
1105 | Anchor_LookaheadMask = (Anchor_FirstLookahead - 1) ^ |
1106 | ((Anchor_FirstLookahead << MaxLookaheads) - 1) }; |
1107 | #ifndef QT_NO_REGEXP_CAPTURE |
1108 | int startAtom(bool officialCapture); |
1109 | void finishAtom(int atom, bool needCapture); |
1110 | #endif |
1111 | |
1112 | #ifndef QT_NO_REGEXP_LOOKAHEAD |
1113 | int addLookahead(QRegExpEngine *eng, bool negative); |
1114 | #endif |
1115 | |
1116 | #ifndef QT_NO_REGEXP_OPTIM |
1117 | bool goodStringMatch(QRegExpMatchState &matchState) const; |
1118 | bool badCharMatch(QRegExpMatchState &matchState) const; |
1119 | #else |
1120 | bool bruteMatch(QRegExpMatchState &matchState) const; |
1121 | #endif |
1122 | |
1123 | QList<QRegExpAutomatonState> s; // array of states |
1124 | #ifndef QT_NO_REGEXP_CAPTURE |
1125 | QList<QRegExpAtom> f; // atom hierarchy |
1126 | int nf; // number of atoms |
1127 | int cf; // current atom |
1128 | QList<int> captureForOfficialCapture; |
1129 | #endif |
1130 | int officialncap; // number of captures, seen from the outside |
1131 | int ncap; // number of captures, seen from the inside |
1132 | #ifndef QT_NO_REGEXP_CCLASS |
1133 | QList<QRegExpCharClass> cl; // array of character classes |
1134 | #endif |
1135 | #ifndef QT_NO_REGEXP_LOOKAHEAD |
1136 | QList<QRegExpLookahead *> ahead; // array of lookaheads |
1137 | #endif |
1138 | #ifndef QT_NO_REGEXP_ANCHOR_ALT |
1139 | QList<QRegExpAnchorAlternation> aa; // array of (a, b) pairs of anchors |
1140 | #endif |
1141 | #ifndef QT_NO_REGEXP_OPTIM |
1142 | bool caretAnchored; // does the regexp start with ^? |
1143 | bool trivial; // is the good-string all that needs to match? |
1144 | #endif |
1145 | bool valid; // is the regular expression valid? |
1146 | Qt::CaseSensitivity cs; // case sensitive? |
1147 | bool greedyQuantifiers; // RegExp2? |
1148 | bool xmlSchemaExtensions; |
1149 | #ifndef QT_NO_REGEXP_BACKREF |
1150 | int nbrefs; // number of back-references |
1151 | #endif |
1152 | |
1153 | #ifndef QT_NO_REGEXP_OPTIM |
1154 | bool useGoodStringHeuristic; // use goodStringMatch? otherwise badCharMatch |
1155 | |
1156 | int goodEarlyStart; // the index where goodStr can first occur in a match |
1157 | int goodLateStart; // the index where goodStr can last occur in a match |
1158 | QString goodStr; // the string that any match has to contain |
1159 | |
1160 | int minl; // the minimum length of a match |
1161 | QList<int> occ1; // first-occurrence array |
1162 | #endif |
1163 | |
1164 | /* |
1165 | The class Box is an abstraction for a regular expression |
1166 | fragment. It can also be seen as one node in the syntax tree of |
1167 | a regular expression with synthetized attributes. |
1168 | |
1169 | Its interface is ugly for performance reasons. |
1170 | */ |
1171 | class Box |
1172 | { |
1173 | public: |
1174 | Box(QRegExpEngine *engine); |
1175 | Box(const Box &b) { operator=(b); } |
1176 | |
1177 | Box &operator=(const Box &b); |
1178 | |
1179 | void clear() { operator=(b: Box(eng)); } |
1180 | void set(QChar ch); |
1181 | void set(const QRegExpCharClass &cc); |
1182 | #ifndef QT_NO_REGEXP_BACKREF |
1183 | void set(int bref); |
1184 | #endif |
1185 | |
1186 | void cat(const Box &b); |
1187 | void orx(const Box &b); |
1188 | void plus(int atom); |
1189 | void opt(); |
1190 | void catAnchor(int a); |
1191 | #ifndef QT_NO_REGEXP_OPTIM |
1192 | void setupHeuristics(); |
1193 | #endif |
1194 | |
1195 | #if defined(QT_DEBUG) |
1196 | void dump() const; |
1197 | #endif |
1198 | |
1199 | private: |
1200 | void addAnchorsToEngine(const Box &to) const; |
1201 | |
1202 | QRegExpEngine *eng; // the automaton under construction |
1203 | QList<int> ls; // the left states (firstpos) |
1204 | QList<int> rs; // the right states (lastpos) |
1205 | QMap<int, int> lanchors; // the left anchors |
1206 | QMap<int, int> ranchors; // the right anchors |
1207 | int skipanchors; // the anchors to match if the box is skipped |
1208 | |
1209 | #ifndef QT_NO_REGEXP_OPTIM |
1210 | int earlyStart; // the index where str can first occur |
1211 | int lateStart; // the index where str can last occur |
1212 | QString str; // a string that has to occur in any match |
1213 | QString leftStr; // a string occurring at the left of this box |
1214 | QString rightStr; // a string occurring at the right of this box |
1215 | int maxl; // the maximum length of this box (possibly InftyLen) |
1216 | #endif |
1217 | |
1218 | int minl; // the minimum length of this box |
1219 | #ifndef QT_NO_REGEXP_OPTIM |
1220 | QList<int> occ1; // first-occurrence array |
1221 | #endif |
1222 | }; |
1223 | |
1224 | friend class Box; |
1225 | |
1226 | /* |
1227 | This is the lexical analyzer for regular expressions. |
1228 | */ |
1229 | enum { Tok_Eos, Tok_Dollar, Tok_LeftParen, Tok_MagicLeftParen, Tok_PosLookahead, |
1230 | Tok_NegLookahead, Tok_RightParen, Tok_CharClass, Tok_Caret, Tok_Quantifier, Tok_Bar, |
1231 | Tok_Word, Tok_NonWord, Tok_Char = 0x10000, Tok_BackRef = 0x20000 }; |
1232 | int getChar(); |
1233 | int getEscape(); |
1234 | #ifndef QT_NO_REGEXP_INTERVAL |
1235 | int getRep(int def); |
1236 | #endif |
1237 | #ifndef QT_NO_REGEXP_LOOKAHEAD |
1238 | void skipChars(int n); |
1239 | #endif |
1240 | void error(const char *msg); |
1241 | void startTokenizer(const QChar *rx, int len); |
1242 | int getToken(); |
1243 | |
1244 | const QChar *yyIn; // a pointer to the input regular expression pattern |
1245 | int yyPos0; // the position of yyTok in the input pattern |
1246 | int yyPos; // the position of the next character to read |
1247 | int yyLen; // the length of yyIn |
1248 | int yyCh; // the last character read |
1249 | QScopedPointer<QRegExpCharClass> yyCharClass; // attribute for Tok_CharClass tokens |
1250 | int yyMinRep; // attribute for Tok_Quantifier |
1251 | int yyMaxRep; // ditto |
1252 | QString yyError; // syntax error or overflow during parsing? |
1253 | |
1254 | /* |
1255 | This is the syntactic analyzer for regular expressions. |
1256 | */ |
1257 | int parse(const QChar *rx, int len); |
1258 | void parseAtom(Box *box); |
1259 | void parseFactor(Box *box); |
1260 | void parseTerm(Box *box); |
1261 | void parseExpression(Box *box); |
1262 | |
1263 | int yyTok; // the last token read |
1264 | bool yyMayCapture; // set this to false to disable capturing |
1265 | |
1266 | friend struct QRegExpMatchState; |
1267 | }; |
1268 | |
1269 | #ifndef QT_NO_REGEXP_LOOKAHEAD |
1270 | /* |
1271 | The struct QRegExpLookahead represents a lookahead a la Perl (e.g., |
1272 | (?=foo) and (?!bar)). |
1273 | */ |
1274 | struct QRegExpLookahead |
1275 | { |
1276 | QRegExpEngine *eng; // NFA representing the embedded regular expression |
1277 | bool neg; // negative lookahead? |
1278 | |
1279 | inline QRegExpLookahead(QRegExpEngine *eng0, bool neg0) |
1280 | : eng(eng0), neg(neg0) { } |
1281 | inline ~QRegExpLookahead() { delete eng; } |
1282 | }; |
1283 | #endif |
1284 | |
1285 | /*! |
1286 | \internal |
1287 | convert the pattern string to the RegExp syntax. |
1288 | |
1289 | This is also used by QScriptEngine::newRegExp to convert to a pattern that JavaScriptCore can understan |
1290 | */ |
1291 | Q_CORE5COMPAT_EXPORT QString qt_regexp_toCanonical(const QString &pattern, |
1292 | QRegExp::PatternSyntax patternSyntax) |
1293 | { |
1294 | switch (patternSyntax) { |
1295 | #ifndef QT_NO_REGEXP_WILDCARD |
1296 | case QRegExp::Wildcard: |
1297 | return wc2rx(wc_str: pattern, enableEscaping: false); |
1298 | case QRegExp::WildcardUnix: |
1299 | return wc2rx(wc_str: pattern, enableEscaping: true); |
1300 | #endif |
1301 | case QRegExp::FixedString: |
1302 | return QRegExp::escape(str: pattern); |
1303 | case QRegExp::W3CXmlSchema11: |
1304 | default: |
1305 | return pattern; |
1306 | } |
1307 | } |
1308 | |
1309 | QRegExpEngine::QRegExpEngine(const QRegExpEngineKey &key) |
1310 | : cs(key.cs), greedyQuantifiers(key.patternSyntax == QRegExp::RegExp2), |
1311 | xmlSchemaExtensions(key.patternSyntax == QRegExp::W3CXmlSchema11) |
1312 | { |
1313 | setup(); |
1314 | |
1315 | QString rx = qt_regexp_toCanonical(pattern: key.pattern, patternSyntax: key.patternSyntax); |
1316 | |
1317 | valid = (parse(rx: rx.unicode(), len: rx.size()) == rx.size()); |
1318 | if (!valid) { |
1319 | #ifndef QT_NO_REGEXP_OPTIM |
1320 | trivial = false; |
1321 | #endif |
1322 | error(RXERR_LEFTDELIM); |
1323 | } |
1324 | } |
1325 | |
1326 | QRegExpEngine::~QRegExpEngine() |
1327 | { |
1328 | #ifndef QT_NO_REGEXP_LOOKAHEAD |
1329 | qDeleteAll(c: ahead); |
1330 | #endif |
1331 | } |
1332 | |
1333 | void QRegExpMatchState::prepareForMatch(QRegExpEngine *eng) |
1334 | { |
1335 | /* |
1336 | We use one QList<int> for all the big data used a lot in |
1337 | matchHere() and friends. |
1338 | */ |
1339 | int ns = eng->s.size(); // number of states |
1340 | int ncap = eng->ncap; |
1341 | #ifndef QT_NO_REGEXP_OPTIM |
1342 | int newSlideTabSize = qMax(a: eng->minl + 1, b: 16); |
1343 | #else |
1344 | int newSlideTabSize = 0; |
1345 | #endif |
1346 | int numCaptures = eng->captureCount(); |
1347 | int newCapturedSize = 2 + 2 * numCaptures; |
1348 | bigArray = q_check_ptr(p: (int *)realloc(ptr: bigArray, size: ((3 + 4 * ncap) * ns + 4 * ncap + newSlideTabSize + newCapturedSize)*sizeof(int))); |
1349 | |
1350 | // set all internal variables only _after_ bigArray is realloc'ed |
1351 | // to prevent a broken regexp in oom case |
1352 | |
1353 | slideTabSize = newSlideTabSize; |
1354 | capturedSize = newCapturedSize; |
1355 | inNextStack = bigArray; |
1356 | memset(s: inNextStack, c: -1, n: ns * sizeof(int)); |
1357 | curStack = inNextStack + ns; |
1358 | nextStack = inNextStack + 2 * ns; |
1359 | |
1360 | curCapBegin = inNextStack + 3 * ns; |
1361 | nextCapBegin = curCapBegin + ncap * ns; |
1362 | curCapEnd = curCapBegin + 2 * ncap * ns; |
1363 | nextCapEnd = curCapBegin + 3 * ncap * ns; |
1364 | |
1365 | tempCapBegin = curCapBegin + 4 * ncap * ns; |
1366 | tempCapEnd = tempCapBegin + ncap; |
1367 | capBegin = tempCapBegin + 2 * ncap; |
1368 | capEnd = tempCapBegin + 3 * ncap; |
1369 | |
1370 | slideTab = tempCapBegin + 4 * ncap; |
1371 | captured = slideTab + slideTabSize; |
1372 | memset(s: captured, c: -1, n: capturedSize*sizeof(int)); |
1373 | this->eng = eng; |
1374 | } |
1375 | |
1376 | /* |
1377 | Tries to match in str and returns an array of (begin, length) pairs |
1378 | for captured text. If there is no match, all pairs are (-1, -1). |
1379 | */ |
1380 | void QRegExpMatchState::match(const QChar *str0, int len0, int pos0, |
1381 | bool minimal0, bool oneTest, int caretIndex) |
1382 | { |
1383 | bool matched = false; |
1384 | QChar char_null; |
1385 | |
1386 | #ifndef QT_NO_REGEXP_OPTIM |
1387 | if (eng->trivial && !oneTest) { |
1388 | // ### Qt6: qsizetype |
1389 | pos = int(QtPrivate::findString(haystack: QStringView(str0, len0), from: pos0, needle: QStringView(eng->goodStr.unicode(), eng->goodStr.size()), cs: eng->cs)); |
1390 | matchLen = eng->goodStr.size(); |
1391 | matched = (pos != -1); |
1392 | } else |
1393 | #endif |
1394 | { |
1395 | in = str0; |
1396 | if (in == nullptr) |
1397 | in = &char_null; |
1398 | pos = pos0; |
1399 | caretPos = caretIndex; |
1400 | len = len0; |
1401 | minimal = minimal0; |
1402 | matchLen = 0; |
1403 | oneTestMatchedLen = 0; |
1404 | |
1405 | if (eng->valid && pos >= 0 && pos <= len) { |
1406 | #ifndef QT_NO_REGEXP_OPTIM |
1407 | if (oneTest) { |
1408 | matched = matchHere(); |
1409 | } else { |
1410 | if (pos <= len - eng->minl) { |
1411 | if (eng->caretAnchored) { |
1412 | matched = matchHere(); |
1413 | } else if (eng->useGoodStringHeuristic) { |
1414 | matched = eng->goodStringMatch(matchState&: *this); |
1415 | } else { |
1416 | matched = eng->badCharMatch(matchState&: *this); |
1417 | } |
1418 | } |
1419 | } |
1420 | #else |
1421 | matched = oneTest ? matchHere() : eng->bruteMatch(*this); |
1422 | #endif |
1423 | } |
1424 | } |
1425 | |
1426 | if (matched) { |
1427 | int *c = captured; |
1428 | *c++ = pos; |
1429 | *c++ = matchLen; |
1430 | |
1431 | int numCaptures = (capturedSize - 2) >> 1; |
1432 | #ifndef QT_NO_REGEXP_CAPTURE |
1433 | for (int i = 0; i < numCaptures; ++i) { |
1434 | int j = eng->captureForOfficialCapture.at(i); |
1435 | if (capBegin[j] != EmptyCapture) { |
1436 | int len = capEnd[j] - capBegin[j]; |
1437 | *c++ = (len > 0) ? pos + capBegin[j] : 0; |
1438 | *c++ = len; |
1439 | } else { |
1440 | *c++ = -1; |
1441 | *c++ = -1; |
1442 | } |
1443 | } |
1444 | #endif |
1445 | } else { |
1446 | // we rely on 2's complement here |
1447 | memset(s: captured, c: -1, n: capturedSize * sizeof(int)); |
1448 | } |
1449 | } |
1450 | |
1451 | /* |
1452 | The three following functions add one state to the automaton and |
1453 | return the number of the state. |
1454 | */ |
1455 | |
1456 | int QRegExpEngine::createState(QChar ch) |
1457 | { |
1458 | return setupState(ch.unicode()); |
1459 | } |
1460 | |
1461 | int QRegExpEngine::createState(const QRegExpCharClass &cc) |
1462 | { |
1463 | #ifndef QT_NO_REGEXP_CCLASS |
1464 | int n = cl.size(); |
1465 | cl += QRegExpCharClass(cc); |
1466 | return setupState(CharClassBit | n); |
1467 | #else |
1468 | Q_UNUSED(cc); |
1469 | return setupState(CharClassBit); |
1470 | #endif |
1471 | } |
1472 | |
1473 | #ifndef QT_NO_REGEXP_BACKREF |
1474 | int QRegExpEngine::createState(int bref) |
1475 | { |
1476 | if (bref > nbrefs) { |
1477 | nbrefs = bref; |
1478 | if (nbrefs > MaxBackRefs) { |
1479 | error(RXERR_LIMIT); |
1480 | return 0; |
1481 | } |
1482 | } |
1483 | return setupState(BackRefBit | bref); |
1484 | } |
1485 | #endif |
1486 | |
1487 | /* |
1488 | The two following functions add a transition between all pairs of |
1489 | states (i, j) where i is found in from, and j is found in to. |
1490 | |
1491 | Cat-transitions are distinguished from plus-transitions for |
1492 | capturing. |
1493 | */ |
1494 | |
1495 | void QRegExpEngine::addCatTransitions(const QList<int> &from, const QList<int> &to) |
1496 | { |
1497 | for (int i = 0; i < from.size(); i++) |
1498 | mergeInto(a: &s[from.at(i)].outs, b: to); |
1499 | } |
1500 | |
1501 | #ifndef QT_NO_REGEXP_CAPTURE |
1502 | void QRegExpEngine::addPlusTransitions(const QList<int> &from, const QList<int> &to, int atom) |
1503 | { |
1504 | for (int i = 0; i < from.size(); i++) { |
1505 | QRegExpAutomatonState &st = s[from.at(i)]; |
1506 | const QList<int> oldOuts = st.outs; |
1507 | mergeInto(a: &st.outs, b: to); |
1508 | if (f.at(i: atom).capture != QRegExpAtom::NoCapture) { |
1509 | for (int j = 0; j < to.size(); j++) { |
1510 | // ### st.reenter.contains(to.at(j)) check looks suspicious |
1511 | if (!st.reenter.contains(key: to.at(i: j)) && |
1512 | !std::binary_search(first: oldOuts.constBegin(), last: oldOuts.constEnd(), val: to.at(i: j))) |
1513 | st.reenter.insert(key: to.at(i: j), value: atom); |
1514 | } |
1515 | } |
1516 | } |
1517 | } |
1518 | #endif |
1519 | |
1520 | #ifndef QT_NO_REGEXP_ANCHOR_ALT |
1521 | /* |
1522 | Returns an anchor that means a OR b. |
1523 | */ |
1524 | int QRegExpEngine::anchorAlternation(int a, int b) |
1525 | { |
1526 | if (((a & b) == a || (a & b) == b) && ((a | b) & Anchor_Alternation) == 0) |
1527 | return a & b; |
1528 | |
1529 | int n = aa.size(); |
1530 | #ifndef QT_NO_REGEXP_OPTIM |
1531 | if (n > 0 && aa.at(i: n - 1).a == a && aa.at(i: n - 1).b == b) |
1532 | return Anchor_Alternation | (n - 1); |
1533 | #endif |
1534 | |
1535 | QRegExpAnchorAlternation element = {.a: a, .b: b}; |
1536 | aa.append(t: element); |
1537 | return Anchor_Alternation | n; |
1538 | } |
1539 | |
1540 | /* |
1541 | Returns an anchor that means a AND b. |
1542 | */ |
1543 | int QRegExpEngine::anchorConcatenation(int a, int b) |
1544 | { |
1545 | if (((a | b) & Anchor_Alternation) == 0) |
1546 | return a | b; |
1547 | if ((b & Anchor_Alternation) != 0) |
1548 | qSwap(value1&: a, value2&: b); |
1549 | |
1550 | int aprime = anchorConcatenation(a: aa.at(i: a ^ Anchor_Alternation).a, b); |
1551 | int bprime = anchorConcatenation(a: aa.at(i: a ^ Anchor_Alternation).b, b); |
1552 | return anchorAlternation(a: aprime, b: bprime); |
1553 | } |
1554 | #endif |
1555 | |
1556 | /* |
1557 | Adds anchor a on a transition caracterised by its from state and |
1558 | its to state. |
1559 | */ |
1560 | void QRegExpEngine::addAnchors(int from, int to, int a) |
1561 | { |
1562 | QRegExpAutomatonState &st = s[from]; |
1563 | if (st.anchors.contains(key: to)) |
1564 | a = anchorAlternation(a: st.anchors.value(key: to), b: a); |
1565 | st.anchors.insert(key: to, value: a); |
1566 | } |
1567 | |
1568 | #ifndef QT_NO_REGEXP_OPTIM |
1569 | /* |
1570 | This function chooses between the good-string and the bad-character |
1571 | heuristics. It computes two scores and chooses the heuristic with |
1572 | the highest score. |
1573 | |
1574 | Here are some common-sense constraints on the scores that should be |
1575 | respected if the formulas are ever modified: (1) If goodStr is |
1576 | empty, the good-string heuristic scores 0. (2) If the regular |
1577 | expression is trivial, the good-string heuristic should be used. |
1578 | (3) If the search is case insensitive, the good-string heuristic |
1579 | should be used, unless it scores 0. (Case insensitivity turns all |
1580 | entries of occ1 to 0.) (4) If (goodLateStart - goodEarlyStart) is |
1581 | big, the good-string heuristic should score less. |
1582 | */ |
1583 | void QRegExpEngine::heuristicallyChooseHeuristic() |
1584 | { |
1585 | if (minl == 0) { |
1586 | useGoodStringHeuristic = false; |
1587 | } else if (trivial) { |
1588 | useGoodStringHeuristic = true; |
1589 | } else { |
1590 | /* |
1591 | Magic formula: The good string has to constitute a good |
1592 | proportion of the minimum-length string, and appear at a |
1593 | more-or-less known index. |
1594 | */ |
1595 | int goodStringScore = (64 * goodStr.size() / minl) - |
1596 | (goodLateStart - goodEarlyStart); |
1597 | /* |
1598 | Less magic formula: We pick some characters at random, and |
1599 | check whether they are good or bad. |
1600 | */ |
1601 | int badCharScore = 0; |
1602 | int step = qMax(a: 1, b: NumBadChars / 32); |
1603 | for (int i = 1; i < NumBadChars; i += step) { |
1604 | if (occ1.at(i) == NoOccurrence) |
1605 | badCharScore += minl; |
1606 | else |
1607 | badCharScore += occ1.at(i); |
1608 | } |
1609 | badCharScore /= minl; |
1610 | useGoodStringHeuristic = (goodStringScore > badCharScore); |
1611 | } |
1612 | } |
1613 | #endif |
1614 | |
1615 | #if defined(QT_DEBUG) |
1616 | void QRegExpEngine::dump() const |
1617 | { |
1618 | int i, j; |
1619 | qDebug(msg: "Case %ssensitive engine" , cs ? "" : "in" ); |
1620 | qDebug(msg: " States" ); |
1621 | for (i = 0; i < s.size(); i++) { |
1622 | qDebug(msg: " %d%s" , i, i == InitialState ? " (initial)" : i == FinalState ? " (final)" : "" ); |
1623 | #ifndef QT_NO_REGEXP_CAPTURE |
1624 | if (nf > 0) |
1625 | qDebug(msg: " in atom %d" , s[i].atom); |
1626 | #endif |
1627 | int m = s[i].match; |
1628 | if ((m & CharClassBit) != 0) { |
1629 | qDebug(msg: " match character class %d" , m ^ CharClassBit); |
1630 | #ifndef QT_NO_REGEXP_CCLASS |
1631 | cl[m ^ CharClassBit].dump(); |
1632 | #else |
1633 | qDebug(" negative character class" ); |
1634 | #endif |
1635 | } else if ((m & BackRefBit) != 0) { |
1636 | qDebug(msg: " match back-reference %d" , m ^ BackRefBit); |
1637 | } else if (m >= 0x20 && m <= 0x7e) { |
1638 | qDebug(msg: " match 0x%.4x (%c)" , m, m); |
1639 | } else { |
1640 | qDebug(msg: " match 0x%.4x" , m); |
1641 | } |
1642 | for (j = 0; j < s[i].outs.size(); j++) { |
1643 | int next = s[i].outs[j]; |
1644 | qDebug(msg: " -> %d" , next); |
1645 | if (s[i].reenter.contains(key: next)) |
1646 | qDebug(msg: " [reenter %d]" , s[i].reenter[next]); |
1647 | if (s[i].anchors.value(key: next) != 0) |
1648 | qDebug(msg: " [anchors 0x%.8x]" , s[i].anchors[next]); |
1649 | } |
1650 | } |
1651 | #ifndef QT_NO_REGEXP_CAPTURE |
1652 | if (nf > 0) { |
1653 | qDebug(msg: " Atom Parent Capture" ); |
1654 | for (i = 0; i < nf; i++) { |
1655 | if (f[i].capture == QRegExpAtom::NoCapture) { |
1656 | qDebug(msg: " %6d %6d nil" , i, f[i].parent); |
1657 | } else { |
1658 | int cap = f[i].capture; |
1659 | bool official = captureForOfficialCapture.contains(t: cap); |
1660 | qDebug(msg: " %6d %6d %6d %s" , i, f[i].parent, f[i].capture, |
1661 | official ? "official" : "" ); |
1662 | } |
1663 | } |
1664 | } |
1665 | #endif |
1666 | #ifndef QT_NO_REGEXP_ANCHOR_ALT |
1667 | for (i = 0; i < aa.size(); i++) |
1668 | qDebug(msg: " Anchor alternation 0x%.8x: 0x%.8x 0x%.9x" , i, aa[i].a, aa[i].b); |
1669 | #endif |
1670 | } |
1671 | #endif |
1672 | |
1673 | void QRegExpEngine::setup() |
1674 | { |
1675 | ref.storeRelaxed(newValue: 1); |
1676 | #ifndef QT_NO_REGEXP_CAPTURE |
1677 | f.resize(size: 32); |
1678 | nf = 0; |
1679 | cf = -1; |
1680 | #endif |
1681 | officialncap = 0; |
1682 | ncap = 0; |
1683 | #ifndef QT_NO_REGEXP_OPTIM |
1684 | caretAnchored = true; |
1685 | trivial = true; |
1686 | #endif |
1687 | valid = false; |
1688 | #ifndef QT_NO_REGEXP_BACKREF |
1689 | nbrefs = 0; |
1690 | #endif |
1691 | #ifndef QT_NO_REGEXP_OPTIM |
1692 | useGoodStringHeuristic = true; |
1693 | minl = 0; |
1694 | occ1.fill(t: 0, newSize: NumBadChars); |
1695 | #endif |
1696 | } |
1697 | |
1698 | int QRegExpEngine::setupState(int match) |
1699 | { |
1700 | #ifndef QT_NO_REGEXP_CAPTURE |
1701 | s += QRegExpAutomatonState(cf, match); |
1702 | #else |
1703 | s += QRegExpAutomatonState(match); |
1704 | #endif |
1705 | return s.size() - 1; |
1706 | } |
1707 | |
1708 | #ifndef QT_NO_REGEXP_CAPTURE |
1709 | /* |
1710 | Functions startAtom() and finishAtom() should be called to delimit |
1711 | atoms. When a state is created, it is assigned to the current atom. |
1712 | The information is later used for capturing. |
1713 | */ |
1714 | int QRegExpEngine::startAtom(bool officialCapture) |
1715 | { |
1716 | if ((nf & (nf + 1)) == 0 && nf + 1 >= f.size()) |
1717 | f.resize(size: (nf + 1) << 1); |
1718 | f[nf].parent = cf; |
1719 | cf = nf++; |
1720 | f[cf].capture = officialCapture ? QRegExpAtom::OfficialCapture : QRegExpAtom::NoCapture; |
1721 | return cf; |
1722 | } |
1723 | |
1724 | void QRegExpEngine::finishAtom(int atom, bool needCapture) |
1725 | { |
1726 | if (greedyQuantifiers && needCapture && f[atom].capture == QRegExpAtom::NoCapture) |
1727 | f[atom].capture = QRegExpAtom::UnofficialCapture; |
1728 | cf = f.at(i: atom).parent; |
1729 | } |
1730 | #endif |
1731 | |
1732 | #ifndef QT_NO_REGEXP_LOOKAHEAD |
1733 | /* |
1734 | Creates a lookahead anchor. |
1735 | */ |
1736 | int QRegExpEngine::addLookahead(QRegExpEngine *eng, bool negative) |
1737 | { |
1738 | int n = ahead.size(); |
1739 | if (n == MaxLookaheads) { |
1740 | error(RXERR_LIMIT); |
1741 | return 0; |
1742 | } |
1743 | ahead += new QRegExpLookahead(eng, negative); |
1744 | return Anchor_FirstLookahead << n; |
1745 | } |
1746 | #endif |
1747 | |
1748 | #ifndef QT_NO_REGEXP_CAPTURE |
1749 | /* |
1750 | We want the longest leftmost captures. |
1751 | */ |
1752 | static bool isBetterCapture(int ncap, const int *begin1, const int *end1, const int *begin2, |
1753 | const int *end2) |
1754 | { |
1755 | for (int i = 0; i < ncap; i++) { |
1756 | int delta = begin2[i] - begin1[i]; // it has to start early... |
1757 | if (delta == 0) |
1758 | delta = end1[i] - end2[i]; // ...and end late |
1759 | |
1760 | if (delta != 0) |
1761 | return delta > 0; |
1762 | } |
1763 | return false; |
1764 | } |
1765 | #endif |
1766 | |
1767 | /* |
1768 | Returns \c true if anchor a matches at position pos + i in the input |
1769 | string, otherwise false. |
1770 | */ |
1771 | bool QRegExpMatchState::testAnchor(int i, int a, const int *capBegin) |
1772 | { |
1773 | int j; |
1774 | |
1775 | #ifndef QT_NO_REGEXP_ANCHOR_ALT |
1776 | if ((a & QRegExpEngine::Anchor_Alternation) != 0) |
1777 | return testAnchor(i, a: eng->aa.at(i: a ^ QRegExpEngine::Anchor_Alternation).a, capBegin) |
1778 | || testAnchor(i, a: eng->aa.at(i: a ^ QRegExpEngine::Anchor_Alternation).b, capBegin); |
1779 | #endif |
1780 | |
1781 | if ((a & QRegExpEngine::Anchor_Caret) != 0) { |
1782 | if (pos + i != caretPos) |
1783 | return false; |
1784 | } |
1785 | if ((a & QRegExpEngine::Anchor_Dollar) != 0) { |
1786 | if (pos + i != len) |
1787 | return false; |
1788 | } |
1789 | #ifndef QT_NO_REGEXP_ESCAPE |
1790 | if ((a & (QRegExpEngine::Anchor_Word | QRegExpEngine::Anchor_NonWord)) != 0) { |
1791 | bool before = false; |
1792 | bool after = false; |
1793 | if (pos + i != 0) |
1794 | before = isWord(ch: in[pos + i - 1]); |
1795 | if (pos + i != len) |
1796 | after = isWord(ch: in[pos + i]); |
1797 | if ((a & QRegExpEngine::Anchor_Word) != 0 && (before == after)) |
1798 | return false; |
1799 | if ((a & QRegExpEngine::Anchor_NonWord) != 0 && (before != after)) |
1800 | return false; |
1801 | } |
1802 | #endif |
1803 | #ifndef QT_NO_REGEXP_LOOKAHEAD |
1804 | if ((a & QRegExpEngine::Anchor_LookaheadMask) != 0) { |
1805 | const QList<QRegExpLookahead *> &ahead = eng->ahead; |
1806 | for (j = 0; j < ahead.size(); j++) { |
1807 | if ((a & (QRegExpEngine::Anchor_FirstLookahead << j)) != 0) { |
1808 | QRegExpMatchState matchState; |
1809 | matchState.prepareForMatch(eng: ahead[j]->eng); |
1810 | matchState.match(str0: in + pos + i, len0: len - pos - i, pos0: 0, |
1811 | minimal0: true, oneTest: true, caretIndex: caretPos - pos - i); |
1812 | if ((matchState.captured[0] == 0) == ahead[j]->neg) |
1813 | return false; |
1814 | } |
1815 | } |
1816 | } |
1817 | #endif |
1818 | #ifndef QT_NO_REGEXP_CAPTURE |
1819 | #ifndef QT_NO_REGEXP_BACKREF |
1820 | for (j = 0; j < eng->nbrefs; j++) { |
1821 | if ((a & (QRegExpEngine::Anchor_BackRef1Empty << j)) != 0) { |
1822 | int i = eng->captureForOfficialCapture.at(i: j); |
1823 | if (capBegin[i] != EmptyCapture) |
1824 | return false; |
1825 | } |
1826 | } |
1827 | #endif |
1828 | #endif |
1829 | return true; |
1830 | } |
1831 | |
1832 | #ifndef QT_NO_REGEXP_OPTIM |
1833 | /* |
1834 | The three following functions are what Jeffrey Friedl would call |
1835 | transmissions (or bump-alongs). Using one or the other should make |
1836 | no difference except in performance. |
1837 | */ |
1838 | |
1839 | bool QRegExpEngine::goodStringMatch(QRegExpMatchState &matchState) const |
1840 | { |
1841 | int k = matchState.pos + goodEarlyStart; |
1842 | QStringMatcher matcher(goodStr.unicode(), goodStr.size(), cs); |
1843 | while ((k = matcher.indexIn(str: matchState.in, length: matchState.len, from: k)) != -1) { |
1844 | int from = k - goodLateStart; |
1845 | int to = k - goodEarlyStart; |
1846 | if (from > matchState.pos) |
1847 | matchState.pos = from; |
1848 | |
1849 | while (matchState.pos <= to) { |
1850 | if (matchState.matchHere()) |
1851 | return true; |
1852 | ++matchState.pos; |
1853 | } |
1854 | ++k; |
1855 | } |
1856 | return false; |
1857 | } |
1858 | |
1859 | bool QRegExpEngine::badCharMatch(QRegExpMatchState &matchState) const |
1860 | { |
1861 | int slideHead = 0; |
1862 | int slideNext = 0; |
1863 | int i; |
1864 | int lastPos = matchState.len - minl; |
1865 | memset(s: matchState.slideTab, c: 0, n: matchState.slideTabSize * sizeof(int)); |
1866 | |
1867 | /* |
1868 | Set up the slide table, used for the bad-character heuristic, |
1869 | using the table of first occurrence of each character. |
1870 | */ |
1871 | for (i = 0; i < minl; i++) { |
1872 | int sk = occ1[BadChar(matchState.in[matchState.pos + i])]; |
1873 | if (sk == NoOccurrence) |
1874 | sk = i + 1; |
1875 | if (sk > 0) { |
1876 | int k = i + 1 - sk; |
1877 | if (k < 0) { |
1878 | sk = i + 1; |
1879 | k = 0; |
1880 | } |
1881 | if (sk > matchState.slideTab[k]) |
1882 | matchState.slideTab[k] = sk; |
1883 | } |
1884 | } |
1885 | |
1886 | if (matchState.pos > lastPos) |
1887 | return false; |
1888 | |
1889 | for (;;) { |
1890 | if (++slideNext >= matchState.slideTabSize) |
1891 | slideNext = 0; |
1892 | if (matchState.slideTab[slideHead] > 0) { |
1893 | if (matchState.slideTab[slideHead] - 1 > matchState.slideTab[slideNext]) |
1894 | matchState.slideTab[slideNext] = matchState.slideTab[slideHead] - 1; |
1895 | matchState.slideTab[slideHead] = 0; |
1896 | } else { |
1897 | if (matchState.matchHere()) |
1898 | return true; |
1899 | } |
1900 | |
1901 | if (matchState.pos == lastPos) |
1902 | break; |
1903 | |
1904 | /* |
1905 | Update the slide table. This code has much in common with |
1906 | the initialization code. |
1907 | */ |
1908 | int sk = occ1[BadChar(matchState.in[matchState.pos + minl])]; |
1909 | if (sk == NoOccurrence) { |
1910 | matchState.slideTab[slideNext] = minl; |
1911 | } else if (sk > 0) { |
1912 | int k = slideNext + minl - sk; |
1913 | if (k >= matchState.slideTabSize) |
1914 | k -= matchState.slideTabSize; |
1915 | if (sk > matchState.slideTab[k]) |
1916 | matchState.slideTab[k] = sk; |
1917 | } |
1918 | slideHead = slideNext; |
1919 | ++matchState.pos; |
1920 | } |
1921 | return false; |
1922 | } |
1923 | #else |
1924 | bool QRegExpEngine::bruteMatch(QRegExpMatchState &matchState) const |
1925 | { |
1926 | while (matchState.pos <= matchState.len) { |
1927 | if (matchState.matchHere()) |
1928 | return true; |
1929 | ++matchState.pos; |
1930 | } |
1931 | return false; |
1932 | } |
1933 | #endif |
1934 | |
1935 | /* |
1936 | Here's the core of the engine. It tries to do a match here and now. |
1937 | */ |
1938 | bool QRegExpMatchState::matchHere() |
1939 | { |
1940 | int ncur = 1, nnext = 0; |
1941 | int i = 0, j, k, m; |
1942 | bool stop = false; |
1943 | |
1944 | matchLen = -1; |
1945 | oneTestMatchedLen = -1; |
1946 | curStack[0] = QRegExpEngine::InitialState; |
1947 | |
1948 | int ncap = eng->ncap; |
1949 | #ifndef QT_NO_REGEXP_CAPTURE |
1950 | if (ncap > 0) { |
1951 | for (j = 0; j < ncap; j++) { |
1952 | curCapBegin[j] = EmptyCapture; |
1953 | curCapEnd[j] = EmptyCapture; |
1954 | } |
1955 | } |
1956 | #endif |
1957 | |
1958 | #ifndef QT_NO_REGEXP_BACKREF |
1959 | while ((ncur > 0 || !sleeping.isEmpty()) && i <= len - pos && !stop) |
1960 | #else |
1961 | while (ncur > 0 && i <= len - pos && !stop) |
1962 | #endif |
1963 | { |
1964 | int ch = (i < len - pos) ? in[pos + i].unicode() : 0; |
1965 | for (j = 0; j < ncur; j++) { |
1966 | int cur = curStack[j]; |
1967 | const QRegExpAutomatonState &scur = eng->s.at(i: cur); |
1968 | const QList<int> &outs = scur.outs; |
1969 | for (k = 0; k < outs.size(); k++) { |
1970 | int next = outs.at(i: k); |
1971 | const QRegExpAutomatonState &snext = eng->s.at(i: next); |
1972 | bool inside = true; |
1973 | #if !defined(QT_NO_REGEXP_BACKREF) && !defined(QT_NO_REGEXP_CAPTURE) |
1974 | int needSomeSleep = 0; |
1975 | #endif |
1976 | |
1977 | /* |
1978 | First, check if the anchors are anchored properly. |
1979 | */ |
1980 | int a = scur.anchors.value(key: next); |
1981 | if (a != 0 && !testAnchor(i, a, capBegin: curCapBegin + j * ncap)) |
1982 | inside = false; |
1983 | |
1984 | /* |
1985 | If indeed they are, check if the input character is |
1986 | correct for this transition. |
1987 | */ |
1988 | if (inside) { |
1989 | m = snext.match; |
1990 | if ((m & (QRegExpEngine::CharClassBit | QRegExpEngine::BackRefBit)) == 0) { |
1991 | if (eng->cs) |
1992 | inside = (m == ch); |
1993 | else |
1994 | inside = (QChar(m).toLower() == QChar(ch).toLower()); |
1995 | } else if (next == QRegExpEngine::FinalState) { |
1996 | matchLen = i; |
1997 | stop = minimal; |
1998 | inside = true; |
1999 | } else if ((m & QRegExpEngine::CharClassBit) != 0) { |
2000 | #ifndef QT_NO_REGEXP_CCLASS |
2001 | const QRegExpCharClass &cc = eng->cl.at(i: m ^ QRegExpEngine::CharClassBit); |
2002 | if (eng->cs) |
2003 | inside = cc.in(ch: QChar(ch)); |
2004 | else if (cc.negative()) |
2005 | inside = cc.in(ch: QChar(ch).toLower()) && |
2006 | cc.in(ch: QChar(ch).toUpper()); |
2007 | else |
2008 | inside = cc.in(ch: QChar(ch).toLower()) || |
2009 | cc.in(ch: QChar(ch).toUpper()); |
2010 | #endif |
2011 | #if !defined(QT_NO_REGEXP_BACKREF) && !defined(QT_NO_REGEXP_CAPTURE) |
2012 | } else { /* ((m & QRegExpEngine::BackRefBit) != 0) */ |
2013 | int bref = m ^ QRegExpEngine::BackRefBit; |
2014 | int ell = j * ncap + eng->captureForOfficialCapture.at(i: bref - 1); |
2015 | |
2016 | inside = bref <= ncap && curCapBegin[ell] != EmptyCapture; |
2017 | if (inside) { |
2018 | if (eng->cs) |
2019 | inside = (in[pos + curCapBegin[ell]] == QChar(ch)); |
2020 | else |
2021 | inside = (in[pos + curCapBegin[ell]].toLower() |
2022 | == QChar(ch).toLower()); |
2023 | } |
2024 | |
2025 | if (inside) { |
2026 | int delta; |
2027 | if (curCapEnd[ell] == EmptyCapture) |
2028 | delta = i - curCapBegin[ell]; |
2029 | else |
2030 | delta = curCapEnd[ell] - curCapBegin[ell]; |
2031 | |
2032 | inside = (delta <= len - (pos + i)); |
2033 | if (inside && delta > 1) { |
2034 | int n = 1; |
2035 | if (eng->cs) { |
2036 | while (n < delta) { |
2037 | if (in[pos + curCapBegin[ell] + n] |
2038 | != in[pos + i + n]) |
2039 | break; |
2040 | ++n; |
2041 | } |
2042 | } else { |
2043 | while (n < delta) { |
2044 | QChar a = in[pos + curCapBegin[ell] + n]; |
2045 | QChar b = in[pos + i + n]; |
2046 | if (a.toLower() != b.toLower()) |
2047 | break; |
2048 | ++n; |
2049 | } |
2050 | } |
2051 | inside = (n == delta); |
2052 | if (inside) |
2053 | needSomeSleep = delta - 1; |
2054 | } |
2055 | } |
2056 | #endif |
2057 | } |
2058 | } |
2059 | |
2060 | /* |
2061 | We must now update our data structures. |
2062 | */ |
2063 | if (inside) { |
2064 | #ifndef QT_NO_REGEXP_CAPTURE |
2065 | int *capBegin, *capEnd; |
2066 | #endif |
2067 | /* |
2068 | If the next state was not encountered yet, all |
2069 | is fine. |
2070 | */ |
2071 | if ((m = inNextStack[next]) == -1) { |
2072 | m = nnext++; |
2073 | nextStack[m] = next; |
2074 | inNextStack[next] = m; |
2075 | #ifndef QT_NO_REGEXP_CAPTURE |
2076 | capBegin = nextCapBegin + m * ncap; |
2077 | capEnd = nextCapEnd + m * ncap; |
2078 | |
2079 | /* |
2080 | Otherwise, we'll first maintain captures in |
2081 | temporary arrays, and decide at the end whether |
2082 | it's best to keep the previous capture zones or |
2083 | the new ones. |
2084 | */ |
2085 | } else { |
2086 | capBegin = tempCapBegin; |
2087 | capEnd = tempCapEnd; |
2088 | #endif |
2089 | } |
2090 | |
2091 | #ifndef QT_NO_REGEXP_CAPTURE |
2092 | /* |
2093 | Updating the capture zones is much of a task. |
2094 | */ |
2095 | if (ncap > 0) { |
2096 | memcpy(dest: capBegin, src: curCapBegin + j * ncap, n: ncap * sizeof(int)); |
2097 | memcpy(dest: capEnd, src: curCapEnd + j * ncap, n: ncap * sizeof(int)); |
2098 | int c = scur.atom, n = snext.atom; |
2099 | int p = -1, q = -1; |
2100 | int cap; |
2101 | |
2102 | /* |
2103 | Lemma 1. For any x in the range [0..nf), we |
2104 | have f[x].parent < x. |
2105 | |
2106 | Proof. By looking at startAtom(), it is |
2107 | clear that cf < nf holds all the time, and |
2108 | thus that f[nf].parent < nf. |
2109 | */ |
2110 | |
2111 | /* |
2112 | If we are reentering an atom, we empty all |
2113 | capture zones inside it. |
2114 | */ |
2115 | if ((q = scur.reenter.value(key: next)) != 0) { |
2116 | QBitArray b(eng->nf, false); |
2117 | b.setBit(i: q, val: true); |
2118 | for (int ell = q + 1; ell < eng->nf; ell++) { |
2119 | if (b.testBit(i: eng->f.at(i: ell).parent)) { |
2120 | b.setBit(i: ell, val: true); |
2121 | cap = eng->f.at(i: ell).capture; |
2122 | if (cap >= 0) { |
2123 | capBegin[cap] = EmptyCapture; |
2124 | capEnd[cap] = EmptyCapture; |
2125 | } |
2126 | } |
2127 | } |
2128 | p = eng->f.at(i: q).parent; |
2129 | |
2130 | /* |
2131 | Otherwise, close the capture zones we are |
2132 | leaving. We are leaving f[c].capture, |
2133 | f[f[c].parent].capture, |
2134 | f[f[f[c].parent].parent].capture, ..., |
2135 | until f[x].capture, with x such that |
2136 | f[x].parent is the youngest common ancestor |
2137 | for c and n. |
2138 | |
2139 | We go up along c's and n's ancestry until |
2140 | we find x. |
2141 | */ |
2142 | } else { |
2143 | p = c; |
2144 | q = n; |
2145 | while (p != q) { |
2146 | if (p > q) { |
2147 | cap = eng->f.at(i: p).capture; |
2148 | if (cap >= 0) { |
2149 | if (capBegin[cap] == i) { |
2150 | capBegin[cap] = EmptyCapture; |
2151 | capEnd[cap] = EmptyCapture; |
2152 | } else { |
2153 | capEnd[cap] = i; |
2154 | } |
2155 | } |
2156 | p = eng->f.at(i: p).parent; |
2157 | } else { |
2158 | q = eng->f.at(i: q).parent; |
2159 | } |
2160 | } |
2161 | } |
2162 | |
2163 | /* |
2164 | In any case, we now open the capture zones |
2165 | we are entering. We work upwards from n |
2166 | until we reach p (the parent of the atom we |
2167 | reenter or the youngest common ancestor). |
2168 | */ |
2169 | while (n > p) { |
2170 | cap = eng->f.at(i: n).capture; |
2171 | if (cap >= 0) { |
2172 | capBegin[cap] = i; |
2173 | capEnd[cap] = EmptyCapture; |
2174 | } |
2175 | n = eng->f.at(i: n).parent; |
2176 | } |
2177 | /* |
2178 | If the next state was already in |
2179 | nextStack, we must choose carefully which |
2180 | capture zones we want to keep. |
2181 | */ |
2182 | if (capBegin == tempCapBegin && |
2183 | isBetterCapture(ncap, begin1: capBegin, end1: capEnd, begin2: nextCapBegin + m * ncap, |
2184 | end2: nextCapEnd + m * ncap)) { |
2185 | memcpy(dest: nextCapBegin + m * ncap, src: capBegin, n: ncap * sizeof(int)); |
2186 | memcpy(dest: nextCapEnd + m * ncap, src: capEnd, n: ncap * sizeof(int)); |
2187 | } |
2188 | } |
2189 | #ifndef QT_NO_REGEXP_BACKREF |
2190 | /* |
2191 | We are done with updating the capture zones. |
2192 | It's now time to put the next state to sleep, |
2193 | if it needs to, and to remove it from |
2194 | nextStack. |
2195 | */ |
2196 | if (needSomeSleep > 0) { |
2197 | QList<int> zzZ(2 + 2 * ncap); |
2198 | zzZ[0] = i + needSomeSleep; |
2199 | zzZ[1] = next; |
2200 | if (ncap > 0) { |
2201 | memcpy(dest: zzZ.data() + 2, src: capBegin, n: ncap * sizeof(int)); |
2202 | memcpy(dest: zzZ.data() + 2 + ncap, src: capEnd, n: ncap * sizeof(int)); |
2203 | } |
2204 | inNextStack[nextStack[--nnext]] = -1; |
2205 | sleeping.append(t: zzZ); |
2206 | } |
2207 | #endif |
2208 | #endif |
2209 | } |
2210 | } |
2211 | } |
2212 | #ifndef QT_NO_REGEXP_CAPTURE |
2213 | /* |
2214 | If we reached the final state, hurray! Copy the captured |
2215 | zone. |
2216 | */ |
2217 | if (ncap > 0 && (m = inNextStack[QRegExpEngine::FinalState]) != -1) { |
2218 | memcpy(dest: capBegin, src: nextCapBegin + m * ncap, n: ncap * sizeof(int)); |
2219 | memcpy(dest: capEnd, src: nextCapEnd + m * ncap, n: ncap * sizeof(int)); |
2220 | } |
2221 | #ifndef QT_NO_REGEXP_BACKREF |
2222 | /* |
2223 | It's time to wake up the sleepers. |
2224 | */ |
2225 | j = 0; |
2226 | while (j < sleeping.size()) { |
2227 | if (sleeping.at(i: j)[0] == i) { |
2228 | const QList<int> &zzZ = sleeping.at(i: j); |
2229 | int next = zzZ[1]; |
2230 | const int *capBegin = zzZ.data() + 2; |
2231 | const int *capEnd = zzZ.data() + 2 + ncap; |
2232 | bool copyOver = true; |
2233 | |
2234 | if ((m = inNextStack[next]) == -1) { |
2235 | m = nnext++; |
2236 | nextStack[m] = next; |
2237 | inNextStack[next] = m; |
2238 | } else { |
2239 | copyOver = isBetterCapture(ncap, begin1: nextCapBegin + m * ncap, end1: nextCapEnd + m * ncap, |
2240 | begin2: capBegin, end2: capEnd); |
2241 | } |
2242 | if (copyOver) { |
2243 | memcpy(dest: nextCapBegin + m * ncap, src: capBegin, n: ncap * sizeof(int)); |
2244 | memcpy(dest: nextCapEnd + m * ncap, src: capEnd, n: ncap * sizeof(int)); |
2245 | } |
2246 | |
2247 | sleeping.removeAt(i: j); |
2248 | } else { |
2249 | ++j; |
2250 | } |
2251 | } |
2252 | #endif |
2253 | #endif |
2254 | for (j = 0; j < nnext; j++) |
2255 | inNextStack[nextStack[j]] = -1; |
2256 | |
2257 | // avoid needless iteration that confuses oneTestMatchedLen |
2258 | if (nnext == 1 && nextStack[0] == QRegExpEngine::FinalState |
2259 | #ifndef QT_NO_REGEXP_BACKREF |
2260 | && sleeping.isEmpty() |
2261 | #endif |
2262 | ) |
2263 | stop = true; |
2264 | |
2265 | qSwap(value1&: curStack, value2&: nextStack); |
2266 | #ifndef QT_NO_REGEXP_CAPTURE |
2267 | qSwap(value1&: curCapBegin, value2&: nextCapBegin); |
2268 | qSwap(value1&: curCapEnd, value2&: nextCapEnd); |
2269 | #endif |
2270 | ncur = nnext; |
2271 | nnext = 0; |
2272 | ++i; |
2273 | } |
2274 | |
2275 | #ifndef QT_NO_REGEXP_BACKREF |
2276 | /* |
2277 | If minimal matching is enabled, we might have some sleepers |
2278 | left. |
2279 | */ |
2280 | if (!sleeping.isEmpty()) |
2281 | sleeping.clear(); |
2282 | #endif |
2283 | |
2284 | oneTestMatchedLen = i - 1; |
2285 | return (matchLen >= 0); |
2286 | } |
2287 | |
2288 | #ifndef QT_NO_REGEXP_CCLASS |
2289 | |
2290 | QRegExpCharClass::QRegExpCharClass() |
2291 | : c(0), n(false) |
2292 | { |
2293 | #ifndef QT_NO_REGEXP_OPTIM |
2294 | occ1.fill(t: NoOccurrence, newSize: NumBadChars); |
2295 | #endif |
2296 | } |
2297 | |
2298 | void QRegExpCharClass::clear() |
2299 | { |
2300 | c = 0; |
2301 | r.clear(); |
2302 | n = false; |
2303 | } |
2304 | |
2305 | void QRegExpCharClass::setNegative(bool negative) |
2306 | { |
2307 | n = negative; |
2308 | #ifndef QT_NO_REGEXP_OPTIM |
2309 | occ1.fill(t: 0, newSize: NumBadChars); |
2310 | #endif |
2311 | } |
2312 | |
2313 | void QRegExpCharClass::addCategories(uint cats) |
2314 | { |
2315 | static const int all_cats = FLAG(QChar::Mark_NonSpacing) | |
2316 | FLAG(QChar::Mark_SpacingCombining) | |
2317 | FLAG(QChar::Mark_Enclosing) | |
2318 | FLAG(QChar::Number_DecimalDigit) | |
2319 | FLAG(QChar::Number_Letter) | |
2320 | FLAG(QChar::Number_Other) | |
2321 | FLAG(QChar::Separator_Space) | |
2322 | FLAG(QChar::Separator_Line) | |
2323 | FLAG(QChar::Separator_Paragraph) | |
2324 | FLAG(QChar::Other_Control) | |
2325 | FLAG(QChar::Other_Format) | |
2326 | FLAG(QChar::Other_Surrogate) | |
2327 | FLAG(QChar::Other_PrivateUse) | |
2328 | FLAG(QChar::Other_NotAssigned) | |
2329 | FLAG(QChar::Letter_Uppercase) | |
2330 | FLAG(QChar::Letter_Lowercase) | |
2331 | FLAG(QChar::Letter_Titlecase) | |
2332 | FLAG(QChar::Letter_Modifier) | |
2333 | FLAG(QChar::Letter_Other) | |
2334 | FLAG(QChar::Punctuation_Connector) | |
2335 | FLAG(QChar::Punctuation_Dash) | |
2336 | FLAG(QChar::Punctuation_Open) | |
2337 | FLAG(QChar::Punctuation_Close) | |
2338 | FLAG(QChar::Punctuation_InitialQuote) | |
2339 | FLAG(QChar::Punctuation_FinalQuote) | |
2340 | FLAG(QChar::Punctuation_Other) | |
2341 | FLAG(QChar::Symbol_Math) | |
2342 | FLAG(QChar::Symbol_Currency) | |
2343 | FLAG(QChar::Symbol_Modifier) | |
2344 | FLAG(QChar::Symbol_Other); |
2345 | c |= (all_cats & cats); |
2346 | #ifndef QT_NO_REGEXP_OPTIM |
2347 | occ1.fill(t: 0, newSize: NumBadChars); |
2348 | #endif |
2349 | } |
2350 | |
2351 | void QRegExpCharClass::addRange(ushort from, ushort to) |
2352 | { |
2353 | if (from > to) |
2354 | qSwap(value1&: from, value2&: to); |
2355 | int m = r.size(); |
2356 | r.resize(size: m + 1); |
2357 | r[m].from = from; |
2358 | r[m].len = to - from + 1; |
2359 | |
2360 | #ifndef QT_NO_REGEXP_OPTIM |
2361 | int i; |
2362 | |
2363 | if (to - from < NumBadChars) { |
2364 | if (from % NumBadChars <= to % NumBadChars) { |
2365 | for (i = from % NumBadChars; i <= to % NumBadChars; i++) |
2366 | occ1[i] = 0; |
2367 | } else { |
2368 | for (i = 0; i <= to % NumBadChars; i++) |
2369 | occ1[i] = 0; |
2370 | for (i = from % NumBadChars; i < NumBadChars; i++) |
2371 | occ1[i] = 0; |
2372 | } |
2373 | } else { |
2374 | occ1.fill(t: 0, newSize: NumBadChars); |
2375 | } |
2376 | #endif |
2377 | } |
2378 | |
2379 | bool QRegExpCharClass::in(QChar ch) const |
2380 | { |
2381 | #ifndef QT_NO_REGEXP_OPTIM |
2382 | if (occ1.at(BadChar(ch)) == NoOccurrence) |
2383 | return n; |
2384 | #endif |
2385 | |
2386 | if (c != 0 && (c & FLAG(ch.category())) != 0) |
2387 | return !n; |
2388 | |
2389 | const int uc = ch.unicode(); |
2390 | int size = r.size(); |
2391 | |
2392 | for (int i = 0; i < size; ++i) { |
2393 | const QRegExpCharClassRange &range = r.at(i); |
2394 | if (uint(uc - range.from) < uint(r.at(i).len)) |
2395 | return !n; |
2396 | } |
2397 | return n; |
2398 | } |
2399 | |
2400 | #if defined(QT_DEBUG) |
2401 | void QRegExpCharClass::dump() const |
2402 | { |
2403 | int i; |
2404 | qDebug(msg: " %stive character class" , n ? "nega" : "posi" ); |
2405 | #ifndef QT_NO_REGEXP_CCLASS |
2406 | if (c != 0) |
2407 | qDebug(msg: " categories 0x%.8x" , c); |
2408 | #endif |
2409 | for (i = 0; i < r.size(); i++) |
2410 | qDebug(msg: " 0x%.4x through 0x%.4x" , r[i].from, r[i].from + r[i].len - 1); |
2411 | } |
2412 | #endif |
2413 | #endif |
2414 | |
2415 | QRegExpEngine::Box::Box(QRegExpEngine *engine) |
2416 | : eng(engine), skipanchors(0) |
2417 | #ifndef QT_NO_REGEXP_OPTIM |
2418 | , earlyStart(0), lateStart(0), maxl(0) |
2419 | #endif |
2420 | { |
2421 | #ifndef QT_NO_REGEXP_OPTIM |
2422 | occ1.fill(t: NoOccurrence, newSize: NumBadChars); |
2423 | #endif |
2424 | minl = 0; |
2425 | } |
2426 | |
2427 | QRegExpEngine::Box &QRegExpEngine::Box::operator=(const Box &b) |
2428 | { |
2429 | eng = b.eng; |
2430 | ls = b.ls; |
2431 | rs = b.rs; |
2432 | lanchors = b.lanchors; |
2433 | ranchors = b.ranchors; |
2434 | skipanchors = b.skipanchors; |
2435 | #ifndef QT_NO_REGEXP_OPTIM |
2436 | earlyStart = b.earlyStart; |
2437 | lateStart = b.lateStart; |
2438 | str = b.str; |
2439 | leftStr = b.leftStr; |
2440 | rightStr = b.rightStr; |
2441 | maxl = b.maxl; |
2442 | occ1 = b.occ1; |
2443 | #endif |
2444 | minl = b.minl; |
2445 | return *this; |
2446 | } |
2447 | |
2448 | void QRegExpEngine::Box::set(QChar ch) |
2449 | { |
2450 | ls.resize(size: 1); |
2451 | ls[0] = eng->createState(ch); |
2452 | rs = ls; |
2453 | #ifndef QT_NO_REGEXP_OPTIM |
2454 | str = ch; |
2455 | leftStr = ch; |
2456 | rightStr = ch; |
2457 | maxl = 1; |
2458 | occ1[BadChar(ch)] = 0; |
2459 | #endif |
2460 | minl = 1; |
2461 | } |
2462 | |
2463 | void QRegExpEngine::Box::set(const QRegExpCharClass &cc) |
2464 | { |
2465 | ls.resize(size: 1); |
2466 | ls[0] = eng->createState(cc); |
2467 | rs = ls; |
2468 | #ifndef QT_NO_REGEXP_OPTIM |
2469 | maxl = 1; |
2470 | occ1 = cc.firstOccurrence(); |
2471 | #endif |
2472 | minl = 1; |
2473 | } |
2474 | |
2475 | #ifndef QT_NO_REGEXP_BACKREF |
2476 | void QRegExpEngine::Box::set(int bref) |
2477 | { |
2478 | ls.resize(size: 1); |
2479 | ls[0] = eng->createState(bref); |
2480 | rs = ls; |
2481 | if (bref >= 1 && bref <= MaxBackRefs) |
2482 | skipanchors = Anchor_BackRef0Empty << bref; |
2483 | #ifndef QT_NO_REGEXP_OPTIM |
2484 | maxl = InftyLen; |
2485 | #endif |
2486 | minl = 0; |
2487 | } |
2488 | #endif |
2489 | |
2490 | void QRegExpEngine::Box::cat(const Box &b) |
2491 | { |
2492 | eng->addCatTransitions(from: rs, to: b.ls); |
2493 | addAnchorsToEngine(to: b); |
2494 | if (minl == 0) { |
2495 | lanchors.insert(map: b.lanchors); |
2496 | if (skipanchors != 0) { |
2497 | for (int i = 0; i < b.ls.size(); i++) { |
2498 | int a = eng->anchorConcatenation(a: lanchors.value(key: b.ls.at(i), defaultValue: 0), b: skipanchors); |
2499 | lanchors.insert(key: b.ls.at(i), value: a); |
2500 | } |
2501 | } |
2502 | mergeInto(a: &ls, b: b.ls); |
2503 | } |
2504 | if (b.minl == 0) { |
2505 | ranchors.insert(map: b.ranchors); |
2506 | if (b.skipanchors != 0) { |
2507 | for (int i = 0; i < rs.size(); i++) { |
2508 | int a = eng->anchorConcatenation(a: ranchors.value(key: rs.at(i), defaultValue: 0), b: b.skipanchors); |
2509 | ranchors.insert(key: rs.at(i), value: a); |
2510 | } |
2511 | } |
2512 | mergeInto(a: &rs, b: b.rs); |
2513 | } else { |
2514 | ranchors = b.ranchors; |
2515 | rs = b.rs; |
2516 | } |
2517 | |
2518 | #ifndef QT_NO_REGEXP_OPTIM |
2519 | if (maxl != InftyLen) { |
2520 | if (rightStr.size() + b.leftStr.size() > |
2521 | qMax(a: str.size(), b: b.str.size())) { |
2522 | earlyStart = minl - rightStr.size(); |
2523 | lateStart = maxl - rightStr.size(); |
2524 | str = rightStr + b.leftStr; |
2525 | } else if (b.str.size() > str.size()) { |
2526 | earlyStart = minl + b.earlyStart; |
2527 | lateStart = maxl + b.lateStart; |
2528 | str = b.str; |
2529 | } |
2530 | } |
2531 | |
2532 | if (leftStr.size() == maxl) |
2533 | leftStr += b.leftStr; |
2534 | |
2535 | if (b.rightStr.size() == b.maxl) { |
2536 | rightStr += b.rightStr; |
2537 | } else { |
2538 | rightStr = b.rightStr; |
2539 | } |
2540 | |
2541 | if (maxl == InftyLen || b.maxl == InftyLen) { |
2542 | maxl = InftyLen; |
2543 | } else { |
2544 | maxl += b.maxl; |
2545 | } |
2546 | |
2547 | for (int i = 0; i < NumBadChars; i++) { |
2548 | if (b.occ1.at(i) != NoOccurrence && minl + b.occ1.at(i) < occ1.at(i)) |
2549 | occ1[i] = minl + b.occ1.at(i); |
2550 | } |
2551 | #endif |
2552 | |
2553 | minl += b.minl; |
2554 | if (minl == 0) |
2555 | skipanchors = eng->anchorConcatenation(a: skipanchors, b: b.skipanchors); |
2556 | else |
2557 | skipanchors = 0; |
2558 | } |
2559 | |
2560 | void QRegExpEngine::Box::orx(const Box &b) |
2561 | { |
2562 | mergeInto(a: &ls, b: b.ls); |
2563 | lanchors.insert(map: b.lanchors); |
2564 | mergeInto(a: &rs, b: b.rs); |
2565 | ranchors.insert(map: b.ranchors); |
2566 | |
2567 | if (b.minl == 0) { |
2568 | if (minl == 0) |
2569 | skipanchors = eng->anchorAlternation(a: skipanchors, b: b.skipanchors); |
2570 | else |
2571 | skipanchors = b.skipanchors; |
2572 | } |
2573 | |
2574 | #ifndef QT_NO_REGEXP_OPTIM |
2575 | for (int i = 0; i < NumBadChars; i++) { |
2576 | if (occ1.at(i) > b.occ1.at(i)) |
2577 | occ1[i] = b.occ1.at(i); |
2578 | } |
2579 | earlyStart = 0; |
2580 | lateStart = 0; |
2581 | str = QString(); |
2582 | leftStr = QString(); |
2583 | rightStr = QString(); |
2584 | if (b.maxl > maxl) |
2585 | maxl = b.maxl; |
2586 | #endif |
2587 | if (b.minl < minl) |
2588 | minl = b.minl; |
2589 | } |
2590 | |
2591 | void QRegExpEngine::Box::plus(int atom) |
2592 | { |
2593 | #ifndef QT_NO_REGEXP_CAPTURE |
2594 | eng->addPlusTransitions(from: rs, to: ls, atom); |
2595 | #else |
2596 | Q_UNUSED(atom); |
2597 | eng->addCatTransitions(rs, ls); |
2598 | #endif |
2599 | addAnchorsToEngine(to: *this); |
2600 | #ifndef QT_NO_REGEXP_OPTIM |
2601 | maxl = InftyLen; |
2602 | #endif |
2603 | } |
2604 | |
2605 | void QRegExpEngine::Box::opt() |
2606 | { |
2607 | #ifndef QT_NO_REGEXP_OPTIM |
2608 | earlyStart = 0; |
2609 | lateStart = 0; |
2610 | str = QString(); |
2611 | leftStr = QString(); |
2612 | rightStr = QString(); |
2613 | #endif |
2614 | skipanchors = 0; |
2615 | minl = 0; |
2616 | } |
2617 | |
2618 | void QRegExpEngine::Box::catAnchor(int a) |
2619 | { |
2620 | if (a != 0) { |
2621 | for (int i = 0; i < rs.size(); i++) { |
2622 | a = eng->anchorConcatenation(a: ranchors.value(key: rs.at(i), defaultValue: 0), b: a); |
2623 | ranchors.insert(key: rs.at(i), value: a); |
2624 | } |
2625 | if (minl == 0) |
2626 | skipanchors = eng->anchorConcatenation(a: skipanchors, b: a); |
2627 | } |
2628 | } |
2629 | |
2630 | #ifndef QT_NO_REGEXP_OPTIM |
2631 | void QRegExpEngine::Box::setupHeuristics() |
2632 | { |
2633 | eng->goodEarlyStart = earlyStart; |
2634 | eng->goodLateStart = lateStart; |
2635 | eng->goodStr = eng->cs ? str : str.toLower(); |
2636 | |
2637 | eng->minl = minl; |
2638 | if (eng->cs) { |
2639 | /* |
2640 | A regular expression such as 112|1 has occ1['2'] = 2 and minl = |
2641 | 1 at this point. An entry of occ1 has to be at most minl or |
2642 | infinity for the rest of the algorithm to go well. |
2643 | |
2644 | We waited until here before normalizing these cases (instead of |
2645 | doing it in Box::orx()) because sometimes things improve by |
2646 | themselves. Consider for example (112|1)34. |
2647 | */ |
2648 | for (int i = 0; i < NumBadChars; i++) { |
2649 | if (occ1.at(i) != NoOccurrence && occ1.at(i) >= minl) |
2650 | occ1[i] = minl; |
2651 | } |
2652 | eng->occ1 = occ1; |
2653 | } else { |
2654 | eng->occ1.fill(t: 0, newSize: NumBadChars); |
2655 | } |
2656 | |
2657 | eng->heuristicallyChooseHeuristic(); |
2658 | } |
2659 | #endif |
2660 | |
2661 | #if defined(QT_DEBUG) |
2662 | void QRegExpEngine::Box::dump() const |
2663 | { |
2664 | int i; |
2665 | qDebug(msg: "Box of at least %d character%s" , minl, minl == 1 ? "" : "s" ); |
2666 | qDebug(msg: " Left states:" ); |
2667 | for (i = 0; i < ls.size(); i++) { |
2668 | if (lanchors.value(key: ls[i], defaultValue: 0) == 0) |
2669 | qDebug(msg: " %d" , ls[i]); |
2670 | else |
2671 | qDebug(msg: " %d [anchors 0x%.8x]" , ls[i], lanchors[ls[i]]); |
2672 | } |
2673 | qDebug(msg: " Right states:" ); |
2674 | for (i = 0; i < rs.size(); i++) { |
2675 | if (ranchors.value(key: rs[i], defaultValue: 0) == 0) |
2676 | qDebug(msg: " %d" , rs[i]); |
2677 | else |
2678 | qDebug(msg: " %d [anchors 0x%.8x]" , rs[i], ranchors[rs[i]]); |
2679 | } |
2680 | qDebug(msg: " Skip anchors: 0x%.8x" , skipanchors); |
2681 | } |
2682 | #endif |
2683 | |
2684 | void QRegExpEngine::Box::addAnchorsToEngine(const Box &to) const |
2685 | { |
2686 | for (int i = 0; i < to.ls.size(); i++) { |
2687 | for (int j = 0; j < rs.size(); j++) { |
2688 | int a = eng->anchorConcatenation(a: ranchors.value(key: rs.at(i: j), defaultValue: 0), |
2689 | b: to.lanchors.value(key: to.ls.at(i), defaultValue: 0)); |
2690 | eng->addAnchors(from: rs[j], to: to.ls[i], a); |
2691 | } |
2692 | } |
2693 | } |
2694 | |
2695 | #ifndef QT_NO_REGEXP_CCLASS |
2696 | // fast lookup hash for xml schema extensions |
2697 | // sorted by name for b-search |
2698 | static const struct CategoriesRangeMapEntry { |
2699 | const char name[40]; |
2700 | uint first, second; |
2701 | } categoriesRangeMap[] = { |
2702 | { .name: "AegeanNumbers" , .first: 0x10100, .second: 0x1013F }, |
2703 | { .name: "AlphabeticPresentationForms" , .first: 0xFB00, .second: 0xFB4F }, |
2704 | { .name: "AncientGreekMusicalNotation" , .first: 0x1D200, .second: 0x1D24F }, |
2705 | { .name: "AncientGreekNumbers" , .first: 0x10140, .second: 0x1018F }, |
2706 | { .name: "Arabic" , .first: 0x0600, .second: 0x06FF }, |
2707 | { .name: "ArabicPresentationForms-A" , .first: 0xFB50, .second: 0xFDFF }, |
2708 | { .name: "ArabicPresentationForms-B" , .first: 0xFE70, .second: 0xFEFF }, |
2709 | { .name: "ArabicSupplement" , .first: 0x0750, .second: 0x077F }, |
2710 | { .name: "Armenian" , .first: 0x0530, .second: 0x058F }, |
2711 | { .name: "Arrows" , .first: 0x2190, .second: 0x21FF }, |
2712 | { .name: "BasicLatin" , .first: 0x0000, .second: 0x007F }, |
2713 | { .name: "Bengali" , .first: 0x0980, .second: 0x09FF }, |
2714 | { .name: "BlockElements" , .first: 0x2580, .second: 0x259F }, |
2715 | { .name: "Bopomofo" , .first: 0x3100, .second: 0x312F }, |
2716 | { .name: "BopomofoExtended" , .first: 0x31A0, .second: 0x31BF }, |
2717 | { .name: "BoxDrawing" , .first: 0x2500, .second: 0x257F }, |
2718 | { .name: "BraillePatterns" , .first: 0x2800, .second: 0x28FF }, |
2719 | { .name: "Buginese" , .first: 0x1A00, .second: 0x1A1F }, |
2720 | { .name: "Buhid" , .first: 0x1740, .second: 0x175F }, |
2721 | { .name: "ByzantineMusicalSymbols" , .first: 0x1D000, .second: 0x1D0FF }, |
2722 | { .name: "CJKCompatibility" , .first: 0x3300, .second: 0x33FF }, |
2723 | { .name: "CJKCompatibilityForms" , .first: 0xFE30, .second: 0xFE4F }, |
2724 | { .name: "CJKCompatibilityIdeographs" , .first: 0xF900, .second: 0xFAFF }, |
2725 | { .name: "CJKCompatibilityIdeographsSupplement" , .first: 0x2F800, .second: 0x2FA1F }, |
2726 | { .name: "CJKRadicalsSupplement" , .first: 0x2E80, .second: 0x2EFF }, |
2727 | { .name: "CJKStrokes" , .first: 0x31C0, .second: 0x31EF }, |
2728 | { .name: "CJKSymbolsandPunctuation" , .first: 0x3000, .second: 0x303F }, |
2729 | { .name: "CJKUnifiedIdeographs" , .first: 0x4E00, .second: 0x9FFF }, |
2730 | { .name: "CJKUnifiedIdeographsExtensionA" , .first: 0x3400, .second: 0x4DB5 }, |
2731 | { .name: "CJKUnifiedIdeographsExtensionB" , .first: 0x20000, .second: 0x2A6DF }, |
2732 | { .name: "Cherokee" , .first: 0x13A0, .second: 0x13FF }, |
2733 | { .name: "CombiningDiacriticalMarks" , .first: 0x0300, .second: 0x036F }, |
2734 | { .name: "CombiningDiacriticalMarksSupplement" , .first: 0x1DC0, .second: 0x1DFF }, |
2735 | { .name: "CombiningHalfMarks" , .first: 0xFE20, .second: 0xFE2F }, |
2736 | { .name: "CombiningMarksforSymbols" , .first: 0x20D0, .second: 0x20FF }, |
2737 | { .name: "ControlPictures" , .first: 0x2400, .second: 0x243F }, |
2738 | { .name: "Coptic" , .first: 0x2C80, .second: 0x2CFF }, |
2739 | { .name: "CurrencySymbols" , .first: 0x20A0, .second: 0x20CF }, |
2740 | { .name: "CypriotSyllabary" , .first: 0x10800, .second: 0x1083F }, |
2741 | { .name: "Cyrillic" , .first: 0x0400, .second: 0x04FF }, |
2742 | { .name: "CyrillicSupplement" , .first: 0x0500, .second: 0x052F }, |
2743 | { .name: "Deseret" , .first: 0x10400, .second: 0x1044F }, |
2744 | { .name: "Devanagari" , .first: 0x0900, .second: 0x097F }, |
2745 | { .name: "Dingbats" , .first: 0x2700, .second: 0x27BF }, |
2746 | { .name: "EnclosedAlphanumerics" , .first: 0x2460, .second: 0x24FF }, |
2747 | { .name: "EnclosedCJKLettersandMonths" , .first: 0x3200, .second: 0x32FF }, |
2748 | { .name: "Ethiopic" , .first: 0x1200, .second: 0x137F }, |
2749 | { .name: "EthiopicExtended" , .first: 0x2D80, .second: 0x2DDF }, |
2750 | { .name: "EthiopicSupplement" , .first: 0x1380, .second: 0x139F }, |
2751 | { .name: "GeneralPunctuation" , .first: 0x2000, .second: 0x206F }, |
2752 | { .name: "GeometricShapes" , .first: 0x25A0, .second: 0x25FF }, |
2753 | { .name: "Georgian" , .first: 0x10A0, .second: 0x10FF }, |
2754 | { .name: "GeorgianSupplement" , .first: 0x2D00, .second: 0x2D2F }, |
2755 | { .name: "Glagolitic" , .first: 0x2C00, .second: 0x2C5F }, |
2756 | { .name: "Gothic" , .first: 0x10330, .second: 0x1034F }, |
2757 | { .name: "Greek" , .first: 0x0370, .second: 0x03FF }, |
2758 | { .name: "GreekExtended" , .first: 0x1F00, .second: 0x1FFF }, |
2759 | { .name: "Gujarati" , .first: 0x0A80, .second: 0x0AFF }, |
2760 | { .name: "Gurmukhi" , .first: 0x0A00, .second: 0x0A7F }, |
2761 | { .name: "HalfwidthandFullwidthForms" , .first: 0xFF00, .second: 0xFFEF }, |
2762 | { .name: "HangulCompatibilityJamo" , .first: 0x3130, .second: 0x318F }, |
2763 | { .name: "HangulJamo" , .first: 0x1100, .second: 0x11FF }, |
2764 | { .name: "HangulSyllables" , .first: 0xAC00, .second: 0xD7A3 }, |
2765 | { .name: "Hanunoo" , .first: 0x1720, .second: 0x173F }, |
2766 | { .name: "Hebrew" , .first: 0x0590, .second: 0x05FF }, |
2767 | { .name: "Hiragana" , .first: 0x3040, .second: 0x309F }, |
2768 | { .name: "IPAExtensions" , .first: 0x0250, .second: 0x02AF }, |
2769 | { .name: "IdeographicDescriptionCharacters" , .first: 0x2FF0, .second: 0x2FFF }, |
2770 | { .name: "Kanbun" , .first: 0x3190, .second: 0x319F }, |
2771 | { .name: "KangxiRadicals" , .first: 0x2F00, .second: 0x2FDF }, |
2772 | { .name: "Kannada" , .first: 0x0C80, .second: 0x0CFF }, |
2773 | { .name: "Katakana" , .first: 0x30A0, .second: 0x30FF }, |
2774 | { .name: "KatakanaPhoneticExtensions" , .first: 0x31F0, .second: 0x31FF }, |
2775 | { .name: "Kharoshthi" , .first: 0x10A00, .second: 0x10A5F }, |
2776 | { .name: "Khmer" , .first: 0x1780, .second: 0x17FF }, |
2777 | { .name: "KhmerSymbols" , .first: 0x19E0, .second: 0x19FF }, |
2778 | { .name: "Lao" , .first: 0x0E80, .second: 0x0EFF }, |
2779 | { .name: "Latin-1Supplement" , .first: 0x0080, .second: 0x00FF }, |
2780 | { .name: "LatinExtended-A" , .first: 0x0100, .second: 0x017F }, |
2781 | { .name: "LatinExtended-B" , .first: 0x0180, .second: 0x024F }, |
2782 | { .name: "LatinExtendedAdditional" , .first: 0x1E00, .second: 0x1EFF }, |
2783 | { .name: "LetterlikeSymbols" , .first: 0x2100, .second: 0x214F }, |
2784 | { .name: "Limbu" , .first: 0x1900, .second: 0x194F }, |
2785 | { .name: "LinearBIdeograms" , .first: 0x10080, .second: 0x100FF }, |
2786 | { .name: "LinearBSyllabary" , .first: 0x10000, .second: 0x1007F }, |
2787 | { .name: "Malayalam" , .first: 0x0D00, .second: 0x0D7F }, |
2788 | { .name: "MathematicalAlphanumericSymbols" , .first: 0x1D400, .second: 0x1D7FF }, |
2789 | { .name: "MathematicalOperators" , .first: 0x2200, .second: 0x22FF }, |
2790 | { .name: "MiscellaneousMathematicalSymbols-A" , .first: 0x27C0, .second: 0x27EF }, |
2791 | { .name: "MiscellaneousMathematicalSymbols-B" , .first: 0x2980, .second: 0x29FF }, |
2792 | { .name: "MiscellaneousSymbols" , .first: 0x2600, .second: 0x26FF }, |
2793 | { .name: "MiscellaneousSymbolsandArrows" , .first: 0x2B00, .second: 0x2BFF }, |
2794 | { .name: "MiscellaneousTechnical" , .first: 0x2300, .second: 0x23FF }, |
2795 | { .name: "ModifierToneLetters" , .first: 0xA700, .second: 0xA71F }, |
2796 | { .name: "Mongolian" , .first: 0x1800, .second: 0x18AF }, |
2797 | { .name: "MusicalSymbols" , .first: 0x1D100, .second: 0x1D1FF }, |
2798 | { .name: "Myanmar" , .first: 0x1000, .second: 0x109F }, |
2799 | { .name: "NewTaiLue" , .first: 0x1980, .second: 0x19DF }, |
2800 | { .name: "NumberForms" , .first: 0x2150, .second: 0x218F }, |
2801 | { .name: "Ogham" , .first: 0x1680, .second: 0x169F }, |
2802 | { .name: "OldItalic" , .first: 0x10300, .second: 0x1032F }, |
2803 | { .name: "OldPersian" , .first: 0x103A0, .second: 0x103DF }, |
2804 | { .name: "OpticalCharacterRecognition" , .first: 0x2440, .second: 0x245F }, |
2805 | { .name: "Oriya" , .first: 0x0B00, .second: 0x0B7F }, |
2806 | { .name: "Osmanya" , .first: 0x10480, .second: 0x104AF }, |
2807 | { .name: "PhoneticExtensions" , .first: 0x1D00, .second: 0x1D7F }, |
2808 | { .name: "PhoneticExtensionsSupplement" , .first: 0x1D80, .second: 0x1DBF }, |
2809 | { .name: "PrivateUse" , .first: 0xE000, .second: 0xF8FF }, |
2810 | { .name: "Runic" , .first: 0x16A0, .second: 0x16FF }, |
2811 | { .name: "Shavian" , .first: 0x10450, .second: 0x1047F }, |
2812 | { .name: "Sinhala" , .first: 0x0D80, .second: 0x0DFF }, |
2813 | { .name: "SmallFormVariants" , .first: 0xFE50, .second: 0xFE6F }, |
2814 | { .name: "SpacingModifierLetters" , .first: 0x02B0, .second: 0x02FF }, |
2815 | { .name: "Specials" , .first: 0xFFF0, .second: 0xFFFF }, |
2816 | { .name: "SuperscriptsandSubscripts" , .first: 0x2070, .second: 0x209F }, |
2817 | { .name: "SupplementalArrows-A" , .first: 0x27F0, .second: 0x27FF }, |
2818 | { .name: "SupplementalArrows-B" , .first: 0x2900, .second: 0x297F }, |
2819 | { .name: "SupplementalMathematicalOperators" , .first: 0x2A00, .second: 0x2AFF }, |
2820 | { .name: "SupplementalPunctuation" , .first: 0x2E00, .second: 0x2E7F }, |
2821 | { .name: "SupplementaryPrivateUseArea-A" , .first: 0xF0000, .second: 0xFFFFF }, |
2822 | { .name: "SupplementaryPrivateUseArea-B" , .first: 0x100000, .second: 0x10FFFF }, |
2823 | { .name: "SylotiNagri" , .first: 0xA800, .second: 0xA82F }, |
2824 | { .name: "Syriac" , .first: 0x0700, .second: 0x074F }, |
2825 | { .name: "Tagalog" , .first: 0x1700, .second: 0x171F }, |
2826 | { .name: "Tagbanwa" , .first: 0x1760, .second: 0x177F }, |
2827 | { .name: "Tags" , .first: 0xE0000, .second: 0xE007F }, |
2828 | { .name: "TaiLe" , .first: 0x1950, .second: 0x197F }, |
2829 | { .name: "TaiXuanJingSymbols" , .first: 0x1D300, .second: 0x1D35F }, |
2830 | { .name: "Tamil" , .first: 0x0B80, .second: 0x0BFF }, |
2831 | { .name: "Telugu" , .first: 0x0C00, .second: 0x0C7F }, |
2832 | { .name: "Thaana" , .first: 0x0780, .second: 0x07BF }, |
2833 | { .name: "Thai" , .first: 0x0E00, .second: 0x0E7F }, |
2834 | { .name: "Tibetan" , .first: 0x0F00, .second: 0x0FFF }, |
2835 | { .name: "Tifinagh" , .first: 0x2D30, .second: 0x2D7F }, |
2836 | { .name: "Ugaritic" , .first: 0x10380, .second: 0x1039F }, |
2837 | { .name: "UnifiedCanadianAboriginalSyllabics" , .first: 0x1400, .second: 0x167F }, |
2838 | { .name: "VariationSelectors" , .first: 0xFE00, .second: 0xFE0F }, |
2839 | { .name: "VariationSelectorsSupplement" , .first: 0xE0100, .second: 0xE01EF }, |
2840 | { .name: "VerticalForms" , .first: 0xFE10, .second: 0xFE1F }, |
2841 | { .name: "YiRadicals" , .first: 0xA490, .second: 0xA4CF }, |
2842 | { .name: "YiSyllables" , .first: 0xA000, .second: 0xA48F }, |
2843 | { .name: "YijingHexagramSymbols" , .first: 0x4DC0, .second: 0x4DFF } |
2844 | }; |
2845 | |
2846 | inline bool operator<(const CategoriesRangeMapEntry &entry1, const CategoriesRangeMapEntry &entry2) |
2847 | { return qstrcmp(str1: entry1.name, str2: entry2.name) < 0; } |
2848 | inline bool operator<(const char *name, const CategoriesRangeMapEntry &entry) |
2849 | { return qstrcmp(str1: name, str2: entry.name) < 0; } |
2850 | inline bool operator<(const CategoriesRangeMapEntry &entry, const char *name) |
2851 | { return qstrcmp(str1: entry.name, str2: name) < 0; } |
2852 | #endif // QT_NO_REGEXP_CCLASS |
2853 | |
2854 | int QRegExpEngine::getChar() |
2855 | { |
2856 | return (yyPos == yyLen) ? EOS : yyIn[yyPos++].unicode(); |
2857 | } |
2858 | |
2859 | int QRegExpEngine::getEscape() |
2860 | { |
2861 | #ifndef QT_NO_REGEXP_ESCAPE |
2862 | const char tab[] = "afnrtv" ; // no b, as \b means word boundary |
2863 | const char backTab[] = "\a\f\n\r\t\v" ; |
2864 | ushort low; |
2865 | int i; |
2866 | #endif |
2867 | ushort val; |
2868 | int prevCh = yyCh; |
2869 | |
2870 | if (prevCh == EOS) { |
2871 | error(RXERR_END); |
2872 | return Tok_Char | '\\'; |
2873 | } |
2874 | yyCh = getChar(); |
2875 | #ifndef QT_NO_REGEXP_ESCAPE |
2876 | if ((prevCh & ~0xff) == 0) { |
2877 | const char *p = strchr(s: tab, c: prevCh); |
2878 | if (p != nullptr) |
2879 | return Tok_Char | backTab[p - tab]; |
2880 | } |
2881 | #endif |
2882 | |
2883 | switch (prevCh) { |
2884 | #ifndef QT_NO_REGEXP_ESCAPE |
2885 | case '0': |
2886 | val = 0; |
2887 | for (i = 0; i < 3; i++) { |
2888 | if (yyCh >= '0' && yyCh <= '7') |
2889 | val = (val << 3) | (yyCh - '0'); |
2890 | else |
2891 | break; |
2892 | yyCh = getChar(); |
2893 | } |
2894 | if ((val & ~0377) != 0) |
2895 | error(RXERR_OCTAL); |
2896 | return Tok_Char | val; |
2897 | #endif |
2898 | #ifndef QT_NO_REGEXP_ESCAPE |
2899 | case 'B': |
2900 | return Tok_NonWord; |
2901 | #endif |
2902 | #ifndef QT_NO_REGEXP_CCLASS |
2903 | case 'D': |
2904 | // see QChar::isDigit() |
2905 | yyCharClass->addCategories(cats: uint(-1) ^ FLAG(QChar::Number_DecimalDigit)); |
2906 | return Tok_CharClass; |
2907 | case 'S': |
2908 | // see QChar::isSpace() |
2909 | yyCharClass->addCategories(cats: uint(-1) ^ (FLAG(QChar::Separator_Space) | |
2910 | FLAG(QChar::Separator_Line) | |
2911 | FLAG(QChar::Separator_Paragraph) | |
2912 | FLAG(QChar::Other_Control))); |
2913 | yyCharClass->addRange(from: 0x0000, to: 0x0008); |
2914 | yyCharClass->addRange(from: 0x000e, to: 0x001f); |
2915 | yyCharClass->addRange(from: 0x007f, to: 0x0084); |
2916 | yyCharClass->addRange(from: 0x0086, to: 0x009f); |
2917 | return Tok_CharClass; |
2918 | case 'W': |
2919 | // see QChar::isLetterOrNumber() and QChar::isMark() |
2920 | yyCharClass->addCategories(cats: uint(-1) ^ (FLAG(QChar::Mark_NonSpacing) | |
2921 | FLAG(QChar::Mark_SpacingCombining) | |
2922 | FLAG(QChar::Mark_Enclosing) | |
2923 | FLAG(QChar::Number_DecimalDigit) | |
2924 | FLAG(QChar::Number_Letter) | |
2925 | FLAG(QChar::Number_Other) | |
2926 | FLAG(QChar::Letter_Uppercase) | |
2927 | FLAG(QChar::Letter_Lowercase) | |
2928 | FLAG(QChar::Letter_Titlecase) | |
2929 | FLAG(QChar::Letter_Modifier) | |
2930 | FLAG(QChar::Letter_Other) | |
2931 | FLAG(QChar::Punctuation_Connector))); |
2932 | yyCharClass->addRange(from: 0x203f, to: 0x2040); |
2933 | yyCharClass->addSingleton(ch: 0x2040); |
2934 | yyCharClass->addSingleton(ch: 0x2054); |
2935 | yyCharClass->addSingleton(ch: 0x30fb); |
2936 | yyCharClass->addRange(from: 0xfe33, to: 0xfe34); |
2937 | yyCharClass->addRange(from: 0xfe4d, to: 0xfe4f); |
2938 | yyCharClass->addSingleton(ch: 0xff3f); |
2939 | yyCharClass->addSingleton(ch: 0xff65); |
2940 | return Tok_CharClass; |
2941 | #endif |
2942 | #ifndef QT_NO_REGEXP_ESCAPE |
2943 | case 'b': |
2944 | return Tok_Word; |
2945 | #endif |
2946 | #ifndef QT_NO_REGEXP_CCLASS |
2947 | case 'd': |
2948 | // see QChar::isDigit() |
2949 | yyCharClass->addCategories(FLAG(QChar::Number_DecimalDigit)); |
2950 | return Tok_CharClass; |
2951 | case 's': |
2952 | // see QChar::isSpace() |
2953 | yyCharClass->addCategories(FLAG(QChar::Separator_Space) | |
2954 | FLAG(QChar::Separator_Line) | |
2955 | FLAG(QChar::Separator_Paragraph)); |
2956 | yyCharClass->addRange(from: 0x0009, to: 0x000d); |
2957 | yyCharClass->addSingleton(ch: 0x0085); |
2958 | return Tok_CharClass; |
2959 | case 'w': |
2960 | // see QChar::isLetterOrNumber() and QChar::isMark() |
2961 | yyCharClass->addCategories(FLAG(QChar::Mark_NonSpacing) | |
2962 | FLAG(QChar::Mark_SpacingCombining) | |
2963 | FLAG(QChar::Mark_Enclosing) | |
2964 | FLAG(QChar::Number_DecimalDigit) | |
2965 | FLAG(QChar::Number_Letter) | |
2966 | FLAG(QChar::Number_Other) | |
2967 | FLAG(QChar::Letter_Uppercase) | |
2968 | FLAG(QChar::Letter_Lowercase) | |
2969 | FLAG(QChar::Letter_Titlecase) | |
2970 | FLAG(QChar::Letter_Modifier) | |
2971 | FLAG(QChar::Letter_Other)); |
2972 | yyCharClass->addSingleton(ch: 0x005f); // '_' |
2973 | return Tok_CharClass; |
2974 | case 'I': |
2975 | if (!xmlSchemaExtensions) |
2976 | break; |
2977 | yyCharClass->setNegative(!yyCharClass->negative()); |
2978 | Q_FALLTHROUGH(); |
2979 | case 'i': |
2980 | if (xmlSchemaExtensions) { |
2981 | yyCharClass->addCategories(FLAG(QChar::Mark_NonSpacing) | |
2982 | FLAG(QChar::Mark_SpacingCombining) | |
2983 | FLAG(QChar::Mark_Enclosing) | |
2984 | FLAG(QChar::Number_DecimalDigit) | |
2985 | FLAG(QChar::Number_Letter) | |
2986 | FLAG(QChar::Number_Other) | |
2987 | FLAG(QChar::Letter_Uppercase) | |
2988 | FLAG(QChar::Letter_Lowercase) | |
2989 | FLAG(QChar::Letter_Titlecase) | |
2990 | FLAG(QChar::Letter_Modifier) | |
2991 | FLAG(QChar::Letter_Other)); |
2992 | yyCharClass->addSingleton(ch: 0x003a); // ':' |
2993 | yyCharClass->addSingleton(ch: 0x005f); // '_' |
2994 | yyCharClass->addRange(from: 0x0041, to: 0x005a); // [A-Z] |
2995 | yyCharClass->addRange(from: 0x0061, to: 0x007a); // [a-z] |
2996 | yyCharClass->addRange(from: 0xc0, to: 0xd6); |
2997 | yyCharClass->addRange(from: 0xd8, to: 0xf6); |
2998 | yyCharClass->addRange(from: 0xf8, to: 0x2ff); |
2999 | yyCharClass->addRange(from: 0x370, to: 0x37d); |
3000 | yyCharClass->addRange(from: 0x37f, to: 0x1fff); |
3001 | yyCharClass->addRange(from: 0x200c, to: 0x200d); |
3002 | yyCharClass->addRange(from: 0x2070, to: 0x218f); |
3003 | yyCharClass->addRange(from: 0x2c00, to: 0x2fef); |
3004 | yyCharClass->addRange(from: 0x3001, to: 0xd7ff); |
3005 | yyCharClass->addRange(from: 0xf900, to: 0xfdcf); |
3006 | yyCharClass->addRange(from: 0xfdf0, to: 0xfffd); |
3007 | yyCharClass->addRange(from: (ushort)0x10000, to: (ushort)0xeffff); |
3008 | return Tok_CharClass; |
3009 | } else { |
3010 | break; |
3011 | } |
3012 | case 'C': |
3013 | if (!xmlSchemaExtensions) |
3014 | break; |
3015 | yyCharClass->setNegative(!yyCharClass->negative()); |
3016 | Q_FALLTHROUGH(); |
3017 | case 'c': |
3018 | if (xmlSchemaExtensions) { |
3019 | yyCharClass->addCategories(FLAG(QChar::Mark_NonSpacing) | |
3020 | FLAG(QChar::Mark_SpacingCombining) | |
3021 | FLAG(QChar::Mark_Enclosing) | |
3022 | FLAG(QChar::Number_DecimalDigit) | |
3023 | FLAG(QChar::Number_Letter) | |
3024 | FLAG(QChar::Number_Other) | |
3025 | FLAG(QChar::Letter_Uppercase) | |
3026 | FLAG(QChar::Letter_Lowercase) | |
3027 | FLAG(QChar::Letter_Titlecase) | |
3028 | FLAG(QChar::Letter_Modifier) | |
3029 | FLAG(QChar::Letter_Other)); |
3030 | yyCharClass->addSingleton(ch: 0x002d); // '-' |
3031 | yyCharClass->addSingleton(ch: 0x002e); // '.' |
3032 | yyCharClass->addSingleton(ch: 0x003a); // ':' |
3033 | yyCharClass->addSingleton(ch: 0x005f); // '_' |
3034 | yyCharClass->addSingleton(ch: 0xb7); |
3035 | yyCharClass->addRange(from: 0x0030, to: 0x0039); // [0-9] |
3036 | yyCharClass->addRange(from: 0x0041, to: 0x005a); // [A-Z] |
3037 | yyCharClass->addRange(from: 0x0061, to: 0x007a); // [a-z] |
3038 | yyCharClass->addRange(from: 0xc0, to: 0xd6); |
3039 | yyCharClass->addRange(from: 0xd8, to: 0xf6); |
3040 | yyCharClass->addRange(from: 0xf8, to: 0x2ff); |
3041 | yyCharClass->addRange(from: 0x370, to: 0x37d); |
3042 | yyCharClass->addRange(from: 0x37f, to: 0x1fff); |
3043 | yyCharClass->addRange(from: 0x200c, to: 0x200d); |
3044 | yyCharClass->addRange(from: 0x2070, to: 0x218f); |
3045 | yyCharClass->addRange(from: 0x2c00, to: 0x2fef); |
3046 | yyCharClass->addRange(from: 0x3001, to: 0xd7ff); |
3047 | yyCharClass->addRange(from: 0xf900, to: 0xfdcf); |
3048 | yyCharClass->addRange(from: 0xfdf0, to: 0xfffd); |
3049 | yyCharClass->addRange(from: (ushort)0x10000, to: (ushort)0xeffff); |
3050 | yyCharClass->addRange(from: 0x0300, to: 0x036f); |
3051 | yyCharClass->addRange(from: 0x203f, to: 0x2040); |
3052 | return Tok_CharClass; |
3053 | } else { |
3054 | break; |
3055 | } |
3056 | case 'P': |
3057 | if (!xmlSchemaExtensions) |
3058 | break; |
3059 | yyCharClass->setNegative(!yyCharClass->negative()); |
3060 | Q_FALLTHROUGH(); |
3061 | case 'p': |
3062 | if (xmlSchemaExtensions) { |
3063 | if (yyCh != '{') { |
3064 | error(RXERR_CHARCLASS); |
3065 | return Tok_CharClass; |
3066 | } |
3067 | |
3068 | QByteArray category; |
3069 | yyCh = getChar(); |
3070 | while (yyCh != '}') { |
3071 | if (yyCh == EOS) { |
3072 | error(RXERR_END); |
3073 | return Tok_CharClass; |
3074 | } |
3075 | category.append(c: yyCh); |
3076 | yyCh = getChar(); |
3077 | } |
3078 | yyCh = getChar(); // skip closing '}' |
3079 | |
3080 | int catlen = category.size(); |
3081 | if (catlen == 1 || catlen == 2) { |
3082 | switch (category.at(i: 0)) { |
3083 | case 'M': |
3084 | if (catlen == 1) { |
3085 | yyCharClass->addCategories(FLAG(QChar::Mark_NonSpacing) | |
3086 | FLAG(QChar::Mark_SpacingCombining) | |
3087 | FLAG(QChar::Mark_Enclosing)); |
3088 | } else { |
3089 | switch (category.at(i: 1)) { |
3090 | case 'n': yyCharClass->addCategories(FLAG(QChar::Mark_NonSpacing)); break; // Mn |
3091 | case 'c': yyCharClass->addCategories(FLAG(QChar::Mark_SpacingCombining)); break; // Mc |
3092 | case 'e': yyCharClass->addCategories(FLAG(QChar::Mark_Enclosing)); break; // Me |
3093 | default: error(RXERR_CATEGORY); break; |
3094 | } |
3095 | } |
3096 | break; |
3097 | case 'N': |
3098 | if (catlen == 1) { |
3099 | yyCharClass->addCategories(FLAG(QChar::Number_DecimalDigit) | |
3100 | FLAG(QChar::Number_Letter) | |
3101 | FLAG(QChar::Number_Other)); |
3102 | } else { |
3103 | switch (category.at(i: 1)) { |
3104 | case 'd': yyCharClass->addCategories(FLAG(QChar::Number_DecimalDigit)); break; // Nd |
3105 | case 'l': yyCharClass->addCategories(FLAG(QChar::Number_Letter)); break; // Hl |
3106 | case 'o': yyCharClass->addCategories(FLAG(QChar::Number_Other)); break; // No |
3107 | default: error(RXERR_CATEGORY); break; |
3108 | } |
3109 | } |
3110 | break; |
3111 | case 'Z': |
3112 | if (catlen == 1) { |
3113 | yyCharClass->addCategories(FLAG(QChar::Separator_Space) | |
3114 | FLAG(QChar::Separator_Line) | |
3115 | FLAG(QChar::Separator_Paragraph)); |
3116 | } else { |
3117 | switch (category.at(i: 1)) { |
3118 | case 's': yyCharClass->addCategories(FLAG(QChar::Separator_Space)); break; // Zs |
3119 | case 'l': yyCharClass->addCategories(FLAG(QChar::Separator_Line)); break; // Zl |
3120 | case 'p': yyCharClass->addCategories(FLAG(QChar::Separator_Paragraph)); break; // Zp |
3121 | default: error(RXERR_CATEGORY); break; |
3122 | } |
3123 | } |
3124 | break; |
3125 | case 'C': |
3126 | if (catlen == 1) { |
3127 | yyCharClass->addCategories(FLAG(QChar::Other_Control) | |
3128 | FLAG(QChar::Other_Format) | |
3129 | FLAG(QChar::Other_Surrogate) | |
3130 | FLAG(QChar::Other_PrivateUse) | |
3131 | FLAG(QChar::Other_NotAssigned)); |
3132 | } else { |
3133 | switch (category.at(i: 1)) { |
3134 | case 'c': yyCharClass->addCategories(FLAG(QChar::Other_Control)); break; // Cc |
3135 | case 'f': yyCharClass->addCategories(FLAG(QChar::Other_Format)); break; // Cf |
3136 | case 's': yyCharClass->addCategories(FLAG(QChar::Other_Surrogate)); break; // Cs |
3137 | case 'o': yyCharClass->addCategories(FLAG(QChar::Other_PrivateUse)); break; // Co |
3138 | case 'n': yyCharClass->addCategories(FLAG(QChar::Other_NotAssigned)); break; // Cn |
3139 | default: error(RXERR_CATEGORY); break; |
3140 | } |
3141 | } |
3142 | break; |
3143 | case 'L': |
3144 | if (catlen == 1) { |
3145 | yyCharClass->addCategories(FLAG(QChar::Letter_Uppercase) | |
3146 | FLAG(QChar::Letter_Lowercase) | |
3147 | FLAG(QChar::Letter_Titlecase) | |
3148 | FLAG(QChar::Letter_Modifier) | |
3149 | FLAG(QChar::Letter_Other)); |
3150 | } else { |
3151 | switch (category.at(i: 1)) { |
3152 | case 'u': yyCharClass->addCategories(FLAG(QChar::Letter_Uppercase)); break; // Lu |
3153 | case 'l': yyCharClass->addCategories(FLAG(QChar::Letter_Lowercase)); break; // Ll |
3154 | case 't': yyCharClass->addCategories(FLAG(QChar::Letter_Titlecase)); break; // Lt |
3155 | case 'm': yyCharClass->addCategories(FLAG(QChar::Letter_Modifier)); break; // Lm |
3156 | case 'o': yyCharClass->addCategories(FLAG(QChar::Letter_Other)); break; // Lo |
3157 | default: error(RXERR_CATEGORY); break; |
3158 | } |
3159 | } |
3160 | break; |
3161 | case 'P': |
3162 | if (catlen == 1) { |
3163 | yyCharClass->addCategories(FLAG(QChar::Punctuation_Connector) | |
3164 | FLAG(QChar::Punctuation_Dash) | |
3165 | FLAG(QChar::Punctuation_Open) | |
3166 | FLAG(QChar::Punctuation_Close) | |
3167 | FLAG(QChar::Punctuation_InitialQuote) | |
3168 | FLAG(QChar::Punctuation_FinalQuote) | |
3169 | FLAG(QChar::Punctuation_Other)); |
3170 | } else { |
3171 | switch (category.at(i: 1)) { |
3172 | case 'c': yyCharClass->addCategories(FLAG(QChar::Punctuation_Connector)); break; // Pc |
3173 | case 'd': yyCharClass->addCategories(FLAG(QChar::Punctuation_Dash)); break; // Pd |
3174 | case 's': yyCharClass->addCategories(FLAG(QChar::Punctuation_Open)); break; // Ps |
3175 | case 'e': yyCharClass->addCategories(FLAG(QChar::Punctuation_Close)); break; // Pe |
3176 | case 'i': yyCharClass->addCategories(FLAG(QChar::Punctuation_InitialQuote)); break; // Pi |
3177 | case 'f': yyCharClass->addCategories(FLAG(QChar::Punctuation_FinalQuote)); break; // Pf |
3178 | case 'o': yyCharClass->addCategories(FLAG(QChar::Punctuation_Other)); break; // Po |
3179 | default: error(RXERR_CATEGORY); break; |
3180 | } |
3181 | } |
3182 | break; |
3183 | case 'S': |
3184 | if (catlen == 1) { |
3185 | yyCharClass->addCategories(FLAG(QChar::Symbol_Math) | |
3186 | FLAG(QChar::Symbol_Currency) | |
3187 | FLAG(QChar::Symbol_Modifier) | |
3188 | FLAG(QChar::Symbol_Other)); |
3189 | } else { |
3190 | switch (category.at(i: 1)) { |
3191 | case 'm': yyCharClass->addCategories(FLAG(QChar::Symbol_Math)); break; // Sm |
3192 | case 'c': yyCharClass->addCategories(FLAG(QChar::Symbol_Currency)); break; // Sc |
3193 | case 'k': yyCharClass->addCategories(FLAG(QChar::Symbol_Modifier)); break; // Sk |
3194 | case 'o': yyCharClass->addCategories(FLAG(QChar::Symbol_Other)); break; // So |
3195 | default: error(RXERR_CATEGORY); break; |
3196 | } |
3197 | } |
3198 | break; |
3199 | default: |
3200 | error(RXERR_CATEGORY); |
3201 | break; |
3202 | } |
3203 | } else if (catlen > 2 && category.at(i: 0) == 'I' && category.at(i: 1) == 's') { |
3204 | static const int N = sizeof(categoriesRangeMap) / sizeof(categoriesRangeMap[0]); |
3205 | const char * const categoryFamily = category.constData() + 2; |
3206 | const CategoriesRangeMapEntry *r = std::lower_bound(first: categoriesRangeMap, last: categoriesRangeMap + N, val: categoryFamily); |
3207 | if (r != categoriesRangeMap + N && qstrcmp(str1: r->name, str2: categoryFamily) == 0) |
3208 | yyCharClass->addRange(from: r->first, to: r->second); |
3209 | else |
3210 | error(RXERR_CATEGORY); |
3211 | } else { |
3212 | error(RXERR_CATEGORY); |
3213 | } |
3214 | return Tok_CharClass; |
3215 | } else { |
3216 | break; |
3217 | } |
3218 | #endif |
3219 | #ifndef QT_NO_REGEXP_ESCAPE |
3220 | case 'x': |
3221 | val = 0; |
3222 | for (i = 0; i < 4; i++) { |
3223 | low = QChar(yyCh).toLower().unicode(); |
3224 | if (low >= '0' && low <= '9') |
3225 | val = (val << 4) | (low - '0'); |
3226 | else if (low >= 'a' && low <= 'f') |
3227 | val = (val << 4) | (low - 'a' + 10); |
3228 | else |
3229 | break; |
3230 | yyCh = getChar(); |
3231 | } |
3232 | return Tok_Char | val; |
3233 | #endif |
3234 | default: |
3235 | break; |
3236 | } |
3237 | if (prevCh >= '1' && prevCh <= '9') { |
3238 | #ifndef QT_NO_REGEXP_BACKREF |
3239 | val = prevCh - '0'; |
3240 | while (yyCh >= '0' && yyCh <= '9') { |
3241 | val = (val * 10) + (yyCh - '0'); |
3242 | yyCh = getChar(); |
3243 | } |
3244 | return Tok_BackRef | val; |
3245 | #else |
3246 | error(RXERR_DISABLED); |
3247 | #endif |
3248 | } |
3249 | return Tok_Char | prevCh; |
3250 | } |
3251 | |
3252 | #ifndef QT_NO_REGEXP_INTERVAL |
3253 | int QRegExpEngine::getRep(int def) |
3254 | { |
3255 | if (yyCh >= '0' && yyCh <= '9') { |
3256 | int rep = 0; |
3257 | do { |
3258 | rep = 10 * rep + yyCh - '0'; |
3259 | if (rep >= InftyRep) { |
3260 | error(RXERR_REPETITION); |
3261 | rep = def; |
3262 | } |
3263 | yyCh = getChar(); |
3264 | } while (yyCh >= '0' && yyCh <= '9'); |
3265 | return rep; |
3266 | } else { |
3267 | return def; |
3268 | } |
3269 | } |
3270 | #endif |
3271 | |
3272 | #ifndef QT_NO_REGEXP_LOOKAHEAD |
3273 | void QRegExpEngine::skipChars(int n) |
3274 | { |
3275 | if (n > 0) { |
3276 | yyPos += n - 1; |
3277 | yyCh = getChar(); |
3278 | } |
3279 | } |
3280 | #endif |
3281 | |
3282 | void QRegExpEngine::error(const char *msg) |
3283 | { |
3284 | if (yyError.isEmpty()) |
3285 | yyError = QLatin1String(msg); |
3286 | } |
3287 | |
3288 | void QRegExpEngine::startTokenizer(const QChar *rx, int len) |
3289 | { |
3290 | yyIn = rx; |
3291 | yyPos0 = 0; |
3292 | yyPos = 0; |
3293 | yyLen = len; |
3294 | yyCh = getChar(); |
3295 | yyCharClass.reset(other: new QRegExpCharClass); |
3296 | yyMinRep = 0; |
3297 | yyMaxRep = 0; |
3298 | yyError = QString(); |
3299 | } |
3300 | |
3301 | int QRegExpEngine::getToken() |
3302 | { |
3303 | #ifndef QT_NO_REGEXP_CCLASS |
3304 | ushort pendingCh = 0; |
3305 | bool charPending; |
3306 | bool rangePending; |
3307 | int tok; |
3308 | #endif |
3309 | int prevCh = yyCh; |
3310 | |
3311 | yyPos0 = yyPos - 1; |
3312 | #ifndef QT_NO_REGEXP_CCLASS |
3313 | yyCharClass->clear(); |
3314 | #endif |
3315 | yyMinRep = 0; |
3316 | yyMaxRep = 0; |
3317 | yyCh = getChar(); |
3318 | |
3319 | switch (prevCh) { |
3320 | case EOS: |
3321 | yyPos0 = yyPos; |
3322 | return Tok_Eos; |
3323 | case '$': |
3324 | return Tok_Dollar; |
3325 | case '(': |
3326 | if (yyCh == '?') { |
3327 | prevCh = getChar(); |
3328 | yyCh = getChar(); |
3329 | switch (prevCh) { |
3330 | #ifndef QT_NO_REGEXP_LOOKAHEAD |
3331 | case '!': |
3332 | return Tok_NegLookahead; |
3333 | case '=': |
3334 | return Tok_PosLookahead; |
3335 | #endif |
3336 | case ':': |
3337 | return Tok_MagicLeftParen; |
3338 | case '<': |
3339 | error(RXERR_LOOKBEHIND); |
3340 | return Tok_MagicLeftParen; |
3341 | default: |
3342 | error(RXERR_LOOKAHEAD); |
3343 | return Tok_MagicLeftParen; |
3344 | } |
3345 | } else { |
3346 | return Tok_LeftParen; |
3347 | } |
3348 | case ')': |
3349 | return Tok_RightParen; |
3350 | case '*': |
3351 | yyMinRep = 0; |
3352 | yyMaxRep = InftyRep; |
3353 | return Tok_Quantifier; |
3354 | case '+': |
3355 | yyMinRep = 1; |
3356 | yyMaxRep = InftyRep; |
3357 | return Tok_Quantifier; |
3358 | case '.': |
3359 | #ifndef QT_NO_REGEXP_CCLASS |
3360 | yyCharClass->setNegative(true); |
3361 | #endif |
3362 | return Tok_CharClass; |
3363 | case '?': |
3364 | yyMinRep = 0; |
3365 | yyMaxRep = 1; |
3366 | return Tok_Quantifier; |
3367 | case '[': |
3368 | #ifndef QT_NO_REGEXP_CCLASS |
3369 | if (yyCh == '^') { |
3370 | yyCharClass->setNegative(true); |
3371 | yyCh = getChar(); |
3372 | } |
3373 | charPending = false; |
3374 | rangePending = false; |
3375 | do { |
3376 | if (yyCh == '-' && charPending && !rangePending) { |
3377 | rangePending = true; |
3378 | yyCh = getChar(); |
3379 | } else { |
3380 | if (charPending && !rangePending) { |
3381 | yyCharClass->addSingleton(ch: pendingCh); |
3382 | charPending = false; |
3383 | } |
3384 | if (yyCh == '\\') { |
3385 | yyCh = getChar(); |
3386 | tok = getEscape(); |
3387 | if (tok == Tok_Word) |
3388 | tok = '\b'; |
3389 | } else { |
3390 | tok = Tok_Char | yyCh; |
3391 | yyCh = getChar(); |
3392 | } |
3393 | if (tok == Tok_CharClass) { |
3394 | if (rangePending) { |
3395 | yyCharClass->addSingleton(ch: '-'); |
3396 | yyCharClass->addSingleton(ch: pendingCh); |
3397 | charPending = false; |
3398 | rangePending = false; |
3399 | } |
3400 | } else if ((tok & Tok_Char) != 0) { |
3401 | if (rangePending) { |
3402 | yyCharClass->addRange(from: pendingCh, to: tok ^ Tok_Char); |
3403 | charPending = false; |
3404 | rangePending = false; |
3405 | } else { |
3406 | pendingCh = tok ^ Tok_Char; |
3407 | charPending = true; |
3408 | } |
3409 | } else { |
3410 | error(RXERR_CHARCLASS); |
3411 | } |
3412 | } |
3413 | } while (yyCh != ']' && yyCh != EOS); |
3414 | if (rangePending) |
3415 | yyCharClass->addSingleton(ch: '-'); |
3416 | if (charPending) |
3417 | yyCharClass->addSingleton(ch: pendingCh); |
3418 | if (yyCh == EOS) |
3419 | error(RXERR_END); |
3420 | else |
3421 | yyCh = getChar(); |
3422 | return Tok_CharClass; |
3423 | #else |
3424 | error(RXERR_END); |
3425 | return Tok_Char | '['; |
3426 | #endif |
3427 | case '\\': |
3428 | return getEscape(); |
3429 | case ']': |
3430 | error(RXERR_LEFTDELIM); |
3431 | return Tok_Char | ']'; |
3432 | case '^': |
3433 | return Tok_Caret; |
3434 | case '{': |
3435 | #ifndef QT_NO_REGEXP_INTERVAL |
3436 | yyMinRep = getRep(def: 0); |
3437 | yyMaxRep = yyMinRep; |
3438 | if (yyCh == ',') { |
3439 | yyCh = getChar(); |
3440 | yyMaxRep = getRep(def: InftyRep); |
3441 | } |
3442 | if (yyMaxRep < yyMinRep) |
3443 | error(RXERR_INTERVAL); |
3444 | if (yyCh != '}') |
3445 | error(RXERR_REPETITION); |
3446 | yyCh = getChar(); |
3447 | return Tok_Quantifier; |
3448 | #else |
3449 | error(RXERR_DISABLED); |
3450 | return Tok_Char | '{'; |
3451 | #endif |
3452 | case '|': |
3453 | return Tok_Bar; |
3454 | case '}': |
3455 | error(RXERR_LEFTDELIM); |
3456 | return Tok_Char | '}'; |
3457 | default: |
3458 | return Tok_Char | prevCh; |
3459 | } |
3460 | } |
3461 | |
3462 | int QRegExpEngine::parse(const QChar *pattern, int len) |
3463 | { |
3464 | valid = true; |
3465 | startTokenizer(rx: pattern, len); |
3466 | yyTok = getToken(); |
3467 | #ifndef QT_NO_REGEXP_CAPTURE |
3468 | yyMayCapture = true; |
3469 | #else |
3470 | yyMayCapture = false; |
3471 | #endif |
3472 | |
3473 | #ifndef QT_NO_REGEXP_CAPTURE |
3474 | int atom = startAtom(officialCapture: false); |
3475 | #endif |
3476 | QRegExpCharClass anything; |
3477 | Box box(this); // create InitialState |
3478 | box.set(anything); |
3479 | Box rightBox(this); // create FinalState |
3480 | rightBox.set(anything); |
3481 | |
3482 | Box middleBox(this); |
3483 | parseExpression(box: &middleBox); |
3484 | #ifndef QT_NO_REGEXP_CAPTURE |
3485 | finishAtom(atom, needCapture: false); |
3486 | #endif |
3487 | #ifndef QT_NO_REGEXP_OPTIM |
3488 | middleBox.setupHeuristics(); |
3489 | #endif |
3490 | box.cat(b: middleBox); |
3491 | box.cat(b: rightBox); |
3492 | yyCharClass.reset(); |
3493 | |
3494 | #ifndef QT_NO_REGEXP_CAPTURE |
3495 | for (int i = 0; i < nf; ++i) { |
3496 | switch (f[i].capture) { |
3497 | case QRegExpAtom::NoCapture: |
3498 | break; |
3499 | case QRegExpAtom::OfficialCapture: |
3500 | f[i].capture = ncap; |
3501 | captureForOfficialCapture.append(t: ncap); |
3502 | ++ncap; |
3503 | ++officialncap; |
3504 | break; |
3505 | case QRegExpAtom::UnofficialCapture: |
3506 | f[i].capture = greedyQuantifiers ? ncap++ : QRegExpAtom::NoCapture; |
3507 | } |
3508 | } |
3509 | |
3510 | #ifndef QT_NO_REGEXP_BACKREF |
3511 | #ifndef QT_NO_REGEXP_OPTIM |
3512 | if (officialncap == 0 && nbrefs == 0) { |
3513 | ncap = nf = 0; |
3514 | f.clear(); |
3515 | } |
3516 | #endif |
3517 | // handle the case where there's a \5 with no corresponding capture |
3518 | // (captureForOfficialCapture.size() != officialncap) |
3519 | for (int i = 0; i < nbrefs - officialncap; ++i) { |
3520 | captureForOfficialCapture.append(t: ncap); |
3521 | ++ncap; |
3522 | } |
3523 | #endif |
3524 | #endif |
3525 | |
3526 | if (!yyError.isEmpty()) |
3527 | return -1; |
3528 | |
3529 | #ifndef QT_NO_REGEXP_OPTIM |
3530 | const QRegExpAutomatonState &sinit = s.at(i: InitialState); |
3531 | caretAnchored = !sinit.anchors.isEmpty(); |
3532 | if (caretAnchored) { |
3533 | const QMap<int, int> &anchors = sinit.anchors; |
3534 | QMap<int, int>::const_iterator a; |
3535 | for (a = anchors.constBegin(); a != anchors.constEnd(); ++a) { |
3536 | if ( |
3537 | #ifndef QT_NO_REGEXP_ANCHOR_ALT |
3538 | (*a & Anchor_Alternation) != 0 || |
3539 | #endif |
3540 | (*a & Anchor_Caret) == 0) |
3541 | { |
3542 | caretAnchored = false; |
3543 | break; |
3544 | } |
3545 | } |
3546 | } |
3547 | #endif |
3548 | |
3549 | // cleanup anchors |
3550 | int numStates = s.size(); |
3551 | for (int i = 0; i < numStates; ++i) { |
3552 | QRegExpAutomatonState &state = s[i]; |
3553 | if (!state.anchors.isEmpty()) { |
3554 | QMap<int, int>::iterator a = state.anchors.begin(); |
3555 | while (a != state.anchors.end()) { |
3556 | if (a.value() == 0) |
3557 | a = state.anchors.erase(it: a); |
3558 | else |
3559 | ++a; |
3560 | } |
3561 | } |
3562 | } |
3563 | |
3564 | return yyPos0; |
3565 | } |
3566 | |
3567 | void QRegExpEngine::parseAtom(Box *box) |
3568 | { |
3569 | #ifndef QT_NO_REGEXP_LOOKAHEAD |
3570 | QRegExpEngine *eng = nullptr; |
3571 | bool neg; |
3572 | int len; |
3573 | #endif |
3574 | |
3575 | if ((yyTok & Tok_Char) != 0) { |
3576 | box->set(QChar(yyTok ^ Tok_Char)); |
3577 | } else { |
3578 | #ifndef QT_NO_REGEXP_OPTIM |
3579 | trivial = false; |
3580 | #endif |
3581 | switch (yyTok) { |
3582 | case Tok_Dollar: |
3583 | box->catAnchor(a: Anchor_Dollar); |
3584 | break; |
3585 | case Tok_Caret: |
3586 | box->catAnchor(a: Anchor_Caret); |
3587 | break; |
3588 | #ifndef QT_NO_REGEXP_LOOKAHEAD |
3589 | case Tok_PosLookahead: |
3590 | case Tok_NegLookahead: |
3591 | neg = (yyTok == Tok_NegLookahead); |
3592 | eng = new QRegExpEngine(cs, greedyQuantifiers); |
3593 | len = eng->parse(pattern: yyIn + yyPos - 1, len: yyLen - yyPos + 1); |
3594 | if (len >= 0) |
3595 | skipChars(n: len); |
3596 | else |
3597 | error(RXERR_LOOKAHEAD); |
3598 | box->catAnchor(a: addLookahead(eng, negative: neg)); |
3599 | yyTok = getToken(); |
3600 | if (yyTok != Tok_RightParen) |
3601 | error(RXERR_LOOKAHEAD); |
3602 | break; |
3603 | #endif |
3604 | #ifndef QT_NO_REGEXP_ESCAPE |
3605 | case Tok_Word: |
3606 | box->catAnchor(a: Anchor_Word); |
3607 | break; |
3608 | case Tok_NonWord: |
3609 | box->catAnchor(a: Anchor_NonWord); |
3610 | break; |
3611 | #endif |
3612 | case Tok_LeftParen: |
3613 | case Tok_MagicLeftParen: |
3614 | yyTok = getToken(); |
3615 | parseExpression(box); |
3616 | if (yyTok != Tok_RightParen) |
3617 | error(RXERR_END); |
3618 | break; |
3619 | case Tok_CharClass: |
3620 | box->set(*yyCharClass); |
3621 | break; |
3622 | case Tok_Quantifier: |
3623 | error(RXERR_REPETITION); |
3624 | break; |
3625 | default: |
3626 | #ifndef QT_NO_REGEXP_BACKREF |
3627 | if ((yyTok & Tok_BackRef) != 0) |
3628 | box->set(yyTok ^ Tok_BackRef); |
3629 | else |
3630 | #endif |
3631 | error(RXERR_DISABLED); |
3632 | } |
3633 | } |
3634 | yyTok = getToken(); |
3635 | } |
3636 | |
3637 | void QRegExpEngine::parseFactor(Box *box) |
3638 | { |
3639 | #ifndef QT_NO_REGEXP_CAPTURE |
3640 | int outerAtom = greedyQuantifiers ? startAtom(officialCapture: false) : -1; |
3641 | int innerAtom = startAtom(officialCapture: yyMayCapture && yyTok == Tok_LeftParen); |
3642 | bool magicLeftParen = (yyTok == Tok_MagicLeftParen); |
3643 | #else |
3644 | const int innerAtom = -1; |
3645 | #endif |
3646 | |
3647 | #ifndef QT_NO_REGEXP_INTERVAL |
3648 | #define YYREDO() \ |
3649 | yyIn = in, yyPos0 = pos0, yyPos = pos, yyLen = len, yyCh = ch, \ |
3650 | *yyCharClass = charClass, yyMinRep = 0, yyMaxRep = 0, yyTok = tok |
3651 | |
3652 | const QChar *in = yyIn; |
3653 | int pos0 = yyPos0; |
3654 | int pos = yyPos; |
3655 | int len = yyLen; |
3656 | int ch = yyCh; |
3657 | QRegExpCharClass charClass; |
3658 | if (yyTok == Tok_CharClass) |
3659 | charClass = *yyCharClass; |
3660 | int tok = yyTok; |
3661 | bool mayCapture = yyMayCapture; |
3662 | #endif |
3663 | |
3664 | parseAtom(box); |
3665 | #ifndef QT_NO_REGEXP_CAPTURE |
3666 | finishAtom(atom: innerAtom, needCapture: magicLeftParen); |
3667 | #endif |
3668 | |
3669 | bool hasQuantifier = (yyTok == Tok_Quantifier); |
3670 | if (hasQuantifier) { |
3671 | #ifndef QT_NO_REGEXP_OPTIM |
3672 | trivial = false; |
3673 | #endif |
3674 | if (yyMaxRep == InftyRep) { |
3675 | box->plus(atom: innerAtom); |
3676 | #ifndef QT_NO_REGEXP_INTERVAL |
3677 | } else if (yyMaxRep == 0) { |
3678 | box->clear(); |
3679 | #endif |
3680 | } |
3681 | if (yyMinRep == 0) |
3682 | box->opt(); |
3683 | |
3684 | #ifndef QT_NO_REGEXP_INTERVAL |
3685 | yyMayCapture = false; |
3686 | int alpha = (yyMinRep == 0) ? 0 : yyMinRep - 1; |
3687 | int beta = (yyMaxRep == InftyRep) ? 0 : yyMaxRep - (alpha + 1); |
3688 | |
3689 | Box rightBox(this); |
3690 | int i; |
3691 | |
3692 | for (i = 0; i < beta; i++) { |
3693 | YYREDO(); |
3694 | Box leftBox(this); |
3695 | parseAtom(box: &leftBox); |
3696 | leftBox.cat(b: rightBox); |
3697 | leftBox.opt(); |
3698 | rightBox = leftBox; |
3699 | } |
3700 | for (i = 0; i < alpha; i++) { |
3701 | YYREDO(); |
3702 | Box leftBox(this); |
3703 | parseAtom(box: &leftBox); |
3704 | leftBox.cat(b: rightBox); |
3705 | rightBox = leftBox; |
3706 | } |
3707 | rightBox.cat(b: *box); |
3708 | *box = rightBox; |
3709 | #endif |
3710 | yyTok = getToken(); |
3711 | #ifndef QT_NO_REGEXP_INTERVAL |
3712 | yyMayCapture = mayCapture; |
3713 | #endif |
3714 | } |
3715 | #undef YYREDO |
3716 | #ifndef QT_NO_REGEXP_CAPTURE |
3717 | if (greedyQuantifiers) |
3718 | finishAtom(atom: outerAtom, needCapture: hasQuantifier); |
3719 | #endif |
3720 | } |
3721 | |
3722 | void QRegExpEngine::parseTerm(Box *box) |
3723 | { |
3724 | #ifndef QT_NO_REGEXP_OPTIM |
3725 | if (yyTok != Tok_Eos && yyTok != Tok_RightParen && yyTok != Tok_Bar) |
3726 | parseFactor(box); |
3727 | #endif |
3728 | while (yyTok != Tok_Eos && yyTok != Tok_RightParen && yyTok != Tok_Bar) { |
3729 | Box rightBox(this); |
3730 | parseFactor(box: &rightBox); |
3731 | box->cat(b: rightBox); |
3732 | } |
3733 | } |
3734 | |
3735 | void QRegExpEngine::parseExpression(Box *box) |
3736 | { |
3737 | parseTerm(box); |
3738 | while (yyTok == Tok_Bar) { |
3739 | #ifndef QT_NO_REGEXP_OPTIM |
3740 | trivial = false; |
3741 | #endif |
3742 | Box rightBox(this); |
3743 | yyTok = getToken(); |
3744 | parseTerm(box: &rightBox); |
3745 | box->orx(b: rightBox); |
3746 | } |
3747 | } |
3748 | |
3749 | /* |
3750 | The struct QRegExpPrivate contains the private data of a regular |
3751 | expression other than the automaton. It makes it possible for many |
3752 | QRegExp objects to use the same QRegExpEngine object with different |
3753 | QRegExpPrivate objects. |
3754 | */ |
3755 | struct QRegExpPrivate |
3756 | { |
3757 | QRegExpEngine *eng; |
3758 | QRegExpEngineKey engineKey; |
3759 | bool minimal; |
3760 | #ifndef QT_NO_REGEXP_CAPTURE |
3761 | QString t; // last string passed to QRegExp::indexIn() or lastIndexIn() |
3762 | QStringList capturedCache; // what QRegExp::capturedTexts() returned last |
3763 | #endif |
3764 | QRegExpMatchState matchState; |
3765 | |
3766 | inline QRegExpPrivate() |
3767 | : eng(nullptr), engineKey(QString(), QRegExp::RegExp, Qt::CaseSensitive), minimal(false) { } |
3768 | inline QRegExpPrivate(const QRegExpEngineKey &key) |
3769 | : eng(nullptr), engineKey(key), minimal(false) {} |
3770 | }; |
3771 | |
3772 | #if !defined(QT_NO_REGEXP_OPTIM) |
3773 | struct QRECache |
3774 | { |
3775 | typedef QHash<QRegExpEngineKey, QRegExpEngine *> EngineCache; |
3776 | typedef QCache<QRegExpEngineKey, QRegExpEngine> UnusedEngineCache; |
3777 | EngineCache usedEngines; |
3778 | UnusedEngineCache unusedEngines; |
3779 | }; |
3780 | Q_GLOBAL_STATIC(QRECache, engineCache) |
3781 | static QBasicMutex engineCacheMutex; |
3782 | #endif // QT_NO_REGEXP_OPTIM |
3783 | |
3784 | static void derefEngine(QRegExpEngine *eng, const QRegExpEngineKey &key) |
3785 | { |
3786 | #if !defined(QT_NO_REGEXP_OPTIM) |
3787 | const auto locker = qt_scoped_lock(mutex&: engineCacheMutex); |
3788 | if (!eng->ref.deref()) { |
3789 | if (QRECache *c = engineCache()) { |
3790 | c->unusedEngines.insert(key, object: eng, cost: 4 + key.pattern.size() / 4); |
3791 | c->usedEngines.remove(key); |
3792 | } else { |
3793 | delete eng; |
3794 | } |
3795 | } |
3796 | #else |
3797 | Q_UNUSED(key); |
3798 | if (!eng->ref.deref()) |
3799 | delete eng; |
3800 | #endif |
3801 | } |
3802 | |
3803 | static void prepareEngine_helper(QRegExpPrivate *priv) |
3804 | { |
3805 | Q_ASSERT(!priv->eng); |
3806 | |
3807 | #if !defined(QT_NO_REGEXP_OPTIM) |
3808 | const auto locker = qt_scoped_lock(mutex&: engineCacheMutex); |
3809 | if (QRECache *c = engineCache()) { |
3810 | priv->eng = c->unusedEngines.take(key: priv->engineKey); |
3811 | if (!priv->eng) |
3812 | priv->eng = c->usedEngines.value(key: priv->engineKey); |
3813 | if (!priv->eng) |
3814 | priv->eng = new QRegExpEngine(priv->engineKey); |
3815 | else |
3816 | priv->eng->ref.ref(); |
3817 | |
3818 | c->usedEngines.insert(key: priv->engineKey, value: priv->eng); |
3819 | return; |
3820 | } |
3821 | #endif // QT_NO_REGEXP_OPTIM |
3822 | |
3823 | priv->eng = new QRegExpEngine(priv->engineKey); |
3824 | } |
3825 | |
3826 | inline static void prepareEngine(QRegExpPrivate *priv) |
3827 | { |
3828 | if (priv->eng) |
3829 | return; |
3830 | prepareEngine_helper(priv); |
3831 | priv->matchState.prepareForMatch(eng: priv->eng); |
3832 | } |
3833 | |
3834 | static void prepareEngineForMatch(QRegExpPrivate *priv, const QString &str) |
3835 | { |
3836 | prepareEngine(priv); |
3837 | priv->matchState.prepareForMatch(eng: priv->eng); |
3838 | #ifndef QT_NO_REGEXP_CAPTURE |
3839 | priv->t = str; |
3840 | priv->capturedCache.clear(); |
3841 | #else |
3842 | Q_UNUSED(str); |
3843 | #endif |
3844 | } |
3845 | |
3846 | static void invalidateEngine(QRegExpPrivate *priv) |
3847 | { |
3848 | if (priv->eng) { |
3849 | derefEngine(eng: priv->eng, key: priv->engineKey); |
3850 | priv->eng = nullptr; |
3851 | priv->matchState.drain(); |
3852 | } |
3853 | } |
3854 | |
3855 | /*! |
3856 | \enum QRegExp::CaretMode |
3857 | |
3858 | The CaretMode enum defines the different meanings of the caret |
3859 | (\b{^}) in a regular expression. The possible values are: |
3860 | |
3861 | \value CaretAtZero |
3862 | The caret corresponds to index 0 in the searched string. |
3863 | |
3864 | \value CaretAtOffset |
3865 | The caret corresponds to the start offset of the search. |
3866 | |
3867 | \value CaretWontMatch |
3868 | The caret never matches. |
3869 | */ |
3870 | |
3871 | /*! |
3872 | \enum QRegExp::PatternSyntax |
3873 | |
3874 | The syntax used to interpret the meaning of the pattern. |
3875 | |
3876 | \value RegExp A rich Perl-like pattern matching syntax. This is |
3877 | the default. |
3878 | |
3879 | \value RegExp2 Like RegExp, but with \l{greedy quantifiers}. |
3880 | (Introduced in Qt 4.2.) |
3881 | |
3882 | \value Wildcard This provides a simple pattern matching syntax |
3883 | similar to that used by shells (command interpreters) for "file |
3884 | globbing". See \l{QRegExp wildcard matching}. |
3885 | |
3886 | \value WildcardUnix This is similar to Wildcard but with the |
3887 | behavior of a Unix shell. The wildcard characters can be escaped |
3888 | with the character "\\". |
3889 | |
3890 | \value FixedString The pattern is a fixed string. This is |
3891 | equivalent to using the RegExp pattern on a string in |
3892 | which all metacharacters are escaped using escape(). |
3893 | |
3894 | \value W3CXmlSchema11 The pattern is a regular expression as |
3895 | defined by the W3C XML Schema 1.1 specification. |
3896 | |
3897 | \sa setPatternSyntax() |
3898 | */ |
3899 | |
3900 | /*! |
3901 | Constructs an empty regexp. |
3902 | |
3903 | \sa isValid(), errorString() |
3904 | */ |
3905 | QRegExp::QRegExp() |
3906 | { |
3907 | priv = new QRegExpPrivate; |
3908 | prepareEngine(priv); |
3909 | } |
3910 | |
3911 | /*! |
3912 | Constructs a regular expression object for the given \a pattern |
3913 | string. The pattern must be given using wildcard notation if \a |
3914 | syntax is \l Wildcard; the default is \l RegExp. The pattern is |
3915 | case sensitive, unless \a cs is Qt::CaseInsensitive. Matching is |
3916 | greedy (maximal), but can be changed by calling |
3917 | setMinimal(). |
3918 | |
3919 | \sa setPattern(), setCaseSensitivity(), setPatternSyntax() |
3920 | */ |
3921 | QRegExp::QRegExp(const QString &pattern, Qt::CaseSensitivity cs, PatternSyntax syntax) |
3922 | { |
3923 | priv = new QRegExpPrivate(QRegExpEngineKey(pattern, syntax, cs)); |
3924 | prepareEngine(priv); |
3925 | } |
3926 | |
3927 | /*! |
3928 | Constructs a regular expression as a copy of \a rx. |
3929 | |
3930 | \sa operator=() |
3931 | */ |
3932 | QRegExp::QRegExp(const QRegExp &rx) |
3933 | { |
3934 | priv = new QRegExpPrivate; |
3935 | operator=(rx); |
3936 | } |
3937 | |
3938 | /*! |
3939 | Destroys the regular expression and cleans up its internal data. |
3940 | */ |
3941 | QRegExp::~QRegExp() |
3942 | { |
3943 | invalidateEngine(priv); |
3944 | delete priv; |
3945 | } |
3946 | |
3947 | /*! |
3948 | Copies the regular expression \a rx and returns a reference to the |
3949 | copy. The case sensitivity, wildcard, and minimal matching options |
3950 | are also copied. |
3951 | */ |
3952 | QRegExp &QRegExp::operator=(const QRegExp &rx) |
3953 | { |
3954 | prepareEngine(priv: rx.priv); // to allow sharing |
3955 | QRegExpEngine *otherEng = rx.priv->eng; |
3956 | if (otherEng) |
3957 | otherEng->ref.ref(); |
3958 | invalidateEngine(priv); |
3959 | priv->eng = otherEng; |
3960 | priv->engineKey = rx.priv->engineKey; |
3961 | priv->minimal = rx.priv->minimal; |
3962 | #ifndef QT_NO_REGEXP_CAPTURE |
3963 | priv->t = rx.priv->t; |
3964 | priv->capturedCache = rx.priv->capturedCache; |
3965 | #endif |
3966 | if (priv->eng) |
3967 | priv->matchState.prepareForMatch(eng: priv->eng); |
3968 | priv->matchState.captured = rx.priv->matchState.captured; |
3969 | return *this; |
3970 | } |
3971 | |
3972 | /*! |
3973 | \fn QRegExp &QRegExp::operator=(QRegExp &&other) |
3974 | |
3975 | Move-assigns \a other to this QRegExp instance. |
3976 | |
3977 | \since 5.2 |
3978 | */ |
3979 | |
3980 | /*! |
3981 | \fn void QRegExp::swap(QRegExp &other) |
3982 | \since 4.8 |
3983 | |
3984 | Swaps regular expression \a other with this regular |
3985 | expression. This operation is very fast and never fails. |
3986 | */ |
3987 | |
3988 | /*! |
3989 | Returns \c true if this regular expression is equal to \a rx; |
3990 | otherwise returns \c false. |
3991 | |
3992 | Two QRegExp objects are equal if they have the same pattern |
3993 | strings and the same settings for case sensitivity, wildcard and |
3994 | minimal matching. |
3995 | */ |
3996 | bool QRegExp::operator==(const QRegExp &rx) const |
3997 | { |
3998 | return priv->engineKey == rx.priv->engineKey && priv->minimal == rx.priv->minimal; |
3999 | } |
4000 | |
4001 | /*! |
4002 | \since 5.6 |
4003 | \relates QRegExp |
4004 | |
4005 | Returns the hash value for \a key, using |
4006 | \a seed to seed the calculation. |
4007 | */ |
4008 | size_t qHash(const QRegExp &key, size_t seed) noexcept |
4009 | { |
4010 | QtPrivate::QHashCombine hash; |
4011 | seed = hash(seed, key.priv->engineKey); |
4012 | seed = hash(seed, key.priv->minimal); |
4013 | return seed; |
4014 | } |
4015 | |
4016 | /*! |
4017 | \fn bool QRegExp::operator!=(const QRegExp &rx) const |
4018 | |
4019 | Returns \c true if this regular expression is not equal to \a rx; |
4020 | otherwise returns \c false. |
4021 | |
4022 | \sa operator==() |
4023 | */ |
4024 | |
4025 | /*! |
4026 | Returns \c true if the pattern string is empty; otherwise returns |
4027 | false. |
4028 | |
4029 | If you call exactMatch() with an empty pattern on an empty string |
4030 | it will return true; otherwise it returns \c false since it operates |
4031 | over the whole string. If you call indexIn() with an empty pattern |
4032 | on \e any string it will return the start offset (0 by default) |
4033 | because the empty pattern matches the 'emptiness' at the start of |
4034 | the string. In this case the length of the match returned by |
4035 | matchedLength() will be 0. |
4036 | |
4037 | See QString::isEmpty(). |
4038 | */ |
4039 | |
4040 | bool QRegExp::isEmpty() const |
4041 | { |
4042 | return priv->engineKey.pattern.isEmpty(); |
4043 | } |
4044 | |
4045 | /*! |
4046 | Returns \c true if the regular expression is valid; otherwise returns |
4047 | false. An invalid regular expression never matches. |
4048 | |
4049 | The pattern \b{[a-z} is an example of an invalid pattern, since |
4050 | it lacks a closing square bracket. |
4051 | |
4052 | Note that the validity of a regexp may also depend on the setting |
4053 | of the wildcard flag, for example \b{*.html} is a valid |
4054 | wildcard regexp but an invalid full regexp. |
4055 | |
4056 | \sa errorString() |
4057 | */ |
4058 | bool QRegExp::isValid() const |
4059 | { |
4060 | if (priv->engineKey.pattern.isEmpty()) { |
4061 | return true; |
4062 | } else { |
4063 | prepareEngine(priv); |
4064 | return priv->eng->isValid(); |
4065 | } |
4066 | } |
4067 | |
4068 | /*! |
4069 | Returns the pattern string of the regular expression. The pattern |
4070 | has either regular expression syntax or wildcard syntax, depending |
4071 | on patternSyntax(). |
4072 | |
4073 | \sa patternSyntax(), caseSensitivity() |
4074 | */ |
4075 | QString QRegExp::pattern() const |
4076 | { |
4077 | return priv->engineKey.pattern; |
4078 | } |
4079 | |
4080 | /*! |
4081 | Sets the pattern string to \a pattern. The case sensitivity, |
4082 | wildcard, and minimal matching options are not changed. |
4083 | |
4084 | \sa setPatternSyntax(), setCaseSensitivity() |
4085 | */ |
4086 | void QRegExp::setPattern(const QString &pattern) |
4087 | { |
4088 | if (priv->engineKey.pattern != pattern) { |
4089 | invalidateEngine(priv); |
4090 | priv->engineKey.pattern = pattern; |
4091 | } |
4092 | } |
4093 | |
4094 | /*! |
4095 | Returns Qt::CaseSensitive if the regexp is matched case |
4096 | sensitively; otherwise returns Qt::CaseInsensitive. |
4097 | |
4098 | \sa patternSyntax(), pattern(), isMinimal() |
4099 | */ |
4100 | Qt::CaseSensitivity QRegExp::caseSensitivity() const |
4101 | { |
4102 | return priv->engineKey.cs; |
4103 | } |
4104 | |
4105 | /*! |
4106 | Sets case sensitive matching to \a cs. |
4107 | |
4108 | If \a cs is Qt::CaseSensitive, \b{\\.txt$} matches |
4109 | \c{readme.txt} but not \c{README.TXT}. |
4110 | |
4111 | \sa setPatternSyntax(), setPattern(), setMinimal() |
4112 | */ |
4113 | void QRegExp::setCaseSensitivity(Qt::CaseSensitivity cs) |
4114 | { |
4115 | if ((bool)cs != (bool)priv->engineKey.cs) { |
4116 | invalidateEngine(priv); |
4117 | priv->engineKey.cs = cs; |
4118 | } |
4119 | } |
4120 | |
4121 | /*! |
4122 | Returns the syntax used by the regular expression. The default is |
4123 | QRegExp::RegExp. |
4124 | |
4125 | \sa pattern(), caseSensitivity() |
4126 | */ |
4127 | QRegExp::PatternSyntax QRegExp::patternSyntax() const |
4128 | { |
4129 | return priv->engineKey.patternSyntax; |
4130 | } |
4131 | |
4132 | /*! |
4133 | Sets the syntax mode for the regular expression. The default is |
4134 | QRegExp::RegExp. |
4135 | |
4136 | Setting \a syntax to QRegExp::Wildcard enables simple shell-like |
4137 | \l{QRegExp wildcard matching}. For example, \b{r*.txt} matches the |
4138 | string \c{readme.txt} in wildcard mode, but does not match |
4139 | \c{readme}. |
4140 | |
4141 | Setting \a syntax to QRegExp::FixedString means that the pattern |
4142 | is interpreted as a plain string. Special characters (e.g., |
4143 | backslash) don't need to be escaped then. |
4144 | |
4145 | \sa setPattern(), setCaseSensitivity(), escape() |
4146 | */ |
4147 | void QRegExp::setPatternSyntax(PatternSyntax syntax) |
4148 | { |
4149 | if (syntax != priv->engineKey.patternSyntax) { |
4150 | invalidateEngine(priv); |
4151 | priv->engineKey.patternSyntax = syntax; |
4152 | } |
4153 | } |
4154 | |
4155 | /*! |
4156 | Returns \c true if minimal (non-greedy) matching is enabled; |
4157 | otherwise returns \c false. |
4158 | |
4159 | \sa caseSensitivity(), setMinimal() |
4160 | */ |
4161 | bool QRegExp::isMinimal() const |
4162 | { |
4163 | return priv->minimal; |
4164 | } |
4165 | |
4166 | /*! |
4167 | Enables or disables minimal matching. If \a minimal is false, |
4168 | matching is greedy (maximal) which is the default. |
4169 | |
4170 | For example, suppose we have the input string "We must be |
4171 | <b>bold</b>, very <b>bold</b>!" and the pattern |
4172 | \b{<b>.*</b>}. With the default greedy (maximal) matching, |
4173 | the match is "We must be \underline{<b>bold</b>, very |
4174 | <b>bold</b>}!". But with minimal (non-greedy) matching, the |
4175 | first match is: "We must be \underline{<b>bold</b>}, very |
4176 | <b>bold</b>!" and the second match is "We must be <b>bold</b>, |
4177 | very \underline{<b>bold</b>}!". In practice we might use the pattern |
4178 | \b{<b>[^<]*\</b>} instead, although this will still fail for |
4179 | nested tags. |
4180 | |
4181 | \sa setCaseSensitivity() |
4182 | */ |
4183 | void QRegExp::setMinimal(bool minimal) |
4184 | { |
4185 | priv->minimal = minimal; |
4186 | } |
4187 | |
4188 | // ### Qt 5: make non-const |
4189 | /*! |
4190 | Returns \c true if \a str is matched exactly by this regular |
4191 | expression; otherwise returns \c false. You can determine how much of |
4192 | the string was matched by calling matchedLength(). |
4193 | |
4194 | For a given regexp string R, exactMatch("R") is the equivalent of |
4195 | indexIn("^R$") since exactMatch() effectively encloses the regexp |
4196 | in the start of string and end of string anchors, except that it |
4197 | sets matchedLength() differently. |
4198 | |
4199 | For example, if the regular expression is \b{blue}, then |
4200 | exactMatch() returns \c true only for input \c blue. For inputs \c |
4201 | bluebell, \c blutak and \c lightblue, exactMatch() returns \c false |
4202 | and matchedLength() will return 4, 3 and 0 respectively. |
4203 | |
4204 | Although const, this function sets matchedLength(), |
4205 | capturedTexts(), and pos(). |
4206 | |
4207 | \sa indexIn(), lastIndexIn() |
4208 | */ |
4209 | bool QRegExp::exactMatch(const QString &str) const |
4210 | { |
4211 | prepareEngineForMatch(priv, str); |
4212 | priv->matchState.match(str0: str.unicode(), len0: str.size(), pos0: 0, minimal0: priv->minimal, oneTest: true, caretIndex: 0); |
4213 | if (priv->matchState.captured[1] == str.size()) { |
4214 | return true; |
4215 | } else { |
4216 | priv->matchState.captured[0] = 0; |
4217 | priv->matchState.captured[1] = priv->matchState.oneTestMatchedLen; |
4218 | return false; |
4219 | } |
4220 | } |
4221 | |
4222 | /*! |
4223 | Returns the regexp as a QVariant |
4224 | */ |
4225 | QRegExp::operator QVariant() const |
4226 | { |
4227 | QT_WARNING_PUSH QT_WARNING_DISABLE_DEPRECATED |
4228 | QVariant v; |
4229 | v.setValue(*this); |
4230 | return v; |
4231 | QT_WARNING_POP |
4232 | } |
4233 | |
4234 | // ### Qt 5: make non-const |
4235 | /*! |
4236 | Attempts to find a match in \a str from position \a offset (0 by |
4237 | default). If \a offset is -1, the search starts at the last |
4238 | character; if -2, at the next to last character; etc. |
4239 | |
4240 | Returns the position of the first match, or -1 if there was no |
4241 | match. |
4242 | |
4243 | The \a caretMode parameter can be used to instruct whether \b{^} |
4244 | should match at index 0 or at \a offset. |
4245 | |
4246 | You might prefer to use QString::indexOf(), QString::contains(), |
4247 | or even QStringList::filter(). To replace matches use |
4248 | QString::replace(). |
4249 | |
4250 | Example: |
4251 | \snippet code/src_corelib_text_qregexp.cpp 13 |
4252 | |
4253 | Although const, this function sets matchedLength(), |
4254 | capturedTexts() and pos(). |
4255 | |
4256 | If the QRegExp is a wildcard expression (see setPatternSyntax()) |
4257 | and want to test a string against the whole wildcard expression, |
4258 | use exactMatch() instead of this function. |
4259 | |
4260 | \sa lastIndexIn(), exactMatch() |
4261 | */ |
4262 | |
4263 | int QRegExp::indexIn(const QString &str, int offset, CaretMode caretMode) const |
4264 | { |
4265 | prepareEngineForMatch(priv, str); |
4266 | if (offset < 0) |
4267 | offset += str.size(); |
4268 | priv->matchState.match(str0: str.unicode(), len0: str.size(), pos0: offset, |
4269 | minimal0: priv->minimal, oneTest: false, caretIndex: caretIndex(offset, caretMode)); |
4270 | return priv->matchState.captured[0]; |
4271 | } |
4272 | |
4273 | // ### Qt 5: make non-const |
4274 | /*! |
4275 | Attempts to find a match backwards in \a str from position \a |
4276 | offset. If \a offset is -1 (the default), the search starts at the |
4277 | last character; if -2, at the next to last character; etc. |
4278 | |
4279 | Returns the position of the first match, or -1 if there was no |
4280 | match. |
4281 | |
4282 | The \a caretMode parameter can be used to instruct whether \b{^} |
4283 | should match at index 0 or at \a offset. |
4284 | |
4285 | Although const, this function sets matchedLength(), |
4286 | capturedTexts() and pos(). |
4287 | |
4288 | \warning Searching backwards is much slower than searching |
4289 | forwards. |
4290 | |
4291 | \sa indexIn(), exactMatch() |
4292 | */ |
4293 | |
4294 | int QRegExp::lastIndexIn(const QString &str, int offset, CaretMode caretMode) const |
4295 | { |
4296 | prepareEngineForMatch(priv, str); |
4297 | if (offset < 0) |
4298 | offset += str.size(); |
4299 | if (offset < 0 || offset > str.size()) { |
4300 | memset(s: priv->matchState.captured, c: -1, n: priv->matchState.capturedSize*sizeof(int)); |
4301 | return -1; |
4302 | } |
4303 | |
4304 | while (offset >= 0) { |
4305 | priv->matchState.match(str0: str.unicode(), len0: str.size(), pos0: offset, |
4306 | minimal0: priv->minimal, oneTest: true, caretIndex: caretIndex(offset, caretMode)); |
4307 | if (priv->matchState.captured[0] == offset) |
4308 | return offset; |
4309 | --offset; |
4310 | } |
4311 | return -1; |
4312 | } |
4313 | |
4314 | /*! |
4315 | Returns the length of the last matched string, or -1 if there was |
4316 | no match. |
4317 | |
4318 | \sa exactMatch(), indexIn(), lastIndexIn() |
4319 | */ |
4320 | int QRegExp::matchedLength() const |
4321 | { |
4322 | return priv->matchState.captured[1]; |
4323 | } |
4324 | |
4325 | |
4326 | /*! |
4327 | Replaces every occurrence of this regular expression in |
4328 | \a str with \a after and returns the result. |
4329 | |
4330 | For regular expressions containing \l{capturing parentheses}, |
4331 | occurrences of \b{\\1}, \b{\\2}, ..., in \a after are replaced |
4332 | with \c {rx}.cap(1), cap(2), ... |
4333 | |
4334 | \sa indexIn(), lastIndexIn(), QRegExp::cap() |
4335 | */ |
4336 | QString QRegExp::replaceIn(const QString &str, const QString &after) const |
4337 | { |
4338 | struct QStringCapture |
4339 | { |
4340 | int pos; |
4341 | int len; |
4342 | int no; |
4343 | }; |
4344 | |
4345 | QRegExp rx2(*this); |
4346 | |
4347 | if (str.isEmpty() && rx2.indexIn(str) == -1) |
4348 | return str; |
4349 | |
4350 | QString s(str); |
4351 | |
4352 | int index = 0; |
4353 | int numCaptures = rx2.captureCount(); |
4354 | int al = after.size(); |
4355 | QRegExp::CaretMode caretMode = QRegExp::CaretAtZero; |
4356 | |
4357 | if (numCaptures > 0) { |
4358 | const QChar *uc = after.unicode(); |
4359 | int numBackRefs = 0; |
4360 | |
4361 | for (int i = 0; i < al - 1; i++) { |
4362 | if (uc[i] == QLatin1Char('\\')) { |
4363 | int no = uc[i + 1].digitValue(); |
4364 | if (no > 0 && no <= numCaptures) |
4365 | numBackRefs++; |
4366 | } |
4367 | } |
4368 | |
4369 | /* |
4370 | This is the harder case where we have back-references. |
4371 | */ |
4372 | if (numBackRefs > 0) { |
4373 | QVarLengthArray<QStringCapture, 16> captures(numBackRefs); |
4374 | int j = 0; |
4375 | |
4376 | for (int i = 0; i < al - 1; i++) { |
4377 | if (uc[i] == QLatin1Char('\\')) { |
4378 | int no = uc[i + 1].digitValue(); |
4379 | if (no > 0 && no <= numCaptures) { |
4380 | QStringCapture capture; |
4381 | capture.pos = i; |
4382 | capture.len = 2; |
4383 | |
4384 | if (i < al - 2) { |
4385 | int secondDigit = uc[i + 2].digitValue(); |
4386 | if (secondDigit != -1 && ((no * 10) + secondDigit) <= numCaptures) { |
4387 | no = (no * 10) + secondDigit; |
4388 | ++capture.len; |
4389 | } |
4390 | } |
4391 | |
4392 | capture.no = no; |
4393 | captures[j++] = capture; |
4394 | } |
4395 | } |
4396 | } |
4397 | |
4398 | while (index <= s.size()) { |
4399 | index = rx2.indexIn(str: s, offset: index, caretMode); |
4400 | if (index == -1) |
4401 | break; |
4402 | |
4403 | QString after2(after); |
4404 | for (j = numBackRefs - 1; j >= 0; j--) { |
4405 | const QStringCapture &capture = captures[j]; |
4406 | after2.replace(i: capture.pos, len: capture.len, after: rx2.cap(nth: capture.no)); |
4407 | } |
4408 | |
4409 | s.replace(i: index, len: rx2.matchedLength(), after: after2); |
4410 | index += after2.size(); |
4411 | |
4412 | // avoid infinite loop on 0-length matches (e.g., QRegExp("[a-z]*")) |
4413 | if (rx2.matchedLength() == 0) |
4414 | ++index; |
4415 | |
4416 | caretMode = QRegExp::CaretWontMatch; |
4417 | } |
4418 | return s; |
4419 | } |
4420 | } |
4421 | |
4422 | /* |
4423 | This is the simple and optimized case where we don't have |
4424 | back-references. |
4425 | */ |
4426 | while (index != -1) { |
4427 | struct { |
4428 | int pos; |
4429 | int length; |
4430 | } replacements[2048]; |
4431 | |
4432 | int pos = 0; |
4433 | int adjust = 0; |
4434 | while (pos < 2047) { |
4435 | index = rx2.indexIn(str: s, offset: index, caretMode); |
4436 | if (index == -1) |
4437 | break; |
4438 | int ml = rx2.matchedLength(); |
4439 | replacements[pos].pos = index; |
4440 | replacements[pos++].length = ml; |
4441 | index += ml; |
4442 | adjust += al - ml; |
4443 | // avoid infinite loop |
4444 | if (!ml) |
4445 | index++; |
4446 | } |
4447 | if (!pos) |
4448 | break; |
4449 | replacements[pos].pos = s.size(); |
4450 | int newlen = s.size() + adjust; |
4451 | |
4452 | // to continue searching at the right position after we did |
4453 | // the first round of replacements |
4454 | if (index != -1) |
4455 | index += adjust; |
4456 | QString newstring; |
4457 | newstring.reserve(asize: newlen + 1); |
4458 | QChar *newuc = newstring.data(); |
4459 | QChar *uc = newuc; |
4460 | int copystart = 0; |
4461 | int i = 0; |
4462 | while (i < pos) { |
4463 | int copyend = replacements[i].pos; |
4464 | int size = copyend - copystart; |
4465 | memcpy(dest: static_cast<void*>(uc), src: static_cast<const void *>(s.constData() + copystart), n: size * sizeof(QChar)); |
4466 | uc += size; |
4467 | memcpy(dest: static_cast<void *>(uc), src: static_cast<const void *>(after.constData()), n: al * sizeof(QChar)); |
4468 | uc += al; |
4469 | copystart = copyend + replacements[i].length; |
4470 | i++; |
4471 | } |
4472 | memcpy(dest: static_cast<void *>(uc), src: static_cast<const void *>(s.constData() + copystart), n: (s.size() - copystart) * sizeof(QChar)); |
4473 | newstring.resize(size: newlen); |
4474 | s = newstring; |
4475 | caretMode = QRegExp::CaretWontMatch; |
4476 | } |
4477 | return s; |
4478 | |
4479 | } |
4480 | |
4481 | |
4482 | /*! |
4483 | \fn QString QRegExp::removeIn(const QString &str) const |
4484 | |
4485 | Removes every occurrence of this regular expression \a str, and |
4486 | returns the result |
4487 | |
4488 | Does the same as replaceIn(str, QString()). |
4489 | |
4490 | \sa indexIn(), lastIndexIn(), replaceIn() |
4491 | */ |
4492 | |
4493 | |
4494 | /*! |
4495 | \fn QString QRegExp::countIn(const QString &str) const |
4496 | |
4497 | Returns the number of times this regular expression matches |
4498 | in \a str. |
4499 | |
4500 | \sa indexIn(), lastIndexIn(), replaceIn() |
4501 | */ |
4502 | |
4503 | int QRegExp::countIn(const QString &str) const |
4504 | { |
4505 | QRegExp rx2(*this); |
4506 | int count = 0; |
4507 | int index = -1; |
4508 | int len = str.size(); |
4509 | while (index < len - 1) { // count overlapping matches |
4510 | index = rx2.indexIn(str, offset: index + 1); |
4511 | if (index == -1) |
4512 | break; |
4513 | count++; |
4514 | } |
4515 | return count; |
4516 | } |
4517 | |
4518 | /*! |
4519 | Splits \a str into substrings wherever this regular expression |
4520 | matches, and returns the list of those strings. If this regular |
4521 | expression does not match anywhere in the string, split() returns a |
4522 | single-element list containing \a str. |
4523 | |
4524 | If \a behavior is set to Qt::KeepEmptyParts, empty fields are |
4525 | included in the resulting list. |
4526 | |
4527 | \sa QStringList::join(), QString::split() |
4528 | */ |
4529 | QStringList QRegExp::splitString(const QString &str, Qt::SplitBehavior behavior) const |
4530 | { |
4531 | QRegExp rx2(*this); |
4532 | QStringList list; |
4533 | int start = 0; |
4534 | int = 0; |
4535 | int end; |
4536 | while ((end = rx2.indexIn(str, offset: start + extra)) != -1) { |
4537 | int matchedLen = rx2.matchedLength(); |
4538 | if (start != end || behavior == Qt::KeepEmptyParts) |
4539 | list.append(t: str.mid(position: start, n: end - start)); |
4540 | start = end + matchedLen; |
4541 | extra = (matchedLen == 0) ? 1 : 0; |
4542 | } |
4543 | if (start != str.size() || behavior == Qt::KeepEmptyParts) |
4544 | list.append(t: str.mid(position: start, n: -1)); |
4545 | return list; |
4546 | } |
4547 | |
4548 | /*! |
4549 | Returns a list of all the strings that match this regular |
4550 | expression in \a stringList. |
4551 | */ |
4552 | QStringList QRegExp::filterList(const QStringList &stringList) const |
4553 | { |
4554 | QStringList res; |
4555 | for (const QString &s : stringList) { |
4556 | if (containedIn(str: s)) |
4557 | res << s; |
4558 | } |
4559 | return res; |
4560 | } |
4561 | |
4562 | /*! |
4563 | Replaces every occurrence of this regexp, in each of \a stringList's |
4564 | with \a after. Returns a reference to the string list. |
4565 | */ |
4566 | QStringList QRegExp::replaceIn(const QStringList &stringList, const QString &after) const |
4567 | { |
4568 | QStringList list; |
4569 | for (const QString &s : stringList) |
4570 | list << replaceIn(str: s, after); |
4571 | return list; |
4572 | } |
4573 | |
4574 | /*! |
4575 | Returns the index position of the first exact match of this regexp in |
4576 | \a list, searching forward from index position \a from. Returns |
4577 | -1 if no item matched. |
4578 | |
4579 | \sa lastIndexIn(), exactMatch() |
4580 | */ |
4581 | int QRegExp::indexIn(const QStringList &list, int from) const |
4582 | { |
4583 | QRegExp rx2(*this); |
4584 | if (from < 0) |
4585 | from = qMax(a: from + list.size(), b: 0); |
4586 | for (int i = from; i < list.size(); ++i) { |
4587 | if (rx2.exactMatch(str: list.at(i))) |
4588 | return i; |
4589 | } |
4590 | return -1; |
4591 | } |
4592 | |
4593 | /*! |
4594 | Returns the index position of the last exact match of this regexp in |
4595 | \a list, searching backward from index position \a from. If \a |
4596 | from is -1 (the default), the search starts at the last item. |
4597 | Returns -1 if no item matched. |
4598 | |
4599 | \sa QRegExp::exactMatch() |
4600 | */ |
4601 | int QRegExp::lastIndexIn(const QStringList &list, int from) const |
4602 | { |
4603 | QRegExp rx2(*this); |
4604 | if (from < 0) |
4605 | from += list.size(); |
4606 | else if (from >= list.size()) |
4607 | from = list.size() - 1; |
4608 | for (int i = from; i >= 0; --i) { |
4609 | if (rx2.exactMatch(str: list.at(i))) |
4610 | return i; |
4611 | } |
4612 | return -1; |
4613 | } |
4614 | |
4615 | #ifndef QT_NO_REGEXP_CAPTURE |
4616 | |
4617 | /*! |
4618 | \since 4.6 |
4619 | Returns the number of captures contained in the regular expression. |
4620 | */ |
4621 | int QRegExp::captureCount() const |
4622 | { |
4623 | prepareEngine(priv); |
4624 | return priv->eng->captureCount(); |
4625 | } |
4626 | |
4627 | /*! |
4628 | Returns a list of the captured text strings. |
4629 | |
4630 | The first string in the list is the entire matched string. Each |
4631 | subsequent list element contains a string that matched a |
4632 | (capturing) subexpression of the regexp. |
4633 | |
4634 | For example: |
4635 | \snippet code/src_corelib_text_qregexp.cpp 14 |
4636 | |
4637 | The above example also captures elements that may be present but |
4638 | which we have no interest in. This problem can be solved by using |
4639 | non-capturing parentheses: |
4640 | |
4641 | \snippet code/src_corelib_text_qregexp.cpp 15 |
4642 | |
4643 | Note that if you want to iterate over the list, you should iterate |
4644 | over a copy, e.g. |
4645 | \snippet code/src_corelib_text_qregexp.cpp 16 |
4646 | |
4647 | Some regexps can match an indeterminate number of times. For |
4648 | example if the input string is "Offsets: 12 14 99 231 7" and the |
4649 | regexp, \c{rx}, is \b{(\\d+)+}, we would hope to get a list of |
4650 | all the numbers matched. However, after calling |
4651 | \c{rx.indexIn(str)}, capturedTexts() will return the list ("12", |
4652 | "12"), i.e. the entire match was "12" and the first subexpression |
4653 | matched was "12". The correct approach is to use cap() in a |
4654 | \l{QRegExp#cap_in_a_loop}{loop}. |
4655 | |
4656 | The order of elements in the string list is as follows. The first |
4657 | element is the entire matching string. Each subsequent element |
4658 | corresponds to the next capturing open left parentheses. Thus |
4659 | capturedTexts()[1] is the text of the first capturing parentheses, |
4660 | capturedTexts()[2] is the text of the second and so on |
4661 | (corresponding to $1, $2, etc., in some other regexp languages). |
4662 | |
4663 | \sa cap(), pos() |
4664 | */ |
4665 | QStringList QRegExp::capturedTexts() const |
4666 | { |
4667 | if (priv->capturedCache.isEmpty()) { |
4668 | prepareEngine(priv); |
4669 | const int *captured = priv->matchState.captured; |
4670 | int n = priv->matchState.capturedSize; |
4671 | |
4672 | for (int i = 0; i < n; i += 2) { |
4673 | QString m; |
4674 | if (captured[i + 1] == 0) |
4675 | m = QLatin1String("" ); // ### Qt 5: don't distinguish between null and empty |
4676 | else if (captured[i] >= 0) |
4677 | m = priv->t.mid(position: captured[i], n: captured[i + 1]); |
4678 | priv->capturedCache.append(t: m); |
4679 | } |
4680 | priv->t.clear(); |
4681 | } |
4682 | return priv->capturedCache; |
4683 | } |
4684 | |
4685 | /*! |
4686 | \internal |
4687 | */ |
4688 | QStringList QRegExp::capturedTexts() |
4689 | { |
4690 | return const_cast<const QRegExp *>(this)->capturedTexts(); |
4691 | } |
4692 | |
4693 | /*! |
4694 | Returns the text captured by the \a nth subexpression. The entire |
4695 | match has index 0 and the parenthesized subexpressions have |
4696 | indexes starting from 1 (excluding non-capturing parentheses). |
4697 | |
4698 | \snippet code/src_corelib_text_qregexp.cpp 17 |
4699 | |
4700 | The order of elements matched by cap() is as follows. The first |
4701 | element, cap(0), is the entire matching string. Each subsequent |
4702 | element corresponds to the next capturing open left parentheses. |
4703 | Thus cap(1) is the text of the first capturing parentheses, cap(2) |
4704 | is the text of the second, and so on. |
4705 | |
4706 | \sa capturedTexts(), pos() |
4707 | */ |
4708 | QString QRegExp::cap(int nth) const |
4709 | { |
4710 | return capturedTexts().value(i: nth); |
4711 | } |
4712 | |
4713 | /*! |
4714 | \internal |
4715 | */ |
4716 | QString QRegExp::cap(int nth) |
4717 | { |
4718 | return const_cast<const QRegExp *>(this)->cap(nth); |
4719 | } |
4720 | |
4721 | /*! |
4722 | Returns the position of the \a nth captured text in the searched |
4723 | string. If \a nth is 0 (the default), pos() returns the position |
4724 | of the whole match. |
4725 | |
4726 | Example: |
4727 | \snippet code/src_corelib_text_qregexp.cpp 18 |
4728 | |
4729 | For zero-length matches, pos() always returns -1. (For example, if |
4730 | cap(4) would return an empty string, pos(4) returns -1.) This is |
4731 | a feature of the implementation. |
4732 | |
4733 | \sa cap(), capturedTexts() |
4734 | */ |
4735 | int QRegExp::pos(int nth) const |
4736 | { |
4737 | if (nth < 0 || nth >= priv->matchState.capturedSize / 2) |
4738 | return -1; |
4739 | else |
4740 | return priv->matchState.captured[2 * nth]; |
4741 | } |
4742 | |
4743 | /*! |
4744 | \internal |
4745 | */ |
4746 | int QRegExp::pos(int nth) |
4747 | { |
4748 | return const_cast<const QRegExp *>(this)->pos(nth); |
4749 | } |
4750 | |
4751 | /*! |
4752 | Returns a text string that explains why a regexp pattern is |
4753 | invalid the case being; otherwise returns "no error occurred". |
4754 | |
4755 | \sa isValid() |
4756 | */ |
4757 | QString QRegExp::errorString() const |
4758 | { |
4759 | if (isValid()) { |
4760 | return QString::fromLatin1(RXERR_OK); |
4761 | } else { |
4762 | return priv->eng->errorString(); |
4763 | } |
4764 | } |
4765 | |
4766 | /*! |
4767 | \internal |
4768 | */ |
4769 | QString QRegExp::errorString() |
4770 | { |
4771 | return const_cast<const QRegExp *>(this)->errorString(); |
4772 | } |
4773 | |
4774 | #endif |
4775 | |
4776 | /*! |
4777 | Returns the string \a str with every regexp special character |
4778 | escaped with a backslash. The special characters are $, (,), *, +, |
4779 | ., ?, [, \,], ^, {, | and }. |
4780 | |
4781 | Example: |
4782 | |
4783 | \snippet code/src_corelib_text_qregexp.cpp 19 |
4784 | |
4785 | This function is useful to construct regexp patterns dynamically: |
4786 | |
4787 | \snippet code/src_corelib_text_qregexp.cpp 20 |
4788 | |
4789 | \sa setPatternSyntax() |
4790 | */ |
4791 | QString QRegExp::escape(const QString &str) |
4792 | { |
4793 | QString quoted; |
4794 | const int count = str.size(); |
4795 | quoted.reserve(asize: count * 2); |
4796 | const QLatin1Char backslash('\\'); |
4797 | for (int i = 0; i < count; i++) { |
4798 | switch (str.at(i).toLatin1()) { |
4799 | case '$': |
4800 | case '(': |
4801 | case ')': |
4802 | case '*': |
4803 | case '+': |
4804 | case '.': |
4805 | case '?': |
4806 | case '[': |
4807 | case '\\': |
4808 | case ']': |
4809 | case '^': |
4810 | case '{': |
4811 | case '|': |
4812 | case '}': |
4813 | quoted.append(c: backslash); |
4814 | } |
4815 | quoted.append(c: str.at(i)); |
4816 | } |
4817 | return quoted; |
4818 | } |
4819 | |
4820 | |
4821 | #ifndef QT_NO_DATASTREAM |
4822 | /*! |
4823 | \relates QRegExp |
4824 | |
4825 | Writes the regular expression \a regExp to stream \a out. |
4826 | |
4827 | \sa {Serializing Qt Data Types} |
4828 | */ |
4829 | QDataStream &operator<<(QDataStream &out, const QRegExp ®Exp) |
4830 | { |
4831 | return out << regExp.pattern() << (quint8)regExp.caseSensitivity() |
4832 | << (quint8)regExp.patternSyntax() |
4833 | << (quint8)!!regExp.isMinimal(); |
4834 | } |
4835 | |
4836 | /*! |
4837 | \relates QRegExp |
4838 | |
4839 | Reads a regular expression from stream \a in into \a regExp. |
4840 | |
4841 | \sa {Serializing Qt Data Types} |
4842 | */ |
4843 | QDataStream &operator>>(QDataStream &in, QRegExp ®Exp) |
4844 | { |
4845 | QString pattern; |
4846 | quint8 cs; |
4847 | quint8 patternSyntax; |
4848 | quint8 isMinimal; |
4849 | |
4850 | in >> pattern >> cs >> patternSyntax >> isMinimal; |
4851 | |
4852 | QRegExp newRegExp(pattern, Qt::CaseSensitivity(cs), |
4853 | QRegExp::PatternSyntax(patternSyntax)); |
4854 | |
4855 | newRegExp.setMinimal(isMinimal); |
4856 | regExp = newRegExp; |
4857 | return in; |
4858 | } |
4859 | #endif // QT_NO_DATASTREAM |
4860 | |
4861 | #ifndef QT_NO_DEBUG_STREAM |
4862 | QDebug operator<<(QDebug dbg, const QRegExp &r) |
4863 | { |
4864 | QDebugStateSaver saver(dbg); |
4865 | dbg.nospace() << "QRegExp(patternSyntax=" << r.patternSyntax() |
4866 | << ", pattern='" << r.pattern() << "')" ; |
4867 | return dbg; |
4868 | } |
4869 | #endif |
4870 | |
4871 | QT_END_NAMESPACE |
4872 | |