1 | // Copyright (C) 2016 The Qt Company Ltd. |
2 | // SPDX-License-Identifier: LicenseRef-Qt-Commercial OR LGPL-3.0-only OR GPL-2.0-only OR GPL-3.0-only |
3 | |
4 | #include "qregexp.h" |
5 | |
6 | #include "qalgorithms.h" |
7 | #include "qbitarray.h" |
8 | #include "qcache.h" |
9 | #include "qdatastream.h" |
10 | #include "qdebug.h" |
11 | #include "qhashfunctions.h" |
12 | #include "qlist.h" |
13 | #include "qmap.h" |
14 | #include "qmutex.h" |
15 | #include "qstring.h" |
16 | #include "qstringlist.h" |
17 | #include "qstringmatcher.h" |
18 | #include "private/qlocking_p.h" |
19 | #include "qvarlengtharray.h" |
20 | |
21 | #include <limits.h> |
22 | #include <algorithm> |
23 | |
24 | QT_BEGIN_NAMESPACE |
25 | |
26 | // error strings for the regexp parser |
27 | #define RXERR_OK QT_TRANSLATE_NOOP("QRegExp", "no error occurred") |
28 | #define RXERR_DISABLED QT_TRANSLATE_NOOP("QRegExp", "disabled feature used") |
29 | #define RXERR_CHARCLASS QT_TRANSLATE_NOOP("QRegExp", "bad char class syntax") |
30 | #define RXERR_LOOKAHEAD QT_TRANSLATE_NOOP("QRegExp", "bad lookahead syntax") |
31 | #define RXERR_LOOKBEHIND QT_TRANSLATE_NOOP("QRegExp", "lookbehinds not supported, see QTBUG-2371") |
32 | #define RXERR_REPETITION QT_TRANSLATE_NOOP("QRegExp", "bad repetition syntax") |
33 | #define RXERR_OCTAL QT_TRANSLATE_NOOP("QRegExp", "invalid octal value") |
34 | #define RXERR_LEFTDELIM QT_TRANSLATE_NOOP("QRegExp", "missing left delim") |
35 | #define RXERR_END QT_TRANSLATE_NOOP("QRegExp", "unexpected end") |
36 | #define RXERR_LIMIT QT_TRANSLATE_NOOP("QRegExp", "met internal limit") |
37 | #define RXERR_INTERVAL QT_TRANSLATE_NOOP("QRegExp", "invalid interval") |
38 | #define RXERR_CATEGORY QT_TRANSLATE_NOOP("QRegExp", "invalid category") |
39 | |
40 | /*! |
41 | \class QRegExp |
42 | \inmodule QtCore5Compat |
43 | \reentrant |
44 | \brief The QRegExp class provides pattern matching using regular expressions. |
45 | |
46 | \ingroup tools |
47 | \ingroup shared |
48 | |
49 | \keyword regular expression |
50 | |
51 | This class is deprecated in Qt 6. Please use QRegularExpression instead |
52 | for all new code. For guidelines on porting old code from QRegExp to |
53 | QRegularExpression, see {Porting to QRegularExpression} |
54 | |
55 | A regular expression, or "regexp", is a pattern for matching |
56 | substrings in a text. This is useful in many contexts, e.g., |
57 | |
58 | \table |
59 | \row \li Validation |
60 | \li A regexp can test whether a substring meets some criteria, |
61 | e.g. is an integer or contains no whitespace. |
62 | \row \li Searching |
63 | \li A regexp provides more powerful pattern matching than |
64 | simple substring matching, e.g., match one of the words |
65 | \e{mail}, \e{letter} or \e{correspondence}, but none of the |
66 | words \e{email}, \e{mailman}, \e{mailer}, \e{letterbox}, etc. |
67 | \row \li Search and Replace |
68 | \li A regexp can replace all occurrences of a substring with a |
69 | different substring, e.g., replace all occurrences of \e{&} |
70 | with \e{\&} except where the \e{&} is already followed by |
71 | an \e{amp;}. |
72 | \row \li String Splitting |
73 | \li A regexp can be used to identify where a string should be |
74 | split apart, e.g. splitting tab-delimited strings. |
75 | \endtable |
76 | |
77 | A brief introduction to regexps is presented, a description of |
78 | Qt's regexp language, some examples, and the function |
79 | documentation itself. QRegExp is modeled on Perl's regexp |
80 | language. It fully supports Unicode. QRegExp can also be used in a |
81 | simpler, \e{wildcard mode} that is similar to the functionality |
82 | found in command shells. The syntax rules used by QRegExp can be |
83 | changed with setPatternSyntax(). In particular, the pattern syntax |
84 | can be set to QRegExp::FixedString, which means the pattern to be |
85 | matched is interpreted as a plain string, i.e., special characters |
86 | (e.g., backslash) are not escaped. |
87 | |
88 | A good text on regexps is \e {Mastering Regular Expressions} |
89 | (Third Edition) by Jeffrey E. F. Friedl, ISBN 0-596-52812-4. |
90 | |
91 | \note In Qt 5, the new QRegularExpression class provides a Perl |
92 | compatible implementation of regular expressions and is recommended |
93 | in place of QRegExp. |
94 | |
95 | \tableofcontents |
96 | |
97 | \section1 Introduction |
98 | |
99 | Regexps are built up from expressions, quantifiers, and |
100 | assertions. The simplest expression is a character, e.g. \b{x} |
101 | or \b{5}. An expression can also be a set of characters |
102 | enclosed in square brackets. \b{[ABCD]} will match an \b{A} |
103 | or a \b{B} or a \b{C} or a \b{D}. We can write this same |
104 | expression as \b{[A-D]}, and an expression to match any |
105 | capital letter in the English alphabet is written as |
106 | \b{[A-Z]}. |
107 | |
108 | A quantifier specifies the number of occurrences of an expression |
109 | that must be matched. \b{x{1,1}} means match one and only one |
110 | \b{x}. \b{x{1,5}} means match a sequence of \b{x} |
111 | characters that contains at least one \b{x} but no more than |
112 | five. |
113 | |
114 | Note that in general regexps cannot be used to check for balanced |
115 | brackets or tags. For example, a regexp can be written to match an |
116 | opening html \c{<b>} and its closing \c{</b>}, if the \c{<b>} tags |
117 | are not nested, but if the \c{<b>} tags are nested, that same |
118 | regexp will match an opening \c{<b>} tag with the wrong closing |
119 | \c{</b>}. For the fragment \c{<b>bold <b>bolder</b></b>}, the |
120 | first \c{<b>} would be matched with the first \c{</b>}, which is |
121 | not correct. However, it is possible to write a regexp that will |
122 | match nested brackets or tags correctly, but only if the number of |
123 | nesting levels is fixed and known. If the number of nesting levels |
124 | is not fixed and known, it is impossible to write a regexp that |
125 | will not fail. |
126 | |
127 | Suppose we want a regexp to match integers in the range 0 to 99. |
128 | At least one digit is required, so we start with the expression |
129 | \b{[0-9]{1,1}}, which matches a single digit exactly once. This |
130 | regexp matches integers in the range 0 to 9. To match integers up |
131 | to 99, increase the maximum number of occurrences to 2, so the |
132 | regexp becomes \b{[0-9]{1,2}}. This regexp satisfies the |
133 | original requirement to match integers from 0 to 99, but it will |
134 | also match integers that occur in the middle of strings. If we |
135 | want the matched integer to be the whole string, we must use the |
136 | anchor assertions, \b{^} (caret) and \b{$} (dollar). When |
137 | \b{^} is the first character in a regexp, it means the regexp |
138 | must match from the beginning of the string. When \b{$} is the |
139 | last character of the regexp, it means the regexp must match to |
140 | the end of the string. The regexp becomes \b{^[0-9]{1,2}$}. |
141 | Note that assertions, e.g. \b{^} and \b{$}, do not match |
142 | characters but locations in the string. |
143 | |
144 | If you have seen regexps described elsewhere, they may have looked |
145 | different from the ones shown here. This is because some sets of |
146 | characters and some quantifiers are so common that they have been |
147 | given special symbols to represent them. \b{[0-9]} can be |
148 | replaced with the symbol \b{\\d}. The quantifier to match |
149 | exactly one occurrence, \b{{1,1}}, can be replaced with the |
150 | expression itself, i.e. \b{x{1,1}} is the same as \b{x}. So |
151 | our 0 to 99 matcher could be written as \b{^\\d{1,2}$}. It can |
152 | also be written \b{^\\d\\d{0,1}$}, i.e. \e{From the start of |
153 | the string, match a digit, followed immediately by 0 or 1 digits}. |
154 | In practice, it would be written as \b{^\\d\\d?$}. The \b{?} |
155 | is shorthand for the quantifier \b{{0,1}}, i.e. 0 or 1 |
156 | occurrences. \b{?} makes an expression optional. The regexp |
157 | \b{^\\d\\d?$} means \e{From the beginning of the string, match |
158 | one digit, followed immediately by 0 or 1 more digit, followed |
159 | immediately by end of string}. |
160 | |
161 | To write a regexp that matches one of the words 'mail' \e or |
162 | 'letter' \e or 'correspondence' but does not match words that |
163 | contain these words, e.g., 'email', 'mailman', 'mailer', and |
164 | 'letterbox', start with a regexp that matches 'mail'. Expressed |
165 | fully, the regexp is \b{m{1,1}a{1,1}i{1,1}l{1,1}}, but because |
166 | a character expression is automatically quantified by |
167 | \b{{1,1}}, we can simplify the regexp to \b{mail}, i.e., an |
168 | 'm' followed by an 'a' followed by an 'i' followed by an 'l'. Now |
169 | we can use the vertical bar \b{|}, which means \b{or}, to |
170 | include the other two words, so our regexp for matching any of the |
171 | three words becomes \b{mail|letter|correspondence}. Match |
172 | 'mail' \b{or} 'letter' \b{or} 'correspondence'. While this |
173 | regexp will match one of the three words we want to match, it will |
174 | also match words we don't want to match, e.g., 'email'. To |
175 | prevent the regexp from matching unwanted words, we must tell it |
176 | to begin and end the match at word boundaries. First we enclose |
177 | our regexp in parentheses, \b{(mail|letter|correspondence)}. |
178 | Parentheses group expressions together, and they identify a part |
179 | of the regexp that we wish to \l{capturing text}{capture}. |
180 | Enclosing the expression in parentheses allows us to use it as a |
181 | component in more complex regexps. It also allows us to examine |
182 | which of the three words was actually matched. To force the match |
183 | to begin and end on word boundaries, we enclose the regexp in |
184 | \b{\\b} \e{word boundary} assertions: |
185 | \b{\\b(mail|letter|correspondence)\\b}. Now the regexp means: |
186 | \e{Match a word boundary, followed by the regexp in parentheses, |
187 | followed by a word boundary}. The \b{\\b} assertion matches a |
188 | \e position in the regexp, not a \e character. A word boundary is |
189 | any non-word character, e.g., a space, newline, or the beginning |
190 | or ending of a string. |
191 | |
192 | If we want to replace ampersand characters with the HTML entity |
193 | \b{\&}, the regexp to match is simply \b{\&}. But this |
194 | regexp will also match ampersands that have already been converted |
195 | to HTML entities. We want to replace only ampersands that are not |
196 | already followed by \b{amp;}. For this, we need the negative |
197 | lookahead assertion, \b{(?!}__\b{)}. The regexp can then be |
198 | written as \b{\&(?!amp;)}, i.e. \e{Match an ampersand that is} |
199 | \b{not} \e{followed by} \b{amp;}. |
200 | |
201 | If we want to count all the occurrences of 'Eric' and 'Eirik' in a |
202 | string, two valid solutions are \b{\\b(Eric|Eirik)\\b} and |
203 | \b{\\bEi?ri[ck]\\b}. The word boundary assertion '\\b' is |
204 | required to avoid matching words that contain either name, |
205 | e.g. 'Ericsson'. Note that the second regexp matches more |
206 | spellings than we want: 'Eric', 'Erik', 'Eiric' and 'Eirik'. |
207 | |
208 | Some of the examples discussed above are implemented in the |
209 | \l{#code-examples}{code examples} section. |
210 | |
211 | \target characters-and-abbreviations-for-sets-of-characters |
212 | \section1 Characters and Abbreviations for Sets of Characters |
213 | |
214 | \table |
215 | \header \li Element \li Meaning |
216 | \row \li \b{c} |
217 | \li A character represents itself unless it has a special |
218 | regexp meaning. e.g. \b{c} matches the character \e c. |
219 | \row \li \b{\\c} |
220 | \li A character that follows a backslash matches the character |
221 | itself, except as specified below. e.g., To match a literal |
222 | caret at the beginning of a string, write \b{\\^}. |
223 | \row \li \b{\\a} |
224 | \li Matches the ASCII bell (BEL, 0x07). |
225 | \row \li \b{\\f} |
226 | \li Matches the ASCII form feed (FF, 0x0C). |
227 | \row \li \b{\\n} |
228 | \li Matches the ASCII line feed (LF, 0x0A, Unix newline). |
229 | \row \li \b{\\r} |
230 | \li Matches the ASCII carriage return (CR, 0x0D). |
231 | \row \li \b{\\t} |
232 | \li Matches the ASCII horizontal tab (HT, 0x09). |
233 | \row \li \b{\\v} |
234 | \li Matches the ASCII vertical tab (VT, 0x0B). |
235 | \row \li \b{\\x\e{hhhh}} |
236 | \li Matches the Unicode character corresponding to the |
237 | hexadecimal number \e{hhhh} (between 0x0000 and 0xFFFF). |
238 | \row \li \b{\\0\e{ooo}} (i.e., \\zero \e{ooo}) |
239 | \li matches the ASCII/Latin1 character for the octal number |
240 | \e{ooo} (between 0 and 0377). |
241 | \row \li \b{. (dot)} |
242 | \li Matches any character (including newline). |
243 | \row \li \b{\\d} |
244 | \li Matches a digit (QChar::isDigit()). |
245 | \row \li \b{\\D} |
246 | \li Matches a non-digit. |
247 | \row \li \b{\\s} |
248 | \li Matches a whitespace character (QChar::isSpace()). |
249 | \row \li \b{\\S} |
250 | \li Matches a non-whitespace character. |
251 | \row \li \b{\\w} |
252 | \li Matches a word character (QChar::isLetterOrNumber(), QChar::isMark(), or '_'). |
253 | \row \li \b{\\W} |
254 | \li Matches a non-word character. |
255 | \row \li \b{\\\e{n}} |
256 | \li The \e{n}-th backreference, e.g. \\1, \\2, etc. |
257 | \endtable |
258 | |
259 | \b{Note:} The C++ compiler transforms backslashes in strings. |
260 | To include a \b{\\} in a regexp, enter it twice, i.e. \c{\\}. |
261 | To match the backslash character itself, enter it four times, i.e. |
262 | \c{\\\\}. |
263 | |
264 | \target sets-of-characters |
265 | \section1 Sets of Characters |
266 | |
267 | Square brackets mean match any character contained in the square |
268 | brackets. The character set abbreviations described above can |
269 | appear in a character set in square brackets. Except for the |
270 | character set abbreviations and the following two exceptions, |
271 | characters do not have special meanings in square brackets. |
272 | |
273 | \table |
274 | \row \li \b{^} |
275 | |
276 | \li The caret negates the character set if it occurs as the |
277 | first character (i.e. immediately after the opening square |
278 | bracket). \b{[abc]} matches 'a' or 'b' or 'c', but |
279 | \b{[^abc]} matches anything \e but 'a' or 'b' or 'c'. |
280 | |
281 | \row \li \b{-} |
282 | |
283 | \li The dash indicates a range of characters. \b{[W-Z]} |
284 | matches 'W' or 'X' or 'Y' or 'Z'. |
285 | |
286 | \endtable |
287 | |
288 | Using the predefined character set abbreviations is more portable |
289 | than using character ranges across platforms and languages. For |
290 | example, \b{[0-9]} matches a digit in Western alphabets but |
291 | \b{\\d} matches a digit in \e any alphabet. |
292 | |
293 | Note: In other regexp documentation, sets of characters are often |
294 | called "character classes". |
295 | |
296 | \target quantifiers |
297 | \section1 Quantifiers |
298 | |
299 | By default, an expression is automatically quantified by |
300 | \b{{1,1}}, i.e. it should occur exactly once. In the following |
301 | list, \b{\e {E}} stands for expression. An expression is a |
302 | character, or an abbreviation for a set of characters, or a set of |
303 | characters in square brackets, or an expression in parentheses. |
304 | |
305 | \table |
306 | \row \li \b{\e {E}?} |
307 | |
308 | \li Matches zero or one occurrences of \e E. This quantifier |
309 | means \e{The previous expression is optional}, because it |
310 | will match whether or not the expression is found. \b{\e |
311 | {E}?} is the same as \b{\e {E}{0,1}}. e.g., \b{dents?} |
312 | matches 'dent' or 'dents'. |
313 | |
314 | \row \li \b{\e {E}+} |
315 | |
316 | \li Matches one or more occurrences of \e E. \b{\e {E}+} is |
317 | the same as \b{\e {E}{1,}}. e.g., \b{0+} matches '0', |
318 | '00', '000', etc. |
319 | |
320 | \row \li \b{\e {E}*} |
321 | |
322 | \li Matches zero or more occurrences of \e E. It is the same |
323 | as \b{\e {E}{0,}}. The \b{*} quantifier is often used |
324 | in error where \b{+} should be used. For example, if |
325 | \b{\\s*$} is used in an expression to match strings that |
326 | end in whitespace, it will match every string because |
327 | \b{\\s*$} means \e{Match zero or more whitespaces followed |
328 | by end of string}. The correct regexp to match strings that |
329 | have at least one trailing whitespace character is |
330 | \b{\\s+$}. |
331 | |
332 | \row \li \b{\e {E}{n}} |
333 | |
334 | \li Matches exactly \e n occurrences of \e E. \b{\e {E}{n}} |
335 | is the same as repeating \e E \e n times. For example, |
336 | \b{x{5}} is the same as \b{xxxxx}. It is also the same |
337 | as \b{\e {E}{n,n}}, e.g. \b{x{5,5}}. |
338 | |
339 | \row \li \b{\e {E}{n,}} |
340 | \li Matches at least \e n occurrences of \e E. |
341 | |
342 | \row \li \b{\e {E}{,m}} |
343 | \li Matches at most \e m occurrences of \e E. \b{\e {E}{,m}} |
344 | is the same as \b{\e {E}{0,m}}. |
345 | |
346 | \row \li \b{\e {E}{n,m}} |
347 | \li Matches at least \e n and at most \e m occurrences of \e E. |
348 | \endtable |
349 | |
350 | To apply a quantifier to more than just the preceding character, |
351 | use parentheses to group characters together in an expression. For |
352 | example, \b{tag+} matches a 't' followed by an 'a' followed by |
353 | at least one 'g', whereas \b{(tag)+} matches at least one |
354 | occurrence of 'tag'. |
355 | |
356 | Note: Quantifiers are normally "greedy". They always match as much |
357 | text as they can. For example, \b{0+} matches the first zero it |
358 | finds and all the consecutive zeros after the first zero. Applied |
359 | to '20005', it matches '2\underline{000}5'. Quantifiers can be made |
360 | non-greedy, see setMinimal(). |
361 | |
362 | \target capturing parentheses |
363 | \target backreferences |
364 | \section1 Capturing Text |
365 | |
366 | Parentheses allow us to group elements together so that we can |
367 | quantify and capture them. For example if we have the expression |
368 | \b{mail|letter|correspondence} that matches a string we know |
369 | that \e one of the words matched but not which one. Using |
370 | parentheses allows us to "capture" whatever is matched within |
371 | their bounds, so if we used \b{(mail|letter|correspondence)} |
372 | and matched this regexp against the string "I sent you some email" |
373 | we can use the cap() or capturedTexts() functions to extract the |
374 | matched characters, in this case 'mail'. |
375 | |
376 | We can use captured text within the regexp itself. To refer to the |
377 | captured text we use \e backreferences which are indexed from 1, |
378 | the same as for cap(). For example we could search for duplicate |
379 | words in a string using \b{\\b(\\w+)\\W+\\1\\b} which means match a |
380 | word boundary followed by one or more word characters followed by |
381 | one or more non-word characters followed by the same text as the |
382 | first parenthesized expression followed by a word boundary. |
383 | |
384 | If we want to use parentheses purely for grouping and not for |
385 | capturing we can use the non-capturing syntax, e.g. |
386 | \b{(?:green|blue)}. Non-capturing parentheses begin '(?:' and |
387 | end ')'. In this example we match either 'green' or 'blue' but we |
388 | do not capture the match so we only know whether or not we matched |
389 | but not which color we actually found. Using non-capturing |
390 | parentheses is more efficient than using capturing parentheses |
391 | since the regexp engine has to do less book-keeping. |
392 | |
393 | Both capturing and non-capturing parentheses may be nested. |
394 | |
395 | \target greedy quantifiers |
396 | |
397 | For historical reasons, quantifiers (e.g. \b{*}) that apply to |
398 | capturing parentheses are more "greedy" than other quantifiers. |
399 | For example, \b{a*(a*)} will match "aaa" with cap(1) == "aaa". |
400 | This behavior is different from what other regexp engines do |
401 | (notably, Perl). To obtain a more intuitive capturing behavior, |
402 | specify QRegExp::RegExp2 to the QRegExp constructor or call |
403 | setPatternSyntax(QRegExp::RegExp2). |
404 | |
405 | \target cap_in_a_loop |
406 | |
407 | When the number of matches cannot be determined in advance, a |
408 | common idiom is to use cap() in a loop. For example: |
409 | |
410 | \snippet code/src_corelib_text_qregexp.cpp 0 |
411 | |
412 | \target assertions |
413 | \section1 Assertions |
414 | |
415 | Assertions make some statement about the text at the point where |
416 | they occur in the regexp but they do not match any characters. In |
417 | the following list \b{\e {E}} stands for any expression. |
418 | |
419 | \table |
420 | \row \li \b{^} |
421 | \li The caret signifies the beginning of the string. If you |
422 | wish to match a literal \c{^} you must escape it by |
423 | writing \c{\\^}. For example, \b{^#include} will only |
424 | match strings which \e begin with the characters '#include'. |
425 | (When the caret is the first character of a character set it |
426 | has a special meaning, see \l{#sets-of-characters}{Sets of Characters}.) |
427 | |
428 | \row \li \b{$} |
429 | \li The dollar signifies the end of the string. For example |
430 | \b{\\d\\s*$} will match strings which end with a digit |
431 | optionally followed by whitespace. If you wish to match a |
432 | literal \c{$} you must escape it by writing |
433 | \c{\\$}. |
434 | |
435 | \row \li \b{\\b} |
436 | \li A word boundary. For example the regexp |
437 | \b{\\bOK\\b} means match immediately after a word |
438 | boundary (e.g. start of string or whitespace) the letter 'O' |
439 | then the letter 'K' immediately before another word boundary |
440 | (e.g. end of string or whitespace). But note that the |
441 | assertion does not actually match any whitespace so if we |
442 | write \b{(\\bOK\\b)} and we have a match it will only |
443 | contain 'OK' even if the string is "It's \underline{OK} now". |
444 | |
445 | \row \li \b{\\B} |
446 | \li A non-word boundary. This assertion is true wherever |
447 | \b{\\b} is false. For example if we searched for |
448 | \b{\\Bon\\B} in "Left on" the match would fail (space |
449 | and end of string aren't non-word boundaries), but it would |
450 | match in "t\underline{on}ne". |
451 | |
452 | \row \li \b{(?=\e E)} |
453 | \li Positive lookahead. This assertion is true if the |
454 | expression matches at this point in the regexp. For example, |
455 | \b{const(?=\\s+char)} matches 'const' whenever it is |
456 | followed by 'char', as in 'static \underline{const} char *'. |
457 | (Compare with \b{const\\s+char}, which matches 'static |
458 | \underline{const char} *'.) |
459 | |
460 | \row \li \b{(?!\e E)} |
461 | \li Negative lookahead. This assertion is true if the |
462 | expression does not match at this point in the regexp. For |
463 | example, \b{const(?!\\s+char)} matches 'const' \e except |
464 | when it is followed by 'char'. |
465 | \endtable |
466 | |
467 | \target QRegExp wildcard matching |
468 | \section1 Wildcard Matching |
469 | |
470 | Most command shells such as \e bash or \e cmd.exe support "file |
471 | globbing", the ability to identify a group of files by using |
472 | wildcards. The setPatternSyntax() function is used to switch |
473 | between regexp and wildcard mode. Wildcard matching is much |
474 | simpler than full regexps and has only four features: |
475 | |
476 | \table |
477 | \row \li \b{c} |
478 | \li Any character represents itself apart from those mentioned |
479 | below. Thus \b{c} matches the character \e c. |
480 | \row \li \b{?} |
481 | \li Matches any single character. It is the same as |
482 | \b{.} in full regexps. |
483 | \row \li \b{*} |
484 | \li Matches zero or more of any characters. It is the |
485 | same as \b{.*} in full regexps. |
486 | \row \li \b{[...]} |
487 | \li Sets of characters can be represented in square brackets, |
488 | similar to full regexps. Within the character class, like |
489 | outside, backslash has no special meaning. |
490 | \endtable |
491 | |
492 | In the mode Wildcard, the wildcard characters cannot be |
493 | escaped. In the mode WildcardUnix, the character '\\' escapes the |
494 | wildcard. |
495 | |
496 | For example if we are in wildcard mode and have strings which |
497 | contain filenames we could identify HTML files with \b{*.html}. |
498 | This will match zero or more characters followed by a dot followed |
499 | by 'h', 't', 'm' and 'l'. |
500 | |
501 | To test a string against a wildcard expression, use exactMatch(). |
502 | For example: |
503 | |
504 | \snippet code/src_corelib_text_qregexp.cpp 1 |
505 | |
506 | \target perl-users |
507 | \section1 Notes for Perl Users |
508 | |
509 | Most of the character class abbreviations supported by Perl are |
510 | supported by QRegExp, see \l{#characters-and-abbreviations-for-sets-of-characters} |
511 | {characters and abbreviations for sets of characters}. |
512 | |
513 | In QRegExp, apart from within character classes, \c{^} always |
514 | signifies the start of the string, so carets must always be |
515 | escaped unless used for that purpose. In Perl the meaning of caret |
516 | varies automagically depending on where it occurs so escaping it |
517 | is rarely necessary. The same applies to \c{$} which in |
518 | QRegExp always signifies the end of the string. |
519 | |
520 | QRegExp's quantifiers are the same as Perl's greedy quantifiers |
521 | (but see the \l{greedy quantifiers}{note above}). Non-greedy |
522 | matching cannot be applied to individual quantifiers, but can be |
523 | applied to all the quantifiers in the pattern. For example, to |
524 | match the Perl regexp \b{ro+?m} requires: |
525 | |
526 | \snippet code/src_corelib_text_qregexp.cpp 2 |
527 | |
528 | The equivalent of Perl's \c{/i} option is |
529 | setCaseSensitivity(Qt::CaseInsensitive). |
530 | |
531 | Perl's \c{/g} option can be emulated using a \l{#cap_in_a_loop}{loop}. |
532 | |
533 | In QRegExp \b{.} matches any character, therefore all QRegExp |
534 | regexps have the equivalent of Perl's \c{/s} option. QRegExp |
535 | does not have an equivalent to Perl's \c{/m} option, but this |
536 | can be emulated in various ways for example by splitting the input |
537 | into lines or by looping with a regexp that searches for newlines. |
538 | |
539 | Because QRegExp is string oriented, there are no \\A, \\Z, or \\z |
540 | assertions. The \\G assertion is not supported but can be emulated |
541 | in a loop. |
542 | |
543 | Perl's $& is cap(0) or capturedTexts()[0]. There are no QRegExp |
544 | equivalents for $`, $' or $+. Perl's capturing variables, $1, $2, |
545 | ... correspond to cap(1) or capturedTexts()[1], cap(2) or |
546 | capturedTexts()[2], etc. |
547 | |
548 | To substitute a pattern use QString::replace(). |
549 | |
550 | Perl's extended \c{/x} syntax is not supported, nor are |
551 | directives, e.g. (?i), or regexp comments, e.g. (?#comment). On |
552 | the other hand, C++'s rules for literal strings can be used to |
553 | achieve the same: |
554 | |
555 | \snippet code/src_corelib_text_qregexp.cpp 3 |
556 | |
557 | Both zero-width positive and zero-width negative lookahead |
558 | assertions (?=pattern) and (?!pattern) are supported with the same |
559 | syntax as Perl. Perl's lookbehind assertions, "independent" |
560 | subexpressions and conditional expressions are not supported. |
561 | |
562 | Non-capturing parentheses are also supported, with the same |
563 | (?:pattern) syntax. |
564 | |
565 | See QString::split() and QStringList::join() for equivalents |
566 | to Perl's split and join functions. |
567 | |
568 | Note: because C++ transforms \\'s they must be written \e twice in |
569 | code, e.g. \b{\\b} must be written \b{\\\\b}. |
570 | |
571 | \target code-examples |
572 | \section1 Code Examples |
573 | |
574 | \snippet code/src_corelib_text_qregexp.cpp 4 |
575 | |
576 | The third string matches '\underline{6}'. This is a simple validation |
577 | regexp for integers in the range 0 to 99. |
578 | |
579 | \snippet code/src_corelib_text_qregexp.cpp 5 |
580 | |
581 | The second string matches '\underline{This_is-OK}'. We've used the |
582 | character set abbreviation '\\S' (non-whitespace) and the anchors |
583 | to match strings which contain no whitespace. |
584 | |
585 | In the following example we match strings containing 'mail' or |
586 | 'letter' or 'correspondence' but only match whole words i.e. not |
587 | 'email' |
588 | |
589 | \snippet code/src_corelib_text_qregexp.cpp 6 |
590 | |
591 | The second string matches "Please write the \underline{letter}". The |
592 | word 'letter' is also captured (because of the parentheses). We |
593 | can see what text we've captured like this: |
594 | |
595 | \snippet code/src_corelib_text_qregexp.cpp 7 |
596 | |
597 | This will capture the text from the first set of capturing |
598 | parentheses (counting capturing left parentheses from left to |
599 | right). The parentheses are counted from 1 since cap(0) is the |
600 | whole matched regexp (equivalent to '&' in most regexp engines). |
601 | |
602 | \snippet code/src_corelib_text_qregexp.cpp 8 |
603 | |
604 | Here we've passed the QRegExp to QString's replace() function to |
605 | replace the matched text with new text. |
606 | |
607 | \snippet code/src_corelib_text_qregexp.cpp 9 |
608 | |
609 | We've used the indexIn() function to repeatedly match the regexp in |
610 | the string. Note that instead of moving forward by one character |
611 | at a time \c pos++ we could have written \c {pos += |
612 | rx.matchedLength()} to skip over the already matched string. The |
613 | count will equal 3, matching 'One \underline{Eric} another |
614 | \underline{Eirik}, and an Ericsson. How many Eiriks, \underline{Eric}?'; it |
615 | doesn't match 'Ericsson' or 'Eiriks' because they are not bounded |
616 | by non-word boundaries. |
617 | |
618 | One common use of regexps is to split lines of delimited data into |
619 | their component fields. |
620 | |
621 | \snippet code/src_corelib_text_qregexp.cpp 10 |
622 | |
623 | In this example our input lines have the format company name, web |
624 | address and country. Unfortunately the regexp is rather long and |
625 | not very versatile -- the code will break if we add any more |
626 | fields. A simpler and better solution is to look for the |
627 | separator, '\\t' in this case, and take the surrounding text. The |
628 | QString::split() function can take a separator string or regexp |
629 | as an argument and split a string accordingly. |
630 | |
631 | \snippet code/src_corelib_text_qregexp.cpp 11 |
632 | |
633 | Here field[0] is the company, field[1] the web address and so on. |
634 | |
635 | To imitate the matching of a shell we can use wildcard mode. |
636 | |
637 | \snippet code/src_corelib_text_qregexp.cpp 12 |
638 | |
639 | Wildcard matching can be convenient because of its simplicity, but |
640 | any wildcard regexp can be defined using full regexps, e.g. |
641 | \b{.*\\.html$}. Notice that we can't match both \c .html and \c |
642 | .htm files with a wildcard unless we use \b{*.htm*} which will |
643 | also match 'test.html.bak'. A full regexp gives us the precision |
644 | we need, \b{.*\\.html?$}. |
645 | |
646 | QRegExp can match case insensitively using setCaseSensitivity(), |
647 | and can use non-greedy matching, see setMinimal(). By |
648 | default QRegExp uses full regexps but this can be changed with |
649 | setPatternSyntax(). Searching can be done forward with indexIn() or backward |
650 | with lastIndexIn(). Captured text can be accessed using |
651 | capturedTexts() which returns a string list of all captured |
652 | strings, or using cap() which returns the captured string for the |
653 | given index. The pos() function takes a match index and returns |
654 | the position in the string where the match was made (or -1 if |
655 | there was no match). |
656 | |
657 | \sa QString, QStringList, QSortFilterProxyModel |
658 | |
659 | \section1 Porting to QRegularExpression |
660 | |
661 | \include corelib/port-from-qregexp.qdocinc porting-to-qregularexpression |
662 | */ |
663 | |
664 | #if defined(Q_OS_VXWORKS) && defined(EOS) |
665 | # undef EOS |
666 | #endif |
667 | |
668 | const int NumBadChars = 64; |
669 | #define BadChar(ch) ((ch).unicode() % NumBadChars) |
670 | |
671 | const int NoOccurrence = INT_MAX; |
672 | const int EmptyCapture = INT_MAX; |
673 | const int InftyLen = INT_MAX; |
674 | const int InftyRep = 1025; |
675 | const int EOS = -1; |
676 | |
677 | static bool isWord(QChar ch) |
678 | { |
679 | return ch.isLetterOrNumber() || ch.isMark() || ch == QLatin1Char('_'); |
680 | } |
681 | |
682 | /* |
683 | Merges two vectors of ints and puts the result into the first |
684 | one. |
685 | */ |
686 | static void mergeInto(QList<int> *a, const QList<int> &b) |
687 | { |
688 | int asize = a->size(); |
689 | int bsize = b.size(); |
690 | if (asize == 0) { |
691 | *a = b; |
692 | #ifndef QT_NO_REGEXP_OPTIM |
693 | } else if (bsize == 1 && a->at(i: asize - 1) < b.at(i: 0)) { |
694 | a->resize(size: asize + 1); |
695 | (*a)[asize] = b.at(i: 0); |
696 | #endif |
697 | } else if (bsize >= 1) { |
698 | int csize = asize + bsize; |
699 | QList<int> c(csize); |
700 | int i = 0, j = 0, k = 0; |
701 | while (i < asize) { |
702 | if (j < bsize) { |
703 | if (a->at(i) == b.at(i: j)) { |
704 | ++i; |
705 | --csize; |
706 | } else if (a->at(i) < b.at(i: j)) { |
707 | c[k++] = a->at(i: i++); |
708 | } else { |
709 | c[k++] = b.at(i: j++); |
710 | } |
711 | } else { |
712 | memcpy(dest: c.data() + k, src: a->constData() + i, n: (asize - i) * sizeof(int)); |
713 | break; |
714 | } |
715 | } |
716 | c.resize(size: csize); |
717 | if (j < bsize) |
718 | memcpy(dest: c.data() + k, src: b.constData() + j, n: (bsize - j) * sizeof(int)); |
719 | *a = c; |
720 | } |
721 | } |
722 | |
723 | #ifndef QT_NO_REGEXP_WILDCARD |
724 | /* |
725 | Translates a wildcard pattern to an equivalent regular expression |
726 | pattern (e.g., *.cpp to .*\.cpp). |
727 | |
728 | If enableEscaping is true, it is possible to escape the wildcard |
729 | characters with \ |
730 | */ |
731 | static QString wc2rx(const QString &wc_str, const bool enableEscaping) |
732 | { |
733 | const int wclen = wc_str.size(); |
734 | QString rx; |
735 | int i = 0; |
736 | bool isEscaping = false; // the previous character is '\' |
737 | const QChar *wc = wc_str.unicode(); |
738 | |
739 | while (i < wclen) { |
740 | const QChar c = wc[i++]; |
741 | switch (c.unicode()) { |
742 | case '\\': |
743 | if (enableEscaping) { |
744 | if (isEscaping) { |
745 | rx += QLatin1String("\\\\" ); |
746 | } // we insert the \\ later if necessary |
747 | if (i == wclen) { // the end |
748 | rx += QLatin1String("\\\\" ); |
749 | } |
750 | } else { |
751 | rx += QLatin1String("\\\\" ); |
752 | } |
753 | isEscaping = true; |
754 | break; |
755 | case '*': |
756 | if (isEscaping) { |
757 | rx += QLatin1String("\\*" ); |
758 | isEscaping = false; |
759 | } else { |
760 | rx += QLatin1String(".*" ); |
761 | } |
762 | break; |
763 | case '?': |
764 | if (isEscaping) { |
765 | rx += QLatin1String("\\?" ); |
766 | isEscaping = false; |
767 | } else { |
768 | rx += QLatin1Char('.'); |
769 | } |
770 | |
771 | break; |
772 | case '$': |
773 | case '(': |
774 | case ')': |
775 | case '+': |
776 | case '.': |
777 | case '^': |
778 | case '{': |
779 | case '|': |
780 | case '}': |
781 | if (isEscaping) { |
782 | isEscaping = false; |
783 | rx += QLatin1String("\\\\" ); |
784 | } |
785 | rx += QLatin1Char('\\'); |
786 | rx += c; |
787 | break; |
788 | case '[': |
789 | if (isEscaping) { |
790 | isEscaping = false; |
791 | rx += QLatin1String("\\[" ); |
792 | } else { |
793 | rx += c; |
794 | if (wc[i] == QLatin1Char('^')) |
795 | rx += wc[i++]; |
796 | if (i < wclen) { |
797 | if (wc[i] == QLatin1Char(']')) |
798 | rx += wc[i++]; |
799 | while (i < wclen && wc[i] != QLatin1Char(']')) { |
800 | if (wc[i] == QLatin1Char('\\')) |
801 | rx += QLatin1Char('\\'); |
802 | rx += wc[i++]; |
803 | } |
804 | } |
805 | } |
806 | break; |
807 | |
808 | case ']': |
809 | if (isEscaping){ |
810 | isEscaping = false; |
811 | rx += QLatin1String("\\" ); |
812 | } |
813 | rx += c; |
814 | break; |
815 | |
816 | default: |
817 | if (isEscaping){ |
818 | isEscaping = false; |
819 | rx += QLatin1String("\\\\" ); |
820 | } |
821 | rx += c; |
822 | } |
823 | } |
824 | return rx; |
825 | } |
826 | #endif |
827 | |
828 | static int caretIndex(int offset, QRegExp::CaretMode caretMode) |
829 | { |
830 | if (caretMode == QRegExp::CaretAtZero) { |
831 | return 0; |
832 | } else if (caretMode == QRegExp::CaretAtOffset) { |
833 | return offset; |
834 | } else { // QRegExp::CaretWontMatch |
835 | return -1; |
836 | } |
837 | } |
838 | |
839 | /* |
840 | The QRegExpEngineKey struct uniquely identifies an engine. |
841 | */ |
842 | struct QRegExpEngineKey |
843 | { |
844 | QString pattern; |
845 | QRegExp::PatternSyntax patternSyntax; |
846 | Qt::CaseSensitivity cs; |
847 | |
848 | inline QRegExpEngineKey(const QString &pattern, QRegExp::PatternSyntax patternSyntax, |
849 | Qt::CaseSensitivity cs) |
850 | : pattern(pattern), patternSyntax(patternSyntax), cs(cs) {} |
851 | |
852 | inline void clear() { |
853 | pattern.clear(); |
854 | patternSyntax = QRegExp::RegExp; |
855 | cs = Qt::CaseSensitive; |
856 | } |
857 | }; |
858 | |
859 | static bool operator==(const QRegExpEngineKey &key1, const QRegExpEngineKey &key2) |
860 | { |
861 | return key1.pattern == key2.pattern && key1.patternSyntax == key2.patternSyntax |
862 | && key1.cs == key2.cs; |
863 | } |
864 | |
865 | static size_t qHash(const QRegExpEngineKey &key, size_t seed = 0) noexcept |
866 | { |
867 | return qHashMulti(seed, args: key.pattern, args: key.patternSyntax, args: key.cs); |
868 | } |
869 | |
870 | class QRegExpEngine; |
871 | |
872 | /* |
873 | This is the engine state during matching. |
874 | */ |
875 | struct QRegExpMatchState |
876 | { |
877 | const QChar *in; // a pointer to the input string data |
878 | int pos; // the current position in the string |
879 | int caretPos; |
880 | int len; // the length of the input string |
881 | bool minimal; // minimal matching? |
882 | int *bigArray; // big array holding the data for the next pointers |
883 | int *inNextStack; // is state is nextStack? |
884 | int *curStack; // stack of current states |
885 | int *nextStack; // stack of next states |
886 | int *curCapBegin; // start of current states' captures |
887 | int *nextCapBegin; // start of next states' captures |
888 | int *curCapEnd; // end of current states' captures |
889 | int *nextCapEnd; // end of next states' captures |
890 | int *tempCapBegin; // start of temporary captures |
891 | int *tempCapEnd; // end of temporary captures |
892 | int *capBegin; // start of captures for a next state |
893 | int *capEnd; // end of captures for a next state |
894 | int *slideTab; // bump-along slide table for bad-character heuristic |
895 | int *captured; // what match() returned last |
896 | int slideTabSize; // size of slide table |
897 | int capturedSize; |
898 | #ifndef QT_NO_REGEXP_BACKREF |
899 | QList<QList<int>> sleeping; // list of back-reference sleepers |
900 | #endif |
901 | int matchLen; // length of match |
902 | int oneTestMatchedLen; // length of partial match |
903 | |
904 | const QRegExpEngine *eng; |
905 | |
906 | inline QRegExpMatchState() : bigArray(nullptr), captured(nullptr) {} |
907 | inline ~QRegExpMatchState() { free(ptr: bigArray); } |
908 | |
909 | void drain() { free(ptr: bigArray); bigArray = nullptr; captured = nullptr; } // to save memory |
910 | void prepareForMatch(QRegExpEngine *eng); |
911 | void match(const QChar *str, int len, int pos, bool minimal, |
912 | bool oneTest, int caretIndex); |
913 | bool matchHere(); |
914 | bool testAnchor(int i, int a, const int *capBegin); |
915 | }; |
916 | |
917 | /* |
918 | The struct QRegExpAutomatonState represents one state in a modified NFA. The |
919 | input characters matched are stored in the state instead of on |
920 | the transitions, something possible for an automaton |
921 | constructed from a regular expression. |
922 | */ |
923 | struct QRegExpAutomatonState |
924 | { |
925 | #ifndef QT_NO_REGEXP_CAPTURE |
926 | int atom; // which atom does this state belong to? |
927 | #endif |
928 | int match; // what does it match? (see CharClassBit and BackRefBit) |
929 | QList<int> outs; // out-transitions |
930 | QMap<int, int> reenter; // atoms reentered when transiting out |
931 | QMap<int, int> anchors; // anchors met when transiting out |
932 | |
933 | inline QRegExpAutomatonState() { } |
934 | #ifndef QT_NO_REGEXP_CAPTURE |
935 | inline QRegExpAutomatonState(int a, int m) |
936 | : atom(a), match(m) { } |
937 | #else |
938 | inline QRegExpAutomatonState(int m) |
939 | : match(m) { } |
940 | #endif |
941 | }; |
942 | |
943 | Q_DECLARE_TYPEINFO(QRegExpAutomatonState, Q_RELOCATABLE_TYPE); |
944 | |
945 | /* |
946 | The struct QRegExpCharClassRange represents a range of characters (e.g., |
947 | [0-9] denotes range 48 to 57). |
948 | */ |
949 | struct QRegExpCharClassRange |
950 | { |
951 | ushort from; // 48 |
952 | ushort len; // 10 |
953 | }; |
954 | |
955 | Q_DECLARE_TYPEINFO(QRegExpCharClassRange, Q_PRIMITIVE_TYPE); |
956 | |
957 | #ifndef QT_NO_REGEXP_CAPTURE |
958 | /* |
959 | The struct QRegExpAtom represents one node in the hierarchy of regular |
960 | expression atoms. |
961 | */ |
962 | struct QRegExpAtom |
963 | { |
964 | enum { NoCapture = -1, OfficialCapture = -2, UnofficialCapture = -3 }; |
965 | |
966 | int parent; // index of parent in array of atoms |
967 | int capture; // index of capture, from 1 to ncap - 1 |
968 | }; |
969 | |
970 | Q_DECLARE_TYPEINFO(QRegExpAtom, Q_PRIMITIVE_TYPE); |
971 | #endif |
972 | |
973 | struct QRegExpLookahead; |
974 | |
975 | #ifndef QT_NO_REGEXP_ANCHOR_ALT |
976 | /* |
977 | The struct QRegExpAnchorAlternation represents a pair of anchors with |
978 | OR semantics. |
979 | */ |
980 | struct QRegExpAnchorAlternation |
981 | { |
982 | int a; // this anchor... |
983 | int b; // ...or this one |
984 | }; |
985 | |
986 | Q_DECLARE_TYPEINFO(QRegExpAnchorAlternation, Q_PRIMITIVE_TYPE); |
987 | #endif |
988 | |
989 | #ifndef QT_NO_REGEXP_CCLASS |
990 | |
991 | #define FLAG(x) (1 << (x)) |
992 | /* |
993 | The class QRegExpCharClass represents a set of characters, such as can |
994 | be found in regular expressions (e.g., [a-z] denotes the set |
995 | {a, b, ..., z}). |
996 | */ |
997 | class QRegExpCharClass |
998 | { |
999 | public: |
1000 | QRegExpCharClass(); |
1001 | |
1002 | void clear(); |
1003 | bool negative() const { return n; } |
1004 | void setNegative(bool negative); |
1005 | void addCategories(uint cats); |
1006 | void addRange(ushort from, ushort to); |
1007 | void addSingleton(ushort ch) { addRange(from: ch, to: ch); } |
1008 | |
1009 | bool in(QChar ch) const; |
1010 | #ifndef QT_NO_REGEXP_OPTIM |
1011 | const QList<int> &firstOccurrence() const { return occ1; } |
1012 | #endif |
1013 | |
1014 | #if defined(QT_DEBUG) |
1015 | void dump() const; |
1016 | #endif |
1017 | |
1018 | private: |
1019 | QList<QRegExpCharClassRange> r; // character ranges |
1020 | #ifndef QT_NO_REGEXP_OPTIM |
1021 | QList<int> occ1; // first-occurrence array |
1022 | #endif |
1023 | uint c; // character classes |
1024 | bool n; // negative? |
1025 | }; |
1026 | #else |
1027 | struct QRegExpCharClass |
1028 | { |
1029 | int dummy; |
1030 | |
1031 | #ifndef QT_NO_REGEXP_OPTIM |
1032 | QRegExpCharClass() { occ1.fill(0, NumBadChars); } |
1033 | |
1034 | const QList<int> &firstOccurrence() const { return occ1; } |
1035 | QList<int> occ1; |
1036 | #endif |
1037 | }; |
1038 | #endif |
1039 | |
1040 | Q_DECLARE_TYPEINFO(QRegExpCharClass, Q_RELOCATABLE_TYPE); |
1041 | |
1042 | /* |
1043 | The QRegExpEngine class encapsulates a modified nondeterministic |
1044 | finite automaton (NFA). |
1045 | */ |
1046 | class QRegExpEngine |
1047 | { |
1048 | public: |
1049 | QRegExpEngine(Qt::CaseSensitivity cs, bool greedyQuantifiers) |
1050 | : cs(cs), greedyQuantifiers(greedyQuantifiers) { setup(); } |
1051 | |
1052 | QRegExpEngine(const QRegExpEngineKey &key); |
1053 | ~QRegExpEngine(); |
1054 | |
1055 | bool isValid() const { return valid; } |
1056 | const QString &errorString() const { return yyError; } |
1057 | int captureCount() const { return officialncap; } |
1058 | |
1059 | int createState(QChar ch); |
1060 | int createState(const QRegExpCharClass &cc); |
1061 | #ifndef QT_NO_REGEXP_BACKREF |
1062 | int createState(int bref); |
1063 | #endif |
1064 | |
1065 | void addCatTransitions(const QList<int> &from, const QList<int> &to); |
1066 | #ifndef QT_NO_REGEXP_CAPTURE |
1067 | void addPlusTransitions(const QList<int> &from, const QList<int> &to, int atom); |
1068 | #endif |
1069 | |
1070 | #ifndef QT_NO_REGEXP_ANCHOR_ALT |
1071 | int anchorAlternation(int a, int b); |
1072 | int anchorConcatenation(int a, int b); |
1073 | #else |
1074 | int anchorAlternation(int a, int b) { return a & b; } |
1075 | int anchorConcatenation(int a, int b) { return a | b; } |
1076 | #endif |
1077 | void addAnchors(int from, int to, int a); |
1078 | |
1079 | #ifndef QT_NO_REGEXP_OPTIM |
1080 | void heuristicallyChooseHeuristic(); |
1081 | #endif |
1082 | |
1083 | #if defined(QT_DEBUG) |
1084 | void dump() const; |
1085 | #endif |
1086 | |
1087 | QAtomicInt ref; |
1088 | |
1089 | private: |
1090 | enum { CharClassBit = 0x10000, BackRefBit = 0x20000 }; |
1091 | enum { InitialState = 0, FinalState = 1 }; |
1092 | |
1093 | void setup(); |
1094 | int setupState(int match); |
1095 | |
1096 | /* |
1097 | Let's hope that 13 lookaheads and 14 back-references are |
1098 | enough. |
1099 | */ |
1100 | enum { MaxLookaheads = 13, MaxBackRefs = 14 }; |
1101 | enum { Anchor_Dollar = 0x00000001, Anchor_Caret = 0x00000002, Anchor_Word = 0x00000004, |
1102 | Anchor_NonWord = 0x00000008, Anchor_FirstLookahead = 0x00000010, |
1103 | Anchor_BackRef1Empty = Anchor_FirstLookahead << MaxLookaheads, |
1104 | Anchor_BackRef0Empty = Anchor_BackRef1Empty >> 1, |
1105 | Anchor_Alternation = unsigned(Anchor_BackRef1Empty) << MaxBackRefs, |
1106 | |
1107 | Anchor_LookaheadMask = (Anchor_FirstLookahead - 1) ^ |
1108 | ((Anchor_FirstLookahead << MaxLookaheads) - 1) }; |
1109 | #ifndef QT_NO_REGEXP_CAPTURE |
1110 | int startAtom(bool officialCapture); |
1111 | void finishAtom(int atom, bool needCapture); |
1112 | #endif |
1113 | |
1114 | #ifndef QT_NO_REGEXP_LOOKAHEAD |
1115 | int addLookahead(QRegExpEngine *eng, bool negative); |
1116 | #endif |
1117 | |
1118 | #ifndef QT_NO_REGEXP_OPTIM |
1119 | bool goodStringMatch(QRegExpMatchState &matchState) const; |
1120 | bool badCharMatch(QRegExpMatchState &matchState) const; |
1121 | #else |
1122 | bool bruteMatch(QRegExpMatchState &matchState) const; |
1123 | #endif |
1124 | |
1125 | QList<QRegExpAutomatonState> s; // array of states |
1126 | #ifndef QT_NO_REGEXP_CAPTURE |
1127 | QList<QRegExpAtom> f; // atom hierarchy |
1128 | int nf; // number of atoms |
1129 | int cf; // current atom |
1130 | QList<int> captureForOfficialCapture; |
1131 | #endif |
1132 | int officialncap; // number of captures, seen from the outside |
1133 | int ncap; // number of captures, seen from the inside |
1134 | #ifndef QT_NO_REGEXP_CCLASS |
1135 | QList<QRegExpCharClass> cl; // array of character classes |
1136 | #endif |
1137 | #ifndef QT_NO_REGEXP_LOOKAHEAD |
1138 | QList<QRegExpLookahead *> ahead; // array of lookaheads |
1139 | #endif |
1140 | #ifndef QT_NO_REGEXP_ANCHOR_ALT |
1141 | QList<QRegExpAnchorAlternation> aa; // array of (a, b) pairs of anchors |
1142 | #endif |
1143 | #ifndef QT_NO_REGEXP_OPTIM |
1144 | bool caretAnchored; // does the regexp start with ^? |
1145 | bool trivial; // is the good-string all that needs to match? |
1146 | #endif |
1147 | bool valid; // is the regular expression valid? |
1148 | Qt::CaseSensitivity cs; // case sensitive? |
1149 | bool greedyQuantifiers; // RegExp2? |
1150 | bool xmlSchemaExtensions; |
1151 | #ifndef QT_NO_REGEXP_BACKREF |
1152 | int nbrefs; // number of back-references |
1153 | #endif |
1154 | |
1155 | #ifndef QT_NO_REGEXP_OPTIM |
1156 | bool useGoodStringHeuristic; // use goodStringMatch? otherwise badCharMatch |
1157 | |
1158 | int goodEarlyStart; // the index where goodStr can first occur in a match |
1159 | int goodLateStart; // the index where goodStr can last occur in a match |
1160 | QString goodStr; // the string that any match has to contain |
1161 | |
1162 | int minl; // the minimum length of a match |
1163 | QList<int> occ1; // first-occurrence array |
1164 | #endif |
1165 | |
1166 | /* |
1167 | The class Box is an abstraction for a regular expression |
1168 | fragment. It can also be seen as one node in the syntax tree of |
1169 | a regular expression with synthetized attributes. |
1170 | |
1171 | Its interface is ugly for performance reasons. |
1172 | */ |
1173 | class Box |
1174 | { |
1175 | public: |
1176 | Box(QRegExpEngine *engine); |
1177 | Box(const Box &b) { operator=(b); } |
1178 | |
1179 | Box &operator=(const Box &b); |
1180 | |
1181 | void clear() { operator=(b: Box(eng)); } |
1182 | void set(QChar ch); |
1183 | void set(const QRegExpCharClass &cc); |
1184 | #ifndef QT_NO_REGEXP_BACKREF |
1185 | void set(int bref); |
1186 | #endif |
1187 | |
1188 | void cat(const Box &b); |
1189 | void orx(const Box &b); |
1190 | void plus(int atom); |
1191 | void opt(); |
1192 | void catAnchor(int a); |
1193 | #ifndef QT_NO_REGEXP_OPTIM |
1194 | void setupHeuristics(); |
1195 | #endif |
1196 | |
1197 | #if defined(QT_DEBUG) |
1198 | void dump() const; |
1199 | #endif |
1200 | |
1201 | private: |
1202 | void addAnchorsToEngine(const Box &to) const; |
1203 | |
1204 | QRegExpEngine *eng; // the automaton under construction |
1205 | QList<int> ls; // the left states (firstpos) |
1206 | QList<int> rs; // the right states (lastpos) |
1207 | QMap<int, int> lanchors; // the left anchors |
1208 | QMap<int, int> ranchors; // the right anchors |
1209 | int skipanchors; // the anchors to match if the box is skipped |
1210 | |
1211 | #ifndef QT_NO_REGEXP_OPTIM |
1212 | int earlyStart; // the index where str can first occur |
1213 | int lateStart; // the index where str can last occur |
1214 | QString str; // a string that has to occur in any match |
1215 | QString leftStr; // a string occurring at the left of this box |
1216 | QString rightStr; // a string occurring at the right of this box |
1217 | int maxl; // the maximum length of this box (possibly InftyLen) |
1218 | #endif |
1219 | |
1220 | int minl; // the minimum length of this box |
1221 | #ifndef QT_NO_REGEXP_OPTIM |
1222 | QList<int> occ1; // first-occurrence array |
1223 | #endif |
1224 | }; |
1225 | |
1226 | friend class Box; |
1227 | |
1228 | /* |
1229 | This is the lexical analyzer for regular expressions. |
1230 | */ |
1231 | enum { Tok_Eos, Tok_Dollar, Tok_LeftParen, Tok_MagicLeftParen, Tok_PosLookahead, |
1232 | Tok_NegLookahead, Tok_RightParen, Tok_CharClass, Tok_Caret, Tok_Quantifier, Tok_Bar, |
1233 | Tok_Word, Tok_NonWord, Tok_Char = 0x10000, Tok_BackRef = 0x20000 }; |
1234 | int getChar(); |
1235 | int getEscape(); |
1236 | #ifndef QT_NO_REGEXP_INTERVAL |
1237 | int getRep(int def); |
1238 | #endif |
1239 | #ifndef QT_NO_REGEXP_LOOKAHEAD |
1240 | void skipChars(int n); |
1241 | #endif |
1242 | void error(const char *msg); |
1243 | void startTokenizer(const QChar *rx, int len); |
1244 | int getToken(); |
1245 | |
1246 | const QChar *yyIn; // a pointer to the input regular expression pattern |
1247 | int yyPos0; // the position of yyTok in the input pattern |
1248 | int yyPos; // the position of the next character to read |
1249 | int yyLen; // the length of yyIn |
1250 | int yyCh; // the last character read |
1251 | QScopedPointer<QRegExpCharClass> yyCharClass; // attribute for Tok_CharClass tokens |
1252 | int yyMinRep; // attribute for Tok_Quantifier |
1253 | int yyMaxRep; // ditto |
1254 | QString yyError; // syntax error or overflow during parsing? |
1255 | |
1256 | /* |
1257 | This is the syntactic analyzer for regular expressions. |
1258 | */ |
1259 | int parse(const QChar *rx, int len); |
1260 | void parseAtom(Box *box); |
1261 | void parseFactor(Box *box); |
1262 | void parseTerm(Box *box); |
1263 | void parseExpression(Box *box); |
1264 | |
1265 | int yyTok; // the last token read |
1266 | bool yyMayCapture; // set this to false to disable capturing |
1267 | |
1268 | friend struct QRegExpMatchState; |
1269 | }; |
1270 | |
1271 | #ifndef QT_NO_REGEXP_LOOKAHEAD |
1272 | /* |
1273 | The struct QRegExpLookahead represents a lookahead a la Perl (e.g., |
1274 | (?=foo) and (?!bar)). |
1275 | */ |
1276 | struct QRegExpLookahead |
1277 | { |
1278 | QRegExpEngine *eng; // NFA representing the embedded regular expression |
1279 | bool neg; // negative lookahead? |
1280 | |
1281 | inline QRegExpLookahead(QRegExpEngine *eng0, bool neg0) |
1282 | : eng(eng0), neg(neg0) { } |
1283 | inline ~QRegExpLookahead() { delete eng; } |
1284 | }; |
1285 | #endif |
1286 | |
1287 | /*! |
1288 | \internal |
1289 | convert the pattern string to the RegExp syntax. |
1290 | |
1291 | This is also used by QScriptEngine::newRegExp to convert to a pattern that JavaScriptCore can understan |
1292 | */ |
1293 | Q_CORE5COMPAT_EXPORT QString qt_regexp_toCanonical(const QString &pattern, |
1294 | QRegExp::PatternSyntax patternSyntax) |
1295 | { |
1296 | switch (patternSyntax) { |
1297 | #ifndef QT_NO_REGEXP_WILDCARD |
1298 | case QRegExp::Wildcard: |
1299 | return wc2rx(wc_str: pattern, enableEscaping: false); |
1300 | case QRegExp::WildcardUnix: |
1301 | return wc2rx(wc_str: pattern, enableEscaping: true); |
1302 | #endif |
1303 | case QRegExp::FixedString: |
1304 | return QRegExp::escape(str: pattern); |
1305 | case QRegExp::W3CXmlSchema11: |
1306 | default: |
1307 | return pattern; |
1308 | } |
1309 | } |
1310 | |
1311 | QRegExpEngine::QRegExpEngine(const QRegExpEngineKey &key) |
1312 | : cs(key.cs), greedyQuantifiers(key.patternSyntax == QRegExp::RegExp2), |
1313 | xmlSchemaExtensions(key.patternSyntax == QRegExp::W3CXmlSchema11) |
1314 | { |
1315 | setup(); |
1316 | |
1317 | QString rx = qt_regexp_toCanonical(pattern: key.pattern, patternSyntax: key.patternSyntax); |
1318 | |
1319 | valid = (parse(rx: rx.unicode(), len: rx.size()) == rx.size()); |
1320 | if (!valid) { |
1321 | #ifndef QT_NO_REGEXP_OPTIM |
1322 | trivial = false; |
1323 | #endif |
1324 | error(RXERR_LEFTDELIM); |
1325 | } |
1326 | } |
1327 | |
1328 | QRegExpEngine::~QRegExpEngine() |
1329 | { |
1330 | #ifndef QT_NO_REGEXP_LOOKAHEAD |
1331 | qDeleteAll(c: ahead); |
1332 | #endif |
1333 | } |
1334 | |
1335 | void QRegExpMatchState::prepareForMatch(QRegExpEngine *eng) |
1336 | { |
1337 | /* |
1338 | We use one QList<int> for all the big data used a lot in |
1339 | matchHere() and friends. |
1340 | */ |
1341 | int ns = eng->s.size(); // number of states |
1342 | int ncap = eng->ncap; |
1343 | #ifndef QT_NO_REGEXP_OPTIM |
1344 | int newSlideTabSize = qMax(a: eng->minl + 1, b: 16); |
1345 | #else |
1346 | int newSlideTabSize = 0; |
1347 | #endif |
1348 | int numCaptures = eng->captureCount(); |
1349 | int newCapturedSize = 2 + 2 * numCaptures; |
1350 | bigArray = q_check_ptr(p: (int *)realloc(ptr: bigArray, size: ((3 + 4 * ncap) * ns + 4 * ncap + newSlideTabSize + newCapturedSize)*sizeof(int))); |
1351 | |
1352 | // set all internal variables only _after_ bigArray is realloc'ed |
1353 | // to prevent a broken regexp in oom case |
1354 | |
1355 | slideTabSize = newSlideTabSize; |
1356 | capturedSize = newCapturedSize; |
1357 | inNextStack = bigArray; |
1358 | memset(s: inNextStack, c: -1, n: ns * sizeof(int)); |
1359 | curStack = inNextStack + ns; |
1360 | nextStack = inNextStack + 2 * ns; |
1361 | |
1362 | curCapBegin = inNextStack + 3 * ns; |
1363 | nextCapBegin = curCapBegin + ncap * ns; |
1364 | curCapEnd = curCapBegin + 2 * ncap * ns; |
1365 | nextCapEnd = curCapBegin + 3 * ncap * ns; |
1366 | |
1367 | tempCapBegin = curCapBegin + 4 * ncap * ns; |
1368 | tempCapEnd = tempCapBegin + ncap; |
1369 | capBegin = tempCapBegin + 2 * ncap; |
1370 | capEnd = tempCapBegin + 3 * ncap; |
1371 | |
1372 | slideTab = tempCapBegin + 4 * ncap; |
1373 | captured = slideTab + slideTabSize; |
1374 | memset(s: captured, c: -1, n: capturedSize*sizeof(int)); |
1375 | this->eng = eng; |
1376 | } |
1377 | |
1378 | /* |
1379 | Tries to match in str and returns an array of (begin, length) pairs |
1380 | for captured text. If there is no match, all pairs are (-1, -1). |
1381 | */ |
1382 | void QRegExpMatchState::match(const QChar *str0, int len0, int pos0, |
1383 | bool minimal0, bool oneTest, int caretIndex) |
1384 | { |
1385 | bool matched = false; |
1386 | QChar char_null; |
1387 | |
1388 | #ifndef QT_NO_REGEXP_OPTIM |
1389 | if (eng->trivial && !oneTest) { |
1390 | // ### Qt6: qsizetype |
1391 | pos = int(QtPrivate::findString(haystack: QStringView(str0, len0), from: pos0, needle: QStringView(eng->goodStr.unicode(), eng->goodStr.size()), cs: eng->cs)); |
1392 | matchLen = eng->goodStr.size(); |
1393 | matched = (pos != -1); |
1394 | } else |
1395 | #endif |
1396 | { |
1397 | in = str0; |
1398 | if (in == nullptr) |
1399 | in = &char_null; |
1400 | pos = pos0; |
1401 | caretPos = caretIndex; |
1402 | len = len0; |
1403 | minimal = minimal0; |
1404 | matchLen = 0; |
1405 | oneTestMatchedLen = 0; |
1406 | |
1407 | if (eng->valid && pos >= 0 && pos <= len) { |
1408 | #ifndef QT_NO_REGEXP_OPTIM |
1409 | if (oneTest) { |
1410 | matched = matchHere(); |
1411 | } else { |
1412 | if (pos <= len - eng->minl) { |
1413 | if (eng->caretAnchored) { |
1414 | matched = matchHere(); |
1415 | } else if (eng->useGoodStringHeuristic) { |
1416 | matched = eng->goodStringMatch(matchState&: *this); |
1417 | } else { |
1418 | matched = eng->badCharMatch(matchState&: *this); |
1419 | } |
1420 | } |
1421 | } |
1422 | #else |
1423 | matched = oneTest ? matchHere() : eng->bruteMatch(*this); |
1424 | #endif |
1425 | } |
1426 | } |
1427 | |
1428 | if (matched) { |
1429 | int *c = captured; |
1430 | *c++ = pos; |
1431 | *c++ = matchLen; |
1432 | |
1433 | int numCaptures = (capturedSize - 2) >> 1; |
1434 | #ifndef QT_NO_REGEXP_CAPTURE |
1435 | for (int i = 0; i < numCaptures; ++i) { |
1436 | int j = eng->captureForOfficialCapture.at(i); |
1437 | if (capBegin[j] != EmptyCapture) { |
1438 | int len = capEnd[j] - capBegin[j]; |
1439 | *c++ = (len > 0) ? pos + capBegin[j] : 0; |
1440 | *c++ = len; |
1441 | } else { |
1442 | *c++ = -1; |
1443 | *c++ = -1; |
1444 | } |
1445 | } |
1446 | #endif |
1447 | } else { |
1448 | // we rely on 2's complement here |
1449 | memset(s: captured, c: -1, n: capturedSize * sizeof(int)); |
1450 | } |
1451 | } |
1452 | |
1453 | /* |
1454 | The three following functions add one state to the automaton and |
1455 | return the number of the state. |
1456 | */ |
1457 | |
1458 | int QRegExpEngine::createState(QChar ch) |
1459 | { |
1460 | return setupState(ch.unicode()); |
1461 | } |
1462 | |
1463 | int QRegExpEngine::createState(const QRegExpCharClass &cc) |
1464 | { |
1465 | #ifndef QT_NO_REGEXP_CCLASS |
1466 | int n = cl.size(); |
1467 | cl += QRegExpCharClass(cc); |
1468 | return setupState(CharClassBit | n); |
1469 | #else |
1470 | Q_UNUSED(cc); |
1471 | return setupState(CharClassBit); |
1472 | #endif |
1473 | } |
1474 | |
1475 | #ifndef QT_NO_REGEXP_BACKREF |
1476 | int QRegExpEngine::createState(int bref) |
1477 | { |
1478 | if (bref > nbrefs) { |
1479 | nbrefs = bref; |
1480 | if (nbrefs > MaxBackRefs) { |
1481 | error(RXERR_LIMIT); |
1482 | return 0; |
1483 | } |
1484 | } |
1485 | return setupState(BackRefBit | bref); |
1486 | } |
1487 | #endif |
1488 | |
1489 | /* |
1490 | The two following functions add a transition between all pairs of |
1491 | states (i, j) where i is found in from, and j is found in to. |
1492 | |
1493 | Cat-transitions are distinguished from plus-transitions for |
1494 | capturing. |
1495 | */ |
1496 | |
1497 | void QRegExpEngine::addCatTransitions(const QList<int> &from, const QList<int> &to) |
1498 | { |
1499 | for (int i = 0; i < from.size(); i++) |
1500 | mergeInto(a: &s[from.at(i)].outs, b: to); |
1501 | } |
1502 | |
1503 | #ifndef QT_NO_REGEXP_CAPTURE |
1504 | void QRegExpEngine::addPlusTransitions(const QList<int> &from, const QList<int> &to, int atom) |
1505 | { |
1506 | for (int i = 0; i < from.size(); i++) { |
1507 | QRegExpAutomatonState &st = s[from.at(i)]; |
1508 | const QList<int> oldOuts = st.outs; |
1509 | mergeInto(a: &st.outs, b: to); |
1510 | if (f.at(i: atom).capture != QRegExpAtom::NoCapture) { |
1511 | for (int j = 0; j < to.size(); j++) { |
1512 | // ### st.reenter.contains(to.at(j)) check looks suspicious |
1513 | if (!st.reenter.contains(key: to.at(i: j)) && |
1514 | !std::binary_search(first: oldOuts.constBegin(), last: oldOuts.constEnd(), val: to.at(i: j))) |
1515 | st.reenter.insert(key: to.at(i: j), value: atom); |
1516 | } |
1517 | } |
1518 | } |
1519 | } |
1520 | #endif |
1521 | |
1522 | #ifndef QT_NO_REGEXP_ANCHOR_ALT |
1523 | /* |
1524 | Returns an anchor that means a OR b. |
1525 | */ |
1526 | int QRegExpEngine::anchorAlternation(int a, int b) |
1527 | { |
1528 | if (((a & b) == a || (a & b) == b) && ((a | b) & Anchor_Alternation) == 0) |
1529 | return a & b; |
1530 | |
1531 | int n = aa.size(); |
1532 | #ifndef QT_NO_REGEXP_OPTIM |
1533 | if (n > 0 && aa.at(i: n - 1).a == a && aa.at(i: n - 1).b == b) |
1534 | return Anchor_Alternation | (n - 1); |
1535 | #endif |
1536 | |
1537 | QRegExpAnchorAlternation element = {.a: a, .b: b}; |
1538 | aa.append(t: element); |
1539 | return Anchor_Alternation | n; |
1540 | } |
1541 | |
1542 | /* |
1543 | Returns an anchor that means a AND b. |
1544 | */ |
1545 | int QRegExpEngine::anchorConcatenation(int a, int b) |
1546 | { |
1547 | if (((a | b) & Anchor_Alternation) == 0) |
1548 | return a | b; |
1549 | if ((b & Anchor_Alternation) != 0) |
1550 | qSwap(value1&: a, value2&: b); |
1551 | |
1552 | int aprime = anchorConcatenation(a: aa.at(i: a ^ Anchor_Alternation).a, b); |
1553 | int bprime = anchorConcatenation(a: aa.at(i: a ^ Anchor_Alternation).b, b); |
1554 | return anchorAlternation(a: aprime, b: bprime); |
1555 | } |
1556 | #endif |
1557 | |
1558 | /* |
1559 | Adds anchor a on a transition caracterised by its from state and |
1560 | its to state. |
1561 | */ |
1562 | void QRegExpEngine::addAnchors(int from, int to, int a) |
1563 | { |
1564 | QRegExpAutomatonState &st = s[from]; |
1565 | if (st.anchors.contains(key: to)) |
1566 | a = anchorAlternation(a: st.anchors.value(key: to), b: a); |
1567 | st.anchors.insert(key: to, value: a); |
1568 | } |
1569 | |
1570 | #ifndef QT_NO_REGEXP_OPTIM |
1571 | /* |
1572 | This function chooses between the good-string and the bad-character |
1573 | heuristics. It computes two scores and chooses the heuristic with |
1574 | the highest score. |
1575 | |
1576 | Here are some common-sense constraints on the scores that should be |
1577 | respected if the formulas are ever modified: (1) If goodStr is |
1578 | empty, the good-string heuristic scores 0. (2) If the regular |
1579 | expression is trivial, the good-string heuristic should be used. |
1580 | (3) If the search is case insensitive, the good-string heuristic |
1581 | should be used, unless it scores 0. (Case insensitivity turns all |
1582 | entries of occ1 to 0.) (4) If (goodLateStart - goodEarlyStart) is |
1583 | big, the good-string heuristic should score less. |
1584 | */ |
1585 | void QRegExpEngine::heuristicallyChooseHeuristic() |
1586 | { |
1587 | if (minl == 0) { |
1588 | useGoodStringHeuristic = false; |
1589 | } else if (trivial) { |
1590 | useGoodStringHeuristic = true; |
1591 | } else { |
1592 | /* |
1593 | Magic formula: The good string has to constitute a good |
1594 | proportion of the minimum-length string, and appear at a |
1595 | more-or-less known index. |
1596 | */ |
1597 | int goodStringScore = (64 * goodStr.size() / minl) - |
1598 | (goodLateStart - goodEarlyStart); |
1599 | /* |
1600 | Less magic formula: We pick some characters at random, and |
1601 | check whether they are good or bad. |
1602 | */ |
1603 | int badCharScore = 0; |
1604 | int step = qMax(a: 1, b: NumBadChars / 32); |
1605 | for (int i = 1; i < NumBadChars; i += step) { |
1606 | if (occ1.at(i) == NoOccurrence) |
1607 | badCharScore += minl; |
1608 | else |
1609 | badCharScore += occ1.at(i); |
1610 | } |
1611 | badCharScore /= minl; |
1612 | useGoodStringHeuristic = (goodStringScore > badCharScore); |
1613 | } |
1614 | } |
1615 | #endif |
1616 | |
1617 | #if defined(QT_DEBUG) |
1618 | void QRegExpEngine::dump() const |
1619 | { |
1620 | int i, j; |
1621 | qDebug(msg: "Case %ssensitive engine" , cs ? "" : "in" ); |
1622 | qDebug(msg: " States" ); |
1623 | for (i = 0; i < s.size(); i++) { |
1624 | qDebug(msg: " %d%s" , i, i == InitialState ? " (initial)" : i == FinalState ? " (final)" : "" ); |
1625 | #ifndef QT_NO_REGEXP_CAPTURE |
1626 | if (nf > 0) |
1627 | qDebug(msg: " in atom %d" , s[i].atom); |
1628 | #endif |
1629 | int m = s[i].match; |
1630 | if ((m & CharClassBit) != 0) { |
1631 | qDebug(msg: " match character class %d" , m ^ CharClassBit); |
1632 | #ifndef QT_NO_REGEXP_CCLASS |
1633 | cl[m ^ CharClassBit].dump(); |
1634 | #else |
1635 | qDebug(" negative character class" ); |
1636 | #endif |
1637 | } else if ((m & BackRefBit) != 0) { |
1638 | qDebug(msg: " match back-reference %d" , m ^ BackRefBit); |
1639 | } else if (m >= 0x20 && m <= 0x7e) { |
1640 | qDebug(msg: " match 0x%.4x (%c)" , m, m); |
1641 | } else { |
1642 | qDebug(msg: " match 0x%.4x" , m); |
1643 | } |
1644 | for (j = 0; j < s[i].outs.size(); j++) { |
1645 | int next = s[i].outs[j]; |
1646 | qDebug(msg: " -> %d" , next); |
1647 | if (s[i].reenter.contains(key: next)) |
1648 | qDebug(msg: " [reenter %d]" , s[i].reenter[next]); |
1649 | if (s[i].anchors.value(key: next) != 0) |
1650 | qDebug(msg: " [anchors 0x%.8x]" , s[i].anchors[next]); |
1651 | } |
1652 | } |
1653 | #ifndef QT_NO_REGEXP_CAPTURE |
1654 | if (nf > 0) { |
1655 | qDebug(msg: " Atom Parent Capture" ); |
1656 | for (i = 0; i < nf; i++) { |
1657 | if (f[i].capture == QRegExpAtom::NoCapture) { |
1658 | qDebug(msg: " %6d %6d nil" , i, f[i].parent); |
1659 | } else { |
1660 | int cap = f[i].capture; |
1661 | bool official = captureForOfficialCapture.contains(t: cap); |
1662 | qDebug(msg: " %6d %6d %6d %s" , i, f[i].parent, f[i].capture, |
1663 | official ? "official" : "" ); |
1664 | } |
1665 | } |
1666 | } |
1667 | #endif |
1668 | #ifndef QT_NO_REGEXP_ANCHOR_ALT |
1669 | for (i = 0; i < aa.size(); i++) |
1670 | qDebug(msg: " Anchor alternation 0x%.8x: 0x%.8x 0x%.9x" , i, aa[i].a, aa[i].b); |
1671 | #endif |
1672 | } |
1673 | #endif |
1674 | |
1675 | void QRegExpEngine::setup() |
1676 | { |
1677 | ref.storeRelaxed(newValue: 1); |
1678 | #ifndef QT_NO_REGEXP_CAPTURE |
1679 | f.resize(size: 32); |
1680 | nf = 0; |
1681 | cf = -1; |
1682 | #endif |
1683 | officialncap = 0; |
1684 | ncap = 0; |
1685 | #ifndef QT_NO_REGEXP_OPTIM |
1686 | caretAnchored = true; |
1687 | trivial = true; |
1688 | #endif |
1689 | valid = false; |
1690 | #ifndef QT_NO_REGEXP_BACKREF |
1691 | nbrefs = 0; |
1692 | #endif |
1693 | #ifndef QT_NO_REGEXP_OPTIM |
1694 | useGoodStringHeuristic = true; |
1695 | minl = 0; |
1696 | occ1.fill(t: 0, newSize: NumBadChars); |
1697 | #endif |
1698 | } |
1699 | |
1700 | int QRegExpEngine::setupState(int match) |
1701 | { |
1702 | #ifndef QT_NO_REGEXP_CAPTURE |
1703 | s += QRegExpAutomatonState(cf, match); |
1704 | #else |
1705 | s += QRegExpAutomatonState(match); |
1706 | #endif |
1707 | return s.size() - 1; |
1708 | } |
1709 | |
1710 | #ifndef QT_NO_REGEXP_CAPTURE |
1711 | /* |
1712 | Functions startAtom() and finishAtom() should be called to delimit |
1713 | atoms. When a state is created, it is assigned to the current atom. |
1714 | The information is later used for capturing. |
1715 | */ |
1716 | int QRegExpEngine::startAtom(bool officialCapture) |
1717 | { |
1718 | if ((nf & (nf + 1)) == 0 && nf + 1 >= f.size()) |
1719 | f.resize(size: (nf + 1) << 1); |
1720 | f[nf].parent = cf; |
1721 | cf = nf++; |
1722 | f[cf].capture = officialCapture ? QRegExpAtom::OfficialCapture : QRegExpAtom::NoCapture; |
1723 | return cf; |
1724 | } |
1725 | |
1726 | void QRegExpEngine::finishAtom(int atom, bool needCapture) |
1727 | { |
1728 | if (greedyQuantifiers && needCapture && f[atom].capture == QRegExpAtom::NoCapture) |
1729 | f[atom].capture = QRegExpAtom::UnofficialCapture; |
1730 | cf = f.at(i: atom).parent; |
1731 | } |
1732 | #endif |
1733 | |
1734 | #ifndef QT_NO_REGEXP_LOOKAHEAD |
1735 | /* |
1736 | Creates a lookahead anchor. |
1737 | */ |
1738 | int QRegExpEngine::addLookahead(QRegExpEngine *eng, bool negative) |
1739 | { |
1740 | int n = ahead.size(); |
1741 | if (n == MaxLookaheads) { |
1742 | error(RXERR_LIMIT); |
1743 | return 0; |
1744 | } |
1745 | ahead += new QRegExpLookahead(eng, negative); |
1746 | return Anchor_FirstLookahead << n; |
1747 | } |
1748 | #endif |
1749 | |
1750 | #ifndef QT_NO_REGEXP_CAPTURE |
1751 | /* |
1752 | We want the longest leftmost captures. |
1753 | */ |
1754 | static bool isBetterCapture(int ncap, const int *begin1, const int *end1, const int *begin2, |
1755 | const int *end2) |
1756 | { |
1757 | for (int i = 0; i < ncap; i++) { |
1758 | int delta = begin2[i] - begin1[i]; // it has to start early... |
1759 | if (delta == 0) |
1760 | delta = end1[i] - end2[i]; // ...and end late |
1761 | |
1762 | if (delta != 0) |
1763 | return delta > 0; |
1764 | } |
1765 | return false; |
1766 | } |
1767 | #endif |
1768 | |
1769 | /* |
1770 | Returns \c true if anchor a matches at position pos + i in the input |
1771 | string, otherwise false. |
1772 | */ |
1773 | bool QRegExpMatchState::testAnchor(int i, int a, const int *capBegin) |
1774 | { |
1775 | int j; |
1776 | |
1777 | #ifndef QT_NO_REGEXP_ANCHOR_ALT |
1778 | if ((a & QRegExpEngine::Anchor_Alternation) != 0) |
1779 | return testAnchor(i, a: eng->aa.at(i: a ^ QRegExpEngine::Anchor_Alternation).a, capBegin) |
1780 | || testAnchor(i, a: eng->aa.at(i: a ^ QRegExpEngine::Anchor_Alternation).b, capBegin); |
1781 | #endif |
1782 | |
1783 | if ((a & QRegExpEngine::Anchor_Caret) != 0) { |
1784 | if (pos + i != caretPos) |
1785 | return false; |
1786 | } |
1787 | if ((a & QRegExpEngine::Anchor_Dollar) != 0) { |
1788 | if (pos + i != len) |
1789 | return false; |
1790 | } |
1791 | #ifndef QT_NO_REGEXP_ESCAPE |
1792 | if ((a & (QRegExpEngine::Anchor_Word | QRegExpEngine::Anchor_NonWord)) != 0) { |
1793 | bool before = false; |
1794 | bool after = false; |
1795 | if (pos + i != 0) |
1796 | before = isWord(ch: in[pos + i - 1]); |
1797 | if (pos + i != len) |
1798 | after = isWord(ch: in[pos + i]); |
1799 | if ((a & QRegExpEngine::Anchor_Word) != 0 && (before == after)) |
1800 | return false; |
1801 | if ((a & QRegExpEngine::Anchor_NonWord) != 0 && (before != after)) |
1802 | return false; |
1803 | } |
1804 | #endif |
1805 | #ifndef QT_NO_REGEXP_LOOKAHEAD |
1806 | if ((a & QRegExpEngine::Anchor_LookaheadMask) != 0) { |
1807 | const QList<QRegExpLookahead *> &ahead = eng->ahead; |
1808 | for (j = 0; j < ahead.size(); j++) { |
1809 | if ((a & (QRegExpEngine::Anchor_FirstLookahead << j)) != 0) { |
1810 | QRegExpMatchState matchState; |
1811 | matchState.prepareForMatch(eng: ahead[j]->eng); |
1812 | matchState.match(str0: in + pos + i, len0: len - pos - i, pos0: 0, |
1813 | minimal0: true, oneTest: true, caretIndex: caretPos - pos - i); |
1814 | if ((matchState.captured[0] == 0) == ahead[j]->neg) |
1815 | return false; |
1816 | } |
1817 | } |
1818 | } |
1819 | #endif |
1820 | #ifndef QT_NO_REGEXP_CAPTURE |
1821 | #ifndef QT_NO_REGEXP_BACKREF |
1822 | for (j = 0; j < eng->nbrefs; j++) { |
1823 | if ((a & (QRegExpEngine::Anchor_BackRef1Empty << j)) != 0) { |
1824 | int i = eng->captureForOfficialCapture.at(i: j); |
1825 | if (capBegin[i] != EmptyCapture) |
1826 | return false; |
1827 | } |
1828 | } |
1829 | #endif |
1830 | #endif |
1831 | return true; |
1832 | } |
1833 | |
1834 | #ifndef QT_NO_REGEXP_OPTIM |
1835 | /* |
1836 | The three following functions are what Jeffrey Friedl would call |
1837 | transmissions (or bump-alongs). Using one or the other should make |
1838 | no difference except in performance. |
1839 | */ |
1840 | |
1841 | bool QRegExpEngine::goodStringMatch(QRegExpMatchState &matchState) const |
1842 | { |
1843 | int k = matchState.pos + goodEarlyStart; |
1844 | QStringMatcher matcher(goodStr.unicode(), goodStr.size(), cs); |
1845 | while ((k = matcher.indexIn(str: matchState.in, length: matchState.len, from: k)) != -1) { |
1846 | int from = k - goodLateStart; |
1847 | int to = k - goodEarlyStart; |
1848 | if (from > matchState.pos) |
1849 | matchState.pos = from; |
1850 | |
1851 | while (matchState.pos <= to) { |
1852 | if (matchState.matchHere()) |
1853 | return true; |
1854 | ++matchState.pos; |
1855 | } |
1856 | ++k; |
1857 | } |
1858 | return false; |
1859 | } |
1860 | |
1861 | bool QRegExpEngine::badCharMatch(QRegExpMatchState &matchState) const |
1862 | { |
1863 | int slideHead = 0; |
1864 | int slideNext = 0; |
1865 | int i; |
1866 | int lastPos = matchState.len - minl; |
1867 | memset(s: matchState.slideTab, c: 0, n: matchState.slideTabSize * sizeof(int)); |
1868 | |
1869 | /* |
1870 | Set up the slide table, used for the bad-character heuristic, |
1871 | using the table of first occurrence of each character. |
1872 | */ |
1873 | for (i = 0; i < minl; i++) { |
1874 | int sk = occ1[BadChar(matchState.in[matchState.pos + i])]; |
1875 | if (sk == NoOccurrence) |
1876 | sk = i + 1; |
1877 | if (sk > 0) { |
1878 | int k = i + 1 - sk; |
1879 | if (k < 0) { |
1880 | sk = i + 1; |
1881 | k = 0; |
1882 | } |
1883 | if (sk > matchState.slideTab[k]) |
1884 | matchState.slideTab[k] = sk; |
1885 | } |
1886 | } |
1887 | |
1888 | if (matchState.pos > lastPos) |
1889 | return false; |
1890 | |
1891 | for (;;) { |
1892 | if (++slideNext >= matchState.slideTabSize) |
1893 | slideNext = 0; |
1894 | if (matchState.slideTab[slideHead] > 0) { |
1895 | if (matchState.slideTab[slideHead] - 1 > matchState.slideTab[slideNext]) |
1896 | matchState.slideTab[slideNext] = matchState.slideTab[slideHead] - 1; |
1897 | matchState.slideTab[slideHead] = 0; |
1898 | } else { |
1899 | if (matchState.matchHere()) |
1900 | return true; |
1901 | } |
1902 | |
1903 | if (matchState.pos == lastPos) |
1904 | break; |
1905 | |
1906 | /* |
1907 | Update the slide table. This code has much in common with |
1908 | the initialization code. |
1909 | */ |
1910 | int sk = occ1[BadChar(matchState.in[matchState.pos + minl])]; |
1911 | if (sk == NoOccurrence) { |
1912 | matchState.slideTab[slideNext] = minl; |
1913 | } else if (sk > 0) { |
1914 | int k = slideNext + minl - sk; |
1915 | if (k >= matchState.slideTabSize) |
1916 | k -= matchState.slideTabSize; |
1917 | if (sk > matchState.slideTab[k]) |
1918 | matchState.slideTab[k] = sk; |
1919 | } |
1920 | slideHead = slideNext; |
1921 | ++matchState.pos; |
1922 | } |
1923 | return false; |
1924 | } |
1925 | #else |
1926 | bool QRegExpEngine::bruteMatch(QRegExpMatchState &matchState) const |
1927 | { |
1928 | while (matchState.pos <= matchState.len) { |
1929 | if (matchState.matchHere()) |
1930 | return true; |
1931 | ++matchState.pos; |
1932 | } |
1933 | return false; |
1934 | } |
1935 | #endif |
1936 | |
1937 | /* |
1938 | Here's the core of the engine. It tries to do a match here and now. |
1939 | */ |
1940 | bool QRegExpMatchState::matchHere() |
1941 | { |
1942 | int ncur = 1, nnext = 0; |
1943 | int i = 0, j, k, m; |
1944 | bool stop = false; |
1945 | |
1946 | matchLen = -1; |
1947 | oneTestMatchedLen = -1; |
1948 | curStack[0] = QRegExpEngine::InitialState; |
1949 | |
1950 | int ncap = eng->ncap; |
1951 | #ifndef QT_NO_REGEXP_CAPTURE |
1952 | if (ncap > 0) { |
1953 | for (j = 0; j < ncap; j++) { |
1954 | curCapBegin[j] = EmptyCapture; |
1955 | curCapEnd[j] = EmptyCapture; |
1956 | } |
1957 | } |
1958 | #endif |
1959 | |
1960 | #ifndef QT_NO_REGEXP_BACKREF |
1961 | while ((ncur > 0 || !sleeping.isEmpty()) && i <= len - pos && !stop) |
1962 | #else |
1963 | while (ncur > 0 && i <= len - pos && !stop) |
1964 | #endif |
1965 | { |
1966 | int ch = (i < len - pos) ? in[pos + i].unicode() : 0; |
1967 | for (j = 0; j < ncur; j++) { |
1968 | int cur = curStack[j]; |
1969 | const QRegExpAutomatonState &scur = eng->s.at(i: cur); |
1970 | const QList<int> &outs = scur.outs; |
1971 | for (k = 0; k < outs.size(); k++) { |
1972 | int next = outs.at(i: k); |
1973 | const QRegExpAutomatonState &snext = eng->s.at(i: next); |
1974 | bool inside = true; |
1975 | #if !defined(QT_NO_REGEXP_BACKREF) && !defined(QT_NO_REGEXP_CAPTURE) |
1976 | int needSomeSleep = 0; |
1977 | #endif |
1978 | |
1979 | /* |
1980 | First, check if the anchors are anchored properly. |
1981 | */ |
1982 | int a = scur.anchors.value(key: next); |
1983 | if (a != 0 && !testAnchor(i, a, capBegin: curCapBegin + j * ncap)) |
1984 | inside = false; |
1985 | |
1986 | /* |
1987 | If indeed they are, check if the input character is |
1988 | correct for this transition. |
1989 | */ |
1990 | if (inside) { |
1991 | m = snext.match; |
1992 | if ((m & (QRegExpEngine::CharClassBit | QRegExpEngine::BackRefBit)) == 0) { |
1993 | if (eng->cs) |
1994 | inside = (m == ch); |
1995 | else |
1996 | inside = (QChar(m).toLower() == QChar(ch).toLower()); |
1997 | } else if (next == QRegExpEngine::FinalState) { |
1998 | matchLen = i; |
1999 | stop = minimal; |
2000 | inside = true; |
2001 | } else if ((m & QRegExpEngine::CharClassBit) != 0) { |
2002 | #ifndef QT_NO_REGEXP_CCLASS |
2003 | const QRegExpCharClass &cc = eng->cl.at(i: m ^ QRegExpEngine::CharClassBit); |
2004 | if (eng->cs) |
2005 | inside = cc.in(ch: QChar(ch)); |
2006 | else if (cc.negative()) |
2007 | inside = cc.in(ch: QChar(ch).toLower()) && |
2008 | cc.in(ch: QChar(ch).toUpper()); |
2009 | else |
2010 | inside = cc.in(ch: QChar(ch).toLower()) || |
2011 | cc.in(ch: QChar(ch).toUpper()); |
2012 | #endif |
2013 | #if !defined(QT_NO_REGEXP_BACKREF) && !defined(QT_NO_REGEXP_CAPTURE) |
2014 | } else { /* ((m & QRegExpEngine::BackRefBit) != 0) */ |
2015 | int bref = m ^ QRegExpEngine::BackRefBit; |
2016 | int ell = j * ncap + eng->captureForOfficialCapture.at(i: bref - 1); |
2017 | |
2018 | inside = bref <= ncap && curCapBegin[ell] != EmptyCapture; |
2019 | if (inside) { |
2020 | if (eng->cs) |
2021 | inside = (in[pos + curCapBegin[ell]] == QChar(ch)); |
2022 | else |
2023 | inside = (in[pos + curCapBegin[ell]].toLower() |
2024 | == QChar(ch).toLower()); |
2025 | } |
2026 | |
2027 | if (inside) { |
2028 | int delta; |
2029 | if (curCapEnd[ell] == EmptyCapture) |
2030 | delta = i - curCapBegin[ell]; |
2031 | else |
2032 | delta = curCapEnd[ell] - curCapBegin[ell]; |
2033 | |
2034 | inside = (delta <= len - (pos + i)); |
2035 | if (inside && delta > 1) { |
2036 | int n = 1; |
2037 | if (eng->cs) { |
2038 | while (n < delta) { |
2039 | if (in[pos + curCapBegin[ell] + n] |
2040 | != in[pos + i + n]) |
2041 | break; |
2042 | ++n; |
2043 | } |
2044 | } else { |
2045 | while (n < delta) { |
2046 | QChar a = in[pos + curCapBegin[ell] + n]; |
2047 | QChar b = in[pos + i + n]; |
2048 | if (a.toLower() != b.toLower()) |
2049 | break; |
2050 | ++n; |
2051 | } |
2052 | } |
2053 | inside = (n == delta); |
2054 | if (inside) |
2055 | needSomeSleep = delta - 1; |
2056 | } |
2057 | } |
2058 | #endif |
2059 | } |
2060 | } |
2061 | |
2062 | /* |
2063 | We must now update our data structures. |
2064 | */ |
2065 | if (inside) { |
2066 | #ifndef QT_NO_REGEXP_CAPTURE |
2067 | int *capBegin, *capEnd; |
2068 | #endif |
2069 | /* |
2070 | If the next state was not encountered yet, all |
2071 | is fine. |
2072 | */ |
2073 | if ((m = inNextStack[next]) == -1) { |
2074 | m = nnext++; |
2075 | nextStack[m] = next; |
2076 | inNextStack[next] = m; |
2077 | #ifndef QT_NO_REGEXP_CAPTURE |
2078 | capBegin = nextCapBegin + m * ncap; |
2079 | capEnd = nextCapEnd + m * ncap; |
2080 | |
2081 | /* |
2082 | Otherwise, we'll first maintain captures in |
2083 | temporary arrays, and decide at the end whether |
2084 | it's best to keep the previous capture zones or |
2085 | the new ones. |
2086 | */ |
2087 | } else { |
2088 | capBegin = tempCapBegin; |
2089 | capEnd = tempCapEnd; |
2090 | #endif |
2091 | } |
2092 | |
2093 | #ifndef QT_NO_REGEXP_CAPTURE |
2094 | /* |
2095 | Updating the capture zones is much of a task. |
2096 | */ |
2097 | if (ncap > 0) { |
2098 | memcpy(dest: capBegin, src: curCapBegin + j * ncap, n: ncap * sizeof(int)); |
2099 | memcpy(dest: capEnd, src: curCapEnd + j * ncap, n: ncap * sizeof(int)); |
2100 | int c = scur.atom, n = snext.atom; |
2101 | int p = -1, q = -1; |
2102 | int cap; |
2103 | |
2104 | /* |
2105 | Lemma 1. For any x in the range [0..nf), we |
2106 | have f[x].parent < x. |
2107 | |
2108 | Proof. By looking at startAtom(), it is |
2109 | clear that cf < nf holds all the time, and |
2110 | thus that f[nf].parent < nf. |
2111 | */ |
2112 | |
2113 | /* |
2114 | If we are reentering an atom, we empty all |
2115 | capture zones inside it. |
2116 | */ |
2117 | if ((q = scur.reenter.value(key: next)) != 0) { |
2118 | QBitArray b(eng->nf, false); |
2119 | b.setBit(i: q, val: true); |
2120 | for (int ell = q + 1; ell < eng->nf; ell++) { |
2121 | if (b.testBit(i: eng->f.at(i: ell).parent)) { |
2122 | b.setBit(i: ell, val: true); |
2123 | cap = eng->f.at(i: ell).capture; |
2124 | if (cap >= 0) { |
2125 | capBegin[cap] = EmptyCapture; |
2126 | capEnd[cap] = EmptyCapture; |
2127 | } |
2128 | } |
2129 | } |
2130 | p = eng->f.at(i: q).parent; |
2131 | |
2132 | /* |
2133 | Otherwise, close the capture zones we are |
2134 | leaving. We are leaving f[c].capture, |
2135 | f[f[c].parent].capture, |
2136 | f[f[f[c].parent].parent].capture, ..., |
2137 | until f[x].capture, with x such that |
2138 | f[x].parent is the youngest common ancestor |
2139 | for c and n. |
2140 | |
2141 | We go up along c's and n's ancestry until |
2142 | we find x. |
2143 | */ |
2144 | } else { |
2145 | p = c; |
2146 | q = n; |
2147 | while (p != q) { |
2148 | if (p > q) { |
2149 | cap = eng->f.at(i: p).capture; |
2150 | if (cap >= 0) { |
2151 | if (capBegin[cap] == i) { |
2152 | capBegin[cap] = EmptyCapture; |
2153 | capEnd[cap] = EmptyCapture; |
2154 | } else { |
2155 | capEnd[cap] = i; |
2156 | } |
2157 | } |
2158 | p = eng->f.at(i: p).parent; |
2159 | } else { |
2160 | q = eng->f.at(i: q).parent; |
2161 | } |
2162 | } |
2163 | } |
2164 | |
2165 | /* |
2166 | In any case, we now open the capture zones |
2167 | we are entering. We work upwards from n |
2168 | until we reach p (the parent of the atom we |
2169 | reenter or the youngest common ancestor). |
2170 | */ |
2171 | while (n > p) { |
2172 | cap = eng->f.at(i: n).capture; |
2173 | if (cap >= 0) { |
2174 | capBegin[cap] = i; |
2175 | capEnd[cap] = EmptyCapture; |
2176 | } |
2177 | n = eng->f.at(i: n).parent; |
2178 | } |
2179 | /* |
2180 | If the next state was already in |
2181 | nextStack, we must choose carefully which |
2182 | capture zones we want to keep. |
2183 | */ |
2184 | if (capBegin == tempCapBegin && |
2185 | isBetterCapture(ncap, begin1: capBegin, end1: capEnd, begin2: nextCapBegin + m * ncap, |
2186 | end2: nextCapEnd + m * ncap)) { |
2187 | memcpy(dest: nextCapBegin + m * ncap, src: capBegin, n: ncap * sizeof(int)); |
2188 | memcpy(dest: nextCapEnd + m * ncap, src: capEnd, n: ncap * sizeof(int)); |
2189 | } |
2190 | } |
2191 | #ifndef QT_NO_REGEXP_BACKREF |
2192 | /* |
2193 | We are done with updating the capture zones. |
2194 | It's now time to put the next state to sleep, |
2195 | if it needs to, and to remove it from |
2196 | nextStack. |
2197 | */ |
2198 | if (needSomeSleep > 0) { |
2199 | QList<int> zzZ(2 + 2 * ncap); |
2200 | zzZ[0] = i + needSomeSleep; |
2201 | zzZ[1] = next; |
2202 | if (ncap > 0) { |
2203 | memcpy(dest: zzZ.data() + 2, src: capBegin, n: ncap * sizeof(int)); |
2204 | memcpy(dest: zzZ.data() + 2 + ncap, src: capEnd, n: ncap * sizeof(int)); |
2205 | } |
2206 | inNextStack[nextStack[--nnext]] = -1; |
2207 | sleeping.append(t: zzZ); |
2208 | } |
2209 | #endif |
2210 | #endif |
2211 | } |
2212 | } |
2213 | } |
2214 | #ifndef QT_NO_REGEXP_CAPTURE |
2215 | /* |
2216 | If we reached the final state, hurray! Copy the captured |
2217 | zone. |
2218 | */ |
2219 | if (ncap > 0 && (m = inNextStack[QRegExpEngine::FinalState]) != -1) { |
2220 | memcpy(dest: capBegin, src: nextCapBegin + m * ncap, n: ncap * sizeof(int)); |
2221 | memcpy(dest: capEnd, src: nextCapEnd + m * ncap, n: ncap * sizeof(int)); |
2222 | } |
2223 | #ifndef QT_NO_REGEXP_BACKREF |
2224 | /* |
2225 | It's time to wake up the sleepers. |
2226 | */ |
2227 | j = 0; |
2228 | while (j < sleeping.size()) { |
2229 | if (sleeping.at(i: j)[0] == i) { |
2230 | const QList<int> &zzZ = sleeping.at(i: j); |
2231 | int next = zzZ[1]; |
2232 | const int *capBegin = zzZ.data() + 2; |
2233 | const int *capEnd = zzZ.data() + 2 + ncap; |
2234 | bool copyOver = true; |
2235 | |
2236 | if ((m = inNextStack[next]) == -1) { |
2237 | m = nnext++; |
2238 | nextStack[m] = next; |
2239 | inNextStack[next] = m; |
2240 | } else { |
2241 | copyOver = isBetterCapture(ncap, begin1: nextCapBegin + m * ncap, end1: nextCapEnd + m * ncap, |
2242 | begin2: capBegin, end2: capEnd); |
2243 | } |
2244 | if (copyOver) { |
2245 | memcpy(dest: nextCapBegin + m * ncap, src: capBegin, n: ncap * sizeof(int)); |
2246 | memcpy(dest: nextCapEnd + m * ncap, src: capEnd, n: ncap * sizeof(int)); |
2247 | } |
2248 | |
2249 | sleeping.removeAt(i: j); |
2250 | } else { |
2251 | ++j; |
2252 | } |
2253 | } |
2254 | #endif |
2255 | #endif |
2256 | for (j = 0; j < nnext; j++) |
2257 | inNextStack[nextStack[j]] = -1; |
2258 | |
2259 | // avoid needless iteration that confuses oneTestMatchedLen |
2260 | if (nnext == 1 && nextStack[0] == QRegExpEngine::FinalState |
2261 | #ifndef QT_NO_REGEXP_BACKREF |
2262 | && sleeping.isEmpty() |
2263 | #endif |
2264 | ) |
2265 | stop = true; |
2266 | |
2267 | qSwap(value1&: curStack, value2&: nextStack); |
2268 | #ifndef QT_NO_REGEXP_CAPTURE |
2269 | qSwap(value1&: curCapBegin, value2&: nextCapBegin); |
2270 | qSwap(value1&: curCapEnd, value2&: nextCapEnd); |
2271 | #endif |
2272 | ncur = nnext; |
2273 | nnext = 0; |
2274 | ++i; |
2275 | } |
2276 | |
2277 | #ifndef QT_NO_REGEXP_BACKREF |
2278 | /* |
2279 | If minimal matching is enabled, we might have some sleepers |
2280 | left. |
2281 | */ |
2282 | if (!sleeping.isEmpty()) |
2283 | sleeping.clear(); |
2284 | #endif |
2285 | |
2286 | oneTestMatchedLen = i - 1; |
2287 | return (matchLen >= 0); |
2288 | } |
2289 | |
2290 | #ifndef QT_NO_REGEXP_CCLASS |
2291 | |
2292 | QRegExpCharClass::QRegExpCharClass() |
2293 | : c(0), n(false) |
2294 | { |
2295 | #ifndef QT_NO_REGEXP_OPTIM |
2296 | occ1.fill(t: NoOccurrence, newSize: NumBadChars); |
2297 | #endif |
2298 | } |
2299 | |
2300 | void QRegExpCharClass::clear() |
2301 | { |
2302 | c = 0; |
2303 | r.clear(); |
2304 | n = false; |
2305 | } |
2306 | |
2307 | void QRegExpCharClass::setNegative(bool negative) |
2308 | { |
2309 | n = negative; |
2310 | #ifndef QT_NO_REGEXP_OPTIM |
2311 | occ1.fill(t: 0, newSize: NumBadChars); |
2312 | #endif |
2313 | } |
2314 | |
2315 | void QRegExpCharClass::addCategories(uint cats) |
2316 | { |
2317 | static const int all_cats = FLAG(QChar::Mark_NonSpacing) | |
2318 | FLAG(QChar::Mark_SpacingCombining) | |
2319 | FLAG(QChar::Mark_Enclosing) | |
2320 | FLAG(QChar::Number_DecimalDigit) | |
2321 | FLAG(QChar::Number_Letter) | |
2322 | FLAG(QChar::Number_Other) | |
2323 | FLAG(QChar::Separator_Space) | |
2324 | FLAG(QChar::Separator_Line) | |
2325 | FLAG(QChar::Separator_Paragraph) | |
2326 | FLAG(QChar::Other_Control) | |
2327 | FLAG(QChar::Other_Format) | |
2328 | FLAG(QChar::Other_Surrogate) | |
2329 | FLAG(QChar::Other_PrivateUse) | |
2330 | FLAG(QChar::Other_NotAssigned) | |
2331 | FLAG(QChar::Letter_Uppercase) | |
2332 | FLAG(QChar::Letter_Lowercase) | |
2333 | FLAG(QChar::Letter_Titlecase) | |
2334 | FLAG(QChar::Letter_Modifier) | |
2335 | FLAG(QChar::Letter_Other) | |
2336 | FLAG(QChar::Punctuation_Connector) | |
2337 | FLAG(QChar::Punctuation_Dash) | |
2338 | FLAG(QChar::Punctuation_Open) | |
2339 | FLAG(QChar::Punctuation_Close) | |
2340 | FLAG(QChar::Punctuation_InitialQuote) | |
2341 | FLAG(QChar::Punctuation_FinalQuote) | |
2342 | FLAG(QChar::Punctuation_Other) | |
2343 | FLAG(QChar::Symbol_Math) | |
2344 | FLAG(QChar::Symbol_Currency) | |
2345 | FLAG(QChar::Symbol_Modifier) | |
2346 | FLAG(QChar::Symbol_Other); |
2347 | c |= (all_cats & cats); |
2348 | #ifndef QT_NO_REGEXP_OPTIM |
2349 | occ1.fill(t: 0, newSize: NumBadChars); |
2350 | #endif |
2351 | } |
2352 | |
2353 | void QRegExpCharClass::addRange(ushort from, ushort to) |
2354 | { |
2355 | if (from > to) |
2356 | qSwap(value1&: from, value2&: to); |
2357 | int m = r.size(); |
2358 | r.resize(size: m + 1); |
2359 | r[m].from = from; |
2360 | r[m].len = to - from + 1; |
2361 | |
2362 | #ifndef QT_NO_REGEXP_OPTIM |
2363 | int i; |
2364 | |
2365 | if (to - from < NumBadChars) { |
2366 | if (from % NumBadChars <= to % NumBadChars) { |
2367 | for (i = from % NumBadChars; i <= to % NumBadChars; i++) |
2368 | occ1[i] = 0; |
2369 | } else { |
2370 | for (i = 0; i <= to % NumBadChars; i++) |
2371 | occ1[i] = 0; |
2372 | for (i = from % NumBadChars; i < NumBadChars; i++) |
2373 | occ1[i] = 0; |
2374 | } |
2375 | } else { |
2376 | occ1.fill(t: 0, newSize: NumBadChars); |
2377 | } |
2378 | #endif |
2379 | } |
2380 | |
2381 | bool QRegExpCharClass::in(QChar ch) const |
2382 | { |
2383 | #ifndef QT_NO_REGEXP_OPTIM |
2384 | if (occ1.at(BadChar(ch)) == NoOccurrence) |
2385 | return n; |
2386 | #endif |
2387 | |
2388 | if (c != 0 && (c & FLAG(ch.category())) != 0) |
2389 | return !n; |
2390 | |
2391 | const int uc = ch.unicode(); |
2392 | int size = r.size(); |
2393 | |
2394 | for (int i = 0; i < size; ++i) { |
2395 | const QRegExpCharClassRange &range = r.at(i); |
2396 | if (uint(uc - range.from) < uint(r.at(i).len)) |
2397 | return !n; |
2398 | } |
2399 | return n; |
2400 | } |
2401 | |
2402 | #if defined(QT_DEBUG) |
2403 | void QRegExpCharClass::dump() const |
2404 | { |
2405 | int i; |
2406 | qDebug(msg: " %stive character class" , n ? "nega" : "posi" ); |
2407 | #ifndef QT_NO_REGEXP_CCLASS |
2408 | if (c != 0) |
2409 | qDebug(msg: " categories 0x%.8x" , c); |
2410 | #endif |
2411 | for (i = 0; i < r.size(); i++) |
2412 | qDebug(msg: " 0x%.4x through 0x%.4x" , r[i].from, r[i].from + r[i].len - 1); |
2413 | } |
2414 | #endif |
2415 | #endif |
2416 | |
2417 | QRegExpEngine::Box::Box(QRegExpEngine *engine) |
2418 | : eng(engine), skipanchors(0) |
2419 | #ifndef QT_NO_REGEXP_OPTIM |
2420 | , earlyStart(0), lateStart(0), maxl(0) |
2421 | #endif |
2422 | { |
2423 | #ifndef QT_NO_REGEXP_OPTIM |
2424 | occ1.fill(t: NoOccurrence, newSize: NumBadChars); |
2425 | #endif |
2426 | minl = 0; |
2427 | } |
2428 | |
2429 | QRegExpEngine::Box &QRegExpEngine::Box::operator=(const Box &b) |
2430 | { |
2431 | eng = b.eng; |
2432 | ls = b.ls; |
2433 | rs = b.rs; |
2434 | lanchors = b.lanchors; |
2435 | ranchors = b.ranchors; |
2436 | skipanchors = b.skipanchors; |
2437 | #ifndef QT_NO_REGEXP_OPTIM |
2438 | earlyStart = b.earlyStart; |
2439 | lateStart = b.lateStart; |
2440 | str = b.str; |
2441 | leftStr = b.leftStr; |
2442 | rightStr = b.rightStr; |
2443 | maxl = b.maxl; |
2444 | occ1 = b.occ1; |
2445 | #endif |
2446 | minl = b.minl; |
2447 | return *this; |
2448 | } |
2449 | |
2450 | void QRegExpEngine::Box::set(QChar ch) |
2451 | { |
2452 | ls.resize(size: 1); |
2453 | ls[0] = eng->createState(ch); |
2454 | rs = ls; |
2455 | #ifndef QT_NO_REGEXP_OPTIM |
2456 | str = ch; |
2457 | leftStr = ch; |
2458 | rightStr = ch; |
2459 | maxl = 1; |
2460 | occ1[BadChar(ch)] = 0; |
2461 | #endif |
2462 | minl = 1; |
2463 | } |
2464 | |
2465 | void QRegExpEngine::Box::set(const QRegExpCharClass &cc) |
2466 | { |
2467 | ls.resize(size: 1); |
2468 | ls[0] = eng->createState(cc); |
2469 | rs = ls; |
2470 | #ifndef QT_NO_REGEXP_OPTIM |
2471 | maxl = 1; |
2472 | occ1 = cc.firstOccurrence(); |
2473 | #endif |
2474 | minl = 1; |
2475 | } |
2476 | |
2477 | #ifndef QT_NO_REGEXP_BACKREF |
2478 | void QRegExpEngine::Box::set(int bref) |
2479 | { |
2480 | ls.resize(size: 1); |
2481 | ls[0] = eng->createState(bref); |
2482 | rs = ls; |
2483 | if (bref >= 1 && bref <= MaxBackRefs) |
2484 | skipanchors = Anchor_BackRef0Empty << bref; |
2485 | #ifndef QT_NO_REGEXP_OPTIM |
2486 | maxl = InftyLen; |
2487 | #endif |
2488 | minl = 0; |
2489 | } |
2490 | #endif |
2491 | |
2492 | void QRegExpEngine::Box::cat(const Box &b) |
2493 | { |
2494 | eng->addCatTransitions(from: rs, to: b.ls); |
2495 | addAnchorsToEngine(to: b); |
2496 | if (minl == 0) { |
2497 | lanchors.insert(map: b.lanchors); |
2498 | if (skipanchors != 0) { |
2499 | for (int i = 0; i < b.ls.size(); i++) { |
2500 | int a = eng->anchorConcatenation(a: lanchors.value(key: b.ls.at(i), defaultValue: 0), b: skipanchors); |
2501 | lanchors.insert(key: b.ls.at(i), value: a); |
2502 | } |
2503 | } |
2504 | mergeInto(a: &ls, b: b.ls); |
2505 | } |
2506 | if (b.minl == 0) { |
2507 | ranchors.insert(map: b.ranchors); |
2508 | if (b.skipanchors != 0) { |
2509 | for (int i = 0; i < rs.size(); i++) { |
2510 | int a = eng->anchorConcatenation(a: ranchors.value(key: rs.at(i), defaultValue: 0), b: b.skipanchors); |
2511 | ranchors.insert(key: rs.at(i), value: a); |
2512 | } |
2513 | } |
2514 | mergeInto(a: &rs, b: b.rs); |
2515 | } else { |
2516 | ranchors = b.ranchors; |
2517 | rs = b.rs; |
2518 | } |
2519 | |
2520 | #ifndef QT_NO_REGEXP_OPTIM |
2521 | if (maxl != InftyLen) { |
2522 | if (rightStr.size() + b.leftStr.size() > |
2523 | qMax(a: str.size(), b: b.str.size())) { |
2524 | earlyStart = minl - rightStr.size(); |
2525 | lateStart = maxl - rightStr.size(); |
2526 | str = rightStr + b.leftStr; |
2527 | } else if (b.str.size() > str.size()) { |
2528 | earlyStart = minl + b.earlyStart; |
2529 | lateStart = maxl + b.lateStart; |
2530 | str = b.str; |
2531 | } |
2532 | } |
2533 | |
2534 | if (leftStr.size() == maxl) |
2535 | leftStr += b.leftStr; |
2536 | |
2537 | if (b.rightStr.size() == b.maxl) { |
2538 | rightStr += b.rightStr; |
2539 | } else { |
2540 | rightStr = b.rightStr; |
2541 | } |
2542 | |
2543 | if (maxl == InftyLen || b.maxl == InftyLen) { |
2544 | maxl = InftyLen; |
2545 | } else { |
2546 | maxl += b.maxl; |
2547 | } |
2548 | |
2549 | for (int i = 0; i < NumBadChars; i++) { |
2550 | if (b.occ1.at(i) != NoOccurrence && minl + b.occ1.at(i) < occ1.at(i)) |
2551 | occ1[i] = minl + b.occ1.at(i); |
2552 | } |
2553 | #endif |
2554 | |
2555 | minl += b.minl; |
2556 | if (minl == 0) |
2557 | skipanchors = eng->anchorConcatenation(a: skipanchors, b: b.skipanchors); |
2558 | else |
2559 | skipanchors = 0; |
2560 | } |
2561 | |
2562 | void QRegExpEngine::Box::orx(const Box &b) |
2563 | { |
2564 | mergeInto(a: &ls, b: b.ls); |
2565 | lanchors.insert(map: b.lanchors); |
2566 | mergeInto(a: &rs, b: b.rs); |
2567 | ranchors.insert(map: b.ranchors); |
2568 | |
2569 | if (b.minl == 0) { |
2570 | if (minl == 0) |
2571 | skipanchors = eng->anchorAlternation(a: skipanchors, b: b.skipanchors); |
2572 | else |
2573 | skipanchors = b.skipanchors; |
2574 | } |
2575 | |
2576 | #ifndef QT_NO_REGEXP_OPTIM |
2577 | for (int i = 0; i < NumBadChars; i++) { |
2578 | if (occ1.at(i) > b.occ1.at(i)) |
2579 | occ1[i] = b.occ1.at(i); |
2580 | } |
2581 | earlyStart = 0; |
2582 | lateStart = 0; |
2583 | str = QString(); |
2584 | leftStr = QString(); |
2585 | rightStr = QString(); |
2586 | if (b.maxl > maxl) |
2587 | maxl = b.maxl; |
2588 | #endif |
2589 | if (b.minl < minl) |
2590 | minl = b.minl; |
2591 | } |
2592 | |
2593 | void QRegExpEngine::Box::plus(int atom) |
2594 | { |
2595 | #ifndef QT_NO_REGEXP_CAPTURE |
2596 | eng->addPlusTransitions(from: rs, to: ls, atom); |
2597 | #else |
2598 | Q_UNUSED(atom); |
2599 | eng->addCatTransitions(rs, ls); |
2600 | #endif |
2601 | addAnchorsToEngine(to: *this); |
2602 | #ifndef QT_NO_REGEXP_OPTIM |
2603 | maxl = InftyLen; |
2604 | #endif |
2605 | } |
2606 | |
2607 | void QRegExpEngine::Box::opt() |
2608 | { |
2609 | #ifndef QT_NO_REGEXP_OPTIM |
2610 | earlyStart = 0; |
2611 | lateStart = 0; |
2612 | str = QString(); |
2613 | leftStr = QString(); |
2614 | rightStr = QString(); |
2615 | #endif |
2616 | skipanchors = 0; |
2617 | minl = 0; |
2618 | } |
2619 | |
2620 | void QRegExpEngine::Box::catAnchor(int a) |
2621 | { |
2622 | if (a != 0) { |
2623 | for (int i = 0; i < rs.size(); i++) { |
2624 | a = eng->anchorConcatenation(a: ranchors.value(key: rs.at(i), defaultValue: 0), b: a); |
2625 | ranchors.insert(key: rs.at(i), value: a); |
2626 | } |
2627 | if (minl == 0) |
2628 | skipanchors = eng->anchorConcatenation(a: skipanchors, b: a); |
2629 | } |
2630 | } |
2631 | |
2632 | #ifndef QT_NO_REGEXP_OPTIM |
2633 | void QRegExpEngine::Box::setupHeuristics() |
2634 | { |
2635 | eng->goodEarlyStart = earlyStart; |
2636 | eng->goodLateStart = lateStart; |
2637 | eng->goodStr = eng->cs ? str : str.toLower(); |
2638 | |
2639 | eng->minl = minl; |
2640 | if (eng->cs) { |
2641 | /* |
2642 | A regular expression such as 112|1 has occ1['2'] = 2 and minl = |
2643 | 1 at this point. An entry of occ1 has to be at most minl or |
2644 | infinity for the rest of the algorithm to go well. |
2645 | |
2646 | We waited until here before normalizing these cases (instead of |
2647 | doing it in Box::orx()) because sometimes things improve by |
2648 | themselves. Consider for example (112|1)34. |
2649 | */ |
2650 | for (int i = 0; i < NumBadChars; i++) { |
2651 | if (occ1.at(i) != NoOccurrence && occ1.at(i) >= minl) |
2652 | occ1[i] = minl; |
2653 | } |
2654 | eng->occ1 = occ1; |
2655 | } else { |
2656 | eng->occ1.fill(t: 0, newSize: NumBadChars); |
2657 | } |
2658 | |
2659 | eng->heuristicallyChooseHeuristic(); |
2660 | } |
2661 | #endif |
2662 | |
2663 | #if defined(QT_DEBUG) |
2664 | void QRegExpEngine::Box::dump() const |
2665 | { |
2666 | int i; |
2667 | qDebug(msg: "Box of at least %d character%s" , minl, minl == 1 ? "" : "s" ); |
2668 | qDebug(msg: " Left states:" ); |
2669 | for (i = 0; i < ls.size(); i++) { |
2670 | if (lanchors.value(key: ls[i], defaultValue: 0) == 0) |
2671 | qDebug(msg: " %d" , ls[i]); |
2672 | else |
2673 | qDebug(msg: " %d [anchors 0x%.8x]" , ls[i], lanchors[ls[i]]); |
2674 | } |
2675 | qDebug(msg: " Right states:" ); |
2676 | for (i = 0; i < rs.size(); i++) { |
2677 | if (ranchors.value(key: rs[i], defaultValue: 0) == 0) |
2678 | qDebug(msg: " %d" , rs[i]); |
2679 | else |
2680 | qDebug(msg: " %d [anchors 0x%.8x]" , rs[i], ranchors[rs[i]]); |
2681 | } |
2682 | qDebug(msg: " Skip anchors: 0x%.8x" , skipanchors); |
2683 | } |
2684 | #endif |
2685 | |
2686 | void QRegExpEngine::Box::addAnchorsToEngine(const Box &to) const |
2687 | { |
2688 | for (int i = 0; i < to.ls.size(); i++) { |
2689 | for (int j = 0; j < rs.size(); j++) { |
2690 | int a = eng->anchorConcatenation(a: ranchors.value(key: rs.at(i: j), defaultValue: 0), |
2691 | b: to.lanchors.value(key: to.ls.at(i), defaultValue: 0)); |
2692 | eng->addAnchors(from: rs[j], to: to.ls[i], a); |
2693 | } |
2694 | } |
2695 | } |
2696 | |
2697 | #ifndef QT_NO_REGEXP_CCLASS |
2698 | // fast lookup hash for xml schema extensions |
2699 | // sorted by name for b-search |
2700 | static const struct CategoriesRangeMapEntry { |
2701 | const char name[40]; |
2702 | uint first, second; |
2703 | } categoriesRangeMap[] = { |
2704 | { .name: "AegeanNumbers" , .first: 0x10100, .second: 0x1013F }, |
2705 | { .name: "AlphabeticPresentationForms" , .first: 0xFB00, .second: 0xFB4F }, |
2706 | { .name: "AncientGreekMusicalNotation" , .first: 0x1D200, .second: 0x1D24F }, |
2707 | { .name: "AncientGreekNumbers" , .first: 0x10140, .second: 0x1018F }, |
2708 | { .name: "Arabic" , .first: 0x0600, .second: 0x06FF }, |
2709 | { .name: "ArabicPresentationForms-A" , .first: 0xFB50, .second: 0xFDFF }, |
2710 | { .name: "ArabicPresentationForms-B" , .first: 0xFE70, .second: 0xFEFF }, |
2711 | { .name: "ArabicSupplement" , .first: 0x0750, .second: 0x077F }, |
2712 | { .name: "Armenian" , .first: 0x0530, .second: 0x058F }, |
2713 | { .name: "Arrows" , .first: 0x2190, .second: 0x21FF }, |
2714 | { .name: "BasicLatin" , .first: 0x0000, .second: 0x007F }, |
2715 | { .name: "Bengali" , .first: 0x0980, .second: 0x09FF }, |
2716 | { .name: "BlockElements" , .first: 0x2580, .second: 0x259F }, |
2717 | { .name: "Bopomofo" , .first: 0x3100, .second: 0x312F }, |
2718 | { .name: "BopomofoExtended" , .first: 0x31A0, .second: 0x31BF }, |
2719 | { .name: "BoxDrawing" , .first: 0x2500, .second: 0x257F }, |
2720 | { .name: "BraillePatterns" , .first: 0x2800, .second: 0x28FF }, |
2721 | { .name: "Buginese" , .first: 0x1A00, .second: 0x1A1F }, |
2722 | { .name: "Buhid" , .first: 0x1740, .second: 0x175F }, |
2723 | { .name: "ByzantineMusicalSymbols" , .first: 0x1D000, .second: 0x1D0FF }, |
2724 | { .name: "CJKCompatibility" , .first: 0x3300, .second: 0x33FF }, |
2725 | { .name: "CJKCompatibilityForms" , .first: 0xFE30, .second: 0xFE4F }, |
2726 | { .name: "CJKCompatibilityIdeographs" , .first: 0xF900, .second: 0xFAFF }, |
2727 | { .name: "CJKCompatibilityIdeographsSupplement" , .first: 0x2F800, .second: 0x2FA1F }, |
2728 | { .name: "CJKRadicalsSupplement" , .first: 0x2E80, .second: 0x2EFF }, |
2729 | { .name: "CJKStrokes" , .first: 0x31C0, .second: 0x31EF }, |
2730 | { .name: "CJKSymbolsandPunctuation" , .first: 0x3000, .second: 0x303F }, |
2731 | { .name: "CJKUnifiedIdeographs" , .first: 0x4E00, .second: 0x9FFF }, |
2732 | { .name: "CJKUnifiedIdeographsExtensionA" , .first: 0x3400, .second: 0x4DB5 }, |
2733 | { .name: "CJKUnifiedIdeographsExtensionB" , .first: 0x20000, .second: 0x2A6DF }, |
2734 | { .name: "Cherokee" , .first: 0x13A0, .second: 0x13FF }, |
2735 | { .name: "CombiningDiacriticalMarks" , .first: 0x0300, .second: 0x036F }, |
2736 | { .name: "CombiningDiacriticalMarksSupplement" , .first: 0x1DC0, .second: 0x1DFF }, |
2737 | { .name: "CombiningHalfMarks" , .first: 0xFE20, .second: 0xFE2F }, |
2738 | { .name: "CombiningMarksforSymbols" , .first: 0x20D0, .second: 0x20FF }, |
2739 | { .name: "ControlPictures" , .first: 0x2400, .second: 0x243F }, |
2740 | { .name: "Coptic" , .first: 0x2C80, .second: 0x2CFF }, |
2741 | { .name: "CurrencySymbols" , .first: 0x20A0, .second: 0x20CF }, |
2742 | { .name: "CypriotSyllabary" , .first: 0x10800, .second: 0x1083F }, |
2743 | { .name: "Cyrillic" , .first: 0x0400, .second: 0x04FF }, |
2744 | { .name: "CyrillicSupplement" , .first: 0x0500, .second: 0x052F }, |
2745 | { .name: "Deseret" , .first: 0x10400, .second: 0x1044F }, |
2746 | { .name: "Devanagari" , .first: 0x0900, .second: 0x097F }, |
2747 | { .name: "Dingbats" , .first: 0x2700, .second: 0x27BF }, |
2748 | { .name: "EnclosedAlphanumerics" , .first: 0x2460, .second: 0x24FF }, |
2749 | { .name: "EnclosedCJKLettersandMonths" , .first: 0x3200, .second: 0x32FF }, |
2750 | { .name: "Ethiopic" , .first: 0x1200, .second: 0x137F }, |
2751 | { .name: "EthiopicExtended" , .first: 0x2D80, .second: 0x2DDF }, |
2752 | { .name: "EthiopicSupplement" , .first: 0x1380, .second: 0x139F }, |
2753 | { .name: "GeneralPunctuation" , .first: 0x2000, .second: 0x206F }, |
2754 | { .name: "GeometricShapes" , .first: 0x25A0, .second: 0x25FF }, |
2755 | { .name: "Georgian" , .first: 0x10A0, .second: 0x10FF }, |
2756 | { .name: "GeorgianSupplement" , .first: 0x2D00, .second: 0x2D2F }, |
2757 | { .name: "Glagolitic" , .first: 0x2C00, .second: 0x2C5F }, |
2758 | { .name: "Gothic" , .first: 0x10330, .second: 0x1034F }, |
2759 | { .name: "Greek" , .first: 0x0370, .second: 0x03FF }, |
2760 | { .name: "GreekExtended" , .first: 0x1F00, .second: 0x1FFF }, |
2761 | { .name: "Gujarati" , .first: 0x0A80, .second: 0x0AFF }, |
2762 | { .name: "Gurmukhi" , .first: 0x0A00, .second: 0x0A7F }, |
2763 | { .name: "HalfwidthandFullwidthForms" , .first: 0xFF00, .second: 0xFFEF }, |
2764 | { .name: "HangulCompatibilityJamo" , .first: 0x3130, .second: 0x318F }, |
2765 | { .name: "HangulJamo" , .first: 0x1100, .second: 0x11FF }, |
2766 | { .name: "HangulSyllables" , .first: 0xAC00, .second: 0xD7A3 }, |
2767 | { .name: "Hanunoo" , .first: 0x1720, .second: 0x173F }, |
2768 | { .name: "Hebrew" , .first: 0x0590, .second: 0x05FF }, |
2769 | { .name: "Hiragana" , .first: 0x3040, .second: 0x309F }, |
2770 | { .name: "IPAExtensions" , .first: 0x0250, .second: 0x02AF }, |
2771 | { .name: "IdeographicDescriptionCharacters" , .first: 0x2FF0, .second: 0x2FFF }, |
2772 | { .name: "Kanbun" , .first: 0x3190, .second: 0x319F }, |
2773 | { .name: "KangxiRadicals" , .first: 0x2F00, .second: 0x2FDF }, |
2774 | { .name: "Kannada" , .first: 0x0C80, .second: 0x0CFF }, |
2775 | { .name: "Katakana" , .first: 0x30A0, .second: 0x30FF }, |
2776 | { .name: "KatakanaPhoneticExtensions" , .first: 0x31F0, .second: 0x31FF }, |
2777 | { .name: "Kharoshthi" , .first: 0x10A00, .second: 0x10A5F }, |
2778 | { .name: "Khmer" , .first: 0x1780, .second: 0x17FF }, |
2779 | { .name: "KhmerSymbols" , .first: 0x19E0, .second: 0x19FF }, |
2780 | { .name: "Lao" , .first: 0x0E80, .second: 0x0EFF }, |
2781 | { .name: "Latin-1Supplement" , .first: 0x0080, .second: 0x00FF }, |
2782 | { .name: "LatinExtended-A" , .first: 0x0100, .second: 0x017F }, |
2783 | { .name: "LatinExtended-B" , .first: 0x0180, .second: 0x024F }, |
2784 | { .name: "LatinExtendedAdditional" , .first: 0x1E00, .second: 0x1EFF }, |
2785 | { .name: "LetterlikeSymbols" , .first: 0x2100, .second: 0x214F }, |
2786 | { .name: "Limbu" , .first: 0x1900, .second: 0x194F }, |
2787 | { .name: "LinearBIdeograms" , .first: 0x10080, .second: 0x100FF }, |
2788 | { .name: "LinearBSyllabary" , .first: 0x10000, .second: 0x1007F }, |
2789 | { .name: "Malayalam" , .first: 0x0D00, .second: 0x0D7F }, |
2790 | { .name: "MathematicalAlphanumericSymbols" , .first: 0x1D400, .second: 0x1D7FF }, |
2791 | { .name: "MathematicalOperators" , .first: 0x2200, .second: 0x22FF }, |
2792 | { .name: "MiscellaneousMathematicalSymbols-A" , .first: 0x27C0, .second: 0x27EF }, |
2793 | { .name: "MiscellaneousMathematicalSymbols-B" , .first: 0x2980, .second: 0x29FF }, |
2794 | { .name: "MiscellaneousSymbols" , .first: 0x2600, .second: 0x26FF }, |
2795 | { .name: "MiscellaneousSymbolsandArrows" , .first: 0x2B00, .second: 0x2BFF }, |
2796 | { .name: "MiscellaneousTechnical" , .first: 0x2300, .second: 0x23FF }, |
2797 | { .name: "ModifierToneLetters" , .first: 0xA700, .second: 0xA71F }, |
2798 | { .name: "Mongolian" , .first: 0x1800, .second: 0x18AF }, |
2799 | { .name: "MusicalSymbols" , .first: 0x1D100, .second: 0x1D1FF }, |
2800 | { .name: "Myanmar" , .first: 0x1000, .second: 0x109F }, |
2801 | { .name: "NewTaiLue" , .first: 0x1980, .second: 0x19DF }, |
2802 | { .name: "NumberForms" , .first: 0x2150, .second: 0x218F }, |
2803 | { .name: "Ogham" , .first: 0x1680, .second: 0x169F }, |
2804 | { .name: "OldItalic" , .first: 0x10300, .second: 0x1032F }, |
2805 | { .name: "OldPersian" , .first: 0x103A0, .second: 0x103DF }, |
2806 | { .name: "OpticalCharacterRecognition" , .first: 0x2440, .second: 0x245F }, |
2807 | { .name: "Oriya" , .first: 0x0B00, .second: 0x0B7F }, |
2808 | { .name: "Osmanya" , .first: 0x10480, .second: 0x104AF }, |
2809 | { .name: "PhoneticExtensions" , .first: 0x1D00, .second: 0x1D7F }, |
2810 | { .name: "PhoneticExtensionsSupplement" , .first: 0x1D80, .second: 0x1DBF }, |
2811 | { .name: "PrivateUse" , .first: 0xE000, .second: 0xF8FF }, |
2812 | { .name: "Runic" , .first: 0x16A0, .second: 0x16FF }, |
2813 | { .name: "Shavian" , .first: 0x10450, .second: 0x1047F }, |
2814 | { .name: "Sinhala" , .first: 0x0D80, .second: 0x0DFF }, |
2815 | { .name: "SmallFormVariants" , .first: 0xFE50, .second: 0xFE6F }, |
2816 | { .name: "SpacingModifierLetters" , .first: 0x02B0, .second: 0x02FF }, |
2817 | { .name: "Specials" , .first: 0xFFF0, .second: 0xFFFF }, |
2818 | { .name: "SuperscriptsandSubscripts" , .first: 0x2070, .second: 0x209F }, |
2819 | { .name: "SupplementalArrows-A" , .first: 0x27F0, .second: 0x27FF }, |
2820 | { .name: "SupplementalArrows-B" , .first: 0x2900, .second: 0x297F }, |
2821 | { .name: "SupplementalMathematicalOperators" , .first: 0x2A00, .second: 0x2AFF }, |
2822 | { .name: "SupplementalPunctuation" , .first: 0x2E00, .second: 0x2E7F }, |
2823 | { .name: "SupplementaryPrivateUseArea-A" , .first: 0xF0000, .second: 0xFFFFF }, |
2824 | { .name: "SupplementaryPrivateUseArea-B" , .first: 0x100000, .second: 0x10FFFF }, |
2825 | { .name: "SylotiNagri" , .first: 0xA800, .second: 0xA82F }, |
2826 | { .name: "Syriac" , .first: 0x0700, .second: 0x074F }, |
2827 | { .name: "Tagalog" , .first: 0x1700, .second: 0x171F }, |
2828 | { .name: "Tagbanwa" , .first: 0x1760, .second: 0x177F }, |
2829 | { .name: "Tags" , .first: 0xE0000, .second: 0xE007F }, |
2830 | { .name: "TaiLe" , .first: 0x1950, .second: 0x197F }, |
2831 | { .name: "TaiXuanJingSymbols" , .first: 0x1D300, .second: 0x1D35F }, |
2832 | { .name: "Tamil" , .first: 0x0B80, .second: 0x0BFF }, |
2833 | { .name: "Telugu" , .first: 0x0C00, .second: 0x0C7F }, |
2834 | { .name: "Thaana" , .first: 0x0780, .second: 0x07BF }, |
2835 | { .name: "Thai" , .first: 0x0E00, .second: 0x0E7F }, |
2836 | { .name: "Tibetan" , .first: 0x0F00, .second: 0x0FFF }, |
2837 | { .name: "Tifinagh" , .first: 0x2D30, .second: 0x2D7F }, |
2838 | { .name: "Ugaritic" , .first: 0x10380, .second: 0x1039F }, |
2839 | { .name: "UnifiedCanadianAboriginalSyllabics" , .first: 0x1400, .second: 0x167F }, |
2840 | { .name: "VariationSelectors" , .first: 0xFE00, .second: 0xFE0F }, |
2841 | { .name: "VariationSelectorsSupplement" , .first: 0xE0100, .second: 0xE01EF }, |
2842 | { .name: "VerticalForms" , .first: 0xFE10, .second: 0xFE1F }, |
2843 | { .name: "YiRadicals" , .first: 0xA490, .second: 0xA4CF }, |
2844 | { .name: "YiSyllables" , .first: 0xA000, .second: 0xA48F }, |
2845 | { .name: "YijingHexagramSymbols" , .first: 0x4DC0, .second: 0x4DFF } |
2846 | }; |
2847 | |
2848 | inline bool operator<(const CategoriesRangeMapEntry &entry1, const CategoriesRangeMapEntry &entry2) |
2849 | { return qstrcmp(str1: entry1.name, str2: entry2.name) < 0; } |
2850 | inline bool operator<(const char *name, const CategoriesRangeMapEntry &entry) |
2851 | { return qstrcmp(str1: name, str2: entry.name) < 0; } |
2852 | inline bool operator<(const CategoriesRangeMapEntry &entry, const char *name) |
2853 | { return qstrcmp(str1: entry.name, str2: name) < 0; } |
2854 | #endif // QT_NO_REGEXP_CCLASS |
2855 | |
2856 | int QRegExpEngine::getChar() |
2857 | { |
2858 | return (yyPos == yyLen) ? EOS : yyIn[yyPos++].unicode(); |
2859 | } |
2860 | |
2861 | int QRegExpEngine::getEscape() |
2862 | { |
2863 | #ifndef QT_NO_REGEXP_ESCAPE |
2864 | const char tab[] = "afnrtv" ; // no b, as \b means word boundary |
2865 | const char backTab[] = "\a\f\n\r\t\v" ; |
2866 | ushort low; |
2867 | int i; |
2868 | #endif |
2869 | ushort val; |
2870 | int prevCh = yyCh; |
2871 | |
2872 | if (prevCh == EOS) { |
2873 | error(RXERR_END); |
2874 | return Tok_Char | '\\'; |
2875 | } |
2876 | yyCh = getChar(); |
2877 | #ifndef QT_NO_REGEXP_ESCAPE |
2878 | if ((prevCh & ~0xff) == 0) { |
2879 | const char *p = strchr(s: tab, c: prevCh); |
2880 | if (p != nullptr) |
2881 | return Tok_Char | backTab[p - tab]; |
2882 | } |
2883 | #endif |
2884 | |
2885 | switch (prevCh) { |
2886 | #ifndef QT_NO_REGEXP_ESCAPE |
2887 | case '0': |
2888 | val = 0; |
2889 | for (i = 0; i < 3; i++) { |
2890 | if (yyCh >= '0' && yyCh <= '7') |
2891 | val = (val << 3) | (yyCh - '0'); |
2892 | else |
2893 | break; |
2894 | yyCh = getChar(); |
2895 | } |
2896 | if ((val & ~0377) != 0) |
2897 | error(RXERR_OCTAL); |
2898 | return Tok_Char | val; |
2899 | #endif |
2900 | #ifndef QT_NO_REGEXP_ESCAPE |
2901 | case 'B': |
2902 | return Tok_NonWord; |
2903 | #endif |
2904 | #ifndef QT_NO_REGEXP_CCLASS |
2905 | case 'D': |
2906 | // see QChar::isDigit() |
2907 | yyCharClass->addCategories(cats: uint(-1) ^ FLAG(QChar::Number_DecimalDigit)); |
2908 | return Tok_CharClass; |
2909 | case 'S': |
2910 | // see QChar::isSpace() |
2911 | yyCharClass->addCategories(cats: uint(-1) ^ (FLAG(QChar::Separator_Space) | |
2912 | FLAG(QChar::Separator_Line) | |
2913 | FLAG(QChar::Separator_Paragraph) | |
2914 | FLAG(QChar::Other_Control))); |
2915 | yyCharClass->addRange(from: 0x0000, to: 0x0008); |
2916 | yyCharClass->addRange(from: 0x000e, to: 0x001f); |
2917 | yyCharClass->addRange(from: 0x007f, to: 0x0084); |
2918 | yyCharClass->addRange(from: 0x0086, to: 0x009f); |
2919 | return Tok_CharClass; |
2920 | case 'W': |
2921 | // see QChar::isLetterOrNumber() and QChar::isMark() |
2922 | yyCharClass->addCategories(cats: uint(-1) ^ (FLAG(QChar::Mark_NonSpacing) | |
2923 | FLAG(QChar::Mark_SpacingCombining) | |
2924 | FLAG(QChar::Mark_Enclosing) | |
2925 | FLAG(QChar::Number_DecimalDigit) | |
2926 | FLAG(QChar::Number_Letter) | |
2927 | FLAG(QChar::Number_Other) | |
2928 | FLAG(QChar::Letter_Uppercase) | |
2929 | FLAG(QChar::Letter_Lowercase) | |
2930 | FLAG(QChar::Letter_Titlecase) | |
2931 | FLAG(QChar::Letter_Modifier) | |
2932 | FLAG(QChar::Letter_Other) | |
2933 | FLAG(QChar::Punctuation_Connector))); |
2934 | yyCharClass->addRange(from: 0x203f, to: 0x2040); |
2935 | yyCharClass->addSingleton(ch: 0x2040); |
2936 | yyCharClass->addSingleton(ch: 0x2054); |
2937 | yyCharClass->addSingleton(ch: 0x30fb); |
2938 | yyCharClass->addRange(from: 0xfe33, to: 0xfe34); |
2939 | yyCharClass->addRange(from: 0xfe4d, to: 0xfe4f); |
2940 | yyCharClass->addSingleton(ch: 0xff3f); |
2941 | yyCharClass->addSingleton(ch: 0xff65); |
2942 | return Tok_CharClass; |
2943 | #endif |
2944 | #ifndef QT_NO_REGEXP_ESCAPE |
2945 | case 'b': |
2946 | return Tok_Word; |
2947 | #endif |
2948 | #ifndef QT_NO_REGEXP_CCLASS |
2949 | case 'd': |
2950 | // see QChar::isDigit() |
2951 | yyCharClass->addCategories(FLAG(QChar::Number_DecimalDigit)); |
2952 | return Tok_CharClass; |
2953 | case 's': |
2954 | // see QChar::isSpace() |
2955 | yyCharClass->addCategories(FLAG(QChar::Separator_Space) | |
2956 | FLAG(QChar::Separator_Line) | |
2957 | FLAG(QChar::Separator_Paragraph)); |
2958 | yyCharClass->addRange(from: 0x0009, to: 0x000d); |
2959 | yyCharClass->addSingleton(ch: 0x0085); |
2960 | return Tok_CharClass; |
2961 | case 'w': |
2962 | // see QChar::isLetterOrNumber() and QChar::isMark() |
2963 | yyCharClass->addCategories(FLAG(QChar::Mark_NonSpacing) | |
2964 | FLAG(QChar::Mark_SpacingCombining) | |
2965 | FLAG(QChar::Mark_Enclosing) | |
2966 | FLAG(QChar::Number_DecimalDigit) | |
2967 | FLAG(QChar::Number_Letter) | |
2968 | FLAG(QChar::Number_Other) | |
2969 | FLAG(QChar::Letter_Uppercase) | |
2970 | FLAG(QChar::Letter_Lowercase) | |
2971 | FLAG(QChar::Letter_Titlecase) | |
2972 | FLAG(QChar::Letter_Modifier) | |
2973 | FLAG(QChar::Letter_Other)); |
2974 | yyCharClass->addSingleton(ch: 0x005f); // '_' |
2975 | return Tok_CharClass; |
2976 | case 'I': |
2977 | if (!xmlSchemaExtensions) |
2978 | break; |
2979 | yyCharClass->setNegative(!yyCharClass->negative()); |
2980 | Q_FALLTHROUGH(); |
2981 | case 'i': |
2982 | if (xmlSchemaExtensions) { |
2983 | yyCharClass->addCategories(FLAG(QChar::Mark_NonSpacing) | |
2984 | FLAG(QChar::Mark_SpacingCombining) | |
2985 | FLAG(QChar::Mark_Enclosing) | |
2986 | FLAG(QChar::Number_DecimalDigit) | |
2987 | FLAG(QChar::Number_Letter) | |
2988 | FLAG(QChar::Number_Other) | |
2989 | FLAG(QChar::Letter_Uppercase) | |
2990 | FLAG(QChar::Letter_Lowercase) | |
2991 | FLAG(QChar::Letter_Titlecase) | |
2992 | FLAG(QChar::Letter_Modifier) | |
2993 | FLAG(QChar::Letter_Other)); |
2994 | yyCharClass->addSingleton(ch: 0x003a); // ':' |
2995 | yyCharClass->addSingleton(ch: 0x005f); // '_' |
2996 | yyCharClass->addRange(from: 0x0041, to: 0x005a); // [A-Z] |
2997 | yyCharClass->addRange(from: 0x0061, to: 0x007a); // [a-z] |
2998 | yyCharClass->addRange(from: 0xc0, to: 0xd6); |
2999 | yyCharClass->addRange(from: 0xd8, to: 0xf6); |
3000 | yyCharClass->addRange(from: 0xf8, to: 0x2ff); |
3001 | yyCharClass->addRange(from: 0x370, to: 0x37d); |
3002 | yyCharClass->addRange(from: 0x37f, to: 0x1fff); |
3003 | yyCharClass->addRange(from: 0x200c, to: 0x200d); |
3004 | yyCharClass->addRange(from: 0x2070, to: 0x218f); |
3005 | yyCharClass->addRange(from: 0x2c00, to: 0x2fef); |
3006 | yyCharClass->addRange(from: 0x3001, to: 0xd7ff); |
3007 | yyCharClass->addRange(from: 0xf900, to: 0xfdcf); |
3008 | yyCharClass->addRange(from: 0xfdf0, to: 0xfffd); |
3009 | yyCharClass->addRange(from: (ushort)0x10000, to: (ushort)0xeffff); |
3010 | return Tok_CharClass; |
3011 | } else { |
3012 | break; |
3013 | } |
3014 | case 'C': |
3015 | if (!xmlSchemaExtensions) |
3016 | break; |
3017 | yyCharClass->setNegative(!yyCharClass->negative()); |
3018 | Q_FALLTHROUGH(); |
3019 | case 'c': |
3020 | if (xmlSchemaExtensions) { |
3021 | yyCharClass->addCategories(FLAG(QChar::Mark_NonSpacing) | |
3022 | FLAG(QChar::Mark_SpacingCombining) | |
3023 | FLAG(QChar::Mark_Enclosing) | |
3024 | FLAG(QChar::Number_DecimalDigit) | |
3025 | FLAG(QChar::Number_Letter) | |
3026 | FLAG(QChar::Number_Other) | |
3027 | FLAG(QChar::Letter_Uppercase) | |
3028 | FLAG(QChar::Letter_Lowercase) | |
3029 | FLAG(QChar::Letter_Titlecase) | |
3030 | FLAG(QChar::Letter_Modifier) | |
3031 | FLAG(QChar::Letter_Other)); |
3032 | yyCharClass->addSingleton(ch: 0x002d); // '-' |
3033 | yyCharClass->addSingleton(ch: 0x002e); // '.' |
3034 | yyCharClass->addSingleton(ch: 0x003a); // ':' |
3035 | yyCharClass->addSingleton(ch: 0x005f); // '_' |
3036 | yyCharClass->addSingleton(ch: 0xb7); |
3037 | yyCharClass->addRange(from: 0x0030, to: 0x0039); // [0-9] |
3038 | yyCharClass->addRange(from: 0x0041, to: 0x005a); // [A-Z] |
3039 | yyCharClass->addRange(from: 0x0061, to: 0x007a); // [a-z] |
3040 | yyCharClass->addRange(from: 0xc0, to: 0xd6); |
3041 | yyCharClass->addRange(from: 0xd8, to: 0xf6); |
3042 | yyCharClass->addRange(from: 0xf8, to: 0x2ff); |
3043 | yyCharClass->addRange(from: 0x370, to: 0x37d); |
3044 | yyCharClass->addRange(from: 0x37f, to: 0x1fff); |
3045 | yyCharClass->addRange(from: 0x200c, to: 0x200d); |
3046 | yyCharClass->addRange(from: 0x2070, to: 0x218f); |
3047 | yyCharClass->addRange(from: 0x2c00, to: 0x2fef); |
3048 | yyCharClass->addRange(from: 0x3001, to: 0xd7ff); |
3049 | yyCharClass->addRange(from: 0xf900, to: 0xfdcf); |
3050 | yyCharClass->addRange(from: 0xfdf0, to: 0xfffd); |
3051 | yyCharClass->addRange(from: (ushort)0x10000, to: (ushort)0xeffff); |
3052 | yyCharClass->addRange(from: 0x0300, to: 0x036f); |
3053 | yyCharClass->addRange(from: 0x203f, to: 0x2040); |
3054 | return Tok_CharClass; |
3055 | } else { |
3056 | break; |
3057 | } |
3058 | case 'P': |
3059 | if (!xmlSchemaExtensions) |
3060 | break; |
3061 | yyCharClass->setNegative(!yyCharClass->negative()); |
3062 | Q_FALLTHROUGH(); |
3063 | case 'p': |
3064 | if (xmlSchemaExtensions) { |
3065 | if (yyCh != '{') { |
3066 | error(RXERR_CHARCLASS); |
3067 | return Tok_CharClass; |
3068 | } |
3069 | |
3070 | QByteArray category; |
3071 | yyCh = getChar(); |
3072 | while (yyCh != '}') { |
3073 | if (yyCh == EOS) { |
3074 | error(RXERR_END); |
3075 | return Tok_CharClass; |
3076 | } |
3077 | category.append(c: yyCh); |
3078 | yyCh = getChar(); |
3079 | } |
3080 | yyCh = getChar(); // skip closing '}' |
3081 | |
3082 | int catlen = category.size(); |
3083 | if (catlen == 1 || catlen == 2) { |
3084 | switch (category.at(i: 0)) { |
3085 | case 'M': |
3086 | if (catlen == 1) { |
3087 | yyCharClass->addCategories(FLAG(QChar::Mark_NonSpacing) | |
3088 | FLAG(QChar::Mark_SpacingCombining) | |
3089 | FLAG(QChar::Mark_Enclosing)); |
3090 | } else { |
3091 | switch (category.at(i: 1)) { |
3092 | case 'n': yyCharClass->addCategories(FLAG(QChar::Mark_NonSpacing)); break; // Mn |
3093 | case 'c': yyCharClass->addCategories(FLAG(QChar::Mark_SpacingCombining)); break; // Mc |
3094 | case 'e': yyCharClass->addCategories(FLAG(QChar::Mark_Enclosing)); break; // Me |
3095 | default: error(RXERR_CATEGORY); break; |
3096 | } |
3097 | } |
3098 | break; |
3099 | case 'N': |
3100 | if (catlen == 1) { |
3101 | yyCharClass->addCategories(FLAG(QChar::Number_DecimalDigit) | |
3102 | FLAG(QChar::Number_Letter) | |
3103 | FLAG(QChar::Number_Other)); |
3104 | } else { |
3105 | switch (category.at(i: 1)) { |
3106 | case 'd': yyCharClass->addCategories(FLAG(QChar::Number_DecimalDigit)); break; // Nd |
3107 | case 'l': yyCharClass->addCategories(FLAG(QChar::Number_Letter)); break; // Hl |
3108 | case 'o': yyCharClass->addCategories(FLAG(QChar::Number_Other)); break; // No |
3109 | default: error(RXERR_CATEGORY); break; |
3110 | } |
3111 | } |
3112 | break; |
3113 | case 'Z': |
3114 | if (catlen == 1) { |
3115 | yyCharClass->addCategories(FLAG(QChar::Separator_Space) | |
3116 | FLAG(QChar::Separator_Line) | |
3117 | FLAG(QChar::Separator_Paragraph)); |
3118 | } else { |
3119 | switch (category.at(i: 1)) { |
3120 | case 's': yyCharClass->addCategories(FLAG(QChar::Separator_Space)); break; // Zs |
3121 | case 'l': yyCharClass->addCategories(FLAG(QChar::Separator_Line)); break; // Zl |
3122 | case 'p': yyCharClass->addCategories(FLAG(QChar::Separator_Paragraph)); break; // Zp |
3123 | default: error(RXERR_CATEGORY); break; |
3124 | } |
3125 | } |
3126 | break; |
3127 | case 'C': |
3128 | if (catlen == 1) { |
3129 | yyCharClass->addCategories(FLAG(QChar::Other_Control) | |
3130 | FLAG(QChar::Other_Format) | |
3131 | FLAG(QChar::Other_Surrogate) | |
3132 | FLAG(QChar::Other_PrivateUse) | |
3133 | FLAG(QChar::Other_NotAssigned)); |
3134 | } else { |
3135 | switch (category.at(i: 1)) { |
3136 | case 'c': yyCharClass->addCategories(FLAG(QChar::Other_Control)); break; // Cc |
3137 | case 'f': yyCharClass->addCategories(FLAG(QChar::Other_Format)); break; // Cf |
3138 | case 's': yyCharClass->addCategories(FLAG(QChar::Other_Surrogate)); break; // Cs |
3139 | case 'o': yyCharClass->addCategories(FLAG(QChar::Other_PrivateUse)); break; // Co |
3140 | case 'n': yyCharClass->addCategories(FLAG(QChar::Other_NotAssigned)); break; // Cn |
3141 | default: error(RXERR_CATEGORY); break; |
3142 | } |
3143 | } |
3144 | break; |
3145 | case 'L': |
3146 | if (catlen == 1) { |
3147 | yyCharClass->addCategories(FLAG(QChar::Letter_Uppercase) | |
3148 | FLAG(QChar::Letter_Lowercase) | |
3149 | FLAG(QChar::Letter_Titlecase) | |
3150 | FLAG(QChar::Letter_Modifier) | |
3151 | FLAG(QChar::Letter_Other)); |
3152 | } else { |
3153 | switch (category.at(i: 1)) { |
3154 | case 'u': yyCharClass->addCategories(FLAG(QChar::Letter_Uppercase)); break; // Lu |
3155 | case 'l': yyCharClass->addCategories(FLAG(QChar::Letter_Lowercase)); break; // Ll |
3156 | case 't': yyCharClass->addCategories(FLAG(QChar::Letter_Titlecase)); break; // Lt |
3157 | case 'm': yyCharClass->addCategories(FLAG(QChar::Letter_Modifier)); break; // Lm |
3158 | case 'o': yyCharClass->addCategories(FLAG(QChar::Letter_Other)); break; // Lo |
3159 | default: error(RXERR_CATEGORY); break; |
3160 | } |
3161 | } |
3162 | break; |
3163 | case 'P': |
3164 | if (catlen == 1) { |
3165 | yyCharClass->addCategories(FLAG(QChar::Punctuation_Connector) | |
3166 | FLAG(QChar::Punctuation_Dash) | |
3167 | FLAG(QChar::Punctuation_Open) | |
3168 | FLAG(QChar::Punctuation_Close) | |
3169 | FLAG(QChar::Punctuation_InitialQuote) | |
3170 | FLAG(QChar::Punctuation_FinalQuote) | |
3171 | FLAG(QChar::Punctuation_Other)); |
3172 | } else { |
3173 | switch (category.at(i: 1)) { |
3174 | case 'c': yyCharClass->addCategories(FLAG(QChar::Punctuation_Connector)); break; // Pc |
3175 | case 'd': yyCharClass->addCategories(FLAG(QChar::Punctuation_Dash)); break; // Pd |
3176 | case 's': yyCharClass->addCategories(FLAG(QChar::Punctuation_Open)); break; // Ps |
3177 | case 'e': yyCharClass->addCategories(FLAG(QChar::Punctuation_Close)); break; // Pe |
3178 | case 'i': yyCharClass->addCategories(FLAG(QChar::Punctuation_InitialQuote)); break; // Pi |
3179 | case 'f': yyCharClass->addCategories(FLAG(QChar::Punctuation_FinalQuote)); break; // Pf |
3180 | case 'o': yyCharClass->addCategories(FLAG(QChar::Punctuation_Other)); break; // Po |
3181 | default: error(RXERR_CATEGORY); break; |
3182 | } |
3183 | } |
3184 | break; |
3185 | case 'S': |
3186 | if (catlen == 1) { |
3187 | yyCharClass->addCategories(FLAG(QChar::Symbol_Math) | |
3188 | FLAG(QChar::Symbol_Currency) | |
3189 | FLAG(QChar::Symbol_Modifier) | |
3190 | FLAG(QChar::Symbol_Other)); |
3191 | } else { |
3192 | switch (category.at(i: 1)) { |
3193 | case 'm': yyCharClass->addCategories(FLAG(QChar::Symbol_Math)); break; // Sm |
3194 | case 'c': yyCharClass->addCategories(FLAG(QChar::Symbol_Currency)); break; // Sc |
3195 | case 'k': yyCharClass->addCategories(FLAG(QChar::Symbol_Modifier)); break; // Sk |
3196 | case 'o': yyCharClass->addCategories(FLAG(QChar::Symbol_Other)); break; // So |
3197 | default: error(RXERR_CATEGORY); break; |
3198 | } |
3199 | } |
3200 | break; |
3201 | default: |
3202 | error(RXERR_CATEGORY); |
3203 | break; |
3204 | } |
3205 | } else if (catlen > 2 && category.at(i: 0) == 'I' && category.at(i: 1) == 's') { |
3206 | static const int N = sizeof(categoriesRangeMap) / sizeof(categoriesRangeMap[0]); |
3207 | const char * const categoryFamily = category.constData() + 2; |
3208 | const CategoriesRangeMapEntry *r = std::lower_bound(first: categoriesRangeMap, last: categoriesRangeMap + N, val: categoryFamily); |
3209 | if (r != categoriesRangeMap + N && qstrcmp(str1: r->name, str2: categoryFamily) == 0) |
3210 | yyCharClass->addRange(from: r->first, to: r->second); |
3211 | else |
3212 | error(RXERR_CATEGORY); |
3213 | } else { |
3214 | error(RXERR_CATEGORY); |
3215 | } |
3216 | return Tok_CharClass; |
3217 | } else { |
3218 | break; |
3219 | } |
3220 | #endif |
3221 | #ifndef QT_NO_REGEXP_ESCAPE |
3222 | case 'x': |
3223 | val = 0; |
3224 | for (i = 0; i < 4; i++) { |
3225 | low = QChar(yyCh).toLower().unicode(); |
3226 | if (low >= '0' && low <= '9') |
3227 | val = (val << 4) | (low - '0'); |
3228 | else if (low >= 'a' && low <= 'f') |
3229 | val = (val << 4) | (low - 'a' + 10); |
3230 | else |
3231 | break; |
3232 | yyCh = getChar(); |
3233 | } |
3234 | return Tok_Char | val; |
3235 | #endif |
3236 | default: |
3237 | break; |
3238 | } |
3239 | if (prevCh >= '1' && prevCh <= '9') { |
3240 | #ifndef QT_NO_REGEXP_BACKREF |
3241 | val = prevCh - '0'; |
3242 | while (yyCh >= '0' && yyCh <= '9') { |
3243 | val = (val * 10) + (yyCh - '0'); |
3244 | yyCh = getChar(); |
3245 | } |
3246 | return Tok_BackRef | val; |
3247 | #else |
3248 | error(RXERR_DISABLED); |
3249 | #endif |
3250 | } |
3251 | return Tok_Char | prevCh; |
3252 | } |
3253 | |
3254 | #ifndef QT_NO_REGEXP_INTERVAL |
3255 | int QRegExpEngine::getRep(int def) |
3256 | { |
3257 | if (yyCh >= '0' && yyCh <= '9') { |
3258 | int rep = 0; |
3259 | do { |
3260 | rep = 10 * rep + yyCh - '0'; |
3261 | if (rep >= InftyRep) { |
3262 | error(RXERR_REPETITION); |
3263 | rep = def; |
3264 | } |
3265 | yyCh = getChar(); |
3266 | } while (yyCh >= '0' && yyCh <= '9'); |
3267 | return rep; |
3268 | } else { |
3269 | return def; |
3270 | } |
3271 | } |
3272 | #endif |
3273 | |
3274 | #ifndef QT_NO_REGEXP_LOOKAHEAD |
3275 | void QRegExpEngine::skipChars(int n) |
3276 | { |
3277 | if (n > 0) { |
3278 | yyPos += n - 1; |
3279 | yyCh = getChar(); |
3280 | } |
3281 | } |
3282 | #endif |
3283 | |
3284 | void QRegExpEngine::error(const char *msg) |
3285 | { |
3286 | if (yyError.isEmpty()) |
3287 | yyError = QLatin1String(msg); |
3288 | } |
3289 | |
3290 | void QRegExpEngine::startTokenizer(const QChar *rx, int len) |
3291 | { |
3292 | yyIn = rx; |
3293 | yyPos0 = 0; |
3294 | yyPos = 0; |
3295 | yyLen = len; |
3296 | yyCh = getChar(); |
3297 | yyCharClass.reset(other: new QRegExpCharClass); |
3298 | yyMinRep = 0; |
3299 | yyMaxRep = 0; |
3300 | yyError = QString(); |
3301 | } |
3302 | |
3303 | int QRegExpEngine::getToken() |
3304 | { |
3305 | #ifndef QT_NO_REGEXP_CCLASS |
3306 | ushort pendingCh = 0; |
3307 | bool charPending; |
3308 | bool rangePending; |
3309 | int tok; |
3310 | #endif |
3311 | int prevCh = yyCh; |
3312 | |
3313 | yyPos0 = yyPos - 1; |
3314 | #ifndef QT_NO_REGEXP_CCLASS |
3315 | yyCharClass->clear(); |
3316 | #endif |
3317 | yyMinRep = 0; |
3318 | yyMaxRep = 0; |
3319 | yyCh = getChar(); |
3320 | |
3321 | switch (prevCh) { |
3322 | case EOS: |
3323 | yyPos0 = yyPos; |
3324 | return Tok_Eos; |
3325 | case '$': |
3326 | return Tok_Dollar; |
3327 | case '(': |
3328 | if (yyCh == '?') { |
3329 | prevCh = getChar(); |
3330 | yyCh = getChar(); |
3331 | switch (prevCh) { |
3332 | #ifndef QT_NO_REGEXP_LOOKAHEAD |
3333 | case '!': |
3334 | return Tok_NegLookahead; |
3335 | case '=': |
3336 | return Tok_PosLookahead; |
3337 | #endif |
3338 | case ':': |
3339 | return Tok_MagicLeftParen; |
3340 | case '<': |
3341 | error(RXERR_LOOKBEHIND); |
3342 | return Tok_MagicLeftParen; |
3343 | default: |
3344 | error(RXERR_LOOKAHEAD); |
3345 | return Tok_MagicLeftParen; |
3346 | } |
3347 | } else { |
3348 | return Tok_LeftParen; |
3349 | } |
3350 | case ')': |
3351 | return Tok_RightParen; |
3352 | case '*': |
3353 | yyMinRep = 0; |
3354 | yyMaxRep = InftyRep; |
3355 | return Tok_Quantifier; |
3356 | case '+': |
3357 | yyMinRep = 1; |
3358 | yyMaxRep = InftyRep; |
3359 | return Tok_Quantifier; |
3360 | case '.': |
3361 | #ifndef QT_NO_REGEXP_CCLASS |
3362 | yyCharClass->setNegative(true); |
3363 | #endif |
3364 | return Tok_CharClass; |
3365 | case '?': |
3366 | yyMinRep = 0; |
3367 | yyMaxRep = 1; |
3368 | return Tok_Quantifier; |
3369 | case '[': |
3370 | #ifndef QT_NO_REGEXP_CCLASS |
3371 | if (yyCh == '^') { |
3372 | yyCharClass->setNegative(true); |
3373 | yyCh = getChar(); |
3374 | } |
3375 | charPending = false; |
3376 | rangePending = false; |
3377 | do { |
3378 | if (yyCh == '-' && charPending && !rangePending) { |
3379 | rangePending = true; |
3380 | yyCh = getChar(); |
3381 | } else { |
3382 | if (charPending && !rangePending) { |
3383 | yyCharClass->addSingleton(ch: pendingCh); |
3384 | charPending = false; |
3385 | } |
3386 | if (yyCh == '\\') { |
3387 | yyCh = getChar(); |
3388 | tok = getEscape(); |
3389 | if (tok == Tok_Word) |
3390 | tok = '\b'; |
3391 | } else { |
3392 | tok = Tok_Char | yyCh; |
3393 | yyCh = getChar(); |
3394 | } |
3395 | if (tok == Tok_CharClass) { |
3396 | if (rangePending) { |
3397 | yyCharClass->addSingleton(ch: '-'); |
3398 | yyCharClass->addSingleton(ch: pendingCh); |
3399 | charPending = false; |
3400 | rangePending = false; |
3401 | } |
3402 | } else if ((tok & Tok_Char) != 0) { |
3403 | if (rangePending) { |
3404 | yyCharClass->addRange(from: pendingCh, to: tok ^ Tok_Char); |
3405 | charPending = false; |
3406 | rangePending = false; |
3407 | } else { |
3408 | pendingCh = tok ^ Tok_Char; |
3409 | charPending = true; |
3410 | } |
3411 | } else { |
3412 | error(RXERR_CHARCLASS); |
3413 | } |
3414 | } |
3415 | } while (yyCh != ']' && yyCh != EOS); |
3416 | if (rangePending) |
3417 | yyCharClass->addSingleton(ch: '-'); |
3418 | if (charPending) |
3419 | yyCharClass->addSingleton(ch: pendingCh); |
3420 | if (yyCh == EOS) |
3421 | error(RXERR_END); |
3422 | else |
3423 | yyCh = getChar(); |
3424 | return Tok_CharClass; |
3425 | #else |
3426 | error(RXERR_END); |
3427 | return Tok_Char | '['; |
3428 | #endif |
3429 | case '\\': |
3430 | return getEscape(); |
3431 | case ']': |
3432 | error(RXERR_LEFTDELIM); |
3433 | return Tok_Char | ']'; |
3434 | case '^': |
3435 | return Tok_Caret; |
3436 | case '{': |
3437 | #ifndef QT_NO_REGEXP_INTERVAL |
3438 | yyMinRep = getRep(def: 0); |
3439 | yyMaxRep = yyMinRep; |
3440 | if (yyCh == ',') { |
3441 | yyCh = getChar(); |
3442 | yyMaxRep = getRep(def: InftyRep); |
3443 | } |
3444 | if (yyMaxRep < yyMinRep) |
3445 | error(RXERR_INTERVAL); |
3446 | if (yyCh != '}') |
3447 | error(RXERR_REPETITION); |
3448 | yyCh = getChar(); |
3449 | return Tok_Quantifier; |
3450 | #else |
3451 | error(RXERR_DISABLED); |
3452 | return Tok_Char | '{'; |
3453 | #endif |
3454 | case '|': |
3455 | return Tok_Bar; |
3456 | case '}': |
3457 | error(RXERR_LEFTDELIM); |
3458 | return Tok_Char | '}'; |
3459 | default: |
3460 | return Tok_Char | prevCh; |
3461 | } |
3462 | } |
3463 | |
3464 | int QRegExpEngine::parse(const QChar *pattern, int len) |
3465 | { |
3466 | valid = true; |
3467 | startTokenizer(rx: pattern, len); |
3468 | yyTok = getToken(); |
3469 | #ifndef QT_NO_REGEXP_CAPTURE |
3470 | yyMayCapture = true; |
3471 | #else |
3472 | yyMayCapture = false; |
3473 | #endif |
3474 | |
3475 | #ifndef QT_NO_REGEXP_CAPTURE |
3476 | int atom = startAtom(officialCapture: false); |
3477 | #endif |
3478 | QRegExpCharClass anything; |
3479 | Box box(this); // create InitialState |
3480 | box.set(anything); |
3481 | Box rightBox(this); // create FinalState |
3482 | rightBox.set(anything); |
3483 | |
3484 | Box middleBox(this); |
3485 | parseExpression(box: &middleBox); |
3486 | #ifndef QT_NO_REGEXP_CAPTURE |
3487 | finishAtom(atom, needCapture: false); |
3488 | #endif |
3489 | #ifndef QT_NO_REGEXP_OPTIM |
3490 | middleBox.setupHeuristics(); |
3491 | #endif |
3492 | box.cat(b: middleBox); |
3493 | box.cat(b: rightBox); |
3494 | yyCharClass.reset(); |
3495 | |
3496 | #ifndef QT_NO_REGEXP_CAPTURE |
3497 | for (int i = 0; i < nf; ++i) { |
3498 | switch (f[i].capture) { |
3499 | case QRegExpAtom::NoCapture: |
3500 | break; |
3501 | case QRegExpAtom::OfficialCapture: |
3502 | f[i].capture = ncap; |
3503 | captureForOfficialCapture.append(t: ncap); |
3504 | ++ncap; |
3505 | ++officialncap; |
3506 | break; |
3507 | case QRegExpAtom::UnofficialCapture: |
3508 | f[i].capture = greedyQuantifiers ? ncap++ : QRegExpAtom::NoCapture; |
3509 | } |
3510 | } |
3511 | |
3512 | #ifndef QT_NO_REGEXP_BACKREF |
3513 | #ifndef QT_NO_REGEXP_OPTIM |
3514 | if (officialncap == 0 && nbrefs == 0) { |
3515 | ncap = nf = 0; |
3516 | f.clear(); |
3517 | } |
3518 | #endif |
3519 | // handle the case where there's a \5 with no corresponding capture |
3520 | // (captureForOfficialCapture.size() != officialncap) |
3521 | for (int i = 0; i < nbrefs - officialncap; ++i) { |
3522 | captureForOfficialCapture.append(t: ncap); |
3523 | ++ncap; |
3524 | } |
3525 | #endif |
3526 | #endif |
3527 | |
3528 | if (!yyError.isEmpty()) |
3529 | return -1; |
3530 | |
3531 | #ifndef QT_NO_REGEXP_OPTIM |
3532 | const QRegExpAutomatonState &sinit = s.at(i: InitialState); |
3533 | caretAnchored = !sinit.anchors.isEmpty(); |
3534 | if (caretAnchored) { |
3535 | const QMap<int, int> &anchors = sinit.anchors; |
3536 | QMap<int, int>::const_iterator a; |
3537 | for (a = anchors.constBegin(); a != anchors.constEnd(); ++a) { |
3538 | if ( |
3539 | #ifndef QT_NO_REGEXP_ANCHOR_ALT |
3540 | (*a & Anchor_Alternation) != 0 || |
3541 | #endif |
3542 | (*a & Anchor_Caret) == 0) |
3543 | { |
3544 | caretAnchored = false; |
3545 | break; |
3546 | } |
3547 | } |
3548 | } |
3549 | #endif |
3550 | |
3551 | // cleanup anchors |
3552 | int numStates = s.size(); |
3553 | for (int i = 0; i < numStates; ++i) { |
3554 | QRegExpAutomatonState &state = s[i]; |
3555 | if (!state.anchors.isEmpty()) { |
3556 | QMap<int, int>::iterator a = state.anchors.begin(); |
3557 | while (a != state.anchors.end()) { |
3558 | if (a.value() == 0) |
3559 | a = state.anchors.erase(it: a); |
3560 | else |
3561 | ++a; |
3562 | } |
3563 | } |
3564 | } |
3565 | |
3566 | return yyPos0; |
3567 | } |
3568 | |
3569 | void QRegExpEngine::parseAtom(Box *box) |
3570 | { |
3571 | #ifndef QT_NO_REGEXP_LOOKAHEAD |
3572 | QRegExpEngine *eng = nullptr; |
3573 | bool neg; |
3574 | int len; |
3575 | #endif |
3576 | |
3577 | if ((yyTok & Tok_Char) != 0) { |
3578 | box->set(QChar(yyTok ^ Tok_Char)); |
3579 | } else { |
3580 | #ifndef QT_NO_REGEXP_OPTIM |
3581 | trivial = false; |
3582 | #endif |
3583 | switch (yyTok) { |
3584 | case Tok_Dollar: |
3585 | box->catAnchor(a: Anchor_Dollar); |
3586 | break; |
3587 | case Tok_Caret: |
3588 | box->catAnchor(a: Anchor_Caret); |
3589 | break; |
3590 | #ifndef QT_NO_REGEXP_LOOKAHEAD |
3591 | case Tok_PosLookahead: |
3592 | case Tok_NegLookahead: |
3593 | neg = (yyTok == Tok_NegLookahead); |
3594 | eng = new QRegExpEngine(cs, greedyQuantifiers); |
3595 | len = eng->parse(pattern: yyIn + yyPos - 1, len: yyLen - yyPos + 1); |
3596 | if (len >= 0) |
3597 | skipChars(n: len); |
3598 | else |
3599 | error(RXERR_LOOKAHEAD); |
3600 | box->catAnchor(a: addLookahead(eng, negative: neg)); |
3601 | yyTok = getToken(); |
3602 | if (yyTok != Tok_RightParen) |
3603 | error(RXERR_LOOKAHEAD); |
3604 | break; |
3605 | #endif |
3606 | #ifndef QT_NO_REGEXP_ESCAPE |
3607 | case Tok_Word: |
3608 | box->catAnchor(a: Anchor_Word); |
3609 | break; |
3610 | case Tok_NonWord: |
3611 | box->catAnchor(a: Anchor_NonWord); |
3612 | break; |
3613 | #endif |
3614 | case Tok_LeftParen: |
3615 | case Tok_MagicLeftParen: |
3616 | yyTok = getToken(); |
3617 | parseExpression(box); |
3618 | if (yyTok != Tok_RightParen) |
3619 | error(RXERR_END); |
3620 | break; |
3621 | case Tok_CharClass: |
3622 | box->set(*yyCharClass); |
3623 | break; |
3624 | case Tok_Quantifier: |
3625 | error(RXERR_REPETITION); |
3626 | break; |
3627 | default: |
3628 | #ifndef QT_NO_REGEXP_BACKREF |
3629 | if ((yyTok & Tok_BackRef) != 0) |
3630 | box->set(yyTok ^ Tok_BackRef); |
3631 | else |
3632 | #endif |
3633 | error(RXERR_DISABLED); |
3634 | } |
3635 | } |
3636 | yyTok = getToken(); |
3637 | } |
3638 | |
3639 | void QRegExpEngine::parseFactor(Box *box) |
3640 | { |
3641 | #ifndef QT_NO_REGEXP_CAPTURE |
3642 | int outerAtom = greedyQuantifiers ? startAtom(officialCapture: false) : -1; |
3643 | int innerAtom = startAtom(officialCapture: yyMayCapture && yyTok == Tok_LeftParen); |
3644 | bool magicLeftParen = (yyTok == Tok_MagicLeftParen); |
3645 | #else |
3646 | const int innerAtom = -1; |
3647 | #endif |
3648 | |
3649 | #ifndef QT_NO_REGEXP_INTERVAL |
3650 | #define YYREDO() \ |
3651 | yyIn = in, yyPos0 = pos0, yyPos = pos, yyLen = len, yyCh = ch, \ |
3652 | *yyCharClass = charClass, yyMinRep = 0, yyMaxRep = 0, yyTok = tok |
3653 | |
3654 | const QChar *in = yyIn; |
3655 | int pos0 = yyPos0; |
3656 | int pos = yyPos; |
3657 | int len = yyLen; |
3658 | int ch = yyCh; |
3659 | QRegExpCharClass charClass; |
3660 | if (yyTok == Tok_CharClass) |
3661 | charClass = *yyCharClass; |
3662 | int tok = yyTok; |
3663 | bool mayCapture = yyMayCapture; |
3664 | #endif |
3665 | |
3666 | parseAtom(box); |
3667 | #ifndef QT_NO_REGEXP_CAPTURE |
3668 | finishAtom(atom: innerAtom, needCapture: magicLeftParen); |
3669 | #endif |
3670 | |
3671 | bool hasQuantifier = (yyTok == Tok_Quantifier); |
3672 | if (hasQuantifier) { |
3673 | #ifndef QT_NO_REGEXP_OPTIM |
3674 | trivial = false; |
3675 | #endif |
3676 | if (yyMaxRep == InftyRep) { |
3677 | box->plus(atom: innerAtom); |
3678 | #ifndef QT_NO_REGEXP_INTERVAL |
3679 | } else if (yyMaxRep == 0) { |
3680 | box->clear(); |
3681 | #endif |
3682 | } |
3683 | if (yyMinRep == 0) |
3684 | box->opt(); |
3685 | |
3686 | #ifndef QT_NO_REGEXP_INTERVAL |
3687 | yyMayCapture = false; |
3688 | int alpha = (yyMinRep == 0) ? 0 : yyMinRep - 1; |
3689 | int beta = (yyMaxRep == InftyRep) ? 0 : yyMaxRep - (alpha + 1); |
3690 | |
3691 | Box rightBox(this); |
3692 | int i; |
3693 | |
3694 | for (i = 0; i < beta; i++) { |
3695 | YYREDO(); |
3696 | Box leftBox(this); |
3697 | parseAtom(box: &leftBox); |
3698 | leftBox.cat(b: rightBox); |
3699 | leftBox.opt(); |
3700 | rightBox = leftBox; |
3701 | } |
3702 | for (i = 0; i < alpha; i++) { |
3703 | YYREDO(); |
3704 | Box leftBox(this); |
3705 | parseAtom(box: &leftBox); |
3706 | leftBox.cat(b: rightBox); |
3707 | rightBox = leftBox; |
3708 | } |
3709 | rightBox.cat(b: *box); |
3710 | *box = rightBox; |
3711 | #endif |
3712 | yyTok = getToken(); |
3713 | #ifndef QT_NO_REGEXP_INTERVAL |
3714 | yyMayCapture = mayCapture; |
3715 | #endif |
3716 | } |
3717 | #undef YYREDO |
3718 | #ifndef QT_NO_REGEXP_CAPTURE |
3719 | if (greedyQuantifiers) |
3720 | finishAtom(atom: outerAtom, needCapture: hasQuantifier); |
3721 | #endif |
3722 | } |
3723 | |
3724 | void QRegExpEngine::parseTerm(Box *box) |
3725 | { |
3726 | #ifndef QT_NO_REGEXP_OPTIM |
3727 | if (yyTok != Tok_Eos && yyTok != Tok_RightParen && yyTok != Tok_Bar) |
3728 | parseFactor(box); |
3729 | #endif |
3730 | while (yyTok != Tok_Eos && yyTok != Tok_RightParen && yyTok != Tok_Bar) { |
3731 | Box rightBox(this); |
3732 | parseFactor(box: &rightBox); |
3733 | box->cat(b: rightBox); |
3734 | } |
3735 | } |
3736 | |
3737 | void QRegExpEngine::parseExpression(Box *box) |
3738 | { |
3739 | parseTerm(box); |
3740 | while (yyTok == Tok_Bar) { |
3741 | #ifndef QT_NO_REGEXP_OPTIM |
3742 | trivial = false; |
3743 | #endif |
3744 | Box rightBox(this); |
3745 | yyTok = getToken(); |
3746 | parseTerm(box: &rightBox); |
3747 | box->orx(b: rightBox); |
3748 | } |
3749 | } |
3750 | |
3751 | /* |
3752 | The struct QRegExpPrivate contains the private data of a regular |
3753 | expression other than the automaton. It makes it possible for many |
3754 | QRegExp objects to use the same QRegExpEngine object with different |
3755 | QRegExpPrivate objects. |
3756 | */ |
3757 | struct QRegExpPrivate |
3758 | { |
3759 | QRegExpEngine *eng; |
3760 | QRegExpEngineKey engineKey; |
3761 | bool minimal; |
3762 | #ifndef QT_NO_REGEXP_CAPTURE |
3763 | QString t; // last string passed to QRegExp::indexIn() or lastIndexIn() |
3764 | QStringList capturedCache; // what QRegExp::capturedTexts() returned last |
3765 | #endif |
3766 | QRegExpMatchState matchState; |
3767 | |
3768 | inline QRegExpPrivate() |
3769 | : eng(nullptr), engineKey(QString(), QRegExp::RegExp, Qt::CaseSensitive), minimal(false) { } |
3770 | inline QRegExpPrivate(const QRegExpEngineKey &key) |
3771 | : eng(nullptr), engineKey(key), minimal(false) {} |
3772 | }; |
3773 | |
3774 | #if !defined(QT_NO_REGEXP_OPTIM) |
3775 | struct QRECache |
3776 | { |
3777 | typedef QHash<QRegExpEngineKey, QRegExpEngine *> EngineCache; |
3778 | typedef QCache<QRegExpEngineKey, QRegExpEngine> UnusedEngineCache; |
3779 | EngineCache usedEngines; |
3780 | UnusedEngineCache unusedEngines; |
3781 | }; |
3782 | Q_GLOBAL_STATIC(QRECache, engineCache) |
3783 | static QBasicMutex engineCacheMutex; |
3784 | #endif // QT_NO_REGEXP_OPTIM |
3785 | |
3786 | static void derefEngine(QRegExpEngine *eng, const QRegExpEngineKey &key) |
3787 | { |
3788 | #if !defined(QT_NO_REGEXP_OPTIM) |
3789 | const auto locker = qt_scoped_lock(mutex&: engineCacheMutex); |
3790 | if (!eng->ref.deref()) { |
3791 | if (QRECache *c = engineCache()) { |
3792 | c->unusedEngines.insert(key, object: eng, cost: 4 + key.pattern.size() / 4); |
3793 | c->usedEngines.remove(key); |
3794 | } else { |
3795 | delete eng; |
3796 | } |
3797 | } |
3798 | #else |
3799 | Q_UNUSED(key); |
3800 | if (!eng->ref.deref()) |
3801 | delete eng; |
3802 | #endif |
3803 | } |
3804 | |
3805 | static void prepareEngine_helper(QRegExpPrivate *priv) |
3806 | { |
3807 | Q_ASSERT(!priv->eng); |
3808 | |
3809 | #if !defined(QT_NO_REGEXP_OPTIM) |
3810 | const auto locker = qt_scoped_lock(mutex&: engineCacheMutex); |
3811 | if (QRECache *c = engineCache()) { |
3812 | priv->eng = c->unusedEngines.take(key: priv->engineKey); |
3813 | if (!priv->eng) |
3814 | priv->eng = c->usedEngines.value(key: priv->engineKey); |
3815 | if (!priv->eng) |
3816 | priv->eng = new QRegExpEngine(priv->engineKey); |
3817 | else |
3818 | priv->eng->ref.ref(); |
3819 | |
3820 | c->usedEngines.insert(key: priv->engineKey, value: priv->eng); |
3821 | return; |
3822 | } |
3823 | #endif // QT_NO_REGEXP_OPTIM |
3824 | |
3825 | priv->eng = new QRegExpEngine(priv->engineKey); |
3826 | } |
3827 | |
3828 | inline static void prepareEngine(QRegExpPrivate *priv) |
3829 | { |
3830 | if (priv->eng) |
3831 | return; |
3832 | prepareEngine_helper(priv); |
3833 | priv->matchState.prepareForMatch(eng: priv->eng); |
3834 | } |
3835 | |
3836 | static void prepareEngineForMatch(QRegExpPrivate *priv, const QString &str) |
3837 | { |
3838 | prepareEngine(priv); |
3839 | priv->matchState.prepareForMatch(eng: priv->eng); |
3840 | #ifndef QT_NO_REGEXP_CAPTURE |
3841 | priv->t = str; |
3842 | priv->capturedCache.clear(); |
3843 | #else |
3844 | Q_UNUSED(str); |
3845 | #endif |
3846 | } |
3847 | |
3848 | static void invalidateEngine(QRegExpPrivate *priv) |
3849 | { |
3850 | if (priv->eng) { |
3851 | derefEngine(eng: priv->eng, key: priv->engineKey); |
3852 | priv->eng = nullptr; |
3853 | priv->matchState.drain(); |
3854 | } |
3855 | } |
3856 | |
3857 | /*! |
3858 | \enum QRegExp::CaretMode |
3859 | |
3860 | The CaretMode enum defines the different meanings of the caret |
3861 | (\b{^}) in a regular expression. The possible values are: |
3862 | |
3863 | \value CaretAtZero |
3864 | The caret corresponds to index 0 in the searched string. |
3865 | |
3866 | \value CaretAtOffset |
3867 | The caret corresponds to the start offset of the search. |
3868 | |
3869 | \value CaretWontMatch |
3870 | The caret never matches. |
3871 | */ |
3872 | |
3873 | /*! |
3874 | \enum QRegExp::PatternSyntax |
3875 | |
3876 | The syntax used to interpret the meaning of the pattern. |
3877 | |
3878 | \value RegExp A rich Perl-like pattern matching syntax. This is |
3879 | the default. |
3880 | |
3881 | \value RegExp2 Like RegExp, but with \l{greedy quantifiers}. |
3882 | (Introduced in Qt 4.2.) |
3883 | |
3884 | \value Wildcard This provides a simple pattern matching syntax |
3885 | similar to that used by shells (command interpreters) for "file |
3886 | globbing". See \l{QRegExp wildcard matching}. |
3887 | |
3888 | \value WildcardUnix This is similar to Wildcard but with the |
3889 | behavior of a Unix shell. The wildcard characters can be escaped |
3890 | with the character "\\". |
3891 | |
3892 | \value FixedString The pattern is a fixed string. This is |
3893 | equivalent to using the RegExp pattern on a string in |
3894 | which all metacharacters are escaped using escape(). |
3895 | |
3896 | \value W3CXmlSchema11 The pattern is a regular expression as |
3897 | defined by the W3C XML Schema 1.1 specification. |
3898 | |
3899 | \sa setPatternSyntax() |
3900 | */ |
3901 | |
3902 | /*! |
3903 | Constructs an empty regexp. |
3904 | |
3905 | \sa isValid(), errorString() |
3906 | */ |
3907 | QRegExp::QRegExp() |
3908 | { |
3909 | priv = new QRegExpPrivate; |
3910 | prepareEngine(priv); |
3911 | } |
3912 | |
3913 | /*! |
3914 | Constructs a regular expression object for the given \a pattern |
3915 | string. The pattern must be given using wildcard notation if \a |
3916 | syntax is \l Wildcard; the default is \l RegExp. The pattern is |
3917 | case sensitive, unless \a cs is Qt::CaseInsensitive. Matching is |
3918 | greedy (maximal), but can be changed by calling |
3919 | setMinimal(). |
3920 | |
3921 | \sa setPattern(), setCaseSensitivity(), setPatternSyntax() |
3922 | */ |
3923 | QRegExp::QRegExp(const QString &pattern, Qt::CaseSensitivity cs, PatternSyntax syntax) |
3924 | { |
3925 | priv = new QRegExpPrivate(QRegExpEngineKey(pattern, syntax, cs)); |
3926 | prepareEngine(priv); |
3927 | } |
3928 | |
3929 | /*! |
3930 | Constructs a regular expression as a copy of \a rx. |
3931 | |
3932 | \sa operator=() |
3933 | */ |
3934 | QRegExp::QRegExp(const QRegExp &rx) |
3935 | { |
3936 | priv = new QRegExpPrivate; |
3937 | operator=(rx); |
3938 | } |
3939 | |
3940 | /*! |
3941 | Destroys the regular expression and cleans up its internal data. |
3942 | */ |
3943 | QRegExp::~QRegExp() |
3944 | { |
3945 | invalidateEngine(priv); |
3946 | delete priv; |
3947 | } |
3948 | |
3949 | /*! |
3950 | Copies the regular expression \a rx and returns a reference to the |
3951 | copy. The case sensitivity, wildcard, and minimal matching options |
3952 | are also copied. |
3953 | */ |
3954 | QRegExp &QRegExp::operator=(const QRegExp &rx) |
3955 | { |
3956 | prepareEngine(priv: rx.priv); // to allow sharing |
3957 | QRegExpEngine *otherEng = rx.priv->eng; |
3958 | if (otherEng) |
3959 | otherEng->ref.ref(); |
3960 | invalidateEngine(priv); |
3961 | priv->eng = otherEng; |
3962 | priv->engineKey = rx.priv->engineKey; |
3963 | priv->minimal = rx.priv->minimal; |
3964 | #ifndef QT_NO_REGEXP_CAPTURE |
3965 | priv->t = rx.priv->t; |
3966 | priv->capturedCache = rx.priv->capturedCache; |
3967 | #endif |
3968 | if (priv->eng) |
3969 | priv->matchState.prepareForMatch(eng: priv->eng); |
3970 | priv->matchState.captured = rx.priv->matchState.captured; |
3971 | return *this; |
3972 | } |
3973 | |
3974 | /*! |
3975 | \fn QRegExp &QRegExp::operator=(QRegExp &&other) |
3976 | |
3977 | Move-assigns \a other to this QRegExp instance. |
3978 | |
3979 | \since 5.2 |
3980 | */ |
3981 | |
3982 | /*! |
3983 | \fn void QRegExp::swap(QRegExp &other) |
3984 | \since 4.8 |
3985 | |
3986 | Swaps regular expression \a other with this regular |
3987 | expression. This operation is very fast and never fails. |
3988 | */ |
3989 | |
3990 | /*! |
3991 | Returns \c true if this regular expression is equal to \a rx; |
3992 | otherwise returns \c false. |
3993 | |
3994 | Two QRegExp objects are equal if they have the same pattern |
3995 | strings and the same settings for case sensitivity, wildcard and |
3996 | minimal matching. |
3997 | */ |
3998 | bool QRegExp::operator==(const QRegExp &rx) const |
3999 | { |
4000 | return priv->engineKey == rx.priv->engineKey && priv->minimal == rx.priv->minimal; |
4001 | } |
4002 | |
4003 | /*! |
4004 | \since 5.6 |
4005 | \relates QRegExp |
4006 | |
4007 | Returns the hash value for \a key, using |
4008 | \a seed to seed the calculation. |
4009 | */ |
4010 | size_t qHash(const QRegExp &key, size_t seed) noexcept |
4011 | { |
4012 | QtPrivate::QHashCombine hash; |
4013 | seed = hash(seed, key.priv->engineKey); |
4014 | seed = hash(seed, key.priv->minimal); |
4015 | return seed; |
4016 | } |
4017 | |
4018 | /*! |
4019 | \fn bool QRegExp::operator!=(const QRegExp &rx) const |
4020 | |
4021 | Returns \c true if this regular expression is not equal to \a rx; |
4022 | otherwise returns \c false. |
4023 | |
4024 | \sa operator==() |
4025 | */ |
4026 | |
4027 | /*! |
4028 | Returns \c true if the pattern string is empty; otherwise returns |
4029 | false. |
4030 | |
4031 | If you call exactMatch() with an empty pattern on an empty string |
4032 | it will return true; otherwise it returns \c false since it operates |
4033 | over the whole string. If you call indexIn() with an empty pattern |
4034 | on \e any string it will return the start offset (0 by default) |
4035 | because the empty pattern matches the 'emptiness' at the start of |
4036 | the string. In this case the length of the match returned by |
4037 | matchedLength() will be 0. |
4038 | |
4039 | See QString::isEmpty(). |
4040 | */ |
4041 | |
4042 | bool QRegExp::isEmpty() const |
4043 | { |
4044 | return priv->engineKey.pattern.isEmpty(); |
4045 | } |
4046 | |
4047 | /*! |
4048 | Returns \c true if the regular expression is valid; otherwise returns |
4049 | false. An invalid regular expression never matches. |
4050 | |
4051 | The pattern \b{[a-z} is an example of an invalid pattern, since |
4052 | it lacks a closing square bracket. |
4053 | |
4054 | Note that the validity of a regexp may also depend on the setting |
4055 | of the wildcard flag, for example \b{*.html} is a valid |
4056 | wildcard regexp but an invalid full regexp. |
4057 | |
4058 | \sa errorString() |
4059 | */ |
4060 | bool QRegExp::isValid() const |
4061 | { |
4062 | if (priv->engineKey.pattern.isEmpty()) { |
4063 | return true; |
4064 | } else { |
4065 | prepareEngine(priv); |
4066 | return priv->eng->isValid(); |
4067 | } |
4068 | } |
4069 | |
4070 | /*! |
4071 | Returns the pattern string of the regular expression. The pattern |
4072 | has either regular expression syntax or wildcard syntax, depending |
4073 | on patternSyntax(). |
4074 | |
4075 | \sa patternSyntax(), caseSensitivity() |
4076 | */ |
4077 | QString QRegExp::pattern() const |
4078 | { |
4079 | return priv->engineKey.pattern; |
4080 | } |
4081 | |
4082 | /*! |
4083 | Sets the pattern string to \a pattern. The case sensitivity, |
4084 | wildcard, and minimal matching options are not changed. |
4085 | |
4086 | \sa setPatternSyntax(), setCaseSensitivity() |
4087 | */ |
4088 | void QRegExp::setPattern(const QString &pattern) |
4089 | { |
4090 | if (priv->engineKey.pattern != pattern) { |
4091 | invalidateEngine(priv); |
4092 | priv->engineKey.pattern = pattern; |
4093 | } |
4094 | } |
4095 | |
4096 | /*! |
4097 | Returns Qt::CaseSensitive if the regexp is matched case |
4098 | sensitively; otherwise returns Qt::CaseInsensitive. |
4099 | |
4100 | \sa patternSyntax(), pattern(), isMinimal() |
4101 | */ |
4102 | Qt::CaseSensitivity QRegExp::caseSensitivity() const |
4103 | { |
4104 | return priv->engineKey.cs; |
4105 | } |
4106 | |
4107 | /*! |
4108 | Sets case sensitive matching to \a cs. |
4109 | |
4110 | If \a cs is Qt::CaseSensitive, \b{\\.txt$} matches |
4111 | \c{readme.txt} but not \c{README.TXT}. |
4112 | |
4113 | \sa setPatternSyntax(), setPattern(), setMinimal() |
4114 | */ |
4115 | void QRegExp::setCaseSensitivity(Qt::CaseSensitivity cs) |
4116 | { |
4117 | if ((bool)cs != (bool)priv->engineKey.cs) { |
4118 | invalidateEngine(priv); |
4119 | priv->engineKey.cs = cs; |
4120 | } |
4121 | } |
4122 | |
4123 | /*! |
4124 | Returns the syntax used by the regular expression. The default is |
4125 | QRegExp::RegExp. |
4126 | |
4127 | \sa pattern(), caseSensitivity() |
4128 | */ |
4129 | QRegExp::PatternSyntax QRegExp::patternSyntax() const |
4130 | { |
4131 | return priv->engineKey.patternSyntax; |
4132 | } |
4133 | |
4134 | /*! |
4135 | Sets the syntax mode for the regular expression. The default is |
4136 | QRegExp::RegExp. |
4137 | |
4138 | Setting \a syntax to QRegExp::Wildcard enables simple shell-like |
4139 | \l{QRegExp wildcard matching}. For example, \b{r*.txt} matches the |
4140 | string \c{readme.txt} in wildcard mode, but does not match |
4141 | \c{readme}. |
4142 | |
4143 | Setting \a syntax to QRegExp::FixedString means that the pattern |
4144 | is interpreted as a plain string. Special characters (e.g., |
4145 | backslash) don't need to be escaped then. |
4146 | |
4147 | \sa setPattern(), setCaseSensitivity(), escape() |
4148 | */ |
4149 | void QRegExp::setPatternSyntax(PatternSyntax syntax) |
4150 | { |
4151 | if (syntax != priv->engineKey.patternSyntax) { |
4152 | invalidateEngine(priv); |
4153 | priv->engineKey.patternSyntax = syntax; |
4154 | } |
4155 | } |
4156 | |
4157 | /*! |
4158 | Returns \c true if minimal (non-greedy) matching is enabled; |
4159 | otherwise returns \c false. |
4160 | |
4161 | \sa caseSensitivity(), setMinimal() |
4162 | */ |
4163 | bool QRegExp::isMinimal() const |
4164 | { |
4165 | return priv->minimal; |
4166 | } |
4167 | |
4168 | /*! |
4169 | Enables or disables minimal matching. If \a minimal is false, |
4170 | matching is greedy (maximal) which is the default. |
4171 | |
4172 | For example, suppose we have the input string "We must be |
4173 | <b>bold</b>, very <b>bold</b>!" and the pattern |
4174 | \b{<b>.*</b>}. With the default greedy (maximal) matching, |
4175 | the match is "We must be \underline{<b>bold</b>, very |
4176 | <b>bold</b>}!". But with minimal (non-greedy) matching, the |
4177 | first match is: "We must be \underline{<b>bold</b>}, very |
4178 | <b>bold</b>!" and the second match is "We must be <b>bold</b>, |
4179 | very \underline{<b>bold</b>}!". In practice we might use the pattern |
4180 | \b{<b>[^<]*\</b>} instead, although this will still fail for |
4181 | nested tags. |
4182 | |
4183 | \sa setCaseSensitivity() |
4184 | */ |
4185 | void QRegExp::setMinimal(bool minimal) |
4186 | { |
4187 | priv->minimal = minimal; |
4188 | } |
4189 | |
4190 | // ### Qt 5: make non-const |
4191 | /*! |
4192 | Returns \c true if \a str is matched exactly by this regular |
4193 | expression; otherwise returns \c false. You can determine how much of |
4194 | the string was matched by calling matchedLength(). |
4195 | |
4196 | For a given regexp string R, exactMatch("R") is the equivalent of |
4197 | indexIn("^R$") since exactMatch() effectively encloses the regexp |
4198 | in the start of string and end of string anchors, except that it |
4199 | sets matchedLength() differently. |
4200 | |
4201 | For example, if the regular expression is \b{blue}, then |
4202 | exactMatch() returns \c true only for input \c blue. For inputs \c |
4203 | bluebell, \c blutak and \c lightblue, exactMatch() returns \c false |
4204 | and matchedLength() will return 4, 3 and 0 respectively. |
4205 | |
4206 | Although const, this function sets matchedLength(), |
4207 | capturedTexts(), and pos(). |
4208 | |
4209 | \sa indexIn(), lastIndexIn() |
4210 | */ |
4211 | bool QRegExp::exactMatch(const QString &str) const |
4212 | { |
4213 | prepareEngineForMatch(priv, str); |
4214 | priv->matchState.match(str0: str.unicode(), len0: str.size(), pos0: 0, minimal0: priv->minimal, oneTest: true, caretIndex: 0); |
4215 | if (priv->matchState.captured[1] == str.size()) { |
4216 | return true; |
4217 | } else { |
4218 | priv->matchState.captured[0] = 0; |
4219 | priv->matchState.captured[1] = priv->matchState.oneTestMatchedLen; |
4220 | return false; |
4221 | } |
4222 | } |
4223 | |
4224 | /*! |
4225 | Returns the regexp as a QVariant |
4226 | */ |
4227 | QRegExp::operator QVariant() const |
4228 | { |
4229 | QT_WARNING_PUSH QT_WARNING_DISABLE_DEPRECATED |
4230 | QVariant v; |
4231 | v.setValue(*this); |
4232 | return v; |
4233 | QT_WARNING_POP |
4234 | } |
4235 | |
4236 | // ### Qt 5: make non-const |
4237 | /*! |
4238 | Attempts to find a match in \a str from position \a offset (0 by |
4239 | default). If \a offset is -1, the search starts at the last |
4240 | character; if -2, at the next to last character; etc. |
4241 | |
4242 | Returns the position of the first match, or -1 if there was no |
4243 | match. |
4244 | |
4245 | The \a caretMode parameter can be used to instruct whether \b{^} |
4246 | should match at index 0 or at \a offset. |
4247 | |
4248 | You might prefer to use QString::indexOf(), QString::contains(), |
4249 | or even QStringList::filter(). To replace matches use |
4250 | QString::replace(). |
4251 | |
4252 | Example: |
4253 | \snippet code/src_corelib_text_qregexp.cpp 13 |
4254 | |
4255 | Although const, this function sets matchedLength(), |
4256 | capturedTexts() and pos(). |
4257 | |
4258 | If the QRegExp is a wildcard expression (see setPatternSyntax()) |
4259 | and want to test a string against the whole wildcard expression, |
4260 | use exactMatch() instead of this function. |
4261 | |
4262 | \sa lastIndexIn(), exactMatch() |
4263 | */ |
4264 | |
4265 | int QRegExp::indexIn(const QString &str, int offset, CaretMode caretMode) const |
4266 | { |
4267 | prepareEngineForMatch(priv, str); |
4268 | if (offset < 0) |
4269 | offset += str.size(); |
4270 | priv->matchState.match(str0: str.unicode(), len0: str.size(), pos0: offset, |
4271 | minimal0: priv->minimal, oneTest: false, caretIndex: caretIndex(offset, caretMode)); |
4272 | return priv->matchState.captured[0]; |
4273 | } |
4274 | |
4275 | // ### Qt 5: make non-const |
4276 | /*! |
4277 | Attempts to find a match backwards in \a str from position \a |
4278 | offset. If \a offset is -1 (the default), the search starts at the |
4279 | last character; if -2, at the next to last character; etc. |
4280 | |
4281 | Returns the position of the first match, or -1 if there was no |
4282 | match. |
4283 | |
4284 | The \a caretMode parameter can be used to instruct whether \b{^} |
4285 | should match at index 0 or at \a offset. |
4286 | |
4287 | Although const, this function sets matchedLength(), |
4288 | capturedTexts() and pos(). |
4289 | |
4290 | \warning Searching backwards is much slower than searching |
4291 | forwards. |
4292 | |
4293 | \sa indexIn(), exactMatch() |
4294 | */ |
4295 | |
4296 | int QRegExp::lastIndexIn(const QString &str, int offset, CaretMode caretMode) const |
4297 | { |
4298 | prepareEngineForMatch(priv, str); |
4299 | if (offset < 0) |
4300 | offset += str.size(); |
4301 | if (offset < 0 || offset > str.size()) { |
4302 | memset(s: priv->matchState.captured, c: -1, n: priv->matchState.capturedSize*sizeof(int)); |
4303 | return -1; |
4304 | } |
4305 | |
4306 | while (offset >= 0) { |
4307 | priv->matchState.match(str0: str.unicode(), len0: str.size(), pos0: offset, |
4308 | minimal0: priv->minimal, oneTest: true, caretIndex: caretIndex(offset, caretMode)); |
4309 | if (priv->matchState.captured[0] == offset) |
4310 | return offset; |
4311 | --offset; |
4312 | } |
4313 | return -1; |
4314 | } |
4315 | |
4316 | /*! |
4317 | Returns the length of the last matched string, or -1 if there was |
4318 | no match. |
4319 | |
4320 | \sa exactMatch(), indexIn(), lastIndexIn() |
4321 | */ |
4322 | int QRegExp::matchedLength() const |
4323 | { |
4324 | return priv->matchState.captured[1]; |
4325 | } |
4326 | |
4327 | |
4328 | /*! |
4329 | Replaces every occurrence of this regular expression in |
4330 | \a str with \a after and returns the result. |
4331 | |
4332 | For regular expressions containing \l{capturing parentheses}, |
4333 | occurrences of \b{\\1}, \b{\\2}, ..., in \a after are replaced |
4334 | with \c {rx}.cap(1), cap(2), ... |
4335 | |
4336 | \sa indexIn(), lastIndexIn(), QRegExp::cap() |
4337 | */ |
4338 | QString QRegExp::replaceIn(const QString &str, const QString &after) const |
4339 | { |
4340 | struct QStringCapture |
4341 | { |
4342 | int pos; |
4343 | int len; |
4344 | int no; |
4345 | }; |
4346 | |
4347 | QRegExp rx2(*this); |
4348 | |
4349 | if (str.isEmpty() && rx2.indexIn(str) == -1) |
4350 | return str; |
4351 | |
4352 | QString s(str); |
4353 | |
4354 | int index = 0; |
4355 | int numCaptures = rx2.captureCount(); |
4356 | int al = after.size(); |
4357 | QRegExp::CaretMode caretMode = QRegExp::CaretAtZero; |
4358 | |
4359 | if (numCaptures > 0) { |
4360 | const QChar *uc = after.unicode(); |
4361 | int numBackRefs = 0; |
4362 | |
4363 | for (int i = 0; i < al - 1; i++) { |
4364 | if (uc[i] == QLatin1Char('\\')) { |
4365 | int no = uc[i + 1].digitValue(); |
4366 | if (no > 0 && no <= numCaptures) |
4367 | numBackRefs++; |
4368 | } |
4369 | } |
4370 | |
4371 | /* |
4372 | This is the harder case where we have back-references. |
4373 | */ |
4374 | if (numBackRefs > 0) { |
4375 | QVarLengthArray<QStringCapture, 16> captures(numBackRefs); |
4376 | int j = 0; |
4377 | |
4378 | for (int i = 0; i < al - 1; i++) { |
4379 | if (uc[i] == QLatin1Char('\\')) { |
4380 | int no = uc[i + 1].digitValue(); |
4381 | if (no > 0 && no <= numCaptures) { |
4382 | QStringCapture capture; |
4383 | capture.pos = i; |
4384 | capture.len = 2; |
4385 | |
4386 | if (i < al - 2) { |
4387 | int secondDigit = uc[i + 2].digitValue(); |
4388 | if (secondDigit != -1 && ((no * 10) + secondDigit) <= numCaptures) { |
4389 | no = (no * 10) + secondDigit; |
4390 | ++capture.len; |
4391 | } |
4392 | } |
4393 | |
4394 | capture.no = no; |
4395 | captures[j++] = capture; |
4396 | } |
4397 | } |
4398 | } |
4399 | |
4400 | while (index <= s.size()) { |
4401 | index = rx2.indexIn(str: s, offset: index, caretMode); |
4402 | if (index == -1) |
4403 | break; |
4404 | |
4405 | QString after2(after); |
4406 | for (j = numBackRefs - 1; j >= 0; j--) { |
4407 | const QStringCapture &capture = captures[j]; |
4408 | after2.replace(i: capture.pos, len: capture.len, after: rx2.cap(nth: capture.no)); |
4409 | } |
4410 | |
4411 | s.replace(i: index, len: rx2.matchedLength(), after: after2); |
4412 | index += after2.size(); |
4413 | |
4414 | // avoid infinite loop on 0-length matches (e.g., QRegExp("[a-z]*")) |
4415 | if (rx2.matchedLength() == 0) |
4416 | ++index; |
4417 | |
4418 | caretMode = QRegExp::CaretWontMatch; |
4419 | } |
4420 | return s; |
4421 | } |
4422 | } |
4423 | |
4424 | /* |
4425 | This is the simple and optimized case where we don't have |
4426 | back-references. |
4427 | */ |
4428 | while (index != -1) { |
4429 | struct { |
4430 | int pos; |
4431 | int length; |
4432 | } replacements[2048]; |
4433 | |
4434 | int pos = 0; |
4435 | int adjust = 0; |
4436 | while (pos < 2047) { |
4437 | index = rx2.indexIn(str: s, offset: index, caretMode); |
4438 | if (index == -1) |
4439 | break; |
4440 | int ml = rx2.matchedLength(); |
4441 | replacements[pos].pos = index; |
4442 | replacements[pos++].length = ml; |
4443 | index += ml; |
4444 | adjust += al - ml; |
4445 | // avoid infinite loop |
4446 | if (!ml) |
4447 | index++; |
4448 | } |
4449 | if (!pos) |
4450 | break; |
4451 | replacements[pos].pos = s.size(); |
4452 | int newlen = s.size() + adjust; |
4453 | |
4454 | // to continue searching at the right position after we did |
4455 | // the first round of replacements |
4456 | if (index != -1) |
4457 | index += adjust; |
4458 | QString newstring; |
4459 | newstring.reserve(asize: newlen + 1); |
4460 | QChar *newuc = newstring.data(); |
4461 | QChar *uc = newuc; |
4462 | int copystart = 0; |
4463 | int i = 0; |
4464 | while (i < pos) { |
4465 | int copyend = replacements[i].pos; |
4466 | int size = copyend - copystart; |
4467 | memcpy(dest: static_cast<void*>(uc), src: static_cast<const void *>(s.constData() + copystart), n: size * sizeof(QChar)); |
4468 | uc += size; |
4469 | memcpy(dest: static_cast<void *>(uc), src: static_cast<const void *>(after.constData()), n: al * sizeof(QChar)); |
4470 | uc += al; |
4471 | copystart = copyend + replacements[i].length; |
4472 | i++; |
4473 | } |
4474 | memcpy(dest: static_cast<void *>(uc), src: static_cast<const void *>(s.constData() + copystart), n: (s.size() - copystart) * sizeof(QChar)); |
4475 | newstring.resize(size: newlen); |
4476 | s = newstring; |
4477 | caretMode = QRegExp::CaretWontMatch; |
4478 | } |
4479 | return s; |
4480 | |
4481 | } |
4482 | |
4483 | |
4484 | /*! |
4485 | \fn QString QRegExp::removeIn(const QString &str) const |
4486 | |
4487 | Removes every occurrence of this regular expression \a str, and |
4488 | returns the result |
4489 | |
4490 | Does the same as replaceIn(str, QString()). |
4491 | |
4492 | \sa indexIn(), lastIndexIn(), replaceIn() |
4493 | */ |
4494 | |
4495 | |
4496 | /*! |
4497 | \fn QString QRegExp::countIn(const QString &str) const |
4498 | |
4499 | Returns the number of times this regular expression matches |
4500 | in \a str. |
4501 | |
4502 | \sa indexIn(), lastIndexIn(), replaceIn() |
4503 | */ |
4504 | |
4505 | int QRegExp::countIn(const QString &str) const |
4506 | { |
4507 | QRegExp rx2(*this); |
4508 | int count = 0; |
4509 | int index = -1; |
4510 | int len = str.size(); |
4511 | while (index < len - 1) { // count overlapping matches |
4512 | index = rx2.indexIn(str, offset: index + 1); |
4513 | if (index == -1) |
4514 | break; |
4515 | count++; |
4516 | } |
4517 | return count; |
4518 | } |
4519 | |
4520 | /*! |
4521 | Splits \a str into substrings wherever this regular expression |
4522 | matches, and returns the list of those strings. If this regular |
4523 | expression does not match anywhere in the string, split() returns a |
4524 | single-element list containing \a str. |
4525 | |
4526 | If \a behavior is set to Qt::KeepEmptyParts, empty fields are |
4527 | included in the resulting list. |
4528 | |
4529 | \sa QStringList::join(), QString::split() |
4530 | */ |
4531 | QStringList QRegExp::splitString(const QString &str, Qt::SplitBehavior behavior) const |
4532 | { |
4533 | QRegExp rx2(*this); |
4534 | QStringList list; |
4535 | int start = 0; |
4536 | int = 0; |
4537 | int end; |
4538 | while ((end = rx2.indexIn(str, offset: start + extra)) != -1) { |
4539 | int matchedLen = rx2.matchedLength(); |
4540 | if (start != end || behavior == Qt::KeepEmptyParts) |
4541 | list.append(t: str.mid(position: start, n: end - start)); |
4542 | start = end + matchedLen; |
4543 | extra = (matchedLen == 0) ? 1 : 0; |
4544 | } |
4545 | if (start != str.size() || behavior == Qt::KeepEmptyParts) |
4546 | list.append(t: str.mid(position: start, n: -1)); |
4547 | return list; |
4548 | } |
4549 | |
4550 | /*! |
4551 | Returns a list of all the strings that match this regular |
4552 | expression in \a stringList. |
4553 | */ |
4554 | QStringList QRegExp::filterList(const QStringList &stringList) const |
4555 | { |
4556 | QStringList res; |
4557 | for (const QString &s : stringList) { |
4558 | if (containedIn(str: s)) |
4559 | res << s; |
4560 | } |
4561 | return res; |
4562 | } |
4563 | |
4564 | /*! |
4565 | Replaces every occurrence of this regexp, in each of \a stringList's |
4566 | with \a after. Returns a reference to the string list. |
4567 | */ |
4568 | QStringList QRegExp::replaceIn(const QStringList &stringList, const QString &after) const |
4569 | { |
4570 | QStringList list; |
4571 | for (const QString &s : stringList) |
4572 | list << replaceIn(str: s, after); |
4573 | return list; |
4574 | } |
4575 | |
4576 | /*! |
4577 | Returns the index position of the first exact match of this regexp in |
4578 | \a list, searching forward from index position \a from. Returns |
4579 | -1 if no item matched. |
4580 | |
4581 | \sa lastIndexIn(), exactMatch() |
4582 | */ |
4583 | int QRegExp::indexIn(const QStringList &list, int from) const |
4584 | { |
4585 | QRegExp rx2(*this); |
4586 | if (from < 0) |
4587 | from = qMax(a: from + list.size(), b: 0); |
4588 | for (int i = from; i < list.size(); ++i) { |
4589 | if (rx2.exactMatch(str: list.at(i))) |
4590 | return i; |
4591 | } |
4592 | return -1; |
4593 | } |
4594 | |
4595 | /*! |
4596 | Returns the index position of the last exact match of this regexp in |
4597 | \a list, searching backward from index position \a from. If \a |
4598 | from is -1 (the default), the search starts at the last item. |
4599 | Returns -1 if no item matched. |
4600 | |
4601 | \sa QRegExp::exactMatch() |
4602 | */ |
4603 | int QRegExp::lastIndexIn(const QStringList &list, int from) const |
4604 | { |
4605 | QRegExp rx2(*this); |
4606 | if (from < 0) |
4607 | from += list.size(); |
4608 | else if (from >= list.size()) |
4609 | from = list.size() - 1; |
4610 | for (int i = from; i >= 0; --i) { |
4611 | if (rx2.exactMatch(str: list.at(i))) |
4612 | return i; |
4613 | } |
4614 | return -1; |
4615 | } |
4616 | |
4617 | #ifndef QT_NO_REGEXP_CAPTURE |
4618 | |
4619 | /*! |
4620 | \since 4.6 |
4621 | Returns the number of captures contained in the regular expression. |
4622 | */ |
4623 | int QRegExp::captureCount() const |
4624 | { |
4625 | prepareEngine(priv); |
4626 | return priv->eng->captureCount(); |
4627 | } |
4628 | |
4629 | /*! |
4630 | Returns a list of the captured text strings. |
4631 | |
4632 | The first string in the list is the entire matched string. Each |
4633 | subsequent list element contains a string that matched a |
4634 | (capturing) subexpression of the regexp. |
4635 | |
4636 | For example: |
4637 | \snippet code/src_corelib_text_qregexp.cpp 14 |
4638 | |
4639 | The above example also captures elements that may be present but |
4640 | which we have no interest in. This problem can be solved by using |
4641 | non-capturing parentheses: |
4642 | |
4643 | \snippet code/src_corelib_text_qregexp.cpp 15 |
4644 | |
4645 | Note that if you want to iterate over the list, you should iterate |
4646 | over a copy, e.g. |
4647 | \snippet code/src_corelib_text_qregexp.cpp 16 |
4648 | |
4649 | Some regexps can match an indeterminate number of times. For |
4650 | example if the input string is "Offsets: 12 14 99 231 7" and the |
4651 | regexp, \c{rx}, is \b{(\\d+)+}, we would hope to get a list of |
4652 | all the numbers matched. However, after calling |
4653 | \c{rx.indexIn(str)}, capturedTexts() will return the list ("12", |
4654 | "12"), i.e. the entire match was "12" and the first subexpression |
4655 | matched was "12". The correct approach is to use cap() in a |
4656 | \l{QRegExp#cap_in_a_loop}{loop}. |
4657 | |
4658 | The order of elements in the string list is as follows. The first |
4659 | element is the entire matching string. Each subsequent element |
4660 | corresponds to the next capturing open left parentheses. Thus |
4661 | capturedTexts()[1] is the text of the first capturing parentheses, |
4662 | capturedTexts()[2] is the text of the second and so on |
4663 | (corresponding to $1, $2, etc., in some other regexp languages). |
4664 | |
4665 | \sa cap(), pos() |
4666 | */ |
4667 | QStringList QRegExp::capturedTexts() const |
4668 | { |
4669 | if (priv->capturedCache.isEmpty()) { |
4670 | prepareEngine(priv); |
4671 | const int *captured = priv->matchState.captured; |
4672 | int n = priv->matchState.capturedSize; |
4673 | |
4674 | for (int i = 0; i < n; i += 2) { |
4675 | QString m; |
4676 | if (captured[i + 1] == 0) |
4677 | m = QLatin1String("" ); // ### Qt 5: don't distinguish between null and empty |
4678 | else if (captured[i] >= 0) |
4679 | m = priv->t.mid(position: captured[i], n: captured[i + 1]); |
4680 | priv->capturedCache.append(t: m); |
4681 | } |
4682 | priv->t.clear(); |
4683 | } |
4684 | return priv->capturedCache; |
4685 | } |
4686 | |
4687 | /*! |
4688 | \internal |
4689 | */ |
4690 | QStringList QRegExp::capturedTexts() |
4691 | { |
4692 | return const_cast<const QRegExp *>(this)->capturedTexts(); |
4693 | } |
4694 | |
4695 | /*! |
4696 | Returns the text captured by the \a nth subexpression. The entire |
4697 | match has index 0 and the parenthesized subexpressions have |
4698 | indexes starting from 1 (excluding non-capturing parentheses). |
4699 | |
4700 | \snippet code/src_corelib_text_qregexp.cpp 17 |
4701 | |
4702 | The order of elements matched by cap() is as follows. The first |
4703 | element, cap(0), is the entire matching string. Each subsequent |
4704 | element corresponds to the next capturing open left parentheses. |
4705 | Thus cap(1) is the text of the first capturing parentheses, cap(2) |
4706 | is the text of the second, and so on. |
4707 | |
4708 | \sa capturedTexts(), pos() |
4709 | */ |
4710 | QString QRegExp::cap(int nth) const |
4711 | { |
4712 | return capturedTexts().value(i: nth); |
4713 | } |
4714 | |
4715 | /*! |
4716 | \internal |
4717 | */ |
4718 | QString QRegExp::cap(int nth) |
4719 | { |
4720 | return const_cast<const QRegExp *>(this)->cap(nth); |
4721 | } |
4722 | |
4723 | /*! |
4724 | Returns the position of the \a nth captured text in the searched |
4725 | string. If \a nth is 0 (the default), pos() returns the position |
4726 | of the whole match. |
4727 | |
4728 | Example: |
4729 | \snippet code/src_corelib_text_qregexp.cpp 18 |
4730 | |
4731 | For zero-length matches, pos() always returns -1. (For example, if |
4732 | cap(4) would return an empty string, pos(4) returns -1.) This is |
4733 | a feature of the implementation. |
4734 | |
4735 | \sa cap(), capturedTexts() |
4736 | */ |
4737 | int QRegExp::pos(int nth) const |
4738 | { |
4739 | if (nth < 0 || nth >= priv->matchState.capturedSize / 2) |
4740 | return -1; |
4741 | else |
4742 | return priv->matchState.captured[2 * nth]; |
4743 | } |
4744 | |
4745 | /*! |
4746 | \internal |
4747 | */ |
4748 | int QRegExp::pos(int nth) |
4749 | { |
4750 | return const_cast<const QRegExp *>(this)->pos(nth); |
4751 | } |
4752 | |
4753 | /*! |
4754 | Returns a text string that explains why a regexp pattern is |
4755 | invalid the case being; otherwise returns "no error occurred". |
4756 | |
4757 | \sa isValid() |
4758 | */ |
4759 | QString QRegExp::errorString() const |
4760 | { |
4761 | if (isValid()) { |
4762 | return QString::fromLatin1(RXERR_OK); |
4763 | } else { |
4764 | return priv->eng->errorString(); |
4765 | } |
4766 | } |
4767 | |
4768 | /*! |
4769 | \internal |
4770 | */ |
4771 | QString QRegExp::errorString() |
4772 | { |
4773 | return const_cast<const QRegExp *>(this)->errorString(); |
4774 | } |
4775 | |
4776 | #endif |
4777 | |
4778 | /*! |
4779 | Returns the string \a str with every regexp special character |
4780 | escaped with a backslash. The special characters are $, (,), *, +, |
4781 | ., ?, [, \,], ^, {, | and }. |
4782 | |
4783 | Example: |
4784 | |
4785 | \snippet code/src_corelib_text_qregexp.cpp 19 |
4786 | |
4787 | This function is useful to construct regexp patterns dynamically: |
4788 | |
4789 | \snippet code/src_corelib_text_qregexp.cpp 20 |
4790 | |
4791 | \sa setPatternSyntax() |
4792 | */ |
4793 | QString QRegExp::escape(const QString &str) |
4794 | { |
4795 | QString quoted; |
4796 | const int count = str.size(); |
4797 | quoted.reserve(asize: count * 2); |
4798 | const QLatin1Char backslash('\\'); |
4799 | for (int i = 0; i < count; i++) { |
4800 | switch (str.at(i).toLatin1()) { |
4801 | case '$': |
4802 | case '(': |
4803 | case ')': |
4804 | case '*': |
4805 | case '+': |
4806 | case '.': |
4807 | case '?': |
4808 | case '[': |
4809 | case '\\': |
4810 | case ']': |
4811 | case '^': |
4812 | case '{': |
4813 | case '|': |
4814 | case '}': |
4815 | quoted.append(c: backslash); |
4816 | } |
4817 | quoted.append(c: str.at(i)); |
4818 | } |
4819 | return quoted; |
4820 | } |
4821 | |
4822 | |
4823 | #ifndef QT_NO_DATASTREAM |
4824 | /*! |
4825 | \relates QRegExp |
4826 | |
4827 | Writes the regular expression \a regExp to stream \a out. |
4828 | |
4829 | \sa {Serializing Qt Data Types} |
4830 | */ |
4831 | QDataStream &operator<<(QDataStream &out, const QRegExp ®Exp) |
4832 | { |
4833 | return out << regExp.pattern() << (quint8)regExp.caseSensitivity() |
4834 | << (quint8)regExp.patternSyntax() |
4835 | << (quint8)!!regExp.isMinimal(); |
4836 | } |
4837 | |
4838 | /*! |
4839 | \relates QRegExp |
4840 | |
4841 | Reads a regular expression from stream \a in into \a regExp. |
4842 | |
4843 | \sa {Serializing Qt Data Types} |
4844 | */ |
4845 | QDataStream &operator>>(QDataStream &in, QRegExp ®Exp) |
4846 | { |
4847 | QString pattern; |
4848 | quint8 cs; |
4849 | quint8 patternSyntax; |
4850 | quint8 isMinimal; |
4851 | |
4852 | in >> pattern >> cs >> patternSyntax >> isMinimal; |
4853 | |
4854 | QRegExp newRegExp(pattern, Qt::CaseSensitivity(cs), |
4855 | QRegExp::PatternSyntax(patternSyntax)); |
4856 | |
4857 | newRegExp.setMinimal(isMinimal); |
4858 | regExp = newRegExp; |
4859 | return in; |
4860 | } |
4861 | #endif // QT_NO_DATASTREAM |
4862 | |
4863 | #ifndef QT_NO_DEBUG_STREAM |
4864 | QDebug operator<<(QDebug dbg, const QRegExp &r) |
4865 | { |
4866 | QDebugStateSaver saver(dbg); |
4867 | dbg.nospace() << "QRegExp(patternSyntax=" << r.patternSyntax() |
4868 | << ", pattern='" << r.pattern() << "')" ; |
4869 | return dbg; |
4870 | } |
4871 | #endif |
4872 | |
4873 | QT_END_NAMESPACE |
4874 | |