1// Copyright (C) 2016 The Qt Company Ltd.
2// SPDX-License-Identifier: LicenseRef-Qt-Commercial OR LGPL-3.0-only OR GPL-2.0-only OR GPL-3.0-only
3// Qt-Security score:critical reason:data-parser
4
5#include "qregexp.h"
6
7#include "qalgorithms.h"
8#include "qbitarray.h"
9#include "qcache.h"
10#include "qdatastream.h"
11#include "qdebug.h"
12#include "qhashfunctions.h"
13#include "qlist.h"
14#include "qmap.h"
15#include "qmutex.h"
16#include "qstring.h"
17#include "qstringlist.h"
18#include "qstringmatcher.h"
19#include "private/qlocking_p.h"
20#include "qvarlengtharray.h"
21
22#include <limits.h>
23#include <algorithm>
24#include <optional>
25
26QT_BEGIN_NAMESPACE
27
28// error strings for the regexp parser
29#define RXERR_OK QT_TRANSLATE_NOOP("QRegExp", "no error occurred")
30#define RXERR_DISABLED QT_TRANSLATE_NOOP("QRegExp", "disabled feature used")
31#define RXERR_CHARCLASS QT_TRANSLATE_NOOP("QRegExp", "bad char class syntax")
32#define RXERR_LOOKAHEAD QT_TRANSLATE_NOOP("QRegExp", "bad lookahead syntax")
33#define RXERR_LOOKBEHIND QT_TRANSLATE_NOOP("QRegExp", "lookbehinds not supported, see QTBUG-2371")
34#define RXERR_REPETITION QT_TRANSLATE_NOOP("QRegExp", "bad repetition syntax")
35#define RXERR_OCTAL QT_TRANSLATE_NOOP("QRegExp", "invalid octal value")
36#define RXERR_LEFTDELIM QT_TRANSLATE_NOOP("QRegExp", "missing left delim")
37#define RXERR_END QT_TRANSLATE_NOOP("QRegExp", "unexpected end")
38#define RXERR_LIMIT QT_TRANSLATE_NOOP("QRegExp", "met internal limit")
39#define RXERR_INTERVAL QT_TRANSLATE_NOOP("QRegExp", "invalid interval")
40#define RXERR_CATEGORY QT_TRANSLATE_NOOP("QRegExp", "invalid category")
41
42/*!
43 \class QRegExp
44 \inmodule QtCore5Compat
45 \reentrant
46 \brief The QRegExp class provides pattern matching using regular expressions.
47
48 \ingroup tools
49 \ingroup shared
50
51 \keyword regular expression
52
53 This class is deprecated in Qt 6. Please use QRegularExpression instead
54 for all new code. For guidelines on porting old code from QRegExp to
55 QRegularExpression, see {Porting to QRegularExpression}
56
57 A regular expression, or "regexp", is a pattern for matching
58 substrings in a text. This is useful in many contexts, e.g.,
59
60 \table
61 \row \li Validation
62 \li A regexp can test whether a substring meets some criteria,
63 e.g. is an integer or contains no whitespace.
64 \row \li Searching
65 \li A regexp provides more powerful pattern matching than
66 simple substring matching, e.g., match one of the words
67 \e{mail}, \e{letter} or \e{correspondence}, but none of the
68 words \e{email}, \e{mailman}, \e{mailer}, \e{letterbox}, etc.
69 \row \li Search and Replace
70 \li A regexp can replace all occurrences of a substring with a
71 different substring, e.g., replace all occurrences of \e{&}
72 with \e{\&amp;} except where the \e{&} is already followed by
73 an \e{amp;}.
74 \row \li String Splitting
75 \li A regexp can be used to identify where a string should be
76 split apart, e.g. splitting tab-delimited strings.
77 \endtable
78
79 A brief introduction to regexps is presented, a description of
80 Qt's regexp language, some examples, and the function
81 documentation itself. QRegExp is modeled on Perl's regexp
82 language. It fully supports Unicode. QRegExp can also be used in a
83 simpler, \e{wildcard mode} that is similar to the functionality
84 found in command shells. The syntax rules used by QRegExp can be
85 changed with setPatternSyntax(). In particular, the pattern syntax
86 can be set to QRegExp::FixedString, which means the pattern to be
87 matched is interpreted as a plain string, i.e., special characters
88 (e.g., backslash) are not escaped.
89
90 A good text on regexps is \e {Mastering Regular Expressions}
91 (Third Edition) by Jeffrey E. F. Friedl, ISBN 0-596-52812-4.
92
93 \note In Qt 5, the new QRegularExpression class provides a Perl
94 compatible implementation of regular expressions and is recommended
95 in place of QRegExp.
96
97 \section1 Introduction
98
99 Regexps are built up from expressions, quantifiers, and
100 assertions. The simplest expression is a character, e.g. \b{x}
101 or \b{5}. An expression can also be a set of characters
102 enclosed in square brackets. \b{[ABCD]} will match an \b{A}
103 or a \b{B} or a \b{C} or a \b{D}. We can write this same
104 expression as \b{[A-D]}, and an expression to match any
105 capital letter in the English alphabet is written as
106 \b{[A-Z]}.
107
108 A quantifier specifies the number of occurrences of an expression
109 that must be matched. \b{x{1,1}} means match one and only one
110 \b{x}. \b{x{1,5}} means match a sequence of \b{x}
111 characters that contains at least one \b{x} but no more than
112 five.
113
114 Note that in general regexps cannot be used to check for balanced
115 brackets or tags. For example, a regexp can be written to match an
116 opening html \c{<b>} and its closing \c{</b>}, if the \c{<b>} tags
117 are not nested, but if the \c{<b>} tags are nested, that same
118 regexp will match an opening \c{<b>} tag with the wrong closing
119 \c{</b>}. For the fragment \c{<b>bold <b>bolder</b></b>}, the
120 first \c{<b>} would be matched with the first \c{</b>}, which is
121 not correct. However, it is possible to write a regexp that will
122 match nested brackets or tags correctly, but only if the number of
123 nesting levels is fixed and known. If the number of nesting levels
124 is not fixed and known, it is impossible to write a regexp that
125 will not fail.
126
127 Suppose we want a regexp to match integers in the range 0 to 99.
128 At least one digit is required, so we start with the expression
129 \b{[0-9]{1,1}}, which matches a single digit exactly once. This
130 regexp matches integers in the range 0 to 9. To match integers up
131 to 99, increase the maximum number of occurrences to 2, so the
132 regexp becomes \b{[0-9]{1,2}}. This regexp satisfies the
133 original requirement to match integers from 0 to 99, but it will
134 also match integers that occur in the middle of strings. If we
135 want the matched integer to be the whole string, we must use the
136 anchor assertions, \b{^} (caret) and \b{$} (dollar). When
137 \b{^} is the first character in a regexp, it means the regexp
138 must match from the beginning of the string. When \b{$} is the
139 last character of the regexp, it means the regexp must match to
140 the end of the string. The regexp becomes \b{^[0-9]{1,2}$}.
141 Note that assertions, e.g. \b{^} and \b{$}, do not match
142 characters but locations in the string.
143
144 If you have seen regexps described elsewhere, they may have looked
145 different from the ones shown here. This is because some sets of
146 characters and some quantifiers are so common that they have been
147 given special symbols to represent them. \b{[0-9]} can be
148 replaced with the symbol \b{\\d}. The quantifier to match
149 exactly one occurrence, \b{{1,1}}, can be replaced with the
150 expression itself, i.e. \b{x{1,1}} is the same as \b{x}. So
151 our 0 to 99 matcher could be written as \b{^\\d{1,2}$}. It can
152 also be written \b{^\\d\\d{0,1}$}, i.e. \e{From the start of
153 the string, match a digit, followed immediately by 0 or 1 digits}.
154 In practice, it would be written as \b{^\\d\\d?$}. The \b{?}
155 is shorthand for the quantifier \b{{0,1}}, i.e. 0 or 1
156 occurrences. \b{?} makes an expression optional. The regexp
157 \b{^\\d\\d?$} means \e{From the beginning of the string, match
158 one digit, followed immediately by 0 or 1 more digit, followed
159 immediately by end of string}.
160
161 To write a regexp that matches one of the words 'mail' \e or
162 'letter' \e or 'correspondence' but does not match words that
163 contain these words, e.g., 'email', 'mailman', 'mailer', and
164 'letterbox', start with a regexp that matches 'mail'. Expressed
165 fully, the regexp is \b{m{1,1}a{1,1}i{1,1}l{1,1}}, but because
166 a character expression is automatically quantified by
167 \b{{1,1}}, we can simplify the regexp to \b{mail}, i.e., an
168 'm' followed by an 'a' followed by an 'i' followed by an 'l'. Now
169 we can use the vertical bar \b{|}, which means \b{or}, to
170 include the other two words, so our regexp for matching any of the
171 three words becomes \b{mail|letter|correspondence}. Match
172 'mail' \b{or} 'letter' \b{or} 'correspondence'. While this
173 regexp will match one of the three words we want to match, it will
174 also match words we don't want to match, e.g., 'email'. To
175 prevent the regexp from matching unwanted words, we must tell it
176 to begin and end the match at word boundaries. First we enclose
177 our regexp in parentheses, \b{(mail|letter|correspondence)}.
178 Parentheses group expressions together, and they identify a part
179 of the regexp that we wish to \l{capturing text}{capture}.
180 Enclosing the expression in parentheses allows us to use it as a
181 component in more complex regexps. It also allows us to examine
182 which of the three words was actually matched. To force the match
183 to begin and end on word boundaries, we enclose the regexp in
184 \b{\\b} \e{word boundary} assertions:
185 \b{\\b(mail|letter|correspondence)\\b}. Now the regexp means:
186 \e{Match a word boundary, followed by the regexp in parentheses,
187 followed by a word boundary}. The \b{\\b} assertion matches a
188 \e position in the regexp, not a \e character. A word boundary is
189 any non-word character, e.g., a space, newline, or the beginning
190 or ending of a string.
191
192 If we want to replace ampersand characters with the HTML entity
193 \b{\&amp;}, the regexp to match is simply \b{\&}. But this
194 regexp will also match ampersands that have already been converted
195 to HTML entities. We want to replace only ampersands that are not
196 already followed by \b{amp;}. For this, we need the negative
197 lookahead assertion, \b{(?!}__\b{)}. The regexp can then be
198 written as \b{\&(?!amp;)}, i.e. \e{Match an ampersand that is}
199 \b{not} \e{followed by} \b{amp;}.
200
201 If we want to count all the occurrences of 'Eric' and 'Eirik' in a
202 string, two valid solutions are \b{\\b(Eric|Eirik)\\b} and
203 \b{\\bEi?ri[ck]\\b}. The word boundary assertion '\\b' is
204 required to avoid matching words that contain either name,
205 e.g. 'Ericsson'. Note that the second regexp matches more
206 spellings than we want: 'Eric', 'Erik', 'Eiric' and 'Eirik'.
207
208 Some of the examples discussed above are implemented in the
209 \l{#code-examples}{code examples} section.
210
211 \target characters-and-abbreviations-for-sets-of-characters
212 \section1 Characters and Abbreviations for Sets of Characters
213
214 \table
215 \header \li Element \li Meaning
216 \row \li \b{c}
217 \li A character represents itself unless it has a special
218 regexp meaning. e.g. \b{c} matches the character \e c.
219 \row \li \b{\\c}
220 \li A character that follows a backslash matches the character
221 itself, except as specified below. e.g., To match a literal
222 caret at the beginning of a string, write \b{\\^}.
223 \row \li \b{\\a}
224 \li Matches the ASCII bell (BEL, 0x07).
225 \row \li \b{\\f}
226 \li Matches the ASCII form feed (FF, 0x0C).
227 \row \li \b{\\n}
228 \li Matches the ASCII line feed (LF, 0x0A, Unix newline).
229 \row \li \b{\\r}
230 \li Matches the ASCII carriage return (CR, 0x0D).
231 \row \li \b{\\t}
232 \li Matches the ASCII horizontal tab (HT, 0x09).
233 \row \li \b{\\v}
234 \li Matches the ASCII vertical tab (VT, 0x0B).
235 \row \li \b{\\x\e{hhhh}}
236 \li Matches the Unicode character corresponding to the
237 hexadecimal number \e{hhhh} (between 0x0000 and 0xFFFF).
238 \row \li \b{\\0\e{ooo}} (i.e., \\zero \e{ooo})
239 \li matches the ASCII/Latin1 character for the octal number
240 \e{ooo} (between 0 and 0377).
241 \row \li \b{. (dot)}
242 \li Matches any character (including newline).
243 \row \li \b{\\d}
244 \li Matches a digit (QChar::isDigit()).
245 \row \li \b{\\D}
246 \li Matches a non-digit.
247 \row \li \b{\\s}
248 \li Matches a whitespace character (QChar::isSpace()).
249 \row \li \b{\\S}
250 \li Matches a non-whitespace character.
251 \row \li \b{\\w}
252 \li Matches a word character (QChar::isLetterOrNumber(), QChar::isMark(), or '_').
253 \row \li \b{\\W}
254 \li Matches a non-word character.
255 \row \li \b{\\\e{n}}
256 \li The \e{n}-th backreference, e.g. \\1, \\2, etc.
257 \endtable
258
259 \b{Note:} The C++ compiler transforms backslashes in strings.
260 To include a \b{\\} in a regexp, enter it twice, i.e. \c{\\}.
261 To match the backslash character itself, enter it four times, i.e.
262 \c{\\\\}.
263
264 \target sets-of-characters
265 \section1 Sets of Characters
266
267 Square brackets mean match any character contained in the square
268 brackets. The character set abbreviations described above can
269 appear in a character set in square brackets. Except for the
270 character set abbreviations and the following two exceptions,
271 characters do not have special meanings in square brackets.
272
273 \table
274 \row \li \b{^}
275
276 \li The caret negates the character set if it occurs as the
277 first character (i.e. immediately after the opening square
278 bracket). \b{[abc]} matches 'a' or 'b' or 'c', but
279 \b{[^abc]} matches anything \e but 'a' or 'b' or 'c'.
280
281 \row \li \b{-}
282
283 \li The dash indicates a range of characters. \b{[W-Z]}
284 matches 'W' or 'X' or 'Y' or 'Z'.
285
286 \endtable
287
288 Using the predefined character set abbreviations is more portable
289 than using character ranges across platforms and languages. For
290 example, \b{[0-9]} matches a digit in Western alphabets but
291 \b{\\d} matches a digit in \e any alphabet.
292
293 Note: In other regexp documentation, sets of characters are often
294 called "character classes".
295
296 \target quantifiers
297 \section1 Quantifiers
298
299 By default, an expression is automatically quantified by
300 \b{{1,1}}, i.e. it should occur exactly once. In the following
301 list, \b{\e {E}} stands for expression. An expression is a
302 character, or an abbreviation for a set of characters, or a set of
303 characters in square brackets, or an expression in parentheses.
304
305 \table
306 \row \li \b{\e {E}?}
307
308 \li Matches zero or one occurrences of \e E. This quantifier
309 means \e{The previous expression is optional}, because it
310 will match whether or not the expression is found. \b{\e
311 {E}?} is the same as \b{\e {E}{0,1}}. e.g., \b{dents?}
312 matches 'dent' or 'dents'.
313
314 \row \li \b{\e {E}+}
315
316 \li Matches one or more occurrences of \e E. \b{\e {E}+} is
317 the same as \b{\e {E}{1,}}. e.g., \b{0+} matches '0',
318 '00', '000', etc.
319
320 \row \li \b{\e {E}*}
321
322 \li Matches zero or more occurrences of \e E. It is the same
323 as \b{\e {E}{0,}}. The \b{*} quantifier is often used
324 in error where \b{+} should be used. For example, if
325 \b{\\s*$} is used in an expression to match strings that
326 end in whitespace, it will match every string because
327 \b{\\s*$} means \e{Match zero or more whitespaces followed
328 by end of string}. The correct regexp to match strings that
329 have at least one trailing whitespace character is
330 \b{\\s+$}.
331
332 \row \li \b{\e {E}{n}}
333
334 \li Matches exactly \e n occurrences of \e E. \b{\e {E}{n}}
335 is the same as repeating \e E \e n times. For example,
336 \b{x{5}} is the same as \b{xxxxx}. It is also the same
337 as \b{\e {E}{n,n}}, e.g. \b{x{5,5}}.
338
339 \row \li \b{\e {E}{n,}}
340 \li Matches at least \e n occurrences of \e E.
341
342 \row \li \b{\e {E}{,m}}
343 \li Matches at most \e m occurrences of \e E. \b{\e {E}{,m}}
344 is the same as \b{\e {E}{0,m}}.
345
346 \row \li \b{\e {E}{n,m}}
347 \li Matches at least \e n and at most \e m occurrences of \e E.
348 \endtable
349
350 To apply a quantifier to more than just the preceding character,
351 use parentheses to group characters together in an expression. For
352 example, \b{tag+} matches a 't' followed by an 'a' followed by
353 at least one 'g', whereas \b{(tag)+} matches at least one
354 occurrence of 'tag'.
355
356 Note: Quantifiers are normally "greedy". They always match as much
357 text as they can. For example, \b{0+} matches the first zero it
358 finds and all the consecutive zeros after the first zero. Applied
359 to '20005', it matches '2\underline{000}5'. Quantifiers can be made
360 non-greedy, see setMinimal().
361
362 \target capturing parentheses
363 \target backreferences
364 \section1 Capturing Text
365
366 Parentheses allow us to group elements together so that we can
367 quantify and capture them. For example if we have the expression
368 \b{mail|letter|correspondence} that matches a string we know
369 that \e one of the words matched but not which one. Using
370 parentheses allows us to "capture" whatever is matched within
371 their bounds, so if we used \b{(mail|letter|correspondence)}
372 and matched this regexp against the string "I sent you some email"
373 we can use the cap() or capturedTexts() functions to extract the
374 matched characters, in this case 'mail'.
375
376 We can use captured text within the regexp itself. To refer to the
377 captured text we use \e backreferences which are indexed from 1,
378 the same as for cap(). For example we could search for duplicate
379 words in a string using \b{\\b(\\w+)\\W+\\1\\b} which means match a
380 word boundary followed by one or more word characters followed by
381 one or more non-word characters followed by the same text as the
382 first parenthesized expression followed by a word boundary.
383
384 If we want to use parentheses purely for grouping and not for
385 capturing we can use the non-capturing syntax, e.g.
386 \b{(?:green|blue)}. Non-capturing parentheses begin '(?:' and
387 end ')'. In this example we match either 'green' or 'blue' but we
388 do not capture the match so we only know whether or not we matched
389 but not which color we actually found. Using non-capturing
390 parentheses is more efficient than using capturing parentheses
391 since the regexp engine has to do less book-keeping.
392
393 Both capturing and non-capturing parentheses may be nested.
394
395 \target greedy quantifiers
396
397 For historical reasons, quantifiers (e.g. \b{*}) that apply to
398 capturing parentheses are more "greedy" than other quantifiers.
399 For example, \b{a*(a*)} will match "aaa" with cap(1) == "aaa".
400 This behavior is different from what other regexp engines do
401 (notably, Perl). To obtain a more intuitive capturing behavior,
402 specify QRegExp::RegExp2 to the QRegExp constructor or call
403 setPatternSyntax(QRegExp::RegExp2).
404
405 \target cap_in_a_loop
406
407 When the number of matches cannot be determined in advance, a
408 common idiom is to use cap() in a loop. For example:
409
410 \snippet code/src_corelib_text_qregexp.cpp 0
411
412 \target assertions
413 \section1 Assertions
414
415 Assertions make some statement about the text at the point where
416 they occur in the regexp but they do not match any characters. In
417 the following list \b{\e {E}} stands for any expression.
418
419 \table
420 \row \li \b{^}
421 \li The caret signifies the beginning of the string. If you
422 wish to match a literal \c{^} you must escape it by
423 writing \c{\\^}. For example, \b{^#include} will only
424 match strings which \e begin with the characters '#include'.
425 (When the caret is the first character of a character set it
426 has a special meaning, see \l{#sets-of-characters}{Sets of Characters}.)
427
428 \row \li \b{$}
429 \li The dollar signifies the end of the string. For example
430 \b{\\d\\s*$} will match strings which end with a digit
431 optionally followed by whitespace. If you wish to match a
432 literal \c{$} you must escape it by writing
433 \c{\\$}.
434
435 \row \li \b{\\b}
436 \li A word boundary. For example the regexp
437 \b{\\bOK\\b} means match immediately after a word
438 boundary (e.g. start of string or whitespace) the letter 'O'
439 then the letter 'K' immediately before another word boundary
440 (e.g. end of string or whitespace). But note that the
441 assertion does not actually match any whitespace so if we
442 write \b{(\\bOK\\b)} and we have a match it will only
443 contain 'OK' even if the string is "It's \underline{OK} now".
444
445 \row \li \b{\\B}
446 \li A non-word boundary. This assertion is true wherever
447 \b{\\b} is false. For example if we searched for
448 \b{\\Bon\\B} in "Left on" the match would fail (space
449 and end of string aren't non-word boundaries), but it would
450 match in "t\underline{on}ne".
451
452 \row \li \b{(?=\e E)}
453 \li Positive lookahead. This assertion is true if the
454 expression matches at this point in the regexp. For example,
455 \b{const(?=\\s+char)} matches 'const' whenever it is
456 followed by 'char', as in 'static \underline{const} char *'.
457 (Compare with \b{const\\s+char}, which matches 'static
458 \underline{const char} *'.)
459
460 \row \li \b{(?!\e E)}
461 \li Negative lookahead. This assertion is true if the
462 expression does not match at this point in the regexp. For
463 example, \b{const(?!\\s+char)} matches 'const' \e except
464 when it is followed by 'char'.
465 \endtable
466
467 \target QRegExp wildcard matching
468 \section1 Wildcard Matching
469
470 Most command shells such as \e bash or \e cmd.exe support "file
471 globbing", the ability to identify a group of files by using
472 wildcards. The setPatternSyntax() function is used to switch
473 between regexp and wildcard mode. Wildcard matching is much
474 simpler than full regexps and has only four features:
475
476 \table
477 \row \li \b{c}
478 \li Any character represents itself apart from those mentioned
479 below. Thus \b{c} matches the character \e c.
480 \row \li \b{?}
481 \li Matches any single character. It is the same as
482 \b{.} in full regexps.
483 \row \li \b{*}
484 \li Matches zero or more of any characters. It is the
485 same as \b{.*} in full regexps.
486 \row \li \b{[...]}
487 \li Sets of characters can be represented in square brackets,
488 similar to full regexps. Within the character class, like
489 outside, backslash has no special meaning.
490 \endtable
491
492 In the mode Wildcard, the wildcard characters cannot be
493 escaped. In the mode WildcardUnix, the character '\\' escapes the
494 wildcard.
495
496 For example if we are in wildcard mode and have strings which
497 contain filenames we could identify HTML files with \b{*.html}.
498 This will match zero or more characters followed by a dot followed
499 by 'h', 't', 'm' and 'l'.
500
501 To test a string against a wildcard expression, use exactMatch().
502 For example:
503
504 \snippet code/src_corelib_text_qregexp.cpp 1
505
506 \target perl-users
507 \section1 Notes for Perl Users
508
509 Most of the character class abbreviations supported by Perl are
510 supported by QRegExp, see \l{#characters-and-abbreviations-for-sets-of-characters}
511 {characters and abbreviations for sets of characters}.
512
513 In QRegExp, apart from within character classes, \c{^} always
514 signifies the start of the string, so carets must always be
515 escaped unless used for that purpose. In Perl the meaning of caret
516 varies automagically depending on where it occurs so escaping it
517 is rarely necessary. The same applies to \c{$} which in
518 QRegExp always signifies the end of the string.
519
520 QRegExp's quantifiers are the same as Perl's greedy quantifiers
521 (but see the \l{greedy quantifiers}{note above}). Non-greedy
522 matching cannot be applied to individual quantifiers, but can be
523 applied to all the quantifiers in the pattern. For example, to
524 match the Perl regexp \b{ro+?m} requires:
525
526 \snippet code/src_corelib_text_qregexp.cpp 2
527
528 The equivalent of Perl's \c{/i} option is
529 setCaseSensitivity(Qt::CaseInsensitive).
530
531 Perl's \c{/g} option can be emulated using a \l{#cap_in_a_loop}{loop}.
532
533 In QRegExp \b{.} matches any character, therefore all QRegExp
534 regexps have the equivalent of Perl's \c{/s} option. QRegExp
535 does not have an equivalent to Perl's \c{/m} option, but this
536 can be emulated in various ways for example by splitting the input
537 into lines or by looping with a regexp that searches for newlines.
538
539 Because QRegExp is string oriented, there are no \\A, \\Z, or \\z
540 assertions. The \\G assertion is not supported but can be emulated
541 in a loop.
542
543 Perl's $& is cap(0) or capturedTexts()[0]. There are no QRegExp
544 equivalents for $`, $' or $+. Perl's capturing variables, $1, $2,
545 ... correspond to cap(1) or capturedTexts()[1], cap(2) or
546 capturedTexts()[2], etc.
547
548 To substitute a pattern use QString::replace().
549
550 Perl's extended \c{/x} syntax is not supported, nor are
551 directives, e.g. (?i), or regexp comments, e.g. (?#comment). On
552 the other hand, C++'s rules for literal strings can be used to
553 achieve the same:
554
555 \snippet code/src_corelib_text_qregexp.cpp 3
556
557 Both zero-width positive and zero-width negative lookahead
558 assertions (?=pattern) and (?!pattern) are supported with the same
559 syntax as Perl. Perl's lookbehind assertions, "independent"
560 subexpressions and conditional expressions are not supported.
561
562 Non-capturing parentheses are also supported, with the same
563 (?:pattern) syntax.
564
565 See QString::split() and QStringList::join() for equivalents
566 to Perl's split and join functions.
567
568 Note: because C++ transforms \\'s they must be written \e twice in
569 code, e.g. \b{\\b} must be written \b{\\\\b}.
570
571 \target code-examples
572 \section1 Code Examples
573
574 \snippet code/src_corelib_text_qregexp.cpp 4
575
576 The third string matches '\underline{6}'. This is a simple validation
577 regexp for integers in the range 0 to 99.
578
579 \snippet code/src_corelib_text_qregexp.cpp 5
580
581 The second string matches '\underline{This_is-OK}'. We've used the
582 character set abbreviation '\\S' (non-whitespace) and the anchors
583 to match strings which contain no whitespace.
584
585 In the following example we match strings containing 'mail' or
586 'letter' or 'correspondence' but only match whole words i.e. not
587 'email'
588
589 \snippet code/src_corelib_text_qregexp.cpp 6
590
591 The second string matches "Please write the \underline{letter}". The
592 word 'letter' is also captured (because of the parentheses). We
593 can see what text we've captured like this:
594
595 \snippet code/src_corelib_text_qregexp.cpp 7
596
597 This will capture the text from the first set of capturing
598 parentheses (counting capturing left parentheses from left to
599 right). The parentheses are counted from 1 since cap(0) is the
600 whole matched regexp (equivalent to '&' in most regexp engines).
601
602 \snippet code/src_corelib_text_qregexp.cpp 8
603
604 Here we've passed the QRegExp to QString's replace() function to
605 replace the matched text with new text.
606
607 \snippet code/src_corelib_text_qregexp.cpp 9
608
609 We've used the indexIn() function to repeatedly match the regexp in
610 the string. Note that instead of moving forward by one character
611 at a time \c pos++ we could have written \c {pos +=
612 rx.matchedLength()} to skip over the already matched string. The
613 count will equal 3, matching 'One \underline{Eric} another
614 \underline{Eirik}, and an Ericsson. How many Eiriks, \underline{Eric}?'; it
615 doesn't match 'Ericsson' or 'Eiriks' because they are not bounded
616 by non-word boundaries.
617
618 One common use of regexps is to split lines of delimited data into
619 their component fields.
620
621 \snippet code/src_corelib_text_qregexp.cpp 10
622
623 In this example our input lines have the format company name, web
624 address and country. Unfortunately the regexp is rather long and
625 not very versatile -- the code will break if we add any more
626 fields. A simpler and better solution is to look for the
627 separator, '\\t' in this case, and take the surrounding text. The
628 QString::split() function can take a separator string or regexp
629 as an argument and split a string accordingly.
630
631 \snippet code/src_corelib_text_qregexp.cpp 11
632
633 Here field[0] is the company, field[1] the web address and so on.
634
635 To imitate the matching of a shell we can use wildcard mode.
636
637 \snippet code/src_corelib_text_qregexp.cpp 12
638
639 Wildcard matching can be convenient because of its simplicity, but
640 any wildcard regexp can be defined using full regexps, e.g.
641 \b{.*\\.html$}. Notice that we can't match both \c .html and \c
642 .htm files with a wildcard unless we use \b{*.htm*} which will
643 also match 'test.html.bak'. A full regexp gives us the precision
644 we need, \b{.*\\.html?$}.
645
646 QRegExp can match case insensitively using setCaseSensitivity(),
647 and can use non-greedy matching, see setMinimal(). By
648 default QRegExp uses full regexps but this can be changed with
649 setPatternSyntax(). Searching can be done forward with indexIn() or backward
650 with lastIndexIn(). Captured text can be accessed using
651 capturedTexts() which returns a string list of all captured
652 strings, or using cap() which returns the captured string for the
653 given index. The pos() function takes a match index and returns
654 the position in the string where the match was made (or -1 if
655 there was no match).
656
657 \sa QString, QStringList, QSortFilterProxyModel
658
659 \section1 Porting to QRegularExpression
660
661 \include corelib/port-from-qregexp.qdocinc porting-to-qregularexpression
662*/
663
664#if defined(Q_OS_VXWORKS) && defined(EOS)
665# undef EOS
666#endif
667
668const int NumBadChars = 64;
669#define BadChar(ch) ((ch).unicode() % NumBadChars)
670
671const int NoOccurrence = INT_MAX;
672const int EmptyCapture = INT_MAX;
673const int InftyLen = INT_MAX;
674const int InftyRep = 1025;
675const int EOS = -1;
676
677static bool isWord(QChar ch)
678{
679 return ch.isLetterOrNumber() || ch.isMark() || ch == QLatin1Char('_');
680}
681
682/*
683 Merges two vectors of ints and puts the result into the first
684 one.
685*/
686static void mergeInto(QList<int> *a, const QList<int> &b)
687{
688 int asize = a->size();
689 int bsize = b.size();
690 if (asize == 0) {
691 *a = b;
692#ifndef QT_NO_REGEXP_OPTIM
693 } else if (bsize == 1 && a->at(i: asize - 1) < b.at(i: 0)) {
694 a->resize(size: asize + 1);
695 (*a)[asize] = b.at(i: 0);
696#endif
697 } else if (bsize >= 1) {
698 int csize = asize + bsize;
699 QList<int> c(csize);
700 int i = 0, j = 0, k = 0;
701 while (i < asize) {
702 if (j < bsize) {
703 if (a->at(i) == b.at(i: j)) {
704 ++i;
705 --csize;
706 } else if (a->at(i) < b.at(i: j)) {
707 c[k++] = a->at(i: i++);
708 } else {
709 c[k++] = b.at(i: j++);
710 }
711 } else {
712 memcpy(dest: c.data() + k, src: a->constData() + i, n: (asize - i) * sizeof(int));
713 break;
714 }
715 }
716 c.resize(size: csize);
717 if (j < bsize)
718 memcpy(dest: c.data() + k, src: b.constData() + j, n: (bsize - j) * sizeof(int));
719 *a = c;
720 }
721}
722
723#ifndef QT_NO_REGEXP_WILDCARD
724/*
725 Translates a wildcard pattern to an equivalent regular expression
726 pattern (e.g., *.cpp to .*\.cpp).
727
728 If enableEscaping is true, it is possible to escape the wildcard
729 characters with \
730*/
731static QString wc2rx(const QString &wc_str, const bool enableEscaping)
732{
733 const int wclen = wc_str.size();
734 QString rx;
735 int i = 0;
736 bool isEscaping = false; // the previous character is '\'
737 const QChar *wc = wc_str.unicode();
738
739 while (i < wclen) {
740 const QChar c = wc[i++];
741 switch (c.unicode()) {
742 case '\\':
743 if (enableEscaping) {
744 if (isEscaping) {
745 rx += QLatin1String("\\\\");
746 } // we insert the \\ later if necessary
747 if (i == wclen) { // the end
748 rx += QLatin1String("\\\\");
749 }
750 } else {
751 rx += QLatin1String("\\\\");
752 }
753 isEscaping = true;
754 break;
755 case '*':
756 if (isEscaping) {
757 rx += QLatin1String("\\*");
758 isEscaping = false;
759 } else {
760 rx += QLatin1String(".*");
761 }
762 break;
763 case '?':
764 if (isEscaping) {
765 rx += QLatin1String("\\?");
766 isEscaping = false;
767 } else {
768 rx += QLatin1Char('.');
769 }
770
771 break;
772 case '$':
773 case '(':
774 case ')':
775 case '+':
776 case '.':
777 case '^':
778 case '{':
779 case '|':
780 case '}':
781 if (isEscaping) {
782 isEscaping = false;
783 rx += QLatin1String("\\\\");
784 }
785 rx += QLatin1Char('\\');
786 rx += c;
787 break;
788 case '[':
789 if (isEscaping) {
790 isEscaping = false;
791 rx += QLatin1String("\\[");
792 } else {
793 rx += c;
794 if (wc[i] == QLatin1Char('^'))
795 rx += wc[i++];
796 if (i < wclen) {
797 if (wc[i] == QLatin1Char(']'))
798 rx += wc[i++];
799 while (i < wclen && wc[i] != QLatin1Char(']')) {
800 if (wc[i] == QLatin1Char('\\'))
801 rx += QLatin1Char('\\');
802 rx += wc[i++];
803 }
804 }
805 }
806 break;
807
808 case ']':
809 if (isEscaping){
810 isEscaping = false;
811 rx += QLatin1String("\\");
812 }
813 rx += c;
814 break;
815
816 default:
817 if (isEscaping){
818 isEscaping = false;
819 rx += QLatin1String("\\\\");
820 }
821 rx += c;
822 }
823 }
824 return rx;
825}
826#endif
827
828static int caretIndex(int offset, QRegExp::CaretMode caretMode)
829{
830 if (caretMode == QRegExp::CaretAtZero) {
831 return 0;
832 } else if (caretMode == QRegExp::CaretAtOffset) {
833 return offset;
834 } else { // QRegExp::CaretWontMatch
835 return -1;
836 }
837}
838
839/*
840 The QRegExpEngineKey struct uniquely identifies an engine.
841*/
842struct QRegExpEngineKey
843{
844 QString pattern;
845 QRegExp::PatternSyntax patternSyntax;
846 Qt::CaseSensitivity cs;
847
848 inline QRegExpEngineKey(const QString &pattern, QRegExp::PatternSyntax patternSyntax,
849 Qt::CaseSensitivity cs)
850 : pattern(pattern), patternSyntax(patternSyntax), cs(cs) {}
851
852 inline void clear() {
853 pattern.clear();
854 patternSyntax = QRegExp::RegExp;
855 cs = Qt::CaseSensitive;
856 }
857};
858
859static bool operator==(const QRegExpEngineKey &key1, const QRegExpEngineKey &key2)
860{
861 return key1.pattern == key2.pattern && key1.patternSyntax == key2.patternSyntax
862 && key1.cs == key2.cs;
863}
864
865static size_t qHash(const QRegExpEngineKey &key, size_t seed = 0) noexcept
866{
867 return qHashMulti(seed, args: key.pattern, args: key.patternSyntax, args: key.cs);
868}
869
870class QRegExpEngine;
871
872/*
873 This is the engine state during matching.
874*/
875struct QRegExpMatchState
876{
877 const QChar *in; // a pointer to the input string data
878 int pos; // the current position in the string
879 int caretPos;
880 int len; // the length of the input string
881 bool minimal; // minimal matching?
882 int *bigArray; // big array holding the data for the next pointers
883 int *inNextStack; // is state is nextStack?
884 int *curStack; // stack of current states
885 int *nextStack; // stack of next states
886 int *curCapBegin; // start of current states' captures
887 int *nextCapBegin; // start of next states' captures
888 int *curCapEnd; // end of current states' captures
889 int *nextCapEnd; // end of next states' captures
890 int *tempCapBegin; // start of temporary captures
891 int *tempCapEnd; // end of temporary captures
892 int *capBegin; // start of captures for a next state
893 int *capEnd; // end of captures for a next state
894 int *slideTab; // bump-along slide table for bad-character heuristic
895 int *captured; // what match() returned last
896 int slideTabSize; // size of slide table
897 int capturedSize;
898#ifndef QT_NO_REGEXP_BACKREF
899 QList<QList<int>> sleeping; // list of back-reference sleepers
900#endif
901 int matchLen; // length of match
902 int oneTestMatchedLen; // length of partial match
903
904 const QRegExpEngine *eng;
905
906 inline QRegExpMatchState() : bigArray(nullptr), captured(nullptr) {}
907 inline ~QRegExpMatchState() { free(ptr: bigArray); }
908
909 void drain() { free(ptr: bigArray); bigArray = nullptr; captured = nullptr; } // to save memory
910 void prepareForMatch(QRegExpEngine *eng);
911 void match(const QChar *str, int len, int pos, bool minimal,
912 bool oneTest, int caretIndex);
913 bool matchHere();
914 bool testAnchor(int i, int a, const int *capBegin);
915};
916
917/*
918 The struct QRegExpAutomatonState represents one state in a modified NFA. The
919 input characters matched are stored in the state instead of on
920 the transitions, something possible for an automaton
921 constructed from a regular expression.
922*/
923struct QRegExpAutomatonState
924{
925#ifndef QT_NO_REGEXP_CAPTURE
926 int atom; // which atom does this state belong to?
927#endif
928 int match; // what does it match? (see CharClassBit and BackRefBit)
929 QList<int> outs; // out-transitions
930 QMap<int, int> reenter; // atoms reentered when transiting out
931 QMap<int, int> anchors; // anchors met when transiting out
932
933 inline QRegExpAutomatonState() { }
934#ifndef QT_NO_REGEXP_CAPTURE
935 inline QRegExpAutomatonState(int a, int m)
936 : atom(a), match(m) { }
937#else
938 inline QRegExpAutomatonState(int m)
939 : match(m) { }
940#endif
941};
942
943Q_DECLARE_TYPEINFO(QRegExpAutomatonState, Q_RELOCATABLE_TYPE);
944
945/*
946 The struct QRegExpCharClassRange represents a range of characters (e.g.,
947 [0-9] denotes range 48 to 57).
948*/
949struct QRegExpCharClassRange
950{
951 ushort from; // 48
952 ushort len; // 10
953};
954
955Q_DECLARE_TYPEINFO(QRegExpCharClassRange, Q_PRIMITIVE_TYPE);
956
957#ifndef QT_NO_REGEXP_CAPTURE
958/*
959 The struct QRegExpAtom represents one node in the hierarchy of regular
960 expression atoms.
961*/
962struct QRegExpAtom
963{
964 enum { NoCapture = -1, OfficialCapture = -2, UnofficialCapture = -3 };
965
966 int parent; // index of parent in array of atoms
967 int capture; // index of capture, from 1 to ncap - 1
968};
969
970Q_DECLARE_TYPEINFO(QRegExpAtom, Q_PRIMITIVE_TYPE);
971#endif
972
973struct QRegExpLookahead;
974
975#ifndef QT_NO_REGEXP_ANCHOR_ALT
976/*
977 The struct QRegExpAnchorAlternation represents a pair of anchors with
978 OR semantics.
979*/
980struct QRegExpAnchorAlternation
981{
982 int a; // this anchor...
983 int b; // ...or this one
984};
985
986Q_DECLARE_TYPEINFO(QRegExpAnchorAlternation, Q_PRIMITIVE_TYPE);
987#endif
988
989#ifndef QT_NO_REGEXP_CCLASS
990
991#define FLAG(x) (1 << (x))
992/*
993 The class QRegExpCharClass represents a set of characters, such as can
994 be found in regular expressions (e.g., [a-z] denotes the set
995 {a, b, ..., z}).
996*/
997class QRegExpCharClass
998{
999public:
1000 QRegExpCharClass();
1001
1002 void clear();
1003 bool negative() const { return n; }
1004 void setNegative(bool negative);
1005 void addCategories(uint cats);
1006 void addRange(ushort from, ushort to);
1007 void addSingleton(ushort ch) { addRange(from: ch, to: ch); }
1008
1009 bool in(QChar ch) const;
1010#ifndef QT_NO_REGEXP_OPTIM
1011 const QList<int> &firstOccurrence() const { return occ1; }
1012#endif
1013
1014#if defined(QT_DEBUG)
1015 void dump() const;
1016#endif
1017
1018private:
1019 QList<QRegExpCharClassRange> r; // character ranges
1020#ifndef QT_NO_REGEXP_OPTIM
1021 QList<int> occ1; // first-occurrence array
1022#endif
1023 uint c; // character classes
1024 bool n; // negative?
1025};
1026#else
1027struct QRegExpCharClass
1028{
1029 int dummy;
1030
1031#ifndef QT_NO_REGEXP_OPTIM
1032 QRegExpCharClass() { occ1.fill(0, NumBadChars); }
1033
1034 const QList<int> &firstOccurrence() const { return occ1; }
1035 QList<int> occ1;
1036#endif
1037};
1038#endif
1039
1040Q_DECLARE_TYPEINFO(QRegExpCharClass, Q_RELOCATABLE_TYPE);
1041
1042/*
1043 The QRegExpEngine class encapsulates a modified nondeterministic
1044 finite automaton (NFA).
1045*/
1046class QRegExpEngine
1047{
1048 Q_DISABLE_COPY_MOVE(QRegExpEngine)
1049public:
1050 QRegExpEngine(Qt::CaseSensitivity cs, bool greedyQuantifiers)
1051 : cs(cs), greedyQuantifiers(greedyQuantifiers) { setup(); }
1052
1053 QRegExpEngine(const QRegExpEngineKey &key);
1054 ~QRegExpEngine();
1055
1056 bool isValid() const { return valid; }
1057 const QString &errorString() const { return yyError; }
1058 int captureCount() const { return officialncap; }
1059
1060 int createState(QChar ch);
1061 int createState(const QRegExpCharClass &cc);
1062#ifndef QT_NO_REGEXP_BACKREF
1063 int createState(int bref);
1064#endif
1065
1066 void addCatTransitions(const QList<int> &from, const QList<int> &to);
1067#ifndef QT_NO_REGEXP_CAPTURE
1068 void addPlusTransitions(const QList<int> &from, const QList<int> &to, int atom);
1069#endif
1070
1071#ifndef QT_NO_REGEXP_ANCHOR_ALT
1072 int anchorAlternation(int a, int b);
1073 int anchorConcatenation(int a, int b);
1074#else
1075 int anchorAlternation(int a, int b) { return a & b; }
1076 int anchorConcatenation(int a, int b) { return a | b; }
1077#endif
1078 void addAnchors(int from, int to, int a);
1079
1080#ifndef QT_NO_REGEXP_OPTIM
1081 void heuristicallyChooseHeuristic();
1082#endif
1083
1084#if defined(QT_DEBUG)
1085 void dump() const;
1086#endif
1087
1088 QAtomicInt ref;
1089
1090private:
1091 enum { CharClassBit = 0x10000, BackRefBit = 0x20000 };
1092 enum { InitialState = 0, FinalState = 1 };
1093
1094 void setup();
1095 int setupState(int match);
1096
1097 /*
1098 Let's hope that 13 lookaheads and 14 back-references are
1099 enough.
1100 */
1101 enum { MaxLookaheads = 13, MaxBackRefs = 14 };
1102 enum { Anchor_Dollar = 0x00000001, Anchor_Caret = 0x00000002, Anchor_Word = 0x00000004,
1103 Anchor_NonWord = 0x00000008, Anchor_FirstLookahead = 0x00000010,
1104 Anchor_BackRef1Empty = Anchor_FirstLookahead << MaxLookaheads,
1105 Anchor_BackRef0Empty = Anchor_BackRef1Empty >> 1,
1106 Anchor_Alternation = unsigned(Anchor_BackRef1Empty) << MaxBackRefs,
1107
1108 Anchor_LookaheadMask = (Anchor_FirstLookahead - 1) ^
1109 ((Anchor_FirstLookahead << MaxLookaheads) - 1) };
1110#ifndef QT_NO_REGEXP_CAPTURE
1111 int startAtom(bool officialCapture);
1112 void finishAtom(int atom, bool needCapture);
1113#endif
1114
1115#ifndef QT_NO_REGEXP_LOOKAHEAD
1116 int addLookahead(QRegExpEngine *eng, bool negative);
1117#endif
1118
1119#ifndef QT_NO_REGEXP_OPTIM
1120 bool goodStringMatch(QRegExpMatchState &matchState) const;
1121 bool badCharMatch(QRegExpMatchState &matchState) const;
1122#else
1123 bool bruteMatch(QRegExpMatchState &matchState) const;
1124#endif
1125
1126 QList<QRegExpAutomatonState> s; // array of states
1127#ifndef QT_NO_REGEXP_CAPTURE
1128 QList<QRegExpAtom> f; // atom hierarchy
1129 int nf; // number of atoms
1130 int cf; // current atom
1131 QList<int> captureForOfficialCapture;
1132#endif
1133 int officialncap; // number of captures, seen from the outside
1134 int ncap; // number of captures, seen from the inside
1135#ifndef QT_NO_REGEXP_CCLASS
1136 QList<QRegExpCharClass> cl; // array of character classes
1137#endif
1138#ifndef QT_NO_REGEXP_LOOKAHEAD
1139 QList<QRegExpLookahead *> ahead; // array of lookaheads
1140#endif
1141#ifndef QT_NO_REGEXP_ANCHOR_ALT
1142 QList<QRegExpAnchorAlternation> aa; // array of (a, b) pairs of anchors
1143#endif
1144#ifndef QT_NO_REGEXP_OPTIM
1145 bool caretAnchored; // does the regexp start with ^?
1146 bool trivial; // is the good-string all that needs to match?
1147#endif
1148 bool valid; // is the regular expression valid?
1149 Qt::CaseSensitivity cs; // case sensitive?
1150 bool greedyQuantifiers; // RegExp2?
1151 bool xmlSchemaExtensions;
1152#ifndef QT_NO_REGEXP_BACKREF
1153 int nbrefs; // number of back-references
1154#endif
1155
1156#ifndef QT_NO_REGEXP_OPTIM
1157 bool useGoodStringHeuristic; // use goodStringMatch? otherwise badCharMatch
1158
1159 int goodEarlyStart; // the index where goodStr can first occur in a match
1160 int goodLateStart; // the index where goodStr can last occur in a match
1161 QString goodStr; // the string that any match has to contain
1162
1163 int minl; // the minimum length of a match
1164 QList<int> occ1; // first-occurrence array
1165#endif
1166
1167 /*
1168 The class Box is an abstraction for a regular expression
1169 fragment. It can also be seen as one node in the syntax tree of
1170 a regular expression with synthetized attributes.
1171
1172 Its interface is ugly for performance reasons.
1173 */
1174 class Box
1175 {
1176 public:
1177 Box(QRegExpEngine *engine);
1178 Box(const Box &b) { operator=(b); }
1179
1180 Box &operator=(const Box &b);
1181
1182 void clear() { operator=(b: Box(eng)); }
1183 void set(QChar ch);
1184 void set(const QRegExpCharClass &cc);
1185#ifndef QT_NO_REGEXP_BACKREF
1186 void set(int bref);
1187#endif
1188
1189 void cat(const Box &b);
1190 void orx(const Box &b);
1191 void plus(int atom);
1192 void opt();
1193 void catAnchor(int a);
1194#ifndef QT_NO_REGEXP_OPTIM
1195 void setupHeuristics();
1196#endif
1197
1198#if defined(QT_DEBUG)
1199 void dump() const;
1200#endif
1201
1202 private:
1203 void addAnchorsToEngine(const Box &to) const;
1204
1205 QRegExpEngine *eng; // the automaton under construction
1206 QList<int> ls; // the left states (firstpos)
1207 QList<int> rs; // the right states (lastpos)
1208 QMap<int, int> lanchors; // the left anchors
1209 QMap<int, int> ranchors; // the right anchors
1210 int skipanchors; // the anchors to match if the box is skipped
1211
1212#ifndef QT_NO_REGEXP_OPTIM
1213 int earlyStart; // the index where str can first occur
1214 int lateStart; // the index where str can last occur
1215 QString str; // a string that has to occur in any match
1216 QString leftStr; // a string occurring at the left of this box
1217 QString rightStr; // a string occurring at the right of this box
1218 int maxl; // the maximum length of this box (possibly InftyLen)
1219#endif
1220
1221 int minl; // the minimum length of this box
1222#ifndef QT_NO_REGEXP_OPTIM
1223 QList<int> occ1; // first-occurrence array
1224#endif
1225 };
1226
1227 friend class Box;
1228
1229 /*
1230 This is the lexical analyzer for regular expressions.
1231 */
1232 enum { Tok_Eos, Tok_Dollar, Tok_LeftParen, Tok_MagicLeftParen, Tok_PosLookahead,
1233 Tok_NegLookahead, Tok_RightParen, Tok_CharClass, Tok_Caret, Tok_Quantifier, Tok_Bar,
1234 Tok_Word, Tok_NonWord, Tok_Char = 0x10000, Tok_BackRef = 0x20000 };
1235 int getChar();
1236 int getEscape();
1237#ifndef QT_NO_REGEXP_INTERVAL
1238 int getRep(int def);
1239#endif
1240#ifndef QT_NO_REGEXP_LOOKAHEAD
1241 void skipChars(int n);
1242#endif
1243 void error(const char *msg);
1244 void startTokenizer(const QChar *rx, int len);
1245 int getToken();
1246
1247 const QChar *yyIn; // a pointer to the input regular expression pattern
1248 int yyPos0; // the position of yyTok in the input pattern
1249 int yyPos; // the position of the next character to read
1250 int yyLen; // the length of yyIn
1251 int yyCh; // the last character read
1252 std::optional<QRegExpCharClass> yyCharClass; // attribute for Tok_CharClass tokens
1253 int yyMinRep; // attribute for Tok_Quantifier
1254 int yyMaxRep; // ditto
1255 QString yyError; // syntax error or overflow during parsing?
1256
1257 /*
1258 This is the syntactic analyzer for regular expressions.
1259 */
1260 int parse(const QChar *rx, int len);
1261 void parseAtom(Box *box);
1262 void parseFactor(Box *box);
1263 void parseTerm(Box *box);
1264 void parseExpression(Box *box);
1265
1266 int yyTok; // the last token read
1267 bool yyMayCapture; // set this to false to disable capturing
1268
1269 friend struct QRegExpMatchState;
1270};
1271
1272#ifndef QT_NO_REGEXP_LOOKAHEAD
1273/*
1274 The struct QRegExpLookahead represents a lookahead a la Perl (e.g.,
1275 (?=foo) and (?!bar)).
1276*/
1277struct QRegExpLookahead
1278{
1279 QRegExpEngine *eng; // NFA representing the embedded regular expression
1280 bool neg; // negative lookahead?
1281
1282 inline QRegExpLookahead(QRegExpEngine *eng0, bool neg0)
1283 : eng(eng0), neg(neg0) { }
1284 inline ~QRegExpLookahead() { delete eng; }
1285};
1286#endif
1287
1288/*!
1289 \internal
1290 convert the pattern string to the RegExp syntax.
1291
1292 This is also used by QScriptEngine::newRegExp to convert to a pattern that JavaScriptCore can understan
1293 */
1294Q_CORE5COMPAT_EXPORT QString qt_regexp_toCanonical(const QString &pattern,
1295 QRegExp::PatternSyntax patternSyntax)
1296{
1297 switch (patternSyntax) {
1298#ifndef QT_NO_REGEXP_WILDCARD
1299 case QRegExp::Wildcard:
1300 return wc2rx(wc_str: pattern, enableEscaping: false);
1301 case QRegExp::WildcardUnix:
1302 return wc2rx(wc_str: pattern, enableEscaping: true);
1303#endif
1304 case QRegExp::FixedString:
1305 return QRegExp::escape(str: pattern);
1306 case QRegExp::W3CXmlSchema11:
1307 default:
1308 return pattern;
1309 }
1310}
1311
1312QRegExpEngine::QRegExpEngine(const QRegExpEngineKey &key)
1313 : cs(key.cs), greedyQuantifiers(key.patternSyntax == QRegExp::RegExp2),
1314 xmlSchemaExtensions(key.patternSyntax == QRegExp::W3CXmlSchema11)
1315{
1316 setup();
1317
1318 QString rx = qt_regexp_toCanonical(pattern: key.pattern, patternSyntax: key.patternSyntax);
1319
1320 valid = (parse(rx: rx.unicode(), len: rx.size()) == rx.size());
1321 if (!valid) {
1322#ifndef QT_NO_REGEXP_OPTIM
1323 trivial = false;
1324#endif
1325 error(RXERR_LEFTDELIM);
1326 }
1327}
1328
1329QRegExpEngine::~QRegExpEngine()
1330{
1331#ifndef QT_NO_REGEXP_LOOKAHEAD
1332 qDeleteAll(c: ahead);
1333#endif
1334}
1335
1336void QRegExpMatchState::prepareForMatch(QRegExpEngine *eng)
1337{
1338 /*
1339 We use one QList<int> for all the big data used a lot in
1340 matchHere() and friends.
1341 */
1342 int ns = eng->s.size(); // number of states
1343 int ncap = eng->ncap;
1344#ifndef QT_NO_REGEXP_OPTIM
1345 int newSlideTabSize = qMax(a: eng->minl + 1, b: 16);
1346#else
1347 int newSlideTabSize = 0;
1348#endif
1349 int numCaptures = eng->captureCount();
1350 int newCapturedSize = 2 + 2 * numCaptures;
1351 bigArray = q_check_ptr(p: (int *)realloc(ptr: bigArray, size: ((3 + 4 * ncap) * ns + 4 * ncap + newSlideTabSize + newCapturedSize)*sizeof(int)));
1352
1353 // set all internal variables only _after_ bigArray is realloc'ed
1354 // to prevent a broken regexp in oom case
1355
1356 slideTabSize = newSlideTabSize;
1357 capturedSize = newCapturedSize;
1358 inNextStack = bigArray;
1359 memset(s: inNextStack, c: -1, n: ns * sizeof(int));
1360 curStack = inNextStack + ns;
1361 nextStack = inNextStack + 2 * ns;
1362
1363 curCapBegin = inNextStack + 3 * ns;
1364 nextCapBegin = curCapBegin + ncap * ns;
1365 curCapEnd = curCapBegin + 2 * ncap * ns;
1366 nextCapEnd = curCapBegin + 3 * ncap * ns;
1367
1368 tempCapBegin = curCapBegin + 4 * ncap * ns;
1369 tempCapEnd = tempCapBegin + ncap;
1370 capBegin = tempCapBegin + 2 * ncap;
1371 capEnd = tempCapBegin + 3 * ncap;
1372
1373 slideTab = tempCapBegin + 4 * ncap;
1374 captured = slideTab + slideTabSize;
1375 memset(s: captured, c: -1, n: capturedSize*sizeof(int));
1376 this->eng = eng;
1377}
1378
1379/*
1380 Tries to match in str and returns an array of (begin, length) pairs
1381 for captured text. If there is no match, all pairs are (-1, -1).
1382*/
1383void QRegExpMatchState::match(const QChar *str0, int len0, int pos0,
1384 bool minimal0, bool oneTest, int caretIndex)
1385{
1386 bool matched = false;
1387 QChar char_null;
1388
1389#ifndef QT_NO_REGEXP_OPTIM
1390 if (eng->trivial && !oneTest) {
1391 // ### Qt6: qsizetype
1392 pos = int(QtPrivate::findString(haystack: QStringView(str0, len0), from: pos0, needle: QStringView(eng->goodStr.unicode(), eng->goodStr.size()), cs: eng->cs));
1393 matchLen = eng->goodStr.size();
1394 matched = (pos != -1);
1395 } else
1396#endif
1397 {
1398 in = str0;
1399 if (in == nullptr)
1400 in = &char_null;
1401 pos = pos0;
1402 caretPos = caretIndex;
1403 len = len0;
1404 minimal = minimal0;
1405 matchLen = 0;
1406 oneTestMatchedLen = 0;
1407
1408 if (eng->valid && pos >= 0 && pos <= len) {
1409#ifndef QT_NO_REGEXP_OPTIM
1410 if (oneTest) {
1411 matched = matchHere();
1412 } else {
1413 if (pos <= len - eng->minl) {
1414 if (eng->caretAnchored) {
1415 matched = matchHere();
1416 } else if (eng->useGoodStringHeuristic) {
1417 matched = eng->goodStringMatch(matchState&: *this);
1418 } else {
1419 matched = eng->badCharMatch(matchState&: *this);
1420 }
1421 }
1422 }
1423#else
1424 matched = oneTest ? matchHere() : eng->bruteMatch(*this);
1425#endif
1426 }
1427 }
1428
1429 if (matched) {
1430 int *c = captured;
1431 *c++ = pos;
1432 *c++ = matchLen;
1433
1434 int numCaptures = (capturedSize - 2) >> 1;
1435#ifndef QT_NO_REGEXP_CAPTURE
1436 for (int i = 0; i < numCaptures; ++i) {
1437 int j = eng->captureForOfficialCapture.at(i);
1438 if (capBegin[j] != EmptyCapture) {
1439 int len = capEnd[j] - capBegin[j];
1440 *c++ = (len > 0) ? pos + capBegin[j] : 0;
1441 *c++ = len;
1442 } else {
1443 *c++ = -1;
1444 *c++ = -1;
1445 }
1446 }
1447#endif
1448 } else {
1449 // we rely on 2's complement here
1450 memset(s: captured, c: -1, n: capturedSize * sizeof(int));
1451 }
1452}
1453
1454/*
1455 The three following functions add one state to the automaton and
1456 return the number of the state.
1457*/
1458
1459int QRegExpEngine::createState(QChar ch)
1460{
1461 return setupState(ch.unicode());
1462}
1463
1464int QRegExpEngine::createState(const QRegExpCharClass &cc)
1465{
1466#ifndef QT_NO_REGEXP_CCLASS
1467 int n = cl.size();
1468 cl += QRegExpCharClass(cc);
1469 return setupState(CharClassBit | n);
1470#else
1471 Q_UNUSED(cc);
1472 return setupState(CharClassBit);
1473#endif
1474}
1475
1476#ifndef QT_NO_REGEXP_BACKREF
1477int QRegExpEngine::createState(int bref)
1478{
1479 if (bref > nbrefs) {
1480 nbrefs = bref;
1481 if (nbrefs > MaxBackRefs) {
1482 error(RXERR_LIMIT);
1483 return 0;
1484 }
1485 }
1486 return setupState(BackRefBit | bref);
1487}
1488#endif
1489
1490/*
1491 The two following functions add a transition between all pairs of
1492 states (i, j) where i is found in from, and j is found in to.
1493
1494 Cat-transitions are distinguished from plus-transitions for
1495 capturing.
1496*/
1497
1498void QRegExpEngine::addCatTransitions(const QList<int> &from, const QList<int> &to)
1499{
1500 for (int i = 0; i < from.size(); i++)
1501 mergeInto(a: &s[from.at(i)].outs, b: to);
1502}
1503
1504#ifndef QT_NO_REGEXP_CAPTURE
1505void QRegExpEngine::addPlusTransitions(const QList<int> &from, const QList<int> &to, int atom)
1506{
1507 for (int i = 0; i < from.size(); i++) {
1508 QRegExpAutomatonState &st = s[from.at(i)];
1509 const QList<int> oldOuts = st.outs;
1510 mergeInto(a: &st.outs, b: to);
1511 if (f.at(i: atom).capture != QRegExpAtom::NoCapture) {
1512 for (int j = 0; j < to.size(); j++) {
1513 // ### st.reenter.contains(to.at(j)) check looks suspicious
1514 if (!st.reenter.contains(key: to.at(i: j)) &&
1515 !std::binary_search(first: oldOuts.constBegin(), last: oldOuts.constEnd(), val: to.at(i: j)))
1516 st.reenter.insert(key: to.at(i: j), value: atom);
1517 }
1518 }
1519 }
1520}
1521#endif
1522
1523#ifndef QT_NO_REGEXP_ANCHOR_ALT
1524/*
1525 Returns an anchor that means a OR b.
1526*/
1527int QRegExpEngine::anchorAlternation(int a, int b)
1528{
1529 if (((a & b) == a || (a & b) == b) && ((a | b) & Anchor_Alternation) == 0)
1530 return a & b;
1531
1532 int n = aa.size();
1533#ifndef QT_NO_REGEXP_OPTIM
1534 if (n > 0 && aa.at(i: n - 1).a == a && aa.at(i: n - 1).b == b)
1535 return Anchor_Alternation | (n - 1);
1536#endif
1537
1538 QRegExpAnchorAlternation element = {.a: a, .b: b};
1539 aa.append(t: element);
1540 return Anchor_Alternation | n;
1541}
1542
1543/*
1544 Returns an anchor that means a AND b.
1545*/
1546int QRegExpEngine::anchorConcatenation(int a, int b)
1547{
1548 if (((a | b) & Anchor_Alternation) == 0)
1549 return a | b;
1550 if ((b & Anchor_Alternation) != 0)
1551 qSwap(value1&: a, value2&: b);
1552
1553 int aprime = anchorConcatenation(a: aa.at(i: a ^ Anchor_Alternation).a, b);
1554 int bprime = anchorConcatenation(a: aa.at(i: a ^ Anchor_Alternation).b, b);
1555 return anchorAlternation(a: aprime, b: bprime);
1556}
1557#endif
1558
1559/*
1560 Adds anchor a on a transition caracterised by its from state and
1561 its to state.
1562*/
1563void QRegExpEngine::addAnchors(int from, int to, int a)
1564{
1565 QRegExpAutomatonState &st = s[from];
1566 if (st.anchors.contains(key: to))
1567 a = anchorAlternation(a: st.anchors.value(key: to), b: a);
1568 st.anchors.insert(key: to, value: a);
1569}
1570
1571#ifndef QT_NO_REGEXP_OPTIM
1572/*
1573 This function chooses between the good-string and the bad-character
1574 heuristics. It computes two scores and chooses the heuristic with
1575 the highest score.
1576
1577 Here are some common-sense constraints on the scores that should be
1578 respected if the formulas are ever modified: (1) If goodStr is
1579 empty, the good-string heuristic scores 0. (2) If the regular
1580 expression is trivial, the good-string heuristic should be used.
1581 (3) If the search is case insensitive, the good-string heuristic
1582 should be used, unless it scores 0. (Case insensitivity turns all
1583 entries of occ1 to 0.) (4) If (goodLateStart - goodEarlyStart) is
1584 big, the good-string heuristic should score less.
1585*/
1586void QRegExpEngine::heuristicallyChooseHeuristic()
1587{
1588 if (minl == 0) {
1589 useGoodStringHeuristic = false;
1590 } else if (trivial) {
1591 useGoodStringHeuristic = true;
1592 } else {
1593 /*
1594 Magic formula: The good string has to constitute a good
1595 proportion of the minimum-length string, and appear at a
1596 more-or-less known index.
1597 */
1598 int goodStringScore = (64 * goodStr.size() / minl) -
1599 (goodLateStart - goodEarlyStart);
1600 /*
1601 Less magic formula: We pick some characters at random, and
1602 check whether they are good or bad.
1603 */
1604 int badCharScore = 0;
1605 int step = qMax(a: 1, b: NumBadChars / 32);
1606 for (int i = 1; i < NumBadChars; i += step) {
1607 if (occ1.at(i) == NoOccurrence)
1608 badCharScore += minl;
1609 else
1610 badCharScore += occ1.at(i);
1611 }
1612 badCharScore /= minl;
1613 useGoodStringHeuristic = (goodStringScore > badCharScore);
1614 }
1615}
1616#endif
1617
1618#if defined(QT_DEBUG)
1619void QRegExpEngine::dump() const
1620{
1621 int i, j;
1622 qDebug(msg: "Case %ssensitive engine", cs ? "" : "in");
1623 qDebug(msg: " States");
1624 for (i = 0; i < s.size(); i++) {
1625 qDebug(msg: " %d%s", i, i == InitialState ? " (initial)" : i == FinalState ? " (final)" : "");
1626#ifndef QT_NO_REGEXP_CAPTURE
1627 if (nf > 0)
1628 qDebug(msg: " in atom %d", s[i].atom);
1629#endif
1630 int m = s[i].match;
1631 if ((m & CharClassBit) != 0) {
1632 qDebug(msg: " match character class %d", m ^ CharClassBit);
1633#ifndef QT_NO_REGEXP_CCLASS
1634 cl[m ^ CharClassBit].dump();
1635#else
1636 qDebug(" negative character class");
1637#endif
1638 } else if ((m & BackRefBit) != 0) {
1639 qDebug(msg: " match back-reference %d", m ^ BackRefBit);
1640 } else if (m >= 0x20 && m <= 0x7e) {
1641 qDebug(msg: " match 0x%.4x (%c)", m, m);
1642 } else {
1643 qDebug(msg: " match 0x%.4x", m);
1644 }
1645 for (j = 0; j < s[i].outs.size(); j++) {
1646 int next = s[i].outs[j];
1647 qDebug(msg: " -> %d", next);
1648 if (s[i].reenter.contains(key: next))
1649 qDebug(msg: " [reenter %d]", s[i].reenter[next]);
1650 if (s[i].anchors.value(key: next) != 0)
1651 qDebug(msg: " [anchors 0x%.8x]", s[i].anchors[next]);
1652 }
1653 }
1654#ifndef QT_NO_REGEXP_CAPTURE
1655 if (nf > 0) {
1656 qDebug(msg: " Atom Parent Capture");
1657 for (i = 0; i < nf; i++) {
1658 if (f[i].capture == QRegExpAtom::NoCapture) {
1659 qDebug(msg: " %6d %6d nil", i, f[i].parent);
1660 } else {
1661 int cap = f[i].capture;
1662 bool official = captureForOfficialCapture.contains(t: cap);
1663 qDebug(msg: " %6d %6d %6d %s", i, f[i].parent, f[i].capture,
1664 official ? "official" : "");
1665 }
1666 }
1667 }
1668#endif
1669#ifndef QT_NO_REGEXP_ANCHOR_ALT
1670 for (i = 0; i < aa.size(); i++)
1671 qDebug(msg: " Anchor alternation 0x%.8x: 0x%.8x 0x%.9x", i, aa[i].a, aa[i].b);
1672#endif
1673}
1674#endif
1675
1676void QRegExpEngine::setup()
1677{
1678 ref.storeRelaxed(newValue: 1);
1679#ifndef QT_NO_REGEXP_CAPTURE
1680 f.resize(size: 32);
1681 nf = 0;
1682 cf = -1;
1683#endif
1684 officialncap = 0;
1685 ncap = 0;
1686#ifndef QT_NO_REGEXP_OPTIM
1687 caretAnchored = true;
1688 trivial = true;
1689#endif
1690 valid = false;
1691#ifndef QT_NO_REGEXP_BACKREF
1692 nbrefs = 0;
1693#endif
1694#ifndef QT_NO_REGEXP_OPTIM
1695 useGoodStringHeuristic = true;
1696 minl = 0;
1697 occ1.fill(t: 0, newSize: NumBadChars);
1698#endif
1699}
1700
1701int QRegExpEngine::setupState(int match)
1702{
1703#ifndef QT_NO_REGEXP_CAPTURE
1704 s += QRegExpAutomatonState(cf, match);
1705#else
1706 s += QRegExpAutomatonState(match);
1707#endif
1708 return s.size() - 1;
1709}
1710
1711#ifndef QT_NO_REGEXP_CAPTURE
1712/*
1713 Functions startAtom() and finishAtom() should be called to delimit
1714 atoms. When a state is created, it is assigned to the current atom.
1715 The information is later used for capturing.
1716*/
1717int QRegExpEngine::startAtom(bool officialCapture)
1718{
1719 if ((nf & (nf + 1)) == 0 && nf + 1 >= f.size())
1720 f.resize(size: (nf + 1) << 1);
1721 f[nf].parent = cf;
1722 cf = nf++;
1723 f[cf].capture = officialCapture ? QRegExpAtom::OfficialCapture : QRegExpAtom::NoCapture;
1724 return cf;
1725}
1726
1727void QRegExpEngine::finishAtom(int atom, bool needCapture)
1728{
1729 if (greedyQuantifiers && needCapture && f[atom].capture == QRegExpAtom::NoCapture)
1730 f[atom].capture = QRegExpAtom::UnofficialCapture;
1731 cf = f.at(i: atom).parent;
1732}
1733#endif
1734
1735#ifndef QT_NO_REGEXP_LOOKAHEAD
1736/*
1737 Creates a lookahead anchor.
1738*/
1739int QRegExpEngine::addLookahead(QRegExpEngine *eng, bool negative)
1740{
1741 int n = ahead.size();
1742 if (n == MaxLookaheads) {
1743 error(RXERR_LIMIT);
1744 return 0;
1745 }
1746 ahead += new QRegExpLookahead(eng, negative);
1747 return Anchor_FirstLookahead << n;
1748}
1749#endif
1750
1751#ifndef QT_NO_REGEXP_CAPTURE
1752/*
1753 We want the longest leftmost captures.
1754*/
1755static bool isBetterCapture(int ncap, const int *begin1, const int *end1, const int *begin2,
1756 const int *end2)
1757{
1758 for (int i = 0; i < ncap; i++) {
1759 int delta = begin2[i] - begin1[i]; // it has to start early...
1760 if (delta == 0)
1761 delta = end1[i] - end2[i]; // ...and end late
1762
1763 if (delta != 0)
1764 return delta > 0;
1765 }
1766 return false;
1767}
1768#endif
1769
1770/*
1771 Returns \c true if anchor a matches at position pos + i in the input
1772 string, otherwise false.
1773*/
1774bool QRegExpMatchState::testAnchor(int i, int a, const int *capBegin)
1775{
1776 int j;
1777
1778#ifndef QT_NO_REGEXP_ANCHOR_ALT
1779 if ((a & QRegExpEngine::Anchor_Alternation) != 0)
1780 return testAnchor(i, a: eng->aa.at(i: a ^ QRegExpEngine::Anchor_Alternation).a, capBegin)
1781 || testAnchor(i, a: eng->aa.at(i: a ^ QRegExpEngine::Anchor_Alternation).b, capBegin);
1782#endif
1783
1784 if ((a & QRegExpEngine::Anchor_Caret) != 0) {
1785 if (pos + i != caretPos)
1786 return false;
1787 }
1788 if ((a & QRegExpEngine::Anchor_Dollar) != 0) {
1789 if (pos + i != len)
1790 return false;
1791 }
1792#ifndef QT_NO_REGEXP_ESCAPE
1793 if ((a & (QRegExpEngine::Anchor_Word | QRegExpEngine::Anchor_NonWord)) != 0) {
1794 bool before = false;
1795 bool after = false;
1796 if (pos + i != 0)
1797 before = isWord(ch: in[pos + i - 1]);
1798 if (pos + i != len)
1799 after = isWord(ch: in[pos + i]);
1800 if ((a & QRegExpEngine::Anchor_Word) != 0 && (before == after))
1801 return false;
1802 if ((a & QRegExpEngine::Anchor_NonWord) != 0 && (before != after))
1803 return false;
1804 }
1805#endif
1806#ifndef QT_NO_REGEXP_LOOKAHEAD
1807 if ((a & QRegExpEngine::Anchor_LookaheadMask) != 0) {
1808 const QList<QRegExpLookahead *> &ahead = eng->ahead;
1809 for (j = 0; j < ahead.size(); j++) {
1810 if ((a & (QRegExpEngine::Anchor_FirstLookahead << j)) != 0) {
1811 QRegExpMatchState matchState;
1812 matchState.prepareForMatch(eng: ahead[j]->eng);
1813 matchState.match(str0: in + pos + i, len0: len - pos - i, pos0: 0,
1814 minimal0: true, oneTest: true, caretIndex: caretPos - pos - i);
1815 if ((matchState.captured[0] == 0) == ahead[j]->neg)
1816 return false;
1817 }
1818 }
1819 }
1820#endif
1821#ifndef QT_NO_REGEXP_CAPTURE
1822#ifndef QT_NO_REGEXP_BACKREF
1823 for (j = 0; j < eng->nbrefs; j++) {
1824 if ((a & (QRegExpEngine::Anchor_BackRef1Empty << j)) != 0) {
1825 int i = eng->captureForOfficialCapture.at(i: j);
1826 if (capBegin[i] != EmptyCapture)
1827 return false;
1828 }
1829 }
1830#endif
1831#endif
1832 return true;
1833}
1834
1835#ifndef QT_NO_REGEXP_OPTIM
1836/*
1837 The three following functions are what Jeffrey Friedl would call
1838 transmissions (or bump-alongs). Using one or the other should make
1839 no difference except in performance.
1840*/
1841
1842bool QRegExpEngine::goodStringMatch(QRegExpMatchState &matchState) const
1843{
1844 int k = matchState.pos + goodEarlyStart;
1845 QStringMatcher matcher(goodStr.unicode(), goodStr.size(), cs);
1846 while ((k = matcher.indexIn(str: matchState.in, length: matchState.len, from: k)) != -1) {
1847 int from = k - goodLateStart;
1848 int to = k - goodEarlyStart;
1849 if (from > matchState.pos)
1850 matchState.pos = from;
1851
1852 while (matchState.pos <= to) {
1853 if (matchState.matchHere())
1854 return true;
1855 ++matchState.pos;
1856 }
1857 ++k;
1858 }
1859 return false;
1860}
1861
1862bool QRegExpEngine::badCharMatch(QRegExpMatchState &matchState) const
1863{
1864 int slideHead = 0;
1865 int slideNext = 0;
1866 int i;
1867 int lastPos = matchState.len - minl;
1868 memset(s: matchState.slideTab, c: 0, n: matchState.slideTabSize * sizeof(int));
1869
1870 /*
1871 Set up the slide table, used for the bad-character heuristic,
1872 using the table of first occurrence of each character.
1873 */
1874 for (i = 0; i < minl; i++) {
1875 int sk = occ1[BadChar(matchState.in[matchState.pos + i])];
1876 if (sk == NoOccurrence)
1877 sk = i + 1;
1878 if (sk > 0) {
1879 int k = i + 1 - sk;
1880 if (k < 0) {
1881 sk = i + 1;
1882 k = 0;
1883 }
1884 if (sk > matchState.slideTab[k])
1885 matchState.slideTab[k] = sk;
1886 }
1887 }
1888
1889 if (matchState.pos > lastPos)
1890 return false;
1891
1892 for (;;) {
1893 if (++slideNext >= matchState.slideTabSize)
1894 slideNext = 0;
1895 if (matchState.slideTab[slideHead] > 0) {
1896 if (matchState.slideTab[slideHead] - 1 > matchState.slideTab[slideNext])
1897 matchState.slideTab[slideNext] = matchState.slideTab[slideHead] - 1;
1898 matchState.slideTab[slideHead] = 0;
1899 } else {
1900 if (matchState.matchHere())
1901 return true;
1902 }
1903
1904 if (matchState.pos == lastPos)
1905 break;
1906
1907 /*
1908 Update the slide table. This code has much in common with
1909 the initialization code.
1910 */
1911 int sk = occ1[BadChar(matchState.in[matchState.pos + minl])];
1912 if (sk == NoOccurrence) {
1913 matchState.slideTab[slideNext] = minl;
1914 } else if (sk > 0) {
1915 int k = slideNext + minl - sk;
1916 if (k >= matchState.slideTabSize)
1917 k -= matchState.slideTabSize;
1918 if (sk > matchState.slideTab[k])
1919 matchState.slideTab[k] = sk;
1920 }
1921 slideHead = slideNext;
1922 ++matchState.pos;
1923 }
1924 return false;
1925}
1926#else
1927bool QRegExpEngine::bruteMatch(QRegExpMatchState &matchState) const
1928{
1929 while (matchState.pos <= matchState.len) {
1930 if (matchState.matchHere())
1931 return true;
1932 ++matchState.pos;
1933 }
1934 return false;
1935}
1936#endif
1937
1938/*
1939 Here's the core of the engine. It tries to do a match here and now.
1940*/
1941bool QRegExpMatchState::matchHere()
1942{
1943 int ncur = 1, nnext = 0;
1944 int i = 0, j, k, m;
1945 bool stop = false;
1946
1947 matchLen = -1;
1948 oneTestMatchedLen = -1;
1949 curStack[0] = QRegExpEngine::InitialState;
1950
1951 int ncap = eng->ncap;
1952#ifndef QT_NO_REGEXP_CAPTURE
1953 if (ncap > 0) {
1954 for (j = 0; j < ncap; j++) {
1955 curCapBegin[j] = EmptyCapture;
1956 curCapEnd[j] = EmptyCapture;
1957 }
1958 }
1959#endif
1960
1961#ifndef QT_NO_REGEXP_BACKREF
1962 while ((ncur > 0 || !sleeping.isEmpty()) && i <= len - pos && !stop)
1963#else
1964 while (ncur > 0 && i <= len - pos && !stop)
1965#endif
1966 {
1967 int ch = (i < len - pos) ? in[pos + i].unicode() : 0;
1968 for (j = 0; j < ncur; j++) {
1969 int cur = curStack[j];
1970 const QRegExpAutomatonState &scur = eng->s.at(i: cur);
1971 const QList<int> &outs = scur.outs;
1972 for (k = 0; k < outs.size(); k++) {
1973 int next = outs.at(i: k);
1974 const QRegExpAutomatonState &snext = eng->s.at(i: next);
1975 bool inside = true;
1976#if !defined(QT_NO_REGEXP_BACKREF) && !defined(QT_NO_REGEXP_CAPTURE)
1977 int needSomeSleep = 0;
1978#endif
1979
1980 /*
1981 First, check if the anchors are anchored properly.
1982 */
1983 int a = scur.anchors.value(key: next);
1984 if (a != 0 && !testAnchor(i, a, capBegin: curCapBegin + j * ncap))
1985 inside = false;
1986
1987 /*
1988 If indeed they are, check if the input character is
1989 correct for this transition.
1990 */
1991 if (inside) {
1992 m = snext.match;
1993 if ((m & (QRegExpEngine::CharClassBit | QRegExpEngine::BackRefBit)) == 0) {
1994 if (eng->cs)
1995 inside = (m == ch);
1996 else
1997 inside = (QChar(m).toLower() == QChar(ch).toLower());
1998 } else if (next == QRegExpEngine::FinalState) {
1999 matchLen = i;
2000 stop = minimal;
2001 inside = true;
2002 } else if ((m & QRegExpEngine::CharClassBit) != 0) {
2003#ifndef QT_NO_REGEXP_CCLASS
2004 const QRegExpCharClass &cc = eng->cl.at(i: m ^ QRegExpEngine::CharClassBit);
2005 if (eng->cs)
2006 inside = cc.in(ch: QChar(ch));
2007 else if (cc.negative())
2008 inside = cc.in(ch: QChar(ch).toLower()) &&
2009 cc.in(ch: QChar(ch).toUpper());
2010 else
2011 inside = cc.in(ch: QChar(ch).toLower()) ||
2012 cc.in(ch: QChar(ch).toUpper());
2013#endif
2014#if !defined(QT_NO_REGEXP_BACKREF) && !defined(QT_NO_REGEXP_CAPTURE)
2015 } else { /* ((m & QRegExpEngine::BackRefBit) != 0) */
2016 int bref = m ^ QRegExpEngine::BackRefBit;
2017 int ell = j * ncap + eng->captureForOfficialCapture.at(i: bref - 1);
2018
2019 inside = bref <= ncap && curCapBegin[ell] != EmptyCapture;
2020 if (inside) {
2021 if (eng->cs)
2022 inside = (in[pos + curCapBegin[ell]] == QChar(ch));
2023 else
2024 inside = (in[pos + curCapBegin[ell]].toLower()
2025 == QChar(ch).toLower());
2026 }
2027
2028 if (inside) {
2029 int delta;
2030 if (curCapEnd[ell] == EmptyCapture)
2031 delta = i - curCapBegin[ell];
2032 else
2033 delta = curCapEnd[ell] - curCapBegin[ell];
2034
2035 inside = (delta <= len - (pos + i));
2036 if (inside && delta > 1) {
2037 int n = 1;
2038 if (eng->cs) {
2039 while (n < delta) {
2040 if (in[pos + curCapBegin[ell] + n]
2041 != in[pos + i + n])
2042 break;
2043 ++n;
2044 }
2045 } else {
2046 while (n < delta) {
2047 QChar a = in[pos + curCapBegin[ell] + n];
2048 QChar b = in[pos + i + n];
2049 if (a.toLower() != b.toLower())
2050 break;
2051 ++n;
2052 }
2053 }
2054 inside = (n == delta);
2055 if (inside)
2056 needSomeSleep = delta - 1;
2057 }
2058 }
2059#endif
2060 }
2061 }
2062
2063 /*
2064 We must now update our data structures.
2065 */
2066 if (inside) {
2067#ifndef QT_NO_REGEXP_CAPTURE
2068 int *capBegin, *capEnd;
2069#endif
2070 /*
2071 If the next state was not encountered yet, all
2072 is fine.
2073 */
2074 if ((m = inNextStack[next]) == -1) {
2075 m = nnext++;
2076 nextStack[m] = next;
2077 inNextStack[next] = m;
2078#ifndef QT_NO_REGEXP_CAPTURE
2079 capBegin = nextCapBegin + m * ncap;
2080 capEnd = nextCapEnd + m * ncap;
2081
2082 /*
2083 Otherwise, we'll first maintain captures in
2084 temporary arrays, and decide at the end whether
2085 it's best to keep the previous capture zones or
2086 the new ones.
2087 */
2088 } else {
2089 capBegin = tempCapBegin;
2090 capEnd = tempCapEnd;
2091#endif
2092 }
2093
2094#ifndef QT_NO_REGEXP_CAPTURE
2095 /*
2096 Updating the capture zones is much of a task.
2097 */
2098 if (ncap > 0) {
2099 memcpy(dest: capBegin, src: curCapBegin + j * ncap, n: ncap * sizeof(int));
2100 memcpy(dest: capEnd, src: curCapEnd + j * ncap, n: ncap * sizeof(int));
2101 int c = scur.atom, n = snext.atom;
2102 int p = -1, q = -1;
2103 int cap;
2104
2105 /*
2106 Lemma 1. For any x in the range [0..nf), we
2107 have f[x].parent < x.
2108
2109 Proof. By looking at startAtom(), it is
2110 clear that cf < nf holds all the time, and
2111 thus that f[nf].parent < nf.
2112 */
2113
2114 /*
2115 If we are reentering an atom, we empty all
2116 capture zones inside it.
2117 */
2118 if ((q = scur.reenter.value(key: next)) != 0) {
2119 QBitArray b(eng->nf, false);
2120 b.setBit(i: q, val: true);
2121 for (int ell = q + 1; ell < eng->nf; ell++) {
2122 if (b.testBit(i: eng->f.at(i: ell).parent)) {
2123 b.setBit(i: ell, val: true);
2124 cap = eng->f.at(i: ell).capture;
2125 if (cap >= 0) {
2126 capBegin[cap] = EmptyCapture;
2127 capEnd[cap] = EmptyCapture;
2128 }
2129 }
2130 }
2131 p = eng->f.at(i: q).parent;
2132
2133 /*
2134 Otherwise, close the capture zones we are
2135 leaving. We are leaving f[c].capture,
2136 f[f[c].parent].capture,
2137 f[f[f[c].parent].parent].capture, ...,
2138 until f[x].capture, with x such that
2139 f[x].parent is the youngest common ancestor
2140 for c and n.
2141
2142 We go up along c's and n's ancestry until
2143 we find x.
2144 */
2145 } else {
2146 p = c;
2147 q = n;
2148 while (p != q) {
2149 if (p > q) {
2150 cap = eng->f.at(i: p).capture;
2151 if (cap >= 0) {
2152 if (capBegin[cap] == i) {
2153 capBegin[cap] = EmptyCapture;
2154 capEnd[cap] = EmptyCapture;
2155 } else {
2156 capEnd[cap] = i;
2157 }
2158 }
2159 p = eng->f.at(i: p).parent;
2160 } else {
2161 q = eng->f.at(i: q).parent;
2162 }
2163 }
2164 }
2165
2166 /*
2167 In any case, we now open the capture zones
2168 we are entering. We work upwards from n
2169 until we reach p (the parent of the atom we
2170 reenter or the youngest common ancestor).
2171 */
2172 while (n > p) {
2173 cap = eng->f.at(i: n).capture;
2174 if (cap >= 0) {
2175 capBegin[cap] = i;
2176 capEnd[cap] = EmptyCapture;
2177 }
2178 n = eng->f.at(i: n).parent;
2179 }
2180 /*
2181 If the next state was already in
2182 nextStack, we must choose carefully which
2183 capture zones we want to keep.
2184 */
2185 if (capBegin == tempCapBegin &&
2186 isBetterCapture(ncap, begin1: capBegin, end1: capEnd, begin2: nextCapBegin + m * ncap,
2187 end2: nextCapEnd + m * ncap)) {
2188 memcpy(dest: nextCapBegin + m * ncap, src: capBegin, n: ncap * sizeof(int));
2189 memcpy(dest: nextCapEnd + m * ncap, src: capEnd, n: ncap * sizeof(int));
2190 }
2191 }
2192#ifndef QT_NO_REGEXP_BACKREF
2193 /*
2194 We are done with updating the capture zones.
2195 It's now time to put the next state to sleep,
2196 if it needs to, and to remove it from
2197 nextStack.
2198 */
2199 if (needSomeSleep > 0) {
2200 QList<int> zzZ(2 + 2 * ncap);
2201 zzZ[0] = i + needSomeSleep;
2202 zzZ[1] = next;
2203 if (ncap > 0) {
2204 memcpy(dest: zzZ.data() + 2, src: capBegin, n: ncap * sizeof(int));
2205 memcpy(dest: zzZ.data() + 2 + ncap, src: capEnd, n: ncap * sizeof(int));
2206 }
2207 inNextStack[nextStack[--nnext]] = -1;
2208 sleeping.append(t: zzZ);
2209 }
2210#endif
2211#endif
2212 }
2213 }
2214 }
2215#ifndef QT_NO_REGEXP_CAPTURE
2216 /*
2217 If we reached the final state, hurray! Copy the captured
2218 zone.
2219 */
2220 if (ncap > 0 && (m = inNextStack[QRegExpEngine::FinalState]) != -1) {
2221 memcpy(dest: capBegin, src: nextCapBegin + m * ncap, n: ncap * sizeof(int));
2222 memcpy(dest: capEnd, src: nextCapEnd + m * ncap, n: ncap * sizeof(int));
2223 }
2224#ifndef QT_NO_REGEXP_BACKREF
2225 /*
2226 It's time to wake up the sleepers.
2227 */
2228 j = 0;
2229 while (j < sleeping.size()) {
2230 if (sleeping.at(i: j)[0] == i) {
2231 const QList<int> &zzZ = sleeping.at(i: j);
2232 int next = zzZ[1];
2233 const int *capBegin = zzZ.data() + 2;
2234 const int *capEnd = zzZ.data() + 2 + ncap;
2235 bool copyOver = true;
2236
2237 if ((m = inNextStack[next]) == -1) {
2238 m = nnext++;
2239 nextStack[m] = next;
2240 inNextStack[next] = m;
2241 } else {
2242 copyOver = isBetterCapture(ncap, begin1: nextCapBegin + m * ncap, end1: nextCapEnd + m * ncap,
2243 begin2: capBegin, end2: capEnd);
2244 }
2245 if (copyOver) {
2246 memcpy(dest: nextCapBegin + m * ncap, src: capBegin, n: ncap * sizeof(int));
2247 memcpy(dest: nextCapEnd + m * ncap, src: capEnd, n: ncap * sizeof(int));
2248 }
2249
2250 sleeping.removeAt(i: j);
2251 } else {
2252 ++j;
2253 }
2254 }
2255#endif
2256#endif
2257 for (j = 0; j < nnext; j++)
2258 inNextStack[nextStack[j]] = -1;
2259
2260 // avoid needless iteration that confuses oneTestMatchedLen
2261 if (nnext == 1 && nextStack[0] == QRegExpEngine::FinalState
2262#ifndef QT_NO_REGEXP_BACKREF
2263 && sleeping.isEmpty()
2264#endif
2265 )
2266 stop = true;
2267
2268 qSwap(value1&: curStack, value2&: nextStack);
2269#ifndef QT_NO_REGEXP_CAPTURE
2270 qSwap(value1&: curCapBegin, value2&: nextCapBegin);
2271 qSwap(value1&: curCapEnd, value2&: nextCapEnd);
2272#endif
2273 ncur = nnext;
2274 nnext = 0;
2275 ++i;
2276 }
2277
2278#ifndef QT_NO_REGEXP_BACKREF
2279 /*
2280 If minimal matching is enabled, we might have some sleepers
2281 left.
2282 */
2283 if (!sleeping.isEmpty())
2284 sleeping.clear();
2285#endif
2286
2287 oneTestMatchedLen = i - 1;
2288 return (matchLen >= 0);
2289}
2290
2291#ifndef QT_NO_REGEXP_CCLASS
2292
2293QRegExpCharClass::QRegExpCharClass()
2294 : c(0), n(false)
2295{
2296#ifndef QT_NO_REGEXP_OPTIM
2297 occ1.fill(t: NoOccurrence, newSize: NumBadChars);
2298#endif
2299}
2300
2301void QRegExpCharClass::clear()
2302{
2303 c = 0;
2304 r.clear();
2305 n = false;
2306}
2307
2308void QRegExpCharClass::setNegative(bool negative)
2309{
2310 n = negative;
2311#ifndef QT_NO_REGEXP_OPTIM
2312 occ1.fill(t: 0, newSize: NumBadChars);
2313#endif
2314}
2315
2316void QRegExpCharClass::addCategories(uint cats)
2317{
2318 static const int all_cats = FLAG(QChar::Mark_NonSpacing) |
2319 FLAG(QChar::Mark_SpacingCombining) |
2320 FLAG(QChar::Mark_Enclosing) |
2321 FLAG(QChar::Number_DecimalDigit) |
2322 FLAG(QChar::Number_Letter) |
2323 FLAG(QChar::Number_Other) |
2324 FLAG(QChar::Separator_Space) |
2325 FLAG(QChar::Separator_Line) |
2326 FLAG(QChar::Separator_Paragraph) |
2327 FLAG(QChar::Other_Control) |
2328 FLAG(QChar::Other_Format) |
2329 FLAG(QChar::Other_Surrogate) |
2330 FLAG(QChar::Other_PrivateUse) |
2331 FLAG(QChar::Other_NotAssigned) |
2332 FLAG(QChar::Letter_Uppercase) |
2333 FLAG(QChar::Letter_Lowercase) |
2334 FLAG(QChar::Letter_Titlecase) |
2335 FLAG(QChar::Letter_Modifier) |
2336 FLAG(QChar::Letter_Other) |
2337 FLAG(QChar::Punctuation_Connector) |
2338 FLAG(QChar::Punctuation_Dash) |
2339 FLAG(QChar::Punctuation_Open) |
2340 FLAG(QChar::Punctuation_Close) |
2341 FLAG(QChar::Punctuation_InitialQuote) |
2342 FLAG(QChar::Punctuation_FinalQuote) |
2343 FLAG(QChar::Punctuation_Other) |
2344 FLAG(QChar::Symbol_Math) |
2345 FLAG(QChar::Symbol_Currency) |
2346 FLAG(QChar::Symbol_Modifier) |
2347 FLAG(QChar::Symbol_Other);
2348 c |= (all_cats & cats);
2349#ifndef QT_NO_REGEXP_OPTIM
2350 occ1.fill(t: 0, newSize: NumBadChars);
2351#endif
2352}
2353
2354void QRegExpCharClass::addRange(ushort from, ushort to)
2355{
2356 if (from > to)
2357 qSwap(value1&: from, value2&: to);
2358 int m = r.size();
2359 r.resize(size: m + 1);
2360 r[m].from = from;
2361 r[m].len = to - from + 1;
2362
2363#ifndef QT_NO_REGEXP_OPTIM
2364 int i;
2365
2366 if (to - from < NumBadChars) {
2367 if (from % NumBadChars <= to % NumBadChars) {
2368 for (i = from % NumBadChars; i <= to % NumBadChars; i++)
2369 occ1[i] = 0;
2370 } else {
2371 for (i = 0; i <= to % NumBadChars; i++)
2372 occ1[i] = 0;
2373 for (i = from % NumBadChars; i < NumBadChars; i++)
2374 occ1[i] = 0;
2375 }
2376 } else {
2377 occ1.fill(t: 0, newSize: NumBadChars);
2378 }
2379#endif
2380}
2381
2382bool QRegExpCharClass::in(QChar ch) const
2383{
2384#ifndef QT_NO_REGEXP_OPTIM
2385 if (occ1.at(BadChar(ch)) == NoOccurrence)
2386 return n;
2387#endif
2388
2389 if (c != 0 && (c & FLAG(ch.category())) != 0)
2390 return !n;
2391
2392 const int uc = ch.unicode();
2393 int size = r.size();
2394
2395 for (int i = 0; i < size; ++i) {
2396 const QRegExpCharClassRange &range = r.at(i);
2397 if (uint(uc - range.from) < uint(r.at(i).len))
2398 return !n;
2399 }
2400 return n;
2401}
2402
2403#if defined(QT_DEBUG)
2404void QRegExpCharClass::dump() const
2405{
2406 int i;
2407 qDebug(msg: " %stive character class", n ? "nega" : "posi");
2408#ifndef QT_NO_REGEXP_CCLASS
2409 if (c != 0)
2410 qDebug(msg: " categories 0x%.8x", c);
2411#endif
2412 for (i = 0; i < r.size(); i++)
2413 qDebug(msg: " 0x%.4x through 0x%.4x", r[i].from, r[i].from + r[i].len - 1);
2414}
2415#endif
2416#endif
2417
2418QRegExpEngine::Box::Box(QRegExpEngine *engine)
2419 : eng(engine), skipanchors(0)
2420#ifndef QT_NO_REGEXP_OPTIM
2421 , earlyStart(0), lateStart(0), maxl(0)
2422#endif
2423{
2424#ifndef QT_NO_REGEXP_OPTIM
2425 occ1.fill(t: NoOccurrence, newSize: NumBadChars);
2426#endif
2427 minl = 0;
2428}
2429
2430QRegExpEngine::Box &QRegExpEngine::Box::operator=(const Box &b)
2431{
2432 eng = b.eng;
2433 ls = b.ls;
2434 rs = b.rs;
2435 lanchors = b.lanchors;
2436 ranchors = b.ranchors;
2437 skipanchors = b.skipanchors;
2438#ifndef QT_NO_REGEXP_OPTIM
2439 earlyStart = b.earlyStart;
2440 lateStart = b.lateStart;
2441 str = b.str;
2442 leftStr = b.leftStr;
2443 rightStr = b.rightStr;
2444 maxl = b.maxl;
2445 occ1 = b.occ1;
2446#endif
2447 minl = b.minl;
2448 return *this;
2449}
2450
2451void QRegExpEngine::Box::set(QChar ch)
2452{
2453 ls.resize(size: 1);
2454 ls[0] = eng->createState(ch);
2455 rs = ls;
2456#ifndef QT_NO_REGEXP_OPTIM
2457 str = ch;
2458 leftStr = ch;
2459 rightStr = ch;
2460 maxl = 1;
2461 occ1[BadChar(ch)] = 0;
2462#endif
2463 minl = 1;
2464}
2465
2466void QRegExpEngine::Box::set(const QRegExpCharClass &cc)
2467{
2468 ls.resize(size: 1);
2469 ls[0] = eng->createState(cc);
2470 rs = ls;
2471#ifndef QT_NO_REGEXP_OPTIM
2472 maxl = 1;
2473 occ1 = cc.firstOccurrence();
2474#endif
2475 minl = 1;
2476}
2477
2478#ifndef QT_NO_REGEXP_BACKREF
2479void QRegExpEngine::Box::set(int bref)
2480{
2481 ls.resize(size: 1);
2482 ls[0] = eng->createState(bref);
2483 rs = ls;
2484 if (bref >= 1 && bref <= MaxBackRefs)
2485 skipanchors = Anchor_BackRef0Empty << bref;
2486#ifndef QT_NO_REGEXP_OPTIM
2487 maxl = InftyLen;
2488#endif
2489 minl = 0;
2490}
2491#endif
2492
2493void QRegExpEngine::Box::cat(const Box &b)
2494{
2495 eng->addCatTransitions(from: rs, to: b.ls);
2496 addAnchorsToEngine(to: b);
2497 if (minl == 0) {
2498 lanchors.insert(map: b.lanchors);
2499 if (skipanchors != 0) {
2500 for (int i = 0; i < b.ls.size(); i++) {
2501 int a = eng->anchorConcatenation(a: lanchors.value(key: b.ls.at(i), defaultValue: 0), b: skipanchors);
2502 lanchors.insert(key: b.ls.at(i), value: a);
2503 }
2504 }
2505 mergeInto(a: &ls, b: b.ls);
2506 }
2507 if (b.minl == 0) {
2508 ranchors.insert(map: b.ranchors);
2509 if (b.skipanchors != 0) {
2510 for (int i = 0; i < rs.size(); i++) {
2511 int a = eng->anchorConcatenation(a: ranchors.value(key: rs.at(i), defaultValue: 0), b: b.skipanchors);
2512 ranchors.insert(key: rs.at(i), value: a);
2513 }
2514 }
2515 mergeInto(a: &rs, b: b.rs);
2516 } else {
2517 ranchors = b.ranchors;
2518 rs = b.rs;
2519 }
2520
2521#ifndef QT_NO_REGEXP_OPTIM
2522 if (maxl != InftyLen) {
2523 if (rightStr.size() + b.leftStr.size() >
2524 qMax(a: str.size(), b: b.str.size())) {
2525 earlyStart = minl - rightStr.size();
2526 lateStart = maxl - rightStr.size();
2527 str = rightStr + b.leftStr;
2528 } else if (b.str.size() > str.size()) {
2529 earlyStart = minl + b.earlyStart;
2530 lateStart = maxl + b.lateStart;
2531 str = b.str;
2532 }
2533 }
2534
2535 if (leftStr.size() == maxl)
2536 leftStr += b.leftStr;
2537
2538 if (b.rightStr.size() == b.maxl) {
2539 rightStr += b.rightStr;
2540 } else {
2541 rightStr = b.rightStr;
2542 }
2543
2544 if (maxl == InftyLen || b.maxl == InftyLen) {
2545 maxl = InftyLen;
2546 } else {
2547 maxl += b.maxl;
2548 }
2549
2550 for (int i = 0; i < NumBadChars; i++) {
2551 if (b.occ1.at(i) != NoOccurrence && minl + b.occ1.at(i) < occ1.at(i))
2552 occ1[i] = minl + b.occ1.at(i);
2553 }
2554#endif
2555
2556 minl += b.minl;
2557 if (minl == 0)
2558 skipanchors = eng->anchorConcatenation(a: skipanchors, b: b.skipanchors);
2559 else
2560 skipanchors = 0;
2561}
2562
2563void QRegExpEngine::Box::orx(const Box &b)
2564{
2565 mergeInto(a: &ls, b: b.ls);
2566 lanchors.insert(map: b.lanchors);
2567 mergeInto(a: &rs, b: b.rs);
2568 ranchors.insert(map: b.ranchors);
2569
2570 if (b.minl == 0) {
2571 if (minl == 0)
2572 skipanchors = eng->anchorAlternation(a: skipanchors, b: b.skipanchors);
2573 else
2574 skipanchors = b.skipanchors;
2575 }
2576
2577#ifndef QT_NO_REGEXP_OPTIM
2578 for (int i = 0; i < NumBadChars; i++) {
2579 if (occ1.at(i) > b.occ1.at(i))
2580 occ1[i] = b.occ1.at(i);
2581 }
2582 earlyStart = 0;
2583 lateStart = 0;
2584 str = QString();
2585 leftStr = QString();
2586 rightStr = QString();
2587 if (b.maxl > maxl)
2588 maxl = b.maxl;
2589#endif
2590 if (b.minl < minl)
2591 minl = b.minl;
2592}
2593
2594void QRegExpEngine::Box::plus(int atom)
2595{
2596#ifndef QT_NO_REGEXP_CAPTURE
2597 eng->addPlusTransitions(from: rs, to: ls, atom);
2598#else
2599 Q_UNUSED(atom);
2600 eng->addCatTransitions(rs, ls);
2601#endif
2602 addAnchorsToEngine(to: *this);
2603#ifndef QT_NO_REGEXP_OPTIM
2604 maxl = InftyLen;
2605#endif
2606}
2607
2608void QRegExpEngine::Box::opt()
2609{
2610#ifndef QT_NO_REGEXP_OPTIM
2611 earlyStart = 0;
2612 lateStart = 0;
2613 str = QString();
2614 leftStr = QString();
2615 rightStr = QString();
2616#endif
2617 skipanchors = 0;
2618 minl = 0;
2619}
2620
2621void QRegExpEngine::Box::catAnchor(int a)
2622{
2623 if (a != 0) {
2624 for (int i = 0; i < rs.size(); i++) {
2625 a = eng->anchorConcatenation(a: ranchors.value(key: rs.at(i), defaultValue: 0), b: a);
2626 ranchors.insert(key: rs.at(i), value: a);
2627 }
2628 if (minl == 0)
2629 skipanchors = eng->anchorConcatenation(a: skipanchors, b: a);
2630 }
2631}
2632
2633#ifndef QT_NO_REGEXP_OPTIM
2634void QRegExpEngine::Box::setupHeuristics()
2635{
2636 eng->goodEarlyStart = earlyStart;
2637 eng->goodLateStart = lateStart;
2638 eng->goodStr = eng->cs ? str : str.toLower();
2639
2640 eng->minl = minl;
2641 if (eng->cs) {
2642 /*
2643 A regular expression such as 112|1 has occ1['2'] = 2 and minl =
2644 1 at this point. An entry of occ1 has to be at most minl or
2645 infinity for the rest of the algorithm to go well.
2646
2647 We waited until here before normalizing these cases (instead of
2648 doing it in Box::orx()) because sometimes things improve by
2649 themselves. Consider for example (112|1)34.
2650 */
2651 for (int i = 0; i < NumBadChars; i++) {
2652 if (occ1.at(i) != NoOccurrence && occ1.at(i) >= minl)
2653 occ1[i] = minl;
2654 }
2655 eng->occ1 = occ1;
2656 } else {
2657 eng->occ1.fill(t: 0, newSize: NumBadChars);
2658 }
2659
2660 eng->heuristicallyChooseHeuristic();
2661}
2662#endif
2663
2664#if defined(QT_DEBUG)
2665void QRegExpEngine::Box::dump() const
2666{
2667 int i;
2668 qDebug(msg: "Box of at least %d character%s", minl, minl == 1 ? "" : "s");
2669 qDebug(msg: " Left states:");
2670 for (i = 0; i < ls.size(); i++) {
2671 if (lanchors.value(key: ls[i], defaultValue: 0) == 0)
2672 qDebug(msg: " %d", ls[i]);
2673 else
2674 qDebug(msg: " %d [anchors 0x%.8x]", ls[i], lanchors[ls[i]]);
2675 }
2676 qDebug(msg: " Right states:");
2677 for (i = 0; i < rs.size(); i++) {
2678 if (ranchors.value(key: rs[i], defaultValue: 0) == 0)
2679 qDebug(msg: " %d", rs[i]);
2680 else
2681 qDebug(msg: " %d [anchors 0x%.8x]", rs[i], ranchors[rs[i]]);
2682 }
2683 qDebug(msg: " Skip anchors: 0x%.8x", skipanchors);
2684}
2685#endif
2686
2687void QRegExpEngine::Box::addAnchorsToEngine(const Box &to) const
2688{
2689 for (int i = 0; i < to.ls.size(); i++) {
2690 for (int j = 0; j < rs.size(); j++) {
2691 int a = eng->anchorConcatenation(a: ranchors.value(key: rs.at(i: j), defaultValue: 0),
2692 b: to.lanchors.value(key: to.ls.at(i), defaultValue: 0));
2693 eng->addAnchors(from: rs[j], to: to.ls[i], a);
2694 }
2695 }
2696}
2697
2698#ifndef QT_NO_REGEXP_CCLASS
2699// fast lookup hash for xml schema extensions
2700// sorted by name for b-search
2701static const struct CategoriesRangeMapEntry {
2702 const char name[40];
2703 uint first, second;
2704} categoriesRangeMap[] = {
2705 { .name: "AegeanNumbers", .first: 0x10100, .second: 0x1013F },
2706 { .name: "AlphabeticPresentationForms", .first: 0xFB00, .second: 0xFB4F },
2707 { .name: "AncientGreekMusicalNotation", .first: 0x1D200, .second: 0x1D24F },
2708 { .name: "AncientGreekNumbers", .first: 0x10140, .second: 0x1018F },
2709 { .name: "Arabic", .first: 0x0600, .second: 0x06FF },
2710 { .name: "ArabicPresentationForms-A", .first: 0xFB50, .second: 0xFDFF },
2711 { .name: "ArabicPresentationForms-B", .first: 0xFE70, .second: 0xFEFF },
2712 { .name: "ArabicSupplement", .first: 0x0750, .second: 0x077F },
2713 { .name: "Armenian", .first: 0x0530, .second: 0x058F },
2714 { .name: "Arrows", .first: 0x2190, .second: 0x21FF },
2715 { .name: "BasicLatin", .first: 0x0000, .second: 0x007F },
2716 { .name: "Bengali", .first: 0x0980, .second: 0x09FF },
2717 { .name: "BlockElements", .first: 0x2580, .second: 0x259F },
2718 { .name: "Bopomofo", .first: 0x3100, .second: 0x312F },
2719 { .name: "BopomofoExtended", .first: 0x31A0, .second: 0x31BF },
2720 { .name: "BoxDrawing", .first: 0x2500, .second: 0x257F },
2721 { .name: "BraillePatterns", .first: 0x2800, .second: 0x28FF },
2722 { .name: "Buginese", .first: 0x1A00, .second: 0x1A1F },
2723 { .name: "Buhid", .first: 0x1740, .second: 0x175F },
2724 { .name: "ByzantineMusicalSymbols", .first: 0x1D000, .second: 0x1D0FF },
2725 { .name: "CJKCompatibility", .first: 0x3300, .second: 0x33FF },
2726 { .name: "CJKCompatibilityForms", .first: 0xFE30, .second: 0xFE4F },
2727 { .name: "CJKCompatibilityIdeographs", .first: 0xF900, .second: 0xFAFF },
2728 { .name: "CJKCompatibilityIdeographsSupplement", .first: 0x2F800, .second: 0x2FA1F },
2729 { .name: "CJKRadicalsSupplement", .first: 0x2E80, .second: 0x2EFF },
2730 { .name: "CJKStrokes", .first: 0x31C0, .second: 0x31EF },
2731 { .name: "CJKSymbolsandPunctuation", .first: 0x3000, .second: 0x303F },
2732 { .name: "CJKUnifiedIdeographs", .first: 0x4E00, .second: 0x9FFF },
2733 { .name: "CJKUnifiedIdeographsExtensionA", .first: 0x3400, .second: 0x4DB5 },
2734 { .name: "CJKUnifiedIdeographsExtensionB", .first: 0x20000, .second: 0x2A6DF },
2735 { .name: "Cherokee", .first: 0x13A0, .second: 0x13FF },
2736 { .name: "CombiningDiacriticalMarks", .first: 0x0300, .second: 0x036F },
2737 { .name: "CombiningDiacriticalMarksSupplement", .first: 0x1DC0, .second: 0x1DFF },
2738 { .name: "CombiningHalfMarks", .first: 0xFE20, .second: 0xFE2F },
2739 { .name: "CombiningMarksforSymbols", .first: 0x20D0, .second: 0x20FF },
2740 { .name: "ControlPictures", .first: 0x2400, .second: 0x243F },
2741 { .name: "Coptic", .first: 0x2C80, .second: 0x2CFF },
2742 { .name: "CurrencySymbols", .first: 0x20A0, .second: 0x20CF },
2743 { .name: "CypriotSyllabary", .first: 0x10800, .second: 0x1083F },
2744 { .name: "Cyrillic", .first: 0x0400, .second: 0x04FF },
2745 { .name: "CyrillicSupplement", .first: 0x0500, .second: 0x052F },
2746 { .name: "Deseret", .first: 0x10400, .second: 0x1044F },
2747 { .name: "Devanagari", .first: 0x0900, .second: 0x097F },
2748 { .name: "Dingbats", .first: 0x2700, .second: 0x27BF },
2749 { .name: "EnclosedAlphanumerics", .first: 0x2460, .second: 0x24FF },
2750 { .name: "EnclosedCJKLettersandMonths", .first: 0x3200, .second: 0x32FF },
2751 { .name: "Ethiopic", .first: 0x1200, .second: 0x137F },
2752 { .name: "EthiopicExtended", .first: 0x2D80, .second: 0x2DDF },
2753 { .name: "EthiopicSupplement", .first: 0x1380, .second: 0x139F },
2754 { .name: "GeneralPunctuation", .first: 0x2000, .second: 0x206F },
2755 { .name: "GeometricShapes", .first: 0x25A0, .second: 0x25FF },
2756 { .name: "Georgian", .first: 0x10A0, .second: 0x10FF },
2757 { .name: "GeorgianSupplement", .first: 0x2D00, .second: 0x2D2F },
2758 { .name: "Glagolitic", .first: 0x2C00, .second: 0x2C5F },
2759 { .name: "Gothic", .first: 0x10330, .second: 0x1034F },
2760 { .name: "Greek", .first: 0x0370, .second: 0x03FF },
2761 { .name: "GreekExtended", .first: 0x1F00, .second: 0x1FFF },
2762 { .name: "Gujarati", .first: 0x0A80, .second: 0x0AFF },
2763 { .name: "Gurmukhi", .first: 0x0A00, .second: 0x0A7F },
2764 { .name: "HalfwidthandFullwidthForms", .first: 0xFF00, .second: 0xFFEF },
2765 { .name: "HangulCompatibilityJamo", .first: 0x3130, .second: 0x318F },
2766 { .name: "HangulJamo", .first: 0x1100, .second: 0x11FF },
2767 { .name: "HangulSyllables", .first: 0xAC00, .second: 0xD7A3 },
2768 { .name: "Hanunoo", .first: 0x1720, .second: 0x173F },
2769 { .name: "Hebrew", .first: 0x0590, .second: 0x05FF },
2770 { .name: "Hiragana", .first: 0x3040, .second: 0x309F },
2771 { .name: "IPAExtensions", .first: 0x0250, .second: 0x02AF },
2772 { .name: "IdeographicDescriptionCharacters", .first: 0x2FF0, .second: 0x2FFF },
2773 { .name: "Kanbun", .first: 0x3190, .second: 0x319F },
2774 { .name: "KangxiRadicals", .first: 0x2F00, .second: 0x2FDF },
2775 { .name: "Kannada", .first: 0x0C80, .second: 0x0CFF },
2776 { .name: "Katakana", .first: 0x30A0, .second: 0x30FF },
2777 { .name: "KatakanaPhoneticExtensions", .first: 0x31F0, .second: 0x31FF },
2778 { .name: "Kharoshthi", .first: 0x10A00, .second: 0x10A5F },
2779 { .name: "Khmer", .first: 0x1780, .second: 0x17FF },
2780 { .name: "KhmerSymbols", .first: 0x19E0, .second: 0x19FF },
2781 { .name: "Lao", .first: 0x0E80, .second: 0x0EFF },
2782 { .name: "Latin-1Supplement", .first: 0x0080, .second: 0x00FF },
2783 { .name: "LatinExtended-A", .first: 0x0100, .second: 0x017F },
2784 { .name: "LatinExtended-B", .first: 0x0180, .second: 0x024F },
2785 { .name: "LatinExtendedAdditional", .first: 0x1E00, .second: 0x1EFF },
2786 { .name: "LetterlikeSymbols", .first: 0x2100, .second: 0x214F },
2787 { .name: "Limbu", .first: 0x1900, .second: 0x194F },
2788 { .name: "LinearBIdeograms", .first: 0x10080, .second: 0x100FF },
2789 { .name: "LinearBSyllabary", .first: 0x10000, .second: 0x1007F },
2790 { .name: "Malayalam", .first: 0x0D00, .second: 0x0D7F },
2791 { .name: "MathematicalAlphanumericSymbols", .first: 0x1D400, .second: 0x1D7FF },
2792 { .name: "MathematicalOperators", .first: 0x2200, .second: 0x22FF },
2793 { .name: "MiscellaneousMathematicalSymbols-A", .first: 0x27C0, .second: 0x27EF },
2794 { .name: "MiscellaneousMathematicalSymbols-B", .first: 0x2980, .second: 0x29FF },
2795 { .name: "MiscellaneousSymbols", .first: 0x2600, .second: 0x26FF },
2796 { .name: "MiscellaneousSymbolsandArrows", .first: 0x2B00, .second: 0x2BFF },
2797 { .name: "MiscellaneousTechnical", .first: 0x2300, .second: 0x23FF },
2798 { .name: "ModifierToneLetters", .first: 0xA700, .second: 0xA71F },
2799 { .name: "Mongolian", .first: 0x1800, .second: 0x18AF },
2800 { .name: "MusicalSymbols", .first: 0x1D100, .second: 0x1D1FF },
2801 { .name: "Myanmar", .first: 0x1000, .second: 0x109F },
2802 { .name: "NewTaiLue", .first: 0x1980, .second: 0x19DF },
2803 { .name: "NumberForms", .first: 0x2150, .second: 0x218F },
2804 { .name: "Ogham", .first: 0x1680, .second: 0x169F },
2805 { .name: "OldItalic", .first: 0x10300, .second: 0x1032F },
2806 { .name: "OldPersian", .first: 0x103A0, .second: 0x103DF },
2807 { .name: "OpticalCharacterRecognition", .first: 0x2440, .second: 0x245F },
2808 { .name: "Oriya", .first: 0x0B00, .second: 0x0B7F },
2809 { .name: "Osmanya", .first: 0x10480, .second: 0x104AF },
2810 { .name: "PhoneticExtensions", .first: 0x1D00, .second: 0x1D7F },
2811 { .name: "PhoneticExtensionsSupplement", .first: 0x1D80, .second: 0x1DBF },
2812 { .name: "PrivateUse", .first: 0xE000, .second: 0xF8FF },
2813 { .name: "Runic", .first: 0x16A0, .second: 0x16FF },
2814 { .name: "Shavian", .first: 0x10450, .second: 0x1047F },
2815 { .name: "Sinhala", .first: 0x0D80, .second: 0x0DFF },
2816 { .name: "SmallFormVariants", .first: 0xFE50, .second: 0xFE6F },
2817 { .name: "SpacingModifierLetters", .first: 0x02B0, .second: 0x02FF },
2818 { .name: "Specials", .first: 0xFFF0, .second: 0xFFFF },
2819 { .name: "SuperscriptsandSubscripts", .first: 0x2070, .second: 0x209F },
2820 { .name: "SupplementalArrows-A", .first: 0x27F0, .second: 0x27FF },
2821 { .name: "SupplementalArrows-B", .first: 0x2900, .second: 0x297F },
2822 { .name: "SupplementalMathematicalOperators", .first: 0x2A00, .second: 0x2AFF },
2823 { .name: "SupplementalPunctuation", .first: 0x2E00, .second: 0x2E7F },
2824 { .name: "SupplementaryPrivateUseArea-A", .first: 0xF0000, .second: 0xFFFFF },
2825 { .name: "SupplementaryPrivateUseArea-B", .first: 0x100000, .second: 0x10FFFF },
2826 { .name: "SylotiNagri", .first: 0xA800, .second: 0xA82F },
2827 { .name: "Syriac", .first: 0x0700, .second: 0x074F },
2828 { .name: "Tagalog", .first: 0x1700, .second: 0x171F },
2829 { .name: "Tagbanwa", .first: 0x1760, .second: 0x177F },
2830 { .name: "Tags", .first: 0xE0000, .second: 0xE007F },
2831 { .name: "TaiLe", .first: 0x1950, .second: 0x197F },
2832 { .name: "TaiXuanJingSymbols", .first: 0x1D300, .second: 0x1D35F },
2833 { .name: "Tamil", .first: 0x0B80, .second: 0x0BFF },
2834 { .name: "Telugu", .first: 0x0C00, .second: 0x0C7F },
2835 { .name: "Thaana", .first: 0x0780, .second: 0x07BF },
2836 { .name: "Thai", .first: 0x0E00, .second: 0x0E7F },
2837 { .name: "Tibetan", .first: 0x0F00, .second: 0x0FFF },
2838 { .name: "Tifinagh", .first: 0x2D30, .second: 0x2D7F },
2839 { .name: "Ugaritic", .first: 0x10380, .second: 0x1039F },
2840 { .name: "UnifiedCanadianAboriginalSyllabics", .first: 0x1400, .second: 0x167F },
2841 { .name: "VariationSelectors", .first: 0xFE00, .second: 0xFE0F },
2842 { .name: "VariationSelectorsSupplement", .first: 0xE0100, .second: 0xE01EF },
2843 { .name: "VerticalForms", .first: 0xFE10, .second: 0xFE1F },
2844 { .name: "YiRadicals", .first: 0xA490, .second: 0xA4CF },
2845 { .name: "YiSyllables", .first: 0xA000, .second: 0xA48F },
2846 { .name: "YijingHexagramSymbols", .first: 0x4DC0, .second: 0x4DFF }
2847};
2848
2849inline bool operator<(const CategoriesRangeMapEntry &entry1, const CategoriesRangeMapEntry &entry2)
2850{ return qstrcmp(str1: entry1.name, str2: entry2.name) < 0; }
2851inline bool operator<(const char *name, const CategoriesRangeMapEntry &entry)
2852{ return qstrcmp(str1: name, str2: entry.name) < 0; }
2853inline bool operator<(const CategoriesRangeMapEntry &entry, const char *name)
2854{ return qstrcmp(str1: entry.name, str2: name) < 0; }
2855#endif // QT_NO_REGEXP_CCLASS
2856
2857int QRegExpEngine::getChar()
2858{
2859 return (yyPos == yyLen) ? EOS : yyIn[yyPos++].unicode();
2860}
2861
2862int QRegExpEngine::getEscape()
2863{
2864#ifndef QT_NO_REGEXP_ESCAPE
2865 const char tab[] = "afnrtv"; // no b, as \b means word boundary
2866 const char backTab[] = "\a\f\n\r\t\v";
2867 ushort low;
2868 int i;
2869#endif
2870 ushort val;
2871 int prevCh = yyCh;
2872
2873 if (prevCh == EOS) {
2874 error(RXERR_END);
2875 return Tok_Char | '\\';
2876 }
2877 yyCh = getChar();
2878#ifndef QT_NO_REGEXP_ESCAPE
2879 if ((prevCh & ~0xff) == 0) {
2880 const char *p = strchr(s: tab, c: prevCh);
2881 if (p != nullptr)
2882 return Tok_Char | backTab[p - tab];
2883 }
2884#endif
2885
2886 switch (prevCh) {
2887#ifndef QT_NO_REGEXP_ESCAPE
2888 case '0':
2889 val = 0;
2890 for (i = 0; i < 3; i++) {
2891 if (yyCh >= '0' && yyCh <= '7')
2892 val = (val << 3) | (yyCh - '0');
2893 else
2894 break;
2895 yyCh = getChar();
2896 }
2897 if ((val & ~0377) != 0)
2898 error(RXERR_OCTAL);
2899 return Tok_Char | val;
2900#endif
2901#ifndef QT_NO_REGEXP_ESCAPE
2902 case 'B':
2903 return Tok_NonWord;
2904#endif
2905#ifndef QT_NO_REGEXP_CCLASS
2906 case 'D':
2907 // see QChar::isDigit()
2908 yyCharClass->addCategories(cats: uint(-1) ^ FLAG(QChar::Number_DecimalDigit));
2909 return Tok_CharClass;
2910 case 'S':
2911 // see QChar::isSpace()
2912 yyCharClass->addCategories(cats: uint(-1) ^ (FLAG(QChar::Separator_Space) |
2913 FLAG(QChar::Separator_Line) |
2914 FLAG(QChar::Separator_Paragraph) |
2915 FLAG(QChar::Other_Control)));
2916 yyCharClass->addRange(from: 0x0000, to: 0x0008);
2917 yyCharClass->addRange(from: 0x000e, to: 0x001f);
2918 yyCharClass->addRange(from: 0x007f, to: 0x0084);
2919 yyCharClass->addRange(from: 0x0086, to: 0x009f);
2920 return Tok_CharClass;
2921 case 'W':
2922 // see QChar::isLetterOrNumber() and QChar::isMark()
2923 yyCharClass->addCategories(cats: uint(-1) ^ (FLAG(QChar::Mark_NonSpacing) |
2924 FLAG(QChar::Mark_SpacingCombining) |
2925 FLAG(QChar::Mark_Enclosing) |
2926 FLAG(QChar::Number_DecimalDigit) |
2927 FLAG(QChar::Number_Letter) |
2928 FLAG(QChar::Number_Other) |
2929 FLAG(QChar::Letter_Uppercase) |
2930 FLAG(QChar::Letter_Lowercase) |
2931 FLAG(QChar::Letter_Titlecase) |
2932 FLAG(QChar::Letter_Modifier) |
2933 FLAG(QChar::Letter_Other) |
2934 FLAG(QChar::Punctuation_Connector)));
2935 yyCharClass->addRange(from: 0x203f, to: 0x2040);
2936 yyCharClass->addSingleton(ch: 0x2040);
2937 yyCharClass->addSingleton(ch: 0x2054);
2938 yyCharClass->addSingleton(ch: 0x30fb);
2939 yyCharClass->addRange(from: 0xfe33, to: 0xfe34);
2940 yyCharClass->addRange(from: 0xfe4d, to: 0xfe4f);
2941 yyCharClass->addSingleton(ch: 0xff3f);
2942 yyCharClass->addSingleton(ch: 0xff65);
2943 return Tok_CharClass;
2944#endif
2945#ifndef QT_NO_REGEXP_ESCAPE
2946 case 'b':
2947 return Tok_Word;
2948#endif
2949#ifndef QT_NO_REGEXP_CCLASS
2950 case 'd':
2951 // see QChar::isDigit()
2952 yyCharClass->addCategories(FLAG(QChar::Number_DecimalDigit));
2953 return Tok_CharClass;
2954 case 's':
2955 // see QChar::isSpace()
2956 yyCharClass->addCategories(FLAG(QChar::Separator_Space) |
2957 FLAG(QChar::Separator_Line) |
2958 FLAG(QChar::Separator_Paragraph));
2959 yyCharClass->addRange(from: 0x0009, to: 0x000d);
2960 yyCharClass->addSingleton(ch: 0x0085);
2961 return Tok_CharClass;
2962 case 'w':
2963 // see QChar::isLetterOrNumber() and QChar::isMark()
2964 yyCharClass->addCategories(FLAG(QChar::Mark_NonSpacing) |
2965 FLAG(QChar::Mark_SpacingCombining) |
2966 FLAG(QChar::Mark_Enclosing) |
2967 FLAG(QChar::Number_DecimalDigit) |
2968 FLAG(QChar::Number_Letter) |
2969 FLAG(QChar::Number_Other) |
2970 FLAG(QChar::Letter_Uppercase) |
2971 FLAG(QChar::Letter_Lowercase) |
2972 FLAG(QChar::Letter_Titlecase) |
2973 FLAG(QChar::Letter_Modifier) |
2974 FLAG(QChar::Letter_Other));
2975 yyCharClass->addSingleton(ch: 0x005f); // '_'
2976 return Tok_CharClass;
2977 case 'I':
2978 if (!xmlSchemaExtensions)
2979 break;
2980 yyCharClass->setNegative(!yyCharClass->negative());
2981 Q_FALLTHROUGH();
2982 case 'i':
2983 if (xmlSchemaExtensions) {
2984 yyCharClass->addCategories(FLAG(QChar::Mark_NonSpacing) |
2985 FLAG(QChar::Mark_SpacingCombining) |
2986 FLAG(QChar::Mark_Enclosing) |
2987 FLAG(QChar::Number_DecimalDigit) |
2988 FLAG(QChar::Number_Letter) |
2989 FLAG(QChar::Number_Other) |
2990 FLAG(QChar::Letter_Uppercase) |
2991 FLAG(QChar::Letter_Lowercase) |
2992 FLAG(QChar::Letter_Titlecase) |
2993 FLAG(QChar::Letter_Modifier) |
2994 FLAG(QChar::Letter_Other));
2995 yyCharClass->addSingleton(ch: 0x003a); // ':'
2996 yyCharClass->addSingleton(ch: 0x005f); // '_'
2997 yyCharClass->addRange(from: 0x0041, to: 0x005a); // [A-Z]
2998 yyCharClass->addRange(from: 0x0061, to: 0x007a); // [a-z]
2999 yyCharClass->addRange(from: 0xc0, to: 0xd6);
3000 yyCharClass->addRange(from: 0xd8, to: 0xf6);
3001 yyCharClass->addRange(from: 0xf8, to: 0x2ff);
3002 yyCharClass->addRange(from: 0x370, to: 0x37d);
3003 yyCharClass->addRange(from: 0x37f, to: 0x1fff);
3004 yyCharClass->addRange(from: 0x200c, to: 0x200d);
3005 yyCharClass->addRange(from: 0x2070, to: 0x218f);
3006 yyCharClass->addRange(from: 0x2c00, to: 0x2fef);
3007 yyCharClass->addRange(from: 0x3001, to: 0xd7ff);
3008 yyCharClass->addRange(from: 0xf900, to: 0xfdcf);
3009 yyCharClass->addRange(from: 0xfdf0, to: 0xfffd);
3010 yyCharClass->addRange(from: (ushort)0x10000, to: (ushort)0xeffff);
3011 return Tok_CharClass;
3012 } else {
3013 break;
3014 }
3015 case 'C':
3016 if (!xmlSchemaExtensions)
3017 break;
3018 yyCharClass->setNegative(!yyCharClass->negative());
3019 Q_FALLTHROUGH();
3020 case 'c':
3021 if (xmlSchemaExtensions) {
3022 yyCharClass->addCategories(FLAG(QChar::Mark_NonSpacing) |
3023 FLAG(QChar::Mark_SpacingCombining) |
3024 FLAG(QChar::Mark_Enclosing) |
3025 FLAG(QChar::Number_DecimalDigit) |
3026 FLAG(QChar::Number_Letter) |
3027 FLAG(QChar::Number_Other) |
3028 FLAG(QChar::Letter_Uppercase) |
3029 FLAG(QChar::Letter_Lowercase) |
3030 FLAG(QChar::Letter_Titlecase) |
3031 FLAG(QChar::Letter_Modifier) |
3032 FLAG(QChar::Letter_Other));
3033 yyCharClass->addSingleton(ch: 0x002d); // '-'
3034 yyCharClass->addSingleton(ch: 0x002e); // '.'
3035 yyCharClass->addSingleton(ch: 0x003a); // ':'
3036 yyCharClass->addSingleton(ch: 0x005f); // '_'
3037 yyCharClass->addSingleton(ch: 0xb7);
3038 yyCharClass->addRange(from: 0x0030, to: 0x0039); // [0-9]
3039 yyCharClass->addRange(from: 0x0041, to: 0x005a); // [A-Z]
3040 yyCharClass->addRange(from: 0x0061, to: 0x007a); // [a-z]
3041 yyCharClass->addRange(from: 0xc0, to: 0xd6);
3042 yyCharClass->addRange(from: 0xd8, to: 0xf6);
3043 yyCharClass->addRange(from: 0xf8, to: 0x2ff);
3044 yyCharClass->addRange(from: 0x370, to: 0x37d);
3045 yyCharClass->addRange(from: 0x37f, to: 0x1fff);
3046 yyCharClass->addRange(from: 0x200c, to: 0x200d);
3047 yyCharClass->addRange(from: 0x2070, to: 0x218f);
3048 yyCharClass->addRange(from: 0x2c00, to: 0x2fef);
3049 yyCharClass->addRange(from: 0x3001, to: 0xd7ff);
3050 yyCharClass->addRange(from: 0xf900, to: 0xfdcf);
3051 yyCharClass->addRange(from: 0xfdf0, to: 0xfffd);
3052 yyCharClass->addRange(from: (ushort)0x10000, to: (ushort)0xeffff);
3053 yyCharClass->addRange(from: 0x0300, to: 0x036f);
3054 yyCharClass->addRange(from: 0x203f, to: 0x2040);
3055 return Tok_CharClass;
3056 } else {
3057 break;
3058 }
3059 case 'P':
3060 if (!xmlSchemaExtensions)
3061 break;
3062 yyCharClass->setNegative(!yyCharClass->negative());
3063 Q_FALLTHROUGH();
3064 case 'p':
3065 if (xmlSchemaExtensions) {
3066 if (yyCh != '{') {
3067 error(RXERR_CHARCLASS);
3068 return Tok_CharClass;
3069 }
3070
3071 QByteArray category;
3072 yyCh = getChar();
3073 while (yyCh != '}') {
3074 if (yyCh == EOS) {
3075 error(RXERR_END);
3076 return Tok_CharClass;
3077 }
3078 category.append(c: yyCh);
3079 yyCh = getChar();
3080 }
3081 yyCh = getChar(); // skip closing '}'
3082
3083 int catlen = category.size();
3084 if (catlen == 1 || catlen == 2) {
3085 switch (category.at(i: 0)) {
3086 case 'M':
3087 if (catlen == 1) {
3088 yyCharClass->addCategories(FLAG(QChar::Mark_NonSpacing) |
3089 FLAG(QChar::Mark_SpacingCombining) |
3090 FLAG(QChar::Mark_Enclosing));
3091 } else {
3092 switch (category.at(i: 1)) {
3093 case 'n': yyCharClass->addCategories(FLAG(QChar::Mark_NonSpacing)); break; // Mn
3094 case 'c': yyCharClass->addCategories(FLAG(QChar::Mark_SpacingCombining)); break; // Mc
3095 case 'e': yyCharClass->addCategories(FLAG(QChar::Mark_Enclosing)); break; // Me
3096 default: error(RXERR_CATEGORY); break;
3097 }
3098 }
3099 break;
3100 case 'N':
3101 if (catlen == 1) {
3102 yyCharClass->addCategories(FLAG(QChar::Number_DecimalDigit) |
3103 FLAG(QChar::Number_Letter) |
3104 FLAG(QChar::Number_Other));
3105 } else {
3106 switch (category.at(i: 1)) {
3107 case 'd': yyCharClass->addCategories(FLAG(QChar::Number_DecimalDigit)); break; // Nd
3108 case 'l': yyCharClass->addCategories(FLAG(QChar::Number_Letter)); break; // Hl
3109 case 'o': yyCharClass->addCategories(FLAG(QChar::Number_Other)); break; // No
3110 default: error(RXERR_CATEGORY); break;
3111 }
3112 }
3113 break;
3114 case 'Z':
3115 if (catlen == 1) {
3116 yyCharClass->addCategories(FLAG(QChar::Separator_Space) |
3117 FLAG(QChar::Separator_Line) |
3118 FLAG(QChar::Separator_Paragraph));
3119 } else {
3120 switch (category.at(i: 1)) {
3121 case 's': yyCharClass->addCategories(FLAG(QChar::Separator_Space)); break; // Zs
3122 case 'l': yyCharClass->addCategories(FLAG(QChar::Separator_Line)); break; // Zl
3123 case 'p': yyCharClass->addCategories(FLAG(QChar::Separator_Paragraph)); break; // Zp
3124 default: error(RXERR_CATEGORY); break;
3125 }
3126 }
3127 break;
3128 case 'C':
3129 if (catlen == 1) {
3130 yyCharClass->addCategories(FLAG(QChar::Other_Control) |
3131 FLAG(QChar::Other_Format) |
3132 FLAG(QChar::Other_Surrogate) |
3133 FLAG(QChar::Other_PrivateUse) |
3134 FLAG(QChar::Other_NotAssigned));
3135 } else {
3136 switch (category.at(i: 1)) {
3137 case 'c': yyCharClass->addCategories(FLAG(QChar::Other_Control)); break; // Cc
3138 case 'f': yyCharClass->addCategories(FLAG(QChar::Other_Format)); break; // Cf
3139 case 's': yyCharClass->addCategories(FLAG(QChar::Other_Surrogate)); break; // Cs
3140 case 'o': yyCharClass->addCategories(FLAG(QChar::Other_PrivateUse)); break; // Co
3141 case 'n': yyCharClass->addCategories(FLAG(QChar::Other_NotAssigned)); break; // Cn
3142 default: error(RXERR_CATEGORY); break;
3143 }
3144 }
3145 break;
3146 case 'L':
3147 if (catlen == 1) {
3148 yyCharClass->addCategories(FLAG(QChar::Letter_Uppercase) |
3149 FLAG(QChar::Letter_Lowercase) |
3150 FLAG(QChar::Letter_Titlecase) |
3151 FLAG(QChar::Letter_Modifier) |
3152 FLAG(QChar::Letter_Other));
3153 } else {
3154 switch (category.at(i: 1)) {
3155 case 'u': yyCharClass->addCategories(FLAG(QChar::Letter_Uppercase)); break; // Lu
3156 case 'l': yyCharClass->addCategories(FLAG(QChar::Letter_Lowercase)); break; // Ll
3157 case 't': yyCharClass->addCategories(FLAG(QChar::Letter_Titlecase)); break; // Lt
3158 case 'm': yyCharClass->addCategories(FLAG(QChar::Letter_Modifier)); break; // Lm
3159 case 'o': yyCharClass->addCategories(FLAG(QChar::Letter_Other)); break; // Lo
3160 default: error(RXERR_CATEGORY); break;
3161 }
3162 }
3163 break;
3164 case 'P':
3165 if (catlen == 1) {
3166 yyCharClass->addCategories(FLAG(QChar::Punctuation_Connector) |
3167 FLAG(QChar::Punctuation_Dash) |
3168 FLAG(QChar::Punctuation_Open) |
3169 FLAG(QChar::Punctuation_Close) |
3170 FLAG(QChar::Punctuation_InitialQuote) |
3171 FLAG(QChar::Punctuation_FinalQuote) |
3172 FLAG(QChar::Punctuation_Other));
3173 } else {
3174 switch (category.at(i: 1)) {
3175 case 'c': yyCharClass->addCategories(FLAG(QChar::Punctuation_Connector)); break; // Pc
3176 case 'd': yyCharClass->addCategories(FLAG(QChar::Punctuation_Dash)); break; // Pd
3177 case 's': yyCharClass->addCategories(FLAG(QChar::Punctuation_Open)); break; // Ps
3178 case 'e': yyCharClass->addCategories(FLAG(QChar::Punctuation_Close)); break; // Pe
3179 case 'i': yyCharClass->addCategories(FLAG(QChar::Punctuation_InitialQuote)); break; // Pi
3180 case 'f': yyCharClass->addCategories(FLAG(QChar::Punctuation_FinalQuote)); break; // Pf
3181 case 'o': yyCharClass->addCategories(FLAG(QChar::Punctuation_Other)); break; // Po
3182 default: error(RXERR_CATEGORY); break;
3183 }
3184 }
3185 break;
3186 case 'S':
3187 if (catlen == 1) {
3188 yyCharClass->addCategories(FLAG(QChar::Symbol_Math) |
3189 FLAG(QChar::Symbol_Currency) |
3190 FLAG(QChar::Symbol_Modifier) |
3191 FLAG(QChar::Symbol_Other));
3192 } else {
3193 switch (category.at(i: 1)) {
3194 case 'm': yyCharClass->addCategories(FLAG(QChar::Symbol_Math)); break; // Sm
3195 case 'c': yyCharClass->addCategories(FLAG(QChar::Symbol_Currency)); break; // Sc
3196 case 'k': yyCharClass->addCategories(FLAG(QChar::Symbol_Modifier)); break; // Sk
3197 case 'o': yyCharClass->addCategories(FLAG(QChar::Symbol_Other)); break; // So
3198 default: error(RXERR_CATEGORY); break;
3199 }
3200 }
3201 break;
3202 default:
3203 error(RXERR_CATEGORY);
3204 break;
3205 }
3206 } else if (catlen > 2 && category.at(i: 0) == 'I' && category.at(i: 1) == 's') {
3207 static const int N = sizeof(categoriesRangeMap) / sizeof(categoriesRangeMap[0]);
3208 const char * const categoryFamily = category.constData() + 2;
3209 const CategoriesRangeMapEntry *r = std::lower_bound(first: categoriesRangeMap, last: categoriesRangeMap + N, val: categoryFamily);
3210 if (r != categoriesRangeMap + N && qstrcmp(str1: r->name, str2: categoryFamily) == 0)
3211 yyCharClass->addRange(from: r->first, to: r->second);
3212 else
3213 error(RXERR_CATEGORY);
3214 } else {
3215 error(RXERR_CATEGORY);
3216 }
3217 return Tok_CharClass;
3218 } else {
3219 break;
3220 }
3221#endif
3222#ifndef QT_NO_REGEXP_ESCAPE
3223 case 'x':
3224 val = 0;
3225 for (i = 0; i < 4; i++) {
3226 low = QChar(yyCh).toLower().unicode();
3227 if (low >= '0' && low <= '9')
3228 val = (val << 4) | (low - '0');
3229 else if (low >= 'a' && low <= 'f')
3230 val = (val << 4) | (low - 'a' + 10);
3231 else
3232 break;
3233 yyCh = getChar();
3234 }
3235 return Tok_Char | val;
3236#endif
3237 default:
3238 break;
3239 }
3240 if (prevCh >= '1' && prevCh <= '9') {
3241#ifndef QT_NO_REGEXP_BACKREF
3242 val = prevCh - '0';
3243 while (yyCh >= '0' && yyCh <= '9') {
3244 val = (val * 10) + (yyCh - '0');
3245 yyCh = getChar();
3246 }
3247 return Tok_BackRef | val;
3248#else
3249 error(RXERR_DISABLED);
3250#endif
3251 }
3252 return Tok_Char | prevCh;
3253}
3254
3255#ifndef QT_NO_REGEXP_INTERVAL
3256int QRegExpEngine::getRep(int def)
3257{
3258 if (yyCh >= '0' && yyCh <= '9') {
3259 int rep = 0;
3260 do {
3261 rep = 10 * rep + yyCh - '0';
3262 if (rep >= InftyRep) {
3263 error(RXERR_REPETITION);
3264 rep = def;
3265 }
3266 yyCh = getChar();
3267 } while (yyCh >= '0' && yyCh <= '9');
3268 return rep;
3269 } else {
3270 return def;
3271 }
3272}
3273#endif
3274
3275#ifndef QT_NO_REGEXP_LOOKAHEAD
3276void QRegExpEngine::skipChars(int n)
3277{
3278 if (n > 0) {
3279 yyPos += n - 1;
3280 yyCh = getChar();
3281 }
3282}
3283#endif
3284
3285void QRegExpEngine::error(const char *msg)
3286{
3287 if (yyError.isEmpty())
3288 yyError = QLatin1String(msg);
3289}
3290
3291void QRegExpEngine::startTokenizer(const QChar *rx, int len)
3292{
3293 yyIn = rx;
3294 yyPos0 = 0;
3295 yyPos = 0;
3296 yyLen = len;
3297 yyCh = getChar();
3298 yyCharClass.emplace();
3299 yyMinRep = 0;
3300 yyMaxRep = 0;
3301 yyError = QString();
3302}
3303
3304int QRegExpEngine::getToken()
3305{
3306#ifndef QT_NO_REGEXP_CCLASS
3307 ushort pendingCh = 0;
3308 bool charPending;
3309 bool rangePending;
3310 int tok;
3311#endif
3312 int prevCh = yyCh;
3313
3314 yyPos0 = yyPos - 1;
3315#ifndef QT_NO_REGEXP_CCLASS
3316 yyCharClass->clear();
3317#endif
3318 yyMinRep = 0;
3319 yyMaxRep = 0;
3320 yyCh = getChar();
3321
3322 switch (prevCh) {
3323 case EOS:
3324 yyPos0 = yyPos;
3325 return Tok_Eos;
3326 case '$':
3327 return Tok_Dollar;
3328 case '(':
3329 if (yyCh == '?') {
3330 prevCh = getChar();
3331 yyCh = getChar();
3332 switch (prevCh) {
3333#ifndef QT_NO_REGEXP_LOOKAHEAD
3334 case '!':
3335 return Tok_NegLookahead;
3336 case '=':
3337 return Tok_PosLookahead;
3338#endif
3339 case ':':
3340 return Tok_MagicLeftParen;
3341 case '<':
3342 error(RXERR_LOOKBEHIND);
3343 return Tok_MagicLeftParen;
3344 default:
3345 error(RXERR_LOOKAHEAD);
3346 return Tok_MagicLeftParen;
3347 }
3348 } else {
3349 return Tok_LeftParen;
3350 }
3351 case ')':
3352 return Tok_RightParen;
3353 case '*':
3354 yyMinRep = 0;
3355 yyMaxRep = InftyRep;
3356 return Tok_Quantifier;
3357 case '+':
3358 yyMinRep = 1;
3359 yyMaxRep = InftyRep;
3360 return Tok_Quantifier;
3361 case '.':
3362#ifndef QT_NO_REGEXP_CCLASS
3363 yyCharClass->setNegative(true);
3364#endif
3365 return Tok_CharClass;
3366 case '?':
3367 yyMinRep = 0;
3368 yyMaxRep = 1;
3369 return Tok_Quantifier;
3370 case '[':
3371#ifndef QT_NO_REGEXP_CCLASS
3372 if (yyCh == '^') {
3373 yyCharClass->setNegative(true);
3374 yyCh = getChar();
3375 }
3376 charPending = false;
3377 rangePending = false;
3378 do {
3379 if (yyCh == '-' && charPending && !rangePending) {
3380 rangePending = true;
3381 yyCh = getChar();
3382 } else {
3383 if (charPending && !rangePending) {
3384 yyCharClass->addSingleton(ch: pendingCh);
3385 charPending = false;
3386 }
3387 if (yyCh == '\\') {
3388 yyCh = getChar();
3389 tok = getEscape();
3390 if (tok == Tok_Word)
3391 tok = '\b';
3392 } else {
3393 tok = Tok_Char | yyCh;
3394 yyCh = getChar();
3395 }
3396 if (tok == Tok_CharClass) {
3397 if (rangePending) {
3398 yyCharClass->addSingleton(ch: '-');
3399 yyCharClass->addSingleton(ch: pendingCh);
3400 charPending = false;
3401 rangePending = false;
3402 }
3403 } else if ((tok & Tok_Char) != 0) {
3404 if (rangePending) {
3405 yyCharClass->addRange(from: pendingCh, to: tok ^ Tok_Char);
3406 charPending = false;
3407 rangePending = false;
3408 } else {
3409 pendingCh = tok ^ Tok_Char;
3410 charPending = true;
3411 }
3412 } else {
3413 error(RXERR_CHARCLASS);
3414 }
3415 }
3416 } while (yyCh != ']' && yyCh != EOS);
3417 if (rangePending)
3418 yyCharClass->addSingleton(ch: '-');
3419 if (charPending)
3420 yyCharClass->addSingleton(ch: pendingCh);
3421 if (yyCh == EOS)
3422 error(RXERR_END);
3423 else
3424 yyCh = getChar();
3425 return Tok_CharClass;
3426#else
3427 error(RXERR_END);
3428 return Tok_Char | '[';
3429#endif
3430 case '\\':
3431 return getEscape();
3432 case ']':
3433 error(RXERR_LEFTDELIM);
3434 return Tok_Char | ']';
3435 case '^':
3436 return Tok_Caret;
3437 case '{':
3438#ifndef QT_NO_REGEXP_INTERVAL
3439 yyMinRep = getRep(def: 0);
3440 yyMaxRep = yyMinRep;
3441 if (yyCh == ',') {
3442 yyCh = getChar();
3443 yyMaxRep = getRep(def: InftyRep);
3444 }
3445 if (yyMaxRep < yyMinRep)
3446 error(RXERR_INTERVAL);
3447 if (yyCh != '}')
3448 error(RXERR_REPETITION);
3449 yyCh = getChar();
3450 return Tok_Quantifier;
3451#else
3452 error(RXERR_DISABLED);
3453 return Tok_Char | '{';
3454#endif
3455 case '|':
3456 return Tok_Bar;
3457 case '}':
3458 error(RXERR_LEFTDELIM);
3459 return Tok_Char | '}';
3460 default:
3461 return Tok_Char | prevCh;
3462 }
3463}
3464
3465int QRegExpEngine::parse(const QChar *pattern, int len)
3466{
3467 valid = true;
3468 startTokenizer(rx: pattern, len);
3469 yyTok = getToken();
3470#ifndef QT_NO_REGEXP_CAPTURE
3471 yyMayCapture = true;
3472#else
3473 yyMayCapture = false;
3474#endif
3475
3476#ifndef QT_NO_REGEXP_CAPTURE
3477 int atom = startAtom(officialCapture: false);
3478#endif
3479 QRegExpCharClass anything;
3480 Box box(this); // create InitialState
3481 box.set(anything);
3482 Box rightBox(this); // create FinalState
3483 rightBox.set(anything);
3484
3485 Box middleBox(this);
3486 parseExpression(box: &middleBox);
3487#ifndef QT_NO_REGEXP_CAPTURE
3488 finishAtom(atom, needCapture: false);
3489#endif
3490#ifndef QT_NO_REGEXP_OPTIM
3491 middleBox.setupHeuristics();
3492#endif
3493 box.cat(b: middleBox);
3494 box.cat(b: rightBox);
3495 yyCharClass.reset();
3496
3497#ifndef QT_NO_REGEXP_CAPTURE
3498 for (int i = 0; i < nf; ++i) {
3499 switch (f[i].capture) {
3500 case QRegExpAtom::NoCapture:
3501 break;
3502 case QRegExpAtom::OfficialCapture:
3503 f[i].capture = ncap;
3504 captureForOfficialCapture.append(t: ncap);
3505 ++ncap;
3506 ++officialncap;
3507 break;
3508 case QRegExpAtom::UnofficialCapture:
3509 f[i].capture = greedyQuantifiers ? ncap++ : QRegExpAtom::NoCapture;
3510 }
3511 }
3512
3513#ifndef QT_NO_REGEXP_BACKREF
3514#ifndef QT_NO_REGEXP_OPTIM
3515 if (officialncap == 0 && nbrefs == 0) {
3516 ncap = nf = 0;
3517 f.clear();
3518 }
3519#endif
3520 // handle the case where there's a \5 with no corresponding capture
3521 // (captureForOfficialCapture.size() != officialncap)
3522 for (int i = 0; i < nbrefs - officialncap; ++i) {
3523 captureForOfficialCapture.append(t: ncap);
3524 ++ncap;
3525 }
3526#endif
3527#endif
3528
3529 if (!yyError.isEmpty())
3530 return -1;
3531
3532#ifndef QT_NO_REGEXP_OPTIM
3533 const QRegExpAutomatonState &sinit = s.at(i: InitialState);
3534 caretAnchored = !sinit.anchors.isEmpty();
3535 if (caretAnchored) {
3536 const QMap<int, int> &anchors = sinit.anchors;
3537 QMap<int, int>::const_iterator a;
3538 for (a = anchors.constBegin(); a != anchors.constEnd(); ++a) {
3539 if (
3540#ifndef QT_NO_REGEXP_ANCHOR_ALT
3541 (*a & Anchor_Alternation) != 0 ||
3542#endif
3543 (*a & Anchor_Caret) == 0)
3544 {
3545 caretAnchored = false;
3546 break;
3547 }
3548 }
3549 }
3550#endif
3551
3552 // cleanup anchors
3553 int numStates = s.size();
3554 for (int i = 0; i < numStates; ++i) {
3555 QRegExpAutomatonState &state = s[i];
3556 if (!state.anchors.isEmpty()) {
3557 QMap<int, int>::iterator a = state.anchors.begin();
3558 while (a != state.anchors.end()) {
3559 if (a.value() == 0)
3560 a = state.anchors.erase(it: a);
3561 else
3562 ++a;
3563 }
3564 }
3565 }
3566
3567 return yyPos0;
3568}
3569
3570void QRegExpEngine::parseAtom(Box *box)
3571{
3572#ifndef QT_NO_REGEXP_LOOKAHEAD
3573 QRegExpEngine *eng = nullptr;
3574 bool neg;
3575 int len;
3576#endif
3577
3578 if ((yyTok & Tok_Char) != 0) {
3579 box->set(QChar(yyTok ^ Tok_Char));
3580 } else {
3581#ifndef QT_NO_REGEXP_OPTIM
3582 trivial = false;
3583#endif
3584 switch (yyTok) {
3585 case Tok_Dollar:
3586 box->catAnchor(a: Anchor_Dollar);
3587 break;
3588 case Tok_Caret:
3589 box->catAnchor(a: Anchor_Caret);
3590 break;
3591#ifndef QT_NO_REGEXP_LOOKAHEAD
3592 case Tok_PosLookahead:
3593 case Tok_NegLookahead:
3594 neg = (yyTok == Tok_NegLookahead);
3595 eng = new QRegExpEngine(cs, greedyQuantifiers);
3596 len = eng->parse(pattern: yyIn + yyPos - 1, len: yyLen - yyPos + 1);
3597 if (len >= 0)
3598 skipChars(n: len);
3599 else
3600 error(RXERR_LOOKAHEAD);
3601 box->catAnchor(a: addLookahead(eng, negative: neg));
3602 yyTok = getToken();
3603 if (yyTok != Tok_RightParen)
3604 error(RXERR_LOOKAHEAD);
3605 break;
3606#endif
3607#ifndef QT_NO_REGEXP_ESCAPE
3608 case Tok_Word:
3609 box->catAnchor(a: Anchor_Word);
3610 break;
3611 case Tok_NonWord:
3612 box->catAnchor(a: Anchor_NonWord);
3613 break;
3614#endif
3615 case Tok_LeftParen:
3616 case Tok_MagicLeftParen:
3617 yyTok = getToken();
3618 parseExpression(box);
3619 if (yyTok != Tok_RightParen)
3620 error(RXERR_END);
3621 break;
3622 case Tok_CharClass:
3623 box->set(*yyCharClass);
3624 break;
3625 case Tok_Quantifier:
3626 error(RXERR_REPETITION);
3627 break;
3628 default:
3629#ifndef QT_NO_REGEXP_BACKREF
3630 if ((yyTok & Tok_BackRef) != 0)
3631 box->set(yyTok ^ Tok_BackRef);
3632 else
3633#endif
3634 error(RXERR_DISABLED);
3635 }
3636 }
3637 yyTok = getToken();
3638}
3639
3640void QRegExpEngine::parseFactor(Box *box)
3641{
3642#ifndef QT_NO_REGEXP_CAPTURE
3643 int outerAtom = greedyQuantifiers ? startAtom(officialCapture: false) : -1;
3644 int innerAtom = startAtom(officialCapture: yyMayCapture && yyTok == Tok_LeftParen);
3645 bool magicLeftParen = (yyTok == Tok_MagicLeftParen);
3646#else
3647 const int innerAtom = -1;
3648#endif
3649
3650#ifndef QT_NO_REGEXP_INTERVAL
3651#define YYREDO() \
3652 yyIn = in, yyPos0 = pos0, yyPos = pos, yyLen = len, yyCh = ch, \
3653 *yyCharClass = charClass, yyMinRep = 0, yyMaxRep = 0, yyTok = tok
3654
3655 const QChar *in = yyIn;
3656 int pos0 = yyPos0;
3657 int pos = yyPos;
3658 int len = yyLen;
3659 int ch = yyCh;
3660 QRegExpCharClass charClass;
3661 if (yyTok == Tok_CharClass)
3662 charClass = *yyCharClass;
3663 int tok = yyTok;
3664 bool mayCapture = yyMayCapture;
3665#endif
3666
3667 parseAtom(box);
3668#ifndef QT_NO_REGEXP_CAPTURE
3669 finishAtom(atom: innerAtom, needCapture: magicLeftParen);
3670#endif
3671
3672 bool hasQuantifier = (yyTok == Tok_Quantifier);
3673 if (hasQuantifier) {
3674#ifndef QT_NO_REGEXP_OPTIM
3675 trivial = false;
3676#endif
3677 if (yyMaxRep == InftyRep) {
3678 box->plus(atom: innerAtom);
3679#ifndef QT_NO_REGEXP_INTERVAL
3680 } else if (yyMaxRep == 0) {
3681 box->clear();
3682#endif
3683 }
3684 if (yyMinRep == 0)
3685 box->opt();
3686
3687#ifndef QT_NO_REGEXP_INTERVAL
3688 yyMayCapture = false;
3689 int alpha = (yyMinRep == 0) ? 0 : yyMinRep - 1;
3690 int beta = (yyMaxRep == InftyRep) ? 0 : yyMaxRep - (alpha + 1);
3691
3692 Box rightBox(this);
3693 int i;
3694
3695 for (i = 0; i < beta; i++) {
3696 YYREDO();
3697 Box leftBox(this);
3698 parseAtom(box: &leftBox);
3699 leftBox.cat(b: rightBox);
3700 leftBox.opt();
3701 rightBox = leftBox;
3702 }
3703 for (i = 0; i < alpha; i++) {
3704 YYREDO();
3705 Box leftBox(this);
3706 parseAtom(box: &leftBox);
3707 leftBox.cat(b: rightBox);
3708 rightBox = leftBox;
3709 }
3710 rightBox.cat(b: *box);
3711 *box = rightBox;
3712#endif
3713 yyTok = getToken();
3714#ifndef QT_NO_REGEXP_INTERVAL
3715 yyMayCapture = mayCapture;
3716#endif
3717 }
3718#undef YYREDO
3719#ifndef QT_NO_REGEXP_CAPTURE
3720 if (greedyQuantifiers)
3721 finishAtom(atom: outerAtom, needCapture: hasQuantifier);
3722#endif
3723}
3724
3725void QRegExpEngine::parseTerm(Box *box)
3726{
3727#ifndef QT_NO_REGEXP_OPTIM
3728 if (yyTok != Tok_Eos && yyTok != Tok_RightParen && yyTok != Tok_Bar)
3729 parseFactor(box);
3730#endif
3731 while (yyTok != Tok_Eos && yyTok != Tok_RightParen && yyTok != Tok_Bar) {
3732 Box rightBox(this);
3733 parseFactor(box: &rightBox);
3734 box->cat(b: rightBox);
3735 }
3736}
3737
3738void QRegExpEngine::parseExpression(Box *box)
3739{
3740 parseTerm(box);
3741 while (yyTok == Tok_Bar) {
3742#ifndef QT_NO_REGEXP_OPTIM
3743 trivial = false;
3744#endif
3745 Box rightBox(this);
3746 yyTok = getToken();
3747 parseTerm(box: &rightBox);
3748 box->orx(b: rightBox);
3749 }
3750}
3751
3752/*
3753 The struct QRegExpPrivate contains the private data of a regular
3754 expression other than the automaton. It makes it possible for many
3755 QRegExp objects to use the same QRegExpEngine object with different
3756 QRegExpPrivate objects.
3757*/
3758struct QRegExpPrivate
3759{
3760 QRegExpEngine *eng;
3761 QRegExpEngineKey engineKey;
3762 bool minimal;
3763#ifndef QT_NO_REGEXP_CAPTURE
3764 QString t; // last string passed to QRegExp::indexIn() or lastIndexIn()
3765 QStringList capturedCache; // what QRegExp::capturedTexts() returned last
3766#endif
3767 QRegExpMatchState matchState;
3768
3769 inline QRegExpPrivate()
3770 : eng(nullptr), engineKey(QString(), QRegExp::RegExp, Qt::CaseSensitive), minimal(false) { }
3771 inline QRegExpPrivate(const QRegExpEngineKey &key)
3772 : eng(nullptr), engineKey(key), minimal(false) {}
3773};
3774
3775#if !defined(QT_NO_REGEXP_OPTIM)
3776struct QRECache
3777{
3778 typedef QHash<QRegExpEngineKey, QRegExpEngine *> EngineCache;
3779 typedef QCache<QRegExpEngineKey, QRegExpEngine> UnusedEngineCache;
3780 EngineCache usedEngines;
3781 UnusedEngineCache unusedEngines;
3782};
3783Q_GLOBAL_STATIC(QRECache, engineCache)
3784static QBasicMutex engineCacheMutex;
3785#endif // QT_NO_REGEXP_OPTIM
3786
3787static void derefEngine(QRegExpEngine *eng, const QRegExpEngineKey &key)
3788{
3789#if !defined(QT_NO_REGEXP_OPTIM)
3790 const auto locker = qt_scoped_lock(mutex&: engineCacheMutex);
3791 if (!eng->ref.deref()) {
3792 if (QRECache *c = engineCache()) {
3793 c->unusedEngines.insert(key, object: eng, cost: 4 + key.pattern.size() / 4);
3794 c->usedEngines.remove(key);
3795 } else {
3796 delete eng;
3797 }
3798 }
3799#else
3800 Q_UNUSED(key);
3801 if (!eng->ref.deref())
3802 delete eng;
3803#endif
3804}
3805
3806static void prepareEngine_helper(QRegExpPrivate *priv)
3807{
3808 Q_ASSERT(!priv->eng);
3809
3810#if !defined(QT_NO_REGEXP_OPTIM)
3811 const auto locker = qt_scoped_lock(mutex&: engineCacheMutex);
3812 if (QRECache *c = engineCache()) {
3813 priv->eng = c->unusedEngines.take(key: priv->engineKey);
3814 if (!priv->eng)
3815 priv->eng = c->usedEngines.value(key: priv->engineKey);
3816 if (!priv->eng)
3817 priv->eng = new QRegExpEngine(priv->engineKey);
3818 else
3819 priv->eng->ref.ref();
3820
3821 c->usedEngines.insert(key: priv->engineKey, value: priv->eng);
3822 return;
3823 }
3824#endif // QT_NO_REGEXP_OPTIM
3825
3826 priv->eng = new QRegExpEngine(priv->engineKey);
3827}
3828
3829inline static void prepareEngine(QRegExpPrivate *priv)
3830{
3831 if (priv->eng)
3832 return;
3833 prepareEngine_helper(priv);
3834 priv->matchState.prepareForMatch(eng: priv->eng);
3835}
3836
3837static void prepareEngineForMatch(QRegExpPrivate *priv, const QString &str)
3838{
3839 prepareEngine(priv);
3840 priv->matchState.prepareForMatch(eng: priv->eng);
3841#ifndef QT_NO_REGEXP_CAPTURE
3842 priv->t = str;
3843 priv->capturedCache.clear();
3844#else
3845 Q_UNUSED(str);
3846#endif
3847}
3848
3849static void invalidateEngine(QRegExpPrivate *priv)
3850{
3851 if (priv->eng) {
3852 derefEngine(eng: priv->eng, key: priv->engineKey);
3853 priv->eng = nullptr;
3854 priv->matchState.drain();
3855 }
3856}
3857
3858/*!
3859 \enum QRegExp::CaretMode
3860
3861 The CaretMode enum defines the different meanings of the caret
3862 (\b{^}) in a regular expression. The possible values are:
3863
3864 \value CaretAtZero
3865 The caret corresponds to index 0 in the searched string.
3866
3867 \value CaretAtOffset
3868 The caret corresponds to the start offset of the search.
3869
3870 \value CaretWontMatch
3871 The caret never matches.
3872*/
3873
3874/*!
3875 \enum QRegExp::PatternSyntax
3876
3877 The syntax used to interpret the meaning of the pattern.
3878
3879 \value RegExp A rich Perl-like pattern matching syntax. This is
3880 the default.
3881
3882 \value RegExp2 Like RegExp, but with \l{greedy quantifiers}.
3883 (Introduced in Qt 4.2.)
3884
3885 \value Wildcard This provides a simple pattern matching syntax
3886 similar to that used by shells (command interpreters) for "file
3887 globbing". See \l{QRegExp wildcard matching}.
3888
3889 \value WildcardUnix This is similar to Wildcard but with the
3890 behavior of a Unix shell. The wildcard characters can be escaped
3891 with the character "\\".
3892
3893 \value FixedString The pattern is a fixed string. This is
3894 equivalent to using the RegExp pattern on a string in
3895 which all metacharacters are escaped using escape().
3896
3897 \value W3CXmlSchema11 The pattern is a regular expression as
3898 defined by the W3C XML Schema 1.1 specification.
3899
3900 \sa setPatternSyntax()
3901*/
3902
3903/*!
3904 Constructs an empty regexp.
3905
3906 \sa isValid(), errorString()
3907*/
3908QRegExp::QRegExp()
3909{
3910 priv = new QRegExpPrivate;
3911 prepareEngine(priv);
3912}
3913
3914/*!
3915 Constructs a regular expression object for the given \a pattern
3916 string. The pattern must be given using wildcard notation if \a
3917 syntax is \l Wildcard; the default is \l RegExp. The pattern is
3918 case sensitive, unless \a cs is Qt::CaseInsensitive. Matching is
3919 greedy (maximal), but can be changed by calling
3920 setMinimal().
3921
3922 \sa setPattern(), setCaseSensitivity(), setPatternSyntax()
3923*/
3924QRegExp::QRegExp(const QString &pattern, Qt::CaseSensitivity cs, PatternSyntax syntax)
3925{
3926 priv = new QRegExpPrivate(QRegExpEngineKey(pattern, syntax, cs));
3927 prepareEngine(priv);
3928}
3929
3930/*!
3931 Constructs a regular expression as a copy of \a rx.
3932
3933 \sa operator=()
3934*/
3935QRegExp::QRegExp(const QRegExp &rx)
3936{
3937 priv = new QRegExpPrivate;
3938 operator=(rx);
3939}
3940
3941/*!
3942 Destroys the regular expression and cleans up its internal data.
3943*/
3944QRegExp::~QRegExp()
3945{
3946 invalidateEngine(priv);
3947 delete priv;
3948}
3949
3950/*!
3951 Copies the regular expression \a rx and returns a reference to the
3952 copy. The case sensitivity, wildcard, and minimal matching options
3953 are also copied.
3954*/
3955QRegExp &QRegExp::operator=(const QRegExp &rx)
3956{
3957 prepareEngine(priv: rx.priv); // to allow sharing
3958 QRegExpEngine *otherEng = rx.priv->eng;
3959 if (otherEng)
3960 otherEng->ref.ref();
3961 invalidateEngine(priv);
3962 priv->eng = otherEng;
3963 priv->engineKey = rx.priv->engineKey;
3964 priv->minimal = rx.priv->minimal;
3965#ifndef QT_NO_REGEXP_CAPTURE
3966 priv->t = rx.priv->t;
3967 priv->capturedCache = rx.priv->capturedCache;
3968#endif
3969 if (priv->eng)
3970 priv->matchState.prepareForMatch(eng: priv->eng);
3971 priv->matchState.captured = rx.priv->matchState.captured;
3972 return *this;
3973}
3974
3975/*!
3976 \fn QRegExp &QRegExp::operator=(QRegExp &&other)
3977
3978 Move-assigns \a other to this QRegExp instance.
3979
3980 \since 5.2
3981*/
3982
3983/*!
3984 \fn void QRegExp::swap(QRegExp &other)
3985 \since 4.8
3986
3987 Swaps regular expression \a other with this regular
3988 expression. This operation is very fast and never fails.
3989*/
3990
3991/*!
3992 Returns \c true if this regular expression is equal to \a rx;
3993 otherwise returns \c false.
3994
3995 Two QRegExp objects are equal if they have the same pattern
3996 strings and the same settings for case sensitivity, wildcard and
3997 minimal matching.
3998*/
3999bool QRegExp::operator==(const QRegExp &rx) const
4000{
4001 return priv->engineKey == rx.priv->engineKey && priv->minimal == rx.priv->minimal;
4002}
4003
4004/*!
4005 \since 5.6
4006 \relates QRegExp
4007
4008 Returns the hash value for \a key, using
4009 \a seed to seed the calculation.
4010*/
4011size_t qHash(const QRegExp &key, size_t seed) noexcept
4012{
4013 return qHashMulti(seed, args: key.priv->engineKey, args: key.priv->minimal);
4014}
4015
4016/*!
4017 \fn bool QRegExp::operator!=(const QRegExp &rx) const
4018
4019 Returns \c true if this regular expression is not equal to \a rx;
4020 otherwise returns \c false.
4021
4022 \sa operator==()
4023*/
4024
4025/*!
4026 Returns \c true if the pattern string is empty; otherwise returns
4027 false.
4028
4029 If you call exactMatch() with an empty pattern on an empty string
4030 it will return true; otherwise it returns \c false since it operates
4031 over the whole string. If you call indexIn() with an empty pattern
4032 on \e any string it will return the start offset (0 by default)
4033 because the empty pattern matches the 'emptiness' at the start of
4034 the string. In this case the length of the match returned by
4035 matchedLength() will be 0.
4036
4037 See QString::isEmpty().
4038*/
4039
4040bool QRegExp::isEmpty() const
4041{
4042 return priv->engineKey.pattern.isEmpty();
4043}
4044
4045/*!
4046 Returns \c true if the regular expression is valid; otherwise returns
4047 false. An invalid regular expression never matches.
4048
4049 The pattern \b{[a-z} is an example of an invalid pattern, since
4050 it lacks a closing square bracket.
4051
4052 Note that the validity of a regexp may also depend on the setting
4053 of the wildcard flag, for example \b{*.html} is a valid
4054 wildcard regexp but an invalid full regexp.
4055
4056 \sa errorString()
4057*/
4058bool QRegExp::isValid() const
4059{
4060 if (priv->engineKey.pattern.isEmpty()) {
4061 return true;
4062 } else {
4063 prepareEngine(priv);
4064 return priv->eng->isValid();
4065 }
4066}
4067
4068/*!
4069 Returns the pattern string of the regular expression. The pattern
4070 has either regular expression syntax or wildcard syntax, depending
4071 on patternSyntax().
4072
4073 \sa patternSyntax(), caseSensitivity()
4074*/
4075QString QRegExp::pattern() const
4076{
4077 return priv->engineKey.pattern;
4078}
4079
4080/*!
4081 Sets the pattern string to \a pattern. The case sensitivity,
4082 wildcard, and minimal matching options are not changed.
4083
4084 \sa setPatternSyntax(), setCaseSensitivity()
4085*/
4086void QRegExp::setPattern(const QString &pattern)
4087{
4088 if (priv->engineKey.pattern != pattern) {
4089 invalidateEngine(priv);
4090 priv->engineKey.pattern = pattern;
4091 }
4092}
4093
4094/*!
4095 Returns Qt::CaseSensitive if the regexp is matched case
4096 sensitively; otherwise returns Qt::CaseInsensitive.
4097
4098 \sa patternSyntax(), pattern(), isMinimal()
4099*/
4100Qt::CaseSensitivity QRegExp::caseSensitivity() const
4101{
4102 return priv->engineKey.cs;
4103}
4104
4105/*!
4106 Sets case sensitive matching to \a cs.
4107
4108 If \a cs is Qt::CaseSensitive, \b{\\.txt$} matches
4109 \c{readme.txt} but not \c{README.TXT}.
4110
4111 \sa setPatternSyntax(), setPattern(), setMinimal()
4112*/
4113void QRegExp::setCaseSensitivity(Qt::CaseSensitivity cs)
4114{
4115 if ((bool)cs != (bool)priv->engineKey.cs) {
4116 invalidateEngine(priv);
4117 priv->engineKey.cs = cs;
4118 }
4119}
4120
4121/*!
4122 Returns the syntax used by the regular expression. The default is
4123 QRegExp::RegExp.
4124
4125 \sa pattern(), caseSensitivity()
4126*/
4127QRegExp::PatternSyntax QRegExp::patternSyntax() const
4128{
4129 return priv->engineKey.patternSyntax;
4130}
4131
4132/*!
4133 Sets the syntax mode for the regular expression. The default is
4134 QRegExp::RegExp.
4135
4136 Setting \a syntax to QRegExp::Wildcard enables simple shell-like
4137 \l{QRegExp wildcard matching}. For example, \b{r*.txt} matches the
4138 string \c{readme.txt} in wildcard mode, but does not match
4139 \c{readme}.
4140
4141 Setting \a syntax to QRegExp::FixedString means that the pattern
4142 is interpreted as a plain string. Special characters (e.g.,
4143 backslash) don't need to be escaped then.
4144
4145 \sa setPattern(), setCaseSensitivity(), escape()
4146*/
4147void QRegExp::setPatternSyntax(PatternSyntax syntax)
4148{
4149 if (syntax != priv->engineKey.patternSyntax) {
4150 invalidateEngine(priv);
4151 priv->engineKey.patternSyntax = syntax;
4152 }
4153}
4154
4155/*!
4156 Returns \c true if minimal (non-greedy) matching is enabled;
4157 otherwise returns \c false.
4158
4159 \sa caseSensitivity(), setMinimal()
4160*/
4161bool QRegExp::isMinimal() const
4162{
4163 return priv->minimal;
4164}
4165
4166/*!
4167 Enables or disables minimal matching. If \a minimal is false,
4168 matching is greedy (maximal) which is the default.
4169
4170 For example, suppose we have the input string "We must be
4171 <b>bold</b>, very <b>bold</b>!" and the pattern
4172 \b{<b>.*</b>}. With the default greedy (maximal) matching,
4173 the match is "We must be \underline{<b>bold</b>, very
4174 <b>bold</b>}!". But with minimal (non-greedy) matching, the
4175 first match is: "We must be \underline{<b>bold</b>}, very
4176 <b>bold</b>!" and the second match is "We must be <b>bold</b>,
4177 very \underline{<b>bold</b>}!". In practice we might use the pattern
4178 \b{<b>[^<]*\</b>} instead, although this will still fail for
4179 nested tags.
4180
4181 \sa setCaseSensitivity()
4182*/
4183void QRegExp::setMinimal(bool minimal)
4184{
4185 priv->minimal = minimal;
4186}
4187
4188// ### Qt 5: make non-const
4189/*!
4190 Returns \c true if \a str is matched exactly by this regular
4191 expression; otherwise returns \c false. You can determine how much of
4192 the string was matched by calling matchedLength().
4193
4194 For a given regexp string R, exactMatch("R") is the equivalent of
4195 indexIn("^R$") since exactMatch() effectively encloses the regexp
4196 in the start of string and end of string anchors, except that it
4197 sets matchedLength() differently.
4198
4199 For example, if the regular expression is \b{blue}, then
4200 exactMatch() returns \c true only for input \c blue. For inputs \c
4201 bluebell, \c blutak and \c lightblue, exactMatch() returns \c false
4202 and matchedLength() will return 4, 3 and 0 respectively.
4203
4204 Although const, this function sets matchedLength(),
4205 capturedTexts(), and pos().
4206
4207 \sa indexIn(), lastIndexIn()
4208*/
4209bool QRegExp::exactMatch(const QString &str) const
4210{
4211 prepareEngineForMatch(priv, str);
4212 priv->matchState.match(str0: str.unicode(), len0: str.size(), pos0: 0, minimal0: priv->minimal, oneTest: true, caretIndex: 0);
4213 if (priv->matchState.captured[1] == str.size()) {
4214 return true;
4215 } else {
4216 priv->matchState.captured[0] = 0;
4217 priv->matchState.captured[1] = priv->matchState.oneTestMatchedLen;
4218 return false;
4219 }
4220}
4221
4222/*!
4223 Returns the regexp as a QVariant
4224*/
4225QRegExp::operator QVariant() const
4226{
4227QT_WARNING_PUSH QT_WARNING_DISABLE_DEPRECATED
4228 QVariant v;
4229 v.setValue(*this);
4230 return v;
4231QT_WARNING_POP
4232}
4233
4234// ### Qt 5: make non-const
4235/*!
4236 Attempts to find a match in \a str from position \a offset (0 by
4237 default). If \a offset is -1, the search starts at the last
4238 character; if -2, at the next to last character; etc.
4239
4240 Returns the position of the first match, or -1 if there was no
4241 match.
4242
4243 The \a caretMode parameter can be used to instruct whether \b{^}
4244 should match at index 0 or at \a offset.
4245
4246 You might prefer to use QString::indexOf(), QString::contains(),
4247 or even QStringList::filter(). To replace matches use
4248 QString::replace().
4249
4250 Example:
4251 \snippet code/src_corelib_text_qregexp.cpp 13
4252
4253 Although const, this function sets matchedLength(),
4254 capturedTexts() and pos().
4255
4256 If the QRegExp is a wildcard expression (see setPatternSyntax())
4257 and want to test a string against the whole wildcard expression,
4258 use exactMatch() instead of this function.
4259
4260 \sa lastIndexIn(), exactMatch()
4261*/
4262
4263int QRegExp::indexIn(const QString &str, int offset, CaretMode caretMode) const
4264{
4265 prepareEngineForMatch(priv, str);
4266 if (offset < 0)
4267 offset += str.size();
4268 priv->matchState.match(str0: str.unicode(), len0: str.size(), pos0: offset,
4269 minimal0: priv->minimal, oneTest: false, caretIndex: caretIndex(offset, caretMode));
4270 return priv->matchState.captured[0];
4271}
4272
4273// ### Qt 5: make non-const
4274/*!
4275 Attempts to find a match backwards in \a str from position \a
4276 offset. If \a offset is -1 (the default), the search starts at the
4277 last character; if -2, at the next to last character; etc.
4278
4279 Returns the position of the first match, or -1 if there was no
4280 match.
4281
4282 The \a caretMode parameter can be used to instruct whether \b{^}
4283 should match at index 0 or at \a offset.
4284
4285 Although const, this function sets matchedLength(),
4286 capturedTexts() and pos().
4287
4288 \warning Searching backwards is much slower than searching
4289 forwards.
4290
4291 \sa indexIn(), exactMatch()
4292*/
4293
4294int QRegExp::lastIndexIn(const QString &str, int offset, CaretMode caretMode) const
4295{
4296 prepareEngineForMatch(priv, str);
4297 if (offset < 0)
4298 offset += str.size();
4299 if (offset < 0 || offset > str.size()) {
4300 memset(s: priv->matchState.captured, c: -1, n: priv->matchState.capturedSize*sizeof(int));
4301 return -1;
4302 }
4303
4304 while (offset >= 0) {
4305 priv->matchState.match(str0: str.unicode(), len0: str.size(), pos0: offset,
4306 minimal0: priv->minimal, oneTest: true, caretIndex: caretIndex(offset, caretMode));
4307 if (priv->matchState.captured[0] == offset)
4308 return offset;
4309 --offset;
4310 }
4311 return -1;
4312}
4313
4314/*!
4315 Returns the length of the last matched string, or -1 if there was
4316 no match.
4317
4318 \sa exactMatch(), indexIn(), lastIndexIn()
4319*/
4320int QRegExp::matchedLength() const
4321{
4322 return priv->matchState.captured[1];
4323}
4324
4325
4326/*!
4327 Replaces every occurrence of this regular expression in
4328 \a str with \a after and returns the result.
4329
4330 For regular expressions containing \l{capturing parentheses},
4331 occurrences of \b{\\1}, \b{\\2}, ..., in \a after are replaced
4332 with \c {rx}.cap(1), cap(2), ...
4333
4334 \sa indexIn(), lastIndexIn(), QRegExp::cap()
4335*/
4336QString QRegExp::replaceIn(const QString &str, const QString &after) const
4337{
4338 struct QStringCapture
4339 {
4340 int pos;
4341 int len;
4342 int no;
4343 };
4344
4345 QRegExp rx2(*this);
4346
4347 if (str.isEmpty() && rx2.indexIn(str) == -1)
4348 return str;
4349
4350 QString s(str);
4351
4352 int index = 0;
4353 int numCaptures = rx2.captureCount();
4354 int al = after.size();
4355 QRegExp::CaretMode caretMode = QRegExp::CaretAtZero;
4356
4357 if (numCaptures > 0) {
4358 const QChar *uc = after.unicode();
4359 int numBackRefs = 0;
4360
4361 for (int i = 0; i < al - 1; i++) {
4362 if (uc[i] == QLatin1Char('\\')) {
4363 int no = uc[i + 1].digitValue();
4364 if (no > 0 && no <= numCaptures)
4365 numBackRefs++;
4366 }
4367 }
4368
4369 /*
4370 This is the harder case where we have back-references.
4371 */
4372 if (numBackRefs > 0) {
4373 QVarLengthArray<QStringCapture, 16> captures(numBackRefs);
4374 int j = 0;
4375
4376 for (int i = 0; i < al - 1; i++) {
4377 if (uc[i] == QLatin1Char('\\')) {
4378 int no = uc[i + 1].digitValue();
4379 if (no > 0 && no <= numCaptures) {
4380 QStringCapture capture;
4381 capture.pos = i;
4382 capture.len = 2;
4383
4384 if (i < al - 2) {
4385 int secondDigit = uc[i + 2].digitValue();
4386 if (secondDigit != -1 && ((no * 10) + secondDigit) <= numCaptures) {
4387 no = (no * 10) + secondDigit;
4388 ++capture.len;
4389 }
4390 }
4391
4392 capture.no = no;
4393 captures[j++] = capture;
4394 }
4395 }
4396 }
4397
4398 while (index <= s.size()) {
4399 index = rx2.indexIn(str: s, offset: index, caretMode);
4400 if (index == -1)
4401 break;
4402
4403 QString after2(after);
4404 for (j = numBackRefs - 1; j >= 0; j--) {
4405 const QStringCapture &capture = captures[j];
4406 after2.replace(i: capture.pos, len: capture.len, after: rx2.cap(nth: capture.no));
4407 }
4408
4409 s.replace(i: index, len: rx2.matchedLength(), after: after2);
4410 index += after2.size();
4411
4412 // avoid infinite loop on 0-length matches (e.g., QRegExp("[a-z]*"))
4413 if (rx2.matchedLength() == 0)
4414 ++index;
4415
4416 caretMode = QRegExp::CaretWontMatch;
4417 }
4418 return s;
4419 }
4420 }
4421
4422 /*
4423 This is the simple and optimized case where we don't have
4424 back-references.
4425 */
4426 while (index != -1) {
4427 struct {
4428 int pos;
4429 int length;
4430 } replacements[2048];
4431
4432 int pos = 0;
4433 int adjust = 0;
4434 while (pos < 2047) {
4435 index = rx2.indexIn(str: s, offset: index, caretMode);
4436 if (index == -1)
4437 break;
4438 int ml = rx2.matchedLength();
4439 replacements[pos].pos = index;
4440 replacements[pos++].length = ml;
4441 index += ml;
4442 adjust += al - ml;
4443 // avoid infinite loop
4444 if (!ml)
4445 index++;
4446 }
4447 if (!pos)
4448 break;
4449 replacements[pos].pos = s.size();
4450 int newlen = s.size() + adjust;
4451
4452 // to continue searching at the right position after we did
4453 // the first round of replacements
4454 if (index != -1)
4455 index += adjust;
4456 QString newstring;
4457 newstring.reserve(asize: newlen + 1);
4458 QChar *newuc = newstring.data();
4459 QChar *uc = newuc;
4460 int copystart = 0;
4461 int i = 0;
4462 while (i < pos) {
4463 int copyend = replacements[i].pos;
4464 int size = copyend - copystart;
4465 memcpy(dest: static_cast<void*>(uc), src: static_cast<const void *>(s.constData() + copystart), n: size * sizeof(QChar));
4466 uc += size;
4467 memcpy(dest: static_cast<void *>(uc), src: static_cast<const void *>(after.constData()), n: al * sizeof(QChar));
4468 uc += al;
4469 copystart = copyend + replacements[i].length;
4470 i++;
4471 }
4472 memcpy(dest: static_cast<void *>(uc), src: static_cast<const void *>(s.constData() + copystart), n: (s.size() - copystart) * sizeof(QChar));
4473 newstring.resize(size: newlen);
4474 s = newstring;
4475 caretMode = QRegExp::CaretWontMatch;
4476 }
4477 return s;
4478
4479}
4480
4481
4482/*!
4483 \fn QString QRegExp::removeIn(const QString &str) const
4484
4485 Removes every occurrence of this regular expression \a str, and
4486 returns the result
4487
4488 Does the same as replaceIn(str, QString()).
4489
4490 \sa indexIn(), lastIndexIn(), replaceIn()
4491*/
4492
4493
4494/*!
4495 \fn QString QRegExp::countIn(const QString &str) const
4496
4497 Returns the number of times this regular expression matches
4498 in \a str.
4499
4500 \sa indexIn(), lastIndexIn(), replaceIn()
4501*/
4502
4503int QRegExp::countIn(const QString &str) const
4504{
4505 QRegExp rx2(*this);
4506 int count = 0;
4507 int index = -1;
4508 int len = str.size();
4509 while (index < len - 1) { // count overlapping matches
4510 index = rx2.indexIn(str, offset: index + 1);
4511 if (index == -1)
4512 break;
4513 count++;
4514 }
4515 return count;
4516}
4517
4518/*!
4519 Splits \a str into substrings wherever this regular expression
4520 matches, and returns the list of those strings. If this regular
4521 expression does not match anywhere in the string, split() returns a
4522 single-element list containing \a str.
4523
4524 If \a behavior is set to Qt::KeepEmptyParts, empty fields are
4525 included in the resulting list.
4526
4527 \sa QStringList::join(), QString::split()
4528*/
4529QStringList QRegExp::splitString(const QString &str, Qt::SplitBehavior behavior) const
4530{
4531 QRegExp rx2(*this);
4532 QStringList list;
4533 int start = 0;
4534 int extra = 0;
4535 int end;
4536 while ((end = rx2.indexIn(str, offset: start + extra)) != -1) {
4537 int matchedLen = rx2.matchedLength();
4538 if (start != end || behavior == Qt::KeepEmptyParts)
4539 list.append(t: str.mid(position: start, n: end - start));
4540 start = end + matchedLen;
4541 extra = (matchedLen == 0) ? 1 : 0;
4542 }
4543 if (start != str.size() || behavior == Qt::KeepEmptyParts)
4544 list.append(t: str.mid(position: start, n: -1));
4545 return list;
4546}
4547
4548/*!
4549 Returns a list of all the strings that match this regular
4550 expression in \a stringList.
4551*/
4552QStringList QRegExp::filterList(const QStringList &stringList) const
4553{
4554 QStringList res;
4555 for (const QString &s : stringList) {
4556 if (containedIn(str: s))
4557 res << s;
4558 }
4559 return res;
4560}
4561
4562/*!
4563 Replaces every occurrence of this regexp, in each of \a stringList's
4564 with \a after. Returns a reference to the string list.
4565*/
4566QStringList QRegExp::replaceIn(const QStringList &stringList, const QString &after) const
4567{
4568 QStringList list;
4569 for (const QString &s : stringList)
4570 list << replaceIn(str: s, after);
4571 return list;
4572}
4573
4574/*!
4575 Returns the index position of the first exact match of this regexp in
4576 \a list, searching forward from index position \a from. Returns
4577 -1 if no item matched.
4578
4579 \sa lastIndexIn(), exactMatch()
4580*/
4581int QRegExp::indexIn(const QStringList &list, int from) const
4582{
4583 QRegExp rx2(*this);
4584 if (from < 0)
4585 from = qMax(a: from + list.size(), b: 0);
4586 for (int i = from; i < list.size(); ++i) {
4587 if (rx2.exactMatch(str: list.at(i)))
4588 return i;
4589 }
4590 return -1;
4591}
4592
4593/*!
4594 Returns the index position of the last exact match of this regexp in
4595 \a list, searching backward from index position \a from. If \a
4596 from is -1 (the default), the search starts at the last item.
4597 Returns -1 if no item matched.
4598
4599 \sa QRegExp::exactMatch()
4600*/
4601int QRegExp::lastIndexIn(const QStringList &list, int from) const
4602{
4603 QRegExp rx2(*this);
4604 if (from < 0)
4605 from += list.size();
4606 else if (from >= list.size())
4607 from = list.size() - 1;
4608 for (int i = from; i >= 0; --i) {
4609 if (rx2.exactMatch(str: list.at(i)))
4610 return i;
4611 }
4612 return -1;
4613}
4614
4615#ifndef QT_NO_REGEXP_CAPTURE
4616
4617/*!
4618 \since 4.6
4619 Returns the number of captures contained in the regular expression.
4620 */
4621int QRegExp::captureCount() const
4622{
4623 prepareEngine(priv);
4624 return priv->eng->captureCount();
4625}
4626
4627/*!
4628 Returns a list of the captured text strings.
4629
4630 The first string in the list is the entire matched string. Each
4631 subsequent list element contains a string that matched a
4632 (capturing) subexpression of the regexp.
4633
4634 For example:
4635 \snippet code/src_corelib_text_qregexp.cpp 14
4636
4637 The above example also captures elements that may be present but
4638 which we have no interest in. This problem can be solved by using
4639 non-capturing parentheses:
4640
4641 \snippet code/src_corelib_text_qregexp.cpp 15
4642
4643 Note that if you want to iterate over the list, you should iterate
4644 over a copy, e.g.
4645 \snippet code/src_corelib_text_qregexp.cpp 16
4646
4647 Some regexps can match an indeterminate number of times. For
4648 example if the input string is "Offsets: 12 14 99 231 7" and the
4649 regexp, \c{rx}, is \b{(\\d+)+}, we would hope to get a list of
4650 all the numbers matched. However, after calling
4651 \c{rx.indexIn(str)}, capturedTexts() will return the list ("12",
4652 "12"), i.e. the entire match was "12" and the first subexpression
4653 matched was "12". The correct approach is to use cap() in a
4654 \l{QRegExp#cap_in_a_loop}{loop}.
4655
4656 The order of elements in the string list is as follows. The first
4657 element is the entire matching string. Each subsequent element
4658 corresponds to the next capturing open left parentheses. Thus
4659 capturedTexts()[1] is the text of the first capturing parentheses,
4660 capturedTexts()[2] is the text of the second and so on
4661 (corresponding to $1, $2, etc., in some other regexp languages).
4662
4663 \sa cap(), pos()
4664*/
4665QStringList QRegExp::capturedTexts() const
4666{
4667 if (priv->capturedCache.isEmpty()) {
4668 prepareEngine(priv);
4669 const int *captured = priv->matchState.captured;
4670 int n = priv->matchState.capturedSize;
4671
4672 for (int i = 0; i < n; i += 2) {
4673 QString m;
4674 if (captured[i + 1] == 0)
4675 m = QLatin1String(""); // ### Qt 5: don't distinguish between null and empty
4676 else if (captured[i] >= 0)
4677 m = priv->t.mid(position: captured[i], n: captured[i + 1]);
4678 priv->capturedCache.append(t: m);
4679 }
4680 priv->t.clear();
4681 }
4682 return priv->capturedCache;
4683}
4684
4685/*!
4686 \internal
4687*/
4688QStringList QRegExp::capturedTexts()
4689{
4690 return const_cast<const QRegExp *>(this)->capturedTexts();
4691}
4692
4693/*!
4694 Returns the text captured by the \a nth subexpression. The entire
4695 match has index 0 and the parenthesized subexpressions have
4696 indexes starting from 1 (excluding non-capturing parentheses).
4697
4698 \snippet code/src_corelib_text_qregexp.cpp 17
4699
4700 The order of elements matched by cap() is as follows. The first
4701 element, cap(0), is the entire matching string. Each subsequent
4702 element corresponds to the next capturing open left parentheses.
4703 Thus cap(1) is the text of the first capturing parentheses, cap(2)
4704 is the text of the second, and so on.
4705
4706 \sa capturedTexts(), pos()
4707*/
4708QString QRegExp::cap(int nth) const
4709{
4710 return capturedTexts().value(i: nth);
4711}
4712
4713/*!
4714 \internal
4715*/
4716QString QRegExp::cap(int nth)
4717{
4718 return const_cast<const QRegExp *>(this)->cap(nth);
4719}
4720
4721/*!
4722 Returns the position of the \a nth captured text in the searched
4723 string. If \a nth is 0 (the default), pos() returns the position
4724 of the whole match.
4725
4726 Example:
4727 \snippet code/src_corelib_text_qregexp.cpp 18
4728
4729 For zero-length matches, pos() always returns -1. (For example, if
4730 cap(4) would return an empty string, pos(4) returns -1.) This is
4731 a feature of the implementation.
4732
4733 \sa cap(), capturedTexts()
4734*/
4735int QRegExp::pos(int nth) const
4736{
4737 if (nth < 0 || nth >= priv->matchState.capturedSize / 2)
4738 return -1;
4739 else
4740 return priv->matchState.captured[2 * nth];
4741}
4742
4743/*!
4744 \internal
4745*/
4746int QRegExp::pos(int nth)
4747{
4748 return const_cast<const QRegExp *>(this)->pos(nth);
4749}
4750
4751/*!
4752 Returns a text string that explains why a regexp pattern is
4753 invalid the case being; otherwise returns "no error occurred".
4754
4755 \sa isValid()
4756*/
4757QString QRegExp::errorString() const
4758{
4759 if (isValid()) {
4760 return QString::fromLatin1(RXERR_OK);
4761 } else {
4762 return priv->eng->errorString();
4763 }
4764}
4765
4766/*!
4767 \internal
4768*/
4769QString QRegExp::errorString()
4770{
4771 return const_cast<const QRegExp *>(this)->errorString();
4772}
4773
4774#endif
4775
4776/*!
4777 Returns the string \a str with every regexp special character
4778 escaped with a backslash. The special characters are $, (,), *, +,
4779 ., ?, [, \,], ^, {, | and }.
4780
4781 Example:
4782
4783 \snippet code/src_corelib_text_qregexp.cpp 19
4784
4785 This function is useful to construct regexp patterns dynamically:
4786
4787 \snippet code/src_corelib_text_qregexp.cpp 20
4788
4789 \sa setPatternSyntax()
4790*/
4791QString QRegExp::escape(const QString &str)
4792{
4793 QString quoted;
4794 const int count = str.size();
4795 quoted.reserve(asize: count * 2);
4796 const QLatin1Char backslash('\\');
4797 for (int i = 0; i < count; i++) {
4798 switch (str.at(i).toLatin1()) {
4799 case '$':
4800 case '(':
4801 case ')':
4802 case '*':
4803 case '+':
4804 case '.':
4805 case '?':
4806 case '[':
4807 case '\\':
4808 case ']':
4809 case '^':
4810 case '{':
4811 case '|':
4812 case '}':
4813 quoted.append(c: backslash);
4814 }
4815 quoted.append(c: str.at(i));
4816 }
4817 return quoted;
4818}
4819
4820
4821#ifndef QT_NO_DATASTREAM
4822/*!
4823 \relates QRegExp
4824
4825 Writes the regular expression \a regExp to stream \a out.
4826
4827 \sa {Serializing Qt Data Types}
4828*/
4829QDataStream &operator<<(QDataStream &out, const QRegExp &regExp)
4830{
4831 return out << regExp.pattern() << (quint8)regExp.caseSensitivity()
4832 << (quint8)regExp.patternSyntax()
4833 << (quint8)!!regExp.isMinimal();
4834}
4835
4836/*!
4837 \relates QRegExp
4838
4839 Reads a regular expression from stream \a in into \a regExp.
4840
4841 \sa {Serializing Qt Data Types}
4842*/
4843QDataStream &operator>>(QDataStream &in, QRegExp &regExp)
4844{
4845 QString pattern;
4846 quint8 cs;
4847 quint8 patternSyntax;
4848 quint8 isMinimal;
4849
4850 in >> pattern >> cs >> patternSyntax >> isMinimal;
4851
4852 QRegExp newRegExp(pattern, Qt::CaseSensitivity(cs),
4853 QRegExp::PatternSyntax(patternSyntax));
4854
4855 newRegExp.setMinimal(isMinimal);
4856 regExp = newRegExp;
4857 return in;
4858}
4859#endif // QT_NO_DATASTREAM
4860
4861#ifndef QT_NO_DEBUG_STREAM
4862QDebug operator<<(QDebug dbg, const QRegExp &r)
4863{
4864 QDebugStateSaver saver(dbg);
4865 dbg.nospace() << "QRegExp(patternSyntax=" << r.patternSyntax()
4866 << ", pattern='"<< r.pattern() << "')";
4867 return dbg;
4868}
4869#endif
4870
4871QT_END_NAMESPACE
4872

source code of qt5compat/src/core5/text/qregexp.cpp