1// Copyright (C) 2016 The Qt Company Ltd.
2// SPDX-License-Identifier: LicenseRef-Qt-Commercial OR LGPL-3.0-only OR GPL-2.0-only OR GPL-3.0-only
3
4#include "qregexp.h"
5
6#include "qalgorithms.h"
7#include "qbitarray.h"
8#include "qcache.h"
9#include "qdatastream.h"
10#include "qdebug.h"
11#include "qhashfunctions.h"
12#include "qlist.h"
13#include "qmap.h"
14#include "qmutex.h"
15#include "qstring.h"
16#include "qstringlist.h"
17#include "qstringmatcher.h"
18#include "private/qlocking_p.h"
19#include "qvarlengtharray.h"
20
21#include <limits.h>
22#include <algorithm>
23
24QT_BEGIN_NAMESPACE
25
26// error strings for the regexp parser
27#define RXERR_OK QT_TRANSLATE_NOOP("QRegExp", "no error occurred")
28#define RXERR_DISABLED QT_TRANSLATE_NOOP("QRegExp", "disabled feature used")
29#define RXERR_CHARCLASS QT_TRANSLATE_NOOP("QRegExp", "bad char class syntax")
30#define RXERR_LOOKAHEAD QT_TRANSLATE_NOOP("QRegExp", "bad lookahead syntax")
31#define RXERR_LOOKBEHIND QT_TRANSLATE_NOOP("QRegExp", "lookbehinds not supported, see QTBUG-2371")
32#define RXERR_REPETITION QT_TRANSLATE_NOOP("QRegExp", "bad repetition syntax")
33#define RXERR_OCTAL QT_TRANSLATE_NOOP("QRegExp", "invalid octal value")
34#define RXERR_LEFTDELIM QT_TRANSLATE_NOOP("QRegExp", "missing left delim")
35#define RXERR_END QT_TRANSLATE_NOOP("QRegExp", "unexpected end")
36#define RXERR_LIMIT QT_TRANSLATE_NOOP("QRegExp", "met internal limit")
37#define RXERR_INTERVAL QT_TRANSLATE_NOOP("QRegExp", "invalid interval")
38#define RXERR_CATEGORY QT_TRANSLATE_NOOP("QRegExp", "invalid category")
39
40/*!
41 \class QRegExp
42 \inmodule QtCore5Compat
43 \reentrant
44 \brief The QRegExp class provides pattern matching using regular expressions.
45
46 \ingroup tools
47 \ingroup shared
48
49 \keyword regular expression
50
51 This class is deprecated in Qt 6. Please use QRegularExpression instead
52 for all new code. For guidelines on porting old code from QRegExp to
53 QRegularExpression, see {Porting to QRegularExpression}
54
55 A regular expression, or "regexp", is a pattern for matching
56 substrings in a text. This is useful in many contexts, e.g.,
57
58 \table
59 \row \li Validation
60 \li A regexp can test whether a substring meets some criteria,
61 e.g. is an integer or contains no whitespace.
62 \row \li Searching
63 \li A regexp provides more powerful pattern matching than
64 simple substring matching, e.g., match one of the words
65 \e{mail}, \e{letter} or \e{correspondence}, but none of the
66 words \e{email}, \e{mailman}, \e{mailer}, \e{letterbox}, etc.
67 \row \li Search and Replace
68 \li A regexp can replace all occurrences of a substring with a
69 different substring, e.g., replace all occurrences of \e{&}
70 with \e{\&amp;} except where the \e{&} is already followed by
71 an \e{amp;}.
72 \row \li String Splitting
73 \li A regexp can be used to identify where a string should be
74 split apart, e.g. splitting tab-delimited strings.
75 \endtable
76
77 A brief introduction to regexps is presented, a description of
78 Qt's regexp language, some examples, and the function
79 documentation itself. QRegExp is modeled on Perl's regexp
80 language. It fully supports Unicode. QRegExp can also be used in a
81 simpler, \e{wildcard mode} that is similar to the functionality
82 found in command shells. The syntax rules used by QRegExp can be
83 changed with setPatternSyntax(). In particular, the pattern syntax
84 can be set to QRegExp::FixedString, which means the pattern to be
85 matched is interpreted as a plain string, i.e., special characters
86 (e.g., backslash) are not escaped.
87
88 A good text on regexps is \e {Mastering Regular Expressions}
89 (Third Edition) by Jeffrey E. F. Friedl, ISBN 0-596-52812-4.
90
91 \note In Qt 5, the new QRegularExpression class provides a Perl
92 compatible implementation of regular expressions and is recommended
93 in place of QRegExp.
94
95 \section1 Introduction
96
97 Regexps are built up from expressions, quantifiers, and
98 assertions. The simplest expression is a character, e.g. \b{x}
99 or \b{5}. An expression can also be a set of characters
100 enclosed in square brackets. \b{[ABCD]} will match an \b{A}
101 or a \b{B} or a \b{C} or a \b{D}. We can write this same
102 expression as \b{[A-D]}, and an expression to match any
103 capital letter in the English alphabet is written as
104 \b{[A-Z]}.
105
106 A quantifier specifies the number of occurrences of an expression
107 that must be matched. \b{x{1,1}} means match one and only one
108 \b{x}. \b{x{1,5}} means match a sequence of \b{x}
109 characters that contains at least one \b{x} but no more than
110 five.
111
112 Note that in general regexps cannot be used to check for balanced
113 brackets or tags. For example, a regexp can be written to match an
114 opening html \c{<b>} and its closing \c{</b>}, if the \c{<b>} tags
115 are not nested, but if the \c{<b>} tags are nested, that same
116 regexp will match an opening \c{<b>} tag with the wrong closing
117 \c{</b>}. For the fragment \c{<b>bold <b>bolder</b></b>}, the
118 first \c{<b>} would be matched with the first \c{</b>}, which is
119 not correct. However, it is possible to write a regexp that will
120 match nested brackets or tags correctly, but only if the number of
121 nesting levels is fixed and known. If the number of nesting levels
122 is not fixed and known, it is impossible to write a regexp that
123 will not fail.
124
125 Suppose we want a regexp to match integers in the range 0 to 99.
126 At least one digit is required, so we start with the expression
127 \b{[0-9]{1,1}}, which matches a single digit exactly once. This
128 regexp matches integers in the range 0 to 9. To match integers up
129 to 99, increase the maximum number of occurrences to 2, so the
130 regexp becomes \b{[0-9]{1,2}}. This regexp satisfies the
131 original requirement to match integers from 0 to 99, but it will
132 also match integers that occur in the middle of strings. If we
133 want the matched integer to be the whole string, we must use the
134 anchor assertions, \b{^} (caret) and \b{$} (dollar). When
135 \b{^} is the first character in a regexp, it means the regexp
136 must match from the beginning of the string. When \b{$} is the
137 last character of the regexp, it means the regexp must match to
138 the end of the string. The regexp becomes \b{^[0-9]{1,2}$}.
139 Note that assertions, e.g. \b{^} and \b{$}, do not match
140 characters but locations in the string.
141
142 If you have seen regexps described elsewhere, they may have looked
143 different from the ones shown here. This is because some sets of
144 characters and some quantifiers are so common that they have been
145 given special symbols to represent them. \b{[0-9]} can be
146 replaced with the symbol \b{\\d}. The quantifier to match
147 exactly one occurrence, \b{{1,1}}, can be replaced with the
148 expression itself, i.e. \b{x{1,1}} is the same as \b{x}. So
149 our 0 to 99 matcher could be written as \b{^\\d{1,2}$}. It can
150 also be written \b{^\\d\\d{0,1}$}, i.e. \e{From the start of
151 the string, match a digit, followed immediately by 0 or 1 digits}.
152 In practice, it would be written as \b{^\\d\\d?$}. The \b{?}
153 is shorthand for the quantifier \b{{0,1}}, i.e. 0 or 1
154 occurrences. \b{?} makes an expression optional. The regexp
155 \b{^\\d\\d?$} means \e{From the beginning of the string, match
156 one digit, followed immediately by 0 or 1 more digit, followed
157 immediately by end of string}.
158
159 To write a regexp that matches one of the words 'mail' \e or
160 'letter' \e or 'correspondence' but does not match words that
161 contain these words, e.g., 'email', 'mailman', 'mailer', and
162 'letterbox', start with a regexp that matches 'mail'. Expressed
163 fully, the regexp is \b{m{1,1}a{1,1}i{1,1}l{1,1}}, but because
164 a character expression is automatically quantified by
165 \b{{1,1}}, we can simplify the regexp to \b{mail}, i.e., an
166 'm' followed by an 'a' followed by an 'i' followed by an 'l'. Now
167 we can use the vertical bar \b{|}, which means \b{or}, to
168 include the other two words, so our regexp for matching any of the
169 three words becomes \b{mail|letter|correspondence}. Match
170 'mail' \b{or} 'letter' \b{or} 'correspondence'. While this
171 regexp will match one of the three words we want to match, it will
172 also match words we don't want to match, e.g., 'email'. To
173 prevent the regexp from matching unwanted words, we must tell it
174 to begin and end the match at word boundaries. First we enclose
175 our regexp in parentheses, \b{(mail|letter|correspondence)}.
176 Parentheses group expressions together, and they identify a part
177 of the regexp that we wish to \l{capturing text}{capture}.
178 Enclosing the expression in parentheses allows us to use it as a
179 component in more complex regexps. It also allows us to examine
180 which of the three words was actually matched. To force the match
181 to begin and end on word boundaries, we enclose the regexp in
182 \b{\\b} \e{word boundary} assertions:
183 \b{\\b(mail|letter|correspondence)\\b}. Now the regexp means:
184 \e{Match a word boundary, followed by the regexp in parentheses,
185 followed by a word boundary}. The \b{\\b} assertion matches a
186 \e position in the regexp, not a \e character. A word boundary is
187 any non-word character, e.g., a space, newline, or the beginning
188 or ending of a string.
189
190 If we want to replace ampersand characters with the HTML entity
191 \b{\&amp;}, the regexp to match is simply \b{\&}. But this
192 regexp will also match ampersands that have already been converted
193 to HTML entities. We want to replace only ampersands that are not
194 already followed by \b{amp;}. For this, we need the negative
195 lookahead assertion, \b{(?!}__\b{)}. The regexp can then be
196 written as \b{\&(?!amp;)}, i.e. \e{Match an ampersand that is}
197 \b{not} \e{followed by} \b{amp;}.
198
199 If we want to count all the occurrences of 'Eric' and 'Eirik' in a
200 string, two valid solutions are \b{\\b(Eric|Eirik)\\b} and
201 \b{\\bEi?ri[ck]\\b}. The word boundary assertion '\\b' is
202 required to avoid matching words that contain either name,
203 e.g. 'Ericsson'. Note that the second regexp matches more
204 spellings than we want: 'Eric', 'Erik', 'Eiric' and 'Eirik'.
205
206 Some of the examples discussed above are implemented in the
207 \l{#code-examples}{code examples} section.
208
209 \target characters-and-abbreviations-for-sets-of-characters
210 \section1 Characters and Abbreviations for Sets of Characters
211
212 \table
213 \header \li Element \li Meaning
214 \row \li \b{c}
215 \li A character represents itself unless it has a special
216 regexp meaning. e.g. \b{c} matches the character \e c.
217 \row \li \b{\\c}
218 \li A character that follows a backslash matches the character
219 itself, except as specified below. e.g., To match a literal
220 caret at the beginning of a string, write \b{\\^}.
221 \row \li \b{\\a}
222 \li Matches the ASCII bell (BEL, 0x07).
223 \row \li \b{\\f}
224 \li Matches the ASCII form feed (FF, 0x0C).
225 \row \li \b{\\n}
226 \li Matches the ASCII line feed (LF, 0x0A, Unix newline).
227 \row \li \b{\\r}
228 \li Matches the ASCII carriage return (CR, 0x0D).
229 \row \li \b{\\t}
230 \li Matches the ASCII horizontal tab (HT, 0x09).
231 \row \li \b{\\v}
232 \li Matches the ASCII vertical tab (VT, 0x0B).
233 \row \li \b{\\x\e{hhhh}}
234 \li Matches the Unicode character corresponding to the
235 hexadecimal number \e{hhhh} (between 0x0000 and 0xFFFF).
236 \row \li \b{\\0\e{ooo}} (i.e., \\zero \e{ooo})
237 \li matches the ASCII/Latin1 character for the octal number
238 \e{ooo} (between 0 and 0377).
239 \row \li \b{. (dot)}
240 \li Matches any character (including newline).
241 \row \li \b{\\d}
242 \li Matches a digit (QChar::isDigit()).
243 \row \li \b{\\D}
244 \li Matches a non-digit.
245 \row \li \b{\\s}
246 \li Matches a whitespace character (QChar::isSpace()).
247 \row \li \b{\\S}
248 \li Matches a non-whitespace character.
249 \row \li \b{\\w}
250 \li Matches a word character (QChar::isLetterOrNumber(), QChar::isMark(), or '_').
251 \row \li \b{\\W}
252 \li Matches a non-word character.
253 \row \li \b{\\\e{n}}
254 \li The \e{n}-th backreference, e.g. \\1, \\2, etc.
255 \endtable
256
257 \b{Note:} The C++ compiler transforms backslashes in strings.
258 To include a \b{\\} in a regexp, enter it twice, i.e. \c{\\}.
259 To match the backslash character itself, enter it four times, i.e.
260 \c{\\\\}.
261
262 \target sets-of-characters
263 \section1 Sets of Characters
264
265 Square brackets mean match any character contained in the square
266 brackets. The character set abbreviations described above can
267 appear in a character set in square brackets. Except for the
268 character set abbreviations and the following two exceptions,
269 characters do not have special meanings in square brackets.
270
271 \table
272 \row \li \b{^}
273
274 \li The caret negates the character set if it occurs as the
275 first character (i.e. immediately after the opening square
276 bracket). \b{[abc]} matches 'a' or 'b' or 'c', but
277 \b{[^abc]} matches anything \e but 'a' or 'b' or 'c'.
278
279 \row \li \b{-}
280
281 \li The dash indicates a range of characters. \b{[W-Z]}
282 matches 'W' or 'X' or 'Y' or 'Z'.
283
284 \endtable
285
286 Using the predefined character set abbreviations is more portable
287 than using character ranges across platforms and languages. For
288 example, \b{[0-9]} matches a digit in Western alphabets but
289 \b{\\d} matches a digit in \e any alphabet.
290
291 Note: In other regexp documentation, sets of characters are often
292 called "character classes".
293
294 \target quantifiers
295 \section1 Quantifiers
296
297 By default, an expression is automatically quantified by
298 \b{{1,1}}, i.e. it should occur exactly once. In the following
299 list, \b{\e {E}} stands for expression. An expression is a
300 character, or an abbreviation for a set of characters, or a set of
301 characters in square brackets, or an expression in parentheses.
302
303 \table
304 \row \li \b{\e {E}?}
305
306 \li Matches zero or one occurrences of \e E. This quantifier
307 means \e{The previous expression is optional}, because it
308 will match whether or not the expression is found. \b{\e
309 {E}?} is the same as \b{\e {E}{0,1}}. e.g., \b{dents?}
310 matches 'dent' or 'dents'.
311
312 \row \li \b{\e {E}+}
313
314 \li Matches one or more occurrences of \e E. \b{\e {E}+} is
315 the same as \b{\e {E}{1,}}. e.g., \b{0+} matches '0',
316 '00', '000', etc.
317
318 \row \li \b{\e {E}*}
319
320 \li Matches zero or more occurrences of \e E. It is the same
321 as \b{\e {E}{0,}}. The \b{*} quantifier is often used
322 in error where \b{+} should be used. For example, if
323 \b{\\s*$} is used in an expression to match strings that
324 end in whitespace, it will match every string because
325 \b{\\s*$} means \e{Match zero or more whitespaces followed
326 by end of string}. The correct regexp to match strings that
327 have at least one trailing whitespace character is
328 \b{\\s+$}.
329
330 \row \li \b{\e {E}{n}}
331
332 \li Matches exactly \e n occurrences of \e E. \b{\e {E}{n}}
333 is the same as repeating \e E \e n times. For example,
334 \b{x{5}} is the same as \b{xxxxx}. It is also the same
335 as \b{\e {E}{n,n}}, e.g. \b{x{5,5}}.
336
337 \row \li \b{\e {E}{n,}}
338 \li Matches at least \e n occurrences of \e E.
339
340 \row \li \b{\e {E}{,m}}
341 \li Matches at most \e m occurrences of \e E. \b{\e {E}{,m}}
342 is the same as \b{\e {E}{0,m}}.
343
344 \row \li \b{\e {E}{n,m}}
345 \li Matches at least \e n and at most \e m occurrences of \e E.
346 \endtable
347
348 To apply a quantifier to more than just the preceding character,
349 use parentheses to group characters together in an expression. For
350 example, \b{tag+} matches a 't' followed by an 'a' followed by
351 at least one 'g', whereas \b{(tag)+} matches at least one
352 occurrence of 'tag'.
353
354 Note: Quantifiers are normally "greedy". They always match as much
355 text as they can. For example, \b{0+} matches the first zero it
356 finds and all the consecutive zeros after the first zero. Applied
357 to '20005', it matches '2\underline{000}5'. Quantifiers can be made
358 non-greedy, see setMinimal().
359
360 \target capturing parentheses
361 \target backreferences
362 \section1 Capturing Text
363
364 Parentheses allow us to group elements together so that we can
365 quantify and capture them. For example if we have the expression
366 \b{mail|letter|correspondence} that matches a string we know
367 that \e one of the words matched but not which one. Using
368 parentheses allows us to "capture" whatever is matched within
369 their bounds, so if we used \b{(mail|letter|correspondence)}
370 and matched this regexp against the string "I sent you some email"
371 we can use the cap() or capturedTexts() functions to extract the
372 matched characters, in this case 'mail'.
373
374 We can use captured text within the regexp itself. To refer to the
375 captured text we use \e backreferences which are indexed from 1,
376 the same as for cap(). For example we could search for duplicate
377 words in a string using \b{\\b(\\w+)\\W+\\1\\b} which means match a
378 word boundary followed by one or more word characters followed by
379 one or more non-word characters followed by the same text as the
380 first parenthesized expression followed by a word boundary.
381
382 If we want to use parentheses purely for grouping and not for
383 capturing we can use the non-capturing syntax, e.g.
384 \b{(?:green|blue)}. Non-capturing parentheses begin '(?:' and
385 end ')'. In this example we match either 'green' or 'blue' but we
386 do not capture the match so we only know whether or not we matched
387 but not which color we actually found. Using non-capturing
388 parentheses is more efficient than using capturing parentheses
389 since the regexp engine has to do less book-keeping.
390
391 Both capturing and non-capturing parentheses may be nested.
392
393 \target greedy quantifiers
394
395 For historical reasons, quantifiers (e.g. \b{*}) that apply to
396 capturing parentheses are more "greedy" than other quantifiers.
397 For example, \b{a*(a*)} will match "aaa" with cap(1) == "aaa".
398 This behavior is different from what other regexp engines do
399 (notably, Perl). To obtain a more intuitive capturing behavior,
400 specify QRegExp::RegExp2 to the QRegExp constructor or call
401 setPatternSyntax(QRegExp::RegExp2).
402
403 \target cap_in_a_loop
404
405 When the number of matches cannot be determined in advance, a
406 common idiom is to use cap() in a loop. For example:
407
408 \snippet code/src_corelib_text_qregexp.cpp 0
409
410 \target assertions
411 \section1 Assertions
412
413 Assertions make some statement about the text at the point where
414 they occur in the regexp but they do not match any characters. In
415 the following list \b{\e {E}} stands for any expression.
416
417 \table
418 \row \li \b{^}
419 \li The caret signifies the beginning of the string. If you
420 wish to match a literal \c{^} you must escape it by
421 writing \c{\\^}. For example, \b{^#include} will only
422 match strings which \e begin with the characters '#include'.
423 (When the caret is the first character of a character set it
424 has a special meaning, see \l{#sets-of-characters}{Sets of Characters}.)
425
426 \row \li \b{$}
427 \li The dollar signifies the end of the string. For example
428 \b{\\d\\s*$} will match strings which end with a digit
429 optionally followed by whitespace. If you wish to match a
430 literal \c{$} you must escape it by writing
431 \c{\\$}.
432
433 \row \li \b{\\b}
434 \li A word boundary. For example the regexp
435 \b{\\bOK\\b} means match immediately after a word
436 boundary (e.g. start of string or whitespace) the letter 'O'
437 then the letter 'K' immediately before another word boundary
438 (e.g. end of string or whitespace). But note that the
439 assertion does not actually match any whitespace so if we
440 write \b{(\\bOK\\b)} and we have a match it will only
441 contain 'OK' even if the string is "It's \underline{OK} now".
442
443 \row \li \b{\\B}
444 \li A non-word boundary. This assertion is true wherever
445 \b{\\b} is false. For example if we searched for
446 \b{\\Bon\\B} in "Left on" the match would fail (space
447 and end of string aren't non-word boundaries), but it would
448 match in "t\underline{on}ne".
449
450 \row \li \b{(?=\e E)}
451 \li Positive lookahead. This assertion is true if the
452 expression matches at this point in the regexp. For example,
453 \b{const(?=\\s+char)} matches 'const' whenever it is
454 followed by 'char', as in 'static \underline{const} char *'.
455 (Compare with \b{const\\s+char}, which matches 'static
456 \underline{const char} *'.)
457
458 \row \li \b{(?!\e E)}
459 \li Negative lookahead. This assertion is true if the
460 expression does not match at this point in the regexp. For
461 example, \b{const(?!\\s+char)} matches 'const' \e except
462 when it is followed by 'char'.
463 \endtable
464
465 \target QRegExp wildcard matching
466 \section1 Wildcard Matching
467
468 Most command shells such as \e bash or \e cmd.exe support "file
469 globbing", the ability to identify a group of files by using
470 wildcards. The setPatternSyntax() function is used to switch
471 between regexp and wildcard mode. Wildcard matching is much
472 simpler than full regexps and has only four features:
473
474 \table
475 \row \li \b{c}
476 \li Any character represents itself apart from those mentioned
477 below. Thus \b{c} matches the character \e c.
478 \row \li \b{?}
479 \li Matches any single character. It is the same as
480 \b{.} in full regexps.
481 \row \li \b{*}
482 \li Matches zero or more of any characters. It is the
483 same as \b{.*} in full regexps.
484 \row \li \b{[...]}
485 \li Sets of characters can be represented in square brackets,
486 similar to full regexps. Within the character class, like
487 outside, backslash has no special meaning.
488 \endtable
489
490 In the mode Wildcard, the wildcard characters cannot be
491 escaped. In the mode WildcardUnix, the character '\\' escapes the
492 wildcard.
493
494 For example if we are in wildcard mode and have strings which
495 contain filenames we could identify HTML files with \b{*.html}.
496 This will match zero or more characters followed by a dot followed
497 by 'h', 't', 'm' and 'l'.
498
499 To test a string against a wildcard expression, use exactMatch().
500 For example:
501
502 \snippet code/src_corelib_text_qregexp.cpp 1
503
504 \target perl-users
505 \section1 Notes for Perl Users
506
507 Most of the character class abbreviations supported by Perl are
508 supported by QRegExp, see \l{#characters-and-abbreviations-for-sets-of-characters}
509 {characters and abbreviations for sets of characters}.
510
511 In QRegExp, apart from within character classes, \c{^} always
512 signifies the start of the string, so carets must always be
513 escaped unless used for that purpose. In Perl the meaning of caret
514 varies automagically depending on where it occurs so escaping it
515 is rarely necessary. The same applies to \c{$} which in
516 QRegExp always signifies the end of the string.
517
518 QRegExp's quantifiers are the same as Perl's greedy quantifiers
519 (but see the \l{greedy quantifiers}{note above}). Non-greedy
520 matching cannot be applied to individual quantifiers, but can be
521 applied to all the quantifiers in the pattern. For example, to
522 match the Perl regexp \b{ro+?m} requires:
523
524 \snippet code/src_corelib_text_qregexp.cpp 2
525
526 The equivalent of Perl's \c{/i} option is
527 setCaseSensitivity(Qt::CaseInsensitive).
528
529 Perl's \c{/g} option can be emulated using a \l{#cap_in_a_loop}{loop}.
530
531 In QRegExp \b{.} matches any character, therefore all QRegExp
532 regexps have the equivalent of Perl's \c{/s} option. QRegExp
533 does not have an equivalent to Perl's \c{/m} option, but this
534 can be emulated in various ways for example by splitting the input
535 into lines or by looping with a regexp that searches for newlines.
536
537 Because QRegExp is string oriented, there are no \\A, \\Z, or \\z
538 assertions. The \\G assertion is not supported but can be emulated
539 in a loop.
540
541 Perl's $& is cap(0) or capturedTexts()[0]. There are no QRegExp
542 equivalents for $`, $' or $+. Perl's capturing variables, $1, $2,
543 ... correspond to cap(1) or capturedTexts()[1], cap(2) or
544 capturedTexts()[2], etc.
545
546 To substitute a pattern use QString::replace().
547
548 Perl's extended \c{/x} syntax is not supported, nor are
549 directives, e.g. (?i), or regexp comments, e.g. (?#comment). On
550 the other hand, C++'s rules for literal strings can be used to
551 achieve the same:
552
553 \snippet code/src_corelib_text_qregexp.cpp 3
554
555 Both zero-width positive and zero-width negative lookahead
556 assertions (?=pattern) and (?!pattern) are supported with the same
557 syntax as Perl. Perl's lookbehind assertions, "independent"
558 subexpressions and conditional expressions are not supported.
559
560 Non-capturing parentheses are also supported, with the same
561 (?:pattern) syntax.
562
563 See QString::split() and QStringList::join() for equivalents
564 to Perl's split and join functions.
565
566 Note: because C++ transforms \\'s they must be written \e twice in
567 code, e.g. \b{\\b} must be written \b{\\\\b}.
568
569 \target code-examples
570 \section1 Code Examples
571
572 \snippet code/src_corelib_text_qregexp.cpp 4
573
574 The third string matches '\underline{6}'. This is a simple validation
575 regexp for integers in the range 0 to 99.
576
577 \snippet code/src_corelib_text_qregexp.cpp 5
578
579 The second string matches '\underline{This_is-OK}'. We've used the
580 character set abbreviation '\\S' (non-whitespace) and the anchors
581 to match strings which contain no whitespace.
582
583 In the following example we match strings containing 'mail' or
584 'letter' or 'correspondence' but only match whole words i.e. not
585 'email'
586
587 \snippet code/src_corelib_text_qregexp.cpp 6
588
589 The second string matches "Please write the \underline{letter}". The
590 word 'letter' is also captured (because of the parentheses). We
591 can see what text we've captured like this:
592
593 \snippet code/src_corelib_text_qregexp.cpp 7
594
595 This will capture the text from the first set of capturing
596 parentheses (counting capturing left parentheses from left to
597 right). The parentheses are counted from 1 since cap(0) is the
598 whole matched regexp (equivalent to '&' in most regexp engines).
599
600 \snippet code/src_corelib_text_qregexp.cpp 8
601
602 Here we've passed the QRegExp to QString's replace() function to
603 replace the matched text with new text.
604
605 \snippet code/src_corelib_text_qregexp.cpp 9
606
607 We've used the indexIn() function to repeatedly match the regexp in
608 the string. Note that instead of moving forward by one character
609 at a time \c pos++ we could have written \c {pos +=
610 rx.matchedLength()} to skip over the already matched string. The
611 count will equal 3, matching 'One \underline{Eric} another
612 \underline{Eirik}, and an Ericsson. How many Eiriks, \underline{Eric}?'; it
613 doesn't match 'Ericsson' or 'Eiriks' because they are not bounded
614 by non-word boundaries.
615
616 One common use of regexps is to split lines of delimited data into
617 their component fields.
618
619 \snippet code/src_corelib_text_qregexp.cpp 10
620
621 In this example our input lines have the format company name, web
622 address and country. Unfortunately the regexp is rather long and
623 not very versatile -- the code will break if we add any more
624 fields. A simpler and better solution is to look for the
625 separator, '\\t' in this case, and take the surrounding text. The
626 QString::split() function can take a separator string or regexp
627 as an argument and split a string accordingly.
628
629 \snippet code/src_corelib_text_qregexp.cpp 11
630
631 Here field[0] is the company, field[1] the web address and so on.
632
633 To imitate the matching of a shell we can use wildcard mode.
634
635 \snippet code/src_corelib_text_qregexp.cpp 12
636
637 Wildcard matching can be convenient because of its simplicity, but
638 any wildcard regexp can be defined using full regexps, e.g.
639 \b{.*\\.html$}. Notice that we can't match both \c .html and \c
640 .htm files with a wildcard unless we use \b{*.htm*} which will
641 also match 'test.html.bak'. A full regexp gives us the precision
642 we need, \b{.*\\.html?$}.
643
644 QRegExp can match case insensitively using setCaseSensitivity(),
645 and can use non-greedy matching, see setMinimal(). By
646 default QRegExp uses full regexps but this can be changed with
647 setPatternSyntax(). Searching can be done forward with indexIn() or backward
648 with lastIndexIn(). Captured text can be accessed using
649 capturedTexts() which returns a string list of all captured
650 strings, or using cap() which returns the captured string for the
651 given index. The pos() function takes a match index and returns
652 the position in the string where the match was made (or -1 if
653 there was no match).
654
655 \sa QString, QStringList, QSortFilterProxyModel
656
657 \section1 Porting to QRegularExpression
658
659 \include corelib/port-from-qregexp.qdocinc porting-to-qregularexpression
660*/
661
662#if defined(Q_OS_VXWORKS) && defined(EOS)
663# undef EOS
664#endif
665
666const int NumBadChars = 64;
667#define BadChar(ch) ((ch).unicode() % NumBadChars)
668
669const int NoOccurrence = INT_MAX;
670const int EmptyCapture = INT_MAX;
671const int InftyLen = INT_MAX;
672const int InftyRep = 1025;
673const int EOS = -1;
674
675static bool isWord(QChar ch)
676{
677 return ch.isLetterOrNumber() || ch.isMark() || ch == QLatin1Char('_');
678}
679
680/*
681 Merges two vectors of ints and puts the result into the first
682 one.
683*/
684static void mergeInto(QList<int> *a, const QList<int> &b)
685{
686 int asize = a->size();
687 int bsize = b.size();
688 if (asize == 0) {
689 *a = b;
690#ifndef QT_NO_REGEXP_OPTIM
691 } else if (bsize == 1 && a->at(i: asize - 1) < b.at(i: 0)) {
692 a->resize(size: asize + 1);
693 (*a)[asize] = b.at(i: 0);
694#endif
695 } else if (bsize >= 1) {
696 int csize = asize + bsize;
697 QList<int> c(csize);
698 int i = 0, j = 0, k = 0;
699 while (i < asize) {
700 if (j < bsize) {
701 if (a->at(i) == b.at(i: j)) {
702 ++i;
703 --csize;
704 } else if (a->at(i) < b.at(i: j)) {
705 c[k++] = a->at(i: i++);
706 } else {
707 c[k++] = b.at(i: j++);
708 }
709 } else {
710 memcpy(dest: c.data() + k, src: a->constData() + i, n: (asize - i) * sizeof(int));
711 break;
712 }
713 }
714 c.resize(size: csize);
715 if (j < bsize)
716 memcpy(dest: c.data() + k, src: b.constData() + j, n: (bsize - j) * sizeof(int));
717 *a = c;
718 }
719}
720
721#ifndef QT_NO_REGEXP_WILDCARD
722/*
723 Translates a wildcard pattern to an equivalent regular expression
724 pattern (e.g., *.cpp to .*\.cpp).
725
726 If enableEscaping is true, it is possible to escape the wildcard
727 characters with \
728*/
729static QString wc2rx(const QString &wc_str, const bool enableEscaping)
730{
731 const int wclen = wc_str.size();
732 QString rx;
733 int i = 0;
734 bool isEscaping = false; // the previous character is '\'
735 const QChar *wc = wc_str.unicode();
736
737 while (i < wclen) {
738 const QChar c = wc[i++];
739 switch (c.unicode()) {
740 case '\\':
741 if (enableEscaping) {
742 if (isEscaping) {
743 rx += QLatin1String("\\\\");
744 } // we insert the \\ later if necessary
745 if (i == wclen) { // the end
746 rx += QLatin1String("\\\\");
747 }
748 } else {
749 rx += QLatin1String("\\\\");
750 }
751 isEscaping = true;
752 break;
753 case '*':
754 if (isEscaping) {
755 rx += QLatin1String("\\*");
756 isEscaping = false;
757 } else {
758 rx += QLatin1String(".*");
759 }
760 break;
761 case '?':
762 if (isEscaping) {
763 rx += QLatin1String("\\?");
764 isEscaping = false;
765 } else {
766 rx += QLatin1Char('.');
767 }
768
769 break;
770 case '$':
771 case '(':
772 case ')':
773 case '+':
774 case '.':
775 case '^':
776 case '{':
777 case '|':
778 case '}':
779 if (isEscaping) {
780 isEscaping = false;
781 rx += QLatin1String("\\\\");
782 }
783 rx += QLatin1Char('\\');
784 rx += c;
785 break;
786 case '[':
787 if (isEscaping) {
788 isEscaping = false;
789 rx += QLatin1String("\\[");
790 } else {
791 rx += c;
792 if (wc[i] == QLatin1Char('^'))
793 rx += wc[i++];
794 if (i < wclen) {
795 if (wc[i] == QLatin1Char(']'))
796 rx += wc[i++];
797 while (i < wclen && wc[i] != QLatin1Char(']')) {
798 if (wc[i] == QLatin1Char('\\'))
799 rx += QLatin1Char('\\');
800 rx += wc[i++];
801 }
802 }
803 }
804 break;
805
806 case ']':
807 if (isEscaping){
808 isEscaping = false;
809 rx += QLatin1String("\\");
810 }
811 rx += c;
812 break;
813
814 default:
815 if (isEscaping){
816 isEscaping = false;
817 rx += QLatin1String("\\\\");
818 }
819 rx += c;
820 }
821 }
822 return rx;
823}
824#endif
825
826static int caretIndex(int offset, QRegExp::CaretMode caretMode)
827{
828 if (caretMode == QRegExp::CaretAtZero) {
829 return 0;
830 } else if (caretMode == QRegExp::CaretAtOffset) {
831 return offset;
832 } else { // QRegExp::CaretWontMatch
833 return -1;
834 }
835}
836
837/*
838 The QRegExpEngineKey struct uniquely identifies an engine.
839*/
840struct QRegExpEngineKey
841{
842 QString pattern;
843 QRegExp::PatternSyntax patternSyntax;
844 Qt::CaseSensitivity cs;
845
846 inline QRegExpEngineKey(const QString &pattern, QRegExp::PatternSyntax patternSyntax,
847 Qt::CaseSensitivity cs)
848 : pattern(pattern), patternSyntax(patternSyntax), cs(cs) {}
849
850 inline void clear() {
851 pattern.clear();
852 patternSyntax = QRegExp::RegExp;
853 cs = Qt::CaseSensitive;
854 }
855};
856
857static bool operator==(const QRegExpEngineKey &key1, const QRegExpEngineKey &key2)
858{
859 return key1.pattern == key2.pattern && key1.patternSyntax == key2.patternSyntax
860 && key1.cs == key2.cs;
861}
862
863static size_t qHash(const QRegExpEngineKey &key, size_t seed = 0) noexcept
864{
865 return qHashMulti(seed, args: key.pattern, args: key.patternSyntax, args: key.cs);
866}
867
868class QRegExpEngine;
869
870/*
871 This is the engine state during matching.
872*/
873struct QRegExpMatchState
874{
875 const QChar *in; // a pointer to the input string data
876 int pos; // the current position in the string
877 int caretPos;
878 int len; // the length of the input string
879 bool minimal; // minimal matching?
880 int *bigArray; // big array holding the data for the next pointers
881 int *inNextStack; // is state is nextStack?
882 int *curStack; // stack of current states
883 int *nextStack; // stack of next states
884 int *curCapBegin; // start of current states' captures
885 int *nextCapBegin; // start of next states' captures
886 int *curCapEnd; // end of current states' captures
887 int *nextCapEnd; // end of next states' captures
888 int *tempCapBegin; // start of temporary captures
889 int *tempCapEnd; // end of temporary captures
890 int *capBegin; // start of captures for a next state
891 int *capEnd; // end of captures for a next state
892 int *slideTab; // bump-along slide table for bad-character heuristic
893 int *captured; // what match() returned last
894 int slideTabSize; // size of slide table
895 int capturedSize;
896#ifndef QT_NO_REGEXP_BACKREF
897 QList<QList<int>> sleeping; // list of back-reference sleepers
898#endif
899 int matchLen; // length of match
900 int oneTestMatchedLen; // length of partial match
901
902 const QRegExpEngine *eng;
903
904 inline QRegExpMatchState() : bigArray(nullptr), captured(nullptr) {}
905 inline ~QRegExpMatchState() { free(ptr: bigArray); }
906
907 void drain() { free(ptr: bigArray); bigArray = nullptr; captured = nullptr; } // to save memory
908 void prepareForMatch(QRegExpEngine *eng);
909 void match(const QChar *str, int len, int pos, bool minimal,
910 bool oneTest, int caretIndex);
911 bool matchHere();
912 bool testAnchor(int i, int a, const int *capBegin);
913};
914
915/*
916 The struct QRegExpAutomatonState represents one state in a modified NFA. The
917 input characters matched are stored in the state instead of on
918 the transitions, something possible for an automaton
919 constructed from a regular expression.
920*/
921struct QRegExpAutomatonState
922{
923#ifndef QT_NO_REGEXP_CAPTURE
924 int atom; // which atom does this state belong to?
925#endif
926 int match; // what does it match? (see CharClassBit and BackRefBit)
927 QList<int> outs; // out-transitions
928 QMap<int, int> reenter; // atoms reentered when transiting out
929 QMap<int, int> anchors; // anchors met when transiting out
930
931 inline QRegExpAutomatonState() { }
932#ifndef QT_NO_REGEXP_CAPTURE
933 inline QRegExpAutomatonState(int a, int m)
934 : atom(a), match(m) { }
935#else
936 inline QRegExpAutomatonState(int m)
937 : match(m) { }
938#endif
939};
940
941Q_DECLARE_TYPEINFO(QRegExpAutomatonState, Q_RELOCATABLE_TYPE);
942
943/*
944 The struct QRegExpCharClassRange represents a range of characters (e.g.,
945 [0-9] denotes range 48 to 57).
946*/
947struct QRegExpCharClassRange
948{
949 ushort from; // 48
950 ushort len; // 10
951};
952
953Q_DECLARE_TYPEINFO(QRegExpCharClassRange, Q_PRIMITIVE_TYPE);
954
955#ifndef QT_NO_REGEXP_CAPTURE
956/*
957 The struct QRegExpAtom represents one node in the hierarchy of regular
958 expression atoms.
959*/
960struct QRegExpAtom
961{
962 enum { NoCapture = -1, OfficialCapture = -2, UnofficialCapture = -3 };
963
964 int parent; // index of parent in array of atoms
965 int capture; // index of capture, from 1 to ncap - 1
966};
967
968Q_DECLARE_TYPEINFO(QRegExpAtom, Q_PRIMITIVE_TYPE);
969#endif
970
971struct QRegExpLookahead;
972
973#ifndef QT_NO_REGEXP_ANCHOR_ALT
974/*
975 The struct QRegExpAnchorAlternation represents a pair of anchors with
976 OR semantics.
977*/
978struct QRegExpAnchorAlternation
979{
980 int a; // this anchor...
981 int b; // ...or this one
982};
983
984Q_DECLARE_TYPEINFO(QRegExpAnchorAlternation, Q_PRIMITIVE_TYPE);
985#endif
986
987#ifndef QT_NO_REGEXP_CCLASS
988
989#define FLAG(x) (1 << (x))
990/*
991 The class QRegExpCharClass represents a set of characters, such as can
992 be found in regular expressions (e.g., [a-z] denotes the set
993 {a, b, ..., z}).
994*/
995class QRegExpCharClass
996{
997public:
998 QRegExpCharClass();
999
1000 void clear();
1001 bool negative() const { return n; }
1002 void setNegative(bool negative);
1003 void addCategories(uint cats);
1004 void addRange(ushort from, ushort to);
1005 void addSingleton(ushort ch) { addRange(from: ch, to: ch); }
1006
1007 bool in(QChar ch) const;
1008#ifndef QT_NO_REGEXP_OPTIM
1009 const QList<int> &firstOccurrence() const { return occ1; }
1010#endif
1011
1012#if defined(QT_DEBUG)
1013 void dump() const;
1014#endif
1015
1016private:
1017 QList<QRegExpCharClassRange> r; // character ranges
1018#ifndef QT_NO_REGEXP_OPTIM
1019 QList<int> occ1; // first-occurrence array
1020#endif
1021 uint c; // character classes
1022 bool n; // negative?
1023};
1024#else
1025struct QRegExpCharClass
1026{
1027 int dummy;
1028
1029#ifndef QT_NO_REGEXP_OPTIM
1030 QRegExpCharClass() { occ1.fill(0, NumBadChars); }
1031
1032 const QList<int> &firstOccurrence() const { return occ1; }
1033 QList<int> occ1;
1034#endif
1035};
1036#endif
1037
1038Q_DECLARE_TYPEINFO(QRegExpCharClass, Q_RELOCATABLE_TYPE);
1039
1040/*
1041 The QRegExpEngine class encapsulates a modified nondeterministic
1042 finite automaton (NFA).
1043*/
1044class QRegExpEngine
1045{
1046public:
1047 QRegExpEngine(Qt::CaseSensitivity cs, bool greedyQuantifiers)
1048 : cs(cs), greedyQuantifiers(greedyQuantifiers) { setup(); }
1049
1050 QRegExpEngine(const QRegExpEngineKey &key);
1051 ~QRegExpEngine();
1052
1053 bool isValid() const { return valid; }
1054 const QString &errorString() const { return yyError; }
1055 int captureCount() const { return officialncap; }
1056
1057 int createState(QChar ch);
1058 int createState(const QRegExpCharClass &cc);
1059#ifndef QT_NO_REGEXP_BACKREF
1060 int createState(int bref);
1061#endif
1062
1063 void addCatTransitions(const QList<int> &from, const QList<int> &to);
1064#ifndef QT_NO_REGEXP_CAPTURE
1065 void addPlusTransitions(const QList<int> &from, const QList<int> &to, int atom);
1066#endif
1067
1068#ifndef QT_NO_REGEXP_ANCHOR_ALT
1069 int anchorAlternation(int a, int b);
1070 int anchorConcatenation(int a, int b);
1071#else
1072 int anchorAlternation(int a, int b) { return a & b; }
1073 int anchorConcatenation(int a, int b) { return a | b; }
1074#endif
1075 void addAnchors(int from, int to, int a);
1076
1077#ifndef QT_NO_REGEXP_OPTIM
1078 void heuristicallyChooseHeuristic();
1079#endif
1080
1081#if defined(QT_DEBUG)
1082 void dump() const;
1083#endif
1084
1085 QAtomicInt ref;
1086
1087private:
1088 enum { CharClassBit = 0x10000, BackRefBit = 0x20000 };
1089 enum { InitialState = 0, FinalState = 1 };
1090
1091 void setup();
1092 int setupState(int match);
1093
1094 /*
1095 Let's hope that 13 lookaheads and 14 back-references are
1096 enough.
1097 */
1098 enum { MaxLookaheads = 13, MaxBackRefs = 14 };
1099 enum { Anchor_Dollar = 0x00000001, Anchor_Caret = 0x00000002, Anchor_Word = 0x00000004,
1100 Anchor_NonWord = 0x00000008, Anchor_FirstLookahead = 0x00000010,
1101 Anchor_BackRef1Empty = Anchor_FirstLookahead << MaxLookaheads,
1102 Anchor_BackRef0Empty = Anchor_BackRef1Empty >> 1,
1103 Anchor_Alternation = unsigned(Anchor_BackRef1Empty) << MaxBackRefs,
1104
1105 Anchor_LookaheadMask = (Anchor_FirstLookahead - 1) ^
1106 ((Anchor_FirstLookahead << MaxLookaheads) - 1) };
1107#ifndef QT_NO_REGEXP_CAPTURE
1108 int startAtom(bool officialCapture);
1109 void finishAtom(int atom, bool needCapture);
1110#endif
1111
1112#ifndef QT_NO_REGEXP_LOOKAHEAD
1113 int addLookahead(QRegExpEngine *eng, bool negative);
1114#endif
1115
1116#ifndef QT_NO_REGEXP_OPTIM
1117 bool goodStringMatch(QRegExpMatchState &matchState) const;
1118 bool badCharMatch(QRegExpMatchState &matchState) const;
1119#else
1120 bool bruteMatch(QRegExpMatchState &matchState) const;
1121#endif
1122
1123 QList<QRegExpAutomatonState> s; // array of states
1124#ifndef QT_NO_REGEXP_CAPTURE
1125 QList<QRegExpAtom> f; // atom hierarchy
1126 int nf; // number of atoms
1127 int cf; // current atom
1128 QList<int> captureForOfficialCapture;
1129#endif
1130 int officialncap; // number of captures, seen from the outside
1131 int ncap; // number of captures, seen from the inside
1132#ifndef QT_NO_REGEXP_CCLASS
1133 QList<QRegExpCharClass> cl; // array of character classes
1134#endif
1135#ifndef QT_NO_REGEXP_LOOKAHEAD
1136 QList<QRegExpLookahead *> ahead; // array of lookaheads
1137#endif
1138#ifndef QT_NO_REGEXP_ANCHOR_ALT
1139 QList<QRegExpAnchorAlternation> aa; // array of (a, b) pairs of anchors
1140#endif
1141#ifndef QT_NO_REGEXP_OPTIM
1142 bool caretAnchored; // does the regexp start with ^?
1143 bool trivial; // is the good-string all that needs to match?
1144#endif
1145 bool valid; // is the regular expression valid?
1146 Qt::CaseSensitivity cs; // case sensitive?
1147 bool greedyQuantifiers; // RegExp2?
1148 bool xmlSchemaExtensions;
1149#ifndef QT_NO_REGEXP_BACKREF
1150 int nbrefs; // number of back-references
1151#endif
1152
1153#ifndef QT_NO_REGEXP_OPTIM
1154 bool useGoodStringHeuristic; // use goodStringMatch? otherwise badCharMatch
1155
1156 int goodEarlyStart; // the index where goodStr can first occur in a match
1157 int goodLateStart; // the index where goodStr can last occur in a match
1158 QString goodStr; // the string that any match has to contain
1159
1160 int minl; // the minimum length of a match
1161 QList<int> occ1; // first-occurrence array
1162#endif
1163
1164 /*
1165 The class Box is an abstraction for a regular expression
1166 fragment. It can also be seen as one node in the syntax tree of
1167 a regular expression with synthetized attributes.
1168
1169 Its interface is ugly for performance reasons.
1170 */
1171 class Box
1172 {
1173 public:
1174 Box(QRegExpEngine *engine);
1175 Box(const Box &b) { operator=(b); }
1176
1177 Box &operator=(const Box &b);
1178
1179 void clear() { operator=(b: Box(eng)); }
1180 void set(QChar ch);
1181 void set(const QRegExpCharClass &cc);
1182#ifndef QT_NO_REGEXP_BACKREF
1183 void set(int bref);
1184#endif
1185
1186 void cat(const Box &b);
1187 void orx(const Box &b);
1188 void plus(int atom);
1189 void opt();
1190 void catAnchor(int a);
1191#ifndef QT_NO_REGEXP_OPTIM
1192 void setupHeuristics();
1193#endif
1194
1195#if defined(QT_DEBUG)
1196 void dump() const;
1197#endif
1198
1199 private:
1200 void addAnchorsToEngine(const Box &to) const;
1201
1202 QRegExpEngine *eng; // the automaton under construction
1203 QList<int> ls; // the left states (firstpos)
1204 QList<int> rs; // the right states (lastpos)
1205 QMap<int, int> lanchors; // the left anchors
1206 QMap<int, int> ranchors; // the right anchors
1207 int skipanchors; // the anchors to match if the box is skipped
1208
1209#ifndef QT_NO_REGEXP_OPTIM
1210 int earlyStart; // the index where str can first occur
1211 int lateStart; // the index where str can last occur
1212 QString str; // a string that has to occur in any match
1213 QString leftStr; // a string occurring at the left of this box
1214 QString rightStr; // a string occurring at the right of this box
1215 int maxl; // the maximum length of this box (possibly InftyLen)
1216#endif
1217
1218 int minl; // the minimum length of this box
1219#ifndef QT_NO_REGEXP_OPTIM
1220 QList<int> occ1; // first-occurrence array
1221#endif
1222 };
1223
1224 friend class Box;
1225
1226 /*
1227 This is the lexical analyzer for regular expressions.
1228 */
1229 enum { Tok_Eos, Tok_Dollar, Tok_LeftParen, Tok_MagicLeftParen, Tok_PosLookahead,
1230 Tok_NegLookahead, Tok_RightParen, Tok_CharClass, Tok_Caret, Tok_Quantifier, Tok_Bar,
1231 Tok_Word, Tok_NonWord, Tok_Char = 0x10000, Tok_BackRef = 0x20000 };
1232 int getChar();
1233 int getEscape();
1234#ifndef QT_NO_REGEXP_INTERVAL
1235 int getRep(int def);
1236#endif
1237#ifndef QT_NO_REGEXP_LOOKAHEAD
1238 void skipChars(int n);
1239#endif
1240 void error(const char *msg);
1241 void startTokenizer(const QChar *rx, int len);
1242 int getToken();
1243
1244 const QChar *yyIn; // a pointer to the input regular expression pattern
1245 int yyPos0; // the position of yyTok in the input pattern
1246 int yyPos; // the position of the next character to read
1247 int yyLen; // the length of yyIn
1248 int yyCh; // the last character read
1249 QScopedPointer<QRegExpCharClass> yyCharClass; // attribute for Tok_CharClass tokens
1250 int yyMinRep; // attribute for Tok_Quantifier
1251 int yyMaxRep; // ditto
1252 QString yyError; // syntax error or overflow during parsing?
1253
1254 /*
1255 This is the syntactic analyzer for regular expressions.
1256 */
1257 int parse(const QChar *rx, int len);
1258 void parseAtom(Box *box);
1259 void parseFactor(Box *box);
1260 void parseTerm(Box *box);
1261 void parseExpression(Box *box);
1262
1263 int yyTok; // the last token read
1264 bool yyMayCapture; // set this to false to disable capturing
1265
1266 friend struct QRegExpMatchState;
1267};
1268
1269#ifndef QT_NO_REGEXP_LOOKAHEAD
1270/*
1271 The struct QRegExpLookahead represents a lookahead a la Perl (e.g.,
1272 (?=foo) and (?!bar)).
1273*/
1274struct QRegExpLookahead
1275{
1276 QRegExpEngine *eng; // NFA representing the embedded regular expression
1277 bool neg; // negative lookahead?
1278
1279 inline QRegExpLookahead(QRegExpEngine *eng0, bool neg0)
1280 : eng(eng0), neg(neg0) { }
1281 inline ~QRegExpLookahead() { delete eng; }
1282};
1283#endif
1284
1285/*!
1286 \internal
1287 convert the pattern string to the RegExp syntax.
1288
1289 This is also used by QScriptEngine::newRegExp to convert to a pattern that JavaScriptCore can understan
1290 */
1291Q_CORE5COMPAT_EXPORT QString qt_regexp_toCanonical(const QString &pattern,
1292 QRegExp::PatternSyntax patternSyntax)
1293{
1294 switch (patternSyntax) {
1295#ifndef QT_NO_REGEXP_WILDCARD
1296 case QRegExp::Wildcard:
1297 return wc2rx(wc_str: pattern, enableEscaping: false);
1298 case QRegExp::WildcardUnix:
1299 return wc2rx(wc_str: pattern, enableEscaping: true);
1300#endif
1301 case QRegExp::FixedString:
1302 return QRegExp::escape(str: pattern);
1303 case QRegExp::W3CXmlSchema11:
1304 default:
1305 return pattern;
1306 }
1307}
1308
1309QRegExpEngine::QRegExpEngine(const QRegExpEngineKey &key)
1310 : cs(key.cs), greedyQuantifiers(key.patternSyntax == QRegExp::RegExp2),
1311 xmlSchemaExtensions(key.patternSyntax == QRegExp::W3CXmlSchema11)
1312{
1313 setup();
1314
1315 QString rx = qt_regexp_toCanonical(pattern: key.pattern, patternSyntax: key.patternSyntax);
1316
1317 valid = (parse(rx: rx.unicode(), len: rx.size()) == rx.size());
1318 if (!valid) {
1319#ifndef QT_NO_REGEXP_OPTIM
1320 trivial = false;
1321#endif
1322 error(RXERR_LEFTDELIM);
1323 }
1324}
1325
1326QRegExpEngine::~QRegExpEngine()
1327{
1328#ifndef QT_NO_REGEXP_LOOKAHEAD
1329 qDeleteAll(c: ahead);
1330#endif
1331}
1332
1333void QRegExpMatchState::prepareForMatch(QRegExpEngine *eng)
1334{
1335 /*
1336 We use one QList<int> for all the big data used a lot in
1337 matchHere() and friends.
1338 */
1339 int ns = eng->s.size(); // number of states
1340 int ncap = eng->ncap;
1341#ifndef QT_NO_REGEXP_OPTIM
1342 int newSlideTabSize = qMax(a: eng->minl + 1, b: 16);
1343#else
1344 int newSlideTabSize = 0;
1345#endif
1346 int numCaptures = eng->captureCount();
1347 int newCapturedSize = 2 + 2 * numCaptures;
1348 bigArray = q_check_ptr(p: (int *)realloc(ptr: bigArray, size: ((3 + 4 * ncap) * ns + 4 * ncap + newSlideTabSize + newCapturedSize)*sizeof(int)));
1349
1350 // set all internal variables only _after_ bigArray is realloc'ed
1351 // to prevent a broken regexp in oom case
1352
1353 slideTabSize = newSlideTabSize;
1354 capturedSize = newCapturedSize;
1355 inNextStack = bigArray;
1356 memset(s: inNextStack, c: -1, n: ns * sizeof(int));
1357 curStack = inNextStack + ns;
1358 nextStack = inNextStack + 2 * ns;
1359
1360 curCapBegin = inNextStack + 3 * ns;
1361 nextCapBegin = curCapBegin + ncap * ns;
1362 curCapEnd = curCapBegin + 2 * ncap * ns;
1363 nextCapEnd = curCapBegin + 3 * ncap * ns;
1364
1365 tempCapBegin = curCapBegin + 4 * ncap * ns;
1366 tempCapEnd = tempCapBegin + ncap;
1367 capBegin = tempCapBegin + 2 * ncap;
1368 capEnd = tempCapBegin + 3 * ncap;
1369
1370 slideTab = tempCapBegin + 4 * ncap;
1371 captured = slideTab + slideTabSize;
1372 memset(s: captured, c: -1, n: capturedSize*sizeof(int));
1373 this->eng = eng;
1374}
1375
1376/*
1377 Tries to match in str and returns an array of (begin, length) pairs
1378 for captured text. If there is no match, all pairs are (-1, -1).
1379*/
1380void QRegExpMatchState::match(const QChar *str0, int len0, int pos0,
1381 bool minimal0, bool oneTest, int caretIndex)
1382{
1383 bool matched = false;
1384 QChar char_null;
1385
1386#ifndef QT_NO_REGEXP_OPTIM
1387 if (eng->trivial && !oneTest) {
1388 // ### Qt6: qsizetype
1389 pos = int(QtPrivate::findString(haystack: QStringView(str0, len0), from: pos0, needle: QStringView(eng->goodStr.unicode(), eng->goodStr.size()), cs: eng->cs));
1390 matchLen = eng->goodStr.size();
1391 matched = (pos != -1);
1392 } else
1393#endif
1394 {
1395 in = str0;
1396 if (in == nullptr)
1397 in = &char_null;
1398 pos = pos0;
1399 caretPos = caretIndex;
1400 len = len0;
1401 minimal = minimal0;
1402 matchLen = 0;
1403 oneTestMatchedLen = 0;
1404
1405 if (eng->valid && pos >= 0 && pos <= len) {
1406#ifndef QT_NO_REGEXP_OPTIM
1407 if (oneTest) {
1408 matched = matchHere();
1409 } else {
1410 if (pos <= len - eng->minl) {
1411 if (eng->caretAnchored) {
1412 matched = matchHere();
1413 } else if (eng->useGoodStringHeuristic) {
1414 matched = eng->goodStringMatch(matchState&: *this);
1415 } else {
1416 matched = eng->badCharMatch(matchState&: *this);
1417 }
1418 }
1419 }
1420#else
1421 matched = oneTest ? matchHere() : eng->bruteMatch(*this);
1422#endif
1423 }
1424 }
1425
1426 if (matched) {
1427 int *c = captured;
1428 *c++ = pos;
1429 *c++ = matchLen;
1430
1431 int numCaptures = (capturedSize - 2) >> 1;
1432#ifndef QT_NO_REGEXP_CAPTURE
1433 for (int i = 0; i < numCaptures; ++i) {
1434 int j = eng->captureForOfficialCapture.at(i);
1435 if (capBegin[j] != EmptyCapture) {
1436 int len = capEnd[j] - capBegin[j];
1437 *c++ = (len > 0) ? pos + capBegin[j] : 0;
1438 *c++ = len;
1439 } else {
1440 *c++ = -1;
1441 *c++ = -1;
1442 }
1443 }
1444#endif
1445 } else {
1446 // we rely on 2's complement here
1447 memset(s: captured, c: -1, n: capturedSize * sizeof(int));
1448 }
1449}
1450
1451/*
1452 The three following functions add one state to the automaton and
1453 return the number of the state.
1454*/
1455
1456int QRegExpEngine::createState(QChar ch)
1457{
1458 return setupState(ch.unicode());
1459}
1460
1461int QRegExpEngine::createState(const QRegExpCharClass &cc)
1462{
1463#ifndef QT_NO_REGEXP_CCLASS
1464 int n = cl.size();
1465 cl += QRegExpCharClass(cc);
1466 return setupState(CharClassBit | n);
1467#else
1468 Q_UNUSED(cc);
1469 return setupState(CharClassBit);
1470#endif
1471}
1472
1473#ifndef QT_NO_REGEXP_BACKREF
1474int QRegExpEngine::createState(int bref)
1475{
1476 if (bref > nbrefs) {
1477 nbrefs = bref;
1478 if (nbrefs > MaxBackRefs) {
1479 error(RXERR_LIMIT);
1480 return 0;
1481 }
1482 }
1483 return setupState(BackRefBit | bref);
1484}
1485#endif
1486
1487/*
1488 The two following functions add a transition between all pairs of
1489 states (i, j) where i is found in from, and j is found in to.
1490
1491 Cat-transitions are distinguished from plus-transitions for
1492 capturing.
1493*/
1494
1495void QRegExpEngine::addCatTransitions(const QList<int> &from, const QList<int> &to)
1496{
1497 for (int i = 0; i < from.size(); i++)
1498 mergeInto(a: &s[from.at(i)].outs, b: to);
1499}
1500
1501#ifndef QT_NO_REGEXP_CAPTURE
1502void QRegExpEngine::addPlusTransitions(const QList<int> &from, const QList<int> &to, int atom)
1503{
1504 for (int i = 0; i < from.size(); i++) {
1505 QRegExpAutomatonState &st = s[from.at(i)];
1506 const QList<int> oldOuts = st.outs;
1507 mergeInto(a: &st.outs, b: to);
1508 if (f.at(i: atom).capture != QRegExpAtom::NoCapture) {
1509 for (int j = 0; j < to.size(); j++) {
1510 // ### st.reenter.contains(to.at(j)) check looks suspicious
1511 if (!st.reenter.contains(key: to.at(i: j)) &&
1512 !std::binary_search(first: oldOuts.constBegin(), last: oldOuts.constEnd(), val: to.at(i: j)))
1513 st.reenter.insert(key: to.at(i: j), value: atom);
1514 }
1515 }
1516 }
1517}
1518#endif
1519
1520#ifndef QT_NO_REGEXP_ANCHOR_ALT
1521/*
1522 Returns an anchor that means a OR b.
1523*/
1524int QRegExpEngine::anchorAlternation(int a, int b)
1525{
1526 if (((a & b) == a || (a & b) == b) && ((a | b) & Anchor_Alternation) == 0)
1527 return a & b;
1528
1529 int n = aa.size();
1530#ifndef QT_NO_REGEXP_OPTIM
1531 if (n > 0 && aa.at(i: n - 1).a == a && aa.at(i: n - 1).b == b)
1532 return Anchor_Alternation | (n - 1);
1533#endif
1534
1535 QRegExpAnchorAlternation element = {.a: a, .b: b};
1536 aa.append(t: element);
1537 return Anchor_Alternation | n;
1538}
1539
1540/*
1541 Returns an anchor that means a AND b.
1542*/
1543int QRegExpEngine::anchorConcatenation(int a, int b)
1544{
1545 if (((a | b) & Anchor_Alternation) == 0)
1546 return a | b;
1547 if ((b & Anchor_Alternation) != 0)
1548 qSwap(value1&: a, value2&: b);
1549
1550 int aprime = anchorConcatenation(a: aa.at(i: a ^ Anchor_Alternation).a, b);
1551 int bprime = anchorConcatenation(a: aa.at(i: a ^ Anchor_Alternation).b, b);
1552 return anchorAlternation(a: aprime, b: bprime);
1553}
1554#endif
1555
1556/*
1557 Adds anchor a on a transition caracterised by its from state and
1558 its to state.
1559*/
1560void QRegExpEngine::addAnchors(int from, int to, int a)
1561{
1562 QRegExpAutomatonState &st = s[from];
1563 if (st.anchors.contains(key: to))
1564 a = anchorAlternation(a: st.anchors.value(key: to), b: a);
1565 st.anchors.insert(key: to, value: a);
1566}
1567
1568#ifndef QT_NO_REGEXP_OPTIM
1569/*
1570 This function chooses between the good-string and the bad-character
1571 heuristics. It computes two scores and chooses the heuristic with
1572 the highest score.
1573
1574 Here are some common-sense constraints on the scores that should be
1575 respected if the formulas are ever modified: (1) If goodStr is
1576 empty, the good-string heuristic scores 0. (2) If the regular
1577 expression is trivial, the good-string heuristic should be used.
1578 (3) If the search is case insensitive, the good-string heuristic
1579 should be used, unless it scores 0. (Case insensitivity turns all
1580 entries of occ1 to 0.) (4) If (goodLateStart - goodEarlyStart) is
1581 big, the good-string heuristic should score less.
1582*/
1583void QRegExpEngine::heuristicallyChooseHeuristic()
1584{
1585 if (minl == 0) {
1586 useGoodStringHeuristic = false;
1587 } else if (trivial) {
1588 useGoodStringHeuristic = true;
1589 } else {
1590 /*
1591 Magic formula: The good string has to constitute a good
1592 proportion of the minimum-length string, and appear at a
1593 more-or-less known index.
1594 */
1595 int goodStringScore = (64 * goodStr.size() / minl) -
1596 (goodLateStart - goodEarlyStart);
1597 /*
1598 Less magic formula: We pick some characters at random, and
1599 check whether they are good or bad.
1600 */
1601 int badCharScore = 0;
1602 int step = qMax(a: 1, b: NumBadChars / 32);
1603 for (int i = 1; i < NumBadChars; i += step) {
1604 if (occ1.at(i) == NoOccurrence)
1605 badCharScore += minl;
1606 else
1607 badCharScore += occ1.at(i);
1608 }
1609 badCharScore /= minl;
1610 useGoodStringHeuristic = (goodStringScore > badCharScore);
1611 }
1612}
1613#endif
1614
1615#if defined(QT_DEBUG)
1616void QRegExpEngine::dump() const
1617{
1618 int i, j;
1619 qDebug(msg: "Case %ssensitive engine", cs ? "" : "in");
1620 qDebug(msg: " States");
1621 for (i = 0; i < s.size(); i++) {
1622 qDebug(msg: " %d%s", i, i == InitialState ? " (initial)" : i == FinalState ? " (final)" : "");
1623#ifndef QT_NO_REGEXP_CAPTURE
1624 if (nf > 0)
1625 qDebug(msg: " in atom %d", s[i].atom);
1626#endif
1627 int m = s[i].match;
1628 if ((m & CharClassBit) != 0) {
1629 qDebug(msg: " match character class %d", m ^ CharClassBit);
1630#ifndef QT_NO_REGEXP_CCLASS
1631 cl[m ^ CharClassBit].dump();
1632#else
1633 qDebug(" negative character class");
1634#endif
1635 } else if ((m & BackRefBit) != 0) {
1636 qDebug(msg: " match back-reference %d", m ^ BackRefBit);
1637 } else if (m >= 0x20 && m <= 0x7e) {
1638 qDebug(msg: " match 0x%.4x (%c)", m, m);
1639 } else {
1640 qDebug(msg: " match 0x%.4x", m);
1641 }
1642 for (j = 0; j < s[i].outs.size(); j++) {
1643 int next = s[i].outs[j];
1644 qDebug(msg: " -> %d", next);
1645 if (s[i].reenter.contains(key: next))
1646 qDebug(msg: " [reenter %d]", s[i].reenter[next]);
1647 if (s[i].anchors.value(key: next) != 0)
1648 qDebug(msg: " [anchors 0x%.8x]", s[i].anchors[next]);
1649 }
1650 }
1651#ifndef QT_NO_REGEXP_CAPTURE
1652 if (nf > 0) {
1653 qDebug(msg: " Atom Parent Capture");
1654 for (i = 0; i < nf; i++) {
1655 if (f[i].capture == QRegExpAtom::NoCapture) {
1656 qDebug(msg: " %6d %6d nil", i, f[i].parent);
1657 } else {
1658 int cap = f[i].capture;
1659 bool official = captureForOfficialCapture.contains(t: cap);
1660 qDebug(msg: " %6d %6d %6d %s", i, f[i].parent, f[i].capture,
1661 official ? "official" : "");
1662 }
1663 }
1664 }
1665#endif
1666#ifndef QT_NO_REGEXP_ANCHOR_ALT
1667 for (i = 0; i < aa.size(); i++)
1668 qDebug(msg: " Anchor alternation 0x%.8x: 0x%.8x 0x%.9x", i, aa[i].a, aa[i].b);
1669#endif
1670}
1671#endif
1672
1673void QRegExpEngine::setup()
1674{
1675 ref.storeRelaxed(newValue: 1);
1676#ifndef QT_NO_REGEXP_CAPTURE
1677 f.resize(size: 32);
1678 nf = 0;
1679 cf = -1;
1680#endif
1681 officialncap = 0;
1682 ncap = 0;
1683#ifndef QT_NO_REGEXP_OPTIM
1684 caretAnchored = true;
1685 trivial = true;
1686#endif
1687 valid = false;
1688#ifndef QT_NO_REGEXP_BACKREF
1689 nbrefs = 0;
1690#endif
1691#ifndef QT_NO_REGEXP_OPTIM
1692 useGoodStringHeuristic = true;
1693 minl = 0;
1694 occ1.fill(t: 0, newSize: NumBadChars);
1695#endif
1696}
1697
1698int QRegExpEngine::setupState(int match)
1699{
1700#ifndef QT_NO_REGEXP_CAPTURE
1701 s += QRegExpAutomatonState(cf, match);
1702#else
1703 s += QRegExpAutomatonState(match);
1704#endif
1705 return s.size() - 1;
1706}
1707
1708#ifndef QT_NO_REGEXP_CAPTURE
1709/*
1710 Functions startAtom() and finishAtom() should be called to delimit
1711 atoms. When a state is created, it is assigned to the current atom.
1712 The information is later used for capturing.
1713*/
1714int QRegExpEngine::startAtom(bool officialCapture)
1715{
1716 if ((nf & (nf + 1)) == 0 && nf + 1 >= f.size())
1717 f.resize(size: (nf + 1) << 1);
1718 f[nf].parent = cf;
1719 cf = nf++;
1720 f[cf].capture = officialCapture ? QRegExpAtom::OfficialCapture : QRegExpAtom::NoCapture;
1721 return cf;
1722}
1723
1724void QRegExpEngine::finishAtom(int atom, bool needCapture)
1725{
1726 if (greedyQuantifiers && needCapture && f[atom].capture == QRegExpAtom::NoCapture)
1727 f[atom].capture = QRegExpAtom::UnofficialCapture;
1728 cf = f.at(i: atom).parent;
1729}
1730#endif
1731
1732#ifndef QT_NO_REGEXP_LOOKAHEAD
1733/*
1734 Creates a lookahead anchor.
1735*/
1736int QRegExpEngine::addLookahead(QRegExpEngine *eng, bool negative)
1737{
1738 int n = ahead.size();
1739 if (n == MaxLookaheads) {
1740 error(RXERR_LIMIT);
1741 return 0;
1742 }
1743 ahead += new QRegExpLookahead(eng, negative);
1744 return Anchor_FirstLookahead << n;
1745}
1746#endif
1747
1748#ifndef QT_NO_REGEXP_CAPTURE
1749/*
1750 We want the longest leftmost captures.
1751*/
1752static bool isBetterCapture(int ncap, const int *begin1, const int *end1, const int *begin2,
1753 const int *end2)
1754{
1755 for (int i = 0; i < ncap; i++) {
1756 int delta = begin2[i] - begin1[i]; // it has to start early...
1757 if (delta == 0)
1758 delta = end1[i] - end2[i]; // ...and end late
1759
1760 if (delta != 0)
1761 return delta > 0;
1762 }
1763 return false;
1764}
1765#endif
1766
1767/*
1768 Returns \c true if anchor a matches at position pos + i in the input
1769 string, otherwise false.
1770*/
1771bool QRegExpMatchState::testAnchor(int i, int a, const int *capBegin)
1772{
1773 int j;
1774
1775#ifndef QT_NO_REGEXP_ANCHOR_ALT
1776 if ((a & QRegExpEngine::Anchor_Alternation) != 0)
1777 return testAnchor(i, a: eng->aa.at(i: a ^ QRegExpEngine::Anchor_Alternation).a, capBegin)
1778 || testAnchor(i, a: eng->aa.at(i: a ^ QRegExpEngine::Anchor_Alternation).b, capBegin);
1779#endif
1780
1781 if ((a & QRegExpEngine::Anchor_Caret) != 0) {
1782 if (pos + i != caretPos)
1783 return false;
1784 }
1785 if ((a & QRegExpEngine::Anchor_Dollar) != 0) {
1786 if (pos + i != len)
1787 return false;
1788 }
1789#ifndef QT_NO_REGEXP_ESCAPE
1790 if ((a & (QRegExpEngine::Anchor_Word | QRegExpEngine::Anchor_NonWord)) != 0) {
1791 bool before = false;
1792 bool after = false;
1793 if (pos + i != 0)
1794 before = isWord(ch: in[pos + i - 1]);
1795 if (pos + i != len)
1796 after = isWord(ch: in[pos + i]);
1797 if ((a & QRegExpEngine::Anchor_Word) != 0 && (before == after))
1798 return false;
1799 if ((a & QRegExpEngine::Anchor_NonWord) != 0 && (before != after))
1800 return false;
1801 }
1802#endif
1803#ifndef QT_NO_REGEXP_LOOKAHEAD
1804 if ((a & QRegExpEngine::Anchor_LookaheadMask) != 0) {
1805 const QList<QRegExpLookahead *> &ahead = eng->ahead;
1806 for (j = 0; j < ahead.size(); j++) {
1807 if ((a & (QRegExpEngine::Anchor_FirstLookahead << j)) != 0) {
1808 QRegExpMatchState matchState;
1809 matchState.prepareForMatch(eng: ahead[j]->eng);
1810 matchState.match(str0: in + pos + i, len0: len - pos - i, pos0: 0,
1811 minimal0: true, oneTest: true, caretIndex: caretPos - pos - i);
1812 if ((matchState.captured[0] == 0) == ahead[j]->neg)
1813 return false;
1814 }
1815 }
1816 }
1817#endif
1818#ifndef QT_NO_REGEXP_CAPTURE
1819#ifndef QT_NO_REGEXP_BACKREF
1820 for (j = 0; j < eng->nbrefs; j++) {
1821 if ((a & (QRegExpEngine::Anchor_BackRef1Empty << j)) != 0) {
1822 int i = eng->captureForOfficialCapture.at(i: j);
1823 if (capBegin[i] != EmptyCapture)
1824 return false;
1825 }
1826 }
1827#endif
1828#endif
1829 return true;
1830}
1831
1832#ifndef QT_NO_REGEXP_OPTIM
1833/*
1834 The three following functions are what Jeffrey Friedl would call
1835 transmissions (or bump-alongs). Using one or the other should make
1836 no difference except in performance.
1837*/
1838
1839bool QRegExpEngine::goodStringMatch(QRegExpMatchState &matchState) const
1840{
1841 int k = matchState.pos + goodEarlyStart;
1842 QStringMatcher matcher(goodStr.unicode(), goodStr.size(), cs);
1843 while ((k = matcher.indexIn(str: matchState.in, length: matchState.len, from: k)) != -1) {
1844 int from = k - goodLateStart;
1845 int to = k - goodEarlyStart;
1846 if (from > matchState.pos)
1847 matchState.pos = from;
1848
1849 while (matchState.pos <= to) {
1850 if (matchState.matchHere())
1851 return true;
1852 ++matchState.pos;
1853 }
1854 ++k;
1855 }
1856 return false;
1857}
1858
1859bool QRegExpEngine::badCharMatch(QRegExpMatchState &matchState) const
1860{
1861 int slideHead = 0;
1862 int slideNext = 0;
1863 int i;
1864 int lastPos = matchState.len - minl;
1865 memset(s: matchState.slideTab, c: 0, n: matchState.slideTabSize * sizeof(int));
1866
1867 /*
1868 Set up the slide table, used for the bad-character heuristic,
1869 using the table of first occurrence of each character.
1870 */
1871 for (i = 0; i < minl; i++) {
1872 int sk = occ1[BadChar(matchState.in[matchState.pos + i])];
1873 if (sk == NoOccurrence)
1874 sk = i + 1;
1875 if (sk > 0) {
1876 int k = i + 1 - sk;
1877 if (k < 0) {
1878 sk = i + 1;
1879 k = 0;
1880 }
1881 if (sk > matchState.slideTab[k])
1882 matchState.slideTab[k] = sk;
1883 }
1884 }
1885
1886 if (matchState.pos > lastPos)
1887 return false;
1888
1889 for (;;) {
1890 if (++slideNext >= matchState.slideTabSize)
1891 slideNext = 0;
1892 if (matchState.slideTab[slideHead] > 0) {
1893 if (matchState.slideTab[slideHead] - 1 > matchState.slideTab[slideNext])
1894 matchState.slideTab[slideNext] = matchState.slideTab[slideHead] - 1;
1895 matchState.slideTab[slideHead] = 0;
1896 } else {
1897 if (matchState.matchHere())
1898 return true;
1899 }
1900
1901 if (matchState.pos == lastPos)
1902 break;
1903
1904 /*
1905 Update the slide table. This code has much in common with
1906 the initialization code.
1907 */
1908 int sk = occ1[BadChar(matchState.in[matchState.pos + minl])];
1909 if (sk == NoOccurrence) {
1910 matchState.slideTab[slideNext] = minl;
1911 } else if (sk > 0) {
1912 int k = slideNext + minl - sk;
1913 if (k >= matchState.slideTabSize)
1914 k -= matchState.slideTabSize;
1915 if (sk > matchState.slideTab[k])
1916 matchState.slideTab[k] = sk;
1917 }
1918 slideHead = slideNext;
1919 ++matchState.pos;
1920 }
1921 return false;
1922}
1923#else
1924bool QRegExpEngine::bruteMatch(QRegExpMatchState &matchState) const
1925{
1926 while (matchState.pos <= matchState.len) {
1927 if (matchState.matchHere())
1928 return true;
1929 ++matchState.pos;
1930 }
1931 return false;
1932}
1933#endif
1934
1935/*
1936 Here's the core of the engine. It tries to do a match here and now.
1937*/
1938bool QRegExpMatchState::matchHere()
1939{
1940 int ncur = 1, nnext = 0;
1941 int i = 0, j, k, m;
1942 bool stop = false;
1943
1944 matchLen = -1;
1945 oneTestMatchedLen = -1;
1946 curStack[0] = QRegExpEngine::InitialState;
1947
1948 int ncap = eng->ncap;
1949#ifndef QT_NO_REGEXP_CAPTURE
1950 if (ncap > 0) {
1951 for (j = 0; j < ncap; j++) {
1952 curCapBegin[j] = EmptyCapture;
1953 curCapEnd[j] = EmptyCapture;
1954 }
1955 }
1956#endif
1957
1958#ifndef QT_NO_REGEXP_BACKREF
1959 while ((ncur > 0 || !sleeping.isEmpty()) && i <= len - pos && !stop)
1960#else
1961 while (ncur > 0 && i <= len - pos && !stop)
1962#endif
1963 {
1964 int ch = (i < len - pos) ? in[pos + i].unicode() : 0;
1965 for (j = 0; j < ncur; j++) {
1966 int cur = curStack[j];
1967 const QRegExpAutomatonState &scur = eng->s.at(i: cur);
1968 const QList<int> &outs = scur.outs;
1969 for (k = 0; k < outs.size(); k++) {
1970 int next = outs.at(i: k);
1971 const QRegExpAutomatonState &snext = eng->s.at(i: next);
1972 bool inside = true;
1973#if !defined(QT_NO_REGEXP_BACKREF) && !defined(QT_NO_REGEXP_CAPTURE)
1974 int needSomeSleep = 0;
1975#endif
1976
1977 /*
1978 First, check if the anchors are anchored properly.
1979 */
1980 int a = scur.anchors.value(key: next);
1981 if (a != 0 && !testAnchor(i, a, capBegin: curCapBegin + j * ncap))
1982 inside = false;
1983
1984 /*
1985 If indeed they are, check if the input character is
1986 correct for this transition.
1987 */
1988 if (inside) {
1989 m = snext.match;
1990 if ((m & (QRegExpEngine::CharClassBit | QRegExpEngine::BackRefBit)) == 0) {
1991 if (eng->cs)
1992 inside = (m == ch);
1993 else
1994 inside = (QChar(m).toLower() == QChar(ch).toLower());
1995 } else if (next == QRegExpEngine::FinalState) {
1996 matchLen = i;
1997 stop = minimal;
1998 inside = true;
1999 } else if ((m & QRegExpEngine::CharClassBit) != 0) {
2000#ifndef QT_NO_REGEXP_CCLASS
2001 const QRegExpCharClass &cc = eng->cl.at(i: m ^ QRegExpEngine::CharClassBit);
2002 if (eng->cs)
2003 inside = cc.in(ch: QChar(ch));
2004 else if (cc.negative())
2005 inside = cc.in(ch: QChar(ch).toLower()) &&
2006 cc.in(ch: QChar(ch).toUpper());
2007 else
2008 inside = cc.in(ch: QChar(ch).toLower()) ||
2009 cc.in(ch: QChar(ch).toUpper());
2010#endif
2011#if !defined(QT_NO_REGEXP_BACKREF) && !defined(QT_NO_REGEXP_CAPTURE)
2012 } else { /* ((m & QRegExpEngine::BackRefBit) != 0) */
2013 int bref = m ^ QRegExpEngine::BackRefBit;
2014 int ell = j * ncap + eng->captureForOfficialCapture.at(i: bref - 1);
2015
2016 inside = bref <= ncap && curCapBegin[ell] != EmptyCapture;
2017 if (inside) {
2018 if (eng->cs)
2019 inside = (in[pos + curCapBegin[ell]] == QChar(ch));
2020 else
2021 inside = (in[pos + curCapBegin[ell]].toLower()
2022 == QChar(ch).toLower());
2023 }
2024
2025 if (inside) {
2026 int delta;
2027 if (curCapEnd[ell] == EmptyCapture)
2028 delta = i - curCapBegin[ell];
2029 else
2030 delta = curCapEnd[ell] - curCapBegin[ell];
2031
2032 inside = (delta <= len - (pos + i));
2033 if (inside && delta > 1) {
2034 int n = 1;
2035 if (eng->cs) {
2036 while (n < delta) {
2037 if (in[pos + curCapBegin[ell] + n]
2038 != in[pos + i + n])
2039 break;
2040 ++n;
2041 }
2042 } else {
2043 while (n < delta) {
2044 QChar a = in[pos + curCapBegin[ell] + n];
2045 QChar b = in[pos + i + n];
2046 if (a.toLower() != b.toLower())
2047 break;
2048 ++n;
2049 }
2050 }
2051 inside = (n == delta);
2052 if (inside)
2053 needSomeSleep = delta - 1;
2054 }
2055 }
2056#endif
2057 }
2058 }
2059
2060 /*
2061 We must now update our data structures.
2062 */
2063 if (inside) {
2064#ifndef QT_NO_REGEXP_CAPTURE
2065 int *capBegin, *capEnd;
2066#endif
2067 /*
2068 If the next state was not encountered yet, all
2069 is fine.
2070 */
2071 if ((m = inNextStack[next]) == -1) {
2072 m = nnext++;
2073 nextStack[m] = next;
2074 inNextStack[next] = m;
2075#ifndef QT_NO_REGEXP_CAPTURE
2076 capBegin = nextCapBegin + m * ncap;
2077 capEnd = nextCapEnd + m * ncap;
2078
2079 /*
2080 Otherwise, we'll first maintain captures in
2081 temporary arrays, and decide at the end whether
2082 it's best to keep the previous capture zones or
2083 the new ones.
2084 */
2085 } else {
2086 capBegin = tempCapBegin;
2087 capEnd = tempCapEnd;
2088#endif
2089 }
2090
2091#ifndef QT_NO_REGEXP_CAPTURE
2092 /*
2093 Updating the capture zones is much of a task.
2094 */
2095 if (ncap > 0) {
2096 memcpy(dest: capBegin, src: curCapBegin + j * ncap, n: ncap * sizeof(int));
2097 memcpy(dest: capEnd, src: curCapEnd + j * ncap, n: ncap * sizeof(int));
2098 int c = scur.atom, n = snext.atom;
2099 int p = -1, q = -1;
2100 int cap;
2101
2102 /*
2103 Lemma 1. For any x in the range [0..nf), we
2104 have f[x].parent < x.
2105
2106 Proof. By looking at startAtom(), it is
2107 clear that cf < nf holds all the time, and
2108 thus that f[nf].parent < nf.
2109 */
2110
2111 /*
2112 If we are reentering an atom, we empty all
2113 capture zones inside it.
2114 */
2115 if ((q = scur.reenter.value(key: next)) != 0) {
2116 QBitArray b(eng->nf, false);
2117 b.setBit(i: q, val: true);
2118 for (int ell = q + 1; ell < eng->nf; ell++) {
2119 if (b.testBit(i: eng->f.at(i: ell).parent)) {
2120 b.setBit(i: ell, val: true);
2121 cap = eng->f.at(i: ell).capture;
2122 if (cap >= 0) {
2123 capBegin[cap] = EmptyCapture;
2124 capEnd[cap] = EmptyCapture;
2125 }
2126 }
2127 }
2128 p = eng->f.at(i: q).parent;
2129
2130 /*
2131 Otherwise, close the capture zones we are
2132 leaving. We are leaving f[c].capture,
2133 f[f[c].parent].capture,
2134 f[f[f[c].parent].parent].capture, ...,
2135 until f[x].capture, with x such that
2136 f[x].parent is the youngest common ancestor
2137 for c and n.
2138
2139 We go up along c's and n's ancestry until
2140 we find x.
2141 */
2142 } else {
2143 p = c;
2144 q = n;
2145 while (p != q) {
2146 if (p > q) {
2147 cap = eng->f.at(i: p).capture;
2148 if (cap >= 0) {
2149 if (capBegin[cap] == i) {
2150 capBegin[cap] = EmptyCapture;
2151 capEnd[cap] = EmptyCapture;
2152 } else {
2153 capEnd[cap] = i;
2154 }
2155 }
2156 p = eng->f.at(i: p).parent;
2157 } else {
2158 q = eng->f.at(i: q).parent;
2159 }
2160 }
2161 }
2162
2163 /*
2164 In any case, we now open the capture zones
2165 we are entering. We work upwards from n
2166 until we reach p (the parent of the atom we
2167 reenter or the youngest common ancestor).
2168 */
2169 while (n > p) {
2170 cap = eng->f.at(i: n).capture;
2171 if (cap >= 0) {
2172 capBegin[cap] = i;
2173 capEnd[cap] = EmptyCapture;
2174 }
2175 n = eng->f.at(i: n).parent;
2176 }
2177 /*
2178 If the next state was already in
2179 nextStack, we must choose carefully which
2180 capture zones we want to keep.
2181 */
2182 if (capBegin == tempCapBegin &&
2183 isBetterCapture(ncap, begin1: capBegin, end1: capEnd, begin2: nextCapBegin + m * ncap,
2184 end2: nextCapEnd + m * ncap)) {
2185 memcpy(dest: nextCapBegin + m * ncap, src: capBegin, n: ncap * sizeof(int));
2186 memcpy(dest: nextCapEnd + m * ncap, src: capEnd, n: ncap * sizeof(int));
2187 }
2188 }
2189#ifndef QT_NO_REGEXP_BACKREF
2190 /*
2191 We are done with updating the capture zones.
2192 It's now time to put the next state to sleep,
2193 if it needs to, and to remove it from
2194 nextStack.
2195 */
2196 if (needSomeSleep > 0) {
2197 QList<int> zzZ(2 + 2 * ncap);
2198 zzZ[0] = i + needSomeSleep;
2199 zzZ[1] = next;
2200 if (ncap > 0) {
2201 memcpy(dest: zzZ.data() + 2, src: capBegin, n: ncap * sizeof(int));
2202 memcpy(dest: zzZ.data() + 2 + ncap, src: capEnd, n: ncap * sizeof(int));
2203 }
2204 inNextStack[nextStack[--nnext]] = -1;
2205 sleeping.append(t: zzZ);
2206 }
2207#endif
2208#endif
2209 }
2210 }
2211 }
2212#ifndef QT_NO_REGEXP_CAPTURE
2213 /*
2214 If we reached the final state, hurray! Copy the captured
2215 zone.
2216 */
2217 if (ncap > 0 && (m = inNextStack[QRegExpEngine::FinalState]) != -1) {
2218 memcpy(dest: capBegin, src: nextCapBegin + m * ncap, n: ncap * sizeof(int));
2219 memcpy(dest: capEnd, src: nextCapEnd + m * ncap, n: ncap * sizeof(int));
2220 }
2221#ifndef QT_NO_REGEXP_BACKREF
2222 /*
2223 It's time to wake up the sleepers.
2224 */
2225 j = 0;
2226 while (j < sleeping.size()) {
2227 if (sleeping.at(i: j)[0] == i) {
2228 const QList<int> &zzZ = sleeping.at(i: j);
2229 int next = zzZ[1];
2230 const int *capBegin = zzZ.data() + 2;
2231 const int *capEnd = zzZ.data() + 2 + ncap;
2232 bool copyOver = true;
2233
2234 if ((m = inNextStack[next]) == -1) {
2235 m = nnext++;
2236 nextStack[m] = next;
2237 inNextStack[next] = m;
2238 } else {
2239 copyOver = isBetterCapture(ncap, begin1: nextCapBegin + m * ncap, end1: nextCapEnd + m * ncap,
2240 begin2: capBegin, end2: capEnd);
2241 }
2242 if (copyOver) {
2243 memcpy(dest: nextCapBegin + m * ncap, src: capBegin, n: ncap * sizeof(int));
2244 memcpy(dest: nextCapEnd + m * ncap, src: capEnd, n: ncap * sizeof(int));
2245 }
2246
2247 sleeping.removeAt(i: j);
2248 } else {
2249 ++j;
2250 }
2251 }
2252#endif
2253#endif
2254 for (j = 0; j < nnext; j++)
2255 inNextStack[nextStack[j]] = -1;
2256
2257 // avoid needless iteration that confuses oneTestMatchedLen
2258 if (nnext == 1 && nextStack[0] == QRegExpEngine::FinalState
2259#ifndef QT_NO_REGEXP_BACKREF
2260 && sleeping.isEmpty()
2261#endif
2262 )
2263 stop = true;
2264
2265 qSwap(value1&: curStack, value2&: nextStack);
2266#ifndef QT_NO_REGEXP_CAPTURE
2267 qSwap(value1&: curCapBegin, value2&: nextCapBegin);
2268 qSwap(value1&: curCapEnd, value2&: nextCapEnd);
2269#endif
2270 ncur = nnext;
2271 nnext = 0;
2272 ++i;
2273 }
2274
2275#ifndef QT_NO_REGEXP_BACKREF
2276 /*
2277 If minimal matching is enabled, we might have some sleepers
2278 left.
2279 */
2280 if (!sleeping.isEmpty())
2281 sleeping.clear();
2282#endif
2283
2284 oneTestMatchedLen = i - 1;
2285 return (matchLen >= 0);
2286}
2287
2288#ifndef QT_NO_REGEXP_CCLASS
2289
2290QRegExpCharClass::QRegExpCharClass()
2291 : c(0), n(false)
2292{
2293#ifndef QT_NO_REGEXP_OPTIM
2294 occ1.fill(t: NoOccurrence, newSize: NumBadChars);
2295#endif
2296}
2297
2298void QRegExpCharClass::clear()
2299{
2300 c = 0;
2301 r.clear();
2302 n = false;
2303}
2304
2305void QRegExpCharClass::setNegative(bool negative)
2306{
2307 n = negative;
2308#ifndef QT_NO_REGEXP_OPTIM
2309 occ1.fill(t: 0, newSize: NumBadChars);
2310#endif
2311}
2312
2313void QRegExpCharClass::addCategories(uint cats)
2314{
2315 static const int all_cats = FLAG(QChar::Mark_NonSpacing) |
2316 FLAG(QChar::Mark_SpacingCombining) |
2317 FLAG(QChar::Mark_Enclosing) |
2318 FLAG(QChar::Number_DecimalDigit) |
2319 FLAG(QChar::Number_Letter) |
2320 FLAG(QChar::Number_Other) |
2321 FLAG(QChar::Separator_Space) |
2322 FLAG(QChar::Separator_Line) |
2323 FLAG(QChar::Separator_Paragraph) |
2324 FLAG(QChar::Other_Control) |
2325 FLAG(QChar::Other_Format) |
2326 FLAG(QChar::Other_Surrogate) |
2327 FLAG(QChar::Other_PrivateUse) |
2328 FLAG(QChar::Other_NotAssigned) |
2329 FLAG(QChar::Letter_Uppercase) |
2330 FLAG(QChar::Letter_Lowercase) |
2331 FLAG(QChar::Letter_Titlecase) |
2332 FLAG(QChar::Letter_Modifier) |
2333 FLAG(QChar::Letter_Other) |
2334 FLAG(QChar::Punctuation_Connector) |
2335 FLAG(QChar::Punctuation_Dash) |
2336 FLAG(QChar::Punctuation_Open) |
2337 FLAG(QChar::Punctuation_Close) |
2338 FLAG(QChar::Punctuation_InitialQuote) |
2339 FLAG(QChar::Punctuation_FinalQuote) |
2340 FLAG(QChar::Punctuation_Other) |
2341 FLAG(QChar::Symbol_Math) |
2342 FLAG(QChar::Symbol_Currency) |
2343 FLAG(QChar::Symbol_Modifier) |
2344 FLAG(QChar::Symbol_Other);
2345 c |= (all_cats & cats);
2346#ifndef QT_NO_REGEXP_OPTIM
2347 occ1.fill(t: 0, newSize: NumBadChars);
2348#endif
2349}
2350
2351void QRegExpCharClass::addRange(ushort from, ushort to)
2352{
2353 if (from > to)
2354 qSwap(value1&: from, value2&: to);
2355 int m = r.size();
2356 r.resize(size: m + 1);
2357 r[m].from = from;
2358 r[m].len = to - from + 1;
2359
2360#ifndef QT_NO_REGEXP_OPTIM
2361 int i;
2362
2363 if (to - from < NumBadChars) {
2364 if (from % NumBadChars <= to % NumBadChars) {
2365 for (i = from % NumBadChars; i <= to % NumBadChars; i++)
2366 occ1[i] = 0;
2367 } else {
2368 for (i = 0; i <= to % NumBadChars; i++)
2369 occ1[i] = 0;
2370 for (i = from % NumBadChars; i < NumBadChars; i++)
2371 occ1[i] = 0;
2372 }
2373 } else {
2374 occ1.fill(t: 0, newSize: NumBadChars);
2375 }
2376#endif
2377}
2378
2379bool QRegExpCharClass::in(QChar ch) const
2380{
2381#ifndef QT_NO_REGEXP_OPTIM
2382 if (occ1.at(BadChar(ch)) == NoOccurrence)
2383 return n;
2384#endif
2385
2386 if (c != 0 && (c & FLAG(ch.category())) != 0)
2387 return !n;
2388
2389 const int uc = ch.unicode();
2390 int size = r.size();
2391
2392 for (int i = 0; i < size; ++i) {
2393 const QRegExpCharClassRange &range = r.at(i);
2394 if (uint(uc - range.from) < uint(r.at(i).len))
2395 return !n;
2396 }
2397 return n;
2398}
2399
2400#if defined(QT_DEBUG)
2401void QRegExpCharClass::dump() const
2402{
2403 int i;
2404 qDebug(msg: " %stive character class", n ? "nega" : "posi");
2405#ifndef QT_NO_REGEXP_CCLASS
2406 if (c != 0)
2407 qDebug(msg: " categories 0x%.8x", c);
2408#endif
2409 for (i = 0; i < r.size(); i++)
2410 qDebug(msg: " 0x%.4x through 0x%.4x", r[i].from, r[i].from + r[i].len - 1);
2411}
2412#endif
2413#endif
2414
2415QRegExpEngine::Box::Box(QRegExpEngine *engine)
2416 : eng(engine), skipanchors(0)
2417#ifndef QT_NO_REGEXP_OPTIM
2418 , earlyStart(0), lateStart(0), maxl(0)
2419#endif
2420{
2421#ifndef QT_NO_REGEXP_OPTIM
2422 occ1.fill(t: NoOccurrence, newSize: NumBadChars);
2423#endif
2424 minl = 0;
2425}
2426
2427QRegExpEngine::Box &QRegExpEngine::Box::operator=(const Box &b)
2428{
2429 eng = b.eng;
2430 ls = b.ls;
2431 rs = b.rs;
2432 lanchors = b.lanchors;
2433 ranchors = b.ranchors;
2434 skipanchors = b.skipanchors;
2435#ifndef QT_NO_REGEXP_OPTIM
2436 earlyStart = b.earlyStart;
2437 lateStart = b.lateStart;
2438 str = b.str;
2439 leftStr = b.leftStr;
2440 rightStr = b.rightStr;
2441 maxl = b.maxl;
2442 occ1 = b.occ1;
2443#endif
2444 minl = b.minl;
2445 return *this;
2446}
2447
2448void QRegExpEngine::Box::set(QChar ch)
2449{
2450 ls.resize(size: 1);
2451 ls[0] = eng->createState(ch);
2452 rs = ls;
2453#ifndef QT_NO_REGEXP_OPTIM
2454 str = ch;
2455 leftStr = ch;
2456 rightStr = ch;
2457 maxl = 1;
2458 occ1[BadChar(ch)] = 0;
2459#endif
2460 minl = 1;
2461}
2462
2463void QRegExpEngine::Box::set(const QRegExpCharClass &cc)
2464{
2465 ls.resize(size: 1);
2466 ls[0] = eng->createState(cc);
2467 rs = ls;
2468#ifndef QT_NO_REGEXP_OPTIM
2469 maxl = 1;
2470 occ1 = cc.firstOccurrence();
2471#endif
2472 minl = 1;
2473}
2474
2475#ifndef QT_NO_REGEXP_BACKREF
2476void QRegExpEngine::Box::set(int bref)
2477{
2478 ls.resize(size: 1);
2479 ls[0] = eng->createState(bref);
2480 rs = ls;
2481 if (bref >= 1 && bref <= MaxBackRefs)
2482 skipanchors = Anchor_BackRef0Empty << bref;
2483#ifndef QT_NO_REGEXP_OPTIM
2484 maxl = InftyLen;
2485#endif
2486 minl = 0;
2487}
2488#endif
2489
2490void QRegExpEngine::Box::cat(const Box &b)
2491{
2492 eng->addCatTransitions(from: rs, to: b.ls);
2493 addAnchorsToEngine(to: b);
2494 if (minl == 0) {
2495 lanchors.insert(map: b.lanchors);
2496 if (skipanchors != 0) {
2497 for (int i = 0; i < b.ls.size(); i++) {
2498 int a = eng->anchorConcatenation(a: lanchors.value(key: b.ls.at(i), defaultValue: 0), b: skipanchors);
2499 lanchors.insert(key: b.ls.at(i), value: a);
2500 }
2501 }
2502 mergeInto(a: &ls, b: b.ls);
2503 }
2504 if (b.minl == 0) {
2505 ranchors.insert(map: b.ranchors);
2506 if (b.skipanchors != 0) {
2507 for (int i = 0; i < rs.size(); i++) {
2508 int a = eng->anchorConcatenation(a: ranchors.value(key: rs.at(i), defaultValue: 0), b: b.skipanchors);
2509 ranchors.insert(key: rs.at(i), value: a);
2510 }
2511 }
2512 mergeInto(a: &rs, b: b.rs);
2513 } else {
2514 ranchors = b.ranchors;
2515 rs = b.rs;
2516 }
2517
2518#ifndef QT_NO_REGEXP_OPTIM
2519 if (maxl != InftyLen) {
2520 if (rightStr.size() + b.leftStr.size() >
2521 qMax(a: str.size(), b: b.str.size())) {
2522 earlyStart = minl - rightStr.size();
2523 lateStart = maxl - rightStr.size();
2524 str = rightStr + b.leftStr;
2525 } else if (b.str.size() > str.size()) {
2526 earlyStart = minl + b.earlyStart;
2527 lateStart = maxl + b.lateStart;
2528 str = b.str;
2529 }
2530 }
2531
2532 if (leftStr.size() == maxl)
2533 leftStr += b.leftStr;
2534
2535 if (b.rightStr.size() == b.maxl) {
2536 rightStr += b.rightStr;
2537 } else {
2538 rightStr = b.rightStr;
2539 }
2540
2541 if (maxl == InftyLen || b.maxl == InftyLen) {
2542 maxl = InftyLen;
2543 } else {
2544 maxl += b.maxl;
2545 }
2546
2547 for (int i = 0; i < NumBadChars; i++) {
2548 if (b.occ1.at(i) != NoOccurrence && minl + b.occ1.at(i) < occ1.at(i))
2549 occ1[i] = minl + b.occ1.at(i);
2550 }
2551#endif
2552
2553 minl += b.minl;
2554 if (minl == 0)
2555 skipanchors = eng->anchorConcatenation(a: skipanchors, b: b.skipanchors);
2556 else
2557 skipanchors = 0;
2558}
2559
2560void QRegExpEngine::Box::orx(const Box &b)
2561{
2562 mergeInto(a: &ls, b: b.ls);
2563 lanchors.insert(map: b.lanchors);
2564 mergeInto(a: &rs, b: b.rs);
2565 ranchors.insert(map: b.ranchors);
2566
2567 if (b.minl == 0) {
2568 if (minl == 0)
2569 skipanchors = eng->anchorAlternation(a: skipanchors, b: b.skipanchors);
2570 else
2571 skipanchors = b.skipanchors;
2572 }
2573
2574#ifndef QT_NO_REGEXP_OPTIM
2575 for (int i = 0; i < NumBadChars; i++) {
2576 if (occ1.at(i) > b.occ1.at(i))
2577 occ1[i] = b.occ1.at(i);
2578 }
2579 earlyStart = 0;
2580 lateStart = 0;
2581 str = QString();
2582 leftStr = QString();
2583 rightStr = QString();
2584 if (b.maxl > maxl)
2585 maxl = b.maxl;
2586#endif
2587 if (b.minl < minl)
2588 minl = b.minl;
2589}
2590
2591void QRegExpEngine::Box::plus(int atom)
2592{
2593#ifndef QT_NO_REGEXP_CAPTURE
2594 eng->addPlusTransitions(from: rs, to: ls, atom);
2595#else
2596 Q_UNUSED(atom);
2597 eng->addCatTransitions(rs, ls);
2598#endif
2599 addAnchorsToEngine(to: *this);
2600#ifndef QT_NO_REGEXP_OPTIM
2601 maxl = InftyLen;
2602#endif
2603}
2604
2605void QRegExpEngine::Box::opt()
2606{
2607#ifndef QT_NO_REGEXP_OPTIM
2608 earlyStart = 0;
2609 lateStart = 0;
2610 str = QString();
2611 leftStr = QString();
2612 rightStr = QString();
2613#endif
2614 skipanchors = 0;
2615 minl = 0;
2616}
2617
2618void QRegExpEngine::Box::catAnchor(int a)
2619{
2620 if (a != 0) {
2621 for (int i = 0; i < rs.size(); i++) {
2622 a = eng->anchorConcatenation(a: ranchors.value(key: rs.at(i), defaultValue: 0), b: a);
2623 ranchors.insert(key: rs.at(i), value: a);
2624 }
2625 if (minl == 0)
2626 skipanchors = eng->anchorConcatenation(a: skipanchors, b: a);
2627 }
2628}
2629
2630#ifndef QT_NO_REGEXP_OPTIM
2631void QRegExpEngine::Box::setupHeuristics()
2632{
2633 eng->goodEarlyStart = earlyStart;
2634 eng->goodLateStart = lateStart;
2635 eng->goodStr = eng->cs ? str : str.toLower();
2636
2637 eng->minl = minl;
2638 if (eng->cs) {
2639 /*
2640 A regular expression such as 112|1 has occ1['2'] = 2 and minl =
2641 1 at this point. An entry of occ1 has to be at most minl or
2642 infinity for the rest of the algorithm to go well.
2643
2644 We waited until here before normalizing these cases (instead of
2645 doing it in Box::orx()) because sometimes things improve by
2646 themselves. Consider for example (112|1)34.
2647 */
2648 for (int i = 0; i < NumBadChars; i++) {
2649 if (occ1.at(i) != NoOccurrence && occ1.at(i) >= minl)
2650 occ1[i] = minl;
2651 }
2652 eng->occ1 = occ1;
2653 } else {
2654 eng->occ1.fill(t: 0, newSize: NumBadChars);
2655 }
2656
2657 eng->heuristicallyChooseHeuristic();
2658}
2659#endif
2660
2661#if defined(QT_DEBUG)
2662void QRegExpEngine::Box::dump() const
2663{
2664 int i;
2665 qDebug(msg: "Box of at least %d character%s", minl, minl == 1 ? "" : "s");
2666 qDebug(msg: " Left states:");
2667 for (i = 0; i < ls.size(); i++) {
2668 if (lanchors.value(key: ls[i], defaultValue: 0) == 0)
2669 qDebug(msg: " %d", ls[i]);
2670 else
2671 qDebug(msg: " %d [anchors 0x%.8x]", ls[i], lanchors[ls[i]]);
2672 }
2673 qDebug(msg: " Right states:");
2674 for (i = 0; i < rs.size(); i++) {
2675 if (ranchors.value(key: rs[i], defaultValue: 0) == 0)
2676 qDebug(msg: " %d", rs[i]);
2677 else
2678 qDebug(msg: " %d [anchors 0x%.8x]", rs[i], ranchors[rs[i]]);
2679 }
2680 qDebug(msg: " Skip anchors: 0x%.8x", skipanchors);
2681}
2682#endif
2683
2684void QRegExpEngine::Box::addAnchorsToEngine(const Box &to) const
2685{
2686 for (int i = 0; i < to.ls.size(); i++) {
2687 for (int j = 0; j < rs.size(); j++) {
2688 int a = eng->anchorConcatenation(a: ranchors.value(key: rs.at(i: j), defaultValue: 0),
2689 b: to.lanchors.value(key: to.ls.at(i), defaultValue: 0));
2690 eng->addAnchors(from: rs[j], to: to.ls[i], a);
2691 }
2692 }
2693}
2694
2695#ifndef QT_NO_REGEXP_CCLASS
2696// fast lookup hash for xml schema extensions
2697// sorted by name for b-search
2698static const struct CategoriesRangeMapEntry {
2699 const char name[40];
2700 uint first, second;
2701} categoriesRangeMap[] = {
2702 { .name: "AegeanNumbers", .first: 0x10100, .second: 0x1013F },
2703 { .name: "AlphabeticPresentationForms", .first: 0xFB00, .second: 0xFB4F },
2704 { .name: "AncientGreekMusicalNotation", .first: 0x1D200, .second: 0x1D24F },
2705 { .name: "AncientGreekNumbers", .first: 0x10140, .second: 0x1018F },
2706 { .name: "Arabic", .first: 0x0600, .second: 0x06FF },
2707 { .name: "ArabicPresentationForms-A", .first: 0xFB50, .second: 0xFDFF },
2708 { .name: "ArabicPresentationForms-B", .first: 0xFE70, .second: 0xFEFF },
2709 { .name: "ArabicSupplement", .first: 0x0750, .second: 0x077F },
2710 { .name: "Armenian", .first: 0x0530, .second: 0x058F },
2711 { .name: "Arrows", .first: 0x2190, .second: 0x21FF },
2712 { .name: "BasicLatin", .first: 0x0000, .second: 0x007F },
2713 { .name: "Bengali", .first: 0x0980, .second: 0x09FF },
2714 { .name: "BlockElements", .first: 0x2580, .second: 0x259F },
2715 { .name: "Bopomofo", .first: 0x3100, .second: 0x312F },
2716 { .name: "BopomofoExtended", .first: 0x31A0, .second: 0x31BF },
2717 { .name: "BoxDrawing", .first: 0x2500, .second: 0x257F },
2718 { .name: "BraillePatterns", .first: 0x2800, .second: 0x28FF },
2719 { .name: "Buginese", .first: 0x1A00, .second: 0x1A1F },
2720 { .name: "Buhid", .first: 0x1740, .second: 0x175F },
2721 { .name: "ByzantineMusicalSymbols", .first: 0x1D000, .second: 0x1D0FF },
2722 { .name: "CJKCompatibility", .first: 0x3300, .second: 0x33FF },
2723 { .name: "CJKCompatibilityForms", .first: 0xFE30, .second: 0xFE4F },
2724 { .name: "CJKCompatibilityIdeographs", .first: 0xF900, .second: 0xFAFF },
2725 { .name: "CJKCompatibilityIdeographsSupplement", .first: 0x2F800, .second: 0x2FA1F },
2726 { .name: "CJKRadicalsSupplement", .first: 0x2E80, .second: 0x2EFF },
2727 { .name: "CJKStrokes", .first: 0x31C0, .second: 0x31EF },
2728 { .name: "CJKSymbolsandPunctuation", .first: 0x3000, .second: 0x303F },
2729 { .name: "CJKUnifiedIdeographs", .first: 0x4E00, .second: 0x9FFF },
2730 { .name: "CJKUnifiedIdeographsExtensionA", .first: 0x3400, .second: 0x4DB5 },
2731 { .name: "CJKUnifiedIdeographsExtensionB", .first: 0x20000, .second: 0x2A6DF },
2732 { .name: "Cherokee", .first: 0x13A0, .second: 0x13FF },
2733 { .name: "CombiningDiacriticalMarks", .first: 0x0300, .second: 0x036F },
2734 { .name: "CombiningDiacriticalMarksSupplement", .first: 0x1DC0, .second: 0x1DFF },
2735 { .name: "CombiningHalfMarks", .first: 0xFE20, .second: 0xFE2F },
2736 { .name: "CombiningMarksforSymbols", .first: 0x20D0, .second: 0x20FF },
2737 { .name: "ControlPictures", .first: 0x2400, .second: 0x243F },
2738 { .name: "Coptic", .first: 0x2C80, .second: 0x2CFF },
2739 { .name: "CurrencySymbols", .first: 0x20A0, .second: 0x20CF },
2740 { .name: "CypriotSyllabary", .first: 0x10800, .second: 0x1083F },
2741 { .name: "Cyrillic", .first: 0x0400, .second: 0x04FF },
2742 { .name: "CyrillicSupplement", .first: 0x0500, .second: 0x052F },
2743 { .name: "Deseret", .first: 0x10400, .second: 0x1044F },
2744 { .name: "Devanagari", .first: 0x0900, .second: 0x097F },
2745 { .name: "Dingbats", .first: 0x2700, .second: 0x27BF },
2746 { .name: "EnclosedAlphanumerics", .first: 0x2460, .second: 0x24FF },
2747 { .name: "EnclosedCJKLettersandMonths", .first: 0x3200, .second: 0x32FF },
2748 { .name: "Ethiopic", .first: 0x1200, .second: 0x137F },
2749 { .name: "EthiopicExtended", .first: 0x2D80, .second: 0x2DDF },
2750 { .name: "EthiopicSupplement", .first: 0x1380, .second: 0x139F },
2751 { .name: "GeneralPunctuation", .first: 0x2000, .second: 0x206F },
2752 { .name: "GeometricShapes", .first: 0x25A0, .second: 0x25FF },
2753 { .name: "Georgian", .first: 0x10A0, .second: 0x10FF },
2754 { .name: "GeorgianSupplement", .first: 0x2D00, .second: 0x2D2F },
2755 { .name: "Glagolitic", .first: 0x2C00, .second: 0x2C5F },
2756 { .name: "Gothic", .first: 0x10330, .second: 0x1034F },
2757 { .name: "Greek", .first: 0x0370, .second: 0x03FF },
2758 { .name: "GreekExtended", .first: 0x1F00, .second: 0x1FFF },
2759 { .name: "Gujarati", .first: 0x0A80, .second: 0x0AFF },
2760 { .name: "Gurmukhi", .first: 0x0A00, .second: 0x0A7F },
2761 { .name: "HalfwidthandFullwidthForms", .first: 0xFF00, .second: 0xFFEF },
2762 { .name: "HangulCompatibilityJamo", .first: 0x3130, .second: 0x318F },
2763 { .name: "HangulJamo", .first: 0x1100, .second: 0x11FF },
2764 { .name: "HangulSyllables", .first: 0xAC00, .second: 0xD7A3 },
2765 { .name: "Hanunoo", .first: 0x1720, .second: 0x173F },
2766 { .name: "Hebrew", .first: 0x0590, .second: 0x05FF },
2767 { .name: "Hiragana", .first: 0x3040, .second: 0x309F },
2768 { .name: "IPAExtensions", .first: 0x0250, .second: 0x02AF },
2769 { .name: "IdeographicDescriptionCharacters", .first: 0x2FF0, .second: 0x2FFF },
2770 { .name: "Kanbun", .first: 0x3190, .second: 0x319F },
2771 { .name: "KangxiRadicals", .first: 0x2F00, .second: 0x2FDF },
2772 { .name: "Kannada", .first: 0x0C80, .second: 0x0CFF },
2773 { .name: "Katakana", .first: 0x30A0, .second: 0x30FF },
2774 { .name: "KatakanaPhoneticExtensions", .first: 0x31F0, .second: 0x31FF },
2775 { .name: "Kharoshthi", .first: 0x10A00, .second: 0x10A5F },
2776 { .name: "Khmer", .first: 0x1780, .second: 0x17FF },
2777 { .name: "KhmerSymbols", .first: 0x19E0, .second: 0x19FF },
2778 { .name: "Lao", .first: 0x0E80, .second: 0x0EFF },
2779 { .name: "Latin-1Supplement", .first: 0x0080, .second: 0x00FF },
2780 { .name: "LatinExtended-A", .first: 0x0100, .second: 0x017F },
2781 { .name: "LatinExtended-B", .first: 0x0180, .second: 0x024F },
2782 { .name: "LatinExtendedAdditional", .first: 0x1E00, .second: 0x1EFF },
2783 { .name: "LetterlikeSymbols", .first: 0x2100, .second: 0x214F },
2784 { .name: "Limbu", .first: 0x1900, .second: 0x194F },
2785 { .name: "LinearBIdeograms", .first: 0x10080, .second: 0x100FF },
2786 { .name: "LinearBSyllabary", .first: 0x10000, .second: 0x1007F },
2787 { .name: "Malayalam", .first: 0x0D00, .second: 0x0D7F },
2788 { .name: "MathematicalAlphanumericSymbols", .first: 0x1D400, .second: 0x1D7FF },
2789 { .name: "MathematicalOperators", .first: 0x2200, .second: 0x22FF },
2790 { .name: "MiscellaneousMathematicalSymbols-A", .first: 0x27C0, .second: 0x27EF },
2791 { .name: "MiscellaneousMathematicalSymbols-B", .first: 0x2980, .second: 0x29FF },
2792 { .name: "MiscellaneousSymbols", .first: 0x2600, .second: 0x26FF },
2793 { .name: "MiscellaneousSymbolsandArrows", .first: 0x2B00, .second: 0x2BFF },
2794 { .name: "MiscellaneousTechnical", .first: 0x2300, .second: 0x23FF },
2795 { .name: "ModifierToneLetters", .first: 0xA700, .second: 0xA71F },
2796 { .name: "Mongolian", .first: 0x1800, .second: 0x18AF },
2797 { .name: "MusicalSymbols", .first: 0x1D100, .second: 0x1D1FF },
2798 { .name: "Myanmar", .first: 0x1000, .second: 0x109F },
2799 { .name: "NewTaiLue", .first: 0x1980, .second: 0x19DF },
2800 { .name: "NumberForms", .first: 0x2150, .second: 0x218F },
2801 { .name: "Ogham", .first: 0x1680, .second: 0x169F },
2802 { .name: "OldItalic", .first: 0x10300, .second: 0x1032F },
2803 { .name: "OldPersian", .first: 0x103A0, .second: 0x103DF },
2804 { .name: "OpticalCharacterRecognition", .first: 0x2440, .second: 0x245F },
2805 { .name: "Oriya", .first: 0x0B00, .second: 0x0B7F },
2806 { .name: "Osmanya", .first: 0x10480, .second: 0x104AF },
2807 { .name: "PhoneticExtensions", .first: 0x1D00, .second: 0x1D7F },
2808 { .name: "PhoneticExtensionsSupplement", .first: 0x1D80, .second: 0x1DBF },
2809 { .name: "PrivateUse", .first: 0xE000, .second: 0xF8FF },
2810 { .name: "Runic", .first: 0x16A0, .second: 0x16FF },
2811 { .name: "Shavian", .first: 0x10450, .second: 0x1047F },
2812 { .name: "Sinhala", .first: 0x0D80, .second: 0x0DFF },
2813 { .name: "SmallFormVariants", .first: 0xFE50, .second: 0xFE6F },
2814 { .name: "SpacingModifierLetters", .first: 0x02B0, .second: 0x02FF },
2815 { .name: "Specials", .first: 0xFFF0, .second: 0xFFFF },
2816 { .name: "SuperscriptsandSubscripts", .first: 0x2070, .second: 0x209F },
2817 { .name: "SupplementalArrows-A", .first: 0x27F0, .second: 0x27FF },
2818 { .name: "SupplementalArrows-B", .first: 0x2900, .second: 0x297F },
2819 { .name: "SupplementalMathematicalOperators", .first: 0x2A00, .second: 0x2AFF },
2820 { .name: "SupplementalPunctuation", .first: 0x2E00, .second: 0x2E7F },
2821 { .name: "SupplementaryPrivateUseArea-A", .first: 0xF0000, .second: 0xFFFFF },
2822 { .name: "SupplementaryPrivateUseArea-B", .first: 0x100000, .second: 0x10FFFF },
2823 { .name: "SylotiNagri", .first: 0xA800, .second: 0xA82F },
2824 { .name: "Syriac", .first: 0x0700, .second: 0x074F },
2825 { .name: "Tagalog", .first: 0x1700, .second: 0x171F },
2826 { .name: "Tagbanwa", .first: 0x1760, .second: 0x177F },
2827 { .name: "Tags", .first: 0xE0000, .second: 0xE007F },
2828 { .name: "TaiLe", .first: 0x1950, .second: 0x197F },
2829 { .name: "TaiXuanJingSymbols", .first: 0x1D300, .second: 0x1D35F },
2830 { .name: "Tamil", .first: 0x0B80, .second: 0x0BFF },
2831 { .name: "Telugu", .first: 0x0C00, .second: 0x0C7F },
2832 { .name: "Thaana", .first: 0x0780, .second: 0x07BF },
2833 { .name: "Thai", .first: 0x0E00, .second: 0x0E7F },
2834 { .name: "Tibetan", .first: 0x0F00, .second: 0x0FFF },
2835 { .name: "Tifinagh", .first: 0x2D30, .second: 0x2D7F },
2836 { .name: "Ugaritic", .first: 0x10380, .second: 0x1039F },
2837 { .name: "UnifiedCanadianAboriginalSyllabics", .first: 0x1400, .second: 0x167F },
2838 { .name: "VariationSelectors", .first: 0xFE00, .second: 0xFE0F },
2839 { .name: "VariationSelectorsSupplement", .first: 0xE0100, .second: 0xE01EF },
2840 { .name: "VerticalForms", .first: 0xFE10, .second: 0xFE1F },
2841 { .name: "YiRadicals", .first: 0xA490, .second: 0xA4CF },
2842 { .name: "YiSyllables", .first: 0xA000, .second: 0xA48F },
2843 { .name: "YijingHexagramSymbols", .first: 0x4DC0, .second: 0x4DFF }
2844};
2845
2846inline bool operator<(const CategoriesRangeMapEntry &entry1, const CategoriesRangeMapEntry &entry2)
2847{ return qstrcmp(str1: entry1.name, str2: entry2.name) < 0; }
2848inline bool operator<(const char *name, const CategoriesRangeMapEntry &entry)
2849{ return qstrcmp(str1: name, str2: entry.name) < 0; }
2850inline bool operator<(const CategoriesRangeMapEntry &entry, const char *name)
2851{ return qstrcmp(str1: entry.name, str2: name) < 0; }
2852#endif // QT_NO_REGEXP_CCLASS
2853
2854int QRegExpEngine::getChar()
2855{
2856 return (yyPos == yyLen) ? EOS : yyIn[yyPos++].unicode();
2857}
2858
2859int QRegExpEngine::getEscape()
2860{
2861#ifndef QT_NO_REGEXP_ESCAPE
2862 const char tab[] = "afnrtv"; // no b, as \b means word boundary
2863 const char backTab[] = "\a\f\n\r\t\v";
2864 ushort low;
2865 int i;
2866#endif
2867 ushort val;
2868 int prevCh = yyCh;
2869
2870 if (prevCh == EOS) {
2871 error(RXERR_END);
2872 return Tok_Char | '\\';
2873 }
2874 yyCh = getChar();
2875#ifndef QT_NO_REGEXP_ESCAPE
2876 if ((prevCh & ~0xff) == 0) {
2877 const char *p = strchr(s: tab, c: prevCh);
2878 if (p != nullptr)
2879 return Tok_Char | backTab[p - tab];
2880 }
2881#endif
2882
2883 switch (prevCh) {
2884#ifndef QT_NO_REGEXP_ESCAPE
2885 case '0':
2886 val = 0;
2887 for (i = 0; i < 3; i++) {
2888 if (yyCh >= '0' && yyCh <= '7')
2889 val = (val << 3) | (yyCh - '0');
2890 else
2891 break;
2892 yyCh = getChar();
2893 }
2894 if ((val & ~0377) != 0)
2895 error(RXERR_OCTAL);
2896 return Tok_Char | val;
2897#endif
2898#ifndef QT_NO_REGEXP_ESCAPE
2899 case 'B':
2900 return Tok_NonWord;
2901#endif
2902#ifndef QT_NO_REGEXP_CCLASS
2903 case 'D':
2904 // see QChar::isDigit()
2905 yyCharClass->addCategories(cats: uint(-1) ^ FLAG(QChar::Number_DecimalDigit));
2906 return Tok_CharClass;
2907 case 'S':
2908 // see QChar::isSpace()
2909 yyCharClass->addCategories(cats: uint(-1) ^ (FLAG(QChar::Separator_Space) |
2910 FLAG(QChar::Separator_Line) |
2911 FLAG(QChar::Separator_Paragraph) |
2912 FLAG(QChar::Other_Control)));
2913 yyCharClass->addRange(from: 0x0000, to: 0x0008);
2914 yyCharClass->addRange(from: 0x000e, to: 0x001f);
2915 yyCharClass->addRange(from: 0x007f, to: 0x0084);
2916 yyCharClass->addRange(from: 0x0086, to: 0x009f);
2917 return Tok_CharClass;
2918 case 'W':
2919 // see QChar::isLetterOrNumber() and QChar::isMark()
2920 yyCharClass->addCategories(cats: uint(-1) ^ (FLAG(QChar::Mark_NonSpacing) |
2921 FLAG(QChar::Mark_SpacingCombining) |
2922 FLAG(QChar::Mark_Enclosing) |
2923 FLAG(QChar::Number_DecimalDigit) |
2924 FLAG(QChar::Number_Letter) |
2925 FLAG(QChar::Number_Other) |
2926 FLAG(QChar::Letter_Uppercase) |
2927 FLAG(QChar::Letter_Lowercase) |
2928 FLAG(QChar::Letter_Titlecase) |
2929 FLAG(QChar::Letter_Modifier) |
2930 FLAG(QChar::Letter_Other) |
2931 FLAG(QChar::Punctuation_Connector)));
2932 yyCharClass->addRange(from: 0x203f, to: 0x2040);
2933 yyCharClass->addSingleton(ch: 0x2040);
2934 yyCharClass->addSingleton(ch: 0x2054);
2935 yyCharClass->addSingleton(ch: 0x30fb);
2936 yyCharClass->addRange(from: 0xfe33, to: 0xfe34);
2937 yyCharClass->addRange(from: 0xfe4d, to: 0xfe4f);
2938 yyCharClass->addSingleton(ch: 0xff3f);
2939 yyCharClass->addSingleton(ch: 0xff65);
2940 return Tok_CharClass;
2941#endif
2942#ifndef QT_NO_REGEXP_ESCAPE
2943 case 'b':
2944 return Tok_Word;
2945#endif
2946#ifndef QT_NO_REGEXP_CCLASS
2947 case 'd':
2948 // see QChar::isDigit()
2949 yyCharClass->addCategories(FLAG(QChar::Number_DecimalDigit));
2950 return Tok_CharClass;
2951 case 's':
2952 // see QChar::isSpace()
2953 yyCharClass->addCategories(FLAG(QChar::Separator_Space) |
2954 FLAG(QChar::Separator_Line) |
2955 FLAG(QChar::Separator_Paragraph));
2956 yyCharClass->addRange(from: 0x0009, to: 0x000d);
2957 yyCharClass->addSingleton(ch: 0x0085);
2958 return Tok_CharClass;
2959 case 'w':
2960 // see QChar::isLetterOrNumber() and QChar::isMark()
2961 yyCharClass->addCategories(FLAG(QChar::Mark_NonSpacing) |
2962 FLAG(QChar::Mark_SpacingCombining) |
2963 FLAG(QChar::Mark_Enclosing) |
2964 FLAG(QChar::Number_DecimalDigit) |
2965 FLAG(QChar::Number_Letter) |
2966 FLAG(QChar::Number_Other) |
2967 FLAG(QChar::Letter_Uppercase) |
2968 FLAG(QChar::Letter_Lowercase) |
2969 FLAG(QChar::Letter_Titlecase) |
2970 FLAG(QChar::Letter_Modifier) |
2971 FLAG(QChar::Letter_Other));
2972 yyCharClass->addSingleton(ch: 0x005f); // '_'
2973 return Tok_CharClass;
2974 case 'I':
2975 if (!xmlSchemaExtensions)
2976 break;
2977 yyCharClass->setNegative(!yyCharClass->negative());
2978 Q_FALLTHROUGH();
2979 case 'i':
2980 if (xmlSchemaExtensions) {
2981 yyCharClass->addCategories(FLAG(QChar::Mark_NonSpacing) |
2982 FLAG(QChar::Mark_SpacingCombining) |
2983 FLAG(QChar::Mark_Enclosing) |
2984 FLAG(QChar::Number_DecimalDigit) |
2985 FLAG(QChar::Number_Letter) |
2986 FLAG(QChar::Number_Other) |
2987 FLAG(QChar::Letter_Uppercase) |
2988 FLAG(QChar::Letter_Lowercase) |
2989 FLAG(QChar::Letter_Titlecase) |
2990 FLAG(QChar::Letter_Modifier) |
2991 FLAG(QChar::Letter_Other));
2992 yyCharClass->addSingleton(ch: 0x003a); // ':'
2993 yyCharClass->addSingleton(ch: 0x005f); // '_'
2994 yyCharClass->addRange(from: 0x0041, to: 0x005a); // [A-Z]
2995 yyCharClass->addRange(from: 0x0061, to: 0x007a); // [a-z]
2996 yyCharClass->addRange(from: 0xc0, to: 0xd6);
2997 yyCharClass->addRange(from: 0xd8, to: 0xf6);
2998 yyCharClass->addRange(from: 0xf8, to: 0x2ff);
2999 yyCharClass->addRange(from: 0x370, to: 0x37d);
3000 yyCharClass->addRange(from: 0x37f, to: 0x1fff);
3001 yyCharClass->addRange(from: 0x200c, to: 0x200d);
3002 yyCharClass->addRange(from: 0x2070, to: 0x218f);
3003 yyCharClass->addRange(from: 0x2c00, to: 0x2fef);
3004 yyCharClass->addRange(from: 0x3001, to: 0xd7ff);
3005 yyCharClass->addRange(from: 0xf900, to: 0xfdcf);
3006 yyCharClass->addRange(from: 0xfdf0, to: 0xfffd);
3007 yyCharClass->addRange(from: (ushort)0x10000, to: (ushort)0xeffff);
3008 return Tok_CharClass;
3009 } else {
3010 break;
3011 }
3012 case 'C':
3013 if (!xmlSchemaExtensions)
3014 break;
3015 yyCharClass->setNegative(!yyCharClass->negative());
3016 Q_FALLTHROUGH();
3017 case 'c':
3018 if (xmlSchemaExtensions) {
3019 yyCharClass->addCategories(FLAG(QChar::Mark_NonSpacing) |
3020 FLAG(QChar::Mark_SpacingCombining) |
3021 FLAG(QChar::Mark_Enclosing) |
3022 FLAG(QChar::Number_DecimalDigit) |
3023 FLAG(QChar::Number_Letter) |
3024 FLAG(QChar::Number_Other) |
3025 FLAG(QChar::Letter_Uppercase) |
3026 FLAG(QChar::Letter_Lowercase) |
3027 FLAG(QChar::Letter_Titlecase) |
3028 FLAG(QChar::Letter_Modifier) |
3029 FLAG(QChar::Letter_Other));
3030 yyCharClass->addSingleton(ch: 0x002d); // '-'
3031 yyCharClass->addSingleton(ch: 0x002e); // '.'
3032 yyCharClass->addSingleton(ch: 0x003a); // ':'
3033 yyCharClass->addSingleton(ch: 0x005f); // '_'
3034 yyCharClass->addSingleton(ch: 0xb7);
3035 yyCharClass->addRange(from: 0x0030, to: 0x0039); // [0-9]
3036 yyCharClass->addRange(from: 0x0041, to: 0x005a); // [A-Z]
3037 yyCharClass->addRange(from: 0x0061, to: 0x007a); // [a-z]
3038 yyCharClass->addRange(from: 0xc0, to: 0xd6);
3039 yyCharClass->addRange(from: 0xd8, to: 0xf6);
3040 yyCharClass->addRange(from: 0xf8, to: 0x2ff);
3041 yyCharClass->addRange(from: 0x370, to: 0x37d);
3042 yyCharClass->addRange(from: 0x37f, to: 0x1fff);
3043 yyCharClass->addRange(from: 0x200c, to: 0x200d);
3044 yyCharClass->addRange(from: 0x2070, to: 0x218f);
3045 yyCharClass->addRange(from: 0x2c00, to: 0x2fef);
3046 yyCharClass->addRange(from: 0x3001, to: 0xd7ff);
3047 yyCharClass->addRange(from: 0xf900, to: 0xfdcf);
3048 yyCharClass->addRange(from: 0xfdf0, to: 0xfffd);
3049 yyCharClass->addRange(from: (ushort)0x10000, to: (ushort)0xeffff);
3050 yyCharClass->addRange(from: 0x0300, to: 0x036f);
3051 yyCharClass->addRange(from: 0x203f, to: 0x2040);
3052 return Tok_CharClass;
3053 } else {
3054 break;
3055 }
3056 case 'P':
3057 if (!xmlSchemaExtensions)
3058 break;
3059 yyCharClass->setNegative(!yyCharClass->negative());
3060 Q_FALLTHROUGH();
3061 case 'p':
3062 if (xmlSchemaExtensions) {
3063 if (yyCh != '{') {
3064 error(RXERR_CHARCLASS);
3065 return Tok_CharClass;
3066 }
3067
3068 QByteArray category;
3069 yyCh = getChar();
3070 while (yyCh != '}') {
3071 if (yyCh == EOS) {
3072 error(RXERR_END);
3073 return Tok_CharClass;
3074 }
3075 category.append(c: yyCh);
3076 yyCh = getChar();
3077 }
3078 yyCh = getChar(); // skip closing '}'
3079
3080 int catlen = category.size();
3081 if (catlen == 1 || catlen == 2) {
3082 switch (category.at(i: 0)) {
3083 case 'M':
3084 if (catlen == 1) {
3085 yyCharClass->addCategories(FLAG(QChar::Mark_NonSpacing) |
3086 FLAG(QChar::Mark_SpacingCombining) |
3087 FLAG(QChar::Mark_Enclosing));
3088 } else {
3089 switch (category.at(i: 1)) {
3090 case 'n': yyCharClass->addCategories(FLAG(QChar::Mark_NonSpacing)); break; // Mn
3091 case 'c': yyCharClass->addCategories(FLAG(QChar::Mark_SpacingCombining)); break; // Mc
3092 case 'e': yyCharClass->addCategories(FLAG(QChar::Mark_Enclosing)); break; // Me
3093 default: error(RXERR_CATEGORY); break;
3094 }
3095 }
3096 break;
3097 case 'N':
3098 if (catlen == 1) {
3099 yyCharClass->addCategories(FLAG(QChar::Number_DecimalDigit) |
3100 FLAG(QChar::Number_Letter) |
3101 FLAG(QChar::Number_Other));
3102 } else {
3103 switch (category.at(i: 1)) {
3104 case 'd': yyCharClass->addCategories(FLAG(QChar::Number_DecimalDigit)); break; // Nd
3105 case 'l': yyCharClass->addCategories(FLAG(QChar::Number_Letter)); break; // Hl
3106 case 'o': yyCharClass->addCategories(FLAG(QChar::Number_Other)); break; // No
3107 default: error(RXERR_CATEGORY); break;
3108 }
3109 }
3110 break;
3111 case 'Z':
3112 if (catlen == 1) {
3113 yyCharClass->addCategories(FLAG(QChar::Separator_Space) |
3114 FLAG(QChar::Separator_Line) |
3115 FLAG(QChar::Separator_Paragraph));
3116 } else {
3117 switch (category.at(i: 1)) {
3118 case 's': yyCharClass->addCategories(FLAG(QChar::Separator_Space)); break; // Zs
3119 case 'l': yyCharClass->addCategories(FLAG(QChar::Separator_Line)); break; // Zl
3120 case 'p': yyCharClass->addCategories(FLAG(QChar::Separator_Paragraph)); break; // Zp
3121 default: error(RXERR_CATEGORY); break;
3122 }
3123 }
3124 break;
3125 case 'C':
3126 if (catlen == 1) {
3127 yyCharClass->addCategories(FLAG(QChar::Other_Control) |
3128 FLAG(QChar::Other_Format) |
3129 FLAG(QChar::Other_Surrogate) |
3130 FLAG(QChar::Other_PrivateUse) |
3131 FLAG(QChar::Other_NotAssigned));
3132 } else {
3133 switch (category.at(i: 1)) {
3134 case 'c': yyCharClass->addCategories(FLAG(QChar::Other_Control)); break; // Cc
3135 case 'f': yyCharClass->addCategories(FLAG(QChar::Other_Format)); break; // Cf
3136 case 's': yyCharClass->addCategories(FLAG(QChar::Other_Surrogate)); break; // Cs
3137 case 'o': yyCharClass->addCategories(FLAG(QChar::Other_PrivateUse)); break; // Co
3138 case 'n': yyCharClass->addCategories(FLAG(QChar::Other_NotAssigned)); break; // Cn
3139 default: error(RXERR_CATEGORY); break;
3140 }
3141 }
3142 break;
3143 case 'L':
3144 if (catlen == 1) {
3145 yyCharClass->addCategories(FLAG(QChar::Letter_Uppercase) |
3146 FLAG(QChar::Letter_Lowercase) |
3147 FLAG(QChar::Letter_Titlecase) |
3148 FLAG(QChar::Letter_Modifier) |
3149 FLAG(QChar::Letter_Other));
3150 } else {
3151 switch (category.at(i: 1)) {
3152 case 'u': yyCharClass->addCategories(FLAG(QChar::Letter_Uppercase)); break; // Lu
3153 case 'l': yyCharClass->addCategories(FLAG(QChar::Letter_Lowercase)); break; // Ll
3154 case 't': yyCharClass->addCategories(FLAG(QChar::Letter_Titlecase)); break; // Lt
3155 case 'm': yyCharClass->addCategories(FLAG(QChar::Letter_Modifier)); break; // Lm
3156 case 'o': yyCharClass->addCategories(FLAG(QChar::Letter_Other)); break; // Lo
3157 default: error(RXERR_CATEGORY); break;
3158 }
3159 }
3160 break;
3161 case 'P':
3162 if (catlen == 1) {
3163 yyCharClass->addCategories(FLAG(QChar::Punctuation_Connector) |
3164 FLAG(QChar::Punctuation_Dash) |
3165 FLAG(QChar::Punctuation_Open) |
3166 FLAG(QChar::Punctuation_Close) |
3167 FLAG(QChar::Punctuation_InitialQuote) |
3168 FLAG(QChar::Punctuation_FinalQuote) |
3169 FLAG(QChar::Punctuation_Other));
3170 } else {
3171 switch (category.at(i: 1)) {
3172 case 'c': yyCharClass->addCategories(FLAG(QChar::Punctuation_Connector)); break; // Pc
3173 case 'd': yyCharClass->addCategories(FLAG(QChar::Punctuation_Dash)); break; // Pd
3174 case 's': yyCharClass->addCategories(FLAG(QChar::Punctuation_Open)); break; // Ps
3175 case 'e': yyCharClass->addCategories(FLAG(QChar::Punctuation_Close)); break; // Pe
3176 case 'i': yyCharClass->addCategories(FLAG(QChar::Punctuation_InitialQuote)); break; // Pi
3177 case 'f': yyCharClass->addCategories(FLAG(QChar::Punctuation_FinalQuote)); break; // Pf
3178 case 'o': yyCharClass->addCategories(FLAG(QChar::Punctuation_Other)); break; // Po
3179 default: error(RXERR_CATEGORY); break;
3180 }
3181 }
3182 break;
3183 case 'S':
3184 if (catlen == 1) {
3185 yyCharClass->addCategories(FLAG(QChar::Symbol_Math) |
3186 FLAG(QChar::Symbol_Currency) |
3187 FLAG(QChar::Symbol_Modifier) |
3188 FLAG(QChar::Symbol_Other));
3189 } else {
3190 switch (category.at(i: 1)) {
3191 case 'm': yyCharClass->addCategories(FLAG(QChar::Symbol_Math)); break; // Sm
3192 case 'c': yyCharClass->addCategories(FLAG(QChar::Symbol_Currency)); break; // Sc
3193 case 'k': yyCharClass->addCategories(FLAG(QChar::Symbol_Modifier)); break; // Sk
3194 case 'o': yyCharClass->addCategories(FLAG(QChar::Symbol_Other)); break; // So
3195 default: error(RXERR_CATEGORY); break;
3196 }
3197 }
3198 break;
3199 default:
3200 error(RXERR_CATEGORY);
3201 break;
3202 }
3203 } else if (catlen > 2 && category.at(i: 0) == 'I' && category.at(i: 1) == 's') {
3204 static const int N = sizeof(categoriesRangeMap) / sizeof(categoriesRangeMap[0]);
3205 const char * const categoryFamily = category.constData() + 2;
3206 const CategoriesRangeMapEntry *r = std::lower_bound(first: categoriesRangeMap, last: categoriesRangeMap + N, val: categoryFamily);
3207 if (r != categoriesRangeMap + N && qstrcmp(str1: r->name, str2: categoryFamily) == 0)
3208 yyCharClass->addRange(from: r->first, to: r->second);
3209 else
3210 error(RXERR_CATEGORY);
3211 } else {
3212 error(RXERR_CATEGORY);
3213 }
3214 return Tok_CharClass;
3215 } else {
3216 break;
3217 }
3218#endif
3219#ifndef QT_NO_REGEXP_ESCAPE
3220 case 'x':
3221 val = 0;
3222 for (i = 0; i < 4; i++) {
3223 low = QChar(yyCh).toLower().unicode();
3224 if (low >= '0' && low <= '9')
3225 val = (val << 4) | (low - '0');
3226 else if (low >= 'a' && low <= 'f')
3227 val = (val << 4) | (low - 'a' + 10);
3228 else
3229 break;
3230 yyCh = getChar();
3231 }
3232 return Tok_Char | val;
3233#endif
3234 default:
3235 break;
3236 }
3237 if (prevCh >= '1' && prevCh <= '9') {
3238#ifndef QT_NO_REGEXP_BACKREF
3239 val = prevCh - '0';
3240 while (yyCh >= '0' && yyCh <= '9') {
3241 val = (val * 10) + (yyCh - '0');
3242 yyCh = getChar();
3243 }
3244 return Tok_BackRef | val;
3245#else
3246 error(RXERR_DISABLED);
3247#endif
3248 }
3249 return Tok_Char | prevCh;
3250}
3251
3252#ifndef QT_NO_REGEXP_INTERVAL
3253int QRegExpEngine::getRep(int def)
3254{
3255 if (yyCh >= '0' && yyCh <= '9') {
3256 int rep = 0;
3257 do {
3258 rep = 10 * rep + yyCh - '0';
3259 if (rep >= InftyRep) {
3260 error(RXERR_REPETITION);
3261 rep = def;
3262 }
3263 yyCh = getChar();
3264 } while (yyCh >= '0' && yyCh <= '9');
3265 return rep;
3266 } else {
3267 return def;
3268 }
3269}
3270#endif
3271
3272#ifndef QT_NO_REGEXP_LOOKAHEAD
3273void QRegExpEngine::skipChars(int n)
3274{
3275 if (n > 0) {
3276 yyPos += n - 1;
3277 yyCh = getChar();
3278 }
3279}
3280#endif
3281
3282void QRegExpEngine::error(const char *msg)
3283{
3284 if (yyError.isEmpty())
3285 yyError = QLatin1String(msg);
3286}
3287
3288void QRegExpEngine::startTokenizer(const QChar *rx, int len)
3289{
3290 yyIn = rx;
3291 yyPos0 = 0;
3292 yyPos = 0;
3293 yyLen = len;
3294 yyCh = getChar();
3295 yyCharClass.reset(other: new QRegExpCharClass);
3296 yyMinRep = 0;
3297 yyMaxRep = 0;
3298 yyError = QString();
3299}
3300
3301int QRegExpEngine::getToken()
3302{
3303#ifndef QT_NO_REGEXP_CCLASS
3304 ushort pendingCh = 0;
3305 bool charPending;
3306 bool rangePending;
3307 int tok;
3308#endif
3309 int prevCh = yyCh;
3310
3311 yyPos0 = yyPos - 1;
3312#ifndef QT_NO_REGEXP_CCLASS
3313 yyCharClass->clear();
3314#endif
3315 yyMinRep = 0;
3316 yyMaxRep = 0;
3317 yyCh = getChar();
3318
3319 switch (prevCh) {
3320 case EOS:
3321 yyPos0 = yyPos;
3322 return Tok_Eos;
3323 case '$':
3324 return Tok_Dollar;
3325 case '(':
3326 if (yyCh == '?') {
3327 prevCh = getChar();
3328 yyCh = getChar();
3329 switch (prevCh) {
3330#ifndef QT_NO_REGEXP_LOOKAHEAD
3331 case '!':
3332 return Tok_NegLookahead;
3333 case '=':
3334 return Tok_PosLookahead;
3335#endif
3336 case ':':
3337 return Tok_MagicLeftParen;
3338 case '<':
3339 error(RXERR_LOOKBEHIND);
3340 return Tok_MagicLeftParen;
3341 default:
3342 error(RXERR_LOOKAHEAD);
3343 return Tok_MagicLeftParen;
3344 }
3345 } else {
3346 return Tok_LeftParen;
3347 }
3348 case ')':
3349 return Tok_RightParen;
3350 case '*':
3351 yyMinRep = 0;
3352 yyMaxRep = InftyRep;
3353 return Tok_Quantifier;
3354 case '+':
3355 yyMinRep = 1;
3356 yyMaxRep = InftyRep;
3357 return Tok_Quantifier;
3358 case '.':
3359#ifndef QT_NO_REGEXP_CCLASS
3360 yyCharClass->setNegative(true);
3361#endif
3362 return Tok_CharClass;
3363 case '?':
3364 yyMinRep = 0;
3365 yyMaxRep = 1;
3366 return Tok_Quantifier;
3367 case '[':
3368#ifndef QT_NO_REGEXP_CCLASS
3369 if (yyCh == '^') {
3370 yyCharClass->setNegative(true);
3371 yyCh = getChar();
3372 }
3373 charPending = false;
3374 rangePending = false;
3375 do {
3376 if (yyCh == '-' && charPending && !rangePending) {
3377 rangePending = true;
3378 yyCh = getChar();
3379 } else {
3380 if (charPending && !rangePending) {
3381 yyCharClass->addSingleton(ch: pendingCh);
3382 charPending = false;
3383 }
3384 if (yyCh == '\\') {
3385 yyCh = getChar();
3386 tok = getEscape();
3387 if (tok == Tok_Word)
3388 tok = '\b';
3389 } else {
3390 tok = Tok_Char | yyCh;
3391 yyCh = getChar();
3392 }
3393 if (tok == Tok_CharClass) {
3394 if (rangePending) {
3395 yyCharClass->addSingleton(ch: '-');
3396 yyCharClass->addSingleton(ch: pendingCh);
3397 charPending = false;
3398 rangePending = false;
3399 }
3400 } else if ((tok & Tok_Char) != 0) {
3401 if (rangePending) {
3402 yyCharClass->addRange(from: pendingCh, to: tok ^ Tok_Char);
3403 charPending = false;
3404 rangePending = false;
3405 } else {
3406 pendingCh = tok ^ Tok_Char;
3407 charPending = true;
3408 }
3409 } else {
3410 error(RXERR_CHARCLASS);
3411 }
3412 }
3413 } while (yyCh != ']' && yyCh != EOS);
3414 if (rangePending)
3415 yyCharClass->addSingleton(ch: '-');
3416 if (charPending)
3417 yyCharClass->addSingleton(ch: pendingCh);
3418 if (yyCh == EOS)
3419 error(RXERR_END);
3420 else
3421 yyCh = getChar();
3422 return Tok_CharClass;
3423#else
3424 error(RXERR_END);
3425 return Tok_Char | '[';
3426#endif
3427 case '\\':
3428 return getEscape();
3429 case ']':
3430 error(RXERR_LEFTDELIM);
3431 return Tok_Char | ']';
3432 case '^':
3433 return Tok_Caret;
3434 case '{':
3435#ifndef QT_NO_REGEXP_INTERVAL
3436 yyMinRep = getRep(def: 0);
3437 yyMaxRep = yyMinRep;
3438 if (yyCh == ',') {
3439 yyCh = getChar();
3440 yyMaxRep = getRep(def: InftyRep);
3441 }
3442 if (yyMaxRep < yyMinRep)
3443 error(RXERR_INTERVAL);
3444 if (yyCh != '}')
3445 error(RXERR_REPETITION);
3446 yyCh = getChar();
3447 return Tok_Quantifier;
3448#else
3449 error(RXERR_DISABLED);
3450 return Tok_Char | '{';
3451#endif
3452 case '|':
3453 return Tok_Bar;
3454 case '}':
3455 error(RXERR_LEFTDELIM);
3456 return Tok_Char | '}';
3457 default:
3458 return Tok_Char | prevCh;
3459 }
3460}
3461
3462int QRegExpEngine::parse(const QChar *pattern, int len)
3463{
3464 valid = true;
3465 startTokenizer(rx: pattern, len);
3466 yyTok = getToken();
3467#ifndef QT_NO_REGEXP_CAPTURE
3468 yyMayCapture = true;
3469#else
3470 yyMayCapture = false;
3471#endif
3472
3473#ifndef QT_NO_REGEXP_CAPTURE
3474 int atom = startAtom(officialCapture: false);
3475#endif
3476 QRegExpCharClass anything;
3477 Box box(this); // create InitialState
3478 box.set(anything);
3479 Box rightBox(this); // create FinalState
3480 rightBox.set(anything);
3481
3482 Box middleBox(this);
3483 parseExpression(box: &middleBox);
3484#ifndef QT_NO_REGEXP_CAPTURE
3485 finishAtom(atom, needCapture: false);
3486#endif
3487#ifndef QT_NO_REGEXP_OPTIM
3488 middleBox.setupHeuristics();
3489#endif
3490 box.cat(b: middleBox);
3491 box.cat(b: rightBox);
3492 yyCharClass.reset();
3493
3494#ifndef QT_NO_REGEXP_CAPTURE
3495 for (int i = 0; i < nf; ++i) {
3496 switch (f[i].capture) {
3497 case QRegExpAtom::NoCapture:
3498 break;
3499 case QRegExpAtom::OfficialCapture:
3500 f[i].capture = ncap;
3501 captureForOfficialCapture.append(t: ncap);
3502 ++ncap;
3503 ++officialncap;
3504 break;
3505 case QRegExpAtom::UnofficialCapture:
3506 f[i].capture = greedyQuantifiers ? ncap++ : QRegExpAtom::NoCapture;
3507 }
3508 }
3509
3510#ifndef QT_NO_REGEXP_BACKREF
3511#ifndef QT_NO_REGEXP_OPTIM
3512 if (officialncap == 0 && nbrefs == 0) {
3513 ncap = nf = 0;
3514 f.clear();
3515 }
3516#endif
3517 // handle the case where there's a \5 with no corresponding capture
3518 // (captureForOfficialCapture.size() != officialncap)
3519 for (int i = 0; i < nbrefs - officialncap; ++i) {
3520 captureForOfficialCapture.append(t: ncap);
3521 ++ncap;
3522 }
3523#endif
3524#endif
3525
3526 if (!yyError.isEmpty())
3527 return -1;
3528
3529#ifndef QT_NO_REGEXP_OPTIM
3530 const QRegExpAutomatonState &sinit = s.at(i: InitialState);
3531 caretAnchored = !sinit.anchors.isEmpty();
3532 if (caretAnchored) {
3533 const QMap<int, int> &anchors = sinit.anchors;
3534 QMap<int, int>::const_iterator a;
3535 for (a = anchors.constBegin(); a != anchors.constEnd(); ++a) {
3536 if (
3537#ifndef QT_NO_REGEXP_ANCHOR_ALT
3538 (*a & Anchor_Alternation) != 0 ||
3539#endif
3540 (*a & Anchor_Caret) == 0)
3541 {
3542 caretAnchored = false;
3543 break;
3544 }
3545 }
3546 }
3547#endif
3548
3549 // cleanup anchors
3550 int numStates = s.size();
3551 for (int i = 0; i < numStates; ++i) {
3552 QRegExpAutomatonState &state = s[i];
3553 if (!state.anchors.isEmpty()) {
3554 QMap<int, int>::iterator a = state.anchors.begin();
3555 while (a != state.anchors.end()) {
3556 if (a.value() == 0)
3557 a = state.anchors.erase(it: a);
3558 else
3559 ++a;
3560 }
3561 }
3562 }
3563
3564 return yyPos0;
3565}
3566
3567void QRegExpEngine::parseAtom(Box *box)
3568{
3569#ifndef QT_NO_REGEXP_LOOKAHEAD
3570 QRegExpEngine *eng = nullptr;
3571 bool neg;
3572 int len;
3573#endif
3574
3575 if ((yyTok & Tok_Char) != 0) {
3576 box->set(QChar(yyTok ^ Tok_Char));
3577 } else {
3578#ifndef QT_NO_REGEXP_OPTIM
3579 trivial = false;
3580#endif
3581 switch (yyTok) {
3582 case Tok_Dollar:
3583 box->catAnchor(a: Anchor_Dollar);
3584 break;
3585 case Tok_Caret:
3586 box->catAnchor(a: Anchor_Caret);
3587 break;
3588#ifndef QT_NO_REGEXP_LOOKAHEAD
3589 case Tok_PosLookahead:
3590 case Tok_NegLookahead:
3591 neg = (yyTok == Tok_NegLookahead);
3592 eng = new QRegExpEngine(cs, greedyQuantifiers);
3593 len = eng->parse(pattern: yyIn + yyPos - 1, len: yyLen - yyPos + 1);
3594 if (len >= 0)
3595 skipChars(n: len);
3596 else
3597 error(RXERR_LOOKAHEAD);
3598 box->catAnchor(a: addLookahead(eng, negative: neg));
3599 yyTok = getToken();
3600 if (yyTok != Tok_RightParen)
3601 error(RXERR_LOOKAHEAD);
3602 break;
3603#endif
3604#ifndef QT_NO_REGEXP_ESCAPE
3605 case Tok_Word:
3606 box->catAnchor(a: Anchor_Word);
3607 break;
3608 case Tok_NonWord:
3609 box->catAnchor(a: Anchor_NonWord);
3610 break;
3611#endif
3612 case Tok_LeftParen:
3613 case Tok_MagicLeftParen:
3614 yyTok = getToken();
3615 parseExpression(box);
3616 if (yyTok != Tok_RightParen)
3617 error(RXERR_END);
3618 break;
3619 case Tok_CharClass:
3620 box->set(*yyCharClass);
3621 break;
3622 case Tok_Quantifier:
3623 error(RXERR_REPETITION);
3624 break;
3625 default:
3626#ifndef QT_NO_REGEXP_BACKREF
3627 if ((yyTok & Tok_BackRef) != 0)
3628 box->set(yyTok ^ Tok_BackRef);
3629 else
3630#endif
3631 error(RXERR_DISABLED);
3632 }
3633 }
3634 yyTok = getToken();
3635}
3636
3637void QRegExpEngine::parseFactor(Box *box)
3638{
3639#ifndef QT_NO_REGEXP_CAPTURE
3640 int outerAtom = greedyQuantifiers ? startAtom(officialCapture: false) : -1;
3641 int innerAtom = startAtom(officialCapture: yyMayCapture && yyTok == Tok_LeftParen);
3642 bool magicLeftParen = (yyTok == Tok_MagicLeftParen);
3643#else
3644 const int innerAtom = -1;
3645#endif
3646
3647#ifndef QT_NO_REGEXP_INTERVAL
3648#define YYREDO() \
3649 yyIn = in, yyPos0 = pos0, yyPos = pos, yyLen = len, yyCh = ch, \
3650 *yyCharClass = charClass, yyMinRep = 0, yyMaxRep = 0, yyTok = tok
3651
3652 const QChar *in = yyIn;
3653 int pos0 = yyPos0;
3654 int pos = yyPos;
3655 int len = yyLen;
3656 int ch = yyCh;
3657 QRegExpCharClass charClass;
3658 if (yyTok == Tok_CharClass)
3659 charClass = *yyCharClass;
3660 int tok = yyTok;
3661 bool mayCapture = yyMayCapture;
3662#endif
3663
3664 parseAtom(box);
3665#ifndef QT_NO_REGEXP_CAPTURE
3666 finishAtom(atom: innerAtom, needCapture: magicLeftParen);
3667#endif
3668
3669 bool hasQuantifier = (yyTok == Tok_Quantifier);
3670 if (hasQuantifier) {
3671#ifndef QT_NO_REGEXP_OPTIM
3672 trivial = false;
3673#endif
3674 if (yyMaxRep == InftyRep) {
3675 box->plus(atom: innerAtom);
3676#ifndef QT_NO_REGEXP_INTERVAL
3677 } else if (yyMaxRep == 0) {
3678 box->clear();
3679#endif
3680 }
3681 if (yyMinRep == 0)
3682 box->opt();
3683
3684#ifndef QT_NO_REGEXP_INTERVAL
3685 yyMayCapture = false;
3686 int alpha = (yyMinRep == 0) ? 0 : yyMinRep - 1;
3687 int beta = (yyMaxRep == InftyRep) ? 0 : yyMaxRep - (alpha + 1);
3688
3689 Box rightBox(this);
3690 int i;
3691
3692 for (i = 0; i < beta; i++) {
3693 YYREDO();
3694 Box leftBox(this);
3695 parseAtom(box: &leftBox);
3696 leftBox.cat(b: rightBox);
3697 leftBox.opt();
3698 rightBox = leftBox;
3699 }
3700 for (i = 0; i < alpha; i++) {
3701 YYREDO();
3702 Box leftBox(this);
3703 parseAtom(box: &leftBox);
3704 leftBox.cat(b: rightBox);
3705 rightBox = leftBox;
3706 }
3707 rightBox.cat(b: *box);
3708 *box = rightBox;
3709#endif
3710 yyTok = getToken();
3711#ifndef QT_NO_REGEXP_INTERVAL
3712 yyMayCapture = mayCapture;
3713#endif
3714 }
3715#undef YYREDO
3716#ifndef QT_NO_REGEXP_CAPTURE
3717 if (greedyQuantifiers)
3718 finishAtom(atom: outerAtom, needCapture: hasQuantifier);
3719#endif
3720}
3721
3722void QRegExpEngine::parseTerm(Box *box)
3723{
3724#ifndef QT_NO_REGEXP_OPTIM
3725 if (yyTok != Tok_Eos && yyTok != Tok_RightParen && yyTok != Tok_Bar)
3726 parseFactor(box);
3727#endif
3728 while (yyTok != Tok_Eos && yyTok != Tok_RightParen && yyTok != Tok_Bar) {
3729 Box rightBox(this);
3730 parseFactor(box: &rightBox);
3731 box->cat(b: rightBox);
3732 }
3733}
3734
3735void QRegExpEngine::parseExpression(Box *box)
3736{
3737 parseTerm(box);
3738 while (yyTok == Tok_Bar) {
3739#ifndef QT_NO_REGEXP_OPTIM
3740 trivial = false;
3741#endif
3742 Box rightBox(this);
3743 yyTok = getToken();
3744 parseTerm(box: &rightBox);
3745 box->orx(b: rightBox);
3746 }
3747}
3748
3749/*
3750 The struct QRegExpPrivate contains the private data of a regular
3751 expression other than the automaton. It makes it possible for many
3752 QRegExp objects to use the same QRegExpEngine object with different
3753 QRegExpPrivate objects.
3754*/
3755struct QRegExpPrivate
3756{
3757 QRegExpEngine *eng;
3758 QRegExpEngineKey engineKey;
3759 bool minimal;
3760#ifndef QT_NO_REGEXP_CAPTURE
3761 QString t; // last string passed to QRegExp::indexIn() or lastIndexIn()
3762 QStringList capturedCache; // what QRegExp::capturedTexts() returned last
3763#endif
3764 QRegExpMatchState matchState;
3765
3766 inline QRegExpPrivate()
3767 : eng(nullptr), engineKey(QString(), QRegExp::RegExp, Qt::CaseSensitive), minimal(false) { }
3768 inline QRegExpPrivate(const QRegExpEngineKey &key)
3769 : eng(nullptr), engineKey(key), minimal(false) {}
3770};
3771
3772#if !defined(QT_NO_REGEXP_OPTIM)
3773struct QRECache
3774{
3775 typedef QHash<QRegExpEngineKey, QRegExpEngine *> EngineCache;
3776 typedef QCache<QRegExpEngineKey, QRegExpEngine> UnusedEngineCache;
3777 EngineCache usedEngines;
3778 UnusedEngineCache unusedEngines;
3779};
3780Q_GLOBAL_STATIC(QRECache, engineCache)
3781static QBasicMutex engineCacheMutex;
3782#endif // QT_NO_REGEXP_OPTIM
3783
3784static void derefEngine(QRegExpEngine *eng, const QRegExpEngineKey &key)
3785{
3786#if !defined(QT_NO_REGEXP_OPTIM)
3787 const auto locker = qt_scoped_lock(mutex&: engineCacheMutex);
3788 if (!eng->ref.deref()) {
3789 if (QRECache *c = engineCache()) {
3790 c->unusedEngines.insert(key, object: eng, cost: 4 + key.pattern.size() / 4);
3791 c->usedEngines.remove(key);
3792 } else {
3793 delete eng;
3794 }
3795 }
3796#else
3797 Q_UNUSED(key);
3798 if (!eng->ref.deref())
3799 delete eng;
3800#endif
3801}
3802
3803static void prepareEngine_helper(QRegExpPrivate *priv)
3804{
3805 Q_ASSERT(!priv->eng);
3806
3807#if !defined(QT_NO_REGEXP_OPTIM)
3808 const auto locker = qt_scoped_lock(mutex&: engineCacheMutex);
3809 if (QRECache *c = engineCache()) {
3810 priv->eng = c->unusedEngines.take(key: priv->engineKey);
3811 if (!priv->eng)
3812 priv->eng = c->usedEngines.value(key: priv->engineKey);
3813 if (!priv->eng)
3814 priv->eng = new QRegExpEngine(priv->engineKey);
3815 else
3816 priv->eng->ref.ref();
3817
3818 c->usedEngines.insert(key: priv->engineKey, value: priv->eng);
3819 return;
3820 }
3821#endif // QT_NO_REGEXP_OPTIM
3822
3823 priv->eng = new QRegExpEngine(priv->engineKey);
3824}
3825
3826inline static void prepareEngine(QRegExpPrivate *priv)
3827{
3828 if (priv->eng)
3829 return;
3830 prepareEngine_helper(priv);
3831 priv->matchState.prepareForMatch(eng: priv->eng);
3832}
3833
3834static void prepareEngineForMatch(QRegExpPrivate *priv, const QString &str)
3835{
3836 prepareEngine(priv);
3837 priv->matchState.prepareForMatch(eng: priv->eng);
3838#ifndef QT_NO_REGEXP_CAPTURE
3839 priv->t = str;
3840 priv->capturedCache.clear();
3841#else
3842 Q_UNUSED(str);
3843#endif
3844}
3845
3846static void invalidateEngine(QRegExpPrivate *priv)
3847{
3848 if (priv->eng) {
3849 derefEngine(eng: priv->eng, key: priv->engineKey);
3850 priv->eng = nullptr;
3851 priv->matchState.drain();
3852 }
3853}
3854
3855/*!
3856 \enum QRegExp::CaretMode
3857
3858 The CaretMode enum defines the different meanings of the caret
3859 (\b{^}) in a regular expression. The possible values are:
3860
3861 \value CaretAtZero
3862 The caret corresponds to index 0 in the searched string.
3863
3864 \value CaretAtOffset
3865 The caret corresponds to the start offset of the search.
3866
3867 \value CaretWontMatch
3868 The caret never matches.
3869*/
3870
3871/*!
3872 \enum QRegExp::PatternSyntax
3873
3874 The syntax used to interpret the meaning of the pattern.
3875
3876 \value RegExp A rich Perl-like pattern matching syntax. This is
3877 the default.
3878
3879 \value RegExp2 Like RegExp, but with \l{greedy quantifiers}.
3880 (Introduced in Qt 4.2.)
3881
3882 \value Wildcard This provides a simple pattern matching syntax
3883 similar to that used by shells (command interpreters) for "file
3884 globbing". See \l{QRegExp wildcard matching}.
3885
3886 \value WildcardUnix This is similar to Wildcard but with the
3887 behavior of a Unix shell. The wildcard characters can be escaped
3888 with the character "\\".
3889
3890 \value FixedString The pattern is a fixed string. This is
3891 equivalent to using the RegExp pattern on a string in
3892 which all metacharacters are escaped using escape().
3893
3894 \value W3CXmlSchema11 The pattern is a regular expression as
3895 defined by the W3C XML Schema 1.1 specification.
3896
3897 \sa setPatternSyntax()
3898*/
3899
3900/*!
3901 Constructs an empty regexp.
3902
3903 \sa isValid(), errorString()
3904*/
3905QRegExp::QRegExp()
3906{
3907 priv = new QRegExpPrivate;
3908 prepareEngine(priv);
3909}
3910
3911/*!
3912 Constructs a regular expression object for the given \a pattern
3913 string. The pattern must be given using wildcard notation if \a
3914 syntax is \l Wildcard; the default is \l RegExp. The pattern is
3915 case sensitive, unless \a cs is Qt::CaseInsensitive. Matching is
3916 greedy (maximal), but can be changed by calling
3917 setMinimal().
3918
3919 \sa setPattern(), setCaseSensitivity(), setPatternSyntax()
3920*/
3921QRegExp::QRegExp(const QString &pattern, Qt::CaseSensitivity cs, PatternSyntax syntax)
3922{
3923 priv = new QRegExpPrivate(QRegExpEngineKey(pattern, syntax, cs));
3924 prepareEngine(priv);
3925}
3926
3927/*!
3928 Constructs a regular expression as a copy of \a rx.
3929
3930 \sa operator=()
3931*/
3932QRegExp::QRegExp(const QRegExp &rx)
3933{
3934 priv = new QRegExpPrivate;
3935 operator=(rx);
3936}
3937
3938/*!
3939 Destroys the regular expression and cleans up its internal data.
3940*/
3941QRegExp::~QRegExp()
3942{
3943 invalidateEngine(priv);
3944 delete priv;
3945}
3946
3947/*!
3948 Copies the regular expression \a rx and returns a reference to the
3949 copy. The case sensitivity, wildcard, and minimal matching options
3950 are also copied.
3951*/
3952QRegExp &QRegExp::operator=(const QRegExp &rx)
3953{
3954 prepareEngine(priv: rx.priv); // to allow sharing
3955 QRegExpEngine *otherEng = rx.priv->eng;
3956 if (otherEng)
3957 otherEng->ref.ref();
3958 invalidateEngine(priv);
3959 priv->eng = otherEng;
3960 priv->engineKey = rx.priv->engineKey;
3961 priv->minimal = rx.priv->minimal;
3962#ifndef QT_NO_REGEXP_CAPTURE
3963 priv->t = rx.priv->t;
3964 priv->capturedCache = rx.priv->capturedCache;
3965#endif
3966 if (priv->eng)
3967 priv->matchState.prepareForMatch(eng: priv->eng);
3968 priv->matchState.captured = rx.priv->matchState.captured;
3969 return *this;
3970}
3971
3972/*!
3973 \fn QRegExp &QRegExp::operator=(QRegExp &&other)
3974
3975 Move-assigns \a other to this QRegExp instance.
3976
3977 \since 5.2
3978*/
3979
3980/*!
3981 \fn void QRegExp::swap(QRegExp &other)
3982 \since 4.8
3983
3984 Swaps regular expression \a other with this regular
3985 expression. This operation is very fast and never fails.
3986*/
3987
3988/*!
3989 Returns \c true if this regular expression is equal to \a rx;
3990 otherwise returns \c false.
3991
3992 Two QRegExp objects are equal if they have the same pattern
3993 strings and the same settings for case sensitivity, wildcard and
3994 minimal matching.
3995*/
3996bool QRegExp::operator==(const QRegExp &rx) const
3997{
3998 return priv->engineKey == rx.priv->engineKey && priv->minimal == rx.priv->minimal;
3999}
4000
4001/*!
4002 \since 5.6
4003 \relates QRegExp
4004
4005 Returns the hash value for \a key, using
4006 \a seed to seed the calculation.
4007*/
4008size_t qHash(const QRegExp &key, size_t seed) noexcept
4009{
4010 QtPrivate::QHashCombine hash;
4011 seed = hash(seed, key.priv->engineKey);
4012 seed = hash(seed, key.priv->minimal);
4013 return seed;
4014}
4015
4016/*!
4017 \fn bool QRegExp::operator!=(const QRegExp &rx) const
4018
4019 Returns \c true if this regular expression is not equal to \a rx;
4020 otherwise returns \c false.
4021
4022 \sa operator==()
4023*/
4024
4025/*!
4026 Returns \c true if the pattern string is empty; otherwise returns
4027 false.
4028
4029 If you call exactMatch() with an empty pattern on an empty string
4030 it will return true; otherwise it returns \c false since it operates
4031 over the whole string. If you call indexIn() with an empty pattern
4032 on \e any string it will return the start offset (0 by default)
4033 because the empty pattern matches the 'emptiness' at the start of
4034 the string. In this case the length of the match returned by
4035 matchedLength() will be 0.
4036
4037 See QString::isEmpty().
4038*/
4039
4040bool QRegExp::isEmpty() const
4041{
4042 return priv->engineKey.pattern.isEmpty();
4043}
4044
4045/*!
4046 Returns \c true if the regular expression is valid; otherwise returns
4047 false. An invalid regular expression never matches.
4048
4049 The pattern \b{[a-z} is an example of an invalid pattern, since
4050 it lacks a closing square bracket.
4051
4052 Note that the validity of a regexp may also depend on the setting
4053 of the wildcard flag, for example \b{*.html} is a valid
4054 wildcard regexp but an invalid full regexp.
4055
4056 \sa errorString()
4057*/
4058bool QRegExp::isValid() const
4059{
4060 if (priv->engineKey.pattern.isEmpty()) {
4061 return true;
4062 } else {
4063 prepareEngine(priv);
4064 return priv->eng->isValid();
4065 }
4066}
4067
4068/*!
4069 Returns the pattern string of the regular expression. The pattern
4070 has either regular expression syntax or wildcard syntax, depending
4071 on patternSyntax().
4072
4073 \sa patternSyntax(), caseSensitivity()
4074*/
4075QString QRegExp::pattern() const
4076{
4077 return priv->engineKey.pattern;
4078}
4079
4080/*!
4081 Sets the pattern string to \a pattern. The case sensitivity,
4082 wildcard, and minimal matching options are not changed.
4083
4084 \sa setPatternSyntax(), setCaseSensitivity()
4085*/
4086void QRegExp::setPattern(const QString &pattern)
4087{
4088 if (priv->engineKey.pattern != pattern) {
4089 invalidateEngine(priv);
4090 priv->engineKey.pattern = pattern;
4091 }
4092}
4093
4094/*!
4095 Returns Qt::CaseSensitive if the regexp is matched case
4096 sensitively; otherwise returns Qt::CaseInsensitive.
4097
4098 \sa patternSyntax(), pattern(), isMinimal()
4099*/
4100Qt::CaseSensitivity QRegExp::caseSensitivity() const
4101{
4102 return priv->engineKey.cs;
4103}
4104
4105/*!
4106 Sets case sensitive matching to \a cs.
4107
4108 If \a cs is Qt::CaseSensitive, \b{\\.txt$} matches
4109 \c{readme.txt} but not \c{README.TXT}.
4110
4111 \sa setPatternSyntax(), setPattern(), setMinimal()
4112*/
4113void QRegExp::setCaseSensitivity(Qt::CaseSensitivity cs)
4114{
4115 if ((bool)cs != (bool)priv->engineKey.cs) {
4116 invalidateEngine(priv);
4117 priv->engineKey.cs = cs;
4118 }
4119}
4120
4121/*!
4122 Returns the syntax used by the regular expression. The default is
4123 QRegExp::RegExp.
4124
4125 \sa pattern(), caseSensitivity()
4126*/
4127QRegExp::PatternSyntax QRegExp::patternSyntax() const
4128{
4129 return priv->engineKey.patternSyntax;
4130}
4131
4132/*!
4133 Sets the syntax mode for the regular expression. The default is
4134 QRegExp::RegExp.
4135
4136 Setting \a syntax to QRegExp::Wildcard enables simple shell-like
4137 \l{QRegExp wildcard matching}. For example, \b{r*.txt} matches the
4138 string \c{readme.txt} in wildcard mode, but does not match
4139 \c{readme}.
4140
4141 Setting \a syntax to QRegExp::FixedString means that the pattern
4142 is interpreted as a plain string. Special characters (e.g.,
4143 backslash) don't need to be escaped then.
4144
4145 \sa setPattern(), setCaseSensitivity(), escape()
4146*/
4147void QRegExp::setPatternSyntax(PatternSyntax syntax)
4148{
4149 if (syntax != priv->engineKey.patternSyntax) {
4150 invalidateEngine(priv);
4151 priv->engineKey.patternSyntax = syntax;
4152 }
4153}
4154
4155/*!
4156 Returns \c true if minimal (non-greedy) matching is enabled;
4157 otherwise returns \c false.
4158
4159 \sa caseSensitivity(), setMinimal()
4160*/
4161bool QRegExp::isMinimal() const
4162{
4163 return priv->minimal;
4164}
4165
4166/*!
4167 Enables or disables minimal matching. If \a minimal is false,
4168 matching is greedy (maximal) which is the default.
4169
4170 For example, suppose we have the input string "We must be
4171 <b>bold</b>, very <b>bold</b>!" and the pattern
4172 \b{<b>.*</b>}. With the default greedy (maximal) matching,
4173 the match is "We must be \underline{<b>bold</b>, very
4174 <b>bold</b>}!". But with minimal (non-greedy) matching, the
4175 first match is: "We must be \underline{<b>bold</b>}, very
4176 <b>bold</b>!" and the second match is "We must be <b>bold</b>,
4177 very \underline{<b>bold</b>}!". In practice we might use the pattern
4178 \b{<b>[^<]*\</b>} instead, although this will still fail for
4179 nested tags.
4180
4181 \sa setCaseSensitivity()
4182*/
4183void QRegExp::setMinimal(bool minimal)
4184{
4185 priv->minimal = minimal;
4186}
4187
4188// ### Qt 5: make non-const
4189/*!
4190 Returns \c true if \a str is matched exactly by this regular
4191 expression; otherwise returns \c false. You can determine how much of
4192 the string was matched by calling matchedLength().
4193
4194 For a given regexp string R, exactMatch("R") is the equivalent of
4195 indexIn("^R$") since exactMatch() effectively encloses the regexp
4196 in the start of string and end of string anchors, except that it
4197 sets matchedLength() differently.
4198
4199 For example, if the regular expression is \b{blue}, then
4200 exactMatch() returns \c true only for input \c blue. For inputs \c
4201 bluebell, \c blutak and \c lightblue, exactMatch() returns \c false
4202 and matchedLength() will return 4, 3 and 0 respectively.
4203
4204 Although const, this function sets matchedLength(),
4205 capturedTexts(), and pos().
4206
4207 \sa indexIn(), lastIndexIn()
4208*/
4209bool QRegExp::exactMatch(const QString &str) const
4210{
4211 prepareEngineForMatch(priv, str);
4212 priv->matchState.match(str0: str.unicode(), len0: str.size(), pos0: 0, minimal0: priv->minimal, oneTest: true, caretIndex: 0);
4213 if (priv->matchState.captured[1] == str.size()) {
4214 return true;
4215 } else {
4216 priv->matchState.captured[0] = 0;
4217 priv->matchState.captured[1] = priv->matchState.oneTestMatchedLen;
4218 return false;
4219 }
4220}
4221
4222/*!
4223 Returns the regexp as a QVariant
4224*/
4225QRegExp::operator QVariant() const
4226{
4227QT_WARNING_PUSH QT_WARNING_DISABLE_DEPRECATED
4228 QVariant v;
4229 v.setValue(*this);
4230 return v;
4231QT_WARNING_POP
4232}
4233
4234// ### Qt 5: make non-const
4235/*!
4236 Attempts to find a match in \a str from position \a offset (0 by
4237 default). If \a offset is -1, the search starts at the last
4238 character; if -2, at the next to last character; etc.
4239
4240 Returns the position of the first match, or -1 if there was no
4241 match.
4242
4243 The \a caretMode parameter can be used to instruct whether \b{^}
4244 should match at index 0 or at \a offset.
4245
4246 You might prefer to use QString::indexOf(), QString::contains(),
4247 or even QStringList::filter(). To replace matches use
4248 QString::replace().
4249
4250 Example:
4251 \snippet code/src_corelib_text_qregexp.cpp 13
4252
4253 Although const, this function sets matchedLength(),
4254 capturedTexts() and pos().
4255
4256 If the QRegExp is a wildcard expression (see setPatternSyntax())
4257 and want to test a string against the whole wildcard expression,
4258 use exactMatch() instead of this function.
4259
4260 \sa lastIndexIn(), exactMatch()
4261*/
4262
4263int QRegExp::indexIn(const QString &str, int offset, CaretMode caretMode) const
4264{
4265 prepareEngineForMatch(priv, str);
4266 if (offset < 0)
4267 offset += str.size();
4268 priv->matchState.match(str0: str.unicode(), len0: str.size(), pos0: offset,
4269 minimal0: priv->minimal, oneTest: false, caretIndex: caretIndex(offset, caretMode));
4270 return priv->matchState.captured[0];
4271}
4272
4273// ### Qt 5: make non-const
4274/*!
4275 Attempts to find a match backwards in \a str from position \a
4276 offset. If \a offset is -1 (the default), the search starts at the
4277 last character; if -2, at the next to last character; etc.
4278
4279 Returns the position of the first match, or -1 if there was no
4280 match.
4281
4282 The \a caretMode parameter can be used to instruct whether \b{^}
4283 should match at index 0 or at \a offset.
4284
4285 Although const, this function sets matchedLength(),
4286 capturedTexts() and pos().
4287
4288 \warning Searching backwards is much slower than searching
4289 forwards.
4290
4291 \sa indexIn(), exactMatch()
4292*/
4293
4294int QRegExp::lastIndexIn(const QString &str, int offset, CaretMode caretMode) const
4295{
4296 prepareEngineForMatch(priv, str);
4297 if (offset < 0)
4298 offset += str.size();
4299 if (offset < 0 || offset > str.size()) {
4300 memset(s: priv->matchState.captured, c: -1, n: priv->matchState.capturedSize*sizeof(int));
4301 return -1;
4302 }
4303
4304 while (offset >= 0) {
4305 priv->matchState.match(str0: str.unicode(), len0: str.size(), pos0: offset,
4306 minimal0: priv->minimal, oneTest: true, caretIndex: caretIndex(offset, caretMode));
4307 if (priv->matchState.captured[0] == offset)
4308 return offset;
4309 --offset;
4310 }
4311 return -1;
4312}
4313
4314/*!
4315 Returns the length of the last matched string, or -1 if there was
4316 no match.
4317
4318 \sa exactMatch(), indexIn(), lastIndexIn()
4319*/
4320int QRegExp::matchedLength() const
4321{
4322 return priv->matchState.captured[1];
4323}
4324
4325
4326/*!
4327 Replaces every occurrence of this regular expression in
4328 \a str with \a after and returns the result.
4329
4330 For regular expressions containing \l{capturing parentheses},
4331 occurrences of \b{\\1}, \b{\\2}, ..., in \a after are replaced
4332 with \c {rx}.cap(1), cap(2), ...
4333
4334 \sa indexIn(), lastIndexIn(), QRegExp::cap()
4335*/
4336QString QRegExp::replaceIn(const QString &str, const QString &after) const
4337{
4338 struct QStringCapture
4339 {
4340 int pos;
4341 int len;
4342 int no;
4343 };
4344
4345 QRegExp rx2(*this);
4346
4347 if (str.isEmpty() && rx2.indexIn(str) == -1)
4348 return str;
4349
4350 QString s(str);
4351
4352 int index = 0;
4353 int numCaptures = rx2.captureCount();
4354 int al = after.size();
4355 QRegExp::CaretMode caretMode = QRegExp::CaretAtZero;
4356
4357 if (numCaptures > 0) {
4358 const QChar *uc = after.unicode();
4359 int numBackRefs = 0;
4360
4361 for (int i = 0; i < al - 1; i++) {
4362 if (uc[i] == QLatin1Char('\\')) {
4363 int no = uc[i + 1].digitValue();
4364 if (no > 0 && no <= numCaptures)
4365 numBackRefs++;
4366 }
4367 }
4368
4369 /*
4370 This is the harder case where we have back-references.
4371 */
4372 if (numBackRefs > 0) {
4373 QVarLengthArray<QStringCapture, 16> captures(numBackRefs);
4374 int j = 0;
4375
4376 for (int i = 0; i < al - 1; i++) {
4377 if (uc[i] == QLatin1Char('\\')) {
4378 int no = uc[i + 1].digitValue();
4379 if (no > 0 && no <= numCaptures) {
4380 QStringCapture capture;
4381 capture.pos = i;
4382 capture.len = 2;
4383
4384 if (i < al - 2) {
4385 int secondDigit = uc[i + 2].digitValue();
4386 if (secondDigit != -1 && ((no * 10) + secondDigit) <= numCaptures) {
4387 no = (no * 10) + secondDigit;
4388 ++capture.len;
4389 }
4390 }
4391
4392 capture.no = no;
4393 captures[j++] = capture;
4394 }
4395 }
4396 }
4397
4398 while (index <= s.size()) {
4399 index = rx2.indexIn(str: s, offset: index, caretMode);
4400 if (index == -1)
4401 break;
4402
4403 QString after2(after);
4404 for (j = numBackRefs - 1; j >= 0; j--) {
4405 const QStringCapture &capture = captures[j];
4406 after2.replace(i: capture.pos, len: capture.len, after: rx2.cap(nth: capture.no));
4407 }
4408
4409 s.replace(i: index, len: rx2.matchedLength(), after: after2);
4410 index += after2.size();
4411
4412 // avoid infinite loop on 0-length matches (e.g., QRegExp("[a-z]*"))
4413 if (rx2.matchedLength() == 0)
4414 ++index;
4415
4416 caretMode = QRegExp::CaretWontMatch;
4417 }
4418 return s;
4419 }
4420 }
4421
4422 /*
4423 This is the simple and optimized case where we don't have
4424 back-references.
4425 */
4426 while (index != -1) {
4427 struct {
4428 int pos;
4429 int length;
4430 } replacements[2048];
4431
4432 int pos = 0;
4433 int adjust = 0;
4434 while (pos < 2047) {
4435 index = rx2.indexIn(str: s, offset: index, caretMode);
4436 if (index == -1)
4437 break;
4438 int ml = rx2.matchedLength();
4439 replacements[pos].pos = index;
4440 replacements[pos++].length = ml;
4441 index += ml;
4442 adjust += al - ml;
4443 // avoid infinite loop
4444 if (!ml)
4445 index++;
4446 }
4447 if (!pos)
4448 break;
4449 replacements[pos].pos = s.size();
4450 int newlen = s.size() + adjust;
4451
4452 // to continue searching at the right position after we did
4453 // the first round of replacements
4454 if (index != -1)
4455 index += adjust;
4456 QString newstring;
4457 newstring.reserve(asize: newlen + 1);
4458 QChar *newuc = newstring.data();
4459 QChar *uc = newuc;
4460 int copystart = 0;
4461 int i = 0;
4462 while (i < pos) {
4463 int copyend = replacements[i].pos;
4464 int size = copyend - copystart;
4465 memcpy(dest: static_cast<void*>(uc), src: static_cast<const void *>(s.constData() + copystart), n: size * sizeof(QChar));
4466 uc += size;
4467 memcpy(dest: static_cast<void *>(uc), src: static_cast<const void *>(after.constData()), n: al * sizeof(QChar));
4468 uc += al;
4469 copystart = copyend + replacements[i].length;
4470 i++;
4471 }
4472 memcpy(dest: static_cast<void *>(uc), src: static_cast<const void *>(s.constData() + copystart), n: (s.size() - copystart) * sizeof(QChar));
4473 newstring.resize(size: newlen);
4474 s = newstring;
4475 caretMode = QRegExp::CaretWontMatch;
4476 }
4477 return s;
4478
4479}
4480
4481
4482/*!
4483 \fn QString QRegExp::removeIn(const QString &str) const
4484
4485 Removes every occurrence of this regular expression \a str, and
4486 returns the result
4487
4488 Does the same as replaceIn(str, QString()).
4489
4490 \sa indexIn(), lastIndexIn(), replaceIn()
4491*/
4492
4493
4494/*!
4495 \fn QString QRegExp::countIn(const QString &str) const
4496
4497 Returns the number of times this regular expression matches
4498 in \a str.
4499
4500 \sa indexIn(), lastIndexIn(), replaceIn()
4501*/
4502
4503int QRegExp::countIn(const QString &str) const
4504{
4505 QRegExp rx2(*this);
4506 int count = 0;
4507 int index = -1;
4508 int len = str.size();
4509 while (index < len - 1) { // count overlapping matches
4510 index = rx2.indexIn(str, offset: index + 1);
4511 if (index == -1)
4512 break;
4513 count++;
4514 }
4515 return count;
4516}
4517
4518/*!
4519 Splits \a str into substrings wherever this regular expression
4520 matches, and returns the list of those strings. If this regular
4521 expression does not match anywhere in the string, split() returns a
4522 single-element list containing \a str.
4523
4524 If \a behavior is set to Qt::KeepEmptyParts, empty fields are
4525 included in the resulting list.
4526
4527 \sa QStringList::join(), QString::split()
4528*/
4529QStringList QRegExp::splitString(const QString &str, Qt::SplitBehavior behavior) const
4530{
4531 QRegExp rx2(*this);
4532 QStringList list;
4533 int start = 0;
4534 int extra = 0;
4535 int end;
4536 while ((end = rx2.indexIn(str, offset: start + extra)) != -1) {
4537 int matchedLen = rx2.matchedLength();
4538 if (start != end || behavior == Qt::KeepEmptyParts)
4539 list.append(t: str.mid(position: start, n: end - start));
4540 start = end + matchedLen;
4541 extra = (matchedLen == 0) ? 1 : 0;
4542 }
4543 if (start != str.size() || behavior == Qt::KeepEmptyParts)
4544 list.append(t: str.mid(position: start, n: -1));
4545 return list;
4546}
4547
4548/*!
4549 Returns a list of all the strings that match this regular
4550 expression in \a stringList.
4551*/
4552QStringList QRegExp::filterList(const QStringList &stringList) const
4553{
4554 QStringList res;
4555 for (const QString &s : stringList) {
4556 if (containedIn(str: s))
4557 res << s;
4558 }
4559 return res;
4560}
4561
4562/*!
4563 Replaces every occurrence of this regexp, in each of \a stringList's
4564 with \a after. Returns a reference to the string list.
4565*/
4566QStringList QRegExp::replaceIn(const QStringList &stringList, const QString &after) const
4567{
4568 QStringList list;
4569 for (const QString &s : stringList)
4570 list << replaceIn(str: s, after);
4571 return list;
4572}
4573
4574/*!
4575 Returns the index position of the first exact match of this regexp in
4576 \a list, searching forward from index position \a from. Returns
4577 -1 if no item matched.
4578
4579 \sa lastIndexIn(), exactMatch()
4580*/
4581int QRegExp::indexIn(const QStringList &list, int from) const
4582{
4583 QRegExp rx2(*this);
4584 if (from < 0)
4585 from = qMax(a: from + list.size(), b: 0);
4586 for (int i = from; i < list.size(); ++i) {
4587 if (rx2.exactMatch(str: list.at(i)))
4588 return i;
4589 }
4590 return -1;
4591}
4592
4593/*!
4594 Returns the index position of the last exact match of this regexp in
4595 \a list, searching backward from index position \a from. If \a
4596 from is -1 (the default), the search starts at the last item.
4597 Returns -1 if no item matched.
4598
4599 \sa QRegExp::exactMatch()
4600*/
4601int QRegExp::lastIndexIn(const QStringList &list, int from) const
4602{
4603 QRegExp rx2(*this);
4604 if (from < 0)
4605 from += list.size();
4606 else if (from >= list.size())
4607 from = list.size() - 1;
4608 for (int i = from; i >= 0; --i) {
4609 if (rx2.exactMatch(str: list.at(i)))
4610 return i;
4611 }
4612 return -1;
4613}
4614
4615#ifndef QT_NO_REGEXP_CAPTURE
4616
4617/*!
4618 \since 4.6
4619 Returns the number of captures contained in the regular expression.
4620 */
4621int QRegExp::captureCount() const
4622{
4623 prepareEngine(priv);
4624 return priv->eng->captureCount();
4625}
4626
4627/*!
4628 Returns a list of the captured text strings.
4629
4630 The first string in the list is the entire matched string. Each
4631 subsequent list element contains a string that matched a
4632 (capturing) subexpression of the regexp.
4633
4634 For example:
4635 \snippet code/src_corelib_text_qregexp.cpp 14
4636
4637 The above example also captures elements that may be present but
4638 which we have no interest in. This problem can be solved by using
4639 non-capturing parentheses:
4640
4641 \snippet code/src_corelib_text_qregexp.cpp 15
4642
4643 Note that if you want to iterate over the list, you should iterate
4644 over a copy, e.g.
4645 \snippet code/src_corelib_text_qregexp.cpp 16
4646
4647 Some regexps can match an indeterminate number of times. For
4648 example if the input string is "Offsets: 12 14 99 231 7" and the
4649 regexp, \c{rx}, is \b{(\\d+)+}, we would hope to get a list of
4650 all the numbers matched. However, after calling
4651 \c{rx.indexIn(str)}, capturedTexts() will return the list ("12",
4652 "12"), i.e. the entire match was "12" and the first subexpression
4653 matched was "12". The correct approach is to use cap() in a
4654 \l{QRegExp#cap_in_a_loop}{loop}.
4655
4656 The order of elements in the string list is as follows. The first
4657 element is the entire matching string. Each subsequent element
4658 corresponds to the next capturing open left parentheses. Thus
4659 capturedTexts()[1] is the text of the first capturing parentheses,
4660 capturedTexts()[2] is the text of the second and so on
4661 (corresponding to $1, $2, etc., in some other regexp languages).
4662
4663 \sa cap(), pos()
4664*/
4665QStringList QRegExp::capturedTexts() const
4666{
4667 if (priv->capturedCache.isEmpty()) {
4668 prepareEngine(priv);
4669 const int *captured = priv->matchState.captured;
4670 int n = priv->matchState.capturedSize;
4671
4672 for (int i = 0; i < n; i += 2) {
4673 QString m;
4674 if (captured[i + 1] == 0)
4675 m = QLatin1String(""); // ### Qt 5: don't distinguish between null and empty
4676 else if (captured[i] >= 0)
4677 m = priv->t.mid(position: captured[i], n: captured[i + 1]);
4678 priv->capturedCache.append(t: m);
4679 }
4680 priv->t.clear();
4681 }
4682 return priv->capturedCache;
4683}
4684
4685/*!
4686 \internal
4687*/
4688QStringList QRegExp::capturedTexts()
4689{
4690 return const_cast<const QRegExp *>(this)->capturedTexts();
4691}
4692
4693/*!
4694 Returns the text captured by the \a nth subexpression. The entire
4695 match has index 0 and the parenthesized subexpressions have
4696 indexes starting from 1 (excluding non-capturing parentheses).
4697
4698 \snippet code/src_corelib_text_qregexp.cpp 17
4699
4700 The order of elements matched by cap() is as follows. The first
4701 element, cap(0), is the entire matching string. Each subsequent
4702 element corresponds to the next capturing open left parentheses.
4703 Thus cap(1) is the text of the first capturing parentheses, cap(2)
4704 is the text of the second, and so on.
4705
4706 \sa capturedTexts(), pos()
4707*/
4708QString QRegExp::cap(int nth) const
4709{
4710 return capturedTexts().value(i: nth);
4711}
4712
4713/*!
4714 \internal
4715*/
4716QString QRegExp::cap(int nth)
4717{
4718 return const_cast<const QRegExp *>(this)->cap(nth);
4719}
4720
4721/*!
4722 Returns the position of the \a nth captured text in the searched
4723 string. If \a nth is 0 (the default), pos() returns the position
4724 of the whole match.
4725
4726 Example:
4727 \snippet code/src_corelib_text_qregexp.cpp 18
4728
4729 For zero-length matches, pos() always returns -1. (For example, if
4730 cap(4) would return an empty string, pos(4) returns -1.) This is
4731 a feature of the implementation.
4732
4733 \sa cap(), capturedTexts()
4734*/
4735int QRegExp::pos(int nth) const
4736{
4737 if (nth < 0 || nth >= priv->matchState.capturedSize / 2)
4738 return -1;
4739 else
4740 return priv->matchState.captured[2 * nth];
4741}
4742
4743/*!
4744 \internal
4745*/
4746int QRegExp::pos(int nth)
4747{
4748 return const_cast<const QRegExp *>(this)->pos(nth);
4749}
4750
4751/*!
4752 Returns a text string that explains why a regexp pattern is
4753 invalid the case being; otherwise returns "no error occurred".
4754
4755 \sa isValid()
4756*/
4757QString QRegExp::errorString() const
4758{
4759 if (isValid()) {
4760 return QString::fromLatin1(RXERR_OK);
4761 } else {
4762 return priv->eng->errorString();
4763 }
4764}
4765
4766/*!
4767 \internal
4768*/
4769QString QRegExp::errorString()
4770{
4771 return const_cast<const QRegExp *>(this)->errorString();
4772}
4773
4774#endif
4775
4776/*!
4777 Returns the string \a str with every regexp special character
4778 escaped with a backslash. The special characters are $, (,), *, +,
4779 ., ?, [, \,], ^, {, | and }.
4780
4781 Example:
4782
4783 \snippet code/src_corelib_text_qregexp.cpp 19
4784
4785 This function is useful to construct regexp patterns dynamically:
4786
4787 \snippet code/src_corelib_text_qregexp.cpp 20
4788
4789 \sa setPatternSyntax()
4790*/
4791QString QRegExp::escape(const QString &str)
4792{
4793 QString quoted;
4794 const int count = str.size();
4795 quoted.reserve(asize: count * 2);
4796 const QLatin1Char backslash('\\');
4797 for (int i = 0; i < count; i++) {
4798 switch (str.at(i).toLatin1()) {
4799 case '$':
4800 case '(':
4801 case ')':
4802 case '*':
4803 case '+':
4804 case '.':
4805 case '?':
4806 case '[':
4807 case '\\':
4808 case ']':
4809 case '^':
4810 case '{':
4811 case '|':
4812 case '}':
4813 quoted.append(c: backslash);
4814 }
4815 quoted.append(c: str.at(i));
4816 }
4817 return quoted;
4818}
4819
4820
4821#ifndef QT_NO_DATASTREAM
4822/*!
4823 \relates QRegExp
4824
4825 Writes the regular expression \a regExp to stream \a out.
4826
4827 \sa {Serializing Qt Data Types}
4828*/
4829QDataStream &operator<<(QDataStream &out, const QRegExp &regExp)
4830{
4831 return out << regExp.pattern() << (quint8)regExp.caseSensitivity()
4832 << (quint8)regExp.patternSyntax()
4833 << (quint8)!!regExp.isMinimal();
4834}
4835
4836/*!
4837 \relates QRegExp
4838
4839 Reads a regular expression from stream \a in into \a regExp.
4840
4841 \sa {Serializing Qt Data Types}
4842*/
4843QDataStream &operator>>(QDataStream &in, QRegExp &regExp)
4844{
4845 QString pattern;
4846 quint8 cs;
4847 quint8 patternSyntax;
4848 quint8 isMinimal;
4849
4850 in >> pattern >> cs >> patternSyntax >> isMinimal;
4851
4852 QRegExp newRegExp(pattern, Qt::CaseSensitivity(cs),
4853 QRegExp::PatternSyntax(patternSyntax));
4854
4855 newRegExp.setMinimal(isMinimal);
4856 regExp = newRegExp;
4857 return in;
4858}
4859#endif // QT_NO_DATASTREAM
4860
4861#ifndef QT_NO_DEBUG_STREAM
4862QDebug operator<<(QDebug dbg, const QRegExp &r)
4863{
4864 QDebugStateSaver saver(dbg);
4865 dbg.nospace() << "QRegExp(patternSyntax=" << r.patternSyntax()
4866 << ", pattern='"<< r.pattern() << "')";
4867 return dbg;
4868}
4869#endif
4870
4871QT_END_NAMESPACE
4872

source code of qt5compat/src/core5/text/qregexp.cpp