1// Copyright (C) 2016 The Qt Company Ltd.
2// SPDX-License-Identifier: LicenseRef-Qt-Commercial OR LGPL-3.0-only OR GPL-2.0-only OR GPL-3.0-only
3
4#include "qregexp.h"
5
6#include "qalgorithms.h"
7#include "qbitarray.h"
8#include "qcache.h"
9#include "qdatastream.h"
10#include "qdebug.h"
11#include "qhashfunctions.h"
12#include "qlist.h"
13#include "qmap.h"
14#include "qmutex.h"
15#include "qstring.h"
16#include "qstringlist.h"
17#include "qstringmatcher.h"
18#include "private/qlocking_p.h"
19#include "qvarlengtharray.h"
20
21#include <limits.h>
22#include <algorithm>
23
24QT_BEGIN_NAMESPACE
25
26// error strings for the regexp parser
27#define RXERR_OK QT_TRANSLATE_NOOP("QRegExp", "no error occurred")
28#define RXERR_DISABLED QT_TRANSLATE_NOOP("QRegExp", "disabled feature used")
29#define RXERR_CHARCLASS QT_TRANSLATE_NOOP("QRegExp", "bad char class syntax")
30#define RXERR_LOOKAHEAD QT_TRANSLATE_NOOP("QRegExp", "bad lookahead syntax")
31#define RXERR_LOOKBEHIND QT_TRANSLATE_NOOP("QRegExp", "lookbehinds not supported, see QTBUG-2371")
32#define RXERR_REPETITION QT_TRANSLATE_NOOP("QRegExp", "bad repetition syntax")
33#define RXERR_OCTAL QT_TRANSLATE_NOOP("QRegExp", "invalid octal value")
34#define RXERR_LEFTDELIM QT_TRANSLATE_NOOP("QRegExp", "missing left delim")
35#define RXERR_END QT_TRANSLATE_NOOP("QRegExp", "unexpected end")
36#define RXERR_LIMIT QT_TRANSLATE_NOOP("QRegExp", "met internal limit")
37#define RXERR_INTERVAL QT_TRANSLATE_NOOP("QRegExp", "invalid interval")
38#define RXERR_CATEGORY QT_TRANSLATE_NOOP("QRegExp", "invalid category")
39
40/*!
41 \class QRegExp
42 \inmodule QtCore5Compat
43 \reentrant
44 \brief The QRegExp class provides pattern matching using regular expressions.
45
46 \ingroup tools
47 \ingroup shared
48
49 \keyword regular expression
50
51 This class is deprecated in Qt 6. Please use QRegularExpression instead
52 for all new code. For guidelines on porting old code from QRegExp to
53 QRegularExpression, see {Porting to QRegularExpression}
54
55 A regular expression, or "regexp", is a pattern for matching
56 substrings in a text. This is useful in many contexts, e.g.,
57
58 \table
59 \row \li Validation
60 \li A regexp can test whether a substring meets some criteria,
61 e.g. is an integer or contains no whitespace.
62 \row \li Searching
63 \li A regexp provides more powerful pattern matching than
64 simple substring matching, e.g., match one of the words
65 \e{mail}, \e{letter} or \e{correspondence}, but none of the
66 words \e{email}, \e{mailman}, \e{mailer}, \e{letterbox}, etc.
67 \row \li Search and Replace
68 \li A regexp can replace all occurrences of a substring with a
69 different substring, e.g., replace all occurrences of \e{&}
70 with \e{\&amp;} except where the \e{&} is already followed by
71 an \e{amp;}.
72 \row \li String Splitting
73 \li A regexp can be used to identify where a string should be
74 split apart, e.g. splitting tab-delimited strings.
75 \endtable
76
77 A brief introduction to regexps is presented, a description of
78 Qt's regexp language, some examples, and the function
79 documentation itself. QRegExp is modeled on Perl's regexp
80 language. It fully supports Unicode. QRegExp can also be used in a
81 simpler, \e{wildcard mode} that is similar to the functionality
82 found in command shells. The syntax rules used by QRegExp can be
83 changed with setPatternSyntax(). In particular, the pattern syntax
84 can be set to QRegExp::FixedString, which means the pattern to be
85 matched is interpreted as a plain string, i.e., special characters
86 (e.g., backslash) are not escaped.
87
88 A good text on regexps is \e {Mastering Regular Expressions}
89 (Third Edition) by Jeffrey E. F. Friedl, ISBN 0-596-52812-4.
90
91 \note In Qt 5, the new QRegularExpression class provides a Perl
92 compatible implementation of regular expressions and is recommended
93 in place of QRegExp.
94
95 \tableofcontents
96
97 \section1 Introduction
98
99 Regexps are built up from expressions, quantifiers, and
100 assertions. The simplest expression is a character, e.g. \b{x}
101 or \b{5}. An expression can also be a set of characters
102 enclosed in square brackets. \b{[ABCD]} will match an \b{A}
103 or a \b{B} or a \b{C} or a \b{D}. We can write this same
104 expression as \b{[A-D]}, and an expression to match any
105 capital letter in the English alphabet is written as
106 \b{[A-Z]}.
107
108 A quantifier specifies the number of occurrences of an expression
109 that must be matched. \b{x{1,1}} means match one and only one
110 \b{x}. \b{x{1,5}} means match a sequence of \b{x}
111 characters that contains at least one \b{x} but no more than
112 five.
113
114 Note that in general regexps cannot be used to check for balanced
115 brackets or tags. For example, a regexp can be written to match an
116 opening html \c{<b>} and its closing \c{</b>}, if the \c{<b>} tags
117 are not nested, but if the \c{<b>} tags are nested, that same
118 regexp will match an opening \c{<b>} tag with the wrong closing
119 \c{</b>}. For the fragment \c{<b>bold <b>bolder</b></b>}, the
120 first \c{<b>} would be matched with the first \c{</b>}, which is
121 not correct. However, it is possible to write a regexp that will
122 match nested brackets or tags correctly, but only if the number of
123 nesting levels is fixed and known. If the number of nesting levels
124 is not fixed and known, it is impossible to write a regexp that
125 will not fail.
126
127 Suppose we want a regexp to match integers in the range 0 to 99.
128 At least one digit is required, so we start with the expression
129 \b{[0-9]{1,1}}, which matches a single digit exactly once. This
130 regexp matches integers in the range 0 to 9. To match integers up
131 to 99, increase the maximum number of occurrences to 2, so the
132 regexp becomes \b{[0-9]{1,2}}. This regexp satisfies the
133 original requirement to match integers from 0 to 99, but it will
134 also match integers that occur in the middle of strings. If we
135 want the matched integer to be the whole string, we must use the
136 anchor assertions, \b{^} (caret) and \b{$} (dollar). When
137 \b{^} is the first character in a regexp, it means the regexp
138 must match from the beginning of the string. When \b{$} is the
139 last character of the regexp, it means the regexp must match to
140 the end of the string. The regexp becomes \b{^[0-9]{1,2}$}.
141 Note that assertions, e.g. \b{^} and \b{$}, do not match
142 characters but locations in the string.
143
144 If you have seen regexps described elsewhere, they may have looked
145 different from the ones shown here. This is because some sets of
146 characters and some quantifiers are so common that they have been
147 given special symbols to represent them. \b{[0-9]} can be
148 replaced with the symbol \b{\\d}. The quantifier to match
149 exactly one occurrence, \b{{1,1}}, can be replaced with the
150 expression itself, i.e. \b{x{1,1}} is the same as \b{x}. So
151 our 0 to 99 matcher could be written as \b{^\\d{1,2}$}. It can
152 also be written \b{^\\d\\d{0,1}$}, i.e. \e{From the start of
153 the string, match a digit, followed immediately by 0 or 1 digits}.
154 In practice, it would be written as \b{^\\d\\d?$}. The \b{?}
155 is shorthand for the quantifier \b{{0,1}}, i.e. 0 or 1
156 occurrences. \b{?} makes an expression optional. The regexp
157 \b{^\\d\\d?$} means \e{From the beginning of the string, match
158 one digit, followed immediately by 0 or 1 more digit, followed
159 immediately by end of string}.
160
161 To write a regexp that matches one of the words 'mail' \e or
162 'letter' \e or 'correspondence' but does not match words that
163 contain these words, e.g., 'email', 'mailman', 'mailer', and
164 'letterbox', start with a regexp that matches 'mail'. Expressed
165 fully, the regexp is \b{m{1,1}a{1,1}i{1,1}l{1,1}}, but because
166 a character expression is automatically quantified by
167 \b{{1,1}}, we can simplify the regexp to \b{mail}, i.e., an
168 'm' followed by an 'a' followed by an 'i' followed by an 'l'. Now
169 we can use the vertical bar \b{|}, which means \b{or}, to
170 include the other two words, so our regexp for matching any of the
171 three words becomes \b{mail|letter|correspondence}. Match
172 'mail' \b{or} 'letter' \b{or} 'correspondence'. While this
173 regexp will match one of the three words we want to match, it will
174 also match words we don't want to match, e.g., 'email'. To
175 prevent the regexp from matching unwanted words, we must tell it
176 to begin and end the match at word boundaries. First we enclose
177 our regexp in parentheses, \b{(mail|letter|correspondence)}.
178 Parentheses group expressions together, and they identify a part
179 of the regexp that we wish to \l{capturing text}{capture}.
180 Enclosing the expression in parentheses allows us to use it as a
181 component in more complex regexps. It also allows us to examine
182 which of the three words was actually matched. To force the match
183 to begin and end on word boundaries, we enclose the regexp in
184 \b{\\b} \e{word boundary} assertions:
185 \b{\\b(mail|letter|correspondence)\\b}. Now the regexp means:
186 \e{Match a word boundary, followed by the regexp in parentheses,
187 followed by a word boundary}. The \b{\\b} assertion matches a
188 \e position in the regexp, not a \e character. A word boundary is
189 any non-word character, e.g., a space, newline, or the beginning
190 or ending of a string.
191
192 If we want to replace ampersand characters with the HTML entity
193 \b{\&amp;}, the regexp to match is simply \b{\&}. But this
194 regexp will also match ampersands that have already been converted
195 to HTML entities. We want to replace only ampersands that are not
196 already followed by \b{amp;}. For this, we need the negative
197 lookahead assertion, \b{(?!}__\b{)}. The regexp can then be
198 written as \b{\&(?!amp;)}, i.e. \e{Match an ampersand that is}
199 \b{not} \e{followed by} \b{amp;}.
200
201 If we want to count all the occurrences of 'Eric' and 'Eirik' in a
202 string, two valid solutions are \b{\\b(Eric|Eirik)\\b} and
203 \b{\\bEi?ri[ck]\\b}. The word boundary assertion '\\b' is
204 required to avoid matching words that contain either name,
205 e.g. 'Ericsson'. Note that the second regexp matches more
206 spellings than we want: 'Eric', 'Erik', 'Eiric' and 'Eirik'.
207
208 Some of the examples discussed above are implemented in the
209 \l{#code-examples}{code examples} section.
210
211 \target characters-and-abbreviations-for-sets-of-characters
212 \section1 Characters and Abbreviations for Sets of Characters
213
214 \table
215 \header \li Element \li Meaning
216 \row \li \b{c}
217 \li A character represents itself unless it has a special
218 regexp meaning. e.g. \b{c} matches the character \e c.
219 \row \li \b{\\c}
220 \li A character that follows a backslash matches the character
221 itself, except as specified below. e.g., To match a literal
222 caret at the beginning of a string, write \b{\\^}.
223 \row \li \b{\\a}
224 \li Matches the ASCII bell (BEL, 0x07).
225 \row \li \b{\\f}
226 \li Matches the ASCII form feed (FF, 0x0C).
227 \row \li \b{\\n}
228 \li Matches the ASCII line feed (LF, 0x0A, Unix newline).
229 \row \li \b{\\r}
230 \li Matches the ASCII carriage return (CR, 0x0D).
231 \row \li \b{\\t}
232 \li Matches the ASCII horizontal tab (HT, 0x09).
233 \row \li \b{\\v}
234 \li Matches the ASCII vertical tab (VT, 0x0B).
235 \row \li \b{\\x\e{hhhh}}
236 \li Matches the Unicode character corresponding to the
237 hexadecimal number \e{hhhh} (between 0x0000 and 0xFFFF).
238 \row \li \b{\\0\e{ooo}} (i.e., \\zero \e{ooo})
239 \li matches the ASCII/Latin1 character for the octal number
240 \e{ooo} (between 0 and 0377).
241 \row \li \b{. (dot)}
242 \li Matches any character (including newline).
243 \row \li \b{\\d}
244 \li Matches a digit (QChar::isDigit()).
245 \row \li \b{\\D}
246 \li Matches a non-digit.
247 \row \li \b{\\s}
248 \li Matches a whitespace character (QChar::isSpace()).
249 \row \li \b{\\S}
250 \li Matches a non-whitespace character.
251 \row \li \b{\\w}
252 \li Matches a word character (QChar::isLetterOrNumber(), QChar::isMark(), or '_').
253 \row \li \b{\\W}
254 \li Matches a non-word character.
255 \row \li \b{\\\e{n}}
256 \li The \e{n}-th backreference, e.g. \\1, \\2, etc.
257 \endtable
258
259 \b{Note:} The C++ compiler transforms backslashes in strings.
260 To include a \b{\\} in a regexp, enter it twice, i.e. \c{\\}.
261 To match the backslash character itself, enter it four times, i.e.
262 \c{\\\\}.
263
264 \target sets-of-characters
265 \section1 Sets of Characters
266
267 Square brackets mean match any character contained in the square
268 brackets. The character set abbreviations described above can
269 appear in a character set in square brackets. Except for the
270 character set abbreviations and the following two exceptions,
271 characters do not have special meanings in square brackets.
272
273 \table
274 \row \li \b{^}
275
276 \li The caret negates the character set if it occurs as the
277 first character (i.e. immediately after the opening square
278 bracket). \b{[abc]} matches 'a' or 'b' or 'c', but
279 \b{[^abc]} matches anything \e but 'a' or 'b' or 'c'.
280
281 \row \li \b{-}
282
283 \li The dash indicates a range of characters. \b{[W-Z]}
284 matches 'W' or 'X' or 'Y' or 'Z'.
285
286 \endtable
287
288 Using the predefined character set abbreviations is more portable
289 than using character ranges across platforms and languages. For
290 example, \b{[0-9]} matches a digit in Western alphabets but
291 \b{\\d} matches a digit in \e any alphabet.
292
293 Note: In other regexp documentation, sets of characters are often
294 called "character classes".
295
296 \target quantifiers
297 \section1 Quantifiers
298
299 By default, an expression is automatically quantified by
300 \b{{1,1}}, i.e. it should occur exactly once. In the following
301 list, \b{\e {E}} stands for expression. An expression is a
302 character, or an abbreviation for a set of characters, or a set of
303 characters in square brackets, or an expression in parentheses.
304
305 \table
306 \row \li \b{\e {E}?}
307
308 \li Matches zero or one occurrences of \e E. This quantifier
309 means \e{The previous expression is optional}, because it
310 will match whether or not the expression is found. \b{\e
311 {E}?} is the same as \b{\e {E}{0,1}}. e.g., \b{dents?}
312 matches 'dent' or 'dents'.
313
314 \row \li \b{\e {E}+}
315
316 \li Matches one or more occurrences of \e E. \b{\e {E}+} is
317 the same as \b{\e {E}{1,}}. e.g., \b{0+} matches '0',
318 '00', '000', etc.
319
320 \row \li \b{\e {E}*}
321
322 \li Matches zero or more occurrences of \e E. It is the same
323 as \b{\e {E}{0,}}. The \b{*} quantifier is often used
324 in error where \b{+} should be used. For example, if
325 \b{\\s*$} is used in an expression to match strings that
326 end in whitespace, it will match every string because
327 \b{\\s*$} means \e{Match zero or more whitespaces followed
328 by end of string}. The correct regexp to match strings that
329 have at least one trailing whitespace character is
330 \b{\\s+$}.
331
332 \row \li \b{\e {E}{n}}
333
334 \li Matches exactly \e n occurrences of \e E. \b{\e {E}{n}}
335 is the same as repeating \e E \e n times. For example,
336 \b{x{5}} is the same as \b{xxxxx}. It is also the same
337 as \b{\e {E}{n,n}}, e.g. \b{x{5,5}}.
338
339 \row \li \b{\e {E}{n,}}
340 \li Matches at least \e n occurrences of \e E.
341
342 \row \li \b{\e {E}{,m}}
343 \li Matches at most \e m occurrences of \e E. \b{\e {E}{,m}}
344 is the same as \b{\e {E}{0,m}}.
345
346 \row \li \b{\e {E}{n,m}}
347 \li Matches at least \e n and at most \e m occurrences of \e E.
348 \endtable
349
350 To apply a quantifier to more than just the preceding character,
351 use parentheses to group characters together in an expression. For
352 example, \b{tag+} matches a 't' followed by an 'a' followed by
353 at least one 'g', whereas \b{(tag)+} matches at least one
354 occurrence of 'tag'.
355
356 Note: Quantifiers are normally "greedy". They always match as much
357 text as they can. For example, \b{0+} matches the first zero it
358 finds and all the consecutive zeros after the first zero. Applied
359 to '20005', it matches '2\underline{000}5'. Quantifiers can be made
360 non-greedy, see setMinimal().
361
362 \target capturing parentheses
363 \target backreferences
364 \section1 Capturing Text
365
366 Parentheses allow us to group elements together so that we can
367 quantify and capture them. For example if we have the expression
368 \b{mail|letter|correspondence} that matches a string we know
369 that \e one of the words matched but not which one. Using
370 parentheses allows us to "capture" whatever is matched within
371 their bounds, so if we used \b{(mail|letter|correspondence)}
372 and matched this regexp against the string "I sent you some email"
373 we can use the cap() or capturedTexts() functions to extract the
374 matched characters, in this case 'mail'.
375
376 We can use captured text within the regexp itself. To refer to the
377 captured text we use \e backreferences which are indexed from 1,
378 the same as for cap(). For example we could search for duplicate
379 words in a string using \b{\\b(\\w+)\\W+\\1\\b} which means match a
380 word boundary followed by one or more word characters followed by
381 one or more non-word characters followed by the same text as the
382 first parenthesized expression followed by a word boundary.
383
384 If we want to use parentheses purely for grouping and not for
385 capturing we can use the non-capturing syntax, e.g.
386 \b{(?:green|blue)}. Non-capturing parentheses begin '(?:' and
387 end ')'. In this example we match either 'green' or 'blue' but we
388 do not capture the match so we only know whether or not we matched
389 but not which color we actually found. Using non-capturing
390 parentheses is more efficient than using capturing parentheses
391 since the regexp engine has to do less book-keeping.
392
393 Both capturing and non-capturing parentheses may be nested.
394
395 \target greedy quantifiers
396
397 For historical reasons, quantifiers (e.g. \b{*}) that apply to
398 capturing parentheses are more "greedy" than other quantifiers.
399 For example, \b{a*(a*)} will match "aaa" with cap(1) == "aaa".
400 This behavior is different from what other regexp engines do
401 (notably, Perl). To obtain a more intuitive capturing behavior,
402 specify QRegExp::RegExp2 to the QRegExp constructor or call
403 setPatternSyntax(QRegExp::RegExp2).
404
405 \target cap_in_a_loop
406
407 When the number of matches cannot be determined in advance, a
408 common idiom is to use cap() in a loop. For example:
409
410 \snippet code/src_corelib_text_qregexp.cpp 0
411
412 \target assertions
413 \section1 Assertions
414
415 Assertions make some statement about the text at the point where
416 they occur in the regexp but they do not match any characters. In
417 the following list \b{\e {E}} stands for any expression.
418
419 \table
420 \row \li \b{^}
421 \li The caret signifies the beginning of the string. If you
422 wish to match a literal \c{^} you must escape it by
423 writing \c{\\^}. For example, \b{^#include} will only
424 match strings which \e begin with the characters '#include'.
425 (When the caret is the first character of a character set it
426 has a special meaning, see \l{#sets-of-characters}{Sets of Characters}.)
427
428 \row \li \b{$}
429 \li The dollar signifies the end of the string. For example
430 \b{\\d\\s*$} will match strings which end with a digit
431 optionally followed by whitespace. If you wish to match a
432 literal \c{$} you must escape it by writing
433 \c{\\$}.
434
435 \row \li \b{\\b}
436 \li A word boundary. For example the regexp
437 \b{\\bOK\\b} means match immediately after a word
438 boundary (e.g. start of string or whitespace) the letter 'O'
439 then the letter 'K' immediately before another word boundary
440 (e.g. end of string or whitespace). But note that the
441 assertion does not actually match any whitespace so if we
442 write \b{(\\bOK\\b)} and we have a match it will only
443 contain 'OK' even if the string is "It's \underline{OK} now".
444
445 \row \li \b{\\B}
446 \li A non-word boundary. This assertion is true wherever
447 \b{\\b} is false. For example if we searched for
448 \b{\\Bon\\B} in "Left on" the match would fail (space
449 and end of string aren't non-word boundaries), but it would
450 match in "t\underline{on}ne".
451
452 \row \li \b{(?=\e E)}
453 \li Positive lookahead. This assertion is true if the
454 expression matches at this point in the regexp. For example,
455 \b{const(?=\\s+char)} matches 'const' whenever it is
456 followed by 'char', as in 'static \underline{const} char *'.
457 (Compare with \b{const\\s+char}, which matches 'static
458 \underline{const char} *'.)
459
460 \row \li \b{(?!\e E)}
461 \li Negative lookahead. This assertion is true if the
462 expression does not match at this point in the regexp. For
463 example, \b{const(?!\\s+char)} matches 'const' \e except
464 when it is followed by 'char'.
465 \endtable
466
467 \target QRegExp wildcard matching
468 \section1 Wildcard Matching
469
470 Most command shells such as \e bash or \e cmd.exe support "file
471 globbing", the ability to identify a group of files by using
472 wildcards. The setPatternSyntax() function is used to switch
473 between regexp and wildcard mode. Wildcard matching is much
474 simpler than full regexps and has only four features:
475
476 \table
477 \row \li \b{c}
478 \li Any character represents itself apart from those mentioned
479 below. Thus \b{c} matches the character \e c.
480 \row \li \b{?}
481 \li Matches any single character. It is the same as
482 \b{.} in full regexps.
483 \row \li \b{*}
484 \li Matches zero or more of any characters. It is the
485 same as \b{.*} in full regexps.
486 \row \li \b{[...]}
487 \li Sets of characters can be represented in square brackets,
488 similar to full regexps. Within the character class, like
489 outside, backslash has no special meaning.
490 \endtable
491
492 In the mode Wildcard, the wildcard characters cannot be
493 escaped. In the mode WildcardUnix, the character '\\' escapes the
494 wildcard.
495
496 For example if we are in wildcard mode and have strings which
497 contain filenames we could identify HTML files with \b{*.html}.
498 This will match zero or more characters followed by a dot followed
499 by 'h', 't', 'm' and 'l'.
500
501 To test a string against a wildcard expression, use exactMatch().
502 For example:
503
504 \snippet code/src_corelib_text_qregexp.cpp 1
505
506 \target perl-users
507 \section1 Notes for Perl Users
508
509 Most of the character class abbreviations supported by Perl are
510 supported by QRegExp, see \l{#characters-and-abbreviations-for-sets-of-characters}
511 {characters and abbreviations for sets of characters}.
512
513 In QRegExp, apart from within character classes, \c{^} always
514 signifies the start of the string, so carets must always be
515 escaped unless used for that purpose. In Perl the meaning of caret
516 varies automagically depending on where it occurs so escaping it
517 is rarely necessary. The same applies to \c{$} which in
518 QRegExp always signifies the end of the string.
519
520 QRegExp's quantifiers are the same as Perl's greedy quantifiers
521 (but see the \l{greedy quantifiers}{note above}). Non-greedy
522 matching cannot be applied to individual quantifiers, but can be
523 applied to all the quantifiers in the pattern. For example, to
524 match the Perl regexp \b{ro+?m} requires:
525
526 \snippet code/src_corelib_text_qregexp.cpp 2
527
528 The equivalent of Perl's \c{/i} option is
529 setCaseSensitivity(Qt::CaseInsensitive).
530
531 Perl's \c{/g} option can be emulated using a \l{#cap_in_a_loop}{loop}.
532
533 In QRegExp \b{.} matches any character, therefore all QRegExp
534 regexps have the equivalent of Perl's \c{/s} option. QRegExp
535 does not have an equivalent to Perl's \c{/m} option, but this
536 can be emulated in various ways for example by splitting the input
537 into lines or by looping with a regexp that searches for newlines.
538
539 Because QRegExp is string oriented, there are no \\A, \\Z, or \\z
540 assertions. The \\G assertion is not supported but can be emulated
541 in a loop.
542
543 Perl's $& is cap(0) or capturedTexts()[0]. There are no QRegExp
544 equivalents for $`, $' or $+. Perl's capturing variables, $1, $2,
545 ... correspond to cap(1) or capturedTexts()[1], cap(2) or
546 capturedTexts()[2], etc.
547
548 To substitute a pattern use QString::replace().
549
550 Perl's extended \c{/x} syntax is not supported, nor are
551 directives, e.g. (?i), or regexp comments, e.g. (?#comment). On
552 the other hand, C++'s rules for literal strings can be used to
553 achieve the same:
554
555 \snippet code/src_corelib_text_qregexp.cpp 3
556
557 Both zero-width positive and zero-width negative lookahead
558 assertions (?=pattern) and (?!pattern) are supported with the same
559 syntax as Perl. Perl's lookbehind assertions, "independent"
560 subexpressions and conditional expressions are not supported.
561
562 Non-capturing parentheses are also supported, with the same
563 (?:pattern) syntax.
564
565 See QString::split() and QStringList::join() for equivalents
566 to Perl's split and join functions.
567
568 Note: because C++ transforms \\'s they must be written \e twice in
569 code, e.g. \b{\\b} must be written \b{\\\\b}.
570
571 \target code-examples
572 \section1 Code Examples
573
574 \snippet code/src_corelib_text_qregexp.cpp 4
575
576 The third string matches '\underline{6}'. This is a simple validation
577 regexp for integers in the range 0 to 99.
578
579 \snippet code/src_corelib_text_qregexp.cpp 5
580
581 The second string matches '\underline{This_is-OK}'. We've used the
582 character set abbreviation '\\S' (non-whitespace) and the anchors
583 to match strings which contain no whitespace.
584
585 In the following example we match strings containing 'mail' or
586 'letter' or 'correspondence' but only match whole words i.e. not
587 'email'
588
589 \snippet code/src_corelib_text_qregexp.cpp 6
590
591 The second string matches "Please write the \underline{letter}". The
592 word 'letter' is also captured (because of the parentheses). We
593 can see what text we've captured like this:
594
595 \snippet code/src_corelib_text_qregexp.cpp 7
596
597 This will capture the text from the first set of capturing
598 parentheses (counting capturing left parentheses from left to
599 right). The parentheses are counted from 1 since cap(0) is the
600 whole matched regexp (equivalent to '&' in most regexp engines).
601
602 \snippet code/src_corelib_text_qregexp.cpp 8
603
604 Here we've passed the QRegExp to QString's replace() function to
605 replace the matched text with new text.
606
607 \snippet code/src_corelib_text_qregexp.cpp 9
608
609 We've used the indexIn() function to repeatedly match the regexp in
610 the string. Note that instead of moving forward by one character
611 at a time \c pos++ we could have written \c {pos +=
612 rx.matchedLength()} to skip over the already matched string. The
613 count will equal 3, matching 'One \underline{Eric} another
614 \underline{Eirik}, and an Ericsson. How many Eiriks, \underline{Eric}?'; it
615 doesn't match 'Ericsson' or 'Eiriks' because they are not bounded
616 by non-word boundaries.
617
618 One common use of regexps is to split lines of delimited data into
619 their component fields.
620
621 \snippet code/src_corelib_text_qregexp.cpp 10
622
623 In this example our input lines have the format company name, web
624 address and country. Unfortunately the regexp is rather long and
625 not very versatile -- the code will break if we add any more
626 fields. A simpler and better solution is to look for the
627 separator, '\\t' in this case, and take the surrounding text. The
628 QString::split() function can take a separator string or regexp
629 as an argument and split a string accordingly.
630
631 \snippet code/src_corelib_text_qregexp.cpp 11
632
633 Here field[0] is the company, field[1] the web address and so on.
634
635 To imitate the matching of a shell we can use wildcard mode.
636
637 \snippet code/src_corelib_text_qregexp.cpp 12
638
639 Wildcard matching can be convenient because of its simplicity, but
640 any wildcard regexp can be defined using full regexps, e.g.
641 \b{.*\\.html$}. Notice that we can't match both \c .html and \c
642 .htm files with a wildcard unless we use \b{*.htm*} which will
643 also match 'test.html.bak'. A full regexp gives us the precision
644 we need, \b{.*\\.html?$}.
645
646 QRegExp can match case insensitively using setCaseSensitivity(),
647 and can use non-greedy matching, see setMinimal(). By
648 default QRegExp uses full regexps but this can be changed with
649 setPatternSyntax(). Searching can be done forward with indexIn() or backward
650 with lastIndexIn(). Captured text can be accessed using
651 capturedTexts() which returns a string list of all captured
652 strings, or using cap() which returns the captured string for the
653 given index. The pos() function takes a match index and returns
654 the position in the string where the match was made (or -1 if
655 there was no match).
656
657 \sa QString, QStringList, QSortFilterProxyModel
658
659 \section1 Porting to QRegularExpression
660
661 \include corelib/port-from-qregexp.qdocinc porting-to-qregularexpression
662*/
663
664#if defined(Q_OS_VXWORKS) && defined(EOS)
665# undef EOS
666#endif
667
668const int NumBadChars = 64;
669#define BadChar(ch) ((ch).unicode() % NumBadChars)
670
671const int NoOccurrence = INT_MAX;
672const int EmptyCapture = INT_MAX;
673const int InftyLen = INT_MAX;
674const int InftyRep = 1025;
675const int EOS = -1;
676
677static bool isWord(QChar ch)
678{
679 return ch.isLetterOrNumber() || ch.isMark() || ch == QLatin1Char('_');
680}
681
682/*
683 Merges two vectors of ints and puts the result into the first
684 one.
685*/
686static void mergeInto(QList<int> *a, const QList<int> &b)
687{
688 int asize = a->size();
689 int bsize = b.size();
690 if (asize == 0) {
691 *a = b;
692#ifndef QT_NO_REGEXP_OPTIM
693 } else if (bsize == 1 && a->at(i: asize - 1) < b.at(i: 0)) {
694 a->resize(size: asize + 1);
695 (*a)[asize] = b.at(i: 0);
696#endif
697 } else if (bsize >= 1) {
698 int csize = asize + bsize;
699 QList<int> c(csize);
700 int i = 0, j = 0, k = 0;
701 while (i < asize) {
702 if (j < bsize) {
703 if (a->at(i) == b.at(i: j)) {
704 ++i;
705 --csize;
706 } else if (a->at(i) < b.at(i: j)) {
707 c[k++] = a->at(i: i++);
708 } else {
709 c[k++] = b.at(i: j++);
710 }
711 } else {
712 memcpy(dest: c.data() + k, src: a->constData() + i, n: (asize - i) * sizeof(int));
713 break;
714 }
715 }
716 c.resize(size: csize);
717 if (j < bsize)
718 memcpy(dest: c.data() + k, src: b.constData() + j, n: (bsize - j) * sizeof(int));
719 *a = c;
720 }
721}
722
723#ifndef QT_NO_REGEXP_WILDCARD
724/*
725 Translates a wildcard pattern to an equivalent regular expression
726 pattern (e.g., *.cpp to .*\.cpp).
727
728 If enableEscaping is true, it is possible to escape the wildcard
729 characters with \
730*/
731static QString wc2rx(const QString &wc_str, const bool enableEscaping)
732{
733 const int wclen = wc_str.size();
734 QString rx;
735 int i = 0;
736 bool isEscaping = false; // the previous character is '\'
737 const QChar *wc = wc_str.unicode();
738
739 while (i < wclen) {
740 const QChar c = wc[i++];
741 switch (c.unicode()) {
742 case '\\':
743 if (enableEscaping) {
744 if (isEscaping) {
745 rx += QLatin1String("\\\\");
746 } // we insert the \\ later if necessary
747 if (i == wclen) { // the end
748 rx += QLatin1String("\\\\");
749 }
750 } else {
751 rx += QLatin1String("\\\\");
752 }
753 isEscaping = true;
754 break;
755 case '*':
756 if (isEscaping) {
757 rx += QLatin1String("\\*");
758 isEscaping = false;
759 } else {
760 rx += QLatin1String(".*");
761 }
762 break;
763 case '?':
764 if (isEscaping) {
765 rx += QLatin1String("\\?");
766 isEscaping = false;
767 } else {
768 rx += QLatin1Char('.');
769 }
770
771 break;
772 case '$':
773 case '(':
774 case ')':
775 case '+':
776 case '.':
777 case '^':
778 case '{':
779 case '|':
780 case '}':
781 if (isEscaping) {
782 isEscaping = false;
783 rx += QLatin1String("\\\\");
784 }
785 rx += QLatin1Char('\\');
786 rx += c;
787 break;
788 case '[':
789 if (isEscaping) {
790 isEscaping = false;
791 rx += QLatin1String("\\[");
792 } else {
793 rx += c;
794 if (wc[i] == QLatin1Char('^'))
795 rx += wc[i++];
796 if (i < wclen) {
797 if (wc[i] == QLatin1Char(']'))
798 rx += wc[i++];
799 while (i < wclen && wc[i] != QLatin1Char(']')) {
800 if (wc[i] == QLatin1Char('\\'))
801 rx += QLatin1Char('\\');
802 rx += wc[i++];
803 }
804 }
805 }
806 break;
807
808 case ']':
809 if (isEscaping){
810 isEscaping = false;
811 rx += QLatin1String("\\");
812 }
813 rx += c;
814 break;
815
816 default:
817 if (isEscaping){
818 isEscaping = false;
819 rx += QLatin1String("\\\\");
820 }
821 rx += c;
822 }
823 }
824 return rx;
825}
826#endif
827
828static int caretIndex(int offset, QRegExp::CaretMode caretMode)
829{
830 if (caretMode == QRegExp::CaretAtZero) {
831 return 0;
832 } else if (caretMode == QRegExp::CaretAtOffset) {
833 return offset;
834 } else { // QRegExp::CaretWontMatch
835 return -1;
836 }
837}
838
839/*
840 The QRegExpEngineKey struct uniquely identifies an engine.
841*/
842struct QRegExpEngineKey
843{
844 QString pattern;
845 QRegExp::PatternSyntax patternSyntax;
846 Qt::CaseSensitivity cs;
847
848 inline QRegExpEngineKey(const QString &pattern, QRegExp::PatternSyntax patternSyntax,
849 Qt::CaseSensitivity cs)
850 : pattern(pattern), patternSyntax(patternSyntax), cs(cs) {}
851
852 inline void clear() {
853 pattern.clear();
854 patternSyntax = QRegExp::RegExp;
855 cs = Qt::CaseSensitive;
856 }
857};
858
859static bool operator==(const QRegExpEngineKey &key1, const QRegExpEngineKey &key2)
860{
861 return key1.pattern == key2.pattern && key1.patternSyntax == key2.patternSyntax
862 && key1.cs == key2.cs;
863}
864
865static size_t qHash(const QRegExpEngineKey &key, size_t seed = 0) noexcept
866{
867 return qHashMulti(seed, args: key.pattern, args: key.patternSyntax, args: key.cs);
868}
869
870class QRegExpEngine;
871
872/*
873 This is the engine state during matching.
874*/
875struct QRegExpMatchState
876{
877 const QChar *in; // a pointer to the input string data
878 int pos; // the current position in the string
879 int caretPos;
880 int len; // the length of the input string
881 bool minimal; // minimal matching?
882 int *bigArray; // big array holding the data for the next pointers
883 int *inNextStack; // is state is nextStack?
884 int *curStack; // stack of current states
885 int *nextStack; // stack of next states
886 int *curCapBegin; // start of current states' captures
887 int *nextCapBegin; // start of next states' captures
888 int *curCapEnd; // end of current states' captures
889 int *nextCapEnd; // end of next states' captures
890 int *tempCapBegin; // start of temporary captures
891 int *tempCapEnd; // end of temporary captures
892 int *capBegin; // start of captures for a next state
893 int *capEnd; // end of captures for a next state
894 int *slideTab; // bump-along slide table for bad-character heuristic
895 int *captured; // what match() returned last
896 int slideTabSize; // size of slide table
897 int capturedSize;
898#ifndef QT_NO_REGEXP_BACKREF
899 QList<QList<int>> sleeping; // list of back-reference sleepers
900#endif
901 int matchLen; // length of match
902 int oneTestMatchedLen; // length of partial match
903
904 const QRegExpEngine *eng;
905
906 inline QRegExpMatchState() : bigArray(nullptr), captured(nullptr) {}
907 inline ~QRegExpMatchState() { free(ptr: bigArray); }
908
909 void drain() { free(ptr: bigArray); bigArray = nullptr; captured = nullptr; } // to save memory
910 void prepareForMatch(QRegExpEngine *eng);
911 void match(const QChar *str, int len, int pos, bool minimal,
912 bool oneTest, int caretIndex);
913 bool matchHere();
914 bool testAnchor(int i, int a, const int *capBegin);
915};
916
917/*
918 The struct QRegExpAutomatonState represents one state in a modified NFA. The
919 input characters matched are stored in the state instead of on
920 the transitions, something possible for an automaton
921 constructed from a regular expression.
922*/
923struct QRegExpAutomatonState
924{
925#ifndef QT_NO_REGEXP_CAPTURE
926 int atom; // which atom does this state belong to?
927#endif
928 int match; // what does it match? (see CharClassBit and BackRefBit)
929 QList<int> outs; // out-transitions
930 QMap<int, int> reenter; // atoms reentered when transiting out
931 QMap<int, int> anchors; // anchors met when transiting out
932
933 inline QRegExpAutomatonState() { }
934#ifndef QT_NO_REGEXP_CAPTURE
935 inline QRegExpAutomatonState(int a, int m)
936 : atom(a), match(m) { }
937#else
938 inline QRegExpAutomatonState(int m)
939 : match(m) { }
940#endif
941};
942
943Q_DECLARE_TYPEINFO(QRegExpAutomatonState, Q_RELOCATABLE_TYPE);
944
945/*
946 The struct QRegExpCharClassRange represents a range of characters (e.g.,
947 [0-9] denotes range 48 to 57).
948*/
949struct QRegExpCharClassRange
950{
951 ushort from; // 48
952 ushort len; // 10
953};
954
955Q_DECLARE_TYPEINFO(QRegExpCharClassRange, Q_PRIMITIVE_TYPE);
956
957#ifndef QT_NO_REGEXP_CAPTURE
958/*
959 The struct QRegExpAtom represents one node in the hierarchy of regular
960 expression atoms.
961*/
962struct QRegExpAtom
963{
964 enum { NoCapture = -1, OfficialCapture = -2, UnofficialCapture = -3 };
965
966 int parent; // index of parent in array of atoms
967 int capture; // index of capture, from 1 to ncap - 1
968};
969
970Q_DECLARE_TYPEINFO(QRegExpAtom, Q_PRIMITIVE_TYPE);
971#endif
972
973struct QRegExpLookahead;
974
975#ifndef QT_NO_REGEXP_ANCHOR_ALT
976/*
977 The struct QRegExpAnchorAlternation represents a pair of anchors with
978 OR semantics.
979*/
980struct QRegExpAnchorAlternation
981{
982 int a; // this anchor...
983 int b; // ...or this one
984};
985
986Q_DECLARE_TYPEINFO(QRegExpAnchorAlternation, Q_PRIMITIVE_TYPE);
987#endif
988
989#ifndef QT_NO_REGEXP_CCLASS
990
991#define FLAG(x) (1 << (x))
992/*
993 The class QRegExpCharClass represents a set of characters, such as can
994 be found in regular expressions (e.g., [a-z] denotes the set
995 {a, b, ..., z}).
996*/
997class QRegExpCharClass
998{
999public:
1000 QRegExpCharClass();
1001
1002 void clear();
1003 bool negative() const { return n; }
1004 void setNegative(bool negative);
1005 void addCategories(uint cats);
1006 void addRange(ushort from, ushort to);
1007 void addSingleton(ushort ch) { addRange(from: ch, to: ch); }
1008
1009 bool in(QChar ch) const;
1010#ifndef QT_NO_REGEXP_OPTIM
1011 const QList<int> &firstOccurrence() const { return occ1; }
1012#endif
1013
1014#if defined(QT_DEBUG)
1015 void dump() const;
1016#endif
1017
1018private:
1019 QList<QRegExpCharClassRange> r; // character ranges
1020#ifndef QT_NO_REGEXP_OPTIM
1021 QList<int> occ1; // first-occurrence array
1022#endif
1023 uint c; // character classes
1024 bool n; // negative?
1025};
1026#else
1027struct QRegExpCharClass
1028{
1029 int dummy;
1030
1031#ifndef QT_NO_REGEXP_OPTIM
1032 QRegExpCharClass() { occ1.fill(0, NumBadChars); }
1033
1034 const QList<int> &firstOccurrence() const { return occ1; }
1035 QList<int> occ1;
1036#endif
1037};
1038#endif
1039
1040Q_DECLARE_TYPEINFO(QRegExpCharClass, Q_RELOCATABLE_TYPE);
1041
1042/*
1043 The QRegExpEngine class encapsulates a modified nondeterministic
1044 finite automaton (NFA).
1045*/
1046class QRegExpEngine
1047{
1048public:
1049 QRegExpEngine(Qt::CaseSensitivity cs, bool greedyQuantifiers)
1050 : cs(cs), greedyQuantifiers(greedyQuantifiers) { setup(); }
1051
1052 QRegExpEngine(const QRegExpEngineKey &key);
1053 ~QRegExpEngine();
1054
1055 bool isValid() const { return valid; }
1056 const QString &errorString() const { return yyError; }
1057 int captureCount() const { return officialncap; }
1058
1059 int createState(QChar ch);
1060 int createState(const QRegExpCharClass &cc);
1061#ifndef QT_NO_REGEXP_BACKREF
1062 int createState(int bref);
1063#endif
1064
1065 void addCatTransitions(const QList<int> &from, const QList<int> &to);
1066#ifndef QT_NO_REGEXP_CAPTURE
1067 void addPlusTransitions(const QList<int> &from, const QList<int> &to, int atom);
1068#endif
1069
1070#ifndef QT_NO_REGEXP_ANCHOR_ALT
1071 int anchorAlternation(int a, int b);
1072 int anchorConcatenation(int a, int b);
1073#else
1074 int anchorAlternation(int a, int b) { return a & b; }
1075 int anchorConcatenation(int a, int b) { return a | b; }
1076#endif
1077 void addAnchors(int from, int to, int a);
1078
1079#ifndef QT_NO_REGEXP_OPTIM
1080 void heuristicallyChooseHeuristic();
1081#endif
1082
1083#if defined(QT_DEBUG)
1084 void dump() const;
1085#endif
1086
1087 QAtomicInt ref;
1088
1089private:
1090 enum { CharClassBit = 0x10000, BackRefBit = 0x20000 };
1091 enum { InitialState = 0, FinalState = 1 };
1092
1093 void setup();
1094 int setupState(int match);
1095
1096 /*
1097 Let's hope that 13 lookaheads and 14 back-references are
1098 enough.
1099 */
1100 enum { MaxLookaheads = 13, MaxBackRefs = 14 };
1101 enum { Anchor_Dollar = 0x00000001, Anchor_Caret = 0x00000002, Anchor_Word = 0x00000004,
1102 Anchor_NonWord = 0x00000008, Anchor_FirstLookahead = 0x00000010,
1103 Anchor_BackRef1Empty = Anchor_FirstLookahead << MaxLookaheads,
1104 Anchor_BackRef0Empty = Anchor_BackRef1Empty >> 1,
1105 Anchor_Alternation = unsigned(Anchor_BackRef1Empty) << MaxBackRefs,
1106
1107 Anchor_LookaheadMask = (Anchor_FirstLookahead - 1) ^
1108 ((Anchor_FirstLookahead << MaxLookaheads) - 1) };
1109#ifndef QT_NO_REGEXP_CAPTURE
1110 int startAtom(bool officialCapture);
1111 void finishAtom(int atom, bool needCapture);
1112#endif
1113
1114#ifndef QT_NO_REGEXP_LOOKAHEAD
1115 int addLookahead(QRegExpEngine *eng, bool negative);
1116#endif
1117
1118#ifndef QT_NO_REGEXP_OPTIM
1119 bool goodStringMatch(QRegExpMatchState &matchState) const;
1120 bool badCharMatch(QRegExpMatchState &matchState) const;
1121#else
1122 bool bruteMatch(QRegExpMatchState &matchState) const;
1123#endif
1124
1125 QList<QRegExpAutomatonState> s; // array of states
1126#ifndef QT_NO_REGEXP_CAPTURE
1127 QList<QRegExpAtom> f; // atom hierarchy
1128 int nf; // number of atoms
1129 int cf; // current atom
1130 QList<int> captureForOfficialCapture;
1131#endif
1132 int officialncap; // number of captures, seen from the outside
1133 int ncap; // number of captures, seen from the inside
1134#ifndef QT_NO_REGEXP_CCLASS
1135 QList<QRegExpCharClass> cl; // array of character classes
1136#endif
1137#ifndef QT_NO_REGEXP_LOOKAHEAD
1138 QList<QRegExpLookahead *> ahead; // array of lookaheads
1139#endif
1140#ifndef QT_NO_REGEXP_ANCHOR_ALT
1141 QList<QRegExpAnchorAlternation> aa; // array of (a, b) pairs of anchors
1142#endif
1143#ifndef QT_NO_REGEXP_OPTIM
1144 bool caretAnchored; // does the regexp start with ^?
1145 bool trivial; // is the good-string all that needs to match?
1146#endif
1147 bool valid; // is the regular expression valid?
1148 Qt::CaseSensitivity cs; // case sensitive?
1149 bool greedyQuantifiers; // RegExp2?
1150 bool xmlSchemaExtensions;
1151#ifndef QT_NO_REGEXP_BACKREF
1152 int nbrefs; // number of back-references
1153#endif
1154
1155#ifndef QT_NO_REGEXP_OPTIM
1156 bool useGoodStringHeuristic; // use goodStringMatch? otherwise badCharMatch
1157
1158 int goodEarlyStart; // the index where goodStr can first occur in a match
1159 int goodLateStart; // the index where goodStr can last occur in a match
1160 QString goodStr; // the string that any match has to contain
1161
1162 int minl; // the minimum length of a match
1163 QList<int> occ1; // first-occurrence array
1164#endif
1165
1166 /*
1167 The class Box is an abstraction for a regular expression
1168 fragment. It can also be seen as one node in the syntax tree of
1169 a regular expression with synthetized attributes.
1170
1171 Its interface is ugly for performance reasons.
1172 */
1173 class Box
1174 {
1175 public:
1176 Box(QRegExpEngine *engine);
1177 Box(const Box &b) { operator=(b); }
1178
1179 Box &operator=(const Box &b);
1180
1181 void clear() { operator=(b: Box(eng)); }
1182 void set(QChar ch);
1183 void set(const QRegExpCharClass &cc);
1184#ifndef QT_NO_REGEXP_BACKREF
1185 void set(int bref);
1186#endif
1187
1188 void cat(const Box &b);
1189 void orx(const Box &b);
1190 void plus(int atom);
1191 void opt();
1192 void catAnchor(int a);
1193#ifndef QT_NO_REGEXP_OPTIM
1194 void setupHeuristics();
1195#endif
1196
1197#if defined(QT_DEBUG)
1198 void dump() const;
1199#endif
1200
1201 private:
1202 void addAnchorsToEngine(const Box &to) const;
1203
1204 QRegExpEngine *eng; // the automaton under construction
1205 QList<int> ls; // the left states (firstpos)
1206 QList<int> rs; // the right states (lastpos)
1207 QMap<int, int> lanchors; // the left anchors
1208 QMap<int, int> ranchors; // the right anchors
1209 int skipanchors; // the anchors to match if the box is skipped
1210
1211#ifndef QT_NO_REGEXP_OPTIM
1212 int earlyStart; // the index where str can first occur
1213 int lateStart; // the index where str can last occur
1214 QString str; // a string that has to occur in any match
1215 QString leftStr; // a string occurring at the left of this box
1216 QString rightStr; // a string occurring at the right of this box
1217 int maxl; // the maximum length of this box (possibly InftyLen)
1218#endif
1219
1220 int minl; // the minimum length of this box
1221#ifndef QT_NO_REGEXP_OPTIM
1222 QList<int> occ1; // first-occurrence array
1223#endif
1224 };
1225
1226 friend class Box;
1227
1228 /*
1229 This is the lexical analyzer for regular expressions.
1230 */
1231 enum { Tok_Eos, Tok_Dollar, Tok_LeftParen, Tok_MagicLeftParen, Tok_PosLookahead,
1232 Tok_NegLookahead, Tok_RightParen, Tok_CharClass, Tok_Caret, Tok_Quantifier, Tok_Bar,
1233 Tok_Word, Tok_NonWord, Tok_Char = 0x10000, Tok_BackRef = 0x20000 };
1234 int getChar();
1235 int getEscape();
1236#ifndef QT_NO_REGEXP_INTERVAL
1237 int getRep(int def);
1238#endif
1239#ifndef QT_NO_REGEXP_LOOKAHEAD
1240 void skipChars(int n);
1241#endif
1242 void error(const char *msg);
1243 void startTokenizer(const QChar *rx, int len);
1244 int getToken();
1245
1246 const QChar *yyIn; // a pointer to the input regular expression pattern
1247 int yyPos0; // the position of yyTok in the input pattern
1248 int yyPos; // the position of the next character to read
1249 int yyLen; // the length of yyIn
1250 int yyCh; // the last character read
1251 QScopedPointer<QRegExpCharClass> yyCharClass; // attribute for Tok_CharClass tokens
1252 int yyMinRep; // attribute for Tok_Quantifier
1253 int yyMaxRep; // ditto
1254 QString yyError; // syntax error or overflow during parsing?
1255
1256 /*
1257 This is the syntactic analyzer for regular expressions.
1258 */
1259 int parse(const QChar *rx, int len);
1260 void parseAtom(Box *box);
1261 void parseFactor(Box *box);
1262 void parseTerm(Box *box);
1263 void parseExpression(Box *box);
1264
1265 int yyTok; // the last token read
1266 bool yyMayCapture; // set this to false to disable capturing
1267
1268 friend struct QRegExpMatchState;
1269};
1270
1271#ifndef QT_NO_REGEXP_LOOKAHEAD
1272/*
1273 The struct QRegExpLookahead represents a lookahead a la Perl (e.g.,
1274 (?=foo) and (?!bar)).
1275*/
1276struct QRegExpLookahead
1277{
1278 QRegExpEngine *eng; // NFA representing the embedded regular expression
1279 bool neg; // negative lookahead?
1280
1281 inline QRegExpLookahead(QRegExpEngine *eng0, bool neg0)
1282 : eng(eng0), neg(neg0) { }
1283 inline ~QRegExpLookahead() { delete eng; }
1284};
1285#endif
1286
1287/*!
1288 \internal
1289 convert the pattern string to the RegExp syntax.
1290
1291 This is also used by QScriptEngine::newRegExp to convert to a pattern that JavaScriptCore can understan
1292 */
1293Q_CORE5COMPAT_EXPORT QString qt_regexp_toCanonical(const QString &pattern,
1294 QRegExp::PatternSyntax patternSyntax)
1295{
1296 switch (patternSyntax) {
1297#ifndef QT_NO_REGEXP_WILDCARD
1298 case QRegExp::Wildcard:
1299 return wc2rx(wc_str: pattern, enableEscaping: false);
1300 case QRegExp::WildcardUnix:
1301 return wc2rx(wc_str: pattern, enableEscaping: true);
1302#endif
1303 case QRegExp::FixedString:
1304 return QRegExp::escape(str: pattern);
1305 case QRegExp::W3CXmlSchema11:
1306 default:
1307 return pattern;
1308 }
1309}
1310
1311QRegExpEngine::QRegExpEngine(const QRegExpEngineKey &key)
1312 : cs(key.cs), greedyQuantifiers(key.patternSyntax == QRegExp::RegExp2),
1313 xmlSchemaExtensions(key.patternSyntax == QRegExp::W3CXmlSchema11)
1314{
1315 setup();
1316
1317 QString rx = qt_regexp_toCanonical(pattern: key.pattern, patternSyntax: key.patternSyntax);
1318
1319 valid = (parse(rx: rx.unicode(), len: rx.size()) == rx.size());
1320 if (!valid) {
1321#ifndef QT_NO_REGEXP_OPTIM
1322 trivial = false;
1323#endif
1324 error(RXERR_LEFTDELIM);
1325 }
1326}
1327
1328QRegExpEngine::~QRegExpEngine()
1329{
1330#ifndef QT_NO_REGEXP_LOOKAHEAD
1331 qDeleteAll(c: ahead);
1332#endif
1333}
1334
1335void QRegExpMatchState::prepareForMatch(QRegExpEngine *eng)
1336{
1337 /*
1338 We use one QList<int> for all the big data used a lot in
1339 matchHere() and friends.
1340 */
1341 int ns = eng->s.size(); // number of states
1342 int ncap = eng->ncap;
1343#ifndef QT_NO_REGEXP_OPTIM
1344 int newSlideTabSize = qMax(a: eng->minl + 1, b: 16);
1345#else
1346 int newSlideTabSize = 0;
1347#endif
1348 int numCaptures = eng->captureCount();
1349 int newCapturedSize = 2 + 2 * numCaptures;
1350 bigArray = q_check_ptr(p: (int *)realloc(ptr: bigArray, size: ((3 + 4 * ncap) * ns + 4 * ncap + newSlideTabSize + newCapturedSize)*sizeof(int)));
1351
1352 // set all internal variables only _after_ bigArray is realloc'ed
1353 // to prevent a broken regexp in oom case
1354
1355 slideTabSize = newSlideTabSize;
1356 capturedSize = newCapturedSize;
1357 inNextStack = bigArray;
1358 memset(s: inNextStack, c: -1, n: ns * sizeof(int));
1359 curStack = inNextStack + ns;
1360 nextStack = inNextStack + 2 * ns;
1361
1362 curCapBegin = inNextStack + 3 * ns;
1363 nextCapBegin = curCapBegin + ncap * ns;
1364 curCapEnd = curCapBegin + 2 * ncap * ns;
1365 nextCapEnd = curCapBegin + 3 * ncap * ns;
1366
1367 tempCapBegin = curCapBegin + 4 * ncap * ns;
1368 tempCapEnd = tempCapBegin + ncap;
1369 capBegin = tempCapBegin + 2 * ncap;
1370 capEnd = tempCapBegin + 3 * ncap;
1371
1372 slideTab = tempCapBegin + 4 * ncap;
1373 captured = slideTab + slideTabSize;
1374 memset(s: captured, c: -1, n: capturedSize*sizeof(int));
1375 this->eng = eng;
1376}
1377
1378/*
1379 Tries to match in str and returns an array of (begin, length) pairs
1380 for captured text. If there is no match, all pairs are (-1, -1).
1381*/
1382void QRegExpMatchState::match(const QChar *str0, int len0, int pos0,
1383 bool minimal0, bool oneTest, int caretIndex)
1384{
1385 bool matched = false;
1386 QChar char_null;
1387
1388#ifndef QT_NO_REGEXP_OPTIM
1389 if (eng->trivial && !oneTest) {
1390 // ### Qt6: qsizetype
1391 pos = int(QtPrivate::findString(haystack: QStringView(str0, len0), from: pos0, needle: QStringView(eng->goodStr.unicode(), eng->goodStr.size()), cs: eng->cs));
1392 matchLen = eng->goodStr.size();
1393 matched = (pos != -1);
1394 } else
1395#endif
1396 {
1397 in = str0;
1398 if (in == nullptr)
1399 in = &char_null;
1400 pos = pos0;
1401 caretPos = caretIndex;
1402 len = len0;
1403 minimal = minimal0;
1404 matchLen = 0;
1405 oneTestMatchedLen = 0;
1406
1407 if (eng->valid && pos >= 0 && pos <= len) {
1408#ifndef QT_NO_REGEXP_OPTIM
1409 if (oneTest) {
1410 matched = matchHere();
1411 } else {
1412 if (pos <= len - eng->minl) {
1413 if (eng->caretAnchored) {
1414 matched = matchHere();
1415 } else if (eng->useGoodStringHeuristic) {
1416 matched = eng->goodStringMatch(matchState&: *this);
1417 } else {
1418 matched = eng->badCharMatch(matchState&: *this);
1419 }
1420 }
1421 }
1422#else
1423 matched = oneTest ? matchHere() : eng->bruteMatch(*this);
1424#endif
1425 }
1426 }
1427
1428 if (matched) {
1429 int *c = captured;
1430 *c++ = pos;
1431 *c++ = matchLen;
1432
1433 int numCaptures = (capturedSize - 2) >> 1;
1434#ifndef QT_NO_REGEXP_CAPTURE
1435 for (int i = 0; i < numCaptures; ++i) {
1436 int j = eng->captureForOfficialCapture.at(i);
1437 if (capBegin[j] != EmptyCapture) {
1438 int len = capEnd[j] - capBegin[j];
1439 *c++ = (len > 0) ? pos + capBegin[j] : 0;
1440 *c++ = len;
1441 } else {
1442 *c++ = -1;
1443 *c++ = -1;
1444 }
1445 }
1446#endif
1447 } else {
1448 // we rely on 2's complement here
1449 memset(s: captured, c: -1, n: capturedSize * sizeof(int));
1450 }
1451}
1452
1453/*
1454 The three following functions add one state to the automaton and
1455 return the number of the state.
1456*/
1457
1458int QRegExpEngine::createState(QChar ch)
1459{
1460 return setupState(ch.unicode());
1461}
1462
1463int QRegExpEngine::createState(const QRegExpCharClass &cc)
1464{
1465#ifndef QT_NO_REGEXP_CCLASS
1466 int n = cl.size();
1467 cl += QRegExpCharClass(cc);
1468 return setupState(CharClassBit | n);
1469#else
1470 Q_UNUSED(cc);
1471 return setupState(CharClassBit);
1472#endif
1473}
1474
1475#ifndef QT_NO_REGEXP_BACKREF
1476int QRegExpEngine::createState(int bref)
1477{
1478 if (bref > nbrefs) {
1479 nbrefs = bref;
1480 if (nbrefs > MaxBackRefs) {
1481 error(RXERR_LIMIT);
1482 return 0;
1483 }
1484 }
1485 return setupState(BackRefBit | bref);
1486}
1487#endif
1488
1489/*
1490 The two following functions add a transition between all pairs of
1491 states (i, j) where i is found in from, and j is found in to.
1492
1493 Cat-transitions are distinguished from plus-transitions for
1494 capturing.
1495*/
1496
1497void QRegExpEngine::addCatTransitions(const QList<int> &from, const QList<int> &to)
1498{
1499 for (int i = 0; i < from.size(); i++)
1500 mergeInto(a: &s[from.at(i)].outs, b: to);
1501}
1502
1503#ifndef QT_NO_REGEXP_CAPTURE
1504void QRegExpEngine::addPlusTransitions(const QList<int> &from, const QList<int> &to, int atom)
1505{
1506 for (int i = 0; i < from.size(); i++) {
1507 QRegExpAutomatonState &st = s[from.at(i)];
1508 const QList<int> oldOuts = st.outs;
1509 mergeInto(a: &st.outs, b: to);
1510 if (f.at(i: atom).capture != QRegExpAtom::NoCapture) {
1511 for (int j = 0; j < to.size(); j++) {
1512 // ### st.reenter.contains(to.at(j)) check looks suspicious
1513 if (!st.reenter.contains(key: to.at(i: j)) &&
1514 !std::binary_search(first: oldOuts.constBegin(), last: oldOuts.constEnd(), val: to.at(i: j)))
1515 st.reenter.insert(key: to.at(i: j), value: atom);
1516 }
1517 }
1518 }
1519}
1520#endif
1521
1522#ifndef QT_NO_REGEXP_ANCHOR_ALT
1523/*
1524 Returns an anchor that means a OR b.
1525*/
1526int QRegExpEngine::anchorAlternation(int a, int b)
1527{
1528 if (((a & b) == a || (a & b) == b) && ((a | b) & Anchor_Alternation) == 0)
1529 return a & b;
1530
1531 int n = aa.size();
1532#ifndef QT_NO_REGEXP_OPTIM
1533 if (n > 0 && aa.at(i: n - 1).a == a && aa.at(i: n - 1).b == b)
1534 return Anchor_Alternation | (n - 1);
1535#endif
1536
1537 QRegExpAnchorAlternation element = {.a: a, .b: b};
1538 aa.append(t: element);
1539 return Anchor_Alternation | n;
1540}
1541
1542/*
1543 Returns an anchor that means a AND b.
1544*/
1545int QRegExpEngine::anchorConcatenation(int a, int b)
1546{
1547 if (((a | b) & Anchor_Alternation) == 0)
1548 return a | b;
1549 if ((b & Anchor_Alternation) != 0)
1550 qSwap(value1&: a, value2&: b);
1551
1552 int aprime = anchorConcatenation(a: aa.at(i: a ^ Anchor_Alternation).a, b);
1553 int bprime = anchorConcatenation(a: aa.at(i: a ^ Anchor_Alternation).b, b);
1554 return anchorAlternation(a: aprime, b: bprime);
1555}
1556#endif
1557
1558/*
1559 Adds anchor a on a transition caracterised by its from state and
1560 its to state.
1561*/
1562void QRegExpEngine::addAnchors(int from, int to, int a)
1563{
1564 QRegExpAutomatonState &st = s[from];
1565 if (st.anchors.contains(key: to))
1566 a = anchorAlternation(a: st.anchors.value(key: to), b: a);
1567 st.anchors.insert(key: to, value: a);
1568}
1569
1570#ifndef QT_NO_REGEXP_OPTIM
1571/*
1572 This function chooses between the good-string and the bad-character
1573 heuristics. It computes two scores and chooses the heuristic with
1574 the highest score.
1575
1576 Here are some common-sense constraints on the scores that should be
1577 respected if the formulas are ever modified: (1) If goodStr is
1578 empty, the good-string heuristic scores 0. (2) If the regular
1579 expression is trivial, the good-string heuristic should be used.
1580 (3) If the search is case insensitive, the good-string heuristic
1581 should be used, unless it scores 0. (Case insensitivity turns all
1582 entries of occ1 to 0.) (4) If (goodLateStart - goodEarlyStart) is
1583 big, the good-string heuristic should score less.
1584*/
1585void QRegExpEngine::heuristicallyChooseHeuristic()
1586{
1587 if (minl == 0) {
1588 useGoodStringHeuristic = false;
1589 } else if (trivial) {
1590 useGoodStringHeuristic = true;
1591 } else {
1592 /*
1593 Magic formula: The good string has to constitute a good
1594 proportion of the minimum-length string, and appear at a
1595 more-or-less known index.
1596 */
1597 int goodStringScore = (64 * goodStr.size() / minl) -
1598 (goodLateStart - goodEarlyStart);
1599 /*
1600 Less magic formula: We pick some characters at random, and
1601 check whether they are good or bad.
1602 */
1603 int badCharScore = 0;
1604 int step = qMax(a: 1, b: NumBadChars / 32);
1605 for (int i = 1; i < NumBadChars; i += step) {
1606 if (occ1.at(i) == NoOccurrence)
1607 badCharScore += minl;
1608 else
1609 badCharScore += occ1.at(i);
1610 }
1611 badCharScore /= minl;
1612 useGoodStringHeuristic = (goodStringScore > badCharScore);
1613 }
1614}
1615#endif
1616
1617#if defined(QT_DEBUG)
1618void QRegExpEngine::dump() const
1619{
1620 int i, j;
1621 qDebug(msg: "Case %ssensitive engine", cs ? "" : "in");
1622 qDebug(msg: " States");
1623 for (i = 0; i < s.size(); i++) {
1624 qDebug(msg: " %d%s", i, i == InitialState ? " (initial)" : i == FinalState ? " (final)" : "");
1625#ifndef QT_NO_REGEXP_CAPTURE
1626 if (nf > 0)
1627 qDebug(msg: " in atom %d", s[i].atom);
1628#endif
1629 int m = s[i].match;
1630 if ((m & CharClassBit) != 0) {
1631 qDebug(msg: " match character class %d", m ^ CharClassBit);
1632#ifndef QT_NO_REGEXP_CCLASS
1633 cl[m ^ CharClassBit].dump();
1634#else
1635 qDebug(" negative character class");
1636#endif
1637 } else if ((m & BackRefBit) != 0) {
1638 qDebug(msg: " match back-reference %d", m ^ BackRefBit);
1639 } else if (m >= 0x20 && m <= 0x7e) {
1640 qDebug(msg: " match 0x%.4x (%c)", m, m);
1641 } else {
1642 qDebug(msg: " match 0x%.4x", m);
1643 }
1644 for (j = 0; j < s[i].outs.size(); j++) {
1645 int next = s[i].outs[j];
1646 qDebug(msg: " -> %d", next);
1647 if (s[i].reenter.contains(key: next))
1648 qDebug(msg: " [reenter %d]", s[i].reenter[next]);
1649 if (s[i].anchors.value(key: next) != 0)
1650 qDebug(msg: " [anchors 0x%.8x]", s[i].anchors[next]);
1651 }
1652 }
1653#ifndef QT_NO_REGEXP_CAPTURE
1654 if (nf > 0) {
1655 qDebug(msg: " Atom Parent Capture");
1656 for (i = 0; i < nf; i++) {
1657 if (f[i].capture == QRegExpAtom::NoCapture) {
1658 qDebug(msg: " %6d %6d nil", i, f[i].parent);
1659 } else {
1660 int cap = f[i].capture;
1661 bool official = captureForOfficialCapture.contains(t: cap);
1662 qDebug(msg: " %6d %6d %6d %s", i, f[i].parent, f[i].capture,
1663 official ? "official" : "");
1664 }
1665 }
1666 }
1667#endif
1668#ifndef QT_NO_REGEXP_ANCHOR_ALT
1669 for (i = 0; i < aa.size(); i++)
1670 qDebug(msg: " Anchor alternation 0x%.8x: 0x%.8x 0x%.9x", i, aa[i].a, aa[i].b);
1671#endif
1672}
1673#endif
1674
1675void QRegExpEngine::setup()
1676{
1677 ref.storeRelaxed(newValue: 1);
1678#ifndef QT_NO_REGEXP_CAPTURE
1679 f.resize(size: 32);
1680 nf = 0;
1681 cf = -1;
1682#endif
1683 officialncap = 0;
1684 ncap = 0;
1685#ifndef QT_NO_REGEXP_OPTIM
1686 caretAnchored = true;
1687 trivial = true;
1688#endif
1689 valid = false;
1690#ifndef QT_NO_REGEXP_BACKREF
1691 nbrefs = 0;
1692#endif
1693#ifndef QT_NO_REGEXP_OPTIM
1694 useGoodStringHeuristic = true;
1695 minl = 0;
1696 occ1.fill(t: 0, newSize: NumBadChars);
1697#endif
1698}
1699
1700int QRegExpEngine::setupState(int match)
1701{
1702#ifndef QT_NO_REGEXP_CAPTURE
1703 s += QRegExpAutomatonState(cf, match);
1704#else
1705 s += QRegExpAutomatonState(match);
1706#endif
1707 return s.size() - 1;
1708}
1709
1710#ifndef QT_NO_REGEXP_CAPTURE
1711/*
1712 Functions startAtom() and finishAtom() should be called to delimit
1713 atoms. When a state is created, it is assigned to the current atom.
1714 The information is later used for capturing.
1715*/
1716int QRegExpEngine::startAtom(bool officialCapture)
1717{
1718 if ((nf & (nf + 1)) == 0 && nf + 1 >= f.size())
1719 f.resize(size: (nf + 1) << 1);
1720 f[nf].parent = cf;
1721 cf = nf++;
1722 f[cf].capture = officialCapture ? QRegExpAtom::OfficialCapture : QRegExpAtom::NoCapture;
1723 return cf;
1724}
1725
1726void QRegExpEngine::finishAtom(int atom, bool needCapture)
1727{
1728 if (greedyQuantifiers && needCapture && f[atom].capture == QRegExpAtom::NoCapture)
1729 f[atom].capture = QRegExpAtom::UnofficialCapture;
1730 cf = f.at(i: atom).parent;
1731}
1732#endif
1733
1734#ifndef QT_NO_REGEXP_LOOKAHEAD
1735/*
1736 Creates a lookahead anchor.
1737*/
1738int QRegExpEngine::addLookahead(QRegExpEngine *eng, bool negative)
1739{
1740 int n = ahead.size();
1741 if (n == MaxLookaheads) {
1742 error(RXERR_LIMIT);
1743 return 0;
1744 }
1745 ahead += new QRegExpLookahead(eng, negative);
1746 return Anchor_FirstLookahead << n;
1747}
1748#endif
1749
1750#ifndef QT_NO_REGEXP_CAPTURE
1751/*
1752 We want the longest leftmost captures.
1753*/
1754static bool isBetterCapture(int ncap, const int *begin1, const int *end1, const int *begin2,
1755 const int *end2)
1756{
1757 for (int i = 0; i < ncap; i++) {
1758 int delta = begin2[i] - begin1[i]; // it has to start early...
1759 if (delta == 0)
1760 delta = end1[i] - end2[i]; // ...and end late
1761
1762 if (delta != 0)
1763 return delta > 0;
1764 }
1765 return false;
1766}
1767#endif
1768
1769/*
1770 Returns \c true if anchor a matches at position pos + i in the input
1771 string, otherwise false.
1772*/
1773bool QRegExpMatchState::testAnchor(int i, int a, const int *capBegin)
1774{
1775 int j;
1776
1777#ifndef QT_NO_REGEXP_ANCHOR_ALT
1778 if ((a & QRegExpEngine::Anchor_Alternation) != 0)
1779 return testAnchor(i, a: eng->aa.at(i: a ^ QRegExpEngine::Anchor_Alternation).a, capBegin)
1780 || testAnchor(i, a: eng->aa.at(i: a ^ QRegExpEngine::Anchor_Alternation).b, capBegin);
1781#endif
1782
1783 if ((a & QRegExpEngine::Anchor_Caret) != 0) {
1784 if (pos + i != caretPos)
1785 return false;
1786 }
1787 if ((a & QRegExpEngine::Anchor_Dollar) != 0) {
1788 if (pos + i != len)
1789 return false;
1790 }
1791#ifndef QT_NO_REGEXP_ESCAPE
1792 if ((a & (QRegExpEngine::Anchor_Word | QRegExpEngine::Anchor_NonWord)) != 0) {
1793 bool before = false;
1794 bool after = false;
1795 if (pos + i != 0)
1796 before = isWord(ch: in[pos + i - 1]);
1797 if (pos + i != len)
1798 after = isWord(ch: in[pos + i]);
1799 if ((a & QRegExpEngine::Anchor_Word) != 0 && (before == after))
1800 return false;
1801 if ((a & QRegExpEngine::Anchor_NonWord) != 0 && (before != after))
1802 return false;
1803 }
1804#endif
1805#ifndef QT_NO_REGEXP_LOOKAHEAD
1806 if ((a & QRegExpEngine::Anchor_LookaheadMask) != 0) {
1807 const QList<QRegExpLookahead *> &ahead = eng->ahead;
1808 for (j = 0; j < ahead.size(); j++) {
1809 if ((a & (QRegExpEngine::Anchor_FirstLookahead << j)) != 0) {
1810 QRegExpMatchState matchState;
1811 matchState.prepareForMatch(eng: ahead[j]->eng);
1812 matchState.match(str0: in + pos + i, len0: len - pos - i, pos0: 0,
1813 minimal0: true, oneTest: true, caretIndex: caretPos - pos - i);
1814 if ((matchState.captured[0] == 0) == ahead[j]->neg)
1815 return false;
1816 }
1817 }
1818 }
1819#endif
1820#ifndef QT_NO_REGEXP_CAPTURE
1821#ifndef QT_NO_REGEXP_BACKREF
1822 for (j = 0; j < eng->nbrefs; j++) {
1823 if ((a & (QRegExpEngine::Anchor_BackRef1Empty << j)) != 0) {
1824 int i = eng->captureForOfficialCapture.at(i: j);
1825 if (capBegin[i] != EmptyCapture)
1826 return false;
1827 }
1828 }
1829#endif
1830#endif
1831 return true;
1832}
1833
1834#ifndef QT_NO_REGEXP_OPTIM
1835/*
1836 The three following functions are what Jeffrey Friedl would call
1837 transmissions (or bump-alongs). Using one or the other should make
1838 no difference except in performance.
1839*/
1840
1841bool QRegExpEngine::goodStringMatch(QRegExpMatchState &matchState) const
1842{
1843 int k = matchState.pos + goodEarlyStart;
1844 QStringMatcher matcher(goodStr.unicode(), goodStr.size(), cs);
1845 while ((k = matcher.indexIn(str: matchState.in, length: matchState.len, from: k)) != -1) {
1846 int from = k - goodLateStart;
1847 int to = k - goodEarlyStart;
1848 if (from > matchState.pos)
1849 matchState.pos = from;
1850
1851 while (matchState.pos <= to) {
1852 if (matchState.matchHere())
1853 return true;
1854 ++matchState.pos;
1855 }
1856 ++k;
1857 }
1858 return false;
1859}
1860
1861bool QRegExpEngine::badCharMatch(QRegExpMatchState &matchState) const
1862{
1863 int slideHead = 0;
1864 int slideNext = 0;
1865 int i;
1866 int lastPos = matchState.len - minl;
1867 memset(s: matchState.slideTab, c: 0, n: matchState.slideTabSize * sizeof(int));
1868
1869 /*
1870 Set up the slide table, used for the bad-character heuristic,
1871 using the table of first occurrence of each character.
1872 */
1873 for (i = 0; i < minl; i++) {
1874 int sk = occ1[BadChar(matchState.in[matchState.pos + i])];
1875 if (sk == NoOccurrence)
1876 sk = i + 1;
1877 if (sk > 0) {
1878 int k = i + 1 - sk;
1879 if (k < 0) {
1880 sk = i + 1;
1881 k = 0;
1882 }
1883 if (sk > matchState.slideTab[k])
1884 matchState.slideTab[k] = sk;
1885 }
1886 }
1887
1888 if (matchState.pos > lastPos)
1889 return false;
1890
1891 for (;;) {
1892 if (++slideNext >= matchState.slideTabSize)
1893 slideNext = 0;
1894 if (matchState.slideTab[slideHead] > 0) {
1895 if (matchState.slideTab[slideHead] - 1 > matchState.slideTab[slideNext])
1896 matchState.slideTab[slideNext] = matchState.slideTab[slideHead] - 1;
1897 matchState.slideTab[slideHead] = 0;
1898 } else {
1899 if (matchState.matchHere())
1900 return true;
1901 }
1902
1903 if (matchState.pos == lastPos)
1904 break;
1905
1906 /*
1907 Update the slide table. This code has much in common with
1908 the initialization code.
1909 */
1910 int sk = occ1[BadChar(matchState.in[matchState.pos + minl])];
1911 if (sk == NoOccurrence) {
1912 matchState.slideTab[slideNext] = minl;
1913 } else if (sk > 0) {
1914 int k = slideNext + minl - sk;
1915 if (k >= matchState.slideTabSize)
1916 k -= matchState.slideTabSize;
1917 if (sk > matchState.slideTab[k])
1918 matchState.slideTab[k] = sk;
1919 }
1920 slideHead = slideNext;
1921 ++matchState.pos;
1922 }
1923 return false;
1924}
1925#else
1926bool QRegExpEngine::bruteMatch(QRegExpMatchState &matchState) const
1927{
1928 while (matchState.pos <= matchState.len) {
1929 if (matchState.matchHere())
1930 return true;
1931 ++matchState.pos;
1932 }
1933 return false;
1934}
1935#endif
1936
1937/*
1938 Here's the core of the engine. It tries to do a match here and now.
1939*/
1940bool QRegExpMatchState::matchHere()
1941{
1942 int ncur = 1, nnext = 0;
1943 int i = 0, j, k, m;
1944 bool stop = false;
1945
1946 matchLen = -1;
1947 oneTestMatchedLen = -1;
1948 curStack[0] = QRegExpEngine::InitialState;
1949
1950 int ncap = eng->ncap;
1951#ifndef QT_NO_REGEXP_CAPTURE
1952 if (ncap > 0) {
1953 for (j = 0; j < ncap; j++) {
1954 curCapBegin[j] = EmptyCapture;
1955 curCapEnd[j] = EmptyCapture;
1956 }
1957 }
1958#endif
1959
1960#ifndef QT_NO_REGEXP_BACKREF
1961 while ((ncur > 0 || !sleeping.isEmpty()) && i <= len - pos && !stop)
1962#else
1963 while (ncur > 0 && i <= len - pos && !stop)
1964#endif
1965 {
1966 int ch = (i < len - pos) ? in[pos + i].unicode() : 0;
1967 for (j = 0; j < ncur; j++) {
1968 int cur = curStack[j];
1969 const QRegExpAutomatonState &scur = eng->s.at(i: cur);
1970 const QList<int> &outs = scur.outs;
1971 for (k = 0; k < outs.size(); k++) {
1972 int next = outs.at(i: k);
1973 const QRegExpAutomatonState &snext = eng->s.at(i: next);
1974 bool inside = true;
1975#if !defined(QT_NO_REGEXP_BACKREF) && !defined(QT_NO_REGEXP_CAPTURE)
1976 int needSomeSleep = 0;
1977#endif
1978
1979 /*
1980 First, check if the anchors are anchored properly.
1981 */
1982 int a = scur.anchors.value(key: next);
1983 if (a != 0 && !testAnchor(i, a, capBegin: curCapBegin + j * ncap))
1984 inside = false;
1985
1986 /*
1987 If indeed they are, check if the input character is
1988 correct for this transition.
1989 */
1990 if (inside) {
1991 m = snext.match;
1992 if ((m & (QRegExpEngine::CharClassBit | QRegExpEngine::BackRefBit)) == 0) {
1993 if (eng->cs)
1994 inside = (m == ch);
1995 else
1996 inside = (QChar(m).toLower() == QChar(ch).toLower());
1997 } else if (next == QRegExpEngine::FinalState) {
1998 matchLen = i;
1999 stop = minimal;
2000 inside = true;
2001 } else if ((m & QRegExpEngine::CharClassBit) != 0) {
2002#ifndef QT_NO_REGEXP_CCLASS
2003 const QRegExpCharClass &cc = eng->cl.at(i: m ^ QRegExpEngine::CharClassBit);
2004 if (eng->cs)
2005 inside = cc.in(ch: QChar(ch));
2006 else if (cc.negative())
2007 inside = cc.in(ch: QChar(ch).toLower()) &&
2008 cc.in(ch: QChar(ch).toUpper());
2009 else
2010 inside = cc.in(ch: QChar(ch).toLower()) ||
2011 cc.in(ch: QChar(ch).toUpper());
2012#endif
2013#if !defined(QT_NO_REGEXP_BACKREF) && !defined(QT_NO_REGEXP_CAPTURE)
2014 } else { /* ((m & QRegExpEngine::BackRefBit) != 0) */
2015 int bref = m ^ QRegExpEngine::BackRefBit;
2016 int ell = j * ncap + eng->captureForOfficialCapture.at(i: bref - 1);
2017
2018 inside = bref <= ncap && curCapBegin[ell] != EmptyCapture;
2019 if (inside) {
2020 if (eng->cs)
2021 inside = (in[pos + curCapBegin[ell]] == QChar(ch));
2022 else
2023 inside = (in[pos + curCapBegin[ell]].toLower()
2024 == QChar(ch).toLower());
2025 }
2026
2027 if (inside) {
2028 int delta;
2029 if (curCapEnd[ell] == EmptyCapture)
2030 delta = i - curCapBegin[ell];
2031 else
2032 delta = curCapEnd[ell] - curCapBegin[ell];
2033
2034 inside = (delta <= len - (pos + i));
2035 if (inside && delta > 1) {
2036 int n = 1;
2037 if (eng->cs) {
2038 while (n < delta) {
2039 if (in[pos + curCapBegin[ell] + n]
2040 != in[pos + i + n])
2041 break;
2042 ++n;
2043 }
2044 } else {
2045 while (n < delta) {
2046 QChar a = in[pos + curCapBegin[ell] + n];
2047 QChar b = in[pos + i + n];
2048 if (a.toLower() != b.toLower())
2049 break;
2050 ++n;
2051 }
2052 }
2053 inside = (n == delta);
2054 if (inside)
2055 needSomeSleep = delta - 1;
2056 }
2057 }
2058#endif
2059 }
2060 }
2061
2062 /*
2063 We must now update our data structures.
2064 */
2065 if (inside) {
2066#ifndef QT_NO_REGEXP_CAPTURE
2067 int *capBegin, *capEnd;
2068#endif
2069 /*
2070 If the next state was not encountered yet, all
2071 is fine.
2072 */
2073 if ((m = inNextStack[next]) == -1) {
2074 m = nnext++;
2075 nextStack[m] = next;
2076 inNextStack[next] = m;
2077#ifndef QT_NO_REGEXP_CAPTURE
2078 capBegin = nextCapBegin + m * ncap;
2079 capEnd = nextCapEnd + m * ncap;
2080
2081 /*
2082 Otherwise, we'll first maintain captures in
2083 temporary arrays, and decide at the end whether
2084 it's best to keep the previous capture zones or
2085 the new ones.
2086 */
2087 } else {
2088 capBegin = tempCapBegin;
2089 capEnd = tempCapEnd;
2090#endif
2091 }
2092
2093#ifndef QT_NO_REGEXP_CAPTURE
2094 /*
2095 Updating the capture zones is much of a task.
2096 */
2097 if (ncap > 0) {
2098 memcpy(dest: capBegin, src: curCapBegin + j * ncap, n: ncap * sizeof(int));
2099 memcpy(dest: capEnd, src: curCapEnd + j * ncap, n: ncap * sizeof(int));
2100 int c = scur.atom, n = snext.atom;
2101 int p = -1, q = -1;
2102 int cap;
2103
2104 /*
2105 Lemma 1. For any x in the range [0..nf), we
2106 have f[x].parent < x.
2107
2108 Proof. By looking at startAtom(), it is
2109 clear that cf < nf holds all the time, and
2110 thus that f[nf].parent < nf.
2111 */
2112
2113 /*
2114 If we are reentering an atom, we empty all
2115 capture zones inside it.
2116 */
2117 if ((q = scur.reenter.value(key: next)) != 0) {
2118 QBitArray b(eng->nf, false);
2119 b.setBit(i: q, val: true);
2120 for (int ell = q + 1; ell < eng->nf; ell++) {
2121 if (b.testBit(i: eng->f.at(i: ell).parent)) {
2122 b.setBit(i: ell, val: true);
2123 cap = eng->f.at(i: ell).capture;
2124 if (cap >= 0) {
2125 capBegin[cap] = EmptyCapture;
2126 capEnd[cap] = EmptyCapture;
2127 }
2128 }
2129 }
2130 p = eng->f.at(i: q).parent;
2131
2132 /*
2133 Otherwise, close the capture zones we are
2134 leaving. We are leaving f[c].capture,
2135 f[f[c].parent].capture,
2136 f[f[f[c].parent].parent].capture, ...,
2137 until f[x].capture, with x such that
2138 f[x].parent is the youngest common ancestor
2139 for c and n.
2140
2141 We go up along c's and n's ancestry until
2142 we find x.
2143 */
2144 } else {
2145 p = c;
2146 q = n;
2147 while (p != q) {
2148 if (p > q) {
2149 cap = eng->f.at(i: p).capture;
2150 if (cap >= 0) {
2151 if (capBegin[cap] == i) {
2152 capBegin[cap] = EmptyCapture;
2153 capEnd[cap] = EmptyCapture;
2154 } else {
2155 capEnd[cap] = i;
2156 }
2157 }
2158 p = eng->f.at(i: p).parent;
2159 } else {
2160 q = eng->f.at(i: q).parent;
2161 }
2162 }
2163 }
2164
2165 /*
2166 In any case, we now open the capture zones
2167 we are entering. We work upwards from n
2168 until we reach p (the parent of the atom we
2169 reenter or the youngest common ancestor).
2170 */
2171 while (n > p) {
2172 cap = eng->f.at(i: n).capture;
2173 if (cap >= 0) {
2174 capBegin[cap] = i;
2175 capEnd[cap] = EmptyCapture;
2176 }
2177 n = eng->f.at(i: n).parent;
2178 }
2179 /*
2180 If the next state was already in
2181 nextStack, we must choose carefully which
2182 capture zones we want to keep.
2183 */
2184 if (capBegin == tempCapBegin &&
2185 isBetterCapture(ncap, begin1: capBegin, end1: capEnd, begin2: nextCapBegin + m * ncap,
2186 end2: nextCapEnd + m * ncap)) {
2187 memcpy(dest: nextCapBegin + m * ncap, src: capBegin, n: ncap * sizeof(int));
2188 memcpy(dest: nextCapEnd + m * ncap, src: capEnd, n: ncap * sizeof(int));
2189 }
2190 }
2191#ifndef QT_NO_REGEXP_BACKREF
2192 /*
2193 We are done with updating the capture zones.
2194 It's now time to put the next state to sleep,
2195 if it needs to, and to remove it from
2196 nextStack.
2197 */
2198 if (needSomeSleep > 0) {
2199 QList<int> zzZ(2 + 2 * ncap);
2200 zzZ[0] = i + needSomeSleep;
2201 zzZ[1] = next;
2202 if (ncap > 0) {
2203 memcpy(dest: zzZ.data() + 2, src: capBegin, n: ncap * sizeof(int));
2204 memcpy(dest: zzZ.data() + 2 + ncap, src: capEnd, n: ncap * sizeof(int));
2205 }
2206 inNextStack[nextStack[--nnext]] = -1;
2207 sleeping.append(t: zzZ);
2208 }
2209#endif
2210#endif
2211 }
2212 }
2213 }
2214#ifndef QT_NO_REGEXP_CAPTURE
2215 /*
2216 If we reached the final state, hurray! Copy the captured
2217 zone.
2218 */
2219 if (ncap > 0 && (m = inNextStack[QRegExpEngine::FinalState]) != -1) {
2220 memcpy(dest: capBegin, src: nextCapBegin + m * ncap, n: ncap * sizeof(int));
2221 memcpy(dest: capEnd, src: nextCapEnd + m * ncap, n: ncap * sizeof(int));
2222 }
2223#ifndef QT_NO_REGEXP_BACKREF
2224 /*
2225 It's time to wake up the sleepers.
2226 */
2227 j = 0;
2228 while (j < sleeping.size()) {
2229 if (sleeping.at(i: j)[0] == i) {
2230 const QList<int> &zzZ = sleeping.at(i: j);
2231 int next = zzZ[1];
2232 const int *capBegin = zzZ.data() + 2;
2233 const int *capEnd = zzZ.data() + 2 + ncap;
2234 bool copyOver = true;
2235
2236 if ((m = inNextStack[next]) == -1) {
2237 m = nnext++;
2238 nextStack[m] = next;
2239 inNextStack[next] = m;
2240 } else {
2241 copyOver = isBetterCapture(ncap, begin1: nextCapBegin + m * ncap, end1: nextCapEnd + m * ncap,
2242 begin2: capBegin, end2: capEnd);
2243 }
2244 if (copyOver) {
2245 memcpy(dest: nextCapBegin + m * ncap, src: capBegin, n: ncap * sizeof(int));
2246 memcpy(dest: nextCapEnd + m * ncap, src: capEnd, n: ncap * sizeof(int));
2247 }
2248
2249 sleeping.removeAt(i: j);
2250 } else {
2251 ++j;
2252 }
2253 }
2254#endif
2255#endif
2256 for (j = 0; j < nnext; j++)
2257 inNextStack[nextStack[j]] = -1;
2258
2259 // avoid needless iteration that confuses oneTestMatchedLen
2260 if (nnext == 1 && nextStack[0] == QRegExpEngine::FinalState
2261#ifndef QT_NO_REGEXP_BACKREF
2262 && sleeping.isEmpty()
2263#endif
2264 )
2265 stop = true;
2266
2267 qSwap(value1&: curStack, value2&: nextStack);
2268#ifndef QT_NO_REGEXP_CAPTURE
2269 qSwap(value1&: curCapBegin, value2&: nextCapBegin);
2270 qSwap(value1&: curCapEnd, value2&: nextCapEnd);
2271#endif
2272 ncur = nnext;
2273 nnext = 0;
2274 ++i;
2275 }
2276
2277#ifndef QT_NO_REGEXP_BACKREF
2278 /*
2279 If minimal matching is enabled, we might have some sleepers
2280 left.
2281 */
2282 if (!sleeping.isEmpty())
2283 sleeping.clear();
2284#endif
2285
2286 oneTestMatchedLen = i - 1;
2287 return (matchLen >= 0);
2288}
2289
2290#ifndef QT_NO_REGEXP_CCLASS
2291
2292QRegExpCharClass::QRegExpCharClass()
2293 : c(0), n(false)
2294{
2295#ifndef QT_NO_REGEXP_OPTIM
2296 occ1.fill(t: NoOccurrence, newSize: NumBadChars);
2297#endif
2298}
2299
2300void QRegExpCharClass::clear()
2301{
2302 c = 0;
2303 r.clear();
2304 n = false;
2305}
2306
2307void QRegExpCharClass::setNegative(bool negative)
2308{
2309 n = negative;
2310#ifndef QT_NO_REGEXP_OPTIM
2311 occ1.fill(t: 0, newSize: NumBadChars);
2312#endif
2313}
2314
2315void QRegExpCharClass::addCategories(uint cats)
2316{
2317 static const int all_cats = FLAG(QChar::Mark_NonSpacing) |
2318 FLAG(QChar::Mark_SpacingCombining) |
2319 FLAG(QChar::Mark_Enclosing) |
2320 FLAG(QChar::Number_DecimalDigit) |
2321 FLAG(QChar::Number_Letter) |
2322 FLAG(QChar::Number_Other) |
2323 FLAG(QChar::Separator_Space) |
2324 FLAG(QChar::Separator_Line) |
2325 FLAG(QChar::Separator_Paragraph) |
2326 FLAG(QChar::Other_Control) |
2327 FLAG(QChar::Other_Format) |
2328 FLAG(QChar::Other_Surrogate) |
2329 FLAG(QChar::Other_PrivateUse) |
2330 FLAG(QChar::Other_NotAssigned) |
2331 FLAG(QChar::Letter_Uppercase) |
2332 FLAG(QChar::Letter_Lowercase) |
2333 FLAG(QChar::Letter_Titlecase) |
2334 FLAG(QChar::Letter_Modifier) |
2335 FLAG(QChar::Letter_Other) |
2336 FLAG(QChar::Punctuation_Connector) |
2337 FLAG(QChar::Punctuation_Dash) |
2338 FLAG(QChar::Punctuation_Open) |
2339 FLAG(QChar::Punctuation_Close) |
2340 FLAG(QChar::Punctuation_InitialQuote) |
2341 FLAG(QChar::Punctuation_FinalQuote) |
2342 FLAG(QChar::Punctuation_Other) |
2343 FLAG(QChar::Symbol_Math) |
2344 FLAG(QChar::Symbol_Currency) |
2345 FLAG(QChar::Symbol_Modifier) |
2346 FLAG(QChar::Symbol_Other);
2347 c |= (all_cats & cats);
2348#ifndef QT_NO_REGEXP_OPTIM
2349 occ1.fill(t: 0, newSize: NumBadChars);
2350#endif
2351}
2352
2353void QRegExpCharClass::addRange(ushort from, ushort to)
2354{
2355 if (from > to)
2356 qSwap(value1&: from, value2&: to);
2357 int m = r.size();
2358 r.resize(size: m + 1);
2359 r[m].from = from;
2360 r[m].len = to - from + 1;
2361
2362#ifndef QT_NO_REGEXP_OPTIM
2363 int i;
2364
2365 if (to - from < NumBadChars) {
2366 if (from % NumBadChars <= to % NumBadChars) {
2367 for (i = from % NumBadChars; i <= to % NumBadChars; i++)
2368 occ1[i] = 0;
2369 } else {
2370 for (i = 0; i <= to % NumBadChars; i++)
2371 occ1[i] = 0;
2372 for (i = from % NumBadChars; i < NumBadChars; i++)
2373 occ1[i] = 0;
2374 }
2375 } else {
2376 occ1.fill(t: 0, newSize: NumBadChars);
2377 }
2378#endif
2379}
2380
2381bool QRegExpCharClass::in(QChar ch) const
2382{
2383#ifndef QT_NO_REGEXP_OPTIM
2384 if (occ1.at(BadChar(ch)) == NoOccurrence)
2385 return n;
2386#endif
2387
2388 if (c != 0 && (c & FLAG(ch.category())) != 0)
2389 return !n;
2390
2391 const int uc = ch.unicode();
2392 int size = r.size();
2393
2394 for (int i = 0; i < size; ++i) {
2395 const QRegExpCharClassRange &range = r.at(i);
2396 if (uint(uc - range.from) < uint(r.at(i).len))
2397 return !n;
2398 }
2399 return n;
2400}
2401
2402#if defined(QT_DEBUG)
2403void QRegExpCharClass::dump() const
2404{
2405 int i;
2406 qDebug(msg: " %stive character class", n ? "nega" : "posi");
2407#ifndef QT_NO_REGEXP_CCLASS
2408 if (c != 0)
2409 qDebug(msg: " categories 0x%.8x", c);
2410#endif
2411 for (i = 0; i < r.size(); i++)
2412 qDebug(msg: " 0x%.4x through 0x%.4x", r[i].from, r[i].from + r[i].len - 1);
2413}
2414#endif
2415#endif
2416
2417QRegExpEngine::Box::Box(QRegExpEngine *engine)
2418 : eng(engine), skipanchors(0)
2419#ifndef QT_NO_REGEXP_OPTIM
2420 , earlyStart(0), lateStart(0), maxl(0)
2421#endif
2422{
2423#ifndef QT_NO_REGEXP_OPTIM
2424 occ1.fill(t: NoOccurrence, newSize: NumBadChars);
2425#endif
2426 minl = 0;
2427}
2428
2429QRegExpEngine::Box &QRegExpEngine::Box::operator=(const Box &b)
2430{
2431 eng = b.eng;
2432 ls = b.ls;
2433 rs = b.rs;
2434 lanchors = b.lanchors;
2435 ranchors = b.ranchors;
2436 skipanchors = b.skipanchors;
2437#ifndef QT_NO_REGEXP_OPTIM
2438 earlyStart = b.earlyStart;
2439 lateStart = b.lateStart;
2440 str = b.str;
2441 leftStr = b.leftStr;
2442 rightStr = b.rightStr;
2443 maxl = b.maxl;
2444 occ1 = b.occ1;
2445#endif
2446 minl = b.minl;
2447 return *this;
2448}
2449
2450void QRegExpEngine::Box::set(QChar ch)
2451{
2452 ls.resize(size: 1);
2453 ls[0] = eng->createState(ch);
2454 rs = ls;
2455#ifndef QT_NO_REGEXP_OPTIM
2456 str = ch;
2457 leftStr = ch;
2458 rightStr = ch;
2459 maxl = 1;
2460 occ1[BadChar(ch)] = 0;
2461#endif
2462 minl = 1;
2463}
2464
2465void QRegExpEngine::Box::set(const QRegExpCharClass &cc)
2466{
2467 ls.resize(size: 1);
2468 ls[0] = eng->createState(cc);
2469 rs = ls;
2470#ifndef QT_NO_REGEXP_OPTIM
2471 maxl = 1;
2472 occ1 = cc.firstOccurrence();
2473#endif
2474 minl = 1;
2475}
2476
2477#ifndef QT_NO_REGEXP_BACKREF
2478void QRegExpEngine::Box::set(int bref)
2479{
2480 ls.resize(size: 1);
2481 ls[0] = eng->createState(bref);
2482 rs = ls;
2483 if (bref >= 1 && bref <= MaxBackRefs)
2484 skipanchors = Anchor_BackRef0Empty << bref;
2485#ifndef QT_NO_REGEXP_OPTIM
2486 maxl = InftyLen;
2487#endif
2488 minl = 0;
2489}
2490#endif
2491
2492void QRegExpEngine::Box::cat(const Box &b)
2493{
2494 eng->addCatTransitions(from: rs, to: b.ls);
2495 addAnchorsToEngine(to: b);
2496 if (minl == 0) {
2497 lanchors.insert(map: b.lanchors);
2498 if (skipanchors != 0) {
2499 for (int i = 0; i < b.ls.size(); i++) {
2500 int a = eng->anchorConcatenation(a: lanchors.value(key: b.ls.at(i), defaultValue: 0), b: skipanchors);
2501 lanchors.insert(key: b.ls.at(i), value: a);
2502 }
2503 }
2504 mergeInto(a: &ls, b: b.ls);
2505 }
2506 if (b.minl == 0) {
2507 ranchors.insert(map: b.ranchors);
2508 if (b.skipanchors != 0) {
2509 for (int i = 0; i < rs.size(); i++) {
2510 int a = eng->anchorConcatenation(a: ranchors.value(key: rs.at(i), defaultValue: 0), b: b.skipanchors);
2511 ranchors.insert(key: rs.at(i), value: a);
2512 }
2513 }
2514 mergeInto(a: &rs, b: b.rs);
2515 } else {
2516 ranchors = b.ranchors;
2517 rs = b.rs;
2518 }
2519
2520#ifndef QT_NO_REGEXP_OPTIM
2521 if (maxl != InftyLen) {
2522 if (rightStr.size() + b.leftStr.size() >
2523 qMax(a: str.size(), b: b.str.size())) {
2524 earlyStart = minl - rightStr.size();
2525 lateStart = maxl - rightStr.size();
2526 str = rightStr + b.leftStr;
2527 } else if (b.str.size() > str.size()) {
2528 earlyStart = minl + b.earlyStart;
2529 lateStart = maxl + b.lateStart;
2530 str = b.str;
2531 }
2532 }
2533
2534 if (leftStr.size() == maxl)
2535 leftStr += b.leftStr;
2536
2537 if (b.rightStr.size() == b.maxl) {
2538 rightStr += b.rightStr;
2539 } else {
2540 rightStr = b.rightStr;
2541 }
2542
2543 if (maxl == InftyLen || b.maxl == InftyLen) {
2544 maxl = InftyLen;
2545 } else {
2546 maxl += b.maxl;
2547 }
2548
2549 for (int i = 0; i < NumBadChars; i++) {
2550 if (b.occ1.at(i) != NoOccurrence && minl + b.occ1.at(i) < occ1.at(i))
2551 occ1[i] = minl + b.occ1.at(i);
2552 }
2553#endif
2554
2555 minl += b.minl;
2556 if (minl == 0)
2557 skipanchors = eng->anchorConcatenation(a: skipanchors, b: b.skipanchors);
2558 else
2559 skipanchors = 0;
2560}
2561
2562void QRegExpEngine::Box::orx(const Box &b)
2563{
2564 mergeInto(a: &ls, b: b.ls);
2565 lanchors.insert(map: b.lanchors);
2566 mergeInto(a: &rs, b: b.rs);
2567 ranchors.insert(map: b.ranchors);
2568
2569 if (b.minl == 0) {
2570 if (minl == 0)
2571 skipanchors = eng->anchorAlternation(a: skipanchors, b: b.skipanchors);
2572 else
2573 skipanchors = b.skipanchors;
2574 }
2575
2576#ifndef QT_NO_REGEXP_OPTIM
2577 for (int i = 0; i < NumBadChars; i++) {
2578 if (occ1.at(i) > b.occ1.at(i))
2579 occ1[i] = b.occ1.at(i);
2580 }
2581 earlyStart = 0;
2582 lateStart = 0;
2583 str = QString();
2584 leftStr = QString();
2585 rightStr = QString();
2586 if (b.maxl > maxl)
2587 maxl = b.maxl;
2588#endif
2589 if (b.minl < minl)
2590 minl = b.minl;
2591}
2592
2593void QRegExpEngine::Box::plus(int atom)
2594{
2595#ifndef QT_NO_REGEXP_CAPTURE
2596 eng->addPlusTransitions(from: rs, to: ls, atom);
2597#else
2598 Q_UNUSED(atom);
2599 eng->addCatTransitions(rs, ls);
2600#endif
2601 addAnchorsToEngine(to: *this);
2602#ifndef QT_NO_REGEXP_OPTIM
2603 maxl = InftyLen;
2604#endif
2605}
2606
2607void QRegExpEngine::Box::opt()
2608{
2609#ifndef QT_NO_REGEXP_OPTIM
2610 earlyStart = 0;
2611 lateStart = 0;
2612 str = QString();
2613 leftStr = QString();
2614 rightStr = QString();
2615#endif
2616 skipanchors = 0;
2617 minl = 0;
2618}
2619
2620void QRegExpEngine::Box::catAnchor(int a)
2621{
2622 if (a != 0) {
2623 for (int i = 0; i < rs.size(); i++) {
2624 a = eng->anchorConcatenation(a: ranchors.value(key: rs.at(i), defaultValue: 0), b: a);
2625 ranchors.insert(key: rs.at(i), value: a);
2626 }
2627 if (minl == 0)
2628 skipanchors = eng->anchorConcatenation(a: skipanchors, b: a);
2629 }
2630}
2631
2632#ifndef QT_NO_REGEXP_OPTIM
2633void QRegExpEngine::Box::setupHeuristics()
2634{
2635 eng->goodEarlyStart = earlyStart;
2636 eng->goodLateStart = lateStart;
2637 eng->goodStr = eng->cs ? str : str.toLower();
2638
2639 eng->minl = minl;
2640 if (eng->cs) {
2641 /*
2642 A regular expression such as 112|1 has occ1['2'] = 2 and minl =
2643 1 at this point. An entry of occ1 has to be at most minl or
2644 infinity for the rest of the algorithm to go well.
2645
2646 We waited until here before normalizing these cases (instead of
2647 doing it in Box::orx()) because sometimes things improve by
2648 themselves. Consider for example (112|1)34.
2649 */
2650 for (int i = 0; i < NumBadChars; i++) {
2651 if (occ1.at(i) != NoOccurrence && occ1.at(i) >= minl)
2652 occ1[i] = minl;
2653 }
2654 eng->occ1 = occ1;
2655 } else {
2656 eng->occ1.fill(t: 0, newSize: NumBadChars);
2657 }
2658
2659 eng->heuristicallyChooseHeuristic();
2660}
2661#endif
2662
2663#if defined(QT_DEBUG)
2664void QRegExpEngine::Box::dump() const
2665{
2666 int i;
2667 qDebug(msg: "Box of at least %d character%s", minl, minl == 1 ? "" : "s");
2668 qDebug(msg: " Left states:");
2669 for (i = 0; i < ls.size(); i++) {
2670 if (lanchors.value(key: ls[i], defaultValue: 0) == 0)
2671 qDebug(msg: " %d", ls[i]);
2672 else
2673 qDebug(msg: " %d [anchors 0x%.8x]", ls[i], lanchors[ls[i]]);
2674 }
2675 qDebug(msg: " Right states:");
2676 for (i = 0; i < rs.size(); i++) {
2677 if (ranchors.value(key: rs[i], defaultValue: 0) == 0)
2678 qDebug(msg: " %d", rs[i]);
2679 else
2680 qDebug(msg: " %d [anchors 0x%.8x]", rs[i], ranchors[rs[i]]);
2681 }
2682 qDebug(msg: " Skip anchors: 0x%.8x", skipanchors);
2683}
2684#endif
2685
2686void QRegExpEngine::Box::addAnchorsToEngine(const Box &to) const
2687{
2688 for (int i = 0; i < to.ls.size(); i++) {
2689 for (int j = 0; j < rs.size(); j++) {
2690 int a = eng->anchorConcatenation(a: ranchors.value(key: rs.at(i: j), defaultValue: 0),
2691 b: to.lanchors.value(key: to.ls.at(i), defaultValue: 0));
2692 eng->addAnchors(from: rs[j], to: to.ls[i], a);
2693 }
2694 }
2695}
2696
2697#ifndef QT_NO_REGEXP_CCLASS
2698// fast lookup hash for xml schema extensions
2699// sorted by name for b-search
2700static const struct CategoriesRangeMapEntry {
2701 const char name[40];
2702 uint first, second;
2703} categoriesRangeMap[] = {
2704 { .name: "AegeanNumbers", .first: 0x10100, .second: 0x1013F },
2705 { .name: "AlphabeticPresentationForms", .first: 0xFB00, .second: 0xFB4F },
2706 { .name: "AncientGreekMusicalNotation", .first: 0x1D200, .second: 0x1D24F },
2707 { .name: "AncientGreekNumbers", .first: 0x10140, .second: 0x1018F },
2708 { .name: "Arabic", .first: 0x0600, .second: 0x06FF },
2709 { .name: "ArabicPresentationForms-A", .first: 0xFB50, .second: 0xFDFF },
2710 { .name: "ArabicPresentationForms-B", .first: 0xFE70, .second: 0xFEFF },
2711 { .name: "ArabicSupplement", .first: 0x0750, .second: 0x077F },
2712 { .name: "Armenian", .first: 0x0530, .second: 0x058F },
2713 { .name: "Arrows", .first: 0x2190, .second: 0x21FF },
2714 { .name: "BasicLatin", .first: 0x0000, .second: 0x007F },
2715 { .name: "Bengali", .first: 0x0980, .second: 0x09FF },
2716 { .name: "BlockElements", .first: 0x2580, .second: 0x259F },
2717 { .name: "Bopomofo", .first: 0x3100, .second: 0x312F },
2718 { .name: "BopomofoExtended", .first: 0x31A0, .second: 0x31BF },
2719 { .name: "BoxDrawing", .first: 0x2500, .second: 0x257F },
2720 { .name: "BraillePatterns", .first: 0x2800, .second: 0x28FF },
2721 { .name: "Buginese", .first: 0x1A00, .second: 0x1A1F },
2722 { .name: "Buhid", .first: 0x1740, .second: 0x175F },
2723 { .name: "ByzantineMusicalSymbols", .first: 0x1D000, .second: 0x1D0FF },
2724 { .name: "CJKCompatibility", .first: 0x3300, .second: 0x33FF },
2725 { .name: "CJKCompatibilityForms", .first: 0xFE30, .second: 0xFE4F },
2726 { .name: "CJKCompatibilityIdeographs", .first: 0xF900, .second: 0xFAFF },
2727 { .name: "CJKCompatibilityIdeographsSupplement", .first: 0x2F800, .second: 0x2FA1F },
2728 { .name: "CJKRadicalsSupplement", .first: 0x2E80, .second: 0x2EFF },
2729 { .name: "CJKStrokes", .first: 0x31C0, .second: 0x31EF },
2730 { .name: "CJKSymbolsandPunctuation", .first: 0x3000, .second: 0x303F },
2731 { .name: "CJKUnifiedIdeographs", .first: 0x4E00, .second: 0x9FFF },
2732 { .name: "CJKUnifiedIdeographsExtensionA", .first: 0x3400, .second: 0x4DB5 },
2733 { .name: "CJKUnifiedIdeographsExtensionB", .first: 0x20000, .second: 0x2A6DF },
2734 { .name: "Cherokee", .first: 0x13A0, .second: 0x13FF },
2735 { .name: "CombiningDiacriticalMarks", .first: 0x0300, .second: 0x036F },
2736 { .name: "CombiningDiacriticalMarksSupplement", .first: 0x1DC0, .second: 0x1DFF },
2737 { .name: "CombiningHalfMarks", .first: 0xFE20, .second: 0xFE2F },
2738 { .name: "CombiningMarksforSymbols", .first: 0x20D0, .second: 0x20FF },
2739 { .name: "ControlPictures", .first: 0x2400, .second: 0x243F },
2740 { .name: "Coptic", .first: 0x2C80, .second: 0x2CFF },
2741 { .name: "CurrencySymbols", .first: 0x20A0, .second: 0x20CF },
2742 { .name: "CypriotSyllabary", .first: 0x10800, .second: 0x1083F },
2743 { .name: "Cyrillic", .first: 0x0400, .second: 0x04FF },
2744 { .name: "CyrillicSupplement", .first: 0x0500, .second: 0x052F },
2745 { .name: "Deseret", .first: 0x10400, .second: 0x1044F },
2746 { .name: "Devanagari", .first: 0x0900, .second: 0x097F },
2747 { .name: "Dingbats", .first: 0x2700, .second: 0x27BF },
2748 { .name: "EnclosedAlphanumerics", .first: 0x2460, .second: 0x24FF },
2749 { .name: "EnclosedCJKLettersandMonths", .first: 0x3200, .second: 0x32FF },
2750 { .name: "Ethiopic", .first: 0x1200, .second: 0x137F },
2751 { .name: "EthiopicExtended", .first: 0x2D80, .second: 0x2DDF },
2752 { .name: "EthiopicSupplement", .first: 0x1380, .second: 0x139F },
2753 { .name: "GeneralPunctuation", .first: 0x2000, .second: 0x206F },
2754 { .name: "GeometricShapes", .first: 0x25A0, .second: 0x25FF },
2755 { .name: "Georgian", .first: 0x10A0, .second: 0x10FF },
2756 { .name: "GeorgianSupplement", .first: 0x2D00, .second: 0x2D2F },
2757 { .name: "Glagolitic", .first: 0x2C00, .second: 0x2C5F },
2758 { .name: "Gothic", .first: 0x10330, .second: 0x1034F },
2759 { .name: "Greek", .first: 0x0370, .second: 0x03FF },
2760 { .name: "GreekExtended", .first: 0x1F00, .second: 0x1FFF },
2761 { .name: "Gujarati", .first: 0x0A80, .second: 0x0AFF },
2762 { .name: "Gurmukhi", .first: 0x0A00, .second: 0x0A7F },
2763 { .name: "HalfwidthandFullwidthForms", .first: 0xFF00, .second: 0xFFEF },
2764 { .name: "HangulCompatibilityJamo", .first: 0x3130, .second: 0x318F },
2765 { .name: "HangulJamo", .first: 0x1100, .second: 0x11FF },
2766 { .name: "HangulSyllables", .first: 0xAC00, .second: 0xD7A3 },
2767 { .name: "Hanunoo", .first: 0x1720, .second: 0x173F },
2768 { .name: "Hebrew", .first: 0x0590, .second: 0x05FF },
2769 { .name: "Hiragana", .first: 0x3040, .second: 0x309F },
2770 { .name: "IPAExtensions", .first: 0x0250, .second: 0x02AF },
2771 { .name: "IdeographicDescriptionCharacters", .first: 0x2FF0, .second: 0x2FFF },
2772 { .name: "Kanbun", .first: 0x3190, .second: 0x319F },
2773 { .name: "KangxiRadicals", .first: 0x2F00, .second: 0x2FDF },
2774 { .name: "Kannada", .first: 0x0C80, .second: 0x0CFF },
2775 { .name: "Katakana", .first: 0x30A0, .second: 0x30FF },
2776 { .name: "KatakanaPhoneticExtensions", .first: 0x31F0, .second: 0x31FF },
2777 { .name: "Kharoshthi", .first: 0x10A00, .second: 0x10A5F },
2778 { .name: "Khmer", .first: 0x1780, .second: 0x17FF },
2779 { .name: "KhmerSymbols", .first: 0x19E0, .second: 0x19FF },
2780 { .name: "Lao", .first: 0x0E80, .second: 0x0EFF },
2781 { .name: "Latin-1Supplement", .first: 0x0080, .second: 0x00FF },
2782 { .name: "LatinExtended-A", .first: 0x0100, .second: 0x017F },
2783 { .name: "LatinExtended-B", .first: 0x0180, .second: 0x024F },
2784 { .name: "LatinExtendedAdditional", .first: 0x1E00, .second: 0x1EFF },
2785 { .name: "LetterlikeSymbols", .first: 0x2100, .second: 0x214F },
2786 { .name: "Limbu", .first: 0x1900, .second: 0x194F },
2787 { .name: "LinearBIdeograms", .first: 0x10080, .second: 0x100FF },
2788 { .name: "LinearBSyllabary", .first: 0x10000, .second: 0x1007F },
2789 { .name: "Malayalam", .first: 0x0D00, .second: 0x0D7F },
2790 { .name: "MathematicalAlphanumericSymbols", .first: 0x1D400, .second: 0x1D7FF },
2791 { .name: "MathematicalOperators", .first: 0x2200, .second: 0x22FF },
2792 { .name: "MiscellaneousMathematicalSymbols-A", .first: 0x27C0, .second: 0x27EF },
2793 { .name: "MiscellaneousMathematicalSymbols-B", .first: 0x2980, .second: 0x29FF },
2794 { .name: "MiscellaneousSymbols", .first: 0x2600, .second: 0x26FF },
2795 { .name: "MiscellaneousSymbolsandArrows", .first: 0x2B00, .second: 0x2BFF },
2796 { .name: "MiscellaneousTechnical", .first: 0x2300, .second: 0x23FF },
2797 { .name: "ModifierToneLetters", .first: 0xA700, .second: 0xA71F },
2798 { .name: "Mongolian", .first: 0x1800, .second: 0x18AF },
2799 { .name: "MusicalSymbols", .first: 0x1D100, .second: 0x1D1FF },
2800 { .name: "Myanmar", .first: 0x1000, .second: 0x109F },
2801 { .name: "NewTaiLue", .first: 0x1980, .second: 0x19DF },
2802 { .name: "NumberForms", .first: 0x2150, .second: 0x218F },
2803 { .name: "Ogham", .first: 0x1680, .second: 0x169F },
2804 { .name: "OldItalic", .first: 0x10300, .second: 0x1032F },
2805 { .name: "OldPersian", .first: 0x103A0, .second: 0x103DF },
2806 { .name: "OpticalCharacterRecognition", .first: 0x2440, .second: 0x245F },
2807 { .name: "Oriya", .first: 0x0B00, .second: 0x0B7F },
2808 { .name: "Osmanya", .first: 0x10480, .second: 0x104AF },
2809 { .name: "PhoneticExtensions", .first: 0x1D00, .second: 0x1D7F },
2810 { .name: "PhoneticExtensionsSupplement", .first: 0x1D80, .second: 0x1DBF },
2811 { .name: "PrivateUse", .first: 0xE000, .second: 0xF8FF },
2812 { .name: "Runic", .first: 0x16A0, .second: 0x16FF },
2813 { .name: "Shavian", .first: 0x10450, .second: 0x1047F },
2814 { .name: "Sinhala", .first: 0x0D80, .second: 0x0DFF },
2815 { .name: "SmallFormVariants", .first: 0xFE50, .second: 0xFE6F },
2816 { .name: "SpacingModifierLetters", .first: 0x02B0, .second: 0x02FF },
2817 { .name: "Specials", .first: 0xFFF0, .second: 0xFFFF },
2818 { .name: "SuperscriptsandSubscripts", .first: 0x2070, .second: 0x209F },
2819 { .name: "SupplementalArrows-A", .first: 0x27F0, .second: 0x27FF },
2820 { .name: "SupplementalArrows-B", .first: 0x2900, .second: 0x297F },
2821 { .name: "SupplementalMathematicalOperators", .first: 0x2A00, .second: 0x2AFF },
2822 { .name: "SupplementalPunctuation", .first: 0x2E00, .second: 0x2E7F },
2823 { .name: "SupplementaryPrivateUseArea-A", .first: 0xF0000, .second: 0xFFFFF },
2824 { .name: "SupplementaryPrivateUseArea-B", .first: 0x100000, .second: 0x10FFFF },
2825 { .name: "SylotiNagri", .first: 0xA800, .second: 0xA82F },
2826 { .name: "Syriac", .first: 0x0700, .second: 0x074F },
2827 { .name: "Tagalog", .first: 0x1700, .second: 0x171F },
2828 { .name: "Tagbanwa", .first: 0x1760, .second: 0x177F },
2829 { .name: "Tags", .first: 0xE0000, .second: 0xE007F },
2830 { .name: "TaiLe", .first: 0x1950, .second: 0x197F },
2831 { .name: "TaiXuanJingSymbols", .first: 0x1D300, .second: 0x1D35F },
2832 { .name: "Tamil", .first: 0x0B80, .second: 0x0BFF },
2833 { .name: "Telugu", .first: 0x0C00, .second: 0x0C7F },
2834 { .name: "Thaana", .first: 0x0780, .second: 0x07BF },
2835 { .name: "Thai", .first: 0x0E00, .second: 0x0E7F },
2836 { .name: "Tibetan", .first: 0x0F00, .second: 0x0FFF },
2837 { .name: "Tifinagh", .first: 0x2D30, .second: 0x2D7F },
2838 { .name: "Ugaritic", .first: 0x10380, .second: 0x1039F },
2839 { .name: "UnifiedCanadianAboriginalSyllabics", .first: 0x1400, .second: 0x167F },
2840 { .name: "VariationSelectors", .first: 0xFE00, .second: 0xFE0F },
2841 { .name: "VariationSelectorsSupplement", .first: 0xE0100, .second: 0xE01EF },
2842 { .name: "VerticalForms", .first: 0xFE10, .second: 0xFE1F },
2843 { .name: "YiRadicals", .first: 0xA490, .second: 0xA4CF },
2844 { .name: "YiSyllables", .first: 0xA000, .second: 0xA48F },
2845 { .name: "YijingHexagramSymbols", .first: 0x4DC0, .second: 0x4DFF }
2846};
2847
2848inline bool operator<(const CategoriesRangeMapEntry &entry1, const CategoriesRangeMapEntry &entry2)
2849{ return qstrcmp(str1: entry1.name, str2: entry2.name) < 0; }
2850inline bool operator<(const char *name, const CategoriesRangeMapEntry &entry)
2851{ return qstrcmp(str1: name, str2: entry.name) < 0; }
2852inline bool operator<(const CategoriesRangeMapEntry &entry, const char *name)
2853{ return qstrcmp(str1: entry.name, str2: name) < 0; }
2854#endif // QT_NO_REGEXP_CCLASS
2855
2856int QRegExpEngine::getChar()
2857{
2858 return (yyPos == yyLen) ? EOS : yyIn[yyPos++].unicode();
2859}
2860
2861int QRegExpEngine::getEscape()
2862{
2863#ifndef QT_NO_REGEXP_ESCAPE
2864 const char tab[] = "afnrtv"; // no b, as \b means word boundary
2865 const char backTab[] = "\a\f\n\r\t\v";
2866 ushort low;
2867 int i;
2868#endif
2869 ushort val;
2870 int prevCh = yyCh;
2871
2872 if (prevCh == EOS) {
2873 error(RXERR_END);
2874 return Tok_Char | '\\';
2875 }
2876 yyCh = getChar();
2877#ifndef QT_NO_REGEXP_ESCAPE
2878 if ((prevCh & ~0xff) == 0) {
2879 const char *p = strchr(s: tab, c: prevCh);
2880 if (p != nullptr)
2881 return Tok_Char | backTab[p - tab];
2882 }
2883#endif
2884
2885 switch (prevCh) {
2886#ifndef QT_NO_REGEXP_ESCAPE
2887 case '0':
2888 val = 0;
2889 for (i = 0; i < 3; i++) {
2890 if (yyCh >= '0' && yyCh <= '7')
2891 val = (val << 3) | (yyCh - '0');
2892 else
2893 break;
2894 yyCh = getChar();
2895 }
2896 if ((val & ~0377) != 0)
2897 error(RXERR_OCTAL);
2898 return Tok_Char | val;
2899#endif
2900#ifndef QT_NO_REGEXP_ESCAPE
2901 case 'B':
2902 return Tok_NonWord;
2903#endif
2904#ifndef QT_NO_REGEXP_CCLASS
2905 case 'D':
2906 // see QChar::isDigit()
2907 yyCharClass->addCategories(cats: uint(-1) ^ FLAG(QChar::Number_DecimalDigit));
2908 return Tok_CharClass;
2909 case 'S':
2910 // see QChar::isSpace()
2911 yyCharClass->addCategories(cats: uint(-1) ^ (FLAG(QChar::Separator_Space) |
2912 FLAG(QChar::Separator_Line) |
2913 FLAG(QChar::Separator_Paragraph) |
2914 FLAG(QChar::Other_Control)));
2915 yyCharClass->addRange(from: 0x0000, to: 0x0008);
2916 yyCharClass->addRange(from: 0x000e, to: 0x001f);
2917 yyCharClass->addRange(from: 0x007f, to: 0x0084);
2918 yyCharClass->addRange(from: 0x0086, to: 0x009f);
2919 return Tok_CharClass;
2920 case 'W':
2921 // see QChar::isLetterOrNumber() and QChar::isMark()
2922 yyCharClass->addCategories(cats: uint(-1) ^ (FLAG(QChar::Mark_NonSpacing) |
2923 FLAG(QChar::Mark_SpacingCombining) |
2924 FLAG(QChar::Mark_Enclosing) |
2925 FLAG(QChar::Number_DecimalDigit) |
2926 FLAG(QChar::Number_Letter) |
2927 FLAG(QChar::Number_Other) |
2928 FLAG(QChar::Letter_Uppercase) |
2929 FLAG(QChar::Letter_Lowercase) |
2930 FLAG(QChar::Letter_Titlecase) |
2931 FLAG(QChar::Letter_Modifier) |
2932 FLAG(QChar::Letter_Other) |
2933 FLAG(QChar::Punctuation_Connector)));
2934 yyCharClass->addRange(from: 0x203f, to: 0x2040);
2935 yyCharClass->addSingleton(ch: 0x2040);
2936 yyCharClass->addSingleton(ch: 0x2054);
2937 yyCharClass->addSingleton(ch: 0x30fb);
2938 yyCharClass->addRange(from: 0xfe33, to: 0xfe34);
2939 yyCharClass->addRange(from: 0xfe4d, to: 0xfe4f);
2940 yyCharClass->addSingleton(ch: 0xff3f);
2941 yyCharClass->addSingleton(ch: 0xff65);
2942 return Tok_CharClass;
2943#endif
2944#ifndef QT_NO_REGEXP_ESCAPE
2945 case 'b':
2946 return Tok_Word;
2947#endif
2948#ifndef QT_NO_REGEXP_CCLASS
2949 case 'd':
2950 // see QChar::isDigit()
2951 yyCharClass->addCategories(FLAG(QChar::Number_DecimalDigit));
2952 return Tok_CharClass;
2953 case 's':
2954 // see QChar::isSpace()
2955 yyCharClass->addCategories(FLAG(QChar::Separator_Space) |
2956 FLAG(QChar::Separator_Line) |
2957 FLAG(QChar::Separator_Paragraph));
2958 yyCharClass->addRange(from: 0x0009, to: 0x000d);
2959 yyCharClass->addSingleton(ch: 0x0085);
2960 return Tok_CharClass;
2961 case 'w':
2962 // see QChar::isLetterOrNumber() and QChar::isMark()
2963 yyCharClass->addCategories(FLAG(QChar::Mark_NonSpacing) |
2964 FLAG(QChar::Mark_SpacingCombining) |
2965 FLAG(QChar::Mark_Enclosing) |
2966 FLAG(QChar::Number_DecimalDigit) |
2967 FLAG(QChar::Number_Letter) |
2968 FLAG(QChar::Number_Other) |
2969 FLAG(QChar::Letter_Uppercase) |
2970 FLAG(QChar::Letter_Lowercase) |
2971 FLAG(QChar::Letter_Titlecase) |
2972 FLAG(QChar::Letter_Modifier) |
2973 FLAG(QChar::Letter_Other));
2974 yyCharClass->addSingleton(ch: 0x005f); // '_'
2975 return Tok_CharClass;
2976 case 'I':
2977 if (!xmlSchemaExtensions)
2978 break;
2979 yyCharClass->setNegative(!yyCharClass->negative());
2980 Q_FALLTHROUGH();
2981 case 'i':
2982 if (xmlSchemaExtensions) {
2983 yyCharClass->addCategories(FLAG(QChar::Mark_NonSpacing) |
2984 FLAG(QChar::Mark_SpacingCombining) |
2985 FLAG(QChar::Mark_Enclosing) |
2986 FLAG(QChar::Number_DecimalDigit) |
2987 FLAG(QChar::Number_Letter) |
2988 FLAG(QChar::Number_Other) |
2989 FLAG(QChar::Letter_Uppercase) |
2990 FLAG(QChar::Letter_Lowercase) |
2991 FLAG(QChar::Letter_Titlecase) |
2992 FLAG(QChar::Letter_Modifier) |
2993 FLAG(QChar::Letter_Other));
2994 yyCharClass->addSingleton(ch: 0x003a); // ':'
2995 yyCharClass->addSingleton(ch: 0x005f); // '_'
2996 yyCharClass->addRange(from: 0x0041, to: 0x005a); // [A-Z]
2997 yyCharClass->addRange(from: 0x0061, to: 0x007a); // [a-z]
2998 yyCharClass->addRange(from: 0xc0, to: 0xd6);
2999 yyCharClass->addRange(from: 0xd8, to: 0xf6);
3000 yyCharClass->addRange(from: 0xf8, to: 0x2ff);
3001 yyCharClass->addRange(from: 0x370, to: 0x37d);
3002 yyCharClass->addRange(from: 0x37f, to: 0x1fff);
3003 yyCharClass->addRange(from: 0x200c, to: 0x200d);
3004 yyCharClass->addRange(from: 0x2070, to: 0x218f);
3005 yyCharClass->addRange(from: 0x2c00, to: 0x2fef);
3006 yyCharClass->addRange(from: 0x3001, to: 0xd7ff);
3007 yyCharClass->addRange(from: 0xf900, to: 0xfdcf);
3008 yyCharClass->addRange(from: 0xfdf0, to: 0xfffd);
3009 yyCharClass->addRange(from: (ushort)0x10000, to: (ushort)0xeffff);
3010 return Tok_CharClass;
3011 } else {
3012 break;
3013 }
3014 case 'C':
3015 if (!xmlSchemaExtensions)
3016 break;
3017 yyCharClass->setNegative(!yyCharClass->negative());
3018 Q_FALLTHROUGH();
3019 case 'c':
3020 if (xmlSchemaExtensions) {
3021 yyCharClass->addCategories(FLAG(QChar::Mark_NonSpacing) |
3022 FLAG(QChar::Mark_SpacingCombining) |
3023 FLAG(QChar::Mark_Enclosing) |
3024 FLAG(QChar::Number_DecimalDigit) |
3025 FLAG(QChar::Number_Letter) |
3026 FLAG(QChar::Number_Other) |
3027 FLAG(QChar::Letter_Uppercase) |
3028 FLAG(QChar::Letter_Lowercase) |
3029 FLAG(QChar::Letter_Titlecase) |
3030 FLAG(QChar::Letter_Modifier) |
3031 FLAG(QChar::Letter_Other));
3032 yyCharClass->addSingleton(ch: 0x002d); // '-'
3033 yyCharClass->addSingleton(ch: 0x002e); // '.'
3034 yyCharClass->addSingleton(ch: 0x003a); // ':'
3035 yyCharClass->addSingleton(ch: 0x005f); // '_'
3036 yyCharClass->addSingleton(ch: 0xb7);
3037 yyCharClass->addRange(from: 0x0030, to: 0x0039); // [0-9]
3038 yyCharClass->addRange(from: 0x0041, to: 0x005a); // [A-Z]
3039 yyCharClass->addRange(from: 0x0061, to: 0x007a); // [a-z]
3040 yyCharClass->addRange(from: 0xc0, to: 0xd6);
3041 yyCharClass->addRange(from: 0xd8, to: 0xf6);
3042 yyCharClass->addRange(from: 0xf8, to: 0x2ff);
3043 yyCharClass->addRange(from: 0x370, to: 0x37d);
3044 yyCharClass->addRange(from: 0x37f, to: 0x1fff);
3045 yyCharClass->addRange(from: 0x200c, to: 0x200d);
3046 yyCharClass->addRange(from: 0x2070, to: 0x218f);
3047 yyCharClass->addRange(from: 0x2c00, to: 0x2fef);
3048 yyCharClass->addRange(from: 0x3001, to: 0xd7ff);
3049 yyCharClass->addRange(from: 0xf900, to: 0xfdcf);
3050 yyCharClass->addRange(from: 0xfdf0, to: 0xfffd);
3051 yyCharClass->addRange(from: (ushort)0x10000, to: (ushort)0xeffff);
3052 yyCharClass->addRange(from: 0x0300, to: 0x036f);
3053 yyCharClass->addRange(from: 0x203f, to: 0x2040);
3054 return Tok_CharClass;
3055 } else {
3056 break;
3057 }
3058 case 'P':
3059 if (!xmlSchemaExtensions)
3060 break;
3061 yyCharClass->setNegative(!yyCharClass->negative());
3062 Q_FALLTHROUGH();
3063 case 'p':
3064 if (xmlSchemaExtensions) {
3065 if (yyCh != '{') {
3066 error(RXERR_CHARCLASS);
3067 return Tok_CharClass;
3068 }
3069
3070 QByteArray category;
3071 yyCh = getChar();
3072 while (yyCh != '}') {
3073 if (yyCh == EOS) {
3074 error(RXERR_END);
3075 return Tok_CharClass;
3076 }
3077 category.append(c: yyCh);
3078 yyCh = getChar();
3079 }
3080 yyCh = getChar(); // skip closing '}'
3081
3082 int catlen = category.size();
3083 if (catlen == 1 || catlen == 2) {
3084 switch (category.at(i: 0)) {
3085 case 'M':
3086 if (catlen == 1) {
3087 yyCharClass->addCategories(FLAG(QChar::Mark_NonSpacing) |
3088 FLAG(QChar::Mark_SpacingCombining) |
3089 FLAG(QChar::Mark_Enclosing));
3090 } else {
3091 switch (category.at(i: 1)) {
3092 case 'n': yyCharClass->addCategories(FLAG(QChar::Mark_NonSpacing)); break; // Mn
3093 case 'c': yyCharClass->addCategories(FLAG(QChar::Mark_SpacingCombining)); break; // Mc
3094 case 'e': yyCharClass->addCategories(FLAG(QChar::Mark_Enclosing)); break; // Me
3095 default: error(RXERR_CATEGORY); break;
3096 }
3097 }
3098 break;
3099 case 'N':
3100 if (catlen == 1) {
3101 yyCharClass->addCategories(FLAG(QChar::Number_DecimalDigit) |
3102 FLAG(QChar::Number_Letter) |
3103 FLAG(QChar::Number_Other));
3104 } else {
3105 switch (category.at(i: 1)) {
3106 case 'd': yyCharClass->addCategories(FLAG(QChar::Number_DecimalDigit)); break; // Nd
3107 case 'l': yyCharClass->addCategories(FLAG(QChar::Number_Letter)); break; // Hl
3108 case 'o': yyCharClass->addCategories(FLAG(QChar::Number_Other)); break; // No
3109 default: error(RXERR_CATEGORY); break;
3110 }
3111 }
3112 break;
3113 case 'Z':
3114 if (catlen == 1) {
3115 yyCharClass->addCategories(FLAG(QChar::Separator_Space) |
3116 FLAG(QChar::Separator_Line) |
3117 FLAG(QChar::Separator_Paragraph));
3118 } else {
3119 switch (category.at(i: 1)) {
3120 case 's': yyCharClass->addCategories(FLAG(QChar::Separator_Space)); break; // Zs
3121 case 'l': yyCharClass->addCategories(FLAG(QChar::Separator_Line)); break; // Zl
3122 case 'p': yyCharClass->addCategories(FLAG(QChar::Separator_Paragraph)); break; // Zp
3123 default: error(RXERR_CATEGORY); break;
3124 }
3125 }
3126 break;
3127 case 'C':
3128 if (catlen == 1) {
3129 yyCharClass->addCategories(FLAG(QChar::Other_Control) |
3130 FLAG(QChar::Other_Format) |
3131 FLAG(QChar::Other_Surrogate) |
3132 FLAG(QChar::Other_PrivateUse) |
3133 FLAG(QChar::Other_NotAssigned));
3134 } else {
3135 switch (category.at(i: 1)) {
3136 case 'c': yyCharClass->addCategories(FLAG(QChar::Other_Control)); break; // Cc
3137 case 'f': yyCharClass->addCategories(FLAG(QChar::Other_Format)); break; // Cf
3138 case 's': yyCharClass->addCategories(FLAG(QChar::Other_Surrogate)); break; // Cs
3139 case 'o': yyCharClass->addCategories(FLAG(QChar::Other_PrivateUse)); break; // Co
3140 case 'n': yyCharClass->addCategories(FLAG(QChar::Other_NotAssigned)); break; // Cn
3141 default: error(RXERR_CATEGORY); break;
3142 }
3143 }
3144 break;
3145 case 'L':
3146 if (catlen == 1) {
3147 yyCharClass->addCategories(FLAG(QChar::Letter_Uppercase) |
3148 FLAG(QChar::Letter_Lowercase) |
3149 FLAG(QChar::Letter_Titlecase) |
3150 FLAG(QChar::Letter_Modifier) |
3151 FLAG(QChar::Letter_Other));
3152 } else {
3153 switch (category.at(i: 1)) {
3154 case 'u': yyCharClass->addCategories(FLAG(QChar::Letter_Uppercase)); break; // Lu
3155 case 'l': yyCharClass->addCategories(FLAG(QChar::Letter_Lowercase)); break; // Ll
3156 case 't': yyCharClass->addCategories(FLAG(QChar::Letter_Titlecase)); break; // Lt
3157 case 'm': yyCharClass->addCategories(FLAG(QChar::Letter_Modifier)); break; // Lm
3158 case 'o': yyCharClass->addCategories(FLAG(QChar::Letter_Other)); break; // Lo
3159 default: error(RXERR_CATEGORY); break;
3160 }
3161 }
3162 break;
3163 case 'P':
3164 if (catlen == 1) {
3165 yyCharClass->addCategories(FLAG(QChar::Punctuation_Connector) |
3166 FLAG(QChar::Punctuation_Dash) |
3167 FLAG(QChar::Punctuation_Open) |
3168 FLAG(QChar::Punctuation_Close) |
3169 FLAG(QChar::Punctuation_InitialQuote) |
3170 FLAG(QChar::Punctuation_FinalQuote) |
3171 FLAG(QChar::Punctuation_Other));
3172 } else {
3173 switch (category.at(i: 1)) {
3174 case 'c': yyCharClass->addCategories(FLAG(QChar::Punctuation_Connector)); break; // Pc
3175 case 'd': yyCharClass->addCategories(FLAG(QChar::Punctuation_Dash)); break; // Pd
3176 case 's': yyCharClass->addCategories(FLAG(QChar::Punctuation_Open)); break; // Ps
3177 case 'e': yyCharClass->addCategories(FLAG(QChar::Punctuation_Close)); break; // Pe
3178 case 'i': yyCharClass->addCategories(FLAG(QChar::Punctuation_InitialQuote)); break; // Pi
3179 case 'f': yyCharClass->addCategories(FLAG(QChar::Punctuation_FinalQuote)); break; // Pf
3180 case 'o': yyCharClass->addCategories(FLAG(QChar::Punctuation_Other)); break; // Po
3181 default: error(RXERR_CATEGORY); break;
3182 }
3183 }
3184 break;
3185 case 'S':
3186 if (catlen == 1) {
3187 yyCharClass->addCategories(FLAG(QChar::Symbol_Math) |
3188 FLAG(QChar::Symbol_Currency) |
3189 FLAG(QChar::Symbol_Modifier) |
3190 FLAG(QChar::Symbol_Other));
3191 } else {
3192 switch (category.at(i: 1)) {
3193 case 'm': yyCharClass->addCategories(FLAG(QChar::Symbol_Math)); break; // Sm
3194 case 'c': yyCharClass->addCategories(FLAG(QChar::Symbol_Currency)); break; // Sc
3195 case 'k': yyCharClass->addCategories(FLAG(QChar::Symbol_Modifier)); break; // Sk
3196 case 'o': yyCharClass->addCategories(FLAG(QChar::Symbol_Other)); break; // So
3197 default: error(RXERR_CATEGORY); break;
3198 }
3199 }
3200 break;
3201 default:
3202 error(RXERR_CATEGORY);
3203 break;
3204 }
3205 } else if (catlen > 2 && category.at(i: 0) == 'I' && category.at(i: 1) == 's') {
3206 static const int N = sizeof(categoriesRangeMap) / sizeof(categoriesRangeMap[0]);
3207 const char * const categoryFamily = category.constData() + 2;
3208 const CategoriesRangeMapEntry *r = std::lower_bound(first: categoriesRangeMap, last: categoriesRangeMap + N, val: categoryFamily);
3209 if (r != categoriesRangeMap + N && qstrcmp(str1: r->name, str2: categoryFamily) == 0)
3210 yyCharClass->addRange(from: r->first, to: r->second);
3211 else
3212 error(RXERR_CATEGORY);
3213 } else {
3214 error(RXERR_CATEGORY);
3215 }
3216 return Tok_CharClass;
3217 } else {
3218 break;
3219 }
3220#endif
3221#ifndef QT_NO_REGEXP_ESCAPE
3222 case 'x':
3223 val = 0;
3224 for (i = 0; i < 4; i++) {
3225 low = QChar(yyCh).toLower().unicode();
3226 if (low >= '0' && low <= '9')
3227 val = (val << 4) | (low - '0');
3228 else if (low >= 'a' && low <= 'f')
3229 val = (val << 4) | (low - 'a' + 10);
3230 else
3231 break;
3232 yyCh = getChar();
3233 }
3234 return Tok_Char | val;
3235#endif
3236 default:
3237 break;
3238 }
3239 if (prevCh >= '1' && prevCh <= '9') {
3240#ifndef QT_NO_REGEXP_BACKREF
3241 val = prevCh - '0';
3242 while (yyCh >= '0' && yyCh <= '9') {
3243 val = (val * 10) + (yyCh - '0');
3244 yyCh = getChar();
3245 }
3246 return Tok_BackRef | val;
3247#else
3248 error(RXERR_DISABLED);
3249#endif
3250 }
3251 return Tok_Char | prevCh;
3252}
3253
3254#ifndef QT_NO_REGEXP_INTERVAL
3255int QRegExpEngine::getRep(int def)
3256{
3257 if (yyCh >= '0' && yyCh <= '9') {
3258 int rep = 0;
3259 do {
3260 rep = 10 * rep + yyCh - '0';
3261 if (rep >= InftyRep) {
3262 error(RXERR_REPETITION);
3263 rep = def;
3264 }
3265 yyCh = getChar();
3266 } while (yyCh >= '0' && yyCh <= '9');
3267 return rep;
3268 } else {
3269 return def;
3270 }
3271}
3272#endif
3273
3274#ifndef QT_NO_REGEXP_LOOKAHEAD
3275void QRegExpEngine::skipChars(int n)
3276{
3277 if (n > 0) {
3278 yyPos += n - 1;
3279 yyCh = getChar();
3280 }
3281}
3282#endif
3283
3284void QRegExpEngine::error(const char *msg)
3285{
3286 if (yyError.isEmpty())
3287 yyError = QLatin1String(msg);
3288}
3289
3290void QRegExpEngine::startTokenizer(const QChar *rx, int len)
3291{
3292 yyIn = rx;
3293 yyPos0 = 0;
3294 yyPos = 0;
3295 yyLen = len;
3296 yyCh = getChar();
3297 yyCharClass.reset(other: new QRegExpCharClass);
3298 yyMinRep = 0;
3299 yyMaxRep = 0;
3300 yyError = QString();
3301}
3302
3303int QRegExpEngine::getToken()
3304{
3305#ifndef QT_NO_REGEXP_CCLASS
3306 ushort pendingCh = 0;
3307 bool charPending;
3308 bool rangePending;
3309 int tok;
3310#endif
3311 int prevCh = yyCh;
3312
3313 yyPos0 = yyPos - 1;
3314#ifndef QT_NO_REGEXP_CCLASS
3315 yyCharClass->clear();
3316#endif
3317 yyMinRep = 0;
3318 yyMaxRep = 0;
3319 yyCh = getChar();
3320
3321 switch (prevCh) {
3322 case EOS:
3323 yyPos0 = yyPos;
3324 return Tok_Eos;
3325 case '$':
3326 return Tok_Dollar;
3327 case '(':
3328 if (yyCh == '?') {
3329 prevCh = getChar();
3330 yyCh = getChar();
3331 switch (prevCh) {
3332#ifndef QT_NO_REGEXP_LOOKAHEAD
3333 case '!':
3334 return Tok_NegLookahead;
3335 case '=':
3336 return Tok_PosLookahead;
3337#endif
3338 case ':':
3339 return Tok_MagicLeftParen;
3340 case '<':
3341 error(RXERR_LOOKBEHIND);
3342 return Tok_MagicLeftParen;
3343 default:
3344 error(RXERR_LOOKAHEAD);
3345 return Tok_MagicLeftParen;
3346 }
3347 } else {
3348 return Tok_LeftParen;
3349 }
3350 case ')':
3351 return Tok_RightParen;
3352 case '*':
3353 yyMinRep = 0;
3354 yyMaxRep = InftyRep;
3355 return Tok_Quantifier;
3356 case '+':
3357 yyMinRep = 1;
3358 yyMaxRep = InftyRep;
3359 return Tok_Quantifier;
3360 case '.':
3361#ifndef QT_NO_REGEXP_CCLASS
3362 yyCharClass->setNegative(true);
3363#endif
3364 return Tok_CharClass;
3365 case '?':
3366 yyMinRep = 0;
3367 yyMaxRep = 1;
3368 return Tok_Quantifier;
3369 case '[':
3370#ifndef QT_NO_REGEXP_CCLASS
3371 if (yyCh == '^') {
3372 yyCharClass->setNegative(true);
3373 yyCh = getChar();
3374 }
3375 charPending = false;
3376 rangePending = false;
3377 do {
3378 if (yyCh == '-' && charPending && !rangePending) {
3379 rangePending = true;
3380 yyCh = getChar();
3381 } else {
3382 if (charPending && !rangePending) {
3383 yyCharClass->addSingleton(ch: pendingCh);
3384 charPending = false;
3385 }
3386 if (yyCh == '\\') {
3387 yyCh = getChar();
3388 tok = getEscape();
3389 if (tok == Tok_Word)
3390 tok = '\b';
3391 } else {
3392 tok = Tok_Char | yyCh;
3393 yyCh = getChar();
3394 }
3395 if (tok == Tok_CharClass) {
3396 if (rangePending) {
3397 yyCharClass->addSingleton(ch: '-');
3398 yyCharClass->addSingleton(ch: pendingCh);
3399 charPending = false;
3400 rangePending = false;
3401 }
3402 } else if ((tok & Tok_Char) != 0) {
3403 if (rangePending) {
3404 yyCharClass->addRange(from: pendingCh, to: tok ^ Tok_Char);
3405 charPending = false;
3406 rangePending = false;
3407 } else {
3408 pendingCh = tok ^ Tok_Char;
3409 charPending = true;
3410 }
3411 } else {
3412 error(RXERR_CHARCLASS);
3413 }
3414 }
3415 } while (yyCh != ']' && yyCh != EOS);
3416 if (rangePending)
3417 yyCharClass->addSingleton(ch: '-');
3418 if (charPending)
3419 yyCharClass->addSingleton(ch: pendingCh);
3420 if (yyCh == EOS)
3421 error(RXERR_END);
3422 else
3423 yyCh = getChar();
3424 return Tok_CharClass;
3425#else
3426 error(RXERR_END);
3427 return Tok_Char | '[';
3428#endif
3429 case '\\':
3430 return getEscape();
3431 case ']':
3432 error(RXERR_LEFTDELIM);
3433 return Tok_Char | ']';
3434 case '^':
3435 return Tok_Caret;
3436 case '{':
3437#ifndef QT_NO_REGEXP_INTERVAL
3438 yyMinRep = getRep(def: 0);
3439 yyMaxRep = yyMinRep;
3440 if (yyCh == ',') {
3441 yyCh = getChar();
3442 yyMaxRep = getRep(def: InftyRep);
3443 }
3444 if (yyMaxRep < yyMinRep)
3445 error(RXERR_INTERVAL);
3446 if (yyCh != '}')
3447 error(RXERR_REPETITION);
3448 yyCh = getChar();
3449 return Tok_Quantifier;
3450#else
3451 error(RXERR_DISABLED);
3452 return Tok_Char | '{';
3453#endif
3454 case '|':
3455 return Tok_Bar;
3456 case '}':
3457 error(RXERR_LEFTDELIM);
3458 return Tok_Char | '}';
3459 default:
3460 return Tok_Char | prevCh;
3461 }
3462}
3463
3464int QRegExpEngine::parse(const QChar *pattern, int len)
3465{
3466 valid = true;
3467 startTokenizer(rx: pattern, len);
3468 yyTok = getToken();
3469#ifndef QT_NO_REGEXP_CAPTURE
3470 yyMayCapture = true;
3471#else
3472 yyMayCapture = false;
3473#endif
3474
3475#ifndef QT_NO_REGEXP_CAPTURE
3476 int atom = startAtom(officialCapture: false);
3477#endif
3478 QRegExpCharClass anything;
3479 Box box(this); // create InitialState
3480 box.set(anything);
3481 Box rightBox(this); // create FinalState
3482 rightBox.set(anything);
3483
3484 Box middleBox(this);
3485 parseExpression(box: &middleBox);
3486#ifndef QT_NO_REGEXP_CAPTURE
3487 finishAtom(atom, needCapture: false);
3488#endif
3489#ifndef QT_NO_REGEXP_OPTIM
3490 middleBox.setupHeuristics();
3491#endif
3492 box.cat(b: middleBox);
3493 box.cat(b: rightBox);
3494 yyCharClass.reset();
3495
3496#ifndef QT_NO_REGEXP_CAPTURE
3497 for (int i = 0; i < nf; ++i) {
3498 switch (f[i].capture) {
3499 case QRegExpAtom::NoCapture:
3500 break;
3501 case QRegExpAtom::OfficialCapture:
3502 f[i].capture = ncap;
3503 captureForOfficialCapture.append(t: ncap);
3504 ++ncap;
3505 ++officialncap;
3506 break;
3507 case QRegExpAtom::UnofficialCapture:
3508 f[i].capture = greedyQuantifiers ? ncap++ : QRegExpAtom::NoCapture;
3509 }
3510 }
3511
3512#ifndef QT_NO_REGEXP_BACKREF
3513#ifndef QT_NO_REGEXP_OPTIM
3514 if (officialncap == 0 && nbrefs == 0) {
3515 ncap = nf = 0;
3516 f.clear();
3517 }
3518#endif
3519 // handle the case where there's a \5 with no corresponding capture
3520 // (captureForOfficialCapture.size() != officialncap)
3521 for (int i = 0; i < nbrefs - officialncap; ++i) {
3522 captureForOfficialCapture.append(t: ncap);
3523 ++ncap;
3524 }
3525#endif
3526#endif
3527
3528 if (!yyError.isEmpty())
3529 return -1;
3530
3531#ifndef QT_NO_REGEXP_OPTIM
3532 const QRegExpAutomatonState &sinit = s.at(i: InitialState);
3533 caretAnchored = !sinit.anchors.isEmpty();
3534 if (caretAnchored) {
3535 const QMap<int, int> &anchors = sinit.anchors;
3536 QMap<int, int>::const_iterator a;
3537 for (a = anchors.constBegin(); a != anchors.constEnd(); ++a) {
3538 if (
3539#ifndef QT_NO_REGEXP_ANCHOR_ALT
3540 (*a & Anchor_Alternation) != 0 ||
3541#endif
3542 (*a & Anchor_Caret) == 0)
3543 {
3544 caretAnchored = false;
3545 break;
3546 }
3547 }
3548 }
3549#endif
3550
3551 // cleanup anchors
3552 int numStates = s.size();
3553 for (int i = 0; i < numStates; ++i) {
3554 QRegExpAutomatonState &state = s[i];
3555 if (!state.anchors.isEmpty()) {
3556 QMap<int, int>::iterator a = state.anchors.begin();
3557 while (a != state.anchors.end()) {
3558 if (a.value() == 0)
3559 a = state.anchors.erase(it: a);
3560 else
3561 ++a;
3562 }
3563 }
3564 }
3565
3566 return yyPos0;
3567}
3568
3569void QRegExpEngine::parseAtom(Box *box)
3570{
3571#ifndef QT_NO_REGEXP_LOOKAHEAD
3572 QRegExpEngine *eng = nullptr;
3573 bool neg;
3574 int len;
3575#endif
3576
3577 if ((yyTok & Tok_Char) != 0) {
3578 box->set(QChar(yyTok ^ Tok_Char));
3579 } else {
3580#ifndef QT_NO_REGEXP_OPTIM
3581 trivial = false;
3582#endif
3583 switch (yyTok) {
3584 case Tok_Dollar:
3585 box->catAnchor(a: Anchor_Dollar);
3586 break;
3587 case Tok_Caret:
3588 box->catAnchor(a: Anchor_Caret);
3589 break;
3590#ifndef QT_NO_REGEXP_LOOKAHEAD
3591 case Tok_PosLookahead:
3592 case Tok_NegLookahead:
3593 neg = (yyTok == Tok_NegLookahead);
3594 eng = new QRegExpEngine(cs, greedyQuantifiers);
3595 len = eng->parse(pattern: yyIn + yyPos - 1, len: yyLen - yyPos + 1);
3596 if (len >= 0)
3597 skipChars(n: len);
3598 else
3599 error(RXERR_LOOKAHEAD);
3600 box->catAnchor(a: addLookahead(eng, negative: neg));
3601 yyTok = getToken();
3602 if (yyTok != Tok_RightParen)
3603 error(RXERR_LOOKAHEAD);
3604 break;
3605#endif
3606#ifndef QT_NO_REGEXP_ESCAPE
3607 case Tok_Word:
3608 box->catAnchor(a: Anchor_Word);
3609 break;
3610 case Tok_NonWord:
3611 box->catAnchor(a: Anchor_NonWord);
3612 break;
3613#endif
3614 case Tok_LeftParen:
3615 case Tok_MagicLeftParen:
3616 yyTok = getToken();
3617 parseExpression(box);
3618 if (yyTok != Tok_RightParen)
3619 error(RXERR_END);
3620 break;
3621 case Tok_CharClass:
3622 box->set(*yyCharClass);
3623 break;
3624 case Tok_Quantifier:
3625 error(RXERR_REPETITION);
3626 break;
3627 default:
3628#ifndef QT_NO_REGEXP_BACKREF
3629 if ((yyTok & Tok_BackRef) != 0)
3630 box->set(yyTok ^ Tok_BackRef);
3631 else
3632#endif
3633 error(RXERR_DISABLED);
3634 }
3635 }
3636 yyTok = getToken();
3637}
3638
3639void QRegExpEngine::parseFactor(Box *box)
3640{
3641#ifndef QT_NO_REGEXP_CAPTURE
3642 int outerAtom = greedyQuantifiers ? startAtom(officialCapture: false) : -1;
3643 int innerAtom = startAtom(officialCapture: yyMayCapture && yyTok == Tok_LeftParen);
3644 bool magicLeftParen = (yyTok == Tok_MagicLeftParen);
3645#else
3646 const int innerAtom = -1;
3647#endif
3648
3649#ifndef QT_NO_REGEXP_INTERVAL
3650#define YYREDO() \
3651 yyIn = in, yyPos0 = pos0, yyPos = pos, yyLen = len, yyCh = ch, \
3652 *yyCharClass = charClass, yyMinRep = 0, yyMaxRep = 0, yyTok = tok
3653
3654 const QChar *in = yyIn;
3655 int pos0 = yyPos0;
3656 int pos = yyPos;
3657 int len = yyLen;
3658 int ch = yyCh;
3659 QRegExpCharClass charClass;
3660 if (yyTok == Tok_CharClass)
3661 charClass = *yyCharClass;
3662 int tok = yyTok;
3663 bool mayCapture = yyMayCapture;
3664#endif
3665
3666 parseAtom(box);
3667#ifndef QT_NO_REGEXP_CAPTURE
3668 finishAtom(atom: innerAtom, needCapture: magicLeftParen);
3669#endif
3670
3671 bool hasQuantifier = (yyTok == Tok_Quantifier);
3672 if (hasQuantifier) {
3673#ifndef QT_NO_REGEXP_OPTIM
3674 trivial = false;
3675#endif
3676 if (yyMaxRep == InftyRep) {
3677 box->plus(atom: innerAtom);
3678#ifndef QT_NO_REGEXP_INTERVAL
3679 } else if (yyMaxRep == 0) {
3680 box->clear();
3681#endif
3682 }
3683 if (yyMinRep == 0)
3684 box->opt();
3685
3686#ifndef QT_NO_REGEXP_INTERVAL
3687 yyMayCapture = false;
3688 int alpha = (yyMinRep == 0) ? 0 : yyMinRep - 1;
3689 int beta = (yyMaxRep == InftyRep) ? 0 : yyMaxRep - (alpha + 1);
3690
3691 Box rightBox(this);
3692 int i;
3693
3694 for (i = 0; i < beta; i++) {
3695 YYREDO();
3696 Box leftBox(this);
3697 parseAtom(box: &leftBox);
3698 leftBox.cat(b: rightBox);
3699 leftBox.opt();
3700 rightBox = leftBox;
3701 }
3702 for (i = 0; i < alpha; i++) {
3703 YYREDO();
3704 Box leftBox(this);
3705 parseAtom(box: &leftBox);
3706 leftBox.cat(b: rightBox);
3707 rightBox = leftBox;
3708 }
3709 rightBox.cat(b: *box);
3710 *box = rightBox;
3711#endif
3712 yyTok = getToken();
3713#ifndef QT_NO_REGEXP_INTERVAL
3714 yyMayCapture = mayCapture;
3715#endif
3716 }
3717#undef YYREDO
3718#ifndef QT_NO_REGEXP_CAPTURE
3719 if (greedyQuantifiers)
3720 finishAtom(atom: outerAtom, needCapture: hasQuantifier);
3721#endif
3722}
3723
3724void QRegExpEngine::parseTerm(Box *box)
3725{
3726#ifndef QT_NO_REGEXP_OPTIM
3727 if (yyTok != Tok_Eos && yyTok != Tok_RightParen && yyTok != Tok_Bar)
3728 parseFactor(box);
3729#endif
3730 while (yyTok != Tok_Eos && yyTok != Tok_RightParen && yyTok != Tok_Bar) {
3731 Box rightBox(this);
3732 parseFactor(box: &rightBox);
3733 box->cat(b: rightBox);
3734 }
3735}
3736
3737void QRegExpEngine::parseExpression(Box *box)
3738{
3739 parseTerm(box);
3740 while (yyTok == Tok_Bar) {
3741#ifndef QT_NO_REGEXP_OPTIM
3742 trivial = false;
3743#endif
3744 Box rightBox(this);
3745 yyTok = getToken();
3746 parseTerm(box: &rightBox);
3747 box->orx(b: rightBox);
3748 }
3749}
3750
3751/*
3752 The struct QRegExpPrivate contains the private data of a regular
3753 expression other than the automaton. It makes it possible for many
3754 QRegExp objects to use the same QRegExpEngine object with different
3755 QRegExpPrivate objects.
3756*/
3757struct QRegExpPrivate
3758{
3759 QRegExpEngine *eng;
3760 QRegExpEngineKey engineKey;
3761 bool minimal;
3762#ifndef QT_NO_REGEXP_CAPTURE
3763 QString t; // last string passed to QRegExp::indexIn() or lastIndexIn()
3764 QStringList capturedCache; // what QRegExp::capturedTexts() returned last
3765#endif
3766 QRegExpMatchState matchState;
3767
3768 inline QRegExpPrivate()
3769 : eng(nullptr), engineKey(QString(), QRegExp::RegExp, Qt::CaseSensitive), minimal(false) { }
3770 inline QRegExpPrivate(const QRegExpEngineKey &key)
3771 : eng(nullptr), engineKey(key), minimal(false) {}
3772};
3773
3774#if !defined(QT_NO_REGEXP_OPTIM)
3775struct QRECache
3776{
3777 typedef QHash<QRegExpEngineKey, QRegExpEngine *> EngineCache;
3778 typedef QCache<QRegExpEngineKey, QRegExpEngine> UnusedEngineCache;
3779 EngineCache usedEngines;
3780 UnusedEngineCache unusedEngines;
3781};
3782Q_GLOBAL_STATIC(QRECache, engineCache)
3783static QBasicMutex engineCacheMutex;
3784#endif // QT_NO_REGEXP_OPTIM
3785
3786static void derefEngine(QRegExpEngine *eng, const QRegExpEngineKey &key)
3787{
3788#if !defined(QT_NO_REGEXP_OPTIM)
3789 const auto locker = qt_scoped_lock(mutex&: engineCacheMutex);
3790 if (!eng->ref.deref()) {
3791 if (QRECache *c = engineCache()) {
3792 c->unusedEngines.insert(key, object: eng, cost: 4 + key.pattern.size() / 4);
3793 c->usedEngines.remove(key);
3794 } else {
3795 delete eng;
3796 }
3797 }
3798#else
3799 Q_UNUSED(key);
3800 if (!eng->ref.deref())
3801 delete eng;
3802#endif
3803}
3804
3805static void prepareEngine_helper(QRegExpPrivate *priv)
3806{
3807 Q_ASSERT(!priv->eng);
3808
3809#if !defined(QT_NO_REGEXP_OPTIM)
3810 const auto locker = qt_scoped_lock(mutex&: engineCacheMutex);
3811 if (QRECache *c = engineCache()) {
3812 priv->eng = c->unusedEngines.take(key: priv->engineKey);
3813 if (!priv->eng)
3814 priv->eng = c->usedEngines.value(key: priv->engineKey);
3815 if (!priv->eng)
3816 priv->eng = new QRegExpEngine(priv->engineKey);
3817 else
3818 priv->eng->ref.ref();
3819
3820 c->usedEngines.insert(key: priv->engineKey, value: priv->eng);
3821 return;
3822 }
3823#endif // QT_NO_REGEXP_OPTIM
3824
3825 priv->eng = new QRegExpEngine(priv->engineKey);
3826}
3827
3828inline static void prepareEngine(QRegExpPrivate *priv)
3829{
3830 if (priv->eng)
3831 return;
3832 prepareEngine_helper(priv);
3833 priv->matchState.prepareForMatch(eng: priv->eng);
3834}
3835
3836static void prepareEngineForMatch(QRegExpPrivate *priv, const QString &str)
3837{
3838 prepareEngine(priv);
3839 priv->matchState.prepareForMatch(eng: priv->eng);
3840#ifndef QT_NO_REGEXP_CAPTURE
3841 priv->t = str;
3842 priv->capturedCache.clear();
3843#else
3844 Q_UNUSED(str);
3845#endif
3846}
3847
3848static void invalidateEngine(QRegExpPrivate *priv)
3849{
3850 if (priv->eng) {
3851 derefEngine(eng: priv->eng, key: priv->engineKey);
3852 priv->eng = nullptr;
3853 priv->matchState.drain();
3854 }
3855}
3856
3857/*!
3858 \enum QRegExp::CaretMode
3859
3860 The CaretMode enum defines the different meanings of the caret
3861 (\b{^}) in a regular expression. The possible values are:
3862
3863 \value CaretAtZero
3864 The caret corresponds to index 0 in the searched string.
3865
3866 \value CaretAtOffset
3867 The caret corresponds to the start offset of the search.
3868
3869 \value CaretWontMatch
3870 The caret never matches.
3871*/
3872
3873/*!
3874 \enum QRegExp::PatternSyntax
3875
3876 The syntax used to interpret the meaning of the pattern.
3877
3878 \value RegExp A rich Perl-like pattern matching syntax. This is
3879 the default.
3880
3881 \value RegExp2 Like RegExp, but with \l{greedy quantifiers}.
3882 (Introduced in Qt 4.2.)
3883
3884 \value Wildcard This provides a simple pattern matching syntax
3885 similar to that used by shells (command interpreters) for "file
3886 globbing". See \l{QRegExp wildcard matching}.
3887
3888 \value WildcardUnix This is similar to Wildcard but with the
3889 behavior of a Unix shell. The wildcard characters can be escaped
3890 with the character "\\".
3891
3892 \value FixedString The pattern is a fixed string. This is
3893 equivalent to using the RegExp pattern on a string in
3894 which all metacharacters are escaped using escape().
3895
3896 \value W3CXmlSchema11 The pattern is a regular expression as
3897 defined by the W3C XML Schema 1.1 specification.
3898
3899 \sa setPatternSyntax()
3900*/
3901
3902/*!
3903 Constructs an empty regexp.
3904
3905 \sa isValid(), errorString()
3906*/
3907QRegExp::QRegExp()
3908{
3909 priv = new QRegExpPrivate;
3910 prepareEngine(priv);
3911}
3912
3913/*!
3914 Constructs a regular expression object for the given \a pattern
3915 string. The pattern must be given using wildcard notation if \a
3916 syntax is \l Wildcard; the default is \l RegExp. The pattern is
3917 case sensitive, unless \a cs is Qt::CaseInsensitive. Matching is
3918 greedy (maximal), but can be changed by calling
3919 setMinimal().
3920
3921 \sa setPattern(), setCaseSensitivity(), setPatternSyntax()
3922*/
3923QRegExp::QRegExp(const QString &pattern, Qt::CaseSensitivity cs, PatternSyntax syntax)
3924{
3925 priv = new QRegExpPrivate(QRegExpEngineKey(pattern, syntax, cs));
3926 prepareEngine(priv);
3927}
3928
3929/*!
3930 Constructs a regular expression as a copy of \a rx.
3931
3932 \sa operator=()
3933*/
3934QRegExp::QRegExp(const QRegExp &rx)
3935{
3936 priv = new QRegExpPrivate;
3937 operator=(rx);
3938}
3939
3940/*!
3941 Destroys the regular expression and cleans up its internal data.
3942*/
3943QRegExp::~QRegExp()
3944{
3945 invalidateEngine(priv);
3946 delete priv;
3947}
3948
3949/*!
3950 Copies the regular expression \a rx and returns a reference to the
3951 copy. The case sensitivity, wildcard, and minimal matching options
3952 are also copied.
3953*/
3954QRegExp &QRegExp::operator=(const QRegExp &rx)
3955{
3956 prepareEngine(priv: rx.priv); // to allow sharing
3957 QRegExpEngine *otherEng = rx.priv->eng;
3958 if (otherEng)
3959 otherEng->ref.ref();
3960 invalidateEngine(priv);
3961 priv->eng = otherEng;
3962 priv->engineKey = rx.priv->engineKey;
3963 priv->minimal = rx.priv->minimal;
3964#ifndef QT_NO_REGEXP_CAPTURE
3965 priv->t = rx.priv->t;
3966 priv->capturedCache = rx.priv->capturedCache;
3967#endif
3968 if (priv->eng)
3969 priv->matchState.prepareForMatch(eng: priv->eng);
3970 priv->matchState.captured = rx.priv->matchState.captured;
3971 return *this;
3972}
3973
3974/*!
3975 \fn QRegExp &QRegExp::operator=(QRegExp &&other)
3976
3977 Move-assigns \a other to this QRegExp instance.
3978
3979 \since 5.2
3980*/
3981
3982/*!
3983 \fn void QRegExp::swap(QRegExp &other)
3984 \since 4.8
3985
3986 Swaps regular expression \a other with this regular
3987 expression. This operation is very fast and never fails.
3988*/
3989
3990/*!
3991 Returns \c true if this regular expression is equal to \a rx;
3992 otherwise returns \c false.
3993
3994 Two QRegExp objects are equal if they have the same pattern
3995 strings and the same settings for case sensitivity, wildcard and
3996 minimal matching.
3997*/
3998bool QRegExp::operator==(const QRegExp &rx) const
3999{
4000 return priv->engineKey == rx.priv->engineKey && priv->minimal == rx.priv->minimal;
4001}
4002
4003/*!
4004 \since 5.6
4005 \relates QRegExp
4006
4007 Returns the hash value for \a key, using
4008 \a seed to seed the calculation.
4009*/
4010size_t qHash(const QRegExp &key, size_t seed) noexcept
4011{
4012 QtPrivate::QHashCombine hash;
4013 seed = hash(seed, key.priv->engineKey);
4014 seed = hash(seed, key.priv->minimal);
4015 return seed;
4016}
4017
4018/*!
4019 \fn bool QRegExp::operator!=(const QRegExp &rx) const
4020
4021 Returns \c true if this regular expression is not equal to \a rx;
4022 otherwise returns \c false.
4023
4024 \sa operator==()
4025*/
4026
4027/*!
4028 Returns \c true if the pattern string is empty; otherwise returns
4029 false.
4030
4031 If you call exactMatch() with an empty pattern on an empty string
4032 it will return true; otherwise it returns \c false since it operates
4033 over the whole string. If you call indexIn() with an empty pattern
4034 on \e any string it will return the start offset (0 by default)
4035 because the empty pattern matches the 'emptiness' at the start of
4036 the string. In this case the length of the match returned by
4037 matchedLength() will be 0.
4038
4039 See QString::isEmpty().
4040*/
4041
4042bool QRegExp::isEmpty() const
4043{
4044 return priv->engineKey.pattern.isEmpty();
4045}
4046
4047/*!
4048 Returns \c true if the regular expression is valid; otherwise returns
4049 false. An invalid regular expression never matches.
4050
4051 The pattern \b{[a-z} is an example of an invalid pattern, since
4052 it lacks a closing square bracket.
4053
4054 Note that the validity of a regexp may also depend on the setting
4055 of the wildcard flag, for example \b{*.html} is a valid
4056 wildcard regexp but an invalid full regexp.
4057
4058 \sa errorString()
4059*/
4060bool QRegExp::isValid() const
4061{
4062 if (priv->engineKey.pattern.isEmpty()) {
4063 return true;
4064 } else {
4065 prepareEngine(priv);
4066 return priv->eng->isValid();
4067 }
4068}
4069
4070/*!
4071 Returns the pattern string of the regular expression. The pattern
4072 has either regular expression syntax or wildcard syntax, depending
4073 on patternSyntax().
4074
4075 \sa patternSyntax(), caseSensitivity()
4076*/
4077QString QRegExp::pattern() const
4078{
4079 return priv->engineKey.pattern;
4080}
4081
4082/*!
4083 Sets the pattern string to \a pattern. The case sensitivity,
4084 wildcard, and minimal matching options are not changed.
4085
4086 \sa setPatternSyntax(), setCaseSensitivity()
4087*/
4088void QRegExp::setPattern(const QString &pattern)
4089{
4090 if (priv->engineKey.pattern != pattern) {
4091 invalidateEngine(priv);
4092 priv->engineKey.pattern = pattern;
4093 }
4094}
4095
4096/*!
4097 Returns Qt::CaseSensitive if the regexp is matched case
4098 sensitively; otherwise returns Qt::CaseInsensitive.
4099
4100 \sa patternSyntax(), pattern(), isMinimal()
4101*/
4102Qt::CaseSensitivity QRegExp::caseSensitivity() const
4103{
4104 return priv->engineKey.cs;
4105}
4106
4107/*!
4108 Sets case sensitive matching to \a cs.
4109
4110 If \a cs is Qt::CaseSensitive, \b{\\.txt$} matches
4111 \c{readme.txt} but not \c{README.TXT}.
4112
4113 \sa setPatternSyntax(), setPattern(), setMinimal()
4114*/
4115void QRegExp::setCaseSensitivity(Qt::CaseSensitivity cs)
4116{
4117 if ((bool)cs != (bool)priv->engineKey.cs) {
4118 invalidateEngine(priv);
4119 priv->engineKey.cs = cs;
4120 }
4121}
4122
4123/*!
4124 Returns the syntax used by the regular expression. The default is
4125 QRegExp::RegExp.
4126
4127 \sa pattern(), caseSensitivity()
4128*/
4129QRegExp::PatternSyntax QRegExp::patternSyntax() const
4130{
4131 return priv->engineKey.patternSyntax;
4132}
4133
4134/*!
4135 Sets the syntax mode for the regular expression. The default is
4136 QRegExp::RegExp.
4137
4138 Setting \a syntax to QRegExp::Wildcard enables simple shell-like
4139 \l{QRegExp wildcard matching}. For example, \b{r*.txt} matches the
4140 string \c{readme.txt} in wildcard mode, but does not match
4141 \c{readme}.
4142
4143 Setting \a syntax to QRegExp::FixedString means that the pattern
4144 is interpreted as a plain string. Special characters (e.g.,
4145 backslash) don't need to be escaped then.
4146
4147 \sa setPattern(), setCaseSensitivity(), escape()
4148*/
4149void QRegExp::setPatternSyntax(PatternSyntax syntax)
4150{
4151 if (syntax != priv->engineKey.patternSyntax) {
4152 invalidateEngine(priv);
4153 priv->engineKey.patternSyntax = syntax;
4154 }
4155}
4156
4157/*!
4158 Returns \c true if minimal (non-greedy) matching is enabled;
4159 otherwise returns \c false.
4160
4161 \sa caseSensitivity(), setMinimal()
4162*/
4163bool QRegExp::isMinimal() const
4164{
4165 return priv->minimal;
4166}
4167
4168/*!
4169 Enables or disables minimal matching. If \a minimal is false,
4170 matching is greedy (maximal) which is the default.
4171
4172 For example, suppose we have the input string "We must be
4173 <b>bold</b>, very <b>bold</b>!" and the pattern
4174 \b{<b>.*</b>}. With the default greedy (maximal) matching,
4175 the match is "We must be \underline{<b>bold</b>, very
4176 <b>bold</b>}!". But with minimal (non-greedy) matching, the
4177 first match is: "We must be \underline{<b>bold</b>}, very
4178 <b>bold</b>!" and the second match is "We must be <b>bold</b>,
4179 very \underline{<b>bold</b>}!". In practice we might use the pattern
4180 \b{<b>[^<]*\</b>} instead, although this will still fail for
4181 nested tags.
4182
4183 \sa setCaseSensitivity()
4184*/
4185void QRegExp::setMinimal(bool minimal)
4186{
4187 priv->minimal = minimal;
4188}
4189
4190// ### Qt 5: make non-const
4191/*!
4192 Returns \c true if \a str is matched exactly by this regular
4193 expression; otherwise returns \c false. You can determine how much of
4194 the string was matched by calling matchedLength().
4195
4196 For a given regexp string R, exactMatch("R") is the equivalent of
4197 indexIn("^R$") since exactMatch() effectively encloses the regexp
4198 in the start of string and end of string anchors, except that it
4199 sets matchedLength() differently.
4200
4201 For example, if the regular expression is \b{blue}, then
4202 exactMatch() returns \c true only for input \c blue. For inputs \c
4203 bluebell, \c blutak and \c lightblue, exactMatch() returns \c false
4204 and matchedLength() will return 4, 3 and 0 respectively.
4205
4206 Although const, this function sets matchedLength(),
4207 capturedTexts(), and pos().
4208
4209 \sa indexIn(), lastIndexIn()
4210*/
4211bool QRegExp::exactMatch(const QString &str) const
4212{
4213 prepareEngineForMatch(priv, str);
4214 priv->matchState.match(str0: str.unicode(), len0: str.size(), pos0: 0, minimal0: priv->minimal, oneTest: true, caretIndex: 0);
4215 if (priv->matchState.captured[1] == str.size()) {
4216 return true;
4217 } else {
4218 priv->matchState.captured[0] = 0;
4219 priv->matchState.captured[1] = priv->matchState.oneTestMatchedLen;
4220 return false;
4221 }
4222}
4223
4224/*!
4225 Returns the regexp as a QVariant
4226*/
4227QRegExp::operator QVariant() const
4228{
4229QT_WARNING_PUSH QT_WARNING_DISABLE_DEPRECATED
4230 QVariant v;
4231 v.setValue(*this);
4232 return v;
4233QT_WARNING_POP
4234}
4235
4236// ### Qt 5: make non-const
4237/*!
4238 Attempts to find a match in \a str from position \a offset (0 by
4239 default). If \a offset is -1, the search starts at the last
4240 character; if -2, at the next to last character; etc.
4241
4242 Returns the position of the first match, or -1 if there was no
4243 match.
4244
4245 The \a caretMode parameter can be used to instruct whether \b{^}
4246 should match at index 0 or at \a offset.
4247
4248 You might prefer to use QString::indexOf(), QString::contains(),
4249 or even QStringList::filter(). To replace matches use
4250 QString::replace().
4251
4252 Example:
4253 \snippet code/src_corelib_text_qregexp.cpp 13
4254
4255 Although const, this function sets matchedLength(),
4256 capturedTexts() and pos().
4257
4258 If the QRegExp is a wildcard expression (see setPatternSyntax())
4259 and want to test a string against the whole wildcard expression,
4260 use exactMatch() instead of this function.
4261
4262 \sa lastIndexIn(), exactMatch()
4263*/
4264
4265int QRegExp::indexIn(const QString &str, int offset, CaretMode caretMode) const
4266{
4267 prepareEngineForMatch(priv, str);
4268 if (offset < 0)
4269 offset += str.size();
4270 priv->matchState.match(str0: str.unicode(), len0: str.size(), pos0: offset,
4271 minimal0: priv->minimal, oneTest: false, caretIndex: caretIndex(offset, caretMode));
4272 return priv->matchState.captured[0];
4273}
4274
4275// ### Qt 5: make non-const
4276/*!
4277 Attempts to find a match backwards in \a str from position \a
4278 offset. If \a offset is -1 (the default), the search starts at the
4279 last character; if -2, at the next to last character; etc.
4280
4281 Returns the position of the first match, or -1 if there was no
4282 match.
4283
4284 The \a caretMode parameter can be used to instruct whether \b{^}
4285 should match at index 0 or at \a offset.
4286
4287 Although const, this function sets matchedLength(),
4288 capturedTexts() and pos().
4289
4290 \warning Searching backwards is much slower than searching
4291 forwards.
4292
4293 \sa indexIn(), exactMatch()
4294*/
4295
4296int QRegExp::lastIndexIn(const QString &str, int offset, CaretMode caretMode) const
4297{
4298 prepareEngineForMatch(priv, str);
4299 if (offset < 0)
4300 offset += str.size();
4301 if (offset < 0 || offset > str.size()) {
4302 memset(s: priv->matchState.captured, c: -1, n: priv->matchState.capturedSize*sizeof(int));
4303 return -1;
4304 }
4305
4306 while (offset >= 0) {
4307 priv->matchState.match(str0: str.unicode(), len0: str.size(), pos0: offset,
4308 minimal0: priv->minimal, oneTest: true, caretIndex: caretIndex(offset, caretMode));
4309 if (priv->matchState.captured[0] == offset)
4310 return offset;
4311 --offset;
4312 }
4313 return -1;
4314}
4315
4316/*!
4317 Returns the length of the last matched string, or -1 if there was
4318 no match.
4319
4320 \sa exactMatch(), indexIn(), lastIndexIn()
4321*/
4322int QRegExp::matchedLength() const
4323{
4324 return priv->matchState.captured[1];
4325}
4326
4327
4328/*!
4329 Replaces every occurrence of this regular expression in
4330 \a str with \a after and returns the result.
4331
4332 For regular expressions containing \l{capturing parentheses},
4333 occurrences of \b{\\1}, \b{\\2}, ..., in \a after are replaced
4334 with \c {rx}.cap(1), cap(2), ...
4335
4336 \sa indexIn(), lastIndexIn(), QRegExp::cap()
4337*/
4338QString QRegExp::replaceIn(const QString &str, const QString &after) const
4339{
4340 struct QStringCapture
4341 {
4342 int pos;
4343 int len;
4344 int no;
4345 };
4346
4347 QRegExp rx2(*this);
4348
4349 if (str.isEmpty() && rx2.indexIn(str) == -1)
4350 return str;
4351
4352 QString s(str);
4353
4354 int index = 0;
4355 int numCaptures = rx2.captureCount();
4356 int al = after.size();
4357 QRegExp::CaretMode caretMode = QRegExp::CaretAtZero;
4358
4359 if (numCaptures > 0) {
4360 const QChar *uc = after.unicode();
4361 int numBackRefs = 0;
4362
4363 for (int i = 0; i < al - 1; i++) {
4364 if (uc[i] == QLatin1Char('\\')) {
4365 int no = uc[i + 1].digitValue();
4366 if (no > 0 && no <= numCaptures)
4367 numBackRefs++;
4368 }
4369 }
4370
4371 /*
4372 This is the harder case where we have back-references.
4373 */
4374 if (numBackRefs > 0) {
4375 QVarLengthArray<QStringCapture, 16> captures(numBackRefs);
4376 int j = 0;
4377
4378 for (int i = 0; i < al - 1; i++) {
4379 if (uc[i] == QLatin1Char('\\')) {
4380 int no = uc[i + 1].digitValue();
4381 if (no > 0 && no <= numCaptures) {
4382 QStringCapture capture;
4383 capture.pos = i;
4384 capture.len = 2;
4385
4386 if (i < al - 2) {
4387 int secondDigit = uc[i + 2].digitValue();
4388 if (secondDigit != -1 && ((no * 10) + secondDigit) <= numCaptures) {
4389 no = (no * 10) + secondDigit;
4390 ++capture.len;
4391 }
4392 }
4393
4394 capture.no = no;
4395 captures[j++] = capture;
4396 }
4397 }
4398 }
4399
4400 while (index <= s.size()) {
4401 index = rx2.indexIn(str: s, offset: index, caretMode);
4402 if (index == -1)
4403 break;
4404
4405 QString after2(after);
4406 for (j = numBackRefs - 1; j >= 0; j--) {
4407 const QStringCapture &capture = captures[j];
4408 after2.replace(i: capture.pos, len: capture.len, after: rx2.cap(nth: capture.no));
4409 }
4410
4411 s.replace(i: index, len: rx2.matchedLength(), after: after2);
4412 index += after2.size();
4413
4414 // avoid infinite loop on 0-length matches (e.g., QRegExp("[a-z]*"))
4415 if (rx2.matchedLength() == 0)
4416 ++index;
4417
4418 caretMode = QRegExp::CaretWontMatch;
4419 }
4420 return s;
4421 }
4422 }
4423
4424 /*
4425 This is the simple and optimized case where we don't have
4426 back-references.
4427 */
4428 while (index != -1) {
4429 struct {
4430 int pos;
4431 int length;
4432 } replacements[2048];
4433
4434 int pos = 0;
4435 int adjust = 0;
4436 while (pos < 2047) {
4437 index = rx2.indexIn(str: s, offset: index, caretMode);
4438 if (index == -1)
4439 break;
4440 int ml = rx2.matchedLength();
4441 replacements[pos].pos = index;
4442 replacements[pos++].length = ml;
4443 index += ml;
4444 adjust += al - ml;
4445 // avoid infinite loop
4446 if (!ml)
4447 index++;
4448 }
4449 if (!pos)
4450 break;
4451 replacements[pos].pos = s.size();
4452 int newlen = s.size() + adjust;
4453
4454 // to continue searching at the right position after we did
4455 // the first round of replacements
4456 if (index != -1)
4457 index += adjust;
4458 QString newstring;
4459 newstring.reserve(asize: newlen + 1);
4460 QChar *newuc = newstring.data();
4461 QChar *uc = newuc;
4462 int copystart = 0;
4463 int i = 0;
4464 while (i < pos) {
4465 int copyend = replacements[i].pos;
4466 int size = copyend - copystart;
4467 memcpy(dest: static_cast<void*>(uc), src: static_cast<const void *>(s.constData() + copystart), n: size * sizeof(QChar));
4468 uc += size;
4469 memcpy(dest: static_cast<void *>(uc), src: static_cast<const void *>(after.constData()), n: al * sizeof(QChar));
4470 uc += al;
4471 copystart = copyend + replacements[i].length;
4472 i++;
4473 }
4474 memcpy(dest: static_cast<void *>(uc), src: static_cast<const void *>(s.constData() + copystart), n: (s.size() - copystart) * sizeof(QChar));
4475 newstring.resize(size: newlen);
4476 s = newstring;
4477 caretMode = QRegExp::CaretWontMatch;
4478 }
4479 return s;
4480
4481}
4482
4483
4484/*!
4485 \fn QString QRegExp::removeIn(const QString &str) const
4486
4487 Removes every occurrence of this regular expression \a str, and
4488 returns the result
4489
4490 Does the same as replaceIn(str, QString()).
4491
4492 \sa indexIn(), lastIndexIn(), replaceIn()
4493*/
4494
4495
4496/*!
4497 \fn QString QRegExp::countIn(const QString &str) const
4498
4499 Returns the number of times this regular expression matches
4500 in \a str.
4501
4502 \sa indexIn(), lastIndexIn(), replaceIn()
4503*/
4504
4505int QRegExp::countIn(const QString &str) const
4506{
4507 QRegExp rx2(*this);
4508 int count = 0;
4509 int index = -1;
4510 int len = str.size();
4511 while (index < len - 1) { // count overlapping matches
4512 index = rx2.indexIn(str, offset: index + 1);
4513 if (index == -1)
4514 break;
4515 count++;
4516 }
4517 return count;
4518}
4519
4520/*!
4521 Splits \a str into substrings wherever this regular expression
4522 matches, and returns the list of those strings. If this regular
4523 expression does not match anywhere in the string, split() returns a
4524 single-element list containing \a str.
4525
4526 If \a behavior is set to Qt::KeepEmptyParts, empty fields are
4527 included in the resulting list.
4528
4529 \sa QStringList::join(), QString::split()
4530*/
4531QStringList QRegExp::splitString(const QString &str, Qt::SplitBehavior behavior) const
4532{
4533 QRegExp rx2(*this);
4534 QStringList list;
4535 int start = 0;
4536 int extra = 0;
4537 int end;
4538 while ((end = rx2.indexIn(str, offset: start + extra)) != -1) {
4539 int matchedLen = rx2.matchedLength();
4540 if (start != end || behavior == Qt::KeepEmptyParts)
4541 list.append(t: str.mid(position: start, n: end - start));
4542 start = end + matchedLen;
4543 extra = (matchedLen == 0) ? 1 : 0;
4544 }
4545 if (start != str.size() || behavior == Qt::KeepEmptyParts)
4546 list.append(t: str.mid(position: start, n: -1));
4547 return list;
4548}
4549
4550/*!
4551 Returns a list of all the strings that match this regular
4552 expression in \a stringList.
4553*/
4554QStringList QRegExp::filterList(const QStringList &stringList) const
4555{
4556 QStringList res;
4557 for (const QString &s : stringList) {
4558 if (containedIn(str: s))
4559 res << s;
4560 }
4561 return res;
4562}
4563
4564/*!
4565 Replaces every occurrence of this regexp, in each of \a stringList's
4566 with \a after. Returns a reference to the string list.
4567*/
4568QStringList QRegExp::replaceIn(const QStringList &stringList, const QString &after) const
4569{
4570 QStringList list;
4571 for (const QString &s : stringList)
4572 list << replaceIn(str: s, after);
4573 return list;
4574}
4575
4576/*!
4577 Returns the index position of the first exact match of this regexp in
4578 \a list, searching forward from index position \a from. Returns
4579 -1 if no item matched.
4580
4581 \sa lastIndexIn(), exactMatch()
4582*/
4583int QRegExp::indexIn(const QStringList &list, int from) const
4584{
4585 QRegExp rx2(*this);
4586 if (from < 0)
4587 from = qMax(a: from + list.size(), b: 0);
4588 for (int i = from; i < list.size(); ++i) {
4589 if (rx2.exactMatch(str: list.at(i)))
4590 return i;
4591 }
4592 return -1;
4593}
4594
4595/*!
4596 Returns the index position of the last exact match of this regexp in
4597 \a list, searching backward from index position \a from. If \a
4598 from is -1 (the default), the search starts at the last item.
4599 Returns -1 if no item matched.
4600
4601 \sa QRegExp::exactMatch()
4602*/
4603int QRegExp::lastIndexIn(const QStringList &list, int from) const
4604{
4605 QRegExp rx2(*this);
4606 if (from < 0)
4607 from += list.size();
4608 else if (from >= list.size())
4609 from = list.size() - 1;
4610 for (int i = from; i >= 0; --i) {
4611 if (rx2.exactMatch(str: list.at(i)))
4612 return i;
4613 }
4614 return -1;
4615}
4616
4617#ifndef QT_NO_REGEXP_CAPTURE
4618
4619/*!
4620 \since 4.6
4621 Returns the number of captures contained in the regular expression.
4622 */
4623int QRegExp::captureCount() const
4624{
4625 prepareEngine(priv);
4626 return priv->eng->captureCount();
4627}
4628
4629/*!
4630 Returns a list of the captured text strings.
4631
4632 The first string in the list is the entire matched string. Each
4633 subsequent list element contains a string that matched a
4634 (capturing) subexpression of the regexp.
4635
4636 For example:
4637 \snippet code/src_corelib_text_qregexp.cpp 14
4638
4639 The above example also captures elements that may be present but
4640 which we have no interest in. This problem can be solved by using
4641 non-capturing parentheses:
4642
4643 \snippet code/src_corelib_text_qregexp.cpp 15
4644
4645 Note that if you want to iterate over the list, you should iterate
4646 over a copy, e.g.
4647 \snippet code/src_corelib_text_qregexp.cpp 16
4648
4649 Some regexps can match an indeterminate number of times. For
4650 example if the input string is "Offsets: 12 14 99 231 7" and the
4651 regexp, \c{rx}, is \b{(\\d+)+}, we would hope to get a list of
4652 all the numbers matched. However, after calling
4653 \c{rx.indexIn(str)}, capturedTexts() will return the list ("12",
4654 "12"), i.e. the entire match was "12" and the first subexpression
4655 matched was "12". The correct approach is to use cap() in a
4656 \l{QRegExp#cap_in_a_loop}{loop}.
4657
4658 The order of elements in the string list is as follows. The first
4659 element is the entire matching string. Each subsequent element
4660 corresponds to the next capturing open left parentheses. Thus
4661 capturedTexts()[1] is the text of the first capturing parentheses,
4662 capturedTexts()[2] is the text of the second and so on
4663 (corresponding to $1, $2, etc., in some other regexp languages).
4664
4665 \sa cap(), pos()
4666*/
4667QStringList QRegExp::capturedTexts() const
4668{
4669 if (priv->capturedCache.isEmpty()) {
4670 prepareEngine(priv);
4671 const int *captured = priv->matchState.captured;
4672 int n = priv->matchState.capturedSize;
4673
4674 for (int i = 0; i < n; i += 2) {
4675 QString m;
4676 if (captured[i + 1] == 0)
4677 m = QLatin1String(""); // ### Qt 5: don't distinguish between null and empty
4678 else if (captured[i] >= 0)
4679 m = priv->t.mid(position: captured[i], n: captured[i + 1]);
4680 priv->capturedCache.append(t: m);
4681 }
4682 priv->t.clear();
4683 }
4684 return priv->capturedCache;
4685}
4686
4687/*!
4688 \internal
4689*/
4690QStringList QRegExp::capturedTexts()
4691{
4692 return const_cast<const QRegExp *>(this)->capturedTexts();
4693}
4694
4695/*!
4696 Returns the text captured by the \a nth subexpression. The entire
4697 match has index 0 and the parenthesized subexpressions have
4698 indexes starting from 1 (excluding non-capturing parentheses).
4699
4700 \snippet code/src_corelib_text_qregexp.cpp 17
4701
4702 The order of elements matched by cap() is as follows. The first
4703 element, cap(0), is the entire matching string. Each subsequent
4704 element corresponds to the next capturing open left parentheses.
4705 Thus cap(1) is the text of the first capturing parentheses, cap(2)
4706 is the text of the second, and so on.
4707
4708 \sa capturedTexts(), pos()
4709*/
4710QString QRegExp::cap(int nth) const
4711{
4712 return capturedTexts().value(i: nth);
4713}
4714
4715/*!
4716 \internal
4717*/
4718QString QRegExp::cap(int nth)
4719{
4720 return const_cast<const QRegExp *>(this)->cap(nth);
4721}
4722
4723/*!
4724 Returns the position of the \a nth captured text in the searched
4725 string. If \a nth is 0 (the default), pos() returns the position
4726 of the whole match.
4727
4728 Example:
4729 \snippet code/src_corelib_text_qregexp.cpp 18
4730
4731 For zero-length matches, pos() always returns -1. (For example, if
4732 cap(4) would return an empty string, pos(4) returns -1.) This is
4733 a feature of the implementation.
4734
4735 \sa cap(), capturedTexts()
4736*/
4737int QRegExp::pos(int nth) const
4738{
4739 if (nth < 0 || nth >= priv->matchState.capturedSize / 2)
4740 return -1;
4741 else
4742 return priv->matchState.captured[2 * nth];
4743}
4744
4745/*!
4746 \internal
4747*/
4748int QRegExp::pos(int nth)
4749{
4750 return const_cast<const QRegExp *>(this)->pos(nth);
4751}
4752
4753/*!
4754 Returns a text string that explains why a regexp pattern is
4755 invalid the case being; otherwise returns "no error occurred".
4756
4757 \sa isValid()
4758*/
4759QString QRegExp::errorString() const
4760{
4761 if (isValid()) {
4762 return QString::fromLatin1(RXERR_OK);
4763 } else {
4764 return priv->eng->errorString();
4765 }
4766}
4767
4768/*!
4769 \internal
4770*/
4771QString QRegExp::errorString()
4772{
4773 return const_cast<const QRegExp *>(this)->errorString();
4774}
4775
4776#endif
4777
4778/*!
4779 Returns the string \a str with every regexp special character
4780 escaped with a backslash. The special characters are $, (,), *, +,
4781 ., ?, [, \,], ^, {, | and }.
4782
4783 Example:
4784
4785 \snippet code/src_corelib_text_qregexp.cpp 19
4786
4787 This function is useful to construct regexp patterns dynamically:
4788
4789 \snippet code/src_corelib_text_qregexp.cpp 20
4790
4791 \sa setPatternSyntax()
4792*/
4793QString QRegExp::escape(const QString &str)
4794{
4795 QString quoted;
4796 const int count = str.size();
4797 quoted.reserve(asize: count * 2);
4798 const QLatin1Char backslash('\\');
4799 for (int i = 0; i < count; i++) {
4800 switch (str.at(i).toLatin1()) {
4801 case '$':
4802 case '(':
4803 case ')':
4804 case '*':
4805 case '+':
4806 case '.':
4807 case '?':
4808 case '[':
4809 case '\\':
4810 case ']':
4811 case '^':
4812 case '{':
4813 case '|':
4814 case '}':
4815 quoted.append(c: backslash);
4816 }
4817 quoted.append(c: str.at(i));
4818 }
4819 return quoted;
4820}
4821
4822
4823#ifndef QT_NO_DATASTREAM
4824/*!
4825 \relates QRegExp
4826
4827 Writes the regular expression \a regExp to stream \a out.
4828
4829 \sa {Serializing Qt Data Types}
4830*/
4831QDataStream &operator<<(QDataStream &out, const QRegExp &regExp)
4832{
4833 return out << regExp.pattern() << (quint8)regExp.caseSensitivity()
4834 << (quint8)regExp.patternSyntax()
4835 << (quint8)!!regExp.isMinimal();
4836}
4837
4838/*!
4839 \relates QRegExp
4840
4841 Reads a regular expression from stream \a in into \a regExp.
4842
4843 \sa {Serializing Qt Data Types}
4844*/
4845QDataStream &operator>>(QDataStream &in, QRegExp &regExp)
4846{
4847 QString pattern;
4848 quint8 cs;
4849 quint8 patternSyntax;
4850 quint8 isMinimal;
4851
4852 in >> pattern >> cs >> patternSyntax >> isMinimal;
4853
4854 QRegExp newRegExp(pattern, Qt::CaseSensitivity(cs),
4855 QRegExp::PatternSyntax(patternSyntax));
4856
4857 newRegExp.setMinimal(isMinimal);
4858 regExp = newRegExp;
4859 return in;
4860}
4861#endif // QT_NO_DATASTREAM
4862
4863#ifndef QT_NO_DEBUG_STREAM
4864QDebug operator<<(QDebug dbg, const QRegExp &r)
4865{
4866 QDebugStateSaver saver(dbg);
4867 dbg.nospace() << "QRegExp(patternSyntax=" << r.patternSyntax()
4868 << ", pattern='"<< r.pattern() << "')";
4869 return dbg;
4870}
4871#endif
4872
4873QT_END_NAMESPACE
4874

source code of qt5compat/src/core5/text/qregexp.cpp