1/****************************************************************************
2**
3** Copyright (C) 2016 The Qt Company Ltd.
4** Contact: https://www.qt.io/licensing/
5**
6** This file is part of the QtCore module of the Qt Toolkit.
7**
8** $QT_BEGIN_LICENSE:LGPL$
9** Commercial License Usage
10** Licensees holding valid commercial Qt licenses may use this file in
11** accordance with the commercial license agreement provided with the
12** Software or, alternatively, in accordance with the terms contained in
13** a written agreement between you and The Qt Company. For licensing terms
14** and conditions see https://www.qt.io/terms-conditions. For further
15** information use the contact form at https://www.qt.io/contact-us.
16**
17** GNU Lesser General Public License Usage
18** Alternatively, this file may be used under the terms of the GNU Lesser
19** General Public License version 3 as published by the Free Software
20** Foundation and appearing in the file LICENSE.LGPL3 included in the
21** packaging of this file. Please review the following information to
22** ensure the GNU Lesser General Public License version 3 requirements
23** will be met: https://www.gnu.org/licenses/lgpl-3.0.html.
24**
25** GNU General Public License Usage
26** Alternatively, this file may be used under the terms of the GNU
27** General Public License version 2.0 or (at your option) the GNU General
28** Public license version 3 or any later version approved by the KDE Free
29** Qt Foundation. The licenses are as published by the Free Software
30** Foundation and appearing in the file LICENSE.GPL2 and LICENSE.GPL3
31** included in the packaging of this file. Please review the following
32** information to ensure the GNU General Public License requirements will
33** be met: https://www.gnu.org/licenses/gpl-2.0.html and
34** https://www.gnu.org/licenses/gpl-3.0.html.
35**
36** $QT_END_LICENSE$
37**
38****************************************************************************/
39
40#include "qregexp.h"
41
42#include "qalgorithms.h"
43#include "qbitarray.h"
44#include "qcache.h"
45#include "qdatastream.h"
46#include "qdebug.h"
47#include "qhashfunctions.h"
48#include "qlist.h"
49#include "qmap.h"
50#include "qmutex.h"
51#include "qstring.h"
52#include "qstringlist.h"
53#include "qstringmatcher.h"
54#include "qvector.h"
55#include "private/qlocking_p.h"
56
57#include <limits.h>
58#include <algorithm>
59
60QT_BEGIN_NAMESPACE
61
62// error strings for the regexp parser
63#define RXERR_OK QT_TRANSLATE_NOOP("QRegExp", "no error occurred")
64#define RXERR_DISABLED QT_TRANSLATE_NOOP("QRegExp", "disabled feature used")
65#define RXERR_CHARCLASS QT_TRANSLATE_NOOP("QRegExp", "bad char class syntax")
66#define RXERR_LOOKAHEAD QT_TRANSLATE_NOOP("QRegExp", "bad lookahead syntax")
67#define RXERR_LOOKBEHIND QT_TRANSLATE_NOOP("QRegExp", "lookbehinds not supported, see QTBUG-2371")
68#define RXERR_REPETITION QT_TRANSLATE_NOOP("QRegExp", "bad repetition syntax")
69#define RXERR_OCTAL QT_TRANSLATE_NOOP("QRegExp", "invalid octal value")
70#define RXERR_LEFTDELIM QT_TRANSLATE_NOOP("QRegExp", "missing left delim")
71#define RXERR_END QT_TRANSLATE_NOOP("QRegExp", "unexpected end")
72#define RXERR_LIMIT QT_TRANSLATE_NOOP("QRegExp", "met internal limit")
73#define RXERR_INTERVAL QT_TRANSLATE_NOOP("QRegExp", "invalid interval")
74#define RXERR_CATEGORY QT_TRANSLATE_NOOP("QRegExp", "invalid category")
75
76/*!
77 \class QRegExp
78 \inmodule QtCore
79 \reentrant
80 \brief The QRegExp class provides pattern matching using regular expressions.
81
82 \ingroup tools
83 \ingroup shared
84
85 \keyword regular expression
86
87 A regular expression, or "regexp", is a pattern for matching
88 substrings in a text. This is useful in many contexts, e.g.,
89
90 \table
91 \row \li Validation
92 \li A regexp can test whether a substring meets some criteria,
93 e.g. is an integer or contains no whitespace.
94 \row \li Searching
95 \li A regexp provides more powerful pattern matching than
96 simple substring matching, e.g., match one of the words
97 \e{mail}, \e{letter} or \e{correspondence}, but none of the
98 words \e{email}, \e{mailman}, \e{mailer}, \e{letterbox}, etc.
99 \row \li Search and Replace
100 \li A regexp can replace all occurrences of a substring with a
101 different substring, e.g., replace all occurrences of \e{&}
102 with \e{\&amp;} except where the \e{&} is already followed by
103 an \e{amp;}.
104 \row \li String Splitting
105 \li A regexp can be used to identify where a string should be
106 split apart, e.g. splitting tab-delimited strings.
107 \endtable
108
109 A brief introduction to regexps is presented, a description of
110 Qt's regexp language, some examples, and the function
111 documentation itself. QRegExp is modeled on Perl's regexp
112 language. It fully supports Unicode. QRegExp can also be used in a
113 simpler, \e{wildcard mode} that is similar to the functionality
114 found in command shells. The syntax rules used by QRegExp can be
115 changed with setPatternSyntax(). In particular, the pattern syntax
116 can be set to QRegExp::FixedString, which means the pattern to be
117 matched is interpreted as a plain string, i.e., special characters
118 (e.g., backslash) are not escaped.
119
120 A good text on regexps is \e {Mastering Regular Expressions}
121 (Third Edition) by Jeffrey E. F. Friedl, ISBN 0-596-52812-4.
122
123 \note In Qt 5, the new QRegularExpression class provides a Perl
124 compatible implementation of regular expressions and is recommended
125 in place of QRegExp.
126
127 \tableofcontents
128
129 \section1 Introduction
130
131 Regexps are built up from expressions, quantifiers, and
132 assertions. The simplest expression is a character, e.g. \b{x}
133 or \b{5}. An expression can also be a set of characters
134 enclosed in square brackets. \b{[ABCD]} will match an \b{A}
135 or a \b{B} or a \b{C} or a \b{D}. We can write this same
136 expression as \b{[A-D]}, and an expression to match any
137 capital letter in the English alphabet is written as
138 \b{[A-Z]}.
139
140 A quantifier specifies the number of occurrences of an expression
141 that must be matched. \b{x{1,1}} means match one and only one
142 \b{x}. \b{x{1,5}} means match a sequence of \b{x}
143 characters that contains at least one \b{x} but no more than
144 five.
145
146 Note that in general regexps cannot be used to check for balanced
147 brackets or tags. For example, a regexp can be written to match an
148 opening html \c{<b>} and its closing \c{</b>}, if the \c{<b>} tags
149 are not nested, but if the \c{<b>} tags are nested, that same
150 regexp will match an opening \c{<b>} tag with the wrong closing
151 \c{</b>}. For the fragment \c{<b>bold <b>bolder</b></b>}, the
152 first \c{<b>} would be matched with the first \c{</b>}, which is
153 not correct. However, it is possible to write a regexp that will
154 match nested brackets or tags correctly, but only if the number of
155 nesting levels is fixed and known. If the number of nesting levels
156 is not fixed and known, it is impossible to write a regexp that
157 will not fail.
158
159 Suppose we want a regexp to match integers in the range 0 to 99.
160 At least one digit is required, so we start with the expression
161 \b{[0-9]{1,1}}, which matches a single digit exactly once. This
162 regexp matches integers in the range 0 to 9. To match integers up
163 to 99, increase the maximum number of occurrences to 2, so the
164 regexp becomes \b{[0-9]{1,2}}. This regexp satisfies the
165 original requirement to match integers from 0 to 99, but it will
166 also match integers that occur in the middle of strings. If we
167 want the matched integer to be the whole string, we must use the
168 anchor assertions, \b{^} (caret) and \b{$} (dollar). When
169 \b{^} is the first character in a regexp, it means the regexp
170 must match from the beginning of the string. When \b{$} is the
171 last character of the regexp, it means the regexp must match to
172 the end of the string. The regexp becomes \b{^[0-9]{1,2}$}.
173 Note that assertions, e.g. \b{^} and \b{$}, do not match
174 characters but locations in the string.
175
176 If you have seen regexps described elsewhere, they may have looked
177 different from the ones shown here. This is because some sets of
178 characters and some quantifiers are so common that they have been
179 given special symbols to represent them. \b{[0-9]} can be
180 replaced with the symbol \b{\\d}. The quantifier to match
181 exactly one occurrence, \b{{1,1}}, can be replaced with the
182 expression itself, i.e. \b{x{1,1}} is the same as \b{x}. So
183 our 0 to 99 matcher could be written as \b{^\\d{1,2}$}. It can
184 also be written \b{^\\d\\d{0,1}$}, i.e. \e{From the start of
185 the string, match a digit, followed immediately by 0 or 1 digits}.
186 In practice, it would be written as \b{^\\d\\d?$}. The \b{?}
187 is shorthand for the quantifier \b{{0,1}}, i.e. 0 or 1
188 occurrences. \b{?} makes an expression optional. The regexp
189 \b{^\\d\\d?$} means \e{From the beginning of the string, match
190 one digit, followed immediately by 0 or 1 more digit, followed
191 immediately by end of string}.
192
193 To write a regexp that matches one of the words 'mail' \e or
194 'letter' \e or 'correspondence' but does not match words that
195 contain these words, e.g., 'email', 'mailman', 'mailer', and
196 'letterbox', start with a regexp that matches 'mail'. Expressed
197 fully, the regexp is \b{m{1,1}a{1,1}i{1,1}l{1,1}}, but because
198 a character expression is automatically quantified by
199 \b{{1,1}}, we can simplify the regexp to \b{mail}, i.e., an
200 'm' followed by an 'a' followed by an 'i' followed by an 'l'. Now
201 we can use the vertical bar \b{|}, which means \b{or}, to
202 include the other two words, so our regexp for matching any of the
203 three words becomes \b{mail|letter|correspondence}. Match
204 'mail' \b{or} 'letter' \b{or} 'correspondence'. While this
205 regexp will match one of the three words we want to match, it will
206 also match words we don't want to match, e.g., 'email'. To
207 prevent the regexp from matching unwanted words, we must tell it
208 to begin and end the match at word boundaries. First we enclose
209 our regexp in parentheses, \b{(mail|letter|correspondence)}.
210 Parentheses group expressions together, and they identify a part
211 of the regexp that we wish to \l{capturing text}{capture}.
212 Enclosing the expression in parentheses allows us to use it as a
213 component in more complex regexps. It also allows us to examine
214 which of the three words was actually matched. To force the match
215 to begin and end on word boundaries, we enclose the regexp in
216 \b{\\b} \e{word boundary} assertions:
217 \b{\\b(mail|letter|correspondence)\\b}. Now the regexp means:
218 \e{Match a word boundary, followed by the regexp in parentheses,
219 followed by a word boundary}. The \b{\\b} assertion matches a
220 \e position in the regexp, not a \e character. A word boundary is
221 any non-word character, e.g., a space, newline, or the beginning
222 or ending of a string.
223
224 If we want to replace ampersand characters with the HTML entity
225 \b{\&amp;}, the regexp to match is simply \b{\&}. But this
226 regexp will also match ampersands that have already been converted
227 to HTML entities. We want to replace only ampersands that are not
228 already followed by \b{amp;}. For this, we need the negative
229 lookahead assertion, \b{(?!}__\b{)}. The regexp can then be
230 written as \b{\&(?!amp;)}, i.e. \e{Match an ampersand that is}
231 \b{not} \e{followed by} \b{amp;}.
232
233 If we want to count all the occurrences of 'Eric' and 'Eirik' in a
234 string, two valid solutions are \b{\\b(Eric|Eirik)\\b} and
235 \b{\\bEi?ri[ck]\\b}. The word boundary assertion '\\b' is
236 required to avoid matching words that contain either name,
237 e.g. 'Ericsson'. Note that the second regexp matches more
238 spellings than we want: 'Eric', 'Erik', 'Eiric' and 'Eirik'.
239
240 Some of the examples discussed above are implemented in the
241 \l{#code-examples}{code examples} section.
242
243 \target characters-and-abbreviations-for-sets-of-characters
244 \section1 Characters and Abbreviations for Sets of Characters
245
246 \table
247 \header \li Element \li Meaning
248 \row \li \b{c}
249 \li A character represents itself unless it has a special
250 regexp meaning. e.g. \b{c} matches the character \e c.
251 \row \li \b{\\c}
252 \li A character that follows a backslash matches the character
253 itself, except as specified below. e.g., To match a literal
254 caret at the beginning of a string, write \b{\\^}.
255 \row \li \b{\\a}
256 \li Matches the ASCII bell (BEL, 0x07).
257 \row \li \b{\\f}
258 \li Matches the ASCII form feed (FF, 0x0C).
259 \row \li \b{\\n}
260 \li Matches the ASCII line feed (LF, 0x0A, Unix newline).
261 \row \li \b{\\r}
262 \li Matches the ASCII carriage return (CR, 0x0D).
263 \row \li \b{\\t}
264 \li Matches the ASCII horizontal tab (HT, 0x09).
265 \row \li \b{\\v}
266 \li Matches the ASCII vertical tab (VT, 0x0B).
267 \row \li \b{\\x\e{hhhh}}
268 \li Matches the Unicode character corresponding to the
269 hexadecimal number \e{hhhh} (between 0x0000 and 0xFFFF).
270 \row \li \b{\\0\e{ooo}} (i.e., \\zero \e{ooo})
271 \li matches the ASCII/Latin1 character for the octal number
272 \e{ooo} (between 0 and 0377).
273 \row \li \b{. (dot)}
274 \li Matches any character (including newline).
275 \row \li \b{\\d}
276 \li Matches a digit (QChar::isDigit()).
277 \row \li \b{\\D}
278 \li Matches a non-digit.
279 \row \li \b{\\s}
280 \li Matches a whitespace character (QChar::isSpace()).
281 \row \li \b{\\S}
282 \li Matches a non-whitespace character.
283 \row \li \b{\\w}
284 \li Matches a word character (QChar::isLetterOrNumber(), QChar::isMark(), or '_').
285 \row \li \b{\\W}
286 \li Matches a non-word character.
287 \row \li \b{\\\e{n}}
288 \li The \e{n}-th backreference, e.g. \\1, \\2, etc.
289 \endtable
290
291 \b{Note:} The C++ compiler transforms backslashes in strings.
292 To include a \b{\\} in a regexp, enter it twice, i.e. \c{\\}.
293 To match the backslash character itself, enter it four times, i.e.
294 \c{\\\\}.
295
296 \target sets-of-characters
297 \section1 Sets of Characters
298
299 Square brackets mean match any character contained in the square
300 brackets. The character set abbreviations described above can
301 appear in a character set in square brackets. Except for the
302 character set abbreviations and the following two exceptions,
303 characters do not have special meanings in square brackets.
304
305 \table
306 \row \li \b{^}
307
308 \li The caret negates the character set if it occurs as the
309 first character (i.e. immediately after the opening square
310 bracket). \b{[abc]} matches 'a' or 'b' or 'c', but
311 \b{[^abc]} matches anything \e but 'a' or 'b' or 'c'.
312
313 \row \li \b{-}
314
315 \li The dash indicates a range of characters. \b{[W-Z]}
316 matches 'W' or 'X' or 'Y' or 'Z'.
317
318 \endtable
319
320 Using the predefined character set abbreviations is more portable
321 than using character ranges across platforms and languages. For
322 example, \b{[0-9]} matches a digit in Western alphabets but
323 \b{\\d} matches a digit in \e any alphabet.
324
325 Note: In other regexp documentation, sets of characters are often
326 called "character classes".
327
328 \target quantifiers
329 \section1 Quantifiers
330
331 By default, an expression is automatically quantified by
332 \b{{1,1}}, i.e. it should occur exactly once. In the following
333 list, \b{\e {E}} stands for expression. An expression is a
334 character, or an abbreviation for a set of characters, or a set of
335 characters in square brackets, or an expression in parentheses.
336
337 \table
338 \row \li \b{\e {E}?}
339
340 \li Matches zero or one occurrences of \e E. This quantifier
341 means \e{The previous expression is optional}, because it
342 will match whether or not the expression is found. \b{\e
343 {E}?} is the same as \b{\e {E}{0,1}}. e.g., \b{dents?}
344 matches 'dent' or 'dents'.
345
346 \row \li \b{\e {E}+}
347
348 \li Matches one or more occurrences of \e E. \b{\e {E}+} is
349 the same as \b{\e {E}{1,}}. e.g., \b{0+} matches '0',
350 '00', '000', etc.
351
352 \row \li \b{\e {E}*}
353
354 \li Matches zero or more occurrences of \e E. It is the same
355 as \b{\e {E}{0,}}. The \b{*} quantifier is often used
356 in error where \b{+} should be used. For example, if
357 \b{\\s*$} is used in an expression to match strings that
358 end in whitespace, it will match every string because
359 \b{\\s*$} means \e{Match zero or more whitespaces followed
360 by end of string}. The correct regexp to match strings that
361 have at least one trailing whitespace character is
362 \b{\\s+$}.
363
364 \row \li \b{\e {E}{n}}
365
366 \li Matches exactly \e n occurrences of \e E. \b{\e {E}{n}}
367 is the same as repeating \e E \e n times. For example,
368 \b{x{5}} is the same as \b{xxxxx}. It is also the same
369 as \b{\e {E}{n,n}}, e.g. \b{x{5,5}}.
370
371 \row \li \b{\e {E}{n,}}
372 \li Matches at least \e n occurrences of \e E.
373
374 \row \li \b{\e {E}{,m}}
375 \li Matches at most \e m occurrences of \e E. \b{\e {E}{,m}}
376 is the same as \b{\e {E}{0,m}}.
377
378 \row \li \b{\e {E}{n,m}}
379 \li Matches at least \e n and at most \e m occurrences of \e E.
380 \endtable
381
382 To apply a quantifier to more than just the preceding character,
383 use parentheses to group characters together in an expression. For
384 example, \b{tag+} matches a 't' followed by an 'a' followed by
385 at least one 'g', whereas \b{(tag)+} matches at least one
386 occurrence of 'tag'.
387
388 Note: Quantifiers are normally "greedy". They always match as much
389 text as they can. For example, \b{0+} matches the first zero it
390 finds and all the consecutive zeros after the first zero. Applied
391 to '20005', it matches '2\underline{000}5'. Quantifiers can be made
392 non-greedy, see setMinimal().
393
394 \target capturing parentheses
395 \target backreferences
396 \section1 Capturing Text
397
398 Parentheses allow us to group elements together so that we can
399 quantify and capture them. For example if we have the expression
400 \b{mail|letter|correspondence} that matches a string we know
401 that \e one of the words matched but not which one. Using
402 parentheses allows us to "capture" whatever is matched within
403 their bounds, so if we used \b{(mail|letter|correspondence)}
404 and matched this regexp against the string "I sent you some email"
405 we can use the cap() or capturedTexts() functions to extract the
406 matched characters, in this case 'mail'.
407
408 We can use captured text within the regexp itself. To refer to the
409 captured text we use \e backreferences which are indexed from 1,
410 the same as for cap(). For example we could search for duplicate
411 words in a string using \b{\\b(\\w+)\\W+\\1\\b} which means match a
412 word boundary followed by one or more word characters followed by
413 one or more non-word characters followed by the same text as the
414 first parenthesized expression followed by a word boundary.
415
416 If we want to use parentheses purely for grouping and not for
417 capturing we can use the non-capturing syntax, e.g.
418 \b{(?:green|blue)}. Non-capturing parentheses begin '(?:' and
419 end ')'. In this example we match either 'green' or 'blue' but we
420 do not capture the match so we only know whether or not we matched
421 but not which color we actually found. Using non-capturing
422 parentheses is more efficient than using capturing parentheses
423 since the regexp engine has to do less book-keeping.
424
425 Both capturing and non-capturing parentheses may be nested.
426
427 \target greedy quantifiers
428
429 For historical reasons, quantifiers (e.g. \b{*}) that apply to
430 capturing parentheses are more "greedy" than other quantifiers.
431 For example, \b{a*(a*)} will match "aaa" with cap(1) == "aaa".
432 This behavior is different from what other regexp engines do
433 (notably, Perl). To obtain a more intuitive capturing behavior,
434 specify QRegExp::RegExp2 to the QRegExp constructor or call
435 setPatternSyntax(QRegExp::RegExp2).
436
437 \target cap_in_a_loop
438
439 When the number of matches cannot be determined in advance, a
440 common idiom is to use cap() in a loop. For example:
441
442 \snippet code/src_corelib_tools_qregexp.cpp 0
443
444 \target assertions
445 \section1 Assertions
446
447 Assertions make some statement about the text at the point where
448 they occur in the regexp but they do not match any characters. In
449 the following list \b{\e {E}} stands for any expression.
450
451 \table
452 \row \li \b{^}
453 \li The caret signifies the beginning of the string. If you
454 wish to match a literal \c{^} you must escape it by
455 writing \c{\\^}. For example, \b{^#include} will only
456 match strings which \e begin with the characters '#include'.
457 (When the caret is the first character of a character set it
458 has a special meaning, see \l{#sets-of-characters}{Sets of Characters}.)
459
460 \row \li \b{$}
461 \li The dollar signifies the end of the string. For example
462 \b{\\d\\s*$} will match strings which end with a digit
463 optionally followed by whitespace. If you wish to match a
464 literal \c{$} you must escape it by writing
465 \c{\\$}.
466
467 \row \li \b{\\b}
468 \li A word boundary. For example the regexp
469 \b{\\bOK\\b} means match immediately after a word
470 boundary (e.g. start of string or whitespace) the letter 'O'
471 then the letter 'K' immediately before another word boundary
472 (e.g. end of string or whitespace). But note that the
473 assertion does not actually match any whitespace so if we
474 write \b{(\\bOK\\b)} and we have a match it will only
475 contain 'OK' even if the string is "It's \underline{OK} now".
476
477 \row \li \b{\\B}
478 \li A non-word boundary. This assertion is true wherever
479 \b{\\b} is false. For example if we searched for
480 \b{\\Bon\\B} in "Left on" the match would fail (space
481 and end of string aren't non-word boundaries), but it would
482 match in "t\underline{on}ne".
483
484 \row \li \b{(?=\e E)}
485 \li Positive lookahead. This assertion is true if the
486 expression matches at this point in the regexp. For example,
487 \b{const(?=\\s+char)} matches 'const' whenever it is
488 followed by 'char', as in 'static \underline{const} char *'.
489 (Compare with \b{const\\s+char}, which matches 'static
490 \underline{const char} *'.)
491
492 \row \li \b{(?!\e E)}
493 \li Negative lookahead. This assertion is true if the
494 expression does not match at this point in the regexp. For
495 example, \b{const(?!\\s+char)} matches 'const' \e except
496 when it is followed by 'char'.
497 \endtable
498
499 \target QRegExp wildcard matching
500 \section1 Wildcard Matching
501
502 Most command shells such as \e bash or \e cmd.exe support "file
503 globbing", the ability to identify a group of files by using
504 wildcards. The setPatternSyntax() function is used to switch
505 between regexp and wildcard mode. Wildcard matching is much
506 simpler than full regexps and has only four features:
507
508 \table
509 \row \li \b{c}
510 \li Any character represents itself apart from those mentioned
511 below. Thus \b{c} matches the character \e c.
512 \row \li \b{?}
513 \li Matches any single character. It is the same as
514 \b{.} in full regexps.
515 \row \li \b{*}
516 \li Matches zero or more of any characters. It is the
517 same as \b{.*} in full regexps.
518 \row \li \b{[...]}
519 \li Sets of characters can be represented in square brackets,
520 similar to full regexps. Within the character class, like
521 outside, backslash has no special meaning.
522 \endtable
523
524 In the mode Wildcard, the wildcard characters cannot be
525 escaped. In the mode WildcardUnix, the character '\\' escapes the
526 wildcard.
527
528 For example if we are in wildcard mode and have strings which
529 contain filenames we could identify HTML files with \b{*.html}.
530 This will match zero or more characters followed by a dot followed
531 by 'h', 't', 'm' and 'l'.
532
533 To test a string against a wildcard expression, use exactMatch().
534 For example:
535
536 \snippet code/src_corelib_tools_qregexp.cpp 1
537
538 \target perl-users
539 \section1 Notes for Perl Users
540
541 Most of the character class abbreviations supported by Perl are
542 supported by QRegExp, see \l{#characters-and-abbreviations-for-sets-of-characters}
543 {characters and abbreviations for sets of characters}.
544
545 In QRegExp, apart from within character classes, \c{^} always
546 signifies the start of the string, so carets must always be
547 escaped unless used for that purpose. In Perl the meaning of caret
548 varies automagically depending on where it occurs so escaping it
549 is rarely necessary. The same applies to \c{$} which in
550 QRegExp always signifies the end of the string.
551
552 QRegExp's quantifiers are the same as Perl's greedy quantifiers
553 (but see the \l{greedy quantifiers}{note above}). Non-greedy
554 matching cannot be applied to individual quantifiers, but can be
555 applied to all the quantifiers in the pattern. For example, to
556 match the Perl regexp \b{ro+?m} requires:
557
558 \snippet code/src_corelib_tools_qregexp.cpp 2
559
560 The equivalent of Perl's \c{/i} option is
561 setCaseSensitivity(Qt::CaseInsensitive).
562
563 Perl's \c{/g} option can be emulated using a \l{#cap_in_a_loop}{loop}.
564
565 In QRegExp \b{.} matches any character, therefore all QRegExp
566 regexps have the equivalent of Perl's \c{/s} option. QRegExp
567 does not have an equivalent to Perl's \c{/m} option, but this
568 can be emulated in various ways for example by splitting the input
569 into lines or by looping with a regexp that searches for newlines.
570
571 Because QRegExp is string oriented, there are no \\A, \\Z, or \\z
572 assertions. The \\G assertion is not supported but can be emulated
573 in a loop.
574
575 Perl's $& is cap(0) or capturedTexts()[0]. There are no QRegExp
576 equivalents for $`, $' or $+. Perl's capturing variables, $1, $2,
577 ... correspond to cap(1) or capturedTexts()[1], cap(2) or
578 capturedTexts()[2], etc.
579
580 To substitute a pattern use QString::replace().
581
582 Perl's extended \c{/x} syntax is not supported, nor are
583 directives, e.g. (?i), or regexp comments, e.g. (?#comment). On
584 the other hand, C++'s rules for literal strings can be used to
585 achieve the same:
586
587 \snippet code/src_corelib_tools_qregexp.cpp 3
588
589 Both zero-width positive and zero-width negative lookahead
590 assertions (?=pattern) and (?!pattern) are supported with the same
591 syntax as Perl. Perl's lookbehind assertions, "independent"
592 subexpressions and conditional expressions are not supported.
593
594 Non-capturing parentheses are also supported, with the same
595 (?:pattern) syntax.
596
597 See QString::split() and QStringList::join() for equivalents
598 to Perl's split and join functions.
599
600 Note: because C++ transforms \\'s they must be written \e twice in
601 code, e.g. \b{\\b} must be written \b{\\\\b}.
602
603 \target code-examples
604 \section1 Code Examples
605
606 \snippet code/src_corelib_tools_qregexp.cpp 4
607
608 The third string matches '\underline{6}'. This is a simple validation
609 regexp for integers in the range 0 to 99.
610
611 \snippet code/src_corelib_tools_qregexp.cpp 5
612
613 The second string matches '\underline{This_is-OK}'. We've used the
614 character set abbreviation '\\S' (non-whitespace) and the anchors
615 to match strings which contain no whitespace.
616
617 In the following example we match strings containing 'mail' or
618 'letter' or 'correspondence' but only match whole words i.e. not
619 'email'
620
621 \snippet code/src_corelib_tools_qregexp.cpp 6
622
623 The second string matches "Please write the \underline{letter}". The
624 word 'letter' is also captured (because of the parentheses). We
625 can see what text we've captured like this:
626
627 \snippet code/src_corelib_tools_qregexp.cpp 7
628
629 This will capture the text from the first set of capturing
630 parentheses (counting capturing left parentheses from left to
631 right). The parentheses are counted from 1 since cap(0) is the
632 whole matched regexp (equivalent to '&' in most regexp engines).
633
634 \snippet code/src_corelib_tools_qregexp.cpp 8
635
636 Here we've passed the QRegExp to QString's replace() function to
637 replace the matched text with new text.
638
639 \snippet code/src_corelib_tools_qregexp.cpp 9
640
641 We've used the indexIn() function to repeatedly match the regexp in
642 the string. Note that instead of moving forward by one character
643 at a time \c pos++ we could have written \c {pos +=
644 rx.matchedLength()} to skip over the already matched string. The
645 count will equal 3, matching 'One \underline{Eric} another
646 \underline{Eirik}, and an Ericsson. How many Eiriks, \underline{Eric}?'; it
647 doesn't match 'Ericsson' or 'Eiriks' because they are not bounded
648 by non-word boundaries.
649
650 One common use of regexps is to split lines of delimited data into
651 their component fields.
652
653 \snippet code/src_corelib_tools_qregexp.cpp 10
654
655 In this example our input lines have the format company name, web
656 address and country. Unfortunately the regexp is rather long and
657 not very versatile -- the code will break if we add any more
658 fields. A simpler and better solution is to look for the
659 separator, '\\t' in this case, and take the surrounding text. The
660 QString::split() function can take a separator string or regexp
661 as an argument and split a string accordingly.
662
663 \snippet code/src_corelib_tools_qregexp.cpp 11
664
665 Here field[0] is the company, field[1] the web address and so on.
666
667 To imitate the matching of a shell we can use wildcard mode.
668
669 \snippet code/src_corelib_tools_qregexp.cpp 12
670
671 Wildcard matching can be convenient because of its simplicity, but
672 any wildcard regexp can be defined using full regexps, e.g.
673 \b{.*\\.html$}. Notice that we can't match both \c .html and \c
674 .htm files with a wildcard unless we use \b{*.htm*} which will
675 also match 'test.html.bak'. A full regexp gives us the precision
676 we need, \b{.*\\.html?$}.
677
678 QRegExp can match case insensitively using setCaseSensitivity(),
679 and can use non-greedy matching, see setMinimal(). By
680 default QRegExp uses full regexps but this can be changed with
681 setPatternSyntax(). Searching can be done forward with indexIn() or backward
682 with lastIndexIn(). Captured text can be accessed using
683 capturedTexts() which returns a string list of all captured
684 strings, or using cap() which returns the captured string for the
685 given index. The pos() function takes a match index and returns
686 the position in the string where the match was made (or -1 if
687 there was no match).
688
689 \sa QString, QStringList, QRegExpValidator, QSortFilterProxyModel,
690 {tools/regexp}{Regular Expression Example}
691*/
692
693#if defined(Q_OS_VXWORKS) && defined(EOS)
694# undef EOS
695#endif
696
697const int NumBadChars = 64;
698#define BadChar(ch) ((ch).unicode() % NumBadChars)
699
700const int NoOccurrence = INT_MAX;
701const int EmptyCapture = INT_MAX;
702const int InftyLen = INT_MAX;
703const int InftyRep = 1025;
704const int EOS = -1;
705
706static bool isWord(QChar ch)
707{
708 return ch.isLetterOrNumber() || ch.isMark() || ch == QLatin1Char('_');
709}
710
711/*
712 Merges two vectors of ints and puts the result into the first
713 one.
714*/
715static void mergeInto(QVector<int> *a, const QVector<int> &b)
716{
717 int asize = a->size();
718 int bsize = b.size();
719 if (asize == 0) {
720 *a = b;
721#ifndef QT_NO_REGEXP_OPTIM
722 } else if (bsize == 1 && a->at(i: asize - 1) < b.at(i: 0)) {
723 a->resize(asize: asize + 1);
724 (*a)[asize] = b.at(i: 0);
725#endif
726 } else if (bsize >= 1) {
727 int csize = asize + bsize;
728 QVector<int> c(csize);
729 int i = 0, j = 0, k = 0;
730 while (i < asize) {
731 if (j < bsize) {
732 if (a->at(i) == b.at(i: j)) {
733 ++i;
734 --csize;
735 } else if (a->at(i) < b.at(i: j)) {
736 c[k++] = a->at(i: i++);
737 } else {
738 c[k++] = b.at(i: j++);
739 }
740 } else {
741 memcpy(dest: c.data() + k, src: a->constData() + i, n: (asize - i) * sizeof(int));
742 break;
743 }
744 }
745 c.resize(asize: csize);
746 if (j < bsize)
747 memcpy(dest: c.data() + k, src: b.constData() + j, n: (bsize - j) * sizeof(int));
748 *a = c;
749 }
750}
751
752#ifndef QT_NO_REGEXP_WILDCARD
753/*
754 Translates a wildcard pattern to an equivalent regular expression
755 pattern (e.g., *.cpp to .*\.cpp).
756
757 If enableEscaping is true, it is possible to escape the wildcard
758 characters with \
759*/
760static QString wc2rx(const QString &wc_str, const bool enableEscaping)
761{
762 const int wclen = wc_str.length();
763 QString rx;
764 int i = 0;
765 bool isEscaping = false; // the previous character is '\'
766 const QChar *wc = wc_str.unicode();
767
768 while (i < wclen) {
769 const QChar c = wc[i++];
770 switch (c.unicode()) {
771 case '\\':
772 if (enableEscaping) {
773 if (isEscaping) {
774 rx += QLatin1String("\\\\");
775 } // we insert the \\ later if necessary
776 if (i == wclen) { // the end
777 rx += QLatin1String("\\\\");
778 }
779 } else {
780 rx += QLatin1String("\\\\");
781 }
782 isEscaping = true;
783 break;
784 case '*':
785 if (isEscaping) {
786 rx += QLatin1String("\\*");
787 isEscaping = false;
788 } else {
789 rx += QLatin1String(".*");
790 }
791 break;
792 case '?':
793 if (isEscaping) {
794 rx += QLatin1String("\\?");
795 isEscaping = false;
796 } else {
797 rx += QLatin1Char('.');
798 }
799
800 break;
801 case '$':
802 case '(':
803 case ')':
804 case '+':
805 case '.':
806 case '^':
807 case '{':
808 case '|':
809 case '}':
810 if (isEscaping) {
811 isEscaping = false;
812 rx += QLatin1String("\\\\");
813 }
814 rx += QLatin1Char('\\');
815 rx += c;
816 break;
817 case '[':
818 if (isEscaping) {
819 isEscaping = false;
820 rx += QLatin1String("\\[");
821 } else {
822 rx += c;
823 if (wc[i] == QLatin1Char('^'))
824 rx += wc[i++];
825 if (i < wclen) {
826 if (wc[i] == QLatin1Char(']'))
827 rx += wc[i++];
828 while (i < wclen && wc[i] != QLatin1Char(']')) {
829 if (wc[i] == QLatin1Char('\\'))
830 rx += QLatin1Char('\\');
831 rx += wc[i++];
832 }
833 }
834 }
835 break;
836
837 case ']':
838 if(isEscaping){
839 isEscaping = false;
840 rx += QLatin1String("\\");
841 }
842 rx += c;
843 break;
844
845 default:
846 if(isEscaping){
847 isEscaping = false;
848 rx += QLatin1String("\\\\");
849 }
850 rx += c;
851 }
852 }
853 return rx;
854}
855#endif
856
857static int caretIndex(int offset, QRegExp::CaretMode caretMode)
858{
859 if (caretMode == QRegExp::CaretAtZero) {
860 return 0;
861 } else if (caretMode == QRegExp::CaretAtOffset) {
862 return offset;
863 } else { // QRegExp::CaretWontMatch
864 return -1;
865 }
866}
867
868/*
869 The QRegExpEngineKey struct uniquely identifies an engine.
870*/
871struct QRegExpEngineKey
872{
873 QString pattern;
874 QRegExp::PatternSyntax patternSyntax;
875 Qt::CaseSensitivity cs;
876
877 inline QRegExpEngineKey(const QString &pattern, QRegExp::PatternSyntax patternSyntax,
878 Qt::CaseSensitivity cs)
879 : pattern(pattern), patternSyntax(patternSyntax), cs(cs) {}
880
881 inline void clear() {
882 pattern.clear();
883 patternSyntax = QRegExp::RegExp;
884 cs = Qt::CaseSensitive;
885 }
886};
887
888static bool operator==(const QRegExpEngineKey &key1, const QRegExpEngineKey &key2)
889{
890 return key1.pattern == key2.pattern && key1.patternSyntax == key2.patternSyntax
891 && key1.cs == key2.cs;
892}
893
894static uint qHash(const QRegExpEngineKey &key, uint seed = 0) noexcept
895{
896 QtPrivate::QHashCombine hash;
897 seed = hash(seed, key.pattern);
898 seed = hash(seed, key.patternSyntax);
899 seed = hash(seed, key.cs);
900 return seed;
901}
902
903class QRegExpEngine;
904
905//Q_DECLARE_TYPEINFO(QVector<int>, Q_MOVABLE_TYPE);
906
907/*
908 This is the engine state during matching.
909*/
910struct QRegExpMatchState
911{
912 const QChar *in; // a pointer to the input string data
913 int pos; // the current position in the string
914 int caretPos;
915 int len; // the length of the input string
916 bool minimal; // minimal matching?
917 int *bigArray; // big array holding the data for the next pointers
918 int *inNextStack; // is state is nextStack?
919 int *curStack; // stack of current states
920 int *nextStack; // stack of next states
921 int *curCapBegin; // start of current states' captures
922 int *nextCapBegin; // start of next states' captures
923 int *curCapEnd; // end of current states' captures
924 int *nextCapEnd; // end of next states' captures
925 int *tempCapBegin; // start of temporary captures
926 int *tempCapEnd; // end of temporary captures
927 int *capBegin; // start of captures for a next state
928 int *capEnd; // end of captures for a next state
929 int *slideTab; // bump-along slide table for bad-character heuristic
930 int *captured; // what match() returned last
931 int slideTabSize; // size of slide table
932 int capturedSize;
933#ifndef QT_NO_REGEXP_BACKREF
934 QList<QVector<int> > sleeping; // list of back-reference sleepers
935#endif
936 int matchLen; // length of match
937 int oneTestMatchedLen; // length of partial match
938
939 const QRegExpEngine *eng;
940
941 inline QRegExpMatchState() : bigArray(nullptr), captured(nullptr) {}
942 inline ~QRegExpMatchState() { free(ptr: bigArray); }
943
944 void drain() { free(ptr: bigArray); bigArray = nullptr; captured = nullptr; } // to save memory
945 void prepareForMatch(QRegExpEngine *eng);
946 void match(const QChar *str, int len, int pos, bool minimal,
947 bool oneTest, int caretIndex);
948 bool matchHere();
949 bool testAnchor(int i, int a, const int *capBegin);
950};
951
952/*
953 The struct QRegExpAutomatonState represents one state in a modified NFA. The
954 input characters matched are stored in the state instead of on
955 the transitions, something possible for an automaton
956 constructed from a regular expression.
957*/
958struct QRegExpAutomatonState
959{
960#ifndef QT_NO_REGEXP_CAPTURE
961 int atom; // which atom does this state belong to?
962#endif
963 int match; // what does it match? (see CharClassBit and BackRefBit)
964 QVector<int> outs; // out-transitions
965 QMap<int, int> reenter; // atoms reentered when transiting out
966 QMap<int, int> anchors; // anchors met when transiting out
967
968 inline QRegExpAutomatonState() { }
969#ifndef QT_NO_REGEXP_CAPTURE
970 inline QRegExpAutomatonState(int a, int m)
971 : atom(a), match(m) { }
972#else
973 inline QRegExpAutomatonState(int m)
974 : match(m) { }
975#endif
976};
977
978Q_DECLARE_TYPEINFO(QRegExpAutomatonState, Q_MOVABLE_TYPE);
979
980/*
981 The struct QRegExpCharClassRange represents a range of characters (e.g.,
982 [0-9] denotes range 48 to 57).
983*/
984struct QRegExpCharClassRange
985{
986 ushort from; // 48
987 ushort len; // 10
988};
989
990Q_DECLARE_TYPEINFO(QRegExpCharClassRange, Q_PRIMITIVE_TYPE);
991
992#ifndef QT_NO_REGEXP_CAPTURE
993/*
994 The struct QRegExpAtom represents one node in the hierarchy of regular
995 expression atoms.
996*/
997struct QRegExpAtom
998{
999 enum { NoCapture = -1, OfficialCapture = -2, UnofficialCapture = -3 };
1000
1001 int parent; // index of parent in array of atoms
1002 int capture; // index of capture, from 1 to ncap - 1
1003};
1004
1005Q_DECLARE_TYPEINFO(QRegExpAtom, Q_PRIMITIVE_TYPE);
1006#endif
1007
1008struct QRegExpLookahead;
1009
1010#ifndef QT_NO_REGEXP_ANCHOR_ALT
1011/*
1012 The struct QRegExpAnchorAlternation represents a pair of anchors with
1013 OR semantics.
1014*/
1015struct QRegExpAnchorAlternation
1016{
1017 int a; // this anchor...
1018 int b; // ...or this one
1019};
1020
1021Q_DECLARE_TYPEINFO(QRegExpAnchorAlternation, Q_PRIMITIVE_TYPE);
1022#endif
1023
1024#ifndef QT_NO_REGEXP_CCLASS
1025
1026#define FLAG(x) (1 << (x))
1027/*
1028 The class QRegExpCharClass represents a set of characters, such as can
1029 be found in regular expressions (e.g., [a-z] denotes the set
1030 {a, b, ..., z}).
1031*/
1032class QRegExpCharClass
1033{
1034public:
1035 QRegExpCharClass();
1036
1037 void clear();
1038 bool negative() const { return n; }
1039 void setNegative(bool negative);
1040 void addCategories(uint cats);
1041 void addRange(ushort from, ushort to);
1042 void addSingleton(ushort ch) { addRange(from: ch, to: ch); }
1043
1044 bool in(QChar ch) const;
1045#ifndef QT_NO_REGEXP_OPTIM
1046 const QVector<int> &firstOccurrence() const { return occ1; }
1047#endif
1048
1049#if defined(QT_DEBUG)
1050 void dump() const;
1051#endif
1052
1053private:
1054 QVector<QRegExpCharClassRange> r; // character ranges
1055#ifndef QT_NO_REGEXP_OPTIM
1056 QVector<int> occ1; // first-occurrence array
1057#endif
1058 uint c; // character classes
1059 bool n; // negative?
1060};
1061#else
1062struct QRegExpCharClass
1063{
1064 int dummy;
1065
1066#ifndef QT_NO_REGEXP_OPTIM
1067 QRegExpCharClass() { occ1.fill(0, NumBadChars); }
1068
1069 const QVector<int> &firstOccurrence() const { return occ1; }
1070 QVector<int> occ1;
1071#endif
1072};
1073#endif
1074
1075Q_DECLARE_TYPEINFO(QRegExpCharClass, Q_MOVABLE_TYPE);
1076
1077/*
1078 The QRegExpEngine class encapsulates a modified nondeterministic
1079 finite automaton (NFA).
1080*/
1081class QRegExpEngine
1082{
1083public:
1084 QRegExpEngine(Qt::CaseSensitivity cs, bool greedyQuantifiers)
1085 : cs(cs), greedyQuantifiers(greedyQuantifiers) { setup(); }
1086
1087 QRegExpEngine(const QRegExpEngineKey &key);
1088 ~QRegExpEngine();
1089
1090 bool isValid() const { return valid; }
1091 const QString &errorString() const { return yyError; }
1092 int captureCount() const { return officialncap; }
1093
1094 int createState(QChar ch);
1095 int createState(const QRegExpCharClass &cc);
1096#ifndef QT_NO_REGEXP_BACKREF
1097 int createState(int bref);
1098#endif
1099
1100 void addCatTransitions(const QVector<int> &from, const QVector<int> &to);
1101#ifndef QT_NO_REGEXP_CAPTURE
1102 void addPlusTransitions(const QVector<int> &from, const QVector<int> &to, int atom);
1103#endif
1104
1105#ifndef QT_NO_REGEXP_ANCHOR_ALT
1106 int anchorAlternation(int a, int b);
1107 int anchorConcatenation(int a, int b);
1108#else
1109 int anchorAlternation(int a, int b) { return a & b; }
1110 int anchorConcatenation(int a, int b) { return a | b; }
1111#endif
1112 void addAnchors(int from, int to, int a);
1113
1114#ifndef QT_NO_REGEXP_OPTIM
1115 void heuristicallyChooseHeuristic();
1116#endif
1117
1118#if defined(QT_DEBUG)
1119 void dump() const;
1120#endif
1121
1122 QAtomicInt ref;
1123
1124private:
1125 enum { CharClassBit = 0x10000, BackRefBit = 0x20000 };
1126 enum { InitialState = 0, FinalState = 1 };
1127
1128 void setup();
1129 int setupState(int match);
1130
1131 /*
1132 Let's hope that 13 lookaheads and 14 back-references are
1133 enough.
1134 */
1135 enum { MaxLookaheads = 13, MaxBackRefs = 14 };
1136 enum { Anchor_Dollar = 0x00000001, Anchor_Caret = 0x00000002, Anchor_Word = 0x00000004,
1137 Anchor_NonWord = 0x00000008, Anchor_FirstLookahead = 0x00000010,
1138 Anchor_BackRef1Empty = Anchor_FirstLookahead << MaxLookaheads,
1139 Anchor_BackRef0Empty = Anchor_BackRef1Empty >> 1,
1140 Anchor_Alternation = unsigned(Anchor_BackRef1Empty) << MaxBackRefs,
1141
1142 Anchor_LookaheadMask = (Anchor_FirstLookahead - 1) ^
1143 ((Anchor_FirstLookahead << MaxLookaheads) - 1) };
1144#ifndef QT_NO_REGEXP_CAPTURE
1145 int startAtom(bool officialCapture);
1146 void finishAtom(int atom, bool needCapture);
1147#endif
1148
1149#ifndef QT_NO_REGEXP_LOOKAHEAD
1150 int addLookahead(QRegExpEngine *eng, bool negative);
1151#endif
1152
1153#ifndef QT_NO_REGEXP_OPTIM
1154 bool goodStringMatch(QRegExpMatchState &matchState) const;
1155 bool badCharMatch(QRegExpMatchState &matchState) const;
1156#else
1157 bool bruteMatch(QRegExpMatchState &matchState) const;
1158#endif
1159
1160 QVector<QRegExpAutomatonState> s; // array of states
1161#ifndef QT_NO_REGEXP_CAPTURE
1162 QVector<QRegExpAtom> f; // atom hierarchy
1163 int nf; // number of atoms
1164 int cf; // current atom
1165 QVector<int> captureForOfficialCapture;
1166#endif
1167 int officialncap; // number of captures, seen from the outside
1168 int ncap; // number of captures, seen from the inside
1169#ifndef QT_NO_REGEXP_CCLASS
1170 QVector<QRegExpCharClass> cl; // array of character classes
1171#endif
1172#ifndef QT_NO_REGEXP_LOOKAHEAD
1173 QVector<QRegExpLookahead *> ahead; // array of lookaheads
1174#endif
1175#ifndef QT_NO_REGEXP_ANCHOR_ALT
1176 QVector<QRegExpAnchorAlternation> aa; // array of (a, b) pairs of anchors
1177#endif
1178#ifndef QT_NO_REGEXP_OPTIM
1179 bool caretAnchored; // does the regexp start with ^?
1180 bool trivial; // is the good-string all that needs to match?
1181#endif
1182 bool valid; // is the regular expression valid?
1183 Qt::CaseSensitivity cs; // case sensitive?
1184 bool greedyQuantifiers; // RegExp2?
1185 bool xmlSchemaExtensions;
1186#ifndef QT_NO_REGEXP_BACKREF
1187 int nbrefs; // number of back-references
1188#endif
1189
1190#ifndef QT_NO_REGEXP_OPTIM
1191 bool useGoodStringHeuristic; // use goodStringMatch? otherwise badCharMatch
1192
1193 int goodEarlyStart; // the index where goodStr can first occur in a match
1194 int goodLateStart; // the index where goodStr can last occur in a match
1195 QString goodStr; // the string that any match has to contain
1196
1197 int minl; // the minimum length of a match
1198 QVector<int> occ1; // first-occurrence array
1199#endif
1200
1201 /*
1202 The class Box is an abstraction for a regular expression
1203 fragment. It can also be seen as one node in the syntax tree of
1204 a regular expression with synthetized attributes.
1205
1206 Its interface is ugly for performance reasons.
1207 */
1208 class Box
1209 {
1210 public:
1211 Box(QRegExpEngine *engine);
1212 Box(const Box &b) { operator=(b); }
1213
1214 Box &operator=(const Box &b);
1215
1216 void clear() { operator=(b: Box(eng)); }
1217 void set(QChar ch);
1218 void set(const QRegExpCharClass &cc);
1219#ifndef QT_NO_REGEXP_BACKREF
1220 void set(int bref);
1221#endif
1222
1223 void cat(const Box &b);
1224 void orx(const Box &b);
1225 void plus(int atom);
1226 void opt();
1227 void catAnchor(int a);
1228#ifndef QT_NO_REGEXP_OPTIM
1229 void setupHeuristics();
1230#endif
1231
1232#if defined(QT_DEBUG)
1233 void dump() const;
1234#endif
1235
1236 private:
1237 void addAnchorsToEngine(const Box &to) const;
1238
1239 QRegExpEngine *eng; // the automaton under construction
1240 QVector<int> ls; // the left states (firstpos)
1241 QVector<int> rs; // the right states (lastpos)
1242 QMap<int, int> lanchors; // the left anchors
1243 QMap<int, int> ranchors; // the right anchors
1244 int skipanchors; // the anchors to match if the box is skipped
1245
1246#ifndef QT_NO_REGEXP_OPTIM
1247 int earlyStart; // the index where str can first occur
1248 int lateStart; // the index where str can last occur
1249 QString str; // a string that has to occur in any match
1250 QString leftStr; // a string occurring at the left of this box
1251 QString rightStr; // a string occurring at the right of this box
1252 int maxl; // the maximum length of this box (possibly InftyLen)
1253#endif
1254
1255 int minl; // the minimum length of this box
1256#ifndef QT_NO_REGEXP_OPTIM
1257 QVector<int> occ1; // first-occurrence array
1258#endif
1259 };
1260
1261 friend class Box;
1262
1263 /*
1264 This is the lexical analyzer for regular expressions.
1265 */
1266 enum { Tok_Eos, Tok_Dollar, Tok_LeftParen, Tok_MagicLeftParen, Tok_PosLookahead,
1267 Tok_NegLookahead, Tok_RightParen, Tok_CharClass, Tok_Caret, Tok_Quantifier, Tok_Bar,
1268 Tok_Word, Tok_NonWord, Tok_Char = 0x10000, Tok_BackRef = 0x20000 };
1269 int getChar();
1270 int getEscape();
1271#ifndef QT_NO_REGEXP_INTERVAL
1272 int getRep(int def);
1273#endif
1274#ifndef QT_NO_REGEXP_LOOKAHEAD
1275 void skipChars(int n);
1276#endif
1277 void error(const char *msg);
1278 void startTokenizer(const QChar *rx, int len);
1279 int getToken();
1280
1281 const QChar *yyIn; // a pointer to the input regular expression pattern
1282 int yyPos0; // the position of yyTok in the input pattern
1283 int yyPos; // the position of the next character to read
1284 int yyLen; // the length of yyIn
1285 int yyCh; // the last character read
1286 QScopedPointer<QRegExpCharClass> yyCharClass; // attribute for Tok_CharClass tokens
1287 int yyMinRep; // attribute for Tok_Quantifier
1288 int yyMaxRep; // ditto
1289 QString yyError; // syntax error or overflow during parsing?
1290
1291 /*
1292 This is the syntactic analyzer for regular expressions.
1293 */
1294 int parse(const QChar *rx, int len);
1295 void parseAtom(Box *box);
1296 void parseFactor(Box *box);
1297 void parseTerm(Box *box);
1298 void parseExpression(Box *box);
1299
1300 int yyTok; // the last token read
1301 bool yyMayCapture; // set this to false to disable capturing
1302
1303 friend struct QRegExpMatchState;
1304};
1305
1306#ifndef QT_NO_REGEXP_LOOKAHEAD
1307/*
1308 The struct QRegExpLookahead represents a lookahead a la Perl (e.g.,
1309 (?=foo) and (?!bar)).
1310*/
1311struct QRegExpLookahead
1312{
1313 QRegExpEngine *eng; // NFA representing the embedded regular expression
1314 bool neg; // negative lookahead?
1315
1316 inline QRegExpLookahead(QRegExpEngine *eng0, bool neg0)
1317 : eng(eng0), neg(neg0) { }
1318 inline ~QRegExpLookahead() { delete eng; }
1319};
1320#endif
1321
1322/*!
1323 \internal
1324 convert the pattern string to the RegExp syntax.
1325
1326 This is also used by QScriptEngine::newRegExp to convert to a pattern that JavaScriptCore can understan
1327 */
1328Q_CORE_EXPORT QString qt_regexp_toCanonical(const QString &pattern, QRegExp::PatternSyntax patternSyntax)
1329{
1330 switch (patternSyntax) {
1331#ifndef QT_NO_REGEXP_WILDCARD
1332 case QRegExp::Wildcard:
1333 return wc2rx(wc_str: pattern, enableEscaping: false);
1334 case QRegExp::WildcardUnix:
1335 return wc2rx(wc_str: pattern, enableEscaping: true);
1336#endif
1337 case QRegExp::FixedString:
1338 return QRegExp::escape(str: pattern);
1339 case QRegExp::W3CXmlSchema11:
1340 default:
1341 return pattern;
1342 }
1343}
1344
1345QRegExpEngine::QRegExpEngine(const QRegExpEngineKey &key)
1346 : cs(key.cs), greedyQuantifiers(key.patternSyntax == QRegExp::RegExp2),
1347 xmlSchemaExtensions(key.patternSyntax == QRegExp::W3CXmlSchema11)
1348{
1349 setup();
1350
1351 QString rx = qt_regexp_toCanonical(pattern: key.pattern, patternSyntax: key.patternSyntax);
1352
1353 valid = (parse(rx: rx.unicode(), len: rx.length()) == rx.length());
1354 if (!valid) {
1355#ifndef QT_NO_REGEXP_OPTIM
1356 trivial = false;
1357#endif
1358 error(RXERR_LEFTDELIM);
1359 }
1360}
1361
1362QRegExpEngine::~QRegExpEngine()
1363{
1364#ifndef QT_NO_REGEXP_LOOKAHEAD
1365 qDeleteAll(c: ahead);
1366#endif
1367}
1368
1369void QRegExpMatchState::prepareForMatch(QRegExpEngine *eng)
1370{
1371 /*
1372 We use one QVector<int> for all the big data used a lot in
1373 matchHere() and friends.
1374 */
1375 int ns = eng->s.size(); // number of states
1376 int ncap = eng->ncap;
1377#ifndef QT_NO_REGEXP_OPTIM
1378 int newSlideTabSize = qMax(a: eng->minl + 1, b: 16);
1379#else
1380 int newSlideTabSize = 0;
1381#endif
1382 int numCaptures = eng->captureCount();
1383 int newCapturedSize = 2 + 2 * numCaptures;
1384 bigArray = q_check_ptr(p: (int *)realloc(ptr: bigArray, size: ((3 + 4 * ncap) * ns + 4 * ncap + newSlideTabSize + newCapturedSize)*sizeof(int)));
1385
1386 // set all internal variables only _after_ bigArray is realloc'ed
1387 // to prevent a broken regexp in oom case
1388
1389 slideTabSize = newSlideTabSize;
1390 capturedSize = newCapturedSize;
1391 inNextStack = bigArray;
1392 memset(s: inNextStack, c: -1, n: ns * sizeof(int));
1393 curStack = inNextStack + ns;
1394 nextStack = inNextStack + 2 * ns;
1395
1396 curCapBegin = inNextStack + 3 * ns;
1397 nextCapBegin = curCapBegin + ncap * ns;
1398 curCapEnd = curCapBegin + 2 * ncap * ns;
1399 nextCapEnd = curCapBegin + 3 * ncap * ns;
1400
1401 tempCapBegin = curCapBegin + 4 * ncap * ns;
1402 tempCapEnd = tempCapBegin + ncap;
1403 capBegin = tempCapBegin + 2 * ncap;
1404 capEnd = tempCapBegin + 3 * ncap;
1405
1406 slideTab = tempCapBegin + 4 * ncap;
1407 captured = slideTab + slideTabSize;
1408 memset(s: captured, c: -1, n: capturedSize*sizeof(int));
1409 this->eng = eng;
1410}
1411
1412/*
1413 Tries to match in str and returns an array of (begin, length) pairs
1414 for captured text. If there is no match, all pairs are (-1, -1).
1415*/
1416void QRegExpMatchState::match(const QChar *str0, int len0, int pos0,
1417 bool minimal0, bool oneTest, int caretIndex)
1418{
1419 bool matched = false;
1420 QChar char_null;
1421
1422#ifndef QT_NO_REGEXP_OPTIM
1423 if (eng->trivial && !oneTest) {
1424 // ### Qt6: qsizetype
1425 pos = int(QtPrivate::findString(haystack: QStringView(str0, len0), from: pos0, needle: QStringView(eng->goodStr.unicode(), eng->goodStr.length()), cs: eng->cs));
1426 matchLen = eng->goodStr.length();
1427 matched = (pos != -1);
1428 } else
1429#endif
1430 {
1431 in = str0;
1432 if (in == nullptr)
1433 in = &char_null;
1434 pos = pos0;
1435 caretPos = caretIndex;
1436 len = len0;
1437 minimal = minimal0;
1438 matchLen = 0;
1439 oneTestMatchedLen = 0;
1440
1441 if (eng->valid && pos >= 0 && pos <= len) {
1442#ifndef QT_NO_REGEXP_OPTIM
1443 if (oneTest) {
1444 matched = matchHere();
1445 } else {
1446 if (pos <= len - eng->minl) {
1447 if (eng->caretAnchored) {
1448 matched = matchHere();
1449 } else if (eng->useGoodStringHeuristic) {
1450 matched = eng->goodStringMatch(matchState&: *this);
1451 } else {
1452 matched = eng->badCharMatch(matchState&: *this);
1453 }
1454 }
1455 }
1456#else
1457 matched = oneTest ? matchHere() : eng->bruteMatch(*this);
1458#endif
1459 }
1460 }
1461
1462 if (matched) {
1463 int *c = captured;
1464 *c++ = pos;
1465 *c++ = matchLen;
1466
1467 int numCaptures = (capturedSize - 2) >> 1;
1468#ifndef QT_NO_REGEXP_CAPTURE
1469 for (int i = 0; i < numCaptures; ++i) {
1470 int j = eng->captureForOfficialCapture.at(i);
1471 if (capBegin[j] != EmptyCapture) {
1472 int len = capEnd[j] - capBegin[j];
1473 *c++ = (len > 0) ? pos + capBegin[j] : 0;
1474 *c++ = len;
1475 } else {
1476 *c++ = -1;
1477 *c++ = -1;
1478 }
1479 }
1480#endif
1481 } else {
1482 // we rely on 2's complement here
1483 memset(s: captured, c: -1, n: capturedSize * sizeof(int));
1484 }
1485}
1486
1487/*
1488 The three following functions add one state to the automaton and
1489 return the number of the state.
1490*/
1491
1492int QRegExpEngine::createState(QChar ch)
1493{
1494 return setupState(ch.unicode());
1495}
1496
1497int QRegExpEngine::createState(const QRegExpCharClass &cc)
1498{
1499#ifndef QT_NO_REGEXP_CCLASS
1500 int n = cl.size();
1501 cl += QRegExpCharClass(cc);
1502 return setupState(CharClassBit | n);
1503#else
1504 Q_UNUSED(cc);
1505 return setupState(CharClassBit);
1506#endif
1507}
1508
1509#ifndef QT_NO_REGEXP_BACKREF
1510int QRegExpEngine::createState(int bref)
1511{
1512 if (bref > nbrefs) {
1513 nbrefs = bref;
1514 if (nbrefs > MaxBackRefs) {
1515 error(RXERR_LIMIT);
1516 return 0;
1517 }
1518 }
1519 return setupState(BackRefBit | bref);
1520}
1521#endif
1522
1523/*
1524 The two following functions add a transition between all pairs of
1525 states (i, j) where i is found in from, and j is found in to.
1526
1527 Cat-transitions are distinguished from plus-transitions for
1528 capturing.
1529*/
1530
1531void QRegExpEngine::addCatTransitions(const QVector<int> &from, const QVector<int> &to)
1532{
1533 for (int i = 0; i < from.size(); i++)
1534 mergeInto(a: &s[from.at(i)].outs, b: to);
1535}
1536
1537#ifndef QT_NO_REGEXP_CAPTURE
1538void QRegExpEngine::addPlusTransitions(const QVector<int> &from, const QVector<int> &to, int atom)
1539{
1540 for (int i = 0; i < from.size(); i++) {
1541 QRegExpAutomatonState &st = s[from.at(i)];
1542 const QVector<int> oldOuts = st.outs;
1543 mergeInto(a: &st.outs, b: to);
1544 if (f.at(i: atom).capture != QRegExpAtom::NoCapture) {
1545 for (int j = 0; j < to.size(); j++) {
1546 // ### st.reenter.contains(to.at(j)) check looks suspicious
1547 if (!st.reenter.contains(akey: to.at(i: j)) &&
1548 !std::binary_search(first: oldOuts.constBegin(), last: oldOuts.constEnd(), val: to.at(i: j)))
1549 st.reenter.insert(akey: to.at(i: j), avalue: atom);
1550 }
1551 }
1552 }
1553}
1554#endif
1555
1556#ifndef QT_NO_REGEXP_ANCHOR_ALT
1557/*
1558 Returns an anchor that means a OR b.
1559*/
1560int QRegExpEngine::anchorAlternation(int a, int b)
1561{
1562 if (((a & b) == a || (a & b) == b) && ((a | b) & Anchor_Alternation) == 0)
1563 return a & b;
1564
1565 int n = aa.size();
1566#ifndef QT_NO_REGEXP_OPTIM
1567 if (n > 0 && aa.at(i: n - 1).a == a && aa.at(i: n - 1).b == b)
1568 return Anchor_Alternation | (n - 1);
1569#endif
1570
1571 QRegExpAnchorAlternation element = {.a: a, .b: b};
1572 aa.append(t: element);
1573 return Anchor_Alternation | n;
1574}
1575
1576/*
1577 Returns an anchor that means a AND b.
1578*/
1579int QRegExpEngine::anchorConcatenation(int a, int b)
1580{
1581 if (((a | b) & Anchor_Alternation) == 0)
1582 return a | b;
1583 if ((b & Anchor_Alternation) != 0)
1584 qSwap(value1&: a, value2&: b);
1585
1586 int aprime = anchorConcatenation(a: aa.at(i: a ^ Anchor_Alternation).a, b);
1587 int bprime = anchorConcatenation(a: aa.at(i: a ^ Anchor_Alternation).b, b);
1588 return anchorAlternation(a: aprime, b: bprime);
1589}
1590#endif
1591
1592/*
1593 Adds anchor a on a transition caracterised by its from state and
1594 its to state.
1595*/
1596void QRegExpEngine::addAnchors(int from, int to, int a)
1597{
1598 QRegExpAutomatonState &st = s[from];
1599 if (st.anchors.contains(akey: to))
1600 a = anchorAlternation(a: st.anchors.value(akey: to), b: a);
1601 st.anchors.insert(akey: to, avalue: a);
1602}
1603
1604#ifndef QT_NO_REGEXP_OPTIM
1605/*
1606 This function chooses between the good-string and the bad-character
1607 heuristics. It computes two scores and chooses the heuristic with
1608 the highest score.
1609
1610 Here are some common-sense constraints on the scores that should be
1611 respected if the formulas are ever modified: (1) If goodStr is
1612 empty, the good-string heuristic scores 0. (2) If the regular
1613 expression is trivial, the good-string heuristic should be used.
1614 (3) If the search is case insensitive, the good-string heuristic
1615 should be used, unless it scores 0. (Case insensitivity turns all
1616 entries of occ1 to 0.) (4) If (goodLateStart - goodEarlyStart) is
1617 big, the good-string heuristic should score less.
1618*/
1619void QRegExpEngine::heuristicallyChooseHeuristic()
1620{
1621 if (minl == 0) {
1622 useGoodStringHeuristic = false;
1623 } else if (trivial) {
1624 useGoodStringHeuristic = true;
1625 } else {
1626 /*
1627 Magic formula: The good string has to constitute a good
1628 proportion of the minimum-length string, and appear at a
1629 more-or-less known index.
1630 */
1631 int goodStringScore = (64 * goodStr.length() / minl) -
1632 (goodLateStart - goodEarlyStart);
1633 /*
1634 Less magic formula: We pick some characters at random, and
1635 check whether they are good or bad.
1636 */
1637 int badCharScore = 0;
1638 int step = qMax(a: 1, b: NumBadChars / 32);
1639 for (int i = 1; i < NumBadChars; i += step) {
1640 if (occ1.at(i) == NoOccurrence)
1641 badCharScore += minl;
1642 else
1643 badCharScore += occ1.at(i);
1644 }
1645 badCharScore /= minl;
1646 useGoodStringHeuristic = (goodStringScore > badCharScore);
1647 }
1648}
1649#endif
1650
1651#if defined(QT_DEBUG)
1652void QRegExpEngine::dump() const
1653{
1654 int i, j;
1655 qDebug(msg: "Case %ssensitive engine", cs ? "" : "in");
1656 qDebug(msg: " States");
1657 for (i = 0; i < s.size(); i++) {
1658 qDebug(msg: " %d%s", i, i == InitialState ? " (initial)" : i == FinalState ? " (final)" : "");
1659#ifndef QT_NO_REGEXP_CAPTURE
1660 if (nf > 0)
1661 qDebug(msg: " in atom %d", s[i].atom);
1662#endif
1663 int m = s[i].match;
1664 if ((m & CharClassBit) != 0) {
1665 qDebug(msg: " match character class %d", m ^ CharClassBit);
1666#ifndef QT_NO_REGEXP_CCLASS
1667 cl[m ^ CharClassBit].dump();
1668#else
1669 qDebug(" negative character class");
1670#endif
1671 } else if ((m & BackRefBit) != 0) {
1672 qDebug(msg: " match back-reference %d", m ^ BackRefBit);
1673 } else if (m >= 0x20 && m <= 0x7e) {
1674 qDebug(msg: " match 0x%.4x (%c)", m, m);
1675 } else {
1676 qDebug(msg: " match 0x%.4x", m);
1677 }
1678 for (j = 0; j < s[i].outs.size(); j++) {
1679 int next = s[i].outs[j];
1680 qDebug(msg: " -> %d", next);
1681 if (s[i].reenter.contains(akey: next))
1682 qDebug(msg: " [reenter %d]", s[i].reenter[next]);
1683 if (s[i].anchors.value(akey: next) != 0)
1684 qDebug(msg: " [anchors 0x%.8x]", s[i].anchors[next]);
1685 }
1686 }
1687#ifndef QT_NO_REGEXP_CAPTURE
1688 if (nf > 0) {
1689 qDebug(msg: " Atom Parent Capture");
1690 for (i = 0; i < nf; i++) {
1691 if (f[i].capture == QRegExpAtom::NoCapture) {
1692 qDebug(msg: " %6d %6d nil", i, f[i].parent);
1693 } else {
1694 int cap = f[i].capture;
1695 bool official = captureForOfficialCapture.contains(t: cap);
1696 qDebug(msg: " %6d %6d %6d %s", i, f[i].parent, f[i].capture,
1697 official ? "official" : "");
1698 }
1699 }
1700 }
1701#endif
1702#ifndef QT_NO_REGEXP_ANCHOR_ALT
1703 for (i = 0; i < aa.size(); i++)
1704 qDebug(msg: " Anchor alternation 0x%.8x: 0x%.8x 0x%.9x", i, aa[i].a, aa[i].b);
1705#endif
1706}
1707#endif
1708
1709void QRegExpEngine::setup()
1710{
1711 ref.storeRelaxed(newValue: 1);
1712#ifndef QT_NO_REGEXP_CAPTURE
1713 f.resize(asize: 32);
1714 nf = 0;
1715 cf = -1;
1716#endif
1717 officialncap = 0;
1718 ncap = 0;
1719#ifndef QT_NO_REGEXP_OPTIM
1720 caretAnchored = true;
1721 trivial = true;
1722#endif
1723 valid = false;
1724#ifndef QT_NO_REGEXP_BACKREF
1725 nbrefs = 0;
1726#endif
1727#ifndef QT_NO_REGEXP_OPTIM
1728 useGoodStringHeuristic = true;
1729 minl = 0;
1730 occ1.fill(from: 0, asize: NumBadChars);
1731#endif
1732}
1733
1734int QRegExpEngine::setupState(int match)
1735{
1736#ifndef QT_NO_REGEXP_CAPTURE
1737 s += QRegExpAutomatonState(cf, match);
1738#else
1739 s += QRegExpAutomatonState(match);
1740#endif
1741 return s.size() - 1;
1742}
1743
1744#ifndef QT_NO_REGEXP_CAPTURE
1745/*
1746 Functions startAtom() and finishAtom() should be called to delimit
1747 atoms. When a state is created, it is assigned to the current atom.
1748 The information is later used for capturing.
1749*/
1750int QRegExpEngine::startAtom(bool officialCapture)
1751{
1752 if ((nf & (nf + 1)) == 0 && nf + 1 >= f.size())
1753 f.resize(asize: (nf + 1) << 1);
1754 f[nf].parent = cf;
1755 cf = nf++;
1756 f[cf].capture = officialCapture ? QRegExpAtom::OfficialCapture : QRegExpAtom::NoCapture;
1757 return cf;
1758}
1759
1760void QRegExpEngine::finishAtom(int atom, bool needCapture)
1761{
1762 if (greedyQuantifiers && needCapture && f[atom].capture == QRegExpAtom::NoCapture)
1763 f[atom].capture = QRegExpAtom::UnofficialCapture;
1764 cf = f.at(i: atom).parent;
1765}
1766#endif
1767
1768#ifndef QT_NO_REGEXP_LOOKAHEAD
1769/*
1770 Creates a lookahead anchor.
1771*/
1772int QRegExpEngine::addLookahead(QRegExpEngine *eng, bool negative)
1773{
1774 int n = ahead.size();
1775 if (n == MaxLookaheads) {
1776 error(RXERR_LIMIT);
1777 return 0;
1778 }
1779 ahead += new QRegExpLookahead(eng, negative);
1780 return Anchor_FirstLookahead << n;
1781}
1782#endif
1783
1784#ifndef QT_NO_REGEXP_CAPTURE
1785/*
1786 We want the longest leftmost captures.
1787*/
1788static bool isBetterCapture(int ncap, const int *begin1, const int *end1, const int *begin2,
1789 const int *end2)
1790{
1791 for (int i = 0; i < ncap; i++) {
1792 int delta = begin2[i] - begin1[i]; // it has to start early...
1793 if (delta == 0)
1794 delta = end1[i] - end2[i]; // ...and end late
1795
1796 if (delta != 0)
1797 return delta > 0;
1798 }
1799 return false;
1800}
1801#endif
1802
1803/*
1804 Returns \c true if anchor a matches at position pos + i in the input
1805 string, otherwise false.
1806*/
1807bool QRegExpMatchState::testAnchor(int i, int a, const int *capBegin)
1808{
1809 int j;
1810
1811#ifndef QT_NO_REGEXP_ANCHOR_ALT
1812 if ((a & QRegExpEngine::Anchor_Alternation) != 0)
1813 return testAnchor(i, a: eng->aa.at(i: a ^ QRegExpEngine::Anchor_Alternation).a, capBegin)
1814 || testAnchor(i, a: eng->aa.at(i: a ^ QRegExpEngine::Anchor_Alternation).b, capBegin);
1815#endif
1816
1817 if ((a & QRegExpEngine::Anchor_Caret) != 0) {
1818 if (pos + i != caretPos)
1819 return false;
1820 }
1821 if ((a & QRegExpEngine::Anchor_Dollar) != 0) {
1822 if (pos + i != len)
1823 return false;
1824 }
1825#ifndef QT_NO_REGEXP_ESCAPE
1826 if ((a & (QRegExpEngine::Anchor_Word | QRegExpEngine::Anchor_NonWord)) != 0) {
1827 bool before = false;
1828 bool after = false;
1829 if (pos + i != 0)
1830 before = isWord(ch: in[pos + i - 1]);
1831 if (pos + i != len)
1832 after = isWord(ch: in[pos + i]);
1833 if ((a & QRegExpEngine::Anchor_Word) != 0 && (before == after))
1834 return false;
1835 if ((a & QRegExpEngine::Anchor_NonWord) != 0 && (before != after))
1836 return false;
1837 }
1838#endif
1839#ifndef QT_NO_REGEXP_LOOKAHEAD
1840 if ((a & QRegExpEngine::Anchor_LookaheadMask) != 0) {
1841 const QVector<QRegExpLookahead *> &ahead = eng->ahead;
1842 for (j = 0; j < ahead.size(); j++) {
1843 if ((a & (QRegExpEngine::Anchor_FirstLookahead << j)) != 0) {
1844 QRegExpMatchState matchState;
1845 matchState.prepareForMatch(eng: ahead[j]->eng);
1846 matchState.match(str0: in + pos + i, len0: len - pos - i, pos0: 0,
1847 minimal0: true, oneTest: true, caretIndex: caretPos - pos - i);
1848 if ((matchState.captured[0] == 0) == ahead[j]->neg)
1849 return false;
1850 }
1851 }
1852 }
1853#endif
1854#ifndef QT_NO_REGEXP_CAPTURE
1855#ifndef QT_NO_REGEXP_BACKREF
1856 for (j = 0; j < eng->nbrefs; j++) {
1857 if ((a & (QRegExpEngine::Anchor_BackRef1Empty << j)) != 0) {
1858 int i = eng->captureForOfficialCapture.at(i: j);
1859 if (capBegin[i] != EmptyCapture)
1860 return false;
1861 }
1862 }
1863#endif
1864#endif
1865 return true;
1866}
1867
1868#ifndef QT_NO_REGEXP_OPTIM
1869/*
1870 The three following functions are what Jeffrey Friedl would call
1871 transmissions (or bump-alongs). Using one or the other should make
1872 no difference except in performance.
1873*/
1874
1875bool QRegExpEngine::goodStringMatch(QRegExpMatchState &matchState) const
1876{
1877 int k = matchState.pos + goodEarlyStart;
1878 QStringMatcher matcher(goodStr.unicode(), goodStr.length(), cs);
1879 while ((k = matcher.indexIn(str: matchState.in, length: matchState.len, from: k)) != -1) {
1880 int from = k - goodLateStart;
1881 int to = k - goodEarlyStart;
1882 if (from > matchState.pos)
1883 matchState.pos = from;
1884
1885 while (matchState.pos <= to) {
1886 if (matchState.matchHere())
1887 return true;
1888 ++matchState.pos;
1889 }
1890 ++k;
1891 }
1892 return false;
1893}
1894
1895bool QRegExpEngine::badCharMatch(QRegExpMatchState &matchState) const
1896{
1897 int slideHead = 0;
1898 int slideNext = 0;
1899 int i;
1900 int lastPos = matchState.len - minl;
1901 memset(s: matchState.slideTab, c: 0, n: matchState.slideTabSize * sizeof(int));
1902
1903 /*
1904 Set up the slide table, used for the bad-character heuristic,
1905 using the table of first occurrence of each character.
1906 */
1907 for (i = 0; i < minl; i++) {
1908 int sk = occ1[BadChar(matchState.in[matchState.pos + i])];
1909 if (sk == NoOccurrence)
1910 sk = i + 1;
1911 if (sk > 0) {
1912 int k = i + 1 - sk;
1913 if (k < 0) {
1914 sk = i + 1;
1915 k = 0;
1916 }
1917 if (sk > matchState.slideTab[k])
1918 matchState.slideTab[k] = sk;
1919 }
1920 }
1921
1922 if (matchState.pos > lastPos)
1923 return false;
1924
1925 for (;;) {
1926 if (++slideNext >= matchState.slideTabSize)
1927 slideNext = 0;
1928 if (matchState.slideTab[slideHead] > 0) {
1929 if (matchState.slideTab[slideHead] - 1 > matchState.slideTab[slideNext])
1930 matchState.slideTab[slideNext] = matchState.slideTab[slideHead] - 1;
1931 matchState.slideTab[slideHead] = 0;
1932 } else {
1933 if (matchState.matchHere())
1934 return true;
1935 }
1936
1937 if (matchState.pos == lastPos)
1938 break;
1939
1940 /*
1941 Update the slide table. This code has much in common with
1942 the initialization code.
1943 */
1944 int sk = occ1[BadChar(matchState.in[matchState.pos + minl])];
1945 if (sk == NoOccurrence) {
1946 matchState.slideTab[slideNext] = minl;
1947 } else if (sk > 0) {
1948 int k = slideNext + minl - sk;
1949 if (k >= matchState.slideTabSize)
1950 k -= matchState.slideTabSize;
1951 if (sk > matchState.slideTab[k])
1952 matchState.slideTab[k] = sk;
1953 }
1954 slideHead = slideNext;
1955 ++matchState.pos;
1956 }
1957 return false;
1958}
1959#else
1960bool QRegExpEngine::bruteMatch(QRegExpMatchState &matchState) const
1961{
1962 while (matchState.pos <= matchState.len) {
1963 if (matchState.matchHere())
1964 return true;
1965 ++matchState.pos;
1966 }
1967 return false;
1968}
1969#endif
1970
1971/*
1972 Here's the core of the engine. It tries to do a match here and now.
1973*/
1974bool QRegExpMatchState::matchHere()
1975{
1976 int ncur = 1, nnext = 0;
1977 int i = 0, j, k, m;
1978 bool stop = false;
1979
1980 matchLen = -1;
1981 oneTestMatchedLen = -1;
1982 curStack[0] = QRegExpEngine::InitialState;
1983
1984 int ncap = eng->ncap;
1985#ifndef QT_NO_REGEXP_CAPTURE
1986 if (ncap > 0) {
1987 for (j = 0; j < ncap; j++) {
1988 curCapBegin[j] = EmptyCapture;
1989 curCapEnd[j] = EmptyCapture;
1990 }
1991 }
1992#endif
1993
1994#ifndef QT_NO_REGEXP_BACKREF
1995 while ((ncur > 0 || !sleeping.isEmpty()) && i <= len - pos && !stop)
1996#else
1997 while (ncur > 0 && i <= len - pos && !stop)
1998#endif
1999 {
2000 int ch = (i < len - pos) ? in[pos + i].unicode() : 0;
2001 for (j = 0; j < ncur; j++) {
2002 int cur = curStack[j];
2003 const QRegExpAutomatonState &scur = eng->s.at(i: cur);
2004 const QVector<int> &outs = scur.outs;
2005 for (k = 0; k < outs.size(); k++) {
2006 int next = outs.at(i: k);
2007 const QRegExpAutomatonState &snext = eng->s.at(i: next);
2008 bool inside = true;
2009#if !defined(QT_NO_REGEXP_BACKREF) && !defined(QT_NO_REGEXP_CAPTURE)
2010 int needSomeSleep = 0;
2011#endif
2012
2013 /*
2014 First, check if the anchors are anchored properly.
2015 */
2016 int a = scur.anchors.value(akey: next);
2017 if (a != 0 && !testAnchor(i, a, capBegin: curCapBegin + j * ncap))
2018 inside = false;
2019
2020 /*
2021 If indeed they are, check if the input character is
2022 correct for this transition.
2023 */
2024 if (inside) {
2025 m = snext.match;
2026 if ((m & (QRegExpEngine::CharClassBit | QRegExpEngine::BackRefBit)) == 0) {
2027 if (eng->cs)
2028 inside = (m == ch);
2029 else
2030 inside = (QChar(m).toLower() == QChar(ch).toLower());
2031 } else if (next == QRegExpEngine::FinalState) {
2032 matchLen = i;
2033 stop = minimal;
2034 inside = true;
2035 } else if ((m & QRegExpEngine::CharClassBit) != 0) {
2036#ifndef QT_NO_REGEXP_CCLASS
2037 const QRegExpCharClass &cc = eng->cl.at(i: m ^ QRegExpEngine::CharClassBit);
2038 if (eng->cs)
2039 inside = cc.in(ch: QChar(ch));
2040 else if (cc.negative())
2041 inside = cc.in(ch: QChar(ch).toLower()) &&
2042 cc.in(ch: QChar(ch).toUpper());
2043 else
2044 inside = cc.in(ch: QChar(ch).toLower()) ||
2045 cc.in(ch: QChar(ch).toUpper());
2046#endif
2047#if !defined(QT_NO_REGEXP_BACKREF) && !defined(QT_NO_REGEXP_CAPTURE)
2048 } else { /* ((m & QRegExpEngine::BackRefBit) != 0) */
2049 int bref = m ^ QRegExpEngine::BackRefBit;
2050 int ell = j * ncap + eng->captureForOfficialCapture.at(i: bref - 1);
2051
2052 inside = bref <= ncap && curCapBegin[ell] != EmptyCapture;
2053 if (inside) {
2054 if (eng->cs)
2055 inside = (in[pos + curCapBegin[ell]] == QChar(ch));
2056 else
2057 inside = (in[pos + curCapBegin[ell]].toLower()
2058 == QChar(ch).toLower());
2059 }
2060
2061 if (inside) {
2062 int delta;
2063 if (curCapEnd[ell] == EmptyCapture)
2064 delta = i - curCapBegin[ell];
2065 else
2066 delta = curCapEnd[ell] - curCapBegin[ell];
2067
2068 inside = (delta <= len - (pos + i));
2069 if (inside && delta > 1) {
2070 int n = 1;
2071 if (eng->cs) {
2072 while (n < delta) {
2073 if (in[pos + curCapBegin[ell] + n]
2074 != in[pos + i + n])
2075 break;
2076 ++n;
2077 }
2078 } else {
2079 while (n < delta) {
2080 QChar a = in[pos + curCapBegin[ell] + n];
2081 QChar b = in[pos + i + n];
2082 if (a.toLower() != b.toLower())
2083 break;
2084 ++n;
2085 }
2086 }
2087 inside = (n == delta);
2088 if (inside)
2089 needSomeSleep = delta - 1;
2090 }
2091 }
2092#endif
2093 }
2094 }
2095
2096 /*
2097 We must now update our data structures.
2098 */
2099 if (inside) {
2100#ifndef QT_NO_REGEXP_CAPTURE
2101 int *capBegin, *capEnd;
2102#endif
2103 /*
2104 If the next state was not encountered yet, all
2105 is fine.
2106 */
2107 if ((m = inNextStack[next]) == -1) {
2108 m = nnext++;
2109 nextStack[m] = next;
2110 inNextStack[next] = m;
2111#ifndef QT_NO_REGEXP_CAPTURE
2112 capBegin = nextCapBegin + m * ncap;
2113 capEnd = nextCapEnd + m * ncap;
2114
2115 /*
2116 Otherwise, we'll first maintain captures in
2117 temporary arrays, and decide at the end whether
2118 it's best to keep the previous capture zones or
2119 the new ones.
2120 */
2121 } else {
2122 capBegin = tempCapBegin;
2123 capEnd = tempCapEnd;
2124#endif
2125 }
2126
2127#ifndef QT_NO_REGEXP_CAPTURE
2128 /*
2129 Updating the capture zones is much of a task.
2130 */
2131 if (ncap > 0) {
2132 memcpy(dest: capBegin, src: curCapBegin + j * ncap, n: ncap * sizeof(int));
2133 memcpy(dest: capEnd, src: curCapEnd + j * ncap, n: ncap * sizeof(int));
2134 int c = scur.atom, n = snext.atom;
2135 int p = -1, q = -1;
2136 int cap;
2137
2138 /*
2139 Lemma 1. For any x in the range [0..nf), we
2140 have f[x].parent < x.
2141
2142 Proof. By looking at startAtom(), it is
2143 clear that cf < nf holds all the time, and
2144 thus that f[nf].parent < nf.
2145 */
2146
2147 /*
2148 If we are reentering an atom, we empty all
2149 capture zones inside it.
2150 */
2151 if ((q = scur.reenter.value(akey: next)) != 0) {
2152 QBitArray b(eng->nf, false);
2153 b.setBit(i: q, val: true);
2154 for (int ell = q + 1; ell < eng->nf; ell++) {
2155 if (b.testBit(i: eng->f.at(i: ell).parent)) {
2156 b.setBit(i: ell, val: true);
2157 cap = eng->f.at(i: ell).capture;
2158 if (cap >= 0) {
2159 capBegin[cap] = EmptyCapture;
2160 capEnd[cap] = EmptyCapture;
2161 }
2162 }
2163 }
2164 p = eng->f.at(i: q).parent;
2165
2166 /*
2167 Otherwise, close the capture zones we are
2168 leaving. We are leaving f[c].capture,
2169 f[f[c].parent].capture,
2170 f[f[f[c].parent].parent].capture, ...,
2171 until f[x].capture, with x such that
2172 f[x].parent is the youngest common ancestor
2173 for c and n.
2174
2175 We go up along c's and n's ancestry until
2176 we find x.
2177 */
2178 } else {
2179 p = c;
2180 q = n;
2181 while (p != q) {
2182 if (p > q) {
2183 cap = eng->f.at(i: p).capture;
2184 if (cap >= 0) {
2185 if (capBegin[cap] == i) {
2186 capBegin[cap] = EmptyCapture;
2187 capEnd[cap] = EmptyCapture;
2188 } else {
2189 capEnd[cap] = i;
2190 }
2191 }
2192 p = eng->f.at(i: p).parent;
2193 } else {
2194 q = eng->f.at(i: q).parent;
2195 }
2196 }
2197 }
2198
2199 /*
2200 In any case, we now open the capture zones
2201 we are entering. We work upwards from n
2202 until we reach p (the parent of the atom we
2203 reenter or the youngest common ancestor).
2204 */
2205 while (n > p) {
2206 cap = eng->f.at(i: n).capture;
2207 if (cap >= 0) {
2208 capBegin[cap] = i;
2209 capEnd[cap] = EmptyCapture;
2210 }
2211 n = eng->f.at(i: n).parent;
2212 }
2213 /*
2214 If the next state was already in
2215 nextStack, we must choose carefully which
2216 capture zones we want to keep.
2217 */
2218 if (capBegin == tempCapBegin &&
2219 isBetterCapture(ncap, begin1: capBegin, end1: capEnd, begin2: nextCapBegin + m * ncap,
2220 end2: nextCapEnd + m * ncap)) {
2221 memcpy(dest: nextCapBegin + m * ncap, src: capBegin, n: ncap * sizeof(int));
2222 memcpy(dest: nextCapEnd + m * ncap, src: capEnd, n: ncap * sizeof(int));
2223 }
2224 }
2225#ifndef QT_NO_REGEXP_BACKREF
2226 /*
2227 We are done with updating the capture zones.
2228 It's now time to put the next state to sleep,
2229 if it needs to, and to remove it from
2230 nextStack.
2231 */
2232 if (needSomeSleep > 0) {
2233 QVector<int> zzZ(2 + 2 * ncap);
2234 zzZ[0] = i + needSomeSleep;
2235 zzZ[1] = next;
2236 if (ncap > 0) {
2237 memcpy(dest: zzZ.data() + 2, src: capBegin, n: ncap * sizeof(int));
2238 memcpy(dest: zzZ.data() + 2 + ncap, src: capEnd, n: ncap * sizeof(int));
2239 }
2240 inNextStack[nextStack[--nnext]] = -1;
2241 sleeping.append(t: zzZ);
2242 }
2243#endif
2244#endif
2245 }
2246 }
2247 }
2248#ifndef QT_NO_REGEXP_CAPTURE
2249 /*
2250 If we reached the final state, hurray! Copy the captured
2251 zone.
2252 */
2253 if (ncap > 0 && (m = inNextStack[QRegExpEngine::FinalState]) != -1) {
2254 memcpy(dest: capBegin, src: nextCapBegin + m * ncap, n: ncap * sizeof(int));
2255 memcpy(dest: capEnd, src: nextCapEnd + m * ncap, n: ncap * sizeof(int));
2256 }
2257#ifndef QT_NO_REGEXP_BACKREF
2258 /*
2259 It's time to wake up the sleepers.
2260 */
2261 j = 0;
2262 while (j < sleeping.count()) {
2263 if (sleeping.at(i: j)[0] == i) {
2264 const QVector<int> &zzZ = sleeping.at(i: j);
2265 int next = zzZ[1];
2266 const int *capBegin = zzZ.data() + 2;
2267 const int *capEnd = zzZ.data() + 2 + ncap;
2268 bool copyOver = true;
2269
2270 if ((m = inNextStack[next]) == -1) {
2271 m = nnext++;
2272 nextStack[m] = next;
2273 inNextStack[next] = m;
2274 } else {
2275 copyOver = isBetterCapture(ncap, begin1: nextCapBegin + m * ncap, end1: nextCapEnd + m * ncap,
2276 begin2: capBegin, end2: capEnd);
2277 }
2278 if (copyOver) {
2279 memcpy(dest: nextCapBegin + m * ncap, src: capBegin, n: ncap * sizeof(int));
2280 memcpy(dest: nextCapEnd + m * ncap, src: capEnd, n: ncap * sizeof(int));
2281 }
2282
2283 sleeping.removeAt(i: j);
2284 } else {
2285 ++j;
2286 }
2287 }
2288#endif
2289#endif
2290 for (j = 0; j < nnext; j++)
2291 inNextStack[nextStack[j]] = -1;
2292
2293 // avoid needless iteration that confuses oneTestMatchedLen
2294 if (nnext == 1 && nextStack[0] == QRegExpEngine::FinalState
2295#ifndef QT_NO_REGEXP_BACKREF
2296 && sleeping.isEmpty()
2297#endif
2298 )
2299 stop = true;
2300
2301 qSwap(value1&: curStack, value2&: nextStack);
2302#ifndef QT_NO_REGEXP_CAPTURE
2303 qSwap(value1&: curCapBegin, value2&: nextCapBegin);
2304 qSwap(value1&: curCapEnd, value2&: nextCapEnd);
2305#endif
2306 ncur = nnext;
2307 nnext = 0;
2308 ++i;
2309 }
2310
2311#ifndef QT_NO_REGEXP_BACKREF
2312 /*
2313 If minimal matching is enabled, we might have some sleepers
2314 left.
2315 */
2316 if (!sleeping.isEmpty())
2317 sleeping.clear();
2318#endif
2319
2320 oneTestMatchedLen = i - 1;
2321 return (matchLen >= 0);
2322}
2323
2324#ifndef QT_NO_REGEXP_CCLASS
2325
2326QRegExpCharClass::QRegExpCharClass()
2327 : c(0), n(false)
2328{
2329#ifndef QT_NO_REGEXP_OPTIM
2330 occ1.fill(from: NoOccurrence, asize: NumBadChars);
2331#endif
2332}
2333
2334void QRegExpCharClass::clear()
2335{
2336 c = 0;
2337 r.clear();
2338 n = false;
2339}
2340
2341void QRegExpCharClass::setNegative(bool negative)
2342{
2343 n = negative;
2344#ifndef QT_NO_REGEXP_OPTIM
2345 occ1.fill(from: 0, asize: NumBadChars);
2346#endif
2347}
2348
2349void QRegExpCharClass::addCategories(uint cats)
2350{
2351 static const int all_cats = FLAG(QChar::Mark_NonSpacing) |
2352 FLAG(QChar::Mark_SpacingCombining) |
2353 FLAG(QChar::Mark_Enclosing) |
2354 FLAG(QChar::Number_DecimalDigit) |
2355 FLAG(QChar::Number_Letter) |
2356 FLAG(QChar::Number_Other) |
2357 FLAG(QChar::Separator_Space) |
2358 FLAG(QChar::Separator_Line) |
2359 FLAG(QChar::Separator_Paragraph) |
2360 FLAG(QChar::Other_Control) |
2361 FLAG(QChar::Other_Format) |
2362 FLAG(QChar::Other_Surrogate) |
2363 FLAG(QChar::Other_PrivateUse) |
2364 FLAG(QChar::Other_NotAssigned) |
2365 FLAG(QChar::Letter_Uppercase) |
2366 FLAG(QChar::Letter_Lowercase) |
2367 FLAG(QChar::Letter_Titlecase) |
2368 FLAG(QChar::Letter_Modifier) |
2369 FLAG(QChar::Letter_Other) |
2370 FLAG(QChar::Punctuation_Connector) |
2371 FLAG(QChar::Punctuation_Dash) |
2372 FLAG(QChar::Punctuation_Open) |
2373 FLAG(QChar::Punctuation_Close) |
2374 FLAG(QChar::Punctuation_InitialQuote) |
2375 FLAG(QChar::Punctuation_FinalQuote) |
2376 FLAG(QChar::Punctuation_Other) |
2377 FLAG(QChar::Symbol_Math) |
2378 FLAG(QChar::Symbol_Currency) |
2379 FLAG(QChar::Symbol_Modifier) |
2380 FLAG(QChar::Symbol_Other);
2381 c |= (all_cats & cats);
2382#ifndef QT_NO_REGEXP_OPTIM
2383 occ1.fill(from: 0, asize: NumBadChars);
2384#endif
2385}
2386
2387void QRegExpCharClass::addRange(ushort from, ushort to)
2388{
2389 if (from > to)
2390 qSwap(value1&: from, value2&: to);
2391 int m = r.size();
2392 r.resize(asize: m + 1);
2393 r[m].from = from;
2394 r[m].len = to - from + 1;
2395
2396#ifndef QT_NO_REGEXP_OPTIM
2397 int i;
2398
2399 if (to - from < NumBadChars) {
2400 if (from % NumBadChars <= to % NumBadChars) {
2401 for (i = from % NumBadChars; i <= to % NumBadChars; i++)
2402 occ1[i] = 0;
2403 } else {
2404 for (i = 0; i <= to % NumBadChars; i++)
2405 occ1[i] = 0;
2406 for (i = from % NumBadChars; i < NumBadChars; i++)
2407 occ1[i] = 0;
2408 }
2409 } else {
2410 occ1.fill(from: 0, asize: NumBadChars);
2411 }
2412#endif
2413}
2414
2415bool QRegExpCharClass::in(QChar ch) const
2416{
2417#ifndef QT_NO_REGEXP_OPTIM
2418 if (occ1.at(BadChar(ch)) == NoOccurrence)
2419 return n;
2420#endif
2421
2422 if (c != 0 && (c & FLAG(ch.category())) != 0)
2423 return !n;
2424
2425 const int uc = ch.unicode();
2426 int size = r.size();
2427
2428 for (int i = 0; i < size; ++i) {
2429 const QRegExpCharClassRange &range = r.at(i);
2430 if (uint(uc - range.from) < uint(r.at(i).len))
2431 return !n;
2432 }
2433 return n;
2434}
2435
2436#if defined(QT_DEBUG)
2437void QRegExpCharClass::dump() const
2438{
2439 int i;
2440 qDebug(msg: " %stive character class", n ? "nega" : "posi");
2441#ifndef QT_NO_REGEXP_CCLASS
2442 if (c != 0)
2443 qDebug(msg: " categories 0x%.8x", c);
2444#endif
2445 for (i = 0; i < r.size(); i++)
2446 qDebug(msg: " 0x%.4x through 0x%.4x", r[i].from, r[i].from + r[i].len - 1);
2447}
2448#endif
2449#endif
2450
2451QRegExpEngine::Box::Box(QRegExpEngine *engine)
2452 : eng(engine), skipanchors(0)
2453#ifndef QT_NO_REGEXP_OPTIM
2454 , earlyStart(0), lateStart(0), maxl(0)
2455#endif
2456{
2457#ifndef QT_NO_REGEXP_OPTIM
2458 occ1.fill(from: NoOccurrence, asize: NumBadChars);
2459#endif
2460 minl = 0;
2461}
2462
2463QRegExpEngine::Box &QRegExpEngine::Box::operator=(const Box &b)
2464{
2465 eng = b.eng;
2466 ls = b.ls;
2467 rs = b.rs;
2468 lanchors = b.lanchors;
2469 ranchors = b.ranchors;
2470 skipanchors = b.skipanchors;
2471#ifndef QT_NO_REGEXP_OPTIM
2472 earlyStart = b.earlyStart;
2473 lateStart = b.lateStart;
2474 str = b.str;
2475 leftStr = b.leftStr;
2476 rightStr = b.rightStr;
2477 maxl = b.maxl;
2478 occ1 = b.occ1;
2479#endif
2480 minl = b.minl;
2481 return *this;
2482}
2483
2484void QRegExpEngine::Box::set(QChar ch)
2485{
2486 ls.resize(asize: 1);
2487 ls[0] = eng->createState(ch);
2488 rs = ls;
2489#ifndef QT_NO_REGEXP_OPTIM
2490 str = ch;
2491 leftStr = ch;
2492 rightStr = ch;
2493 maxl = 1;
2494 occ1[BadChar(ch)] = 0;
2495#endif
2496 minl = 1;
2497}
2498
2499void QRegExpEngine::Box::set(const QRegExpCharClass &cc)
2500{
2501 ls.resize(asize: 1);
2502 ls[0] = eng->createState(cc);
2503 rs = ls;
2504#ifndef QT_NO_REGEXP_OPTIM
2505 maxl = 1;
2506 occ1 = cc.firstOccurrence();
2507#endif
2508 minl = 1;
2509}
2510
2511#ifndef QT_NO_REGEXP_BACKREF
2512void QRegExpEngine::Box::set(int bref)
2513{
2514 ls.resize(asize: 1);
2515 ls[0] = eng->createState(bref);
2516 rs = ls;
2517 if (bref >= 1 && bref <= MaxBackRefs)
2518 skipanchors = Anchor_BackRef0Empty << bref;
2519#ifndef QT_NO_REGEXP_OPTIM
2520 maxl = InftyLen;
2521#endif
2522 minl = 0;
2523}
2524#endif
2525
2526void QRegExpEngine::Box::cat(const Box &b)
2527{
2528 eng->addCatTransitions(from: rs, to: b.ls);
2529 addAnchorsToEngine(to: b);
2530 if (minl == 0) {
2531 lanchors.insert(map: b.lanchors);
2532 if (skipanchors != 0) {
2533 for (int i = 0; i < b.ls.size(); i++) {
2534 int a = eng->anchorConcatenation(a: lanchors.value(akey: b.ls.at(i), adefaultValue: 0), b: skipanchors);
2535 lanchors.insert(akey: b.ls.at(i), avalue: a);
2536 }
2537 }
2538 mergeInto(a: &ls, b: b.ls);
2539 }
2540 if (b.minl == 0) {
2541 ranchors.insert(map: b.ranchors);
2542 if (b.skipanchors != 0) {
2543 for (int i = 0; i < rs.size(); i++) {
2544 int a = eng->anchorConcatenation(a: ranchors.value(akey: rs.at(i), adefaultValue: 0), b: b.skipanchors);
2545 ranchors.insert(akey: rs.at(i), avalue: a);
2546 }
2547 }
2548 mergeInto(a: &rs, b: b.rs);
2549 } else {
2550 ranchors = b.ranchors;
2551 rs = b.rs;
2552 }
2553
2554#ifndef QT_NO_REGEXP_OPTIM
2555 if (maxl != InftyLen) {
2556 if (rightStr.length() + b.leftStr.length() >
2557 qMax(a: str.length(), b: b.str.length())) {
2558 earlyStart = minl - rightStr.length();
2559 lateStart = maxl - rightStr.length();
2560 str = rightStr + b.leftStr;
2561 } else if (b.str.length() > str.length()) {
2562 earlyStart = minl + b.earlyStart;
2563 lateStart = maxl + b.lateStart;
2564 str = b.str;
2565 }
2566 }
2567
2568 if (leftStr.length() == maxl)
2569 leftStr += b.leftStr;
2570
2571 if (b.rightStr.length() == b.maxl) {
2572 rightStr += b.rightStr;
2573 } else {
2574 rightStr = b.rightStr;
2575 }
2576
2577 if (maxl == InftyLen || b.maxl == InftyLen) {
2578 maxl = InftyLen;
2579 } else {
2580 maxl += b.maxl;
2581 }
2582
2583 for (int i = 0; i < NumBadChars; i++) {
2584 if (b.occ1.at(i) != NoOccurrence && minl + b.occ1.at(i) < occ1.at(i))
2585 occ1[i] = minl + b.occ1.at(i);
2586 }
2587#endif
2588
2589 minl += b.minl;
2590 if (minl == 0)
2591 skipanchors = eng->anchorConcatenation(a: skipanchors, b: b.skipanchors);
2592 else
2593 skipanchors = 0;
2594}
2595
2596void QRegExpEngine::Box::orx(const Box &b)
2597{
2598 mergeInto(a: &ls, b: b.ls);
2599 lanchors.insert(map: b.lanchors);
2600 mergeInto(a: &rs, b: b.rs);
2601 ranchors.insert(map: b.ranchors);
2602
2603 if (b.minl == 0) {
2604 if (minl == 0)
2605 skipanchors = eng->anchorAlternation(a: skipanchors, b: b.skipanchors);
2606 else
2607 skipanchors = b.skipanchors;
2608 }
2609
2610#ifndef QT_NO_REGEXP_OPTIM
2611 for (int i = 0; i < NumBadChars; i++) {
2612 if (occ1.at(i) > b.occ1.at(i))
2613 occ1[i] = b.occ1.at(i);
2614 }
2615 earlyStart = 0;
2616 lateStart = 0;
2617 str = QString();
2618 leftStr = QString();
2619 rightStr = QString();
2620 if (b.maxl > maxl)
2621 maxl = b.maxl;
2622#endif
2623 if (b.minl < minl)
2624 minl = b.minl;
2625}
2626
2627void QRegExpEngine::Box::plus(int atom)
2628{
2629#ifndef QT_NO_REGEXP_CAPTURE
2630 eng->addPlusTransitions(from: rs, to: ls, atom);
2631#else
2632 Q_UNUSED(atom);
2633 eng->addCatTransitions(rs, ls);
2634#endif
2635 addAnchorsToEngine(to: *this);
2636#ifndef QT_NO_REGEXP_OPTIM
2637 maxl = InftyLen;
2638#endif
2639}
2640
2641void QRegExpEngine::Box::opt()
2642{
2643#ifndef QT_NO_REGEXP_OPTIM
2644 earlyStart = 0;
2645 lateStart = 0;
2646 str = QString();
2647 leftStr = QString();
2648 rightStr = QString();
2649#endif
2650 skipanchors = 0;
2651 minl = 0;
2652}
2653
2654void QRegExpEngine::Box::catAnchor(int a)
2655{
2656 if (a != 0) {
2657 for (int i = 0; i < rs.size(); i++) {
2658 a = eng->anchorConcatenation(a: ranchors.value(akey: rs.at(i), adefaultValue: 0), b: a);
2659 ranchors.insert(akey: rs.at(i), avalue: a);
2660 }
2661 if (minl == 0)
2662 skipanchors = eng->anchorConcatenation(a: skipanchors, b: a);
2663 }
2664}
2665
2666#ifndef QT_NO_REGEXP_OPTIM
2667void QRegExpEngine::Box::setupHeuristics()
2668{
2669 eng->goodEarlyStart = earlyStart;
2670 eng->goodLateStart = lateStart;
2671 eng->goodStr = eng->cs ? str : str.toLower();
2672
2673 eng->minl = minl;
2674 if (eng->cs) {
2675 /*
2676 A regular expression such as 112|1 has occ1['2'] = 2 and minl =
2677 1 at this point. An entry of occ1 has to be at most minl or
2678 infinity for the rest of the algorithm to go well.
2679
2680 We waited until here before normalizing these cases (instead of
2681 doing it in Box::orx()) because sometimes things improve by
2682 themselves. Consider for example (112|1)34.
2683 */
2684 for (int i = 0; i < NumBadChars; i++) {
2685 if (occ1.at(i) != NoOccurrence && occ1.at(i) >= minl)
2686 occ1[i] = minl;
2687 }
2688 eng->occ1 = occ1;
2689 } else {
2690 eng->occ1.fill(from: 0, asize: NumBadChars);
2691 }
2692
2693 eng->heuristicallyChooseHeuristic();
2694}
2695#endif
2696
2697#if defined(QT_DEBUG)
2698void QRegExpEngine::Box::dump() const
2699{
2700 int i;
2701 qDebug(msg: "Box of at least %d character%s", minl, minl == 1 ? "" : "s");
2702 qDebug(msg: " Left states:");
2703 for (i = 0; i < ls.size(); i++) {
2704 if (lanchors.value(akey: ls[i], adefaultValue: 0) == 0)
2705 qDebug(msg: " %d", ls[i]);
2706 else
2707 qDebug(msg: " %d [anchors 0x%.8x]", ls[i], lanchors[ls[i]]);
2708 }
2709 qDebug(msg: " Right states:");
2710 for (i = 0; i < rs.size(); i++) {
2711 if (ranchors.value(akey: rs[i], adefaultValue: 0) == 0)
2712 qDebug(msg: " %d", rs[i]);
2713 else
2714 qDebug(msg: " %d [anchors 0x%.8x]", rs[i], ranchors[rs[i]]);
2715 }
2716 qDebug(msg: " Skip anchors: 0x%.8x", skipanchors);
2717}
2718#endif
2719
2720void QRegExpEngine::Box::addAnchorsToEngine(const Box &to) const
2721{
2722 for (int i = 0; i < to.ls.size(); i++) {
2723 for (int j = 0; j < rs.size(); j++) {
2724 int a = eng->anchorConcatenation(a: ranchors.value(akey: rs.at(i: j), adefaultValue: 0),
2725 b: to.lanchors.value(akey: to.ls.at(i), adefaultValue: 0));
2726 eng->addAnchors(from: rs[j], to: to.ls[i], a);
2727 }
2728 }
2729}
2730
2731#ifndef QT_NO_REGEXP_CCLASS
2732// fast lookup hash for xml schema extensions
2733// sorted by name for b-search
2734static const struct CategoriesRangeMapEntry {
2735 const char name[40];
2736 uint first, second;
2737} categoriesRangeMap[] = {
2738 { .name: "AegeanNumbers", .first: 0x10100, .second: 0x1013F },
2739 { .name: "AlphabeticPresentationForms", .first: 0xFB00, .second: 0xFB4F },
2740 { .name: "AncientGreekMusicalNotation", .first: 0x1D200, .second: 0x1D24F },
2741 { .name: "AncientGreekNumbers", .first: 0x10140, .second: 0x1018F },
2742 { .name: "Arabic", .first: 0x0600, .second: 0x06FF },
2743 { .name: "ArabicPresentationForms-A", .first: 0xFB50, .second: 0xFDFF },
2744 { .name: "ArabicPresentationForms-B", .first: 0xFE70, .second: 0xFEFF },
2745 { .name: "ArabicSupplement", .first: 0x0750, .second: 0x077F },
2746 { .name: "Armenian", .first: 0x0530, .second: 0x058F },
2747 { .name: "Arrows", .first: 0x2190, .second: 0x21FF },
2748 { .name: "BasicLatin", .first: 0x0000, .second: 0x007F },
2749 { .name: "Bengali", .first: 0x0980, .second: 0x09FF },
2750 { .name: "BlockElements", .first: 0x2580, .second: 0x259F },
2751 { .name: "Bopomofo", .first: 0x3100, .second: 0x312F },
2752 { .name: "BopomofoExtended", .first: 0x31A0, .second: 0x31BF },
2753 { .name: "BoxDrawing", .first: 0x2500, .second: 0x257F },
2754 { .name: "BraillePatterns", .first: 0x2800, .second: 0x28FF },
2755 { .name: "Buginese", .first: 0x1A00, .second: 0x1A1F },
2756 { .name: "Buhid", .first: 0x1740, .second: 0x175F },
2757 { .name: "ByzantineMusicalSymbols", .first: 0x1D000, .second: 0x1D0FF },
2758 { .name: "CJKCompatibility", .first: 0x3300, .second: 0x33FF },
2759 { .name: "CJKCompatibilityForms", .first: 0xFE30, .second: 0xFE4F },
2760 { .name: "CJKCompatibilityIdeographs", .first: 0xF900, .second: 0xFAFF },
2761 { .name: "CJKCompatibilityIdeographsSupplement", .first: 0x2F800, .second: 0x2FA1F },
2762 { .name: "CJKRadicalsSupplement", .first: 0x2E80, .second: 0x2EFF },
2763 { .name: "CJKStrokes", .first: 0x31C0, .second: 0x31EF },
2764 { .name: "CJKSymbolsandPunctuation", .first: 0x3000, .second: 0x303F },
2765 { .name: "CJKUnifiedIdeographs", .first: 0x4E00, .second: 0x9FFF },
2766 { .name: "CJKUnifiedIdeographsExtensionA", .first: 0x3400, .second: 0x4DB5 },
2767 { .name: "CJKUnifiedIdeographsExtensionB", .first: 0x20000, .second: 0x2A6DF },
2768 { .name: "Cherokee", .first: 0x13A0, .second: 0x13FF },
2769 { .name: "CombiningDiacriticalMarks", .first: 0x0300, .second: 0x036F },
2770 { .name: "CombiningDiacriticalMarksSupplement", .first: 0x1DC0, .second: 0x1DFF },
2771 { .name: "CombiningHalfMarks", .first: 0xFE20, .second: 0xFE2F },
2772 { .name: "CombiningMarksforSymbols", .first: 0x20D0, .second: 0x20FF },
2773 { .name: "ControlPictures", .first: 0x2400, .second: 0x243F },
2774 { .name: "Coptic", .first: 0x2C80, .second: 0x2CFF },
2775 { .name: "CurrencySymbols", .first: 0x20A0, .second: 0x20CF },
2776 { .name: "CypriotSyllabary", .first: 0x10800, .second: 0x1083F },
2777 { .name: "Cyrillic", .first: 0x0400, .second: 0x04FF },
2778 { .name: "CyrillicSupplement", .first: 0x0500, .second: 0x052F },
2779 { .name: "Deseret", .first: 0x10400, .second: 0x1044F },
2780 { .name: "Devanagari", .first: 0x0900, .second: 0x097F },
2781 { .name: "Dingbats", .first: 0x2700, .second: 0x27BF },
2782 { .name: "EnclosedAlphanumerics", .first: 0x2460, .second: 0x24FF },
2783 { .name: "EnclosedCJKLettersandMonths", .first: 0x3200, .second: 0x32FF },
2784 { .name: "Ethiopic", .first: 0x1200, .second: 0x137F },
2785 { .name: "EthiopicExtended", .first: 0x2D80, .second: 0x2DDF },
2786 { .name: "EthiopicSupplement", .first: 0x1380, .second: 0x139F },
2787 { .name: "GeneralPunctuation", .first: 0x2000, .second: 0x206F },
2788 { .name: "GeometricShapes", .first: 0x25A0, .second: 0x25FF },
2789 { .name: "Georgian", .first: 0x10A0, .second: 0x10FF },
2790 { .name: "GeorgianSupplement", .first: 0x2D00, .second: 0x2D2F },
2791 { .name: "Glagolitic", .first: 0x2C00, .second: 0x2C5F },
2792 { .name: "Gothic", .first: 0x10330, .second: 0x1034F },
2793 { .name: "Greek", .first: 0x0370, .second: 0x03FF },
2794 { .name: "GreekExtended", .first: 0x1F00, .second: 0x1FFF },
2795 { .name: "Gujarati", .first: 0x0A80, .second: 0x0AFF },
2796 { .name: "Gurmukhi", .first: 0x0A00, .second: 0x0A7F },
2797 { .name: "HalfwidthandFullwidthForms", .first: 0xFF00, .second: 0xFFEF },
2798 { .name: "HangulCompatibilityJamo", .first: 0x3130, .second: 0x318F },
2799 { .name: "HangulJamo", .first: 0x1100, .second: 0x11FF },
2800 { .name: "HangulSyllables", .first: 0xAC00, .second: 0xD7A3 },
2801 { .name: "Hanunoo", .first: 0x1720, .second: 0x173F },
2802 { .name: "Hebrew", .first: 0x0590, .second: 0x05FF },
2803 { .name: "Hiragana", .first: 0x3040, .second: 0x309F },
2804 { .name: "IPAExtensions", .first: 0x0250, .second: 0x02AF },
2805 { .name: "IdeographicDescriptionCharacters", .first: 0x2FF0, .second: 0x2FFF },
2806 { .name: "Kanbun", .first: 0x3190, .second: 0x319F },
2807 { .name: "KangxiRadicals", .first: 0x2F00, .second: 0x2FDF },
2808 { .name: "Kannada", .first: 0x0C80, .second: 0x0CFF },
2809 { .name: "Katakana", .first: 0x30A0, .second: 0x30FF },
2810 { .name: "KatakanaPhoneticExtensions", .first: 0x31F0, .second: 0x31FF },
2811 { .name: "Kharoshthi", .first: 0x10A00, .second: 0x10A5F },
2812 { .name: "Khmer", .first: 0x1780, .second: 0x17FF },
2813 { .name: "KhmerSymbols", .first: 0x19E0, .second: 0x19FF },
2814 { .name: "Lao", .first: 0x0E80, .second: 0x0EFF },
2815 { .name: "Latin-1Supplement", .first: 0x0080, .second: 0x00FF },
2816 { .name: "LatinExtended-A", .first: 0x0100, .second: 0x017F },
2817 { .name: "LatinExtended-B", .first: 0x0180, .second: 0x024F },
2818 { .name: "LatinExtendedAdditional", .first: 0x1E00, .second: 0x1EFF },
2819 { .name: "LetterlikeSymbols", .first: 0x2100, .second: 0x214F },
2820 { .name: "Limbu", .first: 0x1900, .second: 0x194F },
2821 { .name: "LinearBIdeograms", .first: 0x10080, .second: 0x100FF },
2822 { .name: "LinearBSyllabary", .first: 0x10000, .second: 0x1007F },
2823 { .name: "Malayalam", .first: 0x0D00, .second: 0x0D7F },
2824 { .name: "MathematicalAlphanumericSymbols", .first: 0x1D400, .second: 0x1D7FF },
2825 { .name: "MathematicalOperators", .first: 0x2200, .second: 0x22FF },
2826 { .name: "MiscellaneousMathematicalSymbols-A", .first: 0x27C0, .second: 0x27EF },
2827 { .name: "MiscellaneousMathematicalSymbols-B", .first: 0x2980, .second: 0x29FF },
2828 { .name: "MiscellaneousSymbols", .first: 0x2600, .second: 0x26FF },
2829 { .name: "MiscellaneousSymbolsandArrows", .first: 0x2B00, .second: 0x2BFF },
2830 { .name: "MiscellaneousTechnical", .first: 0x2300, .second: 0x23FF },
2831 { .name: "ModifierToneLetters", .first: 0xA700, .second: 0xA71F },
2832 { .name: "Mongolian", .first: 0x1800, .second: 0x18AF },
2833 { .name: "MusicalSymbols", .first: 0x1D100, .second: 0x1D1FF },
2834 { .name: "Myanmar", .first: 0x1000, .second: 0x109F },
2835 { .name: "NewTaiLue", .first: 0x1980, .second: 0x19DF },
2836 { .name: "NumberForms", .first: 0x2150, .second: 0x218F },
2837 { .name: "Ogham", .first: 0x1680, .second: 0x169F },
2838 { .name: "OldItalic", .first: 0x10300, .second: 0x1032F },
2839 { .name: "OldPersian", .first: 0x103A0, .second: 0x103DF },
2840 { .name: "OpticalCharacterRecognition", .first: 0x2440, .second: 0x245F },
2841 { .name: "Oriya", .first: 0x0B00, .second: 0x0B7F },
2842 { .name: "Osmanya", .first: 0x10480, .second: 0x104AF },
2843 { .name: "PhoneticExtensions", .first: 0x1D00, .second: 0x1D7F },
2844 { .name: "PhoneticExtensionsSupplement", .first: 0x1D80, .second: 0x1DBF },
2845 { .name: "PrivateUse", .first: 0xE000, .second: 0xF8FF },
2846 { .name: "Runic", .first: 0x16A0, .second: 0x16FF },
2847 { .name: "Shavian", .first: 0x10450, .second: 0x1047F },
2848 { .name: "Sinhala", .first: 0x0D80, .second: 0x0DFF },
2849 { .name: "SmallFormVariants", .first: 0xFE50, .second: 0xFE6F },
2850 { .name: "SpacingModifierLetters", .first: 0x02B0, .second: 0x02FF },
2851 { .name: "Specials", .first: 0xFFF0, .second: 0xFFFF },
2852 { .name: "SuperscriptsandSubscripts", .first: 0x2070, .second: 0x209F },
2853 { .name: "SupplementalArrows-A", .first: 0x27F0, .second: 0x27FF },
2854 { .name: "SupplementalArrows-B", .first: 0x2900, .second: 0x297F },
2855 { .name: "SupplementalMathematicalOperators", .first: 0x2A00, .second: 0x2AFF },
2856 { .name: "SupplementalPunctuation", .first: 0x2E00, .second: 0x2E7F },
2857 { .name: "SupplementaryPrivateUseArea-A", .first: 0xF0000, .second: 0xFFFFF },
2858 { .name: "SupplementaryPrivateUseArea-B", .first: 0x100000, .second: 0x10FFFF },
2859 { .name: "SylotiNagri", .first: 0xA800, .second: 0xA82F },
2860 { .name: "Syriac", .first: 0x0700, .second: 0x074F },
2861 { .name: "Tagalog", .first: 0x1700, .second: 0x171F },
2862 { .name: "Tagbanwa", .first: 0x1760, .second: 0x177F },
2863 { .name: "Tags", .first: 0xE0000, .second: 0xE007F },
2864 { .name: "TaiLe", .first: 0x1950, .second: 0x197F },
2865 { .name: "TaiXuanJingSymbols", .first: 0x1D300, .second: 0x1D35F },
2866 { .name: "Tamil", .first: 0x0B80, .second: 0x0BFF },
2867 { .name: "Telugu", .first: 0x0C00, .second: 0x0C7F },
2868 { .name: "Thaana", .first: 0x0780, .second: 0x07BF },
2869 { .name: "Thai", .first: 0x0E00, .second: 0x0E7F },
2870 { .name: "Tibetan", .first: 0x0F00, .second: 0x0FFF },
2871 { .name: "Tifinagh", .first: 0x2D30, .second: 0x2D7F },
2872 { .name: "Ugaritic", .first: 0x10380, .second: 0x1039F },
2873 { .name: "UnifiedCanadianAboriginalSyllabics", .first: 0x1400, .second: 0x167F },
2874 { .name: "VariationSelectors", .first: 0xFE00, .second: 0xFE0F },
2875 { .name: "VariationSelectorsSupplement", .first: 0xE0100, .second: 0xE01EF },
2876 { .name: "VerticalForms", .first: 0xFE10, .second: 0xFE1F },
2877 { .name: "YiRadicals", .first: 0xA490, .second: 0xA4CF },
2878 { .name: "YiSyllables", .first: 0xA000, .second: 0xA48F },
2879 { .name: "YijingHexagramSymbols", .first: 0x4DC0, .second: 0x4DFF }
2880};
2881
2882inline bool operator<(const CategoriesRangeMapEntry &entry1, const CategoriesRangeMapEntry &entry2)
2883{ return qstrcmp(str1: entry1.name, str2: entry2.name) < 0; }
2884inline bool operator<(const char *name, const CategoriesRangeMapEntry &entry)
2885{ return qstrcmp(str1: name, str2: entry.name) < 0; }
2886inline bool operator<(const CategoriesRangeMapEntry &entry, const char *name)
2887{ return qstrcmp(str1: entry.name, str2: name) < 0; }
2888#endif // QT_NO_REGEXP_CCLASS
2889
2890int QRegExpEngine::getChar()
2891{
2892 return (yyPos == yyLen) ? EOS : yyIn[yyPos++].unicode();
2893}
2894
2895int QRegExpEngine::getEscape()
2896{
2897#ifndef QT_NO_REGEXP_ESCAPE
2898 const char tab[] = "afnrtv"; // no b, as \b means word boundary
2899 const char backTab[] = "\a\f\n\r\t\v";
2900 ushort low;
2901 int i;
2902#endif
2903 ushort val;
2904 int prevCh = yyCh;
2905
2906 if (prevCh == EOS) {
2907 error(RXERR_END);
2908 return Tok_Char | '\\';
2909 }
2910 yyCh = getChar();
2911#ifndef QT_NO_REGEXP_ESCAPE
2912 if ((prevCh & ~0xff) == 0) {
2913 const char *p = strchr(s: tab, c: prevCh);
2914 if (p != nullptr)
2915 return Tok_Char | backTab[p - tab];
2916 }
2917#endif
2918
2919 switch (prevCh) {
2920#ifndef QT_NO_REGEXP_ESCAPE
2921 case '0':
2922 val = 0;
2923 for (i = 0; i < 3; i++) {
2924 if (yyCh >= '0' && yyCh <= '7')
2925 val = (val << 3) | (yyCh - '0');
2926 else
2927 break;
2928 yyCh = getChar();
2929 }
2930 if ((val & ~0377) != 0)
2931 error(RXERR_OCTAL);
2932 return Tok_Char | val;
2933#endif
2934#ifndef QT_NO_REGEXP_ESCAPE
2935 case 'B':
2936 return Tok_NonWord;
2937#endif
2938#ifndef QT_NO_REGEXP_CCLASS
2939 case 'D':
2940 // see QChar::isDigit()
2941 yyCharClass->addCategories(cats: uint(-1) ^ FLAG(QChar::Number_DecimalDigit));
2942 return Tok_CharClass;
2943 case 'S':
2944 // see QChar::isSpace()
2945 yyCharClass->addCategories(cats: uint(-1) ^ (FLAG(QChar::Separator_Space) |
2946 FLAG(QChar::Separator_Line) |
2947 FLAG(QChar::Separator_Paragraph) |
2948 FLAG(QChar::Other_Control)));
2949 yyCharClass->addRange(from: 0x0000, to: 0x0008);
2950 yyCharClass->addRange(from: 0x000e, to: 0x001f);
2951 yyCharClass->addRange(from: 0x007f, to: 0x0084);
2952 yyCharClass->addRange(from: 0x0086, to: 0x009f);
2953 return Tok_CharClass;
2954 case 'W':
2955 // see QChar::isLetterOrNumber() and QChar::isMark()
2956 yyCharClass->addCategories(cats: uint(-1) ^ (FLAG(QChar::Mark_NonSpacing) |
2957 FLAG(QChar::Mark_SpacingCombining) |
2958 FLAG(QChar::Mark_Enclosing) |
2959 FLAG(QChar::Number_DecimalDigit) |
2960 FLAG(QChar::Number_Letter) |
2961 FLAG(QChar::Number_Other) |
2962 FLAG(QChar::Letter_Uppercase) |
2963 FLAG(QChar::Letter_Lowercase) |
2964 FLAG(QChar::Letter_Titlecase) |
2965 FLAG(QChar::Letter_Modifier) |
2966 FLAG(QChar::Letter_Other) |
2967 FLAG(QChar::Punctuation_Connector)));
2968 yyCharClass->addRange(from: 0x203f, to: 0x2040);
2969 yyCharClass->addSingleton(ch: 0x2040);
2970 yyCharClass->addSingleton(ch: 0x2054);
2971 yyCharClass->addSingleton(ch: 0x30fb);
2972 yyCharClass->addRange(from: 0xfe33, to: 0xfe34);
2973 yyCharClass->addRange(from: 0xfe4d, to: 0xfe4f);
2974 yyCharClass->addSingleton(ch: 0xff3f);
2975 yyCharClass->addSingleton(ch: 0xff65);
2976 return Tok_CharClass;
2977#endif
2978#ifndef QT_NO_REGEXP_ESCAPE
2979 case 'b':
2980 return Tok_Word;
2981#endif
2982#ifndef QT_NO_REGEXP_CCLASS
2983 case 'd':
2984 // see QChar::isDigit()
2985 yyCharClass->addCategories(FLAG(QChar::Number_DecimalDigit));
2986 return Tok_CharClass;
2987 case 's':
2988 // see QChar::isSpace()
2989 yyCharClass->addCategories(FLAG(QChar::Separator_Space) |
2990 FLAG(QChar::Separator_Line) |
2991 FLAG(QChar::Separator_Paragraph));
2992 yyCharClass->addRange(from: 0x0009, to: 0x000d);
2993 yyCharClass->addSingleton(ch: 0x0085);
2994 return Tok_CharClass;
2995 case 'w':
2996 // see QChar::isLetterOrNumber() and QChar::isMark()
2997 yyCharClass->addCategories(FLAG(QChar::Mark_NonSpacing) |
2998 FLAG(QChar::Mark_SpacingCombining) |
2999 FLAG(QChar::Mark_Enclosing) |
3000 FLAG(QChar::Number_DecimalDigit) |
3001 FLAG(QChar::Number_Letter) |
3002 FLAG(QChar::Number_Other) |
3003 FLAG(QChar::Letter_Uppercase) |
3004 FLAG(QChar::Letter_Lowercase) |
3005 FLAG(QChar::Letter_Titlecase) |
3006 FLAG(QChar::Letter_Modifier) |
3007 FLAG(QChar::Letter_Other));
3008 yyCharClass->addSingleton(ch: 0x005f); // '_'
3009 return Tok_CharClass;
3010 case 'I':
3011 if (!xmlSchemaExtensions)
3012 break;
3013 yyCharClass->setNegative(!yyCharClass->negative());
3014 Q_FALLTHROUGH();
3015 case 'i':
3016 if (xmlSchemaExtensions) {
3017 yyCharClass->addCategories(FLAG(QChar::Mark_NonSpacing) |
3018 FLAG(QChar::Mark_SpacingCombining) |
3019 FLAG(QChar::Mark_Enclosing) |
3020 FLAG(QChar::Number_DecimalDigit) |
3021 FLAG(QChar::Number_Letter) |
3022 FLAG(QChar::Number_Other) |
3023 FLAG(QChar::Letter_Uppercase) |
3024 FLAG(QChar::Letter_Lowercase) |
3025 FLAG(QChar::Letter_Titlecase) |
3026 FLAG(QChar::Letter_Modifier) |
3027 FLAG(QChar::Letter_Other));
3028 yyCharClass->addSingleton(ch: 0x003a); // ':'
3029 yyCharClass->addSingleton(ch: 0x005f); // '_'
3030 yyCharClass->addRange(from: 0x0041, to: 0x005a); // [A-Z]
3031 yyCharClass->addRange(from: 0x0061, to: 0x007a); // [a-z]
3032 yyCharClass->addRange(from: 0xc0, to: 0xd6);
3033 yyCharClass->addRange(from: 0xd8, to: 0xf6);
3034 yyCharClass->addRange(from: 0xf8, to: 0x2ff);
3035 yyCharClass->addRange(from: 0x370, to: 0x37d);
3036 yyCharClass->addRange(from: 0x37f, to: 0x1fff);
3037 yyCharClass->addRange(from: 0x200c, to: 0x200d);
3038 yyCharClass->addRange(from: 0x2070, to: 0x218f);
3039 yyCharClass->addRange(from: 0x2c00, to: 0x2fef);
3040 yyCharClass->addRange(from: 0x3001, to: 0xd7ff);
3041 yyCharClass->addRange(from: 0xf900, to: 0xfdcf);
3042 yyCharClass->addRange(from: 0xfdf0, to: 0xfffd);
3043 yyCharClass->addRange(from: (ushort)0x10000, to: (ushort)0xeffff);
3044 return Tok_CharClass;
3045 } else {
3046 break;
3047 }
3048 case 'C':
3049 if (!xmlSchemaExtensions)
3050 break;
3051 yyCharClass->setNegative(!yyCharClass->negative());
3052 Q_FALLTHROUGH();
3053 case 'c':
3054 if (xmlSchemaExtensions) {
3055 yyCharClass->addCategories(FLAG(QChar::Mark_NonSpacing) |
3056 FLAG(QChar::Mark_SpacingCombining) |
3057 FLAG(QChar::Mark_Enclosing) |
3058 FLAG(QChar::Number_DecimalDigit) |
3059 FLAG(QChar::Number_Letter) |
3060 FLAG(QChar::Number_Other) |
3061 FLAG(QChar::Letter_Uppercase) |
3062 FLAG(QChar::Letter_Lowercase) |
3063 FLAG(QChar::Letter_Titlecase) |
3064 FLAG(QChar::Letter_Modifier) |
3065 FLAG(QChar::Letter_Other));
3066 yyCharClass->addSingleton(ch: 0x002d); // '-'
3067 yyCharClass->addSingleton(ch: 0x002e); // '.'
3068 yyCharClass->addSingleton(ch: 0x003a); // ':'
3069 yyCharClass->addSingleton(ch: 0x005f); // '_'
3070 yyCharClass->addSingleton(ch: 0xb7);
3071 yyCharClass->addRange(from: 0x0030, to: 0x0039); // [0-9]
3072 yyCharClass->addRange(from: 0x0041, to: 0x005a); // [A-Z]
3073 yyCharClass->addRange(from: 0x0061, to: 0x007a); // [a-z]
3074 yyCharClass->addRange(from: 0xc0, to: 0xd6);
3075 yyCharClass->addRange(from: 0xd8, to: 0xf6);
3076 yyCharClass->addRange(from: 0xf8, to: 0x2ff);
3077 yyCharClass->addRange(from: 0x370, to: 0x37d);
3078 yyCharClass->addRange(from: 0x37f, to: 0x1fff);
3079 yyCharClass->addRange(from: 0x200c, to: 0x200d);
3080 yyCharClass->addRange(from: 0x2070, to: 0x218f);
3081 yyCharClass->addRange(from: 0x2c00, to: 0x2fef);
3082 yyCharClass->addRange(from: 0x3001, to: 0xd7ff);
3083 yyCharClass->addRange(from: 0xf900, to: 0xfdcf);
3084 yyCharClass->addRange(from: 0xfdf0, to: 0xfffd);
3085 yyCharClass->addRange(from: (ushort)0x10000, to: (ushort)0xeffff);
3086 yyCharClass->addRange(from: 0x0300, to: 0x036f);
3087 yyCharClass->addRange(from: 0x203f, to: 0x2040);
3088 return Tok_CharClass;
3089 } else {
3090 break;
3091 }
3092 case 'P':
3093 if (!xmlSchemaExtensions)
3094 break;
3095 yyCharClass->setNegative(!yyCharClass->negative());
3096 Q_FALLTHROUGH();
3097 case 'p':
3098 if (xmlSchemaExtensions) {
3099 if (yyCh != '{') {
3100 error(RXERR_CHARCLASS);
3101 return Tok_CharClass;
3102 }
3103
3104 QByteArray category;
3105 yyCh = getChar();
3106 while (yyCh != '}') {
3107 if (yyCh == EOS) {
3108 error(RXERR_END);
3109 return Tok_CharClass;
3110 }
3111 category.append(c: yyCh);
3112 yyCh = getChar();
3113 }
3114 yyCh = getChar(); // skip closing '}'
3115
3116 int catlen = category.length();
3117 if (catlen == 1 || catlen == 2) {
3118 switch (category.at(i: 0)) {
3119 case 'M':
3120 if (catlen == 1) {
3121 yyCharClass->addCategories(FLAG(QChar::Mark_NonSpacing) |
3122 FLAG(QChar::Mark_SpacingCombining) |
3123 FLAG(QChar::Mark_Enclosing));
3124 } else {
3125 switch (category.at(i: 1)) {
3126 case 'n': yyCharClass->addCategories(FLAG(QChar::Mark_NonSpacing)); break; // Mn
3127 case 'c': yyCharClass->addCategories(FLAG(QChar::Mark_SpacingCombining)); break; // Mc
3128 case 'e': yyCharClass->addCategories(FLAG(QChar::Mark_Enclosing)); break; // Me
3129 default: error(RXERR_CATEGORY); break;
3130 }
3131 }
3132 break;
3133 case 'N':
3134 if (catlen == 1) {
3135 yyCharClass->addCategories(FLAG(QChar::Number_DecimalDigit) |
3136 FLAG(QChar::Number_Letter) |
3137 FLAG(QChar::Number_Other));
3138 } else {
3139 switch (category.at(i: 1)) {
3140 case 'd': yyCharClass->addCategories(FLAG(QChar::Number_DecimalDigit)); break; // Nd
3141 case 'l': yyCharClass->addCategories(FLAG(QChar::Number_Letter)); break; // Hl
3142 case 'o': yyCharClass->addCategories(FLAG(QChar::Number_Other)); break; // No
3143 default: error(RXERR_CATEGORY); break;
3144 }
3145 }
3146 break;
3147 case 'Z':
3148 if (catlen == 1) {
3149 yyCharClass->addCategories(FLAG(QChar::Separator_Space) |
3150 FLAG(QChar::Separator_Line) |
3151 FLAG(QChar::Separator_Paragraph));
3152 } else {
3153 switch (category.at(i: 1)) {
3154 case 's': yyCharClass->addCategories(FLAG(QChar::Separator_Space)); break; // Zs
3155 case 'l': yyCharClass->addCategories(FLAG(QChar::Separator_Line)); break; // Zl
3156 case 'p': yyCharClass->addCategories(FLAG(QChar::Separator_Paragraph)); break; // Zp
3157 default: error(RXERR_CATEGORY); break;
3158 }
3159 }
3160 break;
3161 case 'C':
3162 if (catlen == 1) {
3163 yyCharClass->addCategories(FLAG(QChar::Other_Control) |
3164 FLAG(QChar::Other_Format) |
3165 FLAG(QChar::Other_Surrogate) |
3166 FLAG(QChar::Other_PrivateUse) |
3167 FLAG(QChar::Other_NotAssigned));
3168 } else {
3169 switch (category.at(i: 1)) {
3170 case 'c': yyCharClass->addCategories(FLAG(QChar::Other_Control)); break; // Cc
3171 case 'f': yyCharClass->addCategories(FLAG(QChar::Other_Format)); break; // Cf
3172 case 's': yyCharClass->addCategories(FLAG(QChar::Other_Surrogate)); break; // Cs
3173 case 'o': yyCharClass->addCategories(FLAG(QChar::Other_PrivateUse)); break; // Co
3174 case 'n': yyCharClass->addCategories(FLAG(QChar::Other_NotAssigned)); break; // Cn
3175 default: error(RXERR_CATEGORY); break;
3176 }
3177 }
3178 break;
3179 case 'L':
3180 if (catlen == 1) {
3181 yyCharClass->addCategories(FLAG(QChar::Letter_Uppercase) |
3182 FLAG(QChar::Letter_Lowercase) |
3183 FLAG(QChar::Letter_Titlecase) |
3184 FLAG(QChar::Letter_Modifier) |
3185 FLAG(QChar::Letter_Other));
3186 } else {
3187 switch (category.at(i: 1)) {
3188 case 'u': yyCharClass->addCategories(FLAG(QChar::Letter_Uppercase)); break; // Lu
3189 case 'l': yyCharClass->addCategories(FLAG(QChar::Letter_Lowercase)); break; // Ll
3190 case 't': yyCharClass->addCategories(FLAG(QChar::Letter_Titlecase)); break; // Lt
3191 case 'm': yyCharClass->addCategories(FLAG(QChar::Letter_Modifier)); break; // Lm
3192 case 'o': yyCharClass->addCategories(FLAG(QChar::Letter_Other)); break; // Lo
3193 default: error(RXERR_CATEGORY); break;
3194 }
3195 }
3196 break;
3197 case 'P':
3198 if (catlen == 1) {
3199 yyCharClass->addCategories(FLAG(QChar::Punctuation_Connector) |
3200 FLAG(QChar::Punctuation_Dash) |
3201 FLAG(QChar::Punctuation_Open) |
3202 FLAG(QChar::Punctuation_Close) |
3203 FLAG(QChar::Punctuation_InitialQuote) |
3204 FLAG(QChar::Punctuation_FinalQuote) |
3205 FLAG(QChar::Punctuation_Other));
3206 } else {
3207 switch (category.at(i: 1)) {
3208 case 'c': yyCharClass->addCategories(FLAG(QChar::Punctuation_Connector)); break; // Pc
3209 case 'd': yyCharClass->addCategories(FLAG(QChar::Punctuation_Dash)); break; // Pd
3210 case 's': yyCharClass->addCategories(FLAG(QChar::Punctuation_Open)); break; // Ps
3211 case 'e': yyCharClass->addCategories(FLAG(QChar::Punctuation_Close)); break; // Pe
3212 case 'i': yyCharClass->addCategories(FLAG(QChar::Punctuation_InitialQuote)); break; // Pi
3213 case 'f': yyCharClass->addCategories(FLAG(QChar::Punctuation_FinalQuote)); break; // Pf
3214 case 'o': yyCharClass->addCategories(FLAG(QChar::Punctuation_Other)); break; // Po
3215 default: error(RXERR_CATEGORY); break;
3216 }
3217 }
3218 break;
3219 case 'S':
3220 if (catlen == 1) {
3221 yyCharClass->addCategories(FLAG(QChar::Symbol_Math) |
3222 FLAG(QChar::Symbol_Currency) |
3223 FLAG(QChar::Symbol_Modifier) |
3224 FLAG(QChar::Symbol_Other));
3225 } else {
3226 switch (category.at(i: 1)) {
3227 case 'm': yyCharClass->addCategories(FLAG(QChar::Symbol_Math)); break; // Sm
3228 case 'c': yyCharClass->addCategories(FLAG(QChar::Symbol_Currency)); break; // Sc
3229 case 'k': yyCharClass->addCategories(FLAG(QChar::Symbol_Modifier)); break; // Sk
3230 case 'o': yyCharClass->addCategories(FLAG(QChar::Symbol_Other)); break; // So
3231 default: error(RXERR_CATEGORY); break;
3232 }
3233 }
3234 break;
3235 default:
3236 error(RXERR_CATEGORY);
3237 break;
3238 }
3239 } else if (catlen > 2 && category.at(i: 0) == 'I' && category.at(i: 1) == 's') {
3240 static const int N = sizeof(categoriesRangeMap) / sizeof(categoriesRangeMap[0]);
3241 const char * const categoryFamily = category.constData() + 2;
3242 const CategoriesRangeMapEntry *r = std::lower_bound(first: categoriesRangeMap, last: categoriesRangeMap + N, val: categoryFamily);
3243 if (r != categoriesRangeMap + N && qstrcmp(str1: r->name, str2: categoryFamily) == 0)
3244 yyCharClass->addRange(from: r->first, to: r->second);
3245 else
3246 error(RXERR_CATEGORY);
3247 } else {
3248 error(RXERR_CATEGORY);
3249 }
3250 return Tok_CharClass;
3251 } else {
3252 break;
3253 }
3254#endif
3255#ifndef QT_NO_REGEXP_ESCAPE
3256 case 'x':
3257 val = 0;
3258 for (i = 0; i < 4; i++) {
3259 low = QChar(yyCh).toLower().unicode();
3260 if (low >= '0' && low <= '9')
3261 val = (val << 4) | (low - '0');
3262 else if (low >= 'a' && low <= 'f')
3263 val = (val << 4) | (low - 'a' + 10);
3264 else
3265 break;
3266 yyCh = getChar();
3267 }
3268 return Tok_Char | val;
3269#endif
3270 default:
3271 break;
3272 }
3273 if (prevCh >= '1' && prevCh <= '9') {
3274#ifndef QT_NO_REGEXP_BACKREF
3275 val = prevCh - '0';
3276 while (yyCh >= '0' && yyCh <= '9') {
3277 val = (val * 10) + (yyCh - '0');
3278 yyCh = getChar();
3279 }
3280 return Tok_BackRef | val;
3281#else
3282 error(RXERR_DISABLED);
3283#endif
3284 }
3285 return Tok_Char | prevCh;
3286}
3287
3288#ifndef QT_NO_REGEXP_INTERVAL
3289int QRegExpEngine::getRep(int def)
3290{
3291 if (yyCh >= '0' && yyCh <= '9') {
3292 int rep = 0;
3293 do {
3294 rep = 10 * rep + yyCh - '0';
3295 if (rep >= InftyRep) {
3296 error(RXERR_REPETITION);
3297 rep = def;
3298 }
3299 yyCh = getChar();
3300 } while (yyCh >= '0' && yyCh <= '9');
3301 return rep;
3302 } else {
3303 return def;
3304 }
3305}
3306#endif
3307
3308#ifndef QT_NO_REGEXP_LOOKAHEAD
3309void QRegExpEngine::skipChars(int n)
3310{
3311 if (n > 0) {
3312 yyPos += n - 1;
3313 yyCh = getChar();
3314 }
3315}
3316#endif
3317
3318void QRegExpEngine::error(const char *msg)
3319{
3320 if (yyError.isEmpty())
3321 yyError = QLatin1String(msg);
3322}
3323
3324void QRegExpEngine::startTokenizer(const QChar *rx, int len)
3325{
3326 yyIn = rx;
3327 yyPos0 = 0;
3328 yyPos = 0;
3329 yyLen = len;
3330 yyCh = getChar();
3331 yyCharClass.reset(other: new QRegExpCharClass);
3332 yyMinRep = 0;
3333 yyMaxRep = 0;
3334 yyError = QString();
3335}
3336
3337int QRegExpEngine::getToken()
3338{
3339#ifndef QT_NO_REGEXP_CCLASS
3340 ushort pendingCh = 0;
3341 bool charPending;
3342 bool rangePending;
3343 int tok;
3344#endif
3345 int prevCh = yyCh;
3346
3347 yyPos0 = yyPos - 1;
3348#ifndef QT_NO_REGEXP_CCLASS
3349 yyCharClass->clear();
3350#endif
3351 yyMinRep = 0;
3352 yyMaxRep = 0;
3353 yyCh = getChar();
3354
3355 switch (prevCh) {
3356 case EOS:
3357 yyPos0 = yyPos;
3358 return Tok_Eos;
3359 case '$':
3360 return Tok_Dollar;
3361 case '(':
3362 if (yyCh == '?') {
3363 prevCh = getChar();
3364 yyCh = getChar();
3365 switch (prevCh) {
3366#ifndef QT_NO_REGEXP_LOOKAHEAD
3367 case '!':
3368 return Tok_NegLookahead;
3369 case '=':
3370 return Tok_PosLookahead;
3371#endif
3372 case ':':
3373 return Tok_MagicLeftParen;
3374 case '<':
3375 error(RXERR_LOOKBEHIND);
3376 return Tok_MagicLeftParen;
3377 default:
3378 error(RXERR_LOOKAHEAD);
3379 return Tok_MagicLeftParen;
3380 }
3381 } else {
3382 return Tok_LeftParen;
3383 }
3384 case ')':
3385 return Tok_RightParen;
3386 case '*':
3387 yyMinRep = 0;
3388 yyMaxRep = InftyRep;
3389 return Tok_Quantifier;
3390 case '+':
3391 yyMinRep = 1;
3392 yyMaxRep = InftyRep;
3393 return Tok_Quantifier;
3394 case '.':
3395#ifndef QT_NO_REGEXP_CCLASS
3396 yyCharClass->setNegative(true);
3397#endif
3398 return Tok_CharClass;
3399 case '?':
3400 yyMinRep = 0;
3401 yyMaxRep = 1;
3402 return Tok_Quantifier;
3403 case '[':
3404#ifndef QT_NO_REGEXP_CCLASS
3405 if (yyCh == '^') {
3406 yyCharClass->setNegative(true);
3407 yyCh = getChar();
3408 }
3409 charPending = false;
3410 rangePending = false;
3411 do {
3412 if (yyCh == '-' && charPending && !rangePending) {
3413 rangePending = true;
3414 yyCh = getChar();
3415 } else {
3416 if (charPending && !rangePending) {
3417 yyCharClass->addSingleton(ch: pendingCh);
3418 charPending = false;
3419 }
3420 if (yyCh == '\\') {
3421 yyCh = getChar();
3422 tok = getEscape();
3423 if (tok == Tok_Word)
3424 tok = '\b';
3425 } else {
3426 tok = Tok_Char | yyCh;
3427 yyCh = getChar();
3428 }
3429 if (tok == Tok_CharClass) {
3430 if (rangePending) {
3431 yyCharClass->addSingleton(ch: '-');
3432 yyCharClass->addSingleton(ch: pendingCh);
3433 charPending = false;
3434 rangePending = false;
3435 }
3436 } else if ((tok & Tok_Char) != 0) {
3437 if (rangePending) {
3438 yyCharClass->addRange(from: pendingCh, to: tok ^ Tok_Char);
3439 charPending = false;
3440 rangePending = false;
3441 } else {
3442 pendingCh = tok ^ Tok_Char;
3443 charPending = true;
3444 }
3445 } else {
3446 error(RXERR_CHARCLASS);
3447 }
3448 }
3449 } while (yyCh != ']' && yyCh != EOS);
3450 if (rangePending)
3451 yyCharClass->addSingleton(ch: '-');
3452 if (charPending)
3453 yyCharClass->addSingleton(ch: pendingCh);
3454 if (yyCh == EOS)
3455 error(RXERR_END);
3456 else
3457 yyCh = getChar();
3458 return Tok_CharClass;
3459#else
3460 error(RXERR_END);
3461 return Tok_Char | '[';
3462#endif
3463 case '\\':
3464 return getEscape();
3465 case ']':
3466 error(RXERR_LEFTDELIM);
3467 return Tok_Char | ']';
3468 case '^':
3469 return Tok_Caret;
3470 case '{':
3471#ifndef QT_NO_REGEXP_INTERVAL
3472 yyMinRep = getRep(def: 0);
3473 yyMaxRep = yyMinRep;
3474 if (yyCh == ',') {
3475 yyCh = getChar();
3476 yyMaxRep = getRep(def: InftyRep);
3477 }
3478 if (yyMaxRep < yyMinRep)
3479 error(RXERR_INTERVAL);
3480 if (yyCh != '}')
3481 error(RXERR_REPETITION);
3482 yyCh = getChar();
3483 return Tok_Quantifier;
3484#else
3485 error(RXERR_DISABLED);
3486 return Tok_Char | '{';
3487#endif
3488 case '|':
3489 return Tok_Bar;
3490 case '}':
3491 error(RXERR_LEFTDELIM);
3492 return Tok_Char | '}';
3493 default:
3494 return Tok_Char | prevCh;
3495 }
3496}
3497
3498int QRegExpEngine::parse(const QChar *pattern, int len)
3499{
3500 valid = true;
3501 startTokenizer(rx: pattern, len);
3502 yyTok = getToken();
3503#ifndef QT_NO_REGEXP_CAPTURE
3504 yyMayCapture = true;
3505#else
3506 yyMayCapture = false;
3507#endif
3508
3509#ifndef QT_NO_REGEXP_CAPTURE
3510 int atom = startAtom(officialCapture: false);
3511#endif
3512 QRegExpCharClass anything;
3513 Box box(this); // create InitialState
3514 box.set(anything);
3515 Box rightBox(this); // create FinalState
3516 rightBox.set(anything);
3517
3518 Box middleBox(this);
3519 parseExpression(box: &middleBox);
3520#ifndef QT_NO_REGEXP_CAPTURE
3521 finishAtom(atom, needCapture: false);
3522#endif
3523#ifndef QT_NO_REGEXP_OPTIM
3524 middleBox.setupHeuristics();
3525#endif
3526 box.cat(b: middleBox);
3527 box.cat(b: rightBox);
3528 yyCharClass.reset();
3529
3530#ifndef QT_NO_REGEXP_CAPTURE
3531 for (int i = 0; i < nf; ++i) {
3532 switch (f[i].capture) {
3533 case QRegExpAtom::NoCapture:
3534 break;
3535 case QRegExpAtom::OfficialCapture:
3536 f[i].capture = ncap;
3537 captureForOfficialCapture.append(t: ncap);
3538 ++ncap;
3539 ++officialncap;
3540 break;
3541 case QRegExpAtom::UnofficialCapture:
3542 f[i].capture = greedyQuantifiers ? ncap++ : QRegExpAtom::NoCapture;
3543 }
3544 }
3545
3546#ifndef QT_NO_REGEXP_BACKREF
3547#ifndef QT_NO_REGEXP_OPTIM
3548 if (officialncap == 0 && nbrefs == 0) {
3549 ncap = nf = 0;
3550 f.clear();
3551 }
3552#endif
3553 // handle the case where there's a \5 with no corresponding capture
3554 // (captureForOfficialCapture.size() != officialncap)
3555 for (int i = 0; i < nbrefs - officialncap; ++i) {
3556 captureForOfficialCapture.append(t: ncap);
3557 ++ncap;
3558 }
3559#endif
3560#endif
3561
3562 if (!yyError.isEmpty())
3563 return -1;
3564
3565#ifndef QT_NO_REGEXP_OPTIM
3566 const QRegExpAutomatonState &sinit = s.at(i: InitialState);
3567 caretAnchored = !sinit.anchors.isEmpty();
3568 if (caretAnchored) {
3569 const QMap<int, int> &anchors = sinit.anchors;
3570 QMap<int, int>::const_iterator a;
3571 for (a = anchors.constBegin(); a != anchors.constEnd(); ++a) {
3572 if (
3573#ifndef QT_NO_REGEXP_ANCHOR_ALT
3574 (*a & Anchor_Alternation) != 0 ||
3575#endif
3576 (*a & Anchor_Caret) == 0)
3577 {
3578 caretAnchored = false;
3579 break;
3580 }
3581 }
3582 }
3583#endif
3584
3585 // cleanup anchors
3586 int numStates = s.count();
3587 for (int i = 0; i < numStates; ++i) {
3588 QRegExpAutomatonState &state = s[i];
3589 if (!state.anchors.isEmpty()) {
3590 QMap<int, int>::iterator a = state.anchors.begin();
3591 while (a != state.anchors.end()) {
3592 if (a.value() == 0)
3593 a = state.anchors.erase(it: a);
3594 else
3595 ++a;
3596 }
3597 }
3598 }
3599
3600 return yyPos0;
3601}
3602
3603void QRegExpEngine::parseAtom(Box *box)
3604{
3605#ifndef QT_NO_REGEXP_LOOKAHEAD
3606 QRegExpEngine *eng = nullptr;
3607 bool neg;
3608 int len;
3609#endif
3610
3611 if ((yyTok & Tok_Char) != 0) {
3612 box->set(QChar(yyTok ^ Tok_Char));
3613 } else {
3614#ifndef QT_NO_REGEXP_OPTIM
3615 trivial = false;
3616#endif
3617 switch (yyTok) {
3618 case Tok_Dollar:
3619 box->catAnchor(a: Anchor_Dollar);
3620 break;
3621 case Tok_Caret:
3622 box->catAnchor(a: Anchor_Caret);
3623 break;
3624#ifndef QT_NO_REGEXP_LOOKAHEAD
3625 case Tok_PosLookahead:
3626 case Tok_NegLookahead:
3627 neg = (yyTok == Tok_NegLookahead);
3628 eng = new QRegExpEngine(cs, greedyQuantifiers);
3629 len = eng->parse(pattern: yyIn + yyPos - 1, len: yyLen - yyPos + 1);
3630 if (len >= 0)
3631 skipChars(n: len);
3632 else
3633 error(RXERR_LOOKAHEAD);
3634 box->catAnchor(a: addLookahead(eng, negative: neg));
3635 yyTok = getToken();
3636 if (yyTok != Tok_RightParen)
3637 error(RXERR_LOOKAHEAD);
3638 break;
3639#endif
3640#ifndef QT_NO_REGEXP_ESCAPE
3641 case Tok_Word:
3642 box->catAnchor(a: Anchor_Word);
3643 break;
3644 case Tok_NonWord:
3645 box->catAnchor(a: Anchor_NonWord);
3646 break;
3647#endif
3648 case Tok_LeftParen:
3649 case Tok_MagicLeftParen:
3650 yyTok = getToken();
3651 parseExpression(box);
3652 if (yyTok != Tok_RightParen)
3653 error(RXERR_END);
3654 break;
3655 case Tok_CharClass:
3656 box->set(*yyCharClass);
3657 break;
3658 case Tok_Quantifier:
3659 error(RXERR_REPETITION);
3660 break;
3661 default:
3662#ifndef QT_NO_REGEXP_BACKREF
3663 if ((yyTok & Tok_BackRef) != 0)
3664 box->set(yyTok ^ Tok_BackRef);
3665 else
3666#endif
3667 error(RXERR_DISABLED);
3668 }
3669 }
3670 yyTok = getToken();
3671}
3672
3673void QRegExpEngine::parseFactor(Box *box)
3674{
3675#ifndef QT_NO_REGEXP_CAPTURE
3676 int outerAtom = greedyQuantifiers ? startAtom(officialCapture: false) : -1;
3677 int innerAtom = startAtom(officialCapture: yyMayCapture && yyTok == Tok_LeftParen);
3678 bool magicLeftParen = (yyTok == Tok_MagicLeftParen);
3679#else
3680 const int innerAtom = -1;
3681#endif
3682
3683#ifndef QT_NO_REGEXP_INTERVAL
3684#define YYREDO() \
3685 yyIn = in, yyPos0 = pos0, yyPos = pos, yyLen = len, yyCh = ch, \
3686 *yyCharClass = charClass, yyMinRep = 0, yyMaxRep = 0, yyTok = tok
3687
3688 const QChar *in = yyIn;
3689 int pos0 = yyPos0;
3690 int pos = yyPos;
3691 int len = yyLen;
3692 int ch = yyCh;
3693 QRegExpCharClass charClass;
3694 if (yyTok == Tok_CharClass)
3695 charClass = *yyCharClass;
3696 int tok = yyTok;
3697 bool mayCapture = yyMayCapture;
3698#endif
3699
3700 parseAtom(box);
3701#ifndef QT_NO_REGEXP_CAPTURE
3702 finishAtom(atom: innerAtom, needCapture: magicLeftParen);
3703#endif
3704
3705 bool hasQuantifier = (yyTok == Tok_Quantifier);
3706 if (hasQuantifier) {
3707#ifndef QT_NO_REGEXP_OPTIM
3708 trivial = false;
3709#endif
3710 if (yyMaxRep == InftyRep) {
3711 box->plus(atom: innerAtom);
3712#ifndef QT_NO_REGEXP_INTERVAL
3713 } else if (yyMaxRep == 0) {
3714 box->clear();
3715#endif
3716 }
3717 if (yyMinRep == 0)
3718 box->opt();
3719
3720#ifndef QT_NO_REGEXP_INTERVAL
3721 yyMayCapture = false;
3722 int alpha = (yyMinRep == 0) ? 0 : yyMinRep - 1;
3723 int beta = (yyMaxRep == InftyRep) ? 0 : yyMaxRep - (alpha + 1);
3724
3725 Box rightBox(this);
3726 int i;
3727
3728 for (i = 0; i < beta; i++) {
3729 YYREDO();
3730 Box leftBox(this);
3731 parseAtom(box: &leftBox);
3732 leftBox.cat(b: rightBox);
3733 leftBox.opt();
3734 rightBox = leftBox;
3735 }
3736 for (i = 0; i < alpha; i++) {
3737 YYREDO();
3738 Box leftBox(this);
3739 parseAtom(box: &leftBox);
3740 leftBox.cat(b: rightBox);
3741 rightBox = leftBox;
3742 }
3743 rightBox.cat(b: *box);
3744 *box = rightBox;
3745#endif
3746 yyTok = getToken();
3747#ifndef QT_NO_REGEXP_INTERVAL
3748 yyMayCapture = mayCapture;
3749#endif
3750 }
3751#undef YYREDO
3752#ifndef QT_NO_REGEXP_CAPTURE
3753 if (greedyQuantifiers)
3754 finishAtom(atom: outerAtom, needCapture: hasQuantifier);
3755#endif
3756}
3757
3758void QRegExpEngine::parseTerm(Box *box)
3759{
3760#ifndef QT_NO_REGEXP_OPTIM
3761 if (yyTok != Tok_Eos && yyTok != Tok_RightParen && yyTok != Tok_Bar)
3762 parseFactor(box);
3763#endif
3764 while (yyTok != Tok_Eos && yyTok != Tok_RightParen && yyTok != Tok_Bar) {
3765 Box rightBox(this);
3766 parseFactor(box: &rightBox);
3767 box->cat(b: rightBox);
3768 }
3769}
3770
3771void QRegExpEngine::parseExpression(Box *box)
3772{
3773 parseTerm(box);
3774 while (yyTok == Tok_Bar) {
3775#ifndef QT_NO_REGEXP_OPTIM
3776 trivial = false;
3777#endif
3778 Box rightBox(this);
3779 yyTok = getToken();
3780 parseTerm(box: &rightBox);
3781 box->orx(b: rightBox);
3782 }
3783}
3784
3785/*
3786 The struct QRegExpPrivate contains the private data of a regular
3787 expression other than the automaton. It makes it possible for many
3788 QRegExp objects to use the same QRegExpEngine object with different
3789 QRegExpPrivate objects.
3790*/
3791struct QRegExpPrivate
3792{
3793 QRegExpEngine *eng;
3794 QRegExpEngineKey engineKey;
3795 bool minimal;
3796#ifndef QT_NO_REGEXP_CAPTURE
3797 QString t; // last string passed to QRegExp::indexIn() or lastIndexIn()
3798 QStringList capturedCache; // what QRegExp::capturedTexts() returned last
3799#endif
3800 QRegExpMatchState matchState;
3801
3802 inline QRegExpPrivate()
3803 : eng(nullptr), engineKey(QString(), QRegExp::RegExp, Qt::CaseSensitive), minimal(false) { }
3804 inline QRegExpPrivate(const QRegExpEngineKey &key)
3805 : eng(nullptr), engineKey(key), minimal(false) {}
3806};
3807
3808#if !defined(QT_NO_REGEXP_OPTIM)
3809struct QRECache
3810{
3811 typedef QHash<QRegExpEngineKey, QRegExpEngine *> EngineCache;
3812 typedef QCache<QRegExpEngineKey, QRegExpEngine> UnusedEngineCache;
3813 EngineCache usedEngines;
3814 UnusedEngineCache unusedEngines;
3815};
3816Q_GLOBAL_STATIC(QRECache, engineCache)
3817static QBasicMutex engineCacheMutex;
3818#endif // QT_NO_REGEXP_OPTIM
3819
3820static void derefEngine(QRegExpEngine *eng, const QRegExpEngineKey &key)
3821{
3822#if !defined(QT_NO_REGEXP_OPTIM)
3823 const auto locker = qt_scoped_lock(mutex&: engineCacheMutex);
3824 if (!eng->ref.deref()) {
3825 if (QRECache *c = engineCache()) {
3826 c->unusedEngines.insert(akey: key, aobject: eng, acost: 4 + key.pattern.length() / 4);
3827 c->usedEngines.remove(akey: key);
3828 } else {
3829 delete eng;
3830 }
3831 }
3832#else
3833 Q_UNUSED(key);
3834 if (!eng->ref.deref())
3835 delete eng;
3836#endif
3837}
3838
3839static void prepareEngine_helper(QRegExpPrivate *priv)
3840{
3841 Q_ASSERT(!priv->eng);
3842
3843#if !defined(QT_NO_REGEXP_OPTIM)
3844 const auto locker = qt_scoped_lock(mutex&: engineCacheMutex);
3845 if (QRECache *c = engineCache()) {
3846 priv->eng = c->unusedEngines.take(key: priv->engineKey);
3847 if (!priv->eng)
3848 priv->eng = c->usedEngines.value(akey: priv->engineKey);
3849 if (!priv->eng)
3850 priv->eng = new QRegExpEngine(priv->engineKey);
3851 else
3852 priv->eng->ref.ref();
3853
3854 c->usedEngines.insert(akey: priv->engineKey, avalue: priv->eng);
3855 return;
3856 }
3857#endif // QT_NO_REGEXP_OPTIM
3858
3859 priv->eng = new QRegExpEngine(priv->engineKey);
3860}
3861
3862inline static void prepareEngine(QRegExpPrivate *priv)
3863{
3864 if (priv->eng)
3865 return;
3866 prepareEngine_helper(priv);
3867 priv->matchState.prepareForMatch(eng: priv->eng);
3868}
3869
3870static void prepareEngineForMatch(QRegExpPrivate *priv, const QString &str)
3871{
3872 prepareEngine(priv);
3873 priv->matchState.prepareForMatch(eng: priv->eng);
3874#ifndef QT_NO_REGEXP_CAPTURE
3875 priv->t = str;
3876 priv->capturedCache.clear();
3877#else
3878 Q_UNUSED(str);
3879#endif
3880}
3881
3882static void invalidateEngine(QRegExpPrivate *priv)
3883{
3884 if (priv->eng) {
3885 derefEngine(eng: priv->eng, key: priv->engineKey);
3886 priv->eng = nullptr;
3887 priv->matchState.drain();
3888 }
3889}
3890
3891/*!
3892 \enum QRegExp::CaretMode
3893
3894 The CaretMode enum defines the different meanings of the caret
3895 (\b{^}) in a regular expression. The possible values are:
3896
3897 \value CaretAtZero
3898 The caret corresponds to index 0 in the searched string.
3899
3900 \value CaretAtOffset
3901 The caret corresponds to the start offset of the search.
3902
3903 \value CaretWontMatch
3904 The caret never matches.
3905*/
3906
3907/*!
3908 \enum QRegExp::PatternSyntax
3909
3910 The syntax used to interpret the meaning of the pattern.
3911
3912 \value RegExp A rich Perl-like pattern matching syntax. This is
3913 the default.
3914
3915 \value RegExp2 Like RegExp, but with \l{greedy quantifiers}.
3916 (Introduced in Qt 4.2.)
3917
3918 \value Wildcard This provides a simple pattern matching syntax
3919 similar to that used by shells (command interpreters) for "file
3920 globbing". See \l{QRegExp wildcard matching}.
3921
3922 \value WildcardUnix This is similar to Wildcard but with the
3923 behavior of a Unix shell. The wildcard characters can be escaped
3924 with the character "\\".
3925
3926 \value FixedString The pattern is a fixed string. This is
3927 equivalent to using the RegExp pattern on a string in
3928 which all metacharacters are escaped using escape().
3929
3930 \value W3CXmlSchema11 The pattern is a regular expression as
3931 defined by the W3C XML Schema 1.1 specification.
3932
3933 \sa setPatternSyntax()
3934*/
3935
3936/*!
3937 Constructs an empty regexp.
3938
3939 \sa isValid(), errorString()
3940*/
3941QRegExp::QRegExp()
3942{
3943 priv = new QRegExpPrivate;
3944 prepareEngine(priv);
3945}
3946
3947/*!
3948 Constructs a regular expression object for the given \a pattern
3949 string. The pattern must be given using wildcard notation if \a
3950 syntax is \l Wildcard; the default is \l RegExp. The pattern is
3951 case sensitive, unless \a cs is Qt::CaseInsensitive. Matching is
3952 greedy (maximal), but can be changed by calling
3953 setMinimal().
3954
3955 \sa setPattern(), setCaseSensitivity(), setPatternSyntax()
3956*/
3957QRegExp::QRegExp(const QString &pattern, Qt::CaseSensitivity cs, PatternSyntax syntax)
3958{
3959 priv = new QRegExpPrivate(QRegExpEngineKey(pattern, syntax, cs));
3960 prepareEngine(priv);
3961}
3962
3963/*!
3964 Constructs a regular expression as a copy of \a rx.
3965
3966 \sa operator=()
3967*/
3968QRegExp::QRegExp(const QRegExp &rx)
3969{
3970 priv = new QRegExpPrivate;
3971 operator=(rx);
3972}
3973
3974/*!
3975 Destroys the regular expression and cleans up its internal data.
3976*/
3977QRegExp::~QRegExp()
3978{
3979 invalidateEngine(priv);
3980 delete priv;
3981}
3982
3983/*!
3984 Copies the regular expression \a rx and returns a reference to the
3985 copy. The case sensitivity, wildcard, and minimal matching options
3986 are also copied.
3987*/
3988QRegExp &QRegExp::operator=(const QRegExp &rx)
3989{
3990 prepareEngine(priv: rx.priv); // to allow sharing
3991 QRegExpEngine *otherEng = rx.priv->eng;
3992 if (otherEng)
3993 otherEng->ref.ref();
3994 invalidateEngine(priv);
3995 priv->eng = otherEng;
3996 priv->engineKey = rx.priv->engineKey;
3997 priv->minimal = rx.priv->minimal;
3998#ifndef QT_NO_REGEXP_CAPTURE
3999 priv->t = rx.priv->t;
4000 priv->capturedCache = rx.priv->capturedCache;
4001#endif
4002 if (priv->eng)
4003 priv->matchState.prepareForMatch(eng: priv->eng);
4004 priv->matchState.captured = rx.priv->matchState.captured;
4005 return *this;
4006}
4007
4008/*!
4009 \fn QRegExp &QRegExp::operator=(QRegExp &&other)
4010
4011 Move-assigns \a other to this QRegExp instance.
4012
4013 \since 5.2
4014*/
4015
4016/*!
4017 \fn void QRegExp::swap(QRegExp &other)
4018 \since 4.8
4019
4020 Swaps regular expression \a other with this regular
4021 expression. This operation is very fast and never fails.
4022*/
4023
4024/*!
4025 Returns \c true if this regular expression is equal to \a rx;
4026 otherwise returns \c false.
4027
4028 Two QRegExp objects are equal if they have the same pattern
4029 strings and the same settings for case sensitivity, wildcard and
4030 minimal matching.
4031*/
4032bool QRegExp::operator==(const QRegExp &rx) const
4033{
4034 return priv->engineKey == rx.priv->engineKey && priv->minimal == rx.priv->minimal;
4035}
4036
4037/*!
4038 \since 5.6
4039 \relates QRegExp
4040
4041 Returns the hash value for \a key, using
4042 \a seed to seed the calculation.
4043*/
4044uint qHash(const QRegExp &key, uint seed) noexcept
4045{
4046 QtPrivate::QHashCombine hash;
4047 seed = hash(seed, key.priv->engineKey);
4048 seed = hash(seed, key.priv->minimal);
4049 return seed;
4050}
4051
4052/*!
4053 \fn bool QRegExp::operator!=(const QRegExp &rx) const
4054
4055 Returns \c true if this regular expression is not equal to \a rx;
4056 otherwise returns \c false.
4057
4058 \sa operator==()
4059*/
4060
4061/*!
4062 Returns \c true if the pattern string is empty; otherwise returns
4063 false.
4064
4065 If you call exactMatch() with an empty pattern on an empty string
4066 it will return true; otherwise it returns \c false since it operates
4067 over the whole string. If you call indexIn() with an empty pattern
4068 on \e any string it will return the start offset (0 by default)
4069 because the empty pattern matches the 'emptiness' at the start of
4070 the string. In this case the length of the match returned by
4071 matchedLength() will be 0.
4072
4073 See QString::isEmpty().
4074*/
4075
4076bool QRegExp::isEmpty() const
4077{
4078 return priv->engineKey.pattern.isEmpty();
4079}
4080
4081/*!
4082 Returns \c true if the regular expression is valid; otherwise returns
4083 false. An invalid regular expression never matches.
4084
4085 The pattern \b{[a-z} is an example of an invalid pattern, since
4086 it lacks a closing square bracket.
4087
4088 Note that the validity of a regexp may also depend on the setting
4089 of the wildcard flag, for example \b{*.html} is a valid
4090 wildcard regexp but an invalid full regexp.
4091
4092 \sa errorString()
4093*/
4094bool QRegExp::isValid() const
4095{
4096 if (priv->engineKey.pattern.isEmpty()) {
4097 return true;
4098 } else {
4099 prepareEngine(priv);
4100 return priv->eng->isValid();
4101 }
4102}
4103
4104/*!
4105 Returns the pattern string of the regular expression. The pattern
4106 has either regular expression syntax or wildcard syntax, depending
4107 on patternSyntax().
4108
4109 \sa patternSyntax(), caseSensitivity()
4110*/
4111QString QRegExp::pattern() const
4112{
4113 return priv->engineKey.pattern;
4114}
4115
4116/*!
4117 Sets the pattern string to \a pattern. The case sensitivity,
4118 wildcard, and minimal matching options are not changed.
4119
4120 \sa setPatternSyntax(), setCaseSensitivity()
4121*/
4122void QRegExp::setPattern(const QString &pattern)
4123{
4124 if (priv->engineKey.pattern != pattern) {
4125 invalidateEngine(priv);
4126 priv->engineKey.pattern = pattern;
4127 }
4128}
4129
4130/*!
4131 Returns Qt::CaseSensitive if the regexp is matched case
4132 sensitively; otherwise returns Qt::CaseInsensitive.
4133
4134 \sa patternSyntax(), pattern(), isMinimal()
4135*/
4136Qt::CaseSensitivity QRegExp::caseSensitivity() const
4137{
4138 return priv->engineKey.cs;
4139}
4140
4141/*!
4142 Sets case sensitive matching to \a cs.
4143
4144 If \a cs is Qt::CaseSensitive, \b{\\.txt$} matches
4145 \c{readme.txt} but not \c{README.TXT}.
4146
4147 \sa setPatternSyntax(), setPattern(), setMinimal()
4148*/
4149void QRegExp::setCaseSensitivity(Qt::CaseSensitivity cs)
4150{
4151 if ((bool)cs != (bool)priv->engineKey.cs) {
4152 invalidateEngine(priv);
4153 priv->engineKey.cs = cs;
4154 }
4155}
4156
4157/*!
4158 Returns the syntax used by the regular expression. The default is
4159 QRegExp::RegExp.
4160
4161 \sa pattern(), caseSensitivity()
4162*/
4163QRegExp::PatternSyntax QRegExp::patternSyntax() const
4164{
4165 return priv->engineKey.patternSyntax;
4166}
4167
4168/*!
4169 Sets the syntax mode for the regular expression. The default is
4170 QRegExp::RegExp.
4171
4172 Setting \a syntax to QRegExp::Wildcard enables simple shell-like
4173 \l{QRegExp wildcard matching}. For example, \b{r*.txt} matches the
4174 string \c{readme.txt} in wildcard mode, but does not match
4175 \c{readme}.
4176
4177 Setting \a syntax to QRegExp::FixedString means that the pattern
4178 is interpreted as a plain string. Special characters (e.g.,
4179 backslash) don't need to be escaped then.
4180
4181 \sa setPattern(), setCaseSensitivity(), escape()
4182*/
4183void QRegExp::setPatternSyntax(PatternSyntax syntax)
4184{
4185 if (syntax != priv->engineKey.patternSyntax) {
4186 invalidateEngine(priv);
4187 priv->engineKey.patternSyntax = syntax;
4188 }
4189}
4190
4191/*!
4192 Returns \c true if minimal (non-greedy) matching is enabled;
4193 otherwise returns \c false.
4194
4195 \sa caseSensitivity(), setMinimal()
4196*/
4197bool QRegExp::isMinimal() const
4198{
4199 return priv->minimal;
4200}
4201
4202/*!
4203 Enables or disables minimal matching. If \a minimal is false,
4204 matching is greedy (maximal) which is the default.
4205
4206 For example, suppose we have the input string "We must be
4207 <b>bold</b>, very <b>bold</b>!" and the pattern
4208 \b{<b>.*</b>}. With the default greedy (maximal) matching,
4209 the match is "We must be \underline{<b>bold</b>, very
4210 <b>bold</b>}!". But with minimal (non-greedy) matching, the
4211 first match is: "We must be \underline{<b>bold</b>}, very
4212 <b>bold</b>!" and the second match is "We must be <b>bold</b>,
4213 very \underline{<b>bold</b>}!". In practice we might use the pattern
4214 \b{<b>[^<]*\</b>} instead, although this will still fail for
4215 nested tags.
4216
4217 \sa setCaseSensitivity()
4218*/
4219void QRegExp::setMinimal(bool minimal)
4220{
4221 priv->minimal = minimal;
4222}
4223
4224// ### Qt 5: make non-const
4225/*!
4226 Returns \c true if \a str is matched exactly by this regular
4227 expression; otherwise returns \c false. You can determine how much of
4228 the string was matched by calling matchedLength().
4229
4230 For a given regexp string R, exactMatch("R") is the equivalent of
4231 indexIn("^R$") since exactMatch() effectively encloses the regexp
4232 in the start of string and end of string anchors, except that it
4233 sets matchedLength() differently.
4234
4235 For example, if the regular expression is \b{blue}, then
4236 exactMatch() returns \c true only for input \c blue. For inputs \c
4237 bluebell, \c blutak and \c lightblue, exactMatch() returns \c false
4238 and matchedLength() will return 4, 3 and 0 respectively.
4239
4240 Although const, this function sets matchedLength(),
4241 capturedTexts(), and pos().
4242
4243 \sa indexIn(), lastIndexIn()
4244*/
4245bool QRegExp::exactMatch(const QString &str) const
4246{
4247 prepareEngineForMatch(priv, str);
4248 priv->matchState.match(str0: str.unicode(), len0: str.length(), pos0: 0, minimal0: priv->minimal, oneTest: true, caretIndex: 0);
4249 if (priv->matchState.captured[1] == str.length()) {
4250 return true;
4251 } else {
4252 priv->matchState.captured[0] = 0;
4253 priv->matchState.captured[1] = priv->matchState.oneTestMatchedLen;
4254 return false;
4255 }
4256}
4257
4258// ### Qt 5: make non-const
4259/*!
4260 Attempts to find a match in \a str from position \a offset (0 by
4261 default). If \a offset is -1, the search starts at the last
4262 character; if -2, at the next to last character; etc.
4263
4264 Returns the position of the first match, or -1 if there was no
4265 match.
4266
4267 The \a caretMode parameter can be used to instruct whether \b{^}
4268 should match at index 0 or at \a offset.
4269
4270 You might prefer to use QString::indexOf(), QString::contains(),
4271 or even QStringList::filter(). To replace matches use
4272 QString::replace().
4273
4274 Example:
4275 \snippet code/src_corelib_tools_qregexp.cpp 13
4276
4277 Although const, this function sets matchedLength(),
4278 capturedTexts() and pos().
4279
4280 If the QRegExp is a wildcard expression (see setPatternSyntax())
4281 and want to test a string against the whole wildcard expression,
4282 use exactMatch() instead of this function.
4283
4284 \sa lastIndexIn(), exactMatch()
4285*/
4286
4287int QRegExp::indexIn(const QString &str, int offset, CaretMode caretMode) const
4288{
4289 prepareEngineForMatch(priv, str);
4290 if (offset < 0)
4291 offset += str.length();
4292 priv->matchState.match(str0: str.unicode(), len0: str.length(), pos0: offset,
4293 minimal0: priv->minimal, oneTest: false, caretIndex: caretIndex(offset, caretMode));
4294 return priv->matchState.captured[0];
4295}
4296
4297// ### Qt 5: make non-const
4298/*!
4299 Attempts to find a match backwards in \a str from position \a
4300 offset. If \a offset is -1 (the default), the search starts at the
4301 last character; if -2, at the next to last character; etc.
4302
4303 Returns the position of the first match, or -1 if there was no
4304 match.
4305
4306 The \a caretMode parameter can be used to instruct whether \b{^}
4307 should match at index 0 or at \a offset.
4308
4309 Although const, this function sets matchedLength(),
4310 capturedTexts() and pos().
4311
4312 \warning Searching backwards is much slower than searching
4313 forwards.
4314
4315 \sa indexIn(), exactMatch()
4316*/
4317
4318int QRegExp::lastIndexIn(const QString &str, int offset, CaretMode caretMode) const
4319{
4320 prepareEngineForMatch(priv, str);
4321 if (offset < 0)
4322 offset += str.length();
4323 if (offset < 0 || offset > str.length()) {
4324 memset(s: priv->matchState.captured, c: -1, n: priv->matchState.capturedSize*sizeof(int));
4325 return -1;
4326 }
4327
4328 while (offset >= 0) {
4329 priv->matchState.match(str0: str.unicode(), len0: str.length(), pos0: offset,
4330 minimal0: priv->minimal, oneTest: true, caretIndex: caretIndex(offset, caretMode));
4331 if (priv->matchState.captured[0] == offset)
4332 return offset;
4333 --offset;
4334 }
4335 return -1;
4336}
4337
4338/*!
4339 Returns the length of the last matched string, or -1 if there was
4340 no match.
4341
4342 \sa exactMatch(), indexIn(), lastIndexIn()
4343*/
4344int QRegExp::matchedLength() const
4345{
4346 return priv->matchState.captured[1];
4347}
4348
4349#ifndef QT_NO_REGEXP_CAPTURE
4350
4351/*!
4352 \since 4.6
4353 Returns the number of captures contained in the regular expression.
4354 */
4355int QRegExp::captureCount() const
4356{
4357 prepareEngine(priv);
4358 return priv->eng->captureCount();
4359}
4360
4361/*!
4362 Returns a list of the captured text strings.
4363
4364 The first string in the list is the entire matched string. Each
4365 subsequent list element contains a string that matched a
4366 (capturing) subexpression of the regexp.
4367
4368 For example:
4369 \snippet code/src_corelib_tools_qregexp.cpp 14
4370
4371 The above example also captures elements that may be present but
4372 which we have no interest in. This problem can be solved by using
4373 non-capturing parentheses:
4374
4375 \snippet code/src_corelib_tools_qregexp.cpp 15
4376
4377 Note that if you want to iterate over the list, you should iterate
4378 over a copy, e.g.
4379 \snippet code/src_corelib_tools_qregexp.cpp 16
4380
4381 Some regexps can match an indeterminate number of times. For
4382 example if the input string is "Offsets: 12 14 99 231 7" and the
4383 regexp, \c{rx}, is \b{(\\d+)+}, we would hope to get a list of
4384 all the numbers matched. However, after calling
4385 \c{rx.indexIn(str)}, capturedTexts() will return the list ("12",
4386 "12"), i.e. the entire match was "12" and the first subexpression
4387 matched was "12". The correct approach is to use cap() in a
4388 \l{QRegExp#cap_in_a_loop}{loop}.
4389
4390 The order of elements in the string list is as follows. The first
4391 element is the entire matching string. Each subsequent element
4392 corresponds to the next capturing open left parentheses. Thus
4393 capturedTexts()[1] is the text of the first capturing parentheses,
4394 capturedTexts()[2] is the text of the second and so on
4395 (corresponding to $1, $2, etc., in some other regexp languages).
4396
4397 \sa cap(), pos()
4398*/
4399QStringList QRegExp::capturedTexts() const
4400{
4401 if (priv->capturedCache.isEmpty()) {
4402 prepareEngine(priv);
4403 const int *captured = priv->matchState.captured;
4404 int n = priv->matchState.capturedSize;
4405
4406 for (int i = 0; i < n; i += 2) {
4407 QString m;
4408 if (captured[i + 1] == 0)
4409 m = QLatin1String(""); // ### Qt 5: don't distinguish between null and empty
4410 else if (captured[i] >= 0)
4411 m = priv->t.mid(position: captured[i], n: captured[i + 1]);
4412 priv->capturedCache.append(t: m);
4413 }
4414 priv->t.clear();
4415 }
4416 return priv->capturedCache;
4417}
4418
4419/*!
4420 \internal
4421*/
4422QStringList QRegExp::capturedTexts()
4423{
4424 return const_cast<const QRegExp *>(this)->capturedTexts();
4425}
4426
4427/*!
4428 Returns the text captured by the \a nth subexpression. The entire
4429 match has index 0 and the parenthesized subexpressions have
4430 indexes starting from 1 (excluding non-capturing parentheses).
4431
4432 \snippet code/src_corelib_tools_qregexp.cpp 17
4433
4434 The order of elements matched by cap() is as follows. The first
4435 element, cap(0), is the entire matching string. Each subsequent
4436 element corresponds to the next capturing open left parentheses.
4437 Thus cap(1) is the text of the first capturing parentheses, cap(2)
4438 is the text of the second, and so on.
4439
4440 \sa capturedTexts(), pos()
4441*/
4442QString QRegExp::cap(int nth) const
4443{
4444 return capturedTexts().value(i: nth);
4445}
4446
4447/*!
4448 \internal
4449*/
4450QString QRegExp::cap(int nth)
4451{
4452 return const_cast<const QRegExp *>(this)->cap(nth);
4453}
4454
4455/*!
4456 Returns the position of the \a nth captured text in the searched
4457 string. If \a nth is 0 (the default), pos() returns the position
4458 of the whole match.
4459
4460 Example:
4461 \snippet code/src_corelib_tools_qregexp.cpp 18
4462
4463 For zero-length matches, pos() always returns -1. (For example, if
4464 cap(4) would return an empty string, pos(4) returns -1.) This is
4465 a feature of the implementation.
4466
4467 \sa cap(), capturedTexts()
4468*/
4469int QRegExp::pos(int nth) const
4470{
4471 if (nth < 0 || nth >= priv->matchState.capturedSize / 2)
4472 return -1;
4473 else
4474 return priv->matchState.captured[2 * nth];
4475}
4476
4477/*!
4478 \internal
4479*/
4480int QRegExp::pos(int nth)
4481{
4482 return const_cast<const QRegExp *>(this)->pos(nth);
4483}
4484
4485/*!
4486 Returns a text string that explains why a regexp pattern is
4487 invalid the case being; otherwise returns "no error occurred".
4488
4489 \sa isValid()
4490*/
4491QString QRegExp::errorString() const
4492{
4493 if (isValid()) {
4494 return QString::fromLatin1(RXERR_OK);
4495 } else {
4496 return priv->eng->errorString();
4497 }
4498}
4499
4500/*!
4501 \internal
4502*/
4503QString QRegExp::errorString()
4504{
4505 return const_cast<const QRegExp *>(this)->errorString();
4506}
4507#endif
4508
4509/*!
4510 Returns the string \a str with every regexp special character
4511 escaped with a backslash. The special characters are $, (,), *, +,
4512 ., ?, [, \,], ^, {, | and }.
4513
4514 Example:
4515
4516 \snippet code/src_corelib_tools_qregexp.cpp 19
4517
4518 This function is useful to construct regexp patterns dynamically:
4519
4520 \snippet code/src_corelib_tools_qregexp.cpp 20
4521
4522 \sa setPatternSyntax()
4523*/
4524QString QRegExp::escape(const QString &str)
4525{
4526 QString quoted;
4527 const int count = str.count();
4528 quoted.reserve(asize: count * 2);
4529 const QLatin1Char backslash('\\');
4530 for (int i = 0; i < count; i++) {
4531 switch (str.at(i).toLatin1()) {
4532 case '$':
4533 case '(':
4534 case ')':
4535 case '*':
4536 case '+':
4537 case '.':
4538 case '?':
4539 case '[':
4540 case '\\':
4541 case ']':
4542 case '^':
4543 case '{':
4544 case '|':
4545 case '}':
4546 quoted.append(c: backslash);
4547 }
4548 quoted.append(c: str.at(i));
4549 }
4550 return quoted;
4551}
4552
4553
4554#ifndef QT_NO_DATASTREAM
4555/*!
4556 \relates QRegExp
4557
4558 Writes the regular expression \a regExp to stream \a out.
4559
4560 \sa {Serializing Qt Data Types}
4561*/
4562QDataStream &operator<<(QDataStream &out, const QRegExp &regExp)
4563{
4564 return out << regExp.pattern() << (quint8)regExp.caseSensitivity()
4565 << (quint8)regExp.patternSyntax()
4566 << (quint8)!!regExp.isMinimal();
4567}
4568
4569/*!
4570 \relates QRegExp
4571
4572 Reads a regular expression from stream \a in into \a regExp.
4573
4574 \sa {Serializing Qt Data Types}
4575*/
4576QDataStream &operator>>(QDataStream &in, QRegExp &regExp)
4577{
4578 QString pattern;
4579 quint8 cs;
4580 quint8 patternSyntax;
4581 quint8 isMinimal;
4582
4583 in >> pattern >> cs >> patternSyntax >> isMinimal;
4584
4585 QRegExp newRegExp(pattern, Qt::CaseSensitivity(cs),
4586 QRegExp::PatternSyntax(patternSyntax));
4587
4588 newRegExp.setMinimal(isMinimal);
4589 regExp = newRegExp;
4590 return in;
4591}
4592#endif // QT_NO_DATASTREAM
4593
4594#ifndef QT_NO_DEBUG_STREAM
4595QDebug operator<<(QDebug dbg, const QRegExp &r)
4596{
4597 QDebugStateSaver saver(dbg);
4598 dbg.nospace() << "QRegExp(patternSyntax=" << r.patternSyntax()
4599 << ", pattern='"<< r.pattern() << "')";
4600 return dbg;
4601}
4602#endif
4603
4604QT_END_NAMESPACE
4605

source code of qtbase/src/corelib/text/qregexp.cpp