qregexp.cpp source code [qtbase/src/corelib/text/qregexp.cpp]

1	/****************************************************************************
2	**
3	** Copyright (C) 2016 The Qt Company Ltd.
4	** Contact: https://www.qt.io/licensing/
5	**
6	** This file is part of the QtCore module of the Qt Toolkit.
7	**
8	** $QT_BEGIN_LICENSE:LGPL$
9	** Commercial License Usage
10	** Licensees holding valid commercial Qt licenses may use this file in
11	** accordance with the commercial license agreement provided with the
12	** Software or, alternatively, in accordance with the terms contained in
13	** a written agreement between you and The Qt Company. For licensing terms
14	** and conditions see https://www.qt.io/terms-conditions. For further
15	** information use the contact form at https://www.qt.io/contact-us.
16	**
17	** GNU Lesser General Public License Usage
18	** Alternatively, this file may be used under the terms of the GNU Lesser
19	** General Public License version 3 as published by the Free Software
20	** Foundation and appearing in the file LICENSE.LGPL3 included in the
21	** packaging of this file. Please review the following information to
22	** ensure the GNU Lesser General Public License version 3 requirements
23	** will be met: https://www.gnu.org/licenses/lgpl-3.0.html.
24	**
25	** GNU General Public License Usage
26	** Alternatively, this file may be used under the terms of the GNU
27	** General Public License version 2.0 or (at your option) the GNU General
28	** Public license version 3 or any later version approved by the KDE Free
29	** Qt Foundation. The licenses are as published by the Free Software
30	** Foundation and appearing in the file LICENSE.GPL2 and LICENSE.GPL3
31	** included in the packaging of this file. Please review the following
32	** information to ensure the GNU General Public License requirements will
33	** be met: https://www.gnu.org/licenses/gpl-2.0.html and
34	** https://www.gnu.org/licenses/gpl-3.0.html.
35	**
36	** $QT_END_LICENSE$
37	**
38	****************************************************************************/
39
40	#include "qregexp.h"
41
42	#include "qalgorithms.h"
43	#include "qbitarray.h"
44	#include "qcache.h"
45	#include "qdatastream.h"
46	#include "qdebug.h"
47	#include "qhashfunctions.h"
48	#include "qlist.h"
49	#include "qmap.h"
50	#include "qmutex.h"
51	#include "qstring.h"
52	#include "qstringlist.h"
53	#include "qstringmatcher.h"
54	#include "qvector.h"
55	#include "private/qlocking_p.h"
56
57	#include <limits.h>
58	#include <algorithm>
59
60	QT_BEGIN_NAMESPACE
61
62	// error strings for the regexp parser
63	#define RXERR_OK QT_TRANSLATE_NOOP("QRegExp", "no error occurred")
64	#define RXERR_DISABLED QT_TRANSLATE_NOOP("QRegExp", "disabled feature used")
65	#define RXERR_CHARCLASS QT_TRANSLATE_NOOP("QRegExp", "bad char class syntax")
66	#define RXERR_LOOKAHEAD QT_TRANSLATE_NOOP("QRegExp", "bad lookahead syntax")
67	#define RXERR_LOOKBEHIND QT_TRANSLATE_NOOP("QRegExp", "lookbehinds not supported, see QTBUG-2371")
68	#define RXERR_REPETITION QT_TRANSLATE_NOOP("QRegExp", "bad repetition syntax")
69	#define RXERR_OCTAL QT_TRANSLATE_NOOP("QRegExp", "invalid octal value")
70	#define RXERR_LEFTDELIM QT_TRANSLATE_NOOP("QRegExp", "missing left delim")
71	#define RXERR_END QT_TRANSLATE_NOOP("QRegExp", "unexpected end")
72	#define RXERR_LIMIT QT_TRANSLATE_NOOP("QRegExp", "met internal limit")
73	#define RXERR_INTERVAL QT_TRANSLATE_NOOP("QRegExp", "invalid interval")
74	#define RXERR_CATEGORY QT_TRANSLATE_NOOP("QRegExp", "invalid category")
75
76	/!*
77	\class QRegExp
78	\inmodule QtCore
79	\reentrant
80	\brief The QRegExp class provides pattern matching using regular expressions.
81
82	\ingroup tools
83	\ingroup shared
84
85	\keyword regular expression
86
87	A regular expression, or "regexp", is a pattern for matching
88	substrings in a text. This is useful in many contexts, e.g.,
89
90	\table
91	\row \li Validation
92	\li A regexp can test whether a substring meets some criteria,
93	e.g. is an integer or contains no whitespace.
94	\row \li Searching
95	\li A regexp provides more powerful pattern matching than
96	simple substring matching, e.g., match one of the words
97	\e{mail}, \e{letter} or \e{correspondence}, but none of the
98	words \e{email}, \e{mailman}, \e{mailer}, \e{letterbox}, etc.
99	\row \li Search and Replace
100	\li A regexp can replace all occurrences of a substring with a
101	different substring, e.g., replace all occurrences of \e{&}
102	with \e{\&} except where the \e{&} is already followed by
103	an \e{amp;}.
104	\row \li String Splitting
105	\li A regexp can be used to identify where a string should be
106	split apart, e.g. splitting tab-delimited strings.
107	\endtable
108
109	A brief introduction to regexps is presented, a description of
110	Qt's regexp language, some examples, and the function
111	documentation itself. QRegExp is modeled on Perl's regexp
112	language. It fully supports Unicode. QRegExp can also be used in a
113	simpler, \e{wildcard mode} that is similar to the functionality
114	found in command shells. The syntax rules used by QRegExp can be
115	changed with setPatternSyntax(). In particular, the pattern syntax
116	can be set to QRegExp::FixedString, which means the pattern to be
117	matched is interpreted as a plain string, i.e., special characters
118	(e.g., backslash) are not escaped.
119
120	A good text on regexps is \e {Mastering Regular Expressions}
121	(Third Edition) by Jeffrey E. F. Friedl, ISBN 0-596-52812-4.
122
123	\note In Qt 5, the new QRegularExpression class provides a Perl
124	compatible implementation of regular expressions and is recommended
125	in place of QRegExp.
126
127	\tableofcontents
128
129	\section1 Introduction
130
131	Regexps are built up from expressions, quantifiers, and
132	assertions. The simplest expression is a character, e.g. \b{x}
133	or \b{5}. An expression can also be a set of characters
134	enclosed in square brackets. \b{[ABCD]} will match an \b{A}
135	or a \b{B} or a \b{C} or a \b{D}. We can write this same
136	expression as \b{[A-D]}, and an expression to match any
137	capital letter in the English alphabet is written as
138	\b{[A-Z]}.
139
140	A quantifier specifies the number of occurrences of an expression
141	that must be matched. \b{x{1,1}} means match one and only one
142	\b{x}. \b{x{1,5}} means match a sequence of \b{x}
143	characters that contains at least one \b{x} but no more than
144	five.
145
146	Note that in general regexps cannot be used to check for balanced
147	brackets or tags. For example, a regexp can be written to match an
148	opening html \c{<b>} and its closing \c{</b>}, if the \c{<b>} tags
149	are not nested, but if the \c{<b>} tags are nested, that same
150	regexp will match an opening \c{<b>} tag with the wrong closing
151	\c{</b>}. For the fragment \c{<b>bold <b>bolder</b></b>}, the
152	first \c{<b>} would be matched with the first \c{</b>}, which is
153	not correct. However, it is possible to write a regexp that will
154	match nested brackets or tags correctly, but only if the number of
155	nesting levels is fixed and known. If the number of nesting levels
156	is not fixed and known, it is impossible to write a regexp that
157	will not fail.
158
159	Suppose we want a regexp to match integers in the range 0 to 99.
160	At least one digit is required, so we start with the expression
161	\b{[0-9]{1,1}}, which matches a single digit exactly once. This
162	regexp matches integers in the range 0 to 9. To match integers up
163	to 99, increase the maximum number of occurrences to 2, so the
164	regexp becomes \b{[0-9]{1,2}}. This regexp satisfies the
165	original requirement to match integers from 0 to 99, but it will
166	also match integers that occur in the middle of strings. If we
167	want the matched integer to be the whole string, we must use the
168	anchor assertions, \b{^} (caret) and \b{$} (dollar). When
169	\b{^} is the first character in a regexp, it means the regexp
170	must match from the beginning of the string. When \b{$} is the
171	last character of the regexp, it means the regexp must match to
172	the end of the string. The regexp becomes \b{^[0-9]{1,2}$}.
173	Note that assertions, e.g. \b{^} and \b{$}, do not match
174	characters but locations in the string.
175
176	If you have seen regexps described elsewhere, they may have looked
177	different from the ones shown here. This is because some sets of
178	characters and some quantifiers are so common that they have been
179	given special symbols to represent them. \b{[0-9]} can be
180	replaced with the symbol \b{\\d}. The quantifier to match
181	exactly one occurrence, \b{{1,1}}, can be replaced with the
182	expression itself, i.e. \b{x{1,1}} is the same as \b{x}. So
183	our 0 to 99 matcher could be written as \b{^\\d{1,2}$}. It can
184	also be written \b{^\\d\\d{0,1}$}, i.e. \e{From the start of
185	the string, match a digit, followed immediately by 0 or 1 digits}.
186	In practice, it would be written as \b{^\\d\\d?$}. The \b{?}
187	is shorthand for the quantifier \b{{0,1}}, i.e. 0 or 1
188	occurrences. \b{?} makes an expression optional. The regexp
189	\b{^\\d\\d?$} means \e{From the beginning of the string, match
190	one digit, followed immediately by 0 or 1 more digit, followed
191	immediately by end of string}.
192
193	To write a regexp that matches one of the words 'mail' \e or
194	'letter' \e or 'correspondence' but does not match words that
195	contain these words, e.g., 'email', 'mailman', 'mailer', and
196	'letterbox', start with a regexp that matches 'mail'. Expressed
197	fully, the regexp is \b{m{1,1}a{1,1}i{1,1}l{1,1}}, but because
198	a character expression is automatically quantified by
199	\b{{1,1}}, we can simplify the regexp to \b{mail}, i.e., an
200	'm' followed by an 'a' followed by an 'i' followed by an 'l'. Now
201	we can use the vertical bar \b{\|}, which means \b{or}, to
202	include the other two words, so our regexp for matching any of the
203	three words becomes \b{mail\|letter\|correspondence}. Match
204	'mail' \b{or} 'letter' \b{or} 'correspondence'. While this
205	regexp will match one of the three words we want to match, it will
206	also match words we don't want to match, e.g., 'email'. To
207	prevent the regexp from matching unwanted words, we must tell it
208	to begin and end the match at word boundaries. First we enclose
209	our regexp in parentheses, \b{(mail\|letter\|correspondence)}.
210	Parentheses group expressions together, and they identify a part
211	of the regexp that we wish to \l{capturing text}{capture}.
212	Enclosing the expression in parentheses allows us to use it as a
213	component in more complex regexps. It also allows us to examine
214	which of the three words was actually matched. To force the match
215	to begin and end on word boundaries, we enclose the regexp in
216	\b{\\b} \e{word boundary} assertions:
217	\b{\\b(mail\|letter\|correspondence)\\b}. Now the regexp means:
218	\e{Match a word boundary, followed by the regexp in parentheses,
219	followed by a word boundary}. The \b{\\b} assertion matches a
220	\e position in the regexp, not a \e character. A word boundary is
221	any non-word character, e.g., a space, newline, or the beginning
222	or ending of a string.
223
224	If we want to replace ampersand characters with the HTML entity
225	\b{\&}, the regexp to match is simply \b{\&}. But this
226	regexp will also match ampersands that have already been converted
227	to HTML entities. We want to replace only ampersands that are not
228	already followed by \b{amp;}. For this, we need the negative
229	lookahead assertion, \b{(?!}__\b{)}. The regexp can then be
230	written as \b{\&(?!amp;)}, i.e. \e{Match an ampersand that is}
231	\b{not} \e{followed by} \b{amp;}.
232
233	If we want to count all the occurrences of 'Eric' and 'Eirik' in a
234	string, two valid solutions are \b{\\b(Eric\|Eirik)\\b} and
235	\b{\\bEi?ri[ck]\\b}. The word boundary assertion '\\b' is
236	required to avoid matching words that contain either name,
237	e.g. 'Ericsson'. Note that the second regexp matches more
238	spellings than we want: 'Eric', 'Erik', 'Eiric' and 'Eirik'.
239
240	Some of the examples discussed above are implemented in the
241	\l{#code-examples}{code examples} section.
242
243	\target characters-and-abbreviations-for-sets-of-characters
244	\section1 Characters and Abbreviations for Sets of Characters
245
246	\table
247	\header \li Element \li Meaning
248	\row \li \b{c}
249	\li A character represents itself unless it has a special
250	regexp meaning. e.g. \b{c} matches the character \e c.
251	\row \li \b{\\c}
252	\li A character that follows a backslash matches the character
253	itself, except as specified below. e.g., To match a literal
254	caret at the beginning of a string, write \b{\\^}.
255	\row \li \b{\\a}
256	\li Matches the ASCII bell (BEL, 0x07).
257	\row \li \b{\\f}
258	\li Matches the ASCII form feed (FF, 0x0C).
259	\row \li \b{\\n}
260	\li Matches the ASCII line feed (LF, 0x0A, Unix newline).
261	\row \li \b{\\r}
262	\li Matches the ASCII carriage return (CR, 0x0D).
263	\row \li \b{\\t}
264	\li Matches the ASCII horizontal tab (HT, 0x09).
265	\row \li \b{\\v}
266	\li Matches the ASCII vertical tab (VT, 0x0B).
267	\row \li \b{\\x\e{hhhh}}
268	\li Matches the Unicode character corresponding to the
269	hexadecimal number \e{hhhh} (between 0x0000 and 0xFFFF).
270	\row \li \b{\\0\e{ooo}} (i.e., \\zero \e{ooo})
271	\li matches the ASCII/Latin1 character for the octal number
272	\e{ooo} (between 0 and 0377).
273	\row \li \b{. (dot)}
274	\li Matches any character (including newline).
275	\row \li \b{\\d}
276	\li Matches a digit (QChar::isDigit()).
277	\row \li \b{\\D}
278	\li Matches a non-digit.
279	\row \li \b{\\s}
280	\li Matches a whitespace character (QChar::isSpace()).
281	\row \li \b{\\S}
282	\li Matches a non-whitespace character.
283	\row \li \b{\\w}
284	\li Matches a word character (QChar::isLetterOrNumber(), QChar::isMark(), or '_').
285	\row \li \b{\\W}
286	\li Matches a non-word character.
287	\row \li \b{\\\e{n}}
288	\li The \e{n}-th backreference, e.g. \\1, \\2, etc.
289	\endtable
290
291	\b{Note:} The C++ compiler transforms backslashes in strings.
292	To include a \b{\\} in a regexp, enter it twice, i.e. \c{\\}.
293	To match the backslash character itself, enter it four times, i.e.
294	\c{\\\\}.
295
296	\target sets-of-characters
297	\section1 Sets of Characters
298
299	Square brackets mean match any character contained in the square
300	brackets. The character set abbreviations described above can
301	appear in a character set in square brackets. Except for the
302	character set abbreviations and the following two exceptions,
303	characters do not have special meanings in square brackets.
304
305	\table
306	\row \li \b{^}
307
308	\li The caret negates the character set if it occurs as the
309	first character (i.e. immediately after the opening square
310	bracket). \b{[abc]} matches 'a' or 'b' or 'c', but
311	\b{[^abc]} matches anything \e but 'a' or 'b' or 'c'.
312
313	\row \li \b{-}
314
315	\li The dash indicates a range of characters. \b{[W-Z]}
316	matches 'W' or 'X' or 'Y' or 'Z'.
317
318	\endtable
319
320	Using the predefined character set abbreviations is more portable
321	than using character ranges across platforms and languages. For
322	example, \b{[0-9]} matches a digit in Western alphabets but
323	\b{\\d} matches a digit in \e any alphabet.
324
325	Note: In other regexp documentation, sets of characters are often
326	called "character classes".
327
328	\target quantifiers
329	\section1 Quantifiers
330
331	By default, an expression is automatically quantified by
332	\b{{1,1}}, i.e. it should occur exactly once. In the following
333	list, \b{\e {E}} stands for expression. An expression is a
334	character, or an abbreviation for a set of characters, or a set of
335	characters in square brackets, or an expression in parentheses.
336
337	\table
338	\row \li \b{\e {E}?}
339
340	\li Matches zero or one occurrences of \e E. This quantifier
341	means \e{The previous expression is optional}, because it
342	will match whether or not the expression is found. \b{\e
343	{E}?} is the same as \b{\e {E}{0,1}}. e.g., \b{dents?}
344	matches 'dent' or 'dents'.
345
346	\row \li \b{\e {E}+}
347
348	\li Matches one or more occurrences of \e E. \b{\e {E}+} is
349	the same as \b{\e {E}{1,}}. e.g., \b{0+} matches '0',
350	'00', '000', etc.
351
352	\row \li \b{\e {E}}*
353
354	\li Matches zero or more occurrences of \e E. It is the same
355	as \b{\e {E}{0,}}. The \b{} quantifier is often used*
356	in error where \b{+} should be used. For example, if
357	\b{\\s$} is used in an expression to match strings that*
358	end in whitespace, it will match every string because
359	\b{\\s$} means \e{Match zero or more whitespaces followed*
360	by end of string}. The correct regexp to match strings that
361	have at least one trailing whitespace character is
362	\b{\\s+$}.
363
364	\row \li \b{\e {E}{n}}
365
366	\li Matches exactly \e n occurrences of \e E. \b{\e {E}{n}}
367	is the same as repeating \e E \e n times. For example,
368	\b{x{5}} is the same as \b{xxxxx}. It is also the same
369	as \b{\e {E}{n,n}}, e.g. \b{x{5,5}}.
370
371	\row \li \b{\e {E}{n,}}
372	\li Matches at least \e n occurrences of \e E.
373
374	\row \li \b{\e {E}{,m}}
375	\li Matches at most \e m occurrences of \e E. \b{\e {E}{,m}}
376	is the same as \b{\e {E}{0,m}}.
377
378	\row \li \b{\e {E}{n,m}}
379	\li Matches at least \e n and at most \e m occurrences of \e E.
380	\endtable
381
382	To apply a quantifier to more than just the preceding character,
383	use parentheses to group characters together in an expression. For
384	example, \b{tag+} matches a 't' followed by an 'a' followed by
385	at least one 'g', whereas \b{(tag)+} matches at least one
386	occurrence of 'tag'.
387
388	Note: Quantifiers are normally "greedy". They always match as much
389	text as they can. For example, \b{0+} matches the first zero it
390	finds and all the consecutive zeros after the first zero. Applied
391	to '20005', it matches '2\underline{000}5'. Quantifiers can be made
392	non-greedy, see setMinimal().
393
394	\target capturing parentheses
395	\target backreferences
396	\section1 Capturing Text
397
398	Parentheses allow us to group elements together so that we can
399	quantify and capture them. For example if we have the expression
400	\b{mail\|letter\|correspondence} that matches a string we know
401	that \e one of the words matched but not which one. Using
402	parentheses allows us to "capture" whatever is matched within
403	their bounds, so if we used \b{(mail\|letter\|correspondence)}
404	and matched this regexp against the string "I sent you some email"
405	we can use the cap() or capturedTexts() functions to extract the
406	matched characters, in this case 'mail'.
407
408	We can use captured text within the regexp itself. To refer to the
409	captured text we use \e backreferences which are indexed from 1,
410	the same as for cap(). For example we could search for duplicate
411	words in a string using \b{\\b(\\w+)\\W+\\1\\b} which means match a
412	word boundary followed by one or more word characters followed by
413	one or more non-word characters followed by the same text as the
414	first parenthesized expression followed by a word boundary.
415
416	If we want to use parentheses purely for grouping and not for
417	capturing we can use the non-capturing syntax, e.g.
418	\b{(?:green\|blue)}. Non-capturing parentheses begin '(?:' and
419	end ')'. In this example we match either 'green' or 'blue' but we
420	do not capture the match so we only know whether or not we matched
421	but not which color we actually found. Using non-capturing
422	parentheses is more efficient than using capturing parentheses
423	since the regexp engine has to do less book-keeping.
424
425	Both capturing and non-capturing parentheses may be nested.
426
427	\target greedy quantifiers
428
429	For historical reasons, quantifiers (e.g. \b{}) that apply to*
430	capturing parentheses are more "greedy" than other quantifiers.
431	For example, \b{a(a)} will match "aaa" with cap(1) == "aaa".
432	This behavior is different from what other regexp engines do
433	(notably, Perl). To obtain a more intuitive capturing behavior,
434	specify QRegExp::RegExp2 to the QRegExp constructor or call
435	setPatternSyntax(QRegExp::RegExp2).
436
437	\target cap_in_a_loop
438
439	When the number of matches cannot be determined in advance, a
440	common idiom is to use cap() in a loop. For example:
441
442	\snippet code/src_corelib_tools_qregexp.cpp 0
443
444	\target assertions
445	\section1 Assertions
446
447	Assertions make some statement about the text at the point where
448	they occur in the regexp but they do not match any characters. In
449	the following list \b{\e {E}} stands for any expression.
450
451	\table
452	\row \li \b{^}
453	\li The caret signifies the beginning of the string. If you
454	wish to match a literal \c{^} you must escape it by
455	writing \c{\\^}. For example, \b{^#include} will only
456	match strings which \e begin with the characters '#include'.
457	(When the caret is the first character of a character set it
458	has a special meaning, see \l{#sets-of-characters}{Sets of Characters}.)
459
460	\row \li \b{$}
461	\li The dollar signifies the end of the string. For example
462	\b{\\d\\s$} will match strings which end with a digit*
463	optionally followed by whitespace. If you wish to match a
464	literal \c{$} you must escape it by writing
465	\c{\\$}.
466
467	\row \li \b{\\b}
468	\li A word boundary. For example the regexp
469	\b{\\bOK\\b} means match immediately after a word
470	boundary (e.g. start of string or whitespace) the letter 'O'
471	then the letter 'K' immediately before another word boundary
472	(e.g. end of string or whitespace). But note that the
473	assertion does not actually match any whitespace so if we
474	write \b{(\\bOK\\b)} and we have a match it will only
475	contain 'OK' even if the string is "It's \underline{OK} now".
476
477	\row \li \b{\\B}
478	\li A non-word boundary. This assertion is true wherever
479	\b{\\b} is false. For example if we searched for
480	\b{\\Bon\\B} in "Left on" the match would fail (space
481	and end of string aren't non-word boundaries), but it would
482	match in "t\underline{on}ne".
483
484	\row \li \b{(?=\e E)}
485	\li Positive lookahead. This assertion is true if the
486	expression matches at this point in the regexp. For example,
487	\b{const(?=\\s+char)} matches 'const' whenever it is
488	followed by 'char', as in 'static \underline{const} char '.*
489	(Compare with \b{const\\s+char}, which matches 'static
490	\underline{const char} '.)*
491
492	\row \li \b{(?!\e E)}
493	\li Negative lookahead. This assertion is true if the
494	expression does not match at this point in the regexp. For
495	example, \b{const(?!\\s+char)} matches 'const' \e except
496	when it is followed by 'char'.
497	\endtable
498
499	\target QRegExp wildcard matching
500	\section1 Wildcard Matching
501
502	Most command shells such as \e bash or \e cmd.exe support "file
503	globbing", the ability to identify a group of files by using
504	wildcards. The setPatternSyntax() function is used to switch
505	between regexp and wildcard mode. Wildcard matching is much
506	simpler than full regexps and has only four features:
507
508	\table
509	\row \li \b{c}
510	\li Any character represents itself apart from those mentioned
511	below. Thus \b{c} matches the character \e c.
512	\row \li \b{?}
513	\li Matches any single character. It is the same as
514	\b{.} in full regexps.
515	\row \li \b{}*
516	\li Matches zero or more of any characters. It is the
517	same as \b{.} in full regexps.*
518	\row \li \b{[...]}
519	\li Sets of characters can be represented in square brackets,
520	similar to full regexps. Within the character class, like
521	outside, backslash has no special meaning.
522	\endtable
523
524	In the mode Wildcard, the wildcard characters cannot be
525	escaped. In the mode WildcardUnix, the character '\\' escapes the
526	wildcard.
527
528	For example if we are in wildcard mode and have strings which
529	contain filenames we could identify HTML files with \b{.html}.*
530	This will match zero or more characters followed by a dot followed
531	by 'h', 't', 'm' and 'l'.
532
533	To test a string against a wildcard expression, use exactMatch().
534	For example:
535
536	\snippet code/src_corelib_tools_qregexp.cpp 1
537
538	\target perl-users
539	\section1 Notes for Perl Users
540
541	Most of the character class abbreviations supported by Perl are
542	supported by QRegExp, see \l{#characters-and-abbreviations-for-sets-of-characters}
543	{characters and abbreviations for sets of characters}.
544
545	In QRegExp, apart from within character classes, \c{^} always
546	signifies the start of the string, so carets must always be
547	escaped unless used for that purpose. In Perl the meaning of caret
548	varies automagically depending on where it occurs so escaping it
549	is rarely necessary. The same applies to \c{$} which in
550	QRegExp always signifies the end of the string.
551
552	QRegExp's quantifiers are the same as Perl's greedy quantifiers
553	(but see the \l{greedy quantifiers}{note above}). Non-greedy
554	matching cannot be applied to individual quantifiers, but can be
555	applied to all the quantifiers in the pattern. For example, to
556	match the Perl regexp \b{ro+?m} requires:
557
558	\snippet code/src_corelib_tools_qregexp.cpp 2
559
560	The equivalent of Perl's \c{/i} option is
561	setCaseSensitivity(Qt::CaseInsensitive).
562
563	Perl's \c{/g} option can be emulated using a \l{#cap_in_a_loop}{loop}.
564
565	In QRegExp \b{.} matches any character, therefore all QRegExp
566	regexps have the equivalent of Perl's \c{/s} option. QRegExp
567	does not have an equivalent to Perl's \c{/m} option, but this
568	can be emulated in various ways for example by splitting the input
569	into lines or by looping with a regexp that searches for newlines.
570
571	Because QRegExp is string oriented, there are no \\A, \\Z, or \\z
572	assertions. The \\G assertion is not supported but can be emulated
573	in a loop.
574
575	Perl's $& is cap(0) or capturedTexts()[0]. There are no QRegExp
576	equivalents for $`, $' or $+. Perl's capturing variables, $1, $2,
577	... correspond to cap(1) or capturedTexts()[1], cap(2) or
578	capturedTexts()[2], etc.
579
580	To substitute a pattern use QString::replace().
581
582	Perl's extended \c{/x} syntax is not supported, nor are
583	directives, e.g. (?i), or regexp comments, e.g. (?#comment). On
584	the other hand, C++'s rules for literal strings can be used to
585	achieve the same:
586
587	\snippet code/src_corelib_tools_qregexp.cpp 3
588
589	Both zero-width positive and zero-width negative lookahead
590	assertions (?=pattern) and (?!pattern) are supported with the same
591	syntax as Perl. Perl's lookbehind assertions, "independent"
592	subexpressions and conditional expressions are not supported.
593
594	Non-capturing parentheses are also supported, with the same
595	(?:pattern) syntax.
596
597	See QString::split() and QStringList::join() for equivalents
598	to Perl's split and join functions.
599
600	Note: because C++ transforms \\'s they must be written \e twice in
601	code, e.g. \b{\\b} must be written \b{\\\\b}.
602
603	\target code-examples
604	\section1 Code Examples
605
606	\snippet code/src_corelib_tools_qregexp.cpp 4
607
608	The third string matches '\underline{6}'. This is a simple validation
609	regexp for integers in the range 0 to 99.
610
611	\snippet code/src_corelib_tools_qregexp.cpp 5
612
613	The second string matches '\underline{This_is-OK}'. We've used the
614	character set abbreviation '\\S' (non-whitespace) and the anchors
615	to match strings which contain no whitespace.
616
617	In the following example we match strings containing 'mail' or
618	'letter' or 'correspondence' but only match whole words i.e. not
619	'email'
620
621	\snippet code/src_corelib_tools_qregexp.cpp 6
622
623	The second string matches "Please write the \underline{letter}". The
624	word 'letter' is also captured (because of the parentheses). We
625	can see what text we've captured like this:
626
627	\snippet code/src_corelib_tools_qregexp.cpp 7
628
629	This will capture the text from the first set of capturing
630	parentheses (counting capturing left parentheses from left to
631	right). The parentheses are counted from 1 since cap(0) is the
632	whole matched regexp (equivalent to '&' in most regexp engines).
633
634	\snippet code/src_corelib_tools_qregexp.cpp 8
635
636	Here we've passed the QRegExp to QString's replace() function to
637	replace the matched text with new text.
638
639	\snippet code/src_corelib_tools_qregexp.cpp 9
640
641	We've used the indexIn() function to repeatedly match the regexp in
642	the string. Note that instead of moving forward by one character
643	at a time \c pos++ we could have written \c {pos +=
644	rx.matchedLength()} to skip over the already matched string. The
645	count will equal 3, matching 'One \underline{Eric} another
646	\underline{Eirik}, and an Ericsson. How many Eiriks, \underline{Eric}?'; it
647	doesn't match 'Ericsson' or 'Eiriks' because they are not bounded
648	by non-word boundaries.
649
650	One common use of regexps is to split lines of delimited data into
651	their component fields.
652
653	\snippet code/src_corelib_tools_qregexp.cpp 10
654
655	In this example our input lines have the format company name, web
656	address and country. Unfortunately the regexp is rather long and
657	not very versatile -- the code will break if we add any more
658	fields. A simpler and better solution is to look for the
659	separator, '\\t' in this case, and take the surrounding text. The
660	QString::split() function can take a separator string or regexp
661	as an argument and split a string accordingly.
662
663	\snippet code/src_corelib_tools_qregexp.cpp 11
664
665	Here field[0] is the company, field[1] the web address and so on.
666
667	To imitate the matching of a shell we can use wildcard mode.
668
669	\snippet code/src_corelib_tools_qregexp.cpp 12
670
671	Wildcard matching can be convenient because of its simplicity, but
672	any wildcard regexp can be defined using full regexps, e.g.
673	\b{.\\.html$}. Notice that we can't match both \c .html and \c*
674	.htm files with a wildcard unless we use \b{.htm} which will
675	also match 'test.html.bak'. A full regexp gives us the precision
676	we need, \b{.\\.html?$}.*
677
678	QRegExp can match case insensitively using setCaseSensitivity(),
679	and can use non-greedy matching, see setMinimal(). By
680	default QRegExp uses full regexps but this can be changed with
681	setPatternSyntax(). Searching can be done forward with indexIn() or backward
682	with lastIndexIn(). Captured text can be accessed using
683	capturedTexts() which returns a string list of all captured
684	strings, or using cap() which returns the captured string for the
685	given index. The pos() function takes a match index and returns
686	the position in the string where the match was made (or -1 if
687	there was no match).
688
689	\sa QString, QStringList, QRegExpValidator, QSortFilterProxyModel,
690	{tools/regexp}{Regular Expression Example}
691	*/
692
693	#if defined(Q_OS_VXWORKS) && defined(EOS)
694	# undef EOS
695	#endif
696
697	const int NumBadChars = `64`;
698	#define BadChar(ch) ((ch).unicode() % NumBadChars)
699
700	const int NoOccurrence = INT_MAX;
701	const int EmptyCapture = INT_MAX;
702	const int InftyLen = INT_MAX;
703	const int InftyRep = `1025`;
704	const int EOS = -`1`;
705
706	static bool isWord(QChar ch)
707	{
708	return ch.isLetterOrNumber() \|\| ch.isMark() \|\| ch == QLatin1Char (`'_'`);
709	}
710
711	/*
712	Merges two vectors of ints and puts the result into the first
713	one.
714	*/
715	static void mergeInto(QVector<int> a, const* QVector<int> &b)
716	{
717	int asize = a->size();
718	int bsize = b.size();
719	if (asize == `0`) {
720	*a = b;
721	#ifndef QT_NO_REGEXP_OPTIM
722	} else if (bsize == `1` && a->at(i: asize - `1`) < b.at(i: `0`)) {
723	a->resize(asize: asize + `1`);
724	(*a)[asize] = b.at(i: `0`);
725	#endif
726	} else if (bsize >= `1`) {
727	int csize = asize + bsize;
728	QVector<int> c(csize);
729	int i = `0`, j = `0`, k = `0`;
730	while (i < asize) {
731	if (j < bsize) {
732	if (a->at(i) == b.at(i: j)) {
733	++i;
734	--csize;
735	} else if (a->at(i) < b.at(i: j)) {
736	c [k++] = a->at(i: i++);
737	} else {
738	c [k++] = b.at(i: j++);
739	}
740	} else {
741	memcpy(dest: c.data() + k, src: a->constData() + i, n: (asize - i) * sizeof(int));
742	break;
743	}
744	}
745	c.resize(asize: csize);
746	if (j < bsize)
747	memcpy(dest: c.data() + k, src: b.constData() + j, n: (bsize - j) * sizeof(int));
748	*a = c;
749	}
750	}
751
752	#ifndef QT_NO_REGEXP_WILDCARD
753	/*
754	Translates a wildcard pattern to an equivalent regular expression
755	pattern (e.g., .cpp to .\.cpp).
756
757	If enableEscaping is true, it is possible to escape the wildcard
758	characters with \
759	*/
760	static QString wc2rx(const QString &wc_str, const bool enableEscaping)
761	{
762	const int wclen = wc_str.length();
763	QString rx;
764	int i = `0`;
765	bool isEscaping = false; // the previous character is '\'
766	const QChar *wc = wc_str.unicode();
767
768	while (i < wclen) {
769	const QChar c = wc[i++];
770	switch (c.unicode()) {
771	case `'\\'`:
772	if (enableEscaping) {
773	if (isEscaping) {
774	rx += QLatin1String ("\\\\");
775	} // we insert the \\ later if necessary
776	if (i == wclen) { // the end
777	rx += QLatin1String ("\\\\");
778	}
779	} else {
780	rx += QLatin1String ("\\\\");
781	}
782	isEscaping = true;
783	break;
784	case `'*'`:
785	if (isEscaping) {
786	rx += QLatin1String ("\\*");
787	isEscaping = false;
788	} else {
789	rx += QLatin1String (".*");
790	}
791	break;
792	case `'?'`:
793	if (isEscaping) {
794	rx += QLatin1String ("\\?");
795	isEscaping = false;
796	} else {
797	rx += QLatin1Char (`'.'`);
798	}
799
800	break;
801	case `'$'`:
802	case `'('`:
803	case `')'`:
804	case `'+'`:
805	case `'.'`:
806	case `'^'`:
807	case `'{'`:
808	case `'\|'`:
809	case `'}'`:
810	if (isEscaping) {
811	isEscaping = false;
812	rx += QLatin1String ("\\\\");
813	}
814	rx += QLatin1Char (`'\\'`);
815	rx += c;
816	break;
817	case `'['`:
818	if (isEscaping) {
819	isEscaping = false;
820	rx += QLatin1String ("\\[");
821	} else {
822	rx += c;
823	if (wc[i] == QLatin1Char (`'^'`))
824	rx += wc[i++];
825	if (i < wclen) {
826	if (wc[i] == QLatin1Char (`']'`))
827	rx += wc[i++];
828	while (i < wclen && wc[i] != QLatin1Char (`']'`)) {
829	if (wc[i] == QLatin1Char (`'\\'`))
830	rx += QLatin1Char (`'\\'`);
831	rx += wc[i++];
832	}
833	}
834	}
835	break;
836
837	case `']'`:
838	if(isEscaping){
839	isEscaping = false;
840	rx += QLatin1String ("\\");
841	}
842	rx += c;
843	break;
844
845	default:
846	if(isEscaping){
847	isEscaping = false;
848	rx += QLatin1String ("\\\\");
849	}
850	rx += c;
851	}
852	}
853	return rx;
854	}
855	#endif
856
857	static int caretIndex(int offset, QRegExp::CaretMode caretMode)
858	{
859	if (caretMode == QRegExp::CaretAtZero) {
860	return `0`;
861	} else if (caretMode == QRegExp::CaretAtOffset) {
862	return offset;
863	} else { // QRegExp::CaretWontMatch
864	return -`1`;
865	}
866	}
867
868	/*
869	The QRegExpEngineKey struct uniquely identifies an engine.
870	*/
871	struct QRegExpEngineKey
872	{
873	QString pattern;
874	QRegExp::PatternSyntax patternSyntax;
875	Qt::CaseSensitivity cs;
876
877	inline QRegExpEngineKey(const QString &pattern, QRegExp::PatternSyntax patternSyntax,
878	Qt::CaseSensitivity cs)
879	: pattern (pattern), patternSyntax(patternSyntax), cs(cs) {}
880
881	inline void clear() {
882	pattern.clear();
883	patternSyntax = QRegExp::RegExp;
884	cs = Qt::CaseSensitive;
885	}
886	};
887
888	static bool operator==(const QRegExpEngineKey &key1, const QRegExpEngineKey &key2)
889	{
890	return key1.pattern == key2.pattern && key1.patternSyntax == key2.patternSyntax
891	&& key1.cs == key2.cs;
892	}
893
894	static uint qHash(const QRegExpEngineKey &key, uint seed = `0`) noexcept
895	{
896	QtPrivate::QHashCombine hash;
897	seed = hash (seed, key.pattern);
898	seed = hash (seed, key.patternSyntax);
899	seed = hash (seed, key.cs);
900	return seed;
901	}
902
903	class QRegExpEngine;
904
905	//Q_DECLARE_TYPEINFO(QVector<int>, Q_MOVABLE_TYPE);
906
907	/*
908	This is the engine state during matching.
909	*/
910	struct QRegExpMatchState
911	{
912	const QChar in; // a pointer to the input string data*
913	int pos; // the current position in the string
914	int caretPos;
915	int len; // the length of the input string
916	bool minimal; // minimal matching?
917	int bigArray; // big array holding the data for the next pointers*
918	int inNextStack; // is state is nextStack?*
919	int curStack; // stack of current states*
920	int nextStack; // stack of next states*
921	int curCapBegin; // start of current states' captures*
922	int nextCapBegin; // start of next states' captures*
923	int curCapEnd; // end of current states' captures*
924	int nextCapEnd; // end of next states' captures*
925	int tempCapBegin; // start of temporary captures*
926	int tempCapEnd; // end of temporary captures*
927	int capBegin; // start of captures for a next state*
928	int capEnd; // end of captures for a next state*
929	int slideTab; // bump-along slide table for bad-character heuristic*
930	int captured; // what match() returned last*
931	int slideTabSize; // size of slide table
932	int capturedSize;
933	#ifndef QT_NO_REGEXP_BACKREF
934	QList<QVector<int> > sleeping; // list of back-reference sleepers
935	#endif
936	int matchLen; // length of match
937	int oneTestMatchedLen; // length of partial match
938
939	const QRegExpEngine *eng;
940
941	inline QRegExpMatchState() : bigArray(nullptr), captured(nullptr) {}
942	inline ~QRegExpMatchState() { free(ptr: bigArray); }
943
944	void drain() { free(ptr: bigArray); bigArray = nullptr; captured = nullptr; } // to save memory
945	void prepareForMatch(QRegExpEngine *eng);
946	void match(const QChar str, int* len, int pos, bool minimal,
947	bool oneTest, int caretIndex);
948	bool matchHere();
949	bool testAnchor(int i, int a, const int *capBegin);
950	};
951
952	/*
953	The struct QRegExpAutomatonState represents one state in a modified NFA. The
954	input characters matched are stored in the state instead of on
955	the transitions, something possible for an automaton
956	constructed from a regular expression.
957	*/
958	struct QRegExpAutomatonState
959	{
960	#ifndef QT_NO_REGEXP_CAPTURE
961	int atom; // which atom does this state belong to?
962	#endif
963	int match; // what does it match? (see CharClassBit and BackRefBit)
964	QVector<int> outs; // out-transitions
965	QMap<int, int> reenter; // atoms reentered when transiting out
966	QMap<int, int> anchors; // anchors met when transiting out
967
968	inline QRegExpAutomatonState() { }
969	#ifndef QT_NO_REGEXP_CAPTURE
970	inline QRegExpAutomatonState(int a, int m)
971	: atom(a), match(m) { }
972	#else
973	inline QRegExpAutomatonState(int m)
974	: match(m) { }
975	#endif
976	};
977
978	Q_DECLARE_TYPEINFO(QRegExpAutomatonState, Q_MOVABLE_TYPE);
979
980	/*
981	The struct QRegExpCharClassRange represents a range of characters (e.g.,
982	[0-9] denotes range 48 to 57).
983	*/
984	struct QRegExpCharClassRange
985	{
986	ushort from; // 48
987	ushort len; // 10
988	};
989
990	Q_DECLARE_TYPEINFO(QRegExpCharClassRange, Q_PRIMITIVE_TYPE);
991
992	#ifndef QT_NO_REGEXP_CAPTURE
993	/*
994	The struct QRegExpAtom represents one node in the hierarchy of regular
995	expression atoms.
996	*/
997	struct QRegExpAtom
998	{
999	enum { NoCapture = -`1`, OfficialCapture = -`2`, UnofficialCapture = -`3` };
1000
1001	int parent; // index of parent in array of atoms
1002	int capture; // index of capture, from 1 to ncap - 1
1003	};
1004
1005	Q_DECLARE_TYPEINFO(QRegExpAtom, Q_PRIMITIVE_TYPE);
1006	#endif
1007
1008	struct QRegExpLookahead;
1009
1010	#ifndef QT_NO_REGEXP_ANCHOR_ALT
1011	/*
1012	The struct QRegExpAnchorAlternation represents a pair of anchors with
1013	OR semantics.
1014	*/
1015	struct QRegExpAnchorAlternation
1016	{
1017	int a; // this anchor...
1018	int b; // ...or this one
1019	};
1020
1021	Q_DECLARE_TYPEINFO(QRegExpAnchorAlternation, Q_PRIMITIVE_TYPE);
1022	#endif
1023
1024	#ifndef QT_NO_REGEXP_CCLASS
1025
1026	#define FLAG(x) (1 << (x))
1027	/*
1028	The class QRegExpCharClass represents a set of characters, such as can
1029	be found in regular expressions (e.g., [a-z] denotes the set
1030	{a, b, ..., z}).
1031	*/
1032	class QRegExpCharClass
1033	{
1034	public:
1035	QRegExpCharClass();
1036
1037	void clear();
1038	bool negative() const { return n; }
1039	void setNegative(bool negative);
1040	void addCategories(uint cats);
1041	void addRange(ushort from, ushort to);
1042	void addSingleton(ushort ch) { addRange(from: ch, to: ch); }
1043
1044	bool in(QChar ch) const;
1045	#ifndef QT_NO_REGEXP_OPTIM
1046	const QVector<int> &firstOccurrence() const { return occ1; }
1047	#endif
1048
1049	#if defined(QT_DEBUG)
1050	void dump() const;
1051	#endif
1052
1053	private:
1054	QVector<QRegExpCharClassRange> r; // character ranges
1055	#ifndef QT_NO_REGEXP_OPTIM
1056	QVector<int> occ1; // first-occurrence array
1057	#endif
1058	uint c; // character classes
1059	bool n; // negative?
1060	};
1061	#else
1062	struct QRegExpCharClass
1063	{
1064	int dummy;
1065
1066	#ifndef QT_NO_REGEXP_OPTIM
1067	QRegExpCharClass() { occ1.fill(`0`, NumBadChars); }
1068
1069	const QVector<int> &firstOccurrence() const { return occ1; }
1070	QVector<int> occ1;
1071	#endif
1072	};
1073	#endif
1074
1075	Q_DECLARE_TYPEINFO(QRegExpCharClass, Q_MOVABLE_TYPE);
1076
1077	/*
1078	The QRegExpEngine class encapsulates a modified nondeterministic
1079	finite automaton (NFA).
1080	*/
1081	class QRegExpEngine
1082	{
1083	public:
1084	QRegExpEngine(Qt::CaseSensitivity cs, bool greedyQuantifiers)
1085	: cs(cs), greedyQuantifiers(greedyQuantifiers) { setup(); }
1086
1087	QRegExpEngine(const QRegExpEngineKey &key);
1088	~QRegExpEngine();
1089
1090	bool isValid() const { return valid; }
1091	const QString &errorString() const { return yyError; }
1092	int captureCount() const { return officialncap; }
1093
1094	int createState(QChar ch);
1095	int createState(const QRegExpCharClass &cc);
1096	#ifndef QT_NO_REGEXP_BACKREF
1097	int createState(int bref);
1098	#endif
1099
1100	void addCatTransitions(const QVector<int> &from, const QVector<int> &to);
1101	#ifndef QT_NO_REGEXP_CAPTURE
1102	void addPlusTransitions(const QVector<int> &from, const QVector<int> &to, int atom);
1103	#endif
1104
1105	#ifndef QT_NO_REGEXP_ANCHOR_ALT
1106	int anchorAlternation(int a, int b);
1107	int anchorConcatenation(int a, int b);
1108	#else
1109	int anchorAlternation(int a, int b) { return a & b; }
1110	int anchorConcatenation(int a, int b) { return a \| b; }
1111	#endif
1112	void addAnchors(int from, int to, int a);
1113
1114	#ifndef QT_NO_REGEXP_OPTIM
1115	void heuristicallyChooseHeuristic();
1116	#endif
1117
1118	#if defined(QT_DEBUG)
1119	void dump() const;
1120	#endif
1121
1122	QAtomicInt ref;
1123
1124	private:
1125	enum { CharClassBit = `0x10000`, BackRefBit = `0x20000` };
1126	enum { InitialState = `0`, FinalState = `1` };
1127
1128	void setup();
1129	int setupState(int match);
1130
1131	/*
1132	Let's hope that 13 lookaheads and 14 back-references are
1133	enough.
1134	*/
1135	enum { MaxLookaheads = `13`, MaxBackRefs = `14` };
1136	enum { Anchor_Dollar = `0x00000001`, Anchor_Caret = `0x00000002`, Anchor_Word = `0x00000004`,
1137	Anchor_NonWord = `0x00000008`, Anchor_FirstLookahead = `0x00000010`,
1138	Anchor_BackRef1Empty = Anchor_FirstLookahead << MaxLookaheads,
1139	Anchor_BackRef0Empty = Anchor_BackRef1Empty >> `1`,
1140	Anchor_Alternation = unsigned(Anchor_BackRef1Empty) << MaxBackRefs,
1141
1142	Anchor_LookaheadMask = (Anchor_FirstLookahead - `1`) ^
1143	((Anchor_FirstLookahead << MaxLookaheads) - `1`) };
1144	#ifndef QT_NO_REGEXP_CAPTURE
1145	int startAtom(bool officialCapture);
1146	void finishAtom(int atom, bool needCapture);
1147	#endif
1148
1149	#ifndef QT_NO_REGEXP_LOOKAHEAD
1150	int addLookahead(QRegExpEngine eng, bool* negative);
1151	#endif
1152
1153	#ifndef QT_NO_REGEXP_OPTIM
1154	bool goodStringMatch(QRegExpMatchState &matchState) const;
1155	bool badCharMatch(QRegExpMatchState &matchState) const;
1156	#else
1157	bool bruteMatch(QRegExpMatchState &matchState) const;
1158	#endif
1159
1160	QVector<QRegExpAutomatonState> s; // array of states
1161	#ifndef QT_NO_REGEXP_CAPTURE
1162	QVector<QRegExpAtom> f; // atom hierarchy
1163	int nf; // number of atoms
1164	int cf; // current atom
1165	QVector<int> captureForOfficialCapture;
1166	#endif
1167	int officialncap; // number of captures, seen from the outside
1168	int ncap; // number of captures, seen from the inside
1169	#ifndef QT_NO_REGEXP_CCLASS
1170	QVector<QRegExpCharClass> cl; // array of character classes
1171	#endif
1172	#ifndef QT_NO_REGEXP_LOOKAHEAD
1173	QVector<QRegExpLookahead > ahead; // array of lookaheads*
1174	#endif
1175	#ifndef QT_NO_REGEXP_ANCHOR_ALT
1176	QVector<QRegExpAnchorAlternation> aa; // array of (a, b) pairs of anchors
1177	#endif
1178	#ifndef QT_NO_REGEXP_OPTIM
1179	bool caretAnchored; // does the regexp start with ^?
1180	bool trivial; // is the good-string all that needs to match?
1181	#endif
1182	bool valid; // is the regular expression valid?
1183	Qt::CaseSensitivity cs; // case sensitive?
1184	bool greedyQuantifiers; // RegExp2?
1185	bool xmlSchemaExtensions;
1186	#ifndef QT_NO_REGEXP_BACKREF
1187	int nbrefs; // number of back-references
1188	#endif
1189
1190	#ifndef QT_NO_REGEXP_OPTIM
1191	bool useGoodStringHeuristic; // use goodStringMatch? otherwise badCharMatch
1192
1193	int goodEarlyStart; // the index where goodStr can first occur in a match
1194	int goodLateStart; // the index where goodStr can last occur in a match
1195	QString goodStr; // the string that any match has to contain
1196
1197	int minl; // the minimum length of a match
1198	QVector<int> occ1; // first-occurrence array
1199	#endif
1200
1201	/*
1202	The class Box is an abstraction for a regular expression
1203	fragment. It can also be seen as one node in the syntax tree of
1204	a regular expression with synthetized attributes.
1205
1206	Its interface is ugly for performance reasons.
1207	*/
1208	class Box
1209	{
1210	public:
1211	Box(QRegExpEngine *engine);
1212	Box(const Box &b) { operator=(b); }
1213
1214	Box &operator=(const Box &b);
1215
1216	void clear() { operator=(b: Box (eng)); }
1217	void set(QChar ch);
1218	void set(const QRegExpCharClass &cc);
1219	#ifndef QT_NO_REGEXP_BACKREF
1220	void set(int bref);
1221	#endif
1222
1223	void cat(const Box &b);
1224	void orx(const Box &b);
1225	void plus(int atom);
1226	void opt();
1227	void catAnchor(int a);
1228	#ifndef QT_NO_REGEXP_OPTIM
1229	void setupHeuristics();
1230	#endif
1231
1232	#if defined(QT_DEBUG)
1233	void dump() const;
1234	#endif
1235
1236	private:
1237	void addAnchorsToEngine(const Box &to) const;
1238
1239	QRegExpEngine eng; // the automaton under construction*
1240	QVector<int> ls; // the left states (firstpos)
1241	QVector<int> rs; // the right states (lastpos)
1242	QMap<int, int> lanchors; // the left anchors
1243	QMap<int, int> ranchors; // the right anchors
1244	int skipanchors; // the anchors to match if the box is skipped
1245
1246	#ifndef QT_NO_REGEXP_OPTIM
1247	int earlyStart; // the index where str can first occur
1248	int lateStart; // the index where str can last occur
1249	QString str; // a string that has to occur in any match
1250	QString leftStr; // a string occurring at the left of this box
1251	QString rightStr; // a string occurring at the right of this box
1252	int maxl; // the maximum length of this box (possibly InftyLen)
1253	#endif
1254
1255	int minl; // the minimum length of this box
1256	#ifndef QT_NO_REGEXP_OPTIM
1257	QVector<int> occ1; // first-occurrence array
1258	#endif
1259	};
1260
1261	friend class Box;
1262
1263	/*
1264	This is the lexical analyzer for regular expressions.
1265	*/
1266	enum { Tok_Eos, Tok_Dollar, Tok_LeftParen, Tok_MagicLeftParen, Tok_PosLookahead,
1267	Tok_NegLookahead, Tok_RightParen, Tok_CharClass, Tok_Caret, Tok_Quantifier, Tok_Bar,
1268	Tok_Word, Tok_NonWord, Tok_Char = `0x10000`, Tok_BackRef = `0x20000` };
1269	int getChar();
1270	int getEscape();
1271	#ifndef QT_NO_REGEXP_INTERVAL
1272	int getRep(int def);
1273	#endif
1274	#ifndef QT_NO_REGEXP_LOOKAHEAD
1275	void skipChars(int n);
1276	#endif
1277	void error(const char *msg);
1278	void startTokenizer(const QChar rx, int* len);
1279	int getToken();
1280
1281	const QChar yyIn; // a pointer to the input regular expression pattern*
1282	int yyPos0; // the position of yyTok in the input pattern
1283	int yyPos; // the position of the next character to read
1284	int yyLen; // the length of yyIn
1285	int yyCh; // the last character read
1286	QScopedPointer<QRegExpCharClass> yyCharClass; // attribute for Tok_CharClass tokens
1287	int yyMinRep; // attribute for Tok_Quantifier
1288	int yyMaxRep; // ditto
1289	QString yyError; // syntax error or overflow during parsing?
1290
1291	/*
1292	This is the syntactic analyzer for regular expressions.
1293	*/
1294	int parse(const QChar rx, int* len);
1295	void parseAtom(Box *box);
1296	void parseFactor(Box *box);
1297	void parseTerm(Box *box);
1298	void parseExpression(Box *box);
1299
1300	int yyTok; // the last token read
1301	bool yyMayCapture; // set this to false to disable capturing
1302
1303	friend struct QRegExpMatchState;
1304	};
1305
1306	#ifndef QT_NO_REGEXP_LOOKAHEAD
1307	/*
1308	The struct QRegExpLookahead represents a lookahead a la Perl (e.g.,
1309	(?=foo) and (?!bar)).
1310	*/
1311	struct QRegExpLookahead
1312	{
1313	QRegExpEngine eng; // NFA representing the embedded regular expression*
1314	bool neg; // negative lookahead?
1315
1316	inline QRegExpLookahead(QRegExpEngine eng0, bool* neg0)
1317	: eng(eng0), neg(neg0) { }
1318	inline ~QRegExpLookahead() { delete eng; }
1319	};
1320	#endif
1321
1322	/!*
1323	\internal
1324	convert the pattern string to the RegExp syntax.
1325
1326	This is also used by QScriptEngine::newRegExp to convert to a pattern that JavaScriptCore can understan
1327	*/
1328	Q_CORE_EXPORT QString qt_regexp_toCanonical(const QString &pattern, QRegExp::PatternSyntax patternSyntax)
1329	{
1330	switch (patternSyntax) {
1331	#ifndef QT_NO_REGEXP_WILDCARD
1332	case QRegExp::Wildcard:
1333	return wc2rx(wc_str: pattern, enableEscaping: false);
1334	case QRegExp::WildcardUnix:
1335	return wc2rx(wc_str: pattern, enableEscaping: true);
1336	#endif
1337	case QRegExp::FixedString:
1338	return QRegExp::escape(str: pattern);
1339	case QRegExp::W3CXmlSchema11:
1340	default:
1341	return pattern;
1342	}
1343	}
1344
1345	QRegExpEngine::QRegExpEngine(const QRegExpEngineKey &key)
1346	: cs(key.cs), greedyQuantifiers(key.patternSyntax == QRegExp::RegExp2),
1347	xmlSchemaExtensions(key.patternSyntax == QRegExp::W3CXmlSchema11)
1348	{
1349	setup();
1350
1351	QString rx = qt_regexp_toCanonical(pattern: key.pattern, patternSyntax: key.patternSyntax);
1352
1353	valid = (parse(rx: rx.unicode(), len: rx.length()) == rx.length());
1354	if (!valid) {
1355	#ifndef QT_NO_REGEXP_OPTIM
1356	trivial = false;
1357	#endif
1358	error(RXERR_LEFTDELIM);
1359	}
1360	}
1361
1362	QRegExpEngine::~QRegExpEngine()
1363	{
1364	#ifndef QT_NO_REGEXP_LOOKAHEAD
1365	qDeleteAll(c: ahead);
1366	#endif
1367	}
1368
1369	void QRegExpMatchState::prepareForMatch(QRegExpEngine *eng)
1370	{
1371	/*
1372	We use one QVector<int> for all the big data used a lot in
1373	matchHere() and friends.
1374	*/
1375	int ns = eng->s.size(); // number of states
1376	int ncap = eng->ncap;
1377	#ifndef QT_NO_REGEXP_OPTIM
1378	int newSlideTabSize = qMax(a: eng->minl + `1`, b: `16`);
1379	#else
1380	int newSlideTabSize = `0`;
1381	#endif
1382	int numCaptures = eng->captureCount();
1383	int newCapturedSize = `2` + `2` * numCaptures;
1384	bigArray = q_check_ptr(p: (int )realloc(ptr: bigArray, size: ((`3` + `4` ncap) * ns + `4` * ncap + newSlideTabSize + newCapturedSize)*sizeof(int)));
1385
1386	// set all internal variables only _after_ bigArray is realloc'ed
1387	// to prevent a broken regexp in oom case
1388
1389	slideTabSize = newSlideTabSize;
1390	capturedSize = newCapturedSize;
1391	inNextStack = bigArray;
1392	memset(s: inNextStack, c: -`1`, n: ns * sizeof(int));
1393	curStack = inNextStack + ns;
1394	nextStack = inNextStack + `2` * ns;
1395
1396	curCapBegin = inNextStack + `3` * ns;
1397	nextCapBegin = curCapBegin + ncap * ns;
1398	curCapEnd = curCapBegin + `2` * ncap * ns;
1399	nextCapEnd = curCapBegin + `3` * ncap * ns;
1400
1401	tempCapBegin = curCapBegin + `4` * ncap * ns;
1402	tempCapEnd = tempCapBegin + ncap;
1403	capBegin = tempCapBegin + `2` * ncap;
1404	capEnd = tempCapBegin + `3` * ncap;
1405
1406	slideTab = tempCapBegin + `4` * ncap;
1407	captured = slideTab + slideTabSize;
1408	memset(s: captured, c: -`1`, n: capturedSize*sizeof(int));
1409	this->eng = eng;
1410	}
1411
1412	/*
1413	Tries to match in str and returns an array of (begin, length) pairs
1414	for captured text. If there is no match, all pairs are (-1, -1).
1415	*/
1416	void QRegExpMatchState::match(const QChar str0, int* len0, int pos0,
1417	bool minimal0, bool oneTest, int caretIndex)
1418	{
1419	bool matched = false;
1420	QChar char_null;
1421
1422	#ifndef QT_NO_REGEXP_OPTIM
1423	if (eng->trivial && !oneTest) {
1424	// ### Qt6: qsizetype
1425	pos = int(QtPrivate::findString(haystack: QStringView (str0, len0), from: pos0, needle: QStringView (eng->goodStr.unicode(), eng->goodStr.length()), cs: eng->cs));
1426	matchLen = eng->goodStr.length();
1427	matched = (pos != -`1`);
1428	} else
1429	#endif
1430	{
1431	in = str0;
1432	if (in == nullptr)
1433	in = &char_null;
1434	pos = pos0;
1435	caretPos = caretIndex;
1436	len = len0;
1437	minimal = minimal0;
1438	matchLen = `0`;
1439	oneTestMatchedLen = `0`;
1440
1441	if (eng->valid && pos >= `0` && pos <= len) {
1442	#ifndef QT_NO_REGEXP_OPTIM
1443	if (oneTest) {
1444	matched = matchHere();
1445	} else {
1446	if (pos <= len - eng->minl) {
1447	if (eng->caretAnchored) {
1448	matched = matchHere();
1449	} else if (eng->useGoodStringHeuristic) {
1450	matched = eng->goodStringMatch(matchState&: *this);
1451	} else {
1452	matched = eng->badCharMatch(matchState&: *this);
1453	}
1454	}
1455	}
1456	#else
1457	matched = oneTest ? matchHere() : eng->bruteMatch(*this);
1458	#endif
1459	}
1460	}
1461
1462	if (matched) {
1463	int *c = captured;
1464	*c++ = pos;
1465	*c++ = matchLen;
1466
1467	int numCaptures = (capturedSize - `2`) >> `1`;
1468	#ifndef QT_NO_REGEXP_CAPTURE
1469	for (int i = `0`; i < numCaptures; ++i) {
1470	int j = eng->captureForOfficialCapture.at(i);
1471	if (capBegin[j] != EmptyCapture) {
1472	int len = capEnd[j] - capBegin[j];
1473	*c++ = (len > `0`) ? pos + capBegin[j] : `0`;
1474	*c++ = len;
1475	} else {
1476	*c++ = -`1`;
1477	*c++ = -`1`;
1478	}
1479	}
1480	#endif
1481	} else {
1482	// we rely on 2's complement here
1483	memset(s: captured, c: -`1`, n: capturedSize * sizeof(int));
1484	}
1485	}
1486
1487	/*
1488	The three following functions add one state to the automaton and
1489	return the number of the state.
1490	*/
1491
1492	int QRegExpEngine::createState(QChar ch)
1493	{
1494	return setupState(ch.unicode());
1495	}
1496
1497	int QRegExpEngine::createState(const QRegExpCharClass &cc)
1498	{
1499	#ifndef QT_NO_REGEXP_CCLASS
1500	int n = cl.size();
1501	cl += QRegExpCharClass (cc);
1502	return setupState(CharClassBit \| n);
1503	#else
1504	Q_UNUSED(cc);
1505	return setupState(CharClassBit);
1506	#endif
1507	}
1508
1509	#ifndef QT_NO_REGEXP_BACKREF
1510	int QRegExpEngine::createState(int bref)
1511	{
1512	if (bref > nbrefs) {
1513	nbrefs = bref;
1514	if (nbrefs > MaxBackRefs) {
1515	error(RXERR_LIMIT);
1516	return `0`;
1517	}
1518	}
1519	return setupState(BackRefBit \| bref);
1520	}
1521	#endif
1522
1523	/*
1524	The two following functions add a transition between all pairs of
1525	states (i, j) where i is found in from, and j is found in to.
1526
1527	Cat-transitions are distinguished from plus-transitions for
1528	capturing.
1529	*/
1530
1531	void QRegExpEngine::addCatTransitions(const QVector<int> &from, const QVector<int> &to)
1532	{
1533	for (int i = `0`; i < from.size(); i++)
1534	mergeInto(a: &s [from.at(i)].outs, b: to);
1535	}
1536
1537	#ifndef QT_NO_REGEXP_CAPTURE
1538	void QRegExpEngine::addPlusTransitions(const QVector<int> &from, const QVector<int> &to, int atom)
1539	{
1540	for (int i = `0`; i < from.size(); i++) {
1541	QRegExpAutomatonState &st = s [from.at(i)];
1542	const QVector<int> oldOuts = st.outs;
1543	mergeInto(a: &st.outs, b: to);
1544	if (f.at(i: atom).capture != QRegExpAtom::NoCapture) {
1545	for (int j = `0`; j < to.size(); j++) {
1546	// ### st.reenter.contains(to.at(j)) check looks suspicious
1547	if (!st.reenter.contains(akey: to.at(i: j)) &&
1548	!std::binary_search(first: oldOuts.constBegin(), last: oldOuts.constEnd(), val: to.at(i: j)))
1549	st.reenter.insert(akey: to.at(i: j), avalue: atom);
1550	}
1551	}
1552	}
1553	}
1554	#endif
1555
1556	#ifndef QT_NO_REGEXP_ANCHOR_ALT
1557	/*
1558	Returns an anchor that means a OR b.
1559	*/
1560	int QRegExpEngine::anchorAlternation(int a, int b)
1561	{
1562	if (((a & b) == a \|\| (a & b) == b) && ((a \| b) & Anchor_Alternation) == `0`)
1563	return a & b;
1564
1565	int n = aa.size();
1566	#ifndef QT_NO_REGEXP_OPTIM
1567	if (n > `0` && aa.at(i: n - `1`).a == a && aa.at(i: n - `1`).b == b)
1568	return Anchor_Alternation \| (n - `1`);
1569	#endif
1570
1571	QRegExpAnchorAlternation element = {.a: a, .b: b};
1572	aa.append(t: element);
1573	return Anchor_Alternation \| n;
1574	}
1575
1576	/*
1577	Returns an anchor that means a AND b.
1578	*/
1579	int QRegExpEngine::anchorConcatenation(int a, int b)
1580	{
1581	if (((a \| b) & Anchor_Alternation) == `0`)
1582	return a \| b;
1583	if ((b & Anchor_Alternation) != `0`)
1584	qSwap(value1&: a, value2&: b);
1585
1586	int aprime = anchorConcatenation(a: aa.at(i: a ^ Anchor_Alternation).a, b);
1587	int bprime = anchorConcatenation(a: aa.at(i: a ^ Anchor_Alternation).b, b);
1588	return anchorAlternation(a: aprime, b: bprime);
1589	}
1590	#endif
1591
1592	/*
1593	Adds anchor a on a transition caracterised by its from state and
1594	its to state.
1595	*/
1596	void QRegExpEngine::addAnchors(int from, int to, int a)
1597	{
1598	QRegExpAutomatonState &st = s [from];
1599	if (st.anchors.contains(akey: to))
1600	a = anchorAlternation(a: st.anchors.value(akey: to), b: a);
1601	st.anchors.insert(akey: to, avalue: a);
1602	}
1603
1604	#ifndef QT_NO_REGEXP_OPTIM
1605	/*
1606	This function chooses between the good-string and the bad-character
1607	heuristics. It computes two scores and chooses the heuristic with
1608	the highest score.
1609
1610	Here are some common-sense constraints on the scores that should be
1611	respected if the formulas are ever modified: (1) If goodStr is
1612	empty, the good-string heuristic scores 0. (2) If the regular
1613	expression is trivial, the good-string heuristic should be used.
1614	(3) If the search is case insensitive, the good-string heuristic
1615	should be used, unless it scores 0. (Case insensitivity turns all
1616	entries of occ1 to 0.) (4) If (goodLateStart - goodEarlyStart) is
1617	big, the good-string heuristic should score less.
1618	*/
1619	void QRegExpEngine::heuristicallyChooseHeuristic()
1620	{
1621	if (minl == `0`) {
1622	useGoodStringHeuristic = false;
1623	} else if (trivial) {
1624	useGoodStringHeuristic = true;
1625	} else {
1626	/*
1627	Magic formula: The good string has to constitute a good
1628	proportion of the minimum-length string, and appear at a
1629	more-or-less known index.
1630	*/
1631	int goodStringScore = (`64` * goodStr.length() / minl) -
1632	(goodLateStart - goodEarlyStart);
1633	/*
1634	Less magic formula: We pick some characters at random, and
1635	check whether they are good or bad.
1636	*/
1637	int badCharScore = `0`;
1638	int step = qMax(a: `1`, b: NumBadChars / `32`);
1639	for (int i = `1`; i < NumBadChars; i += step) {
1640	if (occ1.at(i) == NoOccurrence)
1641	badCharScore += minl;
1642	else
1643	badCharScore += occ1.at(i);
1644	}
1645	badCharScore /= minl;
1646	useGoodStringHeuristic = (goodStringScore > badCharScore);
1647	}
1648	}
1649	#endif
1650
1651	#if defined(QT_DEBUG)
1652	void QRegExpEngine::dump() const
1653	{
1654	int i, j;
1655	qDebug(msg: "Case %ssensitive engine", cs ? "" : "in");
1656	qDebug(msg: " States");
1657	for (i = `0`; i < s.size(); i++) {
1658	qDebug(msg: " %d%s", i, i == InitialState ? " (initial)" : i == FinalState ? " (final)" : "");
1659	#ifndef QT_NO_REGEXP_CAPTURE
1660	if (nf > `0`)
1661	qDebug(msg: " in atom %d", s [i].atom);
1662	#endif
1663	int m = s [i].match;
1664	if ((m & CharClassBit) != `0`) {
1665	qDebug(msg: " match character class %d", m ^ CharClassBit);
1666	#ifndef QT_NO_REGEXP_CCLASS
1667	cl [m ^ CharClassBit].dump();
1668	#else
1669	qDebug(" negative character class");
1670	#endif
1671	} else if ((m & BackRefBit) != `0`) {
1672	qDebug(msg: " match back-reference %d", m ^ BackRefBit);
1673	} else if (m >= `0x20` && m <= `0x7e`) {
1674	qDebug(msg: " match 0x%.4x (%c)", m, m);
1675	} else {
1676	qDebug(msg: " match 0x%.4x", m);
1677	}
1678	for (j = `0`; j < s [i].outs.size(); j++) {
1679	int next = s [i].outs [j];
1680	qDebug(msg: " -> %d", next);
1681	if (s [i].reenter.contains(akey: next))
1682	qDebug(msg: " [reenter %d]", s [i].reenter [next]);
1683	if (s [i].anchors.value(akey: next) != `0`)
1684	qDebug(msg: " [anchors 0x%.8x]", s [i].anchors [next]);
1685	}
1686	}
1687	#ifndef QT_NO_REGEXP_CAPTURE
1688	if (nf > `0`) {
1689	qDebug(msg: " Atom Parent Capture");
1690	for (i = `0`; i < nf; i++) {
1691	if (f [i].capture == QRegExpAtom::NoCapture) {
1692	qDebug(msg: " %6d %6d nil", i, f [i].parent);
1693	} else {
1694	int cap = f [i].capture;
1695	bool official = captureForOfficialCapture.contains(t: cap);
1696	qDebug(msg: " %6d %6d %6d %s", i, f [i].parent, f [i].capture,
1697	official ? "official" : "");
1698	}
1699	}
1700	}
1701	#endif
1702	#ifndef QT_NO_REGEXP_ANCHOR_ALT
1703	for (i = `0`; i < aa.size(); i++)
1704	qDebug(msg: " Anchor alternation 0x%.8x: 0x%.8x 0x%.9x", i, aa [i].a, aa [i].b);
1705	#endif
1706	}
1707	#endif
1708
1709	void QRegExpEngine::setup()
1710	{
1711	ref.storeRelaxed(newValue: `1`);
1712	#ifndef QT_NO_REGEXP_CAPTURE
1713	f.resize(asize: `32`);
1714	nf = `0`;
1715	cf = -`1`;
1716	#endif
1717	officialncap = `0`;
1718	ncap = `0`;
1719	#ifndef QT_NO_REGEXP_OPTIM
1720	caretAnchored = true;
1721	trivial = true;
1722	#endif
1723	valid = false;
1724	#ifndef QT_NO_REGEXP_BACKREF
1725	nbrefs = `0`;
1726	#endif
1727	#ifndef QT_NO_REGEXP_OPTIM
1728	useGoodStringHeuristic = true;
1729	minl = `0`;
1730	occ1.fill(from: `0`, asize: NumBadChars);
1731	#endif
1732	}
1733
1734	int QRegExpEngine::setupState(int match)
1735	{
1736	#ifndef QT_NO_REGEXP_CAPTURE
1737	s += QRegExpAutomatonState (cf, match);
1738	#else
1739	s += QRegExpAutomatonState(match);
1740	#endif
1741	return s.size() - `1`;
1742	}
1743
1744	#ifndef QT_NO_REGEXP_CAPTURE
1745	/*
1746	Functions startAtom() and finishAtom() should be called to delimit
1747	atoms. When a state is created, it is assigned to the current atom.
1748	The information is later used for capturing.
1749	*/
1750	int QRegExpEngine::startAtom(bool officialCapture)
1751	{
1752	if ((nf & (nf + `1`)) == `0` && nf + `1` >= f.size())
1753	f.resize(asize: (nf + `1`) << `1`);
1754	f [nf].parent = cf;
1755	cf = nf++;
1756	f [cf].capture = officialCapture ? QRegExpAtom::OfficialCapture : QRegExpAtom::NoCapture;
1757	return cf;
1758	}
1759
1760	void QRegExpEngine::finishAtom(int atom, bool needCapture)
1761	{
1762	if (greedyQuantifiers && needCapture && f [atom].capture == QRegExpAtom::NoCapture)
1763	f [atom].capture = QRegExpAtom::UnofficialCapture;
1764	cf = f.at(i: atom).parent;
1765	}
1766	#endif
1767
1768	#ifndef QT_NO_REGEXP_LOOKAHEAD
1769	/*
1770	Creates a lookahead anchor.
1771	*/
1772	int QRegExpEngine::addLookahead(QRegExpEngine eng, bool* negative)
1773	{
1774	int n = ahead.size();
1775	if (n == MaxLookaheads) {
1776	error(RXERR_LIMIT);
1777	return `0`;
1778	}
1779	ahead += new QRegExpLookahead (eng, negative);
1780	return Anchor_FirstLookahead << n;
1781	}
1782	#endif
1783
1784	#ifndef QT_NO_REGEXP_CAPTURE
1785	/*
1786	We want the longest leftmost captures.
1787	*/
1788	static bool isBetterCapture(int ncap, const int begin1, const* int end1, const* int *begin2,
1789	const int *end2)
1790	{
1791	for (int i = `0`; i < ncap; i++) {
1792	int delta = begin2[i] - begin1[i]; // it has to start early...
1793	if (delta == `0`)
1794	delta = end1[i] - end2[i]; // ...and end late
1795
1796	if (delta != `0`)
1797	return delta > `0`;
1798	}
1799	return false;
1800	}
1801	#endif
1802
1803	/*
1804	Returns \c true if anchor a matches at position pos + i in the input
1805	string, otherwise false.
1806	*/
1807	bool QRegExpMatchState::testAnchor(int i, int a, const int *capBegin)
1808	{
1809	int j;
1810
1811	#ifndef QT_NO_REGEXP_ANCHOR_ALT
1812	if ((a & QRegExpEngine::Anchor_Alternation) != `0`)
1813	return testAnchor(i, a: eng->aa.at(i: a ^ QRegExpEngine::Anchor_Alternation).a, capBegin)
1814	\|\| testAnchor(i, a: eng->aa.at(i: a ^ QRegExpEngine::Anchor_Alternation).b, capBegin);
1815	#endif
1816
1817	if ((a & QRegExpEngine::Anchor_Caret) != `0`) {
1818	if (pos + i != caretPos)
1819	return false;
1820	}
1821	if ((a & QRegExpEngine::Anchor_Dollar) != `0`) {
1822	if (pos + i != len)
1823	return false;
1824	}
1825	#ifndef QT_NO_REGEXP_ESCAPE
1826	if ((a & (QRegExpEngine::Anchor_Word \| QRegExpEngine::Anchor_NonWord)) != `0`) {
1827	bool before = false;
1828	bool after = false;
1829	if (pos + i != `0`)
1830	before = isWord(ch: in[pos + i - `1`]);
1831	if (pos + i != len)
1832	after = isWord(ch: in[pos + i]);
1833	if ((a & QRegExpEngine::Anchor_Word) != `0` && (before == after))
1834	return false;
1835	if ((a & QRegExpEngine::Anchor_NonWord) != `0` && (before != after))
1836	return false;
1837	}
1838	#endif
1839	#ifndef QT_NO_REGEXP_LOOKAHEAD
1840	if ((a & QRegExpEngine::Anchor_LookaheadMask) != `0`) {
1841	const QVector<QRegExpLookahead *> &ahead = eng->ahead;
1842	for (j = `0`; j < ahead.size(); j++) {
1843	if ((a & (QRegExpEngine::Anchor_FirstLookahead << j)) != `0`) {
1844	QRegExpMatchState matchState;
1845	matchState.prepareForMatch(eng: ahead [j]->eng);
1846	matchState.match(str0: in + pos + i, len0: len - pos - i, pos0: `0`,
1847	minimal0: true, oneTest: true, caretIndex: caretPos - pos - i);
1848	if ((matchState.captured[`0`] == `0`) == ahead [j]->neg)
1849	return false;
1850	}
1851	}
1852	}
1853	#endif
1854	#ifndef QT_NO_REGEXP_CAPTURE
1855	#ifndef QT_NO_REGEXP_BACKREF
1856	for (j = `0`; j < eng->nbrefs; j++) {
1857	if ((a & (QRegExpEngine::Anchor_BackRef1Empty << j)) != `0`) {
1858	int i = eng->captureForOfficialCapture.at(i: j);
1859	if (capBegin[i] != EmptyCapture)
1860	return false;
1861	}
1862	}
1863	#endif
1864	#endif
1865	return true;
1866	}
1867
1868	#ifndef QT_NO_REGEXP_OPTIM
1869	/*
1870	The three following functions are what Jeffrey Friedl would call
1871	transmissions (or bump-alongs). Using one or the other should make
1872	no difference except in performance.
1873	*/
1874
1875	bool QRegExpEngine::goodStringMatch(QRegExpMatchState &matchState) const
1876	{
1877	int k = matchState.pos + goodEarlyStart;
1878	QStringMatcher matcher(goodStr.unicode(), goodStr.length(), cs);
1879	while ((k = matcher.indexIn(str: matchState.in, length: matchState.len, from: k)) != -`1`) {
1880	int from = k - goodLateStart;
1881	int to = k - goodEarlyStart;
1882	if (from > matchState.pos)
1883	matchState.pos = from;
1884
1885	while (matchState.pos <= to) {
1886	if (matchState.matchHere())
1887	return true;
1888	++matchState.pos;
1889	}
1890	++k;
1891	}
1892	return false;
1893	}
1894
1895	bool QRegExpEngine::badCharMatch(QRegExpMatchState &matchState) const
1896	{
1897	int slideHead = `0`;
1898	int slideNext = `0`;
1899	int i;
1900	int lastPos = matchState.len - minl;
1901	memset(s: matchState.slideTab, c: `0`, n: matchState.slideTabSize * sizeof(int));
1902
1903	/*
1904	Set up the slide table, used for the bad-character heuristic,
1905	using the table of first occurrence of each character.
1906	*/
1907	for (i = `0`; i < minl; i++) {
1908	int sk = occ1 [BadChar(matchState.in[matchState.pos + i])];
1909	if (sk == NoOccurrence)
1910	sk = i + `1`;
1911	if (sk > `0`) {
1912	int k = i + `1` - sk;
1913	if (k < `0`) {
1914	sk = i + `1`;
1915	k = `0`;
1916	}
1917	if (sk > matchState.slideTab[k])
1918	matchState.slideTab[k] = sk;
1919	}
1920	}
1921
1922	if (matchState.pos > lastPos)
1923	return false;
1924
1925	for (;;) {
1926	if (++slideNext >= matchState.slideTabSize)
1927	slideNext = `0`;
1928	if (matchState.slideTab[slideHead] > `0`) {
1929	if (matchState.slideTab[slideHead] - `1` > matchState.slideTab[slideNext])
1930	matchState.slideTab[slideNext] = matchState.slideTab[slideHead] - `1`;
1931	matchState.slideTab[slideHead] = `0`;
1932	} else {
1933	if (matchState.matchHere())
1934	return true;
1935	}
1936
1937	if (matchState.pos == lastPos)
1938	break;
1939
1940	/*
1941	Update the slide table. This code has much in common with
1942	the initialization code.
1943	*/
1944	int sk = occ1 [BadChar(matchState.in[matchState.pos + minl])];
1945	if (sk == NoOccurrence) {
1946	matchState.slideTab[slideNext] = minl;
1947	} else if (sk > `0`) {
1948	int k = slideNext + minl - sk;
1949	if (k >= matchState.slideTabSize)
1950	k -= matchState.slideTabSize;
1951	if (sk > matchState.slideTab[k])
1952	matchState.slideTab[k] = sk;
1953	}
1954	slideHead = slideNext;
1955	++matchState.pos;
1956	}
1957	return false;
1958	}
1959	#else
1960	bool QRegExpEngine::bruteMatch(QRegExpMatchState &matchState) const
1961	{
1962	while (matchState.pos <= matchState.len) {
1963	if (matchState.matchHere())
1964	return true;
1965	++matchState.pos;
1966	}
1967	return false;
1968	}
1969	#endif
1970
1971	/*
1972	Here's the core of the engine. It tries to do a match here and now.
1973	*/
1974	bool QRegExpMatchState::matchHere()
1975	{
1976	int ncur = `1`, nnext = `0`;
1977	int i = `0`, j, k, m;
1978	bool stop = false;
1979
1980	matchLen = -`1`;
1981	oneTestMatchedLen = -`1`;
1982	curStack[`0`] = QRegExpEngine::InitialState;
1983
1984	int ncap = eng->ncap;
1985	#ifndef QT_NO_REGEXP_CAPTURE
1986	if (ncap > `0`) {
1987	for (j = `0`; j < ncap; j++) {
1988	curCapBegin[j] = EmptyCapture;
1989	curCapEnd[j] = EmptyCapture;
1990	}
1991	}
1992	#endif
1993
1994	#ifndef QT_NO_REGEXP_BACKREF
1995	while ((ncur > `0` \|\| !sleeping.isEmpty()) && i <= len - pos && !stop)
1996	#else
1997	while (ncur > `0` && i <= len - pos && !stop)
1998	#endif
1999	{
2000	int ch = (i < len - pos) ? in[pos + i].unicode() : `0`;
2001	for (j = `0`; j < ncur; j++) {
2002	int cur = curStack[j];
2003	const QRegExpAutomatonState &scur = eng->s.at(i: cur);
2004	const QVector<int> &outs = scur.outs;
2005	for (k = `0`; k < outs.size(); k++) {
2006	int next = outs.at(i: k);
2007	const QRegExpAutomatonState &snext = eng->s.at(i: next);
2008	bool inside = true;
2009	#if !defined(QT_NO_REGEXP_BACKREF) && !defined(QT_NO_REGEXP_CAPTURE)
2010	int needSomeSleep = `0`;
2011	#endif
2012
2013	/*
2014	First, check if the anchors are anchored properly.
2015	*/
2016	int a = scur.anchors.value(akey: next);
2017	if (a != `0` && !testAnchor(i, a, capBegin: curCapBegin + j * ncap))
2018	inside = false;
2019
2020	/*
2021	If indeed they are, check if the input character is
2022	correct for this transition.
2023	*/
2024	if (inside) {
2025	m = snext.match;
2026	if ((m & (QRegExpEngine::CharClassBit \| QRegExpEngine::BackRefBit)) == `0`) {
2027	if (eng->cs)
2028	inside = (m == ch);
2029	else
2030	inside = (QChar (m).toLower() == QChar (ch).toLower());
2031	} else if (next == QRegExpEngine::FinalState) {
2032	matchLen = i;
2033	stop = minimal;
2034	inside = true;
2035	} else if ((m & QRegExpEngine::CharClassBit) != `0`) {
2036	#ifndef QT_NO_REGEXP_CCLASS
2037	const QRegExpCharClass &cc = eng->cl.at(i: m ^ QRegExpEngine::CharClassBit);
2038	if (eng->cs)
2039	inside = cc.in(ch: QChar (ch));
2040	else if (cc.negative())
2041	inside = cc.in(ch: QChar (ch).toLower()) &&
2042	cc.in(ch: QChar (ch).toUpper());
2043	else
2044	inside = cc.in(ch: QChar (ch).toLower()) \|\|
2045	cc.in(ch: QChar (ch).toUpper());
2046	#endif
2047	#if !defined(QT_NO_REGEXP_BACKREF) && !defined(QT_NO_REGEXP_CAPTURE)
2048	} else { / ((m & QRegExpEngine::BackRefBit) != 0) /
2049	int bref = m ^ QRegExpEngine::BackRefBit;
2050	int ell = j * ncap + eng->captureForOfficialCapture.at(i: bref - `1`);
2051
2052	inside = bref <= ncap && curCapBegin[ell] != EmptyCapture;
2053	if (inside) {
2054	if (eng->cs)
2055	inside = (in[pos + curCapBegin[ell]] == QChar (ch));
2056	else
2057	inside = (in[pos + curCapBegin[ell]].toLower()
2058	== QChar (ch).toLower());
2059	}
2060
2061	if (inside) {
2062	int delta;
2063	if (curCapEnd[ell] == EmptyCapture)
2064	delta = i - curCapBegin[ell];
2065	else
2066	delta = curCapEnd[ell] - curCapBegin[ell];
2067
2068	inside = (delta <= len - (pos + i));
2069	if (inside && delta > `1`) {
2070	int n = `1`;
2071	if (eng->cs) {
2072	while (n < delta) {
2073	if (in[pos + curCapBegin[ell] + n]
2074	!= in[pos + i + n])
2075	break;
2076	++n;
2077	}
2078	} else {
2079	while (n < delta) {
2080	QChar a = in[pos + curCapBegin[ell] + n];
2081	QChar b = in[pos + i + n];
2082	if (a.toLower() != b.toLower())
2083	break;
2084	++n;
2085	}
2086	}
2087	inside = (n == delta);
2088	if (inside)
2089	needSomeSleep = delta - `1`;
2090	}
2091	}
2092	#endif
2093	}
2094	}
2095
2096	/*
2097	We must now update our data structures.
2098	*/
2099	if (inside) {
2100	#ifndef QT_NO_REGEXP_CAPTURE
2101	int capBegin, capEnd;
2102	#endif
2103	/*
2104	If the next state was not encountered yet, all
2105	is fine.
2106	*/
2107	if ((m = inNextStack[next]) == -`1`) {
2108	m = nnext++;
2109	nextStack[m] = next;
2110	inNextStack[next] = m;
2111	#ifndef QT_NO_REGEXP_CAPTURE
2112	capBegin = nextCapBegin + m * ncap;
2113	capEnd = nextCapEnd + m * ncap;
2114
2115	/*
2116	Otherwise, we'll first maintain captures in
2117	temporary arrays, and decide at the end whether
2118	it's best to keep the previous capture zones or
2119	the new ones.
2120	*/
2121	} else {
2122	capBegin = tempCapBegin;
2123	capEnd = tempCapEnd;
2124	#endif
2125	}
2126
2127	#ifndef QT_NO_REGEXP_CAPTURE
2128	/*
2129	Updating the capture zones is much of a task.
2130	*/
2131	if (ncap > `0`) {
2132	memcpy(dest: capBegin, src: curCapBegin + j * ncap, n: ncap * sizeof(int));
2133	memcpy(dest: capEnd, src: curCapEnd + j * ncap, n: ncap * sizeof(int));
2134	int c = scur.atom, n = snext.atom;
2135	int p = -`1`, q = -`1`;
2136	int cap;
2137
2138	/*
2139	Lemma 1. For any x in the range [0..nf), we
2140	have f[x].parent < x.
2141
2142	Proof. By looking at startAtom(), it is
2143	clear that cf < nf holds all the time, and
2144	thus that f[nf].parent < nf.
2145	*/
2146
2147	/*
2148	If we are reentering an atom, we empty all
2149	capture zones inside it.
2150	*/
2151	if ((q = scur.reenter.value(akey: next)) != `0`) {
2152	QBitArray b(eng->nf, false);
2153	b.setBit(i: q, val: true);
2154	for (int ell = q + `1`; ell < eng->nf; ell++) {
2155	if (b.testBit(i: eng->f.at(i: ell).parent)) {
2156	b.setBit(i: ell, val: true);
2157	cap = eng->f.at(i: ell).capture;
2158	if (cap >= `0`) {
2159	capBegin[cap] = EmptyCapture;
2160	capEnd[cap] = EmptyCapture;
2161	}
2162	}
2163	}
2164	p = eng->f.at(i: q).parent;
2165
2166	/*
2167	Otherwise, close the capture zones we are
2168	leaving. We are leaving f[c].capture,
2169	f[f[c].parent].capture,
2170	f[f[f[c].parent].parent].capture, ...,
2171	until f[x].capture, with x such that
2172	f[x].parent is the youngest common ancestor
2173	for c and n.
2174
2175	We go up along c's and n's ancestry until
2176	we find x.
2177	*/
2178	} else {
2179	p = c;
2180	q = n;
2181	while (p != q) {
2182	if (p > q) {
2183	cap = eng->f.at(i: p).capture;
2184	if (cap >= `0`) {
2185	if (capBegin[cap] == i) {
2186	capBegin[cap] = EmptyCapture;
2187	capEnd[cap] = EmptyCapture;
2188	} else {
2189	capEnd[cap] = i;
2190	}
2191	}
2192	p = eng->f.at(i: p).parent;
2193	} else {
2194	q = eng->f.at(i: q).parent;
2195	}
2196	}
2197	}
2198
2199	/*
2200	In any case, we now open the capture zones
2201	we are entering. We work upwards from n
2202	until we reach p (the parent of the atom we
2203	reenter or the youngest common ancestor).
2204	*/
2205	while (n > p) {
2206	cap = eng->f.at(i: n).capture;
2207	if (cap >= `0`) {
2208	capBegin[cap] = i;
2209	capEnd[cap] = EmptyCapture;
2210	}
2211	n = eng->f.at(i: n).parent;
2212	}
2213	/*
2214	If the next state was already in
2215	nextStack, we must choose carefully which
2216	capture zones we want to keep.
2217	*/
2218	if (capBegin == tempCapBegin &&
2219	isBetterCapture(ncap, begin1: capBegin, end1: capEnd, begin2: nextCapBegin + m * ncap,
2220	end2: nextCapEnd + m * ncap)) {
2221	memcpy(dest: nextCapBegin + m * ncap, src: capBegin, n: ncap * sizeof(int));
2222	memcpy(dest: nextCapEnd + m * ncap, src: capEnd, n: ncap * sizeof(int));
2223	}
2224	}
2225	#ifndef QT_NO_REGEXP_BACKREF
2226	/*
2227	We are done with updating the capture zones.
2228	It's now time to put the next state to sleep,
2229	if it needs to, and to remove it from
2230	nextStack.
2231	*/
2232	if (needSomeSleep > `0`) {
2233	QVector<int> zzZ(`2` + `2` * ncap);
2234	zzZ [`0`] = i + needSomeSleep;
2235	zzZ [`1`] = next;
2236	if (ncap > `0`) {
2237	memcpy(dest: zzZ.data() + `2`, src: capBegin, n: ncap * sizeof(int));
2238	memcpy(dest: zzZ.data() + `2` + ncap, src: capEnd, n: ncap * sizeof(int));
2239	}
2240	inNextStack[nextStack[--nnext]] = -`1`;
2241	sleeping.append(t: zzZ);
2242	}
2243	#endif
2244	#endif
2245	}
2246	}
2247	}
2248	#ifndef QT_NO_REGEXP_CAPTURE
2249	/*
2250	If we reached the final state, hurray! Copy the captured
2251	zone.
2252	*/
2253	if (ncap > `0` && (m = inNextStack[QRegExpEngine::FinalState]) != -`1`) {
2254	memcpy(dest: capBegin, src: nextCapBegin + m * ncap, n: ncap * sizeof(int));
2255	memcpy(dest: capEnd, src: nextCapEnd + m * ncap, n: ncap * sizeof(int));
2256	}
2257	#ifndef QT_NO_REGEXP_BACKREF
2258	/*
2259	It's time to wake up the sleepers.
2260	*/
2261	j = `0`;
2262	while (j < sleeping.count()) {
2263	if (sleeping.at(i: j)[`0`] == i) {
2264	const QVector<int> &zzZ = sleeping.at(i: j);
2265	int next = zzZ [`1`];
2266	const int *capBegin = zzZ.data() + `2`;
2267	const int *capEnd = zzZ.data() + `2` + ncap;
2268	bool copyOver = true;
2269
2270	if ((m = inNextStack[next]) == -`1`) {
2271	m = nnext++;
2272	nextStack[m] = next;
2273	inNextStack[next] = m;
2274	} else {
2275	copyOver = isBetterCapture(ncap, begin1: nextCapBegin + m * ncap, end1: nextCapEnd + m * ncap,
2276	begin2: capBegin, end2: capEnd);
2277	}
2278	if (copyOver) {
2279	memcpy(dest: nextCapBegin + m * ncap, src: capBegin, n: ncap * sizeof(int));
2280	memcpy(dest: nextCapEnd + m * ncap, src: capEnd, n: ncap * sizeof(int));
2281	}
2282
2283	sleeping.removeAt(i: j);
2284	} else {
2285	++j;
2286	}
2287	}
2288	#endif
2289	#endif
2290	for (j = `0`; j < nnext; j++)
2291	inNextStack[nextStack[j]] = -`1`;
2292
2293	// avoid needless iteration that confuses oneTestMatchedLen
2294	if (nnext == `1` && nextStack[`0`] == QRegExpEngine::FinalState
2295	#ifndef QT_NO_REGEXP_BACKREF
2296	&& sleeping.isEmpty()
2297	#endif
2298	)
2299	stop = true;
2300
2301	qSwap(value1&: curStack, value2&: nextStack);
2302	#ifndef QT_NO_REGEXP_CAPTURE
2303	qSwap(value1&: curCapBegin, value2&: nextCapBegin);
2304	qSwap(value1&: curCapEnd, value2&: nextCapEnd);
2305	#endif
2306	ncur = nnext;
2307	nnext = `0`;
2308	++i;
2309	}
2310
2311	#ifndef QT_NO_REGEXP_BACKREF
2312	/*
2313	If minimal matching is enabled, we might have some sleepers
2314	left.
2315	*/
2316	if (!sleeping.isEmpty())
2317	sleeping.clear();
2318	#endif
2319
2320	oneTestMatchedLen = i - `1`;
2321	return (matchLen >= `0`);
2322	}
2323
2324	#ifndef QT_NO_REGEXP_CCLASS
2325
2326	QRegExpCharClass::QRegExpCharClass()
2327	: c(`0`), n(false)
2328	{
2329	#ifndef QT_NO_REGEXP_OPTIM
2330	occ1.fill(from: NoOccurrence, asize: NumBadChars);
2331	#endif
2332	}
2333
2334	void QRegExpCharClass::clear()
2335	{
2336	c = `0`;
2337	r.clear();
2338	n = false;
2339	}
2340
2341	void QRegExpCharClass::setNegative(bool negative)
2342	{
2343	n = negative;
2344	#ifndef QT_NO_REGEXP_OPTIM
2345	occ1.fill(from: `0`, asize: NumBadChars);
2346	#endif
2347	}
2348
2349	void QRegExpCharClass::addCategories(uint cats)
2350	{
2351	static const int all_cats = FLAG(QChar::Mark_NonSpacing) \|
2352	FLAG(QChar::Mark_SpacingCombining) \|
2353	FLAG(QChar::Mark_Enclosing) \|
2354	FLAG(QChar::Number_DecimalDigit) \|
2355	FLAG(QChar::Number_Letter) \|
2356	FLAG(QChar::Number_Other) \|
2357	FLAG(QChar::Separator_Space) \|
2358	FLAG(QChar::Separator_Line) \|
2359	FLAG(QChar::Separator_Paragraph) \|
2360	FLAG(QChar::Other_Control) \|
2361	FLAG(QChar::Other_Format) \|
2362	FLAG(QChar::Other_Surrogate) \|
2363	FLAG(QChar::Other_PrivateUse) \|
2364	FLAG(QChar::Other_NotAssigned) \|
2365	FLAG(QChar::Letter_Uppercase) \|
2366	FLAG(QChar::Letter_Lowercase) \|
2367	FLAG(QChar::Letter_Titlecase) \|
2368	FLAG(QChar::Letter_Modifier) \|
2369	FLAG(QChar::Letter_Other) \|
2370	FLAG(QChar::Punctuation_Connector) \|
2371	FLAG(QChar::Punctuation_Dash) \|
2372	FLAG(QChar::Punctuation_Open) \|
2373	FLAG(QChar::Punctuation_Close) \|
2374	FLAG(QChar::Punctuation_InitialQuote) \|
2375	FLAG(QChar::Punctuation_FinalQuote) \|
2376	FLAG(QChar::Punctuation_Other) \|
2377	FLAG(QChar::Symbol_Math) \|
2378	FLAG(QChar::Symbol_Currency) \|
2379	FLAG(QChar::Symbol_Modifier) \|
2380	FLAG(QChar::Symbol_Other);
2381	c \|= (all_cats & cats);
2382	#ifndef QT_NO_REGEXP_OPTIM
2383	occ1.fill(from: `0`, asize: NumBadChars);
2384	#endif
2385	}
2386
2387	void QRegExpCharClass::addRange(ushort from, ushort to)
2388	{
2389	if (from > to)
2390	qSwap(value1&: from, value2&: to);
2391	int m = r.size();
2392	r.resize(asize: m + `1`);
2393	r [m].from = from;
2394	r [m].len = to - from + `1`;
2395
2396	#ifndef QT_NO_REGEXP_OPTIM
2397	int i;
2398
2399	if (to - from < NumBadChars) {
2400	if (from % NumBadChars <= to % NumBadChars) {
2401	for (i = from % NumBadChars; i <= to % NumBadChars; i++)
2402	occ1 [i] = `0`;
2403	} else {
2404	for (i = `0`; i <= to % NumBadChars; i++)
2405	occ1 [i] = `0`;
2406	for (i = from % NumBadChars; i < NumBadChars; i++)
2407	occ1 [i] = `0`;
2408	}
2409	} else {
2410	occ1.fill(from: `0`, asize: NumBadChars);
2411	}
2412	#endif
2413	}
2414
2415	bool QRegExpCharClass::in(QChar ch) const
2416	{
2417	#ifndef QT_NO_REGEXP_OPTIM
2418	if (occ1.at(BadChar(ch)) == NoOccurrence)
2419	return n;
2420	#endif
2421
2422	if (c != `0` && (c & FLAG(ch.category())) != `0`)
2423	return !n;
2424
2425	const int uc = ch.unicode();
2426	int size = r.size();
2427
2428	for (int i = `0`; i < size; ++i) {
2429	const QRegExpCharClassRange &range = r.at(i);
2430	if (uint(uc - range.from) < uint(r.at(i).len))
2431	return !n;
2432	}
2433	return n;
2434	}
2435
2436	#if defined(QT_DEBUG)
2437	void QRegExpCharClass::dump() const
2438	{
2439	int i;
2440	qDebug(msg: " %stive character class", n ? "nega" : "posi");
2441	#ifndef QT_NO_REGEXP_CCLASS
2442	if (c != `0`)
2443	qDebug(msg: " categories 0x%.8x", c);
2444	#endif
2445	for (i = `0`; i < r.size(); i++)
2446	qDebug(msg: " 0x%.4x through 0x%.4x", r [i].from, r [i].from + r [i].len - `1`);
2447	}
2448	#endif
2449	#endif
2450
2451	QRegExpEngine::Box::Box(QRegExpEngine *engine)
2452	: eng(engine), skipanchors(`0`)
2453	#ifndef QT_NO_REGEXP_OPTIM
2454	, earlyStart(`0`), lateStart(`0`), maxl(`0`)
2455	#endif
2456	{
2457	#ifndef QT_NO_REGEXP_OPTIM
2458	occ1.fill(from: NoOccurrence, asize: NumBadChars);
2459	#endif
2460	minl = `0`;
2461	}
2462
2463	QRegExpEngine::Box &QRegExpEngine::Box::operator=(const Box &b)
2464	{
2465	eng = b.eng;
2466	ls = b.ls;
2467	rs = b.rs;
2468	lanchors = b.lanchors;
2469	ranchors = b.ranchors;
2470	skipanchors = b.skipanchors;
2471	#ifndef QT_NO_REGEXP_OPTIM
2472	earlyStart = b.earlyStart;
2473	lateStart = b.lateStart;
2474	str = b.str;
2475	leftStr = b.leftStr;
2476	rightStr = b.rightStr;
2477	maxl = b.maxl;
2478	occ1 = b.occ1;
2479	#endif
2480	minl = b.minl;
2481	return *this;
2482	}
2483
2484	void QRegExpEngine::Box::set(QChar ch)
2485	{
2486	ls.resize(asize: `1`);
2487	ls [`0`] = eng->createState(ch);
2488	rs = ls;
2489	#ifndef QT_NO_REGEXP_OPTIM
2490	str = ch;
2491	leftStr = ch;
2492	rightStr = ch;
2493	maxl = `1`;
2494	occ1 [BadChar(ch)] = `0`;
2495	#endif
2496	minl = `1`;
2497	}
2498
2499	void QRegExpEngine::Box::set(const QRegExpCharClass &cc)
2500	{
2501	ls.resize(asize: `1`);
2502	ls [`0`] = eng->createState(cc);
2503	rs = ls;
2504	#ifndef QT_NO_REGEXP_OPTIM
2505	maxl = `1`;
2506	occ1 = cc.firstOccurrence();
2507	#endif
2508	minl = `1`;
2509	}
2510
2511	#ifndef QT_NO_REGEXP_BACKREF
2512	void QRegExpEngine::Box::set(int bref)
2513	{
2514	ls.resize(asize: `1`);
2515	ls [`0`] = eng->createState(bref);
2516	rs = ls;
2517	if (bref >= `1` && bref <= MaxBackRefs)
2518	skipanchors = Anchor_BackRef0Empty << bref;
2519	#ifndef QT_NO_REGEXP_OPTIM
2520	maxl = InftyLen;
2521	#endif
2522	minl = `0`;
2523	}
2524	#endif
2525
2526	void QRegExpEngine::Box::cat(const Box &b)
2527	{
2528	eng->addCatTransitions(from: rs, to: b.ls);
2529	addAnchorsToEngine(to: b);
2530	if (minl == `0`) {
2531	lanchors.insert(map: b.lanchors);
2532	if (skipanchors != `0`) {
2533	for (int i = `0`; i < b.ls.size(); i++) {
2534	int a = eng->anchorConcatenation(a: lanchors.value(akey: b.ls.at(i), adefaultValue: `0`), b: skipanchors);
2535	lanchors.insert(akey: b.ls.at(i), avalue: a);
2536	}
2537	}
2538	mergeInto(a: &ls, b: b.ls);
2539	}
2540	if (b.minl == `0`) {
2541	ranchors.insert(map: b.ranchors);
2542	if (b.skipanchors != `0`) {
2543	for (int i = `0`; i < rs.size(); i++) {
2544	int a = eng->anchorConcatenation(a: ranchors.value(akey: rs.at(i), adefaultValue: `0`), b: b.skipanchors);
2545	ranchors.insert(akey: rs.at(i), avalue: a);
2546	}
2547	}
2548	mergeInto(a: &rs, b: b.rs);
2549	} else {
2550	ranchors = b.ranchors;
2551	rs = b.rs;
2552	}
2553
2554	#ifndef QT_NO_REGEXP_OPTIM
2555	if (maxl != InftyLen) {
2556	if (rightStr.length() + b.leftStr.length() >
2557	qMax(a: str.length(), b: b.str.length())) {
2558	earlyStart = minl - rightStr.length();
2559	lateStart = maxl - rightStr.length();
2560	str = rightStr + b.leftStr;
2561	} else if (b.str.length() > str.length()) {
2562	earlyStart = minl + b.earlyStart;
2563	lateStart = maxl + b.lateStart;
2564	str = b.str;
2565	}
2566	}
2567
2568	if (leftStr.length() == maxl)
2569	leftStr += b.leftStr;
2570
2571	if (b.rightStr.length() == b.maxl) {
2572	rightStr += b.rightStr;
2573	} else {
2574	rightStr = b.rightStr;
2575	}
2576
2577	if (maxl == InftyLen \|\| b.maxl == InftyLen) {
2578	maxl = InftyLen;
2579	} else {
2580	maxl += b.maxl;
2581	}
2582
2583	for (int i = `0`; i < NumBadChars; i++) {
2584	if (b.occ1.at(i) != NoOccurrence && minl + b.occ1.at(i) < occ1.at(i))
2585	occ1 [i] = minl + b.occ1.at(i);
2586	}
2587	#endif
2588
2589	minl += b.minl;
2590	if (minl == `0`)
2591	skipanchors = eng->anchorConcatenation(a: skipanchors, b: b.skipanchors);
2592	else
2593	skipanchors = `0`;
2594	}
2595
2596	void QRegExpEngine::Box::orx(const Box &b)
2597	{
2598	mergeInto(a: &ls, b: b.ls);
2599	lanchors.insert(map: b.lanchors);
2600	mergeInto(a: &rs, b: b.rs);
2601	ranchors.insert(map: b.ranchors);
2602
2603	if (b.minl == `0`) {
2604	if (minl == `0`)
2605	skipanchors = eng->anchorAlternation(a: skipanchors, b: b.skipanchors);
2606	else
2607	skipanchors = b.skipanchors;
2608	}
2609
2610	#ifndef QT_NO_REGEXP_OPTIM
2611	for (int i = `0`; i < NumBadChars; i++) {
2612	if (occ1.at(i) > b.occ1.at(i))
2613	occ1 [i] = b.occ1.at(i);
2614	}
2615	earlyStart = `0`;
2616	lateStart = `0`;
2617	str = QString ();
2618	leftStr = QString ();
2619	rightStr = QString ();
2620	if (b.maxl > maxl)
2621	maxl = b.maxl;
2622	#endif
2623	if (b.minl < minl)
2624	minl = b.minl;
2625	}
2626
2627	void QRegExpEngine::Box::plus(int atom)
2628	{
2629	#ifndef QT_NO_REGEXP_CAPTURE
2630	eng->addPlusTransitions(from: rs, to: ls, atom);
2631	#else
2632	Q_UNUSED(atom);
2633	eng->addCatTransitions(rs, ls);
2634	#endif
2635	addAnchorsToEngine(to: *this);
2636	#ifndef QT_NO_REGEXP_OPTIM
2637	maxl = InftyLen;
2638	#endif
2639	}
2640
2641	void QRegExpEngine::Box::opt()
2642	{
2643	#ifndef QT_NO_REGEXP_OPTIM
2644	earlyStart = `0`;
2645	lateStart = `0`;
2646	str = QString ();
2647	leftStr = QString ();
2648	rightStr = QString ();
2649	#endif
2650	skipanchors = `0`;
2651	minl = `0`;
2652	}
2653
2654	void QRegExpEngine::Box::catAnchor(int a)
2655	{
2656	if (a != `0`) {
2657	for (int i = `0`; i < rs.size(); i++) {
2658	a = eng->anchorConcatenation(a: ranchors.value(akey: rs.at(i), adefaultValue: `0`), b: a);
2659	ranchors.insert(akey: rs.at(i), avalue: a);
2660	}
2661	if (minl == `0`)
2662	skipanchors = eng->anchorConcatenation(a: skipanchors, b: a);
2663	}
2664	}
2665
2666	#ifndef QT_NO_REGEXP_OPTIM
2667	void QRegExpEngine::Box::setupHeuristics()
2668	{
2669	eng->goodEarlyStart = earlyStart;
2670	eng->goodLateStart = lateStart;
2671	eng->goodStr = eng->cs ? str : str.toLower();
2672
2673	eng->minl = minl;
2674	if (eng->cs) {
2675	/*
2676	A regular expression such as 112\|1 has occ1['2'] = 2 and minl =
2677	1 at this point. An entry of occ1 has to be at most minl or
2678	infinity for the rest of the algorithm to go well.
2679
2680	We waited until here before normalizing these cases (instead of
2681	doing it in Box::orx()) because sometimes things improve by
2682	themselves. Consider for example (112\|1)34.
2683	*/
2684	for (int i = `0`; i < NumBadChars; i++) {
2685	if (occ1.at(i) != NoOccurrence && occ1.at(i) >= minl)
2686	occ1 [i] = minl;
2687	}
2688	eng->occ1 = occ1;
2689	} else {
2690	eng->occ1.fill(from: `0`, asize: NumBadChars);
2691	}
2692
2693	eng->heuristicallyChooseHeuristic();
2694	}
2695	#endif
2696
2697	#if defined(QT_DEBUG)
2698	void QRegExpEngine::Box::dump() const
2699	{
2700	int i;
2701	qDebug(msg: "Box of at least %d character%s", minl, minl == `1` ? "" : "s");
2702	qDebug(msg: " Left states:");
2703	for (i = `0`; i < ls.size(); i++) {
2704	if (lanchors.value(akey: ls [i], adefaultValue: `0`) == `0`)
2705	qDebug(msg: " %d", ls [i]);
2706	else
2707	qDebug(msg: " %d [anchors 0x%.8x]", ls [i], lanchors [ls [i]]);
2708	}
2709	qDebug(msg: " Right states:");
2710	for (i = `0`; i < rs.size(); i++) {
2711	if (ranchors.value(akey: rs [i], adefaultValue: `0`) == `0`)
2712	qDebug(msg: " %d", rs [i]);
2713	else
2714	qDebug(msg: " %d [anchors 0x%.8x]", rs [i], ranchors [rs [i]]);
2715	}
2716	qDebug(msg: " Skip anchors: 0x%.8x", skipanchors);
2717	}
2718	#endif
2719
2720	void QRegExpEngine::Box::addAnchorsToEngine(const Box &to) const
2721	{
2722	for (int i = `0`; i < to.ls.size(); i++) {
2723	for (int j = `0`; j < rs.size(); j++) {
2724	int a = eng->anchorConcatenation(a: ranchors.value(akey: rs.at(i: j), adefaultValue: `0`),
2725	b: to.lanchors.value(akey: to.ls.at(i), adefaultValue: `0`));
2726	eng->addAnchors(from: rs [j], to: to.ls [i], a);
2727	}
2728	}
2729	}
2730
2731	#ifndef QT_NO_REGEXP_CCLASS
2732	// fast lookup hash for xml schema extensions
2733	// sorted by name for b-search
2734	static const struct CategoriesRangeMapEntry {
2735	const char name[`40`];
2736	uint first, second;
2737	} categoriesRangeMap[] = {
2738	{ .name: "AegeanNumbers", .first: `0x10100`, .second: `0x1013F` },
2739	{ .name: "AlphabeticPresentationForms", .first: `0xFB00`, .second: `0xFB4F` },
2740	{ .name: "AncientGreekMusicalNotation", .first: `0x1D200`, .second: `0x1D24F` },
2741	{ .name: "AncientGreekNumbers", .first: `0x10140`, .second: `0x1018F` },
2742	{ .name: "Arabic", .first: `0x0600`, .second: `0x06FF` },
2743	{ .name: "ArabicPresentationForms-A", .first: `0xFB50`, .second: `0xFDFF` },
2744	{ .name: "ArabicPresentationForms-B", .first: `0xFE70`, .second: `0xFEFF` },
2745	{ .name: "ArabicSupplement", .first: `0x0750`, .second: `0x077F` },
2746	{ .name: "Armenian", .first: `0x0530`, .second: `0x058F` },
2747	{ .name: "Arrows", .first: `0x2190`, .second: `0x21FF` },
2748	{ .name: "BasicLatin", .first: `0x0000`, .second: `0x007F` },
2749	{ .name: "Bengali", .first: `0x0980`, .second: `0x09FF` },
2750	{ .name: "BlockElements", .first: `0x2580`, .second: `0x259F` },
2751	{ .name: "Bopomofo", .first: `0x3100`, .second: `0x312F` },
2752	{ .name: "BopomofoExtended", .first: `0x31A0`, .second: `0x31BF` },
2753	{ .name: "BoxDrawing", .first: `0x2500`, .second: `0x257F` },
2754	{ .name: "BraillePatterns", .first: `0x2800`, .second: `0x28FF` },
2755	{ .name: "Buginese", .first: `0x1A00`, .second: `0x1A1F` },
2756	{ .name: "Buhid", .first: `0x1740`, .second: `0x175F` },
2757	{ .name: "ByzantineMusicalSymbols", .first: `0x1D000`, .second: `0x1D0FF` },
2758	{ .name: "CJKCompatibility", .first: `0x3300`, .second: `0x33FF` },
2759	{ .name: "CJKCompatibilityForms", .first: `0xFE30`, .second: `0xFE4F` },
2760	{ .name: "CJKCompatibilityIdeographs", .first: `0xF900`, .second: `0xFAFF` },
2761	{ .name: "CJKCompatibilityIdeographsSupplement", .first: `0x2F800`, .second: `0x2FA1F` },
2762	{ .name: "CJKRadicalsSupplement", .first: `0x2E80`, .second: `0x2EFF` },
2763	{ .name: "CJKStrokes", .first: `0x31C0`, .second: `0x31EF` },
2764	{ .name: "CJKSymbolsandPunctuation", .first: `0x3000`, .second: `0x303F` },
2765	{ .name: "CJKUnifiedIdeographs", .first: `0x4E00`, .second: `0x9FFF` },
2766	{ .name: "CJKUnifiedIdeographsExtensionA", .first: `0x3400`, .second: `0x4DB5` },
2767	{ .name: "CJKUnifiedIdeographsExtensionB", .first: `0x20000`, .second: `0x2A6DF` },
2768	{ .name: "Cherokee", .first: `0x13A0`, .second: `0x13FF` },
2769	{ .name: "CombiningDiacriticalMarks", .first: `0x0300`, .second: `0x036F` },
2770	{ .name: "CombiningDiacriticalMarksSupplement", .first: `0x1DC0`, .second: `0x1DFF` },
2771	{ .name: "CombiningHalfMarks", .first: `0xFE20`, .second: `0xFE2F` },
2772	{ .name: "CombiningMarksforSymbols", .first: `0x20D0`, .second: `0x20FF` },
2773	{ .name: "ControlPictures", .first: `0x2400`, .second: `0x243F` },
2774	{ .name: "Coptic", .first: `0x2C80`, .second: `0x2CFF` },
2775	{ .name: "CurrencySymbols", .first: `0x20A0`, .second: `0x20CF` },
2776	{ .name: "CypriotSyllabary", .first: `0x10800`, .second: `0x1083F` },
2777	{ .name: "Cyrillic", .first: `0x0400`, .second: `0x04FF` },
2778	{ .name: "CyrillicSupplement", .first: `0x0500`, .second: `0x052F` },
2779	{ .name: "Deseret", .first: `0x10400`, .second: `0x1044F` },
2780	{ .name: "Devanagari", .first: `0x0900`, .second: `0x097F` },
2781	{ .name: "Dingbats", .first: `0x2700`, .second: `0x27BF` },
2782	{ .name: "EnclosedAlphanumerics", .first: `0x2460`, .second: `0x24FF` },
2783	{ .name: "EnclosedCJKLettersandMonths", .first: `0x3200`, .second: `0x32FF` },
2784	{ .name: "Ethiopic", .first: `0x1200`, .second: `0x137F` },
2785	{ .name: "EthiopicExtended", .first: `0x2D80`, .second: `0x2DDF` },
2786	{ .name: "EthiopicSupplement", .first: `0x1380`, .second: `0x139F` },
2787	{ .name: "GeneralPunctuation", .first: `0x2000`, .second: `0x206F` },
2788	{ .name: "GeometricShapes", .first: `0x25A0`, .second: `0x25FF` },
2789	{ .name: "Georgian", .first: `0x10A0`, .second: `0x10FF` },
2790	{ .name: "GeorgianSupplement", .first: `0x2D00`, .second: `0x2D2F` },
2791	{ .name: "Glagolitic", .first: `0x2C00`, .second: `0x2C5F` },
2792	{ .name: "Gothic", .first: `0x10330`, .second: `0x1034F` },
2793	{ .name: "Greek", .first: `0x0370`, .second: `0x03FF` },
2794	{ .name: "GreekExtended", .first: `0x1F00`, .second: `0x1FFF` },
2795	{ .name: "Gujarati", .first: `0x0A80`, .second: `0x0AFF` },
2796	{ .name: "Gurmukhi", .first: `0x0A00`, .second: `0x0A7F` },
2797	{ .name: "HalfwidthandFullwidthForms", .first: `0xFF00`, .second: `0xFFEF` },
2798	{ .name: "HangulCompatibilityJamo", .first: `0x3130`, .second: `0x318F` },
2799	{ .name: "HangulJamo", .first: `0x1100`, .second: `0x11FF` },
2800	{ .name: "HangulSyllables", .first: `0xAC00`, .second: `0xD7A3` },
2801	{ .name: "Hanunoo", .first: `0x1720`, .second: `0x173F` },
2802	{ .name: "Hebrew", .first: `0x0590`, .second: `0x05FF` },
2803	{ .name: "Hiragana", .first: `0x3040`, .second: `0x309F` },
2804	{ .name: "IPAExtensions", .first: `0x0250`, .second: `0x02AF` },
2805	{ .name: "IdeographicDescriptionCharacters", .first: `0x2FF0`, .second: `0x2FFF` },
2806	{ .name: "Kanbun", .first: `0x3190`, .second: `0x319F` },
2807	{ .name: "KangxiRadicals", .first: `0x2F00`, .second: `0x2FDF` },
2808	{ .name: "Kannada", .first: `0x0C80`, .second: `0x0CFF` },
2809	{ .name: "Katakana", .first: `0x30A0`, .second: `0x30FF` },
2810	{ .name: "KatakanaPhoneticExtensions", .first: `0x31F0`, .second: `0x31FF` },
2811	{ .name: "Kharoshthi", .first: `0x10A00`, .second: `0x10A5F` },
2812	{ .name: "Khmer", .first: `0x1780`, .second: `0x17FF` },
2813	{ .name: "KhmerSymbols", .first: `0x19E0`, .second: `0x19FF` },
2814	{ .name: "Lao", .first: `0x0E80`, .second: `0x0EFF` },
2815	{ .name: "Latin-1Supplement", .first: `0x0080`, .second: `0x00FF` },
2816	{ .name: "LatinExtended-A", .first: `0x0100`, .second: `0x017F` },
2817	{ .name: "LatinExtended-B", .first: `0x0180`, .second: `0x024F` },
2818	{ .name: "LatinExtendedAdditional", .first: `0x1E00`, .second: `0x1EFF` },
2819	{ .name: "LetterlikeSymbols", .first: `0x2100`, .second: `0x214F` },
2820	{ .name: "Limbu", .first: `0x1900`, .second: `0x194F` },
2821	{ .name: "LinearBIdeograms", .first: `0x10080`, .second: `0x100FF` },
2822	{ .name: "LinearBSyllabary", .first: `0x10000`, .second: `0x1007F` },
2823	{ .name: "Malayalam", .first: `0x0D00`, .second: `0x0D7F` },
2824	{ .name: "MathematicalAlphanumericSymbols", .first: `0x1D400`, .second: `0x1D7FF` },
2825	{ .name: "MathematicalOperators", .first: `0x2200`, .second: `0x22FF` },
2826	{ .name: "MiscellaneousMathematicalSymbols-A", .first: `0x27C0`, .second: `0x27EF` },
2827	{ .name: "MiscellaneousMathematicalSymbols-B", .first: `0x2980`, .second: `0x29FF` },
2828	{ .name: "MiscellaneousSymbols", .first: `0x2600`, .second: `0x26FF` },
2829	{ .name: "MiscellaneousSymbolsandArrows", .first: `0x2B00`, .second: `0x2BFF` },
2830	{ .name: "MiscellaneousTechnical", .first: `0x2300`, .second: `0x23FF` },
2831	{ .name: "ModifierToneLetters", .first: `0xA700`, .second: `0xA71F` },
2832	{ .name: "Mongolian", .first: `0x1800`, .second: `0x18AF` },
2833	{ .name: "MusicalSymbols", .first: `0x1D100`, .second: `0x1D1FF` },
2834	{ .name: "Myanmar", .first: `0x1000`, .second: `0x109F` },
2835	{ .name: "NewTaiLue", .first: `0x1980`, .second: `0x19DF` },
2836	{ .name: "NumberForms", .first: `0x2150`, .second: `0x218F` },
2837	{ .name: "Ogham", .first: `0x1680`, .second: `0x169F` },
2838	{ .name: "OldItalic", .first: `0x10300`, .second: `0x1032F` },
2839	{ .name: "OldPersian", .first: `0x103A0`, .second: `0x103DF` },
2840	{ .name: "OpticalCharacterRecognition", .first: `0x2440`, .second: `0x245F` },
2841	{ .name: "Oriya", .first: `0x0B00`, .second: `0x0B7F` },
2842	{ .name: "Osmanya", .first: `0x10480`, .second: `0x104AF` },
2843	{ .name: "PhoneticExtensions", .first: `0x1D00`, .second: `0x1D7F` },
2844	{ .name: "PhoneticExtensionsSupplement", .first: `0x1D80`, .second: `0x1DBF` },
2845	{ .name: "PrivateUse", .first: `0xE000`, .second: `0xF8FF` },
2846	{ .name: "Runic", .first: `0x16A0`, .second: `0x16FF` },
2847	{ .name: "Shavian", .first: `0x10450`, .second: `0x1047F` },
2848	{ .name: "Sinhala", .first: `0x0D80`, .second: `0x0DFF` },
2849	{ .name: "SmallFormVariants", .first: `0xFE50`, .second: `0xFE6F` },
2850	{ .name: "SpacingModifierLetters", .first: `0x02B0`, .second: `0x02FF` },
2851	{ .name: "Specials", .first: `0xFFF0`, .second: `0xFFFF` },
2852	{ .name: "SuperscriptsandSubscripts", .first: `0x2070`, .second: `0x209F` },
2853	{ .name: "SupplementalArrows-A", .first: `0x27F0`, .second: `0x27FF` },
2854	{ .name: "SupplementalArrows-B", .first: `0x2900`, .second: `0x297F` },
2855	{ .name: "SupplementalMathematicalOperators", .first: `0x2A00`, .second: `0x2AFF` },
2856	{ .name: "SupplementalPunctuation", .first: `0x2E00`, .second: `0x2E7F` },
2857	{ .name: "SupplementaryPrivateUseArea-A", .first: `0xF0000`, .second: `0xFFFFF` },
2858	{ .name: "SupplementaryPrivateUseArea-B", .first: `0x100000`, .second: `0x10FFFF` },
2859	{ .name: "SylotiNagri", .first: `0xA800`, .second: `0xA82F` },
2860	{ .name: "Syriac", .first: `0x0700`, .second: `0x074F` },
2861	{ .name: "Tagalog", .first: `0x1700`, .second: `0x171F` },
2862	{ .name: "Tagbanwa", .first: `0x1760`, .second: `0x177F` },
2863	{ .name: "Tags", .first: `0xE0000`, .second: `0xE007F` },
2864	{ .name: "TaiLe", .first: `0x1950`, .second: `0x197F` },
2865	{ .name: "TaiXuanJingSymbols", .first: `0x1D300`, .second: `0x1D35F` },
2866	{ .name: "Tamil", .first: `0x0B80`, .second: `0x0BFF` },
2867	{ .name: "Telugu", .first: `0x0C00`, .second: `0x0C7F` },
2868	{ .name: "Thaana", .first: `0x0780`, .second: `0x07BF` },
2869	{ .name: "Thai", .first: `0x0E00`, .second: `0x0E7F` },
2870	{ .name: "Tibetan", .first: `0x0F00`, .second: `0x0FFF` },
2871	{ .name: "Tifinagh", .first: `0x2D30`, .second: `0x2D7F` },
2872	{ .name: "Ugaritic", .first: `0x10380`, .second: `0x1039F` },
2873	{ .name: "UnifiedCanadianAboriginalSyllabics", .first: `0x1400`, .second: `0x167F` },
2874	{ .name: "VariationSelectors", .first: `0xFE00`, .second: `0xFE0F` },
2875	{ .name: "VariationSelectorsSupplement", .first: `0xE0100`, .second: `0xE01EF` },
2876	{ .name: "VerticalForms", .first: `0xFE10`, .second: `0xFE1F` },
2877	{ .name: "YiRadicals", .first: `0xA490`, .second: `0xA4CF` },
2878	{ .name: "YiSyllables", .first: `0xA000`, .second: `0xA48F` },
2879	{ .name: "YijingHexagramSymbols", .first: `0x4DC0`, .second: `0x4DFF` }
2880	};
2881
2882	inline bool operator<(const CategoriesRangeMapEntry &entry1, const CategoriesRangeMapEntry &entry2)
2883	{ return qstrcmp(str1: entry1.name, str2: entry2.name) < `0`; }
2884	inline bool operator<(const char name, const* CategoriesRangeMapEntry &entry)
2885	{ return qstrcmp(str1: name, str2: entry.name) < `0`; }
2886	inline bool operator<(const CategoriesRangeMapEntry &entry, const char *name)
2887	{ return qstrcmp(str1: entry.name, str2: name) < `0`; }
2888	#endif // QT_NO_REGEXP_CCLASS
2889
2890	int QRegExpEngine::getChar()
2891	{
2892	return (yyPos == yyLen) ? EOS : yyIn[yyPos++].unicode();
2893	}
2894
2895	int QRegExpEngine::getEscape()
2896	{
2897	#ifndef QT_NO_REGEXP_ESCAPE
2898	const char tab[] = "afnrtv"; // no b, as \b means word boundary
2899	const char backTab[] = "\a\f\n\r\t\v";
2900	ushort low;
2901	int i;
2902	#endif
2903	ushort val;
2904	int prevCh = yyCh;
2905
2906	if (prevCh == EOS) {
2907	error(RXERR_END);
2908	return Tok_Char \| `'\\'`;
2909	}
2910	yyCh = getChar();
2911	#ifndef QT_NO_REGEXP_ESCAPE
2912	if ((prevCh & ~`0xff`) == `0`) {
2913	const char *p = strchr(s: tab, c: prevCh);
2914	if (p != nullptr)
2915	return Tok_Char \| backTab[p - tab];
2916	}
2917	#endif
2918
2919	switch (prevCh) {
2920	#ifndef QT_NO_REGEXP_ESCAPE
2921	case `'0'`:
2922	val = `0`;
2923	for (i = `0`; i < `3`; i++) {
2924	if (yyCh >= `'0'` && yyCh <= `'7'`)
2925	val = (val << `3`) \| (yyCh - `'0'`);
2926	else
2927	break;
2928	yyCh = getChar();
2929	}
2930	if ((val & ~`0377`) != `0`)
2931	error(RXERR_OCTAL);
2932	return Tok_Char \| val;
2933	#endif
2934	#ifndef QT_NO_REGEXP_ESCAPE
2935	case `'B'`:
2936	return Tok_NonWord;
2937	#endif
2938	#ifndef QT_NO_REGEXP_CCLASS
2939	case `'D'`:
2940	// see QChar::isDigit()
2941	yyCharClass ->addCategories(cats: uint(-`1`) ^ FLAG(QChar::Number_DecimalDigit));
2942	return Tok_CharClass;
2943	case `'S'`:
2944	// see QChar::isSpace()
2945	yyCharClass ->addCategories(cats: uint(-`1`) ^ (FLAG(QChar::Separator_Space) \|
2946	FLAG(QChar::Separator_Line) \|
2947	FLAG(QChar::Separator_Paragraph) \|
2948	FLAG(QChar::Other_Control)));
2949	yyCharClass ->addRange(from: `0x0000`, to: `0x0008`);
2950	yyCharClass ->addRange(from: `0x000e`, to: `0x001f`);
2951	yyCharClass ->addRange(from: `0x007f`, to: `0x0084`);
2952	yyCharClass ->addRange(from: `0x0086`, to: `0x009f`);
2953	return Tok_CharClass;
2954	case `'W'`:
2955	// see QChar::isLetterOrNumber() and QChar::isMark()
2956	yyCharClass ->addCategories(cats: uint(-`1`) ^ (FLAG(QChar::Mark_NonSpacing) \|
2957	FLAG(QChar::Mark_SpacingCombining) \|
2958	FLAG(QChar::Mark_Enclosing) \|
2959	FLAG(QChar::Number_DecimalDigit) \|
2960	FLAG(QChar::Number_Letter) \|
2961	FLAG(QChar::Number_Other) \|
2962	FLAG(QChar::Letter_Uppercase) \|
2963	FLAG(QChar::Letter_Lowercase) \|
2964	FLAG(QChar::Letter_Titlecase) \|
2965	FLAG(QChar::Letter_Modifier) \|
2966	FLAG(QChar::Letter_Other) \|
2967	FLAG(QChar::Punctuation_Connector)));
2968	yyCharClass ->addRange(from: `0x203f`, to: `0x2040`);
2969	yyCharClass ->addSingleton(ch: `0x2040`);
2970	yyCharClass ->addSingleton(ch: `0x2054`);
2971	yyCharClass ->addSingleton(ch: `0x30fb`);
2972	yyCharClass ->addRange(from: `0xfe33`, to: `0xfe34`);
2973	yyCharClass ->addRange(from: `0xfe4d`, to: `0xfe4f`);
2974	yyCharClass ->addSingleton(ch: `0xff3f`);
2975	yyCharClass ->addSingleton(ch: `0xff65`);
2976	return Tok_CharClass;
2977	#endif
2978	#ifndef QT_NO_REGEXP_ESCAPE
2979	case `'b'`:
2980	return Tok_Word;
2981	#endif
2982	#ifndef QT_NO_REGEXP_CCLASS
2983	case `'d'`:
2984	// see QChar::isDigit()
2985	yyCharClass ->addCategories(FLAG(QChar::Number_DecimalDigit));
2986	return Tok_CharClass;
2987	case `'s'`:
2988	// see QChar::isSpace()
2989	yyCharClass ->addCategories(FLAG(QChar::Separator_Space) \|
2990	FLAG(QChar::Separator_Line) \|
2991	FLAG(QChar::Separator_Paragraph));
2992	yyCharClass ->addRange(from: `0x0009`, to: `0x000d`);
2993	yyCharClass ->addSingleton(ch: `0x0085`);
2994	return Tok_CharClass;
2995	case `'w'`:
2996	// see QChar::isLetterOrNumber() and QChar::isMark()
2997	yyCharClass ->addCategories(FLAG(QChar::Mark_NonSpacing) \|
2998	FLAG(QChar::Mark_SpacingCombining) \|
2999	FLAG(QChar::Mark_Enclosing) \|
3000	FLAG(QChar::Number_DecimalDigit) \|
3001	FLAG(QChar::Number_Letter) \|
3002	FLAG(QChar::Number_Other) \|
3003	FLAG(QChar::Letter_Uppercase) \|
3004	FLAG(QChar::Letter_Lowercase) \|
3005	FLAG(QChar::Letter_Titlecase) \|
3006	FLAG(QChar::Letter_Modifier) \|
3007	FLAG(QChar::Letter_Other));
3008	yyCharClass ->addSingleton(ch: `0x005f`); // '_'
3009	return Tok_CharClass;
3010	case `'I'`:
3011	if (!xmlSchemaExtensions)
3012	break;
3013	yyCharClass ->setNegative(!yyCharClass ->negative());
3014	Q_FALLTHROUGH();
3015	case `'i'`:
3016	if (xmlSchemaExtensions) {
3017	yyCharClass ->addCategories(FLAG(QChar::Mark_NonSpacing) \|
3018	FLAG(QChar::Mark_SpacingCombining) \|
3019	FLAG(QChar::Mark_Enclosing) \|
3020	FLAG(QChar::Number_DecimalDigit) \|
3021	FLAG(QChar::Number_Letter) \|
3022	FLAG(QChar::Number_Other) \|
3023	FLAG(QChar::Letter_Uppercase) \|
3024	FLAG(QChar::Letter_Lowercase) \|
3025	FLAG(QChar::Letter_Titlecase) \|
3026	FLAG(QChar::Letter_Modifier) \|
3027	FLAG(QChar::Letter_Other));
3028	yyCharClass ->addSingleton(ch: `0x003a`); // ':'
3029	yyCharClass ->addSingleton(ch: `0x005f`); // '_'
3030	yyCharClass ->addRange(from: `0x0041`, to: `0x005a`); // [A-Z]
3031	yyCharClass ->addRange(from: `0x0061`, to: `0x007a`); // [a-z]
3032	yyCharClass ->addRange(from: `0xc0`, to: `0xd6`);
3033	yyCharClass ->addRange(from: `0xd8`, to: `0xf6`);
3034	yyCharClass ->addRange(from: `0xf8`, to: `0x2ff`);
3035	yyCharClass ->addRange(from: `0x370`, to: `0x37d`);
3036	yyCharClass ->addRange(from: `0x37f`, to: `0x1fff`);
3037	yyCharClass ->addRange(from: `0x200c`, to: `0x200d`);
3038	yyCharClass ->addRange(from: `0x2070`, to: `0x218f`);
3039	yyCharClass ->addRange(from: `0x2c00`, to: `0x2fef`);
3040	yyCharClass ->addRange(from: `0x3001`, to: `0xd7ff`);
3041	yyCharClass ->addRange(from: `0xf900`, to: `0xfdcf`);
3042	yyCharClass ->addRange(from: `0xfdf0`, to: `0xfffd`);
3043	yyCharClass ->addRange(from: (ushort)`0x10000`, to: (ushort)`0xeffff`);
3044	return Tok_CharClass;
3045	} else {
3046	break;
3047	}
3048	case `'C'`:
3049	if (!xmlSchemaExtensions)
3050	break;
3051	yyCharClass ->setNegative(!yyCharClass ->negative());
3052	Q_FALLTHROUGH();
3053	case `'c'`:
3054	if (xmlSchemaExtensions) {
3055	yyCharClass ->addCategories(FLAG(QChar::Mark_NonSpacing) \|
3056	FLAG(QChar::Mark_SpacingCombining) \|
3057	FLAG(QChar::Mark_Enclosing) \|
3058	FLAG(QChar::Number_DecimalDigit) \|
3059	FLAG(QChar::Number_Letter) \|
3060	FLAG(QChar::Number_Other) \|
3061	FLAG(QChar::Letter_Uppercase) \|
3062	FLAG(QChar::Letter_Lowercase) \|
3063	FLAG(QChar::Letter_Titlecase) \|
3064	FLAG(QChar::Letter_Modifier) \|
3065	FLAG(QChar::Letter_Other));
3066	yyCharClass ->addSingleton(ch: `0x002d`); // '-'
3067	yyCharClass ->addSingleton(ch: `0x002e`); // '.'
3068	yyCharClass ->addSingleton(ch: `0x003a`); // ':'
3069	yyCharClass ->addSingleton(ch: `0x005f`); // '_'
3070	yyCharClass ->addSingleton(ch: `0xb7`);
3071	yyCharClass ->addRange(from: `0x0030`, to: `0x0039`); // [0-9]
3072	yyCharClass ->addRange(from: `0x0041`, to: `0x005a`); // [A-Z]
3073	yyCharClass ->addRange(from: `0x0061`, to: `0x007a`); // [a-z]
3074	yyCharClass ->addRange(from: `0xc0`, to: `0xd6`);
3075	yyCharClass ->addRange(from: `0xd8`, to: `0xf6`);
3076	yyCharClass ->addRange(from: `0xf8`, to: `0x2ff`);
3077	yyCharClass ->addRange(from: `0x370`, to: `0x37d`);
3078	yyCharClass ->addRange(from: `0x37f`, to: `0x1fff`);
3079	yyCharClass ->addRange(from: `0x200c`, to: `0x200d`);
3080	yyCharClass ->addRange(from: `0x2070`, to: `0x218f`);
3081	yyCharClass ->addRange(from: `0x2c00`, to: `0x2fef`);
3082	yyCharClass ->addRange(from: `0x3001`, to: `0xd7ff`);
3083	yyCharClass ->addRange(from: `0xf900`, to: `0xfdcf`);
3084	yyCharClass ->addRange(from: `0xfdf0`, to: `0xfffd`);
3085	yyCharClass ->addRange(from: (ushort)`0x10000`, to: (ushort)`0xeffff`);
3086	yyCharClass ->addRange(from: `0x0300`, to: `0x036f`);
3087	yyCharClass ->addRange(from: `0x203f`, to: `0x2040`);
3088	return Tok_CharClass;
3089	} else {
3090	break;
3091	}
3092	case `'P'`:
3093	if (!xmlSchemaExtensions)
3094	break;
3095	yyCharClass ->setNegative(!yyCharClass ->negative());
3096	Q_FALLTHROUGH();
3097	case `'p'`:
3098	if (xmlSchemaExtensions) {
3099	if (yyCh != `'{'`) {
3100	error(RXERR_CHARCLASS);
3101	return Tok_CharClass;
3102	}
3103
3104	QByteArray category;
3105	yyCh = getChar();
3106	while (yyCh != `'}'`) {
3107	if (yyCh == EOS) {
3108	error(RXERR_END);
3109	return Tok_CharClass;
3110	}
3111	category.append(c: yyCh);
3112	yyCh = getChar();
3113	}
3114	yyCh = getChar(); // skip closing '}'
3115
3116	int catlen = category.length();
3117	if (catlen == `1` \|\| catlen == `2`) {
3118	switch (category.at(i: `0`)) {
3119	case `'M'`:
3120	if (catlen == `1`) {
3121	yyCharClass ->addCategories(FLAG(QChar::Mark_NonSpacing) \|
3122	FLAG(QChar::Mark_SpacingCombining) \|
3123	FLAG(QChar::Mark_Enclosing));
3124	} else {
3125	switch (category.at(i: `1`)) {
3126	case `'n'`: yyCharClass ->addCategories(FLAG(QChar::Mark_NonSpacing)); break; // Mn
3127	case `'c'`: yyCharClass ->addCategories(FLAG(QChar::Mark_SpacingCombining)); break; // Mc
3128	case `'e'`: yyCharClass ->addCategories(FLAG(QChar::Mark_Enclosing)); break; // Me
3129	default: error(RXERR_CATEGORY); break;
3130	}
3131	}
3132	break;
3133	case `'N'`:
3134	if (catlen == `1`) {
3135	yyCharClass ->addCategories(FLAG(QChar::Number_DecimalDigit) \|
3136	FLAG(QChar::Number_Letter) \|
3137	FLAG(QChar::Number_Other));
3138	} else {
3139	switch (category.at(i: `1`)) {
3140	case `'d'`: yyCharClass ->addCategories(FLAG(QChar::Number_DecimalDigit)); break; // Nd
3141	case `'l'`: yyCharClass ->addCategories(FLAG(QChar::Number_Letter)); break; // Hl
3142	case `'o'`: yyCharClass ->addCategories(FLAG(QChar::Number_Other)); break; // No
3143	default: error(RXERR_CATEGORY); break;
3144	}
3145	}
3146	break;
3147	case `'Z'`:
3148	if (catlen == `1`) {
3149	yyCharClass ->addCategories(FLAG(QChar::Separator_Space) \|
3150	FLAG(QChar::Separator_Line) \|
3151	FLAG(QChar::Separator_Paragraph));
3152	} else {
3153	switch (category.at(i: `1`)) {
3154	case `'s'`: yyCharClass ->addCategories(FLAG(QChar::Separator_Space)); break; // Zs
3155	case `'l'`: yyCharClass ->addCategories(FLAG(QChar::Separator_Line)); break; // Zl
3156	case `'p'`: yyCharClass ->addCategories(FLAG(QChar::Separator_Paragraph)); break; // Zp
3157	default: error(RXERR_CATEGORY); break;
3158	}
3159	}
3160	break;
3161	case `'C'`:
3162	if (catlen == `1`) {
3163	yyCharClass ->addCategories(FLAG(QChar::Other_Control) \|
3164	FLAG(QChar::Other_Format) \|
3165	FLAG(QChar::Other_Surrogate) \|
3166	FLAG(QChar::Other_PrivateUse) \|
3167	FLAG(QChar::Other_NotAssigned));
3168	} else {
3169	switch (category.at(i: `1`)) {
3170	case `'c'`: yyCharClass ->addCategories(FLAG(QChar::Other_Control)); break; // Cc
3171	case `'f'`: yyCharClass ->addCategories(FLAG(QChar::Other_Format)); break; // Cf
3172	case `'s'`: yyCharClass ->addCategories(FLAG(QChar::Other_Surrogate)); break; // Cs
3173	case `'o'`: yyCharClass ->addCategories(FLAG(QChar::Other_PrivateUse)); break; // Co
3174	case `'n'`: yyCharClass ->addCategories(FLAG(QChar::Other_NotAssigned)); break; // Cn
3175	default: error(RXERR_CATEGORY); break;
3176	}
3177	}
3178	break;
3179	case `'L'`:
3180	if (catlen == `1`) {
3181	yyCharClass ->addCategories(FLAG(QChar::Letter_Uppercase) \|
3182	FLAG(QChar::Letter_Lowercase) \|
3183	FLAG(QChar::Letter_Titlecase) \|
3184	FLAG(QChar::Letter_Modifier) \|
3185	FLAG(QChar::Letter_Other));
3186	} else {
3187	switch (category.at(i: `1`)) {
3188	case `'u'`: yyCharClass ->addCategories(FLAG(QChar::Letter_Uppercase)); break; // Lu
3189	case `'l'`: yyCharClass ->addCategories(FLAG(QChar::Letter_Lowercase)); break; // Ll
3190	case `'t'`: yyCharClass ->addCategories(FLAG(QChar::Letter_Titlecase)); break; // Lt
3191	case `'m'`: yyCharClass ->addCategories(FLAG(QChar::Letter_Modifier)); break; // Lm
3192	case `'o'`: yyCharClass ->addCategories(FLAG(QChar::Letter_Other)); break; // Lo
3193	default: error(RXERR_CATEGORY); break;
3194	}
3195	}
3196	break;
3197	case `'P'`:
3198	if (catlen == `1`) {
3199	yyCharClass ->addCategories(FLAG(QChar::Punctuation_Connector) \|
3200	FLAG(QChar::Punctuation_Dash) \|
3201	FLAG(QChar::Punctuation_Open) \|
3202	FLAG(QChar::Punctuation_Close) \|
3203	FLAG(QChar::Punctuation_InitialQuote) \|
3204	FLAG(QChar::Punctuation_FinalQuote) \|
3205	FLAG(QChar::Punctuation_Other));
3206	} else {
3207	switch (category.at(i: `1`)) {
3208	case `'c'`: yyCharClass ->addCategories(FLAG(QChar::Punctuation_Connector)); break; // Pc
3209	case `'d'`: yyCharClass ->addCategories(FLAG(QChar::Punctuation_Dash)); break; // Pd
3210	case `'s'`: yyCharClass ->addCategories(FLAG(QChar::Punctuation_Open)); break; // Ps
3211	case `'e'`: yyCharClass ->addCategories(FLAG(QChar::Punctuation_Close)); break; // Pe
3212	case `'i'`: yyCharClass ->addCategories(FLAG(QChar::Punctuation_InitialQuote)); break; // Pi
3213	case `'f'`: yyCharClass ->addCategories(FLAG(QChar::Punctuation_FinalQuote)); break; // Pf
3214	case `'o'`: yyCharClass ->addCategories(FLAG(QChar::Punctuation_Other)); break; // Po
3215	default: error(RXERR_CATEGORY); break;
3216	}
3217	}
3218	break;
3219	case `'S'`:
3220	if (catlen == `1`) {
3221	yyCharClass ->addCategories(FLAG(QChar::Symbol_Math) \|
3222	FLAG(QChar::Symbol_Currency) \|
3223	FLAG(QChar::Symbol_Modifier) \|
3224	FLAG(QChar::Symbol_Other));
3225	} else {
3226	switch (category.at(i: `1`)) {
3227	case `'m'`: yyCharClass ->addCategories(FLAG(QChar::Symbol_Math)); break; // Sm
3228	case `'c'`: yyCharClass ->addCategories(FLAG(QChar::Symbol_Currency)); break; // Sc
3229	case `'k'`: yyCharClass ->addCategories(FLAG(QChar::Symbol_Modifier)); break; // Sk
3230	case `'o'`: yyCharClass ->addCategories(FLAG(QChar::Symbol_Other)); break; // So
3231	default: error(RXERR_CATEGORY); break;
3232	}
3233	}
3234	break;
3235	default:
3236	error(RXERR_CATEGORY);
3237	break;
3238	}
3239	} else if (catlen > `2` && category.at(i: `0`) == `'I'` && category.at(i: `1`) == `'s'`) {
3240	static const int N = sizeof(categoriesRangeMap) / sizeof(categoriesRangeMap[`0`]);
3241	const char * const categoryFamily = category.constData() + `2`;
3242	const CategoriesRangeMapEntry *r = std::lower_bound(first: categoriesRangeMap, last: categoriesRangeMap + N, val: categoryFamily);
3243	if (r != categoriesRangeMap + N && qstrcmp(str1: r->name, str2: categoryFamily) == `0`)
3244	yyCharClass ->addRange(from: r->first, to: r->second);
3245	else
3246	error(RXERR_CATEGORY);
3247	} else {
3248	error(RXERR_CATEGORY);
3249	}
3250	return Tok_CharClass;
3251	} else {
3252	break;
3253	}
3254	#endif
3255	#ifndef QT_NO_REGEXP_ESCAPE
3256	case `'x'`:
3257	val = `0`;
3258	for (i = `0`; i < `4`; i++) {
3259	low = QChar (yyCh).toLower().unicode();
3260	if (low >= `'0'` && low <= `'9'`)
3261	val = (val << `4`) \| (low - `'0'`);
3262	else if (low >= `'a'` && low <= `'f'`)
3263	val = (val << `4`) \| (low - `'a'` + `10`);
3264	else
3265	break;
3266	yyCh = getChar();
3267	}
3268	return Tok_Char \| val;
3269	#endif
3270	default:
3271	break;
3272	}
3273	if (prevCh >= `'1'` && prevCh <= `'9'`) {
3274	#ifndef QT_NO_REGEXP_BACKREF
3275	val = prevCh - `'0'`;
3276	while (yyCh >= `'0'` && yyCh <= `'9'`) {
3277	val = (val * `10`) + (yyCh - `'0'`);
3278	yyCh = getChar();
3279	}
3280	return Tok_BackRef \| val;
3281	#else
3282	error(RXERR_DISABLED);
3283	#endif
3284	}
3285	return Tok_Char \| prevCh;
3286	}
3287
3288	#ifndef QT_NO_REGEXP_INTERVAL
3289	int QRegExpEngine::getRep(int def)
3290	{
3291	if (yyCh >= `'0'` && yyCh <= `'9'`) {
3292	int rep = `0`;
3293	do {
3294	rep = `10` * rep + yyCh - `'0'`;
3295	if (rep >= InftyRep) {
3296	error(RXERR_REPETITION);
3297	rep = def;
3298	}
3299	yyCh = getChar();
3300	} while (yyCh >= `'0'` && yyCh <= `'9'`);
3301	return rep;
3302	} else {
3303	return def;
3304	}
3305	}
3306	#endif
3307
3308	#ifndef QT_NO_REGEXP_LOOKAHEAD
3309	void QRegExpEngine::skipChars(int n)
3310	{
3311	if (n > `0`) {
3312	yyPos += n - `1`;
3313	yyCh = getChar();
3314	}
3315	}
3316	#endif
3317
3318	void QRegExpEngine::error(const char *msg)
3319	{
3320	if (yyError.isEmpty())
3321	yyError = QLatin1String (msg);
3322	}
3323
3324	void QRegExpEngine::startTokenizer(const QChar rx, int* len)
3325	{
3326	yyIn = rx;
3327	yyPos0 = `0`;
3328	yyPos = `0`;
3329	yyLen = len;
3330	yyCh = getChar();
3331	yyCharClass.reset(other: new QRegExpCharClass);
3332	yyMinRep = `0`;
3333	yyMaxRep = `0`;
3334	yyError = QString ();
3335	}
3336
3337	int QRegExpEngine::getToken()
3338	{
3339	#ifndef QT_NO_REGEXP_CCLASS
3340	ushort pendingCh = `0`;
3341	bool charPending;
3342	bool rangePending;
3343	int tok;
3344	#endif
3345	int prevCh = yyCh;
3346
3347	yyPos0 = yyPos - `1`;
3348	#ifndef QT_NO_REGEXP_CCLASS
3349	yyCharClass ->clear();
3350	#endif
3351	yyMinRep = `0`;
3352	yyMaxRep = `0`;
3353	yyCh = getChar();
3354
3355	switch (prevCh) {
3356	case EOS:
3357	yyPos0 = yyPos;
3358	return Tok_Eos;
3359	case `'$'`:
3360	return Tok_Dollar;
3361	case `'('`:
3362	if (yyCh == `'?'`) {
3363	prevCh = getChar();
3364	yyCh = getChar();
3365	switch (prevCh) {
3366	#ifndef QT_NO_REGEXP_LOOKAHEAD
3367	case `'!'`:
3368	return Tok_NegLookahead;
3369	case `'='`:
3370	return Tok_PosLookahead;
3371	#endif
3372	case `':'`:
3373	return Tok_MagicLeftParen;
3374	case `'<'`:
3375	error(RXERR_LOOKBEHIND);
3376	return Tok_MagicLeftParen;
3377	default:
3378	error(RXERR_LOOKAHEAD);
3379	return Tok_MagicLeftParen;
3380	}
3381	} else {
3382	return Tok_LeftParen;
3383	}
3384	case `')'`:
3385	return Tok_RightParen;
3386	case `'*'`:
3387	yyMinRep = `0`;
3388	yyMaxRep = InftyRep;
3389	return Tok_Quantifier;
3390	case `'+'`:
3391	yyMinRep = `1`;
3392	yyMaxRep = InftyRep;
3393	return Tok_Quantifier;
3394	case `'.'`:
3395	#ifndef QT_NO_REGEXP_CCLASS
3396	yyCharClass ->setNegative(true);
3397	#endif
3398	return Tok_CharClass;
3399	case `'?'`:
3400	yyMinRep = `0`;
3401	yyMaxRep = `1`;
3402	return Tok_Quantifier;
3403	case `'['`:
3404	#ifndef QT_NO_REGEXP_CCLASS
3405	if (yyCh == `'^'`) {
3406	yyCharClass ->setNegative(true);
3407	yyCh = getChar();
3408	}
3409	charPending = false;
3410	rangePending = false;
3411	do {
3412	if (yyCh == `'-'` && charPending && !rangePending) {
3413	rangePending = true;
3414	yyCh = getChar();
3415	} else {
3416	if (charPending && !rangePending) {
3417	yyCharClass ->addSingleton(ch: pendingCh);
3418	charPending = false;
3419	}
3420	if (yyCh == `'\\'`) {
3421	yyCh = getChar();
3422	tok = getEscape();
3423	if (tok == Tok_Word)
3424	tok = `'\b'`;
3425	} else {
3426	tok = Tok_Char \| yyCh;
3427	yyCh = getChar();
3428	}
3429	if (tok == Tok_CharClass) {
3430	if (rangePending) {
3431	yyCharClass ->addSingleton(ch: `'-'`);
3432	yyCharClass ->addSingleton(ch: pendingCh);
3433	charPending = false;
3434	rangePending = false;
3435	}
3436	} else if ((tok & Tok_Char) != `0`) {
3437	if (rangePending) {
3438	yyCharClass ->addRange(from: pendingCh, to: tok ^ Tok_Char);
3439	charPending = false;
3440	rangePending = false;
3441	} else {
3442	pendingCh = tok ^ Tok_Char;
3443	charPending = true;
3444	}
3445	} else {
3446	error(RXERR_CHARCLASS);
3447	}
3448	}
3449	} while (yyCh != `']'` && yyCh != EOS);
3450	if (rangePending)
3451	yyCharClass ->addSingleton(ch: `'-'`);
3452	if (charPending)
3453	yyCharClass ->addSingleton(ch: pendingCh);
3454	if (yyCh == EOS)
3455	error(RXERR_END);
3456	else
3457	yyCh = getChar();
3458	return Tok_CharClass;
3459	#else
3460	error(RXERR_END);
3461	return Tok_Char \| `'['`;
3462	#endif
3463	case `'\\'`:
3464	return getEscape();
3465	case `']'`:
3466	error(RXERR_LEFTDELIM);
3467	return Tok_Char \| `']'`;
3468	case `'^'`:
3469	return Tok_Caret;
3470	case `'{'`:
3471	#ifndef QT_NO_REGEXP_INTERVAL
3472	yyMinRep = getRep(def: `0`);
3473	yyMaxRep = yyMinRep;
3474	if (yyCh == `','`) {
3475	yyCh = getChar();
3476	yyMaxRep = getRep(def: InftyRep);
3477	}
3478	if (yyMaxRep < yyMinRep)
3479	error(RXERR_INTERVAL);
3480	if (yyCh != `'}'`)
3481	error(RXERR_REPETITION);
3482	yyCh = getChar();
3483	return Tok_Quantifier;
3484	#else
3485	error(RXERR_DISABLED);
3486	return Tok_Char \| `'{'`;
3487	#endif
3488	case `'\|'`:
3489	return Tok_Bar;
3490	case `'}'`:
3491	error(RXERR_LEFTDELIM);
3492	return Tok_Char \| `'}'`;
3493	default:
3494	return Tok_Char \| prevCh;
3495	}
3496	}
3497
3498	int QRegExpEngine::parse(const QChar pattern, int* len)
3499	{
3500	valid = true;
3501	startTokenizer(rx: pattern, len);
3502	yyTok = getToken();
3503	#ifndef QT_NO_REGEXP_CAPTURE
3504	yyMayCapture = true;
3505	#else
3506	yyMayCapture = false;
3507	#endif
3508
3509	#ifndef QT_NO_REGEXP_CAPTURE
3510	int atom = startAtom(officialCapture: false);
3511	#endif
3512	QRegExpCharClass anything;
3513	Box box(this); // create InitialState
3514	box.set(anything);
3515	Box rightBox(this); // create FinalState
3516	rightBox.set(anything);
3517
3518	Box middleBox(this);
3519	parseExpression(box: &middleBox);
3520	#ifndef QT_NO_REGEXP_CAPTURE
3521	finishAtom(atom, needCapture: false);
3522	#endif
3523	#ifndef QT_NO_REGEXP_OPTIM
3524	middleBox.setupHeuristics();
3525	#endif
3526	box.cat(b: middleBox);
3527	box.cat(b: rightBox);
3528	yyCharClass.reset();
3529
3530	#ifndef QT_NO_REGEXP_CAPTURE
3531	for (int i = `0`; i < nf; ++i) {
3532	switch (f [i].capture) {
3533	case QRegExpAtom::NoCapture:
3534	break;
3535	case QRegExpAtom::OfficialCapture:
3536	f [i].capture = ncap;
3537	captureForOfficialCapture.append(t: ncap);
3538	++ncap;
3539	++officialncap;
3540	break;
3541	case QRegExpAtom::UnofficialCapture:
3542	f [i].capture = greedyQuantifiers ? ncap++ : QRegExpAtom::NoCapture;
3543	}
3544	}
3545
3546	#ifndef QT_NO_REGEXP_BACKREF
3547	#ifndef QT_NO_REGEXP_OPTIM
3548	if (officialncap == `0` && nbrefs == `0`) {
3549	ncap = nf = `0`;
3550	f.clear();
3551	}
3552	#endif
3553	// handle the case where there's a \5 with no corresponding capture
3554	// (captureForOfficialCapture.size() != officialncap)
3555	for (int i = `0`; i < nbrefs - officialncap; ++i) {
3556	captureForOfficialCapture.append(t: ncap);
3557	++ncap;
3558	}
3559	#endif
3560	#endif
3561
3562	if (!yyError.isEmpty())
3563	return -`1`;
3564
3565	#ifndef QT_NO_REGEXP_OPTIM
3566	const QRegExpAutomatonState &sinit = s.at(i: InitialState);
3567	caretAnchored = !sinit.anchors.isEmpty();
3568	if (caretAnchored) {
3569	const QMap<int, int> &anchors = sinit.anchors;
3570	QMap<int, int>::const_iterator a;
3571	for (a = anchors.constBegin(); a != anchors.constEnd(); ++a) {
3572	if (
3573	#ifndef QT_NO_REGEXP_ANCHOR_ALT
3574	(*a & Anchor_Alternation) != `0` \|\|
3575	#endif
3576	(*a & Anchor_Caret) == `0`)
3577	{
3578	caretAnchored = false;
3579	break;
3580	}
3581	}
3582	}
3583	#endif
3584
3585	// cleanup anchors
3586	int numStates = s.count();
3587	for (int i = `0`; i < numStates; ++i) {
3588	QRegExpAutomatonState &state = s [i];
3589	if (!state.anchors.isEmpty()) {
3590	QMap<int, int>::iterator a = state.anchors.begin();
3591	while (a != state.anchors.end()) {
3592	if (a.value() == `0`)
3593	a = state.anchors.erase(it: a);
3594	else
3595	++a;
3596	}
3597	}
3598	}
3599
3600	return yyPos0;
3601	}
3602
3603	void QRegExpEngine::parseAtom(Box *box)
3604	{
3605	#ifndef QT_NO_REGEXP_LOOKAHEAD
3606	QRegExpEngine eng = nullptr*;
3607	bool neg;
3608	int len;
3609	#endif
3610
3611	if ((yyTok & Tok_Char) != `0`) {
3612	box->set(QChar (yyTok ^ Tok_Char));
3613	} else {
3614	#ifndef QT_NO_REGEXP_OPTIM
3615	trivial = false;
3616	#endif
3617	switch (yyTok) {
3618	case Tok_Dollar:
3619	box->catAnchor(a: Anchor_Dollar);
3620	break;
3621	case Tok_Caret:
3622	box->catAnchor(a: Anchor_Caret);
3623	break;
3624	#ifndef QT_NO_REGEXP_LOOKAHEAD
3625	case Tok_PosLookahead:
3626	case Tok_NegLookahead:
3627	neg = (yyTok == Tok_NegLookahead);
3628	eng = new QRegExpEngine (cs, greedyQuantifiers);
3629	len = eng->parse(pattern: yyIn + yyPos - `1`, len: yyLen - yyPos + `1`);
3630	if (len >= `0`)
3631	skipChars(n: len);
3632	else
3633	error(RXERR_LOOKAHEAD);
3634	box->catAnchor(a: addLookahead(eng, negative: neg));
3635	yyTok = getToken();
3636	if (yyTok != Tok_RightParen)
3637	error(RXERR_LOOKAHEAD);
3638	break;
3639	#endif
3640	#ifndef QT_NO_REGEXP_ESCAPE
3641	case Tok_Word:
3642	box->catAnchor(a: Anchor_Word);
3643	break;
3644	case Tok_NonWord:
3645	box->catAnchor(a: Anchor_NonWord);
3646	break;
3647	#endif
3648	case Tok_LeftParen:
3649	case Tok_MagicLeftParen:
3650	yyTok = getToken();
3651	parseExpression(box);
3652	if (yyTok != Tok_RightParen)
3653	error(RXERR_END);
3654	break;
3655	case Tok_CharClass:
3656	box->set(*yyCharClass);
3657	break;
3658	case Tok_Quantifier:
3659	error(RXERR_REPETITION);
3660	break;
3661	default:
3662	#ifndef QT_NO_REGEXP_BACKREF
3663	if ((yyTok & Tok_BackRef) != `0`)
3664	box->set(yyTok ^ Tok_BackRef);
3665	else
3666	#endif
3667	error(RXERR_DISABLED);
3668	}
3669	}
3670	yyTok = getToken();
3671	}
3672
3673	void QRegExpEngine::parseFactor(Box *box)
3674	{
3675	#ifndef QT_NO_REGEXP_CAPTURE
3676	int outerAtom = greedyQuantifiers ? startAtom(officialCapture: false) : -`1`;
3677	int innerAtom = startAtom(officialCapture: yyMayCapture && yyTok == Tok_LeftParen);
3678	bool magicLeftParen = (yyTok == Tok_MagicLeftParen);
3679	#else
3680	const int innerAtom = -`1`;
3681	#endif
3682
3683	#ifndef QT_NO_REGEXP_INTERVAL
3684	#define YYREDO() \
3685	yyIn = in, yyPos0 = pos0, yyPos = pos, yyLen = len, yyCh = ch, \
3686	*yyCharClass = charClass, yyMinRep = 0, yyMaxRep = 0, yyTok = tok
3687
3688	const QChar *in = yyIn;
3689	int pos0 = yyPos0;
3690	int pos = yyPos;
3691	int len = yyLen;
3692	int ch = yyCh;
3693	QRegExpCharClass charClass;
3694	if (yyTok == Tok_CharClass)
3695	charClass = *yyCharClass;
3696	int tok = yyTok;
3697	bool mayCapture = yyMayCapture;
3698	#endif
3699
3700	parseAtom(box);
3701	#ifndef QT_NO_REGEXP_CAPTURE
3702	finishAtom(atom: innerAtom, needCapture: magicLeftParen);
3703	#endif
3704
3705	bool hasQuantifier = (yyTok == Tok_Quantifier);
3706	if (hasQuantifier) {
3707	#ifndef QT_NO_REGEXP_OPTIM
3708	trivial = false;
3709	#endif
3710	if (yyMaxRep == InftyRep) {
3711	box->plus(atom: innerAtom);
3712	#ifndef QT_NO_REGEXP_INTERVAL
3713	} else if (yyMaxRep == `0`) {
3714	box->clear();
3715	#endif
3716	}
3717	if (yyMinRep == `0`)
3718	box->opt();
3719
3720	#ifndef QT_NO_REGEXP_INTERVAL
3721	yyMayCapture = false;
3722	int alpha = (yyMinRep == `0`) ? `0` : yyMinRep - `1`;
3723	int beta = (yyMaxRep == InftyRep) ? `0` : yyMaxRep - (alpha + `1`);
3724
3725	Box rightBox(this);
3726	int i;
3727
3728	for (i = `0`; i < beta; i++) {
3729	YYREDO();
3730	Box leftBox(this);
3731	parseAtom(box: &leftBox);
3732	leftBox.cat(b: rightBox);
3733	leftBox.opt();
3734	rightBox = leftBox;
3735	}
3736	for (i = `0`; i < alpha; i++) {
3737	YYREDO();
3738	Box leftBox(this);
3739	parseAtom(box: &leftBox);
3740	leftBox.cat(b: rightBox);
3741	rightBox = leftBox;
3742	}
3743	rightBox.cat(b: *box);
3744	*box = rightBox;
3745	#endif
3746	yyTok = getToken();
3747	#ifndef QT_NO_REGEXP_INTERVAL
3748	yyMayCapture = mayCapture;
3749	#endif
3750	}
3751	#undef YYREDO
3752	#ifndef QT_NO_REGEXP_CAPTURE
3753	if (greedyQuantifiers)
3754	finishAtom(atom: outerAtom, needCapture: hasQuantifier);
3755	#endif
3756	}
3757
3758	void QRegExpEngine::parseTerm(Box *box)
3759	{
3760	#ifndef QT_NO_REGEXP_OPTIM
3761	if (yyTok != Tok_Eos && yyTok != Tok_RightParen && yyTok != Tok_Bar)
3762	parseFactor(box);
3763	#endif
3764	while (yyTok != Tok_Eos && yyTok != Tok_RightParen && yyTok != Tok_Bar) {
3765	Box rightBox(this);
3766	parseFactor(box: &rightBox);
3767	box->cat(b: rightBox);
3768	}
3769	}
3770
3771	void QRegExpEngine::parseExpression(Box *box)
3772	{
3773	parseTerm(box);
3774	while (yyTok == Tok_Bar) {
3775	#ifndef QT_NO_REGEXP_OPTIM
3776	trivial = false;
3777	#endif
3778	Box rightBox(this);
3779	yyTok = getToken();
3780	parseTerm(box: &rightBox);
3781	box->orx(b: rightBox);
3782	}
3783	}
3784
3785	/*
3786	The struct QRegExpPrivate contains the private data of a regular
3787	expression other than the automaton. It makes it possible for many
3788	QRegExp objects to use the same QRegExpEngine object with different
3789	QRegExpPrivate objects.
3790	*/
3791	struct QRegExpPrivate
3792	{
3793	QRegExpEngine *eng;
3794	QRegExpEngineKey engineKey;
3795	bool minimal;
3796	#ifndef QT_NO_REGEXP_CAPTURE
3797	QString t; // last string passed to QRegExp::indexIn() or lastIndexIn()
3798	QStringList capturedCache; // what QRegExp::capturedTexts() returned last
3799	#endif
3800	QRegExpMatchState matchState;
3801
3802	inline QRegExpPrivate()
3803	: eng(nullptr), engineKey (QString (), QRegExp::RegExp, Qt::CaseSensitive), minimal(false) { }
3804	inline QRegExpPrivate(const QRegExpEngineKey &key)
3805	: eng(nullptr), engineKey (key), minimal(false) {}
3806	};
3807
3808	#if !defined(QT_NO_REGEXP_OPTIM)
3809	struct QRECache
3810	{
3811	typedef QHash<QRegExpEngineKey, QRegExpEngine *> EngineCache;
3812	typedef QCache<QRegExpEngineKey, QRegExpEngine> UnusedEngineCache;
3813	EngineCache usedEngines;
3814	UnusedEngineCache unusedEngines;
3815	};
3816	Q_GLOBAL_STATIC(QRECache, engineCache)
3817	static QBasicMutex engineCacheMutex;
3818	#endif // QT_NO_REGEXP_OPTIM
3819
3820	static void derefEngine(QRegExpEngine eng, const* QRegExpEngineKey &key)
3821	{
3822	#if !defined(QT_NO_REGEXP_OPTIM)
3823	const auto locker = qt_scoped_lock(mutex&: engineCacheMutex);
3824	if (!eng->ref.deref()) {
3825	if (QRECache *c = engineCache ()) {
3826	c->unusedEngines.insert(akey: key, aobject: eng, acost: `4` + key.pattern.length() / `4`);
3827	c->usedEngines.remove(akey: key);
3828	} else {
3829	delete eng;
3830	}
3831	}
3832	#else
3833	Q_UNUSED(key);
3834	if (!eng->ref.deref())
3835	delete eng;
3836	#endif
3837	}
3838
3839	static void prepareEngine_helper(QRegExpPrivate *priv)
3840	{
3841	Q_ASSERT(!priv->eng);
3842
3843	#if !defined(QT_NO_REGEXP_OPTIM)
3844	const auto locker = qt_scoped_lock(mutex&: engineCacheMutex);
3845	if (QRECache *c = engineCache ()) {
3846	priv->eng = c->unusedEngines.take(key: priv->engineKey);
3847	if (!priv->eng)
3848	priv->eng = c->usedEngines.value(akey: priv->engineKey);
3849	if (!priv->eng)
3850	priv->eng = new QRegExpEngine (priv->engineKey);
3851	else
3852	priv->eng->ref.ref();
3853
3854	c->usedEngines.insert(akey: priv->engineKey, avalue: priv->eng);
3855	return;
3856	}
3857	#endif // QT_NO_REGEXP_OPTIM
3858
3859	priv->eng = new QRegExpEngine (priv->engineKey);
3860	}
3861
3862	inline static void prepareEngine(QRegExpPrivate *priv)
3863	{
3864	if (priv->eng)
3865	return;
3866	prepareEngine_helper(priv);
3867	priv->matchState.prepareForMatch(eng: priv->eng);
3868	}
3869
3870	static void prepareEngineForMatch(QRegExpPrivate priv, const* QString &str)
3871	{
3872	prepareEngine(priv);
3873	priv->matchState.prepareForMatch(eng: priv->eng);
3874	#ifndef QT_NO_REGEXP_CAPTURE
3875	priv->t = str;
3876	priv->capturedCache.clear();
3877	#else
3878	Q_UNUSED(str);
3879	#endif
3880	}
3881
3882	static void invalidateEngine(QRegExpPrivate *priv)
3883	{
3884	if (priv->eng) {
3885	derefEngine(eng: priv->eng, key: priv->engineKey);
3886	priv->eng = nullptr;
3887	priv->matchState.drain();
3888	}
3889	}
3890
3891	/!*
3892	\enum QRegExp::CaretMode
3893
3894	The CaretMode enum defines the different meanings of the caret
3895	(\b{^}) in a regular expression. The possible values are:
3896
3897	\value CaretAtZero
3898	The caret corresponds to index 0 in the searched string.
3899
3900	\value CaretAtOffset
3901	The caret corresponds to the start offset of the search.
3902
3903	\value CaretWontMatch
3904	The caret never matches.
3905	*/
3906
3907	/!*
3908	\enum QRegExp::PatternSyntax
3909
3910	The syntax used to interpret the meaning of the pattern.
3911
3912	\value RegExp A rich Perl-like pattern matching syntax. This is
3913	the default.
3914
3915	\value RegExp2 Like RegExp, but with \l{greedy quantifiers}.
3916	(Introduced in Qt 4.2.)
3917
3918	\value Wildcard This provides a simple pattern matching syntax
3919	similar to that used by shells (command interpreters) for "file
3920	globbing". See \l{QRegExp wildcard matching}.
3921
3922	\value WildcardUnix This is similar to Wildcard but with the
3923	behavior of a Unix shell. The wildcard characters can be escaped
3924	with the character "\\".
3925
3926	\value FixedString The pattern is a fixed string. This is
3927	equivalent to using the RegExp pattern on a string in
3928	which all metacharacters are escaped using escape().
3929
3930	\value W3CXmlSchema11 The pattern is a regular expression as
3931	defined by the W3C XML Schema 1.1 specification.
3932
3933	\sa setPatternSyntax()
3934	*/
3935
3936	/!*
3937	Constructs an empty regexp.
3938
3939	\sa isValid(), errorString()
3940	*/
3941	QRegExp::QRegExp()
3942	{
3943	priv = new QRegExpPrivate;
3944	prepareEngine(priv);
3945	}
3946
3947	/!*
3948	Constructs a regular expression object for the given \a pattern
3949	string. The pattern must be given using wildcard notation if \a
3950	syntax is \l Wildcard; the default is \l RegExp. The pattern is
3951	case sensitive, unless \a cs is Qt::CaseInsensitive. Matching is
3952	greedy (maximal), but can be changed by calling
3953	setMinimal().
3954
3955	\sa setPattern(), setCaseSensitivity(), setPatternSyntax()
3956	*/
3957	QRegExp::QRegExp(const QString &pattern, Qt::CaseSensitivity cs, PatternSyntax syntax)
3958	{
3959	priv = new QRegExpPrivate (QRegExpEngineKey (pattern, syntax, cs));
3960	prepareEngine(priv);
3961	}
3962
3963	/!*
3964	Constructs a regular expression as a copy of \a rx.
3965
3966	\sa operator=()
3967	*/
3968	QRegExp::QRegExp(const QRegExp &rx)
3969	{
3970	priv = new QRegExpPrivate;
3971	operator=(rx);
3972	}
3973
3974	/!*
3975	Destroys the regular expression and cleans up its internal data.
3976	*/
3977	QRegExp::~QRegExp()
3978	{
3979	invalidateEngine(priv);
3980	delete priv;
3981	}
3982
3983	/!*
3984	Copies the regular expression \a rx and returns a reference to the
3985	copy. The case sensitivity, wildcard, and minimal matching options
3986	are also copied.
3987	*/
3988	QRegExp &QRegExp::operator=(const QRegExp &rx)
3989	{
3990	prepareEngine(priv: rx.priv); // to allow sharing
3991	QRegExpEngine *otherEng = rx.priv->eng;
3992	if (otherEng)
3993	otherEng->ref.ref();
3994	invalidateEngine(priv);
3995	priv->eng = otherEng;
3996	priv->engineKey = rx.priv->engineKey;
3997	priv->minimal = rx.priv->minimal;
3998	#ifndef QT_NO_REGEXP_CAPTURE
3999	priv->t = rx.priv->t;
4000	priv->capturedCache = rx.priv->capturedCache;
4001	#endif
4002	if (priv->eng)
4003	priv->matchState.prepareForMatch(eng: priv->eng);
4004	priv->matchState.captured = rx.priv->matchState.captured;
4005	return *this;
4006	}
4007
4008	/!*
4009	\fn QRegExp &QRegExp::operator=(QRegExp &&other)
4010
4011	Move-assigns \a other to this QRegExp instance.
4012
4013	\since 5.2
4014	*/
4015
4016	/!*
4017	\fn void QRegExp::swap(QRegExp &other)
4018	\since 4.8
4019
4020	Swaps regular expression \a other with this regular
4021	expression. This operation is very fast and never fails.
4022	*/
4023
4024	/!*
4025	Returns \c true if this regular expression is equal to \a rx;
4026	otherwise returns \c false.
4027
4028	Two QRegExp objects are equal if they have the same pattern
4029	strings and the same settings for case sensitivity, wildcard and
4030	minimal matching.
4031	*/
4032	bool QRegExp::operator==(const QRegExp &rx) const
4033	{
4034	return priv->engineKey == rx.priv->engineKey && priv->minimal == rx.priv->minimal;
4035	}
4036
4037	/!*
4038	\since 5.6
4039	\relates QRegExp
4040
4041	Returns the hash value for \a key, using
4042	\a seed to seed the calculation.
4043	*/
4044	uint qHash(const QRegExp &key, uint seed) noexcept
4045	{
4046	QtPrivate::QHashCombine hash;
4047	seed = hash (seed, key.priv->engineKey);
4048	seed = hash (seed, key.priv->minimal);
4049	return seed;
4050	}
4051
4052	/!*
4053	\fn bool QRegExp::operator!=(const QRegExp &rx) const
4054
4055	Returns \c true if this regular expression is not equal to \a rx;
4056	otherwise returns \c false.
4057
4058	\sa operator==()
4059	*/
4060
4061	/!*
4062	Returns \c true if the pattern string is empty; otherwise returns
4063	false.
4064
4065	If you call exactMatch() with an empty pattern on an empty string
4066	it will return true; otherwise it returns \c false since it operates
4067	over the whole string. If you call indexIn() with an empty pattern
4068	on \e any string it will return the start offset (0 by default)
4069	because the empty pattern matches the 'emptiness' at the start of
4070	the string. In this case the length of the match returned by
4071	matchedLength() will be 0.
4072
4073	See QString::isEmpty().
4074	*/
4075
4076	bool QRegExp::isEmpty() const
4077	{
4078	return priv->engineKey.pattern.isEmpty();
4079	}
4080
4081	/!*
4082	Returns \c true if the regular expression is valid; otherwise returns
4083	false. An invalid regular expression never matches.
4084
4085	The pattern \b{[a-z} is an example of an invalid pattern, since
4086	it lacks a closing square bracket.
4087
4088	Note that the validity of a regexp may also depend on the setting
4089	of the wildcard flag, for example \b{.html} is a valid*
4090	wildcard regexp but an invalid full regexp.
4091
4092	\sa errorString()
4093	*/
4094	bool QRegExp::isValid() const
4095	{
4096	if (priv->engineKey.pattern.isEmpty()) {
4097	return true;
4098	} else {
4099	prepareEngine(priv);
4100	return priv->eng->isValid();
4101	}
4102	}
4103
4104	/!*
4105	Returns the pattern string of the regular expression. The pattern
4106	has either regular expression syntax or wildcard syntax, depending
4107	on patternSyntax().
4108
4109	\sa patternSyntax(), caseSensitivity()
4110	*/
4111	QString QRegExp::pattern() const
4112	{
4113	return priv->engineKey.pattern;
4114	}
4115
4116	/!*
4117	Sets the pattern string to \a pattern. The case sensitivity,
4118	wildcard, and minimal matching options are not changed.
4119
4120	\sa setPatternSyntax(), setCaseSensitivity()
4121	*/
4122	void QRegExp::setPattern(const QString &pattern)
4123	{
4124	if (priv->engineKey.pattern != pattern) {
4125	invalidateEngine(priv);
4126	priv->engineKey.pattern = pattern;
4127	}
4128	}
4129
4130	/!*
4131	Returns Qt::CaseSensitive if the regexp is matched case
4132	sensitively; otherwise returns Qt::CaseInsensitive.
4133
4134	\sa patternSyntax(), pattern(), isMinimal()
4135	*/
4136	Qt::CaseSensitivity QRegExp::caseSensitivity() const
4137	{
4138	return priv->engineKey.cs;
4139	}
4140
4141	/!*
4142	Sets case sensitive matching to \a cs.
4143
4144	If \a cs is Qt::CaseSensitive, \b{\\.txt$} matches
4145	\c{readme.txt} but not \c{README.TXT}.
4146
4147	\sa setPatternSyntax(), setPattern(), setMinimal()
4148	*/
4149	void QRegExp::setCaseSensitivity(Qt::CaseSensitivity cs)
4150	{
4151	if ((bool)cs != (bool)priv->engineKey.cs) {
4152	invalidateEngine(priv);
4153	priv->engineKey.cs = cs;
4154	}
4155	}
4156
4157	/!*
4158	Returns the syntax used by the regular expression. The default is
4159	QRegExp::RegExp.
4160
4161	\sa pattern(), caseSensitivity()
4162	*/
4163	QRegExp::PatternSyntax QRegExp::patternSyntax() const
4164	{
4165	return priv->engineKey.patternSyntax;
4166	}
4167
4168	/!*
4169	Sets the syntax mode for the regular expression. The default is
4170	QRegExp::RegExp.
4171
4172	Setting \a syntax to QRegExp::Wildcard enables simple shell-like
4173	\l{QRegExp wildcard matching}. For example, \b{r.txt} matches the*
4174	string \c{readme.txt} in wildcard mode, but does not match
4175	\c{readme}.
4176
4177	Setting \a syntax to QRegExp::FixedString means that the pattern
4178	is interpreted as a plain string. Special characters (e.g.,
4179	backslash) don't need to be escaped then.
4180
4181	\sa setPattern(), setCaseSensitivity(), escape()
4182	*/
4183	void QRegExp::setPatternSyntax(PatternSyntax syntax)
4184	{
4185	if (syntax != priv->engineKey.patternSyntax) {
4186	invalidateEngine(priv);
4187	priv->engineKey.patternSyntax = syntax;
4188	}
4189	}
4190
4191	/!*
4192	Returns \c true if minimal (non-greedy) matching is enabled;
4193	otherwise returns \c false.
4194
4195	\sa caseSensitivity(), setMinimal()
4196	*/
4197	bool QRegExp::isMinimal() const
4198	{
4199	return priv->minimal;
4200	}
4201
4202	/!*
4203	Enables or disables minimal matching. If \a minimal is false,
4204	matching is greedy (maximal) which is the default.
4205
4206	For example, suppose we have the input string "We must be
4207	<b>bold</b>, very <b>bold</b>!" and the pattern
4208	\b{<b>.</b>}. With the default greedy (maximal) matching,*
4209	the match is "We must be \underline{<b>bold</b>, very
4210	<b>bold</b>}!". But with minimal (non-greedy) matching, the
4211	first match is: "We must be \underline{<b>bold</b>}, very
4212	<b>bold</b>!" and the second match is "We must be <b>bold</b>,
4213	very \underline{<b>bold</b>}!". In practice we might use the pattern
4214	\b{<b>[^<]\</b>} instead, although this will still fail for*
4215	nested tags.
4216
4217	\sa setCaseSensitivity()
4218	*/
4219	void QRegExp::setMinimal(bool minimal)
4220	{
4221	priv->minimal = minimal;
4222	}
4223
4224	// ### Qt 5: make non-const
4225	/!*
4226	Returns \c true if \a str is matched exactly by this regular
4227	expression; otherwise returns \c false. You can determine how much of
4228	the string was matched by calling matchedLength().
4229
4230	For a given regexp string R, exactMatch("R") is the equivalent of
4231	indexIn("^R$") since exactMatch() effectively encloses the regexp
4232	in the start of string and end of string anchors, except that it
4233	sets matchedLength() differently.
4234
4235	For example, if the regular expression is \b{blue}, then
4236	exactMatch() returns \c true only for input \c blue. For inputs \c
4237	bluebell, \c blutak and \c lightblue, exactMatch() returns \c false
4238	and matchedLength() will return 4, 3 and 0 respectively.
4239
4240	Although const, this function sets matchedLength(),
4241	capturedTexts(), and pos().
4242
4243	\sa indexIn(), lastIndexIn()
4244	*/
4245	bool QRegExp::exactMatch(const QString &str) const
4246	{
4247	prepareEngineForMatch(priv, str);
4248	priv->matchState.match(str0: str.unicode(), len0: str.length(), pos0: `0`, minimal0: priv->minimal, oneTest: true, caretIndex: `0`);
4249	if (priv->matchState.captured[`1`] == str.length()) {
4250	return true;
4251	} else {
4252	priv->matchState.captured[`0`] = `0`;
4253	priv->matchState.captured[`1`] = priv->matchState.oneTestMatchedLen;
4254	return false;
4255	}
4256	}
4257
4258	// ### Qt 5: make non-const
4259	/!*
4260	Attempts to find a match in \a str from position \a offset (0 by
4261	default). If \a offset is -1, the search starts at the last
4262	character; if -2, at the next to last character; etc.
4263
4264	Returns the position of the first match, or -1 if there was no
4265	match.
4266
4267	The \a caretMode parameter can be used to instruct whether \b{^}
4268	should match at index 0 or at \a offset.
4269
4270	You might prefer to use QString::indexOf(), QString::contains(),
4271	or even QStringList::filter(). To replace matches use
4272	QString::replace().
4273
4274	Example:
4275	\snippet code/src_corelib_tools_qregexp.cpp 13
4276
4277	Although const, this function sets matchedLength(),
4278	capturedTexts() and pos().
4279
4280	If the QRegExp is a wildcard expression (see setPatternSyntax())
4281	and want to test a string against the whole wildcard expression,
4282	use exactMatch() instead of this function.
4283
4284	\sa lastIndexIn(), exactMatch()
4285	*/
4286
4287	int QRegExp::indexIn(const QString &str, int offset, CaretMode caretMode) const
4288	{
4289	prepareEngineForMatch(priv, str);
4290	if (offset < `0`)
4291	offset += str.length();
4292	priv->matchState.match(str0: str.unicode(), len0: str.length(), pos0: offset,
4293	minimal0: priv->minimal, oneTest: false, caretIndex: caretIndex(offset, caretMode));
4294	return priv->matchState.captured[`0`];
4295	}
4296
4297	// ### Qt 5: make non-const
4298	/!*
4299	Attempts to find a match backwards in \a str from position \a
4300	offset. If \a offset is -1 (the default), the search starts at the
4301	last character; if -2, at the next to last character; etc.
4302
4303	Returns the position of the first match, or -1 if there was no
4304	match.
4305
4306	The \a caretMode parameter can be used to instruct whether \b{^}
4307	should match at index 0 or at \a offset.
4308
4309	Although const, this function sets matchedLength(),
4310	capturedTexts() and pos().
4311
4312	\warning Searching backwards is much slower than searching
4313	forwards.
4314
4315	\sa indexIn(), exactMatch()
4316	*/
4317
4318	int QRegExp::lastIndexIn(const QString &str, int offset, CaretMode caretMode) const
4319	{
4320	prepareEngineForMatch(priv, str);
4321	if (offset < `0`)
4322	offset += str.length();
4323	if (offset < `0` \|\| offset > str.length()) {
4324	memset(s: priv->matchState.captured, c: -`1`, n: priv->matchState.capturedSize*sizeof(int));
4325	return -`1`;
4326	}
4327
4328	while (offset >= `0`) {
4329	priv->matchState.match(str0: str.unicode(), len0: str.length(), pos0: offset,
4330	minimal0: priv->minimal, oneTest: true, caretIndex: caretIndex(offset, caretMode));
4331	if (priv->matchState.captured[`0`] == offset)
4332	return offset;
4333	--offset;
4334	}
4335	return -`1`;
4336	}
4337
4338	/!*
4339	Returns the length of the last matched string, or -1 if there was
4340	no match.
4341
4342	\sa exactMatch(), indexIn(), lastIndexIn()
4343	*/
4344	int QRegExp::matchedLength() const
4345	{
4346	return priv->matchState.captured[`1`];
4347	}
4348
4349	#ifndef QT_NO_REGEXP_CAPTURE
4350
4351	/!*
4352	\since 4.6
4353	Returns the number of captures contained in the regular expression.
4354	*/
4355	int QRegExp::captureCount() const
4356	{
4357	prepareEngine(priv);
4358	return priv->eng->captureCount();
4359	}
4360
4361	/!*
4362	Returns a list of the captured text strings.
4363
4364	The first string in the list is the entire matched string. Each
4365	subsequent list element contains a string that matched a
4366	(capturing) subexpression of the regexp.
4367
4368	For example:
4369	\snippet code/src_corelib_tools_qregexp.cpp 14
4370
4371	The above example also captures elements that may be present but
4372	which we have no interest in. This problem can be solved by using
4373	non-capturing parentheses:
4374
4375	\snippet code/src_corelib_tools_qregexp.cpp 15
4376
4377	Note that if you want to iterate over the list, you should iterate
4378	over a copy, e.g.
4379	\snippet code/src_corelib_tools_qregexp.cpp 16
4380
4381	Some regexps can match an indeterminate number of times. For
4382	example if the input string is "Offsets: 12 14 99 231 7" and the
4383	regexp, \c{rx}, is \b{(\\d+)+}, we would hope to get a list of
4384	all the numbers matched. However, after calling
4385	\c{rx.indexIn(str)}, capturedTexts() will return the list ("12",
4386	"12"), i.e. the entire match was "12" and the first subexpression
4387	matched was "12". The correct approach is to use cap() in a
4388	\l{QRegExp#cap_in_a_loop}{loop}.
4389
4390	The order of elements in the string list is as follows. The first
4391	element is the entire matching string. Each subsequent element
4392	corresponds to the next capturing open left parentheses. Thus
4393	capturedTexts()[1] is the text of the first capturing parentheses,
4394	capturedTexts()[2] is the text of the second and so on
4395	(corresponding to $1, $2, etc., in some other regexp languages).
4396
4397	\sa cap(), pos()
4398	*/
4399	QStringList QRegExp::capturedTexts() const
4400	{
4401	if (priv->capturedCache.isEmpty()) {
4402	prepareEngine(priv);
4403	const int *captured = priv->matchState.captured;
4404	int n = priv->matchState.capturedSize;
4405
4406	for (int i = `0`; i < n; i += `2`) {
4407	QString m;
4408	if (captured[i + `1`] == `0`)
4409	m = QLatin1String (""); // ### Qt 5: don't distinguish between null and empty
4410	else if (captured[i] >= `0`)
4411	m = priv->t.mid(position: captured[i], n: captured[i + `1`]);
4412	priv->capturedCache.append(t: m);
4413	}
4414	priv->t.clear();
4415	}
4416	return priv->capturedCache;
4417	}
4418
4419	/!*
4420	\internal
4421	*/
4422	QStringList QRegExp::capturedTexts()
4423	{
4424	return const_cast<const QRegExp >(this*)->capturedTexts();
4425	}
4426
4427	/!*
4428	Returns the text captured by the \a nth subexpression. The entire
4429	match has index 0 and the parenthesized subexpressions have
4430	indexes starting from 1 (excluding non-capturing parentheses).
4431
4432	\snippet code/src_corelib_tools_qregexp.cpp 17
4433
4434	The order of elements matched by cap() is as follows. The first
4435	element, cap(0), is the entire matching string. Each subsequent
4436	element corresponds to the next capturing open left parentheses.
4437	Thus cap(1) is the text of the first capturing parentheses, cap(2)
4438	is the text of the second, and so on.
4439
4440	\sa capturedTexts(), pos()
4441	*/
4442	QString QRegExp::cap(int nth) const
4443	{
4444	return capturedTexts().value(i: nth);
4445	}
4446
4447	/!*
4448	\internal
4449	*/
4450	QString QRegExp::cap(int nth)
4451	{
4452	return const_cast<const QRegExp >(this*)->cap(nth);
4453	}
4454
4455	/!*
4456	Returns the position of the \a nth captured text in the searched
4457	string. If \a nth is 0 (the default), pos() returns the position
4458	of the whole match.
4459
4460	Example:
4461	\snippet code/src_corelib_tools_qregexp.cpp 18
4462
4463	For zero-length matches, pos() always returns -1. (For example, if
4464	cap(4) would return an empty string, pos(4) returns -1.) This is
4465	a feature of the implementation.
4466
4467	\sa cap(), capturedTexts()
4468	*/
4469	int QRegExp::pos(int nth) const
4470	{
4471	if (nth < `0` \|\| nth >= priv->matchState.capturedSize / `2`)
4472	return -`1`;
4473	else
4474	return priv->matchState.captured[`2` * nth];
4475	}
4476
4477	/!*
4478	\internal
4479	*/
4480	int QRegExp::pos(int nth)
4481	{
4482	return const_cast<const QRegExp >(this*)->pos(nth);
4483	}
4484
4485	/!*
4486	Returns a text string that explains why a regexp pattern is
4487	invalid the case being; otherwise returns "no error occurred".
4488
4489	\sa isValid()
4490	*/
4491	QString QRegExp::errorString() const
4492	{
4493	if (isValid()) {
4494	return QString::fromLatin1(RXERR_OK);
4495	} else {
4496	return priv->eng->errorString();
4497	}
4498	}
4499
4500	/!*
4501	\internal
4502	*/
4503	QString QRegExp::errorString()
4504	{
4505	return const_cast<const QRegExp >(this*)->errorString();
4506	}
4507	#endif
4508
4509	/!*
4510	Returns the string \a str with every regexp special character
4511	escaped with a backslash. The special characters are $, (,), , +,*
4512	., ?, [, \,], ^, {, \| and }.
4513
4514	Example:
4515
4516	\snippet code/src_corelib_tools_qregexp.cpp 19
4517
4518	This function is useful to construct regexp patterns dynamically:
4519
4520	\snippet code/src_corelib_tools_qregexp.cpp 20
4521
4522	\sa setPatternSyntax()
4523	*/
4524	QString QRegExp::escape(const QString &str)
4525	{
4526	QString quoted;
4527	const int count = str.count();
4528	quoted.reserve(asize: count * `2`);
4529	const QLatin1Char backslash(`'\\'`);
4530	for (int i = `0`; i < count; i++) {
4531	switch (str.at(i).toLatin1()) {
4532	case `'$'`:
4533	case `'('`:
4534	case `')'`:
4535	case `'*'`:
4536	case `'+'`:
4537	case `'.'`:
4538	case `'?'`:
4539	case `'['`:
4540	case `'\\'`:
4541	case `']'`:
4542	case `'^'`:
4543	case `'{'`:
4544	case `'\|'`:
4545	case `'}'`:
4546	quoted.append(c: backslash);
4547	}
4548	quoted.append(c: str.at(i));
4549	}
4550	return quoted;
4551	}
4552
4553
4554	#ifndef QT_NO_DATASTREAM
4555	/!*
4556	\relates QRegExp
4557
4558	Writes the regular expression \a regExp to stream \a out.
4559
4560	\sa {Serializing Qt Data Types}
4561	*/
4562	QDataStream &operator<<(QDataStream &out, const QRegExp &regExp)
4563	{
4564	return out << regExp.pattern() << (quint8)regExp.caseSensitivity()
4565	<< (quint8)regExp.patternSyntax()
4566	<< (quint8)!!regExp.isMinimal();
4567	}
4568
4569	/!*
4570	\relates QRegExp
4571
4572	Reads a regular expression from stream \a in into \a regExp.
4573
4574	\sa {Serializing Qt Data Types}
4575	*/
4576	QDataStream &operator>>(QDataStream &in, QRegExp &regExp)
4577	{
4578	QString pattern;
4579	quint8 cs;
4580	quint8 patternSyntax;
4581	quint8 isMinimal;
4582
4583	in >> pattern >> cs >> patternSyntax >> isMinimal;
4584
4585	QRegExp newRegExp(pattern, Qt::CaseSensitivity(cs),
4586	QRegExp::PatternSyntax(patternSyntax));
4587
4588	newRegExp.setMinimal(isMinimal);
4589	regExp = newRegExp;
4590	return in;
4591	}
4592	#endif // QT_NO_DATASTREAM
4593
4594	#ifndef QT_NO_DEBUG_STREAM
4595	QDebug operator<<(QDebug dbg, const QRegExp &r)
4596	{
4597	QDebugStateSaver saver(dbg);
4598	dbg.nospace() << "QRegExp(patternSyntax=" << r.patternSyntax()
4599	<< ", pattern='"<< r.pattern() << "')";
4600	return dbg;
4601	}
4602	#endif
4603
4604	QT_END_NAMESPACE
4605

source code of qtbase/src/corelib/text/qregexp.cpp