qregexp.cpp source code [qt5compat/src/core5/text/qregexp.cpp]

1	// Copyright (C) 2016 The Qt Company Ltd.
2	// SPDX-License-Identifier: LicenseRef-Qt-Commercial OR LGPL-3.0-only OR GPL-2.0-only OR GPL-3.0-only
3
4	#include "qregexp.h"
5
6	#include "qalgorithms.h"
7	#include "qbitarray.h"
8	#include "qcache.h"
9	#include "qdatastream.h"
10	#include "qdebug.h"
11	#include "qhashfunctions.h"
12	#include "qlist.h"
13	#include "qmap.h"
14	#include "qmutex.h"
15	#include "qstring.h"
16	#include "qstringlist.h"
17	#include "qstringmatcher.h"
18	#include "private/qlocking_p.h"
19	#include "qvarlengtharray.h"
20
21	#include <limits.h>
22	#include <algorithm>
23
24	QT_BEGIN_NAMESPACE
25
26	// error strings for the regexp parser
27	#define RXERR_OK QT_TRANSLATE_NOOP("QRegExp", "no error occurred")
28	#define RXERR_DISABLED QT_TRANSLATE_NOOP("QRegExp", "disabled feature used")
29	#define RXERR_CHARCLASS QT_TRANSLATE_NOOP("QRegExp", "bad char class syntax")
30	#define RXERR_LOOKAHEAD QT_TRANSLATE_NOOP("QRegExp", "bad lookahead syntax")
31	#define RXERR_LOOKBEHIND QT_TRANSLATE_NOOP("QRegExp", "lookbehinds not supported, see QTBUG-2371")
32	#define RXERR_REPETITION QT_TRANSLATE_NOOP("QRegExp", "bad repetition syntax")
33	#define RXERR_OCTAL QT_TRANSLATE_NOOP("QRegExp", "invalid octal value")
34	#define RXERR_LEFTDELIM QT_TRANSLATE_NOOP("QRegExp", "missing left delim")
35	#define RXERR_END QT_TRANSLATE_NOOP("QRegExp", "unexpected end")
36	#define RXERR_LIMIT QT_TRANSLATE_NOOP("QRegExp", "met internal limit")
37	#define RXERR_INTERVAL QT_TRANSLATE_NOOP("QRegExp", "invalid interval")
38	#define RXERR_CATEGORY QT_TRANSLATE_NOOP("QRegExp", "invalid category")
39
40	/!*
41	\class QRegExp
42	\inmodule QtCore5Compat
43	\reentrant
44	\brief The QRegExp class provides pattern matching using regular expressions.
45
46	\ingroup tools
47	\ingroup shared
48
49	\keyword regular expression
50
51	This class is deprecated in Qt 6. Please use QRegularExpression instead
52	for all new code. For guidelines on porting old code from QRegExp to
53	QRegularExpression, see {Porting to QRegularExpression}
54
55	A regular expression, or "regexp", is a pattern for matching
56	substrings in a text. This is useful in many contexts, e.g.,
57
58	\table
59	\row \li Validation
60	\li A regexp can test whether a substring meets some criteria,
61	e.g. is an integer or contains no whitespace.
62	\row \li Searching
63	\li A regexp provides more powerful pattern matching than
64	simple substring matching, e.g., match one of the words
65	\e{mail}, \e{letter} or \e{correspondence}, but none of the
66	words \e{email}, \e{mailman}, \e{mailer}, \e{letterbox}, etc.
67	\row \li Search and Replace
68	\li A regexp can replace all occurrences of a substring with a
69	different substring, e.g., replace all occurrences of \e{&}
70	with \e{\&} except where the \e{&} is already followed by
71	an \e{amp;}.
72	\row \li String Splitting
73	\li A regexp can be used to identify where a string should be
74	split apart, e.g. splitting tab-delimited strings.
75	\endtable
76
77	A brief introduction to regexps is presented, a description of
78	Qt's regexp language, some examples, and the function
79	documentation itself. QRegExp is modeled on Perl's regexp
80	language. It fully supports Unicode. QRegExp can also be used in a
81	simpler, \e{wildcard mode} that is similar to the functionality
82	found in command shells. The syntax rules used by QRegExp can be
83	changed with setPatternSyntax(). In particular, the pattern syntax
84	can be set to QRegExp::FixedString, which means the pattern to be
85	matched is interpreted as a plain string, i.e., special characters
86	(e.g., backslash) are not escaped.
87
88	A good text on regexps is \e {Mastering Regular Expressions}
89	(Third Edition) by Jeffrey E. F. Friedl, ISBN 0-596-52812-4.
90
91	\note In Qt 5, the new QRegularExpression class provides a Perl
92	compatible implementation of regular expressions and is recommended
93	in place of QRegExp.
94
95	\section1 Introduction
96
97	Regexps are built up from expressions, quantifiers, and
98	assertions. The simplest expression is a character, e.g. \b{x}
99	or \b{5}. An expression can also be a set of characters
100	enclosed in square brackets. \b{[ABCD]} will match an \b{A}
101	or a \b{B} or a \b{C} or a \b{D}. We can write this same
102	expression as \b{[A-D]}, and an expression to match any
103	capital letter in the English alphabet is written as
104	\b{[A-Z]}.
105
106	A quantifier specifies the number of occurrences of an expression
107	that must be matched. \b{x{1,1}} means match one and only one
108	\b{x}. \b{x{1,5}} means match a sequence of \b{x}
109	characters that contains at least one \b{x} but no more than
110	five.
111
112	Note that in general regexps cannot be used to check for balanced
113	brackets or tags. For example, a regexp can be written to match an
114	opening html \c{<b>} and its closing \c{</b>}, if the \c{<b>} tags
115	are not nested, but if the \c{<b>} tags are nested, that same
116	regexp will match an opening \c{<b>} tag with the wrong closing
117	\c{</b>}. For the fragment \c{<b>bold <b>bolder</b></b>}, the
118	first \c{<b>} would be matched with the first \c{</b>}, which is
119	not correct. However, it is possible to write a regexp that will
120	match nested brackets or tags correctly, but only if the number of
121	nesting levels is fixed and known. If the number of nesting levels
122	is not fixed and known, it is impossible to write a regexp that
123	will not fail.
124
125	Suppose we want a regexp to match integers in the range 0 to 99.
126	At least one digit is required, so we start with the expression
127	\b{[0-9]{1,1}}, which matches a single digit exactly once. This
128	regexp matches integers in the range 0 to 9. To match integers up
129	to 99, increase the maximum number of occurrences to 2, so the
130	regexp becomes \b{[0-9]{1,2}}. This regexp satisfies the
131	original requirement to match integers from 0 to 99, but it will
132	also match integers that occur in the middle of strings. If we
133	want the matched integer to be the whole string, we must use the
134	anchor assertions, \b{^} (caret) and \b{$} (dollar). When
135	\b{^} is the first character in a regexp, it means the regexp
136	must match from the beginning of the string. When \b{$} is the
137	last character of the regexp, it means the regexp must match to
138	the end of the string. The regexp becomes \b{^[0-9]{1,2}$}.
139	Note that assertions, e.g. \b{^} and \b{$}, do not match
140	characters but locations in the string.
141
142	If you have seen regexps described elsewhere, they may have looked
143	different from the ones shown here. This is because some sets of
144	characters and some quantifiers are so common that they have been
145	given special symbols to represent them. \b{[0-9]} can be
146	replaced with the symbol \b{\\d}. The quantifier to match
147	exactly one occurrence, \b{{1,1}}, can be replaced with the
148	expression itself, i.e. \b{x{1,1}} is the same as \b{x}. So
149	our 0 to 99 matcher could be written as \b{^\\d{1,2}$}. It can
150	also be written \b{^\\d\\d{0,1}$}, i.e. \e{From the start of
151	the string, match a digit, followed immediately by 0 or 1 digits}.
152	In practice, it would be written as \b{^\\d\\d?$}. The \b{?}
153	is shorthand for the quantifier \b{{0,1}}, i.e. 0 or 1
154	occurrences. \b{?} makes an expression optional. The regexp
155	\b{^\\d\\d?$} means \e{From the beginning of the string, match
156	one digit, followed immediately by 0 or 1 more digit, followed
157	immediately by end of string}.
158
159	To write a regexp that matches one of the words 'mail' \e or
160	'letter' \e or 'correspondence' but does not match words that
161	contain these words, e.g., 'email', 'mailman', 'mailer', and
162	'letterbox', start with a regexp that matches 'mail'. Expressed
163	fully, the regexp is \b{m{1,1}a{1,1}i{1,1}l{1,1}}, but because
164	a character expression is automatically quantified by
165	\b{{1,1}}, we can simplify the regexp to \b{mail}, i.e., an
166	'm' followed by an 'a' followed by an 'i' followed by an 'l'. Now
167	we can use the vertical bar \b{\|}, which means \b{or}, to
168	include the other two words, so our regexp for matching any of the
169	three words becomes \b{mail\|letter\|correspondence}. Match
170	'mail' \b{or} 'letter' \b{or} 'correspondence'. While this
171	regexp will match one of the three words we want to match, it will
172	also match words we don't want to match, e.g., 'email'. To
173	prevent the regexp from matching unwanted words, we must tell it
174	to begin and end the match at word boundaries. First we enclose
175	our regexp in parentheses, \b{(mail\|letter\|correspondence)}.
176	Parentheses group expressions together, and they identify a part
177	of the regexp that we wish to \l{capturing text}{capture}.
178	Enclosing the expression in parentheses allows us to use it as a
179	component in more complex regexps. It also allows us to examine
180	which of the three words was actually matched. To force the match
181	to begin and end on word boundaries, we enclose the regexp in
182	\b{\\b} \e{word boundary} assertions:
183	\b{\\b(mail\|letter\|correspondence)\\b}. Now the regexp means:
184	\e{Match a word boundary, followed by the regexp in parentheses,
185	followed by a word boundary}. The \b{\\b} assertion matches a
186	\e position in the regexp, not a \e character. A word boundary is
187	any non-word character, e.g., a space, newline, or the beginning
188	or ending of a string.
189
190	If we want to replace ampersand characters with the HTML entity
191	\b{\&}, the regexp to match is simply \b{\&}. But this
192	regexp will also match ampersands that have already been converted
193	to HTML entities. We want to replace only ampersands that are not
194	already followed by \b{amp;}. For this, we need the negative
195	lookahead assertion, \b{(?!}__\b{)}. The regexp can then be
196	written as \b{\&(?!amp;)}, i.e. \e{Match an ampersand that is}
197	\b{not} \e{followed by} \b{amp;}.
198
199	If we want to count all the occurrences of 'Eric' and 'Eirik' in a
200	string, two valid solutions are \b{\\b(Eric\|Eirik)\\b} and
201	\b{\\bEi?ri[ck]\\b}. The word boundary assertion '\\b' is
202	required to avoid matching words that contain either name,
203	e.g. 'Ericsson'. Note that the second regexp matches more
204	spellings than we want: 'Eric', 'Erik', 'Eiric' and 'Eirik'.
205
206	Some of the examples discussed above are implemented in the
207	\l{#code-examples}{code examples} section.
208
209	\target characters-and-abbreviations-for-sets-of-characters
210	\section1 Characters and Abbreviations for Sets of Characters
211
212	\table
213	\header \li Element \li Meaning
214	\row \li \b{c}
215	\li A character represents itself unless it has a special
216	regexp meaning. e.g. \b{c} matches the character \e c.
217	\row \li \b{\\c}
218	\li A character that follows a backslash matches the character
219	itself, except as specified below. e.g., To match a literal
220	caret at the beginning of a string, write \b{\\^}.
221	\row \li \b{\\a}
222	\li Matches the ASCII bell (BEL, 0x07).
223	\row \li \b{\\f}
224	\li Matches the ASCII form feed (FF, 0x0C).
225	\row \li \b{\\n}
226	\li Matches the ASCII line feed (LF, 0x0A, Unix newline).
227	\row \li \b{\\r}
228	\li Matches the ASCII carriage return (CR, 0x0D).
229	\row \li \b{\\t}
230	\li Matches the ASCII horizontal tab (HT, 0x09).
231	\row \li \b{\\v}
232	\li Matches the ASCII vertical tab (VT, 0x0B).
233	\row \li \b{\\x\e{hhhh}}
234	\li Matches the Unicode character corresponding to the
235	hexadecimal number \e{hhhh} (between 0x0000 and 0xFFFF).
236	\row \li \b{\\0\e{ooo}} (i.e., \\zero \e{ooo})
237	\li matches the ASCII/Latin1 character for the octal number
238	\e{ooo} (between 0 and 0377).
239	\row \li \b{. (dot)}
240	\li Matches any character (including newline).
241	\row \li \b{\\d}
242	\li Matches a digit (QChar::isDigit()).
243	\row \li \b{\\D}
244	\li Matches a non-digit.
245	\row \li \b{\\s}
246	\li Matches a whitespace character (QChar::isSpace()).
247	\row \li \b{\\S}
248	\li Matches a non-whitespace character.
249	\row \li \b{\\w}
250	\li Matches a word character (QChar::isLetterOrNumber(), QChar::isMark(), or '_').
251	\row \li \b{\\W}
252	\li Matches a non-word character.
253	\row \li \b{\\\e{n}}
254	\li The \e{n}-th backreference, e.g. \\1, \\2, etc.
255	\endtable
256
257	\b{Note:} The C++ compiler transforms backslashes in strings.
258	To include a \b{\\} in a regexp, enter it twice, i.e. \c{\\}.
259	To match the backslash character itself, enter it four times, i.e.
260	\c{\\\\}.
261
262	\target sets-of-characters
263	\section1 Sets of Characters
264
265	Square brackets mean match any character contained in the square
266	brackets. The character set abbreviations described above can
267	appear in a character set in square brackets. Except for the
268	character set abbreviations and the following two exceptions,
269	characters do not have special meanings in square brackets.
270
271	\table
272	\row \li \b{^}
273
274	\li The caret negates the character set if it occurs as the
275	first character (i.e. immediately after the opening square
276	bracket). \b{[abc]} matches 'a' or 'b' or 'c', but
277	\b{[^abc]} matches anything \e but 'a' or 'b' or 'c'.
278
279	\row \li \b{-}
280
281	\li The dash indicates a range of characters. \b{[W-Z]}
282	matches 'W' or 'X' or 'Y' or 'Z'.
283
284	\endtable
285
286	Using the predefined character set abbreviations is more portable
287	than using character ranges across platforms and languages. For
288	example, \b{[0-9]} matches a digit in Western alphabets but
289	\b{\\d} matches a digit in \e any alphabet.
290
291	Note: In other regexp documentation, sets of characters are often
292	called "character classes".
293
294	\target quantifiers
295	\section1 Quantifiers
296
297	By default, an expression is automatically quantified by
298	\b{{1,1}}, i.e. it should occur exactly once. In the following
299	list, \b{\e {E}} stands for expression. An expression is a
300	character, or an abbreviation for a set of characters, or a set of
301	characters in square brackets, or an expression in parentheses.
302
303	\table
304	\row \li \b{\e {E}?}
305
306	\li Matches zero or one occurrences of \e E. This quantifier
307	means \e{The previous expression is optional}, because it
308	will match whether or not the expression is found. \b{\e
309	{E}?} is the same as \b{\e {E}{0,1}}. e.g., \b{dents?}
310	matches 'dent' or 'dents'.
311
312	\row \li \b{\e {E}+}
313
314	\li Matches one or more occurrences of \e E. \b{\e {E}+} is
315	the same as \b{\e {E}{1,}}. e.g., \b{0+} matches '0',
316	'00', '000', etc.
317
318	\row \li \b{\e {E}}*
319
320	\li Matches zero or more occurrences of \e E. It is the same
321	as \b{\e {E}{0,}}. The \b{} quantifier is often used*
322	in error where \b{+} should be used. For example, if
323	\b{\\s$} is used in an expression to match strings that*
324	end in whitespace, it will match every string because
325	\b{\\s$} means \e{Match zero or more whitespaces followed*
326	by end of string}. The correct regexp to match strings that
327	have at least one trailing whitespace character is
328	\b{\\s+$}.
329
330	\row \li \b{\e {E}{n}}
331
332	\li Matches exactly \e n occurrences of \e E. \b{\e {E}{n}}
333	is the same as repeating \e E \e n times. For example,
334	\b{x{5}} is the same as \b{xxxxx}. It is also the same
335	as \b{\e {E}{n,n}}, e.g. \b{x{5,5}}.
336
337	\row \li \b{\e {E}{n,}}
338	\li Matches at least \e n occurrences of \e E.
339
340	\row \li \b{\e {E}{,m}}
341	\li Matches at most \e m occurrences of \e E. \b{\e {E}{,m}}
342	is the same as \b{\e {E}{0,m}}.
343
344	\row \li \b{\e {E}{n,m}}
345	\li Matches at least \e n and at most \e m occurrences of \e E.
346	\endtable
347
348	To apply a quantifier to more than just the preceding character,
349	use parentheses to group characters together in an expression. For
350	example, \b{tag+} matches a 't' followed by an 'a' followed by
351	at least one 'g', whereas \b{(tag)+} matches at least one
352	occurrence of 'tag'.
353
354	Note: Quantifiers are normally "greedy". They always match as much
355	text as they can. For example, \b{0+} matches the first zero it
356	finds and all the consecutive zeros after the first zero. Applied
357	to '20005', it matches '2\underline{000}5'. Quantifiers can be made
358	non-greedy, see setMinimal().
359
360	\target capturing parentheses
361	\target backreferences
362	\section1 Capturing Text
363
364	Parentheses allow us to group elements together so that we can
365	quantify and capture them. For example if we have the expression
366	\b{mail\|letter\|correspondence} that matches a string we know
367	that \e one of the words matched but not which one. Using
368	parentheses allows us to "capture" whatever is matched within
369	their bounds, so if we used \b{(mail\|letter\|correspondence)}
370	and matched this regexp against the string "I sent you some email"
371	we can use the cap() or capturedTexts() functions to extract the
372	matched characters, in this case 'mail'.
373
374	We can use captured text within the regexp itself. To refer to the
375	captured text we use \e backreferences which are indexed from 1,
376	the same as for cap(). For example we could search for duplicate
377	words in a string using \b{\\b(\\w+)\\W+\\1\\b} which means match a
378	word boundary followed by one or more word characters followed by
379	one or more non-word characters followed by the same text as the
380	first parenthesized expression followed by a word boundary.
381
382	If we want to use parentheses purely for grouping and not for
383	capturing we can use the non-capturing syntax, e.g.
384	\b{(?:green\|blue)}. Non-capturing parentheses begin '(?:' and
385	end ')'. In this example we match either 'green' or 'blue' but we
386	do not capture the match so we only know whether or not we matched
387	but not which color we actually found. Using non-capturing
388	parentheses is more efficient than using capturing parentheses
389	since the regexp engine has to do less book-keeping.
390
391	Both capturing and non-capturing parentheses may be nested.
392
393	\target greedy quantifiers
394
395	For historical reasons, quantifiers (e.g. \b{}) that apply to*
396	capturing parentheses are more "greedy" than other quantifiers.
397	For example, \b{a(a)} will match "aaa" with cap(1) == "aaa".
398	This behavior is different from what other regexp engines do
399	(notably, Perl). To obtain a more intuitive capturing behavior,
400	specify QRegExp::RegExp2 to the QRegExp constructor or call
401	setPatternSyntax(QRegExp::RegExp2).
402
403	\target cap_in_a_loop
404
405	When the number of matches cannot be determined in advance, a
406	common idiom is to use cap() in a loop. For example:
407
408	\snippet code/src_corelib_text_qregexp.cpp 0
409
410	\target assertions
411	\section1 Assertions
412
413	Assertions make some statement about the text at the point where
414	they occur in the regexp but they do not match any characters. In
415	the following list \b{\e {E}} stands for any expression.
416
417	\table
418	\row \li \b{^}
419	\li The caret signifies the beginning of the string. If you
420	wish to match a literal \c{^} you must escape it by
421	writing \c{\\^}. For example, \b{^#include} will only
422	match strings which \e begin with the characters '#include'.
423	(When the caret is the first character of a character set it
424	has a special meaning, see \l{#sets-of-characters}{Sets of Characters}.)
425
426	\row \li \b{$}
427	\li The dollar signifies the end of the string. For example
428	\b{\\d\\s$} will match strings which end with a digit*
429	optionally followed by whitespace. If you wish to match a
430	literal \c{$} you must escape it by writing
431	\c{\\$}.
432
433	\row \li \b{\\b}
434	\li A word boundary. For example the regexp
435	\b{\\bOK\\b} means match immediately after a word
436	boundary (e.g. start of string or whitespace) the letter 'O'
437	then the letter 'K' immediately before another word boundary
438	(e.g. end of string or whitespace). But note that the
439	assertion does not actually match any whitespace so if we
440	write \b{(\\bOK\\b)} and we have a match it will only
441	contain 'OK' even if the string is "It's \underline{OK} now".
442
443	\row \li \b{\\B}
444	\li A non-word boundary. This assertion is true wherever
445	\b{\\b} is false. For example if we searched for
446	\b{\\Bon\\B} in "Left on" the match would fail (space
447	and end of string aren't non-word boundaries), but it would
448	match in "t\underline{on}ne".
449
450	\row \li \b{(?=\e E)}
451	\li Positive lookahead. This assertion is true if the
452	expression matches at this point in the regexp. For example,
453	\b{const(?=\\s+char)} matches 'const' whenever it is
454	followed by 'char', as in 'static \underline{const} char '.*
455	(Compare with \b{const\\s+char}, which matches 'static
456	\underline{const char} '.)*
457
458	\row \li \b{(?!\e E)}
459	\li Negative lookahead. This assertion is true if the
460	expression does not match at this point in the regexp. For
461	example, \b{const(?!\\s+char)} matches 'const' \e except
462	when it is followed by 'char'.
463	\endtable
464
465	\target QRegExp wildcard matching
466	\section1 Wildcard Matching
467
468	Most command shells such as \e bash or \e cmd.exe support "file
469	globbing", the ability to identify a group of files by using
470	wildcards. The setPatternSyntax() function is used to switch
471	between regexp and wildcard mode. Wildcard matching is much
472	simpler than full regexps and has only four features:
473
474	\table
475	\row \li \b{c}
476	\li Any character represents itself apart from those mentioned
477	below. Thus \b{c} matches the character \e c.
478	\row \li \b{?}
479	\li Matches any single character. It is the same as
480	\b{.} in full regexps.
481	\row \li \b{}*
482	\li Matches zero or more of any characters. It is the
483	same as \b{.} in full regexps.*
484	\row \li \b{[...]}
485	\li Sets of characters can be represented in square brackets,
486	similar to full regexps. Within the character class, like
487	outside, backslash has no special meaning.
488	\endtable
489
490	In the mode Wildcard, the wildcard characters cannot be
491	escaped. In the mode WildcardUnix, the character '\\' escapes the
492	wildcard.
493
494	For example if we are in wildcard mode and have strings which
495	contain filenames we could identify HTML files with \b{.html}.*
496	This will match zero or more characters followed by a dot followed
497	by 'h', 't', 'm' and 'l'.
498
499	To test a string against a wildcard expression, use exactMatch().
500	For example:
501
502	\snippet code/src_corelib_text_qregexp.cpp 1
503
504	\target perl-users
505	\section1 Notes for Perl Users
506
507	Most of the character class abbreviations supported by Perl are
508	supported by QRegExp, see \l{#characters-and-abbreviations-for-sets-of-characters}
509	{characters and abbreviations for sets of characters}.
510
511	In QRegExp, apart from within character classes, \c{^} always
512	signifies the start of the string, so carets must always be
513	escaped unless used for that purpose. In Perl the meaning of caret
514	varies automagically depending on where it occurs so escaping it
515	is rarely necessary. The same applies to \c{$} which in
516	QRegExp always signifies the end of the string.
517
518	QRegExp's quantifiers are the same as Perl's greedy quantifiers
519	(but see the \l{greedy quantifiers}{note above}). Non-greedy
520	matching cannot be applied to individual quantifiers, but can be
521	applied to all the quantifiers in the pattern. For example, to
522	match the Perl regexp \b{ro+?m} requires:
523
524	\snippet code/src_corelib_text_qregexp.cpp 2
525
526	The equivalent of Perl's \c{/i} option is
527	setCaseSensitivity(Qt::CaseInsensitive).
528
529	Perl's \c{/g} option can be emulated using a \l{#cap_in_a_loop}{loop}.
530
531	In QRegExp \b{.} matches any character, therefore all QRegExp
532	regexps have the equivalent of Perl's \c{/s} option. QRegExp
533	does not have an equivalent to Perl's \c{/m} option, but this
534	can be emulated in various ways for example by splitting the input
535	into lines or by looping with a regexp that searches for newlines.
536
537	Because QRegExp is string oriented, there are no \\A, \\Z, or \\z
538	assertions. The \\G assertion is not supported but can be emulated
539	in a loop.
540
541	Perl's $& is cap(0) or capturedTexts()[0]. There are no QRegExp
542	equivalents for $`, $' or $+. Perl's capturing variables, $1, $2,
543	... correspond to cap(1) or capturedTexts()[1], cap(2) or
544	capturedTexts()[2], etc.
545
546	To substitute a pattern use QString::replace().
547
548	Perl's extended \c{/x} syntax is not supported, nor are
549	directives, e.g. (?i), or regexp comments, e.g. (?#comment). On
550	the other hand, C++'s rules for literal strings can be used to
551	achieve the same:
552
553	\snippet code/src_corelib_text_qregexp.cpp 3
554
555	Both zero-width positive and zero-width negative lookahead
556	assertions (?=pattern) and (?!pattern) are supported with the same
557	syntax as Perl. Perl's lookbehind assertions, "independent"
558	subexpressions and conditional expressions are not supported.
559
560	Non-capturing parentheses are also supported, with the same
561	(?:pattern) syntax.
562
563	See QString::split() and QStringList::join() for equivalents
564	to Perl's split and join functions.
565
566	Note: because C++ transforms \\'s they must be written \e twice in
567	code, e.g. \b{\\b} must be written \b{\\\\b}.
568
569	\target code-examples
570	\section1 Code Examples
571
572	\snippet code/src_corelib_text_qregexp.cpp 4
573
574	The third string matches '\underline{6}'. This is a simple validation
575	regexp for integers in the range 0 to 99.
576
577	\snippet code/src_corelib_text_qregexp.cpp 5
578
579	The second string matches '\underline{This_is-OK}'. We've used the
580	character set abbreviation '\\S' (non-whitespace) and the anchors
581	to match strings which contain no whitespace.
582
583	In the following example we match strings containing 'mail' or
584	'letter' or 'correspondence' but only match whole words i.e. not
585	'email'
586
587	\snippet code/src_corelib_text_qregexp.cpp 6
588
589	The second string matches "Please write the \underline{letter}". The
590	word 'letter' is also captured (because of the parentheses). We
591	can see what text we've captured like this:
592
593	\snippet code/src_corelib_text_qregexp.cpp 7
594
595	This will capture the text from the first set of capturing
596	parentheses (counting capturing left parentheses from left to
597	right). The parentheses are counted from 1 since cap(0) is the
598	whole matched regexp (equivalent to '&' in most regexp engines).
599
600	\snippet code/src_corelib_text_qregexp.cpp 8
601
602	Here we've passed the QRegExp to QString's replace() function to
603	replace the matched text with new text.
604
605	\snippet code/src_corelib_text_qregexp.cpp 9
606
607	We've used the indexIn() function to repeatedly match the regexp in
608	the string. Note that instead of moving forward by one character
609	at a time \c pos++ we could have written \c {pos +=
610	rx.matchedLength()} to skip over the already matched string. The
611	count will equal 3, matching 'One \underline{Eric} another
612	\underline{Eirik}, and an Ericsson. How many Eiriks, \underline{Eric}?'; it
613	doesn't match 'Ericsson' or 'Eiriks' because they are not bounded
614	by non-word boundaries.
615
616	One common use of regexps is to split lines of delimited data into
617	their component fields.
618
619	\snippet code/src_corelib_text_qregexp.cpp 10
620
621	In this example our input lines have the format company name, web
622	address and country. Unfortunately the regexp is rather long and
623	not very versatile -- the code will break if we add any more
624	fields. A simpler and better solution is to look for the
625	separator, '\\t' in this case, and take the surrounding text. The
626	QString::split() function can take a separator string or regexp
627	as an argument and split a string accordingly.
628
629	\snippet code/src_corelib_text_qregexp.cpp 11
630
631	Here field[0] is the company, field[1] the web address and so on.
632
633	To imitate the matching of a shell we can use wildcard mode.
634
635	\snippet code/src_corelib_text_qregexp.cpp 12
636
637	Wildcard matching can be convenient because of its simplicity, but
638	any wildcard regexp can be defined using full regexps, e.g.
639	\b{.\\.html$}. Notice that we can't match both \c .html and \c*
640	.htm files with a wildcard unless we use \b{.htm} which will
641	also match 'test.html.bak'. A full regexp gives us the precision
642	we need, \b{.\\.html?$}.*
643
644	QRegExp can match case insensitively using setCaseSensitivity(),
645	and can use non-greedy matching, see setMinimal(). By
646	default QRegExp uses full regexps but this can be changed with
647	setPatternSyntax(). Searching can be done forward with indexIn() or backward
648	with lastIndexIn(). Captured text can be accessed using
649	capturedTexts() which returns a string list of all captured
650	strings, or using cap() which returns the captured string for the
651	given index. The pos() function takes a match index and returns
652	the position in the string where the match was made (or -1 if
653	there was no match).
654
655	\sa QString, QStringList, QSortFilterProxyModel
656
657	\section1 Porting to QRegularExpression
658
659	\include corelib/port-from-qregexp.qdocinc porting-to-qregularexpression
660	*/
661
662	#if defined(Q_OS_VXWORKS) && defined(EOS)
663	# undef EOS
664	#endif
665
666	const int NumBadChars = `64`;
667	#define BadChar(ch) ((ch).unicode() % NumBadChars)
668
669	const int NoOccurrence = INT_MAX;
670	const int EmptyCapture = INT_MAX;
671	const int InftyLen = INT_MAX;
672	const int InftyRep = `1025`;
673	const int EOS = -`1`;
674
675	static bool isWord(QChar ch)
676	{
677	return ch.isLetterOrNumber() \|\| ch.isMark() \|\| ch == QLatin1Char (`'_'`);
678	}
679
680	/*
681	Merges two vectors of ints and puts the result into the first
682	one.
683	*/
684	static void mergeInto(QList<int> a, const* QList<int> &b)
685	{
686	int asize = a->size();
687	int bsize = b.size();
688	if (asize == `0`) {
689	*a = b;
690	#ifndef QT_NO_REGEXP_OPTIM
691	} else if (bsize == `1` && a->at(i: asize - `1`) < b.at(i: `0`)) {
692	a->resize(size: asize + `1`);
693	(*a)[asize] = b.at(i: `0`);
694	#endif
695	} else if (bsize >= `1`) {
696	int csize = asize + bsize;
697	QList<int> c(csize);
698	int i = `0`, j = `0`, k = `0`;
699	while (i < asize) {
700	if (j < bsize) {
701	if (a->at(i) == b.at(i: j)) {
702	++i;
703	--csize;
704	} else if (a->at(i) < b.at(i: j)) {
705	c [k++] = a->at(i: i++);
706	} else {
707	c [k++] = b.at(i: j++);
708	}
709	} else {
710	memcpy(dest: c.data() + k, src: a->constData() + i, n: (asize - i) * sizeof(int));
711	break;
712	}
713	}
714	c.resize(size: csize);
715	if (j < bsize)
716	memcpy(dest: c.data() + k, src: b.constData() + j, n: (bsize - j) * sizeof(int));
717	*a = c;
718	}
719	}
720
721	#ifndef QT_NO_REGEXP_WILDCARD
722	/*
723	Translates a wildcard pattern to an equivalent regular expression
724	pattern (e.g., .cpp to .\.cpp).
725
726	If enableEscaping is true, it is possible to escape the wildcard
727	characters with \
728	*/
729	static QString wc2rx(const QString &wc_str, const bool enableEscaping)
730	{
731	const int wclen = wc_str.size();
732	QString rx;
733	int i = `0`;
734	bool isEscaping = false; // the previous character is '\'
735	const QChar *wc = wc_str.unicode();
736
737	while (i < wclen) {
738	const QChar c = wc[i++];
739	switch (c.unicode()) {
740	case `'\\'`:
741	if (enableEscaping) {
742	if (isEscaping) {
743	rx += QLatin1String ("\\\\");
744	} // we insert the \\ later if necessary
745	if (i == wclen) { // the end
746	rx += QLatin1String ("\\\\");
747	}
748	} else {
749	rx += QLatin1String ("\\\\");
750	}
751	isEscaping = true;
752	break;
753	case `'*'`:
754	if (isEscaping) {
755	rx += QLatin1String ("\\*");
756	isEscaping = false;
757	} else {
758	rx += QLatin1String (".*");
759	}
760	break;
761	case `'?'`:
762	if (isEscaping) {
763	rx += QLatin1String ("\\?");
764	isEscaping = false;
765	} else {
766	rx += QLatin1Char (`'.'`);
767	}
768
769	break;
770	case `'$'`:
771	case `'('`:
772	case `')'`:
773	case `'+'`:
774	case `'.'`:
775	case `'^'`:
776	case `'{'`:
777	case `'\|'`:
778	case `'}'`:
779	if (isEscaping) {
780	isEscaping = false;
781	rx += QLatin1String ("\\\\");
782	}
783	rx += QLatin1Char (`'\\'`);
784	rx += c;
785	break;
786	case `'['`:
787	if (isEscaping) {
788	isEscaping = false;
789	rx += QLatin1String ("\\[");
790	} else {
791	rx += c;
792	if (wc[i] == QLatin1Char (`'^'`))
793	rx += wc[i++];
794	if (i < wclen) {
795	if (wc[i] == QLatin1Char (`']'`))
796	rx += wc[i++];
797	while (i < wclen && wc[i] != QLatin1Char (`']'`)) {
798	if (wc[i] == QLatin1Char (`'\\'`))
799	rx += QLatin1Char (`'\\'`);
800	rx += wc[i++];
801	}
802	}
803	}
804	break;
805
806	case `']'`:
807	if (isEscaping){
808	isEscaping = false;
809	rx += QLatin1String ("\\");
810	}
811	rx += c;
812	break;
813
814	default:
815	if (isEscaping){
816	isEscaping = false;
817	rx += QLatin1String ("\\\\");
818	}
819	rx += c;
820	}
821	}
822	return rx;
823	}
824	#endif
825
826	static int caretIndex(int offset, QRegExp::CaretMode caretMode)
827	{
828	if (caretMode == QRegExp::CaretAtZero) {
829	return `0`;
830	} else if (caretMode == QRegExp::CaretAtOffset) {
831	return offset;
832	} else { // QRegExp::CaretWontMatch
833	return -`1`;
834	}
835	}
836
837	/*
838	The QRegExpEngineKey struct uniquely identifies an engine.
839	*/
840	struct QRegExpEngineKey
841	{
842	QString pattern;
843	QRegExp::PatternSyntax patternSyntax;
844	Qt::CaseSensitivity cs;
845
846	inline QRegExpEngineKey(const QString &pattern, QRegExp::PatternSyntax patternSyntax,
847	Qt::CaseSensitivity cs)
848	: pattern (pattern), patternSyntax(patternSyntax), cs(cs) {}
849
850	inline void clear() {
851	pattern.clear();
852	patternSyntax = QRegExp::RegExp;
853	cs = Qt::CaseSensitive;
854	}
855	};
856
857	static bool operator==(const QRegExpEngineKey &key1, const QRegExpEngineKey &key2)
858	{
859	return key1.pattern == key2.pattern && key1.patternSyntax == key2.patternSyntax
860	&& key1.cs == key2.cs;
861	}
862
863	static size_t qHash(const QRegExpEngineKey &key, size_t seed = `0`) noexcept
864	{
865	return qHashMulti(seed, args: key.pattern, args: key.patternSyntax, args: key.cs);
866	}
867
868	class QRegExpEngine;
869
870	/*
871	This is the engine state during matching.
872	*/
873	struct QRegExpMatchState
874	{
875	const QChar in; // a pointer to the input string data*
876	int pos; // the current position in the string
877	int caretPos;
878	int len; // the length of the input string
879	bool minimal; // minimal matching?
880	int bigArray; // big array holding the data for the next pointers*
881	int inNextStack; // is state is nextStack?*
882	int curStack; // stack of current states*
883	int nextStack; // stack of next states*
884	int curCapBegin; // start of current states' captures*
885	int nextCapBegin; // start of next states' captures*
886	int curCapEnd; // end of current states' captures*
887	int nextCapEnd; // end of next states' captures*
888	int tempCapBegin; // start of temporary captures*
889	int tempCapEnd; // end of temporary captures*
890	int capBegin; // start of captures for a next state*
891	int capEnd; // end of captures for a next state*
892	int slideTab; // bump-along slide table for bad-character heuristic*
893	int captured; // what match() returned last*
894	int slideTabSize; // size of slide table
895	int capturedSize;
896	#ifndef QT_NO_REGEXP_BACKREF
897	QList<QList<int>> sleeping; // list of back-reference sleepers
898	#endif
899	int matchLen; // length of match
900	int oneTestMatchedLen; // length of partial match
901
902	const QRegExpEngine *eng;
903
904	inline QRegExpMatchState() : bigArray(nullptr), captured(nullptr) {}
905	inline ~QRegExpMatchState() { free(ptr: bigArray); }
906
907	void drain() { free(ptr: bigArray); bigArray = nullptr; captured = nullptr; } // to save memory
908	void prepareForMatch(QRegExpEngine *eng);
909	void match(const QChar str, int* len, int pos, bool minimal,
910	bool oneTest, int caretIndex);
911	bool matchHere();
912	bool testAnchor(int i, int a, const int *capBegin);
913	};
914
915	/*
916	The struct QRegExpAutomatonState represents one state in a modified NFA. The
917	input characters matched are stored in the state instead of on
918	the transitions, something possible for an automaton
919	constructed from a regular expression.
920	*/
921	struct QRegExpAutomatonState
922	{
923	#ifndef QT_NO_REGEXP_CAPTURE
924	int atom; // which atom does this state belong to?
925	#endif
926	int match; // what does it match? (see CharClassBit and BackRefBit)
927	QList<int> outs; // out-transitions
928	QMap<int, int> reenter; // atoms reentered when transiting out
929	QMap<int, int> anchors; // anchors met when transiting out
930
931	inline QRegExpAutomatonState() { }
932	#ifndef QT_NO_REGEXP_CAPTURE
933	inline QRegExpAutomatonState(int a, int m)
934	: atom(a), match(m) { }
935	#else
936	inline QRegExpAutomatonState(int m)
937	: match(m) { }
938	#endif
939	};
940
941	Q_DECLARE_TYPEINFO(QRegExpAutomatonState, Q_RELOCATABLE_TYPE);
942
943	/*
944	The struct QRegExpCharClassRange represents a range of characters (e.g.,
945	[0-9] denotes range 48 to 57).
946	*/
947	struct QRegExpCharClassRange
948	{
949	ushort from; // 48
950	ushort len; // 10
951	};
952
953	Q_DECLARE_TYPEINFO(QRegExpCharClassRange, Q_PRIMITIVE_TYPE);
954
955	#ifndef QT_NO_REGEXP_CAPTURE
956	/*
957	The struct QRegExpAtom represents one node in the hierarchy of regular
958	expression atoms.
959	*/
960	struct QRegExpAtom
961	{
962	enum { NoCapture = -`1`, OfficialCapture = -`2`, UnofficialCapture = -`3` };
963
964	int parent; // index of parent in array of atoms
965	int capture; // index of capture, from 1 to ncap - 1
966	};
967
968	Q_DECLARE_TYPEINFO(QRegExpAtom, Q_PRIMITIVE_TYPE);
969	#endif
970
971	struct QRegExpLookahead;
972
973	#ifndef QT_NO_REGEXP_ANCHOR_ALT
974	/*
975	The struct QRegExpAnchorAlternation represents a pair of anchors with
976	OR semantics.
977	*/
978	struct QRegExpAnchorAlternation
979	{
980	int a; // this anchor...
981	int b; // ...or this one
982	};
983
984	Q_DECLARE_TYPEINFO(QRegExpAnchorAlternation, Q_PRIMITIVE_TYPE);
985	#endif
986
987	#ifndef QT_NO_REGEXP_CCLASS
988
989	#define FLAG(x) (1 << (x))
990	/*
991	The class QRegExpCharClass represents a set of characters, such as can
992	be found in regular expressions (e.g., [a-z] denotes the set
993	{a, b, ..., z}).
994	*/
995	class QRegExpCharClass
996	{
997	public:
998	QRegExpCharClass();
999
1000	void clear();
1001	bool negative() const { return n; }
1002	void setNegative(bool negative);
1003	void addCategories(uint cats);
1004	void addRange(ushort from, ushort to);
1005	void addSingleton(ushort ch) { addRange(from: ch, to: ch); }
1006
1007	bool in(QChar ch) const;
1008	#ifndef QT_NO_REGEXP_OPTIM
1009	const QList<int> &firstOccurrence() const { return occ1; }
1010	#endif
1011
1012	#if defined(QT_DEBUG)
1013	void dump() const;
1014	#endif
1015
1016	private:
1017	QList<QRegExpCharClassRange> r; // character ranges
1018	#ifndef QT_NO_REGEXP_OPTIM
1019	QList<int> occ1; // first-occurrence array
1020	#endif
1021	uint c; // character classes
1022	bool n; // negative?
1023	};
1024	#else
1025	struct QRegExpCharClass
1026	{
1027	int dummy;
1028
1029	#ifndef QT_NO_REGEXP_OPTIM
1030	QRegExpCharClass() { occ1.fill(`0`, NumBadChars); }
1031
1032	const QList<int> &firstOccurrence() const { return occ1; }
1033	QList<int> occ1;
1034	#endif
1035	};
1036	#endif
1037
1038	Q_DECLARE_TYPEINFO(QRegExpCharClass, Q_RELOCATABLE_TYPE);
1039
1040	/*
1041	The QRegExpEngine class encapsulates a modified nondeterministic
1042	finite automaton (NFA).
1043	*/
1044	class QRegExpEngine
1045	{
1046	public:
1047	QRegExpEngine(Qt::CaseSensitivity cs, bool greedyQuantifiers)
1048	: cs(cs), greedyQuantifiers(greedyQuantifiers) { setup(); }
1049
1050	QRegExpEngine(const QRegExpEngineKey &key);
1051	~QRegExpEngine();
1052
1053	bool isValid() const { return valid; }
1054	const QString &errorString() const { return yyError; }
1055	int captureCount() const { return officialncap; }
1056
1057	int createState(QChar ch);
1058	int createState(const QRegExpCharClass &cc);
1059	#ifndef QT_NO_REGEXP_BACKREF
1060	int createState(int bref);
1061	#endif
1062
1063	void addCatTransitions(const QList<int> &from, const QList<int> &to);
1064	#ifndef QT_NO_REGEXP_CAPTURE
1065	void addPlusTransitions(const QList<int> &from, const QList<int> &to, int atom);
1066	#endif
1067
1068	#ifndef QT_NO_REGEXP_ANCHOR_ALT
1069	int anchorAlternation(int a, int b);
1070	int anchorConcatenation(int a, int b);
1071	#else
1072	int anchorAlternation(int a, int b) { return a & b; }
1073	int anchorConcatenation(int a, int b) { return a \| b; }
1074	#endif
1075	void addAnchors(int from, int to, int a);
1076
1077	#ifndef QT_NO_REGEXP_OPTIM
1078	void heuristicallyChooseHeuristic();
1079	#endif
1080
1081	#if defined(QT_DEBUG)
1082	void dump() const;
1083	#endif
1084
1085	QAtomicInt ref;
1086
1087	private:
1088	enum { CharClassBit = `0x10000`, BackRefBit = `0x20000` };
1089	enum { InitialState = `0`, FinalState = `1` };
1090
1091	void setup();
1092	int setupState(int match);
1093
1094	/*
1095	Let's hope that 13 lookaheads and 14 back-references are
1096	enough.
1097	*/
1098	enum { MaxLookaheads = `13`, MaxBackRefs = `14` };
1099	enum { Anchor_Dollar = `0x00000001`, Anchor_Caret = `0x00000002`, Anchor_Word = `0x00000004`,
1100	Anchor_NonWord = `0x00000008`, Anchor_FirstLookahead = `0x00000010`,
1101	Anchor_BackRef1Empty = Anchor_FirstLookahead << MaxLookaheads,
1102	Anchor_BackRef0Empty = Anchor_BackRef1Empty >> `1`,
1103	Anchor_Alternation = unsigned(Anchor_BackRef1Empty) << MaxBackRefs,
1104
1105	Anchor_LookaheadMask = (Anchor_FirstLookahead - `1`) ^
1106	((Anchor_FirstLookahead << MaxLookaheads) - `1`) };
1107	#ifndef QT_NO_REGEXP_CAPTURE
1108	int startAtom(bool officialCapture);
1109	void finishAtom(int atom, bool needCapture);
1110	#endif
1111
1112	#ifndef QT_NO_REGEXP_LOOKAHEAD
1113	int addLookahead(QRegExpEngine eng, bool* negative);
1114	#endif
1115
1116	#ifndef QT_NO_REGEXP_OPTIM
1117	bool goodStringMatch(QRegExpMatchState &matchState) const;
1118	bool badCharMatch(QRegExpMatchState &matchState) const;
1119	#else
1120	bool bruteMatch(QRegExpMatchState &matchState) const;
1121	#endif
1122
1123	QList<QRegExpAutomatonState> s; // array of states
1124	#ifndef QT_NO_REGEXP_CAPTURE
1125	QList<QRegExpAtom> f; // atom hierarchy
1126	int nf; // number of atoms
1127	int cf; // current atom
1128	QList<int> captureForOfficialCapture;
1129	#endif
1130	int officialncap; // number of captures, seen from the outside
1131	int ncap; // number of captures, seen from the inside
1132	#ifndef QT_NO_REGEXP_CCLASS
1133	QList<QRegExpCharClass> cl; // array of character classes
1134	#endif
1135	#ifndef QT_NO_REGEXP_LOOKAHEAD
1136	QList<QRegExpLookahead > ahead; // array of lookaheads*
1137	#endif
1138	#ifndef QT_NO_REGEXP_ANCHOR_ALT
1139	QList<QRegExpAnchorAlternation> aa; // array of (a, b) pairs of anchors
1140	#endif
1141	#ifndef QT_NO_REGEXP_OPTIM
1142	bool caretAnchored; // does the regexp start with ^?
1143	bool trivial; // is the good-string all that needs to match?
1144	#endif
1145	bool valid; // is the regular expression valid?
1146	Qt::CaseSensitivity cs; // case sensitive?
1147	bool greedyQuantifiers; // RegExp2?
1148	bool xmlSchemaExtensions;
1149	#ifndef QT_NO_REGEXP_BACKREF
1150	int nbrefs; // number of back-references
1151	#endif
1152
1153	#ifndef QT_NO_REGEXP_OPTIM
1154	bool useGoodStringHeuristic; // use goodStringMatch? otherwise badCharMatch
1155
1156	int goodEarlyStart; // the index where goodStr can first occur in a match
1157	int goodLateStart; // the index where goodStr can last occur in a match
1158	QString goodStr; // the string that any match has to contain
1159
1160	int minl; // the minimum length of a match
1161	QList<int> occ1; // first-occurrence array
1162	#endif
1163
1164	/*
1165	The class Box is an abstraction for a regular expression
1166	fragment. It can also be seen as one node in the syntax tree of
1167	a regular expression with synthetized attributes.
1168
1169	Its interface is ugly for performance reasons.
1170	*/
1171	class Box
1172	{
1173	public:
1174	Box(QRegExpEngine *engine);
1175	Box(const Box &b) { operator=(b); }
1176
1177	Box &operator=(const Box &b);
1178
1179	void clear() { operator=(b: Box (eng)); }
1180	void set(QChar ch);
1181	void set(const QRegExpCharClass &cc);
1182	#ifndef QT_NO_REGEXP_BACKREF
1183	void set(int bref);
1184	#endif
1185
1186	void cat(const Box &b);
1187	void orx(const Box &b);
1188	void plus(int atom);
1189	void opt();
1190	void catAnchor(int a);
1191	#ifndef QT_NO_REGEXP_OPTIM
1192	void setupHeuristics();
1193	#endif
1194
1195	#if defined(QT_DEBUG)
1196	void dump() const;
1197	#endif
1198
1199	private:
1200	void addAnchorsToEngine(const Box &to) const;
1201
1202	QRegExpEngine eng; // the automaton under construction*
1203	QList<int> ls; // the left states (firstpos)
1204	QList<int> rs; // the right states (lastpos)
1205	QMap<int, int> lanchors; // the left anchors
1206	QMap<int, int> ranchors; // the right anchors
1207	int skipanchors; // the anchors to match if the box is skipped
1208
1209	#ifndef QT_NO_REGEXP_OPTIM
1210	int earlyStart; // the index where str can first occur
1211	int lateStart; // the index where str can last occur
1212	QString str; // a string that has to occur in any match
1213	QString leftStr; // a string occurring at the left of this box
1214	QString rightStr; // a string occurring at the right of this box
1215	int maxl; // the maximum length of this box (possibly InftyLen)
1216	#endif
1217
1218	int minl; // the minimum length of this box
1219	#ifndef QT_NO_REGEXP_OPTIM
1220	QList<int> occ1; // first-occurrence array
1221	#endif
1222	};
1223
1224	friend class Box;
1225
1226	/*
1227	This is the lexical analyzer for regular expressions.
1228	*/
1229	enum { Tok_Eos, Tok_Dollar, Tok_LeftParen, Tok_MagicLeftParen, Tok_PosLookahead,
1230	Tok_NegLookahead, Tok_RightParen, Tok_CharClass, Tok_Caret, Tok_Quantifier, Tok_Bar,
1231	Tok_Word, Tok_NonWord, Tok_Char = `0x10000`, Tok_BackRef = `0x20000` };
1232	int getChar();
1233	int getEscape();
1234	#ifndef QT_NO_REGEXP_INTERVAL
1235	int getRep(int def);
1236	#endif
1237	#ifndef QT_NO_REGEXP_LOOKAHEAD
1238	void skipChars(int n);
1239	#endif
1240	void error(const char *msg);
1241	void startTokenizer(const QChar rx, int* len);
1242	int getToken();
1243
1244	const QChar yyIn; // a pointer to the input regular expression pattern*
1245	int yyPos0; // the position of yyTok in the input pattern
1246	int yyPos; // the position of the next character to read
1247	int yyLen; // the length of yyIn
1248	int yyCh; // the last character read
1249	QScopedPointer<QRegExpCharClass> yyCharClass; // attribute for Tok_CharClass tokens
1250	int yyMinRep; // attribute for Tok_Quantifier
1251	int yyMaxRep; // ditto
1252	QString yyError; // syntax error or overflow during parsing?
1253
1254	/*
1255	This is the syntactic analyzer for regular expressions.
1256	*/
1257	int parse(const QChar rx, int* len);
1258	void parseAtom(Box *box);
1259	void parseFactor(Box *box);
1260	void parseTerm(Box *box);
1261	void parseExpression(Box *box);
1262
1263	int yyTok; // the last token read
1264	bool yyMayCapture; // set this to false to disable capturing
1265
1266	friend struct QRegExpMatchState;
1267	};
1268
1269	#ifndef QT_NO_REGEXP_LOOKAHEAD
1270	/*
1271	The struct QRegExpLookahead represents a lookahead a la Perl (e.g.,
1272	(?=foo) and (?!bar)).
1273	*/
1274	struct QRegExpLookahead
1275	{
1276	QRegExpEngine eng; // NFA representing the embedded regular expression*
1277	bool neg; // negative lookahead?
1278
1279	inline QRegExpLookahead(QRegExpEngine eng0, bool* neg0)
1280	: eng(eng0), neg(neg0) { }
1281	inline ~QRegExpLookahead() { delete eng; }
1282	};
1283	#endif
1284
1285	/!*
1286	\internal
1287	convert the pattern string to the RegExp syntax.
1288
1289	This is also used by QScriptEngine::newRegExp to convert to a pattern that JavaScriptCore can understan
1290	*/
1291	Q_CORE5COMPAT_EXPORT QString qt_regexp_toCanonical(const QString &pattern,
1292	QRegExp::PatternSyntax patternSyntax)
1293	{
1294	switch (patternSyntax) {
1295	#ifndef QT_NO_REGEXP_WILDCARD
1296	case QRegExp::Wildcard:
1297	return wc2rx(wc_str: pattern, enableEscaping: false);
1298	case QRegExp::WildcardUnix:
1299	return wc2rx(wc_str: pattern, enableEscaping: true);
1300	#endif
1301	case QRegExp::FixedString:
1302	return QRegExp::escape(str: pattern);
1303	case QRegExp::W3CXmlSchema11:
1304	default:
1305	return pattern;
1306	}
1307	}
1308
1309	QRegExpEngine::QRegExpEngine(const QRegExpEngineKey &key)
1310	: cs(key.cs), greedyQuantifiers(key.patternSyntax == QRegExp::RegExp2),
1311	xmlSchemaExtensions(key.patternSyntax == QRegExp::W3CXmlSchema11)
1312	{
1313	setup();
1314
1315	QString rx = qt_regexp_toCanonical(pattern: key.pattern, patternSyntax: key.patternSyntax);
1316
1317	valid = (parse(rx: rx.unicode(), len: rx.size()) == rx.size());
1318	if (!valid) {
1319	#ifndef QT_NO_REGEXP_OPTIM
1320	trivial = false;
1321	#endif
1322	error(RXERR_LEFTDELIM);
1323	}
1324	}
1325
1326	QRegExpEngine::~QRegExpEngine()
1327	{
1328	#ifndef QT_NO_REGEXP_LOOKAHEAD
1329	qDeleteAll(c: ahead);
1330	#endif
1331	}
1332
1333	void QRegExpMatchState::prepareForMatch(QRegExpEngine *eng)
1334	{
1335	/*
1336	We use one QList<int> for all the big data used a lot in
1337	matchHere() and friends.
1338	*/
1339	int ns = eng->s.size(); // number of states
1340	int ncap = eng->ncap;
1341	#ifndef QT_NO_REGEXP_OPTIM
1342	int newSlideTabSize = qMax(a: eng->minl + `1`, b: `16`);
1343	#else
1344	int newSlideTabSize = `0`;
1345	#endif
1346	int numCaptures = eng->captureCount();
1347	int newCapturedSize = `2` + `2` * numCaptures;
1348	bigArray = q_check_ptr(p: (int )realloc(ptr: bigArray, size: ((`3` + `4` ncap) * ns + `4` * ncap + newSlideTabSize + newCapturedSize)*sizeof(int)));
1349
1350	// set all internal variables only _after_ bigArray is realloc'ed
1351	// to prevent a broken regexp in oom case
1352
1353	slideTabSize = newSlideTabSize;
1354	capturedSize = newCapturedSize;
1355	inNextStack = bigArray;
1356	memset(s: inNextStack, c: -`1`, n: ns * sizeof(int));
1357	curStack = inNextStack + ns;
1358	nextStack = inNextStack + `2` * ns;
1359
1360	curCapBegin = inNextStack + `3` * ns;
1361	nextCapBegin = curCapBegin + ncap * ns;
1362	curCapEnd = curCapBegin + `2` * ncap * ns;
1363	nextCapEnd = curCapBegin + `3` * ncap * ns;
1364
1365	tempCapBegin = curCapBegin + `4` * ncap * ns;
1366	tempCapEnd = tempCapBegin + ncap;
1367	capBegin = tempCapBegin + `2` * ncap;
1368	capEnd = tempCapBegin + `3` * ncap;
1369
1370	slideTab = tempCapBegin + `4` * ncap;
1371	captured = slideTab + slideTabSize;
1372	memset(s: captured, c: -`1`, n: capturedSize*sizeof(int));
1373	this->eng = eng;
1374	}
1375
1376	/*
1377	Tries to match in str and returns an array of (begin, length) pairs
1378	for captured text. If there is no match, all pairs are (-1, -1).
1379	*/
1380	void QRegExpMatchState::match(const QChar str0, int* len0, int pos0,
1381	bool minimal0, bool oneTest, int caretIndex)
1382	{
1383	bool matched = false;
1384	QChar char_null;
1385
1386	#ifndef QT_NO_REGEXP_OPTIM
1387	if (eng->trivial && !oneTest) {
1388	// ### Qt6: qsizetype
1389	pos = int(QtPrivate::findString(haystack: QStringView (str0, len0), from: pos0, needle: QStringView (eng->goodStr.unicode(), eng->goodStr.size()), cs: eng->cs));
1390	matchLen = eng->goodStr.size();
1391	matched = (pos != -`1`);
1392	} else
1393	#endif
1394	{
1395	in = str0;
1396	if (in == nullptr)
1397	in = &char_null;
1398	pos = pos0;
1399	caretPos = caretIndex;
1400	len = len0;
1401	minimal = minimal0;
1402	matchLen = `0`;
1403	oneTestMatchedLen = `0`;
1404
1405	if (eng->valid && pos >= `0` && pos <= len) {
1406	#ifndef QT_NO_REGEXP_OPTIM
1407	if (oneTest) {
1408	matched = matchHere();
1409	} else {
1410	if (pos <= len - eng->minl) {
1411	if (eng->caretAnchored) {
1412	matched = matchHere();
1413	} else if (eng->useGoodStringHeuristic) {
1414	matched = eng->goodStringMatch(matchState&: *this);
1415	} else {
1416	matched = eng->badCharMatch(matchState&: *this);
1417	}
1418	}
1419	}
1420	#else
1421	matched = oneTest ? matchHere() : eng->bruteMatch(*this);
1422	#endif
1423	}
1424	}
1425
1426	if (matched) {
1427	int *c = captured;
1428	*c++ = pos;
1429	*c++ = matchLen;
1430
1431	int numCaptures = (capturedSize - `2`) >> `1`;
1432	#ifndef QT_NO_REGEXP_CAPTURE
1433	for (int i = `0`; i < numCaptures; ++i) {
1434	int j = eng->captureForOfficialCapture.at(i);
1435	if (capBegin[j] != EmptyCapture) {
1436	int len = capEnd[j] - capBegin[j];
1437	*c++ = (len > `0`) ? pos + capBegin[j] : `0`;
1438	*c++ = len;
1439	} else {
1440	*c++ = -`1`;
1441	*c++ = -`1`;
1442	}
1443	}
1444	#endif
1445	} else {
1446	// we rely on 2's complement here
1447	memset(s: captured, c: -`1`, n: capturedSize * sizeof(int));
1448	}
1449	}
1450
1451	/*
1452	The three following functions add one state to the automaton and
1453	return the number of the state.
1454	*/
1455
1456	int QRegExpEngine::createState(QChar ch)
1457	{
1458	return setupState(ch.unicode());
1459	}
1460
1461	int QRegExpEngine::createState(const QRegExpCharClass &cc)
1462	{
1463	#ifndef QT_NO_REGEXP_CCLASS
1464	int n = cl.size();
1465	cl += QRegExpCharClass (cc);
1466	return setupState(CharClassBit \| n);
1467	#else
1468	Q_UNUSED(cc);
1469	return setupState(CharClassBit);
1470	#endif
1471	}
1472
1473	#ifndef QT_NO_REGEXP_BACKREF
1474	int QRegExpEngine::createState(int bref)
1475	{
1476	if (bref > nbrefs) {
1477	nbrefs = bref;
1478	if (nbrefs > MaxBackRefs) {
1479	error(RXERR_LIMIT);
1480	return `0`;
1481	}
1482	}
1483	return setupState(BackRefBit \| bref);
1484	}
1485	#endif
1486
1487	/*
1488	The two following functions add a transition between all pairs of
1489	states (i, j) where i is found in from, and j is found in to.
1490
1491	Cat-transitions are distinguished from plus-transitions for
1492	capturing.
1493	*/
1494
1495	void QRegExpEngine::addCatTransitions(const QList<int> &from, const QList<int> &to)
1496	{
1497	for (int i = `0`; i < from.size(); i++)
1498	mergeInto(a: &s [from.at(i)].outs, b: to);
1499	}
1500
1501	#ifndef QT_NO_REGEXP_CAPTURE
1502	void QRegExpEngine::addPlusTransitions(const QList<int> &from, const QList<int> &to, int atom)
1503	{
1504	for (int i = `0`; i < from.size(); i++) {
1505	QRegExpAutomatonState &st = s [from.at(i)];
1506	const QList<int> oldOuts = st.outs;
1507	mergeInto(a: &st.outs, b: to);
1508	if (f.at(i: atom).capture != QRegExpAtom::NoCapture) {
1509	for (int j = `0`; j < to.size(); j++) {
1510	// ### st.reenter.contains(to.at(j)) check looks suspicious
1511	if (!st.reenter.contains(key: to.at(i: j)) &&
1512	!std::binary_search(first: oldOuts.constBegin(), last: oldOuts.constEnd(), val: to.at(i: j)))
1513	st.reenter.insert(key: to.at(i: j), value: atom);
1514	}
1515	}
1516	}
1517	}
1518	#endif
1519
1520	#ifndef QT_NO_REGEXP_ANCHOR_ALT
1521	/*
1522	Returns an anchor that means a OR b.
1523	*/
1524	int QRegExpEngine::anchorAlternation(int a, int b)
1525	{
1526	if (((a & b) == a \|\| (a & b) == b) && ((a \| b) & Anchor_Alternation) == `0`)
1527	return a & b;
1528
1529	int n = aa.size();
1530	#ifndef QT_NO_REGEXP_OPTIM
1531	if (n > `0` && aa.at(i: n - `1`).a == a && aa.at(i: n - `1`).b == b)
1532	return Anchor_Alternation \| (n - `1`);
1533	#endif
1534
1535	QRegExpAnchorAlternation element = {.a: a, .b: b};
1536	aa.append(t: element);
1537	return Anchor_Alternation \| n;
1538	}
1539
1540	/*
1541	Returns an anchor that means a AND b.
1542	*/
1543	int QRegExpEngine::anchorConcatenation(int a, int b)
1544	{
1545	if (((a \| b) & Anchor_Alternation) == `0`)
1546	return a \| b;
1547	if ((b & Anchor_Alternation) != `0`)
1548	qSwap(value1&: a, value2&: b);
1549
1550	int aprime = anchorConcatenation(a: aa.at(i: a ^ Anchor_Alternation).a, b);
1551	int bprime = anchorConcatenation(a: aa.at(i: a ^ Anchor_Alternation).b, b);
1552	return anchorAlternation(a: aprime, b: bprime);
1553	}
1554	#endif
1555
1556	/*
1557	Adds anchor a on a transition caracterised by its from state and
1558	its to state.
1559	*/
1560	void QRegExpEngine::addAnchors(int from, int to, int a)
1561	{
1562	QRegExpAutomatonState &st = s [from];
1563	if (st.anchors.contains(key: to))
1564	a = anchorAlternation(a: st.anchors.value(key: to), b: a);
1565	st.anchors.insert(key: to, value: a);
1566	}
1567
1568	#ifndef QT_NO_REGEXP_OPTIM
1569	/*
1570	This function chooses between the good-string and the bad-character
1571	heuristics. It computes two scores and chooses the heuristic with
1572	the highest score.
1573
1574	Here are some common-sense constraints on the scores that should be
1575	respected if the formulas are ever modified: (1) If goodStr is
1576	empty, the good-string heuristic scores 0. (2) If the regular
1577	expression is trivial, the good-string heuristic should be used.
1578	(3) If the search is case insensitive, the good-string heuristic
1579	should be used, unless it scores 0. (Case insensitivity turns all
1580	entries of occ1 to 0.) (4) If (goodLateStart - goodEarlyStart) is
1581	big, the good-string heuristic should score less.
1582	*/
1583	void QRegExpEngine::heuristicallyChooseHeuristic()
1584	{
1585	if (minl == `0`) {
1586	useGoodStringHeuristic = false;
1587	} else if (trivial) {
1588	useGoodStringHeuristic = true;
1589	} else {
1590	/*
1591	Magic formula: The good string has to constitute a good
1592	proportion of the minimum-length string, and appear at a
1593	more-or-less known index.
1594	*/
1595	int goodStringScore = (`64` * goodStr.size() / minl) -
1596	(goodLateStart - goodEarlyStart);
1597	/*
1598	Less magic formula: We pick some characters at random, and
1599	check whether they are good or bad.
1600	*/
1601	int badCharScore = `0`;
1602	int step = qMax(a: `1`, b: NumBadChars / `32`);
1603	for (int i = `1`; i < NumBadChars; i += step) {
1604	if (occ1.at(i) == NoOccurrence)
1605	badCharScore += minl;
1606	else
1607	badCharScore += occ1.at(i);
1608	}
1609	badCharScore /= minl;
1610	useGoodStringHeuristic = (goodStringScore > badCharScore);
1611	}
1612	}
1613	#endif
1614
1615	#if defined(QT_DEBUG)
1616	void QRegExpEngine::dump() const
1617	{
1618	int i, j;
1619	qDebug(msg: "Case %ssensitive engine", cs ? "" : "in");
1620	qDebug(msg: " States");
1621	for (i = `0`; i < s.size(); i++) {
1622	qDebug(msg: " %d%s", i, i == InitialState ? " (initial)" : i == FinalState ? " (final)" : "");
1623	#ifndef QT_NO_REGEXP_CAPTURE
1624	if (nf > `0`)
1625	qDebug(msg: " in atom %d", s [i].atom);
1626	#endif
1627	int m = s [i].match;
1628	if ((m & CharClassBit) != `0`) {
1629	qDebug(msg: " match character class %d", m ^ CharClassBit);
1630	#ifndef QT_NO_REGEXP_CCLASS
1631	cl [m ^ CharClassBit].dump();
1632	#else
1633	qDebug(" negative character class");
1634	#endif
1635	} else if ((m & BackRefBit) != `0`) {
1636	qDebug(msg: " match back-reference %d", m ^ BackRefBit);
1637	} else if (m >= `0x20` && m <= `0x7e`) {
1638	qDebug(msg: " match 0x%.4x (%c)", m, m);
1639	} else {
1640	qDebug(msg: " match 0x%.4x", m);
1641	}
1642	for (j = `0`; j < s [i].outs.size(); j++) {
1643	int next = s [i].outs [j];
1644	qDebug(msg: " -> %d", next);
1645	if (s [i].reenter.contains(key: next))
1646	qDebug(msg: " [reenter %d]", s [i].reenter [next]);
1647	if (s [i].anchors.value(key: next) != `0`)
1648	qDebug(msg: " [anchors 0x%.8x]", s [i].anchors [next]);
1649	}
1650	}
1651	#ifndef QT_NO_REGEXP_CAPTURE
1652	if (nf > `0`) {
1653	qDebug(msg: " Atom Parent Capture");
1654	for (i = `0`; i < nf; i++) {
1655	if (f [i].capture == QRegExpAtom::NoCapture) {
1656	qDebug(msg: " %6d %6d nil", i, f [i].parent);
1657	} else {
1658	int cap = f [i].capture;
1659	bool official = captureForOfficialCapture.contains(t: cap);
1660	qDebug(msg: " %6d %6d %6d %s", i, f [i].parent, f [i].capture,
1661	official ? "official" : "");
1662	}
1663	}
1664	}
1665	#endif
1666	#ifndef QT_NO_REGEXP_ANCHOR_ALT
1667	for (i = `0`; i < aa.size(); i++)
1668	qDebug(msg: " Anchor alternation 0x%.8x: 0x%.8x 0x%.9x", i, aa [i].a, aa [i].b);
1669	#endif
1670	}
1671	#endif
1672
1673	void QRegExpEngine::setup()
1674	{
1675	ref.storeRelaxed(newValue: `1`);
1676	#ifndef QT_NO_REGEXP_CAPTURE
1677	f.resize(size: `32`);
1678	nf = `0`;
1679	cf = -`1`;
1680	#endif
1681	officialncap = `0`;
1682	ncap = `0`;
1683	#ifndef QT_NO_REGEXP_OPTIM
1684	caretAnchored = true;
1685	trivial = true;
1686	#endif
1687	valid = false;
1688	#ifndef QT_NO_REGEXP_BACKREF
1689	nbrefs = `0`;
1690	#endif
1691	#ifndef QT_NO_REGEXP_OPTIM
1692	useGoodStringHeuristic = true;
1693	minl = `0`;
1694	occ1.fill(t: `0`, newSize: NumBadChars);
1695	#endif
1696	}
1697
1698	int QRegExpEngine::setupState(int match)
1699	{
1700	#ifndef QT_NO_REGEXP_CAPTURE
1701	s += QRegExpAutomatonState (cf, match);
1702	#else
1703	s += QRegExpAutomatonState(match);
1704	#endif
1705	return s.size() - `1`;
1706	}
1707
1708	#ifndef QT_NO_REGEXP_CAPTURE
1709	/*
1710	Functions startAtom() and finishAtom() should be called to delimit
1711	atoms. When a state is created, it is assigned to the current atom.
1712	The information is later used for capturing.
1713	*/
1714	int QRegExpEngine::startAtom(bool officialCapture)
1715	{
1716	if ((nf & (nf + `1`)) == `0` && nf + `1` >= f.size())
1717	f.resize(size: (nf + `1`) << `1`);
1718	f [nf].parent = cf;
1719	cf = nf++;
1720	f [cf].capture = officialCapture ? QRegExpAtom::OfficialCapture : QRegExpAtom::NoCapture;
1721	return cf;
1722	}
1723
1724	void QRegExpEngine::finishAtom(int atom, bool needCapture)
1725	{
1726	if (greedyQuantifiers && needCapture && f [atom].capture == QRegExpAtom::NoCapture)
1727	f [atom].capture = QRegExpAtom::UnofficialCapture;
1728	cf = f.at(i: atom).parent;
1729	}
1730	#endif
1731
1732	#ifndef QT_NO_REGEXP_LOOKAHEAD
1733	/*
1734	Creates a lookahead anchor.
1735	*/
1736	int QRegExpEngine::addLookahead(QRegExpEngine eng, bool* negative)
1737	{
1738	int n = ahead.size();
1739	if (n == MaxLookaheads) {
1740	error(RXERR_LIMIT);
1741	return `0`;
1742	}
1743	ahead += new QRegExpLookahead (eng, negative);
1744	return Anchor_FirstLookahead << n;
1745	}
1746	#endif
1747
1748	#ifndef QT_NO_REGEXP_CAPTURE
1749	/*
1750	We want the longest leftmost captures.
1751	*/
1752	static bool isBetterCapture(int ncap, const int begin1, const* int end1, const* int *begin2,
1753	const int *end2)
1754	{
1755	for (int i = `0`; i < ncap; i++) {
1756	int delta = begin2[i] - begin1[i]; // it has to start early...
1757	if (delta == `0`)
1758	delta = end1[i] - end2[i]; // ...and end late
1759
1760	if (delta != `0`)
1761	return delta > `0`;
1762	}
1763	return false;
1764	}
1765	#endif
1766
1767	/*
1768	Returns \c true if anchor a matches at position pos + i in the input
1769	string, otherwise false.
1770	*/
1771	bool QRegExpMatchState::testAnchor(int i, int a, const int *capBegin)
1772	{
1773	int j;
1774
1775	#ifndef QT_NO_REGEXP_ANCHOR_ALT
1776	if ((a & QRegExpEngine::Anchor_Alternation) != `0`)
1777	return testAnchor(i, a: eng->aa.at(i: a ^ QRegExpEngine::Anchor_Alternation).a, capBegin)
1778	\|\| testAnchor(i, a: eng->aa.at(i: a ^ QRegExpEngine::Anchor_Alternation).b, capBegin);
1779	#endif
1780
1781	if ((a & QRegExpEngine::Anchor_Caret) != `0`) {
1782	if (pos + i != caretPos)
1783	return false;
1784	}
1785	if ((a & QRegExpEngine::Anchor_Dollar) != `0`) {
1786	if (pos + i != len)
1787	return false;
1788	}
1789	#ifndef QT_NO_REGEXP_ESCAPE
1790	if ((a & (QRegExpEngine::Anchor_Word \| QRegExpEngine::Anchor_NonWord)) != `0`) {
1791	bool before = false;
1792	bool after = false;
1793	if (pos + i != `0`)
1794	before = isWord(ch: in[pos + i - `1`]);
1795	if (pos + i != len)
1796	after = isWord(ch: in[pos + i]);
1797	if ((a & QRegExpEngine::Anchor_Word) != `0` && (before == after))
1798	return false;
1799	if ((a & QRegExpEngine::Anchor_NonWord) != `0` && (before != after))
1800	return false;
1801	}
1802	#endif
1803	#ifndef QT_NO_REGEXP_LOOKAHEAD
1804	if ((a & QRegExpEngine::Anchor_LookaheadMask) != `0`) {
1805	const QList<QRegExpLookahead *> &ahead = eng->ahead;
1806	for (j = `0`; j < ahead.size(); j++) {
1807	if ((a & (QRegExpEngine::Anchor_FirstLookahead << j)) != `0`) {
1808	QRegExpMatchState matchState;
1809	matchState.prepareForMatch(eng: ahead [j]->eng);
1810	matchState.match(str0: in + pos + i, len0: len - pos - i, pos0: `0`,
1811	minimal0: true, oneTest: true, caretIndex: caretPos - pos - i);
1812	if ((matchState.captured[`0`] == `0`) == ahead [j]->neg)
1813	return false;
1814	}
1815	}
1816	}
1817	#endif
1818	#ifndef QT_NO_REGEXP_CAPTURE
1819	#ifndef QT_NO_REGEXP_BACKREF
1820	for (j = `0`; j < eng->nbrefs; j++) {
1821	if ((a & (QRegExpEngine::Anchor_BackRef1Empty << j)) != `0`) {
1822	int i = eng->captureForOfficialCapture.at(i: j);
1823	if (capBegin[i] != EmptyCapture)
1824	return false;
1825	}
1826	}
1827	#endif
1828	#endif
1829	return true;
1830	}
1831
1832	#ifndef QT_NO_REGEXP_OPTIM
1833	/*
1834	The three following functions are what Jeffrey Friedl would call
1835	transmissions (or bump-alongs). Using one or the other should make
1836	no difference except in performance.
1837	*/
1838
1839	bool QRegExpEngine::goodStringMatch(QRegExpMatchState &matchState) const
1840	{
1841	int k = matchState.pos + goodEarlyStart;
1842	QStringMatcher matcher(goodStr.unicode(), goodStr.size(), cs);
1843	while ((k = matcher.indexIn(str: matchState.in, length: matchState.len, from: k)) != -`1`) {
1844	int from = k - goodLateStart;
1845	int to = k - goodEarlyStart;
1846	if (from > matchState.pos)
1847	matchState.pos = from;
1848
1849	while (matchState.pos <= to) {
1850	if (matchState.matchHere())
1851	return true;
1852	++matchState.pos;
1853	}
1854	++k;
1855	}
1856	return false;
1857	}
1858
1859	bool QRegExpEngine::badCharMatch(QRegExpMatchState &matchState) const
1860	{
1861	int slideHead = `0`;
1862	int slideNext = `0`;
1863	int i;
1864	int lastPos = matchState.len - minl;
1865	memset(s: matchState.slideTab, c: `0`, n: matchState.slideTabSize * sizeof(int));
1866
1867	/*
1868	Set up the slide table, used for the bad-character heuristic,
1869	using the table of first occurrence of each character.
1870	*/
1871	for (i = `0`; i < minl; i++) {
1872	int sk = occ1 [BadChar(matchState.in[matchState.pos + i])];
1873	if (sk == NoOccurrence)
1874	sk = i + `1`;
1875	if (sk > `0`) {
1876	int k = i + `1` - sk;
1877	if (k < `0`) {
1878	sk = i + `1`;
1879	k = `0`;
1880	}
1881	if (sk > matchState.slideTab[k])
1882	matchState.slideTab[k] = sk;
1883	}
1884	}
1885
1886	if (matchState.pos > lastPos)
1887	return false;
1888
1889	for (;;) {
1890	if (++slideNext >= matchState.slideTabSize)
1891	slideNext = `0`;
1892	if (matchState.slideTab[slideHead] > `0`) {
1893	if (matchState.slideTab[slideHead] - `1` > matchState.slideTab[slideNext])
1894	matchState.slideTab[slideNext] = matchState.slideTab[slideHead] - `1`;
1895	matchState.slideTab[slideHead] = `0`;
1896	} else {
1897	if (matchState.matchHere())
1898	return true;
1899	}
1900
1901	if (matchState.pos == lastPos)
1902	break;
1903
1904	/*
1905	Update the slide table. This code has much in common with
1906	the initialization code.
1907	*/
1908	int sk = occ1 [BadChar(matchState.in[matchState.pos + minl])];
1909	if (sk == NoOccurrence) {
1910	matchState.slideTab[slideNext] = minl;
1911	} else if (sk > `0`) {
1912	int k = slideNext + minl - sk;
1913	if (k >= matchState.slideTabSize)
1914	k -= matchState.slideTabSize;
1915	if (sk > matchState.slideTab[k])
1916	matchState.slideTab[k] = sk;
1917	}
1918	slideHead = slideNext;
1919	++matchState.pos;
1920	}
1921	return false;
1922	}
1923	#else
1924	bool QRegExpEngine::bruteMatch(QRegExpMatchState &matchState) const
1925	{
1926	while (matchState.pos <= matchState.len) {
1927	if (matchState.matchHere())
1928	return true;
1929	++matchState.pos;
1930	}
1931	return false;
1932	}
1933	#endif
1934
1935	/*
1936	Here's the core of the engine. It tries to do a match here and now.
1937	*/
1938	bool QRegExpMatchState::matchHere()
1939	{
1940	int ncur = `1`, nnext = `0`;
1941	int i = `0`, j, k, m;
1942	bool stop = false;
1943
1944	matchLen = -`1`;
1945	oneTestMatchedLen = -`1`;
1946	curStack[`0`] = QRegExpEngine::InitialState;
1947
1948	int ncap = eng->ncap;
1949	#ifndef QT_NO_REGEXP_CAPTURE
1950	if (ncap > `0`) {
1951	for (j = `0`; j < ncap; j++) {
1952	curCapBegin[j] = EmptyCapture;
1953	curCapEnd[j] = EmptyCapture;
1954	}
1955	}
1956	#endif
1957
1958	#ifndef QT_NO_REGEXP_BACKREF
1959	while ((ncur > `0` \|\| !sleeping.isEmpty()) && i <= len - pos && !stop)
1960	#else
1961	while (ncur > `0` && i <= len - pos && !stop)
1962	#endif
1963	{
1964	int ch = (i < len - pos) ? in[pos + i].unicode() : `0`;
1965	for (j = `0`; j < ncur; j++) {
1966	int cur = curStack[j];
1967	const QRegExpAutomatonState &scur = eng->s.at(i: cur);
1968	const QList<int> &outs = scur.outs;
1969	for (k = `0`; k < outs.size(); k++) {
1970	int next = outs.at(i: k);
1971	const QRegExpAutomatonState &snext = eng->s.at(i: next);
1972	bool inside = true;
1973	#if !defined(QT_NO_REGEXP_BACKREF) && !defined(QT_NO_REGEXP_CAPTURE)
1974	int needSomeSleep = `0`;
1975	#endif
1976
1977	/*
1978	First, check if the anchors are anchored properly.
1979	*/
1980	int a = scur.anchors.value(key: next);
1981	if (a != `0` && !testAnchor(i, a, capBegin: curCapBegin + j * ncap))
1982	inside = false;
1983
1984	/*
1985	If indeed they are, check if the input character is
1986	correct for this transition.
1987	*/
1988	if (inside) {
1989	m = snext.match;
1990	if ((m & (QRegExpEngine::CharClassBit \| QRegExpEngine::BackRefBit)) == `0`) {
1991	if (eng->cs)
1992	inside = (m == ch);
1993	else
1994	inside = (QChar (m).toLower() == QChar (ch).toLower());
1995	} else if (next == QRegExpEngine::FinalState) {
1996	matchLen = i;
1997	stop = minimal;
1998	inside = true;
1999	} else if ((m & QRegExpEngine::CharClassBit) != `0`) {
2000	#ifndef QT_NO_REGEXP_CCLASS
2001	const QRegExpCharClass &cc = eng->cl.at(i: m ^ QRegExpEngine::CharClassBit);
2002	if (eng->cs)
2003	inside = cc.in(ch: QChar (ch));
2004	else if (cc.negative())
2005	inside = cc.in(ch: QChar (ch).toLower()) &&
2006	cc.in(ch: QChar (ch).toUpper());
2007	else
2008	inside = cc.in(ch: QChar (ch).toLower()) \|\|
2009	cc.in(ch: QChar (ch).toUpper());
2010	#endif
2011	#if !defined(QT_NO_REGEXP_BACKREF) && !defined(QT_NO_REGEXP_CAPTURE)
2012	} else { / ((m & QRegExpEngine::BackRefBit) != 0) /
2013	int bref = m ^ QRegExpEngine::BackRefBit;
2014	int ell = j * ncap + eng->captureForOfficialCapture.at(i: bref - `1`);
2015
2016	inside = bref <= ncap && curCapBegin[ell] != EmptyCapture;
2017	if (inside) {
2018	if (eng->cs)
2019	inside = (in[pos + curCapBegin[ell]] == QChar (ch));
2020	else
2021	inside = (in[pos + curCapBegin[ell]].toLower()
2022	== QChar (ch).toLower());
2023	}
2024
2025	if (inside) {
2026	int delta;
2027	if (curCapEnd[ell] == EmptyCapture)
2028	delta = i - curCapBegin[ell];
2029	else
2030	delta = curCapEnd[ell] - curCapBegin[ell];
2031
2032	inside = (delta <= len - (pos + i));
2033	if (inside && delta > `1`) {
2034	int n = `1`;
2035	if (eng->cs) {
2036	while (n < delta) {
2037	if (in[pos + curCapBegin[ell] + n]
2038	!= in[pos + i + n])
2039	break;
2040	++n;
2041	}
2042	} else {
2043	while (n < delta) {
2044	QChar a = in[pos + curCapBegin[ell] + n];
2045	QChar b = in[pos + i + n];
2046	if (a.toLower() != b.toLower())
2047	break;
2048	++n;
2049	}
2050	}
2051	inside = (n == delta);
2052	if (inside)
2053	needSomeSleep = delta - `1`;
2054	}
2055	}
2056	#endif
2057	}
2058	}
2059
2060	/*
2061	We must now update our data structures.
2062	*/
2063	if (inside) {
2064	#ifndef QT_NO_REGEXP_CAPTURE
2065	int capBegin, capEnd;
2066	#endif
2067	/*
2068	If the next state was not encountered yet, all
2069	is fine.
2070	*/
2071	if ((m = inNextStack[next]) == -`1`) {
2072	m = nnext++;
2073	nextStack[m] = next;
2074	inNextStack[next] = m;
2075	#ifndef QT_NO_REGEXP_CAPTURE
2076	capBegin = nextCapBegin + m * ncap;
2077	capEnd = nextCapEnd + m * ncap;
2078
2079	/*
2080	Otherwise, we'll first maintain captures in
2081	temporary arrays, and decide at the end whether
2082	it's best to keep the previous capture zones or
2083	the new ones.
2084	*/
2085	} else {
2086	capBegin = tempCapBegin;
2087	capEnd = tempCapEnd;
2088	#endif
2089	}
2090
2091	#ifndef QT_NO_REGEXP_CAPTURE
2092	/*
2093	Updating the capture zones is much of a task.
2094	*/
2095	if (ncap > `0`) {
2096	memcpy(dest: capBegin, src: curCapBegin + j * ncap, n: ncap * sizeof(int));
2097	memcpy(dest: capEnd, src: curCapEnd + j * ncap, n: ncap * sizeof(int));
2098	int c = scur.atom, n = snext.atom;
2099	int p = -`1`, q = -`1`;
2100	int cap;
2101
2102	/*
2103	Lemma 1. For any x in the range [0..nf), we
2104	have f[x].parent < x.
2105
2106	Proof. By looking at startAtom(), it is
2107	clear that cf < nf holds all the time, and
2108	thus that f[nf].parent < nf.
2109	*/
2110
2111	/*
2112	If we are reentering an atom, we empty all
2113	capture zones inside it.
2114	*/
2115	if ((q = scur.reenter.value(key: next)) != `0`) {
2116	QBitArray b(eng->nf, false);
2117	b.setBit(i: q, val: true);
2118	for (int ell = q + `1`; ell < eng->nf; ell++) {
2119	if (b.testBit(i: eng->f.at(i: ell).parent)) {
2120	b.setBit(i: ell, val: true);
2121	cap = eng->f.at(i: ell).capture;
2122	if (cap >= `0`) {
2123	capBegin[cap] = EmptyCapture;
2124	capEnd[cap] = EmptyCapture;
2125	}
2126	}
2127	}
2128	p = eng->f.at(i: q).parent;
2129
2130	/*
2131	Otherwise, close the capture zones we are
2132	leaving. We are leaving f[c].capture,
2133	f[f[c].parent].capture,
2134	f[f[f[c].parent].parent].capture, ...,
2135	until f[x].capture, with x such that
2136	f[x].parent is the youngest common ancestor
2137	for c and n.
2138
2139	We go up along c's and n's ancestry until
2140	we find x.
2141	*/
2142	} else {
2143	p = c;
2144	q = n;
2145	while (p != q) {
2146	if (p > q) {
2147	cap = eng->f.at(i: p).capture;
2148	if (cap >= `0`) {
2149	if (capBegin[cap] == i) {
2150	capBegin[cap] = EmptyCapture;
2151	capEnd[cap] = EmptyCapture;
2152	} else {
2153	capEnd[cap] = i;
2154	}
2155	}
2156	p = eng->f.at(i: p).parent;
2157	} else {
2158	q = eng->f.at(i: q).parent;
2159	}
2160	}
2161	}
2162
2163	/*
2164	In any case, we now open the capture zones
2165	we are entering. We work upwards from n
2166	until we reach p (the parent of the atom we
2167	reenter or the youngest common ancestor).
2168	*/
2169	while (n > p) {
2170	cap = eng->f.at(i: n).capture;
2171	if (cap >= `0`) {
2172	capBegin[cap] = i;
2173	capEnd[cap] = EmptyCapture;
2174	}
2175	n = eng->f.at(i: n).parent;
2176	}
2177	/*
2178	If the next state was already in
2179	nextStack, we must choose carefully which
2180	capture zones we want to keep.
2181	*/
2182	if (capBegin == tempCapBegin &&
2183	isBetterCapture(ncap, begin1: capBegin, end1: capEnd, begin2: nextCapBegin + m * ncap,
2184	end2: nextCapEnd + m * ncap)) {
2185	memcpy(dest: nextCapBegin + m * ncap, src: capBegin, n: ncap * sizeof(int));
2186	memcpy(dest: nextCapEnd + m * ncap, src: capEnd, n: ncap * sizeof(int));
2187	}
2188	}
2189	#ifndef QT_NO_REGEXP_BACKREF
2190	/*
2191	We are done with updating the capture zones.
2192	It's now time to put the next state to sleep,
2193	if it needs to, and to remove it from
2194	nextStack.
2195	*/
2196	if (needSomeSleep > `0`) {
2197	QList<int> zzZ(`2` + `2` * ncap);
2198	zzZ [`0`] = i + needSomeSleep;
2199	zzZ [`1`] = next;
2200	if (ncap > `0`) {
2201	memcpy(dest: zzZ.data() + `2`, src: capBegin, n: ncap * sizeof(int));
2202	memcpy(dest: zzZ.data() + `2` + ncap, src: capEnd, n: ncap * sizeof(int));
2203	}
2204	inNextStack[nextStack[--nnext]] = -`1`;
2205	sleeping.append(t: zzZ);
2206	}
2207	#endif
2208	#endif
2209	}
2210	}
2211	}
2212	#ifndef QT_NO_REGEXP_CAPTURE
2213	/*
2214	If we reached the final state, hurray! Copy the captured
2215	zone.
2216	*/
2217	if (ncap > `0` && (m = inNextStack[QRegExpEngine::FinalState]) != -`1`) {
2218	memcpy(dest: capBegin, src: nextCapBegin + m * ncap, n: ncap * sizeof(int));
2219	memcpy(dest: capEnd, src: nextCapEnd + m * ncap, n: ncap * sizeof(int));
2220	}
2221	#ifndef QT_NO_REGEXP_BACKREF
2222	/*
2223	It's time to wake up the sleepers.
2224	*/
2225	j = `0`;
2226	while (j < sleeping.size()) {
2227	if (sleeping.at(i: j)[`0`] == i) {
2228	const QList<int> &zzZ = sleeping.at(i: j);
2229	int next = zzZ [`1`];
2230	const int *capBegin = zzZ.data() + `2`;
2231	const int *capEnd = zzZ.data() + `2` + ncap;
2232	bool copyOver = true;
2233
2234	if ((m = inNextStack[next]) == -`1`) {
2235	m = nnext++;
2236	nextStack[m] = next;
2237	inNextStack[next] = m;
2238	} else {
2239	copyOver = isBetterCapture(ncap, begin1: nextCapBegin + m * ncap, end1: nextCapEnd + m * ncap,
2240	begin2: capBegin, end2: capEnd);
2241	}
2242	if (copyOver) {
2243	memcpy(dest: nextCapBegin + m * ncap, src: capBegin, n: ncap * sizeof(int));
2244	memcpy(dest: nextCapEnd + m * ncap, src: capEnd, n: ncap * sizeof(int));
2245	}
2246
2247	sleeping.removeAt(i: j);
2248	} else {
2249	++j;
2250	}
2251	}
2252	#endif
2253	#endif
2254	for (j = `0`; j < nnext; j++)
2255	inNextStack[nextStack[j]] = -`1`;
2256
2257	// avoid needless iteration that confuses oneTestMatchedLen
2258	if (nnext == `1` && nextStack[`0`] == QRegExpEngine::FinalState
2259	#ifndef QT_NO_REGEXP_BACKREF
2260	&& sleeping.isEmpty()
2261	#endif
2262	)
2263	stop = true;
2264
2265	qSwap(value1&: curStack, value2&: nextStack);
2266	#ifndef QT_NO_REGEXP_CAPTURE
2267	qSwap(value1&: curCapBegin, value2&: nextCapBegin);
2268	qSwap(value1&: curCapEnd, value2&: nextCapEnd);
2269	#endif
2270	ncur = nnext;
2271	nnext = `0`;
2272	++i;
2273	}
2274
2275	#ifndef QT_NO_REGEXP_BACKREF
2276	/*
2277	If minimal matching is enabled, we might have some sleepers
2278	left.
2279	*/
2280	if (!sleeping.isEmpty())
2281	sleeping.clear();
2282	#endif
2283
2284	oneTestMatchedLen = i - `1`;
2285	return (matchLen >= `0`);
2286	}
2287
2288	#ifndef QT_NO_REGEXP_CCLASS
2289
2290	QRegExpCharClass::QRegExpCharClass()
2291	: c(`0`), n(false)
2292	{
2293	#ifndef QT_NO_REGEXP_OPTIM
2294	occ1.fill(t: NoOccurrence, newSize: NumBadChars);
2295	#endif
2296	}
2297
2298	void QRegExpCharClass::clear()
2299	{
2300	c = `0`;
2301	r.clear();
2302	n = false;
2303	}
2304
2305	void QRegExpCharClass::setNegative(bool negative)
2306	{
2307	n = negative;
2308	#ifndef QT_NO_REGEXP_OPTIM
2309	occ1.fill(t: `0`, newSize: NumBadChars);
2310	#endif
2311	}
2312
2313	void QRegExpCharClass::addCategories(uint cats)
2314	{
2315	static const int all_cats = FLAG(QChar::Mark_NonSpacing) \|
2316	FLAG(QChar::Mark_SpacingCombining) \|
2317	FLAG(QChar::Mark_Enclosing) \|
2318	FLAG(QChar::Number_DecimalDigit) \|
2319	FLAG(QChar::Number_Letter) \|
2320	FLAG(QChar::Number_Other) \|
2321	FLAG(QChar::Separator_Space) \|
2322	FLAG(QChar::Separator_Line) \|
2323	FLAG(QChar::Separator_Paragraph) \|
2324	FLAG(QChar::Other_Control) \|
2325	FLAG(QChar::Other_Format) \|
2326	FLAG(QChar::Other_Surrogate) \|
2327	FLAG(QChar::Other_PrivateUse) \|
2328	FLAG(QChar::Other_NotAssigned) \|
2329	FLAG(QChar::Letter_Uppercase) \|
2330	FLAG(QChar::Letter_Lowercase) \|
2331	FLAG(QChar::Letter_Titlecase) \|
2332	FLAG(QChar::Letter_Modifier) \|
2333	FLAG(QChar::Letter_Other) \|
2334	FLAG(QChar::Punctuation_Connector) \|
2335	FLAG(QChar::Punctuation_Dash) \|
2336	FLAG(QChar::Punctuation_Open) \|
2337	FLAG(QChar::Punctuation_Close) \|
2338	FLAG(QChar::Punctuation_InitialQuote) \|
2339	FLAG(QChar::Punctuation_FinalQuote) \|
2340	FLAG(QChar::Punctuation_Other) \|
2341	FLAG(QChar::Symbol_Math) \|
2342	FLAG(QChar::Symbol_Currency) \|
2343	FLAG(QChar::Symbol_Modifier) \|
2344	FLAG(QChar::Symbol_Other);
2345	c \|= (all_cats & cats);
2346	#ifndef QT_NO_REGEXP_OPTIM
2347	occ1.fill(t: `0`, newSize: NumBadChars);
2348	#endif
2349	}
2350
2351	void QRegExpCharClass::addRange(ushort from, ushort to)
2352	{
2353	if (from > to)
2354	qSwap(value1&: from, value2&: to);
2355	int m = r.size();
2356	r.resize(size: m + `1`);
2357	r [m].from = from;
2358	r [m].len = to - from + `1`;
2359
2360	#ifndef QT_NO_REGEXP_OPTIM
2361	int i;
2362
2363	if (to - from < NumBadChars) {
2364	if (from % NumBadChars <= to % NumBadChars) {
2365	for (i = from % NumBadChars; i <= to % NumBadChars; i++)
2366	occ1 [i] = `0`;
2367	} else {
2368	for (i = `0`; i <= to % NumBadChars; i++)
2369	occ1 [i] = `0`;
2370	for (i = from % NumBadChars; i < NumBadChars; i++)
2371	occ1 [i] = `0`;
2372	}
2373	} else {
2374	occ1.fill(t: `0`, newSize: NumBadChars);
2375	}
2376	#endif
2377	}
2378
2379	bool QRegExpCharClass::in(QChar ch) const
2380	{
2381	#ifndef QT_NO_REGEXP_OPTIM
2382	if (occ1.at(BadChar(ch)) == NoOccurrence)
2383	return n;
2384	#endif
2385
2386	if (c != `0` && (c & FLAG(ch.category())) != `0`)
2387	return !n;
2388
2389	const int uc = ch.unicode();
2390	int size = r.size();
2391
2392	for (int i = `0`; i < size; ++i) {
2393	const QRegExpCharClassRange &range = r.at(i);
2394	if (uint(uc - range.from) < uint(r.at(i).len))
2395	return !n;
2396	}
2397	return n;
2398	}
2399
2400	#if defined(QT_DEBUG)
2401	void QRegExpCharClass::dump() const
2402	{
2403	int i;
2404	qDebug(msg: " %stive character class", n ? "nega" : "posi");
2405	#ifndef QT_NO_REGEXP_CCLASS
2406	if (c != `0`)
2407	qDebug(msg: " categories 0x%.8x", c);
2408	#endif
2409	for (i = `0`; i < r.size(); i++)
2410	qDebug(msg: " 0x%.4x through 0x%.4x", r [i].from, r [i].from + r [i].len - `1`);
2411	}
2412	#endif
2413	#endif
2414
2415	QRegExpEngine::Box::Box(QRegExpEngine *engine)
2416	: eng(engine), skipanchors(`0`)
2417	#ifndef QT_NO_REGEXP_OPTIM
2418	, earlyStart(`0`), lateStart(`0`), maxl(`0`)
2419	#endif
2420	{
2421	#ifndef QT_NO_REGEXP_OPTIM
2422	occ1.fill(t: NoOccurrence, newSize: NumBadChars);
2423	#endif
2424	minl = `0`;
2425	}
2426
2427	QRegExpEngine::Box &QRegExpEngine::Box::operator=(const Box &b)
2428	{
2429	eng = b.eng;
2430	ls = b.ls;
2431	rs = b.rs;
2432	lanchors = b.lanchors;
2433	ranchors = b.ranchors;
2434	skipanchors = b.skipanchors;
2435	#ifndef QT_NO_REGEXP_OPTIM
2436	earlyStart = b.earlyStart;
2437	lateStart = b.lateStart;
2438	str = b.str;
2439	leftStr = b.leftStr;
2440	rightStr = b.rightStr;
2441	maxl = b.maxl;
2442	occ1 = b.occ1;
2443	#endif
2444	minl = b.minl;
2445	return *this;
2446	}
2447
2448	void QRegExpEngine::Box::set(QChar ch)
2449	{
2450	ls.resize(size: `1`);
2451	ls [`0`] = eng->createState(ch);
2452	rs = ls;
2453	#ifndef QT_NO_REGEXP_OPTIM
2454	str = ch;
2455	leftStr = ch;
2456	rightStr = ch;
2457	maxl = `1`;
2458	occ1 [BadChar(ch)] = `0`;
2459	#endif
2460	minl = `1`;
2461	}
2462
2463	void QRegExpEngine::Box::set(const QRegExpCharClass &cc)
2464	{
2465	ls.resize(size: `1`);
2466	ls [`0`] = eng->createState(cc);
2467	rs = ls;
2468	#ifndef QT_NO_REGEXP_OPTIM
2469	maxl = `1`;
2470	occ1 = cc.firstOccurrence();
2471	#endif
2472	minl = `1`;
2473	}
2474
2475	#ifndef QT_NO_REGEXP_BACKREF
2476	void QRegExpEngine::Box::set(int bref)
2477	{
2478	ls.resize(size: `1`);
2479	ls [`0`] = eng->createState(bref);
2480	rs = ls;
2481	if (bref >= `1` && bref <= MaxBackRefs)
2482	skipanchors = Anchor_BackRef0Empty << bref;
2483	#ifndef QT_NO_REGEXP_OPTIM
2484	maxl = InftyLen;
2485	#endif
2486	minl = `0`;
2487	}
2488	#endif
2489
2490	void QRegExpEngine::Box::cat(const Box &b)
2491	{
2492	eng->addCatTransitions(from: rs, to: b.ls);
2493	addAnchorsToEngine(to: b);
2494	if (minl == `0`) {
2495	lanchors.insert(map: b.lanchors);
2496	if (skipanchors != `0`) {
2497	for (int i = `0`; i < b.ls.size(); i++) {
2498	int a = eng->anchorConcatenation(a: lanchors.value(key: b.ls.at(i), defaultValue: `0`), b: skipanchors);
2499	lanchors.insert(key: b.ls.at(i), value: a);
2500	}
2501	}
2502	mergeInto(a: &ls, b: b.ls);
2503	}
2504	if (b.minl == `0`) {
2505	ranchors.insert(map: b.ranchors);
2506	if (b.skipanchors != `0`) {
2507	for (int i = `0`; i < rs.size(); i++) {
2508	int a = eng->anchorConcatenation(a: ranchors.value(key: rs.at(i), defaultValue: `0`), b: b.skipanchors);
2509	ranchors.insert(key: rs.at(i), value: a);
2510	}
2511	}
2512	mergeInto(a: &rs, b: b.rs);
2513	} else {
2514	ranchors = b.ranchors;
2515	rs = b.rs;
2516	}
2517
2518	#ifndef QT_NO_REGEXP_OPTIM
2519	if (maxl != InftyLen) {
2520	if (rightStr.size() + b.leftStr.size() >
2521	qMax(a: str.size(), b: b.str.size())) {
2522	earlyStart = minl - rightStr.size();
2523	lateStart = maxl - rightStr.size();
2524	str = rightStr + b.leftStr;
2525	} else if (b.str.size() > str.size()) {
2526	earlyStart = minl + b.earlyStart;
2527	lateStart = maxl + b.lateStart;
2528	str = b.str;
2529	}
2530	}
2531
2532	if (leftStr.size() == maxl)
2533	leftStr += b.leftStr;
2534
2535	if (b.rightStr.size() == b.maxl) {
2536	rightStr += b.rightStr;
2537	} else {
2538	rightStr = b.rightStr;
2539	}
2540
2541	if (maxl == InftyLen \|\| b.maxl == InftyLen) {
2542	maxl = InftyLen;
2543	} else {
2544	maxl += b.maxl;
2545	}
2546
2547	for (int i = `0`; i < NumBadChars; i++) {
2548	if (b.occ1.at(i) != NoOccurrence && minl + b.occ1.at(i) < occ1.at(i))
2549	occ1 [i] = minl + b.occ1.at(i);
2550	}
2551	#endif
2552
2553	minl += b.minl;
2554	if (minl == `0`)
2555	skipanchors = eng->anchorConcatenation(a: skipanchors, b: b.skipanchors);
2556	else
2557	skipanchors = `0`;
2558	}
2559
2560	void QRegExpEngine::Box::orx(const Box &b)
2561	{
2562	mergeInto(a: &ls, b: b.ls);
2563	lanchors.insert(map: b.lanchors);
2564	mergeInto(a: &rs, b: b.rs);
2565	ranchors.insert(map: b.ranchors);
2566
2567	if (b.minl == `0`) {
2568	if (minl == `0`)
2569	skipanchors = eng->anchorAlternation(a: skipanchors, b: b.skipanchors);
2570	else
2571	skipanchors = b.skipanchors;
2572	}
2573
2574	#ifndef QT_NO_REGEXP_OPTIM
2575	for (int i = `0`; i < NumBadChars; i++) {
2576	if (occ1.at(i) > b.occ1.at(i))
2577	occ1 [i] = b.occ1.at(i);
2578	}
2579	earlyStart = `0`;
2580	lateStart = `0`;
2581	str = QString ();
2582	leftStr = QString ();
2583	rightStr = QString ();
2584	if (b.maxl > maxl)
2585	maxl = b.maxl;
2586	#endif
2587	if (b.minl < minl)
2588	minl = b.minl;
2589	}
2590
2591	void QRegExpEngine::Box::plus(int atom)
2592	{
2593	#ifndef QT_NO_REGEXP_CAPTURE
2594	eng->addPlusTransitions(from: rs, to: ls, atom);
2595	#else
2596	Q_UNUSED(atom);
2597	eng->addCatTransitions(rs, ls);
2598	#endif
2599	addAnchorsToEngine(to: *this);
2600	#ifndef QT_NO_REGEXP_OPTIM
2601	maxl = InftyLen;
2602	#endif
2603	}
2604
2605	void QRegExpEngine::Box::opt()
2606	{
2607	#ifndef QT_NO_REGEXP_OPTIM
2608	earlyStart = `0`;
2609	lateStart = `0`;
2610	str = QString ();
2611	leftStr = QString ();
2612	rightStr = QString ();
2613	#endif
2614	skipanchors = `0`;
2615	minl = `0`;
2616	}
2617
2618	void QRegExpEngine::Box::catAnchor(int a)
2619	{
2620	if (a != `0`) {
2621	for (int i = `0`; i < rs.size(); i++) {
2622	a = eng->anchorConcatenation(a: ranchors.value(key: rs.at(i), defaultValue: `0`), b: a);
2623	ranchors.insert(key: rs.at(i), value: a);
2624	}
2625	if (minl == `0`)
2626	skipanchors = eng->anchorConcatenation(a: skipanchors, b: a);
2627	}
2628	}
2629
2630	#ifndef QT_NO_REGEXP_OPTIM
2631	void QRegExpEngine::Box::setupHeuristics()
2632	{
2633	eng->goodEarlyStart = earlyStart;
2634	eng->goodLateStart = lateStart;
2635	eng->goodStr = eng->cs ? str : str.toLower();
2636
2637	eng->minl = minl;
2638	if (eng->cs) {
2639	/*
2640	A regular expression such as 112\|1 has occ1['2'] = 2 and minl =
2641	1 at this point. An entry of occ1 has to be at most minl or
2642	infinity for the rest of the algorithm to go well.
2643
2644	We waited until here before normalizing these cases (instead of
2645	doing it in Box::orx()) because sometimes things improve by
2646	themselves. Consider for example (112\|1)34.
2647	*/
2648	for (int i = `0`; i < NumBadChars; i++) {
2649	if (occ1.at(i) != NoOccurrence && occ1.at(i) >= minl)
2650	occ1 [i] = minl;
2651	}
2652	eng->occ1 = occ1;
2653	} else {
2654	eng->occ1.fill(t: `0`, newSize: NumBadChars);
2655	}
2656
2657	eng->heuristicallyChooseHeuristic();
2658	}
2659	#endif
2660
2661	#if defined(QT_DEBUG)
2662	void QRegExpEngine::Box::dump() const
2663	{
2664	int i;
2665	qDebug(msg: "Box of at least %d character%s", minl, minl == `1` ? "" : "s");
2666	qDebug(msg: " Left states:");
2667	for (i = `0`; i < ls.size(); i++) {
2668	if (lanchors.value(key: ls [i], defaultValue: `0`) == `0`)
2669	qDebug(msg: " %d", ls [i]);
2670	else
2671	qDebug(msg: " %d [anchors 0x%.8x]", ls [i], lanchors [ls [i]]);
2672	}
2673	qDebug(msg: " Right states:");
2674	for (i = `0`; i < rs.size(); i++) {
2675	if (ranchors.value(key: rs [i], defaultValue: `0`) == `0`)
2676	qDebug(msg: " %d", rs [i]);
2677	else
2678	qDebug(msg: " %d [anchors 0x%.8x]", rs [i], ranchors [rs [i]]);
2679	}
2680	qDebug(msg: " Skip anchors: 0x%.8x", skipanchors);
2681	}
2682	#endif
2683
2684	void QRegExpEngine::Box::addAnchorsToEngine(const Box &to) const
2685	{
2686	for (int i = `0`; i < to.ls.size(); i++) {
2687	for (int j = `0`; j < rs.size(); j++) {
2688	int a = eng->anchorConcatenation(a: ranchors.value(key: rs.at(i: j), defaultValue: `0`),
2689	b: to.lanchors.value(key: to.ls.at(i), defaultValue: `0`));
2690	eng->addAnchors(from: rs [j], to: to.ls [i], a);
2691	}
2692	}
2693	}
2694
2695	#ifndef QT_NO_REGEXP_CCLASS
2696	// fast lookup hash for xml schema extensions
2697	// sorted by name for b-search
2698	static const struct CategoriesRangeMapEntry {
2699	const char name[`40`];
2700	uint first, second;
2701	} categoriesRangeMap[] = {
2702	{ .name: "AegeanNumbers", .first: `0x10100`, .second: `0x1013F` },
2703	{ .name: "AlphabeticPresentationForms", .first: `0xFB00`, .second: `0xFB4F` },
2704	{ .name: "AncientGreekMusicalNotation", .first: `0x1D200`, .second: `0x1D24F` },
2705	{ .name: "AncientGreekNumbers", .first: `0x10140`, .second: `0x1018F` },
2706	{ .name: "Arabic", .first: `0x0600`, .second: `0x06FF` },
2707	{ .name: "ArabicPresentationForms-A", .first: `0xFB50`, .second: `0xFDFF` },
2708	{ .name: "ArabicPresentationForms-B", .first: `0xFE70`, .second: `0xFEFF` },
2709	{ .name: "ArabicSupplement", .first: `0x0750`, .second: `0x077F` },
2710	{ .name: "Armenian", .first: `0x0530`, .second: `0x058F` },
2711	{ .name: "Arrows", .first: `0x2190`, .second: `0x21FF` },
2712	{ .name: "BasicLatin", .first: `0x0000`, .second: `0x007F` },
2713	{ .name: "Bengali", .first: `0x0980`, .second: `0x09FF` },
2714	{ .name: "BlockElements", .first: `0x2580`, .second: `0x259F` },
2715	{ .name: "Bopomofo", .first: `0x3100`, .second: `0x312F` },
2716	{ .name: "BopomofoExtended", .first: `0x31A0`, .second: `0x31BF` },
2717	{ .name: "BoxDrawing", .first: `0x2500`, .second: `0x257F` },
2718	{ .name: "BraillePatterns", .first: `0x2800`, .second: `0x28FF` },
2719	{ .name: "Buginese", .first: `0x1A00`, .second: `0x1A1F` },
2720	{ .name: "Buhid", .first: `0x1740`, .second: `0x175F` },
2721	{ .name: "ByzantineMusicalSymbols", .first: `0x1D000`, .second: `0x1D0FF` },
2722	{ .name: "CJKCompatibility", .first: `0x3300`, .second: `0x33FF` },
2723	{ .name: "CJKCompatibilityForms", .first: `0xFE30`, .second: `0xFE4F` },
2724	{ .name: "CJKCompatibilityIdeographs", .first: `0xF900`, .second: `0xFAFF` },
2725	{ .name: "CJKCompatibilityIdeographsSupplement", .first: `0x2F800`, .second: `0x2FA1F` },
2726	{ .name: "CJKRadicalsSupplement", .first: `0x2E80`, .second: `0x2EFF` },
2727	{ .name: "CJKStrokes", .first: `0x31C0`, .second: `0x31EF` },
2728	{ .name: "CJKSymbolsandPunctuation", .first: `0x3000`, .second: `0x303F` },
2729	{ .name: "CJKUnifiedIdeographs", .first: `0x4E00`, .second: `0x9FFF` },
2730	{ .name: "CJKUnifiedIdeographsExtensionA", .first: `0x3400`, .second: `0x4DB5` },
2731	{ .name: "CJKUnifiedIdeographsExtensionB", .first: `0x20000`, .second: `0x2A6DF` },
2732	{ .name: "Cherokee", .first: `0x13A0`, .second: `0x13FF` },
2733	{ .name: "CombiningDiacriticalMarks", .first: `0x0300`, .second: `0x036F` },
2734	{ .name: "CombiningDiacriticalMarksSupplement", .first: `0x1DC0`, .second: `0x1DFF` },
2735	{ .name: "CombiningHalfMarks", .first: `0xFE20`, .second: `0xFE2F` },
2736	{ .name: "CombiningMarksforSymbols", .first: `0x20D0`, .second: `0x20FF` },
2737	{ .name: "ControlPictures", .first: `0x2400`, .second: `0x243F` },
2738	{ .name: "Coptic", .first: `0x2C80`, .second: `0x2CFF` },
2739	{ .name: "CurrencySymbols", .first: `0x20A0`, .second: `0x20CF` },
2740	{ .name: "CypriotSyllabary", .first: `0x10800`, .second: `0x1083F` },
2741	{ .name: "Cyrillic", .first: `0x0400`, .second: `0x04FF` },
2742	{ .name: "CyrillicSupplement", .first: `0x0500`, .second: `0x052F` },
2743	{ .name: "Deseret", .first: `0x10400`, .second: `0x1044F` },
2744	{ .name: "Devanagari", .first: `0x0900`, .second: `0x097F` },
2745	{ .name: "Dingbats", .first: `0x2700`, .second: `0x27BF` },
2746	{ .name: "EnclosedAlphanumerics", .first: `0x2460`, .second: `0x24FF` },
2747	{ .name: "EnclosedCJKLettersandMonths", .first: `0x3200`, .second: `0x32FF` },
2748	{ .name: "Ethiopic", .first: `0x1200`, .second: `0x137F` },
2749	{ .name: "EthiopicExtended", .first: `0x2D80`, .second: `0x2DDF` },
2750	{ .name: "EthiopicSupplement", .first: `0x1380`, .second: `0x139F` },
2751	{ .name: "GeneralPunctuation", .first: `0x2000`, .second: `0x206F` },
2752	{ .name: "GeometricShapes", .first: `0x25A0`, .second: `0x25FF` },
2753	{ .name: "Georgian", .first: `0x10A0`, .second: `0x10FF` },
2754	{ .name: "GeorgianSupplement", .first: `0x2D00`, .second: `0x2D2F` },
2755	{ .name: "Glagolitic", .first: `0x2C00`, .second: `0x2C5F` },
2756	{ .name: "Gothic", .first: `0x10330`, .second: `0x1034F` },
2757	{ .name: "Greek", .first: `0x0370`, .second: `0x03FF` },
2758	{ .name: "GreekExtended", .first: `0x1F00`, .second: `0x1FFF` },
2759	{ .name: "Gujarati", .first: `0x0A80`, .second: `0x0AFF` },
2760	{ .name: "Gurmukhi", .first: `0x0A00`, .second: `0x0A7F` },
2761	{ .name: "HalfwidthandFullwidthForms", .first: `0xFF00`, .second: `0xFFEF` },
2762	{ .name: "HangulCompatibilityJamo", .first: `0x3130`, .second: `0x318F` },
2763	{ .name: "HangulJamo", .first: `0x1100`, .second: `0x11FF` },
2764	{ .name: "HangulSyllables", .first: `0xAC00`, .second: `0xD7A3` },
2765	{ .name: "Hanunoo", .first: `0x1720`, .second: `0x173F` },
2766	{ .name: "Hebrew", .first: `0x0590`, .second: `0x05FF` },
2767	{ .name: "Hiragana", .first: `0x3040`, .second: `0x309F` },
2768	{ .name: "IPAExtensions", .first: `0x0250`, .second: `0x02AF` },
2769	{ .name: "IdeographicDescriptionCharacters", .first: `0x2FF0`, .second: `0x2FFF` },
2770	{ .name: "Kanbun", .first: `0x3190`, .second: `0x319F` },
2771	{ .name: "KangxiRadicals", .first: `0x2F00`, .second: `0x2FDF` },
2772	{ .name: "Kannada", .first: `0x0C80`, .second: `0x0CFF` },
2773	{ .name: "Katakana", .first: `0x30A0`, .second: `0x30FF` },
2774	{ .name: "KatakanaPhoneticExtensions", .first: `0x31F0`, .second: `0x31FF` },
2775	{ .name: "Kharoshthi", .first: `0x10A00`, .second: `0x10A5F` },
2776	{ .name: "Khmer", .first: `0x1780`, .second: `0x17FF` },
2777	{ .name: "KhmerSymbols", .first: `0x19E0`, .second: `0x19FF` },
2778	{ .name: "Lao", .first: `0x0E80`, .second: `0x0EFF` },
2779	{ .name: "Latin-1Supplement", .first: `0x0080`, .second: `0x00FF` },
2780	{ .name: "LatinExtended-A", .first: `0x0100`, .second: `0x017F` },
2781	{ .name: "LatinExtended-B", .first: `0x0180`, .second: `0x024F` },
2782	{ .name: "LatinExtendedAdditional", .first: `0x1E00`, .second: `0x1EFF` },
2783	{ .name: "LetterlikeSymbols", .first: `0x2100`, .second: `0x214F` },
2784	{ .name: "Limbu", .first: `0x1900`, .second: `0x194F` },
2785	{ .name: "LinearBIdeograms", .first: `0x10080`, .second: `0x100FF` },
2786	{ .name: "LinearBSyllabary", .first: `0x10000`, .second: `0x1007F` },
2787	{ .name: "Malayalam", .first: `0x0D00`, .second: `0x0D7F` },
2788	{ .name: "MathematicalAlphanumericSymbols", .first: `0x1D400`, .second: `0x1D7FF` },
2789	{ .name: "MathematicalOperators", .first: `0x2200`, .second: `0x22FF` },
2790	{ .name: "MiscellaneousMathematicalSymbols-A", .first: `0x27C0`, .second: `0x27EF` },
2791	{ .name: "MiscellaneousMathematicalSymbols-B", .first: `0x2980`, .second: `0x29FF` },
2792	{ .name: "MiscellaneousSymbols", .first: `0x2600`, .second: `0x26FF` },
2793	{ .name: "MiscellaneousSymbolsandArrows", .first: `0x2B00`, .second: `0x2BFF` },
2794	{ .name: "MiscellaneousTechnical", .first: `0x2300`, .second: `0x23FF` },
2795	{ .name: "ModifierToneLetters", .first: `0xA700`, .second: `0xA71F` },
2796	{ .name: "Mongolian", .first: `0x1800`, .second: `0x18AF` },
2797	{ .name: "MusicalSymbols", .first: `0x1D100`, .second: `0x1D1FF` },
2798	{ .name: "Myanmar", .first: `0x1000`, .second: `0x109F` },
2799	{ .name: "NewTaiLue", .first: `0x1980`, .second: `0x19DF` },
2800	{ .name: "NumberForms", .first: `0x2150`, .second: `0x218F` },
2801	{ .name: "Ogham", .first: `0x1680`, .second: `0x169F` },
2802	{ .name: "OldItalic", .first: `0x10300`, .second: `0x1032F` },
2803	{ .name: "OldPersian", .first: `0x103A0`, .second: `0x103DF` },
2804	{ .name: "OpticalCharacterRecognition", .first: `0x2440`, .second: `0x245F` },
2805	{ .name: "Oriya", .first: `0x0B00`, .second: `0x0B7F` },
2806	{ .name: "Osmanya", .first: `0x10480`, .second: `0x104AF` },
2807	{ .name: "PhoneticExtensions", .first: `0x1D00`, .second: `0x1D7F` },
2808	{ .name: "PhoneticExtensionsSupplement", .first: `0x1D80`, .second: `0x1DBF` },
2809	{ .name: "PrivateUse", .first: `0xE000`, .second: `0xF8FF` },
2810	{ .name: "Runic", .first: `0x16A0`, .second: `0x16FF` },
2811	{ .name: "Shavian", .first: `0x10450`, .second: `0x1047F` },
2812	{ .name: "Sinhala", .first: `0x0D80`, .second: `0x0DFF` },
2813	{ .name: "SmallFormVariants", .first: `0xFE50`, .second: `0xFE6F` },
2814	{ .name: "SpacingModifierLetters", .first: `0x02B0`, .second: `0x02FF` },
2815	{ .name: "Specials", .first: `0xFFF0`, .second: `0xFFFF` },
2816	{ .name: "SuperscriptsandSubscripts", .first: `0x2070`, .second: `0x209F` },
2817	{ .name: "SupplementalArrows-A", .first: `0x27F0`, .second: `0x27FF` },
2818	{ .name: "SupplementalArrows-B", .first: `0x2900`, .second: `0x297F` },
2819	{ .name: "SupplementalMathematicalOperators", .first: `0x2A00`, .second: `0x2AFF` },
2820	{ .name: "SupplementalPunctuation", .first: `0x2E00`, .second: `0x2E7F` },
2821	{ .name: "SupplementaryPrivateUseArea-A", .first: `0xF0000`, .second: `0xFFFFF` },
2822	{ .name: "SupplementaryPrivateUseArea-B", .first: `0x100000`, .second: `0x10FFFF` },
2823	{ .name: "SylotiNagri", .first: `0xA800`, .second: `0xA82F` },
2824	{ .name: "Syriac", .first: `0x0700`, .second: `0x074F` },
2825	{ .name: "Tagalog", .first: `0x1700`, .second: `0x171F` },
2826	{ .name: "Tagbanwa", .first: `0x1760`, .second: `0x177F` },
2827	{ .name: "Tags", .first: `0xE0000`, .second: `0xE007F` },
2828	{ .name: "TaiLe", .first: `0x1950`, .second: `0x197F` },
2829	{ .name: "TaiXuanJingSymbols", .first: `0x1D300`, .second: `0x1D35F` },
2830	{ .name: "Tamil", .first: `0x0B80`, .second: `0x0BFF` },
2831	{ .name: "Telugu", .first: `0x0C00`, .second: `0x0C7F` },
2832	{ .name: "Thaana", .first: `0x0780`, .second: `0x07BF` },
2833	{ .name: "Thai", .first: `0x0E00`, .second: `0x0E7F` },
2834	{ .name: "Tibetan", .first: `0x0F00`, .second: `0x0FFF` },
2835	{ .name: "Tifinagh", .first: `0x2D30`, .second: `0x2D7F` },
2836	{ .name: "Ugaritic", .first: `0x10380`, .second: `0x1039F` },
2837	{ .name: "UnifiedCanadianAboriginalSyllabics", .first: `0x1400`, .second: `0x167F` },
2838	{ .name: "VariationSelectors", .first: `0xFE00`, .second: `0xFE0F` },
2839	{ .name: "VariationSelectorsSupplement", .first: `0xE0100`, .second: `0xE01EF` },
2840	{ .name: "VerticalForms", .first: `0xFE10`, .second: `0xFE1F` },
2841	{ .name: "YiRadicals", .first: `0xA490`, .second: `0xA4CF` },
2842	{ .name: "YiSyllables", .first: `0xA000`, .second: `0xA48F` },
2843	{ .name: "YijingHexagramSymbols", .first: `0x4DC0`, .second: `0x4DFF` }
2844	};
2845
2846	inline bool operator<(const CategoriesRangeMapEntry &entry1, const CategoriesRangeMapEntry &entry2)
2847	{ return qstrcmp(str1: entry1.name, str2: entry2.name) < `0`; }
2848	inline bool operator<(const char name, const* CategoriesRangeMapEntry &entry)
2849	{ return qstrcmp(str1: name, str2: entry.name) < `0`; }
2850	inline bool operator<(const CategoriesRangeMapEntry &entry, const char *name)
2851	{ return qstrcmp(str1: entry.name, str2: name) < `0`; }
2852	#endif // QT_NO_REGEXP_CCLASS
2853
2854	int QRegExpEngine::getChar()
2855	{
2856	return (yyPos == yyLen) ? EOS : yyIn[yyPos++].unicode();
2857	}
2858
2859	int QRegExpEngine::getEscape()
2860	{
2861	#ifndef QT_NO_REGEXP_ESCAPE
2862	const char tab[] = "afnrtv"; // no b, as \b means word boundary
2863	const char backTab[] = "\a\f\n\r\t\v";
2864	ushort low;
2865	int i;
2866	#endif
2867	ushort val;
2868	int prevCh = yyCh;
2869
2870	if (prevCh == EOS) {
2871	error(RXERR_END);
2872	return Tok_Char \| `'\\'`;
2873	}
2874	yyCh = getChar();
2875	#ifndef QT_NO_REGEXP_ESCAPE
2876	if ((prevCh & ~`0xff`) == `0`) {
2877	const char *p = strchr(s: tab, c: prevCh);
2878	if (p != nullptr)
2879	return Tok_Char \| backTab[p - tab];
2880	}
2881	#endif
2882
2883	switch (prevCh) {
2884	#ifndef QT_NO_REGEXP_ESCAPE
2885	case `'0'`:
2886	val = `0`;
2887	for (i = `0`; i < `3`; i++) {
2888	if (yyCh >= `'0'` && yyCh <= `'7'`)
2889	val = (val << `3`) \| (yyCh - `'0'`);
2890	else
2891	break;
2892	yyCh = getChar();
2893	}
2894	if ((val & ~`0377`) != `0`)
2895	error(RXERR_OCTAL);
2896	return Tok_Char \| val;
2897	#endif
2898	#ifndef QT_NO_REGEXP_ESCAPE
2899	case `'B'`:
2900	return Tok_NonWord;
2901	#endif
2902	#ifndef QT_NO_REGEXP_CCLASS
2903	case `'D'`:
2904	// see QChar::isDigit()
2905	yyCharClass ->addCategories(cats: uint(-`1`) ^ FLAG(QChar::Number_DecimalDigit));
2906	return Tok_CharClass;
2907	case `'S'`:
2908	// see QChar::isSpace()
2909	yyCharClass ->addCategories(cats: uint(-`1`) ^ (FLAG(QChar::Separator_Space) \|
2910	FLAG(QChar::Separator_Line) \|
2911	FLAG(QChar::Separator_Paragraph) \|
2912	FLAG(QChar::Other_Control)));
2913	yyCharClass ->addRange(from: `0x0000`, to: `0x0008`);
2914	yyCharClass ->addRange(from: `0x000e`, to: `0x001f`);
2915	yyCharClass ->addRange(from: `0x007f`, to: `0x0084`);
2916	yyCharClass ->addRange(from: `0x0086`, to: `0x009f`);
2917	return Tok_CharClass;
2918	case `'W'`:
2919	// see QChar::isLetterOrNumber() and QChar::isMark()
2920	yyCharClass ->addCategories(cats: uint(-`1`) ^ (FLAG(QChar::Mark_NonSpacing) \|
2921	FLAG(QChar::Mark_SpacingCombining) \|
2922	FLAG(QChar::Mark_Enclosing) \|
2923	FLAG(QChar::Number_DecimalDigit) \|
2924	FLAG(QChar::Number_Letter) \|
2925	FLAG(QChar::Number_Other) \|
2926	FLAG(QChar::Letter_Uppercase) \|
2927	FLAG(QChar::Letter_Lowercase) \|
2928	FLAG(QChar::Letter_Titlecase) \|
2929	FLAG(QChar::Letter_Modifier) \|
2930	FLAG(QChar::Letter_Other) \|
2931	FLAG(QChar::Punctuation_Connector)));
2932	yyCharClass ->addRange(from: `0x203f`, to: `0x2040`);
2933	yyCharClass ->addSingleton(ch: `0x2040`);
2934	yyCharClass ->addSingleton(ch: `0x2054`);
2935	yyCharClass ->addSingleton(ch: `0x30fb`);
2936	yyCharClass ->addRange(from: `0xfe33`, to: `0xfe34`);
2937	yyCharClass ->addRange(from: `0xfe4d`, to: `0xfe4f`);
2938	yyCharClass ->addSingleton(ch: `0xff3f`);
2939	yyCharClass ->addSingleton(ch: `0xff65`);
2940	return Tok_CharClass;
2941	#endif
2942	#ifndef QT_NO_REGEXP_ESCAPE
2943	case `'b'`:
2944	return Tok_Word;
2945	#endif
2946	#ifndef QT_NO_REGEXP_CCLASS
2947	case `'d'`:
2948	// see QChar::isDigit()
2949	yyCharClass ->addCategories(FLAG(QChar::Number_DecimalDigit));
2950	return Tok_CharClass;
2951	case `'s'`:
2952	// see QChar::isSpace()
2953	yyCharClass ->addCategories(FLAG(QChar::Separator_Space) \|
2954	FLAG(QChar::Separator_Line) \|
2955	FLAG(QChar::Separator_Paragraph));
2956	yyCharClass ->addRange(from: `0x0009`, to: `0x000d`);
2957	yyCharClass ->addSingleton(ch: `0x0085`);
2958	return Tok_CharClass;
2959	case `'w'`:
2960	// see QChar::isLetterOrNumber() and QChar::isMark()
2961	yyCharClass ->addCategories(FLAG(QChar::Mark_NonSpacing) \|
2962	FLAG(QChar::Mark_SpacingCombining) \|
2963	FLAG(QChar::Mark_Enclosing) \|
2964	FLAG(QChar::Number_DecimalDigit) \|
2965	FLAG(QChar::Number_Letter) \|
2966	FLAG(QChar::Number_Other) \|
2967	FLAG(QChar::Letter_Uppercase) \|
2968	FLAG(QChar::Letter_Lowercase) \|
2969	FLAG(QChar::Letter_Titlecase) \|
2970	FLAG(QChar::Letter_Modifier) \|
2971	FLAG(QChar::Letter_Other));
2972	yyCharClass ->addSingleton(ch: `0x005f`); // '_'
2973	return Tok_CharClass;
2974	case `'I'`:
2975	if (!xmlSchemaExtensions)
2976	break;
2977	yyCharClass ->setNegative(!yyCharClass ->negative());
2978	Q_FALLTHROUGH();
2979	case `'i'`:
2980	if (xmlSchemaExtensions) {
2981	yyCharClass ->addCategories(FLAG(QChar::Mark_NonSpacing) \|
2982	FLAG(QChar::Mark_SpacingCombining) \|
2983	FLAG(QChar::Mark_Enclosing) \|
2984	FLAG(QChar::Number_DecimalDigit) \|
2985	FLAG(QChar::Number_Letter) \|
2986	FLAG(QChar::Number_Other) \|
2987	FLAG(QChar::Letter_Uppercase) \|
2988	FLAG(QChar::Letter_Lowercase) \|
2989	FLAG(QChar::Letter_Titlecase) \|
2990	FLAG(QChar::Letter_Modifier) \|
2991	FLAG(QChar::Letter_Other));
2992	yyCharClass ->addSingleton(ch: `0x003a`); // ':'
2993	yyCharClass ->addSingleton(ch: `0x005f`); // '_'
2994	yyCharClass ->addRange(from: `0x0041`, to: `0x005a`); // [A-Z]
2995	yyCharClass ->addRange(from: `0x0061`, to: `0x007a`); // [a-z]
2996	yyCharClass ->addRange(from: `0xc0`, to: `0xd6`);
2997	yyCharClass ->addRange(from: `0xd8`, to: `0xf6`);
2998	yyCharClass ->addRange(from: `0xf8`, to: `0x2ff`);
2999	yyCharClass ->addRange(from: `0x370`, to: `0x37d`);
3000	yyCharClass ->addRange(from: `0x37f`, to: `0x1fff`);
3001	yyCharClass ->addRange(from: `0x200c`, to: `0x200d`);
3002	yyCharClass ->addRange(from: `0x2070`, to: `0x218f`);
3003	yyCharClass ->addRange(from: `0x2c00`, to: `0x2fef`);
3004	yyCharClass ->addRange(from: `0x3001`, to: `0xd7ff`);
3005	yyCharClass ->addRange(from: `0xf900`, to: `0xfdcf`);
3006	yyCharClass ->addRange(from: `0xfdf0`, to: `0xfffd`);
3007	yyCharClass ->addRange(from: (ushort)`0x10000`, to: (ushort)`0xeffff`);
3008	return Tok_CharClass;
3009	} else {
3010	break;
3011	}
3012	case `'C'`:
3013	if (!xmlSchemaExtensions)
3014	break;
3015	yyCharClass ->setNegative(!yyCharClass ->negative());
3016	Q_FALLTHROUGH();
3017	case `'c'`:
3018	if (xmlSchemaExtensions) {
3019	yyCharClass ->addCategories(FLAG(QChar::Mark_NonSpacing) \|
3020	FLAG(QChar::Mark_SpacingCombining) \|
3021	FLAG(QChar::Mark_Enclosing) \|
3022	FLAG(QChar::Number_DecimalDigit) \|
3023	FLAG(QChar::Number_Letter) \|
3024	FLAG(QChar::Number_Other) \|
3025	FLAG(QChar::Letter_Uppercase) \|
3026	FLAG(QChar::Letter_Lowercase) \|
3027	FLAG(QChar::Letter_Titlecase) \|
3028	FLAG(QChar::Letter_Modifier) \|
3029	FLAG(QChar::Letter_Other));
3030	yyCharClass ->addSingleton(ch: `0x002d`); // '-'
3031	yyCharClass ->addSingleton(ch: `0x002e`); // '.'
3032	yyCharClass ->addSingleton(ch: `0x003a`); // ':'
3033	yyCharClass ->addSingleton(ch: `0x005f`); // '_'
3034	yyCharClass ->addSingleton(ch: `0xb7`);
3035	yyCharClass ->addRange(from: `0x0030`, to: `0x0039`); // [0-9]
3036	yyCharClass ->addRange(from: `0x0041`, to: `0x005a`); // [A-Z]
3037	yyCharClass ->addRange(from: `0x0061`, to: `0x007a`); // [a-z]
3038	yyCharClass ->addRange(from: `0xc0`, to: `0xd6`);
3039	yyCharClass ->addRange(from: `0xd8`, to: `0xf6`);
3040	yyCharClass ->addRange(from: `0xf8`, to: `0x2ff`);
3041	yyCharClass ->addRange(from: `0x370`, to: `0x37d`);
3042	yyCharClass ->addRange(from: `0x37f`, to: `0x1fff`);
3043	yyCharClass ->addRange(from: `0x200c`, to: `0x200d`);
3044	yyCharClass ->addRange(from: `0x2070`, to: `0x218f`);
3045	yyCharClass ->addRange(from: `0x2c00`, to: `0x2fef`);
3046	yyCharClass ->addRange(from: `0x3001`, to: `0xd7ff`);
3047	yyCharClass ->addRange(from: `0xf900`, to: `0xfdcf`);
3048	yyCharClass ->addRange(from: `0xfdf0`, to: `0xfffd`);
3049	yyCharClass ->addRange(from: (ushort)`0x10000`, to: (ushort)`0xeffff`);
3050	yyCharClass ->addRange(from: `0x0300`, to: `0x036f`);
3051	yyCharClass ->addRange(from: `0x203f`, to: `0x2040`);
3052	return Tok_CharClass;
3053	} else {
3054	break;
3055	}
3056	case `'P'`:
3057	if (!xmlSchemaExtensions)
3058	break;
3059	yyCharClass ->setNegative(!yyCharClass ->negative());
3060	Q_FALLTHROUGH();
3061	case `'p'`:
3062	if (xmlSchemaExtensions) {
3063	if (yyCh != `'{'`) {
3064	error(RXERR_CHARCLASS);
3065	return Tok_CharClass;
3066	}
3067
3068	QByteArray category;
3069	yyCh = getChar();
3070	while (yyCh != `'}'`) {
3071	if (yyCh == EOS) {
3072	error(RXERR_END);
3073	return Tok_CharClass;
3074	}
3075	category.append(c: yyCh);
3076	yyCh = getChar();
3077	}
3078	yyCh = getChar(); // skip closing '}'
3079
3080	int catlen = category.size();
3081	if (catlen == `1` \|\| catlen == `2`) {
3082	switch (category.at(i: `0`)) {
3083	case `'M'`:
3084	if (catlen == `1`) {
3085	yyCharClass ->addCategories(FLAG(QChar::Mark_NonSpacing) \|
3086	FLAG(QChar::Mark_SpacingCombining) \|
3087	FLAG(QChar::Mark_Enclosing));
3088	} else {
3089	switch (category.at(i: `1`)) {
3090	case `'n'`: yyCharClass ->addCategories(FLAG(QChar::Mark_NonSpacing)); break; // Mn
3091	case `'c'`: yyCharClass ->addCategories(FLAG(QChar::Mark_SpacingCombining)); break; // Mc
3092	case `'e'`: yyCharClass ->addCategories(FLAG(QChar::Mark_Enclosing)); break; // Me
3093	default: error(RXERR_CATEGORY); break;
3094	}
3095	}
3096	break;
3097	case `'N'`:
3098	if (catlen == `1`) {
3099	yyCharClass ->addCategories(FLAG(QChar::Number_DecimalDigit) \|
3100	FLAG(QChar::Number_Letter) \|
3101	FLAG(QChar::Number_Other));
3102	} else {
3103	switch (category.at(i: `1`)) {
3104	case `'d'`: yyCharClass ->addCategories(FLAG(QChar::Number_DecimalDigit)); break; // Nd
3105	case `'l'`: yyCharClass ->addCategories(FLAG(QChar::Number_Letter)); break; // Hl
3106	case `'o'`: yyCharClass ->addCategories(FLAG(QChar::Number_Other)); break; // No
3107	default: error(RXERR_CATEGORY); break;
3108	}
3109	}
3110	break;
3111	case `'Z'`:
3112	if (catlen == `1`) {
3113	yyCharClass ->addCategories(FLAG(QChar::Separator_Space) \|
3114	FLAG(QChar::Separator_Line) \|
3115	FLAG(QChar::Separator_Paragraph));
3116	} else {
3117	switch (category.at(i: `1`)) {
3118	case `'s'`: yyCharClass ->addCategories(FLAG(QChar::Separator_Space)); break; // Zs
3119	case `'l'`: yyCharClass ->addCategories(FLAG(QChar::Separator_Line)); break; // Zl
3120	case `'p'`: yyCharClass ->addCategories(FLAG(QChar::Separator_Paragraph)); break; // Zp
3121	default: error(RXERR_CATEGORY); break;
3122	}
3123	}
3124	break;
3125	case `'C'`:
3126	if (catlen == `1`) {
3127	yyCharClass ->addCategories(FLAG(QChar::Other_Control) \|
3128	FLAG(QChar::Other_Format) \|
3129	FLAG(QChar::Other_Surrogate) \|
3130	FLAG(QChar::Other_PrivateUse) \|
3131	FLAG(QChar::Other_NotAssigned));
3132	} else {
3133	switch (category.at(i: `1`)) {
3134	case `'c'`: yyCharClass ->addCategories(FLAG(QChar::Other_Control)); break; // Cc
3135	case `'f'`: yyCharClass ->addCategories(FLAG(QChar::Other_Format)); break; // Cf
3136	case `'s'`: yyCharClass ->addCategories(FLAG(QChar::Other_Surrogate)); break; // Cs
3137	case `'o'`: yyCharClass ->addCategories(FLAG(QChar::Other_PrivateUse)); break; // Co
3138	case `'n'`: yyCharClass ->addCategories(FLAG(QChar::Other_NotAssigned)); break; // Cn
3139	default: error(RXERR_CATEGORY); break;
3140	}
3141	}
3142	break;
3143	case `'L'`:
3144	if (catlen == `1`) {
3145	yyCharClass ->addCategories(FLAG(QChar::Letter_Uppercase) \|
3146	FLAG(QChar::Letter_Lowercase) \|
3147	FLAG(QChar::Letter_Titlecase) \|
3148	FLAG(QChar::Letter_Modifier) \|
3149	FLAG(QChar::Letter_Other));
3150	} else {
3151	switch (category.at(i: `1`)) {
3152	case `'u'`: yyCharClass ->addCategories(FLAG(QChar::Letter_Uppercase)); break; // Lu
3153	case `'l'`: yyCharClass ->addCategories(FLAG(QChar::Letter_Lowercase)); break; // Ll
3154	case `'t'`: yyCharClass ->addCategories(FLAG(QChar::Letter_Titlecase)); break; // Lt
3155	case `'m'`: yyCharClass ->addCategories(FLAG(QChar::Letter_Modifier)); break; // Lm
3156	case `'o'`: yyCharClass ->addCategories(FLAG(QChar::Letter_Other)); break; // Lo
3157	default: error(RXERR_CATEGORY); break;
3158	}
3159	}
3160	break;
3161	case `'P'`:
3162	if (catlen == `1`) {
3163	yyCharClass ->addCategories(FLAG(QChar::Punctuation_Connector) \|
3164	FLAG(QChar::Punctuation_Dash) \|
3165	FLAG(QChar::Punctuation_Open) \|
3166	FLAG(QChar::Punctuation_Close) \|
3167	FLAG(QChar::Punctuation_InitialQuote) \|
3168	FLAG(QChar::Punctuation_FinalQuote) \|
3169	FLAG(QChar::Punctuation_Other));
3170	} else {
3171	switch (category.at(i: `1`)) {
3172	case `'c'`: yyCharClass ->addCategories(FLAG(QChar::Punctuation_Connector)); break; // Pc
3173	case `'d'`: yyCharClass ->addCategories(FLAG(QChar::Punctuation_Dash)); break; // Pd
3174	case `'s'`: yyCharClass ->addCategories(FLAG(QChar::Punctuation_Open)); break; // Ps
3175	case `'e'`: yyCharClass ->addCategories(FLAG(QChar::Punctuation_Close)); break; // Pe
3176	case `'i'`: yyCharClass ->addCategories(FLAG(QChar::Punctuation_InitialQuote)); break; // Pi
3177	case `'f'`: yyCharClass ->addCategories(FLAG(QChar::Punctuation_FinalQuote)); break; // Pf
3178	case `'o'`: yyCharClass ->addCategories(FLAG(QChar::Punctuation_Other)); break; // Po
3179	default: error(RXERR_CATEGORY); break;
3180	}
3181	}
3182	break;
3183	case `'S'`:
3184	if (catlen == `1`) {
3185	yyCharClass ->addCategories(FLAG(QChar::Symbol_Math) \|
3186	FLAG(QChar::Symbol_Currency) \|
3187	FLAG(QChar::Symbol_Modifier) \|
3188	FLAG(QChar::Symbol_Other));
3189	} else {
3190	switch (category.at(i: `1`)) {
3191	case `'m'`: yyCharClass ->addCategories(FLAG(QChar::Symbol_Math)); break; // Sm
3192	case `'c'`: yyCharClass ->addCategories(FLAG(QChar::Symbol_Currency)); break; // Sc
3193	case `'k'`: yyCharClass ->addCategories(FLAG(QChar::Symbol_Modifier)); break; // Sk
3194	case `'o'`: yyCharClass ->addCategories(FLAG(QChar::Symbol_Other)); break; // So
3195	default: error(RXERR_CATEGORY); break;
3196	}
3197	}
3198	break;
3199	default:
3200	error(RXERR_CATEGORY);
3201	break;
3202	}
3203	} else if (catlen > `2` && category.at(i: `0`) == `'I'` && category.at(i: `1`) == `'s'`) {
3204	static const int N = sizeof(categoriesRangeMap) / sizeof(categoriesRangeMap[`0`]);
3205	const char * const categoryFamily = category.constData() + `2`;
3206	const CategoriesRangeMapEntry *r = std::lower_bound(first: categoriesRangeMap, last: categoriesRangeMap + N, val: categoryFamily);
3207	if (r != categoriesRangeMap + N && qstrcmp(str1: r->name, str2: categoryFamily) == `0`)
3208	yyCharClass ->addRange(from: r->first, to: r->second);
3209	else
3210	error(RXERR_CATEGORY);
3211	} else {
3212	error(RXERR_CATEGORY);
3213	}
3214	return Tok_CharClass;
3215	} else {
3216	break;
3217	}
3218	#endif
3219	#ifndef QT_NO_REGEXP_ESCAPE
3220	case `'x'`:
3221	val = `0`;
3222	for (i = `0`; i < `4`; i++) {
3223	low = QChar (yyCh).toLower().unicode();
3224	if (low >= `'0'` && low <= `'9'`)
3225	val = (val << `4`) \| (low - `'0'`);
3226	else if (low >= `'a'` && low <= `'f'`)
3227	val = (val << `4`) \| (low - `'a'` + `10`);
3228	else
3229	break;
3230	yyCh = getChar();
3231	}
3232	return Tok_Char \| val;
3233	#endif
3234	default:
3235	break;
3236	}
3237	if (prevCh >= `'1'` && prevCh <= `'9'`) {
3238	#ifndef QT_NO_REGEXP_BACKREF
3239	val = prevCh - `'0'`;
3240	while (yyCh >= `'0'` && yyCh <= `'9'`) {
3241	val = (val * `10`) + (yyCh - `'0'`);
3242	yyCh = getChar();
3243	}
3244	return Tok_BackRef \| val;
3245	#else
3246	error(RXERR_DISABLED);
3247	#endif
3248	}
3249	return Tok_Char \| prevCh;
3250	}
3251
3252	#ifndef QT_NO_REGEXP_INTERVAL
3253	int QRegExpEngine::getRep(int def)
3254	{
3255	if (yyCh >= `'0'` && yyCh <= `'9'`) {
3256	int rep = `0`;
3257	do {
3258	rep = `10` * rep + yyCh - `'0'`;
3259	if (rep >= InftyRep) {
3260	error(RXERR_REPETITION);
3261	rep = def;
3262	}
3263	yyCh = getChar();
3264	} while (yyCh >= `'0'` && yyCh <= `'9'`);
3265	return rep;
3266	} else {
3267	return def;
3268	}
3269	}
3270	#endif
3271
3272	#ifndef QT_NO_REGEXP_LOOKAHEAD
3273	void QRegExpEngine::skipChars(int n)
3274	{
3275	if (n > `0`) {
3276	yyPos += n - `1`;
3277	yyCh = getChar();
3278	}
3279	}
3280	#endif
3281
3282	void QRegExpEngine::error(const char *msg)
3283	{
3284	if (yyError.isEmpty())
3285	yyError = QLatin1String (msg);
3286	}
3287
3288	void QRegExpEngine::startTokenizer(const QChar rx, int* len)
3289	{
3290	yyIn = rx;
3291	yyPos0 = `0`;
3292	yyPos = `0`;
3293	yyLen = len;
3294	yyCh = getChar();
3295	yyCharClass.reset(other: new QRegExpCharClass);
3296	yyMinRep = `0`;
3297	yyMaxRep = `0`;
3298	yyError = QString ();
3299	}
3300
3301	int QRegExpEngine::getToken()
3302	{
3303	#ifndef QT_NO_REGEXP_CCLASS
3304	ushort pendingCh = `0`;
3305	bool charPending;
3306	bool rangePending;
3307	int tok;
3308	#endif
3309	int prevCh = yyCh;
3310
3311	yyPos0 = yyPos - `1`;
3312	#ifndef QT_NO_REGEXP_CCLASS
3313	yyCharClass ->clear();
3314	#endif
3315	yyMinRep = `0`;
3316	yyMaxRep = `0`;
3317	yyCh = getChar();
3318
3319	switch (prevCh) {
3320	case EOS:
3321	yyPos0 = yyPos;
3322	return Tok_Eos;
3323	case `'$'`:
3324	return Tok_Dollar;
3325	case `'('`:
3326	if (yyCh == `'?'`) {
3327	prevCh = getChar();
3328	yyCh = getChar();
3329	switch (prevCh) {
3330	#ifndef QT_NO_REGEXP_LOOKAHEAD
3331	case `'!'`:
3332	return Tok_NegLookahead;
3333	case `'='`:
3334	return Tok_PosLookahead;
3335	#endif
3336	case `':'`:
3337	return Tok_MagicLeftParen;
3338	case `'<'`:
3339	error(RXERR_LOOKBEHIND);
3340	return Tok_MagicLeftParen;
3341	default:
3342	error(RXERR_LOOKAHEAD);
3343	return Tok_MagicLeftParen;
3344	}
3345	} else {
3346	return Tok_LeftParen;
3347	}
3348	case `')'`:
3349	return Tok_RightParen;
3350	case `'*'`:
3351	yyMinRep = `0`;
3352	yyMaxRep = InftyRep;
3353	return Tok_Quantifier;
3354	case `'+'`:
3355	yyMinRep = `1`;
3356	yyMaxRep = InftyRep;
3357	return Tok_Quantifier;
3358	case `'.'`:
3359	#ifndef QT_NO_REGEXP_CCLASS
3360	yyCharClass ->setNegative(true);
3361	#endif
3362	return Tok_CharClass;
3363	case `'?'`:
3364	yyMinRep = `0`;
3365	yyMaxRep = `1`;
3366	return Tok_Quantifier;
3367	case `'['`:
3368	#ifndef QT_NO_REGEXP_CCLASS
3369	if (yyCh == `'^'`) {
3370	yyCharClass ->setNegative(true);
3371	yyCh = getChar();
3372	}
3373	charPending = false;
3374	rangePending = false;
3375	do {
3376	if (yyCh == `'-'` && charPending && !rangePending) {
3377	rangePending = true;
3378	yyCh = getChar();
3379	} else {
3380	if (charPending && !rangePending) {
3381	yyCharClass ->addSingleton(ch: pendingCh);
3382	charPending = false;
3383	}
3384	if (yyCh == `'\\'`) {
3385	yyCh = getChar();
3386	tok = getEscape();
3387	if (tok == Tok_Word)
3388	tok = `'\b'`;
3389	} else {
3390	tok = Tok_Char \| yyCh;
3391	yyCh = getChar();
3392	}
3393	if (tok == Tok_CharClass) {
3394	if (rangePending) {
3395	yyCharClass ->addSingleton(ch: `'-'`);
3396	yyCharClass ->addSingleton(ch: pendingCh);
3397	charPending = false;
3398	rangePending = false;
3399	}
3400	} else if ((tok & Tok_Char) != `0`) {
3401	if (rangePending) {
3402	yyCharClass ->addRange(from: pendingCh, to: tok ^ Tok_Char);
3403	charPending = false;
3404	rangePending = false;
3405	} else {
3406	pendingCh = tok ^ Tok_Char;
3407	charPending = true;
3408	}
3409	} else {
3410	error(RXERR_CHARCLASS);
3411	}
3412	}
3413	} while (yyCh != `']'` && yyCh != EOS);
3414	if (rangePending)
3415	yyCharClass ->addSingleton(ch: `'-'`);
3416	if (charPending)
3417	yyCharClass ->addSingleton(ch: pendingCh);
3418	if (yyCh == EOS)
3419	error(RXERR_END);
3420	else
3421	yyCh = getChar();
3422	return Tok_CharClass;
3423	#else
3424	error(RXERR_END);
3425	return Tok_Char \| `'['`;
3426	#endif
3427	case `'\\'`:
3428	return getEscape();
3429	case `']'`:
3430	error(RXERR_LEFTDELIM);
3431	return Tok_Char \| `']'`;
3432	case `'^'`:
3433	return Tok_Caret;
3434	case `'{'`:
3435	#ifndef QT_NO_REGEXP_INTERVAL
3436	yyMinRep = getRep(def: `0`);
3437	yyMaxRep = yyMinRep;
3438	if (yyCh == `','`) {
3439	yyCh = getChar();
3440	yyMaxRep = getRep(def: InftyRep);
3441	}
3442	if (yyMaxRep < yyMinRep)
3443	error(RXERR_INTERVAL);
3444	if (yyCh != `'}'`)
3445	error(RXERR_REPETITION);
3446	yyCh = getChar();
3447	return Tok_Quantifier;
3448	#else
3449	error(RXERR_DISABLED);
3450	return Tok_Char \| `'{'`;
3451	#endif
3452	case `'\|'`:
3453	return Tok_Bar;
3454	case `'}'`:
3455	error(RXERR_LEFTDELIM);
3456	return Tok_Char \| `'}'`;
3457	default:
3458	return Tok_Char \| prevCh;
3459	}
3460	}
3461
3462	int QRegExpEngine::parse(const QChar pattern, int* len)
3463	{
3464	valid = true;
3465	startTokenizer(rx: pattern, len);
3466	yyTok = getToken();
3467	#ifndef QT_NO_REGEXP_CAPTURE
3468	yyMayCapture = true;
3469	#else
3470	yyMayCapture = false;
3471	#endif
3472
3473	#ifndef QT_NO_REGEXP_CAPTURE
3474	int atom = startAtom(officialCapture: false);
3475	#endif
3476	QRegExpCharClass anything;
3477	Box box(this); // create InitialState
3478	box.set(anything);
3479	Box rightBox(this); // create FinalState
3480	rightBox.set(anything);
3481
3482	Box middleBox(this);
3483	parseExpression(box: &middleBox);
3484	#ifndef QT_NO_REGEXP_CAPTURE
3485	finishAtom(atom, needCapture: false);
3486	#endif
3487	#ifndef QT_NO_REGEXP_OPTIM
3488	middleBox.setupHeuristics();
3489	#endif
3490	box.cat(b: middleBox);
3491	box.cat(b: rightBox);
3492	yyCharClass.reset();
3493
3494	#ifndef QT_NO_REGEXP_CAPTURE
3495	for (int i = `0`; i < nf; ++i) {
3496	switch (f [i].capture) {
3497	case QRegExpAtom::NoCapture:
3498	break;
3499	case QRegExpAtom::OfficialCapture:
3500	f [i].capture = ncap;
3501	captureForOfficialCapture.append(t: ncap);
3502	++ncap;
3503	++officialncap;
3504	break;
3505	case QRegExpAtom::UnofficialCapture:
3506	f [i].capture = greedyQuantifiers ? ncap++ : QRegExpAtom::NoCapture;
3507	}
3508	}
3509
3510	#ifndef QT_NO_REGEXP_BACKREF
3511	#ifndef QT_NO_REGEXP_OPTIM
3512	if (officialncap == `0` && nbrefs == `0`) {
3513	ncap = nf = `0`;
3514	f.clear();
3515	}
3516	#endif
3517	// handle the case where there's a \5 with no corresponding capture
3518	// (captureForOfficialCapture.size() != officialncap)
3519	for (int i = `0`; i < nbrefs - officialncap; ++i) {
3520	captureForOfficialCapture.append(t: ncap);
3521	++ncap;
3522	}
3523	#endif
3524	#endif
3525
3526	if (!yyError.isEmpty())
3527	return -`1`;
3528
3529	#ifndef QT_NO_REGEXP_OPTIM
3530	const QRegExpAutomatonState &sinit = s.at(i: InitialState);
3531	caretAnchored = !sinit.anchors.isEmpty();
3532	if (caretAnchored) {
3533	const QMap<int, int> &anchors = sinit.anchors;
3534	QMap<int, int>::const_iterator a;
3535	for (a = anchors.constBegin(); a != anchors.constEnd(); ++a) {
3536	if (
3537	#ifndef QT_NO_REGEXP_ANCHOR_ALT
3538	(*a & Anchor_Alternation) != `0` \|\|
3539	#endif
3540	(*a & Anchor_Caret) == `0`)
3541	{
3542	caretAnchored = false;
3543	break;
3544	}
3545	}
3546	}
3547	#endif
3548
3549	// cleanup anchors
3550	int numStates = s.size();
3551	for (int i = `0`; i < numStates; ++i) {
3552	QRegExpAutomatonState &state = s [i];
3553	if (!state.anchors.isEmpty()) {
3554	QMap<int, int>::iterator a = state.anchors.begin();
3555	while (a != state.anchors.end()) {
3556	if (a.value() == `0`)
3557	a = state.anchors.erase(it: a);
3558	else
3559	++a;
3560	}
3561	}
3562	}
3563
3564	return yyPos0;
3565	}
3566
3567	void QRegExpEngine::parseAtom(Box *box)
3568	{
3569	#ifndef QT_NO_REGEXP_LOOKAHEAD
3570	QRegExpEngine eng = nullptr*;
3571	bool neg;
3572	int len;
3573	#endif
3574
3575	if ((yyTok & Tok_Char) != `0`) {
3576	box->set(QChar (yyTok ^ Tok_Char));
3577	} else {
3578	#ifndef QT_NO_REGEXP_OPTIM
3579	trivial = false;
3580	#endif
3581	switch (yyTok) {
3582	case Tok_Dollar:
3583	box->catAnchor(a: Anchor_Dollar);
3584	break;
3585	case Tok_Caret:
3586	box->catAnchor(a: Anchor_Caret);
3587	break;
3588	#ifndef QT_NO_REGEXP_LOOKAHEAD
3589	case Tok_PosLookahead:
3590	case Tok_NegLookahead:
3591	neg = (yyTok == Tok_NegLookahead);
3592	eng = new QRegExpEngine (cs, greedyQuantifiers);
3593	len = eng->parse(pattern: yyIn + yyPos - `1`, len: yyLen - yyPos + `1`);
3594	if (len >= `0`)
3595	skipChars(n: len);
3596	else
3597	error(RXERR_LOOKAHEAD);
3598	box->catAnchor(a: addLookahead(eng, negative: neg));
3599	yyTok = getToken();
3600	if (yyTok != Tok_RightParen)
3601	error(RXERR_LOOKAHEAD);
3602	break;
3603	#endif
3604	#ifndef QT_NO_REGEXP_ESCAPE
3605	case Tok_Word:
3606	box->catAnchor(a: Anchor_Word);
3607	break;
3608	case Tok_NonWord:
3609	box->catAnchor(a: Anchor_NonWord);
3610	break;
3611	#endif
3612	case Tok_LeftParen:
3613	case Tok_MagicLeftParen:
3614	yyTok = getToken();
3615	parseExpression(box);
3616	if (yyTok != Tok_RightParen)
3617	error(RXERR_END);
3618	break;
3619	case Tok_CharClass:
3620	box->set(*yyCharClass);
3621	break;
3622	case Tok_Quantifier:
3623	error(RXERR_REPETITION);
3624	break;
3625	default:
3626	#ifndef QT_NO_REGEXP_BACKREF
3627	if ((yyTok & Tok_BackRef) != `0`)
3628	box->set(yyTok ^ Tok_BackRef);
3629	else
3630	#endif
3631	error(RXERR_DISABLED);
3632	}
3633	}
3634	yyTok = getToken();
3635	}
3636
3637	void QRegExpEngine::parseFactor(Box *box)
3638	{
3639	#ifndef QT_NO_REGEXP_CAPTURE
3640	int outerAtom = greedyQuantifiers ? startAtom(officialCapture: false) : -`1`;
3641	int innerAtom = startAtom(officialCapture: yyMayCapture && yyTok == Tok_LeftParen);
3642	bool magicLeftParen = (yyTok == Tok_MagicLeftParen);
3643	#else
3644	const int innerAtom = -`1`;
3645	#endif
3646
3647	#ifndef QT_NO_REGEXP_INTERVAL
3648	#define YYREDO() \
3649	yyIn = in, yyPos0 = pos0, yyPos = pos, yyLen = len, yyCh = ch, \
3650	*yyCharClass = charClass, yyMinRep = 0, yyMaxRep = 0, yyTok = tok
3651
3652	const QChar *in = yyIn;
3653	int pos0 = yyPos0;
3654	int pos = yyPos;
3655	int len = yyLen;
3656	int ch = yyCh;
3657	QRegExpCharClass charClass;
3658	if (yyTok == Tok_CharClass)
3659	charClass = *yyCharClass;
3660	int tok = yyTok;
3661	bool mayCapture = yyMayCapture;
3662	#endif
3663
3664	parseAtom(box);
3665	#ifndef QT_NO_REGEXP_CAPTURE
3666	finishAtom(atom: innerAtom, needCapture: magicLeftParen);
3667	#endif
3668
3669	bool hasQuantifier = (yyTok == Tok_Quantifier);
3670	if (hasQuantifier) {
3671	#ifndef QT_NO_REGEXP_OPTIM
3672	trivial = false;
3673	#endif
3674	if (yyMaxRep == InftyRep) {
3675	box->plus(atom: innerAtom);
3676	#ifndef QT_NO_REGEXP_INTERVAL
3677	} else if (yyMaxRep == `0`) {
3678	box->clear();
3679	#endif
3680	}
3681	if (yyMinRep == `0`)
3682	box->opt();
3683
3684	#ifndef QT_NO_REGEXP_INTERVAL
3685	yyMayCapture = false;
3686	int alpha = (yyMinRep == `0`) ? `0` : yyMinRep - `1`;
3687	int beta = (yyMaxRep == InftyRep) ? `0` : yyMaxRep - (alpha + `1`);
3688
3689	Box rightBox(this);
3690	int i;
3691
3692	for (i = `0`; i < beta; i++) {
3693	YYREDO();
3694	Box leftBox(this);
3695	parseAtom(box: &leftBox);
3696	leftBox.cat(b: rightBox);
3697	leftBox.opt();
3698	rightBox = leftBox;
3699	}
3700	for (i = `0`; i < alpha; i++) {
3701	YYREDO();
3702	Box leftBox(this);
3703	parseAtom(box: &leftBox);
3704	leftBox.cat(b: rightBox);
3705	rightBox = leftBox;
3706	}
3707	rightBox.cat(b: *box);
3708	*box = rightBox;
3709	#endif
3710	yyTok = getToken();
3711	#ifndef QT_NO_REGEXP_INTERVAL
3712	yyMayCapture = mayCapture;
3713	#endif
3714	}
3715	#undef YYREDO
3716	#ifndef QT_NO_REGEXP_CAPTURE
3717	if (greedyQuantifiers)
3718	finishAtom(atom: outerAtom, needCapture: hasQuantifier);
3719	#endif
3720	}
3721
3722	void QRegExpEngine::parseTerm(Box *box)
3723	{
3724	#ifndef QT_NO_REGEXP_OPTIM
3725	if (yyTok != Tok_Eos && yyTok != Tok_RightParen && yyTok != Tok_Bar)
3726	parseFactor(box);
3727	#endif
3728	while (yyTok != Tok_Eos && yyTok != Tok_RightParen && yyTok != Tok_Bar) {
3729	Box rightBox(this);
3730	parseFactor(box: &rightBox);
3731	box->cat(b: rightBox);
3732	}
3733	}
3734
3735	void QRegExpEngine::parseExpression(Box *box)
3736	{
3737	parseTerm(box);
3738	while (yyTok == Tok_Bar) {
3739	#ifndef QT_NO_REGEXP_OPTIM
3740	trivial = false;
3741	#endif
3742	Box rightBox(this);
3743	yyTok = getToken();
3744	parseTerm(box: &rightBox);
3745	box->orx(b: rightBox);
3746	}
3747	}
3748
3749	/*
3750	The struct QRegExpPrivate contains the private data of a regular
3751	expression other than the automaton. It makes it possible for many
3752	QRegExp objects to use the same QRegExpEngine object with different
3753	QRegExpPrivate objects.
3754	*/
3755	struct QRegExpPrivate
3756	{
3757	QRegExpEngine *eng;
3758	QRegExpEngineKey engineKey;
3759	bool minimal;
3760	#ifndef QT_NO_REGEXP_CAPTURE
3761	QString t; // last string passed to QRegExp::indexIn() or lastIndexIn()
3762	QStringList capturedCache; // what QRegExp::capturedTexts() returned last
3763	#endif
3764	QRegExpMatchState matchState;
3765
3766	inline QRegExpPrivate()
3767	: eng(nullptr), engineKey (QString (), QRegExp::RegExp, Qt::CaseSensitive), minimal(false) { }
3768	inline QRegExpPrivate(const QRegExpEngineKey &key)
3769	: eng(nullptr), engineKey (key), minimal(false) {}
3770	};
3771
3772	#if !defined(QT_NO_REGEXP_OPTIM)
3773	struct QRECache
3774	{
3775	typedef QHash<QRegExpEngineKey, QRegExpEngine *> EngineCache;
3776	typedef QCache<QRegExpEngineKey, QRegExpEngine> UnusedEngineCache;
3777	EngineCache usedEngines;
3778	UnusedEngineCache unusedEngines;
3779	};
3780	Q_GLOBAL_STATIC(QRECache, engineCache)
3781	static QBasicMutex engineCacheMutex;
3782	#endif // QT_NO_REGEXP_OPTIM
3783
3784	static void derefEngine(QRegExpEngine eng, const* QRegExpEngineKey &key)
3785	{
3786	#if !defined(QT_NO_REGEXP_OPTIM)
3787	const auto locker = qt_scoped_lock(mutex&: engineCacheMutex);
3788	if (!eng->ref.deref()) {
3789	if (QRECache *c = engineCache ()) {
3790	c->unusedEngines.insert(key, object: eng, cost: `4` + key.pattern.size() / `4`);
3791	c->usedEngines.remove(key);
3792	} else {
3793	delete eng;
3794	}
3795	}
3796	#else
3797	Q_UNUSED(key);
3798	if (!eng->ref.deref())
3799	delete eng;
3800	#endif
3801	}
3802
3803	static void prepareEngine_helper(QRegExpPrivate *priv)
3804	{
3805	Q_ASSERT(!priv->eng);
3806
3807	#if !defined(QT_NO_REGEXP_OPTIM)
3808	const auto locker = qt_scoped_lock(mutex&: engineCacheMutex);
3809	if (QRECache *c = engineCache ()) {
3810	priv->eng = c->unusedEngines.take(key: priv->engineKey);
3811	if (!priv->eng)
3812	priv->eng = c->usedEngines.value(key: priv->engineKey);
3813	if (!priv->eng)
3814	priv->eng = new QRegExpEngine (priv->engineKey);
3815	else
3816	priv->eng->ref.ref();
3817
3818	c->usedEngines.insert(key: priv->engineKey, value: priv->eng);
3819	return;
3820	}
3821	#endif // QT_NO_REGEXP_OPTIM
3822
3823	priv->eng = new QRegExpEngine (priv->engineKey);
3824	}
3825
3826	inline static void prepareEngine(QRegExpPrivate *priv)
3827	{
3828	if (priv->eng)
3829	return;
3830	prepareEngine_helper(priv);
3831	priv->matchState.prepareForMatch(eng: priv->eng);
3832	}
3833
3834	static void prepareEngineForMatch(QRegExpPrivate priv, const* QString &str)
3835	{
3836	prepareEngine(priv);
3837	priv->matchState.prepareForMatch(eng: priv->eng);
3838	#ifndef QT_NO_REGEXP_CAPTURE
3839	priv->t = str;
3840	priv->capturedCache.clear();
3841	#else
3842	Q_UNUSED(str);
3843	#endif
3844	}
3845
3846	static void invalidateEngine(QRegExpPrivate *priv)
3847	{
3848	if (priv->eng) {
3849	derefEngine(eng: priv->eng, key: priv->engineKey);
3850	priv->eng = nullptr;
3851	priv->matchState.drain();
3852	}
3853	}
3854
3855	/!*
3856	\enum QRegExp::CaretMode
3857
3858	The CaretMode enum defines the different meanings of the caret
3859	(\b{^}) in a regular expression. The possible values are:
3860
3861	\value CaretAtZero
3862	The caret corresponds to index 0 in the searched string.
3863
3864	\value CaretAtOffset
3865	The caret corresponds to the start offset of the search.
3866
3867	\value CaretWontMatch
3868	The caret never matches.
3869	*/
3870
3871	/!*
3872	\enum QRegExp::PatternSyntax
3873
3874	The syntax used to interpret the meaning of the pattern.
3875
3876	\value RegExp A rich Perl-like pattern matching syntax. This is
3877	the default.
3878
3879	\value RegExp2 Like RegExp, but with \l{greedy quantifiers}.
3880	(Introduced in Qt 4.2.)
3881
3882	\value Wildcard This provides a simple pattern matching syntax
3883	similar to that used by shells (command interpreters) for "file
3884	globbing". See \l{QRegExp wildcard matching}.
3885
3886	\value WildcardUnix This is similar to Wildcard but with the
3887	behavior of a Unix shell. The wildcard characters can be escaped
3888	with the character "\\".
3889
3890	\value FixedString The pattern is a fixed string. This is
3891	equivalent to using the RegExp pattern on a string in
3892	which all metacharacters are escaped using escape().
3893
3894	\value W3CXmlSchema11 The pattern is a regular expression as
3895	defined by the W3C XML Schema 1.1 specification.
3896
3897	\sa setPatternSyntax()
3898	*/
3899
3900	/!*
3901	Constructs an empty regexp.
3902
3903	\sa isValid(), errorString()
3904	*/
3905	QRegExp::QRegExp()
3906	{
3907	priv = new QRegExpPrivate;
3908	prepareEngine(priv);
3909	}
3910
3911	/!*
3912	Constructs a regular expression object for the given \a pattern
3913	string. The pattern must be given using wildcard notation if \a
3914	syntax is \l Wildcard; the default is \l RegExp. The pattern is
3915	case sensitive, unless \a cs is Qt::CaseInsensitive. Matching is
3916	greedy (maximal), but can be changed by calling
3917	setMinimal().
3918
3919	\sa setPattern(), setCaseSensitivity(), setPatternSyntax()
3920	*/
3921	QRegExp::QRegExp(const QString &pattern, Qt::CaseSensitivity cs, PatternSyntax syntax)
3922	{
3923	priv = new QRegExpPrivate (QRegExpEngineKey (pattern, syntax, cs));
3924	prepareEngine(priv);
3925	}
3926
3927	/!*
3928	Constructs a regular expression as a copy of \a rx.
3929
3930	\sa operator=()
3931	*/
3932	QRegExp::QRegExp(const QRegExp &rx)
3933	{
3934	priv = new QRegExpPrivate;
3935	operator=(rx);
3936	}
3937
3938	/!*
3939	Destroys the regular expression and cleans up its internal data.
3940	*/
3941	QRegExp::~QRegExp()
3942	{
3943	invalidateEngine(priv);
3944	delete priv;
3945	}
3946
3947	/!*
3948	Copies the regular expression \a rx and returns a reference to the
3949	copy. The case sensitivity, wildcard, and minimal matching options
3950	are also copied.
3951	*/
3952	QRegExp &QRegExp::operator=(const QRegExp &rx)
3953	{
3954	prepareEngine(priv: rx.priv); // to allow sharing
3955	QRegExpEngine *otherEng = rx.priv->eng;
3956	if (otherEng)
3957	otherEng->ref.ref();
3958	invalidateEngine(priv);
3959	priv->eng = otherEng;
3960	priv->engineKey = rx.priv->engineKey;
3961	priv->minimal = rx.priv->minimal;
3962	#ifndef QT_NO_REGEXP_CAPTURE
3963	priv->t = rx.priv->t;
3964	priv->capturedCache = rx.priv->capturedCache;
3965	#endif
3966	if (priv->eng)
3967	priv->matchState.prepareForMatch(eng: priv->eng);
3968	priv->matchState.captured = rx.priv->matchState.captured;
3969	return *this;
3970	}
3971
3972	/!*
3973	\fn QRegExp &QRegExp::operator=(QRegExp &&other)
3974
3975	Move-assigns \a other to this QRegExp instance.
3976
3977	\since 5.2
3978	*/
3979
3980	/!*
3981	\fn void QRegExp::swap(QRegExp &other)
3982	\since 4.8
3983
3984	Swaps regular expression \a other with this regular
3985	expression. This operation is very fast and never fails.
3986	*/
3987
3988	/!*
3989	Returns \c true if this regular expression is equal to \a rx;
3990	otherwise returns \c false.
3991
3992	Two QRegExp objects are equal if they have the same pattern
3993	strings and the same settings for case sensitivity, wildcard and
3994	minimal matching.
3995	*/
3996	bool QRegExp::operator==(const QRegExp &rx) const
3997	{
3998	return priv->engineKey == rx.priv->engineKey && priv->minimal == rx.priv->minimal;
3999	}
4000
4001	/!*
4002	\since 5.6
4003	\relates QRegExp
4004
4005	Returns the hash value for \a key, using
4006	\a seed to seed the calculation.
4007	*/
4008	size_t qHash(const QRegExp &key, size_t seed) noexcept
4009	{
4010	QtPrivate::QHashCombine hash;
4011	seed = hash (seed, key.priv->engineKey);
4012	seed = hash (seed, key.priv->minimal);
4013	return seed;
4014	}
4015
4016	/!*
4017	\fn bool QRegExp::operator!=(const QRegExp &rx) const
4018
4019	Returns \c true if this regular expression is not equal to \a rx;
4020	otherwise returns \c false.
4021
4022	\sa operator==()
4023	*/
4024
4025	/!*
4026	Returns \c true if the pattern string is empty; otherwise returns
4027	false.
4028
4029	If you call exactMatch() with an empty pattern on an empty string
4030	it will return true; otherwise it returns \c false since it operates
4031	over the whole string. If you call indexIn() with an empty pattern
4032	on \e any string it will return the start offset (0 by default)
4033	because the empty pattern matches the 'emptiness' at the start of
4034	the string. In this case the length of the match returned by
4035	matchedLength() will be 0.
4036
4037	See QString::isEmpty().
4038	*/
4039
4040	bool QRegExp::isEmpty() const
4041	{
4042	return priv->engineKey.pattern.isEmpty();
4043	}
4044
4045	/!*
4046	Returns \c true if the regular expression is valid; otherwise returns
4047	false. An invalid regular expression never matches.
4048
4049	The pattern \b{[a-z} is an example of an invalid pattern, since
4050	it lacks a closing square bracket.
4051
4052	Note that the validity of a regexp may also depend on the setting
4053	of the wildcard flag, for example \b{.html} is a valid*
4054	wildcard regexp but an invalid full regexp.
4055
4056	\sa errorString()
4057	*/
4058	bool QRegExp::isValid() const
4059	{
4060	if (priv->engineKey.pattern.isEmpty()) {
4061	return true;
4062	} else {
4063	prepareEngine(priv);
4064	return priv->eng->isValid();
4065	}
4066	}
4067
4068	/!*
4069	Returns the pattern string of the regular expression. The pattern
4070	has either regular expression syntax or wildcard syntax, depending
4071	on patternSyntax().
4072
4073	\sa patternSyntax(), caseSensitivity()
4074	*/
4075	QString QRegExp::pattern() const
4076	{
4077	return priv->engineKey.pattern;
4078	}
4079
4080	/!*
4081	Sets the pattern string to \a pattern. The case sensitivity,
4082	wildcard, and minimal matching options are not changed.
4083
4084	\sa setPatternSyntax(), setCaseSensitivity()
4085	*/
4086	void QRegExp::setPattern(const QString &pattern)
4087	{
4088	if (priv->engineKey.pattern != pattern) {
4089	invalidateEngine(priv);
4090	priv->engineKey.pattern = pattern;
4091	}
4092	}
4093
4094	/!*
4095	Returns Qt::CaseSensitive if the regexp is matched case
4096	sensitively; otherwise returns Qt::CaseInsensitive.
4097
4098	\sa patternSyntax(), pattern(), isMinimal()
4099	*/
4100	Qt::CaseSensitivity QRegExp::caseSensitivity() const
4101	{
4102	return priv->engineKey.cs;
4103	}
4104
4105	/!*
4106	Sets case sensitive matching to \a cs.
4107
4108	If \a cs is Qt::CaseSensitive, \b{\\.txt$} matches
4109	\c{readme.txt} but not \c{README.TXT}.
4110
4111	\sa setPatternSyntax(), setPattern(), setMinimal()
4112	*/
4113	void QRegExp::setCaseSensitivity(Qt::CaseSensitivity cs)
4114	{
4115	if ((bool)cs != (bool)priv->engineKey.cs) {
4116	invalidateEngine(priv);
4117	priv->engineKey.cs = cs;
4118	}
4119	}
4120
4121	/!*
4122	Returns the syntax used by the regular expression. The default is
4123	QRegExp::RegExp.
4124
4125	\sa pattern(), caseSensitivity()
4126	*/
4127	QRegExp::PatternSyntax QRegExp::patternSyntax() const
4128	{
4129	return priv->engineKey.patternSyntax;
4130	}
4131
4132	/!*
4133	Sets the syntax mode for the regular expression. The default is
4134	QRegExp::RegExp.
4135
4136	Setting \a syntax to QRegExp::Wildcard enables simple shell-like
4137	\l{QRegExp wildcard matching}. For example, \b{r.txt} matches the*
4138	string \c{readme.txt} in wildcard mode, but does not match
4139	\c{readme}.
4140
4141	Setting \a syntax to QRegExp::FixedString means that the pattern
4142	is interpreted as a plain string. Special characters (e.g.,
4143	backslash) don't need to be escaped then.
4144
4145	\sa setPattern(), setCaseSensitivity(), escape()
4146	*/
4147	void QRegExp::setPatternSyntax(PatternSyntax syntax)
4148	{
4149	if (syntax != priv->engineKey.patternSyntax) {
4150	invalidateEngine(priv);
4151	priv->engineKey.patternSyntax = syntax;
4152	}
4153	}
4154
4155	/!*
4156	Returns \c true if minimal (non-greedy) matching is enabled;
4157	otherwise returns \c false.
4158
4159	\sa caseSensitivity(), setMinimal()
4160	*/
4161	bool QRegExp::isMinimal() const
4162	{
4163	return priv->minimal;
4164	}
4165
4166	/!*
4167	Enables or disables minimal matching. If \a minimal is false,
4168	matching is greedy (maximal) which is the default.
4169
4170	For example, suppose we have the input string "We must be
4171	<b>bold</b>, very <b>bold</b>!" and the pattern
4172	\b{<b>.</b>}. With the default greedy (maximal) matching,*
4173	the match is "We must be \underline{<b>bold</b>, very
4174	<b>bold</b>}!". But with minimal (non-greedy) matching, the
4175	first match is: "We must be \underline{<b>bold</b>}, very
4176	<b>bold</b>!" and the second match is "We must be <b>bold</b>,
4177	very \underline{<b>bold</b>}!". In practice we might use the pattern
4178	\b{<b>[^<]\</b>} instead, although this will still fail for*
4179	nested tags.
4180
4181	\sa setCaseSensitivity()
4182	*/
4183	void QRegExp::setMinimal(bool minimal)
4184	{
4185	priv->minimal = minimal;
4186	}
4187
4188	// ### Qt 5: make non-const
4189	/!*
4190	Returns \c true if \a str is matched exactly by this regular
4191	expression; otherwise returns \c false. You can determine how much of
4192	the string was matched by calling matchedLength().
4193
4194	For a given regexp string R, exactMatch("R") is the equivalent of
4195	indexIn("^R$") since exactMatch() effectively encloses the regexp
4196	in the start of string and end of string anchors, except that it
4197	sets matchedLength() differently.
4198
4199	For example, if the regular expression is \b{blue}, then
4200	exactMatch() returns \c true only for input \c blue. For inputs \c
4201	bluebell, \c blutak and \c lightblue, exactMatch() returns \c false
4202	and matchedLength() will return 4, 3 and 0 respectively.
4203
4204	Although const, this function sets matchedLength(),
4205	capturedTexts(), and pos().
4206
4207	\sa indexIn(), lastIndexIn()
4208	*/
4209	bool QRegExp::exactMatch(const QString &str) const
4210	{
4211	prepareEngineForMatch(priv, str);
4212	priv->matchState.match(str0: str.unicode(), len0: str.size(), pos0: `0`, minimal0: priv->minimal, oneTest: true, caretIndex: `0`);
4213	if (priv->matchState.captured[`1`] == str.size()) {
4214	return true;
4215	} else {
4216	priv->matchState.captured[`0`] = `0`;
4217	priv->matchState.captured[`1`] = priv->matchState.oneTestMatchedLen;
4218	return false;
4219	}
4220	}
4221
4222	/!*
4223	Returns the regexp as a QVariant
4224	*/
4225	QRegExp::operator QVariant() const
4226	{
4227	QT_WARNING_PUSH QT_WARNING_DISABLE_DEPRECATED
4228	QVariant v;
4229	v.setValue(*this);
4230	return v;
4231	QT_WARNING_POP
4232	}
4233
4234	// ### Qt 5: make non-const
4235	/!*
4236	Attempts to find a match in \a str from position \a offset (0 by
4237	default). If \a offset is -1, the search starts at the last
4238	character; if -2, at the next to last character; etc.
4239
4240	Returns the position of the first match, or -1 if there was no
4241	match.
4242
4243	The \a caretMode parameter can be used to instruct whether \b{^}
4244	should match at index 0 or at \a offset.
4245
4246	You might prefer to use QString::indexOf(), QString::contains(),
4247	or even QStringList::filter(). To replace matches use
4248	QString::replace().
4249
4250	Example:
4251	\snippet code/src_corelib_text_qregexp.cpp 13
4252
4253	Although const, this function sets matchedLength(),
4254	capturedTexts() and pos().
4255
4256	If the QRegExp is a wildcard expression (see setPatternSyntax())
4257	and want to test a string against the whole wildcard expression,
4258	use exactMatch() instead of this function.
4259
4260	\sa lastIndexIn(), exactMatch()
4261	*/
4262
4263	int QRegExp::indexIn(const QString &str, int offset, CaretMode caretMode) const
4264	{
4265	prepareEngineForMatch(priv, str);
4266	if (offset < `0`)
4267	offset += str.size();
4268	priv->matchState.match(str0: str.unicode(), len0: str.size(), pos0: offset,
4269	minimal0: priv->minimal, oneTest: false, caretIndex: caretIndex(offset, caretMode));
4270	return priv->matchState.captured[`0`];
4271	}
4272
4273	// ### Qt 5: make non-const
4274	/!*
4275	Attempts to find a match backwards in \a str from position \a
4276	offset. If \a offset is -1 (the default), the search starts at the
4277	last character; if -2, at the next to last character; etc.
4278
4279	Returns the position of the first match, or -1 if there was no
4280	match.
4281
4282	The \a caretMode parameter can be used to instruct whether \b{^}
4283	should match at index 0 or at \a offset.
4284
4285	Although const, this function sets matchedLength(),
4286	capturedTexts() and pos().
4287
4288	\warning Searching backwards is much slower than searching
4289	forwards.
4290
4291	\sa indexIn(), exactMatch()
4292	*/
4293
4294	int QRegExp::lastIndexIn(const QString &str, int offset, CaretMode caretMode) const
4295	{
4296	prepareEngineForMatch(priv, str);
4297	if (offset < `0`)
4298	offset += str.size();
4299	if (offset < `0` \|\| offset > str.size()) {
4300	memset(s: priv->matchState.captured, c: -`1`, n: priv->matchState.capturedSize*sizeof(int));
4301	return -`1`;
4302	}
4303
4304	while (offset >= `0`) {
4305	priv->matchState.match(str0: str.unicode(), len0: str.size(), pos0: offset,
4306	minimal0: priv->minimal, oneTest: true, caretIndex: caretIndex(offset, caretMode));
4307	if (priv->matchState.captured[`0`] == offset)
4308	return offset;
4309	--offset;
4310	}
4311	return -`1`;
4312	}
4313
4314	/!*
4315	Returns the length of the last matched string, or -1 if there was
4316	no match.
4317
4318	\sa exactMatch(), indexIn(), lastIndexIn()
4319	*/
4320	int QRegExp::matchedLength() const
4321	{
4322	return priv->matchState.captured[`1`];
4323	}
4324
4325
4326	/!*
4327	Replaces every occurrence of this regular expression in
4328	\a str with \a after and returns the result.
4329
4330	For regular expressions containing \l{capturing parentheses},
4331	occurrences of \b{\\1}, \b{\\2}, ..., in \a after are replaced
4332	with \c {rx}.cap(1), cap(2), ...
4333
4334	\sa indexIn(), lastIndexIn(), QRegExp::cap()
4335	*/
4336	QString QRegExp::replaceIn(const QString &str, const QString &after) const
4337	{
4338	struct QStringCapture
4339	{
4340	int pos;
4341	int len;
4342	int no;
4343	};
4344
4345	QRegExp rx2(*this);
4346
4347	if (str.isEmpty() && rx2.indexIn(str) == -`1`)
4348	return str;
4349
4350	QString s(str);
4351
4352	int index = `0`;
4353	int numCaptures = rx2.captureCount();
4354	int al = after.size();
4355	QRegExp::CaretMode caretMode = QRegExp::CaretAtZero;
4356
4357	if (numCaptures > `0`) {
4358	const QChar *uc = after.unicode();
4359	int numBackRefs = `0`;
4360
4361	for (int i = `0`; i < al - `1`; i++) {
4362	if (uc[i] == QLatin1Char (`'\\'`)) {
4363	int no = uc[i + `1`].digitValue();
4364	if (no > `0` && no <= numCaptures)
4365	numBackRefs++;
4366	}
4367	}
4368
4369	/*
4370	This is the harder case where we have back-references.
4371	*/
4372	if (numBackRefs > `0`) {
4373	QVarLengthArray<QStringCapture, `16`> captures(numBackRefs);
4374	int j = `0`;
4375
4376	for (int i = `0`; i < al - `1`; i++) {
4377	if (uc[i] == QLatin1Char (`'\\'`)) {
4378	int no = uc[i + `1`].digitValue();
4379	if (no > `0` && no <= numCaptures) {
4380	QStringCapture capture;
4381	capture.pos = i;
4382	capture.len = `2`;
4383
4384	if (i < al - `2`) {
4385	int secondDigit = uc[i + `2`].digitValue();
4386	if (secondDigit != -`1` && ((no * `10`) + secondDigit) <= numCaptures) {
4387	no = (no * `10`) + secondDigit;
4388	++capture.len;
4389	}
4390	}
4391
4392	capture.no = no;
4393	captures [j++] = capture;
4394	}
4395	}
4396	}
4397
4398	while (index <= s.size()) {
4399	index = rx2.indexIn(str: s, offset: index, caretMode);
4400	if (index == -`1`)
4401	break;
4402
4403	QString after2(after);
4404	for (j = numBackRefs - `1`; j >= `0`; j--) {
4405	const QStringCapture &capture = captures [j];
4406	after2.replace(i: capture.pos, len: capture.len, after: rx2.cap(nth: capture.no));
4407	}
4408
4409	s.replace(i: index, len: rx2.matchedLength(), after: after2);
4410	index += after2.size();
4411
4412	// avoid infinite loop on 0-length matches (e.g., QRegExp("[a-z]"))*
4413	if (rx2.matchedLength() == `0`)
4414	++index;
4415
4416	caretMode = QRegExp::CaretWontMatch;
4417	}
4418	return s;
4419	}
4420	}
4421
4422	/*
4423	This is the simple and optimized case where we don't have
4424	back-references.
4425	*/
4426	while (index != -`1`) {
4427	struct {
4428	int pos;
4429	int length;
4430	} replacements[`2048`];
4431
4432	int pos = `0`;
4433	int adjust = `0`;
4434	while (pos < `2047`) {
4435	index = rx2.indexIn(str: s, offset: index, caretMode);
4436	if (index == -`1`)
4437	break;
4438	int ml = rx2.matchedLength();
4439	replacements[pos].pos = index;
4440	replacements[pos++].length = ml;
4441	index += ml;
4442	adjust += al - ml;
4443	// avoid infinite loop
4444	if (!ml)
4445	index++;
4446	}
4447	if (!pos)
4448	break;
4449	replacements[pos].pos = s.size();
4450	int newlen = s.size() + adjust;
4451
4452	// to continue searching at the right position after we did
4453	// the first round of replacements
4454	if (index != -`1`)
4455	index += adjust;
4456	QString newstring;
4457	newstring.reserve(asize: newlen + `1`);
4458	QChar *newuc = newstring.data();
4459	QChar *uc = newuc;
4460	int copystart = `0`;
4461	int i = `0`;
4462	while (i < pos) {
4463	int copyend = replacements[i].pos;
4464	int size = copyend - copystart;
4465	memcpy(dest: static_cast<void>(uc), src: static_cast<const* void >(s.constData() + copystart), n: size sizeof(QChar));
4466	uc += size;
4467	memcpy(dest: static_cast<void >(uc), src: static_cast<const* void >(after.constData()), n: al sizeof(QChar));
4468	uc += al;
4469	copystart = copyend + replacements[i].length;
4470	i++;
4471	}
4472	memcpy(dest: static_cast<void >(uc), src: static_cast<const* void >(s.constData() + copystart), n: (s.size() - copystart) sizeof(QChar));
4473	newstring.resize(size: newlen);
4474	s = newstring;
4475	caretMode = QRegExp::CaretWontMatch;
4476	}
4477	return s;
4478
4479	}
4480
4481
4482	/!*
4483	\fn QString QRegExp::removeIn(const QString &str) const
4484
4485	Removes every occurrence of this regular expression \a str, and
4486	returns the result
4487
4488	Does the same as replaceIn(str, QString()).
4489
4490	\sa indexIn(), lastIndexIn(), replaceIn()
4491	*/
4492
4493
4494	/!*
4495	\fn QString QRegExp::countIn(const QString &str) const
4496
4497	Returns the number of times this regular expression matches
4498	in \a str.
4499
4500	\sa indexIn(), lastIndexIn(), replaceIn()
4501	*/
4502
4503	int QRegExp::countIn(const QString &str) const
4504	{
4505	QRegExp rx2(*this);
4506	int count = `0`;
4507	int index = -`1`;
4508	int len = str.size();
4509	while (index < len - `1`) { // count overlapping matches
4510	index = rx2.indexIn(str, offset: index + `1`);
4511	if (index == -`1`)
4512	break;
4513	count++;
4514	}
4515	return count;
4516	}
4517
4518	/!*
4519	Splits \a str into substrings wherever this regular expression
4520	matches, and returns the list of those strings. If this regular
4521	expression does not match anywhere in the string, split() returns a
4522	single-element list containing \a str.
4523
4524	If \a behavior is set to Qt::KeepEmptyParts, empty fields are
4525	included in the resulting list.
4526
4527	\sa QStringList::join(), QString::split()
4528	*/
4529	QStringList QRegExp::splitString(const QString &str, Qt::SplitBehavior behavior) const
4530	{
4531	QRegExp rx2(*this);
4532	QStringList list;
4533	int start = `0`;
4534	int extra = `0`;
4535	int end;
4536	while ((end = rx2.indexIn(str, offset: start + extra)) != -`1`) {
4537	int matchedLen = rx2.matchedLength();
4538	if (start != end \|\| behavior == Qt::KeepEmptyParts)
4539	list.append(t: str.mid(position: start, n: end - start));
4540	start = end + matchedLen;
4541	extra = (matchedLen == `0`) ? `1` : `0`;
4542	}
4543	if (start != str.size() \|\| behavior == Qt::KeepEmptyParts)
4544	list.append(t: str.mid(position: start, n: -`1`));
4545	return list;
4546	}
4547
4548	/!*
4549	Returns a list of all the strings that match this regular
4550	expression in \a stringList.
4551	*/
4552	QStringList QRegExp::filterList(const QStringList &stringList) const
4553	{
4554	QStringList res;
4555	for (const QString &s : stringList) {
4556	if (containedIn(str: s))
4557	res << s;
4558	}
4559	return res;
4560	}
4561
4562	/!*
4563	Replaces every occurrence of this regexp, in each of \a stringList's
4564	with \a after. Returns a reference to the string list.
4565	*/
4566	QStringList QRegExp::replaceIn(const QStringList &stringList, const QString &after) const
4567	{
4568	QStringList list;
4569	for (const QString &s : stringList)
4570	list << replaceIn(str: s, after);
4571	return list;
4572	}
4573
4574	/!*
4575	Returns the index position of the first exact match of this regexp in
4576	\a list, searching forward from index position \a from. Returns
4577	-1 if no item matched.
4578
4579	\sa lastIndexIn(), exactMatch()
4580	*/
4581	int QRegExp::indexIn(const QStringList &list, int from) const
4582	{
4583	QRegExp rx2(*this);
4584	if (from < `0`)
4585	from = qMax(a: from + list.size(), b: `0`);
4586	for (int i = from; i < list.size(); ++i) {
4587	if (rx2.exactMatch(str: list.at(i)))
4588	return i;
4589	}
4590	return -`1`;
4591	}
4592
4593	/!*
4594	Returns the index position of the last exact match of this regexp in
4595	\a list, searching backward from index position \a from. If \a
4596	from is -1 (the default), the search starts at the last item.
4597	Returns -1 if no item matched.
4598
4599	\sa QRegExp::exactMatch()
4600	*/
4601	int QRegExp::lastIndexIn(const QStringList &list, int from) const
4602	{
4603	QRegExp rx2(*this);
4604	if (from < `0`)
4605	from += list.size();
4606	else if (from >= list.size())
4607	from = list.size() - `1`;
4608	for (int i = from; i >= `0`; --i) {
4609	if (rx2.exactMatch(str: list.at(i)))
4610	return i;
4611	}
4612	return -`1`;
4613	}
4614
4615	#ifndef QT_NO_REGEXP_CAPTURE
4616
4617	/!*
4618	\since 4.6
4619	Returns the number of captures contained in the regular expression.
4620	*/
4621	int QRegExp::captureCount() const
4622	{
4623	prepareEngine(priv);
4624	return priv->eng->captureCount();
4625	}
4626
4627	/!*
4628	Returns a list of the captured text strings.
4629
4630	The first string in the list is the entire matched string. Each
4631	subsequent list element contains a string that matched a
4632	(capturing) subexpression of the regexp.
4633
4634	For example:
4635	\snippet code/src_corelib_text_qregexp.cpp 14
4636
4637	The above example also captures elements that may be present but
4638	which we have no interest in. This problem can be solved by using
4639	non-capturing parentheses:
4640
4641	\snippet code/src_corelib_text_qregexp.cpp 15
4642
4643	Note that if you want to iterate over the list, you should iterate
4644	over a copy, e.g.
4645	\snippet code/src_corelib_text_qregexp.cpp 16
4646
4647	Some regexps can match an indeterminate number of times. For
4648	example if the input string is "Offsets: 12 14 99 231 7" and the
4649	regexp, \c{rx}, is \b{(\\d+)+}, we would hope to get a list of
4650	all the numbers matched. However, after calling
4651	\c{rx.indexIn(str)}, capturedTexts() will return the list ("12",
4652	"12"), i.e. the entire match was "12" and the first subexpression
4653	matched was "12". The correct approach is to use cap() in a
4654	\l{QRegExp#cap_in_a_loop}{loop}.
4655
4656	The order of elements in the string list is as follows. The first
4657	element is the entire matching string. Each subsequent element
4658	corresponds to the next capturing open left parentheses. Thus
4659	capturedTexts()[1] is the text of the first capturing parentheses,
4660	capturedTexts()[2] is the text of the second and so on
4661	(corresponding to $1, $2, etc., in some other regexp languages).
4662
4663	\sa cap(), pos()
4664	*/
4665	QStringList QRegExp::capturedTexts() const
4666	{
4667	if (priv->capturedCache.isEmpty()) {
4668	prepareEngine(priv);
4669	const int *captured = priv->matchState.captured;
4670	int n = priv->matchState.capturedSize;
4671
4672	for (int i = `0`; i < n; i += `2`) {
4673	QString m;
4674	if (captured[i + `1`] == `0`)
4675	m = QLatin1String (""); // ### Qt 5: don't distinguish between null and empty
4676	else if (captured[i] >= `0`)
4677	m = priv->t.mid(position: captured[i], n: captured[i + `1`]);
4678	priv->capturedCache.append(t: m);
4679	}
4680	priv->t.clear();
4681	}
4682	return priv->capturedCache;
4683	}
4684
4685	/!*
4686	\internal
4687	*/
4688	QStringList QRegExp::capturedTexts()
4689	{
4690	return const_cast<const QRegExp >(this*)->capturedTexts();
4691	}
4692
4693	/!*
4694	Returns the text captured by the \a nth subexpression. The entire
4695	match has index 0 and the parenthesized subexpressions have
4696	indexes starting from 1 (excluding non-capturing parentheses).
4697
4698	\snippet code/src_corelib_text_qregexp.cpp 17
4699
4700	The order of elements matched by cap() is as follows. The first
4701	element, cap(0), is the entire matching string. Each subsequent
4702	element corresponds to the next capturing open left parentheses.
4703	Thus cap(1) is the text of the first capturing parentheses, cap(2)
4704	is the text of the second, and so on.
4705
4706	\sa capturedTexts(), pos()
4707	*/
4708	QString QRegExp::cap(int nth) const
4709	{
4710	return capturedTexts().value(i: nth);
4711	}
4712
4713	/!*
4714	\internal
4715	*/
4716	QString QRegExp::cap(int nth)
4717	{
4718	return const_cast<const QRegExp >(this*)->cap(nth);
4719	}
4720
4721	/!*
4722	Returns the position of the \a nth captured text in the searched
4723	string. If \a nth is 0 (the default), pos() returns the position
4724	of the whole match.
4725
4726	Example:
4727	\snippet code/src_corelib_text_qregexp.cpp 18
4728
4729	For zero-length matches, pos() always returns -1. (For example, if
4730	cap(4) would return an empty string, pos(4) returns -1.) This is
4731	a feature of the implementation.
4732
4733	\sa cap(), capturedTexts()
4734	*/
4735	int QRegExp::pos(int nth) const
4736	{
4737	if (nth < `0` \|\| nth >= priv->matchState.capturedSize / `2`)
4738	return -`1`;
4739	else
4740	return priv->matchState.captured[`2` * nth];
4741	}
4742
4743	/!*
4744	\internal
4745	*/
4746	int QRegExp::pos(int nth)
4747	{
4748	return const_cast<const QRegExp >(this*)->pos(nth);
4749	}
4750
4751	/!*
4752	Returns a text string that explains why a regexp pattern is
4753	invalid the case being; otherwise returns "no error occurred".
4754
4755	\sa isValid()
4756	*/
4757	QString QRegExp::errorString() const
4758	{
4759	if (isValid()) {
4760	return QString::fromLatin1(RXERR_OK);
4761	} else {
4762	return priv->eng->errorString();
4763	}
4764	}
4765
4766	/!*
4767	\internal
4768	*/
4769	QString QRegExp::errorString()
4770	{
4771	return const_cast<const QRegExp >(this*)->errorString();
4772	}
4773
4774	#endif
4775
4776	/!*
4777	Returns the string \a str with every regexp special character
4778	escaped with a backslash. The special characters are $, (,), , +,*
4779	., ?, [, \,], ^, {, \| and }.
4780
4781	Example:
4782
4783	\snippet code/src_corelib_text_qregexp.cpp 19
4784
4785	This function is useful to construct regexp patterns dynamically:
4786
4787	\snippet code/src_corelib_text_qregexp.cpp 20
4788
4789	\sa setPatternSyntax()
4790	*/
4791	QString QRegExp::escape(const QString &str)
4792	{
4793	QString quoted;
4794	const int count = str.size();
4795	quoted.reserve(asize: count * `2`);
4796	const QLatin1Char backslash(`'\\'`);
4797	for (int i = `0`; i < count; i++) {
4798	switch (str.at(i).toLatin1()) {
4799	case `'$'`:
4800	case `'('`:
4801	case `')'`:
4802	case `'*'`:
4803	case `'+'`:
4804	case `'.'`:
4805	case `'?'`:
4806	case `'['`:
4807	case `'\\'`:
4808	case `']'`:
4809	case `'^'`:
4810	case `'{'`:
4811	case `'\|'`:
4812	case `'}'`:
4813	quoted.append(c: backslash);
4814	}
4815	quoted.append(c: str.at(i));
4816	}
4817	return quoted;
4818	}
4819
4820
4821	#ifndef QT_NO_DATASTREAM
4822	/!*
4823	\relates QRegExp
4824
4825	Writes the regular expression \a regExp to stream \a out.
4826
4827	\sa {Serializing Qt Data Types}
4828	*/
4829	QDataStream &operator<<(QDataStream &out, const QRegExp &regExp)
4830	{
4831	return out << regExp.pattern() << (quint8)regExp.caseSensitivity()
4832	<< (quint8)regExp.patternSyntax()
4833	<< (quint8)!!regExp.isMinimal();
4834	}
4835
4836	/!*
4837	\relates QRegExp
4838
4839	Reads a regular expression from stream \a in into \a regExp.
4840
4841	\sa {Serializing Qt Data Types}
4842	*/
4843	QDataStream &operator>>(QDataStream &in, QRegExp &regExp)
4844	{
4845	QString pattern;
4846	quint8 cs;
4847	quint8 patternSyntax;
4848	quint8 isMinimal;
4849
4850	in >> pattern >> cs >> patternSyntax >> isMinimal;
4851
4852	QRegExp newRegExp(pattern, Qt::CaseSensitivity(cs),
4853	QRegExp::PatternSyntax(patternSyntax));
4854
4855	newRegExp.setMinimal(isMinimal);
4856	regExp = newRegExp;
4857	return in;
4858	}
4859	#endif // QT_NO_DATASTREAM
4860
4861	#ifndef QT_NO_DEBUG_STREAM
4862	QDebug operator<<(QDebug dbg, const QRegExp &r)
4863	{
4864	QDebugStateSaver saver(dbg);
4865	dbg.nospace() << "QRegExp(patternSyntax=" << r.patternSyntax()
4866	<< ", pattern='"<< r.pattern() << "')";
4867	return dbg;
4868	}
4869	#endif
4870
4871	QT_END_NAMESPACE
4872

source code of qt5compat/src/core5/text/qregexp.cpp