1 | /**************************************************************************** |
2 | ** |
3 | ** Copyright (C) 2016 The Qt Company Ltd. |
4 | ** Contact: https://www.qt.io/licensing/ |
5 | ** |
6 | ** This file is part of the QtCore module of the Qt Toolkit. |
7 | ** |
8 | ** $QT_BEGIN_LICENSE:LGPL$ |
9 | ** Commercial License Usage |
10 | ** Licensees holding valid commercial Qt licenses may use this file in |
11 | ** accordance with the commercial license agreement provided with the |
12 | ** Software or, alternatively, in accordance with the terms contained in |
13 | ** a written agreement between you and The Qt Company. For licensing terms |
14 | ** and conditions see https://www.qt.io/terms-conditions. For further |
15 | ** information use the contact form at https://www.qt.io/contact-us. |
16 | ** |
17 | ** GNU Lesser General Public License Usage |
18 | ** Alternatively, this file may be used under the terms of the GNU Lesser |
19 | ** General Public License version 3 as published by the Free Software |
20 | ** Foundation and appearing in the file LICENSE.LGPL3 included in the |
21 | ** packaging of this file. Please review the following information to |
22 | ** ensure the GNU Lesser General Public License version 3 requirements |
23 | ** will be met: https://www.gnu.org/licenses/lgpl-3.0.html. |
24 | ** |
25 | ** GNU General Public License Usage |
26 | ** Alternatively, this file may be used under the terms of the GNU |
27 | ** General Public License version 2.0 or (at your option) the GNU General |
28 | ** Public license version 3 or any later version approved by the KDE Free |
29 | ** Qt Foundation. The licenses are as published by the Free Software |
30 | ** Foundation and appearing in the file LICENSE.GPL2 and LICENSE.GPL3 |
31 | ** included in the packaging of this file. Please review the following |
32 | ** information to ensure the GNU General Public License requirements will |
33 | ** be met: https://www.gnu.org/licenses/gpl-2.0.html and |
34 | ** https://www.gnu.org/licenses/gpl-3.0.html. |
35 | ** |
36 | ** $QT_END_LICENSE$ |
37 | ** |
38 | ****************************************************************************/ |
39 | |
40 | #include "qregexp.h" |
41 | |
42 | #include "qalgorithms.h" |
43 | #include "qbitarray.h" |
44 | #include "qcache.h" |
45 | #include "qdatastream.h" |
46 | #include "qdebug.h" |
47 | #include "qhashfunctions.h" |
48 | #include "qlist.h" |
49 | #include "qmap.h" |
50 | #include "qmutex.h" |
51 | #include "qstring.h" |
52 | #include "qstringlist.h" |
53 | #include "qstringmatcher.h" |
54 | #include "qvector.h" |
55 | #include "private/qlocking_p.h" |
56 | |
57 | #include <limits.h> |
58 | #include <algorithm> |
59 | |
60 | QT_BEGIN_NAMESPACE |
61 | |
62 | // error strings for the regexp parser |
63 | #define RXERR_OK QT_TRANSLATE_NOOP("QRegExp", "no error occurred") |
64 | #define RXERR_DISABLED QT_TRANSLATE_NOOP("QRegExp", "disabled feature used") |
65 | #define RXERR_CHARCLASS QT_TRANSLATE_NOOP("QRegExp", "bad char class syntax") |
66 | #define RXERR_LOOKAHEAD QT_TRANSLATE_NOOP("QRegExp", "bad lookahead syntax") |
67 | #define RXERR_LOOKBEHIND QT_TRANSLATE_NOOP("QRegExp", "lookbehinds not supported, see QTBUG-2371") |
68 | #define RXERR_REPETITION QT_TRANSLATE_NOOP("QRegExp", "bad repetition syntax") |
69 | #define RXERR_OCTAL QT_TRANSLATE_NOOP("QRegExp", "invalid octal value") |
70 | #define RXERR_LEFTDELIM QT_TRANSLATE_NOOP("QRegExp", "missing left delim") |
71 | #define RXERR_END QT_TRANSLATE_NOOP("QRegExp", "unexpected end") |
72 | #define RXERR_LIMIT QT_TRANSLATE_NOOP("QRegExp", "met internal limit") |
73 | #define RXERR_INTERVAL QT_TRANSLATE_NOOP("QRegExp", "invalid interval") |
74 | #define RXERR_CATEGORY QT_TRANSLATE_NOOP("QRegExp", "invalid category") |
75 | |
76 | /*! |
77 | \class QRegExp |
78 | \inmodule QtCore |
79 | \reentrant |
80 | \brief The QRegExp class provides pattern matching using regular expressions. |
81 | |
82 | \ingroup tools |
83 | \ingroup shared |
84 | |
85 | \keyword regular expression |
86 | |
87 | A regular expression, or "regexp", is a pattern for matching |
88 | substrings in a text. This is useful in many contexts, e.g., |
89 | |
90 | \table |
91 | \row \li Validation |
92 | \li A regexp can test whether a substring meets some criteria, |
93 | e.g. is an integer or contains no whitespace. |
94 | \row \li Searching |
95 | \li A regexp provides more powerful pattern matching than |
96 | simple substring matching, e.g., match one of the words |
97 | \e{mail}, \e{letter} or \e{correspondence}, but none of the |
98 | words \e{email}, \e{mailman}, \e{mailer}, \e{letterbox}, etc. |
99 | \row \li Search and Replace |
100 | \li A regexp can replace all occurrences of a substring with a |
101 | different substring, e.g., replace all occurrences of \e{&} |
102 | with \e{\&} except where the \e{&} is already followed by |
103 | an \e{amp;}. |
104 | \row \li String Splitting |
105 | \li A regexp can be used to identify where a string should be |
106 | split apart, e.g. splitting tab-delimited strings. |
107 | \endtable |
108 | |
109 | A brief introduction to regexps is presented, a description of |
110 | Qt's regexp language, some examples, and the function |
111 | documentation itself. QRegExp is modeled on Perl's regexp |
112 | language. It fully supports Unicode. QRegExp can also be used in a |
113 | simpler, \e{wildcard mode} that is similar to the functionality |
114 | found in command shells. The syntax rules used by QRegExp can be |
115 | changed with setPatternSyntax(). In particular, the pattern syntax |
116 | can be set to QRegExp::FixedString, which means the pattern to be |
117 | matched is interpreted as a plain string, i.e., special characters |
118 | (e.g., backslash) are not escaped. |
119 | |
120 | A good text on regexps is \e {Mastering Regular Expressions} |
121 | (Third Edition) by Jeffrey E. F. Friedl, ISBN 0-596-52812-4. |
122 | |
123 | \note In Qt 5, the new QRegularExpression class provides a Perl |
124 | compatible implementation of regular expressions and is recommended |
125 | in place of QRegExp. |
126 | |
127 | \tableofcontents |
128 | |
129 | \section1 Introduction |
130 | |
131 | Regexps are built up from expressions, quantifiers, and |
132 | assertions. The simplest expression is a character, e.g. \b{x} |
133 | or \b{5}. An expression can also be a set of characters |
134 | enclosed in square brackets. \b{[ABCD]} will match an \b{A} |
135 | or a \b{B} or a \b{C} or a \b{D}. We can write this same |
136 | expression as \b{[A-D]}, and an expression to match any |
137 | capital letter in the English alphabet is written as |
138 | \b{[A-Z]}. |
139 | |
140 | A quantifier specifies the number of occurrences of an expression |
141 | that must be matched. \b{x{1,1}} means match one and only one |
142 | \b{x}. \b{x{1,5}} means match a sequence of \b{x} |
143 | characters that contains at least one \b{x} but no more than |
144 | five. |
145 | |
146 | Note that in general regexps cannot be used to check for balanced |
147 | brackets or tags. For example, a regexp can be written to match an |
148 | opening html \c{<b>} and its closing \c{</b>}, if the \c{<b>} tags |
149 | are not nested, but if the \c{<b>} tags are nested, that same |
150 | regexp will match an opening \c{<b>} tag with the wrong closing |
151 | \c{</b>}. For the fragment \c{<b>bold <b>bolder</b></b>}, the |
152 | first \c{<b>} would be matched with the first \c{</b>}, which is |
153 | not correct. However, it is possible to write a regexp that will |
154 | match nested brackets or tags correctly, but only if the number of |
155 | nesting levels is fixed and known. If the number of nesting levels |
156 | is not fixed and known, it is impossible to write a regexp that |
157 | will not fail. |
158 | |
159 | Suppose we want a regexp to match integers in the range 0 to 99. |
160 | At least one digit is required, so we start with the expression |
161 | \b{[0-9]{1,1}}, which matches a single digit exactly once. This |
162 | regexp matches integers in the range 0 to 9. To match integers up |
163 | to 99, increase the maximum number of occurrences to 2, so the |
164 | regexp becomes \b{[0-9]{1,2}}. This regexp satisfies the |
165 | original requirement to match integers from 0 to 99, but it will |
166 | also match integers that occur in the middle of strings. If we |
167 | want the matched integer to be the whole string, we must use the |
168 | anchor assertions, \b{^} (caret) and \b{$} (dollar). When |
169 | \b{^} is the first character in a regexp, it means the regexp |
170 | must match from the beginning of the string. When \b{$} is the |
171 | last character of the regexp, it means the regexp must match to |
172 | the end of the string. The regexp becomes \b{^[0-9]{1,2}$}. |
173 | Note that assertions, e.g. \b{^} and \b{$}, do not match |
174 | characters but locations in the string. |
175 | |
176 | If you have seen regexps described elsewhere, they may have looked |
177 | different from the ones shown here. This is because some sets of |
178 | characters and some quantifiers are so common that they have been |
179 | given special symbols to represent them. \b{[0-9]} can be |
180 | replaced with the symbol \b{\\d}. The quantifier to match |
181 | exactly one occurrence, \b{{1,1}}, can be replaced with the |
182 | expression itself, i.e. \b{x{1,1}} is the same as \b{x}. So |
183 | our 0 to 99 matcher could be written as \b{^\\d{1,2}$}. It can |
184 | also be written \b{^\\d\\d{0,1}$}, i.e. \e{From the start of |
185 | the string, match a digit, followed immediately by 0 or 1 digits}. |
186 | In practice, it would be written as \b{^\\d\\d?$}. The \b{?} |
187 | is shorthand for the quantifier \b{{0,1}}, i.e. 0 or 1 |
188 | occurrences. \b{?} makes an expression optional. The regexp |
189 | \b{^\\d\\d?$} means \e{From the beginning of the string, match |
190 | one digit, followed immediately by 0 or 1 more digit, followed |
191 | immediately by end of string}. |
192 | |
193 | To write a regexp that matches one of the words 'mail' \e or |
194 | 'letter' \e or 'correspondence' but does not match words that |
195 | contain these words, e.g., 'email', 'mailman', 'mailer', and |
196 | 'letterbox', start with a regexp that matches 'mail'. Expressed |
197 | fully, the regexp is \b{m{1,1}a{1,1}i{1,1}l{1,1}}, but because |
198 | a character expression is automatically quantified by |
199 | \b{{1,1}}, we can simplify the regexp to \b{mail}, i.e., an |
200 | 'm' followed by an 'a' followed by an 'i' followed by an 'l'. Now |
201 | we can use the vertical bar \b{|}, which means \b{or}, to |
202 | include the other two words, so our regexp for matching any of the |
203 | three words becomes \b{mail|letter|correspondence}. Match |
204 | 'mail' \b{or} 'letter' \b{or} 'correspondence'. While this |
205 | regexp will match one of the three words we want to match, it will |
206 | also match words we don't want to match, e.g., 'email'. To |
207 | prevent the regexp from matching unwanted words, we must tell it |
208 | to begin and end the match at word boundaries. First we enclose |
209 | our regexp in parentheses, \b{(mail|letter|correspondence)}. |
210 | Parentheses group expressions together, and they identify a part |
211 | of the regexp that we wish to \l{capturing text}{capture}. |
212 | Enclosing the expression in parentheses allows us to use it as a |
213 | component in more complex regexps. It also allows us to examine |
214 | which of the three words was actually matched. To force the match |
215 | to begin and end on word boundaries, we enclose the regexp in |
216 | \b{\\b} \e{word boundary} assertions: |
217 | \b{\\b(mail|letter|correspondence)\\b}. Now the regexp means: |
218 | \e{Match a word boundary, followed by the regexp in parentheses, |
219 | followed by a word boundary}. The \b{\\b} assertion matches a |
220 | \e position in the regexp, not a \e character. A word boundary is |
221 | any non-word character, e.g., a space, newline, or the beginning |
222 | or ending of a string. |
223 | |
224 | If we want to replace ampersand characters with the HTML entity |
225 | \b{\&}, the regexp to match is simply \b{\&}. But this |
226 | regexp will also match ampersands that have already been converted |
227 | to HTML entities. We want to replace only ampersands that are not |
228 | already followed by \b{amp;}. For this, we need the negative |
229 | lookahead assertion, \b{(?!}__\b{)}. The regexp can then be |
230 | written as \b{\&(?!amp;)}, i.e. \e{Match an ampersand that is} |
231 | \b{not} \e{followed by} \b{amp;}. |
232 | |
233 | If we want to count all the occurrences of 'Eric' and 'Eirik' in a |
234 | string, two valid solutions are \b{\\b(Eric|Eirik)\\b} and |
235 | \b{\\bEi?ri[ck]\\b}. The word boundary assertion '\\b' is |
236 | required to avoid matching words that contain either name, |
237 | e.g. 'Ericsson'. Note that the second regexp matches more |
238 | spellings than we want: 'Eric', 'Erik', 'Eiric' and 'Eirik'. |
239 | |
240 | Some of the examples discussed above are implemented in the |
241 | \l{#code-examples}{code examples} section. |
242 | |
243 | \target characters-and-abbreviations-for-sets-of-characters |
244 | \section1 Characters and Abbreviations for Sets of Characters |
245 | |
246 | \table |
247 | \header \li Element \li Meaning |
248 | \row \li \b{c} |
249 | \li A character represents itself unless it has a special |
250 | regexp meaning. e.g. \b{c} matches the character \e c. |
251 | \row \li \b{\\c} |
252 | \li A character that follows a backslash matches the character |
253 | itself, except as specified below. e.g., To match a literal |
254 | caret at the beginning of a string, write \b{\\^}. |
255 | \row \li \b{\\a} |
256 | \li Matches the ASCII bell (BEL, 0x07). |
257 | \row \li \b{\\f} |
258 | \li Matches the ASCII form feed (FF, 0x0C). |
259 | \row \li \b{\\n} |
260 | \li Matches the ASCII line feed (LF, 0x0A, Unix newline). |
261 | \row \li \b{\\r} |
262 | \li Matches the ASCII carriage return (CR, 0x0D). |
263 | \row \li \b{\\t} |
264 | \li Matches the ASCII horizontal tab (HT, 0x09). |
265 | \row \li \b{\\v} |
266 | \li Matches the ASCII vertical tab (VT, 0x0B). |
267 | \row \li \b{\\x\e{hhhh}} |
268 | \li Matches the Unicode character corresponding to the |
269 | hexadecimal number \e{hhhh} (between 0x0000 and 0xFFFF). |
270 | \row \li \b{\\0\e{ooo}} (i.e., \\zero \e{ooo}) |
271 | \li matches the ASCII/Latin1 character for the octal number |
272 | \e{ooo} (between 0 and 0377). |
273 | \row \li \b{. (dot)} |
274 | \li Matches any character (including newline). |
275 | \row \li \b{\\d} |
276 | \li Matches a digit (QChar::isDigit()). |
277 | \row \li \b{\\D} |
278 | \li Matches a non-digit. |
279 | \row \li \b{\\s} |
280 | \li Matches a whitespace character (QChar::isSpace()). |
281 | \row \li \b{\\S} |
282 | \li Matches a non-whitespace character. |
283 | \row \li \b{\\w} |
284 | \li Matches a word character (QChar::isLetterOrNumber(), QChar::isMark(), or '_'). |
285 | \row \li \b{\\W} |
286 | \li Matches a non-word character. |
287 | \row \li \b{\\\e{n}} |
288 | \li The \e{n}-th backreference, e.g. \\1, \\2, etc. |
289 | \endtable |
290 | |
291 | \b{Note:} The C++ compiler transforms backslashes in strings. |
292 | To include a \b{\\} in a regexp, enter it twice, i.e. \c{\\}. |
293 | To match the backslash character itself, enter it four times, i.e. |
294 | \c{\\\\}. |
295 | |
296 | \target sets-of-characters |
297 | \section1 Sets of Characters |
298 | |
299 | Square brackets mean match any character contained in the square |
300 | brackets. The character set abbreviations described above can |
301 | appear in a character set in square brackets. Except for the |
302 | character set abbreviations and the following two exceptions, |
303 | characters do not have special meanings in square brackets. |
304 | |
305 | \table |
306 | \row \li \b{^} |
307 | |
308 | \li The caret negates the character set if it occurs as the |
309 | first character (i.e. immediately after the opening square |
310 | bracket). \b{[abc]} matches 'a' or 'b' or 'c', but |
311 | \b{[^abc]} matches anything \e but 'a' or 'b' or 'c'. |
312 | |
313 | \row \li \b{-} |
314 | |
315 | \li The dash indicates a range of characters. \b{[W-Z]} |
316 | matches 'W' or 'X' or 'Y' or 'Z'. |
317 | |
318 | \endtable |
319 | |
320 | Using the predefined character set abbreviations is more portable |
321 | than using character ranges across platforms and languages. For |
322 | example, \b{[0-9]} matches a digit in Western alphabets but |
323 | \b{\\d} matches a digit in \e any alphabet. |
324 | |
325 | Note: In other regexp documentation, sets of characters are often |
326 | called "character classes". |
327 | |
328 | \target quantifiers |
329 | \section1 Quantifiers |
330 | |
331 | By default, an expression is automatically quantified by |
332 | \b{{1,1}}, i.e. it should occur exactly once. In the following |
333 | list, \b{\e {E}} stands for expression. An expression is a |
334 | character, or an abbreviation for a set of characters, or a set of |
335 | characters in square brackets, or an expression in parentheses. |
336 | |
337 | \table |
338 | \row \li \b{\e {E}?} |
339 | |
340 | \li Matches zero or one occurrences of \e E. This quantifier |
341 | means \e{The previous expression is optional}, because it |
342 | will match whether or not the expression is found. \b{\e |
343 | {E}?} is the same as \b{\e {E}{0,1}}. e.g., \b{dents?} |
344 | matches 'dent' or 'dents'. |
345 | |
346 | \row \li \b{\e {E}+} |
347 | |
348 | \li Matches one or more occurrences of \e E. \b{\e {E}+} is |
349 | the same as \b{\e {E}{1,}}. e.g., \b{0+} matches '0', |
350 | '00', '000', etc. |
351 | |
352 | \row \li \b{\e {E}*} |
353 | |
354 | \li Matches zero or more occurrences of \e E. It is the same |
355 | as \b{\e {E}{0,}}. The \b{*} quantifier is often used |
356 | in error where \b{+} should be used. For example, if |
357 | \b{\\s*$} is used in an expression to match strings that |
358 | end in whitespace, it will match every string because |
359 | \b{\\s*$} means \e{Match zero or more whitespaces followed |
360 | by end of string}. The correct regexp to match strings that |
361 | have at least one trailing whitespace character is |
362 | \b{\\s+$}. |
363 | |
364 | \row \li \b{\e {E}{n}} |
365 | |
366 | \li Matches exactly \e n occurrences of \e E. \b{\e {E}{n}} |
367 | is the same as repeating \e E \e n times. For example, |
368 | \b{x{5}} is the same as \b{xxxxx}. It is also the same |
369 | as \b{\e {E}{n,n}}, e.g. \b{x{5,5}}. |
370 | |
371 | \row \li \b{\e {E}{n,}} |
372 | \li Matches at least \e n occurrences of \e E. |
373 | |
374 | \row \li \b{\e {E}{,m}} |
375 | \li Matches at most \e m occurrences of \e E. \b{\e {E}{,m}} |
376 | is the same as \b{\e {E}{0,m}}. |
377 | |
378 | \row \li \b{\e {E}{n,m}} |
379 | \li Matches at least \e n and at most \e m occurrences of \e E. |
380 | \endtable |
381 | |
382 | To apply a quantifier to more than just the preceding character, |
383 | use parentheses to group characters together in an expression. For |
384 | example, \b{tag+} matches a 't' followed by an 'a' followed by |
385 | at least one 'g', whereas \b{(tag)+} matches at least one |
386 | occurrence of 'tag'. |
387 | |
388 | Note: Quantifiers are normally "greedy". They always match as much |
389 | text as they can. For example, \b{0+} matches the first zero it |
390 | finds and all the consecutive zeros after the first zero. Applied |
391 | to '20005', it matches '2\underline{000}5'. Quantifiers can be made |
392 | non-greedy, see setMinimal(). |
393 | |
394 | \target capturing parentheses |
395 | \target backreferences |
396 | \section1 Capturing Text |
397 | |
398 | Parentheses allow us to group elements together so that we can |
399 | quantify and capture them. For example if we have the expression |
400 | \b{mail|letter|correspondence} that matches a string we know |
401 | that \e one of the words matched but not which one. Using |
402 | parentheses allows us to "capture" whatever is matched within |
403 | their bounds, so if we used \b{(mail|letter|correspondence)} |
404 | and matched this regexp against the string "I sent you some email" |
405 | we can use the cap() or capturedTexts() functions to extract the |
406 | matched characters, in this case 'mail'. |
407 | |
408 | We can use captured text within the regexp itself. To refer to the |
409 | captured text we use \e backreferences which are indexed from 1, |
410 | the same as for cap(). For example we could search for duplicate |
411 | words in a string using \b{\\b(\\w+)\\W+\\1\\b} which means match a |
412 | word boundary followed by one or more word characters followed by |
413 | one or more non-word characters followed by the same text as the |
414 | first parenthesized expression followed by a word boundary. |
415 | |
416 | If we want to use parentheses purely for grouping and not for |
417 | capturing we can use the non-capturing syntax, e.g. |
418 | \b{(?:green|blue)}. Non-capturing parentheses begin '(?:' and |
419 | end ')'. In this example we match either 'green' or 'blue' but we |
420 | do not capture the match so we only know whether or not we matched |
421 | but not which color we actually found. Using non-capturing |
422 | parentheses is more efficient than using capturing parentheses |
423 | since the regexp engine has to do less book-keeping. |
424 | |
425 | Both capturing and non-capturing parentheses may be nested. |
426 | |
427 | \target greedy quantifiers |
428 | |
429 | For historical reasons, quantifiers (e.g. \b{*}) that apply to |
430 | capturing parentheses are more "greedy" than other quantifiers. |
431 | For example, \b{a*(a*)} will match "aaa" with cap(1) == "aaa". |
432 | This behavior is different from what other regexp engines do |
433 | (notably, Perl). To obtain a more intuitive capturing behavior, |
434 | specify QRegExp::RegExp2 to the QRegExp constructor or call |
435 | setPatternSyntax(QRegExp::RegExp2). |
436 | |
437 | \target cap_in_a_loop |
438 | |
439 | When the number of matches cannot be determined in advance, a |
440 | common idiom is to use cap() in a loop. For example: |
441 | |
442 | \snippet code/src_corelib_tools_qregexp.cpp 0 |
443 | |
444 | \target assertions |
445 | \section1 Assertions |
446 | |
447 | Assertions make some statement about the text at the point where |
448 | they occur in the regexp but they do not match any characters. In |
449 | the following list \b{\e {E}} stands for any expression. |
450 | |
451 | \table |
452 | \row \li \b{^} |
453 | \li The caret signifies the beginning of the string. If you |
454 | wish to match a literal \c{^} you must escape it by |
455 | writing \c{\\^}. For example, \b{^#include} will only |
456 | match strings which \e begin with the characters '#include'. |
457 | (When the caret is the first character of a character set it |
458 | has a special meaning, see \l{#sets-of-characters}{Sets of Characters}.) |
459 | |
460 | \row \li \b{$} |
461 | \li The dollar signifies the end of the string. For example |
462 | \b{\\d\\s*$} will match strings which end with a digit |
463 | optionally followed by whitespace. If you wish to match a |
464 | literal \c{$} you must escape it by writing |
465 | \c{\\$}. |
466 | |
467 | \row \li \b{\\b} |
468 | \li A word boundary. For example the regexp |
469 | \b{\\bOK\\b} means match immediately after a word |
470 | boundary (e.g. start of string or whitespace) the letter 'O' |
471 | then the letter 'K' immediately before another word boundary |
472 | (e.g. end of string or whitespace). But note that the |
473 | assertion does not actually match any whitespace so if we |
474 | write \b{(\\bOK\\b)} and we have a match it will only |
475 | contain 'OK' even if the string is "It's \underline{OK} now". |
476 | |
477 | \row \li \b{\\B} |
478 | \li A non-word boundary. This assertion is true wherever |
479 | \b{\\b} is false. For example if we searched for |
480 | \b{\\Bon\\B} in "Left on" the match would fail (space |
481 | and end of string aren't non-word boundaries), but it would |
482 | match in "t\underline{on}ne". |
483 | |
484 | \row \li \b{(?=\e E)} |
485 | \li Positive lookahead. This assertion is true if the |
486 | expression matches at this point in the regexp. For example, |
487 | \b{const(?=\\s+char)} matches 'const' whenever it is |
488 | followed by 'char', as in 'static \underline{const} char *'. |
489 | (Compare with \b{const\\s+char}, which matches 'static |
490 | \underline{const char} *'.) |
491 | |
492 | \row \li \b{(?!\e E)} |
493 | \li Negative lookahead. This assertion is true if the |
494 | expression does not match at this point in the regexp. For |
495 | example, \b{const(?!\\s+char)} matches 'const' \e except |
496 | when it is followed by 'char'. |
497 | \endtable |
498 | |
499 | \target QRegExp wildcard matching |
500 | \section1 Wildcard Matching |
501 | |
502 | Most command shells such as \e bash or \e cmd.exe support "file |
503 | globbing", the ability to identify a group of files by using |
504 | wildcards. The setPatternSyntax() function is used to switch |
505 | between regexp and wildcard mode. Wildcard matching is much |
506 | simpler than full regexps and has only four features: |
507 | |
508 | \table |
509 | \row \li \b{c} |
510 | \li Any character represents itself apart from those mentioned |
511 | below. Thus \b{c} matches the character \e c. |
512 | \row \li \b{?} |
513 | \li Matches any single character. It is the same as |
514 | \b{.} in full regexps. |
515 | \row \li \b{*} |
516 | \li Matches zero or more of any characters. It is the |
517 | same as \b{.*} in full regexps. |
518 | \row \li \b{[...]} |
519 | \li Sets of characters can be represented in square brackets, |
520 | similar to full regexps. Within the character class, like |
521 | outside, backslash has no special meaning. |
522 | \endtable |
523 | |
524 | In the mode Wildcard, the wildcard characters cannot be |
525 | escaped. In the mode WildcardUnix, the character '\\' escapes the |
526 | wildcard. |
527 | |
528 | For example if we are in wildcard mode and have strings which |
529 | contain filenames we could identify HTML files with \b{*.html}. |
530 | This will match zero or more characters followed by a dot followed |
531 | by 'h', 't', 'm' and 'l'. |
532 | |
533 | To test a string against a wildcard expression, use exactMatch(). |
534 | For example: |
535 | |
536 | \snippet code/src_corelib_tools_qregexp.cpp 1 |
537 | |
538 | \target perl-users |
539 | \section1 Notes for Perl Users |
540 | |
541 | Most of the character class abbreviations supported by Perl are |
542 | supported by QRegExp, see \l{#characters-and-abbreviations-for-sets-of-characters} |
543 | {characters and abbreviations for sets of characters}. |
544 | |
545 | In QRegExp, apart from within character classes, \c{^} always |
546 | signifies the start of the string, so carets must always be |
547 | escaped unless used for that purpose. In Perl the meaning of caret |
548 | varies automagically depending on where it occurs so escaping it |
549 | is rarely necessary. The same applies to \c{$} which in |
550 | QRegExp always signifies the end of the string. |
551 | |
552 | QRegExp's quantifiers are the same as Perl's greedy quantifiers |
553 | (but see the \l{greedy quantifiers}{note above}). Non-greedy |
554 | matching cannot be applied to individual quantifiers, but can be |
555 | applied to all the quantifiers in the pattern. For example, to |
556 | match the Perl regexp \b{ro+?m} requires: |
557 | |
558 | \snippet code/src_corelib_tools_qregexp.cpp 2 |
559 | |
560 | The equivalent of Perl's \c{/i} option is |
561 | setCaseSensitivity(Qt::CaseInsensitive). |
562 | |
563 | Perl's \c{/g} option can be emulated using a \l{#cap_in_a_loop}{loop}. |
564 | |
565 | In QRegExp \b{.} matches any character, therefore all QRegExp |
566 | regexps have the equivalent of Perl's \c{/s} option. QRegExp |
567 | does not have an equivalent to Perl's \c{/m} option, but this |
568 | can be emulated in various ways for example by splitting the input |
569 | into lines or by looping with a regexp that searches for newlines. |
570 | |
571 | Because QRegExp is string oriented, there are no \\A, \\Z, or \\z |
572 | assertions. The \\G assertion is not supported but can be emulated |
573 | in a loop. |
574 | |
575 | Perl's $& is cap(0) or capturedTexts()[0]. There are no QRegExp |
576 | equivalents for $`, $' or $+. Perl's capturing variables, $1, $2, |
577 | ... correspond to cap(1) or capturedTexts()[1], cap(2) or |
578 | capturedTexts()[2], etc. |
579 | |
580 | To substitute a pattern use QString::replace(). |
581 | |
582 | Perl's extended \c{/x} syntax is not supported, nor are |
583 | directives, e.g. (?i), or regexp comments, e.g. (?#comment). On |
584 | the other hand, C++'s rules for literal strings can be used to |
585 | achieve the same: |
586 | |
587 | \snippet code/src_corelib_tools_qregexp.cpp 3 |
588 | |
589 | Both zero-width positive and zero-width negative lookahead |
590 | assertions (?=pattern) and (?!pattern) are supported with the same |
591 | syntax as Perl. Perl's lookbehind assertions, "independent" |
592 | subexpressions and conditional expressions are not supported. |
593 | |
594 | Non-capturing parentheses are also supported, with the same |
595 | (?:pattern) syntax. |
596 | |
597 | See QString::split() and QStringList::join() for equivalents |
598 | to Perl's split and join functions. |
599 | |
600 | Note: because C++ transforms \\'s they must be written \e twice in |
601 | code, e.g. \b{\\b} must be written \b{\\\\b}. |
602 | |
603 | \target code-examples |
604 | \section1 Code Examples |
605 | |
606 | \snippet code/src_corelib_tools_qregexp.cpp 4 |
607 | |
608 | The third string matches '\underline{6}'. This is a simple validation |
609 | regexp for integers in the range 0 to 99. |
610 | |
611 | \snippet code/src_corelib_tools_qregexp.cpp 5 |
612 | |
613 | The second string matches '\underline{This_is-OK}'. We've used the |
614 | character set abbreviation '\\S' (non-whitespace) and the anchors |
615 | to match strings which contain no whitespace. |
616 | |
617 | In the following example we match strings containing 'mail' or |
618 | 'letter' or 'correspondence' but only match whole words i.e. not |
619 | 'email' |
620 | |
621 | \snippet code/src_corelib_tools_qregexp.cpp 6 |
622 | |
623 | The second string matches "Please write the \underline{letter}". The |
624 | word 'letter' is also captured (because of the parentheses). We |
625 | can see what text we've captured like this: |
626 | |
627 | \snippet code/src_corelib_tools_qregexp.cpp 7 |
628 | |
629 | This will capture the text from the first set of capturing |
630 | parentheses (counting capturing left parentheses from left to |
631 | right). The parentheses are counted from 1 since cap(0) is the |
632 | whole matched regexp (equivalent to '&' in most regexp engines). |
633 | |
634 | \snippet code/src_corelib_tools_qregexp.cpp 8 |
635 | |
636 | Here we've passed the QRegExp to QString's replace() function to |
637 | replace the matched text with new text. |
638 | |
639 | \snippet code/src_corelib_tools_qregexp.cpp 9 |
640 | |
641 | We've used the indexIn() function to repeatedly match the regexp in |
642 | the string. Note that instead of moving forward by one character |
643 | at a time \c pos++ we could have written \c {pos += |
644 | rx.matchedLength()} to skip over the already matched string. The |
645 | count will equal 3, matching 'One \underline{Eric} another |
646 | \underline{Eirik}, and an Ericsson. How many Eiriks, \underline{Eric}?'; it |
647 | doesn't match 'Ericsson' or 'Eiriks' because they are not bounded |
648 | by non-word boundaries. |
649 | |
650 | One common use of regexps is to split lines of delimited data into |
651 | their component fields. |
652 | |
653 | \snippet code/src_corelib_tools_qregexp.cpp 10 |
654 | |
655 | In this example our input lines have the format company name, web |
656 | address and country. Unfortunately the regexp is rather long and |
657 | not very versatile -- the code will break if we add any more |
658 | fields. A simpler and better solution is to look for the |
659 | separator, '\\t' in this case, and take the surrounding text. The |
660 | QString::split() function can take a separator string or regexp |
661 | as an argument and split a string accordingly. |
662 | |
663 | \snippet code/src_corelib_tools_qregexp.cpp 11 |
664 | |
665 | Here field[0] is the company, field[1] the web address and so on. |
666 | |
667 | To imitate the matching of a shell we can use wildcard mode. |
668 | |
669 | \snippet code/src_corelib_tools_qregexp.cpp 12 |
670 | |
671 | Wildcard matching can be convenient because of its simplicity, but |
672 | any wildcard regexp can be defined using full regexps, e.g. |
673 | \b{.*\\.html$}. Notice that we can't match both \c .html and \c |
674 | .htm files with a wildcard unless we use \b{*.htm*} which will |
675 | also match 'test.html.bak'. A full regexp gives us the precision |
676 | we need, \b{.*\\.html?$}. |
677 | |
678 | QRegExp can match case insensitively using setCaseSensitivity(), |
679 | and can use non-greedy matching, see setMinimal(). By |
680 | default QRegExp uses full regexps but this can be changed with |
681 | setPatternSyntax(). Searching can be done forward with indexIn() or backward |
682 | with lastIndexIn(). Captured text can be accessed using |
683 | capturedTexts() which returns a string list of all captured |
684 | strings, or using cap() which returns the captured string for the |
685 | given index. The pos() function takes a match index and returns |
686 | the position in the string where the match was made (or -1 if |
687 | there was no match). |
688 | |
689 | \sa QString, QStringList, QRegExpValidator, QSortFilterProxyModel, |
690 | {tools/regexp}{Regular Expression Example} |
691 | */ |
692 | |
693 | #if defined(Q_OS_VXWORKS) && defined(EOS) |
694 | # undef EOS |
695 | #endif |
696 | |
697 | const int NumBadChars = 64; |
698 | #define BadChar(ch) ((ch).unicode() % NumBadChars) |
699 | |
700 | const int NoOccurrence = INT_MAX; |
701 | const int EmptyCapture = INT_MAX; |
702 | const int InftyLen = INT_MAX; |
703 | const int InftyRep = 1025; |
704 | const int EOS = -1; |
705 | |
706 | static bool isWord(QChar ch) |
707 | { |
708 | return ch.isLetterOrNumber() || ch.isMark() || ch == QLatin1Char('_'); |
709 | } |
710 | |
711 | /* |
712 | Merges two vectors of ints and puts the result into the first |
713 | one. |
714 | */ |
715 | static void mergeInto(QVector<int> *a, const QVector<int> &b) |
716 | { |
717 | int asize = a->size(); |
718 | int bsize = b.size(); |
719 | if (asize == 0) { |
720 | *a = b; |
721 | #ifndef QT_NO_REGEXP_OPTIM |
722 | } else if (bsize == 1 && a->at(i: asize - 1) < b.at(i: 0)) { |
723 | a->resize(asize: asize + 1); |
724 | (*a)[asize] = b.at(i: 0); |
725 | #endif |
726 | } else if (bsize >= 1) { |
727 | int csize = asize + bsize; |
728 | QVector<int> c(csize); |
729 | int i = 0, j = 0, k = 0; |
730 | while (i < asize) { |
731 | if (j < bsize) { |
732 | if (a->at(i) == b.at(i: j)) { |
733 | ++i; |
734 | --csize; |
735 | } else if (a->at(i) < b.at(i: j)) { |
736 | c[k++] = a->at(i: i++); |
737 | } else { |
738 | c[k++] = b.at(i: j++); |
739 | } |
740 | } else { |
741 | memcpy(dest: c.data() + k, src: a->constData() + i, n: (asize - i) * sizeof(int)); |
742 | break; |
743 | } |
744 | } |
745 | c.resize(asize: csize); |
746 | if (j < bsize) |
747 | memcpy(dest: c.data() + k, src: b.constData() + j, n: (bsize - j) * sizeof(int)); |
748 | *a = c; |
749 | } |
750 | } |
751 | |
752 | #ifndef QT_NO_REGEXP_WILDCARD |
753 | /* |
754 | Translates a wildcard pattern to an equivalent regular expression |
755 | pattern (e.g., *.cpp to .*\.cpp). |
756 | |
757 | If enableEscaping is true, it is possible to escape the wildcard |
758 | characters with \ |
759 | */ |
760 | static QString wc2rx(const QString &wc_str, const bool enableEscaping) |
761 | { |
762 | const int wclen = wc_str.length(); |
763 | QString rx; |
764 | int i = 0; |
765 | bool isEscaping = false; // the previous character is '\' |
766 | const QChar *wc = wc_str.unicode(); |
767 | |
768 | while (i < wclen) { |
769 | const QChar c = wc[i++]; |
770 | switch (c.unicode()) { |
771 | case '\\': |
772 | if (enableEscaping) { |
773 | if (isEscaping) { |
774 | rx += QLatin1String("\\\\" ); |
775 | } // we insert the \\ later if necessary |
776 | if (i == wclen) { // the end |
777 | rx += QLatin1String("\\\\" ); |
778 | } |
779 | } else { |
780 | rx += QLatin1String("\\\\" ); |
781 | } |
782 | isEscaping = true; |
783 | break; |
784 | case '*': |
785 | if (isEscaping) { |
786 | rx += QLatin1String("\\*" ); |
787 | isEscaping = false; |
788 | } else { |
789 | rx += QLatin1String(".*" ); |
790 | } |
791 | break; |
792 | case '?': |
793 | if (isEscaping) { |
794 | rx += QLatin1String("\\?" ); |
795 | isEscaping = false; |
796 | } else { |
797 | rx += QLatin1Char('.'); |
798 | } |
799 | |
800 | break; |
801 | case '$': |
802 | case '(': |
803 | case ')': |
804 | case '+': |
805 | case '.': |
806 | case '^': |
807 | case '{': |
808 | case '|': |
809 | case '}': |
810 | if (isEscaping) { |
811 | isEscaping = false; |
812 | rx += QLatin1String("\\\\" ); |
813 | } |
814 | rx += QLatin1Char('\\'); |
815 | rx += c; |
816 | break; |
817 | case '[': |
818 | if (isEscaping) { |
819 | isEscaping = false; |
820 | rx += QLatin1String("\\[" ); |
821 | } else { |
822 | rx += c; |
823 | if (wc[i] == QLatin1Char('^')) |
824 | rx += wc[i++]; |
825 | if (i < wclen) { |
826 | if (wc[i] == QLatin1Char(']')) |
827 | rx += wc[i++]; |
828 | while (i < wclen && wc[i] != QLatin1Char(']')) { |
829 | if (wc[i] == QLatin1Char('\\')) |
830 | rx += QLatin1Char('\\'); |
831 | rx += wc[i++]; |
832 | } |
833 | } |
834 | } |
835 | break; |
836 | |
837 | case ']': |
838 | if(isEscaping){ |
839 | isEscaping = false; |
840 | rx += QLatin1String("\\" ); |
841 | } |
842 | rx += c; |
843 | break; |
844 | |
845 | default: |
846 | if(isEscaping){ |
847 | isEscaping = false; |
848 | rx += QLatin1String("\\\\" ); |
849 | } |
850 | rx += c; |
851 | } |
852 | } |
853 | return rx; |
854 | } |
855 | #endif |
856 | |
857 | static int caretIndex(int offset, QRegExp::CaretMode caretMode) |
858 | { |
859 | if (caretMode == QRegExp::CaretAtZero) { |
860 | return 0; |
861 | } else if (caretMode == QRegExp::CaretAtOffset) { |
862 | return offset; |
863 | } else { // QRegExp::CaretWontMatch |
864 | return -1; |
865 | } |
866 | } |
867 | |
868 | /* |
869 | The QRegExpEngineKey struct uniquely identifies an engine. |
870 | */ |
871 | struct QRegExpEngineKey |
872 | { |
873 | QString pattern; |
874 | QRegExp::PatternSyntax patternSyntax; |
875 | Qt::CaseSensitivity cs; |
876 | |
877 | inline QRegExpEngineKey(const QString &pattern, QRegExp::PatternSyntax patternSyntax, |
878 | Qt::CaseSensitivity cs) |
879 | : pattern(pattern), patternSyntax(patternSyntax), cs(cs) {} |
880 | |
881 | inline void clear() { |
882 | pattern.clear(); |
883 | patternSyntax = QRegExp::RegExp; |
884 | cs = Qt::CaseSensitive; |
885 | } |
886 | }; |
887 | |
888 | static bool operator==(const QRegExpEngineKey &key1, const QRegExpEngineKey &key2) |
889 | { |
890 | return key1.pattern == key2.pattern && key1.patternSyntax == key2.patternSyntax |
891 | && key1.cs == key2.cs; |
892 | } |
893 | |
894 | static uint qHash(const QRegExpEngineKey &key, uint seed = 0) noexcept |
895 | { |
896 | QtPrivate::QHashCombine hash; |
897 | seed = hash(seed, key.pattern); |
898 | seed = hash(seed, key.patternSyntax); |
899 | seed = hash(seed, key.cs); |
900 | return seed; |
901 | } |
902 | |
903 | class QRegExpEngine; |
904 | |
905 | //Q_DECLARE_TYPEINFO(QVector<int>, Q_MOVABLE_TYPE); |
906 | |
907 | /* |
908 | This is the engine state during matching. |
909 | */ |
910 | struct QRegExpMatchState |
911 | { |
912 | const QChar *in; // a pointer to the input string data |
913 | int pos; // the current position in the string |
914 | int caretPos; |
915 | int len; // the length of the input string |
916 | bool minimal; // minimal matching? |
917 | int *bigArray; // big array holding the data for the next pointers |
918 | int *inNextStack; // is state is nextStack? |
919 | int *curStack; // stack of current states |
920 | int *nextStack; // stack of next states |
921 | int *curCapBegin; // start of current states' captures |
922 | int *nextCapBegin; // start of next states' captures |
923 | int *curCapEnd; // end of current states' captures |
924 | int *nextCapEnd; // end of next states' captures |
925 | int *tempCapBegin; // start of temporary captures |
926 | int *tempCapEnd; // end of temporary captures |
927 | int *capBegin; // start of captures for a next state |
928 | int *capEnd; // end of captures for a next state |
929 | int *slideTab; // bump-along slide table for bad-character heuristic |
930 | int *captured; // what match() returned last |
931 | int slideTabSize; // size of slide table |
932 | int capturedSize; |
933 | #ifndef QT_NO_REGEXP_BACKREF |
934 | QList<QVector<int> > sleeping; // list of back-reference sleepers |
935 | #endif |
936 | int matchLen; // length of match |
937 | int oneTestMatchedLen; // length of partial match |
938 | |
939 | const QRegExpEngine *eng; |
940 | |
941 | inline QRegExpMatchState() : bigArray(nullptr), captured(nullptr) {} |
942 | inline ~QRegExpMatchState() { free(ptr: bigArray); } |
943 | |
944 | void drain() { free(ptr: bigArray); bigArray = nullptr; captured = nullptr; } // to save memory |
945 | void prepareForMatch(QRegExpEngine *eng); |
946 | void match(const QChar *str, int len, int pos, bool minimal, |
947 | bool oneTest, int caretIndex); |
948 | bool matchHere(); |
949 | bool testAnchor(int i, int a, const int *capBegin); |
950 | }; |
951 | |
952 | /* |
953 | The struct QRegExpAutomatonState represents one state in a modified NFA. The |
954 | input characters matched are stored in the state instead of on |
955 | the transitions, something possible for an automaton |
956 | constructed from a regular expression. |
957 | */ |
958 | struct QRegExpAutomatonState |
959 | { |
960 | #ifndef QT_NO_REGEXP_CAPTURE |
961 | int atom; // which atom does this state belong to? |
962 | #endif |
963 | int match; // what does it match? (see CharClassBit and BackRefBit) |
964 | QVector<int> outs; // out-transitions |
965 | QMap<int, int> reenter; // atoms reentered when transiting out |
966 | QMap<int, int> anchors; // anchors met when transiting out |
967 | |
968 | inline QRegExpAutomatonState() { } |
969 | #ifndef QT_NO_REGEXP_CAPTURE |
970 | inline QRegExpAutomatonState(int a, int m) |
971 | : atom(a), match(m) { } |
972 | #else |
973 | inline QRegExpAutomatonState(int m) |
974 | : match(m) { } |
975 | #endif |
976 | }; |
977 | |
978 | Q_DECLARE_TYPEINFO(QRegExpAutomatonState, Q_MOVABLE_TYPE); |
979 | |
980 | /* |
981 | The struct QRegExpCharClassRange represents a range of characters (e.g., |
982 | [0-9] denotes range 48 to 57). |
983 | */ |
984 | struct QRegExpCharClassRange |
985 | { |
986 | ushort from; // 48 |
987 | ushort len; // 10 |
988 | }; |
989 | |
990 | Q_DECLARE_TYPEINFO(QRegExpCharClassRange, Q_PRIMITIVE_TYPE); |
991 | |
992 | #ifndef QT_NO_REGEXP_CAPTURE |
993 | /* |
994 | The struct QRegExpAtom represents one node in the hierarchy of regular |
995 | expression atoms. |
996 | */ |
997 | struct QRegExpAtom |
998 | { |
999 | enum { NoCapture = -1, OfficialCapture = -2, UnofficialCapture = -3 }; |
1000 | |
1001 | int parent; // index of parent in array of atoms |
1002 | int capture; // index of capture, from 1 to ncap - 1 |
1003 | }; |
1004 | |
1005 | Q_DECLARE_TYPEINFO(QRegExpAtom, Q_PRIMITIVE_TYPE); |
1006 | #endif |
1007 | |
1008 | struct QRegExpLookahead; |
1009 | |
1010 | #ifndef QT_NO_REGEXP_ANCHOR_ALT |
1011 | /* |
1012 | The struct QRegExpAnchorAlternation represents a pair of anchors with |
1013 | OR semantics. |
1014 | */ |
1015 | struct QRegExpAnchorAlternation |
1016 | { |
1017 | int a; // this anchor... |
1018 | int b; // ...or this one |
1019 | }; |
1020 | |
1021 | Q_DECLARE_TYPEINFO(QRegExpAnchorAlternation, Q_PRIMITIVE_TYPE); |
1022 | #endif |
1023 | |
1024 | #ifndef QT_NO_REGEXP_CCLASS |
1025 | |
1026 | #define FLAG(x) (1 << (x)) |
1027 | /* |
1028 | The class QRegExpCharClass represents a set of characters, such as can |
1029 | be found in regular expressions (e.g., [a-z] denotes the set |
1030 | {a, b, ..., z}). |
1031 | */ |
1032 | class QRegExpCharClass |
1033 | { |
1034 | public: |
1035 | QRegExpCharClass(); |
1036 | |
1037 | void clear(); |
1038 | bool negative() const { return n; } |
1039 | void setNegative(bool negative); |
1040 | void addCategories(uint cats); |
1041 | void addRange(ushort from, ushort to); |
1042 | void addSingleton(ushort ch) { addRange(from: ch, to: ch); } |
1043 | |
1044 | bool in(QChar ch) const; |
1045 | #ifndef QT_NO_REGEXP_OPTIM |
1046 | const QVector<int> &firstOccurrence() const { return occ1; } |
1047 | #endif |
1048 | |
1049 | #if defined(QT_DEBUG) |
1050 | void dump() const; |
1051 | #endif |
1052 | |
1053 | private: |
1054 | QVector<QRegExpCharClassRange> r; // character ranges |
1055 | #ifndef QT_NO_REGEXP_OPTIM |
1056 | QVector<int> occ1; // first-occurrence array |
1057 | #endif |
1058 | uint c; // character classes |
1059 | bool n; // negative? |
1060 | }; |
1061 | #else |
1062 | struct QRegExpCharClass |
1063 | { |
1064 | int dummy; |
1065 | |
1066 | #ifndef QT_NO_REGEXP_OPTIM |
1067 | QRegExpCharClass() { occ1.fill(0, NumBadChars); } |
1068 | |
1069 | const QVector<int> &firstOccurrence() const { return occ1; } |
1070 | QVector<int> occ1; |
1071 | #endif |
1072 | }; |
1073 | #endif |
1074 | |
1075 | Q_DECLARE_TYPEINFO(QRegExpCharClass, Q_MOVABLE_TYPE); |
1076 | |
1077 | /* |
1078 | The QRegExpEngine class encapsulates a modified nondeterministic |
1079 | finite automaton (NFA). |
1080 | */ |
1081 | class QRegExpEngine |
1082 | { |
1083 | public: |
1084 | QRegExpEngine(Qt::CaseSensitivity cs, bool greedyQuantifiers) |
1085 | : cs(cs), greedyQuantifiers(greedyQuantifiers) { setup(); } |
1086 | |
1087 | QRegExpEngine(const QRegExpEngineKey &key); |
1088 | ~QRegExpEngine(); |
1089 | |
1090 | bool isValid() const { return valid; } |
1091 | const QString &errorString() const { return yyError; } |
1092 | int captureCount() const { return officialncap; } |
1093 | |
1094 | int createState(QChar ch); |
1095 | int createState(const QRegExpCharClass &cc); |
1096 | #ifndef QT_NO_REGEXP_BACKREF |
1097 | int createState(int bref); |
1098 | #endif |
1099 | |
1100 | void addCatTransitions(const QVector<int> &from, const QVector<int> &to); |
1101 | #ifndef QT_NO_REGEXP_CAPTURE |
1102 | void addPlusTransitions(const QVector<int> &from, const QVector<int> &to, int atom); |
1103 | #endif |
1104 | |
1105 | #ifndef QT_NO_REGEXP_ANCHOR_ALT |
1106 | int anchorAlternation(int a, int b); |
1107 | int anchorConcatenation(int a, int b); |
1108 | #else |
1109 | int anchorAlternation(int a, int b) { return a & b; } |
1110 | int anchorConcatenation(int a, int b) { return a | b; } |
1111 | #endif |
1112 | void addAnchors(int from, int to, int a); |
1113 | |
1114 | #ifndef QT_NO_REGEXP_OPTIM |
1115 | void heuristicallyChooseHeuristic(); |
1116 | #endif |
1117 | |
1118 | #if defined(QT_DEBUG) |
1119 | void dump() const; |
1120 | #endif |
1121 | |
1122 | QAtomicInt ref; |
1123 | |
1124 | private: |
1125 | enum { CharClassBit = 0x10000, BackRefBit = 0x20000 }; |
1126 | enum { InitialState = 0, FinalState = 1 }; |
1127 | |
1128 | void setup(); |
1129 | int setupState(int match); |
1130 | |
1131 | /* |
1132 | Let's hope that 13 lookaheads and 14 back-references are |
1133 | enough. |
1134 | */ |
1135 | enum { MaxLookaheads = 13, MaxBackRefs = 14 }; |
1136 | enum { Anchor_Dollar = 0x00000001, Anchor_Caret = 0x00000002, Anchor_Word = 0x00000004, |
1137 | Anchor_NonWord = 0x00000008, Anchor_FirstLookahead = 0x00000010, |
1138 | Anchor_BackRef1Empty = Anchor_FirstLookahead << MaxLookaheads, |
1139 | Anchor_BackRef0Empty = Anchor_BackRef1Empty >> 1, |
1140 | Anchor_Alternation = unsigned(Anchor_BackRef1Empty) << MaxBackRefs, |
1141 | |
1142 | Anchor_LookaheadMask = (Anchor_FirstLookahead - 1) ^ |
1143 | ((Anchor_FirstLookahead << MaxLookaheads) - 1) }; |
1144 | #ifndef QT_NO_REGEXP_CAPTURE |
1145 | int startAtom(bool officialCapture); |
1146 | void finishAtom(int atom, bool needCapture); |
1147 | #endif |
1148 | |
1149 | #ifndef QT_NO_REGEXP_LOOKAHEAD |
1150 | int addLookahead(QRegExpEngine *eng, bool negative); |
1151 | #endif |
1152 | |
1153 | #ifndef QT_NO_REGEXP_OPTIM |
1154 | bool goodStringMatch(QRegExpMatchState &matchState) const; |
1155 | bool badCharMatch(QRegExpMatchState &matchState) const; |
1156 | #else |
1157 | bool bruteMatch(QRegExpMatchState &matchState) const; |
1158 | #endif |
1159 | |
1160 | QVector<QRegExpAutomatonState> s; // array of states |
1161 | #ifndef QT_NO_REGEXP_CAPTURE |
1162 | QVector<QRegExpAtom> f; // atom hierarchy |
1163 | int nf; // number of atoms |
1164 | int cf; // current atom |
1165 | QVector<int> captureForOfficialCapture; |
1166 | #endif |
1167 | int officialncap; // number of captures, seen from the outside |
1168 | int ncap; // number of captures, seen from the inside |
1169 | #ifndef QT_NO_REGEXP_CCLASS |
1170 | QVector<QRegExpCharClass> cl; // array of character classes |
1171 | #endif |
1172 | #ifndef QT_NO_REGEXP_LOOKAHEAD |
1173 | QVector<QRegExpLookahead *> ahead; // array of lookaheads |
1174 | #endif |
1175 | #ifndef QT_NO_REGEXP_ANCHOR_ALT |
1176 | QVector<QRegExpAnchorAlternation> aa; // array of (a, b) pairs of anchors |
1177 | #endif |
1178 | #ifndef QT_NO_REGEXP_OPTIM |
1179 | bool caretAnchored; // does the regexp start with ^? |
1180 | bool trivial; // is the good-string all that needs to match? |
1181 | #endif |
1182 | bool valid; // is the regular expression valid? |
1183 | Qt::CaseSensitivity cs; // case sensitive? |
1184 | bool greedyQuantifiers; // RegExp2? |
1185 | bool xmlSchemaExtensions; |
1186 | #ifndef QT_NO_REGEXP_BACKREF |
1187 | int nbrefs; // number of back-references |
1188 | #endif |
1189 | |
1190 | #ifndef QT_NO_REGEXP_OPTIM |
1191 | bool useGoodStringHeuristic; // use goodStringMatch? otherwise badCharMatch |
1192 | |
1193 | int goodEarlyStart; // the index where goodStr can first occur in a match |
1194 | int goodLateStart; // the index where goodStr can last occur in a match |
1195 | QString goodStr; // the string that any match has to contain |
1196 | |
1197 | int minl; // the minimum length of a match |
1198 | QVector<int> occ1; // first-occurrence array |
1199 | #endif |
1200 | |
1201 | /* |
1202 | The class Box is an abstraction for a regular expression |
1203 | fragment. It can also be seen as one node in the syntax tree of |
1204 | a regular expression with synthetized attributes. |
1205 | |
1206 | Its interface is ugly for performance reasons. |
1207 | */ |
1208 | class Box |
1209 | { |
1210 | public: |
1211 | Box(QRegExpEngine *engine); |
1212 | Box(const Box &b) { operator=(b); } |
1213 | |
1214 | Box &operator=(const Box &b); |
1215 | |
1216 | void clear() { operator=(b: Box(eng)); } |
1217 | void set(QChar ch); |
1218 | void set(const QRegExpCharClass &cc); |
1219 | #ifndef QT_NO_REGEXP_BACKREF |
1220 | void set(int bref); |
1221 | #endif |
1222 | |
1223 | void cat(const Box &b); |
1224 | void orx(const Box &b); |
1225 | void plus(int atom); |
1226 | void opt(); |
1227 | void catAnchor(int a); |
1228 | #ifndef QT_NO_REGEXP_OPTIM |
1229 | void setupHeuristics(); |
1230 | #endif |
1231 | |
1232 | #if defined(QT_DEBUG) |
1233 | void dump() const; |
1234 | #endif |
1235 | |
1236 | private: |
1237 | void addAnchorsToEngine(const Box &to) const; |
1238 | |
1239 | QRegExpEngine *eng; // the automaton under construction |
1240 | QVector<int> ls; // the left states (firstpos) |
1241 | QVector<int> rs; // the right states (lastpos) |
1242 | QMap<int, int> lanchors; // the left anchors |
1243 | QMap<int, int> ranchors; // the right anchors |
1244 | int skipanchors; // the anchors to match if the box is skipped |
1245 | |
1246 | #ifndef QT_NO_REGEXP_OPTIM |
1247 | int earlyStart; // the index where str can first occur |
1248 | int lateStart; // the index where str can last occur |
1249 | QString str; // a string that has to occur in any match |
1250 | QString leftStr; // a string occurring at the left of this box |
1251 | QString rightStr; // a string occurring at the right of this box |
1252 | int maxl; // the maximum length of this box (possibly InftyLen) |
1253 | #endif |
1254 | |
1255 | int minl; // the minimum length of this box |
1256 | #ifndef QT_NO_REGEXP_OPTIM |
1257 | QVector<int> occ1; // first-occurrence array |
1258 | #endif |
1259 | }; |
1260 | |
1261 | friend class Box; |
1262 | |
1263 | /* |
1264 | This is the lexical analyzer for regular expressions. |
1265 | */ |
1266 | enum { Tok_Eos, Tok_Dollar, Tok_LeftParen, Tok_MagicLeftParen, Tok_PosLookahead, |
1267 | Tok_NegLookahead, Tok_RightParen, Tok_CharClass, Tok_Caret, Tok_Quantifier, Tok_Bar, |
1268 | Tok_Word, Tok_NonWord, Tok_Char = 0x10000, Tok_BackRef = 0x20000 }; |
1269 | int getChar(); |
1270 | int getEscape(); |
1271 | #ifndef QT_NO_REGEXP_INTERVAL |
1272 | int getRep(int def); |
1273 | #endif |
1274 | #ifndef QT_NO_REGEXP_LOOKAHEAD |
1275 | void skipChars(int n); |
1276 | #endif |
1277 | void error(const char *msg); |
1278 | void startTokenizer(const QChar *rx, int len); |
1279 | int getToken(); |
1280 | |
1281 | const QChar *yyIn; // a pointer to the input regular expression pattern |
1282 | int yyPos0; // the position of yyTok in the input pattern |
1283 | int yyPos; // the position of the next character to read |
1284 | int yyLen; // the length of yyIn |
1285 | int yyCh; // the last character read |
1286 | QScopedPointer<QRegExpCharClass> yyCharClass; // attribute for Tok_CharClass tokens |
1287 | int yyMinRep; // attribute for Tok_Quantifier |
1288 | int yyMaxRep; // ditto |
1289 | QString yyError; // syntax error or overflow during parsing? |
1290 | |
1291 | /* |
1292 | This is the syntactic analyzer for regular expressions. |
1293 | */ |
1294 | int parse(const QChar *rx, int len); |
1295 | void parseAtom(Box *box); |
1296 | void parseFactor(Box *box); |
1297 | void parseTerm(Box *box); |
1298 | void parseExpression(Box *box); |
1299 | |
1300 | int yyTok; // the last token read |
1301 | bool yyMayCapture; // set this to false to disable capturing |
1302 | |
1303 | friend struct QRegExpMatchState; |
1304 | }; |
1305 | |
1306 | #ifndef QT_NO_REGEXP_LOOKAHEAD |
1307 | /* |
1308 | The struct QRegExpLookahead represents a lookahead a la Perl (e.g., |
1309 | (?=foo) and (?!bar)). |
1310 | */ |
1311 | struct QRegExpLookahead |
1312 | { |
1313 | QRegExpEngine *eng; // NFA representing the embedded regular expression |
1314 | bool neg; // negative lookahead? |
1315 | |
1316 | inline QRegExpLookahead(QRegExpEngine *eng0, bool neg0) |
1317 | : eng(eng0), neg(neg0) { } |
1318 | inline ~QRegExpLookahead() { delete eng; } |
1319 | }; |
1320 | #endif |
1321 | |
1322 | /*! |
1323 | \internal |
1324 | convert the pattern string to the RegExp syntax. |
1325 | |
1326 | This is also used by QScriptEngine::newRegExp to convert to a pattern that JavaScriptCore can understan |
1327 | */ |
1328 | Q_CORE_EXPORT QString qt_regexp_toCanonical(const QString &pattern, QRegExp::PatternSyntax patternSyntax) |
1329 | { |
1330 | switch (patternSyntax) { |
1331 | #ifndef QT_NO_REGEXP_WILDCARD |
1332 | case QRegExp::Wildcard: |
1333 | return wc2rx(wc_str: pattern, enableEscaping: false); |
1334 | case QRegExp::WildcardUnix: |
1335 | return wc2rx(wc_str: pattern, enableEscaping: true); |
1336 | #endif |
1337 | case QRegExp::FixedString: |
1338 | return QRegExp::escape(str: pattern); |
1339 | case QRegExp::W3CXmlSchema11: |
1340 | default: |
1341 | return pattern; |
1342 | } |
1343 | } |
1344 | |
1345 | QRegExpEngine::QRegExpEngine(const QRegExpEngineKey &key) |
1346 | : cs(key.cs), greedyQuantifiers(key.patternSyntax == QRegExp::RegExp2), |
1347 | xmlSchemaExtensions(key.patternSyntax == QRegExp::W3CXmlSchema11) |
1348 | { |
1349 | setup(); |
1350 | |
1351 | QString rx = qt_regexp_toCanonical(pattern: key.pattern, patternSyntax: key.patternSyntax); |
1352 | |
1353 | valid = (parse(rx: rx.unicode(), len: rx.length()) == rx.length()); |
1354 | if (!valid) { |
1355 | #ifndef QT_NO_REGEXP_OPTIM |
1356 | trivial = false; |
1357 | #endif |
1358 | error(RXERR_LEFTDELIM); |
1359 | } |
1360 | } |
1361 | |
1362 | QRegExpEngine::~QRegExpEngine() |
1363 | { |
1364 | #ifndef QT_NO_REGEXP_LOOKAHEAD |
1365 | qDeleteAll(c: ahead); |
1366 | #endif |
1367 | } |
1368 | |
1369 | void QRegExpMatchState::prepareForMatch(QRegExpEngine *eng) |
1370 | { |
1371 | /* |
1372 | We use one QVector<int> for all the big data used a lot in |
1373 | matchHere() and friends. |
1374 | */ |
1375 | int ns = eng->s.size(); // number of states |
1376 | int ncap = eng->ncap; |
1377 | #ifndef QT_NO_REGEXP_OPTIM |
1378 | int newSlideTabSize = qMax(a: eng->minl + 1, b: 16); |
1379 | #else |
1380 | int newSlideTabSize = 0; |
1381 | #endif |
1382 | int numCaptures = eng->captureCount(); |
1383 | int newCapturedSize = 2 + 2 * numCaptures; |
1384 | bigArray = q_check_ptr(p: (int *)realloc(ptr: bigArray, size: ((3 + 4 * ncap) * ns + 4 * ncap + newSlideTabSize + newCapturedSize)*sizeof(int))); |
1385 | |
1386 | // set all internal variables only _after_ bigArray is realloc'ed |
1387 | // to prevent a broken regexp in oom case |
1388 | |
1389 | slideTabSize = newSlideTabSize; |
1390 | capturedSize = newCapturedSize; |
1391 | inNextStack = bigArray; |
1392 | memset(s: inNextStack, c: -1, n: ns * sizeof(int)); |
1393 | curStack = inNextStack + ns; |
1394 | nextStack = inNextStack + 2 * ns; |
1395 | |
1396 | curCapBegin = inNextStack + 3 * ns; |
1397 | nextCapBegin = curCapBegin + ncap * ns; |
1398 | curCapEnd = curCapBegin + 2 * ncap * ns; |
1399 | nextCapEnd = curCapBegin + 3 * ncap * ns; |
1400 | |
1401 | tempCapBegin = curCapBegin + 4 * ncap * ns; |
1402 | tempCapEnd = tempCapBegin + ncap; |
1403 | capBegin = tempCapBegin + 2 * ncap; |
1404 | capEnd = tempCapBegin + 3 * ncap; |
1405 | |
1406 | slideTab = tempCapBegin + 4 * ncap; |
1407 | captured = slideTab + slideTabSize; |
1408 | memset(s: captured, c: -1, n: capturedSize*sizeof(int)); |
1409 | this->eng = eng; |
1410 | } |
1411 | |
1412 | /* |
1413 | Tries to match in str and returns an array of (begin, length) pairs |
1414 | for captured text. If there is no match, all pairs are (-1, -1). |
1415 | */ |
1416 | void QRegExpMatchState::match(const QChar *str0, int len0, int pos0, |
1417 | bool minimal0, bool oneTest, int caretIndex) |
1418 | { |
1419 | bool matched = false; |
1420 | QChar char_null; |
1421 | |
1422 | #ifndef QT_NO_REGEXP_OPTIM |
1423 | if (eng->trivial && !oneTest) { |
1424 | // ### Qt6: qsizetype |
1425 | pos = int(QtPrivate::findString(haystack: QStringView(str0, len0), from: pos0, needle: QStringView(eng->goodStr.unicode(), eng->goodStr.length()), cs: eng->cs)); |
1426 | matchLen = eng->goodStr.length(); |
1427 | matched = (pos != -1); |
1428 | } else |
1429 | #endif |
1430 | { |
1431 | in = str0; |
1432 | if (in == nullptr) |
1433 | in = &char_null; |
1434 | pos = pos0; |
1435 | caretPos = caretIndex; |
1436 | len = len0; |
1437 | minimal = minimal0; |
1438 | matchLen = 0; |
1439 | oneTestMatchedLen = 0; |
1440 | |
1441 | if (eng->valid && pos >= 0 && pos <= len) { |
1442 | #ifndef QT_NO_REGEXP_OPTIM |
1443 | if (oneTest) { |
1444 | matched = matchHere(); |
1445 | } else { |
1446 | if (pos <= len - eng->minl) { |
1447 | if (eng->caretAnchored) { |
1448 | matched = matchHere(); |
1449 | } else if (eng->useGoodStringHeuristic) { |
1450 | matched = eng->goodStringMatch(matchState&: *this); |
1451 | } else { |
1452 | matched = eng->badCharMatch(matchState&: *this); |
1453 | } |
1454 | } |
1455 | } |
1456 | #else |
1457 | matched = oneTest ? matchHere() : eng->bruteMatch(*this); |
1458 | #endif |
1459 | } |
1460 | } |
1461 | |
1462 | if (matched) { |
1463 | int *c = captured; |
1464 | *c++ = pos; |
1465 | *c++ = matchLen; |
1466 | |
1467 | int numCaptures = (capturedSize - 2) >> 1; |
1468 | #ifndef QT_NO_REGEXP_CAPTURE |
1469 | for (int i = 0; i < numCaptures; ++i) { |
1470 | int j = eng->captureForOfficialCapture.at(i); |
1471 | if (capBegin[j] != EmptyCapture) { |
1472 | int len = capEnd[j] - capBegin[j]; |
1473 | *c++ = (len > 0) ? pos + capBegin[j] : 0; |
1474 | *c++ = len; |
1475 | } else { |
1476 | *c++ = -1; |
1477 | *c++ = -1; |
1478 | } |
1479 | } |
1480 | #endif |
1481 | } else { |
1482 | // we rely on 2's complement here |
1483 | memset(s: captured, c: -1, n: capturedSize * sizeof(int)); |
1484 | } |
1485 | } |
1486 | |
1487 | /* |
1488 | The three following functions add one state to the automaton and |
1489 | return the number of the state. |
1490 | */ |
1491 | |
1492 | int QRegExpEngine::createState(QChar ch) |
1493 | { |
1494 | return setupState(ch.unicode()); |
1495 | } |
1496 | |
1497 | int QRegExpEngine::createState(const QRegExpCharClass &cc) |
1498 | { |
1499 | #ifndef QT_NO_REGEXP_CCLASS |
1500 | int n = cl.size(); |
1501 | cl += QRegExpCharClass(cc); |
1502 | return setupState(CharClassBit | n); |
1503 | #else |
1504 | Q_UNUSED(cc); |
1505 | return setupState(CharClassBit); |
1506 | #endif |
1507 | } |
1508 | |
1509 | #ifndef QT_NO_REGEXP_BACKREF |
1510 | int QRegExpEngine::createState(int bref) |
1511 | { |
1512 | if (bref > nbrefs) { |
1513 | nbrefs = bref; |
1514 | if (nbrefs > MaxBackRefs) { |
1515 | error(RXERR_LIMIT); |
1516 | return 0; |
1517 | } |
1518 | } |
1519 | return setupState(BackRefBit | bref); |
1520 | } |
1521 | #endif |
1522 | |
1523 | /* |
1524 | The two following functions add a transition between all pairs of |
1525 | states (i, j) where i is found in from, and j is found in to. |
1526 | |
1527 | Cat-transitions are distinguished from plus-transitions for |
1528 | capturing. |
1529 | */ |
1530 | |
1531 | void QRegExpEngine::addCatTransitions(const QVector<int> &from, const QVector<int> &to) |
1532 | { |
1533 | for (int i = 0; i < from.size(); i++) |
1534 | mergeInto(a: &s[from.at(i)].outs, b: to); |
1535 | } |
1536 | |
1537 | #ifndef QT_NO_REGEXP_CAPTURE |
1538 | void QRegExpEngine::addPlusTransitions(const QVector<int> &from, const QVector<int> &to, int atom) |
1539 | { |
1540 | for (int i = 0; i < from.size(); i++) { |
1541 | QRegExpAutomatonState &st = s[from.at(i)]; |
1542 | const QVector<int> oldOuts = st.outs; |
1543 | mergeInto(a: &st.outs, b: to); |
1544 | if (f.at(i: atom).capture != QRegExpAtom::NoCapture) { |
1545 | for (int j = 0; j < to.size(); j++) { |
1546 | // ### st.reenter.contains(to.at(j)) check looks suspicious |
1547 | if (!st.reenter.contains(akey: to.at(i: j)) && |
1548 | !std::binary_search(first: oldOuts.constBegin(), last: oldOuts.constEnd(), val: to.at(i: j))) |
1549 | st.reenter.insert(akey: to.at(i: j), avalue: atom); |
1550 | } |
1551 | } |
1552 | } |
1553 | } |
1554 | #endif |
1555 | |
1556 | #ifndef QT_NO_REGEXP_ANCHOR_ALT |
1557 | /* |
1558 | Returns an anchor that means a OR b. |
1559 | */ |
1560 | int QRegExpEngine::anchorAlternation(int a, int b) |
1561 | { |
1562 | if (((a & b) == a || (a & b) == b) && ((a | b) & Anchor_Alternation) == 0) |
1563 | return a & b; |
1564 | |
1565 | int n = aa.size(); |
1566 | #ifndef QT_NO_REGEXP_OPTIM |
1567 | if (n > 0 && aa.at(i: n - 1).a == a && aa.at(i: n - 1).b == b) |
1568 | return Anchor_Alternation | (n - 1); |
1569 | #endif |
1570 | |
1571 | QRegExpAnchorAlternation element = {.a: a, .b: b}; |
1572 | aa.append(t: element); |
1573 | return Anchor_Alternation | n; |
1574 | } |
1575 | |
1576 | /* |
1577 | Returns an anchor that means a AND b. |
1578 | */ |
1579 | int QRegExpEngine::anchorConcatenation(int a, int b) |
1580 | { |
1581 | if (((a | b) & Anchor_Alternation) == 0) |
1582 | return a | b; |
1583 | if ((b & Anchor_Alternation) != 0) |
1584 | qSwap(value1&: a, value2&: b); |
1585 | |
1586 | int aprime = anchorConcatenation(a: aa.at(i: a ^ Anchor_Alternation).a, b); |
1587 | int bprime = anchorConcatenation(a: aa.at(i: a ^ Anchor_Alternation).b, b); |
1588 | return anchorAlternation(a: aprime, b: bprime); |
1589 | } |
1590 | #endif |
1591 | |
1592 | /* |
1593 | Adds anchor a on a transition caracterised by its from state and |
1594 | its to state. |
1595 | */ |
1596 | void QRegExpEngine::addAnchors(int from, int to, int a) |
1597 | { |
1598 | QRegExpAutomatonState &st = s[from]; |
1599 | if (st.anchors.contains(akey: to)) |
1600 | a = anchorAlternation(a: st.anchors.value(akey: to), b: a); |
1601 | st.anchors.insert(akey: to, avalue: a); |
1602 | } |
1603 | |
1604 | #ifndef QT_NO_REGEXP_OPTIM |
1605 | /* |
1606 | This function chooses between the good-string and the bad-character |
1607 | heuristics. It computes two scores and chooses the heuristic with |
1608 | the highest score. |
1609 | |
1610 | Here are some common-sense constraints on the scores that should be |
1611 | respected if the formulas are ever modified: (1) If goodStr is |
1612 | empty, the good-string heuristic scores 0. (2) If the regular |
1613 | expression is trivial, the good-string heuristic should be used. |
1614 | (3) If the search is case insensitive, the good-string heuristic |
1615 | should be used, unless it scores 0. (Case insensitivity turns all |
1616 | entries of occ1 to 0.) (4) If (goodLateStart - goodEarlyStart) is |
1617 | big, the good-string heuristic should score less. |
1618 | */ |
1619 | void QRegExpEngine::heuristicallyChooseHeuristic() |
1620 | { |
1621 | if (minl == 0) { |
1622 | useGoodStringHeuristic = false; |
1623 | } else if (trivial) { |
1624 | useGoodStringHeuristic = true; |
1625 | } else { |
1626 | /* |
1627 | Magic formula: The good string has to constitute a good |
1628 | proportion of the minimum-length string, and appear at a |
1629 | more-or-less known index. |
1630 | */ |
1631 | int goodStringScore = (64 * goodStr.length() / minl) - |
1632 | (goodLateStart - goodEarlyStart); |
1633 | /* |
1634 | Less magic formula: We pick some characters at random, and |
1635 | check whether they are good or bad. |
1636 | */ |
1637 | int badCharScore = 0; |
1638 | int step = qMax(a: 1, b: NumBadChars / 32); |
1639 | for (int i = 1; i < NumBadChars; i += step) { |
1640 | if (occ1.at(i) == NoOccurrence) |
1641 | badCharScore += minl; |
1642 | else |
1643 | badCharScore += occ1.at(i); |
1644 | } |
1645 | badCharScore /= minl; |
1646 | useGoodStringHeuristic = (goodStringScore > badCharScore); |
1647 | } |
1648 | } |
1649 | #endif |
1650 | |
1651 | #if defined(QT_DEBUG) |
1652 | void QRegExpEngine::dump() const |
1653 | { |
1654 | int i, j; |
1655 | qDebug(msg: "Case %ssensitive engine" , cs ? "" : "in" ); |
1656 | qDebug(msg: " States" ); |
1657 | for (i = 0; i < s.size(); i++) { |
1658 | qDebug(msg: " %d%s" , i, i == InitialState ? " (initial)" : i == FinalState ? " (final)" : "" ); |
1659 | #ifndef QT_NO_REGEXP_CAPTURE |
1660 | if (nf > 0) |
1661 | qDebug(msg: " in atom %d" , s[i].atom); |
1662 | #endif |
1663 | int m = s[i].match; |
1664 | if ((m & CharClassBit) != 0) { |
1665 | qDebug(msg: " match character class %d" , m ^ CharClassBit); |
1666 | #ifndef QT_NO_REGEXP_CCLASS |
1667 | cl[m ^ CharClassBit].dump(); |
1668 | #else |
1669 | qDebug(" negative character class" ); |
1670 | #endif |
1671 | } else if ((m & BackRefBit) != 0) { |
1672 | qDebug(msg: " match back-reference %d" , m ^ BackRefBit); |
1673 | } else if (m >= 0x20 && m <= 0x7e) { |
1674 | qDebug(msg: " match 0x%.4x (%c)" , m, m); |
1675 | } else { |
1676 | qDebug(msg: " match 0x%.4x" , m); |
1677 | } |
1678 | for (j = 0; j < s[i].outs.size(); j++) { |
1679 | int next = s[i].outs[j]; |
1680 | qDebug(msg: " -> %d" , next); |
1681 | if (s[i].reenter.contains(akey: next)) |
1682 | qDebug(msg: " [reenter %d]" , s[i].reenter[next]); |
1683 | if (s[i].anchors.value(akey: next) != 0) |
1684 | qDebug(msg: " [anchors 0x%.8x]" , s[i].anchors[next]); |
1685 | } |
1686 | } |
1687 | #ifndef QT_NO_REGEXP_CAPTURE |
1688 | if (nf > 0) { |
1689 | qDebug(msg: " Atom Parent Capture" ); |
1690 | for (i = 0; i < nf; i++) { |
1691 | if (f[i].capture == QRegExpAtom::NoCapture) { |
1692 | qDebug(msg: " %6d %6d nil" , i, f[i].parent); |
1693 | } else { |
1694 | int cap = f[i].capture; |
1695 | bool official = captureForOfficialCapture.contains(t: cap); |
1696 | qDebug(msg: " %6d %6d %6d %s" , i, f[i].parent, f[i].capture, |
1697 | official ? "official" : "" ); |
1698 | } |
1699 | } |
1700 | } |
1701 | #endif |
1702 | #ifndef QT_NO_REGEXP_ANCHOR_ALT |
1703 | for (i = 0; i < aa.size(); i++) |
1704 | qDebug(msg: " Anchor alternation 0x%.8x: 0x%.8x 0x%.9x" , i, aa[i].a, aa[i].b); |
1705 | #endif |
1706 | } |
1707 | #endif |
1708 | |
1709 | void QRegExpEngine::setup() |
1710 | { |
1711 | ref.storeRelaxed(newValue: 1); |
1712 | #ifndef QT_NO_REGEXP_CAPTURE |
1713 | f.resize(asize: 32); |
1714 | nf = 0; |
1715 | cf = -1; |
1716 | #endif |
1717 | officialncap = 0; |
1718 | ncap = 0; |
1719 | #ifndef QT_NO_REGEXP_OPTIM |
1720 | caretAnchored = true; |
1721 | trivial = true; |
1722 | #endif |
1723 | valid = false; |
1724 | #ifndef QT_NO_REGEXP_BACKREF |
1725 | nbrefs = 0; |
1726 | #endif |
1727 | #ifndef QT_NO_REGEXP_OPTIM |
1728 | useGoodStringHeuristic = true; |
1729 | minl = 0; |
1730 | occ1.fill(from: 0, asize: NumBadChars); |
1731 | #endif |
1732 | } |
1733 | |
1734 | int QRegExpEngine::setupState(int match) |
1735 | { |
1736 | #ifndef QT_NO_REGEXP_CAPTURE |
1737 | s += QRegExpAutomatonState(cf, match); |
1738 | #else |
1739 | s += QRegExpAutomatonState(match); |
1740 | #endif |
1741 | return s.size() - 1; |
1742 | } |
1743 | |
1744 | #ifndef QT_NO_REGEXP_CAPTURE |
1745 | /* |
1746 | Functions startAtom() and finishAtom() should be called to delimit |
1747 | atoms. When a state is created, it is assigned to the current atom. |
1748 | The information is later used for capturing. |
1749 | */ |
1750 | int QRegExpEngine::startAtom(bool officialCapture) |
1751 | { |
1752 | if ((nf & (nf + 1)) == 0 && nf + 1 >= f.size()) |
1753 | f.resize(asize: (nf + 1) << 1); |
1754 | f[nf].parent = cf; |
1755 | cf = nf++; |
1756 | f[cf].capture = officialCapture ? QRegExpAtom::OfficialCapture : QRegExpAtom::NoCapture; |
1757 | return cf; |
1758 | } |
1759 | |
1760 | void QRegExpEngine::finishAtom(int atom, bool needCapture) |
1761 | { |
1762 | if (greedyQuantifiers && needCapture && f[atom].capture == QRegExpAtom::NoCapture) |
1763 | f[atom].capture = QRegExpAtom::UnofficialCapture; |
1764 | cf = f.at(i: atom).parent; |
1765 | } |
1766 | #endif |
1767 | |
1768 | #ifndef QT_NO_REGEXP_LOOKAHEAD |
1769 | /* |
1770 | Creates a lookahead anchor. |
1771 | */ |
1772 | int QRegExpEngine::addLookahead(QRegExpEngine *eng, bool negative) |
1773 | { |
1774 | int n = ahead.size(); |
1775 | if (n == MaxLookaheads) { |
1776 | error(RXERR_LIMIT); |
1777 | return 0; |
1778 | } |
1779 | ahead += new QRegExpLookahead(eng, negative); |
1780 | return Anchor_FirstLookahead << n; |
1781 | } |
1782 | #endif |
1783 | |
1784 | #ifndef QT_NO_REGEXP_CAPTURE |
1785 | /* |
1786 | We want the longest leftmost captures. |
1787 | */ |
1788 | static bool isBetterCapture(int ncap, const int *begin1, const int *end1, const int *begin2, |
1789 | const int *end2) |
1790 | { |
1791 | for (int i = 0; i < ncap; i++) { |
1792 | int delta = begin2[i] - begin1[i]; // it has to start early... |
1793 | if (delta == 0) |
1794 | delta = end1[i] - end2[i]; // ...and end late |
1795 | |
1796 | if (delta != 0) |
1797 | return delta > 0; |
1798 | } |
1799 | return false; |
1800 | } |
1801 | #endif |
1802 | |
1803 | /* |
1804 | Returns \c true if anchor a matches at position pos + i in the input |
1805 | string, otherwise false. |
1806 | */ |
1807 | bool QRegExpMatchState::testAnchor(int i, int a, const int *capBegin) |
1808 | { |
1809 | int j; |
1810 | |
1811 | #ifndef QT_NO_REGEXP_ANCHOR_ALT |
1812 | if ((a & QRegExpEngine::Anchor_Alternation) != 0) |
1813 | return testAnchor(i, a: eng->aa.at(i: a ^ QRegExpEngine::Anchor_Alternation).a, capBegin) |
1814 | || testAnchor(i, a: eng->aa.at(i: a ^ QRegExpEngine::Anchor_Alternation).b, capBegin); |
1815 | #endif |
1816 | |
1817 | if ((a & QRegExpEngine::Anchor_Caret) != 0) { |
1818 | if (pos + i != caretPos) |
1819 | return false; |
1820 | } |
1821 | if ((a & QRegExpEngine::Anchor_Dollar) != 0) { |
1822 | if (pos + i != len) |
1823 | return false; |
1824 | } |
1825 | #ifndef QT_NO_REGEXP_ESCAPE |
1826 | if ((a & (QRegExpEngine::Anchor_Word | QRegExpEngine::Anchor_NonWord)) != 0) { |
1827 | bool before = false; |
1828 | bool after = false; |
1829 | if (pos + i != 0) |
1830 | before = isWord(ch: in[pos + i - 1]); |
1831 | if (pos + i != len) |
1832 | after = isWord(ch: in[pos + i]); |
1833 | if ((a & QRegExpEngine::Anchor_Word) != 0 && (before == after)) |
1834 | return false; |
1835 | if ((a & QRegExpEngine::Anchor_NonWord) != 0 && (before != after)) |
1836 | return false; |
1837 | } |
1838 | #endif |
1839 | #ifndef QT_NO_REGEXP_LOOKAHEAD |
1840 | if ((a & QRegExpEngine::Anchor_LookaheadMask) != 0) { |
1841 | const QVector<QRegExpLookahead *> &ahead = eng->ahead; |
1842 | for (j = 0; j < ahead.size(); j++) { |
1843 | if ((a & (QRegExpEngine::Anchor_FirstLookahead << j)) != 0) { |
1844 | QRegExpMatchState matchState; |
1845 | matchState.prepareForMatch(eng: ahead[j]->eng); |
1846 | matchState.match(str0: in + pos + i, len0: len - pos - i, pos0: 0, |
1847 | minimal0: true, oneTest: true, caretIndex: caretPos - pos - i); |
1848 | if ((matchState.captured[0] == 0) == ahead[j]->neg) |
1849 | return false; |
1850 | } |
1851 | } |
1852 | } |
1853 | #endif |
1854 | #ifndef QT_NO_REGEXP_CAPTURE |
1855 | #ifndef QT_NO_REGEXP_BACKREF |
1856 | for (j = 0; j < eng->nbrefs; j++) { |
1857 | if ((a & (QRegExpEngine::Anchor_BackRef1Empty << j)) != 0) { |
1858 | int i = eng->captureForOfficialCapture.at(i: j); |
1859 | if (capBegin[i] != EmptyCapture) |
1860 | return false; |
1861 | } |
1862 | } |
1863 | #endif |
1864 | #endif |
1865 | return true; |
1866 | } |
1867 | |
1868 | #ifndef QT_NO_REGEXP_OPTIM |
1869 | /* |
1870 | The three following functions are what Jeffrey Friedl would call |
1871 | transmissions (or bump-alongs). Using one or the other should make |
1872 | no difference except in performance. |
1873 | */ |
1874 | |
1875 | bool QRegExpEngine::goodStringMatch(QRegExpMatchState &matchState) const |
1876 | { |
1877 | int k = matchState.pos + goodEarlyStart; |
1878 | QStringMatcher matcher(goodStr.unicode(), goodStr.length(), cs); |
1879 | while ((k = matcher.indexIn(str: matchState.in, length: matchState.len, from: k)) != -1) { |
1880 | int from = k - goodLateStart; |
1881 | int to = k - goodEarlyStart; |
1882 | if (from > matchState.pos) |
1883 | matchState.pos = from; |
1884 | |
1885 | while (matchState.pos <= to) { |
1886 | if (matchState.matchHere()) |
1887 | return true; |
1888 | ++matchState.pos; |
1889 | } |
1890 | ++k; |
1891 | } |
1892 | return false; |
1893 | } |
1894 | |
1895 | bool QRegExpEngine::badCharMatch(QRegExpMatchState &matchState) const |
1896 | { |
1897 | int slideHead = 0; |
1898 | int slideNext = 0; |
1899 | int i; |
1900 | int lastPos = matchState.len - minl; |
1901 | memset(s: matchState.slideTab, c: 0, n: matchState.slideTabSize * sizeof(int)); |
1902 | |
1903 | /* |
1904 | Set up the slide table, used for the bad-character heuristic, |
1905 | using the table of first occurrence of each character. |
1906 | */ |
1907 | for (i = 0; i < minl; i++) { |
1908 | int sk = occ1[BadChar(matchState.in[matchState.pos + i])]; |
1909 | if (sk == NoOccurrence) |
1910 | sk = i + 1; |
1911 | if (sk > 0) { |
1912 | int k = i + 1 - sk; |
1913 | if (k < 0) { |
1914 | sk = i + 1; |
1915 | k = 0; |
1916 | } |
1917 | if (sk > matchState.slideTab[k]) |
1918 | matchState.slideTab[k] = sk; |
1919 | } |
1920 | } |
1921 | |
1922 | if (matchState.pos > lastPos) |
1923 | return false; |
1924 | |
1925 | for (;;) { |
1926 | if (++slideNext >= matchState.slideTabSize) |
1927 | slideNext = 0; |
1928 | if (matchState.slideTab[slideHead] > 0) { |
1929 | if (matchState.slideTab[slideHead] - 1 > matchState.slideTab[slideNext]) |
1930 | matchState.slideTab[slideNext] = matchState.slideTab[slideHead] - 1; |
1931 | matchState.slideTab[slideHead] = 0; |
1932 | } else { |
1933 | if (matchState.matchHere()) |
1934 | return true; |
1935 | } |
1936 | |
1937 | if (matchState.pos == lastPos) |
1938 | break; |
1939 | |
1940 | /* |
1941 | Update the slide table. This code has much in common with |
1942 | the initialization code. |
1943 | */ |
1944 | int sk = occ1[BadChar(matchState.in[matchState.pos + minl])]; |
1945 | if (sk == NoOccurrence) { |
1946 | matchState.slideTab[slideNext] = minl; |
1947 | } else if (sk > 0) { |
1948 | int k = slideNext + minl - sk; |
1949 | if (k >= matchState.slideTabSize) |
1950 | k -= matchState.slideTabSize; |
1951 | if (sk > matchState.slideTab[k]) |
1952 | matchState.slideTab[k] = sk; |
1953 | } |
1954 | slideHead = slideNext; |
1955 | ++matchState.pos; |
1956 | } |
1957 | return false; |
1958 | } |
1959 | #else |
1960 | bool QRegExpEngine::bruteMatch(QRegExpMatchState &matchState) const |
1961 | { |
1962 | while (matchState.pos <= matchState.len) { |
1963 | if (matchState.matchHere()) |
1964 | return true; |
1965 | ++matchState.pos; |
1966 | } |
1967 | return false; |
1968 | } |
1969 | #endif |
1970 | |
1971 | /* |
1972 | Here's the core of the engine. It tries to do a match here and now. |
1973 | */ |
1974 | bool QRegExpMatchState::matchHere() |
1975 | { |
1976 | int ncur = 1, nnext = 0; |
1977 | int i = 0, j, k, m; |
1978 | bool stop = false; |
1979 | |
1980 | matchLen = -1; |
1981 | oneTestMatchedLen = -1; |
1982 | curStack[0] = QRegExpEngine::InitialState; |
1983 | |
1984 | int ncap = eng->ncap; |
1985 | #ifndef QT_NO_REGEXP_CAPTURE |
1986 | if (ncap > 0) { |
1987 | for (j = 0; j < ncap; j++) { |
1988 | curCapBegin[j] = EmptyCapture; |
1989 | curCapEnd[j] = EmptyCapture; |
1990 | } |
1991 | } |
1992 | #endif |
1993 | |
1994 | #ifndef QT_NO_REGEXP_BACKREF |
1995 | while ((ncur > 0 || !sleeping.isEmpty()) && i <= len - pos && !stop) |
1996 | #else |
1997 | while (ncur > 0 && i <= len - pos && !stop) |
1998 | #endif |
1999 | { |
2000 | int ch = (i < len - pos) ? in[pos + i].unicode() : 0; |
2001 | for (j = 0; j < ncur; j++) { |
2002 | int cur = curStack[j]; |
2003 | const QRegExpAutomatonState &scur = eng->s.at(i: cur); |
2004 | const QVector<int> &outs = scur.outs; |
2005 | for (k = 0; k < outs.size(); k++) { |
2006 | int next = outs.at(i: k); |
2007 | const QRegExpAutomatonState &snext = eng->s.at(i: next); |
2008 | bool inside = true; |
2009 | #if !defined(QT_NO_REGEXP_BACKREF) && !defined(QT_NO_REGEXP_CAPTURE) |
2010 | int needSomeSleep = 0; |
2011 | #endif |
2012 | |
2013 | /* |
2014 | First, check if the anchors are anchored properly. |
2015 | */ |
2016 | int a = scur.anchors.value(akey: next); |
2017 | if (a != 0 && !testAnchor(i, a, capBegin: curCapBegin + j * ncap)) |
2018 | inside = false; |
2019 | |
2020 | /* |
2021 | If indeed they are, check if the input character is |
2022 | correct for this transition. |
2023 | */ |
2024 | if (inside) { |
2025 | m = snext.match; |
2026 | if ((m & (QRegExpEngine::CharClassBit | QRegExpEngine::BackRefBit)) == 0) { |
2027 | if (eng->cs) |
2028 | inside = (m == ch); |
2029 | else |
2030 | inside = (QChar(m).toLower() == QChar(ch).toLower()); |
2031 | } else if (next == QRegExpEngine::FinalState) { |
2032 | matchLen = i; |
2033 | stop = minimal; |
2034 | inside = true; |
2035 | } else if ((m & QRegExpEngine::CharClassBit) != 0) { |
2036 | #ifndef QT_NO_REGEXP_CCLASS |
2037 | const QRegExpCharClass &cc = eng->cl.at(i: m ^ QRegExpEngine::CharClassBit); |
2038 | if (eng->cs) |
2039 | inside = cc.in(ch: QChar(ch)); |
2040 | else if (cc.negative()) |
2041 | inside = cc.in(ch: QChar(ch).toLower()) && |
2042 | cc.in(ch: QChar(ch).toUpper()); |
2043 | else |
2044 | inside = cc.in(ch: QChar(ch).toLower()) || |
2045 | cc.in(ch: QChar(ch).toUpper()); |
2046 | #endif |
2047 | #if !defined(QT_NO_REGEXP_BACKREF) && !defined(QT_NO_REGEXP_CAPTURE) |
2048 | } else { /* ((m & QRegExpEngine::BackRefBit) != 0) */ |
2049 | int bref = m ^ QRegExpEngine::BackRefBit; |
2050 | int ell = j * ncap + eng->captureForOfficialCapture.at(i: bref - 1); |
2051 | |
2052 | inside = bref <= ncap && curCapBegin[ell] != EmptyCapture; |
2053 | if (inside) { |
2054 | if (eng->cs) |
2055 | inside = (in[pos + curCapBegin[ell]] == QChar(ch)); |
2056 | else |
2057 | inside = (in[pos + curCapBegin[ell]].toLower() |
2058 | == QChar(ch).toLower()); |
2059 | } |
2060 | |
2061 | if (inside) { |
2062 | int delta; |
2063 | if (curCapEnd[ell] == EmptyCapture) |
2064 | delta = i - curCapBegin[ell]; |
2065 | else |
2066 | delta = curCapEnd[ell] - curCapBegin[ell]; |
2067 | |
2068 | inside = (delta <= len - (pos + i)); |
2069 | if (inside && delta > 1) { |
2070 | int n = 1; |
2071 | if (eng->cs) { |
2072 | while (n < delta) { |
2073 | if (in[pos + curCapBegin[ell] + n] |
2074 | != in[pos + i + n]) |
2075 | break; |
2076 | ++n; |
2077 | } |
2078 | } else { |
2079 | while (n < delta) { |
2080 | QChar a = in[pos + curCapBegin[ell] + n]; |
2081 | QChar b = in[pos + i + n]; |
2082 | if (a.toLower() != b.toLower()) |
2083 | break; |
2084 | ++n; |
2085 | } |
2086 | } |
2087 | inside = (n == delta); |
2088 | if (inside) |
2089 | needSomeSleep = delta - 1; |
2090 | } |
2091 | } |
2092 | #endif |
2093 | } |
2094 | } |
2095 | |
2096 | /* |
2097 | We must now update our data structures. |
2098 | */ |
2099 | if (inside) { |
2100 | #ifndef QT_NO_REGEXP_CAPTURE |
2101 | int *capBegin, *capEnd; |
2102 | #endif |
2103 | /* |
2104 | If the next state was not encountered yet, all |
2105 | is fine. |
2106 | */ |
2107 | if ((m = inNextStack[next]) == -1) { |
2108 | m = nnext++; |
2109 | nextStack[m] = next; |
2110 | inNextStack[next] = m; |
2111 | #ifndef QT_NO_REGEXP_CAPTURE |
2112 | capBegin = nextCapBegin + m * ncap; |
2113 | capEnd = nextCapEnd + m * ncap; |
2114 | |
2115 | /* |
2116 | Otherwise, we'll first maintain captures in |
2117 | temporary arrays, and decide at the end whether |
2118 | it's best to keep the previous capture zones or |
2119 | the new ones. |
2120 | */ |
2121 | } else { |
2122 | capBegin = tempCapBegin; |
2123 | capEnd = tempCapEnd; |
2124 | #endif |
2125 | } |
2126 | |
2127 | #ifndef QT_NO_REGEXP_CAPTURE |
2128 | /* |
2129 | Updating the capture zones is much of a task. |
2130 | */ |
2131 | if (ncap > 0) { |
2132 | memcpy(dest: capBegin, src: curCapBegin + j * ncap, n: ncap * sizeof(int)); |
2133 | memcpy(dest: capEnd, src: curCapEnd + j * ncap, n: ncap * sizeof(int)); |
2134 | int c = scur.atom, n = snext.atom; |
2135 | int p = -1, q = -1; |
2136 | int cap; |
2137 | |
2138 | /* |
2139 | Lemma 1. For any x in the range [0..nf), we |
2140 | have f[x].parent < x. |
2141 | |
2142 | Proof. By looking at startAtom(), it is |
2143 | clear that cf < nf holds all the time, and |
2144 | thus that f[nf].parent < nf. |
2145 | */ |
2146 | |
2147 | /* |
2148 | If we are reentering an atom, we empty all |
2149 | capture zones inside it. |
2150 | */ |
2151 | if ((q = scur.reenter.value(akey: next)) != 0) { |
2152 | QBitArray b(eng->nf, false); |
2153 | b.setBit(i: q, val: true); |
2154 | for (int ell = q + 1; ell < eng->nf; ell++) { |
2155 | if (b.testBit(i: eng->f.at(i: ell).parent)) { |
2156 | b.setBit(i: ell, val: true); |
2157 | cap = eng->f.at(i: ell).capture; |
2158 | if (cap >= 0) { |
2159 | capBegin[cap] = EmptyCapture; |
2160 | capEnd[cap] = EmptyCapture; |
2161 | } |
2162 | } |
2163 | } |
2164 | p = eng->f.at(i: q).parent; |
2165 | |
2166 | /* |
2167 | Otherwise, close the capture zones we are |
2168 | leaving. We are leaving f[c].capture, |
2169 | f[f[c].parent].capture, |
2170 | f[f[f[c].parent].parent].capture, ..., |
2171 | until f[x].capture, with x such that |
2172 | f[x].parent is the youngest common ancestor |
2173 | for c and n. |
2174 | |
2175 | We go up along c's and n's ancestry until |
2176 | we find x. |
2177 | */ |
2178 | } else { |
2179 | p = c; |
2180 | q = n; |
2181 | while (p != q) { |
2182 | if (p > q) { |
2183 | cap = eng->f.at(i: p).capture; |
2184 | if (cap >= 0) { |
2185 | if (capBegin[cap] == i) { |
2186 | capBegin[cap] = EmptyCapture; |
2187 | capEnd[cap] = EmptyCapture; |
2188 | } else { |
2189 | capEnd[cap] = i; |
2190 | } |
2191 | } |
2192 | p = eng->f.at(i: p).parent; |
2193 | } else { |
2194 | q = eng->f.at(i: q).parent; |
2195 | } |
2196 | } |
2197 | } |
2198 | |
2199 | /* |
2200 | In any case, we now open the capture zones |
2201 | we are entering. We work upwards from n |
2202 | until we reach p (the parent of the atom we |
2203 | reenter or the youngest common ancestor). |
2204 | */ |
2205 | while (n > p) { |
2206 | cap = eng->f.at(i: n).capture; |
2207 | if (cap >= 0) { |
2208 | capBegin[cap] = i; |
2209 | capEnd[cap] = EmptyCapture; |
2210 | } |
2211 | n = eng->f.at(i: n).parent; |
2212 | } |
2213 | /* |
2214 | If the next state was already in |
2215 | nextStack, we must choose carefully which |
2216 | capture zones we want to keep. |
2217 | */ |
2218 | if (capBegin == tempCapBegin && |
2219 | isBetterCapture(ncap, begin1: capBegin, end1: capEnd, begin2: nextCapBegin + m * ncap, |
2220 | end2: nextCapEnd + m * ncap)) { |
2221 | memcpy(dest: nextCapBegin + m * ncap, src: capBegin, n: ncap * sizeof(int)); |
2222 | memcpy(dest: nextCapEnd + m * ncap, src: capEnd, n: ncap * sizeof(int)); |
2223 | } |
2224 | } |
2225 | #ifndef QT_NO_REGEXP_BACKREF |
2226 | /* |
2227 | We are done with updating the capture zones. |
2228 | It's now time to put the next state to sleep, |
2229 | if it needs to, and to remove it from |
2230 | nextStack. |
2231 | */ |
2232 | if (needSomeSleep > 0) { |
2233 | QVector<int> zzZ(2 + 2 * ncap); |
2234 | zzZ[0] = i + needSomeSleep; |
2235 | zzZ[1] = next; |
2236 | if (ncap > 0) { |
2237 | memcpy(dest: zzZ.data() + 2, src: capBegin, n: ncap * sizeof(int)); |
2238 | memcpy(dest: zzZ.data() + 2 + ncap, src: capEnd, n: ncap * sizeof(int)); |
2239 | } |
2240 | inNextStack[nextStack[--nnext]] = -1; |
2241 | sleeping.append(t: zzZ); |
2242 | } |
2243 | #endif |
2244 | #endif |
2245 | } |
2246 | } |
2247 | } |
2248 | #ifndef QT_NO_REGEXP_CAPTURE |
2249 | /* |
2250 | If we reached the final state, hurray! Copy the captured |
2251 | zone. |
2252 | */ |
2253 | if (ncap > 0 && (m = inNextStack[QRegExpEngine::FinalState]) != -1) { |
2254 | memcpy(dest: capBegin, src: nextCapBegin + m * ncap, n: ncap * sizeof(int)); |
2255 | memcpy(dest: capEnd, src: nextCapEnd + m * ncap, n: ncap * sizeof(int)); |
2256 | } |
2257 | #ifndef QT_NO_REGEXP_BACKREF |
2258 | /* |
2259 | It's time to wake up the sleepers. |
2260 | */ |
2261 | j = 0; |
2262 | while (j < sleeping.count()) { |
2263 | if (sleeping.at(i: j)[0] == i) { |
2264 | const QVector<int> &zzZ = sleeping.at(i: j); |
2265 | int next = zzZ[1]; |
2266 | const int *capBegin = zzZ.data() + 2; |
2267 | const int *capEnd = zzZ.data() + 2 + ncap; |
2268 | bool copyOver = true; |
2269 | |
2270 | if ((m = inNextStack[next]) == -1) { |
2271 | m = nnext++; |
2272 | nextStack[m] = next; |
2273 | inNextStack[next] = m; |
2274 | } else { |
2275 | copyOver = isBetterCapture(ncap, begin1: nextCapBegin + m * ncap, end1: nextCapEnd + m * ncap, |
2276 | begin2: capBegin, end2: capEnd); |
2277 | } |
2278 | if (copyOver) { |
2279 | memcpy(dest: nextCapBegin + m * ncap, src: capBegin, n: ncap * sizeof(int)); |
2280 | memcpy(dest: nextCapEnd + m * ncap, src: capEnd, n: ncap * sizeof(int)); |
2281 | } |
2282 | |
2283 | sleeping.removeAt(i: j); |
2284 | } else { |
2285 | ++j; |
2286 | } |
2287 | } |
2288 | #endif |
2289 | #endif |
2290 | for (j = 0; j < nnext; j++) |
2291 | inNextStack[nextStack[j]] = -1; |
2292 | |
2293 | // avoid needless iteration that confuses oneTestMatchedLen |
2294 | if (nnext == 1 && nextStack[0] == QRegExpEngine::FinalState |
2295 | #ifndef QT_NO_REGEXP_BACKREF |
2296 | && sleeping.isEmpty() |
2297 | #endif |
2298 | ) |
2299 | stop = true; |
2300 | |
2301 | qSwap(value1&: curStack, value2&: nextStack); |
2302 | #ifndef QT_NO_REGEXP_CAPTURE |
2303 | qSwap(value1&: curCapBegin, value2&: nextCapBegin); |
2304 | qSwap(value1&: curCapEnd, value2&: nextCapEnd); |
2305 | #endif |
2306 | ncur = nnext; |
2307 | nnext = 0; |
2308 | ++i; |
2309 | } |
2310 | |
2311 | #ifndef QT_NO_REGEXP_BACKREF |
2312 | /* |
2313 | If minimal matching is enabled, we might have some sleepers |
2314 | left. |
2315 | */ |
2316 | if (!sleeping.isEmpty()) |
2317 | sleeping.clear(); |
2318 | #endif |
2319 | |
2320 | oneTestMatchedLen = i - 1; |
2321 | return (matchLen >= 0); |
2322 | } |
2323 | |
2324 | #ifndef QT_NO_REGEXP_CCLASS |
2325 | |
2326 | QRegExpCharClass::QRegExpCharClass() |
2327 | : c(0), n(false) |
2328 | { |
2329 | #ifndef QT_NO_REGEXP_OPTIM |
2330 | occ1.fill(from: NoOccurrence, asize: NumBadChars); |
2331 | #endif |
2332 | } |
2333 | |
2334 | void QRegExpCharClass::clear() |
2335 | { |
2336 | c = 0; |
2337 | r.clear(); |
2338 | n = false; |
2339 | } |
2340 | |
2341 | void QRegExpCharClass::setNegative(bool negative) |
2342 | { |
2343 | n = negative; |
2344 | #ifndef QT_NO_REGEXP_OPTIM |
2345 | occ1.fill(from: 0, asize: NumBadChars); |
2346 | #endif |
2347 | } |
2348 | |
2349 | void QRegExpCharClass::addCategories(uint cats) |
2350 | { |
2351 | static const int all_cats = FLAG(QChar::Mark_NonSpacing) | |
2352 | FLAG(QChar::Mark_SpacingCombining) | |
2353 | FLAG(QChar::Mark_Enclosing) | |
2354 | FLAG(QChar::Number_DecimalDigit) | |
2355 | FLAG(QChar::Number_Letter) | |
2356 | FLAG(QChar::Number_Other) | |
2357 | FLAG(QChar::Separator_Space) | |
2358 | FLAG(QChar::Separator_Line) | |
2359 | FLAG(QChar::Separator_Paragraph) | |
2360 | FLAG(QChar::Other_Control) | |
2361 | FLAG(QChar::Other_Format) | |
2362 | FLAG(QChar::Other_Surrogate) | |
2363 | FLAG(QChar::Other_PrivateUse) | |
2364 | FLAG(QChar::Other_NotAssigned) | |
2365 | FLAG(QChar::Letter_Uppercase) | |
2366 | FLAG(QChar::Letter_Lowercase) | |
2367 | FLAG(QChar::Letter_Titlecase) | |
2368 | FLAG(QChar::Letter_Modifier) | |
2369 | FLAG(QChar::Letter_Other) | |
2370 | FLAG(QChar::Punctuation_Connector) | |
2371 | FLAG(QChar::Punctuation_Dash) | |
2372 | FLAG(QChar::Punctuation_Open) | |
2373 | FLAG(QChar::Punctuation_Close) | |
2374 | FLAG(QChar::Punctuation_InitialQuote) | |
2375 | FLAG(QChar::Punctuation_FinalQuote) | |
2376 | FLAG(QChar::Punctuation_Other) | |
2377 | FLAG(QChar::Symbol_Math) | |
2378 | FLAG(QChar::Symbol_Currency) | |
2379 | FLAG(QChar::Symbol_Modifier) | |
2380 | FLAG(QChar::Symbol_Other); |
2381 | c |= (all_cats & cats); |
2382 | #ifndef QT_NO_REGEXP_OPTIM |
2383 | occ1.fill(from: 0, asize: NumBadChars); |
2384 | #endif |
2385 | } |
2386 | |
2387 | void QRegExpCharClass::addRange(ushort from, ushort to) |
2388 | { |
2389 | if (from > to) |
2390 | qSwap(value1&: from, value2&: to); |
2391 | int m = r.size(); |
2392 | r.resize(asize: m + 1); |
2393 | r[m].from = from; |
2394 | r[m].len = to - from + 1; |
2395 | |
2396 | #ifndef QT_NO_REGEXP_OPTIM |
2397 | int i; |
2398 | |
2399 | if (to - from < NumBadChars) { |
2400 | if (from % NumBadChars <= to % NumBadChars) { |
2401 | for (i = from % NumBadChars; i <= to % NumBadChars; i++) |
2402 | occ1[i] = 0; |
2403 | } else { |
2404 | for (i = 0; i <= to % NumBadChars; i++) |
2405 | occ1[i] = 0; |
2406 | for (i = from % NumBadChars; i < NumBadChars; i++) |
2407 | occ1[i] = 0; |
2408 | } |
2409 | } else { |
2410 | occ1.fill(from: 0, asize: NumBadChars); |
2411 | } |
2412 | #endif |
2413 | } |
2414 | |
2415 | bool QRegExpCharClass::in(QChar ch) const |
2416 | { |
2417 | #ifndef QT_NO_REGEXP_OPTIM |
2418 | if (occ1.at(BadChar(ch)) == NoOccurrence) |
2419 | return n; |
2420 | #endif |
2421 | |
2422 | if (c != 0 && (c & FLAG(ch.category())) != 0) |
2423 | return !n; |
2424 | |
2425 | const int uc = ch.unicode(); |
2426 | int size = r.size(); |
2427 | |
2428 | for (int i = 0; i < size; ++i) { |
2429 | const QRegExpCharClassRange &range = r.at(i); |
2430 | if (uint(uc - range.from) < uint(r.at(i).len)) |
2431 | return !n; |
2432 | } |
2433 | return n; |
2434 | } |
2435 | |
2436 | #if defined(QT_DEBUG) |
2437 | void QRegExpCharClass::dump() const |
2438 | { |
2439 | int i; |
2440 | qDebug(msg: " %stive character class" , n ? "nega" : "posi" ); |
2441 | #ifndef QT_NO_REGEXP_CCLASS |
2442 | if (c != 0) |
2443 | qDebug(msg: " categories 0x%.8x" , c); |
2444 | #endif |
2445 | for (i = 0; i < r.size(); i++) |
2446 | qDebug(msg: " 0x%.4x through 0x%.4x" , r[i].from, r[i].from + r[i].len - 1); |
2447 | } |
2448 | #endif |
2449 | #endif |
2450 | |
2451 | QRegExpEngine::Box::Box(QRegExpEngine *engine) |
2452 | : eng(engine), skipanchors(0) |
2453 | #ifndef QT_NO_REGEXP_OPTIM |
2454 | , earlyStart(0), lateStart(0), maxl(0) |
2455 | #endif |
2456 | { |
2457 | #ifndef QT_NO_REGEXP_OPTIM |
2458 | occ1.fill(from: NoOccurrence, asize: NumBadChars); |
2459 | #endif |
2460 | minl = 0; |
2461 | } |
2462 | |
2463 | QRegExpEngine::Box &QRegExpEngine::Box::operator=(const Box &b) |
2464 | { |
2465 | eng = b.eng; |
2466 | ls = b.ls; |
2467 | rs = b.rs; |
2468 | lanchors = b.lanchors; |
2469 | ranchors = b.ranchors; |
2470 | skipanchors = b.skipanchors; |
2471 | #ifndef QT_NO_REGEXP_OPTIM |
2472 | earlyStart = b.earlyStart; |
2473 | lateStart = b.lateStart; |
2474 | str = b.str; |
2475 | leftStr = b.leftStr; |
2476 | rightStr = b.rightStr; |
2477 | maxl = b.maxl; |
2478 | occ1 = b.occ1; |
2479 | #endif |
2480 | minl = b.minl; |
2481 | return *this; |
2482 | } |
2483 | |
2484 | void QRegExpEngine::Box::set(QChar ch) |
2485 | { |
2486 | ls.resize(asize: 1); |
2487 | ls[0] = eng->createState(ch); |
2488 | rs = ls; |
2489 | #ifndef QT_NO_REGEXP_OPTIM |
2490 | str = ch; |
2491 | leftStr = ch; |
2492 | rightStr = ch; |
2493 | maxl = 1; |
2494 | occ1[BadChar(ch)] = 0; |
2495 | #endif |
2496 | minl = 1; |
2497 | } |
2498 | |
2499 | void QRegExpEngine::Box::set(const QRegExpCharClass &cc) |
2500 | { |
2501 | ls.resize(asize: 1); |
2502 | ls[0] = eng->createState(cc); |
2503 | rs = ls; |
2504 | #ifndef QT_NO_REGEXP_OPTIM |
2505 | maxl = 1; |
2506 | occ1 = cc.firstOccurrence(); |
2507 | #endif |
2508 | minl = 1; |
2509 | } |
2510 | |
2511 | #ifndef QT_NO_REGEXP_BACKREF |
2512 | void QRegExpEngine::Box::set(int bref) |
2513 | { |
2514 | ls.resize(asize: 1); |
2515 | ls[0] = eng->createState(bref); |
2516 | rs = ls; |
2517 | if (bref >= 1 && bref <= MaxBackRefs) |
2518 | skipanchors = Anchor_BackRef0Empty << bref; |
2519 | #ifndef QT_NO_REGEXP_OPTIM |
2520 | maxl = InftyLen; |
2521 | #endif |
2522 | minl = 0; |
2523 | } |
2524 | #endif |
2525 | |
2526 | void QRegExpEngine::Box::cat(const Box &b) |
2527 | { |
2528 | eng->addCatTransitions(from: rs, to: b.ls); |
2529 | addAnchorsToEngine(to: b); |
2530 | if (minl == 0) { |
2531 | lanchors.insert(map: b.lanchors); |
2532 | if (skipanchors != 0) { |
2533 | for (int i = 0; i < b.ls.size(); i++) { |
2534 | int a = eng->anchorConcatenation(a: lanchors.value(akey: b.ls.at(i), adefaultValue: 0), b: skipanchors); |
2535 | lanchors.insert(akey: b.ls.at(i), avalue: a); |
2536 | } |
2537 | } |
2538 | mergeInto(a: &ls, b: b.ls); |
2539 | } |
2540 | if (b.minl == 0) { |
2541 | ranchors.insert(map: b.ranchors); |
2542 | if (b.skipanchors != 0) { |
2543 | for (int i = 0; i < rs.size(); i++) { |
2544 | int a = eng->anchorConcatenation(a: ranchors.value(akey: rs.at(i), adefaultValue: 0), b: b.skipanchors); |
2545 | ranchors.insert(akey: rs.at(i), avalue: a); |
2546 | } |
2547 | } |
2548 | mergeInto(a: &rs, b: b.rs); |
2549 | } else { |
2550 | ranchors = b.ranchors; |
2551 | rs = b.rs; |
2552 | } |
2553 | |
2554 | #ifndef QT_NO_REGEXP_OPTIM |
2555 | if (maxl != InftyLen) { |
2556 | if (rightStr.length() + b.leftStr.length() > |
2557 | qMax(a: str.length(), b: b.str.length())) { |
2558 | earlyStart = minl - rightStr.length(); |
2559 | lateStart = maxl - rightStr.length(); |
2560 | str = rightStr + b.leftStr; |
2561 | } else if (b.str.length() > str.length()) { |
2562 | earlyStart = minl + b.earlyStart; |
2563 | lateStart = maxl + b.lateStart; |
2564 | str = b.str; |
2565 | } |
2566 | } |
2567 | |
2568 | if (leftStr.length() == maxl) |
2569 | leftStr += b.leftStr; |
2570 | |
2571 | if (b.rightStr.length() == b.maxl) { |
2572 | rightStr += b.rightStr; |
2573 | } else { |
2574 | rightStr = b.rightStr; |
2575 | } |
2576 | |
2577 | if (maxl == InftyLen || b.maxl == InftyLen) { |
2578 | maxl = InftyLen; |
2579 | } else { |
2580 | maxl += b.maxl; |
2581 | } |
2582 | |
2583 | for (int i = 0; i < NumBadChars; i++) { |
2584 | if (b.occ1.at(i) != NoOccurrence && minl + b.occ1.at(i) < occ1.at(i)) |
2585 | occ1[i] = minl + b.occ1.at(i); |
2586 | } |
2587 | #endif |
2588 | |
2589 | minl += b.minl; |
2590 | if (minl == 0) |
2591 | skipanchors = eng->anchorConcatenation(a: skipanchors, b: b.skipanchors); |
2592 | else |
2593 | skipanchors = 0; |
2594 | } |
2595 | |
2596 | void QRegExpEngine::Box::orx(const Box &b) |
2597 | { |
2598 | mergeInto(a: &ls, b: b.ls); |
2599 | lanchors.insert(map: b.lanchors); |
2600 | mergeInto(a: &rs, b: b.rs); |
2601 | ranchors.insert(map: b.ranchors); |
2602 | |
2603 | if (b.minl == 0) { |
2604 | if (minl == 0) |
2605 | skipanchors = eng->anchorAlternation(a: skipanchors, b: b.skipanchors); |
2606 | else |
2607 | skipanchors = b.skipanchors; |
2608 | } |
2609 | |
2610 | #ifndef QT_NO_REGEXP_OPTIM |
2611 | for (int i = 0; i < NumBadChars; i++) { |
2612 | if (occ1.at(i) > b.occ1.at(i)) |
2613 | occ1[i] = b.occ1.at(i); |
2614 | } |
2615 | earlyStart = 0; |
2616 | lateStart = 0; |
2617 | str = QString(); |
2618 | leftStr = QString(); |
2619 | rightStr = QString(); |
2620 | if (b.maxl > maxl) |
2621 | maxl = b.maxl; |
2622 | #endif |
2623 | if (b.minl < minl) |
2624 | minl = b.minl; |
2625 | } |
2626 | |
2627 | void QRegExpEngine::Box::plus(int atom) |
2628 | { |
2629 | #ifndef QT_NO_REGEXP_CAPTURE |
2630 | eng->addPlusTransitions(from: rs, to: ls, atom); |
2631 | #else |
2632 | Q_UNUSED(atom); |
2633 | eng->addCatTransitions(rs, ls); |
2634 | #endif |
2635 | addAnchorsToEngine(to: *this); |
2636 | #ifndef QT_NO_REGEXP_OPTIM |
2637 | maxl = InftyLen; |
2638 | #endif |
2639 | } |
2640 | |
2641 | void QRegExpEngine::Box::opt() |
2642 | { |
2643 | #ifndef QT_NO_REGEXP_OPTIM |
2644 | earlyStart = 0; |
2645 | lateStart = 0; |
2646 | str = QString(); |
2647 | leftStr = QString(); |
2648 | rightStr = QString(); |
2649 | #endif |
2650 | skipanchors = 0; |
2651 | minl = 0; |
2652 | } |
2653 | |
2654 | void QRegExpEngine::Box::catAnchor(int a) |
2655 | { |
2656 | if (a != 0) { |
2657 | for (int i = 0; i < rs.size(); i++) { |
2658 | a = eng->anchorConcatenation(a: ranchors.value(akey: rs.at(i), adefaultValue: 0), b: a); |
2659 | ranchors.insert(akey: rs.at(i), avalue: a); |
2660 | } |
2661 | if (minl == 0) |
2662 | skipanchors = eng->anchorConcatenation(a: skipanchors, b: a); |
2663 | } |
2664 | } |
2665 | |
2666 | #ifndef QT_NO_REGEXP_OPTIM |
2667 | void QRegExpEngine::Box::setupHeuristics() |
2668 | { |
2669 | eng->goodEarlyStart = earlyStart; |
2670 | eng->goodLateStart = lateStart; |
2671 | eng->goodStr = eng->cs ? str : str.toLower(); |
2672 | |
2673 | eng->minl = minl; |
2674 | if (eng->cs) { |
2675 | /* |
2676 | A regular expression such as 112|1 has occ1['2'] = 2 and minl = |
2677 | 1 at this point. An entry of occ1 has to be at most minl or |
2678 | infinity for the rest of the algorithm to go well. |
2679 | |
2680 | We waited until here before normalizing these cases (instead of |
2681 | doing it in Box::orx()) because sometimes things improve by |
2682 | themselves. Consider for example (112|1)34. |
2683 | */ |
2684 | for (int i = 0; i < NumBadChars; i++) { |
2685 | if (occ1.at(i) != NoOccurrence && occ1.at(i) >= minl) |
2686 | occ1[i] = minl; |
2687 | } |
2688 | eng->occ1 = occ1; |
2689 | } else { |
2690 | eng->occ1.fill(from: 0, asize: NumBadChars); |
2691 | } |
2692 | |
2693 | eng->heuristicallyChooseHeuristic(); |
2694 | } |
2695 | #endif |
2696 | |
2697 | #if defined(QT_DEBUG) |
2698 | void QRegExpEngine::Box::dump() const |
2699 | { |
2700 | int i; |
2701 | qDebug(msg: "Box of at least %d character%s" , minl, minl == 1 ? "" : "s" ); |
2702 | qDebug(msg: " Left states:" ); |
2703 | for (i = 0; i < ls.size(); i++) { |
2704 | if (lanchors.value(akey: ls[i], adefaultValue: 0) == 0) |
2705 | qDebug(msg: " %d" , ls[i]); |
2706 | else |
2707 | qDebug(msg: " %d [anchors 0x%.8x]" , ls[i], lanchors[ls[i]]); |
2708 | } |
2709 | qDebug(msg: " Right states:" ); |
2710 | for (i = 0; i < rs.size(); i++) { |
2711 | if (ranchors.value(akey: rs[i], adefaultValue: 0) == 0) |
2712 | qDebug(msg: " %d" , rs[i]); |
2713 | else |
2714 | qDebug(msg: " %d [anchors 0x%.8x]" , rs[i], ranchors[rs[i]]); |
2715 | } |
2716 | qDebug(msg: " Skip anchors: 0x%.8x" , skipanchors); |
2717 | } |
2718 | #endif |
2719 | |
2720 | void QRegExpEngine::Box::addAnchorsToEngine(const Box &to) const |
2721 | { |
2722 | for (int i = 0; i < to.ls.size(); i++) { |
2723 | for (int j = 0; j < rs.size(); j++) { |
2724 | int a = eng->anchorConcatenation(a: ranchors.value(akey: rs.at(i: j), adefaultValue: 0), |
2725 | b: to.lanchors.value(akey: to.ls.at(i), adefaultValue: 0)); |
2726 | eng->addAnchors(from: rs[j], to: to.ls[i], a); |
2727 | } |
2728 | } |
2729 | } |
2730 | |
2731 | #ifndef QT_NO_REGEXP_CCLASS |
2732 | // fast lookup hash for xml schema extensions |
2733 | // sorted by name for b-search |
2734 | static const struct CategoriesRangeMapEntry { |
2735 | const char name[40]; |
2736 | uint first, second; |
2737 | } categoriesRangeMap[] = { |
2738 | { .name: "AegeanNumbers" , .first: 0x10100, .second: 0x1013F }, |
2739 | { .name: "AlphabeticPresentationForms" , .first: 0xFB00, .second: 0xFB4F }, |
2740 | { .name: "AncientGreekMusicalNotation" , .first: 0x1D200, .second: 0x1D24F }, |
2741 | { .name: "AncientGreekNumbers" , .first: 0x10140, .second: 0x1018F }, |
2742 | { .name: "Arabic" , .first: 0x0600, .second: 0x06FF }, |
2743 | { .name: "ArabicPresentationForms-A" , .first: 0xFB50, .second: 0xFDFF }, |
2744 | { .name: "ArabicPresentationForms-B" , .first: 0xFE70, .second: 0xFEFF }, |
2745 | { .name: "ArabicSupplement" , .first: 0x0750, .second: 0x077F }, |
2746 | { .name: "Armenian" , .first: 0x0530, .second: 0x058F }, |
2747 | { .name: "Arrows" , .first: 0x2190, .second: 0x21FF }, |
2748 | { .name: "BasicLatin" , .first: 0x0000, .second: 0x007F }, |
2749 | { .name: "Bengali" , .first: 0x0980, .second: 0x09FF }, |
2750 | { .name: "BlockElements" , .first: 0x2580, .second: 0x259F }, |
2751 | { .name: "Bopomofo" , .first: 0x3100, .second: 0x312F }, |
2752 | { .name: "BopomofoExtended" , .first: 0x31A0, .second: 0x31BF }, |
2753 | { .name: "BoxDrawing" , .first: 0x2500, .second: 0x257F }, |
2754 | { .name: "BraillePatterns" , .first: 0x2800, .second: 0x28FF }, |
2755 | { .name: "Buginese" , .first: 0x1A00, .second: 0x1A1F }, |
2756 | { .name: "Buhid" , .first: 0x1740, .second: 0x175F }, |
2757 | { .name: "ByzantineMusicalSymbols" , .first: 0x1D000, .second: 0x1D0FF }, |
2758 | { .name: "CJKCompatibility" , .first: 0x3300, .second: 0x33FF }, |
2759 | { .name: "CJKCompatibilityForms" , .first: 0xFE30, .second: 0xFE4F }, |
2760 | { .name: "CJKCompatibilityIdeographs" , .first: 0xF900, .second: 0xFAFF }, |
2761 | { .name: "CJKCompatibilityIdeographsSupplement" , .first: 0x2F800, .second: 0x2FA1F }, |
2762 | { .name: "CJKRadicalsSupplement" , .first: 0x2E80, .second: 0x2EFF }, |
2763 | { .name: "CJKStrokes" , .first: 0x31C0, .second: 0x31EF }, |
2764 | { .name: "CJKSymbolsandPunctuation" , .first: 0x3000, .second: 0x303F }, |
2765 | { .name: "CJKUnifiedIdeographs" , .first: 0x4E00, .second: 0x9FFF }, |
2766 | { .name: "CJKUnifiedIdeographsExtensionA" , .first: 0x3400, .second: 0x4DB5 }, |
2767 | { .name: "CJKUnifiedIdeographsExtensionB" , .first: 0x20000, .second: 0x2A6DF }, |
2768 | { .name: "Cherokee" , .first: 0x13A0, .second: 0x13FF }, |
2769 | { .name: "CombiningDiacriticalMarks" , .first: 0x0300, .second: 0x036F }, |
2770 | { .name: "CombiningDiacriticalMarksSupplement" , .first: 0x1DC0, .second: 0x1DFF }, |
2771 | { .name: "CombiningHalfMarks" , .first: 0xFE20, .second: 0xFE2F }, |
2772 | { .name: "CombiningMarksforSymbols" , .first: 0x20D0, .second: 0x20FF }, |
2773 | { .name: "ControlPictures" , .first: 0x2400, .second: 0x243F }, |
2774 | { .name: "Coptic" , .first: 0x2C80, .second: 0x2CFF }, |
2775 | { .name: "CurrencySymbols" , .first: 0x20A0, .second: 0x20CF }, |
2776 | { .name: "CypriotSyllabary" , .first: 0x10800, .second: 0x1083F }, |
2777 | { .name: "Cyrillic" , .first: 0x0400, .second: 0x04FF }, |
2778 | { .name: "CyrillicSupplement" , .first: 0x0500, .second: 0x052F }, |
2779 | { .name: "Deseret" , .first: 0x10400, .second: 0x1044F }, |
2780 | { .name: "Devanagari" , .first: 0x0900, .second: 0x097F }, |
2781 | { .name: "Dingbats" , .first: 0x2700, .second: 0x27BF }, |
2782 | { .name: "EnclosedAlphanumerics" , .first: 0x2460, .second: 0x24FF }, |
2783 | { .name: "EnclosedCJKLettersandMonths" , .first: 0x3200, .second: 0x32FF }, |
2784 | { .name: "Ethiopic" , .first: 0x1200, .second: 0x137F }, |
2785 | { .name: "EthiopicExtended" , .first: 0x2D80, .second: 0x2DDF }, |
2786 | { .name: "EthiopicSupplement" , .first: 0x1380, .second: 0x139F }, |
2787 | { .name: "GeneralPunctuation" , .first: 0x2000, .second: 0x206F }, |
2788 | { .name: "GeometricShapes" , .first: 0x25A0, .second: 0x25FF }, |
2789 | { .name: "Georgian" , .first: 0x10A0, .second: 0x10FF }, |
2790 | { .name: "GeorgianSupplement" , .first: 0x2D00, .second: 0x2D2F }, |
2791 | { .name: "Glagolitic" , .first: 0x2C00, .second: 0x2C5F }, |
2792 | { .name: "Gothic" , .first: 0x10330, .second: 0x1034F }, |
2793 | { .name: "Greek" , .first: 0x0370, .second: 0x03FF }, |
2794 | { .name: "GreekExtended" , .first: 0x1F00, .second: 0x1FFF }, |
2795 | { .name: "Gujarati" , .first: 0x0A80, .second: 0x0AFF }, |
2796 | { .name: "Gurmukhi" , .first: 0x0A00, .second: 0x0A7F }, |
2797 | { .name: "HalfwidthandFullwidthForms" , .first: 0xFF00, .second: 0xFFEF }, |
2798 | { .name: "HangulCompatibilityJamo" , .first: 0x3130, .second: 0x318F }, |
2799 | { .name: "HangulJamo" , .first: 0x1100, .second: 0x11FF }, |
2800 | { .name: "HangulSyllables" , .first: 0xAC00, .second: 0xD7A3 }, |
2801 | { .name: "Hanunoo" , .first: 0x1720, .second: 0x173F }, |
2802 | { .name: "Hebrew" , .first: 0x0590, .second: 0x05FF }, |
2803 | { .name: "Hiragana" , .first: 0x3040, .second: 0x309F }, |
2804 | { .name: "IPAExtensions" , .first: 0x0250, .second: 0x02AF }, |
2805 | { .name: "IdeographicDescriptionCharacters" , .first: 0x2FF0, .second: 0x2FFF }, |
2806 | { .name: "Kanbun" , .first: 0x3190, .second: 0x319F }, |
2807 | { .name: "KangxiRadicals" , .first: 0x2F00, .second: 0x2FDF }, |
2808 | { .name: "Kannada" , .first: 0x0C80, .second: 0x0CFF }, |
2809 | { .name: "Katakana" , .first: 0x30A0, .second: 0x30FF }, |
2810 | { .name: "KatakanaPhoneticExtensions" , .first: 0x31F0, .second: 0x31FF }, |
2811 | { .name: "Kharoshthi" , .first: 0x10A00, .second: 0x10A5F }, |
2812 | { .name: "Khmer" , .first: 0x1780, .second: 0x17FF }, |
2813 | { .name: "KhmerSymbols" , .first: 0x19E0, .second: 0x19FF }, |
2814 | { .name: "Lao" , .first: 0x0E80, .second: 0x0EFF }, |
2815 | { .name: "Latin-1Supplement" , .first: 0x0080, .second: 0x00FF }, |
2816 | { .name: "LatinExtended-A" , .first: 0x0100, .second: 0x017F }, |
2817 | { .name: "LatinExtended-B" , .first: 0x0180, .second: 0x024F }, |
2818 | { .name: "LatinExtendedAdditional" , .first: 0x1E00, .second: 0x1EFF }, |
2819 | { .name: "LetterlikeSymbols" , .first: 0x2100, .second: 0x214F }, |
2820 | { .name: "Limbu" , .first: 0x1900, .second: 0x194F }, |
2821 | { .name: "LinearBIdeograms" , .first: 0x10080, .second: 0x100FF }, |
2822 | { .name: "LinearBSyllabary" , .first: 0x10000, .second: 0x1007F }, |
2823 | { .name: "Malayalam" , .first: 0x0D00, .second: 0x0D7F }, |
2824 | { .name: "MathematicalAlphanumericSymbols" , .first: 0x1D400, .second: 0x1D7FF }, |
2825 | { .name: "MathematicalOperators" , .first: 0x2200, .second: 0x22FF }, |
2826 | { .name: "MiscellaneousMathematicalSymbols-A" , .first: 0x27C0, .second: 0x27EF }, |
2827 | { .name: "MiscellaneousMathematicalSymbols-B" , .first: 0x2980, .second: 0x29FF }, |
2828 | { .name: "MiscellaneousSymbols" , .first: 0x2600, .second: 0x26FF }, |
2829 | { .name: "MiscellaneousSymbolsandArrows" , .first: 0x2B00, .second: 0x2BFF }, |
2830 | { .name: "MiscellaneousTechnical" , .first: 0x2300, .second: 0x23FF }, |
2831 | { .name: "ModifierToneLetters" , .first: 0xA700, .second: 0xA71F }, |
2832 | { .name: "Mongolian" , .first: 0x1800, .second: 0x18AF }, |
2833 | { .name: "MusicalSymbols" , .first: 0x1D100, .second: 0x1D1FF }, |
2834 | { .name: "Myanmar" , .first: 0x1000, .second: 0x109F }, |
2835 | { .name: "NewTaiLue" , .first: 0x1980, .second: 0x19DF }, |
2836 | { .name: "NumberForms" , .first: 0x2150, .second: 0x218F }, |
2837 | { .name: "Ogham" , .first: 0x1680, .second: 0x169F }, |
2838 | { .name: "OldItalic" , .first: 0x10300, .second: 0x1032F }, |
2839 | { .name: "OldPersian" , .first: 0x103A0, .second: 0x103DF }, |
2840 | { .name: "OpticalCharacterRecognition" , .first: 0x2440, .second: 0x245F }, |
2841 | { .name: "Oriya" , .first: 0x0B00, .second: 0x0B7F }, |
2842 | { .name: "Osmanya" , .first: 0x10480, .second: 0x104AF }, |
2843 | { .name: "PhoneticExtensions" , .first: 0x1D00, .second: 0x1D7F }, |
2844 | { .name: "PhoneticExtensionsSupplement" , .first: 0x1D80, .second: 0x1DBF }, |
2845 | { .name: "PrivateUse" , .first: 0xE000, .second: 0xF8FF }, |
2846 | { .name: "Runic" , .first: 0x16A0, .second: 0x16FF }, |
2847 | { .name: "Shavian" , .first: 0x10450, .second: 0x1047F }, |
2848 | { .name: "Sinhala" , .first: 0x0D80, .second: 0x0DFF }, |
2849 | { .name: "SmallFormVariants" , .first: 0xFE50, .second: 0xFE6F }, |
2850 | { .name: "SpacingModifierLetters" , .first: 0x02B0, .second: 0x02FF }, |
2851 | { .name: "Specials" , .first: 0xFFF0, .second: 0xFFFF }, |
2852 | { .name: "SuperscriptsandSubscripts" , .first: 0x2070, .second: 0x209F }, |
2853 | { .name: "SupplementalArrows-A" , .first: 0x27F0, .second: 0x27FF }, |
2854 | { .name: "SupplementalArrows-B" , .first: 0x2900, .second: 0x297F }, |
2855 | { .name: "SupplementalMathematicalOperators" , .first: 0x2A00, .second: 0x2AFF }, |
2856 | { .name: "SupplementalPunctuation" , .first: 0x2E00, .second: 0x2E7F }, |
2857 | { .name: "SupplementaryPrivateUseArea-A" , .first: 0xF0000, .second: 0xFFFFF }, |
2858 | { .name: "SupplementaryPrivateUseArea-B" , .first: 0x100000, .second: 0x10FFFF }, |
2859 | { .name: "SylotiNagri" , .first: 0xA800, .second: 0xA82F }, |
2860 | { .name: "Syriac" , .first: 0x0700, .second: 0x074F }, |
2861 | { .name: "Tagalog" , .first: 0x1700, .second: 0x171F }, |
2862 | { .name: "Tagbanwa" , .first: 0x1760, .second: 0x177F }, |
2863 | { .name: "Tags" , .first: 0xE0000, .second: 0xE007F }, |
2864 | { .name: "TaiLe" , .first: 0x1950, .second: 0x197F }, |
2865 | { .name: "TaiXuanJingSymbols" , .first: 0x1D300, .second: 0x1D35F }, |
2866 | { .name: "Tamil" , .first: 0x0B80, .second: 0x0BFF }, |
2867 | { .name: "Telugu" , .first: 0x0C00, .second: 0x0C7F }, |
2868 | { .name: "Thaana" , .first: 0x0780, .second: 0x07BF }, |
2869 | { .name: "Thai" , .first: 0x0E00, .second: 0x0E7F }, |
2870 | { .name: "Tibetan" , .first: 0x0F00, .second: 0x0FFF }, |
2871 | { .name: "Tifinagh" , .first: 0x2D30, .second: 0x2D7F }, |
2872 | { .name: "Ugaritic" , .first: 0x10380, .second: 0x1039F }, |
2873 | { .name: "UnifiedCanadianAboriginalSyllabics" , .first: 0x1400, .second: 0x167F }, |
2874 | { .name: "VariationSelectors" , .first: 0xFE00, .second: 0xFE0F }, |
2875 | { .name: "VariationSelectorsSupplement" , .first: 0xE0100, .second: 0xE01EF }, |
2876 | { .name: "VerticalForms" , .first: 0xFE10, .second: 0xFE1F }, |
2877 | { .name: "YiRadicals" , .first: 0xA490, .second: 0xA4CF }, |
2878 | { .name: "YiSyllables" , .first: 0xA000, .second: 0xA48F }, |
2879 | { .name: "YijingHexagramSymbols" , .first: 0x4DC0, .second: 0x4DFF } |
2880 | }; |
2881 | |
2882 | inline bool operator<(const CategoriesRangeMapEntry &entry1, const CategoriesRangeMapEntry &entry2) |
2883 | { return qstrcmp(str1: entry1.name, str2: entry2.name) < 0; } |
2884 | inline bool operator<(const char *name, const CategoriesRangeMapEntry &entry) |
2885 | { return qstrcmp(str1: name, str2: entry.name) < 0; } |
2886 | inline bool operator<(const CategoriesRangeMapEntry &entry, const char *name) |
2887 | { return qstrcmp(str1: entry.name, str2: name) < 0; } |
2888 | #endif // QT_NO_REGEXP_CCLASS |
2889 | |
2890 | int QRegExpEngine::getChar() |
2891 | { |
2892 | return (yyPos == yyLen) ? EOS : yyIn[yyPos++].unicode(); |
2893 | } |
2894 | |
2895 | int QRegExpEngine::getEscape() |
2896 | { |
2897 | #ifndef QT_NO_REGEXP_ESCAPE |
2898 | const char tab[] = "afnrtv" ; // no b, as \b means word boundary |
2899 | const char backTab[] = "\a\f\n\r\t\v" ; |
2900 | ushort low; |
2901 | int i; |
2902 | #endif |
2903 | ushort val; |
2904 | int prevCh = yyCh; |
2905 | |
2906 | if (prevCh == EOS) { |
2907 | error(RXERR_END); |
2908 | return Tok_Char | '\\'; |
2909 | } |
2910 | yyCh = getChar(); |
2911 | #ifndef QT_NO_REGEXP_ESCAPE |
2912 | if ((prevCh & ~0xff) == 0) { |
2913 | const char *p = strchr(s: tab, c: prevCh); |
2914 | if (p != nullptr) |
2915 | return Tok_Char | backTab[p - tab]; |
2916 | } |
2917 | #endif |
2918 | |
2919 | switch (prevCh) { |
2920 | #ifndef QT_NO_REGEXP_ESCAPE |
2921 | case '0': |
2922 | val = 0; |
2923 | for (i = 0; i < 3; i++) { |
2924 | if (yyCh >= '0' && yyCh <= '7') |
2925 | val = (val << 3) | (yyCh - '0'); |
2926 | else |
2927 | break; |
2928 | yyCh = getChar(); |
2929 | } |
2930 | if ((val & ~0377) != 0) |
2931 | error(RXERR_OCTAL); |
2932 | return Tok_Char | val; |
2933 | #endif |
2934 | #ifndef QT_NO_REGEXP_ESCAPE |
2935 | case 'B': |
2936 | return Tok_NonWord; |
2937 | #endif |
2938 | #ifndef QT_NO_REGEXP_CCLASS |
2939 | case 'D': |
2940 | // see QChar::isDigit() |
2941 | yyCharClass->addCategories(cats: uint(-1) ^ FLAG(QChar::Number_DecimalDigit)); |
2942 | return Tok_CharClass; |
2943 | case 'S': |
2944 | // see QChar::isSpace() |
2945 | yyCharClass->addCategories(cats: uint(-1) ^ (FLAG(QChar::Separator_Space) | |
2946 | FLAG(QChar::Separator_Line) | |
2947 | FLAG(QChar::Separator_Paragraph) | |
2948 | FLAG(QChar::Other_Control))); |
2949 | yyCharClass->addRange(from: 0x0000, to: 0x0008); |
2950 | yyCharClass->addRange(from: 0x000e, to: 0x001f); |
2951 | yyCharClass->addRange(from: 0x007f, to: 0x0084); |
2952 | yyCharClass->addRange(from: 0x0086, to: 0x009f); |
2953 | return Tok_CharClass; |
2954 | case 'W': |
2955 | // see QChar::isLetterOrNumber() and QChar::isMark() |
2956 | yyCharClass->addCategories(cats: uint(-1) ^ (FLAG(QChar::Mark_NonSpacing) | |
2957 | FLAG(QChar::Mark_SpacingCombining) | |
2958 | FLAG(QChar::Mark_Enclosing) | |
2959 | FLAG(QChar::Number_DecimalDigit) | |
2960 | FLAG(QChar::Number_Letter) | |
2961 | FLAG(QChar::Number_Other) | |
2962 | FLAG(QChar::Letter_Uppercase) | |
2963 | FLAG(QChar::Letter_Lowercase) | |
2964 | FLAG(QChar::Letter_Titlecase) | |
2965 | FLAG(QChar::Letter_Modifier) | |
2966 | FLAG(QChar::Letter_Other) | |
2967 | FLAG(QChar::Punctuation_Connector))); |
2968 | yyCharClass->addRange(from: 0x203f, to: 0x2040); |
2969 | yyCharClass->addSingleton(ch: 0x2040); |
2970 | yyCharClass->addSingleton(ch: 0x2054); |
2971 | yyCharClass->addSingleton(ch: 0x30fb); |
2972 | yyCharClass->addRange(from: 0xfe33, to: 0xfe34); |
2973 | yyCharClass->addRange(from: 0xfe4d, to: 0xfe4f); |
2974 | yyCharClass->addSingleton(ch: 0xff3f); |
2975 | yyCharClass->addSingleton(ch: 0xff65); |
2976 | return Tok_CharClass; |
2977 | #endif |
2978 | #ifndef QT_NO_REGEXP_ESCAPE |
2979 | case 'b': |
2980 | return Tok_Word; |
2981 | #endif |
2982 | #ifndef QT_NO_REGEXP_CCLASS |
2983 | case 'd': |
2984 | // see QChar::isDigit() |
2985 | yyCharClass->addCategories(FLAG(QChar::Number_DecimalDigit)); |
2986 | return Tok_CharClass; |
2987 | case 's': |
2988 | // see QChar::isSpace() |
2989 | yyCharClass->addCategories(FLAG(QChar::Separator_Space) | |
2990 | FLAG(QChar::Separator_Line) | |
2991 | FLAG(QChar::Separator_Paragraph)); |
2992 | yyCharClass->addRange(from: 0x0009, to: 0x000d); |
2993 | yyCharClass->addSingleton(ch: 0x0085); |
2994 | return Tok_CharClass; |
2995 | case 'w': |
2996 | // see QChar::isLetterOrNumber() and QChar::isMark() |
2997 | yyCharClass->addCategories(FLAG(QChar::Mark_NonSpacing) | |
2998 | FLAG(QChar::Mark_SpacingCombining) | |
2999 | FLAG(QChar::Mark_Enclosing) | |
3000 | FLAG(QChar::Number_DecimalDigit) | |
3001 | FLAG(QChar::Number_Letter) | |
3002 | FLAG(QChar::Number_Other) | |
3003 | FLAG(QChar::Letter_Uppercase) | |
3004 | FLAG(QChar::Letter_Lowercase) | |
3005 | FLAG(QChar::Letter_Titlecase) | |
3006 | FLAG(QChar::Letter_Modifier) | |
3007 | FLAG(QChar::Letter_Other)); |
3008 | yyCharClass->addSingleton(ch: 0x005f); // '_' |
3009 | return Tok_CharClass; |
3010 | case 'I': |
3011 | if (!xmlSchemaExtensions) |
3012 | break; |
3013 | yyCharClass->setNegative(!yyCharClass->negative()); |
3014 | Q_FALLTHROUGH(); |
3015 | case 'i': |
3016 | if (xmlSchemaExtensions) { |
3017 | yyCharClass->addCategories(FLAG(QChar::Mark_NonSpacing) | |
3018 | FLAG(QChar::Mark_SpacingCombining) | |
3019 | FLAG(QChar::Mark_Enclosing) | |
3020 | FLAG(QChar::Number_DecimalDigit) | |
3021 | FLAG(QChar::Number_Letter) | |
3022 | FLAG(QChar::Number_Other) | |
3023 | FLAG(QChar::Letter_Uppercase) | |
3024 | FLAG(QChar::Letter_Lowercase) | |
3025 | FLAG(QChar::Letter_Titlecase) | |
3026 | FLAG(QChar::Letter_Modifier) | |
3027 | FLAG(QChar::Letter_Other)); |
3028 | yyCharClass->addSingleton(ch: 0x003a); // ':' |
3029 | yyCharClass->addSingleton(ch: 0x005f); // '_' |
3030 | yyCharClass->addRange(from: 0x0041, to: 0x005a); // [A-Z] |
3031 | yyCharClass->addRange(from: 0x0061, to: 0x007a); // [a-z] |
3032 | yyCharClass->addRange(from: 0xc0, to: 0xd6); |
3033 | yyCharClass->addRange(from: 0xd8, to: 0xf6); |
3034 | yyCharClass->addRange(from: 0xf8, to: 0x2ff); |
3035 | yyCharClass->addRange(from: 0x370, to: 0x37d); |
3036 | yyCharClass->addRange(from: 0x37f, to: 0x1fff); |
3037 | yyCharClass->addRange(from: 0x200c, to: 0x200d); |
3038 | yyCharClass->addRange(from: 0x2070, to: 0x218f); |
3039 | yyCharClass->addRange(from: 0x2c00, to: 0x2fef); |
3040 | yyCharClass->addRange(from: 0x3001, to: 0xd7ff); |
3041 | yyCharClass->addRange(from: 0xf900, to: 0xfdcf); |
3042 | yyCharClass->addRange(from: 0xfdf0, to: 0xfffd); |
3043 | yyCharClass->addRange(from: (ushort)0x10000, to: (ushort)0xeffff); |
3044 | return Tok_CharClass; |
3045 | } else { |
3046 | break; |
3047 | } |
3048 | case 'C': |
3049 | if (!xmlSchemaExtensions) |
3050 | break; |
3051 | yyCharClass->setNegative(!yyCharClass->negative()); |
3052 | Q_FALLTHROUGH(); |
3053 | case 'c': |
3054 | if (xmlSchemaExtensions) { |
3055 | yyCharClass->addCategories(FLAG(QChar::Mark_NonSpacing) | |
3056 | FLAG(QChar::Mark_SpacingCombining) | |
3057 | FLAG(QChar::Mark_Enclosing) | |
3058 | FLAG(QChar::Number_DecimalDigit) | |
3059 | FLAG(QChar::Number_Letter) | |
3060 | FLAG(QChar::Number_Other) | |
3061 | FLAG(QChar::Letter_Uppercase) | |
3062 | FLAG(QChar::Letter_Lowercase) | |
3063 | FLAG(QChar::Letter_Titlecase) | |
3064 | FLAG(QChar::Letter_Modifier) | |
3065 | FLAG(QChar::Letter_Other)); |
3066 | yyCharClass->addSingleton(ch: 0x002d); // '-' |
3067 | yyCharClass->addSingleton(ch: 0x002e); // '.' |
3068 | yyCharClass->addSingleton(ch: 0x003a); // ':' |
3069 | yyCharClass->addSingleton(ch: 0x005f); // '_' |
3070 | yyCharClass->addSingleton(ch: 0xb7); |
3071 | yyCharClass->addRange(from: 0x0030, to: 0x0039); // [0-9] |
3072 | yyCharClass->addRange(from: 0x0041, to: 0x005a); // [A-Z] |
3073 | yyCharClass->addRange(from: 0x0061, to: 0x007a); // [a-z] |
3074 | yyCharClass->addRange(from: 0xc0, to: 0xd6); |
3075 | yyCharClass->addRange(from: 0xd8, to: 0xf6); |
3076 | yyCharClass->addRange(from: 0xf8, to: 0x2ff); |
3077 | yyCharClass->addRange(from: 0x370, to: 0x37d); |
3078 | yyCharClass->addRange(from: 0x37f, to: 0x1fff); |
3079 | yyCharClass->addRange(from: 0x200c, to: 0x200d); |
3080 | yyCharClass->addRange(from: 0x2070, to: 0x218f); |
3081 | yyCharClass->addRange(from: 0x2c00, to: 0x2fef); |
3082 | yyCharClass->addRange(from: 0x3001, to: 0xd7ff); |
3083 | yyCharClass->addRange(from: 0xf900, to: 0xfdcf); |
3084 | yyCharClass->addRange(from: 0xfdf0, to: 0xfffd); |
3085 | yyCharClass->addRange(from: (ushort)0x10000, to: (ushort)0xeffff); |
3086 | yyCharClass->addRange(from: 0x0300, to: 0x036f); |
3087 | yyCharClass->addRange(from: 0x203f, to: 0x2040); |
3088 | return Tok_CharClass; |
3089 | } else { |
3090 | break; |
3091 | } |
3092 | case 'P': |
3093 | if (!xmlSchemaExtensions) |
3094 | break; |
3095 | yyCharClass->setNegative(!yyCharClass->negative()); |
3096 | Q_FALLTHROUGH(); |
3097 | case 'p': |
3098 | if (xmlSchemaExtensions) { |
3099 | if (yyCh != '{') { |
3100 | error(RXERR_CHARCLASS); |
3101 | return Tok_CharClass; |
3102 | } |
3103 | |
3104 | QByteArray category; |
3105 | yyCh = getChar(); |
3106 | while (yyCh != '}') { |
3107 | if (yyCh == EOS) { |
3108 | error(RXERR_END); |
3109 | return Tok_CharClass; |
3110 | } |
3111 | category.append(c: yyCh); |
3112 | yyCh = getChar(); |
3113 | } |
3114 | yyCh = getChar(); // skip closing '}' |
3115 | |
3116 | int catlen = category.length(); |
3117 | if (catlen == 1 || catlen == 2) { |
3118 | switch (category.at(i: 0)) { |
3119 | case 'M': |
3120 | if (catlen == 1) { |
3121 | yyCharClass->addCategories(FLAG(QChar::Mark_NonSpacing) | |
3122 | FLAG(QChar::Mark_SpacingCombining) | |
3123 | FLAG(QChar::Mark_Enclosing)); |
3124 | } else { |
3125 | switch (category.at(i: 1)) { |
3126 | case 'n': yyCharClass->addCategories(FLAG(QChar::Mark_NonSpacing)); break; // Mn |
3127 | case 'c': yyCharClass->addCategories(FLAG(QChar::Mark_SpacingCombining)); break; // Mc |
3128 | case 'e': yyCharClass->addCategories(FLAG(QChar::Mark_Enclosing)); break; // Me |
3129 | default: error(RXERR_CATEGORY); break; |
3130 | } |
3131 | } |
3132 | break; |
3133 | case 'N': |
3134 | if (catlen == 1) { |
3135 | yyCharClass->addCategories(FLAG(QChar::Number_DecimalDigit) | |
3136 | FLAG(QChar::Number_Letter) | |
3137 | FLAG(QChar::Number_Other)); |
3138 | } else { |
3139 | switch (category.at(i: 1)) { |
3140 | case 'd': yyCharClass->addCategories(FLAG(QChar::Number_DecimalDigit)); break; // Nd |
3141 | case 'l': yyCharClass->addCategories(FLAG(QChar::Number_Letter)); break; // Hl |
3142 | case 'o': yyCharClass->addCategories(FLAG(QChar::Number_Other)); break; // No |
3143 | default: error(RXERR_CATEGORY); break; |
3144 | } |
3145 | } |
3146 | break; |
3147 | case 'Z': |
3148 | if (catlen == 1) { |
3149 | yyCharClass->addCategories(FLAG(QChar::Separator_Space) | |
3150 | FLAG(QChar::Separator_Line) | |
3151 | FLAG(QChar::Separator_Paragraph)); |
3152 | } else { |
3153 | switch (category.at(i: 1)) { |
3154 | case 's': yyCharClass->addCategories(FLAG(QChar::Separator_Space)); break; // Zs |
3155 | case 'l': yyCharClass->addCategories(FLAG(QChar::Separator_Line)); break; // Zl |
3156 | case 'p': yyCharClass->addCategories(FLAG(QChar::Separator_Paragraph)); break; // Zp |
3157 | default: error(RXERR_CATEGORY); break; |
3158 | } |
3159 | } |
3160 | break; |
3161 | case 'C': |
3162 | if (catlen == 1) { |
3163 | yyCharClass->addCategories(FLAG(QChar::Other_Control) | |
3164 | FLAG(QChar::Other_Format) | |
3165 | FLAG(QChar::Other_Surrogate) | |
3166 | FLAG(QChar::Other_PrivateUse) | |
3167 | FLAG(QChar::Other_NotAssigned)); |
3168 | } else { |
3169 | switch (category.at(i: 1)) { |
3170 | case 'c': yyCharClass->addCategories(FLAG(QChar::Other_Control)); break; // Cc |
3171 | case 'f': yyCharClass->addCategories(FLAG(QChar::Other_Format)); break; // Cf |
3172 | case 's': yyCharClass->addCategories(FLAG(QChar::Other_Surrogate)); break; // Cs |
3173 | case 'o': yyCharClass->addCategories(FLAG(QChar::Other_PrivateUse)); break; // Co |
3174 | case 'n': yyCharClass->addCategories(FLAG(QChar::Other_NotAssigned)); break; // Cn |
3175 | default: error(RXERR_CATEGORY); break; |
3176 | } |
3177 | } |
3178 | break; |
3179 | case 'L': |
3180 | if (catlen == 1) { |
3181 | yyCharClass->addCategories(FLAG(QChar::Letter_Uppercase) | |
3182 | FLAG(QChar::Letter_Lowercase) | |
3183 | FLAG(QChar::Letter_Titlecase) | |
3184 | FLAG(QChar::Letter_Modifier) | |
3185 | FLAG(QChar::Letter_Other)); |
3186 | } else { |
3187 | switch (category.at(i: 1)) { |
3188 | case 'u': yyCharClass->addCategories(FLAG(QChar::Letter_Uppercase)); break; // Lu |
3189 | case 'l': yyCharClass->addCategories(FLAG(QChar::Letter_Lowercase)); break; // Ll |
3190 | case 't': yyCharClass->addCategories(FLAG(QChar::Letter_Titlecase)); break; // Lt |
3191 | case 'm': yyCharClass->addCategories(FLAG(QChar::Letter_Modifier)); break; // Lm |
3192 | case 'o': yyCharClass->addCategories(FLAG(QChar::Letter_Other)); break; // Lo |
3193 | default: error(RXERR_CATEGORY); break; |
3194 | } |
3195 | } |
3196 | break; |
3197 | case 'P': |
3198 | if (catlen == 1) { |
3199 | yyCharClass->addCategories(FLAG(QChar::Punctuation_Connector) | |
3200 | FLAG(QChar::Punctuation_Dash) | |
3201 | FLAG(QChar::Punctuation_Open) | |
3202 | FLAG(QChar::Punctuation_Close) | |
3203 | FLAG(QChar::Punctuation_InitialQuote) | |
3204 | FLAG(QChar::Punctuation_FinalQuote) | |
3205 | FLAG(QChar::Punctuation_Other)); |
3206 | } else { |
3207 | switch (category.at(i: 1)) { |
3208 | case 'c': yyCharClass->addCategories(FLAG(QChar::Punctuation_Connector)); break; // Pc |
3209 | case 'd': yyCharClass->addCategories(FLAG(QChar::Punctuation_Dash)); break; // Pd |
3210 | case 's': yyCharClass->addCategories(FLAG(QChar::Punctuation_Open)); break; // Ps |
3211 | case 'e': yyCharClass->addCategories(FLAG(QChar::Punctuation_Close)); break; // Pe |
3212 | case 'i': yyCharClass->addCategories(FLAG(QChar::Punctuation_InitialQuote)); break; // Pi |
3213 | case 'f': yyCharClass->addCategories(FLAG(QChar::Punctuation_FinalQuote)); break; // Pf |
3214 | case 'o': yyCharClass->addCategories(FLAG(QChar::Punctuation_Other)); break; // Po |
3215 | default: error(RXERR_CATEGORY); break; |
3216 | } |
3217 | } |
3218 | break; |
3219 | case 'S': |
3220 | if (catlen == 1) { |
3221 | yyCharClass->addCategories(FLAG(QChar::Symbol_Math) | |
3222 | FLAG(QChar::Symbol_Currency) | |
3223 | FLAG(QChar::Symbol_Modifier) | |
3224 | FLAG(QChar::Symbol_Other)); |
3225 | } else { |
3226 | switch (category.at(i: 1)) { |
3227 | case 'm': yyCharClass->addCategories(FLAG(QChar::Symbol_Math)); break; // Sm |
3228 | case 'c': yyCharClass->addCategories(FLAG(QChar::Symbol_Currency)); break; // Sc |
3229 | case 'k': yyCharClass->addCategories(FLAG(QChar::Symbol_Modifier)); break; // Sk |
3230 | case 'o': yyCharClass->addCategories(FLAG(QChar::Symbol_Other)); break; // So |
3231 | default: error(RXERR_CATEGORY); break; |
3232 | } |
3233 | } |
3234 | break; |
3235 | default: |
3236 | error(RXERR_CATEGORY); |
3237 | break; |
3238 | } |
3239 | } else if (catlen > 2 && category.at(i: 0) == 'I' && category.at(i: 1) == 's') { |
3240 | static const int N = sizeof(categoriesRangeMap) / sizeof(categoriesRangeMap[0]); |
3241 | const char * const categoryFamily = category.constData() + 2; |
3242 | const CategoriesRangeMapEntry *r = std::lower_bound(first: categoriesRangeMap, last: categoriesRangeMap + N, val: categoryFamily); |
3243 | if (r != categoriesRangeMap + N && qstrcmp(str1: r->name, str2: categoryFamily) == 0) |
3244 | yyCharClass->addRange(from: r->first, to: r->second); |
3245 | else |
3246 | error(RXERR_CATEGORY); |
3247 | } else { |
3248 | error(RXERR_CATEGORY); |
3249 | } |
3250 | return Tok_CharClass; |
3251 | } else { |
3252 | break; |
3253 | } |
3254 | #endif |
3255 | #ifndef QT_NO_REGEXP_ESCAPE |
3256 | case 'x': |
3257 | val = 0; |
3258 | for (i = 0; i < 4; i++) { |
3259 | low = QChar(yyCh).toLower().unicode(); |
3260 | if (low >= '0' && low <= '9') |
3261 | val = (val << 4) | (low - '0'); |
3262 | else if (low >= 'a' && low <= 'f') |
3263 | val = (val << 4) | (low - 'a' + 10); |
3264 | else |
3265 | break; |
3266 | yyCh = getChar(); |
3267 | } |
3268 | return Tok_Char | val; |
3269 | #endif |
3270 | default: |
3271 | break; |
3272 | } |
3273 | if (prevCh >= '1' && prevCh <= '9') { |
3274 | #ifndef QT_NO_REGEXP_BACKREF |
3275 | val = prevCh - '0'; |
3276 | while (yyCh >= '0' && yyCh <= '9') { |
3277 | val = (val * 10) + (yyCh - '0'); |
3278 | yyCh = getChar(); |
3279 | } |
3280 | return Tok_BackRef | val; |
3281 | #else |
3282 | error(RXERR_DISABLED); |
3283 | #endif |
3284 | } |
3285 | return Tok_Char | prevCh; |
3286 | } |
3287 | |
3288 | #ifndef QT_NO_REGEXP_INTERVAL |
3289 | int QRegExpEngine::getRep(int def) |
3290 | { |
3291 | if (yyCh >= '0' && yyCh <= '9') { |
3292 | int rep = 0; |
3293 | do { |
3294 | rep = 10 * rep + yyCh - '0'; |
3295 | if (rep >= InftyRep) { |
3296 | error(RXERR_REPETITION); |
3297 | rep = def; |
3298 | } |
3299 | yyCh = getChar(); |
3300 | } while (yyCh >= '0' && yyCh <= '9'); |
3301 | return rep; |
3302 | } else { |
3303 | return def; |
3304 | } |
3305 | } |
3306 | #endif |
3307 | |
3308 | #ifndef QT_NO_REGEXP_LOOKAHEAD |
3309 | void QRegExpEngine::skipChars(int n) |
3310 | { |
3311 | if (n > 0) { |
3312 | yyPos += n - 1; |
3313 | yyCh = getChar(); |
3314 | } |
3315 | } |
3316 | #endif |
3317 | |
3318 | void QRegExpEngine::error(const char *msg) |
3319 | { |
3320 | if (yyError.isEmpty()) |
3321 | yyError = QLatin1String(msg); |
3322 | } |
3323 | |
3324 | void QRegExpEngine::startTokenizer(const QChar *rx, int len) |
3325 | { |
3326 | yyIn = rx; |
3327 | yyPos0 = 0; |
3328 | yyPos = 0; |
3329 | yyLen = len; |
3330 | yyCh = getChar(); |
3331 | yyCharClass.reset(other: new QRegExpCharClass); |
3332 | yyMinRep = 0; |
3333 | yyMaxRep = 0; |
3334 | yyError = QString(); |
3335 | } |
3336 | |
3337 | int QRegExpEngine::getToken() |
3338 | { |
3339 | #ifndef QT_NO_REGEXP_CCLASS |
3340 | ushort pendingCh = 0; |
3341 | bool charPending; |
3342 | bool rangePending; |
3343 | int tok; |
3344 | #endif |
3345 | int prevCh = yyCh; |
3346 | |
3347 | yyPos0 = yyPos - 1; |
3348 | #ifndef QT_NO_REGEXP_CCLASS |
3349 | yyCharClass->clear(); |
3350 | #endif |
3351 | yyMinRep = 0; |
3352 | yyMaxRep = 0; |
3353 | yyCh = getChar(); |
3354 | |
3355 | switch (prevCh) { |
3356 | case EOS: |
3357 | yyPos0 = yyPos; |
3358 | return Tok_Eos; |
3359 | case '$': |
3360 | return Tok_Dollar; |
3361 | case '(': |
3362 | if (yyCh == '?') { |
3363 | prevCh = getChar(); |
3364 | yyCh = getChar(); |
3365 | switch (prevCh) { |
3366 | #ifndef QT_NO_REGEXP_LOOKAHEAD |
3367 | case '!': |
3368 | return Tok_NegLookahead; |
3369 | case '=': |
3370 | return Tok_PosLookahead; |
3371 | #endif |
3372 | case ':': |
3373 | return Tok_MagicLeftParen; |
3374 | case '<': |
3375 | error(RXERR_LOOKBEHIND); |
3376 | return Tok_MagicLeftParen; |
3377 | default: |
3378 | error(RXERR_LOOKAHEAD); |
3379 | return Tok_MagicLeftParen; |
3380 | } |
3381 | } else { |
3382 | return Tok_LeftParen; |
3383 | } |
3384 | case ')': |
3385 | return Tok_RightParen; |
3386 | case '*': |
3387 | yyMinRep = 0; |
3388 | yyMaxRep = InftyRep; |
3389 | return Tok_Quantifier; |
3390 | case '+': |
3391 | yyMinRep = 1; |
3392 | yyMaxRep = InftyRep; |
3393 | return Tok_Quantifier; |
3394 | case '.': |
3395 | #ifndef QT_NO_REGEXP_CCLASS |
3396 | yyCharClass->setNegative(true); |
3397 | #endif |
3398 | return Tok_CharClass; |
3399 | case '?': |
3400 | yyMinRep = 0; |
3401 | yyMaxRep = 1; |
3402 | return Tok_Quantifier; |
3403 | case '[': |
3404 | #ifndef QT_NO_REGEXP_CCLASS |
3405 | if (yyCh == '^') { |
3406 | yyCharClass->setNegative(true); |
3407 | yyCh = getChar(); |
3408 | } |
3409 | charPending = false; |
3410 | rangePending = false; |
3411 | do { |
3412 | if (yyCh == '-' && charPending && !rangePending) { |
3413 | rangePending = true; |
3414 | yyCh = getChar(); |
3415 | } else { |
3416 | if (charPending && !rangePending) { |
3417 | yyCharClass->addSingleton(ch: pendingCh); |
3418 | charPending = false; |
3419 | } |
3420 | if (yyCh == '\\') { |
3421 | yyCh = getChar(); |
3422 | tok = getEscape(); |
3423 | if (tok == Tok_Word) |
3424 | tok = '\b'; |
3425 | } else { |
3426 | tok = Tok_Char | yyCh; |
3427 | yyCh = getChar(); |
3428 | } |
3429 | if (tok == Tok_CharClass) { |
3430 | if (rangePending) { |
3431 | yyCharClass->addSingleton(ch: '-'); |
3432 | yyCharClass->addSingleton(ch: pendingCh); |
3433 | charPending = false; |
3434 | rangePending = false; |
3435 | } |
3436 | } else if ((tok & Tok_Char) != 0) { |
3437 | if (rangePending) { |
3438 | yyCharClass->addRange(from: pendingCh, to: tok ^ Tok_Char); |
3439 | charPending = false; |
3440 | rangePending = false; |
3441 | } else { |
3442 | pendingCh = tok ^ Tok_Char; |
3443 | charPending = true; |
3444 | } |
3445 | } else { |
3446 | error(RXERR_CHARCLASS); |
3447 | } |
3448 | } |
3449 | } while (yyCh != ']' && yyCh != EOS); |
3450 | if (rangePending) |
3451 | yyCharClass->addSingleton(ch: '-'); |
3452 | if (charPending) |
3453 | yyCharClass->addSingleton(ch: pendingCh); |
3454 | if (yyCh == EOS) |
3455 | error(RXERR_END); |
3456 | else |
3457 | yyCh = getChar(); |
3458 | return Tok_CharClass; |
3459 | #else |
3460 | error(RXERR_END); |
3461 | return Tok_Char | '['; |
3462 | #endif |
3463 | case '\\': |
3464 | return getEscape(); |
3465 | case ']': |
3466 | error(RXERR_LEFTDELIM); |
3467 | return Tok_Char | ']'; |
3468 | case '^': |
3469 | return Tok_Caret; |
3470 | case '{': |
3471 | #ifndef QT_NO_REGEXP_INTERVAL |
3472 | yyMinRep = getRep(def: 0); |
3473 | yyMaxRep = yyMinRep; |
3474 | if (yyCh == ',') { |
3475 | yyCh = getChar(); |
3476 | yyMaxRep = getRep(def: InftyRep); |
3477 | } |
3478 | if (yyMaxRep < yyMinRep) |
3479 | error(RXERR_INTERVAL); |
3480 | if (yyCh != '}') |
3481 | error(RXERR_REPETITION); |
3482 | yyCh = getChar(); |
3483 | return Tok_Quantifier; |
3484 | #else |
3485 | error(RXERR_DISABLED); |
3486 | return Tok_Char | '{'; |
3487 | #endif |
3488 | case '|': |
3489 | return Tok_Bar; |
3490 | case '}': |
3491 | error(RXERR_LEFTDELIM); |
3492 | return Tok_Char | '}'; |
3493 | default: |
3494 | return Tok_Char | prevCh; |
3495 | } |
3496 | } |
3497 | |
3498 | int QRegExpEngine::parse(const QChar *pattern, int len) |
3499 | { |
3500 | valid = true; |
3501 | startTokenizer(rx: pattern, len); |
3502 | yyTok = getToken(); |
3503 | #ifndef QT_NO_REGEXP_CAPTURE |
3504 | yyMayCapture = true; |
3505 | #else |
3506 | yyMayCapture = false; |
3507 | #endif |
3508 | |
3509 | #ifndef QT_NO_REGEXP_CAPTURE |
3510 | int atom = startAtom(officialCapture: false); |
3511 | #endif |
3512 | QRegExpCharClass anything; |
3513 | Box box(this); // create InitialState |
3514 | box.set(anything); |
3515 | Box rightBox(this); // create FinalState |
3516 | rightBox.set(anything); |
3517 | |
3518 | Box middleBox(this); |
3519 | parseExpression(box: &middleBox); |
3520 | #ifndef QT_NO_REGEXP_CAPTURE |
3521 | finishAtom(atom, needCapture: false); |
3522 | #endif |
3523 | #ifndef QT_NO_REGEXP_OPTIM |
3524 | middleBox.setupHeuristics(); |
3525 | #endif |
3526 | box.cat(b: middleBox); |
3527 | box.cat(b: rightBox); |
3528 | yyCharClass.reset(); |
3529 | |
3530 | #ifndef QT_NO_REGEXP_CAPTURE |
3531 | for (int i = 0; i < nf; ++i) { |
3532 | switch (f[i].capture) { |
3533 | case QRegExpAtom::NoCapture: |
3534 | break; |
3535 | case QRegExpAtom::OfficialCapture: |
3536 | f[i].capture = ncap; |
3537 | captureForOfficialCapture.append(t: ncap); |
3538 | ++ncap; |
3539 | ++officialncap; |
3540 | break; |
3541 | case QRegExpAtom::UnofficialCapture: |
3542 | f[i].capture = greedyQuantifiers ? ncap++ : QRegExpAtom::NoCapture; |
3543 | } |
3544 | } |
3545 | |
3546 | #ifndef QT_NO_REGEXP_BACKREF |
3547 | #ifndef QT_NO_REGEXP_OPTIM |
3548 | if (officialncap == 0 && nbrefs == 0) { |
3549 | ncap = nf = 0; |
3550 | f.clear(); |
3551 | } |
3552 | #endif |
3553 | // handle the case where there's a \5 with no corresponding capture |
3554 | // (captureForOfficialCapture.size() != officialncap) |
3555 | for (int i = 0; i < nbrefs - officialncap; ++i) { |
3556 | captureForOfficialCapture.append(t: ncap); |
3557 | ++ncap; |
3558 | } |
3559 | #endif |
3560 | #endif |
3561 | |
3562 | if (!yyError.isEmpty()) |
3563 | return -1; |
3564 | |
3565 | #ifndef QT_NO_REGEXP_OPTIM |
3566 | const QRegExpAutomatonState &sinit = s.at(i: InitialState); |
3567 | caretAnchored = !sinit.anchors.isEmpty(); |
3568 | if (caretAnchored) { |
3569 | const QMap<int, int> &anchors = sinit.anchors; |
3570 | QMap<int, int>::const_iterator a; |
3571 | for (a = anchors.constBegin(); a != anchors.constEnd(); ++a) { |
3572 | if ( |
3573 | #ifndef QT_NO_REGEXP_ANCHOR_ALT |
3574 | (*a & Anchor_Alternation) != 0 || |
3575 | #endif |
3576 | (*a & Anchor_Caret) == 0) |
3577 | { |
3578 | caretAnchored = false; |
3579 | break; |
3580 | } |
3581 | } |
3582 | } |
3583 | #endif |
3584 | |
3585 | // cleanup anchors |
3586 | int numStates = s.count(); |
3587 | for (int i = 0; i < numStates; ++i) { |
3588 | QRegExpAutomatonState &state = s[i]; |
3589 | if (!state.anchors.isEmpty()) { |
3590 | QMap<int, int>::iterator a = state.anchors.begin(); |
3591 | while (a != state.anchors.end()) { |
3592 | if (a.value() == 0) |
3593 | a = state.anchors.erase(it: a); |
3594 | else |
3595 | ++a; |
3596 | } |
3597 | } |
3598 | } |
3599 | |
3600 | return yyPos0; |
3601 | } |
3602 | |
3603 | void QRegExpEngine::parseAtom(Box *box) |
3604 | { |
3605 | #ifndef QT_NO_REGEXP_LOOKAHEAD |
3606 | QRegExpEngine *eng = nullptr; |
3607 | bool neg; |
3608 | int len; |
3609 | #endif |
3610 | |
3611 | if ((yyTok & Tok_Char) != 0) { |
3612 | box->set(QChar(yyTok ^ Tok_Char)); |
3613 | } else { |
3614 | #ifndef QT_NO_REGEXP_OPTIM |
3615 | trivial = false; |
3616 | #endif |
3617 | switch (yyTok) { |
3618 | case Tok_Dollar: |
3619 | box->catAnchor(a: Anchor_Dollar); |
3620 | break; |
3621 | case Tok_Caret: |
3622 | box->catAnchor(a: Anchor_Caret); |
3623 | break; |
3624 | #ifndef QT_NO_REGEXP_LOOKAHEAD |
3625 | case Tok_PosLookahead: |
3626 | case Tok_NegLookahead: |
3627 | neg = (yyTok == Tok_NegLookahead); |
3628 | eng = new QRegExpEngine(cs, greedyQuantifiers); |
3629 | len = eng->parse(pattern: yyIn + yyPos - 1, len: yyLen - yyPos + 1); |
3630 | if (len >= 0) |
3631 | skipChars(n: len); |
3632 | else |
3633 | error(RXERR_LOOKAHEAD); |
3634 | box->catAnchor(a: addLookahead(eng, negative: neg)); |
3635 | yyTok = getToken(); |
3636 | if (yyTok != Tok_RightParen) |
3637 | error(RXERR_LOOKAHEAD); |
3638 | break; |
3639 | #endif |
3640 | #ifndef QT_NO_REGEXP_ESCAPE |
3641 | case Tok_Word: |
3642 | box->catAnchor(a: Anchor_Word); |
3643 | break; |
3644 | case Tok_NonWord: |
3645 | box->catAnchor(a: Anchor_NonWord); |
3646 | break; |
3647 | #endif |
3648 | case Tok_LeftParen: |
3649 | case Tok_MagicLeftParen: |
3650 | yyTok = getToken(); |
3651 | parseExpression(box); |
3652 | if (yyTok != Tok_RightParen) |
3653 | error(RXERR_END); |
3654 | break; |
3655 | case Tok_CharClass: |
3656 | box->set(*yyCharClass); |
3657 | break; |
3658 | case Tok_Quantifier: |
3659 | error(RXERR_REPETITION); |
3660 | break; |
3661 | default: |
3662 | #ifndef QT_NO_REGEXP_BACKREF |
3663 | if ((yyTok & Tok_BackRef) != 0) |
3664 | box->set(yyTok ^ Tok_BackRef); |
3665 | else |
3666 | #endif |
3667 | error(RXERR_DISABLED); |
3668 | } |
3669 | } |
3670 | yyTok = getToken(); |
3671 | } |
3672 | |
3673 | void QRegExpEngine::parseFactor(Box *box) |
3674 | { |
3675 | #ifndef QT_NO_REGEXP_CAPTURE |
3676 | int outerAtom = greedyQuantifiers ? startAtom(officialCapture: false) : -1; |
3677 | int innerAtom = startAtom(officialCapture: yyMayCapture && yyTok == Tok_LeftParen); |
3678 | bool magicLeftParen = (yyTok == Tok_MagicLeftParen); |
3679 | #else |
3680 | const int innerAtom = -1; |
3681 | #endif |
3682 | |
3683 | #ifndef QT_NO_REGEXP_INTERVAL |
3684 | #define YYREDO() \ |
3685 | yyIn = in, yyPos0 = pos0, yyPos = pos, yyLen = len, yyCh = ch, \ |
3686 | *yyCharClass = charClass, yyMinRep = 0, yyMaxRep = 0, yyTok = tok |
3687 | |
3688 | const QChar *in = yyIn; |
3689 | int pos0 = yyPos0; |
3690 | int pos = yyPos; |
3691 | int len = yyLen; |
3692 | int ch = yyCh; |
3693 | QRegExpCharClass charClass; |
3694 | if (yyTok == Tok_CharClass) |
3695 | charClass = *yyCharClass; |
3696 | int tok = yyTok; |
3697 | bool mayCapture = yyMayCapture; |
3698 | #endif |
3699 | |
3700 | parseAtom(box); |
3701 | #ifndef QT_NO_REGEXP_CAPTURE |
3702 | finishAtom(atom: innerAtom, needCapture: magicLeftParen); |
3703 | #endif |
3704 | |
3705 | bool hasQuantifier = (yyTok == Tok_Quantifier); |
3706 | if (hasQuantifier) { |
3707 | #ifndef QT_NO_REGEXP_OPTIM |
3708 | trivial = false; |
3709 | #endif |
3710 | if (yyMaxRep == InftyRep) { |
3711 | box->plus(atom: innerAtom); |
3712 | #ifndef QT_NO_REGEXP_INTERVAL |
3713 | } else if (yyMaxRep == 0) { |
3714 | box->clear(); |
3715 | #endif |
3716 | } |
3717 | if (yyMinRep == 0) |
3718 | box->opt(); |
3719 | |
3720 | #ifndef QT_NO_REGEXP_INTERVAL |
3721 | yyMayCapture = false; |
3722 | int alpha = (yyMinRep == 0) ? 0 : yyMinRep - 1; |
3723 | int beta = (yyMaxRep == InftyRep) ? 0 : yyMaxRep - (alpha + 1); |
3724 | |
3725 | Box rightBox(this); |
3726 | int i; |
3727 | |
3728 | for (i = 0; i < beta; i++) { |
3729 | YYREDO(); |
3730 | Box leftBox(this); |
3731 | parseAtom(box: &leftBox); |
3732 | leftBox.cat(b: rightBox); |
3733 | leftBox.opt(); |
3734 | rightBox = leftBox; |
3735 | } |
3736 | for (i = 0; i < alpha; i++) { |
3737 | YYREDO(); |
3738 | Box leftBox(this); |
3739 | parseAtom(box: &leftBox); |
3740 | leftBox.cat(b: rightBox); |
3741 | rightBox = leftBox; |
3742 | } |
3743 | rightBox.cat(b: *box); |
3744 | *box = rightBox; |
3745 | #endif |
3746 | yyTok = getToken(); |
3747 | #ifndef QT_NO_REGEXP_INTERVAL |
3748 | yyMayCapture = mayCapture; |
3749 | #endif |
3750 | } |
3751 | #undef YYREDO |
3752 | #ifndef QT_NO_REGEXP_CAPTURE |
3753 | if (greedyQuantifiers) |
3754 | finishAtom(atom: outerAtom, needCapture: hasQuantifier); |
3755 | #endif |
3756 | } |
3757 | |
3758 | void QRegExpEngine::parseTerm(Box *box) |
3759 | { |
3760 | #ifndef QT_NO_REGEXP_OPTIM |
3761 | if (yyTok != Tok_Eos && yyTok != Tok_RightParen && yyTok != Tok_Bar) |
3762 | parseFactor(box); |
3763 | #endif |
3764 | while (yyTok != Tok_Eos && yyTok != Tok_RightParen && yyTok != Tok_Bar) { |
3765 | Box rightBox(this); |
3766 | parseFactor(box: &rightBox); |
3767 | box->cat(b: rightBox); |
3768 | } |
3769 | } |
3770 | |
3771 | void QRegExpEngine::parseExpression(Box *box) |
3772 | { |
3773 | parseTerm(box); |
3774 | while (yyTok == Tok_Bar) { |
3775 | #ifndef QT_NO_REGEXP_OPTIM |
3776 | trivial = false; |
3777 | #endif |
3778 | Box rightBox(this); |
3779 | yyTok = getToken(); |
3780 | parseTerm(box: &rightBox); |
3781 | box->orx(b: rightBox); |
3782 | } |
3783 | } |
3784 | |
3785 | /* |
3786 | The struct QRegExpPrivate contains the private data of a regular |
3787 | expression other than the automaton. It makes it possible for many |
3788 | QRegExp objects to use the same QRegExpEngine object with different |
3789 | QRegExpPrivate objects. |
3790 | */ |
3791 | struct QRegExpPrivate |
3792 | { |
3793 | QRegExpEngine *eng; |
3794 | QRegExpEngineKey engineKey; |
3795 | bool minimal; |
3796 | #ifndef QT_NO_REGEXP_CAPTURE |
3797 | QString t; // last string passed to QRegExp::indexIn() or lastIndexIn() |
3798 | QStringList capturedCache; // what QRegExp::capturedTexts() returned last |
3799 | #endif |
3800 | QRegExpMatchState matchState; |
3801 | |
3802 | inline QRegExpPrivate() |
3803 | : eng(nullptr), engineKey(QString(), QRegExp::RegExp, Qt::CaseSensitive), minimal(false) { } |
3804 | inline QRegExpPrivate(const QRegExpEngineKey &key) |
3805 | : eng(nullptr), engineKey(key), minimal(false) {} |
3806 | }; |
3807 | |
3808 | #if !defined(QT_NO_REGEXP_OPTIM) |
3809 | struct QRECache |
3810 | { |
3811 | typedef QHash<QRegExpEngineKey, QRegExpEngine *> EngineCache; |
3812 | typedef QCache<QRegExpEngineKey, QRegExpEngine> UnusedEngineCache; |
3813 | EngineCache usedEngines; |
3814 | UnusedEngineCache unusedEngines; |
3815 | }; |
3816 | Q_GLOBAL_STATIC(QRECache, engineCache) |
3817 | static QBasicMutex engineCacheMutex; |
3818 | #endif // QT_NO_REGEXP_OPTIM |
3819 | |
3820 | static void derefEngine(QRegExpEngine *eng, const QRegExpEngineKey &key) |
3821 | { |
3822 | #if !defined(QT_NO_REGEXP_OPTIM) |
3823 | const auto locker = qt_scoped_lock(mutex&: engineCacheMutex); |
3824 | if (!eng->ref.deref()) { |
3825 | if (QRECache *c = engineCache()) { |
3826 | c->unusedEngines.insert(akey: key, aobject: eng, acost: 4 + key.pattern.length() / 4); |
3827 | c->usedEngines.remove(akey: key); |
3828 | } else { |
3829 | delete eng; |
3830 | } |
3831 | } |
3832 | #else |
3833 | Q_UNUSED(key); |
3834 | if (!eng->ref.deref()) |
3835 | delete eng; |
3836 | #endif |
3837 | } |
3838 | |
3839 | static void prepareEngine_helper(QRegExpPrivate *priv) |
3840 | { |
3841 | Q_ASSERT(!priv->eng); |
3842 | |
3843 | #if !defined(QT_NO_REGEXP_OPTIM) |
3844 | const auto locker = qt_scoped_lock(mutex&: engineCacheMutex); |
3845 | if (QRECache *c = engineCache()) { |
3846 | priv->eng = c->unusedEngines.take(key: priv->engineKey); |
3847 | if (!priv->eng) |
3848 | priv->eng = c->usedEngines.value(akey: priv->engineKey); |
3849 | if (!priv->eng) |
3850 | priv->eng = new QRegExpEngine(priv->engineKey); |
3851 | else |
3852 | priv->eng->ref.ref(); |
3853 | |
3854 | c->usedEngines.insert(akey: priv->engineKey, avalue: priv->eng); |
3855 | return; |
3856 | } |
3857 | #endif // QT_NO_REGEXP_OPTIM |
3858 | |
3859 | priv->eng = new QRegExpEngine(priv->engineKey); |
3860 | } |
3861 | |
3862 | inline static void prepareEngine(QRegExpPrivate *priv) |
3863 | { |
3864 | if (priv->eng) |
3865 | return; |
3866 | prepareEngine_helper(priv); |
3867 | priv->matchState.prepareForMatch(eng: priv->eng); |
3868 | } |
3869 | |
3870 | static void prepareEngineForMatch(QRegExpPrivate *priv, const QString &str) |
3871 | { |
3872 | prepareEngine(priv); |
3873 | priv->matchState.prepareForMatch(eng: priv->eng); |
3874 | #ifndef QT_NO_REGEXP_CAPTURE |
3875 | priv->t = str; |
3876 | priv->capturedCache.clear(); |
3877 | #else |
3878 | Q_UNUSED(str); |
3879 | #endif |
3880 | } |
3881 | |
3882 | static void invalidateEngine(QRegExpPrivate *priv) |
3883 | { |
3884 | if (priv->eng) { |
3885 | derefEngine(eng: priv->eng, key: priv->engineKey); |
3886 | priv->eng = nullptr; |
3887 | priv->matchState.drain(); |
3888 | } |
3889 | } |
3890 | |
3891 | /*! |
3892 | \enum QRegExp::CaretMode |
3893 | |
3894 | The CaretMode enum defines the different meanings of the caret |
3895 | (\b{^}) in a regular expression. The possible values are: |
3896 | |
3897 | \value CaretAtZero |
3898 | The caret corresponds to index 0 in the searched string. |
3899 | |
3900 | \value CaretAtOffset |
3901 | The caret corresponds to the start offset of the search. |
3902 | |
3903 | \value CaretWontMatch |
3904 | The caret never matches. |
3905 | */ |
3906 | |
3907 | /*! |
3908 | \enum QRegExp::PatternSyntax |
3909 | |
3910 | The syntax used to interpret the meaning of the pattern. |
3911 | |
3912 | \value RegExp A rich Perl-like pattern matching syntax. This is |
3913 | the default. |
3914 | |
3915 | \value RegExp2 Like RegExp, but with \l{greedy quantifiers}. |
3916 | (Introduced in Qt 4.2.) |
3917 | |
3918 | \value Wildcard This provides a simple pattern matching syntax |
3919 | similar to that used by shells (command interpreters) for "file |
3920 | globbing". See \l{QRegExp wildcard matching}. |
3921 | |
3922 | \value WildcardUnix This is similar to Wildcard but with the |
3923 | behavior of a Unix shell. The wildcard characters can be escaped |
3924 | with the character "\\". |
3925 | |
3926 | \value FixedString The pattern is a fixed string. This is |
3927 | equivalent to using the RegExp pattern on a string in |
3928 | which all metacharacters are escaped using escape(). |
3929 | |
3930 | \value W3CXmlSchema11 The pattern is a regular expression as |
3931 | defined by the W3C XML Schema 1.1 specification. |
3932 | |
3933 | \sa setPatternSyntax() |
3934 | */ |
3935 | |
3936 | /*! |
3937 | Constructs an empty regexp. |
3938 | |
3939 | \sa isValid(), errorString() |
3940 | */ |
3941 | QRegExp::QRegExp() |
3942 | { |
3943 | priv = new QRegExpPrivate; |
3944 | prepareEngine(priv); |
3945 | } |
3946 | |
3947 | /*! |
3948 | Constructs a regular expression object for the given \a pattern |
3949 | string. The pattern must be given using wildcard notation if \a |
3950 | syntax is \l Wildcard; the default is \l RegExp. The pattern is |
3951 | case sensitive, unless \a cs is Qt::CaseInsensitive. Matching is |
3952 | greedy (maximal), but can be changed by calling |
3953 | setMinimal(). |
3954 | |
3955 | \sa setPattern(), setCaseSensitivity(), setPatternSyntax() |
3956 | */ |
3957 | QRegExp::QRegExp(const QString &pattern, Qt::CaseSensitivity cs, PatternSyntax syntax) |
3958 | { |
3959 | priv = new QRegExpPrivate(QRegExpEngineKey(pattern, syntax, cs)); |
3960 | prepareEngine(priv); |
3961 | } |
3962 | |
3963 | /*! |
3964 | Constructs a regular expression as a copy of \a rx. |
3965 | |
3966 | \sa operator=() |
3967 | */ |
3968 | QRegExp::QRegExp(const QRegExp &rx) |
3969 | { |
3970 | priv = new QRegExpPrivate; |
3971 | operator=(rx); |
3972 | } |
3973 | |
3974 | /*! |
3975 | Destroys the regular expression and cleans up its internal data. |
3976 | */ |
3977 | QRegExp::~QRegExp() |
3978 | { |
3979 | invalidateEngine(priv); |
3980 | delete priv; |
3981 | } |
3982 | |
3983 | /*! |
3984 | Copies the regular expression \a rx and returns a reference to the |
3985 | copy. The case sensitivity, wildcard, and minimal matching options |
3986 | are also copied. |
3987 | */ |
3988 | QRegExp &QRegExp::operator=(const QRegExp &rx) |
3989 | { |
3990 | prepareEngine(priv: rx.priv); // to allow sharing |
3991 | QRegExpEngine *otherEng = rx.priv->eng; |
3992 | if (otherEng) |
3993 | otherEng->ref.ref(); |
3994 | invalidateEngine(priv); |
3995 | priv->eng = otherEng; |
3996 | priv->engineKey = rx.priv->engineKey; |
3997 | priv->minimal = rx.priv->minimal; |
3998 | #ifndef QT_NO_REGEXP_CAPTURE |
3999 | priv->t = rx.priv->t; |
4000 | priv->capturedCache = rx.priv->capturedCache; |
4001 | #endif |
4002 | if (priv->eng) |
4003 | priv->matchState.prepareForMatch(eng: priv->eng); |
4004 | priv->matchState.captured = rx.priv->matchState.captured; |
4005 | return *this; |
4006 | } |
4007 | |
4008 | /*! |
4009 | \fn QRegExp &QRegExp::operator=(QRegExp &&other) |
4010 | |
4011 | Move-assigns \a other to this QRegExp instance. |
4012 | |
4013 | \since 5.2 |
4014 | */ |
4015 | |
4016 | /*! |
4017 | \fn void QRegExp::swap(QRegExp &other) |
4018 | \since 4.8 |
4019 | |
4020 | Swaps regular expression \a other with this regular |
4021 | expression. This operation is very fast and never fails. |
4022 | */ |
4023 | |
4024 | /*! |
4025 | Returns \c true if this regular expression is equal to \a rx; |
4026 | otherwise returns \c false. |
4027 | |
4028 | Two QRegExp objects are equal if they have the same pattern |
4029 | strings and the same settings for case sensitivity, wildcard and |
4030 | minimal matching. |
4031 | */ |
4032 | bool QRegExp::operator==(const QRegExp &rx) const |
4033 | { |
4034 | return priv->engineKey == rx.priv->engineKey && priv->minimal == rx.priv->minimal; |
4035 | } |
4036 | |
4037 | /*! |
4038 | \since 5.6 |
4039 | \relates QRegExp |
4040 | |
4041 | Returns the hash value for \a key, using |
4042 | \a seed to seed the calculation. |
4043 | */ |
4044 | uint qHash(const QRegExp &key, uint seed) noexcept |
4045 | { |
4046 | QtPrivate::QHashCombine hash; |
4047 | seed = hash(seed, key.priv->engineKey); |
4048 | seed = hash(seed, key.priv->minimal); |
4049 | return seed; |
4050 | } |
4051 | |
4052 | /*! |
4053 | \fn bool QRegExp::operator!=(const QRegExp &rx) const |
4054 | |
4055 | Returns \c true if this regular expression is not equal to \a rx; |
4056 | otherwise returns \c false. |
4057 | |
4058 | \sa operator==() |
4059 | */ |
4060 | |
4061 | /*! |
4062 | Returns \c true if the pattern string is empty; otherwise returns |
4063 | false. |
4064 | |
4065 | If you call exactMatch() with an empty pattern on an empty string |
4066 | it will return true; otherwise it returns \c false since it operates |
4067 | over the whole string. If you call indexIn() with an empty pattern |
4068 | on \e any string it will return the start offset (0 by default) |
4069 | because the empty pattern matches the 'emptiness' at the start of |
4070 | the string. In this case the length of the match returned by |
4071 | matchedLength() will be 0. |
4072 | |
4073 | See QString::isEmpty(). |
4074 | */ |
4075 | |
4076 | bool QRegExp::isEmpty() const |
4077 | { |
4078 | return priv->engineKey.pattern.isEmpty(); |
4079 | } |
4080 | |
4081 | /*! |
4082 | Returns \c true if the regular expression is valid; otherwise returns |
4083 | false. An invalid regular expression never matches. |
4084 | |
4085 | The pattern \b{[a-z} is an example of an invalid pattern, since |
4086 | it lacks a closing square bracket. |
4087 | |
4088 | Note that the validity of a regexp may also depend on the setting |
4089 | of the wildcard flag, for example \b{*.html} is a valid |
4090 | wildcard regexp but an invalid full regexp. |
4091 | |
4092 | \sa errorString() |
4093 | */ |
4094 | bool QRegExp::isValid() const |
4095 | { |
4096 | if (priv->engineKey.pattern.isEmpty()) { |
4097 | return true; |
4098 | } else { |
4099 | prepareEngine(priv); |
4100 | return priv->eng->isValid(); |
4101 | } |
4102 | } |
4103 | |
4104 | /*! |
4105 | Returns the pattern string of the regular expression. The pattern |
4106 | has either regular expression syntax or wildcard syntax, depending |
4107 | on patternSyntax(). |
4108 | |
4109 | \sa patternSyntax(), caseSensitivity() |
4110 | */ |
4111 | QString QRegExp::pattern() const |
4112 | { |
4113 | return priv->engineKey.pattern; |
4114 | } |
4115 | |
4116 | /*! |
4117 | Sets the pattern string to \a pattern. The case sensitivity, |
4118 | wildcard, and minimal matching options are not changed. |
4119 | |
4120 | \sa setPatternSyntax(), setCaseSensitivity() |
4121 | */ |
4122 | void QRegExp::setPattern(const QString &pattern) |
4123 | { |
4124 | if (priv->engineKey.pattern != pattern) { |
4125 | invalidateEngine(priv); |
4126 | priv->engineKey.pattern = pattern; |
4127 | } |
4128 | } |
4129 | |
4130 | /*! |
4131 | Returns Qt::CaseSensitive if the regexp is matched case |
4132 | sensitively; otherwise returns Qt::CaseInsensitive. |
4133 | |
4134 | \sa patternSyntax(), pattern(), isMinimal() |
4135 | */ |
4136 | Qt::CaseSensitivity QRegExp::caseSensitivity() const |
4137 | { |
4138 | return priv->engineKey.cs; |
4139 | } |
4140 | |
4141 | /*! |
4142 | Sets case sensitive matching to \a cs. |
4143 | |
4144 | If \a cs is Qt::CaseSensitive, \b{\\.txt$} matches |
4145 | \c{readme.txt} but not \c{README.TXT}. |
4146 | |
4147 | \sa setPatternSyntax(), setPattern(), setMinimal() |
4148 | */ |
4149 | void QRegExp::setCaseSensitivity(Qt::CaseSensitivity cs) |
4150 | { |
4151 | if ((bool)cs != (bool)priv->engineKey.cs) { |
4152 | invalidateEngine(priv); |
4153 | priv->engineKey.cs = cs; |
4154 | } |
4155 | } |
4156 | |
4157 | /*! |
4158 | Returns the syntax used by the regular expression. The default is |
4159 | QRegExp::RegExp. |
4160 | |
4161 | \sa pattern(), caseSensitivity() |
4162 | */ |
4163 | QRegExp::PatternSyntax QRegExp::patternSyntax() const |
4164 | { |
4165 | return priv->engineKey.patternSyntax; |
4166 | } |
4167 | |
4168 | /*! |
4169 | Sets the syntax mode for the regular expression. The default is |
4170 | QRegExp::RegExp. |
4171 | |
4172 | Setting \a syntax to QRegExp::Wildcard enables simple shell-like |
4173 | \l{QRegExp wildcard matching}. For example, \b{r*.txt} matches the |
4174 | string \c{readme.txt} in wildcard mode, but does not match |
4175 | \c{readme}. |
4176 | |
4177 | Setting \a syntax to QRegExp::FixedString means that the pattern |
4178 | is interpreted as a plain string. Special characters (e.g., |
4179 | backslash) don't need to be escaped then. |
4180 | |
4181 | \sa setPattern(), setCaseSensitivity(), escape() |
4182 | */ |
4183 | void QRegExp::setPatternSyntax(PatternSyntax syntax) |
4184 | { |
4185 | if (syntax != priv->engineKey.patternSyntax) { |
4186 | invalidateEngine(priv); |
4187 | priv->engineKey.patternSyntax = syntax; |
4188 | } |
4189 | } |
4190 | |
4191 | /*! |
4192 | Returns \c true if minimal (non-greedy) matching is enabled; |
4193 | otherwise returns \c false. |
4194 | |
4195 | \sa caseSensitivity(), setMinimal() |
4196 | */ |
4197 | bool QRegExp::isMinimal() const |
4198 | { |
4199 | return priv->minimal; |
4200 | } |
4201 | |
4202 | /*! |
4203 | Enables or disables minimal matching. If \a minimal is false, |
4204 | matching is greedy (maximal) which is the default. |
4205 | |
4206 | For example, suppose we have the input string "We must be |
4207 | <b>bold</b>, very <b>bold</b>!" and the pattern |
4208 | \b{<b>.*</b>}. With the default greedy (maximal) matching, |
4209 | the match is "We must be \underline{<b>bold</b>, very |
4210 | <b>bold</b>}!". But with minimal (non-greedy) matching, the |
4211 | first match is: "We must be \underline{<b>bold</b>}, very |
4212 | <b>bold</b>!" and the second match is "We must be <b>bold</b>, |
4213 | very \underline{<b>bold</b>}!". In practice we might use the pattern |
4214 | \b{<b>[^<]*\</b>} instead, although this will still fail for |
4215 | nested tags. |
4216 | |
4217 | \sa setCaseSensitivity() |
4218 | */ |
4219 | void QRegExp::setMinimal(bool minimal) |
4220 | { |
4221 | priv->minimal = minimal; |
4222 | } |
4223 | |
4224 | // ### Qt 5: make non-const |
4225 | /*! |
4226 | Returns \c true if \a str is matched exactly by this regular |
4227 | expression; otherwise returns \c false. You can determine how much of |
4228 | the string was matched by calling matchedLength(). |
4229 | |
4230 | For a given regexp string R, exactMatch("R") is the equivalent of |
4231 | indexIn("^R$") since exactMatch() effectively encloses the regexp |
4232 | in the start of string and end of string anchors, except that it |
4233 | sets matchedLength() differently. |
4234 | |
4235 | For example, if the regular expression is \b{blue}, then |
4236 | exactMatch() returns \c true only for input \c blue. For inputs \c |
4237 | bluebell, \c blutak and \c lightblue, exactMatch() returns \c false |
4238 | and matchedLength() will return 4, 3 and 0 respectively. |
4239 | |
4240 | Although const, this function sets matchedLength(), |
4241 | capturedTexts(), and pos(). |
4242 | |
4243 | \sa indexIn(), lastIndexIn() |
4244 | */ |
4245 | bool QRegExp::exactMatch(const QString &str) const |
4246 | { |
4247 | prepareEngineForMatch(priv, str); |
4248 | priv->matchState.match(str0: str.unicode(), len0: str.length(), pos0: 0, minimal0: priv->minimal, oneTest: true, caretIndex: 0); |
4249 | if (priv->matchState.captured[1] == str.length()) { |
4250 | return true; |
4251 | } else { |
4252 | priv->matchState.captured[0] = 0; |
4253 | priv->matchState.captured[1] = priv->matchState.oneTestMatchedLen; |
4254 | return false; |
4255 | } |
4256 | } |
4257 | |
4258 | // ### Qt 5: make non-const |
4259 | /*! |
4260 | Attempts to find a match in \a str from position \a offset (0 by |
4261 | default). If \a offset is -1, the search starts at the last |
4262 | character; if -2, at the next to last character; etc. |
4263 | |
4264 | Returns the position of the first match, or -1 if there was no |
4265 | match. |
4266 | |
4267 | The \a caretMode parameter can be used to instruct whether \b{^} |
4268 | should match at index 0 or at \a offset. |
4269 | |
4270 | You might prefer to use QString::indexOf(), QString::contains(), |
4271 | or even QStringList::filter(). To replace matches use |
4272 | QString::replace(). |
4273 | |
4274 | Example: |
4275 | \snippet code/src_corelib_tools_qregexp.cpp 13 |
4276 | |
4277 | Although const, this function sets matchedLength(), |
4278 | capturedTexts() and pos(). |
4279 | |
4280 | If the QRegExp is a wildcard expression (see setPatternSyntax()) |
4281 | and want to test a string against the whole wildcard expression, |
4282 | use exactMatch() instead of this function. |
4283 | |
4284 | \sa lastIndexIn(), exactMatch() |
4285 | */ |
4286 | |
4287 | int QRegExp::indexIn(const QString &str, int offset, CaretMode caretMode) const |
4288 | { |
4289 | prepareEngineForMatch(priv, str); |
4290 | if (offset < 0) |
4291 | offset += str.length(); |
4292 | priv->matchState.match(str0: str.unicode(), len0: str.length(), pos0: offset, |
4293 | minimal0: priv->minimal, oneTest: false, caretIndex: caretIndex(offset, caretMode)); |
4294 | return priv->matchState.captured[0]; |
4295 | } |
4296 | |
4297 | // ### Qt 5: make non-const |
4298 | /*! |
4299 | Attempts to find a match backwards in \a str from position \a |
4300 | offset. If \a offset is -1 (the default), the search starts at the |
4301 | last character; if -2, at the next to last character; etc. |
4302 | |
4303 | Returns the position of the first match, or -1 if there was no |
4304 | match. |
4305 | |
4306 | The \a caretMode parameter can be used to instruct whether \b{^} |
4307 | should match at index 0 or at \a offset. |
4308 | |
4309 | Although const, this function sets matchedLength(), |
4310 | capturedTexts() and pos(). |
4311 | |
4312 | \warning Searching backwards is much slower than searching |
4313 | forwards. |
4314 | |
4315 | \sa indexIn(), exactMatch() |
4316 | */ |
4317 | |
4318 | int QRegExp::lastIndexIn(const QString &str, int offset, CaretMode caretMode) const |
4319 | { |
4320 | prepareEngineForMatch(priv, str); |
4321 | if (offset < 0) |
4322 | offset += str.length(); |
4323 | if (offset < 0 || offset > str.length()) { |
4324 | memset(s: priv->matchState.captured, c: -1, n: priv->matchState.capturedSize*sizeof(int)); |
4325 | return -1; |
4326 | } |
4327 | |
4328 | while (offset >= 0) { |
4329 | priv->matchState.match(str0: str.unicode(), len0: str.length(), pos0: offset, |
4330 | minimal0: priv->minimal, oneTest: true, caretIndex: caretIndex(offset, caretMode)); |
4331 | if (priv->matchState.captured[0] == offset) |
4332 | return offset; |
4333 | --offset; |
4334 | } |
4335 | return -1; |
4336 | } |
4337 | |
4338 | /*! |
4339 | Returns the length of the last matched string, or -1 if there was |
4340 | no match. |
4341 | |
4342 | \sa exactMatch(), indexIn(), lastIndexIn() |
4343 | */ |
4344 | int QRegExp::matchedLength() const |
4345 | { |
4346 | return priv->matchState.captured[1]; |
4347 | } |
4348 | |
4349 | #ifndef QT_NO_REGEXP_CAPTURE |
4350 | |
4351 | /*! |
4352 | \since 4.6 |
4353 | Returns the number of captures contained in the regular expression. |
4354 | */ |
4355 | int QRegExp::captureCount() const |
4356 | { |
4357 | prepareEngine(priv); |
4358 | return priv->eng->captureCount(); |
4359 | } |
4360 | |
4361 | /*! |
4362 | Returns a list of the captured text strings. |
4363 | |
4364 | The first string in the list is the entire matched string. Each |
4365 | subsequent list element contains a string that matched a |
4366 | (capturing) subexpression of the regexp. |
4367 | |
4368 | For example: |
4369 | \snippet code/src_corelib_tools_qregexp.cpp 14 |
4370 | |
4371 | The above example also captures elements that may be present but |
4372 | which we have no interest in. This problem can be solved by using |
4373 | non-capturing parentheses: |
4374 | |
4375 | \snippet code/src_corelib_tools_qregexp.cpp 15 |
4376 | |
4377 | Note that if you want to iterate over the list, you should iterate |
4378 | over a copy, e.g. |
4379 | \snippet code/src_corelib_tools_qregexp.cpp 16 |
4380 | |
4381 | Some regexps can match an indeterminate number of times. For |
4382 | example if the input string is "Offsets: 12 14 99 231 7" and the |
4383 | regexp, \c{rx}, is \b{(\\d+)+}, we would hope to get a list of |
4384 | all the numbers matched. However, after calling |
4385 | \c{rx.indexIn(str)}, capturedTexts() will return the list ("12", |
4386 | "12"), i.e. the entire match was "12" and the first subexpression |
4387 | matched was "12". The correct approach is to use cap() in a |
4388 | \l{QRegExp#cap_in_a_loop}{loop}. |
4389 | |
4390 | The order of elements in the string list is as follows. The first |
4391 | element is the entire matching string. Each subsequent element |
4392 | corresponds to the next capturing open left parentheses. Thus |
4393 | capturedTexts()[1] is the text of the first capturing parentheses, |
4394 | capturedTexts()[2] is the text of the second and so on |
4395 | (corresponding to $1, $2, etc., in some other regexp languages). |
4396 | |
4397 | \sa cap(), pos() |
4398 | */ |
4399 | QStringList QRegExp::capturedTexts() const |
4400 | { |
4401 | if (priv->capturedCache.isEmpty()) { |
4402 | prepareEngine(priv); |
4403 | const int *captured = priv->matchState.captured; |
4404 | int n = priv->matchState.capturedSize; |
4405 | |
4406 | for (int i = 0; i < n; i += 2) { |
4407 | QString m; |
4408 | if (captured[i + 1] == 0) |
4409 | m = QLatin1String("" ); // ### Qt 5: don't distinguish between null and empty |
4410 | else if (captured[i] >= 0) |
4411 | m = priv->t.mid(position: captured[i], n: captured[i + 1]); |
4412 | priv->capturedCache.append(t: m); |
4413 | } |
4414 | priv->t.clear(); |
4415 | } |
4416 | return priv->capturedCache; |
4417 | } |
4418 | |
4419 | /*! |
4420 | \internal |
4421 | */ |
4422 | QStringList QRegExp::capturedTexts() |
4423 | { |
4424 | return const_cast<const QRegExp *>(this)->capturedTexts(); |
4425 | } |
4426 | |
4427 | /*! |
4428 | Returns the text captured by the \a nth subexpression. The entire |
4429 | match has index 0 and the parenthesized subexpressions have |
4430 | indexes starting from 1 (excluding non-capturing parentheses). |
4431 | |
4432 | \snippet code/src_corelib_tools_qregexp.cpp 17 |
4433 | |
4434 | The order of elements matched by cap() is as follows. The first |
4435 | element, cap(0), is the entire matching string. Each subsequent |
4436 | element corresponds to the next capturing open left parentheses. |
4437 | Thus cap(1) is the text of the first capturing parentheses, cap(2) |
4438 | is the text of the second, and so on. |
4439 | |
4440 | \sa capturedTexts(), pos() |
4441 | */ |
4442 | QString QRegExp::cap(int nth) const |
4443 | { |
4444 | return capturedTexts().value(i: nth); |
4445 | } |
4446 | |
4447 | /*! |
4448 | \internal |
4449 | */ |
4450 | QString QRegExp::cap(int nth) |
4451 | { |
4452 | return const_cast<const QRegExp *>(this)->cap(nth); |
4453 | } |
4454 | |
4455 | /*! |
4456 | Returns the position of the \a nth captured text in the searched |
4457 | string. If \a nth is 0 (the default), pos() returns the position |
4458 | of the whole match. |
4459 | |
4460 | Example: |
4461 | \snippet code/src_corelib_tools_qregexp.cpp 18 |
4462 | |
4463 | For zero-length matches, pos() always returns -1. (For example, if |
4464 | cap(4) would return an empty string, pos(4) returns -1.) This is |
4465 | a feature of the implementation. |
4466 | |
4467 | \sa cap(), capturedTexts() |
4468 | */ |
4469 | int QRegExp::pos(int nth) const |
4470 | { |
4471 | if (nth < 0 || nth >= priv->matchState.capturedSize / 2) |
4472 | return -1; |
4473 | else |
4474 | return priv->matchState.captured[2 * nth]; |
4475 | } |
4476 | |
4477 | /*! |
4478 | \internal |
4479 | */ |
4480 | int QRegExp::pos(int nth) |
4481 | { |
4482 | return const_cast<const QRegExp *>(this)->pos(nth); |
4483 | } |
4484 | |
4485 | /*! |
4486 | Returns a text string that explains why a regexp pattern is |
4487 | invalid the case being; otherwise returns "no error occurred". |
4488 | |
4489 | \sa isValid() |
4490 | */ |
4491 | QString QRegExp::errorString() const |
4492 | { |
4493 | if (isValid()) { |
4494 | return QString::fromLatin1(RXERR_OK); |
4495 | } else { |
4496 | return priv->eng->errorString(); |
4497 | } |
4498 | } |
4499 | |
4500 | /*! |
4501 | \internal |
4502 | */ |
4503 | QString QRegExp::errorString() |
4504 | { |
4505 | return const_cast<const QRegExp *>(this)->errorString(); |
4506 | } |
4507 | #endif |
4508 | |
4509 | /*! |
4510 | Returns the string \a str with every regexp special character |
4511 | escaped with a backslash. The special characters are $, (,), *, +, |
4512 | ., ?, [, \,], ^, {, | and }. |
4513 | |
4514 | Example: |
4515 | |
4516 | \snippet code/src_corelib_tools_qregexp.cpp 19 |
4517 | |
4518 | This function is useful to construct regexp patterns dynamically: |
4519 | |
4520 | \snippet code/src_corelib_tools_qregexp.cpp 20 |
4521 | |
4522 | \sa setPatternSyntax() |
4523 | */ |
4524 | QString QRegExp::escape(const QString &str) |
4525 | { |
4526 | QString quoted; |
4527 | const int count = str.count(); |
4528 | quoted.reserve(asize: count * 2); |
4529 | const QLatin1Char backslash('\\'); |
4530 | for (int i = 0; i < count; i++) { |
4531 | switch (str.at(i).toLatin1()) { |
4532 | case '$': |
4533 | case '(': |
4534 | case ')': |
4535 | case '*': |
4536 | case '+': |
4537 | case '.': |
4538 | case '?': |
4539 | case '[': |
4540 | case '\\': |
4541 | case ']': |
4542 | case '^': |
4543 | case '{': |
4544 | case '|': |
4545 | case '}': |
4546 | quoted.append(c: backslash); |
4547 | } |
4548 | quoted.append(c: str.at(i)); |
4549 | } |
4550 | return quoted; |
4551 | } |
4552 | |
4553 | |
4554 | #ifndef QT_NO_DATASTREAM |
4555 | /*! |
4556 | \relates QRegExp |
4557 | |
4558 | Writes the regular expression \a regExp to stream \a out. |
4559 | |
4560 | \sa {Serializing Qt Data Types} |
4561 | */ |
4562 | QDataStream &operator<<(QDataStream &out, const QRegExp ®Exp) |
4563 | { |
4564 | return out << regExp.pattern() << (quint8)regExp.caseSensitivity() |
4565 | << (quint8)regExp.patternSyntax() |
4566 | << (quint8)!!regExp.isMinimal(); |
4567 | } |
4568 | |
4569 | /*! |
4570 | \relates QRegExp |
4571 | |
4572 | Reads a regular expression from stream \a in into \a regExp. |
4573 | |
4574 | \sa {Serializing Qt Data Types} |
4575 | */ |
4576 | QDataStream &operator>>(QDataStream &in, QRegExp ®Exp) |
4577 | { |
4578 | QString pattern; |
4579 | quint8 cs; |
4580 | quint8 patternSyntax; |
4581 | quint8 isMinimal; |
4582 | |
4583 | in >> pattern >> cs >> patternSyntax >> isMinimal; |
4584 | |
4585 | QRegExp newRegExp(pattern, Qt::CaseSensitivity(cs), |
4586 | QRegExp::PatternSyntax(patternSyntax)); |
4587 | |
4588 | newRegExp.setMinimal(isMinimal); |
4589 | regExp = newRegExp; |
4590 | return in; |
4591 | } |
4592 | #endif // QT_NO_DATASTREAM |
4593 | |
4594 | #ifndef QT_NO_DEBUG_STREAM |
4595 | QDebug operator<<(QDebug dbg, const QRegExp &r) |
4596 | { |
4597 | QDebugStateSaver saver(dbg); |
4598 | dbg.nospace() << "QRegExp(patternSyntax=" << r.patternSyntax() |
4599 | << ", pattern='" << r.pattern() << "')" ; |
4600 | return dbg; |
4601 | } |
4602 | #endif |
4603 | |
4604 | QT_END_NAMESPACE |
4605 | |