1/* This is JavaScriptCore's variant of the PCRE library. While this library
2started out as a copy of PCRE, many of the features of PCRE have been
3removed. This library now supports only the regular expression features
4required by the JavaScript language specification, and has only the functions
5needed by JavaScriptCore and the rest of WebKit.
6
7 Originally written by Philip Hazel
8 Copyright (c) 1997-2006 University of Cambridge
9 Copyright (C) 2002, 2004, 2006, 2007, 2008, 2009 Apple Inc. All rights reserved.
10 Copyright (C) 2007 Eric Seidel <eric@webkit.org>
11
12-----------------------------------------------------------------------------
13Redistribution and use in source and binary forms, with or without
14modification, are permitted provided that the following conditions are met:
15
16 * Redistributions of source code must retain the above copyright notice,
17 this list of conditions and the following disclaimer.
18
19 * Redistributions in binary form must reproduce the above copyright
20 notice, this list of conditions and the following disclaimer in the
21 documentation and/or other materials provided with the distribution.
22
23 * Neither the name of the University of Cambridge nor the names of its
24 contributors may be used to endorse or promote products derived from
25 this software without specific prior written permission.
26
27THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37POSSIBILITY OF SUCH DAMAGE.
38-----------------------------------------------------------------------------
39*/
40
41/* This module contains the external function jsRegExpExecute(), along with
42supporting internal functions that are not used by other modules. */
43
44#include "config.h"
45
46#include "pcre_internal.h"
47
48#include <string.h>
49#include <wtf/ASCIICType.h>
50#include <wtf/FastMalloc.h>
51
52using namespace WTF;
53
54/* Negative values for the firstchar and reqchar variables */
55
56#define REQ_UNSET (-2)
57#define REQ_NONE (-1)
58
59/*************************************************
60* Code parameters and static tables *
61*************************************************/
62
63/* Maximum number of items on the nested bracket stacks at compile time. This
64applies to the nesting of all kinds of parentheses. It does not limit
65un-nested, non-capturing parentheses. This number can be made bigger if
66necessary - it is used to dimension one int and one unsigned char vector at
67compile time. */
68
69#define BRASTACK_SIZE 200
70
71/* Table for handling escaped characters in the range '0'-'z'. Positive returns
72are simple data values; negative values are for special things like \d and so
73on. Zero means further processing is needed (for things like \x), or the escape
74is invalid. */
75
76static const short escapes[] = {
77 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
78 0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
79 '@', 0, -ESC_B, 0, -ESC_D, 0, 0, 0, /* @ - G */
80 0, 0, 0, 0, 0, 0, 0, 0, /* H - O */
81 0, 0, 0, -ESC_S, 0, 0, 0, -ESC_W, /* P - W */
82 0, 0, 0, '[', '\\', ']', '^', '_', /* X - _ */
83 '`', 7, -ESC_b, 0, -ESC_d, 0, '\f', 0, /* ` - g */
84 0, 0, 0, 0, 0, 0, '\n', 0, /* h - o */
85 0, 0, '\r', -ESC_s, '\t', 0, '\v', -ESC_w, /* p - w */
86 0, 0, 0 /* x - z */
87};
88
89/* Error code numbers. They are given names so that they can more easily be
90tracked. */
91
92enum ErrorCode {
93 ERR0, ERR1, ERR2, ERR3, ERR4, ERR5, ERR6, ERR7, ERR8, ERR9,
94 ERR10, ERR11, ERR12, ERR13, ERR14, ERR15, ERR16, ERR17
95};
96
97/* The texts of compile-time error messages. These are "char *" because they
98are passed to the outside world. */
99
100static const char* errorText(ErrorCode code)
101{
102 static const char errorTexts[] =
103 /* 1 */
104 "\\ at end of pattern\0"
105 "\\c at end of pattern\0"
106 "character value in \\x{...} sequence is too large\0"
107 "numbers out of order in {} quantifier\0"
108 /* 5 */
109 "number too big in {} quantifier\0"
110 "missing terminating ] for character class\0"
111 "internal error: code overflow\0"
112 "range out of order in character class\0"
113 "nothing to repeat\0"
114 /* 10 */
115 "unmatched parentheses\0"
116 "internal error: unexpected repeat\0"
117 "unrecognized character after (?\0"
118 "failed to get memory\0"
119 "missing )\0"
120 /* 15 */
121 "reference to non-existent subpattern\0"
122 "regular expression too large\0"
123 "parentheses nested too deeply"
124 ;
125
126 int i = code;
127 const char* text = errorTexts;
128 while (i > 1)
129 i -= !*text++;
130 return text;
131}
132
133/* Structure for passing "static" information around between the functions
134doing the compiling. */
135
136struct CompileData {
137 CompileData() {
138 topBackref = 0;
139 backrefMap = 0;
140 reqVaryOpt = 0;
141 needOuterBracket = false;
142 numCapturingBrackets = 0;
143 }
144 int topBackref; /* Maximum back reference */
145 unsigned backrefMap; /* Bitmap of low back refs */
146 int reqVaryOpt; /* "After variable item" flag for reqByte */
147 bool needOuterBracket;
148 int numCapturingBrackets;
149};
150
151/* Definitions to allow mutual recursion */
152
153static bool compileBracket(int, int*, unsigned char**, const UChar**, const UChar*, ErrorCode*, int, int*, int*, CompileData&);
154static bool bracketIsAnchored(const unsigned char* code);
155static bool bracketNeedsLineStart(const unsigned char* code, unsigned captureMap, unsigned backrefMap);
156static int bracketFindFirstAssertedCharacter(const unsigned char* code, bool inassert);
157
158/*************************************************
159* Handle escapes *
160*************************************************/
161
162/* This function is called when a \ has been encountered. It either returns a
163positive value for a simple escape such as \n, or a negative value which
164encodes one of the more complicated things such as \d. When UTF-8 is enabled,
165a positive value greater than 255 may be returned. On entry, ptr is pointing at
166the \. On exit, it is on the final character of the escape sequence.
167
168Arguments:
169 ptrPtr points to the pattern position pointer
170 errorCodePtr points to the errorcode variable
171 bracount number of previous extracting brackets
172 options the options bits
173 isClass true if inside a character class
174
175Returns: zero or positive => a data character
176 negative => a special escape sequence
177 on error, errorPtr is set
178*/
179
180static int checkEscape(const UChar** ptrPtr, const UChar* patternEnd, ErrorCode* errorCodePtr, int bracount, bool isClass)
181{
182 const UChar* ptr = *ptrPtr + 1;
183
184 /* If backslash is at the end of the pattern, it's an error. */
185 if (ptr == patternEnd) {
186 *errorCodePtr = ERR1;
187 *ptrPtr = ptr;
188 return 0;
189 }
190
191 int c = *ptr;
192
193 /* Non-alphamerics are literals. For digits or letters, do an initial lookup in
194 a table. A non-zero result is something that can be returned immediately.
195 Otherwise further processing may be required. */
196
197 if (c < '0' || c > 'z') { /* Not alphameric */
198 } else if (int escapeValue = escapes[c - '0']) {
199 c = escapeValue;
200 if (isClass) {
201 if (-c == ESC_b)
202 c = '\b'; /* \b is backslash in a class */
203 else if (-c == ESC_B)
204 c = 'B'; /* and \B is a capital B in a class (in browsers event though ECMAScript 15.10.2.19 says it raises an error) */
205 }
206 /* Escapes that need further processing, or are illegal. */
207
208 } else {
209 switch (c) {
210 case '1':
211 case '2':
212 case '3':
213 case '4':
214 case '5':
215 case '6':
216 case '7':
217 case '8':
218 case '9':
219 /* Escape sequences starting with a non-zero digit are backreferences,
220 unless there are insufficient brackets, in which case they are octal
221 escape sequences. Those sequences end on the first non-octal character
222 or when we overflow 0-255, whichever comes first. */
223
224 if (!isClass) {
225 const UChar* oldptr = ptr;
226 c -= '0';
227 while ((ptr + 1 < patternEnd) && isASCIIDigit(c: ptr[1]) && c <= bracount)
228 c = c * 10 + *(++ptr) - '0';
229 if (c <= bracount) {
230 c = -(ESC_REF + c);
231 break;
232 }
233 ptr = oldptr; /* Put the pointer back and fall through */
234 }
235
236 /* Handle an octal number following \. If the first digit is 8 or 9,
237 this is not octal. */
238
239 if ((c = *ptr) >= '8') {
240 c = '\\';
241 ptr -= 1;
242 break;
243 }
244
245 /* \0 always starts an octal number, but we may drop through to here with a
246 larger first octal digit. */
247
248 case '0': {
249 c -= '0';
250 int i;
251 for (i = 1; i <= 2; ++i) {
252 if (ptr + i >= patternEnd || ptr[i] < '0' || ptr[i] > '7')
253 break;
254 int cc = c * 8 + ptr[i] - '0';
255 if (cc > 255)
256 break;
257 c = cc;
258 }
259 ptr += i - 1;
260 break;
261 }
262
263 case 'x': {
264 c = 0;
265 int i;
266 for (i = 1; i <= 2; ++i) {
267 if (ptr + i >= patternEnd || !isASCIIHexDigit(c: ptr[i])) {
268 c = 'x';
269 i = 1;
270 break;
271 }
272 int cc = ptr[i];
273 if (cc >= 'a')
274 cc -= 32; /* Convert to upper case */
275 c = c * 16 + cc - ((cc < 'A') ? '0' : ('A' - 10));
276 }
277 ptr += i - 1;
278 break;
279 }
280
281 case 'u': {
282 c = 0;
283 int i;
284 for (i = 1; i <= 4; ++i) {
285 if (ptr + i >= patternEnd || !isASCIIHexDigit(c: ptr[i])) {
286 c = 'u';
287 i = 1;
288 break;
289 }
290 int cc = ptr[i];
291 if (cc >= 'a')
292 cc -= 32; /* Convert to upper case */
293 c = c * 16 + cc - ((cc < 'A') ? '0' : ('A' - 10));
294 }
295 ptr += i - 1;
296 break;
297 }
298
299 case 'c':
300 if (++ptr == patternEnd) {
301 *errorCodePtr = ERR2;
302 return 0;
303 }
304
305 c = *ptr;
306
307 /* To match Firefox, inside a character class, we also accept
308 numbers and '_' as control characters */
309 if ((!isClass && !isASCIIAlpha(c)) || (!isASCIIAlphanumeric(c) && c != '_')) {
310 c = '\\';
311 ptr -= 2;
312 break;
313 }
314
315 /* A letter is upper-cased; then the 0x40 bit is flipped. This coding
316 is ASCII-specific, but then the whole concept of \cx is ASCII-specific. */
317 c = toASCIIUpper(c) ^ 0x40;
318 break;
319 }
320 }
321
322 *ptrPtr = ptr;
323 return c;
324}
325
326/*************************************************
327* Check for counted repeat *
328*************************************************/
329
330/* This function is called when a '{' is encountered in a place where it might
331start a quantifier. It looks ahead to see if it really is a quantifier or not.
332It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
333where the ddds are digits.
334
335Arguments:
336 p pointer to the first char after '{'
337
338Returns: true or false
339*/
340
341static bool isCountedRepeat(const UChar* p, const UChar* patternEnd)
342{
343 if (p >= patternEnd || !isASCIIDigit(c: *p))
344 return false;
345 p++;
346 while (p < patternEnd && isASCIIDigit(c: *p))
347 p++;
348 if (p < patternEnd && *p == '}')
349 return true;
350
351 if (p >= patternEnd || *p++ != ',')
352 return false;
353 if (p < patternEnd && *p == '}')
354 return true;
355
356 if (p >= patternEnd || !isASCIIDigit(c: *p))
357 return false;
358 p++;
359 while (p < patternEnd && isASCIIDigit(c: *p))
360 p++;
361
362 return (p < patternEnd && *p == '}');
363}
364
365/*************************************************
366* Read repeat counts *
367*************************************************/
368
369/* Read an item of the form {n,m} and return the values. This is called only
370after isCountedRepeat() has confirmed that a repeat-count quantifier exists,
371so the syntax is guaranteed to be correct, but we need to check the values.
372
373Arguments:
374 p pointer to first char after '{'
375 minp pointer to int for min
376 maxp pointer to int for max
377 returned as -1 if no max
378 errorCodePtr points to error code variable
379
380Returns: pointer to '}' on success;
381 current ptr on error, with errorCodePtr set non-zero
382*/
383
384static const UChar* readRepeatCounts(const UChar* p, int* minp, int* maxp, ErrorCode* errorCodePtr)
385{
386 int min = 0;
387 int max = -1;
388
389 /* Read the minimum value and do a paranoid check: a negative value indicates
390 an integer overflow. */
391
392 while (isASCIIDigit(c: *p))
393 min = min * 10 + *p++ - '0';
394 if (min < 0 || min > 65535) {
395 *errorCodePtr = ERR5;
396 return p;
397 }
398
399 /* Read the maximum value if there is one, and again do a paranoid on its size.
400 Also, max must not be less than min. */
401
402 if (*p == '}')
403 max = min;
404 else {
405 if (*(++p) != '}') {
406 max = 0;
407 while (isASCIIDigit(c: *p))
408 max = max * 10 + *p++ - '0';
409 if (max < 0 || max > 65535) {
410 *errorCodePtr = ERR5;
411 return p;
412 }
413 if (max < min) {
414 *errorCodePtr = ERR4;
415 return p;
416 }
417 }
418 }
419
420 /* Fill in the required variables, and pass back the pointer to the terminating
421 '}'. */
422
423 *minp = min;
424 *maxp = max;
425 return p;
426}
427
428/*************************************************
429* Find first significant op code *
430*************************************************/
431
432/* This is called by several functions that scan a compiled expression looking
433for a fixed first character, or an anchoring op code etc. It skips over things
434that do not influence this.
435
436Arguments:
437 code pointer to the start of the group
438Returns: pointer to the first significant opcode
439*/
440
441static const unsigned char* firstSignificantOpcode(const unsigned char* code)
442{
443 while (*code == OP_BRANUMBER)
444 code += 3;
445 return code;
446}
447
448static const unsigned char* firstSignificantOpcodeSkippingAssertions(const unsigned char* code)
449{
450 while (true) {
451 switch (*code) {
452 case OP_ASSERT_NOT:
453 advanceToEndOfBracket(opcodePtr&: code);
454 code += 1 + LINK_SIZE;
455 break;
456 case OP_WORD_BOUNDARY:
457 case OP_NOT_WORD_BOUNDARY:
458 ++code;
459 break;
460 case OP_BRANUMBER:
461 code += 3;
462 break;
463 default:
464 return code;
465 }
466 }
467}
468
469/*************************************************
470* Get othercase range *
471*************************************************/
472
473/* This function is passed the start and end of a class range, in UTF-8 mode
474with UCP support. It searches up the characters, looking for internal ranges of
475characters in the "other" case. Each call returns the next one, updating the
476start address.
477
478Arguments:
479 cptr points to starting character value; updated
480 d end value
481 ocptr where to put start of othercase range
482 odptr where to put end of othercase range
483
484Yield: true when range returned; false when no more
485*/
486
487static bool getOthercaseRange(int* cptr, int d, int* ocptr, int* odptr)
488{
489 int c, othercase = 0;
490
491 for (c = *cptr; c <= d; c++) {
492 if ((othercase = jsc_pcre_ucp_othercase(c)) >= 0)
493 break;
494 }
495
496 if (c > d)
497 return false;
498
499 *ocptr = othercase;
500 int next = othercase + 1;
501
502 for (++c; c <= d; c++) {
503 if (jsc_pcre_ucp_othercase(c) != next)
504 break;
505 next++;
506 }
507
508 *odptr = next - 1;
509 *cptr = c;
510
511 return true;
512}
513
514/*************************************************
515 * Convert character value to UTF-8 *
516 *************************************************/
517
518/* This function takes an integer value in the range 0 - 0x7fffffff
519 and encodes it as a UTF-8 character in 0 to 6 bytes.
520
521 Arguments:
522 cvalue the character value
523 buffer pointer to buffer for result - at least 6 bytes long
524
525 Returns: number of characters placed in the buffer
526 */
527
528static int encodeUTF8(int cvalue, unsigned char *buffer)
529{
530 int i;
531 for (i = 0; i < jsc_pcre_utf8_table1_size; i++)
532 if (cvalue <= jsc_pcre_utf8_table1[i])
533 break;
534 buffer += i;
535 for (int j = i; j > 0; j--) {
536 *buffer-- = 0x80 | (cvalue & 0x3f);
537 cvalue >>= 6;
538 }
539 *buffer = jsc_pcre_utf8_table2[i] | cvalue;
540 return i + 1;
541}
542
543/*************************************************
544* Compile one branch *
545*************************************************/
546
547/* Scan the pattern, compiling it into the code vector.
548
549Arguments:
550 options the option bits
551 brackets points to number of extracting brackets used
552 codePtr points to the pointer to the current code point
553 ptrPtr points to the current pattern pointer
554 errorCodePtr points to error code variable
555 firstbyteptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
556 reqbyteptr set to the last literal character required, else < 0
557 cd contains pointers to tables etc.
558
559Returns: true on success
560 false, with *errorCodePtr set non-zero on error
561*/
562
563static inline bool safelyCheckNextChar(const UChar* ptr, const UChar* patternEnd, UChar expected)
564{
565 return ((ptr + 1 < patternEnd) && ptr[1] == expected);
566}
567
568static bool
569compileBranch(int options, int* brackets, unsigned char** codePtr,
570 const UChar** ptrPtr, const UChar* patternEnd, ErrorCode* errorCodePtr, int *firstbyteptr,
571 int* reqbyteptr, CompileData& cd)
572{
573 int repeatType, opType;
574 int repeatMin = 0, repeat_max = 0; /* To please picky compilers */
575 int bravalue = 0;
576 int reqvary, tempreqvary;
577 int c;
578 unsigned char* code = *codePtr;
579 unsigned char* tempcode;
580 bool didGroupSetFirstByte = false;
581 const UChar* ptr = *ptrPtr;
582 unsigned char* previous = NULL;
583 unsigned char classbits[32];
584
585 bool class_utf8;
586 unsigned char* class_utf8data;
587 unsigned char utf8_char[6];
588
589 /* Initialize no first byte, no required byte. REQ_UNSET means "no char
590 matching encountered yet". It gets changed to REQ_NONE if we hit something that
591 matches a non-fixed char first char; reqByte just remains unset if we never
592 find one.
593
594 When we hit a repeat whose minimum is zero, we may have to adjust these values
595 to take the zero repeat into account. This is implemented by setting them to
596 zeroFirstByte and zeroReqByte when such a repeat is encountered. The individual
597 item types that can be repeated set these backoff variables appropriately. */
598
599 int firstByte = REQ_UNSET;
600 int reqByte = REQ_UNSET;
601 int zeroReqByte = REQ_UNSET;
602 int zeroFirstByte = REQ_UNSET;
603
604 /* The variable reqCaseOpt contains either the REQ_IGNORE_CASE value or zero,
605 according to the current setting of the ignores-case flag. REQ_IGNORE_CASE is a bit
606 value > 255. It is added into the firstByte or reqByte variables to record the
607 case status of the value. This is used only for ASCII characters. */
608
609 int reqCaseOpt = (options & IgnoreCaseOption) ? REQ_IGNORE_CASE : 0;
610
611 /* Switch on next character until the end of the branch */
612
613 for (;; ptr++) {
614 bool negateClass;
615 bool shouldFlipNegation; /* If a negative special such as \S is used, we should negate the whole class to properly support Unicode. */
616 int classCharCount;
617 int classLastChar;
618 int skipBytes;
619 int subReqByte;
620 int subFirstByte;
621 int mcLength;
622 unsigned char mcbuffer[8];
623
624 /* Next byte in the pattern */
625
626 c = ptr < patternEnd ? *ptr : 0;
627
628 /* Fill in length of a previous callout, except when the next thing is
629 a quantifier. */
630
631 bool isQuantifier = c == '*' || c == '+' || c == '?' || (c == '{' && isCountedRepeat(p: ptr + 1, patternEnd));
632
633 switch (c) {
634 /* The branch terminates at end of string, |, or ). */
635
636 case 0:
637 if (ptr < patternEnd)
638 goto NORMAL_CHAR;
639 // End of string; fall through
640 case '|':
641 case ')':
642 *firstbyteptr = firstByte;
643 *reqbyteptr = reqByte;
644 *codePtr = code;
645 *ptrPtr = ptr;
646 return true;
647
648 /* Handle single-character metacharacters. In multiline mode, ^ disables
649 the setting of any following char as a first character. */
650
651 case '^':
652 if (options & MatchAcrossMultipleLinesOption) {
653 if (firstByte == REQ_UNSET)
654 firstByte = REQ_NONE;
655 *code++ = OP_BOL;
656 } else
657 *code++ = OP_CIRC;
658 previous = NULL;
659 break;
660
661 case '$':
662 previous = NULL;
663 if (options & MatchAcrossMultipleLinesOption)
664 *code++ = OP_EOL;
665 else
666 *code++ = OP_DOLL;
667 break;
668
669 /* There can never be a first char if '.' is first, whatever happens about
670 repeats. The value of reqByte doesn't change either. */
671
672 case '.':
673 if (firstByte == REQ_UNSET)
674 firstByte = REQ_NONE;
675 zeroFirstByte = firstByte;
676 zeroReqByte = reqByte;
677 previous = code;
678 *code++ = OP_NOT_NEWLINE;
679 break;
680
681 /* Character classes. If the included characters are all < 256, we build a
682 32-byte bitmap of the permitted characters, except in the special case
683 where there is only one such character. For negated classes, we build the
684 map as usual, then invert it at the end. However, we use a different opcode
685 so that data characters > 255 can be handled correctly.
686
687 If the class contains characters outside the 0-255 range, a different
688 opcode is compiled. It may optionally have a bit map for characters < 256,
689 but those above are are explicitly listed afterwards. A flag byte tells
690 whether the bitmap is present, and whether this is a negated class or not.
691 */
692
693 case '[': {
694 previous = code;
695 shouldFlipNegation = false;
696
697 /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
698 they are encountered at the top level, so we'll do that too. */
699
700 /* If the first character is '^', set the negation flag and skip it. */
701
702 if (ptr + 1 >= patternEnd) {
703 *errorCodePtr = ERR6;
704 return false;
705 }
706
707 if (ptr[1] == '^') {
708 negateClass = true;
709 ++ptr;
710 } else
711 negateClass = false;
712
713 /* Keep a count of chars with values < 256 so that we can optimize the case
714 of just a single character (as long as it's < 256). For higher valued UTF-8
715 characters, we don't yet do any optimization. */
716
717 classCharCount = 0;
718 classLastChar = -1;
719
720 class_utf8 = false; /* No chars >= 256 */
721 class_utf8data = code + LINK_SIZE + 34; /* For UTF-8 items */
722
723 /* Initialize the 32-char bit map to all zeros. We have to build the
724 map in a temporary bit of store, in case the class contains only 1
725 character (< 256), because in that case the compiled code doesn't use the
726 bit map. */
727
728 memset(s: classbits, c: 0, n: 32 * sizeof(unsigned char));
729
730 /* Process characters until ] is reached. The first pass
731 through the regex checked the overall syntax, so we don't need to be very
732 strict here. At the start of the loop, c contains the first byte of the
733 character. */
734
735 while ((++ptr < patternEnd) && (c = *ptr) != ']') {
736 /* Backslash may introduce a single character, or it may introduce one
737 of the specials, which just set a flag. Escaped items are checked for
738 validity in the pre-compiling pass. The sequence \b is a special case.
739 Inside a class (and only there) it is treated as backspace. Elsewhere
740 it marks a word boundary. Other escapes have preset maps ready to
741 or into the one we are building. We assume they have more than one
742 character in them, so set classCharCount bigger than one. */
743
744 if (c == '\\') {
745 c = checkEscape(ptrPtr: &ptr, patternEnd, errorCodePtr, bracount: cd.numCapturingBrackets, isClass: true);
746 if (c < 0) {
747 classCharCount += 2; /* Greater than 1 is what matters */
748 switch (-c) {
749 case ESC_d:
750 for (c = 0; c < 32; c++)
751 classbits[c] |= classBitmapForChar(c: c + cbit_digit);
752 continue;
753
754 case ESC_D:
755 shouldFlipNegation = true;
756 for (c = 0; c < 32; c++)
757 classbits[c] |= ~classBitmapForChar(c: c + cbit_digit);
758 continue;
759
760 case ESC_w:
761 for (c = 0; c < 32; c++)
762 classbits[c] |= classBitmapForChar(c: c + cbit_word);
763 continue;
764
765 case ESC_W:
766 shouldFlipNegation = true;
767 for (c = 0; c < 32; c++)
768 classbits[c] |= ~classBitmapForChar(c: c + cbit_word);
769 continue;
770
771 case ESC_s:
772 for (c = 0; c < 32; c++)
773 classbits[c] |= classBitmapForChar(c: c + cbit_space);
774 continue;
775
776 case ESC_S:
777 shouldFlipNegation = true;
778 for (c = 0; c < 32; c++)
779 classbits[c] |= ~classBitmapForChar(c: c + cbit_space);
780 continue;
781
782 /* Unrecognized escapes are faulted if PCRE is running in its
783 strict mode. By default, for compatibility with Perl, they are
784 treated as literals. */
785
786 default:
787 c = *ptr; /* The final character */
788 classCharCount -= 2; /* Undo the default count from above */
789 }
790 }
791
792 /* Fall through if we have a single character (c >= 0). This may be
793 > 256 in UTF-8 mode. */
794
795 } /* End of backslash handling */
796
797 /* A single character may be followed by '-' to form a range. However,
798 Perl does not permit ']' to be the end of the range. A '-' character
799 here is treated as a literal. */
800
801 if ((ptr + 2 < patternEnd) && ptr[1] == '-' && ptr[2] != ']') {
802 ptr += 2;
803
804 int d = *ptr;
805
806 /* The second part of a range can be a single-character escape, but
807 not any of the other escapes. Perl 5.6 treats a hyphen as a literal
808 in such circumstances. */
809
810 if (d == '\\') {
811 const UChar* oldptr = ptr;
812 d = checkEscape(ptrPtr: &ptr, patternEnd, errorCodePtr, bracount: cd.numCapturingBrackets, isClass: true);
813
814 /* \X is literal X; any other special means the '-' was literal */
815 if (d < 0) {
816 ptr = oldptr - 2;
817 goto LONE_SINGLE_CHARACTER; /* A few lines below */
818 }
819 }
820
821 /* The check that the two values are in the correct order happens in
822 the pre-pass. Optimize one-character ranges */
823
824 if (d == c)
825 goto LONE_SINGLE_CHARACTER; /* A few lines below */
826
827 /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
828 matching, we have to use an XCLASS with extra data items. Caseless
829 matching for characters > 127 is available only if UCP support is
830 available. */
831
832 if ((d > 255 || ((options & IgnoreCaseOption) && d > 127))) {
833 class_utf8 = true;
834
835 /* With UCP support, we can find the other case equivalents of
836 the relevant characters. There may be several ranges. Optimize how
837 they fit with the basic range. */
838
839 if (options & IgnoreCaseOption) {
840 int occ, ocd;
841 int cc = c;
842 int origd = d;
843 while (getOthercaseRange(cptr: &cc, d: origd, ocptr: &occ, odptr: &ocd)) {
844 if (occ >= c && ocd <= d)
845 continue; /* Skip embedded ranges */
846
847 if (occ < c && ocd >= c - 1) /* Extend the basic range */
848 { /* if there is overlap, */
849 c = occ; /* noting that if occ < c */
850 continue; /* we can't have ocd > d */
851 } /* because a subrange is */
852 if (ocd > d && occ <= d + 1) /* always shorter than */
853 { /* the basic range. */
854 d = ocd;
855 continue;
856 }
857
858 if (occ == ocd)
859 *class_utf8data++ = XCL_SINGLE;
860 else {
861 *class_utf8data++ = XCL_RANGE;
862 class_utf8data += encodeUTF8(cvalue: occ, buffer: class_utf8data);
863 }
864 class_utf8data += encodeUTF8(cvalue: ocd, buffer: class_utf8data);
865 }
866 }
867
868 /* Now record the original range, possibly modified for UCP caseless
869 overlapping ranges. */
870
871 *class_utf8data++ = XCL_RANGE;
872 class_utf8data += encodeUTF8(cvalue: c, buffer: class_utf8data);
873 class_utf8data += encodeUTF8(cvalue: d, buffer: class_utf8data);
874
875 /* With UCP support, we are done. Without UCP support, there is no
876 caseless matching for UTF-8 characters > 127; we can use the bit map
877 for the smaller ones. */
878
879 continue; /* With next character in the class */
880 }
881
882 /* We use the bit map for all cases when not in UTF-8 mode; else
883 ranges that lie entirely within 0-127 when there is UCP support; else
884 for partial ranges without UCP support. */
885
886 for (; c <= d; c++) {
887 classbits[c/8] |= (1 << (c&7));
888 if (options & IgnoreCaseOption) {
889 int uc = flipCase(c);
890 classbits[uc/8] |= (1 << (uc&7));
891 }
892 classCharCount++; /* in case a one-char range */
893 classLastChar = c;
894 }
895
896 continue; /* Go get the next char in the class */
897 }
898
899 /* Handle a lone single character - we can get here for a normal
900 non-escape char, or after \ that introduces a single character or for an
901 apparent range that isn't. */
902
903 LONE_SINGLE_CHARACTER:
904
905 /* Handle a character that cannot go in the bit map */
906
907 if ((c > 255 || ((options & IgnoreCaseOption) && c > 127))) {
908 class_utf8 = true;
909 *class_utf8data++ = XCL_SINGLE;
910 class_utf8data += encodeUTF8(cvalue: c, buffer: class_utf8data);
911
912 if (options & IgnoreCaseOption) {
913 int othercase;
914 if ((othercase = jsc_pcre_ucp_othercase(c)) >= 0) {
915 *class_utf8data++ = XCL_SINGLE;
916 class_utf8data += encodeUTF8(cvalue: othercase, buffer: class_utf8data);
917 }
918 }
919 } else {
920 /* Handle a single-byte character */
921 classbits[c/8] |= (1 << (c&7));
922 if (options & IgnoreCaseOption) {
923 c = flipCase(c);
924 classbits[c/8] |= (1 << (c&7));
925 }
926 classCharCount++;
927 classLastChar = c;
928 }
929 }
930
931 /* If classCharCount is 1, we saw precisely one character whose value is
932 less than 256. In non-UTF-8 mode we can always optimize. In UTF-8 mode, we
933 can optimize the negative case only if there were no characters >= 128
934 because OP_NOT and the related opcodes like OP_NOTSTAR operate on
935 single-bytes only. This is an historical hangover. Maybe one day we can
936 tidy these opcodes to handle multi-byte characters.
937
938 The optimization throws away the bit map. We turn the item into a
939 1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
940 that OP_NOT does not support multibyte characters. In the positive case, it
941 can cause firstByte to be set. Otherwise, there can be no first char if
942 this item is first, whatever repeat count may follow. In the case of
943 reqByte, save the previous value for reinstating. */
944
945 if (classCharCount == 1 && (!class_utf8 && (!negateClass || classLastChar < 128))) {
946 zeroReqByte = reqByte;
947
948 /* The OP_NOT opcode works on one-byte characters only. */
949
950 if (negateClass) {
951 if (firstByte == REQ_UNSET)
952 firstByte = REQ_NONE;
953 zeroFirstByte = firstByte;
954 *code++ = OP_NOT;
955 *code++ = classLastChar;
956 break;
957 }
958
959 /* For a single, positive character, get the value into c, and
960 then we can handle this with the normal one-character code. */
961
962 c = classLastChar;
963 goto NORMAL_CHAR;
964 } /* End of 1-char optimization */
965
966 /* The general case - not the one-char optimization. If this is the first
967 thing in the branch, there can be no first char setting, whatever the
968 repeat count. Any reqByte setting must remain unchanged after any kind of
969 repeat. */
970
971 if (firstByte == REQ_UNSET) firstByte = REQ_NONE;
972 zeroFirstByte = firstByte;
973 zeroReqByte = reqByte;
974
975 /* If there are characters with values > 255, we have to compile an
976 extended class, with its own opcode. If there are no characters < 256,
977 we can omit the bitmap. */
978
979 if (class_utf8 && !shouldFlipNegation) {
980 *class_utf8data++ = XCL_END; /* Marks the end of extra data */
981 *code++ = OP_XCLASS;
982 code += LINK_SIZE;
983 *code = negateClass? XCL_NOT : 0;
984
985 /* If the map is required, install it, and move on to the end of
986 the extra data */
987
988 if (classCharCount > 0) {
989 *code++ |= XCL_MAP;
990 memcpy(dest: code, src: classbits, n: 32);
991 code = class_utf8data;
992 }
993
994 /* If the map is not required, slide down the extra data. */
995
996 else {
997 int len = class_utf8data - (code + 33);
998 memmove(dest: code + 1, src: code + 33, n: len);
999 code += len + 1;
1000 }
1001
1002 /* Now fill in the complete length of the item */
1003
1004 putLinkValue(opcodePtr: previous + 1, value: code - previous);
1005 break; /* End of class handling */
1006 }
1007
1008 /* If there are no characters > 255, negate the 32-byte map if necessary,
1009 and copy it into the code vector. If this is the first thing in the branch,
1010 there can be no first char setting, whatever the repeat count. Any reqByte
1011 setting must remain unchanged after any kind of repeat. */
1012
1013 *code++ = (negateClass == shouldFlipNegation) ? OP_CLASS : OP_NCLASS;
1014 if (negateClass)
1015 for (c = 0; c < 32; c++)
1016 code[c] = ~classbits[c];
1017 else
1018 memcpy(dest: code, src: classbits, n: 32);
1019 code += 32;
1020 break;
1021 }
1022
1023 /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
1024 has been tested above. */
1025
1026 case '{':
1027 if (!isQuantifier)
1028 goto NORMAL_CHAR;
1029 ptr = readRepeatCounts(p: ptr + 1, minp: &repeatMin, maxp: &repeat_max, errorCodePtr);
1030 if (*errorCodePtr)
1031 goto FAILED;
1032 goto REPEAT;
1033
1034 case '*':
1035 repeatMin = 0;
1036 repeat_max = -1;
1037 goto REPEAT;
1038
1039 case '+':
1040 repeatMin = 1;
1041 repeat_max = -1;
1042 goto REPEAT;
1043
1044 case '?':
1045 repeatMin = 0;
1046 repeat_max = 1;
1047
1048 REPEAT:
1049 if (!previous) {
1050 *errorCodePtr = ERR9;
1051 goto FAILED;
1052 }
1053
1054 if (repeatMin == 0) {
1055 firstByte = zeroFirstByte; /* Adjust for zero repeat */
1056 reqByte = zeroReqByte; /* Ditto */
1057 }
1058
1059 /* Remember whether this is a variable length repeat */
1060
1061 reqvary = (repeatMin == repeat_max) ? 0 : REQ_VARY;
1062
1063 opType = 0; /* Default single-char op codes */
1064
1065 /* Save start of previous item, in case we have to move it up to make space
1066 for an inserted OP_ONCE for the additional '+' extension. */
1067 /* FIXME: Probably don't need this because we don't use OP_ONCE. */
1068
1069 tempcode = previous;
1070
1071 /* If the next character is '+', we have a possessive quantifier. This
1072 implies greediness, whatever the setting of the PCRE_UNGREEDY option.
1073 If the next character is '?' this is a minimizing repeat, by default,
1074 but if PCRE_UNGREEDY is set, it works the other way round. We change the
1075 repeat type to the non-default. */
1076
1077 if (safelyCheckNextChar(ptr, patternEnd, expected: '?')) {
1078 repeatType = 1;
1079 ptr++;
1080 } else
1081 repeatType = 0;
1082
1083 /* If previous was a character match, abolish the item and generate a
1084 repeat item instead. If a char item has a minumum of more than one, ensure
1085 that it is set in reqByte - it might not be if a sequence such as x{3} is
1086 the first thing in a branch because the x will have gone into firstByte
1087 instead. */
1088
1089 if (*previous == OP_CHAR || *previous == OP_CHAR_IGNORING_CASE) {
1090 /* Deal with UTF-8 characters that take up more than one byte. It's
1091 easier to write this out separately than try to macrify it. Use c to
1092 hold the length of the character in bytes, plus 0x80 to flag that it's a
1093 length rather than a small character. */
1094
1095 if (code[-1] & 0x80) {
1096 unsigned char *lastchar = code - 1;
1097 while((*lastchar & 0xc0) == 0x80)
1098 lastchar--;
1099 c = code - lastchar; /* Length of UTF-8 character */
1100 memcpy(dest: utf8_char, src: lastchar, n: c); /* Save the char */
1101 c |= 0x80; /* Flag c as a length */
1102 }
1103 else {
1104 c = code[-1];
1105 if (repeatMin > 1)
1106 reqByte = c | reqCaseOpt | cd.reqVaryOpt;
1107 }
1108
1109 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
1110 }
1111
1112 else if (*previous == OP_ASCII_CHAR || *previous == OP_ASCII_LETTER_IGNORING_CASE) {
1113 c = previous[1];
1114 if (repeatMin > 1)
1115 reqByte = c | reqCaseOpt | cd.reqVaryOpt;
1116 goto OUTPUT_SINGLE_REPEAT;
1117 }
1118
1119 /* If previous was a single negated character ([^a] or similar), we use
1120 one of the special opcodes, replacing it. The code is shared with single-
1121 character repeats by setting opt_type to add a suitable offset into
1122 repeatType. OP_NOT is currently used only for single-byte chars. */
1123
1124 else if (*previous == OP_NOT) {
1125 opType = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
1126 c = previous[1];
1127 goto OUTPUT_SINGLE_REPEAT;
1128 }
1129
1130 /* If previous was a character type match (\d or similar), abolish it and
1131 create a suitable repeat item. The code is shared with single-character
1132 repeats by setting opType to add a suitable offset into repeatType. */
1133
1134 else if (*previous <= OP_NOT_NEWLINE) {
1135 opType = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
1136 c = *previous;
1137
1138 OUTPUT_SINGLE_REPEAT:
1139 int prop_type = -1;
1140 int prop_value = -1;
1141
1142 unsigned char* oldcode = code;
1143 code = previous; /* Usually overwrite previous item */
1144
1145 /* If the maximum is zero then the minimum must also be zero; Perl allows
1146 this case, so we do too - by simply omitting the item altogether. */
1147
1148 if (repeat_max == 0)
1149 goto END_REPEAT;
1150
1151 /* Combine the opType with the repeatType */
1152
1153 repeatType += opType;
1154
1155 /* A minimum of zero is handled either as the special case * or ?, or as
1156 an UPTO, with the maximum given. */
1157
1158 if (repeatMin == 0) {
1159 if (repeat_max == -1)
1160 *code++ = OP_STAR + repeatType;
1161 else if (repeat_max == 1)
1162 *code++ = OP_QUERY + repeatType;
1163 else {
1164 *code++ = OP_UPTO + repeatType;
1165 put2ByteValueAndAdvance(opcodePtr&: code, value: repeat_max);
1166 }
1167 }
1168
1169 /* A repeat minimum of 1 is optimized into some special cases. If the
1170 maximum is unlimited, we use OP_PLUS. Otherwise, the original item it
1171 left in place and, if the maximum is greater than 1, we use OP_UPTO with
1172 one less than the maximum. */
1173
1174 else if (repeatMin == 1) {
1175 if (repeat_max == -1)
1176 *code++ = OP_PLUS + repeatType;
1177 else {
1178 code = oldcode; /* leave previous item in place */
1179 if (repeat_max == 1)
1180 goto END_REPEAT;
1181 *code++ = OP_UPTO + repeatType;
1182 put2ByteValueAndAdvance(opcodePtr&: code, value: repeat_max - 1);
1183 }
1184 }
1185
1186 /* The case {n,n} is just an EXACT, while the general case {n,m} is
1187 handled as an EXACT followed by an UPTO. */
1188
1189 else {
1190 *code++ = OP_EXACT + opType; /* NB EXACT doesn't have repeatType */
1191 put2ByteValueAndAdvance(opcodePtr&: code, value: repeatMin);
1192
1193 /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
1194 we have to insert the character for the previous code. For a repeated
1195 Unicode property match, there are two extra bytes that define the
1196 required property. In UTF-8 mode, long characters have their length in
1197 c, with the 0x80 bit as a flag. */
1198
1199 if (repeat_max < 0) {
1200 if (c >= 128) {
1201 memcpy(dest: code, src: utf8_char, n: c & 7);
1202 code += c & 7;
1203 } else {
1204 *code++ = c;
1205 if (prop_type >= 0) {
1206 *code++ = prop_type;
1207 *code++ = prop_value;
1208 }
1209 }
1210 *code++ = OP_STAR + repeatType;
1211 }
1212
1213 /* Else insert an UPTO if the max is greater than the min, again
1214 preceded by the character, for the previously inserted code. */
1215
1216 else if (repeat_max != repeatMin) {
1217 if (c >= 128) {
1218 memcpy(dest: code, src: utf8_char, n: c & 7);
1219 code += c & 7;
1220 } else
1221 *code++ = c;
1222 if (prop_type >= 0) {
1223 *code++ = prop_type;
1224 *code++ = prop_value;
1225 }
1226 repeat_max -= repeatMin;
1227 *code++ = OP_UPTO + repeatType;
1228 put2ByteValueAndAdvance(opcodePtr&: code, value: repeat_max);
1229 }
1230 }
1231
1232 /* The character or character type itself comes last in all cases. */
1233
1234 if (c >= 128) {
1235 memcpy(dest: code, src: utf8_char, n: c & 7);
1236 code += c & 7;
1237 } else
1238 *code++ = c;
1239
1240 /* For a repeated Unicode property match, there are two extra bytes that
1241 define the required property. */
1242
1243 if (prop_type >= 0) {
1244 *code++ = prop_type;
1245 *code++ = prop_value;
1246 }
1247 }
1248
1249 /* If previous was a character class or a back reference, we put the repeat
1250 stuff after it, but just skip the item if the repeat was {0,0}. */
1251
1252 else if (*previous == OP_CLASS ||
1253 *previous == OP_NCLASS ||
1254 *previous == OP_XCLASS ||
1255 *previous == OP_REF)
1256 {
1257 if (repeat_max == 0) {
1258 code = previous;
1259 goto END_REPEAT;
1260 }
1261
1262 if (repeatMin == 0 && repeat_max == -1)
1263 *code++ = OP_CRSTAR + repeatType;
1264 else if (repeatMin == 1 && repeat_max == -1)
1265 *code++ = OP_CRPLUS + repeatType;
1266 else if (repeatMin == 0 && repeat_max == 1)
1267 *code++ = OP_CRQUERY + repeatType;
1268 else {
1269 *code++ = OP_CRRANGE + repeatType;
1270 put2ByteValueAndAdvance(opcodePtr&: code, value: repeatMin);
1271 if (repeat_max == -1)
1272 repeat_max = 0; /* 2-byte encoding for max */
1273 put2ByteValueAndAdvance(opcodePtr&: code, value: repeat_max);
1274 }
1275 }
1276
1277 /* If previous was a bracket group, we may have to replicate it in certain
1278 cases. */
1279
1280 else if (*previous >= OP_BRA) {
1281 int ketoffset = 0;
1282 int len = code - previous;
1283 unsigned char* bralink = NULL;
1284
1285 /* If the maximum repeat count is unlimited, find the end of the bracket
1286 by scanning through from the start, and compute the offset back to it
1287 from the current code pointer. There may be an OP_OPT setting following
1288 the final KET, so we can't find the end just by going back from the code
1289 pointer. */
1290
1291 if (repeat_max == -1) {
1292 const unsigned char* ket = previous;
1293 advanceToEndOfBracket(opcodePtr&: ket);
1294 ketoffset = code - ket;
1295 }
1296
1297 /* The case of a zero minimum is special because of the need to stick
1298 OP_BRAZERO in front of it, and because the group appears once in the
1299 data, whereas in other cases it appears the minimum number of times. For
1300 this reason, it is simplest to treat this case separately, as otherwise
1301 the code gets far too messy. There are several special subcases when the
1302 minimum is zero. */
1303
1304 if (repeatMin == 0) {
1305 /* If the maximum is also zero, we just omit the group from the output
1306 altogether. */
1307
1308 if (repeat_max == 0) {
1309 code = previous;
1310 goto END_REPEAT;
1311 }
1312
1313 /* If the maximum is 1 or unlimited, we just have to stick in the
1314 BRAZERO and do no more at this point. However, we do need to adjust
1315 any OP_RECURSE calls inside the group that refer to the group itself or
1316 any internal group, because the offset is from the start of the whole
1317 regex. Temporarily terminate the pattern while doing this. */
1318
1319 if (repeat_max <= 1) {
1320 *code = OP_END;
1321 memmove(dest: previous+1, src: previous, n: len);
1322 code++;
1323 *previous++ = OP_BRAZERO + repeatType;
1324 }
1325
1326 /* If the maximum is greater than 1 and limited, we have to replicate
1327 in a nested fashion, sticking OP_BRAZERO before each set of brackets.
1328 The first one has to be handled carefully because it's the original
1329 copy, which has to be moved up. The remainder can be handled by code
1330 that is common with the non-zero minimum case below. We have to
1331 adjust the value of repeat_max, since one less copy is required. */
1332
1333 else {
1334 *code = OP_END;
1335 memmove(dest: previous + 2 + LINK_SIZE, src: previous, n: len);
1336 code += 2 + LINK_SIZE;
1337 *previous++ = OP_BRAZERO + repeatType;
1338 *previous++ = OP_BRA;
1339
1340 /* We chain together the bracket offset fields that have to be
1341 filled in later when the ends of the brackets are reached. */
1342
1343 int offset = (!bralink) ? 0 : previous - bralink;
1344 bralink = previous;
1345 putLinkValueAllowZeroAndAdvance(opcodePtr&: previous, value: offset);
1346 }
1347
1348 repeat_max--;
1349 }
1350
1351 /* If the minimum is greater than zero, replicate the group as many
1352 times as necessary, and adjust the maximum to the number of subsequent
1353 copies that we need. If we set a first char from the group, and didn't
1354 set a required char, copy the latter from the former. */
1355
1356 else {
1357 if (repeatMin > 1) {
1358 if (didGroupSetFirstByte && reqByte < 0)
1359 reqByte = firstByte;
1360 for (int i = 1; i < repeatMin; i++) {
1361 memcpy(dest: code, src: previous, n: len);
1362 code += len;
1363 }
1364 }
1365 if (repeat_max > 0)
1366 repeat_max -= repeatMin;
1367 }
1368
1369 /* This code is common to both the zero and non-zero minimum cases. If
1370 the maximum is limited, it replicates the group in a nested fashion,
1371 remembering the bracket starts on a stack. In the case of a zero minimum,
1372 the first one was set up above. In all cases the repeat_max now specifies
1373 the number of additional copies needed. */
1374
1375 if (repeat_max >= 0) {
1376 for (int i = repeat_max - 1; i >= 0; i--) {
1377 *code++ = OP_BRAZERO + repeatType;
1378
1379 /* All but the final copy start a new nesting, maintaining the
1380 chain of brackets outstanding. */
1381
1382 if (i != 0) {
1383 *code++ = OP_BRA;
1384 int offset = (!bralink) ? 0 : code - bralink;
1385 bralink = code;
1386 putLinkValueAllowZeroAndAdvance(opcodePtr&: code, value: offset);
1387 }
1388
1389 memcpy(dest: code, src: previous, n: len);
1390 code += len;
1391 }
1392
1393 /* Now chain through the pending brackets, and fill in their length
1394 fields (which are holding the chain links pro tem). */
1395
1396 while (bralink) {
1397 int offset = code - bralink + 1;
1398 unsigned char* bra = code - offset;
1399 int oldlinkoffset = getLinkValueAllowZero(opcodePtr: bra + 1);
1400 bralink = (!oldlinkoffset) ? 0 : bralink - oldlinkoffset;
1401 *code++ = OP_KET;
1402 putLinkValueAndAdvance(opcodePtr&: code, value: offset);
1403 putLinkValue(opcodePtr: bra + 1, value: offset);
1404 }
1405 }
1406
1407 /* If the maximum is unlimited, set a repeater in the final copy. We
1408 can't just offset backwards from the current code point, because we
1409 don't know if there's been an options resetting after the ket. The
1410 correct offset was computed above. */
1411
1412 else
1413 code[-ketoffset] = OP_KETRMAX + repeatType;
1414 }
1415
1416 // A quantifier after an assertion is mostly meaningless, but it
1417 // can nullify the assertion if it has a 0 minimum.
1418 else if (*previous == OP_ASSERT || *previous == OP_ASSERT_NOT) {
1419 if (repeatMin == 0) {
1420 code = previous;
1421 goto END_REPEAT;
1422 }
1423 }
1424
1425 /* Else there's some kind of shambles */
1426
1427 else {
1428 *errorCodePtr = ERR11;
1429 goto FAILED;
1430 }
1431
1432 /* In all case we no longer have a previous item. We also set the
1433 "follows varying string" flag for subsequently encountered reqbytes if
1434 it isn't already set and we have just passed a varying length item. */
1435
1436 END_REPEAT:
1437 previous = NULL;
1438 cd.reqVaryOpt |= reqvary;
1439 break;
1440
1441 /* Start of nested bracket sub-expression, or comment or lookahead or
1442 lookbehind or option setting or condition. First deal with special things
1443 that can come after a bracket; all are introduced by ?, and the appearance
1444 of any of them means that this is not a referencing group. They were
1445 checked for validity in the first pass over the string, so we don't have to
1446 check for syntax errors here. */
1447
1448 case '(':
1449 skipBytes = 0;
1450
1451 if (*(++ptr) == '?') {
1452 switch (*(++ptr)) {
1453 case ':': /* Non-extracting bracket */
1454 bravalue = OP_BRA;
1455 ptr++;
1456 break;
1457
1458 case '=': /* Positive lookahead */
1459 bravalue = OP_ASSERT;
1460 ptr++;
1461 break;
1462
1463 case '!': /* Negative lookahead */
1464 bravalue = OP_ASSERT_NOT;
1465 ptr++;
1466 break;
1467
1468 /* Character after (? not specially recognized */
1469
1470 default:
1471 *errorCodePtr = ERR12;
1472 goto FAILED;
1473 }
1474 }
1475
1476 /* Else we have a referencing group; adjust the opcode. If the bracket
1477 number is greater than EXTRACT_BASIC_MAX, we set the opcode one higher, and
1478 arrange for the true number to follow later, in an OP_BRANUMBER item. */
1479
1480 else {
1481 if (++(*brackets) > EXTRACT_BASIC_MAX) {
1482 bravalue = OP_BRA + EXTRACT_BASIC_MAX + 1;
1483 code[1 + LINK_SIZE] = OP_BRANUMBER;
1484 put2ByteValue(opcodePtr: code + 2 + LINK_SIZE, value: *brackets);
1485 skipBytes = 3;
1486 }
1487 else
1488 bravalue = OP_BRA + *brackets;
1489 }
1490
1491 /* Process nested bracketed re. We copy code into a non-variable
1492 in order to be able to pass its address because some compilers
1493 complain otherwise. Pass in a new setting for the ims options
1494 if they have changed. */
1495
1496 previous = code;
1497 *code = bravalue;
1498 tempcode = code;
1499 tempreqvary = cd.reqVaryOpt; /* Save value before bracket */
1500
1501 if (!compileBracket(
1502 options,
1503 brackets, /* Extracting bracket count */
1504 &tempcode, /* Where to put code (updated) */
1505 &ptr, /* Input pointer (updated) */
1506 patternEnd,
1507 errorCodePtr, /* Where to put an error message */
1508 skipBytes, /* Skip over OP_BRANUMBER */
1509 &subFirstByte, /* For possible first char */
1510 &subReqByte, /* For possible last char */
1511 cd)) /* Tables block */
1512 goto FAILED;
1513
1514 /* At the end of compiling, code is still pointing to the start of the
1515 group, while tempcode has been updated to point past the end of the group
1516 and any option resetting that may follow it. The pattern pointer (ptr)
1517 is on the bracket. */
1518
1519 /* Handle updating of the required and first characters. Update for normal
1520 brackets of all kinds, and conditions with two branches (see code above).
1521 If the bracket is followed by a quantifier with zero repeat, we have to
1522 back off. Hence the definition of zeroReqByte and zeroFirstByte outside the
1523 main loop so that they can be accessed for the back off. */
1524
1525 zeroReqByte = reqByte;
1526 zeroFirstByte = firstByte;
1527 didGroupSetFirstByte = false;
1528
1529 if (bravalue >= OP_BRA) {
1530 /* If we have not yet set a firstByte in this branch, take it from the
1531 subpattern, remembering that it was set here so that a repeat of more
1532 than one can replicate it as reqByte if necessary. If the subpattern has
1533 no firstByte, set "none" for the whole branch. In both cases, a zero
1534 repeat forces firstByte to "none". */
1535
1536 if (firstByte == REQ_UNSET) {
1537 if (subFirstByte >= 0) {
1538 firstByte = subFirstByte;
1539 didGroupSetFirstByte = true;
1540 }
1541 else
1542 firstByte = REQ_NONE;
1543 zeroFirstByte = REQ_NONE;
1544 }
1545
1546 /* If firstByte was previously set, convert the subpattern's firstByte
1547 into reqByte if there wasn't one, using the vary flag that was in
1548 existence beforehand. */
1549
1550 else if (subFirstByte >= 0 && subReqByte < 0)
1551 subReqByte = subFirstByte | tempreqvary;
1552
1553 /* If the subpattern set a required byte (or set a first byte that isn't
1554 really the first byte - see above), set it. */
1555
1556 if (subReqByte >= 0)
1557 reqByte = subReqByte;
1558 }
1559
1560 /* For a forward assertion, we take the reqByte, if set. This can be
1561 helpful if the pattern that follows the assertion doesn't set a different
1562 char. For example, it's useful for /(?=abcde).+/. We can't set firstByte
1563 for an assertion, however because it leads to incorrect effect for patterns
1564 such as /(?=a)a.+/ when the "real" "a" would then become a reqByte instead
1565 of a firstByte. This is overcome by a scan at the end if there's no
1566 firstByte, looking for an asserted first char. */
1567
1568 else if (bravalue == OP_ASSERT && subReqByte >= 0)
1569 reqByte = subReqByte;
1570
1571 /* Now update the main code pointer to the end of the group. */
1572
1573 code = tempcode;
1574
1575 /* Error if hit end of pattern */
1576
1577 if (ptr >= patternEnd || *ptr != ')') {
1578 *errorCodePtr = ERR14;
1579 goto FAILED;
1580 }
1581 break;
1582
1583 /* Check \ for being a real metacharacter; if not, fall through and handle
1584 it as a data character at the start of a string. Escape items are checked
1585 for validity in the pre-compiling pass. */
1586
1587 case '\\':
1588 c = checkEscape(ptrPtr: &ptr, patternEnd, errorCodePtr, bracount: cd.numCapturingBrackets, isClass: false);
1589
1590 /* Handle metacharacters introduced by \. For ones like \d, the ESC_ values
1591 are arranged to be the negation of the corresponding OP_values. For the
1592 back references, the values are ESC_REF plus the reference number. Only
1593 back references and those types that consume a character may be repeated.
1594 We can test for values between ESC_b and ESC_w for the latter; this may
1595 have to change if any new ones are ever created. */
1596
1597 if (c < 0) {
1598 /* For metasequences that actually match a character, we disable the
1599 setting of a first character if it hasn't already been set. */
1600
1601 if (firstByte == REQ_UNSET && -c > ESC_b && -c <= ESC_w)
1602 firstByte = REQ_NONE;
1603
1604 /* Set values to reset to if this is followed by a zero repeat. */
1605
1606 zeroFirstByte = firstByte;
1607 zeroReqByte = reqByte;
1608
1609 /* Back references are handled specially */
1610
1611 if (-c >= ESC_REF) {
1612 int number = -c - ESC_REF;
1613 previous = code;
1614 *code++ = OP_REF;
1615 put2ByteValueAndAdvance(opcodePtr&: code, value: number);
1616 }
1617
1618 /* For the rest, we can obtain the OP value by negating the escape
1619 value */
1620
1621 else {
1622 previous = (-c > ESC_b && -c <= ESC_w) ? code : NULL;
1623 *code++ = -c;
1624 }
1625 continue;
1626 }
1627
1628 /* Fall through. */
1629
1630 /* Handle a literal character. It is guaranteed not to be whitespace or #
1631 when the extended flag is set. If we are in UTF-8 mode, it may be a
1632 multi-byte literal character. */
1633
1634 default:
1635 NORMAL_CHAR:
1636
1637 previous = code;
1638
1639 if (c < 128) {
1640 mcLength = 1;
1641 mcbuffer[0] = c;
1642
1643 if ((options & IgnoreCaseOption) && (c | 0x20) >= 'a' && (c | 0x20) <= 'z') {
1644 *code++ = OP_ASCII_LETTER_IGNORING_CASE;
1645 *code++ = c | 0x20;
1646 } else {
1647 *code++ = OP_ASCII_CHAR;
1648 *code++ = c;
1649 }
1650 } else {
1651 mcLength = encodeUTF8(cvalue: c, buffer: mcbuffer);
1652
1653 *code++ = (options & IgnoreCaseOption) ? OP_CHAR_IGNORING_CASE : OP_CHAR;
1654 for (c = 0; c < mcLength; c++)
1655 *code++ = mcbuffer[c];
1656 }
1657
1658 /* Set the first and required bytes appropriately. If no previous first
1659 byte, set it from this character, but revert to none on a zero repeat.
1660 Otherwise, leave the firstByte value alone, and don't change it on a zero
1661 repeat. */
1662
1663 if (firstByte == REQ_UNSET) {
1664 zeroFirstByte = REQ_NONE;
1665 zeroReqByte = reqByte;
1666
1667 /* If the character is more than one byte long, we can set firstByte
1668 only if it is not to be matched caselessly. */
1669
1670 if (mcLength == 1 || reqCaseOpt == 0) {
1671 firstByte = mcbuffer[0] | reqCaseOpt;
1672 if (mcLength != 1)
1673 reqByte = code[-1] | cd.reqVaryOpt;
1674 }
1675 else
1676 firstByte = reqByte = REQ_NONE;
1677 }
1678
1679 /* firstByte was previously set; we can set reqByte only the length is
1680 1 or the matching is caseful. */
1681
1682 else {
1683 zeroFirstByte = firstByte;
1684 zeroReqByte = reqByte;
1685 if (mcLength == 1 || reqCaseOpt == 0)
1686 reqByte = code[-1] | reqCaseOpt | cd.reqVaryOpt;
1687 }
1688
1689 break; /* End of literal character handling */
1690 }
1691 } /* end of big loop */
1692
1693 /* Control never reaches here by falling through, only by a goto for all the
1694 error states. Pass back the position in the pattern so that it can be displayed
1695 to the user for diagnosing the error. */
1696
1697FAILED:
1698 *ptrPtr = ptr;
1699 return false;
1700}
1701
1702/*************************************************
1703* Compile sequence of alternatives *
1704*************************************************/
1705
1706/* On entry, ptr is pointing past the bracket character, but on return
1707it points to the closing bracket, or vertical bar, or end of string.
1708The code variable is pointing at the byte into which the BRA operator has been
1709stored. If the ims options are changed at the start (for a (?ims: group) or
1710during any branch, we need to insert an OP_OPT item at the start of every
1711following branch to ensure they get set correctly at run time, and also pass
1712the new options into every subsequent branch compile.
1713
1714Argument:
1715 options option bits, including any changes for this subpattern
1716 brackets -> int containing the number of extracting brackets used
1717 codePtr -> the address of the current code pointer
1718 ptrPtr -> the address of the current pattern pointer
1719 errorCodePtr -> pointer to error code variable
1720 skipBytes skip this many bytes at start (for OP_BRANUMBER)
1721 firstbyteptr place to put the first required character, or a negative number
1722 reqbyteptr place to put the last required character, or a negative number
1723 cd points to the data block with tables pointers etc.
1724
1725Returns: true on success
1726*/
1727
1728static bool
1729compileBracket(int options, int* brackets, unsigned char** codePtr,
1730 const UChar** ptrPtr, const UChar* patternEnd, ErrorCode* errorCodePtr, int skipBytes,
1731 int* firstbyteptr, int* reqbyteptr, CompileData& cd)
1732{
1733 const UChar* ptr = *ptrPtr;
1734 unsigned char* code = *codePtr;
1735 unsigned char* lastBranch = code;
1736 unsigned char* start_bracket = code;
1737 int firstByte = REQ_UNSET;
1738 int reqByte = REQ_UNSET;
1739
1740 /* Offset is set zero to mark that this bracket is still open */
1741
1742 putLinkValueAllowZero(opcodePtr: code + 1, value: 0);
1743 code += 1 + LINK_SIZE + skipBytes;
1744
1745 /* Loop for each alternative branch */
1746
1747 while (true) {
1748 /* Now compile the branch */
1749
1750 int branchFirstByte;
1751 int branchReqByte;
1752 if (!compileBranch(options, brackets, codePtr: &code, ptrPtr: &ptr, patternEnd, errorCodePtr,
1753 firstbyteptr: &branchFirstByte, reqbyteptr: &branchReqByte, cd)) {
1754 *ptrPtr = ptr;
1755 return false;
1756 }
1757
1758 /* If this is the first branch, the firstByte and reqByte values for the
1759 branch become the values for the regex. */
1760
1761 if (*lastBranch != OP_ALT) {
1762 firstByte = branchFirstByte;
1763 reqByte = branchReqByte;
1764 }
1765
1766 /* If this is not the first branch, the first char and reqByte have to
1767 match the values from all the previous branches, except that if the previous
1768 value for reqByte didn't have REQ_VARY set, it can still match, and we set
1769 REQ_VARY for the regex. */
1770
1771 else {
1772 /* If we previously had a firstByte, but it doesn't match the new branch,
1773 we have to abandon the firstByte for the regex, but if there was previously
1774 no reqByte, it takes on the value of the old firstByte. */
1775
1776 if (firstByte >= 0 && firstByte != branchFirstByte) {
1777 if (reqByte < 0)
1778 reqByte = firstByte;
1779 firstByte = REQ_NONE;
1780 }
1781
1782 /* If we (now or from before) have no firstByte, a firstByte from the
1783 branch becomes a reqByte if there isn't a branch reqByte. */
1784
1785 if (firstByte < 0 && branchFirstByte >= 0 && branchReqByte < 0)
1786 branchReqByte = branchFirstByte;
1787
1788 /* Now ensure that the reqbytes match */
1789
1790 if ((reqByte & ~REQ_VARY) != (branchReqByte & ~REQ_VARY))
1791 reqByte = REQ_NONE;
1792 else
1793 reqByte |= branchReqByte; /* To "or" REQ_VARY */
1794 }
1795
1796 /* Reached end of expression, either ')' or end of pattern. Go back through
1797 the alternative branches and reverse the chain of offsets, with the field in
1798 the BRA item now becoming an offset to the first alternative. If there are
1799 no alternatives, it points to the end of the group. The length in the
1800 terminating ket is always the length of the whole bracketed item. If any of
1801 the ims options were changed inside the group, compile a resetting op-code
1802 following, except at the very end of the pattern. Return leaving the pointer
1803 at the terminating char. */
1804
1805 if (ptr >= patternEnd || *ptr != '|') {
1806 int length = code - lastBranch;
1807 do {
1808 int prevLength = getLinkValueAllowZero(opcodePtr: lastBranch + 1);
1809 putLinkValue(opcodePtr: lastBranch + 1, value: length);
1810 length = prevLength;
1811 lastBranch -= length;
1812 } while (length > 0);
1813
1814 /* Fill in the ket */
1815
1816 *code = OP_KET;
1817 putLinkValue(opcodePtr: code + 1, value: code - start_bracket);
1818 code += 1 + LINK_SIZE;
1819
1820 /* Set values to pass back */
1821
1822 *codePtr = code;
1823 *ptrPtr = ptr;
1824 *firstbyteptr = firstByte;
1825 *reqbyteptr = reqByte;
1826 return true;
1827 }
1828
1829 /* Another branch follows; insert an "or" node. Its length field points back
1830 to the previous branch while the bracket remains open. At the end the chain
1831 is reversed. It's done like this so that the start of the bracket has a
1832 zero offset until it is closed, making it possible to detect recursion. */
1833
1834 *code = OP_ALT;
1835 putLinkValue(opcodePtr: code + 1, value: code - lastBranch);
1836 lastBranch = code;
1837 code += 1 + LINK_SIZE;
1838 ptr++;
1839 }
1840 ASSERT_NOT_REACHED();
1841}
1842
1843/*************************************************
1844* Check for anchored expression *
1845*************************************************/
1846
1847/* Try to find out if this is an anchored regular expression. Consider each
1848alternative branch. If they all start OP_CIRC, or with a bracket
1849all of whose alternatives start OP_CIRC (recurse ad lib), then
1850it's anchored.
1851
1852Arguments:
1853 code points to start of expression (the bracket)
1854 captureMap a bitmap of which brackets we are inside while testing; this
1855 handles up to substring 31; all brackets after that share
1856 the zero bit
1857 backrefMap the back reference bitmap
1858*/
1859
1860static bool branchIsAnchored(const unsigned char* code)
1861{
1862 const unsigned char* scode = firstSignificantOpcode(code);
1863 int op = *scode;
1864
1865 /* Brackets */
1866 if (op >= OP_BRA || op == OP_ASSERT)
1867 return bracketIsAnchored(code: scode);
1868
1869 /* Check for explicit anchoring */
1870 return op == OP_CIRC;
1871}
1872
1873static bool bracketIsAnchored(const unsigned char* code)
1874{
1875 do {
1876 if (!branchIsAnchored(code: code + 1 + LINK_SIZE))
1877 return false;
1878 code += getLinkValue(opcodePtr: code + 1);
1879 } while (*code == OP_ALT); /* Loop for each alternative */
1880 return true;
1881}
1882
1883/*************************************************
1884* Check for starting with ^ or .* *
1885*************************************************/
1886
1887/* This is called to find out if every branch starts with ^ or .* so that
1888"first char" processing can be done to speed things up in multiline
1889matching and for non-DOTALL patterns that start with .* (which must start at
1890the beginning or after \n)
1891
1892Except when the .* appears inside capturing parentheses, and there is a
1893subsequent back reference to those parentheses. By keeping a bitmap of the
1894first 31 back references, we can catch some of the more common cases more
1895precisely; all the greater back references share a single bit.
1896
1897Arguments:
1898 code points to start of expression (the bracket)
1899 captureMap a bitmap of which brackets we are inside while testing; this
1900 handles up to substring 31; all brackets after that share
1901 the zero bit
1902 backrefMap the back reference bitmap
1903*/
1904
1905static bool branchNeedsLineStart(const unsigned char* code, unsigned captureMap, unsigned backrefMap)
1906{
1907 const unsigned char* scode = firstSignificantOpcode(code);
1908 int op = *scode;
1909
1910 /* Capturing brackets */
1911 if (op > OP_BRA) {
1912 int captureNum = op - OP_BRA;
1913 if (captureNum > EXTRACT_BASIC_MAX)
1914 captureNum = get2ByteValue(opcodePtr: scode + 2 + LINK_SIZE);
1915 int bracketMask = (captureNum < 32) ? (1 << captureNum) : 1;
1916 return bracketNeedsLineStart(code: scode, captureMap: captureMap | bracketMask, backrefMap);
1917 }
1918
1919 /* Other brackets */
1920 if (op == OP_BRA || op == OP_ASSERT)
1921 return bracketNeedsLineStart(code: scode, captureMap, backrefMap);
1922
1923 /* .* means "start at start or after \n" if it isn't in brackets that
1924 may be referenced. */
1925
1926 if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)
1927 return scode[1] == OP_NOT_NEWLINE && !(captureMap & backrefMap);
1928
1929 /* Explicit ^ */
1930 return op == OP_CIRC || op == OP_BOL;
1931}
1932
1933static bool bracketNeedsLineStart(const unsigned char* code, unsigned captureMap, unsigned backrefMap)
1934{
1935 do {
1936 if (!branchNeedsLineStart(code: code + 1 + LINK_SIZE, captureMap, backrefMap))
1937 return false;
1938 code += getLinkValue(opcodePtr: code + 1);
1939 } while (*code == OP_ALT); /* Loop for each alternative */
1940 return true;
1941}
1942
1943/*************************************************
1944* Check for asserted fixed first char *
1945*************************************************/
1946
1947/* During compilation, the "first char" settings from forward assertions are
1948discarded, because they can cause conflicts with actual literals that follow.
1949However, if we end up without a first char setting for an unanchored pattern,
1950it is worth scanning the regex to see if there is an initial asserted first
1951char. If all branches start with the same asserted char, or with a bracket all
1952of whose alternatives start with the same asserted char (recurse ad lib), then
1953we return that char, otherwise -1.
1954
1955Arguments:
1956 code points to start of expression (the bracket)
1957 options pointer to the options (used to check casing changes)
1958 inassert true if in an assertion
1959
1960Returns: -1 or the fixed first char
1961*/
1962
1963static int branchFindFirstAssertedCharacter(const unsigned char* code, bool inassert)
1964{
1965 const unsigned char* scode = firstSignificantOpcodeSkippingAssertions(code);
1966 int op = *scode;
1967
1968 if (op >= OP_BRA)
1969 op = OP_BRA;
1970
1971 switch (op) {
1972 default:
1973 return -1;
1974
1975 case OP_BRA:
1976 case OP_ASSERT:
1977 return bracketFindFirstAssertedCharacter(code: scode, inassert: op == OP_ASSERT);
1978
1979 case OP_EXACT:
1980 scode += 2;
1981 /* Fall through */
1982
1983 case OP_CHAR:
1984 case OP_CHAR_IGNORING_CASE:
1985 case OP_ASCII_CHAR:
1986 case OP_ASCII_LETTER_IGNORING_CASE:
1987 case OP_PLUS:
1988 case OP_MINPLUS:
1989 if (!inassert)
1990 return -1;
1991 return scode[1];
1992 }
1993}
1994
1995static int bracketFindFirstAssertedCharacter(const unsigned char* code, bool inassert)
1996{
1997 int c = -1;
1998 do {
1999 int d = branchFindFirstAssertedCharacter(code: code + 1 + LINK_SIZE, inassert);
2000 if (d < 0)
2001 return -1;
2002 if (c < 0)
2003 c = d;
2004 else if (c != d)
2005 return -1;
2006 code += getLinkValue(opcodePtr: code + 1);
2007 } while (*code == OP_ALT);
2008 return c;
2009}
2010
2011static inline int multiplyWithOverflowCheck(int a, int b)
2012{
2013 if (!a || !b)
2014 return 0;
2015 if (a > MAX_PATTERN_SIZE / b)
2016 return -1;
2017 return a * b;
2018}
2019
2020static int calculateCompiledPatternLength(const UChar* pattern, int patternLength, JSRegExpIgnoreCaseOption ignoreCase,
2021 CompileData& cd, ErrorCode& errorcode)
2022{
2023 /* Make a pass over the pattern to compute the
2024 amount of store required to hold the compiled code. This does not have to be
2025 perfect as long as errors are overestimates. */
2026
2027 if (patternLength > MAX_PATTERN_SIZE) {
2028 errorcode = ERR16;
2029 return -1;
2030 }
2031
2032 int length = 1 + LINK_SIZE; /* For initial BRA plus length */
2033 int branch_extra = 0;
2034 int lastitemlength = 0;
2035 unsigned brastackptr = 0;
2036 int brastack[BRASTACK_SIZE];
2037 unsigned char bralenstack[BRASTACK_SIZE];
2038 int bracount = 0;
2039
2040 const UChar* ptr = (const UChar*)(pattern - 1);
2041 const UChar* patternEnd = (const UChar*)(pattern + patternLength);
2042
2043 while (++ptr < patternEnd) {
2044 int minRepeats = 0, maxRepeats = 0;
2045 int c = *ptr;
2046
2047 switch (c) {
2048 /* A backslashed item may be an escaped data character or it may be a
2049 character type. */
2050
2051 case '\\':
2052 c = checkEscape(ptrPtr: &ptr, patternEnd, errorCodePtr: &errorcode, bracount: cd.numCapturingBrackets, isClass: false);
2053 if (errorcode != 0)
2054 return -1;
2055
2056 lastitemlength = 1; /* Default length of last item for repeats */
2057
2058 if (c >= 0) { /* Data character */
2059 length += 2; /* For a one-byte character */
2060
2061 if (c > 127) {
2062 int i;
2063 for (i = 0; i < jsc_pcre_utf8_table1_size; i++)
2064 if (c <= jsc_pcre_utf8_table1[i]) break;
2065 length += i;
2066 lastitemlength += i;
2067 }
2068
2069 continue;
2070 }
2071
2072 /* Other escapes need one byte */
2073
2074 length++;
2075
2076 /* A back reference needs an additional 2 bytes, plus either one or 5
2077 bytes for a repeat. We also need to keep the value of the highest
2078 back reference. */
2079
2080 if (c <= -ESC_REF) {
2081 int refnum = -c - ESC_REF;
2082 cd.backrefMap |= (refnum < 32) ? (1 << refnum) : 1;
2083 if (refnum > cd.topBackref)
2084 cd.topBackref = refnum;
2085 length += 2; /* For single back reference */
2086 if (safelyCheckNextChar(ptr, patternEnd, expected: '{') && isCountedRepeat(p: ptr + 2, patternEnd)) {
2087 ptr = readRepeatCounts(p: ptr + 2, minp: &minRepeats, maxp: &maxRepeats, errorCodePtr: &errorcode);
2088 if (errorcode)
2089 return -1;
2090 if ((minRepeats == 0 && (maxRepeats == 1 || maxRepeats == -1)) ||
2091 (minRepeats == 1 && maxRepeats == -1))
2092 length++;
2093 else
2094 length += 5;
2095 if (safelyCheckNextChar(ptr, patternEnd, expected: '?'))
2096 ptr++;
2097 }
2098 }
2099 continue;
2100
2101 case '^': /* Single-byte metacharacters */
2102 case '.':
2103 case '$':
2104 length++;
2105 lastitemlength = 1;
2106 continue;
2107
2108 case '*': /* These repeats won't be after brackets; */
2109 case '+': /* those are handled separately */
2110 case '?':
2111 length++;
2112 goto POSSESSIVE;
2113
2114 /* This covers the cases of braced repeats after a single char, metachar,
2115 class, or back reference. */
2116
2117 case '{':
2118 if (!isCountedRepeat(p: ptr + 1, patternEnd))
2119 goto NORMAL_CHAR;
2120 ptr = readRepeatCounts(p: ptr + 1, minp: &minRepeats, maxp: &maxRepeats, errorCodePtr: &errorcode);
2121 if (errorcode != 0)
2122 return -1;
2123
2124 /* These special cases just insert one extra opcode */
2125
2126 if ((minRepeats == 0 && (maxRepeats == 1 || maxRepeats == -1)) ||
2127 (minRepeats == 1 && maxRepeats == -1))
2128 length++;
2129
2130 /* These cases might insert additional copies of a preceding character. */
2131
2132 else {
2133 if (minRepeats != 1) {
2134 length -= lastitemlength; /* Uncount the original char or metachar */
2135 if (minRepeats > 0)
2136 length += 3 + lastitemlength;
2137 }
2138 length += lastitemlength + ((maxRepeats > 0) ? 3 : 1);
2139 }
2140
2141 if (safelyCheckNextChar(ptr, patternEnd, expected: '?'))
2142 ptr++; /* Needs no extra length */
2143
2144 POSSESSIVE: /* Test for possessive quantifier */
2145 if (safelyCheckNextChar(ptr, patternEnd, expected: '+')) {
2146 ptr++;
2147 length += 2 + 2 * LINK_SIZE; /* Allow for atomic brackets */
2148 }
2149 continue;
2150
2151 /* An alternation contains an offset to the next branch or ket. If any ims
2152 options changed in the previous branch(es), and/or if we are in a
2153 lookbehind assertion, extra space will be needed at the start of the
2154 branch. This is handled by branch_extra. */
2155
2156 case '|':
2157 if (brastackptr == 0)
2158 cd.needOuterBracket = true;
2159 length += 1 + LINK_SIZE + branch_extra;
2160 continue;
2161
2162 /* A character class uses 33 characters provided that all the character
2163 values are less than 256. Otherwise, it uses a bit map for low valued
2164 characters, and individual items for others. Don't worry about character
2165 types that aren't allowed in classes - they'll get picked up during the
2166 compile. A character class that contains only one single-byte character
2167 uses 2 or 3 bytes, depending on whether it is negated or not. Notice this
2168 where we can. (In UTF-8 mode we can do this only for chars < 128.) */
2169
2170 case '[': {
2171 int class_optcount;
2172 if (*(++ptr) == '^') {
2173 class_optcount = 10; /* Greater than one */
2174 ptr++;
2175 }
2176 else
2177 class_optcount = 0;
2178
2179 bool class_utf8 = false;
2180
2181 for (; ptr < patternEnd && *ptr != ']'; ++ptr) {
2182 /* Check for escapes */
2183
2184 if (*ptr == '\\') {
2185 c = checkEscape(ptrPtr: &ptr, patternEnd, errorCodePtr: &errorcode, bracount: cd.numCapturingBrackets, isClass: true);
2186 if (errorcode != 0)
2187 return -1;
2188
2189 /* Handle escapes that turn into characters */
2190
2191 if (c >= 0)
2192 goto NON_SPECIAL_CHARACTER;
2193
2194 /* Escapes that are meta-things. The normal ones just affect the
2195 bit map, but Unicode properties require an XCLASS extended item. */
2196
2197 else
2198 class_optcount = 10; /* \d, \s etc; make sure > 1 */
2199 }
2200
2201 /* Anything else increments the possible optimization count. We have to
2202 detect ranges here so that we can compute the number of extra ranges for
2203 caseless wide characters when UCP support is available. If there are wide
2204 characters, we are going to have to use an XCLASS, even for single
2205 characters. */
2206
2207 else {
2208 c = *ptr;
2209
2210 /* Come here from handling \ above when it escapes to a char value */
2211
2212 NON_SPECIAL_CHARACTER:
2213 class_optcount++;
2214
2215 int d = -1;
2216 if (safelyCheckNextChar(ptr, patternEnd, expected: '-')) {
2217 const UChar* hyptr = ptr++;
2218 if (safelyCheckNextChar(ptr, patternEnd, expected: '\\')) {
2219 ptr++;
2220 d = checkEscape(ptrPtr: &ptr, patternEnd, errorCodePtr: &errorcode, bracount: cd.numCapturingBrackets, isClass: true);
2221 if (errorcode != 0)
2222 return -1;
2223 }
2224 else if ((ptr + 1 < patternEnd) && ptr[1] != ']')
2225 d = *++ptr;
2226 if (d < 0)
2227 ptr = hyptr; /* go back to hyphen as data */
2228 }
2229
2230 /* If d >= 0 we have a range. In UTF-8 mode, if the end is > 255, or >
2231 127 for caseless matching, we will need to use an XCLASS. */
2232
2233 if (d >= 0) {
2234 class_optcount = 10; /* Ensure > 1 */
2235 if (d < c) {
2236 errorcode = ERR8;
2237 return -1;
2238 }
2239
2240 if ((d > 255 || (ignoreCase && d > 127))) {
2241 unsigned char buffer[6];
2242 if (!class_utf8) /* Allow for XCLASS overhead */
2243 {
2244 class_utf8 = true;
2245 length += LINK_SIZE + 2;
2246 }
2247
2248 /* If we have UCP support, find out how many extra ranges are
2249 needed to map the other case of characters within this range. We
2250 have to mimic the range optimization here, because extending the
2251 range upwards might push d over a boundary that makes it use
2252 another byte in the UTF-8 representation. */
2253
2254 if (ignoreCase) {
2255 int occ, ocd;
2256 int cc = c;
2257 int origd = d;
2258 while (getOthercaseRange(cptr: &cc, d: origd, ocptr: &occ, odptr: &ocd)) {
2259 if (occ >= c && ocd <= d)
2260 continue; /* Skip embedded */
2261
2262 if (occ < c && ocd >= c - 1) /* Extend the basic range */
2263 { /* if there is overlap, */
2264 c = occ; /* noting that if occ < c */
2265 continue; /* we can't have ocd > d */
2266 } /* because a subrange is */
2267 if (ocd > d && occ <= d + 1) /* always shorter than */
2268 { /* the basic range. */
2269 d = ocd;
2270 continue;
2271 }
2272
2273 /* An extra item is needed */
2274
2275 length += 1 + encodeUTF8(cvalue: occ, buffer) +
2276 ((occ == ocd) ? 0 : encodeUTF8(cvalue: ocd, buffer));
2277 }
2278 }
2279
2280 /* The length of the (possibly extended) range */
2281
2282 length += 1 + encodeUTF8(cvalue: c, buffer) + encodeUTF8(cvalue: d, buffer);
2283 }
2284
2285 }
2286
2287 /* We have a single character. There is nothing to be done unless we
2288 are in UTF-8 mode. If the char is > 255, or 127 when caseless, we must
2289 allow for an XCL_SINGLE item, doubled for caselessness if there is UCP
2290 support. */
2291
2292 else {
2293 if ((c > 255 || (ignoreCase && c > 127))) {
2294 unsigned char buffer[6];
2295 class_optcount = 10; /* Ensure > 1 */
2296 if (!class_utf8) /* Allow for XCLASS overhead */
2297 {
2298 class_utf8 = true;
2299 length += LINK_SIZE + 2;
2300 }
2301 length += (ignoreCase ? 2 : 1) * (1 + encodeUTF8(cvalue: c, buffer));
2302 }
2303 }
2304 }
2305 }
2306
2307 if (ptr >= patternEnd) { /* Missing terminating ']' */
2308 errorcode = ERR6;
2309 return -1;
2310 }
2311
2312 /* We can optimize when there was only one optimizable character.
2313 Note that this does not detect the case of a negated single character.
2314 In that case we do an incorrect length computation, but it's not a serious
2315 problem because the computed length is too large rather than too small. */
2316
2317 if (class_optcount == 1)
2318 goto NORMAL_CHAR;
2319
2320 /* Here, we handle repeats for the class opcodes. */
2321 {
2322 length += 33;
2323
2324 /* A repeat needs either 1 or 5 bytes. If it is a possessive quantifier,
2325 we also need extra for wrapping the whole thing in a sub-pattern. */
2326
2327 if (safelyCheckNextChar(ptr, patternEnd, expected: '{') && isCountedRepeat(p: ptr + 2, patternEnd)) {
2328 ptr = readRepeatCounts(p: ptr + 2, minp: &minRepeats, maxp: &maxRepeats, errorCodePtr: &errorcode);
2329 if (errorcode != 0)
2330 return -1;
2331 if ((minRepeats == 0 && (maxRepeats == 1 || maxRepeats == -1)) ||
2332 (minRepeats == 1 && maxRepeats == -1))
2333 length++;
2334 else
2335 length += 5;
2336 if (safelyCheckNextChar(ptr, patternEnd, expected: '+')) {
2337 ptr++;
2338 length += 2 + 2 * LINK_SIZE;
2339 } else if (safelyCheckNextChar(ptr, patternEnd, expected: '?'))
2340 ptr++;
2341 }
2342 }
2343 continue;
2344 }
2345
2346 /* Brackets may be genuine groups or special things */
2347
2348 case '(': {
2349 int branch_newextra = 0;
2350 int bracket_length = 1 + LINK_SIZE;
2351 bool capturing = false;
2352
2353 /* Handle special forms of bracket, which all start (? */
2354
2355 if (safelyCheckNextChar(ptr, patternEnd, expected: '?')) {
2356 switch (c = (ptr + 2 < patternEnd ? ptr[2] : 0)) {
2357 /* Non-referencing groups and lookaheads just move the pointer on, and
2358 then behave like a non-special bracket, except that they don't increment
2359 the count of extracting brackets. Ditto for the "once only" bracket,
2360 which is in Perl from version 5.005. */
2361
2362 case ':':
2363 case '=':
2364 case '!':
2365 ptr += 2;
2366 break;
2367
2368 /* Else loop checking valid options until ) is met. Anything else is an
2369 error. If we are without any brackets, i.e. at top level, the settings
2370 act as if specified in the options, so massage the options immediately.
2371 This is for backward compatibility with Perl 5.004. */
2372
2373 default:
2374 errorcode = ERR12;
2375 return -1;
2376 }
2377 } else
2378 capturing = 1;
2379
2380 /* Capturing brackets must be counted so we can process escapes in a
2381 Perlish way. If the number exceeds EXTRACT_BASIC_MAX we are going to need
2382 an additional 3 bytes of memory per capturing bracket. */
2383
2384 if (capturing) {
2385 bracount++;
2386 if (bracount > EXTRACT_BASIC_MAX)
2387 bracket_length += 3;
2388 }
2389
2390 /* Save length for computing whole length at end if there's a repeat that
2391 requires duplication of the group. Also save the current value of
2392 branch_extra, and start the new group with the new value. If non-zero, this
2393 will either be 2 for a (?imsx: group, or 3 for a lookbehind assertion. */
2394
2395 if (brastackptr >= sizeof(brastack)/sizeof(int)) {
2396 errorcode = ERR17;
2397 return -1;
2398 }
2399
2400 bralenstack[brastackptr] = branch_extra;
2401 branch_extra = branch_newextra;
2402
2403 brastack[brastackptr++] = length;
2404 length += bracket_length;
2405 continue;
2406 }
2407
2408 /* Handle ket. Look for subsequent maxRepeats/minRepeats; for certain sets of values we
2409 have to replicate this bracket up to that many times. If brastackptr is
2410 0 this is an unmatched bracket which will generate an error, but take care
2411 not to try to access brastack[-1] when computing the length and restoring
2412 the branch_extra value. */
2413
2414 case ')': {
2415 int duplength;
2416 length += 1 + LINK_SIZE;
2417 if (brastackptr > 0) {
2418 duplength = length - brastack[--brastackptr];
2419 branch_extra = bralenstack[brastackptr];
2420 }
2421 else
2422 duplength = 0;
2423
2424 /* Leave ptr at the final char; for readRepeatCounts this happens
2425 automatically; for the others we need an increment. */
2426
2427 if ((ptr + 1 < patternEnd) && (c = ptr[1]) == '{' && isCountedRepeat(p: ptr + 2, patternEnd)) {
2428 ptr = readRepeatCounts(p: ptr + 2, minp: &minRepeats, maxp: &maxRepeats, errorCodePtr: &errorcode);
2429 if (errorcode)
2430 return -1;
2431 } else if (c == '*') {
2432 minRepeats = 0;
2433 maxRepeats = -1;
2434 ptr++;
2435 } else if (c == '+') {
2436 minRepeats = 1;
2437 maxRepeats = -1;
2438 ptr++;
2439 } else if (c == '?') {
2440 minRepeats = 0;
2441 maxRepeats = 1;
2442 ptr++;
2443 } else {
2444 minRepeats = 1;
2445 maxRepeats = 1;
2446 }
2447
2448 /* If the minimum is zero, we have to allow for an OP_BRAZERO before the
2449 group, and if the maximum is greater than zero, we have to replicate
2450 maxval-1 times; each replication acquires an OP_BRAZERO plus a nesting
2451 bracket set. */
2452
2453 int repeatsLength;
2454 if (minRepeats == 0) {
2455 length++;
2456 if (maxRepeats > 0) {
2457 repeatsLength = multiplyWithOverflowCheck(a: maxRepeats - 1, b: duplength + 3 + 2 * LINK_SIZE);
2458 if (repeatsLength < 0) {
2459 errorcode = ERR16;
2460 return -1;
2461 }
2462 length += repeatsLength;
2463 if (length > MAX_PATTERN_SIZE) {
2464 errorcode = ERR16;
2465 return -1;
2466 }
2467 }
2468 }
2469
2470 /* When the minimum is greater than zero, we have to replicate up to
2471 minval-1 times, with no additions required in the copies. Then, if there
2472 is a limited maximum we have to replicate up to maxval-1 times allowing
2473 for a BRAZERO item before each optional copy and nesting brackets for all
2474 but one of the optional copies. */
2475
2476 else {
2477 repeatsLength = multiplyWithOverflowCheck(a: minRepeats - 1, b: duplength);
2478 if (repeatsLength < 0) {
2479 errorcode = ERR16;
2480 return -1;
2481 }
2482 length += repeatsLength;
2483 if (maxRepeats > minRepeats) { /* Need this test as maxRepeats=-1 means no limit */
2484 repeatsLength = multiplyWithOverflowCheck(a: maxRepeats - minRepeats, b: duplength + 3 + 2 * LINK_SIZE);
2485 if (repeatsLength < 0) {
2486 errorcode = ERR16;
2487 return -1;
2488 }
2489 length += repeatsLength - (2 + 2 * LINK_SIZE);
2490 }
2491 if (length > MAX_PATTERN_SIZE) {
2492 errorcode = ERR16;
2493 return -1;
2494 }
2495 }
2496
2497 /* Allow space for once brackets for "possessive quantifier" */
2498
2499 if (safelyCheckNextChar(ptr, patternEnd, expected: '+')) {
2500 ptr++;
2501 length += 2 + 2 * LINK_SIZE;
2502 }
2503 continue;
2504 }
2505
2506 /* Non-special character. It won't be space or # in extended mode, so it is
2507 always a genuine character. If we are in a \Q...\E sequence, check for the
2508 end; if not, we have a literal. */
2509
2510 default:
2511 NORMAL_CHAR:
2512 length += 2; /* For a one-byte character */
2513 lastitemlength = 1; /* Default length of last item for repeats */
2514
2515 if (c > 127) {
2516 int i;
2517 for (i = 0; i < jsc_pcre_utf8_table1_size; i++)
2518 if (c <= jsc_pcre_utf8_table1[i])
2519 break;
2520 length += i;
2521 lastitemlength += i;
2522 }
2523
2524 continue;
2525 }
2526 }
2527
2528 length += 2 + LINK_SIZE; /* For final KET and END */
2529
2530 cd.numCapturingBrackets = bracount;
2531 return length;
2532}
2533
2534/*************************************************
2535* Compile a Regular Expression *
2536*************************************************/
2537
2538/* This function takes a string and returns a pointer to a block of store
2539holding a compiled version of the expression. The original API for this
2540function had no error code return variable; it is retained for backwards
2541compatibility. The new function is given a new name.
2542
2543Arguments:
2544 pattern the regular expression
2545 options various option bits
2546 errorCodePtr pointer to error code variable (pcre_compile2() only)
2547 can be NULL if you don't want a code value
2548 errorPtr pointer to pointer to error text
2549 erroroffset ptr offset in pattern where error was detected
2550 tables pointer to character tables or NULL
2551
2552Returns: pointer to compiled data block, or NULL on error,
2553 with errorPtr and erroroffset set
2554*/
2555
2556static inline JSRegExp* returnError(ErrorCode errorcode, const char** errorPtr)
2557{
2558 *errorPtr = errorText(code: errorcode);
2559 return 0;
2560}
2561
2562JSRegExp* jsRegExpCompile(const UChar* pattern, int patternLength,
2563 JSRegExpIgnoreCaseOption ignoreCase, JSRegExpMultilineOption multiline,
2564 unsigned* numSubpatterns, const char** errorPtr)
2565{
2566 /* We can't pass back an error message if errorPtr is NULL; I guess the best we
2567 can do is just return NULL, but we can set a code value if there is a code pointer. */
2568 if (!errorPtr)
2569 return 0;
2570 *errorPtr = NULL;
2571
2572 CompileData cd;
2573
2574 ErrorCode errorcode = ERR0;
2575 /* Call this once just to count the brackets. */
2576 calculateCompiledPatternLength(pattern, patternLength, ignoreCase, cd, errorcode);
2577 /* Call it again to compute the length. */
2578 int length = calculateCompiledPatternLength(pattern, patternLength, ignoreCase, cd, errorcode);
2579 if (errorcode)
2580 return returnError(errorcode, errorPtr);
2581
2582 if (length > MAX_PATTERN_SIZE)
2583 return returnError(errorcode: ERR16, errorPtr);
2584
2585 size_t size = length + sizeof(JSRegExp);
2586#if REGEXP_HISTOGRAM
2587 size_t stringOffset = (size + sizeof(UChar) - 1) / sizeof(UChar) * sizeof(UChar);
2588 size = stringOffset + patternLength * sizeof(UChar);
2589#endif
2590 JSRegExp* re = reinterpret_cast<JSRegExp*>(new char[size]);
2591
2592 if (!re)
2593 return returnError(errorcode: ERR13, errorPtr);
2594
2595 re->options = (ignoreCase ? IgnoreCaseOption : 0) | (multiline ? MatchAcrossMultipleLinesOption : 0);
2596
2597 /* The starting points of the name/number translation table and of the code are
2598 passed around in the compile data block. */
2599
2600 const unsigned char* codeStart = (const unsigned char*)(re + 1);
2601
2602 /* Set up a starting, non-extracting bracket, then compile the expression. On
2603 error, errorcode will be set non-zero, so we don't need to look at the result
2604 of the function here. */
2605
2606 const UChar* ptr = (const UChar*)pattern;
2607 const UChar* patternEnd = pattern + patternLength;
2608 unsigned char* code = const_cast<unsigned char*>(codeStart);
2609 int firstByte, reqByte;
2610 int bracketCount = 0;
2611 if (!cd.needOuterBracket)
2612 compileBranch(options: re->options, brackets: &bracketCount, codePtr: &code, ptrPtr: &ptr, patternEnd, errorCodePtr: &errorcode, firstbyteptr: &firstByte, reqbyteptr: &reqByte, cd);
2613 else {
2614 *code = OP_BRA;
2615 compileBracket(options: re->options, brackets: &bracketCount, codePtr: &code, ptrPtr: &ptr, patternEnd, errorCodePtr: &errorcode, skipBytes: 0, firstbyteptr: &firstByte, reqbyteptr: &reqByte, cd);
2616 }
2617 re->topBracket = bracketCount;
2618 re->topBackref = cd.topBackref;
2619
2620 /* If not reached end of pattern on success, there's an excess bracket. */
2621
2622 if (errorcode == 0 && ptr < patternEnd)
2623 errorcode = ERR10;
2624
2625 /* Fill in the terminating state and check for disastrous overflow, but
2626 if debugging, leave the test till after things are printed out. */
2627
2628 *code++ = OP_END;
2629
2630 ASSERT(code - codeStart <= length);
2631 if (code - codeStart > length)
2632 errorcode = ERR7;
2633
2634 /* Give an error if there's back reference to a non-existent capturing
2635 subpattern. */
2636
2637 if (re->topBackref > re->topBracket)
2638 errorcode = ERR15;
2639
2640 /* Failed to compile, or error while post-processing */
2641
2642 if (errorcode != ERR0) {
2643 delete [] reinterpret_cast<char*>(re);
2644 return returnError(errorcode, errorPtr);
2645 }
2646
2647 /* If the anchored option was not passed, set the flag if we can determine that
2648 the pattern is anchored by virtue of ^ characters or \A or anything else (such
2649 as starting with .* when DOTALL is set).
2650
2651 Otherwise, if we know what the first character has to be, save it, because that
2652 speeds up unanchored matches no end. If not, see if we can set the
2653 UseMultiLineFirstByteOptimizationOption flag. This is helpful for multiline matches when all branches
2654 start with ^. and also when all branches start with .* for non-DOTALL matches.
2655 */
2656
2657 if (cd.needOuterBracket ? bracketIsAnchored(code: codeStart) : branchIsAnchored(code: codeStart))
2658 re->options |= IsAnchoredOption;
2659 else {
2660 if (firstByte < 0) {
2661 firstByte = (cd.needOuterBracket
2662 ? bracketFindFirstAssertedCharacter(code: codeStart, inassert: false)
2663 : branchFindFirstAssertedCharacter(code: codeStart, inassert: false))
2664 | ((re->options & IgnoreCaseOption) ? REQ_IGNORE_CASE : 0);
2665 }
2666 if (firstByte >= 0) {
2667 int ch = firstByte & 255;
2668 if (ch < 127) {
2669 re->firstByte = ((firstByte & REQ_IGNORE_CASE) && flipCase(c: ch) == ch) ? ch : firstByte;
2670 re->options |= UseFirstByteOptimizationOption;
2671 }
2672 } else {
2673 if (cd.needOuterBracket ? bracketNeedsLineStart(code: codeStart, captureMap: 0, backrefMap: cd.backrefMap) : branchNeedsLineStart(code: codeStart, captureMap: 0, backrefMap: cd.backrefMap))
2674 re->options |= UseMultiLineFirstByteOptimizationOption;
2675 }
2676 }
2677
2678 /* For an anchored pattern, we use the "required byte" only if it follows a
2679 variable length item in the regex. Remove the caseless flag for non-caseable
2680 bytes. */
2681
2682 if (reqByte >= 0 && (!(re->options & IsAnchoredOption) || (reqByte & REQ_VARY))) {
2683 int ch = reqByte & 255;
2684 if (ch < 127) {
2685 re->reqByte = ((reqByte & REQ_IGNORE_CASE) && flipCase(c: ch) == ch) ? (reqByte & ~REQ_IGNORE_CASE) : reqByte;
2686 re->options |= UseRequiredByteOptimizationOption;
2687 }
2688 }
2689
2690#if REGEXP_HISTOGRAM
2691 re->stringOffset = stringOffset;
2692 re->stringLength = patternLength;
2693 memcpy(reinterpret_cast<char*>(re) + stringOffset, pattern, patternLength * 2);
2694#endif
2695
2696 if (numSubpatterns)
2697 *numSubpatterns = re->topBracket;
2698 return re;
2699}
2700
2701void jsRegExpFree(JSRegExp* re)
2702{
2703 delete [] reinterpret_cast<char*>(re);
2704}
2705

source code of qtscript/src/3rdparty/javascriptcore/JavaScriptCore/pcre/pcre_compile.cpp