pcre_compile.cpp source code [qtscript/src/3rdparty/javascriptcore/JavaScriptCore/pcre/pcre_compile.cpp]

1	/ This is JavaScriptCore's variant of the PCRE library. While this library*
2	started out as a copy of PCRE, many of the features of PCRE have been
3	removed. This library now supports only the regular expression features
4	required by the JavaScript language specification, and has only the functions
5	needed by JavaScriptCore and the rest of WebKit.
6
7	Originally written by Philip Hazel
8	Copyright (c) 1997-2006 University of Cambridge
9	Copyright (C) 2002, 2004, 2006, 2007, 2008, 2009 Apple Inc. All rights reserved.
10	Copyright (C) 2007 Eric Seidel <eric@webkit.org>
11
12	-----------------------------------------------------------------------------
13	Redistribution and use in source and binary forms, with or without
14	modification, are permitted provided that the following conditions are met:
15
16	* Redistributions of source code must retain the above copyright notice,
17	this list of conditions and the following disclaimer.
18
19	* Redistributions in binary form must reproduce the above copyright
20	notice, this list of conditions and the following disclaimer in the
21	documentation and/or other materials provided with the distribution.
22
23	* Neither the name of the University of Cambridge nor the names of its
24	contributors may be used to endorse or promote products derived from
25	this software without specific prior written permission.
26
27	THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28	AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29	IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30	ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31	LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32	CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33	SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34	INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35	CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36	ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37	POSSIBILITY OF SUCH DAMAGE.
38	-----------------------------------------------------------------------------
39	*/
40
41	/ This module contains the external function jsRegExpExecute(), along with*
42	supporting internal functions that are not used by other modules. /*
43
44	#include "config.h"
45
46	#include "pcre_internal.h"
47
48	#include <string.h>
49	#include <wtf/ASCIICType.h>
50	#include <wtf/FastMalloc.h>
51
52	using namespace WTF;
53
54	/ Negative values for the firstchar and reqchar variables /
55
56	#define REQ_UNSET (-2)
57	#define REQ_NONE (-1)
58
59	/*************************************************
60	* Code parameters and static tables *
61	*************************************************/
62
63	/ Maximum number of items on the nested bracket stacks at compile time. This*
64	applies to the nesting of all kinds of parentheses. It does not limit
65	un-nested, non-capturing parentheses. This number can be made bigger if
66	necessary - it is used to dimension one int and one unsigned char vector at
67	compile time. /*
68
69	#define BRASTACK_SIZE 200
70
71	/ Table for handling escaped characters in the range '0'-'z'. Positive returns*
72	are simple data values; negative values are for special things like \d and so
73	on. Zero means further processing is needed (for things like \x), or the escape
74	is invalid. /*
75
76	static const short escapes[] = {
77	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, / 0 - 7 /
78	`0`, `0`, `':'`, `';'`, `'<'`, `'='`, `'>'`, `'?'`, / 8 - ? /
79	`'@'`, `0`, -ESC_B, `0`, -ESC_D, `0`, `0`, `0`, / @ - G /
80	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, / H - O /
81	`0`, `0`, `0`, -ESC_S, `0`, `0`, `0`, -ESC_W, / P - W /
82	`0`, `0`, `0`, `'['`, `'\\'`, `']'`, `'^'`, `'_'`, / X - _ /
83	'`', `7`, -ESC_b, `0`, -ESC_d, `0`, `'\f'`, `0`, / ` - g /
84	`0`, `0`, `0`, `0`, `0`, `0`, `'\n'`, `0`, / h - o /
85	`0`, `0`, `'\r'`, -ESC_s, `'\t'`, `0`, `'\v'`, -ESC_w, / p - w /
86	`0`, `0`, `0` / x - z /
87	};
88
89	/ Error code numbers. They are given names so that they can more easily be*
90	tracked. /*
91
92	enum ErrorCode {
93	ERR0, ERR1, ERR2, ERR3, ERR4, ERR5, ERR6, ERR7, ERR8, ERR9,
94	ERR10, ERR11, ERR12, ERR13, ERR14, ERR15, ERR16, ERR17
95	};
96
97	/ The texts of compile-time error messages. These are "char " because they
98	are passed to the outside world. /*
99
100	static const char* errorText(ErrorCode code)
101	{
102	static const char errorTexts[] =
103	/ 1 /
104	"\\ at end of pattern\0"
105	"\\c at end of pattern\0"
106	"character value in \\x{...} sequence is too large\0"
107	"numbers out of order in {} quantifier\0"
108	/ 5 /
109	"number too big in {} quantifier\0"
110	"missing terminating ] for character class\0"
111	"internal error: code overflow\0"
112	"range out of order in character class\0"
113	"nothing to repeat\0"
114	/ 10 /
115	"unmatched parentheses\0"
116	"internal error: unexpected repeat\0"
117	"unrecognized character after (?\0"
118	"failed to get memory\0"
119	"missing )\0"
120	/ 15 /
121	"reference to non-existent subpattern\0"
122	"regular expression too large\0"
123	"parentheses nested too deeply"
124	;
125
126	int i = code;
127	const char* text = errorTexts;
128	while (i > `1`)
129	i -= !*text++;
130	return text;
131	}
132
133	/ Structure for passing "static" information around between the functions*
134	doing the compiling. /*
135
136	struct CompileData {
137	CompileData() {
138	topBackref = `0`;
139	backrefMap = `0`;
140	reqVaryOpt = `0`;
141	needOuterBracket = false;
142	numCapturingBrackets = `0`;
143	}
144	int topBackref; / Maximum back reference /
145	unsigned backrefMap; / Bitmap of low back refs /
146	int reqVaryOpt; / "After variable item" flag for reqByte /
147	bool needOuterBracket;
148	int numCapturingBrackets;
149	};
150
151	/ Definitions to allow mutual recursion /
152
153	static bool compileBracket(int, int, unsigned* char*, const* UChar*, const* UChar, ErrorCode, int, int, int**, CompileData&);
154	static bool bracketIsAnchored(const unsigned char* code);
155	static bool bracketNeedsLineStart(const unsigned char* code, unsigned captureMap, unsigned backrefMap);
156	static int bracketFindFirstAssertedCharacter(const unsigned char* code, bool inassert);
157
158	/*************************************************
159	* Handle escapes *
160	*************************************************/
161
162	/ This function is called when a \ has been encountered. It either returns a*
163	positive value for a simple escape such as \n, or a negative value which
164	encodes one of the more complicated things such as \d. When UTF-8 is enabled,
165	a positive value greater than 255 may be returned. On entry, ptr is pointing at
166	the \. On exit, it is on the final character of the escape sequence.
167
168	Arguments:
169	ptrPtr points to the pattern position pointer
170	errorCodePtr points to the errorcode variable
171	bracount number of previous extracting brackets
172	options the options bits
173	isClass true if inside a character class
174
175	Returns: zero or positive => a data character
176	negative => a special escape sequence
177	on error, errorPtr is set
178	*/
179
180	static int checkEscape(const UChar** ptrPtr, const UChar* patternEnd, ErrorCode* errorCodePtr, int bracount, bool isClass)
181	{
182	const UChar* ptr = *ptrPtr + `1`;
183
184	/ If backslash is at the end of the pattern, it's an error. /
185	if (ptr == patternEnd) {
186	*errorCodePtr = ERR1;
187	*ptrPtr = ptr;
188	return `0`;
189	}
190
191	int c = *ptr;
192
193	/ Non-alphamerics are literals. For digits or letters, do an initial lookup in*
194	a table. A non-zero result is something that can be returned immediately.
195	Otherwise further processing may be required. /*
196
197	if (c < `'0'` \|\| c > `'z'`) { / Not alphameric /
198	} else if (int escapeValue = escapes[c - `'0'`]) {
199	c = escapeValue;
200	if (isClass) {
201	if (-c == ESC_b)
202	c = `'\b'`; / \b is backslash in a class /
203	else if (-c == ESC_B)
204	c = `'B'`; / and \B is a capital B in a class (in browsers event though ECMAScript 15.10.2.19 says it raises an error) /
205	}
206	/ Escapes that need further processing, or are illegal. /
207
208	} else {
209	switch (c) {
210	case `'1'`:
211	case `'2'`:
212	case `'3'`:
213	case `'4'`:
214	case `'5'`:
215	case `'6'`:
216	case `'7'`:
217	case `'8'`:
218	case `'9'`:
219	/ Escape sequences starting with a non-zero digit are backreferences,*
220	unless there are insufficient brackets, in which case they are octal
221	escape sequences. Those sequences end on the first non-octal character
222	or when we overflow 0-255, whichever comes first. /*
223
224	if (!isClass) {
225	const UChar* oldptr = ptr;
226	c -= `'0'`;
227	while ((ptr + `1` < patternEnd) && isASCIIDigit(c: ptr[`1`]) && c <= bracount)
228	c = c * `10` + *(++ptr) - `'0'`;
229	if (c <= bracount) {
230	c = -(ESC_REF + c);
231	break;
232	}
233	ptr = oldptr; / Put the pointer back and fall through /
234	}
235
236	/ Handle an octal number following \. If the first digit is 8 or 9,*
237	this is not octal. /*
238
239	if ((c = *ptr) >= `'8'`) {
240	c = `'\\'`;
241	ptr -= `1`;
242	break;
243	}
244
245	/ \0 always starts an octal number, but we may drop through to here with a*
246	larger first octal digit. /*
247
248	case `'0'`: {
249	c -= `'0'`;
250	int i;
251	for (i = `1`; i <= `2`; ++i) {
252	if (ptr + i >= patternEnd \|\| ptr[i] < `'0'` \|\| ptr[i] > `'7'`)
253	break;
254	int cc = c * `8` + ptr[i] - `'0'`;
255	if (cc > `255`)
256	break;
257	c = cc;
258	}
259	ptr += i - `1`;
260	break;
261	}
262
263	case `'x'`: {
264	c = `0`;
265	int i;
266	for (i = `1`; i <= `2`; ++i) {
267	if (ptr + i >= patternEnd \|\| !isASCIIHexDigit(c: ptr[i])) {
268	c = `'x'`;
269	i = `1`;
270	break;
271	}
272	int cc = ptr[i];
273	if (cc >= `'a'`)
274	cc -= `32`; / Convert to upper case /
275	c = c * `16` + cc - ((cc < `'A'`) ? `'0'` : (`'A'` - `10`));
276	}
277	ptr += i - `1`;
278	break;
279	}
280
281	case `'u'`: {
282	c = `0`;
283	int i;
284	for (i = `1`; i <= `4`; ++i) {
285	if (ptr + i >= patternEnd \|\| !isASCIIHexDigit(c: ptr[i])) {
286	c = `'u'`;
287	i = `1`;
288	break;
289	}
290	int cc = ptr[i];
291	if (cc >= `'a'`)
292	cc -= `32`; / Convert to upper case /
293	c = c * `16` + cc - ((cc < `'A'`) ? `'0'` : (`'A'` - `10`));
294	}
295	ptr += i - `1`;
296	break;
297	}
298
299	case `'c'`:
300	if (++ptr == patternEnd) {
301	*errorCodePtr = ERR2;
302	return `0`;
303	}
304
305	c = *ptr;
306
307	/ To match Firefox, inside a character class, we also accept*
308	numbers and '_' as control characters /*
309	if ((!isClass && !isASCIIAlpha(c)) \|\| (!isASCIIAlphanumeric(c) && c != `'_'`)) {
310	c = `'\\'`;
311	ptr -= `2`;
312	break;
313	}
314
315	/ A letter is upper-cased; then the 0x40 bit is flipped. This coding*
316	is ASCII-specific, but then the whole concept of \cx is ASCII-specific. /*
317	c = toASCIIUpper(c) ^ `0x40`;
318	break;
319	}
320	}
321
322	*ptrPtr = ptr;
323	return c;
324	}
325
326	/*************************************************
327	* Check for counted repeat *
328	*************************************************/
329
330	/ This function is called when a '{' is encountered in a place where it might*
331	start a quantifier. It looks ahead to see if it really is a quantifier or not.
332	It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
333	where the ddds are digits.
334
335	Arguments:
336	p pointer to the first char after '{'
337
338	Returns: true or false
339	*/
340
341	static bool isCountedRepeat(const UChar* p, const UChar* patternEnd)
342	{
343	if (p >= patternEnd \|\| !isASCIIDigit(c: *p))
344	return false;
345	p++;
346	while (p < patternEnd && isASCIIDigit(c: *p))
347	p++;
348	if (p < patternEnd && *p == `'}'`)
349	return true;
350
351	if (p >= patternEnd \|\| *p++ != `','`)
352	return false;
353	if (p < patternEnd && *p == `'}'`)
354	return true;
355
356	if (p >= patternEnd \|\| !isASCIIDigit(c: *p))
357	return false;
358	p++;
359	while (p < patternEnd && isASCIIDigit(c: *p))
360	p++;
361
362	return (p < patternEnd && *p == `'}'`);
363	}
364
365	/*************************************************
366	* Read repeat counts *
367	*************************************************/
368
369	/ Read an item of the form {n,m} and return the values. This is called only*
370	after isCountedRepeat() has confirmed that a repeat-count quantifier exists,
371	so the syntax is guaranteed to be correct, but we need to check the values.
372
373	Arguments:
374	p pointer to first char after '{'
375	minp pointer to int for min
376	maxp pointer to int for max
377	returned as -1 if no max
378	errorCodePtr points to error code variable
379
380	Returns: pointer to '}' on success;
381	current ptr on error, with errorCodePtr set non-zero
382	*/
383
384	static const UChar* readRepeatCounts(const UChar* p, int* minp, int* maxp, ErrorCode* errorCodePtr)
385	{
386	int min = `0`;
387	int max = -`1`;
388
389	/ Read the minimum value and do a paranoid check: a negative value indicates*
390	an integer overflow. /*
391
392	while (isASCIIDigit(c: *p))
393	min = min * `10` + *p++ - `'0'`;
394	if (min < `0` \|\| min > `65535`) {
395	*errorCodePtr = ERR5;
396	return p;
397	}
398
399	/ Read the maximum value if there is one, and again do a paranoid on its size.*
400	Also, max must not be less than min. /*
401
402	if (*p == `'}'`)
403	max = min;
404	else {
405	if (*(++p) != `'}'`) {
406	max = `0`;
407	while (isASCIIDigit(c: *p))
408	max = max * `10` + *p++ - `'0'`;
409	if (max < `0` \|\| max > `65535`) {
410	*errorCodePtr = ERR5;
411	return p;
412	}
413	if (max < min) {
414	*errorCodePtr = ERR4;
415	return p;
416	}
417	}
418	}
419
420	/ Fill in the required variables, and pass back the pointer to the terminating*
421	'}'. /*
422
423	*minp = min;
424	*maxp = max;
425	return p;
426	}
427
428	/*************************************************
429	* Find first significant op code *
430	*************************************************/
431
432	/ This is called by several functions that scan a compiled expression looking*
433	for a fixed first character, or an anchoring op code etc. It skips over things
434	that do not influence this.
435
436	Arguments:
437	code pointer to the start of the group
438	Returns: pointer to the first significant opcode
439	*/
440
441	static const unsigned char* firstSignificantOpcode(const unsigned char* code)
442	{
443	while (*code == OP_BRANUMBER)
444	code += `3`;
445	return code;
446	}
447
448	static const unsigned char* firstSignificantOpcodeSkippingAssertions(const unsigned char* code)
449	{
450	while (true) {
451	switch (*code) {
452	case OP_ASSERT_NOT:
453	advanceToEndOfBracket(opcodePtr&: code);
454	code += `1` + LINK_SIZE;
455	break;
456	case OP_WORD_BOUNDARY:
457	case OP_NOT_WORD_BOUNDARY:
458	++code;
459	break;
460	case OP_BRANUMBER:
461	code += `3`;
462	break;
463	default:
464	return code;
465	}
466	}
467	}
468
469	/*************************************************
470	* Get othercase range *
471	*************************************************/
472
473	/ This function is passed the start and end of a class range, in UTF-8 mode*
474	with UCP support. It searches up the characters, looking for internal ranges of
475	characters in the "other" case. Each call returns the next one, updating the
476	start address.
477
478	Arguments:
479	cptr points to starting character value; updated
480	d end value
481	ocptr where to put start of othercase range
482	odptr where to put end of othercase range
483
484	Yield: true when range returned; false when no more
485	*/
486
487	static bool getOthercaseRange(int* cptr, int d, int* ocptr, int* odptr)
488	{
489	int c, othercase = `0`;
490
491	for (c = *cptr; c <= d; c++) {
492	if ((othercase = jsc_pcre_ucp_othercase(c)) >= `0`)
493	break;
494	}
495
496	if (c > d)
497	return false;
498
499	*ocptr = othercase;
500	int next = othercase + `1`;
501
502	for (++c; c <= d; c++) {
503	if (jsc_pcre_ucp_othercase(c) != next)
504	break;
505	next++;
506	}
507
508	*odptr = next - `1`;
509	*cptr = c;
510
511	return true;
512	}
513
514	/*************************************************
515	* Convert character value to UTF-8 *
516	*************************************************/
517
518	/ This function takes an integer value in the range 0 - 0x7fffffff*
519	and encodes it as a UTF-8 character in 0 to 6 bytes.
520
521	Arguments:
522	cvalue the character value
523	buffer pointer to buffer for result - at least 6 bytes long
524
525	Returns: number of characters placed in the buffer
526	*/
527
528	static int encodeUTF8(int cvalue, unsigned char *buffer)
529	{
530	int i;
531	for (i = `0`; i < jsc_pcre_utf8_table1_size; i++)
532	if (cvalue <= jsc_pcre_utf8_table1[i])
533	break;
534	buffer += i;
535	for (int j = i; j > `0`; j--) {
536	*buffer-- = `0x80` \| (cvalue & `0x3f`);
537	cvalue >>= `6`;
538	}
539	*buffer = jsc_pcre_utf8_table2[i] \| cvalue;
540	return i + `1`;
541	}
542
543	/*************************************************
544	* Compile one branch *
545	*************************************************/
546
547	/ Scan the pattern, compiling it into the code vector.*
548
549	Arguments:
550	options the option bits
551	brackets points to number of extracting brackets used
552	codePtr points to the pointer to the current code point
553	ptrPtr points to the current pattern pointer
554	errorCodePtr points to error code variable
555	firstbyteptr set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
556	reqbyteptr set to the last literal character required, else < 0
557	cd contains pointers to tables etc.
558
559	Returns: true on success
560	false, with errorCodePtr set non-zero on error*
561	*/
562
563	static inline bool safelyCheckNextChar(const UChar* ptr, const UChar* patternEnd, UChar expected)
564	{
565	return ((ptr + `1` < patternEnd) && ptr[`1`] == expected);
566	}
567
568	static bool
569	compileBranch(int options, int* brackets, unsigned char** codePtr,
570	const UChar** ptrPtr, const UChar* patternEnd, ErrorCode* errorCodePtr, int *firstbyteptr,
571	int* reqbyteptr, CompileData& cd)
572	{
573	int repeatType, opType;
574	int repeatMin = `0`, repeat_max = `0`; / To please picky compilers /
575	int bravalue = `0`;
576	int reqvary, tempreqvary;
577	int c;
578	unsigned char* code = *codePtr;
579	unsigned char* tempcode;
580	bool didGroupSetFirstByte = false;
581	const UChar* ptr = *ptrPtr;
582	unsigned char* previous = NULL;
583	unsigned char classbits[`32`];
584
585	bool class_utf8;
586	unsigned char* class_utf8data;
587	unsigned char utf8_char[`6`];
588
589	/ Initialize no first byte, no required byte. REQ_UNSET means "no char*
590	matching encountered yet". It gets changed to REQ_NONE if we hit something that
591	matches a non-fixed char first char; reqByte just remains unset if we never
592	find one.
593
594	When we hit a repeat whose minimum is zero, we may have to adjust these values
595	to take the zero repeat into account. This is implemented by setting them to
596	zeroFirstByte and zeroReqByte when such a repeat is encountered. The individual
597	item types that can be repeated set these backoff variables appropriately. /*
598
599	int firstByte = REQ_UNSET;
600	int reqByte = REQ_UNSET;
601	int zeroReqByte = REQ_UNSET;
602	int zeroFirstByte = REQ_UNSET;
603
604	/ The variable reqCaseOpt contains either the REQ_IGNORE_CASE value or zero,*
605	according to the current setting of the ignores-case flag. REQ_IGNORE_CASE is a bit
606	value > 255. It is added into the firstByte or reqByte variables to record the
607	case status of the value. This is used only for ASCII characters. /*
608
609	int reqCaseOpt = (options & IgnoreCaseOption) ? REQ_IGNORE_CASE : `0`;
610
611	/ Switch on next character until the end of the branch /
612
613	for (;; ptr++) {
614	bool negateClass;
615	bool shouldFlipNegation; / If a negative special such as \S is used, we should negate the whole class to properly support Unicode. /
616	int classCharCount;
617	int classLastChar;
618	int skipBytes;
619	int subReqByte;
620	int subFirstByte;
621	int mcLength;
622	unsigned char mcbuffer[`8`];
623
624	/ Next byte in the pattern /
625
626	c = ptr < patternEnd ? *ptr : `0`;
627
628	/ Fill in length of a previous callout, except when the next thing is*
629	a quantifier. /*
630
631	bool isQuantifier = c == `'*'` \|\| c == `'+'` \|\| c == `'?'` \|\| (c == `'{'` && isCountedRepeat(p: ptr + `1`, patternEnd));
632
633	switch (c) {
634	/ The branch terminates at end of string, \|, or ). /
635
636	case `0`:
637	if (ptr < patternEnd)
638	goto NORMAL_CHAR;
639	// End of string; fall through
640	case `'\|'`:
641	case `')'`:
642	*firstbyteptr = firstByte;
643	*reqbyteptr = reqByte;
644	*codePtr = code;
645	*ptrPtr = ptr;
646	return true;
647
648	/ Handle single-character metacharacters. In multiline mode, ^ disables*
649	the setting of any following char as a first character. /*
650
651	case `'^'`:
652	if (options & MatchAcrossMultipleLinesOption) {
653	if (firstByte == REQ_UNSET)
654	firstByte = REQ_NONE;
655	*code++ = OP_BOL;
656	} else
657	*code++ = OP_CIRC;
658	previous = NULL;
659	break;
660
661	case `'$'`:
662	previous = NULL;
663	if (options & MatchAcrossMultipleLinesOption)
664	*code++ = OP_EOL;
665	else
666	*code++ = OP_DOLL;
667	break;
668
669	/ There can never be a first char if '.' is first, whatever happens about*
670	repeats. The value of reqByte doesn't change either. /*
671
672	case `'.'`:
673	if (firstByte == REQ_UNSET)
674	firstByte = REQ_NONE;
675	zeroFirstByte = firstByte;
676	zeroReqByte = reqByte;
677	previous = code;
678	*code++ = OP_NOT_NEWLINE;
679	break;
680
681	/ Character classes. If the included characters are all < 256, we build a*
682	32-byte bitmap of the permitted characters, except in the special case
683	where there is only one such character. For negated classes, we build the
684	map as usual, then invert it at the end. However, we use a different opcode
685	so that data characters > 255 can be handled correctly.
686
687	If the class contains characters outside the 0-255 range, a different
688	opcode is compiled. It may optionally have a bit map for characters < 256,
689	but those above are are explicitly listed afterwards. A flag byte tells
690	whether the bitmap is present, and whether this is a negated class or not.
691	*/
692
693	case `'['`: {
694	previous = code;
695	shouldFlipNegation = false;
696
697	/ PCRE supports POSIX class stuff inside a class. Perl gives an error if*
698	they are encountered at the top level, so we'll do that too. /*
699
700	/ If the first character is '^', set the negation flag and skip it. /
701
702	if (ptr + `1` >= patternEnd) {
703	*errorCodePtr = ERR6;
704	return false;
705	}
706
707	if (ptr[`1`] == `'^'`) {
708	negateClass = true;
709	++ptr;
710	} else
711	negateClass = false;
712
713	/ Keep a count of chars with values < 256 so that we can optimize the case*
714	of just a single character (as long as it's < 256). For higher valued UTF-8
715	characters, we don't yet do any optimization. /*
716
717	classCharCount = `0`;
718	classLastChar = -`1`;
719
720	class_utf8 = false; / No chars >= 256 /
721	class_utf8data = code + LINK_SIZE + `34`; / For UTF-8 items /
722
723	/ Initialize the 32-char bit map to all zeros. We have to build the*
724	map in a temporary bit of store, in case the class contains only 1
725	character (< 256), because in that case the compiled code doesn't use the
726	bit map. /*
727
728	memset(s: classbits, c: `0`, n: `32` * sizeof(unsigned char));
729
730	/ Process characters until ] is reached. The first pass*
731	through the regex checked the overall syntax, so we don't need to be very
732	strict here. At the start of the loop, c contains the first byte of the
733	character. /*
734
735	while ((++ptr < patternEnd) && (c = *ptr) != `']'`) {
736	/ Backslash may introduce a single character, or it may introduce one*
737	of the specials, which just set a flag. Escaped items are checked for
738	validity in the pre-compiling pass. The sequence \b is a special case.
739	Inside a class (and only there) it is treated as backspace. Elsewhere
740	it marks a word boundary. Other escapes have preset maps ready to
741	or into the one we are building. We assume they have more than one
742	character in them, so set classCharCount bigger than one. /*
743
744	if (c == `'\\'`) {
745	c = checkEscape(ptrPtr: &ptr, patternEnd, errorCodePtr, bracount: cd.numCapturingBrackets, isClass: true);
746	if (c < `0`) {
747	classCharCount += `2`; / Greater than 1 is what matters /
748	switch (-c) {
749	case ESC_d:
750	for (c = `0`; c < `32`; c++)
751	classbits[c] \|= classBitmapForChar(c: c + cbit_digit);
752	continue;
753
754	case ESC_D:
755	shouldFlipNegation = true;
756	for (c = `0`; c < `32`; c++)
757	classbits[c] \|= ~classBitmapForChar(c: c + cbit_digit);
758	continue;
759
760	case ESC_w:
761	for (c = `0`; c < `32`; c++)
762	classbits[c] \|= classBitmapForChar(c: c + cbit_word);
763	continue;
764
765	case ESC_W:
766	shouldFlipNegation = true;
767	for (c = `0`; c < `32`; c++)
768	classbits[c] \|= ~classBitmapForChar(c: c + cbit_word);
769	continue;
770
771	case ESC_s:
772	for (c = `0`; c < `32`; c++)
773	classbits[c] \|= classBitmapForChar(c: c + cbit_space);
774	continue;
775
776	case ESC_S:
777	shouldFlipNegation = true;
778	for (c = `0`; c < `32`; c++)
779	classbits[c] \|= ~classBitmapForChar(c: c + cbit_space);
780	continue;
781
782	/ Unrecognized escapes are faulted if PCRE is running in its*
783	strict mode. By default, for compatibility with Perl, they are
784	treated as literals. /*
785
786	default:
787	c = ptr; /* The final character /
788	classCharCount -= `2`; / Undo the default count from above /
789	}
790	}
791
792	/ Fall through if we have a single character (c >= 0). This may be*
793	> 256 in UTF-8 mode. /*
794
795	} / End of backslash handling /
796
797	/ A single character may be followed by '-' to form a range. However,*
798	Perl does not permit ']' to be the end of the range. A '-' character
799	here is treated as a literal. /*
800
801	if ((ptr + `2` < patternEnd) && ptr[`1`] == `'-'` && ptr[`2`] != `']'`) {
802	ptr += `2`;
803
804	int d = *ptr;
805
806	/ The second part of a range can be a single-character escape, but*
807	not any of the other escapes. Perl 5.6 treats a hyphen as a literal
808	in such circumstances. /*
809
810	if (d == `'\\'`) {
811	const UChar* oldptr = ptr;
812	d = checkEscape(ptrPtr: &ptr, patternEnd, errorCodePtr, bracount: cd.numCapturingBrackets, isClass: true);
813
814	/ \X is literal X; any other special means the '-' was literal /
815	if (d < `0`) {
816	ptr = oldptr - `2`;
817	goto LONE_SINGLE_CHARACTER; / A few lines below /
818	}
819	}
820
821	/ The check that the two values are in the correct order happens in*
822	the pre-pass. Optimize one-character ranges /*
823
824	if (d == c)
825	goto LONE_SINGLE_CHARACTER; / A few lines below /
826
827	/ In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless*
828	matching, we have to use an XCLASS with extra data items. Caseless
829	matching for characters > 127 is available only if UCP support is
830	available. /*
831
832	if ((d > `255` \|\| ((options & IgnoreCaseOption) && d > `127`))) {
833	class_utf8 = true;
834
835	/ With UCP support, we can find the other case equivalents of*
836	the relevant characters. There may be several ranges. Optimize how
837	they fit with the basic range. /*
838
839	if (options & IgnoreCaseOption) {
840	int occ, ocd;
841	int cc = c;
842	int origd = d;
843	while (getOthercaseRange(cptr: &cc, d: origd, ocptr: &occ, odptr: &ocd)) {
844	if (occ >= c && ocd <= d)
845	continue; / Skip embedded ranges /
846
847	if (occ < c && ocd >= c - `1`) / Extend the basic range /
848	{ / if there is overlap, /
849	c = occ; / noting that if occ < c /
850	continue; / we can't have ocd > d /
851	} / because a subrange is /
852	if (ocd > d && occ <= d + `1`) / always shorter than /
853	{ / the basic range. /
854	d = ocd;
855	continue;
856	}
857
858	if (occ == ocd)
859	*class_utf8data++ = XCL_SINGLE;
860	else {
861	*class_utf8data++ = XCL_RANGE;
862	class_utf8data += encodeUTF8(cvalue: occ, buffer: class_utf8data);
863	}
864	class_utf8data += encodeUTF8(cvalue: ocd, buffer: class_utf8data);
865	}
866	}
867
868	/ Now record the original range, possibly modified for UCP caseless*
869	overlapping ranges. /*
870
871	*class_utf8data++ = XCL_RANGE;
872	class_utf8data += encodeUTF8(cvalue: c, buffer: class_utf8data);
873	class_utf8data += encodeUTF8(cvalue: d, buffer: class_utf8data);
874
875	/ With UCP support, we are done. Without UCP support, there is no*
876	caseless matching for UTF-8 characters > 127; we can use the bit map
877	for the smaller ones. /*
878
879	continue; / With next character in the class /
880	}
881
882	/ We use the bit map for all cases when not in UTF-8 mode; else*
883	ranges that lie entirely within 0-127 when there is UCP support; else
884	for partial ranges without UCP support. /*
885
886	for (; c <= d; c++) {
887	classbits[c/`8`] \|= (`1` << (c&`7`));
888	if (options & IgnoreCaseOption) {
889	int uc = flipCase(c);
890	classbits[uc/`8`] \|= (`1` << (uc&`7`));
891	}
892	classCharCount++; / in case a one-char range /
893	classLastChar = c;
894	}
895
896	continue; / Go get the next char in the class /
897	}
898
899	/ Handle a lone single character - we can get here for a normal*
900	non-escape char, or after \ that introduces a single character or for an
901	apparent range that isn't. /*
902
903	LONE_SINGLE_CHARACTER:
904
905	/ Handle a character that cannot go in the bit map /
906
907	if ((c > `255` \|\| ((options & IgnoreCaseOption) && c > `127`))) {
908	class_utf8 = true;
909	*class_utf8data++ = XCL_SINGLE;
910	class_utf8data += encodeUTF8(cvalue: c, buffer: class_utf8data);
911
912	if (options & IgnoreCaseOption) {
913	int othercase;
914	if ((othercase = jsc_pcre_ucp_othercase(c)) >= `0`) {
915	*class_utf8data++ = XCL_SINGLE;
916	class_utf8data += encodeUTF8(cvalue: othercase, buffer: class_utf8data);
917	}
918	}
919	} else {
920	/ Handle a single-byte character /
921	classbits[c/`8`] \|= (`1` << (c&`7`));
922	if (options & IgnoreCaseOption) {
923	c = flipCase(c);
924	classbits[c/`8`] \|= (`1` << (c&`7`));
925	}
926	classCharCount++;
927	classLastChar = c;
928	}
929	}
930
931	/ If classCharCount is 1, we saw precisely one character whose value is*
932	less than 256. In non-UTF-8 mode we can always optimize. In UTF-8 mode, we
933	can optimize the negative case only if there were no characters >= 128
934	because OP_NOT and the related opcodes like OP_NOTSTAR operate on
935	single-bytes only. This is an historical hangover. Maybe one day we can
936	tidy these opcodes to handle multi-byte characters.
937
938	The optimization throws away the bit map. We turn the item into a
939	1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
940	that OP_NOT does not support multibyte characters. In the positive case, it
941	can cause firstByte to be set. Otherwise, there can be no first char if
942	this item is first, whatever repeat count may follow. In the case of
943	reqByte, save the previous value for reinstating. /*
944
945	if (classCharCount == `1` && (!class_utf8 && (!negateClass \|\| classLastChar < `128`))) {
946	zeroReqByte = reqByte;
947
948	/ The OP_NOT opcode works on one-byte characters only. /
949
950	if (negateClass) {
951	if (firstByte == REQ_UNSET)
952	firstByte = REQ_NONE;
953	zeroFirstByte = firstByte;
954	*code++ = OP_NOT;
955	*code++ = classLastChar;
956	break;
957	}
958
959	/ For a single, positive character, get the value into c, and*
960	then we can handle this with the normal one-character code. /*
961
962	c = classLastChar;
963	goto NORMAL_CHAR;
964	} / End of 1-char optimization /
965
966	/ The general case - not the one-char optimization. If this is the first*
967	thing in the branch, there can be no first char setting, whatever the
968	repeat count. Any reqByte setting must remain unchanged after any kind of
969	repeat. /*
970
971	if (firstByte == REQ_UNSET) firstByte = REQ_NONE;
972	zeroFirstByte = firstByte;
973	zeroReqByte = reqByte;
974
975	/ If there are characters with values > 255, we have to compile an*
976	extended class, with its own opcode. If there are no characters < 256,
977	we can omit the bitmap. /*
978
979	if (class_utf8 && !shouldFlipNegation) {
980	class_utf8data++ = XCL_END; /* Marks the end of extra data /
981	*code++ = OP_XCLASS;
982	code += LINK_SIZE;
983	*code = negateClass? XCL_NOT : `0`;
984
985	/ If the map is required, install it, and move on to the end of*
986	the extra data /*
987
988	if (classCharCount > `0`) {
989	*code++ \|= XCL_MAP;
990	memcpy(dest: code, src: classbits, n: `32`);
991	code = class_utf8data;
992	}
993
994	/ If the map is not required, slide down the extra data. /
995
996	else {
997	int len = class_utf8data - (code + `33`);
998	memmove(dest: code + `1`, src: code + `33`, n: len);
999	code += len + `1`;
1000	}
1001
1002	/ Now fill in the complete length of the item /
1003
1004	putLinkValue(opcodePtr: previous + `1`, value: code - previous);
1005	break; / End of class handling /
1006	}
1007
1008	/ If there are no characters > 255, negate the 32-byte map if necessary,*
1009	and copy it into the code vector. If this is the first thing in the branch,
1010	there can be no first char setting, whatever the repeat count. Any reqByte
1011	setting must remain unchanged after any kind of repeat. /*
1012
1013	*code++ = (negateClass == shouldFlipNegation) ? OP_CLASS : OP_NCLASS;
1014	if (negateClass)
1015	for (c = `0`; c < `32`; c++)
1016	code[c] = ~classbits[c];
1017	else
1018	memcpy(dest: code, src: classbits, n: `32`);
1019	code += `32`;
1020	break;
1021	}
1022
1023	/ Various kinds of repeat; '{' is not necessarily a quantifier, but this*
1024	has been tested above. /*
1025
1026	case `'{'`:
1027	if (!isQuantifier)
1028	goto NORMAL_CHAR;
1029	ptr = readRepeatCounts(p: ptr + `1`, minp: &repeatMin, maxp: &repeat_max, errorCodePtr);
1030	if (*errorCodePtr)
1031	goto FAILED;
1032	goto REPEAT;
1033
1034	case `'*'`:
1035	repeatMin = `0`;
1036	repeat_max = -`1`;
1037	goto REPEAT;
1038
1039	case `'+'`:
1040	repeatMin = `1`;
1041	repeat_max = -`1`;
1042	goto REPEAT;
1043
1044	case `'?'`:
1045	repeatMin = `0`;
1046	repeat_max = `1`;
1047
1048	REPEAT:
1049	if (!previous) {
1050	*errorCodePtr = ERR9;
1051	goto FAILED;
1052	}
1053
1054	if (repeatMin == `0`) {
1055	firstByte = zeroFirstByte; / Adjust for zero repeat /
1056	reqByte = zeroReqByte; / Ditto /
1057	}
1058
1059	/ Remember whether this is a variable length repeat /
1060
1061	reqvary = (repeatMin == repeat_max) ? `0` : REQ_VARY;
1062
1063	opType = `0`; / Default single-char op codes /
1064
1065	/ Save start of previous item, in case we have to move it up to make space*
1066	for an inserted OP_ONCE for the additional '+' extension. /*
1067	/ FIXME: Probably don't need this because we don't use OP_ONCE. /
1068
1069	tempcode = previous;
1070
1071	/ If the next character is '+', we have a possessive quantifier. This*
1072	implies greediness, whatever the setting of the PCRE_UNGREEDY option.
1073	If the next character is '?' this is a minimizing repeat, by default,
1074	but if PCRE_UNGREEDY is set, it works the other way round. We change the
1075	repeat type to the non-default. /*
1076
1077	if (safelyCheckNextChar(ptr, patternEnd, expected: `'?'`)) {
1078	repeatType = `1`;
1079	ptr++;
1080	} else
1081	repeatType = `0`;
1082
1083	/ If previous was a character match, abolish the item and generate a*
1084	repeat item instead. If a char item has a minumum of more than one, ensure
1085	that it is set in reqByte - it might not be if a sequence such as x{3} is
1086	the first thing in a branch because the x will have gone into firstByte
1087	instead. /*
1088
1089	if (previous == OP_CHAR \|\| previous == OP_CHAR_IGNORING_CASE) {
1090	/ Deal with UTF-8 characters that take up more than one byte. It's*
1091	easier to write this out separately than try to macrify it. Use c to
1092	hold the length of the character in bytes, plus 0x80 to flag that it's a
1093	length rather than a small character. /*
1094
1095	if (code[-`1`] & `0x80`) {
1096	unsigned char *lastchar = code - `1`;
1097	while((*lastchar & `0xc0`) == `0x80`)
1098	lastchar--;
1099	c = code - lastchar; / Length of UTF-8 character /
1100	memcpy(dest: utf8_char, src: lastchar, n: c); / Save the char /
1101	c \|= `0x80`; / Flag c as a length /
1102	}
1103	else {
1104	c = code[-`1`];
1105	if (repeatMin > `1`)
1106	reqByte = c \| reqCaseOpt \| cd.reqVaryOpt;
1107	}
1108
1109	goto OUTPUT_SINGLE_REPEAT; / Code shared with single character types /
1110	}
1111
1112	else if (previous == OP_ASCII_CHAR \|\| previous == OP_ASCII_LETTER_IGNORING_CASE) {
1113	c = previous[`1`];
1114	if (repeatMin > `1`)
1115	reqByte = c \| reqCaseOpt \| cd.reqVaryOpt;
1116	goto OUTPUT_SINGLE_REPEAT;
1117	}
1118
1119	/ If previous was a single negated character ([^a] or similar), we use*
1120	one of the special opcodes, replacing it. The code is shared with single-
1121	character repeats by setting opt_type to add a suitable offset into
1122	repeatType. OP_NOT is currently used only for single-byte chars. /*
1123
1124	else if (*previous == OP_NOT) {
1125	opType = OP_NOTSTAR - OP_STAR; / Use "not" opcodes /
1126	c = previous[`1`];
1127	goto OUTPUT_SINGLE_REPEAT;
1128	}
1129
1130	/ If previous was a character type match (\d or similar), abolish it and*
1131	create a suitable repeat item. The code is shared with single-character
1132	repeats by setting opType to add a suitable offset into repeatType. /*
1133
1134	else if (*previous <= OP_NOT_NEWLINE) {
1135	opType = OP_TYPESTAR - OP_STAR; / Use type opcodes /
1136	c = *previous;
1137
1138	OUTPUT_SINGLE_REPEAT:
1139	int prop_type = -`1`;
1140	int prop_value = -`1`;
1141
1142	unsigned char* oldcode = code;
1143	code = previous; / Usually overwrite previous item /
1144
1145	/ If the maximum is zero then the minimum must also be zero; Perl allows*
1146	this case, so we do too - by simply omitting the item altogether. /*
1147
1148	if (repeat_max == `0`)
1149	goto END_REPEAT;
1150
1151	/ Combine the opType with the repeatType /
1152
1153	repeatType += opType;
1154
1155	/ A minimum of zero is handled either as the special case * or ?, or as*
1156	an UPTO, with the maximum given. /*
1157
1158	if (repeatMin == `0`) {
1159	if (repeat_max == -`1`)
1160	*code++ = OP_STAR + repeatType;
1161	else if (repeat_max == `1`)
1162	*code++ = OP_QUERY + repeatType;
1163	else {
1164	*code++ = OP_UPTO + repeatType;
1165	put2ByteValueAndAdvance(opcodePtr&: code, value: repeat_max);
1166	}
1167	}
1168
1169	/ A repeat minimum of 1 is optimized into some special cases. If the*
1170	maximum is unlimited, we use OP_PLUS. Otherwise, the original item it
1171	left in place and, if the maximum is greater than 1, we use OP_UPTO with
1172	one less than the maximum. /*
1173
1174	else if (repeatMin == `1`) {
1175	if (repeat_max == -`1`)
1176	*code++ = OP_PLUS + repeatType;
1177	else {
1178	code = oldcode; / leave previous item in place /
1179	if (repeat_max == `1`)
1180	goto END_REPEAT;
1181	*code++ = OP_UPTO + repeatType;
1182	put2ByteValueAndAdvance(opcodePtr&: code, value: repeat_max - `1`);
1183	}
1184	}
1185
1186	/ The case {n,n} is just an EXACT, while the general case {n,m} is*
1187	handled as an EXACT followed by an UPTO. /*
1188
1189	else {
1190	code++ = OP_EXACT + opType; /* NB EXACT doesn't have repeatType /
1191	put2ByteValueAndAdvance(opcodePtr&: code, value: repeatMin);
1192
1193	/ If the maximum is unlimited, insert an OP_STAR. Before doing so,*
1194	we have to insert the character for the previous code. For a repeated
1195	Unicode property match, there are two extra bytes that define the
1196	required property. In UTF-8 mode, long characters have their length in
1197	c, with the 0x80 bit as a flag. /*
1198
1199	if (repeat_max < `0`) {
1200	if (c >= `128`) {
1201	memcpy(dest: code, src: utf8_char, n: c & `7`);
1202	code += c & `7`;
1203	} else {
1204	*code++ = c;
1205	if (prop_type >= `0`) {
1206	*code++ = prop_type;
1207	*code++ = prop_value;
1208	}
1209	}
1210	*code++ = OP_STAR + repeatType;
1211	}
1212
1213	/ Else insert an UPTO if the max is greater than the min, again*
1214	preceded by the character, for the previously inserted code. /*
1215
1216	else if (repeat_max != repeatMin) {
1217	if (c >= `128`) {
1218	memcpy(dest: code, src: utf8_char, n: c & `7`);
1219	code += c & `7`;
1220	} else
1221	*code++ = c;
1222	if (prop_type >= `0`) {
1223	*code++ = prop_type;
1224	*code++ = prop_value;
1225	}
1226	repeat_max -= repeatMin;
1227	*code++ = OP_UPTO + repeatType;
1228	put2ByteValueAndAdvance(opcodePtr&: code, value: repeat_max);
1229	}
1230	}
1231
1232	/ The character or character type itself comes last in all cases. /
1233
1234	if (c >= `128`) {
1235	memcpy(dest: code, src: utf8_char, n: c & `7`);
1236	code += c & `7`;
1237	} else
1238	*code++ = c;
1239
1240	/ For a repeated Unicode property match, there are two extra bytes that*
1241	define the required property. /*
1242
1243	if (prop_type >= `0`) {
1244	*code++ = prop_type;
1245	*code++ = prop_value;
1246	}
1247	}
1248
1249	/ If previous was a character class or a back reference, we put the repeat*
1250	stuff after it, but just skip the item if the repeat was {0,0}. /*
1251
1252	else if (*previous == OP_CLASS \|\|
1253	*previous == OP_NCLASS \|\|
1254	*previous == OP_XCLASS \|\|
1255	*previous == OP_REF)
1256	{
1257	if (repeat_max == `0`) {
1258	code = previous;
1259	goto END_REPEAT;
1260	}
1261
1262	if (repeatMin == `0` && repeat_max == -`1`)
1263	*code++ = OP_CRSTAR + repeatType;
1264	else if (repeatMin == `1` && repeat_max == -`1`)
1265	*code++ = OP_CRPLUS + repeatType;
1266	else if (repeatMin == `0` && repeat_max == `1`)
1267	*code++ = OP_CRQUERY + repeatType;
1268	else {
1269	*code++ = OP_CRRANGE + repeatType;
1270	put2ByteValueAndAdvance(opcodePtr&: code, value: repeatMin);
1271	if (repeat_max == -`1`)
1272	repeat_max = `0`; / 2-byte encoding for max /
1273	put2ByteValueAndAdvance(opcodePtr&: code, value: repeat_max);
1274	}
1275	}
1276
1277	/ If previous was a bracket group, we may have to replicate it in certain*
1278	cases. /*
1279
1280	else if (*previous >= OP_BRA) {
1281	int ketoffset = `0`;
1282	int len = code - previous;
1283	unsigned char* bralink = NULL;
1284
1285	/ If the maximum repeat count is unlimited, find the end of the bracket*
1286	by scanning through from the start, and compute the offset back to it
1287	from the current code pointer. There may be an OP_OPT setting following
1288	the final KET, so we can't find the end just by going back from the code
1289	pointer. /*
1290
1291	if (repeat_max == -`1`) {
1292	const unsigned char* ket = previous;
1293	advanceToEndOfBracket(opcodePtr&: ket);
1294	ketoffset = code - ket;
1295	}
1296
1297	/ The case of a zero minimum is special because of the need to stick*
1298	OP_BRAZERO in front of it, and because the group appears once in the
1299	data, whereas in other cases it appears the minimum number of times. For
1300	this reason, it is simplest to treat this case separately, as otherwise
1301	the code gets far too messy. There are several special subcases when the
1302	minimum is zero. /*
1303
1304	if (repeatMin == `0`) {
1305	/ If the maximum is also zero, we just omit the group from the output*
1306	altogether. /*
1307
1308	if (repeat_max == `0`) {
1309	code = previous;
1310	goto END_REPEAT;
1311	}
1312
1313	/ If the maximum is 1 or unlimited, we just have to stick in the*
1314	BRAZERO and do no more at this point. However, we do need to adjust
1315	any OP_RECURSE calls inside the group that refer to the group itself or
1316	any internal group, because the offset is from the start of the whole
1317	regex. Temporarily terminate the pattern while doing this. /*
1318
1319	if (repeat_max <= `1`) {
1320	*code = OP_END;
1321	memmove(dest: previous+`1`, src: previous, n: len);
1322	code++;
1323	*previous++ = OP_BRAZERO + repeatType;
1324	}
1325
1326	/ If the maximum is greater than 1 and limited, we have to replicate*
1327	in a nested fashion, sticking OP_BRAZERO before each set of brackets.
1328	The first one has to be handled carefully because it's the original
1329	copy, which has to be moved up. The remainder can be handled by code
1330	that is common with the non-zero minimum case below. We have to
1331	adjust the value of repeat_max, since one less copy is required. /*
1332
1333	else {
1334	*code = OP_END;
1335	memmove(dest: previous + `2` + LINK_SIZE, src: previous, n: len);
1336	code += `2` + LINK_SIZE;
1337	*previous++ = OP_BRAZERO + repeatType;
1338	*previous++ = OP_BRA;
1339
1340	/ We chain together the bracket offset fields that have to be*
1341	filled in later when the ends of the brackets are reached. /*
1342
1343	int offset = (!bralink) ? `0` : previous - bralink;
1344	bralink = previous;
1345	putLinkValueAllowZeroAndAdvance(opcodePtr&: previous, value: offset);
1346	}
1347
1348	repeat_max--;
1349	}
1350
1351	/ If the minimum is greater than zero, replicate the group as many*
1352	times as necessary, and adjust the maximum to the number of subsequent
1353	copies that we need. If we set a first char from the group, and didn't
1354	set a required char, copy the latter from the former. /*
1355
1356	else {
1357	if (repeatMin > `1`) {
1358	if (didGroupSetFirstByte && reqByte < `0`)
1359	reqByte = firstByte;
1360	for (int i = `1`; i < repeatMin; i++) {
1361	memcpy(dest: code, src: previous, n: len);
1362	code += len;
1363	}
1364	}
1365	if (repeat_max > `0`)
1366	repeat_max -= repeatMin;
1367	}
1368
1369	/ This code is common to both the zero and non-zero minimum cases. If*
1370	the maximum is limited, it replicates the group in a nested fashion,
1371	remembering the bracket starts on a stack. In the case of a zero minimum,
1372	the first one was set up above. In all cases the repeat_max now specifies
1373	the number of additional copies needed. /*
1374
1375	if (repeat_max >= `0`) {
1376	for (int i = repeat_max - `1`; i >= `0`; i--) {
1377	*code++ = OP_BRAZERO + repeatType;
1378
1379	/ All but the final copy start a new nesting, maintaining the*
1380	chain of brackets outstanding. /*
1381
1382	if (i != `0`) {
1383	*code++ = OP_BRA;
1384	int offset = (!bralink) ? `0` : code - bralink;
1385	bralink = code;
1386	putLinkValueAllowZeroAndAdvance(opcodePtr&: code, value: offset);
1387	}
1388
1389	memcpy(dest: code, src: previous, n: len);
1390	code += len;
1391	}
1392
1393	/ Now chain through the pending brackets, and fill in their length*
1394	fields (which are holding the chain links pro tem). /*
1395
1396	while (bralink) {
1397	int offset = code - bralink + `1`;
1398	unsigned char* bra = code - offset;
1399	int oldlinkoffset = getLinkValueAllowZero(opcodePtr: bra + `1`);
1400	bralink = (!oldlinkoffset) ? `0` : bralink - oldlinkoffset;
1401	*code++ = OP_KET;
1402	putLinkValueAndAdvance(opcodePtr&: code, value: offset);
1403	putLinkValue(opcodePtr: bra + `1`, value: offset);
1404	}
1405	}
1406
1407	/ If the maximum is unlimited, set a repeater in the final copy. We*
1408	can't just offset backwards from the current code point, because we
1409	don't know if there's been an options resetting after the ket. The
1410	correct offset was computed above. /*
1411
1412	else
1413	code[-ketoffset] = OP_KETRMAX + repeatType;
1414	}
1415
1416	// A quantifier after an assertion is mostly meaningless, but it
1417	// can nullify the assertion if it has a 0 minimum.
1418	else if (previous == OP_ASSERT \|\| previous == OP_ASSERT_NOT) {
1419	if (repeatMin == `0`) {
1420	code = previous;
1421	goto END_REPEAT;
1422	}
1423	}
1424
1425	/ Else there's some kind of shambles /
1426
1427	else {
1428	*errorCodePtr = ERR11;
1429	goto FAILED;
1430	}
1431
1432	/ In all case we no longer have a previous item. We also set the*
1433	"follows varying string" flag for subsequently encountered reqbytes if
1434	it isn't already set and we have just passed a varying length item. /*
1435
1436	END_REPEAT:
1437	previous = NULL;
1438	cd.reqVaryOpt \|= reqvary;
1439	break;
1440
1441	/ Start of nested bracket sub-expression, or comment or lookahead or*
1442	lookbehind or option setting or condition. First deal with special things
1443	that can come after a bracket; all are introduced by ?, and the appearance
1444	of any of them means that this is not a referencing group. They were
1445	checked for validity in the first pass over the string, so we don't have to
1446	check for syntax errors here. /*
1447
1448	case `'('`:
1449	skipBytes = `0`;
1450
1451	if (*(++ptr) == `'?'`) {
1452	switch (*(++ptr)) {
1453	case `':'`: / Non-extracting bracket /
1454	bravalue = OP_BRA;
1455	ptr++;
1456	break;
1457
1458	case `'='`: / Positive lookahead /
1459	bravalue = OP_ASSERT;
1460	ptr++;
1461	break;
1462
1463	case `'!'`: / Negative lookahead /
1464	bravalue = OP_ASSERT_NOT;
1465	ptr++;
1466	break;
1467
1468	/ Character after (? not specially recognized /
1469
1470	default:
1471	*errorCodePtr = ERR12;
1472	goto FAILED;
1473	}
1474	}
1475
1476	/ Else we have a referencing group; adjust the opcode. If the bracket*
1477	number is greater than EXTRACT_BASIC_MAX, we set the opcode one higher, and
1478	arrange for the true number to follow later, in an OP_BRANUMBER item. /*
1479
1480	else {
1481	if (++(*brackets) > EXTRACT_BASIC_MAX) {
1482	bravalue = OP_BRA + EXTRACT_BASIC_MAX + `1`;
1483	code[`1` + LINK_SIZE] = OP_BRANUMBER;
1484	put2ByteValue(opcodePtr: code + `2` + LINK_SIZE, value: *brackets);
1485	skipBytes = `3`;
1486	}
1487	else
1488	bravalue = OP_BRA + *brackets;
1489	}
1490
1491	/ Process nested bracketed re. We copy code into a non-variable*
1492	in order to be able to pass its address because some compilers
1493	complain otherwise. Pass in a new setting for the ims options
1494	if they have changed. /*
1495
1496	previous = code;
1497	*code = bravalue;
1498	tempcode = code;
1499	tempreqvary = cd.reqVaryOpt; / Save value before bracket /
1500
1501	if (!compileBracket(
1502	options,
1503	brackets, / Extracting bracket count /
1504	&tempcode, / Where to put code (updated) /
1505	&ptr, / Input pointer (updated) /
1506	patternEnd,
1507	errorCodePtr, / Where to put an error message /
1508	skipBytes, / Skip over OP_BRANUMBER /
1509	&subFirstByte, / For possible first char /
1510	&subReqByte, / For possible last char /
1511	cd)) / Tables block /
1512	goto FAILED;
1513
1514	/ At the end of compiling, code is still pointing to the start of the*
1515	group, while tempcode has been updated to point past the end of the group
1516	and any option resetting that may follow it. The pattern pointer (ptr)
1517	is on the bracket. /*
1518
1519	/ Handle updating of the required and first characters. Update for normal*
1520	brackets of all kinds, and conditions with two branches (see code above).
1521	If the bracket is followed by a quantifier with zero repeat, we have to
1522	back off. Hence the definition of zeroReqByte and zeroFirstByte outside the
1523	main loop so that they can be accessed for the back off. /*
1524
1525	zeroReqByte = reqByte;
1526	zeroFirstByte = firstByte;
1527	didGroupSetFirstByte = false;
1528
1529	if (bravalue >= OP_BRA) {
1530	/ If we have not yet set a firstByte in this branch, take it from the*
1531	subpattern, remembering that it was set here so that a repeat of more
1532	than one can replicate it as reqByte if necessary. If the subpattern has
1533	no firstByte, set "none" for the whole branch. In both cases, a zero
1534	repeat forces firstByte to "none". /*
1535
1536	if (firstByte == REQ_UNSET) {
1537	if (subFirstByte >= `0`) {
1538	firstByte = subFirstByte;
1539	didGroupSetFirstByte = true;
1540	}
1541	else
1542	firstByte = REQ_NONE;
1543	zeroFirstByte = REQ_NONE;
1544	}
1545
1546	/ If firstByte was previously set, convert the subpattern's firstByte*
1547	into reqByte if there wasn't one, using the vary flag that was in
1548	existence beforehand. /*
1549
1550	else if (subFirstByte >= `0` && subReqByte < `0`)
1551	subReqByte = subFirstByte \| tempreqvary;
1552
1553	/ If the subpattern set a required byte (or set a first byte that isn't*
1554	really the first byte - see above), set it. /*
1555
1556	if (subReqByte >= `0`)
1557	reqByte = subReqByte;
1558	}
1559
1560	/ For a forward assertion, we take the reqByte, if set. This can be*
1561	helpful if the pattern that follows the assertion doesn't set a different
1562	char. For example, it's useful for /(?=abcde).+/. We can't set firstByte
1563	for an assertion, however because it leads to incorrect effect for patterns
1564	such as /(?=a)a.+/ when the "real" "a" would then become a reqByte instead
1565	of a firstByte. This is overcome by a scan at the end if there's no
1566	firstByte, looking for an asserted first char. /*
1567
1568	else if (bravalue == OP_ASSERT && subReqByte >= `0`)
1569	reqByte = subReqByte;
1570
1571	/ Now update the main code pointer to the end of the group. /
1572
1573	code = tempcode;
1574
1575	/ Error if hit end of pattern /
1576
1577	if (ptr >= patternEnd \|\| *ptr != `')'`) {
1578	*errorCodePtr = ERR14;
1579	goto FAILED;
1580	}
1581	break;
1582
1583	/ Check \ for being a real metacharacter; if not, fall through and handle*
1584	it as a data character at the start of a string. Escape items are checked
1585	for validity in the pre-compiling pass. /*
1586
1587	case `'\\'`:
1588	c = checkEscape(ptrPtr: &ptr, patternEnd, errorCodePtr, bracount: cd.numCapturingBrackets, isClass: false);
1589
1590	/ Handle metacharacters introduced by \. For ones like \d, the ESC_ values*
1591	are arranged to be the negation of the corresponding OP_values. For the
1592	back references, the values are ESC_REF plus the reference number. Only
1593	back references and those types that consume a character may be repeated.
1594	We can test for values between ESC_b and ESC_w for the latter; this may
1595	have to change if any new ones are ever created. /*
1596
1597	if (c < `0`) {
1598	/ For metasequences that actually match a character, we disable the*
1599	setting of a first character if it hasn't already been set. /*
1600
1601	if (firstByte == REQ_UNSET && -c > ESC_b && -c <= ESC_w)
1602	firstByte = REQ_NONE;
1603
1604	/ Set values to reset to if this is followed by a zero repeat. /
1605
1606	zeroFirstByte = firstByte;
1607	zeroReqByte = reqByte;
1608
1609	/ Back references are handled specially /
1610
1611	if (-c >= ESC_REF) {
1612	int number = -c - ESC_REF;
1613	previous = code;
1614	*code++ = OP_REF;
1615	put2ByteValueAndAdvance(opcodePtr&: code, value: number);
1616	}
1617
1618	/ For the rest, we can obtain the OP value by negating the escape*
1619	value /*
1620
1621	else {
1622	previous = (-c > ESC_b && -c <= ESC_w) ? code : NULL;
1623	*code++ = -c;
1624	}
1625	continue;
1626	}
1627
1628	/ Fall through. /
1629
1630	/ Handle a literal character. It is guaranteed not to be whitespace or #*
1631	when the extended flag is set. If we are in UTF-8 mode, it may be a
1632	multi-byte literal character. /*
1633
1634	default:
1635	NORMAL_CHAR:
1636
1637	previous = code;
1638
1639	if (c < `128`) {
1640	mcLength = `1`;
1641	mcbuffer[`0`] = c;
1642
1643	if ((options & IgnoreCaseOption) && (c \| `0x20`) >= `'a'` && (c \| `0x20`) <= `'z'`) {
1644	*code++ = OP_ASCII_LETTER_IGNORING_CASE;
1645	*code++ = c \| `0x20`;
1646	} else {
1647	*code++ = OP_ASCII_CHAR;
1648	*code++ = c;
1649	}
1650	} else {
1651	mcLength = encodeUTF8(cvalue: c, buffer: mcbuffer);
1652
1653	*code++ = (options & IgnoreCaseOption) ? OP_CHAR_IGNORING_CASE : OP_CHAR;
1654	for (c = `0`; c < mcLength; c++)
1655	*code++ = mcbuffer[c];
1656	}
1657
1658	/ Set the first and required bytes appropriately. If no previous first*
1659	byte, set it from this character, but revert to none on a zero repeat.
1660	Otherwise, leave the firstByte value alone, and don't change it on a zero
1661	repeat. /*
1662
1663	if (firstByte == REQ_UNSET) {
1664	zeroFirstByte = REQ_NONE;
1665	zeroReqByte = reqByte;
1666
1667	/ If the character is more than one byte long, we can set firstByte*
1668	only if it is not to be matched caselessly. /*
1669
1670	if (mcLength == `1` \|\| reqCaseOpt == `0`) {
1671	firstByte = mcbuffer[`0`] \| reqCaseOpt;
1672	if (mcLength != `1`)
1673	reqByte = code[-`1`] \| cd.reqVaryOpt;
1674	}
1675	else
1676	firstByte = reqByte = REQ_NONE;
1677	}
1678
1679	/ firstByte was previously set; we can set reqByte only the length is*
1680	1 or the matching is caseful. /*
1681
1682	else {
1683	zeroFirstByte = firstByte;
1684	zeroReqByte = reqByte;
1685	if (mcLength == `1` \|\| reqCaseOpt == `0`)
1686	reqByte = code[-`1`] \| reqCaseOpt \| cd.reqVaryOpt;
1687	}
1688
1689	break; / End of literal character handling /
1690	}
1691	} / end of big loop /
1692
1693	/ Control never reaches here by falling through, only by a goto for all the*
1694	error states. Pass back the position in the pattern so that it can be displayed
1695	to the user for diagnosing the error. /*
1696
1697	FAILED:
1698	*ptrPtr = ptr;
1699	return false;
1700	}
1701
1702	/*************************************************
1703	* Compile sequence of alternatives *
1704	*************************************************/
1705
1706	/ On entry, ptr is pointing past the bracket character, but on return*
1707	it points to the closing bracket, or vertical bar, or end of string.
1708	The code variable is pointing at the byte into which the BRA operator has been
1709	stored. If the ims options are changed at the start (for a (?ims: group) or
1710	during any branch, we need to insert an OP_OPT item at the start of every
1711	following branch to ensure they get set correctly at run time, and also pass
1712	the new options into every subsequent branch compile.
1713
1714	Argument:
1715	options option bits, including any changes for this subpattern
1716	brackets -> int containing the number of extracting brackets used
1717	codePtr -> the address of the current code pointer
1718	ptrPtr -> the address of the current pattern pointer
1719	errorCodePtr -> pointer to error code variable
1720	skipBytes skip this many bytes at start (for OP_BRANUMBER)
1721	firstbyteptr place to put the first required character, or a negative number
1722	reqbyteptr place to put the last required character, or a negative number
1723	cd points to the data block with tables pointers etc.
1724
1725	Returns: true on success
1726	*/
1727
1728	static bool
1729	compileBracket(int options, int* brackets, unsigned char** codePtr,
1730	const UChar** ptrPtr, const UChar* patternEnd, ErrorCode* errorCodePtr, int skipBytes,
1731	int* firstbyteptr, int* reqbyteptr, CompileData& cd)
1732	{
1733	const UChar* ptr = *ptrPtr;
1734	unsigned char* code = *codePtr;
1735	unsigned char* lastBranch = code;
1736	unsigned char* start_bracket = code;
1737	int firstByte = REQ_UNSET;
1738	int reqByte = REQ_UNSET;
1739
1740	/ Offset is set zero to mark that this bracket is still open /
1741
1742	putLinkValueAllowZero(opcodePtr: code + `1`, value: `0`);
1743	code += `1` + LINK_SIZE + skipBytes;
1744
1745	/ Loop for each alternative branch /
1746
1747	while (true) {
1748	/ Now compile the branch /
1749
1750	int branchFirstByte;
1751	int branchReqByte;
1752	if (!compileBranch(options, brackets, codePtr: &code, ptrPtr: &ptr, patternEnd, errorCodePtr,
1753	firstbyteptr: &branchFirstByte, reqbyteptr: &branchReqByte, cd)) {
1754	*ptrPtr = ptr;
1755	return false;
1756	}
1757
1758	/ If this is the first branch, the firstByte and reqByte values for the*
1759	branch become the values for the regex. /*
1760
1761	if (*lastBranch != OP_ALT) {
1762	firstByte = branchFirstByte;
1763	reqByte = branchReqByte;
1764	}
1765
1766	/ If this is not the first branch, the first char and reqByte have to*
1767	match the values from all the previous branches, except that if the previous
1768	value for reqByte didn't have REQ_VARY set, it can still match, and we set
1769	REQ_VARY for the regex. /*
1770
1771	else {
1772	/ If we previously had a firstByte, but it doesn't match the new branch,*
1773	we have to abandon the firstByte for the regex, but if there was previously
1774	no reqByte, it takes on the value of the old firstByte. /*
1775
1776	if (firstByte >= `0` && firstByte != branchFirstByte) {
1777	if (reqByte < `0`)
1778	reqByte = firstByte;
1779	firstByte = REQ_NONE;
1780	}
1781
1782	/ If we (now or from before) have no firstByte, a firstByte from the*
1783	branch becomes a reqByte if there isn't a branch reqByte. /*
1784
1785	if (firstByte < `0` && branchFirstByte >= `0` && branchReqByte < `0`)
1786	branchReqByte = branchFirstByte;
1787
1788	/ Now ensure that the reqbytes match /
1789
1790	if ((reqByte & ~REQ_VARY) != (branchReqByte & ~REQ_VARY))
1791	reqByte = REQ_NONE;
1792	else
1793	reqByte \|= branchReqByte; / To "or" REQ_VARY /
1794	}
1795
1796	/ Reached end of expression, either ')' or end of pattern. Go back through*
1797	the alternative branches and reverse the chain of offsets, with the field in
1798	the BRA item now becoming an offset to the first alternative. If there are
1799	no alternatives, it points to the end of the group. The length in the
1800	terminating ket is always the length of the whole bracketed item. If any of
1801	the ims options were changed inside the group, compile a resetting op-code
1802	following, except at the very end of the pattern. Return leaving the pointer
1803	at the terminating char. /*
1804
1805	if (ptr >= patternEnd \|\| *ptr != `'\|'`) {
1806	int length = code - lastBranch;
1807	do {
1808	int prevLength = getLinkValueAllowZero(opcodePtr: lastBranch + `1`);
1809	putLinkValue(opcodePtr: lastBranch + `1`, value: length);
1810	length = prevLength;
1811	lastBranch -= length;
1812	} while (length > `0`);
1813
1814	/ Fill in the ket /
1815
1816	*code = OP_KET;
1817	putLinkValue(opcodePtr: code + `1`, value: code - start_bracket);
1818	code += `1` + LINK_SIZE;
1819
1820	/ Set values to pass back /
1821
1822	*codePtr = code;
1823	*ptrPtr = ptr;
1824	*firstbyteptr = firstByte;
1825	*reqbyteptr = reqByte;
1826	return true;
1827	}
1828
1829	/ Another branch follows; insert an "or" node. Its length field points back*
1830	to the previous branch while the bracket remains open. At the end the chain
1831	is reversed. It's done like this so that the start of the bracket has a
1832	zero offset until it is closed, making it possible to detect recursion. /*
1833
1834	*code = OP_ALT;
1835	putLinkValue(opcodePtr: code + `1`, value: code - lastBranch);
1836	lastBranch = code;
1837	code += `1` + LINK_SIZE;
1838	ptr++;
1839	}
1840	ASSERT_NOT_REACHED();
1841	}
1842
1843	/*************************************************
1844	* Check for anchored expression *
1845	*************************************************/
1846
1847	/ Try to find out if this is an anchored regular expression. Consider each*
1848	alternative branch. If they all start OP_CIRC, or with a bracket
1849	all of whose alternatives start OP_CIRC (recurse ad lib), then
1850	it's anchored.
1851
1852	Arguments:
1853	code points to start of expression (the bracket)
1854	captureMap a bitmap of which brackets we are inside while testing; this
1855	handles up to substring 31; all brackets after that share
1856	the zero bit
1857	backrefMap the back reference bitmap
1858	*/
1859
1860	static bool branchIsAnchored(const unsigned char* code)
1861	{
1862	const unsigned char* scode = firstSignificantOpcode(code);
1863	int op = *scode;
1864
1865	/ Brackets /
1866	if (op >= OP_BRA \|\| op == OP_ASSERT)
1867	return bracketIsAnchored(code: scode);
1868
1869	/ Check for explicit anchoring /
1870	return op == OP_CIRC;
1871	}
1872
1873	static bool bracketIsAnchored(const unsigned char* code)
1874	{
1875	do {
1876	if (!branchIsAnchored(code: code + `1` + LINK_SIZE))
1877	return false;
1878	code += getLinkValue(opcodePtr: code + `1`);
1879	} while (code == OP_ALT); /* Loop for each alternative /
1880	return true;
1881	}
1882
1883	/*************************************************
1884	* Check for starting with ^ or .* *
1885	*************************************************/
1886
1887	/ This is called to find out if every branch starts with ^ or .* so that*
1888	"first char" processing can be done to speed things up in multiline
1889	matching and for non-DOTALL patterns that start with . (which must start at*
1890	the beginning or after \n)
1891
1892	Except when the . appears inside capturing parentheses, and there is a*
1893	subsequent back reference to those parentheses. By keeping a bitmap of the
1894	first 31 back references, we can catch some of the more common cases more
1895	precisely; all the greater back references share a single bit.
1896
1897	Arguments:
1898	code points to start of expression (the bracket)
1899	captureMap a bitmap of which brackets we are inside while testing; this
1900	handles up to substring 31; all brackets after that share
1901	the zero bit
1902	backrefMap the back reference bitmap
1903	*/
1904
1905	static bool branchNeedsLineStart(const unsigned char* code, unsigned captureMap, unsigned backrefMap)
1906	{
1907	const unsigned char* scode = firstSignificantOpcode(code);
1908	int op = *scode;
1909
1910	/ Capturing brackets /
1911	if (op > OP_BRA) {
1912	int captureNum = op - OP_BRA;
1913	if (captureNum > EXTRACT_BASIC_MAX)
1914	captureNum = get2ByteValue(opcodePtr: scode + `2` + LINK_SIZE);
1915	int bracketMask = (captureNum < `32`) ? (`1` << captureNum) : `1`;
1916	return bracketNeedsLineStart(code: scode, captureMap: captureMap \| bracketMask, backrefMap);
1917	}
1918
1919	/ Other brackets /
1920	if (op == OP_BRA \|\| op == OP_ASSERT)
1921	return bracketNeedsLineStart(code: scode, captureMap, backrefMap);
1922
1923	/ .* means "start at start or after \n" if it isn't in brackets that*
1924	may be referenced. /*
1925
1926	if (op == OP_TYPESTAR \|\| op == OP_TYPEMINSTAR)
1927	return scode[`1`] == OP_NOT_NEWLINE && !(captureMap & backrefMap);
1928
1929	/ Explicit ^ /
1930	return op == OP_CIRC \|\| op == OP_BOL;
1931	}
1932
1933	static bool bracketNeedsLineStart(const unsigned char* code, unsigned captureMap, unsigned backrefMap)
1934	{
1935	do {
1936	if (!branchNeedsLineStart(code: code + `1` + LINK_SIZE, captureMap, backrefMap))
1937	return false;
1938	code += getLinkValue(opcodePtr: code + `1`);
1939	} while (code == OP_ALT); /* Loop for each alternative /
1940	return true;
1941	}
1942
1943	/*************************************************
1944	* Check for asserted fixed first char *
1945	*************************************************/
1946
1947	/ During compilation, the "first char" settings from forward assertions are*
1948	discarded, because they can cause conflicts with actual literals that follow.
1949	However, if we end up without a first char setting for an unanchored pattern,
1950	it is worth scanning the regex to see if there is an initial asserted first
1951	char. If all branches start with the same asserted char, or with a bracket all
1952	of whose alternatives start with the same asserted char (recurse ad lib), then
1953	we return that char, otherwise -1.
1954
1955	Arguments:
1956	code points to start of expression (the bracket)
1957	options pointer to the options (used to check casing changes)
1958	inassert true if in an assertion
1959
1960	Returns: -1 or the fixed first char
1961	*/
1962
1963	static int branchFindFirstAssertedCharacter(const unsigned char* code, bool inassert)
1964	{
1965	const unsigned char* scode = firstSignificantOpcodeSkippingAssertions(code);
1966	int op = *scode;
1967
1968	if (op >= OP_BRA)
1969	op = OP_BRA;
1970
1971	switch (op) {
1972	default:
1973	return -`1`;
1974
1975	case OP_BRA:
1976	case OP_ASSERT:
1977	return bracketFindFirstAssertedCharacter(code: scode, inassert: op == OP_ASSERT);
1978
1979	case OP_EXACT:
1980	scode += `2`;
1981	/ Fall through /
1982
1983	case OP_CHAR:
1984	case OP_CHAR_IGNORING_CASE:
1985	case OP_ASCII_CHAR:
1986	case OP_ASCII_LETTER_IGNORING_CASE:
1987	case OP_PLUS:
1988	case OP_MINPLUS:
1989	if (!inassert)
1990	return -`1`;
1991	return scode[`1`];
1992	}
1993	}
1994
1995	static int bracketFindFirstAssertedCharacter(const unsigned char* code, bool inassert)
1996	{
1997	int c = -`1`;
1998	do {
1999	int d = branchFindFirstAssertedCharacter(code: code + `1` + LINK_SIZE, inassert);
2000	if (d < `0`)
2001	return -`1`;
2002	if (c < `0`)
2003	c = d;
2004	else if (c != d)
2005	return -`1`;
2006	code += getLinkValue(opcodePtr: code + `1`);
2007	} while (*code == OP_ALT);
2008	return c;
2009	}
2010
2011	static inline int multiplyWithOverflowCheck(int a, int b)
2012	{
2013	if (!a \|\| !b)
2014	return `0`;
2015	if (a > MAX_PATTERN_SIZE / b)
2016	return -`1`;
2017	return a * b;
2018	}
2019
2020	static int calculateCompiledPatternLength(const UChar* pattern, int patternLength, JSRegExpIgnoreCaseOption ignoreCase,
2021	CompileData& cd, ErrorCode& errorcode)
2022	{
2023	/ Make a pass over the pattern to compute the*
2024	amount of store required to hold the compiled code. This does not have to be
2025	perfect as long as errors are overestimates. /*
2026
2027	if (patternLength > MAX_PATTERN_SIZE) {
2028	errorcode = ERR16;
2029	return -`1`;
2030	}
2031
2032	int length = `1` + LINK_SIZE; / For initial BRA plus length /
2033	int branch_extra = `0`;
2034	int lastitemlength = `0`;
2035	unsigned brastackptr = `0`;
2036	int brastack[BRASTACK_SIZE];
2037	unsigned char bralenstack[BRASTACK_SIZE];
2038	int bracount = `0`;
2039
2040	const UChar* ptr = (const UChar*)(pattern - `1`);
2041	const UChar* patternEnd = (const UChar*)(pattern + patternLength);
2042
2043	while (++ptr < patternEnd) {
2044	int minRepeats = `0`, maxRepeats = `0`;
2045	int c = *ptr;
2046
2047	switch (c) {
2048	/ A backslashed item may be an escaped data character or it may be a*
2049	character type. /*
2050
2051	case `'\\'`:
2052	c = checkEscape(ptrPtr: &ptr, patternEnd, errorCodePtr: &errorcode, bracount: cd.numCapturingBrackets, isClass: false);
2053	if (errorcode != `0`)
2054	return -`1`;
2055
2056	lastitemlength = `1`; / Default length of last item for repeats /
2057
2058	if (c >= `0`) { / Data character /
2059	length += `2`; / For a one-byte character /
2060
2061	if (c > `127`) {
2062	int i;
2063	for (i = `0`; i < jsc_pcre_utf8_table1_size; i++)
2064	if (c <= jsc_pcre_utf8_table1[i]) break;
2065	length += i;
2066	lastitemlength += i;
2067	}
2068
2069	continue;
2070	}
2071
2072	/ Other escapes need one byte /
2073
2074	length++;
2075
2076	/ A back reference needs an additional 2 bytes, plus either one or 5*
2077	bytes for a repeat. We also need to keep the value of the highest
2078	back reference. /*
2079
2080	if (c <= -ESC_REF) {
2081	int refnum = -c - ESC_REF;
2082	cd.backrefMap \|= (refnum < `32`) ? (`1` << refnum) : `1`;
2083	if (refnum > cd.topBackref)
2084	cd.topBackref = refnum;
2085	length += `2`; / For single back reference /
2086	if (safelyCheckNextChar(ptr, patternEnd, expected: `'{'`) && isCountedRepeat(p: ptr + `2`, patternEnd)) {
2087	ptr = readRepeatCounts(p: ptr + `2`, minp: &minRepeats, maxp: &maxRepeats, errorCodePtr: &errorcode);
2088	if (errorcode)
2089	return -`1`;
2090	if ((minRepeats == `0` && (maxRepeats == `1` \|\| maxRepeats == -`1`)) \|\|
2091	(minRepeats == `1` && maxRepeats == -`1`))
2092	length++;
2093	else
2094	length += `5`;
2095	if (safelyCheckNextChar(ptr, patternEnd, expected: `'?'`))
2096	ptr++;
2097	}
2098	}
2099	continue;
2100
2101	case `'^'`: / Single-byte metacharacters /
2102	case `'.'`:
2103	case `'$'`:
2104	length++;
2105	lastitemlength = `1`;
2106	continue;
2107
2108	case `''`: /* These repeats won't be after brackets; /
2109	case `'+'`: / those are handled separately /
2110	case `'?'`:
2111	length++;
2112	goto POSSESSIVE;
2113
2114	/ This covers the cases of braced repeats after a single char, metachar,*
2115	class, or back reference. /*
2116
2117	case `'{'`:
2118	if (!isCountedRepeat(p: ptr + `1`, patternEnd))
2119	goto NORMAL_CHAR;
2120	ptr = readRepeatCounts(p: ptr + `1`, minp: &minRepeats, maxp: &maxRepeats, errorCodePtr: &errorcode);
2121	if (errorcode != `0`)
2122	return -`1`;
2123
2124	/ These special cases just insert one extra opcode /
2125
2126	if ((minRepeats == `0` && (maxRepeats == `1` \|\| maxRepeats == -`1`)) \|\|
2127	(minRepeats == `1` && maxRepeats == -`1`))
2128	length++;
2129
2130	/ These cases might insert additional copies of a preceding character. /
2131
2132	else {
2133	if (minRepeats != `1`) {
2134	length -= lastitemlength; / Uncount the original char or metachar /
2135	if (minRepeats > `0`)
2136	length += `3` + lastitemlength;
2137	}
2138	length += lastitemlength + ((maxRepeats > `0`) ? `3` : `1`);
2139	}
2140
2141	if (safelyCheckNextChar(ptr, patternEnd, expected: `'?'`))
2142	ptr++; / Needs no extra length /
2143
2144	POSSESSIVE: / Test for possessive quantifier /
2145	if (safelyCheckNextChar(ptr, patternEnd, expected: `'+'`)) {
2146	ptr++;
2147	length += `2` + `2` * LINK_SIZE; / Allow for atomic brackets /
2148	}
2149	continue;
2150
2151	/ An alternation contains an offset to the next branch or ket. If any ims*
2152	options changed in the previous branch(es), and/or if we are in a
2153	lookbehind assertion, extra space will be needed at the start of the
2154	branch. This is handled by branch_extra. /*
2155
2156	case `'\|'`:
2157	if (brastackptr == `0`)
2158	cd.needOuterBracket = true;
2159	length += `1` + LINK_SIZE + branch_extra;
2160	continue;
2161
2162	/ A character class uses 33 characters provided that all the character*
2163	values are less than 256. Otherwise, it uses a bit map for low valued
2164	characters, and individual items for others. Don't worry about character
2165	types that aren't allowed in classes - they'll get picked up during the
2166	compile. A character class that contains only one single-byte character
2167	uses 2 or 3 bytes, depending on whether it is negated or not. Notice this
2168	where we can. (In UTF-8 mode we can do this only for chars < 128.) /*
2169
2170	case `'['`: {
2171	int class_optcount;
2172	if (*(++ptr) == `'^'`) {
2173	class_optcount = `10`; / Greater than one /
2174	ptr++;
2175	}
2176	else
2177	class_optcount = `0`;
2178
2179	bool class_utf8 = false;
2180
2181	for (; ptr < patternEnd && *ptr != `']'`; ++ptr) {
2182	/ Check for escapes /
2183
2184	if (*ptr == `'\\'`) {
2185	c = checkEscape(ptrPtr: &ptr, patternEnd, errorCodePtr: &errorcode, bracount: cd.numCapturingBrackets, isClass: true);
2186	if (errorcode != `0`)
2187	return -`1`;
2188
2189	/ Handle escapes that turn into characters /
2190
2191	if (c >= `0`)
2192	goto NON_SPECIAL_CHARACTER;
2193
2194	/ Escapes that are meta-things. The normal ones just affect the*
2195	bit map, but Unicode properties require an XCLASS extended item. /*
2196
2197	else
2198	class_optcount = `10`; / \d, \s etc; make sure > 1 /
2199	}
2200
2201	/ Anything else increments the possible optimization count. We have to*
2202	detect ranges here so that we can compute the number of extra ranges for
2203	caseless wide characters when UCP support is available. If there are wide
2204	characters, we are going to have to use an XCLASS, even for single
2205	characters. /*
2206
2207	else {
2208	c = *ptr;
2209
2210	/ Come here from handling \ above when it escapes to a char value /
2211
2212	NON_SPECIAL_CHARACTER:
2213	class_optcount++;
2214
2215	int d = -`1`;
2216	if (safelyCheckNextChar(ptr, patternEnd, expected: `'-'`)) {
2217	const UChar* hyptr = ptr++;
2218	if (safelyCheckNextChar(ptr, patternEnd, expected: `'\\'`)) {
2219	ptr++;
2220	d = checkEscape(ptrPtr: &ptr, patternEnd, errorCodePtr: &errorcode, bracount: cd.numCapturingBrackets, isClass: true);
2221	if (errorcode != `0`)
2222	return -`1`;
2223	}
2224	else if ((ptr + `1` < patternEnd) && ptr[`1`] != `']'`)
2225	d = *++ptr;
2226	if (d < `0`)
2227	ptr = hyptr; / go back to hyphen as data /
2228	}
2229
2230	/ If d >= 0 we have a range. In UTF-8 mode, if the end is > 255, or >*
2231	127 for caseless matching, we will need to use an XCLASS. /*
2232
2233	if (d >= `0`) {
2234	class_optcount = `10`; / Ensure > 1 /
2235	if (d < c) {
2236	errorcode = ERR8;
2237	return -`1`;
2238	}
2239
2240	if ((d > `255` \|\| (ignoreCase && d > `127`))) {
2241	unsigned char buffer[`6`];
2242	if (!class_utf8) / Allow for XCLASS overhead /
2243	{
2244	class_utf8 = true;
2245	length += LINK_SIZE + `2`;
2246	}
2247
2248	/ If we have UCP support, find out how many extra ranges are*
2249	needed to map the other case of characters within this range. We
2250	have to mimic the range optimization here, because extending the
2251	range upwards might push d over a boundary that makes it use
2252	another byte in the UTF-8 representation. /*
2253
2254	if (ignoreCase) {
2255	int occ, ocd;
2256	int cc = c;
2257	int origd = d;
2258	while (getOthercaseRange(cptr: &cc, d: origd, ocptr: &occ, odptr: &ocd)) {
2259	if (occ >= c && ocd <= d)
2260	continue; / Skip embedded /
2261
2262	if (occ < c && ocd >= c - `1`) / Extend the basic range /
2263	{ / if there is overlap, /
2264	c = occ; / noting that if occ < c /
2265	continue; / we can't have ocd > d /
2266	} / because a subrange is /
2267	if (ocd > d && occ <= d + `1`) / always shorter than /
2268	{ / the basic range. /
2269	d = ocd;
2270	continue;
2271	}
2272
2273	/ An extra item is needed /
2274
2275	length += `1` + encodeUTF8(cvalue: occ, buffer) +
2276	((occ == ocd) ? `0` : encodeUTF8(cvalue: ocd, buffer));
2277	}
2278	}
2279
2280	/ The length of the (possibly extended) range /
2281
2282	length += `1` + encodeUTF8(cvalue: c, buffer) + encodeUTF8(cvalue: d, buffer);
2283	}
2284
2285	}
2286
2287	/ We have a single character. There is nothing to be done unless we*
2288	are in UTF-8 mode. If the char is > 255, or 127 when caseless, we must
2289	allow for an XCL_SINGLE item, doubled for caselessness if there is UCP
2290	support. /*
2291
2292	else {
2293	if ((c > `255` \|\| (ignoreCase && c > `127`))) {
2294	unsigned char buffer[`6`];
2295	class_optcount = `10`; / Ensure > 1 /
2296	if (!class_utf8) / Allow for XCLASS overhead /
2297	{
2298	class_utf8 = true;
2299	length += LINK_SIZE + `2`;
2300	}
2301	length += (ignoreCase ? `2` : `1`) * (`1` + encodeUTF8(cvalue: c, buffer));
2302	}
2303	}
2304	}
2305	}
2306
2307	if (ptr >= patternEnd) { / Missing terminating ']' /
2308	errorcode = ERR6;
2309	return -`1`;
2310	}
2311
2312	/ We can optimize when there was only one optimizable character.*
2313	Note that this does not detect the case of a negated single character.
2314	In that case we do an incorrect length computation, but it's not a serious
2315	problem because the computed length is too large rather than too small. /*
2316
2317	if (class_optcount == `1`)
2318	goto NORMAL_CHAR;
2319
2320	/ Here, we handle repeats for the class opcodes. /
2321	{
2322	length += `33`;
2323
2324	/ A repeat needs either 1 or 5 bytes. If it is a possessive quantifier,*
2325	we also need extra for wrapping the whole thing in a sub-pattern. /*
2326
2327	if (safelyCheckNextChar(ptr, patternEnd, expected: `'{'`) && isCountedRepeat(p: ptr + `2`, patternEnd)) {
2328	ptr = readRepeatCounts(p: ptr + `2`, minp: &minRepeats, maxp: &maxRepeats, errorCodePtr: &errorcode);
2329	if (errorcode != `0`)
2330	return -`1`;
2331	if ((minRepeats == `0` && (maxRepeats == `1` \|\| maxRepeats == -`1`)) \|\|
2332	(minRepeats == `1` && maxRepeats == -`1`))
2333	length++;
2334	else
2335	length += `5`;
2336	if (safelyCheckNextChar(ptr, patternEnd, expected: `'+'`)) {
2337	ptr++;
2338	length += `2` + `2` * LINK_SIZE;
2339	} else if (safelyCheckNextChar(ptr, patternEnd, expected: `'?'`))
2340	ptr++;
2341	}
2342	}
2343	continue;
2344	}
2345
2346	/ Brackets may be genuine groups or special things /
2347
2348	case `'('`: {
2349	int branch_newextra = `0`;
2350	int bracket_length = `1` + LINK_SIZE;
2351	bool capturing = false;
2352
2353	/ Handle special forms of bracket, which all start (? /
2354
2355	if (safelyCheckNextChar(ptr, patternEnd, expected: `'?'`)) {
2356	switch (c = (ptr + `2` < patternEnd ? ptr[`2`] : `0`)) {
2357	/ Non-referencing groups and lookaheads just move the pointer on, and*
2358	then behave like a non-special bracket, except that they don't increment
2359	the count of extracting brackets. Ditto for the "once only" bracket,
2360	which is in Perl from version 5.005. /*
2361
2362	case `':'`:
2363	case `'='`:
2364	case `'!'`:
2365	ptr += `2`;
2366	break;
2367
2368	/ Else loop checking valid options until ) is met. Anything else is an*
2369	error. If we are without any brackets, i.e. at top level, the settings
2370	act as if specified in the options, so massage the options immediately.
2371	This is for backward compatibility with Perl 5.004. /*
2372
2373	default:
2374	errorcode = ERR12;
2375	return -`1`;
2376	}
2377	} else
2378	capturing = `1`;
2379
2380	/ Capturing brackets must be counted so we can process escapes in a*
2381	Perlish way. If the number exceeds EXTRACT_BASIC_MAX we are going to need
2382	an additional 3 bytes of memory per capturing bracket. /*
2383
2384	if (capturing) {
2385	bracount++;
2386	if (bracount > EXTRACT_BASIC_MAX)
2387	bracket_length += `3`;
2388	}
2389
2390	/ Save length for computing whole length at end if there's a repeat that*
2391	requires duplication of the group. Also save the current value of
2392	branch_extra, and start the new group with the new value. If non-zero, this
2393	will either be 2 for a (?imsx: group, or 3 for a lookbehind assertion. /*
2394
2395	if (brastackptr >= sizeof(brastack)/sizeof(int)) {
2396	errorcode = ERR17;
2397	return -`1`;
2398	}
2399
2400	bralenstack[brastackptr] = branch_extra;
2401	branch_extra = branch_newextra;
2402
2403	brastack[brastackptr++] = length;
2404	length += bracket_length;
2405	continue;
2406	}
2407
2408	/ Handle ket. Look for subsequent maxRepeats/minRepeats; for certain sets of values we*
2409	have to replicate this bracket up to that many times. If brastackptr is
2410	0 this is an unmatched bracket which will generate an error, but take care
2411	not to try to access brastack[-1] when computing the length and restoring
2412	the branch_extra value. /*
2413
2414	case `')'`: {
2415	int duplength;
2416	length += `1` + LINK_SIZE;
2417	if (brastackptr > `0`) {
2418	duplength = length - brastack[--brastackptr];
2419	branch_extra = bralenstack[brastackptr];
2420	}
2421	else
2422	duplength = `0`;
2423
2424	/ Leave ptr at the final char; for readRepeatCounts this happens*
2425	automatically; for the others we need an increment. /*
2426
2427	if ((ptr + `1` < patternEnd) && (c = ptr[`1`]) == `'{'` && isCountedRepeat(p: ptr + `2`, patternEnd)) {
2428	ptr = readRepeatCounts(p: ptr + `2`, minp: &minRepeats, maxp: &maxRepeats, errorCodePtr: &errorcode);
2429	if (errorcode)
2430	return -`1`;
2431	} else if (c == `'*'`) {
2432	minRepeats = `0`;
2433	maxRepeats = -`1`;
2434	ptr++;
2435	} else if (c == `'+'`) {
2436	minRepeats = `1`;
2437	maxRepeats = -`1`;
2438	ptr++;
2439	} else if (c == `'?'`) {
2440	minRepeats = `0`;
2441	maxRepeats = `1`;
2442	ptr++;
2443	} else {
2444	minRepeats = `1`;
2445	maxRepeats = `1`;
2446	}
2447
2448	/ If the minimum is zero, we have to allow for an OP_BRAZERO before the*
2449	group, and if the maximum is greater than zero, we have to replicate
2450	maxval-1 times; each replication acquires an OP_BRAZERO plus a nesting
2451	bracket set. /*
2452
2453	int repeatsLength;
2454	if (minRepeats == `0`) {
2455	length++;
2456	if (maxRepeats > `0`) {
2457	repeatsLength = multiplyWithOverflowCheck(a: maxRepeats - `1`, b: duplength + `3` + `2` * LINK_SIZE);
2458	if (repeatsLength < `0`) {
2459	errorcode = ERR16;
2460	return -`1`;
2461	}
2462	length += repeatsLength;
2463	if (length > MAX_PATTERN_SIZE) {
2464	errorcode = ERR16;
2465	return -`1`;
2466	}
2467	}
2468	}
2469
2470	/ When the minimum is greater than zero, we have to replicate up to*
2471	minval-1 times, with no additions required in the copies. Then, if there
2472	is a limited maximum we have to replicate up to maxval-1 times allowing
2473	for a BRAZERO item before each optional copy and nesting brackets for all
2474	but one of the optional copies. /*
2475
2476	else {
2477	repeatsLength = multiplyWithOverflowCheck(a: minRepeats - `1`, b: duplength);
2478	if (repeatsLength < `0`) {
2479	errorcode = ERR16;
2480	return -`1`;
2481	}
2482	length += repeatsLength;
2483	if (maxRepeats > minRepeats) { / Need this test as maxRepeats=-1 means no limit /
2484	repeatsLength = multiplyWithOverflowCheck(a: maxRepeats - minRepeats, b: duplength + `3` + `2` * LINK_SIZE);
2485	if (repeatsLength < `0`) {
2486	errorcode = ERR16;
2487	return -`1`;
2488	}
2489	length += repeatsLength - (`2` + `2` * LINK_SIZE);
2490	}
2491	if (length > MAX_PATTERN_SIZE) {
2492	errorcode = ERR16;
2493	return -`1`;
2494	}
2495	}
2496
2497	/ Allow space for once brackets for "possessive quantifier" /
2498
2499	if (safelyCheckNextChar(ptr, patternEnd, expected: `'+'`)) {
2500	ptr++;
2501	length += `2` + `2` * LINK_SIZE;
2502	}
2503	continue;
2504	}
2505
2506	/ Non-special character. It won't be space or # in extended mode, so it is*
2507	always a genuine character. If we are in a \Q...\E sequence, check for the
2508	end; if not, we have a literal. /*
2509
2510	default:
2511	NORMAL_CHAR:
2512	length += `2`; / For a one-byte character /
2513	lastitemlength = `1`; / Default length of last item for repeats /
2514
2515	if (c > `127`) {
2516	int i;
2517	for (i = `0`; i < jsc_pcre_utf8_table1_size; i++)
2518	if (c <= jsc_pcre_utf8_table1[i])
2519	break;
2520	length += i;
2521	lastitemlength += i;
2522	}
2523
2524	continue;
2525	}
2526	}
2527
2528	length += `2` + LINK_SIZE; / For final KET and END /
2529
2530	cd.numCapturingBrackets = bracount;
2531	return length;
2532	}
2533
2534	/*************************************************
2535	* Compile a Regular Expression *
2536	*************************************************/
2537
2538	/ This function takes a string and returns a pointer to a block of store*
2539	holding a compiled version of the expression. The original API for this
2540	function had no error code return variable; it is retained for backwards
2541	compatibility. The new function is given a new name.
2542
2543	Arguments:
2544	pattern the regular expression
2545	options various option bits
2546	errorCodePtr pointer to error code variable (pcre_compile2() only)
2547	can be NULL if you don't want a code value
2548	errorPtr pointer to pointer to error text
2549	erroroffset ptr offset in pattern where error was detected
2550	tables pointer to character tables or NULL
2551
2552	Returns: pointer to compiled data block, or NULL on error,
2553	with errorPtr and erroroffset set
2554	*/
2555
2556	static inline JSRegExp* returnError(ErrorCode errorcode, const char** errorPtr)
2557	{
2558	*errorPtr = errorText(code: errorcode);
2559	return `0`;
2560	}
2561
2562	JSRegExp* jsRegExpCompile(const UChar* pattern, int patternLength,
2563	JSRegExpIgnoreCaseOption ignoreCase, JSRegExpMultilineOption multiline,
2564	unsigned* numSubpatterns, const char** errorPtr)
2565	{
2566	/ We can't pass back an error message if errorPtr is NULL; I guess the best we*
2567	can do is just return NULL, but we can set a code value if there is a code pointer. /*
2568	if (!errorPtr)
2569	return `0`;
2570	*errorPtr = NULL;
2571
2572	CompileData cd;
2573
2574	ErrorCode errorcode = ERR0;
2575	/ Call this once just to count the brackets. /
2576	calculateCompiledPatternLength(pattern, patternLength, ignoreCase, cd, errorcode);
2577	/ Call it again to compute the length. /
2578	int length = calculateCompiledPatternLength(pattern, patternLength, ignoreCase, cd, errorcode);
2579	if (errorcode)
2580	return returnError(errorcode, errorPtr);
2581
2582	if (length > MAX_PATTERN_SIZE)
2583	return returnError(errorcode: ERR16, errorPtr);
2584
2585	size_t size = length + sizeof(JSRegExp);
2586	#if REGEXP_HISTOGRAM
2587	size_t stringOffset = (size + sizeof(UChar) - `1`) / sizeof(UChar) * sizeof(UChar);
2588	size = stringOffset + patternLength * sizeof(UChar);
2589	#endif
2590	JSRegExp* re = reinterpret_cast<JSRegExp>(new* char[size]);
2591
2592	if (!re)
2593	return returnError(errorcode: ERR13, errorPtr);
2594
2595	re->options = (ignoreCase ? IgnoreCaseOption : `0`) \| (multiline ? MatchAcrossMultipleLinesOption : `0`);
2596
2597	/ The starting points of the name/number translation table and of the code are*
2598	passed around in the compile data block. /*
2599
2600	const unsigned char* codeStart = (const unsigned char*)(re + `1`);
2601
2602	/ Set up a starting, non-extracting bracket, then compile the expression. On*
2603	error, errorcode will be set non-zero, so we don't need to look at the result
2604	of the function here. /*
2605
2606	const UChar* ptr = (const UChar*)pattern;
2607	const UChar* patternEnd = pattern + patternLength;
2608	unsigned char* code = const_cast<unsigned char*>(codeStart);
2609	int firstByte, reqByte;
2610	int bracketCount = `0`;
2611	if (!cd.needOuterBracket)
2612	compileBranch(options: re->options, brackets: &bracketCount, codePtr: &code, ptrPtr: &ptr, patternEnd, errorCodePtr: &errorcode, firstbyteptr: &firstByte, reqbyteptr: &reqByte, cd);
2613	else {
2614	*code = OP_BRA;
2615	compileBracket(options: re->options, brackets: &bracketCount, codePtr: &code, ptrPtr: &ptr, patternEnd, errorCodePtr: &errorcode, skipBytes: `0`, firstbyteptr: &firstByte, reqbyteptr: &reqByte, cd);
2616	}
2617	re->topBracket = bracketCount;
2618	re->topBackref = cd.topBackref;
2619
2620	/ If not reached end of pattern on success, there's an excess bracket. /
2621
2622	if (errorcode == `0` && ptr < patternEnd)
2623	errorcode = ERR10;
2624
2625	/ Fill in the terminating state and check for disastrous overflow, but*
2626	if debugging, leave the test till after things are printed out. /*
2627
2628	*code++ = OP_END;
2629
2630	ASSERT(code - codeStart <= length);
2631	if (code - codeStart > length)
2632	errorcode = ERR7;
2633
2634	/ Give an error if there's back reference to a non-existent capturing*
2635	subpattern. /*
2636
2637	if (re->topBackref > re->topBracket)
2638	errorcode = ERR15;
2639
2640	/ Failed to compile, or error while post-processing /
2641
2642	if (errorcode != ERR0) {
2643	delete [] reinterpret_cast<char*>(re);
2644	return returnError(errorcode, errorPtr);
2645	}
2646
2647	/ If the anchored option was not passed, set the flag if we can determine that*
2648	the pattern is anchored by virtue of ^ characters or \A or anything else (such
2649	as starting with . when DOTALL is set).*
2650
2651	Otherwise, if we know what the first character has to be, save it, because that
2652	speeds up unanchored matches no end. If not, see if we can set the
2653	UseMultiLineFirstByteOptimizationOption flag. This is helpful for multiline matches when all branches
2654	start with ^. and also when all branches start with . for non-DOTALL matches.*
2655	*/
2656
2657	if (cd.needOuterBracket ? bracketIsAnchored(code: codeStart) : branchIsAnchored(code: codeStart))
2658	re->options \|= IsAnchoredOption;
2659	else {
2660	if (firstByte < `0`) {
2661	firstByte = (cd.needOuterBracket
2662	? bracketFindFirstAssertedCharacter(code: codeStart, inassert: false)
2663	: branchFindFirstAssertedCharacter(code: codeStart, inassert: false))
2664	\| ((re->options & IgnoreCaseOption) ? REQ_IGNORE_CASE : `0`);
2665	}
2666	if (firstByte >= `0`) {
2667	int ch = firstByte & `255`;
2668	if (ch < `127`) {
2669	re->firstByte = ((firstByte & REQ_IGNORE_CASE) && flipCase(c: ch) == ch) ? ch : firstByte;
2670	re->options \|= UseFirstByteOptimizationOption;
2671	}
2672	} else {
2673	if (cd.needOuterBracket ? bracketNeedsLineStart(code: codeStart, captureMap: `0`, backrefMap: cd.backrefMap) : branchNeedsLineStart(code: codeStart, captureMap: `0`, backrefMap: cd.backrefMap))
2674	re->options \|= UseMultiLineFirstByteOptimizationOption;
2675	}
2676	}
2677
2678	/ For an anchored pattern, we use the "required byte" only if it follows a*
2679	variable length item in the regex. Remove the caseless flag for non-caseable
2680	bytes. /*
2681
2682	if (reqByte >= `0` && (!(re->options & IsAnchoredOption) \|\| (reqByte & REQ_VARY))) {
2683	int ch = reqByte & `255`;
2684	if (ch < `127`) {
2685	re->reqByte = ((reqByte & REQ_IGNORE_CASE) && flipCase(c: ch) == ch) ? (reqByte & ~REQ_IGNORE_CASE) : reqByte;
2686	re->options \|= UseRequiredByteOptimizationOption;
2687	}
2688	}
2689
2690	#if REGEXP_HISTOGRAM
2691	re->stringOffset = stringOffset;
2692	re->stringLength = patternLength;
2693	memcpy(reinterpret_cast<char>(re) + stringOffset, pattern, patternLength `2`);
2694	#endif
2695
2696	if (numSubpatterns)
2697	*numSubpatterns = re->topBracket;
2698	return re;
2699	}
2700
2701	void jsRegExpFree(JSRegExp* re)
2702	{
2703	delete [] reinterpret_cast<char*>(re);
2704	}
2705

source code of qtscript/src/3rdparty/javascriptcore/JavaScriptCore/pcre/pcre_compile.cpp