RegexInterpreter.cpp source code [qtscript/src/3rdparty/javascriptcore/JavaScriptCore/yarr/RegexInterpreter.cpp]

1	/*
2	* Copyright (C) 2009 Apple Inc. All rights reserved.
3	*
4	* Redistribution and use in source and binary forms, with or without
5	* modification, are permitted provided that the following conditions
6	* are met:
7	* 1. Redistributions of source code must retain the above copyright
8	* notice, this list of conditions and the following disclaimer.
9	* 2. Redistributions in binary form must reproduce the above copyright
10	* notice, this list of conditions and the following disclaimer in the
11	* documentation and/or other materials provided with the distribution.
12	*
13	* THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
14	* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
16	* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
17	* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
18	* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
19	* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
20	* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
21	* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
23	* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24	*/
25
26	#include "config.h"
27	#include "RegexInterpreter.h"
28
29	#include "RegexCompiler.h"
30	#include "RegexPattern.h"
31
32	#ifndef NDEBUG
33	#include <stdio.h>
34	#endif
35
36	#if ENABLE(YARR)
37
38	using namespace WTF;
39
40	namespace JSC { namespace Yarr {
41
42	class Interpreter {
43	public:
44	struct ParenthesesDisjunctionContext;
45
46	struct BackTrackInfoPatternCharacter {
47	uintptr_t matchAmount;
48	};
49	struct BackTrackInfoCharacterClass {
50	uintptr_t matchAmount;
51	};
52	struct BackTrackInfoBackReference {
53	uintptr_t begin; // Not really needed for greedy quantifiers.
54	uintptr_t matchAmount; // Not really needed for fixed quantifiers.
55	};
56	struct BackTrackInfoAlternative {
57	uintptr_t offset;
58	};
59	struct BackTrackInfoParentheticalAssertion {
60	uintptr_t begin;
61	};
62	struct BackTrackInfoParenthesesOnce {
63	uintptr_t inParentheses;
64	};
65	struct BackTrackInfoParentheses {
66	uintptr_t matchAmount;
67	ParenthesesDisjunctionContext* lastContext;
68	uintptr_t prevBegin;
69	uintptr_t prevEnd;
70	};
71
72	static inline void appendParenthesesDisjunctionContext(BackTrackInfoParentheses* backTrack, ParenthesesDisjunctionContext* context)
73	{
74	context->next = backTrack->lastContext;
75	backTrack->lastContext = context;
76	++backTrack->matchAmount;
77	}
78
79	static inline void popParenthesesDisjunctionContext(BackTrackInfoParentheses* backTrack)
80	{
81	ASSERT(backTrack->matchAmount);
82	ASSERT(backTrack->lastContext);
83	backTrack->lastContext = backTrack->lastContext->next;
84	--backTrack->matchAmount;
85	}
86
87	struct DisjunctionContext
88	{
89	DisjunctionContext()
90	: term(`0`)
91	{
92	}
93
94	void* operator new(size_t, void* where)
95	{
96	return where;
97	}
98
99	int term;
100	unsigned matchBegin;
101	unsigned matchEnd;
102	uintptr_t frame[`1`];
103	};
104
105	DisjunctionContext* allocDisjunctionContext(ByteDisjunction* disjunction)
106	{
107	return new(malloc(size: sizeof(DisjunctionContext) + (disjunction->m_frameSize - `1`) * sizeof(uintptr_t))) DisjunctionContext ();
108	}
109
110	void freeDisjunctionContext(DisjunctionContext* context)
111	{
112	free(ptr: context);
113	}
114
115	struct ParenthesesDisjunctionContext
116	{
117	ParenthesesDisjunctionContext(int* output, ByteTerm& term)
118	: next(`0`)
119	{
120	unsigned firstSubpatternId = term.atom.subpatternId;
121	unsigned numNestedSubpatterns = term.atom.parenthesesDisjunction->m_numSubpatterns;
122
123	for (unsigned i = `0`; i < (numNestedSubpatterns << `1`); ++i) {
124	subpatternBackup[i] = output[(firstSubpatternId << `1`) + i];
125	output[(firstSubpatternId << `1`) + i] = -`1`;
126	}
127
128	new(getDisjunctionContext(term)) DisjunctionContext ();
129	}
130
131	void* operator new(size_t, void* where)
132	{
133	return where;
134	}
135
136	void restoreOutput(int* output, unsigned firstSubpatternId, unsigned numNestedSubpatterns)
137	{
138	for (unsigned i = `0`; i < (numNestedSubpatterns << `1`); ++i)
139	output[(firstSubpatternId << `1`) + i] = subpatternBackup[i];
140	}
141
142	DisjunctionContext* getDisjunctionContext(ByteTerm& term)
143	{
144	return reinterpret_cast<DisjunctionContext*>(&(subpatternBackup[term.atom.parenthesesDisjunction->m_numSubpatterns << `1`]));
145	}
146
147	ParenthesesDisjunctionContext* next;
148	int subpatternBackup[`1`];
149	};
150
151	ParenthesesDisjunctionContext* allocParenthesesDisjunctionContext(ByteDisjunction* disjunction, int* output, ByteTerm& term)
152	{
153	return new(malloc(size: sizeof(ParenthesesDisjunctionContext) + (((term.atom.parenthesesDisjunction->m_numSubpatterns << `1`) - `1`) * sizeof(int)) + sizeof(DisjunctionContext) + (disjunction->m_frameSize - `1`) * sizeof(uintptr_t))) ParenthesesDisjunctionContext (output, term);
154	}
155
156	void freeParenthesesDisjunctionContext(ParenthesesDisjunctionContext* context)
157	{
158	free(ptr: context);
159	}
160
161	class InputStream {
162	public:
163	InputStream(const UChar* input, unsigned start, unsigned length)
164	: input(input)
165	, pos(start)
166	, length(length)
167	{
168	}
169
170	void next()
171	{
172	++pos;
173	}
174
175	void rewind(unsigned amount)
176	{
177	ASSERT(pos >= amount);
178	pos -= amount;
179	}
180
181	int read()
182	{
183	ASSERT(pos < length);
184	if (pos < length)
185	return input[pos];
186	return -`1`;
187	}
188
189	int readChecked(int position)
190	{
191	ASSERT(position < `0`);
192	ASSERT((unsigned)-position <= pos);
193	unsigned p = pos + position;
194	ASSERT(p < length);
195	return input[p];
196	}
197
198	int reread(unsigned from)
199	{
200	ASSERT(from < length);
201	return input[from];
202	}
203
204	int prev()
205	{
206	ASSERT(!(pos > length));
207	if (pos && length)
208	return input[pos - `1`];
209	return -`1`;
210	}
211
212	unsigned getPos()
213	{
214	return pos;
215	}
216
217	void setPos(unsigned p)
218	{
219	pos = p;
220	}
221
222	bool atStart()
223	{
224	return pos == `0`;
225	}
226
227	bool atEnd()
228	{
229	return pos == length;
230	}
231
232	bool checkInput(int count)
233	{
234	if ((pos + count) <= length) {
235	pos += count;
236	return true;
237	} else
238	return false;
239	}
240
241	void uncheckInput(int count)
242	{
243	pos -= count;
244	}
245
246	bool atStart(int position)
247	{
248	return (pos + position) == `0`;
249	}
250
251	bool atEnd(int position)
252	{
253	return (pos + position) == length;
254	}
255
256	private:
257	const UChar* input;
258	unsigned pos;
259	unsigned length;
260	};
261
262	bool testCharacterClass(CharacterClass* characterClass, int ch)
263	{
264	if (ch & `0xFF80`) {
265	for (unsigned i = `0`; i < characterClass->m_matchesUnicode.size(); ++i)
266	if (ch == characterClass->m_matchesUnicode [i])
267	return true;
268	for (unsigned i = `0`; i < characterClass->m_rangesUnicode.size(); ++i)
269	if ((ch >= characterClass->m_rangesUnicode [i].begin) && (ch <= characterClass->m_rangesUnicode [i].end))
270	return true;
271	} else {
272	for (unsigned i = `0`; i < characterClass->m_matches.size(); ++i)
273	if (ch == characterClass->m_matches [i])
274	return true;
275	for (unsigned i = `0`; i < characterClass->m_ranges.size(); ++i)
276	if ((ch >= characterClass->m_ranges [i].begin) && (ch <= characterClass->m_ranges [i].end))
277	return true;
278	}
279
280	return false;
281	}
282
283	bool tryConsumeCharacter(int testChar)
284	{
285	if (input.atEnd())
286	return false;
287
288	int ch = input.read();
289
290	if (pattern->m_ignoreCase ? ((Unicode::toLower(ch: testChar) == ch) \|\| (Unicode::toUpper(ch: testChar) == ch)) : (testChar == ch)) {
291	input.next();
292	return true;
293	}
294	return false;
295	}
296
297	bool checkCharacter(int testChar, int inputPosition)
298	{
299	return testChar == input.readChecked(position: inputPosition);
300	}
301
302	bool checkCasedCharacter(int loChar, int hiChar, int inputPosition)
303	{
304	int ch = input.readChecked(position: inputPosition);
305	return (loChar == ch) \|\| (hiChar == ch);
306	}
307
308	bool tryConsumeCharacterClass(CharacterClass* characterClass, bool invert)
309	{
310	if (input.atEnd())
311	return false;
312
313	bool match = testCharacterClass(characterClass, ch: input.read());
314
315	if (invert)
316	match = !match;
317
318	if (match) {
319	input.next();
320	return true;
321	}
322	return false;
323	}
324
325	bool checkCharacterClass(CharacterClass* characterClass, bool invert, int inputPosition)
326	{
327	bool match = testCharacterClass(characterClass, ch: input.readChecked(position: inputPosition));
328	return invert ? !match : match;
329	}
330
331	bool tryConsumeBackReference(int matchBegin, int matchEnd, int inputOffset)
332	{
333	int matchSize = matchEnd - matchBegin;
334
335	if (!input.checkInput(count: matchSize))
336	return false;
337
338	for (int i = `0`; i < matchSize; ++i) {
339	if (!checkCharacter(testChar: input.reread(from: matchBegin + i), inputPosition: inputOffset - matchSize + i)) {
340	input.uncheckInput(count: matchSize);
341	return false;
342	}
343	}
344
345	return true;
346	}
347
348	bool matchAssertionBOL(ByteTerm& term)
349	{
350	return (input.atStart(position: term.inputPosition)) \|\| (pattern->m_multiline && testCharacterClass(characterClass: pattern->newlineCharacterClass, ch: input.readChecked(position: term.inputPosition - `1`)));
351	}
352
353	bool matchAssertionEOL(ByteTerm& term)
354	{
355	if (term.inputPosition)
356	return (input.atEnd(position: term.inputPosition)) \|\| (pattern->m_multiline && testCharacterClass(characterClass: pattern->newlineCharacterClass, ch: input.readChecked(position: term.inputPosition)));
357	else
358	return (input.atEnd()) \|\| (pattern->m_multiline && testCharacterClass(characterClass: pattern->newlineCharacterClass, ch: input.read()));
359	}
360
361	bool matchAssertionWordBoundary(ByteTerm& term)
362	{
363	bool prevIsWordchar = !input.atStart(position: term.inputPosition) && testCharacterClass(characterClass: pattern->wordcharCharacterClass, ch: input.readChecked(position: term.inputPosition - `1`));
364	bool readIsWordchar;
365	if (term.inputPosition)
366	readIsWordchar = !input.atEnd(position: term.inputPosition) && testCharacterClass(characterClass: pattern->wordcharCharacterClass, ch: input.readChecked(position: term.inputPosition));
367	else
368	readIsWordchar = !input.atEnd() && testCharacterClass(characterClass: pattern->wordcharCharacterClass, ch: input.read());
369
370	bool wordBoundary = prevIsWordchar != readIsWordchar;
371	return term.invert() ? !wordBoundary : wordBoundary;
372	}
373
374	bool backtrackPatternCharacter(ByteTerm& term, DisjunctionContext* context)
375	{
376	BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + term.frameLocation);
377
378	switch (term.atom.quantityType) {
379	case QuantifierFixedCount:
380	break;
381
382	case QuantifierGreedy:
383	if (backTrack->matchAmount) {
384	--backTrack->matchAmount;
385	input.uncheckInput(count: `1`);
386	return true;
387	}
388	break;
389
390	case QuantifierNonGreedy:
391	if ((backTrack->matchAmount < term.atom.quantityCount) && input.checkInput(count: `1`)) {
392	++backTrack->matchAmount;
393	if (checkCharacter(testChar: term.atom.patternCharacter, inputPosition: term.inputPosition - `1`))
394	return true;
395	}
396	input.uncheckInput(count: backTrack->matchAmount);
397	break;
398	}
399
400	return false;
401	}
402
403	bool backtrackPatternCasedCharacter(ByteTerm& term, DisjunctionContext* context)
404	{
405	BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + term.frameLocation);
406
407	switch (term.atom.quantityType) {
408	case QuantifierFixedCount:
409	break;
410
411	case QuantifierGreedy:
412	if (backTrack->matchAmount) {
413	--backTrack->matchAmount;
414	input.uncheckInput(count: `1`);
415	return true;
416	}
417	break;
418
419	case QuantifierNonGreedy:
420	if ((backTrack->matchAmount < term.atom.quantityCount) && input.checkInput(count: `1`)) {
421	++backTrack->matchAmount;
422	if (checkCasedCharacter(loChar: term.atom.casedCharacter.lo, hiChar: term.atom.casedCharacter.hi, inputPosition: term.inputPosition - `1`))
423	return true;
424	}
425	input.uncheckInput(count: backTrack->matchAmount);
426	break;
427	}
428
429	return false;
430	}
431
432	bool matchCharacterClass(ByteTerm& term, DisjunctionContext* context)
433	{
434	ASSERT(term.type == ByteTerm::TypeCharacterClass);
435	BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + term.frameLocation);
436
437	switch (term.atom.quantityType) {
438	case QuantifierFixedCount: {
439	for (unsigned matchAmount = `0`; matchAmount < term.atom.quantityCount; ++matchAmount) {
440	if (!checkCharacterClass(characterClass: term.atom.characterClass, invert: term.invert(), inputPosition: term.inputPosition + matchAmount))
441	return false;
442	}
443	return true;
444	}
445
446	case QuantifierGreedy: {
447	unsigned matchAmount = `0`;
448	while ((matchAmount < term.atom.quantityCount) && input.checkInput(count: `1`)) {
449	if (!checkCharacterClass(characterClass: term.atom.characterClass, invert: term.invert(), inputPosition: term.inputPosition - `1`)) {
450	input.uncheckInput(count: `1`);
451	break;
452	}
453	++matchAmount;
454	}
455	backTrack->matchAmount = matchAmount;
456
457	return true;
458	}
459
460	case QuantifierNonGreedy:
461	backTrack->matchAmount = `0`;
462	return true;
463	}
464
465	ASSERT_NOT_REACHED();
466	return false;
467	}
468
469	bool backtrackCharacterClass(ByteTerm& term, DisjunctionContext* context)
470	{
471	ASSERT(term.type == ByteTerm::TypeCharacterClass);
472	BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + term.frameLocation);
473
474	switch (term.atom.quantityType) {
475	case QuantifierFixedCount:
476	break;
477
478	case QuantifierGreedy:
479	if (backTrack->matchAmount) {
480	--backTrack->matchAmount;
481	input.uncheckInput(count: `1`);
482	return true;
483	}
484	break;
485
486	case QuantifierNonGreedy:
487	if ((backTrack->matchAmount < term.atom.quantityCount) && input.checkInput(count: `1`)) {
488	++backTrack->matchAmount;
489	if (checkCharacterClass(characterClass: term.atom.characterClass, invert: term.invert(), inputPosition: term.inputPosition - `1`))
490	return true;
491	}
492	input.uncheckInput(count: backTrack->matchAmount);
493	break;
494	}
495
496	return false;
497	}
498
499	bool matchBackReference(ByteTerm& term, DisjunctionContext* context)
500	{
501	ASSERT(term.type == ByteTerm::TypeBackReference);
502	BackTrackInfoBackReference* backTrack = reinterpret_cast<BackTrackInfoBackReference*>(context->frame + term.frameLocation);
503
504	int matchBegin = output[(term.atom.subpatternId << `1`)];
505	int matchEnd = output[(term.atom.subpatternId << `1`) + `1`];
506	ASSERT((matchBegin == -`1`) == (matchEnd == -`1`));
507	ASSERT(matchBegin <= matchEnd);
508
509	if (matchBegin == matchEnd)
510	return true;
511
512	switch (term.atom.quantityType) {
513	case QuantifierFixedCount: {
514	backTrack->begin = input.getPos();
515	for (unsigned matchAmount = `0`; matchAmount < term.atom.quantityCount; ++matchAmount) {
516	if (!tryConsumeBackReference(matchBegin, matchEnd, inputOffset: term.inputPosition)) {
517	input.setPos(backTrack->begin);
518	return false;
519	}
520	}
521	return true;
522	}
523
524	case QuantifierGreedy: {
525	unsigned matchAmount = `0`;
526	while ((matchAmount < term.atom.quantityCount) && tryConsumeBackReference(matchBegin, matchEnd, inputOffset: term.inputPosition))
527	++matchAmount;
528	backTrack->matchAmount = matchAmount;
529	return true;
530	}
531
532	case QuantifierNonGreedy:
533	backTrack->begin = input.getPos();
534	backTrack->matchAmount = `0`;
535	return true;
536	}
537
538	ASSERT_NOT_REACHED();
539	return false;
540	}
541
542	bool backtrackBackReference(ByteTerm& term, DisjunctionContext* context)
543	{
544	ASSERT(term.type == ByteTerm::TypeBackReference);
545	BackTrackInfoBackReference* backTrack = reinterpret_cast<BackTrackInfoBackReference*>(context->frame + term.frameLocation);
546
547	int matchBegin = output[(term.atom.subpatternId << `1`)];
548	int matchEnd = output[(term.atom.subpatternId << `1`) + `1`];
549	ASSERT((matchBegin == -`1`) == (matchEnd == -`1`));
550	ASSERT(matchBegin <= matchEnd);
551
552	if (matchBegin == matchEnd)
553	return false;
554
555	switch (term.atom.quantityType) {
556	case QuantifierFixedCount:
557	// for quantityCount == 1, could rewind.
558	input.setPos(backTrack->begin);
559	break;
560
561	case QuantifierGreedy:
562	if (backTrack->matchAmount) {
563	--backTrack->matchAmount;
564	input.rewind(amount: matchEnd - matchBegin);
565	return true;
566	}
567	break;
568
569	case QuantifierNonGreedy:
570	if ((backTrack->matchAmount < term.atom.quantityCount) && tryConsumeBackReference(matchBegin, matchEnd, inputOffset: term.inputPosition)) {
571	++backTrack->matchAmount;
572	return true;
573	} else
574	input.setPos(backTrack->begin);
575	break;
576	}
577
578	return false;
579	}
580
581	void recordParenthesesMatch(ByteTerm& term, ParenthesesDisjunctionContext* context)
582	{
583	if (term.capture()) {
584	unsigned subpatternId = term.atom.subpatternId;
585	output[(subpatternId << `1`)] = context->getDisjunctionContext(term)->matchBegin + term.inputPosition;
586	output[(subpatternId << `1`) + `1`] = context->getDisjunctionContext(term)->matchEnd + term.inputPosition;
587	}
588	}
589	void resetMatches(ByteTerm& term, ParenthesesDisjunctionContext* context)
590	{
591	unsigned firstSubpatternId = term.atom.subpatternId;
592	unsigned count = term.atom.parenthesesDisjunction->m_numSubpatterns;
593	context->restoreOutput(output, firstSubpatternId, numNestedSubpatterns: count);
594	}
595	void resetAssertionMatches(ByteTerm& term)
596	{
597	unsigned firstSubpatternId = term.atom.subpatternId;
598	unsigned count = term.atom.parenthesesDisjunction->m_numSubpatterns;
599	for (unsigned i = `0`; i < (count << `1`); ++i)
600	output[(firstSubpatternId << `1`) + i] = -`1`;
601	}
602	bool parenthesesDoBacktrack(ByteTerm& term, BackTrackInfoParentheses* backTrack)
603	{
604	while (backTrack->matchAmount) {
605	ParenthesesDisjunctionContext* context = backTrack->lastContext;
606
607	if (matchDisjunction(disjunction: term.atom.parenthesesDisjunction, context: context->getDisjunctionContext(term), btrack: true))
608	return true;
609
610	resetMatches(term, context);
611	popParenthesesDisjunctionContext(backTrack);
612	freeParenthesesDisjunctionContext(context);
613	}
614
615	return false;
616	}
617
618	bool matchParenthesesOnceBegin(ByteTerm& term, DisjunctionContext* context)
619	{
620	ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceBegin);
621	ASSERT(term.atom.quantityCount == `1`);
622
623	BackTrackInfoParenthesesOnce* backTrack = reinterpret_cast<BackTrackInfoParenthesesOnce*>(context->frame + term.frameLocation);
624
625	switch (term.atom.quantityType) {
626	case QuantifierGreedy: {
627	// set this speculatively; if we get to the parens end this will be true.
628	backTrack->inParentheses = `1`;
629	break;
630	}
631	case QuantifierNonGreedy: {
632	backTrack->inParentheses = `0`;
633	context->term += term.atom.parenthesesWidth;
634	return true;
635	}
636	case QuantifierFixedCount:
637	break;
638	}
639
640	if (term.capture()) {
641	unsigned subpatternId = term.atom.subpatternId;
642	output[(subpatternId << `1`)] = input.getPos() + term.inputPosition;
643	}
644
645	return true;
646	}
647
648	bool matchParenthesesOnceEnd(ByteTerm& term, DisjunctionContext*)
649	{
650	ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceEnd);
651	ASSERT(term.atom.quantityCount == `1`);
652
653	if (term.capture()) {
654	unsigned subpatternId = term.atom.subpatternId;
655	output[(subpatternId << `1`) + `1`] = input.getPos() + term.inputPosition;
656	}
657	return true;
658	}
659
660	bool backtrackParenthesesOnceBegin(ByteTerm& term, DisjunctionContext* context)
661	{
662	ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceBegin);
663	ASSERT(term.atom.quantityCount == `1`);
664
665	BackTrackInfoParenthesesOnce* backTrack = reinterpret_cast<BackTrackInfoParenthesesOnce*>(context->frame + term.frameLocation);
666
667	if (term.capture()) {
668	unsigned subpatternId = term.atom.subpatternId;
669	output[(subpatternId << `1`)] = -`1`;
670	output[(subpatternId << `1`) + `1`] = -`1`;
671	}
672
673	switch (term.atom.quantityType) {
674	case QuantifierGreedy:
675	// if we backtrack to this point, there is another chance - try matching nothing.
676	ASSERT(backTrack->inParentheses);
677	backTrack->inParentheses = `0`;
678	context->term += term.atom.parenthesesWidth;
679	return true;
680	case QuantifierNonGreedy:
681	ASSERT(backTrack->inParentheses);
682	case QuantifierFixedCount:
683	break;
684	}
685
686	return false;
687	}
688
689	bool backtrackParenthesesOnceEnd(ByteTerm& term, DisjunctionContext* context)
690	{
691	ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceEnd);
692	ASSERT(term.atom.quantityCount == `1`);
693
694	BackTrackInfoParenthesesOnce* backTrack = reinterpret_cast<BackTrackInfoParenthesesOnce*>(context->frame + term.frameLocation);
695
696	switch (term.atom.quantityType) {
697	case QuantifierGreedy:
698	if (!backTrack->inParentheses) {
699	context->term -= term.atom.parenthesesWidth;
700	return false;
701	}
702	case QuantifierNonGreedy:
703	if (!backTrack->inParentheses) {
704	// now try to match the parens; set this speculatively.
705	backTrack->inParentheses = `1`;
706	if (term.capture()) {
707	unsigned subpatternId = term.atom.subpatternId;
708	output[(subpatternId << `1`) + `1`] = input.getPos() + term.inputPosition;
709	}
710	context->term -= term.atom.parenthesesWidth;
711	return true;
712	}
713	case QuantifierFixedCount:
714	break;
715	}
716
717	return false;
718	}
719
720	bool matchParentheticalAssertionBegin(ByteTerm& term, DisjunctionContext* context)
721	{
722	ASSERT(term.type == ByteTerm::TypeParentheticalAssertionBegin);
723	ASSERT(term.atom.quantityCount == `1`);
724
725	BackTrackInfoParentheticalAssertion* backTrack = reinterpret_cast<BackTrackInfoParentheticalAssertion*>(context->frame + term.frameLocation);
726
727	backTrack->begin = input.getPos();
728	return true;
729	}
730
731	bool matchParentheticalAssertionEnd(ByteTerm& term, DisjunctionContext* context)
732	{
733	ASSERT(term.type == ByteTerm::TypeParentheticalAssertionEnd);
734	ASSERT(term.atom.quantityCount == `1`);
735
736	BackTrackInfoParentheticalAssertion* backTrack = reinterpret_cast<BackTrackInfoParentheticalAssertion*>(context->frame + term.frameLocation);
737
738	input.setPos(backTrack->begin);
739
740	// We've reached the end of the parens; if they are inverted, this is failure.
741	if (term.invert()) {
742	context->term -= term.atom.parenthesesWidth;
743	return false;
744	}
745
746	return true;
747	}
748
749	bool backtrackParentheticalAssertionBegin(ByteTerm& term, DisjunctionContext* context)
750	{
751	ASSERT(term.type == ByteTerm::TypeParentheticalAssertionBegin);
752	ASSERT(term.atom.quantityCount == `1`);
753
754	// We've failed to match parens; if they are inverted, this is win!
755	if (term.invert()) {
756	context->term += term.atom.parenthesesWidth;
757	return true;
758	}
759
760	return false;
761	}
762
763	bool backtrackParentheticalAssertionEnd(ByteTerm& term, DisjunctionContext* context)
764	{
765	ASSERT(term.type == ByteTerm::TypeParentheticalAssertionEnd);
766	ASSERT(term.atom.quantityCount == `1`);
767
768	BackTrackInfoParentheticalAssertion* backTrack = reinterpret_cast<BackTrackInfoParentheticalAssertion*>(context->frame + term.frameLocation);
769
770	input.setPos(backTrack->begin);
771
772	context->term -= term.atom.parenthesesWidth;
773	return false;
774	}
775
776	bool matchParentheses(ByteTerm& term, DisjunctionContext* context)
777	{
778	ASSERT(term.type == ByteTerm::TypeParenthesesSubpattern);
779
780	BackTrackInfoParentheses* backTrack = reinterpret_cast<BackTrackInfoParentheses*>(context->frame + term.frameLocation);
781
782	unsigned subpatternId = term.atom.subpatternId;
783	ByteDisjunction* disjunctionBody = term.atom.parenthesesDisjunction;
784
785	backTrack->prevBegin = output[(subpatternId << `1`)];
786	backTrack->prevEnd = output[(subpatternId << `1`) + `1`];
787
788	backTrack->matchAmount = `0`;
789	backTrack->lastContext = `0`;
790
791	switch (term.atom.quantityType) {
792	case QuantifierFixedCount: {
793	// While we haven't yet reached our fixed limit,
794	while (backTrack->matchAmount < term.atom.quantityCount) {
795	// Try to do a match, and it it succeeds, add it to the list.
796	ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunction: disjunctionBody, output, term);
797	if (matchDisjunction(disjunction: disjunctionBody, context: context->getDisjunctionContext(term)))
798	appendParenthesesDisjunctionContext(backTrack, context);
799	else {
800	// The match failed; try to find an alternate point to carry on from.
801	resetMatches(term, context);
802	freeParenthesesDisjunctionContext(context);
803	if (!parenthesesDoBacktrack(term, backTrack))
804	return false;
805	}
806	}
807
808	ASSERT(backTrack->matchAmount == term.atom.quantityCount);
809	ParenthesesDisjunctionContext* context = backTrack->lastContext;
810	recordParenthesesMatch(term, context);
811	return true;
812	}
813
814	case QuantifierGreedy: {
815	while (backTrack->matchAmount < term.atom.quantityCount) {
816	ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunction: disjunctionBody, output, term);
817	if (matchNonZeroDisjunction(disjunction: disjunctionBody, context: context->getDisjunctionContext(term)))
818	appendParenthesesDisjunctionContext(backTrack, context);
819	else {
820	resetMatches(term, context);
821	freeParenthesesDisjunctionContext(context);
822	break;
823	}
824	}
825
826	if (backTrack->matchAmount) {
827	ParenthesesDisjunctionContext* context = backTrack->lastContext;
828	recordParenthesesMatch(term, context);
829	}
830	return true;
831	}
832
833	case QuantifierNonGreedy:
834	return true;
835	}
836
837	ASSERT_NOT_REACHED();
838	return false;
839	}
840
841	// Rules for backtracking differ depending on whether this is greedy or non-greedy.
842	//
843	// Greedy matches never should try just adding more - you should already have done
844	// the 'more' cases. Always backtrack, at least a leetle bit. However cases where
845	// you backtrack an item off the list needs checking, since we'll never have matched
846	// the one less case. Tracking forwards, still add as much as possible.
847	//
848	// Non-greedy, we've already done the one less case, so don't match on popping.
849	// We haven't done the one more case, so always try to add that.
850	//
851	bool backtrackParentheses(ByteTerm& term, DisjunctionContext* context)
852	{
853	ASSERT(term.type == ByteTerm::TypeParenthesesSubpattern);
854
855	BackTrackInfoParentheses* backTrack = reinterpret_cast<BackTrackInfoParentheses*>(context->frame + term.frameLocation);
856
857	if (term.capture()) {
858	unsigned subpatternId = term.atom.subpatternId;
859	output[(subpatternId << `1`)] = backTrack->prevBegin;
860	output[(subpatternId << `1`) + `1`] = backTrack->prevEnd;
861	}
862
863	ByteDisjunction* disjunctionBody = term.atom.parenthesesDisjunction;
864
865	switch (term.atom.quantityType) {
866	case QuantifierFixedCount: {
867	ASSERT(backTrack->matchAmount == term.atom.quantityCount);
868
869	ParenthesesDisjunctionContext* context = `0`;
870
871	if (!parenthesesDoBacktrack(term, backTrack))
872	return false;
873
874	// While we haven't yet reached our fixed limit,
875	while (backTrack->matchAmount < term.atom.quantityCount) {
876	// Try to do a match, and it it succeeds, add it to the list.
877	context = allocParenthesesDisjunctionContext(disjunction: disjunctionBody, output, term);
878	if (matchDisjunction(disjunction: disjunctionBody, context: context->getDisjunctionContext(term)))
879	appendParenthesesDisjunctionContext(backTrack, context);
880	else {
881	// The match failed; try to find an alternate point to carry on from.
882	resetMatches(term, context);
883	freeParenthesesDisjunctionContext(context);
884	if (!parenthesesDoBacktrack(term, backTrack))
885	return false;
886	}
887	}
888
889	ASSERT(backTrack->matchAmount == term.atom.quantityCount);
890	context = backTrack->lastContext;
891	recordParenthesesMatch(term, context);
892	return true;
893	}
894
895	case QuantifierGreedy: {
896	if (!backTrack->matchAmount)
897	return false;
898
899	ParenthesesDisjunctionContext* context = backTrack->lastContext;
900	if (matchNonZeroDisjunction(disjunction: disjunctionBody, context: context->getDisjunctionContext(term), btrack: true)) {
901	while (backTrack->matchAmount < term.atom.quantityCount) {
902	ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunction: disjunctionBody, output, term);
903	if (matchNonZeroDisjunction(disjunction: disjunctionBody, context: context->getDisjunctionContext(term)))
904	appendParenthesesDisjunctionContext(backTrack, context);
905	else {
906	resetMatches(term, context);
907	freeParenthesesDisjunctionContext(context);
908	break;
909	}
910	}
911	} else {
912	resetMatches(term, context);
913	popParenthesesDisjunctionContext(backTrack);
914	freeParenthesesDisjunctionContext(context);
915	}
916
917	if (backTrack->matchAmount) {
918	ParenthesesDisjunctionContext* context = backTrack->lastContext;
919	recordParenthesesMatch(term, context);
920	}
921	return true;
922	}
923
924	case QuantifierNonGreedy: {
925	// If we've not reached the limit, try to add one more match.
926	if (backTrack->matchAmount < term.atom.quantityCount) {
927	ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunction: disjunctionBody, output, term);
928	if (matchNonZeroDisjunction(disjunction: disjunctionBody, context: context->getDisjunctionContext(term))) {
929	appendParenthesesDisjunctionContext(backTrack, context);
930	recordParenthesesMatch(term, context);
931	return true;
932	} else {
933	resetMatches(term, context);
934	freeParenthesesDisjunctionContext(context);
935	}
936	}
937
938	// Nope - okay backtrack looking for an alternative.
939	while (backTrack->matchAmount) {
940	ParenthesesDisjunctionContext* context = backTrack->lastContext;
941	if (matchNonZeroDisjunction(disjunction: disjunctionBody, context: context->getDisjunctionContext(term), btrack: true)) {
942	// successful backtrack! we're back in the game!
943	if (backTrack->matchAmount) {
944	context = backTrack->lastContext;
945	recordParenthesesMatch(term, context);
946	}
947	return true;
948	}
949
950	// pop a match off the stack
951	resetMatches(term, context);
952	popParenthesesDisjunctionContext(backTrack);
953	freeParenthesesDisjunctionContext(context);
954	}
955
956	return false;
957	}
958	}
959
960	ASSERT_NOT_REACHED();
961	return false;
962	}
963
964	#define MATCH_NEXT() { ++context->term; goto matchAgain; }
965	#define BACKTRACK() { --context->term; goto backtrack; }
966	#define currentTerm() (disjunction->terms[context->term])
967	bool matchDisjunction(ByteDisjunction* disjunction, DisjunctionContext* context, bool btrack = false)
968	{
969	if (btrack)
970	BACKTRACK();
971
972	context->matchBegin = input.getPos();
973	context->term = `0`;
974
975	matchAgain:
976	ASSERT(context->term < static_cast<int>(disjunction->terms.size()));
977
978	switch (currentTerm().type) {
979	case ByteTerm::TypeSubpatternBegin:
980	MATCH_NEXT();
981	case ByteTerm::TypeSubpatternEnd:
982	context->matchEnd = input.getPos();
983	return true;
984
985	case ByteTerm::TypeBodyAlternativeBegin:
986	MATCH_NEXT();
987	case ByteTerm::TypeBodyAlternativeDisjunction:
988	case ByteTerm::TypeBodyAlternativeEnd:
989	context->matchEnd = input.getPos();
990	return true;
991
992	case ByteTerm::TypeAlternativeBegin:
993	MATCH_NEXT();
994	case ByteTerm::TypeAlternativeDisjunction:
995	case ByteTerm::TypeAlternativeEnd: {
996	int offset = currentTerm().alternative.end;
997	BackTrackInfoAlternative* backTrack = reinterpret_cast<BackTrackInfoAlternative*>(context->frame + currentTerm().frameLocation);
998	backTrack->offset = offset;
999	context->term += offset;
1000	MATCH_NEXT();
1001	}
1002
1003	case ByteTerm::TypeAssertionBOL:
1004	if (matchAssertionBOL(currentTerm()))
1005	MATCH_NEXT();
1006	BACKTRACK();
1007	case ByteTerm::TypeAssertionEOL:
1008	if (matchAssertionEOL(currentTerm()))
1009	MATCH_NEXT();
1010	BACKTRACK();
1011	case ByteTerm::TypeAssertionWordBoundary:
1012	if (matchAssertionWordBoundary(currentTerm()))
1013	MATCH_NEXT();
1014	BACKTRACK();
1015
1016	case ByteTerm::TypePatternCharacterOnce:
1017	case ByteTerm::TypePatternCharacterFixed: {
1018	for (unsigned matchAmount = `0`; matchAmount < currentTerm().atom.quantityCount; ++matchAmount) {
1019	if (!checkCharacter(currentTerm().atom.patternCharacter, currentTerm().inputPosition + matchAmount))
1020	BACKTRACK();
1021	}
1022	MATCH_NEXT();
1023	}
1024	case ByteTerm::TypePatternCharacterGreedy: {
1025	BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation);
1026	unsigned matchAmount = `0`;
1027	while ((matchAmount < currentTerm().atom.quantityCount) && input.checkInput(count: `1`)) {
1028	if (!checkCharacter(currentTerm().atom.patternCharacter, currentTerm().inputPosition - `1`)) {
1029	input.uncheckInput(count: `1`);
1030	break;
1031	}
1032	++matchAmount;
1033	}
1034	backTrack->matchAmount = matchAmount;
1035
1036	MATCH_NEXT();
1037	}
1038	case ByteTerm::TypePatternCharacterNonGreedy: {
1039	BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation);
1040	backTrack->matchAmount = `0`;
1041	MATCH_NEXT();
1042	}
1043
1044	case ByteTerm::TypePatternCasedCharacterOnce:
1045	case ByteTerm::TypePatternCasedCharacterFixed: {
1046	for (unsigned matchAmount = `0`; matchAmount < currentTerm().atom.quantityCount; ++matchAmount) {
1047	if (!checkCasedCharacter(currentTerm().atom.casedCharacter.lo, currentTerm().atom.casedCharacter.hi, currentTerm().inputPosition + matchAmount))
1048	BACKTRACK();
1049	}
1050	MATCH_NEXT();
1051	}
1052	case ByteTerm::TypePatternCasedCharacterGreedy: {
1053	BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation);
1054	unsigned matchAmount = `0`;
1055	while ((matchAmount < currentTerm().atom.quantityCount) && input.checkInput(count: `1`)) {
1056	if (!checkCasedCharacter(currentTerm().atom.casedCharacter.lo, currentTerm().atom.casedCharacter.hi, currentTerm().inputPosition - `1`)) {
1057	input.uncheckInput(count: `1`);
1058	break;
1059	}
1060	++matchAmount;
1061	}
1062	backTrack->matchAmount = matchAmount;
1063
1064	MATCH_NEXT();
1065	}
1066	case ByteTerm::TypePatternCasedCharacterNonGreedy: {
1067	BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation);
1068	backTrack->matchAmount = `0`;
1069	MATCH_NEXT();
1070	}
1071
1072	case ByteTerm::TypeCharacterClass:
1073	if (matchCharacterClass(currentTerm(), context))
1074	MATCH_NEXT();
1075	BACKTRACK();
1076	case ByteTerm::TypeBackReference:
1077	if (matchBackReference(currentTerm(), context))
1078	MATCH_NEXT();
1079	BACKTRACK();
1080	case ByteTerm::TypeParenthesesSubpattern:
1081	if (matchParentheses(currentTerm(), context))
1082	MATCH_NEXT();
1083	BACKTRACK();
1084	case ByteTerm::TypeParenthesesSubpatternOnceBegin:
1085	if (matchParenthesesOnceBegin(currentTerm(), context))
1086	MATCH_NEXT();
1087	BACKTRACK();
1088	case ByteTerm::TypeParenthesesSubpatternOnceEnd:
1089	if (matchParenthesesOnceEnd(currentTerm(), context))
1090	MATCH_NEXT();
1091	BACKTRACK();
1092	case ByteTerm::TypeParentheticalAssertionBegin:
1093	if (matchParentheticalAssertionBegin(currentTerm(), context))
1094	MATCH_NEXT();
1095	BACKTRACK();
1096	case ByteTerm::TypeParentheticalAssertionEnd:
1097	if (matchParentheticalAssertionEnd(currentTerm(), context))
1098	MATCH_NEXT();
1099	BACKTRACK();
1100
1101	case ByteTerm::TypeCheckInput:
1102	if (input.checkInput(currentTerm().checkInputCount))
1103	MATCH_NEXT();
1104	BACKTRACK();
1105	}
1106
1107	// We should never fall-through to here.
1108	ASSERT_NOT_REACHED();
1109
1110	backtrack:
1111	ASSERT(context->term < static_cast<int>(disjunction->terms.size()));
1112
1113	switch (currentTerm().type) {
1114	case ByteTerm::TypeSubpatternBegin:
1115	return false;
1116	case ByteTerm::TypeSubpatternEnd:
1117	ASSERT_NOT_REACHED();
1118
1119	case ByteTerm::TypeBodyAlternativeBegin:
1120	case ByteTerm::TypeBodyAlternativeDisjunction: {
1121	int offset = currentTerm().alternative.next;
1122	context->term += offset;
1123	if (offset > `0`)
1124	MATCH_NEXT();
1125
1126	if (input.atEnd())
1127	return false;
1128
1129	input.next();
1130	context->matchBegin = input.getPos();
1131	MATCH_NEXT();
1132	}
1133	case ByteTerm::TypeBodyAlternativeEnd:
1134	ASSERT_NOT_REACHED();
1135
1136	case ByteTerm::TypeAlternativeBegin:
1137	case ByteTerm::TypeAlternativeDisjunction: {
1138	int offset = currentTerm().alternative.next;
1139	context->term += offset;
1140	if (offset > `0`)
1141	MATCH_NEXT();
1142	BACKTRACK();
1143	}
1144	case ByteTerm::TypeAlternativeEnd: {
1145	// We should never backtrack back into an alternative of the main body of the regex.
1146	BackTrackInfoAlternative* backTrack = reinterpret_cast<BackTrackInfoAlternative*>(context->frame + currentTerm().frameLocation);
1147	unsigned offset = backTrack->offset;
1148	context->term -= offset;
1149	BACKTRACK();
1150	}
1151
1152	case ByteTerm::TypeAssertionBOL:
1153	case ByteTerm::TypeAssertionEOL:
1154	case ByteTerm::TypeAssertionWordBoundary:
1155	BACKTRACK();
1156
1157	case ByteTerm::TypePatternCharacterOnce:
1158	case ByteTerm::TypePatternCharacterFixed:
1159	case ByteTerm::TypePatternCharacterGreedy:
1160	case ByteTerm::TypePatternCharacterNonGreedy:
1161	if (backtrackPatternCharacter(currentTerm(), context))
1162	MATCH_NEXT();
1163	BACKTRACK();
1164	case ByteTerm::TypePatternCasedCharacterOnce:
1165	case ByteTerm::TypePatternCasedCharacterFixed:
1166	case ByteTerm::TypePatternCasedCharacterGreedy:
1167	case ByteTerm::TypePatternCasedCharacterNonGreedy:
1168	if (backtrackPatternCasedCharacter(currentTerm(), context))
1169	MATCH_NEXT();
1170	BACKTRACK();
1171	case ByteTerm::TypeCharacterClass:
1172	if (backtrackCharacterClass(currentTerm(), context))
1173	MATCH_NEXT();
1174	BACKTRACK();
1175	case ByteTerm::TypeBackReference:
1176	if (backtrackBackReference(currentTerm(), context))
1177	MATCH_NEXT();
1178	BACKTRACK();
1179	case ByteTerm::TypeParenthesesSubpattern:
1180	if (backtrackParentheses(currentTerm(), context))
1181	MATCH_NEXT();
1182	BACKTRACK();
1183	case ByteTerm::TypeParenthesesSubpatternOnceBegin:
1184	if (backtrackParenthesesOnceBegin(currentTerm(), context))
1185	MATCH_NEXT();
1186	BACKTRACK();
1187	case ByteTerm::TypeParenthesesSubpatternOnceEnd:
1188	if (backtrackParenthesesOnceEnd(currentTerm(), context))
1189	MATCH_NEXT();
1190	BACKTRACK();
1191	case ByteTerm::TypeParentheticalAssertionBegin:
1192	if (backtrackParentheticalAssertionBegin(currentTerm(), context))
1193	MATCH_NEXT();
1194	BACKTRACK();
1195	case ByteTerm::TypeParentheticalAssertionEnd:
1196	if (backtrackParentheticalAssertionEnd(currentTerm(), context))
1197	MATCH_NEXT();
1198	BACKTRACK();
1199
1200	case ByteTerm::TypeCheckInput:
1201	input.uncheckInput(currentTerm().checkInputCount);
1202	BACKTRACK();
1203	}
1204
1205	ASSERT_NOT_REACHED();
1206	return false;
1207	}
1208
1209	bool matchNonZeroDisjunction(ByteDisjunction* disjunction, DisjunctionContext* context, bool btrack = false)
1210	{
1211	if (matchDisjunction(disjunction, context, btrack)) {
1212	while (context->matchBegin == context->matchEnd) {
1213	if (!matchDisjunction(disjunction, context, btrack: true))
1214	return false;
1215	}
1216	return true;
1217	}
1218
1219	return false;
1220	}
1221
1222	int interpret()
1223	{
1224	for (unsigned i = `0`; i < ((pattern->m_body ->m_numSubpatterns + `1`) << `1`); ++i)
1225	output[i] = -`1`;
1226
1227	DisjunctionContext* context = allocDisjunctionContext(disjunction: pattern->m_body.get());
1228
1229	if (matchDisjunction(disjunction: pattern->m_body.get(), context)) {
1230	output[`0`] = context->matchBegin;
1231	output[`1`] = context->matchEnd;
1232	}
1233
1234	freeDisjunctionContext(context);
1235
1236	return output[`0`];
1237	}
1238
1239	Interpreter(BytecodePattern* pattern, int* output, const UChar* inputChar, unsigned start, unsigned length)
1240	: pattern(pattern)
1241	, output(output)
1242	, input (inputChar, start, length)
1243	{
1244	}
1245
1246	private:
1247	BytecodePattern *pattern;
1248	int* output;
1249	InputStream input;
1250	};
1251
1252
1253
1254	class ByteCompiler {
1255	struct ParenthesesStackEntry {
1256	unsigned beginTerm;
1257	unsigned savedAlternativeIndex;
1258	ParenthesesStackEntry(unsigned beginTerm, unsigned savedAlternativeIndex/, unsigned subpatternId, bool capture = false/)
1259	: beginTerm(beginTerm)
1260	, savedAlternativeIndex(savedAlternativeIndex)
1261	{
1262	}
1263	};
1264
1265	public:
1266	ByteCompiler(RegexPattern& pattern)
1267	: m_pattern(pattern)
1268	{
1269	m_bodyDisjunction = `0`;
1270	m_currentAlternativeIndex = `0`;
1271	}
1272
1273	BytecodePattern* compile()
1274	{
1275	regexBegin(numSubpatterns: m_pattern.m_numSubpatterns, callFrameSize: m_pattern.m_body->m_callFrameSize);
1276	emitDisjunction(disjunction: m_pattern.m_body);
1277	regexEnd();
1278
1279	return new BytecodePattern (m_bodyDisjunction, m_allParenthesesInfo, m_pattern);
1280	}
1281
1282	void checkInput(unsigned count)
1283	{
1284	m_bodyDisjunction->terms.append(val: ByteTerm::CheckInput(count));
1285	}
1286
1287	void assertionBOL(int inputPosition)
1288	{
1289	m_bodyDisjunction->terms.append(val: ByteTerm::BOL(inputPos: inputPosition));
1290	}
1291
1292	void assertionEOL(int inputPosition)
1293	{
1294	m_bodyDisjunction->terms.append(val: ByteTerm::EOL(inputPos: inputPosition));
1295	}
1296
1297	void assertionWordBoundary(bool invert, int inputPosition)
1298	{
1299	m_bodyDisjunction->terms.append(val: ByteTerm::WordBoundary(invert, inputPos: inputPosition));
1300	}
1301
1302	void atomPatternCharacter(UChar ch, int inputPosition, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType)
1303	{
1304	if (m_pattern.m_ignoreCase) {
1305	UChar lo = Unicode::toLower(ch);
1306	UChar hi = Unicode::toUpper(ch);
1307
1308	if (lo != hi) {
1309	m_bodyDisjunction->terms.append(val: ByteTerm (lo, hi, inputPosition, frameLocation, quantityCount, quantityType));
1310	return;
1311	}
1312	}
1313
1314	m_bodyDisjunction->terms.append(val: ByteTerm (ch, inputPosition, frameLocation, quantityCount, quantityType));
1315	}
1316
1317	void atomCharacterClass(CharacterClass* characterClass, bool invert, int inputPosition, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType)
1318	{
1319	m_bodyDisjunction->terms.append(val: ByteTerm (characterClass, invert, inputPosition));
1320
1321	m_bodyDisjunction->terms [m_bodyDisjunction->terms.size() - `1`].atom.quantityCount = quantityCount;
1322	m_bodyDisjunction->terms [m_bodyDisjunction->terms.size() - `1`].atom.quantityType = quantityType;
1323	m_bodyDisjunction->terms [m_bodyDisjunction->terms.size() - `1`].frameLocation = frameLocation;
1324	}
1325
1326	void atomBackReference(unsigned subpatternId, int inputPosition, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType)
1327	{
1328	ASSERT(subpatternId);
1329
1330	m_bodyDisjunction->terms.append(val: ByteTerm::BackReference(subpatternId, inputPos: inputPosition));
1331
1332	m_bodyDisjunction->terms [m_bodyDisjunction->terms.size() - `1`].atom.quantityCount = quantityCount;
1333	m_bodyDisjunction->terms [m_bodyDisjunction->terms.size() - `1`].atom.quantityType = quantityType;
1334	m_bodyDisjunction->terms [m_bodyDisjunction->terms.size() - `1`].frameLocation = frameLocation;
1335	}
1336
1337	void atomParenthesesSubpatternBegin(unsigned subpatternId, bool capture, int inputPosition, unsigned frameLocation, unsigned alternativeFrameLocation)
1338	{
1339	int beginTerm = m_bodyDisjunction->terms.size();
1340
1341	m_bodyDisjunction->terms.append(val: ByteTerm (ByteTerm::TypeParenthesesSubpatternOnceBegin, subpatternId, capture, inputPosition));
1342	m_bodyDisjunction->terms [m_bodyDisjunction->terms.size() - `1`].frameLocation = frameLocation;
1343	m_bodyDisjunction->terms.append(val: ByteTerm::AlternativeBegin());
1344	m_bodyDisjunction->terms [m_bodyDisjunction->terms.size() - `1`].frameLocation = alternativeFrameLocation;
1345
1346	m_parenthesesStack.append(val: ParenthesesStackEntry (beginTerm, m_currentAlternativeIndex));
1347	m_currentAlternativeIndex = beginTerm + `1`;
1348	}
1349
1350	void atomParentheticalAssertionBegin(unsigned subpatternId, bool invert, unsigned frameLocation, unsigned alternativeFrameLocation)
1351	{
1352	int beginTerm = m_bodyDisjunction->terms.size();
1353
1354	m_bodyDisjunction->terms.append(val: ByteTerm (ByteTerm::TypeParentheticalAssertionBegin, subpatternId, invert, `0`));
1355	m_bodyDisjunction->terms [m_bodyDisjunction->terms.size() - `1`].frameLocation = frameLocation;
1356	m_bodyDisjunction->terms.append(val: ByteTerm::AlternativeBegin());
1357	m_bodyDisjunction->terms [m_bodyDisjunction->terms.size() - `1`].frameLocation = alternativeFrameLocation;
1358
1359	m_parenthesesStack.append(val: ParenthesesStackEntry (beginTerm, m_currentAlternativeIndex));
1360	m_currentAlternativeIndex = beginTerm + `1`;
1361	}
1362
1363	unsigned popParenthesesStack()
1364	{
1365	ASSERT(m_parenthesesStack.size());
1366	int stackEnd = m_parenthesesStack.size() - `1`;
1367	unsigned beginTerm = m_parenthesesStack [stackEnd].beginTerm;
1368	m_currentAlternativeIndex = m_parenthesesStack [stackEnd].savedAlternativeIndex;
1369	m_parenthesesStack.shrink(size: stackEnd);
1370
1371	ASSERT(beginTerm < m_bodyDisjunction->terms.size());
1372	ASSERT(m_currentAlternativeIndex < m_bodyDisjunction->terms.size());
1373
1374	return beginTerm;
1375	}
1376
1377	#ifndef NDEBUG
1378	void dumpDisjunction(ByteDisjunction* disjunction)
1379	{
1380	printf(format: "ByteDisjunction(%p):\n\t", disjunction);
1381	for (unsigned i = `0`; i < disjunction->terms.size(); ++i)
1382	printf(format: "{ %d } ", disjunction->terms [i].type);
1383	printf(format: "\n");
1384	}
1385	#endif
1386
1387	void closeAlternative(int beginTerm)
1388	{
1389	int origBeginTerm = beginTerm;
1390	ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeAlternativeBegin);
1391	int endIndex = m_bodyDisjunction->terms.size();
1392
1393	unsigned frameLocation = m_bodyDisjunction->terms [beginTerm].frameLocation;
1394
1395	if (!m_bodyDisjunction->terms [beginTerm].alternative.next)
1396	m_bodyDisjunction->terms.remove(position: beginTerm);
1397	else {
1398	while (m_bodyDisjunction->terms [beginTerm].alternative.next) {
1399	beginTerm += m_bodyDisjunction->terms [beginTerm].alternative.next;
1400	ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeAlternativeDisjunction);
1401	m_bodyDisjunction->terms [beginTerm].alternative.end = endIndex - beginTerm;
1402	m_bodyDisjunction->terms [beginTerm].frameLocation = frameLocation;
1403	}
1404
1405	m_bodyDisjunction->terms [beginTerm].alternative.next = origBeginTerm - beginTerm;
1406
1407	m_bodyDisjunction->terms.append(val: ByteTerm::AlternativeEnd());
1408	m_bodyDisjunction->terms [endIndex].frameLocation = frameLocation;
1409	}
1410	}
1411
1412	void closeBodyAlternative()
1413	{
1414	int beginTerm = `0`;
1415	int origBeginTerm = `0`;
1416	ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeBodyAlternativeBegin);
1417	int endIndex = m_bodyDisjunction->terms.size();
1418
1419	unsigned frameLocation = m_bodyDisjunction->terms [beginTerm].frameLocation;
1420
1421	while (m_bodyDisjunction->terms [beginTerm].alternative.next) {
1422	beginTerm += m_bodyDisjunction->terms [beginTerm].alternative.next;
1423	ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeBodyAlternativeDisjunction);
1424	m_bodyDisjunction->terms [beginTerm].alternative.end = endIndex - beginTerm;
1425	m_bodyDisjunction->terms [beginTerm].frameLocation = frameLocation;
1426	}
1427
1428	m_bodyDisjunction->terms [beginTerm].alternative.next = origBeginTerm - beginTerm;
1429
1430	m_bodyDisjunction->terms.append(val: ByteTerm::BodyAlternativeEnd());
1431	m_bodyDisjunction->terms [endIndex].frameLocation = frameLocation;
1432	}
1433
1434	void atomParenthesesEnd(bool doInline, unsigned lastSubpatternId, int inputPosition, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType, unsigned callFrameSize = `0`)
1435	{
1436	unsigned beginTerm = popParenthesesStack();
1437	closeAlternative(beginTerm: beginTerm + `1`);
1438	unsigned endTerm = m_bodyDisjunction->terms.size();
1439
1440	bool isAssertion = m_bodyDisjunction->terms [beginTerm].type == ByteTerm::TypeParentheticalAssertionBegin;
1441	bool invertOrCapture = m_bodyDisjunction->terms [beginTerm].invertOrCapture;
1442	unsigned subpatternId = m_bodyDisjunction->terms [beginTerm].atom.subpatternId;
1443
1444	m_bodyDisjunction->terms.append(val: ByteTerm (isAssertion ? ByteTerm::TypeParentheticalAssertionEnd : ByteTerm::TypeParenthesesSubpatternOnceEnd, subpatternId, invertOrCapture, inputPosition));
1445	m_bodyDisjunction->terms [beginTerm].atom.parenthesesWidth = endTerm - beginTerm;
1446	m_bodyDisjunction->terms [endTerm].atom.parenthesesWidth = endTerm - beginTerm;
1447	m_bodyDisjunction->terms [endTerm].frameLocation = frameLocation;
1448
1449	if (doInline) {
1450	m_bodyDisjunction->terms [beginTerm].atom.quantityCount = quantityCount;
1451	m_bodyDisjunction->terms [beginTerm].atom.quantityType = quantityType;
1452	m_bodyDisjunction->terms [endTerm].atom.quantityCount = quantityCount;
1453	m_bodyDisjunction->terms [endTerm].atom.quantityType = quantityType;
1454	} else {
1455	ByteTerm& parenthesesBegin = m_bodyDisjunction->terms [beginTerm];
1456	ASSERT(parenthesesBegin.type == ByteTerm::TypeParenthesesSubpatternOnceBegin);
1457
1458	bool invertOrCapture = parenthesesBegin.invertOrCapture;
1459	unsigned subpatternId = parenthesesBegin.atom.subpatternId;
1460
1461	unsigned numSubpatterns = lastSubpatternId - subpatternId + `1`;
1462	ByteDisjunction* parenthesesDisjunction = new ByteDisjunction (numSubpatterns, callFrameSize);
1463
1464	parenthesesDisjunction->terms.append(val: ByteTerm::SubpatternBegin());
1465	for (unsigned termInParentheses = beginTerm + `1`; termInParentheses < endTerm; ++termInParentheses)
1466	parenthesesDisjunction->terms.append(val: m_bodyDisjunction->terms [termInParentheses]);
1467	parenthesesDisjunction->terms.append(val: ByteTerm::SubpatternEnd());
1468
1469	m_bodyDisjunction->terms.shrink(size: beginTerm);
1470
1471	m_allParenthesesInfo.append(val: parenthesesDisjunction);
1472	m_bodyDisjunction->terms.append(val: ByteTerm (ByteTerm::TypeParenthesesSubpattern, subpatternId, parenthesesDisjunction, invertOrCapture, inputPosition));
1473
1474	m_bodyDisjunction->terms [beginTerm].atom.quantityCount = quantityCount;
1475	m_bodyDisjunction->terms [beginTerm].atom.quantityType = quantityType;
1476	m_bodyDisjunction->terms [beginTerm].frameLocation = frameLocation;
1477	}
1478	}
1479
1480	void regexBegin(unsigned numSubpatterns, unsigned callFrameSize)
1481	{
1482	m_bodyDisjunction = new ByteDisjunction (numSubpatterns, callFrameSize);
1483	m_bodyDisjunction->terms.append(val: ByteTerm::BodyAlternativeBegin());
1484	m_bodyDisjunction->terms [`0`].frameLocation = `0`;
1485	m_currentAlternativeIndex = `0`;
1486	}
1487
1488	void regexEnd()
1489	{
1490	closeBodyAlternative();
1491	}
1492
1493	void alternativeBodyDisjunction()
1494	{
1495	int newAlternativeIndex = m_bodyDisjunction->terms.size();
1496	m_bodyDisjunction->terms [m_currentAlternativeIndex].alternative.next = newAlternativeIndex - m_currentAlternativeIndex;
1497	m_bodyDisjunction->terms.append(val: ByteTerm::BodyAlternativeDisjunction());
1498
1499	m_currentAlternativeIndex = newAlternativeIndex;
1500	}
1501
1502	void alternativeDisjunction()
1503	{
1504	int newAlternativeIndex = m_bodyDisjunction->terms.size();
1505	m_bodyDisjunction->terms [m_currentAlternativeIndex].alternative.next = newAlternativeIndex - m_currentAlternativeIndex;
1506	m_bodyDisjunction->terms.append(val: ByteTerm::AlternativeDisjunction());
1507
1508	m_currentAlternativeIndex = newAlternativeIndex;
1509	}
1510
1511	void emitDisjunction(PatternDisjunction* disjunction, unsigned inputCountAlreadyChecked = `0`, unsigned parenthesesInputCountAlreadyChecked = `0`)
1512	{
1513	for (unsigned alt = `0`; alt < disjunction->m_alternatives.size(); ++alt) {
1514	unsigned currentCountAlreadyChecked = inputCountAlreadyChecked;
1515
1516	if (alt) {
1517	if (disjunction == m_pattern.m_body)
1518	alternativeBodyDisjunction();
1519	else
1520	alternativeDisjunction();
1521	}
1522
1523	PatternAlternative* alternative = disjunction->m_alternatives [alt];
1524	unsigned minimumSize = alternative->m_minimumSize;
1525
1526	ASSERT(minimumSize >= parenthesesInputCountAlreadyChecked);
1527	unsigned countToCheck = minimumSize - parenthesesInputCountAlreadyChecked;
1528	if (countToCheck)
1529	checkInput(count: countToCheck);
1530	currentCountAlreadyChecked += countToCheck;
1531
1532	for (unsigned i = `0`; i < alternative->m_terms.size(); ++i) {
1533	PatternTerm& term = alternative->m_terms [i];
1534
1535	switch (term.type) {
1536	case PatternTerm::TypeAssertionBOL:
1537	assertionBOL(inputPosition: term.inputPosition - currentCountAlreadyChecked);
1538	break;
1539
1540	case PatternTerm::TypeAssertionEOL:
1541	assertionEOL(inputPosition: term.inputPosition - currentCountAlreadyChecked);
1542	break;
1543
1544	case PatternTerm::TypeAssertionWordBoundary:
1545	assertionWordBoundary(invert: term.invertOrCapture, inputPosition: term.inputPosition - currentCountAlreadyChecked);
1546	break;
1547
1548	case PatternTerm::TypePatternCharacter:
1549	atomPatternCharacter(ch: term.patternCharacter, inputPosition: term.inputPosition - currentCountAlreadyChecked, frameLocation: term.frameLocation, quantityCount: term.quantityCount, quantityType: term.quantityType);
1550	break;
1551
1552	case PatternTerm::TypeCharacterClass:
1553	atomCharacterClass(characterClass: term.characterClass, invert: term.invertOrCapture, inputPosition: term.inputPosition - currentCountAlreadyChecked, frameLocation: term.frameLocation, quantityCount: term.quantityCount, quantityType: term.quantityType);
1554	break;
1555
1556	case PatternTerm::TypeBackReference:
1557	atomBackReference(subpatternId: term.subpatternId, inputPosition: term.inputPosition - currentCountAlreadyChecked, frameLocation: term.frameLocation, quantityCount: term.quantityCount, quantityType: term.quantityType);
1558	break;
1559
1560	case PatternTerm::TypeForwardReference:
1561	break;
1562
1563	case PatternTerm::TypeParenthesesSubpattern: {
1564	unsigned disjunctionAlreadyCheckedCount = `0`;
1565	if ((term.quantityCount == `1`) && !term.parentheses.isCopy) {
1566	if (term.quantityType == QuantifierFixedCount) {
1567	disjunctionAlreadyCheckedCount = term.parentheses.disjunction->m_minimumSize;
1568	unsigned delegateEndInputOffset = term.inputPosition - currentCountAlreadyChecked;
1569	atomParenthesesSubpatternBegin(subpatternId: term.parentheses.subpatternId, capture: term.invertOrCapture, inputPosition: delegateEndInputOffset - disjunctionAlreadyCheckedCount, frameLocation: term.frameLocation, alternativeFrameLocation: term.frameLocation);
1570	emitDisjunction(disjunction: term.parentheses.disjunction, inputCountAlreadyChecked: currentCountAlreadyChecked, parenthesesInputCountAlreadyChecked: term.parentheses.disjunction->m_minimumSize);
1571	atomParenthesesEnd(doInline: true, lastSubpatternId: term.parentheses.lastSubpatternId, inputPosition: delegateEndInputOffset, frameLocation: term.frameLocation, quantityCount: term.quantityCount, quantityType: term.quantityType, callFrameSize: term.parentheses.disjunction->m_callFrameSize);
1572	} else {
1573	unsigned delegateEndInputOffset = term.inputPosition - currentCountAlreadyChecked;
1574	atomParenthesesSubpatternBegin(subpatternId: term.parentheses.subpatternId, capture: term.invertOrCapture, inputPosition: delegateEndInputOffset - disjunctionAlreadyCheckedCount, frameLocation: term.frameLocation, alternativeFrameLocation: term.frameLocation + RegexStackSpaceForBackTrackInfoParenthesesOnce);
1575	emitDisjunction(disjunction: term.parentheses.disjunction, inputCountAlreadyChecked: currentCountAlreadyChecked, parenthesesInputCountAlreadyChecked: `0`);
1576	atomParenthesesEnd(doInline: true, lastSubpatternId: term.parentheses.lastSubpatternId, inputPosition: delegateEndInputOffset, frameLocation: term.frameLocation, quantityCount: term.quantityCount, quantityType: term.quantityType, callFrameSize: term.parentheses.disjunction->m_callFrameSize);
1577	}
1578	} else {
1579	unsigned delegateEndInputOffset = term.inputPosition - currentCountAlreadyChecked;
1580	atomParenthesesSubpatternBegin(subpatternId: term.parentheses.subpatternId, capture: term.invertOrCapture, inputPosition: delegateEndInputOffset - disjunctionAlreadyCheckedCount, frameLocation: term.frameLocation, alternativeFrameLocation: `0`);
1581	emitDisjunction(disjunction: term.parentheses.disjunction, inputCountAlreadyChecked: currentCountAlreadyChecked, parenthesesInputCountAlreadyChecked: `0`);
1582	atomParenthesesEnd(doInline: false, lastSubpatternId: term.parentheses.lastSubpatternId, inputPosition: delegateEndInputOffset, frameLocation: term.frameLocation, quantityCount: term.quantityCount, quantityType: term.quantityType, callFrameSize: term.parentheses.disjunction->m_callFrameSize);
1583	}
1584	break;
1585	}
1586
1587	case PatternTerm::TypeParentheticalAssertion: {
1588	unsigned alternativeFrameLocation = term.inputPosition + RegexStackSpaceForBackTrackInfoParentheticalAssertion;
1589
1590	atomParentheticalAssertionBegin(subpatternId: term.parentheses.subpatternId, invert: term.invertOrCapture, frameLocation: term.frameLocation, alternativeFrameLocation);
1591	emitDisjunction(disjunction: term.parentheses.disjunction, inputCountAlreadyChecked: currentCountAlreadyChecked, parenthesesInputCountAlreadyChecked: `0`);
1592	atomParenthesesEnd(doInline: true, lastSubpatternId: term.parentheses.lastSubpatternId, inputPosition: `0`, frameLocation: term.frameLocation, quantityCount: term.quantityCount, quantityType: term.quantityType);
1593	break;
1594	}
1595	}
1596	}
1597	}
1598	}
1599
1600	private:
1601	RegexPattern& m_pattern;
1602	ByteDisjunction* m_bodyDisjunction;
1603	unsigned m_currentAlternativeIndex;
1604	Vector<ParenthesesStackEntry> m_parenthesesStack;
1605	Vector<ByteDisjunction*> m_allParenthesesInfo;
1606	};
1607
1608
1609	BytecodePattern* byteCompileRegex(const UString& patternString, unsigned& numSubpatterns, const char& error, bool* ignoreCase, bool multiline)
1610	{
1611	RegexPattern pattern(ignoreCase, multiline);
1612
1613	if ((error = compileRegex(patternString, pattern)))
1614	return `0`;
1615
1616	numSubpatterns = pattern.m_numSubpatterns;
1617
1618	return ByteCompiler (pattern).compile();
1619	}
1620
1621	int interpretRegex(BytecodePattern* regex, const UChar* input, unsigned start, unsigned length, int* output)
1622	{
1623	return Interpreter (regex, output, input, start, length).interpret();
1624	}
1625
1626
1627	COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoPatternCharacter) == (RegexStackSpaceForBackTrackInfoPatternCharacter * sizeof(uintptr_t)), CheckRegexStackSpaceForBackTrackInfoPatternCharacter);
1628	COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoCharacterClass) == (RegexStackSpaceForBackTrackInfoCharacterClass * sizeof(uintptr_t)), CheckRegexStackSpaceForBackTrackInfoCharacterClass);
1629	COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoBackReference) == (RegexStackSpaceForBackTrackInfoBackReference * sizeof(uintptr_t)), CheckRegexStackSpaceForBackTrackInfoBackReference);
1630	COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoAlternative) == (RegexStackSpaceForBackTrackInfoAlternative * sizeof(uintptr_t)), CheckRegexStackSpaceForBackTrackInfoAlternative);
1631	COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoParentheticalAssertion) == (RegexStackSpaceForBackTrackInfoParentheticalAssertion * sizeof(uintptr_t)), CheckRegexStackSpaceForBackTrackInfoParentheticalAssertion);
1632	COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoParenthesesOnce) == (RegexStackSpaceForBackTrackInfoParenthesesOnce * sizeof(uintptr_t)), CheckRegexStackSpaceForBackTrackInfoParenthesesOnce);
1633	COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoParentheses) == (RegexStackSpaceForBackTrackInfoParentheses * sizeof(uintptr_t)), CheckRegexStackSpaceForBackTrackInfoParentheses);
1634
1635
1636	} }
1637
1638	#endif
1639

source code of qtscript/src/3rdparty/javascriptcore/JavaScriptCore/yarr/RegexInterpreter.cpp