RegexCompiler.cpp source code [qtscript/src/3rdparty/javascriptcore/JavaScriptCore/yarr/RegexCompiler.cpp]

1	/*
2	* Copyright (C) 2009 Apple Inc. All rights reserved.
3	*
4	* Redistribution and use in source and binary forms, with or without
5	* modification, are permitted provided that the following conditions
6	* are met:
7	* 1. Redistributions of source code must retain the above copyright
8	* notice, this list of conditions and the following disclaimer.
9	* 2. Redistributions in binary form must reproduce the above copyright
10	* notice, this list of conditions and the following disclaimer in the
11	* documentation and/or other materials provided with the distribution.
12	*
13	* THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
14	* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
16	* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
17	* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
18	* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
19	* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
20	* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
21	* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
23	* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24	*/
25
26	#include "config.h"
27	#include "RegexCompiler.h"
28
29	#include "RegexInterpreter.h"
30	#include "RegexPattern.h"
31	#include <wtf/Vector.h>
32
33	#if ENABLE(YARR)
34
35	using namespace WTF;
36
37	namespace JSC { namespace Yarr {
38
39	class CharacterClassConstructor {
40	public:
41	CharacterClassConstructor(bool isCaseInsensitive = false)
42	: m_isCaseInsensitive(isCaseInsensitive)
43	{
44	}
45
46	void reset()
47	{
48	m_matches.clear();
49	m_ranges.clear();
50	m_matchesUnicode.clear();
51	m_rangesUnicode.clear();
52	}
53
54	void append(const CharacterClass* other)
55	{
56	for (size_t i = `0`; i < other->m_matches.size(); ++i)
57	addSorted(matches&: m_matches, ch: other->m_matches [i]);
58	for (size_t i = `0`; i < other->m_ranges.size(); ++i)
59	addSortedRange(ranges&: m_ranges, lo: other->m_ranges [i].begin, hi: other->m_ranges [i].end);
60	for (size_t i = `0`; i < other->m_matchesUnicode.size(); ++i)
61	addSorted(matches&: m_matchesUnicode, ch: other->m_matchesUnicode [i]);
62	for (size_t i = `0`; i < other->m_rangesUnicode.size(); ++i)
63	addSortedRange(ranges&: m_rangesUnicode, lo: other->m_rangesUnicode [i].begin, hi: other->m_rangesUnicode [i].end);
64	}
65
66	void putChar(UChar ch)
67	{
68	if (ch <= `0x7f`) {
69	if (m_isCaseInsensitive && isASCIIAlpha(c: ch)) {
70	addSorted(matches&: m_matches, ch: toASCIIUpper(c: ch));
71	addSorted(matches&: m_matches, ch: toASCIILower(c: ch));
72	} else
73	addSorted(matches&: m_matches, ch);
74	} else {
75	UChar upper, lower;
76	if (m_isCaseInsensitive && ((upper = Unicode::toUpper(ch)) != (lower = Unicode::toLower(ch)))) {
77	addSorted(matches&: m_matchesUnicode, ch: upper);
78	addSorted(matches&: m_matchesUnicode, ch: lower);
79	} else
80	addSorted(matches&: m_matchesUnicode, ch);
81	}
82	}
83
84	// returns true if this character has another case, and 'ch' is the upper case form.
85	static inline bool isUnicodeUpper(UChar ch)
86	{
87	return ch != Unicode::toLower(ch);
88	}
89
90	// returns true if this character has another case, and 'ch' is the lower case form.
91	static inline bool isUnicodeLower(UChar ch)
92	{
93	return ch != Unicode::toUpper(ch);
94	}
95
96	void putRange(UChar lo, UChar hi)
97	{
98	if (lo <= `0x7f`) {
99	char asciiLo = lo;
100	char asciiHi = std::min(a: hi, b: (UChar)`0x7f`);
101	addSortedRange(ranges&: m_ranges, lo, hi: asciiHi);
102
103	if (m_isCaseInsensitive) {
104	if ((asciiLo <= `'Z'`) && (asciiHi >= `'A'`))
105	addSortedRange(ranges&: m_ranges, lo: std::max(a: asciiLo, b: `'A'`)+(`'a'`-`'A'`), hi: std::min(a: asciiHi, b: `'Z'`)+(`'a'`-`'A'`));
106	if ((asciiLo <= `'z'`) && (asciiHi >= `'a'`))
107	addSortedRange(ranges&: m_ranges, lo: std::max(a: asciiLo, b: `'a'`)+(`'A'`-`'a'`), hi: std::min(a: asciiHi, b: `'z'`)+(`'A'`-`'a'`));
108	}
109	}
110	if (hi >= `0x80`) {
111	uint32_t unicodeCurr = std::max(a: lo, b: (UChar)`0x80`);
112	addSortedRange(ranges&: m_rangesUnicode, lo: unicodeCurr, hi);
113
114	if (m_isCaseInsensitive) {
115	while (unicodeCurr <= hi) {
116	// If the upper bound of the range (hi) is 0xffff, the increments to
117	// unicodeCurr in this loop may take it to 0x10000. This is fine
118	// (if so we won't re-enter the loop, since the loop condition above
119	// will definitely fail) - but this does mean we cannot use a UChar
120	// to represent unicodeCurr, we must use a 32-bit value instead.
121	ASSERT(unicodeCurr <= `0xffff`);
122
123	if (isUnicodeUpper(ch: unicodeCurr)) {
124	UChar lowerCaseRangeBegin = Unicode::toLower(ch: unicodeCurr);
125	UChar lowerCaseRangeEnd = lowerCaseRangeBegin;
126	while ((++unicodeCurr <= hi) && isUnicodeUpper(ch: unicodeCurr) && (Unicode::toLower(ch: unicodeCurr) == (lowerCaseRangeEnd + `1`)))
127	lowerCaseRangeEnd++;
128	addSortedRange(ranges&: m_rangesUnicode, lo: lowerCaseRangeBegin, hi: lowerCaseRangeEnd);
129	} else if (isUnicodeLower(ch: unicodeCurr)) {
130	UChar upperCaseRangeBegin = Unicode::toUpper(ch: unicodeCurr);
131	UChar upperCaseRangeEnd = upperCaseRangeBegin;
132	while ((++unicodeCurr <= hi) && isUnicodeLower(ch: unicodeCurr) && (Unicode::toUpper(ch: unicodeCurr) == (upperCaseRangeEnd + `1`)))
133	upperCaseRangeEnd++;
134	addSortedRange(ranges&: m_rangesUnicode, lo: upperCaseRangeBegin, hi: upperCaseRangeEnd);
135	} else
136	++unicodeCurr;
137	}
138	}
139	}
140	}
141
142	CharacterClass* charClass()
143	{
144	CharacterClass* characterClass = new CharacterClass ();
145
146	characterClass->m_matches.append(val: m_matches);
147	characterClass->m_ranges.append(val: m_ranges);
148	characterClass->m_matchesUnicode.append(val: m_matchesUnicode);
149	characterClass->m_rangesUnicode.append(val: m_rangesUnicode);
150
151	reset();
152
153	return characterClass;
154	}
155
156	private:
157	void addSorted(Vector<UChar>& matches, UChar ch)
158	{
159	unsigned pos = `0`;
160	unsigned range = matches.size();
161
162	// binary chop, find position to insert char.
163	while (range) {
164	unsigned index = range >> `1`;
165
166	int val = matches [pos+index] - ch;
167	if (!val)
168	return;
169	else if (val > `0`)
170	range = index;
171	else {
172	pos += (index+`1`);
173	range -= (index+`1`);
174	}
175	}
176
177	if (pos == matches.size())
178	matches.append(val: ch);
179	else
180	matches.insert(position: pos, val: ch);
181	}
182
183	void addSortedRange(Vector<CharacterRange>& ranges, UChar lo, UChar hi)
184	{
185	unsigned end = ranges.size();
186
187	// Simple linear scan - I doubt there are that many ranges anyway...
188	// feel free to fix this with something faster (eg binary chop).
189	for (unsigned i = `0`; i < end; ++i) {
190	// does the new range fall before the current position in the array
191	if (hi < ranges [i].begin) {
192	// optional optimization: concatenate appending ranges? - may not be worthwhile.
193	if (hi == (ranges [i].begin - `1`)) {
194	ranges [i].begin = lo;
195	return;
196	}
197	ranges.insert(position: i, val: CharacterRange (lo, hi));
198	return;
199	}
200	// Okay, since we didn't hit the last case, the end of the new range is definitely at or after the begining
201	// If the new range start at or before the end of the last range, then the overlap (if it starts one after the
202	// end of the last range they concatenate, which is just as good.
203	if (lo <= (ranges [i].end + `1`)) {
204	// found an intersect! we'll replace this entry in the array.
205	ranges [i].begin = std::min(a: ranges [i].begin, b: lo);
206	ranges [i].end = std::max(a: ranges [i].end, b: hi);
207
208	// now check if the new range can subsume any subsequent ranges.
209	unsigned next = i+`1`;
210	// each iteration of the loop we will either remove something from the list, or break the loop.
211	while (next < ranges.size()) {
212	if (ranges [next].begin <= (ranges [i].end + `1`)) {
213	// the next entry now overlaps / concatenates this one.
214	ranges [i].end = std::max(a: ranges [i].end, b: ranges [next].end);
215	ranges.remove(position: next);
216	} else
217	break;
218	}
219
220	return;
221	}
222	}
223
224	// CharacterRange comes after all existing ranges.
225	ranges.append(val: CharacterRange (lo, hi));
226	}
227
228	bool m_isCaseInsensitive;
229
230	Vector<UChar> m_matches;
231	Vector<CharacterRange> m_ranges;
232	Vector<UChar> m_matchesUnicode;
233	Vector<CharacterRange> m_rangesUnicode;
234	};
235
236
237	CharacterClass* newlineCreate()
238	{
239	CharacterClass* characterClass = new CharacterClass ();
240
241	characterClass->m_matches.append(val: `'\n'`);
242	characterClass->m_matches.append(val: `'\r'`);
243	characterClass->m_matchesUnicode.append(val: `0x2028`);
244	characterClass->m_matchesUnicode.append(val: `0x2029`);
245
246	return characterClass;
247	}
248
249	CharacterClass* digitsCreate()
250	{
251	CharacterClass* characterClass = new CharacterClass ();
252
253	characterClass->m_ranges.append(val: CharacterRange (`'0'`, `'9'`));
254
255	return characterClass;
256	}
257
258	CharacterClass* spacesCreate()
259	{
260	CharacterClass* characterClass = new CharacterClass ();
261
262	characterClass->m_matches.append(val: `' '`);
263	characterClass->m_ranges.append(val: CharacterRange (`'\t'`, `'\r'`));
264	characterClass->m_matchesUnicode.append(val: `0x00a0`);
265	characterClass->m_matchesUnicode.append(val: `0x1680`);
266	characterClass->m_matchesUnicode.append(val: `0x180e`);
267	characterClass->m_matchesUnicode.append(val: `0x2028`);
268	characterClass->m_matchesUnicode.append(val: `0x2029`);
269	characterClass->m_matchesUnicode.append(val: `0x202f`);
270	characterClass->m_matchesUnicode.append(val: `0x205f`);
271	characterClass->m_matchesUnicode.append(val: `0x3000`);
272	characterClass->m_rangesUnicode.append(val: CharacterRange (`0x2000`, `0x200a`));
273
274	return characterClass;
275	}
276
277	CharacterClass* wordcharCreate()
278	{
279	CharacterClass* characterClass = new CharacterClass ();
280
281	characterClass->m_matches.append(val: `'_'`);
282	characterClass->m_ranges.append(val: CharacterRange (`'0'`, `'9'`));
283	characterClass->m_ranges.append(val: CharacterRange (`'A'`, `'Z'`));
284	characterClass->m_ranges.append(val: CharacterRange (`'a'`, `'z'`));
285
286	return characterClass;
287	}
288
289	CharacterClass* nondigitsCreate()
290	{
291	CharacterClass* characterClass = new CharacterClass ();
292
293	characterClass->m_ranges.append(val: CharacterRange (`0`, `'0'` - `1`));
294	characterClass->m_ranges.append(val: CharacterRange (`'9'` + `1`, `0x7f`));
295	characterClass->m_rangesUnicode.append(val: CharacterRange (`0x80`, `0xffff`));
296
297	return characterClass;
298	}
299
300	CharacterClass* nonspacesCreate()
301	{
302	CharacterClass* characterClass = new CharacterClass ();
303
304	characterClass->m_ranges.append(val: CharacterRange (`0`, `'\t'` - `1`));
305	characterClass->m_ranges.append(val: CharacterRange (`'\r'` + `1`, `' '` - `1`));
306	characterClass->m_ranges.append(val: CharacterRange (`' '` + `1`, `0x7f`));
307	characterClass->m_rangesUnicode.append(val: CharacterRange (`0x0080`, `0x009f`));
308	characterClass->m_rangesUnicode.append(val: CharacterRange (`0x00a1`, `0x167f`));
309	characterClass->m_rangesUnicode.append(val: CharacterRange (`0x1681`, `0x180d`));
310	characterClass->m_rangesUnicode.append(val: CharacterRange (`0x180f`, `0x1fff`));
311	characterClass->m_rangesUnicode.append(val: CharacterRange (`0x200b`, `0x2027`));
312	characterClass->m_rangesUnicode.append(val: CharacterRange (`0x202a`, `0x202e`));
313	characterClass->m_rangesUnicode.append(val: CharacterRange (`0x2030`, `0x205e`));
314	characterClass->m_rangesUnicode.append(val: CharacterRange (`0x2060`, `0x2fff`));
315	characterClass->m_rangesUnicode.append(val: CharacterRange (`0x3001`, `0xffff`));
316
317	return characterClass;
318	}
319
320	CharacterClass* nonwordcharCreate()
321	{
322	CharacterClass* characterClass = new CharacterClass ();
323
324	characterClass->m_matches.append(val: '`');
325	characterClass->m_ranges.append(val: CharacterRange (`0`, `'0'` - `1`));
326	characterClass->m_ranges.append(val: CharacterRange (`'9'` + `1`, `'A'` - `1`));
327	characterClass->m_ranges.append(val: CharacterRange (`'Z'` + `1`, `'_'` - `1`));
328	characterClass->m_ranges.append(val: CharacterRange (`'z'` + `1`, `0x7f`));
329	characterClass->m_rangesUnicode.append(val: CharacterRange (`0x80`, `0xffff`));
330
331	return characterClass;
332	}
333
334
335	class RegexPatternConstructor {
336	public:
337	RegexPatternConstructor(RegexPattern& pattern)
338	: m_pattern(pattern)
339	, m_characterClassConstructor (pattern.m_ignoreCase)
340	{
341	}
342
343	~RegexPatternConstructor()
344	{
345	}
346
347	void reset()
348	{
349	m_pattern.reset();
350	m_characterClassConstructor.reset();
351	}
352
353	void assertionBOL()
354	{
355	m_alternative->m_terms.append(val: PatternTerm::BOL());
356	}
357	void assertionEOL()
358	{
359	m_alternative->m_terms.append(val: PatternTerm::EOL());
360	}
361	void assertionWordBoundary(bool invert)
362	{
363	m_alternative->m_terms.append(val: PatternTerm::WordBoundary(invert));
364	}
365
366	void atomPatternCharacter(UChar ch)
367	{
368	// We handle case-insensitive checking of unicode characters which do have both
369	// cases by handling them as if they were defined using a CharacterClass.
370	if (m_pattern.m_ignoreCase && !isASCII(c: ch) && (Unicode::toUpper(ch) != Unicode::toLower(ch))) {
371	atomCharacterClassBegin();
372	atomCharacterClassAtom(ch);
373	atomCharacterClassEnd();
374	} else
375	m_alternative->m_terms.append(val: PatternTerm (ch));
376	}
377
378	void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert)
379	{
380	switch (classID) {
381	case DigitClassID:
382	m_alternative->m_terms.append(val: PatternTerm (m_pattern.digitsCharacterClass(), invert));
383	break;
384	case SpaceClassID:
385	m_alternative->m_terms.append(val: PatternTerm (m_pattern.spacesCharacterClass(), invert));
386	break;
387	case WordClassID:
388	m_alternative->m_terms.append(val: PatternTerm (m_pattern.wordcharCharacterClass(), invert));
389	break;
390	case NewlineClassID:
391	m_alternative->m_terms.append(val: PatternTerm (m_pattern.newlineCharacterClass(), invert));
392	break;
393	}
394	}
395
396	void atomCharacterClassBegin(bool invert = false)
397	{
398	m_invertCharacterClass = invert;
399	}
400
401	void atomCharacterClassAtom(UChar ch)
402	{
403	m_characterClassConstructor.putChar(ch);
404	}
405
406	void atomCharacterClassRange(UChar begin, UChar end)
407	{
408	m_characterClassConstructor.putRange(lo: begin, hi: end);
409	}
410
411	void atomCharacterClassBuiltIn(BuiltInCharacterClassID classID, bool invert)
412	{
413	ASSERT(classID != NewlineClassID);
414
415	switch (classID) {
416	case DigitClassID:
417	m_characterClassConstructor.append(other: invert ? m_pattern.nondigitsCharacterClass() : m_pattern.digitsCharacterClass());
418	break;
419
420	case SpaceClassID:
421	m_characterClassConstructor.append(other: invert ? m_pattern.nonspacesCharacterClass() : m_pattern.spacesCharacterClass());
422	break;
423
424	case WordClassID:
425	m_characterClassConstructor.append(other: invert ? m_pattern.nonwordcharCharacterClass() : m_pattern.wordcharCharacterClass());
426	break;
427
428	default:
429	ASSERT_NOT_REACHED();
430	}
431	}
432
433	void atomCharacterClassEnd()
434	{
435	CharacterClass* newCharacterClass = m_characterClassConstructor.charClass();
436	m_pattern.m_userCharacterClasses.append(val: newCharacterClass);
437	m_alternative->m_terms.append(val: PatternTerm (newCharacterClass, m_invertCharacterClass));
438	}
439
440	void atomParenthesesSubpatternBegin(bool capture = true)
441	{
442	unsigned subpatternId = m_pattern.m_numSubpatterns + `1`;
443	if (capture)
444	m_pattern.m_numSubpatterns++;
445
446	PatternDisjunction* parenthesesDisjunction = new PatternDisjunction (m_alternative);
447	m_pattern.m_disjunctions.append(val: parenthesesDisjunction);
448	m_alternative->m_terms.append(val: PatternTerm (PatternTerm::TypeParenthesesSubpattern, subpatternId, parenthesesDisjunction, capture));
449	m_alternative = parenthesesDisjunction->addNewAlternative();
450	}
451
452	void atomParentheticalAssertionBegin(bool invert = false)
453	{
454	PatternDisjunction* parenthesesDisjunction = new PatternDisjunction (m_alternative);
455	m_pattern.m_disjunctions.append(val: parenthesesDisjunction);
456	m_alternative->m_terms.append(val: PatternTerm (PatternTerm::TypeParentheticalAssertion, m_pattern.m_numSubpatterns + `1`, parenthesesDisjunction, invert));
457	m_alternative = parenthesesDisjunction->addNewAlternative();
458	}
459
460	void atomParenthesesEnd()
461	{
462	ASSERT(m_alternative->m_parent);
463	ASSERT(m_alternative->m_parent->m_parent);
464	m_alternative = m_alternative->m_parent->m_parent;
465
466	m_alternative->lastTerm().parentheses.lastSubpatternId = m_pattern.m_numSubpatterns;
467	}
468
469	void atomBackReference(unsigned subpatternId)
470	{
471	ASSERT(subpatternId);
472	m_pattern.m_maxBackReference = std::max(a: m_pattern.m_maxBackReference, b: subpatternId);
473
474	if (subpatternId > m_pattern.m_numSubpatterns) {
475	m_alternative->m_terms.append(val: PatternTerm::ForwardReference());
476	return;
477	}
478
479	PatternAlternative* currentAlternative = m_alternative;
480	ASSERT(currentAlternative);
481
482	// Note to self: if we waited until the AST was baked, we could also remove forwards refs
483	while ((currentAlternative = currentAlternative->m_parent->m_parent)) {
484	PatternTerm& term = currentAlternative->lastTerm();
485	ASSERT((term.type == PatternTerm::TypeParenthesesSubpattern) \|\| (term.type == PatternTerm::TypeParentheticalAssertion));
486
487	if ((term.type == PatternTerm::TypeParenthesesSubpattern) && term.invertOrCapture && (subpatternId == term.subpatternId)) {
488	m_alternative->m_terms.append(val: PatternTerm::ForwardReference());
489	return;
490	}
491	}
492
493	m_alternative->m_terms.append(val: PatternTerm (subpatternId));
494	}
495
496	PatternDisjunction* copyDisjunction(PatternDisjunction* disjunction)
497	{
498	PatternDisjunction* newDisjunction = new PatternDisjunction ();
499
500	newDisjunction->m_parent = disjunction->m_parent;
501	for (unsigned alt = `0`; alt < disjunction->m_alternatives.size(); ++alt) {
502	PatternAlternative* alternative = disjunction->m_alternatives [alt];
503	PatternAlternative* newAlternative = newDisjunction->addNewAlternative();
504	for (unsigned i = `0`; i < alternative->m_terms.size(); ++i)
505	newAlternative->m_terms.append(val: copyTerm(term&: alternative->m_terms [i]));
506	}
507
508	m_pattern.m_disjunctions.append(val: newDisjunction);
509	return newDisjunction;
510	}
511
512	PatternTerm copyTerm(PatternTerm& term)
513	{
514	if ((term.type != PatternTerm::TypeParenthesesSubpattern) && (term.type != PatternTerm::TypeParentheticalAssertion))
515	return PatternTerm (term);
516
517	PatternTerm termCopy = term;
518	termCopy.parentheses.disjunction = copyDisjunction(disjunction: termCopy.parentheses.disjunction);
519	return termCopy;
520	}
521
522	void quantifyAtom(unsigned min, unsigned max, bool greedy)
523	{
524	ASSERT(min <= max);
525	ASSERT(m_alternative->m_terms.size());
526
527	if (!max) {
528	m_alternative->removeLastTerm();
529	return;
530	}
531
532	PatternTerm& term = m_alternative->lastTerm();
533	ASSERT(term.type > PatternTerm::TypeAssertionWordBoundary);
534	ASSERT((term.quantityCount == `1`) && (term.quantityType == QuantifierFixedCount));
535
536	// For any assertion with a zero minimum, not matching is valid and has no effect,
537	// remove it. Otherwise, we need to match as least once, but there is no point
538	// matching more than once, so remove the quantifier. It is not entirely clear
539	// from the spec whether or not this behavior is correct, but I believe this
540	// matches Firefox. :-/
541	if (term.type == PatternTerm::TypeParentheticalAssertion) {
542	if (!min)
543	m_alternative->removeLastTerm();
544	return;
545	}
546
547	if (min == `0`)
548	term.quantify(count: max, type: greedy ? QuantifierGreedy : QuantifierNonGreedy);
549	else if (min == max)
550	term.quantify(count: min, type: QuantifierFixedCount);
551	else {
552	term.quantify(count: min, type: QuantifierFixedCount);
553	m_alternative->m_terms.append(val: copyTerm(term));
554	// NOTE: this term is interesting from an analysis perspective, in that it can be ignored.....
555	m_alternative->lastTerm().quantify(count: (max == UINT_MAX) ? max : max - min, type: greedy ? QuantifierGreedy : QuantifierNonGreedy);
556	if (m_alternative->lastTerm().type == PatternTerm::TypeParenthesesSubpattern)
557	m_alternative->lastTerm().parentheses.isCopy = true;
558	}
559	}
560
561	void disjunction()
562	{
563	m_alternative = m_alternative->m_parent->addNewAlternative();
564	}
565
566	void regexBegin()
567	{
568	m_pattern.m_body = new PatternDisjunction ();
569	m_alternative = m_pattern.m_body->addNewAlternative();
570	m_pattern.m_disjunctions.append(val: m_pattern.m_body);
571	}
572	void regexEnd()
573	{
574	}
575	void regexError()
576	{
577	}
578
579	unsigned setupAlternativeOffsets(PatternAlternative* alternative, unsigned currentCallFrameSize, unsigned initialInputPosition)
580	{
581	alternative->m_hasFixedSize = true;
582	unsigned currentInputPosition = initialInputPosition;
583
584	for (unsigned i = `0`; i < alternative->m_terms.size(); ++i) {
585	PatternTerm& term = alternative->m_terms [i];
586
587	switch (term.type) {
588	case PatternTerm::TypeAssertionBOL:
589	case PatternTerm::TypeAssertionEOL:
590	case PatternTerm::TypeAssertionWordBoundary:
591	term.inputPosition = currentInputPosition;
592	break;
593
594	case PatternTerm::TypeBackReference:
595	term.inputPosition = currentInputPosition;
596	term.frameLocation = currentCallFrameSize;
597	currentCallFrameSize += RegexStackSpaceForBackTrackInfoBackReference;
598	alternative->m_hasFixedSize = false;
599	break;
600
601	case PatternTerm::TypeForwardReference:
602	break;
603
604	case PatternTerm::TypePatternCharacter:
605	term.inputPosition = currentInputPosition;
606	if (term.quantityType != QuantifierFixedCount) {
607	term.frameLocation = currentCallFrameSize;
608	currentCallFrameSize += RegexStackSpaceForBackTrackInfoPatternCharacter;
609	alternative->m_hasFixedSize = false;
610	} else
611	currentInputPosition += term.quantityCount;
612	break;
613
614	case PatternTerm::TypeCharacterClass:
615	term.inputPosition = currentInputPosition;
616	if (term.quantityType != QuantifierFixedCount) {
617	term.frameLocation = currentCallFrameSize;
618	currentCallFrameSize += RegexStackSpaceForBackTrackInfoCharacterClass;
619	alternative->m_hasFixedSize = false;
620	} else
621	currentInputPosition += term.quantityCount;
622	break;
623
624	case PatternTerm::TypeParenthesesSubpattern:
625	// Note: for fixed once parentheses we will ensure at least the minimum is available; others are on their own.
626	term.frameLocation = currentCallFrameSize;
627	if ((term.quantityCount == `1`) && !term.parentheses.isCopy) {
628	if (term.quantityType == QuantifierFixedCount) {
629	currentCallFrameSize = setupDisjunctionOffsets(disjunction: term.parentheses.disjunction, initialCallFrameSize: currentCallFrameSize, initialInputPosition: currentInputPosition);
630	currentInputPosition += term.parentheses.disjunction->m_minimumSize;
631	} else {
632	currentCallFrameSize += RegexStackSpaceForBackTrackInfoParenthesesOnce;
633	currentCallFrameSize = setupDisjunctionOffsets(disjunction: term.parentheses.disjunction, initialCallFrameSize: currentCallFrameSize, initialInputPosition: currentInputPosition);
634	}
635	term.inputPosition = currentInputPosition;
636	} else {
637	term.inputPosition = currentInputPosition;
638	setupDisjunctionOffsets(disjunction: term.parentheses.disjunction, initialCallFrameSize: `0`, initialInputPosition: currentInputPosition);
639	currentCallFrameSize += RegexStackSpaceForBackTrackInfoParentheses;
640	}
641	// Fixed count of 1 could be accepted, if they have a fixed size AND* if all alternatives are of the same length.*
642	alternative->m_hasFixedSize = false;
643	break;
644
645	case PatternTerm::TypeParentheticalAssertion:
646	term.inputPosition = currentInputPosition;
647	term.frameLocation = currentCallFrameSize;
648	currentCallFrameSize = setupDisjunctionOffsets(disjunction: term.parentheses.disjunction, initialCallFrameSize: currentCallFrameSize + RegexStackSpaceForBackTrackInfoParentheticalAssertion, initialInputPosition: currentInputPosition);
649	break;
650	}
651	}
652
653	alternative->m_minimumSize = currentInputPosition - initialInputPosition;
654	return currentCallFrameSize;
655	}
656
657	unsigned setupDisjunctionOffsets(PatternDisjunction* disjunction, unsigned initialCallFrameSize, unsigned initialInputPosition)
658	{
659	if ((disjunction != m_pattern.m_body) && (disjunction->m_alternatives.size() > `1`))
660	initialCallFrameSize += RegexStackSpaceForBackTrackInfoAlternative;
661
662	unsigned minimumInputSize = UINT_MAX;
663	unsigned maximumCallFrameSize = `0`;
664	bool hasFixedSize = true;
665
666	for (unsigned alt = `0`; alt < disjunction->m_alternatives.size(); ++alt) {
667	PatternAlternative* alternative = disjunction->m_alternatives [alt];
668	unsigned currentAlternativeCallFrameSize = setupAlternativeOffsets(alternative, currentCallFrameSize: initialCallFrameSize, initialInputPosition);
669	minimumInputSize = min(a: minimumInputSize, b: alternative->m_minimumSize);
670	maximumCallFrameSize = max(a: maximumCallFrameSize, b: currentAlternativeCallFrameSize);
671	hasFixedSize &= alternative->m_hasFixedSize;
672	}
673
674	ASSERT(minimumInputSize != UINT_MAX);
675	ASSERT(maximumCallFrameSize >= initialCallFrameSize);
676
677	disjunction->m_hasFixedSize = hasFixedSize;
678	disjunction->m_minimumSize = minimumInputSize;
679	disjunction->m_callFrameSize = maximumCallFrameSize;
680	return maximumCallFrameSize;
681	}
682
683	void setupOffsets()
684	{
685	setupDisjunctionOffsets(disjunction: m_pattern.m_body, initialCallFrameSize: `0`, initialInputPosition: `0`);
686	}
687
688	private:
689	RegexPattern& m_pattern;
690	PatternAlternative* m_alternative;
691	CharacterClassConstructor m_characterClassConstructor;
692	bool m_invertCharacterClass;
693	};
694
695
696	const char* compileRegex(const UString& patternString, RegexPattern& pattern)
697	{
698	RegexPatternConstructor constructor(pattern);
699
700	if (const char* error = parse(delegate&: constructor, pattern: patternString))
701	return error;
702
703	// If the pattern contains illegal backreferences reset & reparse.
704	// Quoting Netscape's "What's new in JavaScript 1.2",
705	// "Note: if the number of left parentheses is less than the number specified
706	// in \#, the \# is taken as an octal escape as described in the next row."
707	if (pattern.containsIllegalBackReference()) {
708	unsigned numSubpatterns = pattern.m_numSubpatterns;
709
710	constructor.reset();
711	#if !ASSERT_DISABLED
712	const char* error =
713	#endif
714	parse(delegate&: constructor, pattern: patternString, backReferenceLimit: numSubpatterns);
715
716	ASSERT(!error);
717	ASSERT(numSubpatterns == pattern.m_numSubpatterns);
718	}
719
720	constructor.setupOffsets();
721
722	return NULL;
723	};
724
725
726	} }
727
728	#endif
729

source code of qtscript/src/3rdparty/javascriptcore/JavaScriptCore/yarr/RegexCompiler.cpp