YarrPattern.cpp source code [qtdeclarative/src/3rdparty/masm/yarr/YarrPattern.cpp]

1	/*
2	* Copyright (C) 2009, 2013-2016 Apple Inc. All rights reserved.
3	* Copyright (C) 2010 Peter Varga (pvarga@inf.u-szeged.hu), University of Szeged
4	*
5	* Redistribution and use in source and binary forms, with or without
6	* modification, are permitted provided that the following conditions
7	* are met:
8	* 1. Redistributions of source code must retain the above copyright
9	* notice, this list of conditions and the following disclaimer.
10	* 2. Redistributions in binary form must reproduce the above copyright
11	* notice, this list of conditions and the following disclaimer in the
12	* documentation and/or other materials provided with the distribution.
13	*
14	* THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
15	* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
17	* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
18	* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
19	* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20	* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
21	* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
22	* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
24	* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25	*/
26
27	#include "config.h"
28	#include "YarrPattern.h"
29
30	#include "Options.h"
31	#include "Yarr.h"
32	#include "YarrCanonicalize.h"
33	#include "YarrParser.h"
34	#include <wtf/DataLog.h>
35	#include <wtf/Optional.h>
36	#include <wtf/Vector.h>
37	#include <wtf/text/WTFString.h>
38
39	namespace JSC { namespace Yarr {
40
41	#include "RegExpJitTables.h"
42
43	class CharacterClassConstructor {
44	public:
45	CharacterClassConstructor(bool isCaseInsensitive, CanonicalMode canonicalMode)
46	: m_isCaseInsensitive(isCaseInsensitive)
47	, m_hasNonBMPCharacters(false)
48	, m_anyCharacter(false)
49	, m_canonicalMode(canonicalMode)
50	{
51	}
52
53	void reset()
54	{
55	m_matches.clear();
56	m_ranges.clear();
57	m_matchesUnicode.clear();
58	m_rangesUnicode.clear();
59	m_hasNonBMPCharacters = false;
60	m_anyCharacter = false;
61	}
62
63	void append(const CharacterClass* other)
64	{
65	for (size_t i = `0`; i < other->m_matches.size(); ++i)
66	addSorted(matches&: m_matches, ch: other->m_matches [i]);
67	for (size_t i = `0`; i < other->m_ranges.size(); ++i)
68	addSortedRange(ranges&: m_ranges, lo: other->m_ranges [i].begin, hi: other->m_ranges [i].end);
69	for (size_t i = `0`; i < other->m_matchesUnicode.size(); ++i)
70	addSorted(matches&: m_matchesUnicode, ch: other->m_matchesUnicode [i]);
71	for (size_t i = `0`; i < other->m_rangesUnicode.size(); ++i)
72	addSortedRange(ranges&: m_rangesUnicode, lo: other->m_rangesUnicode [i].begin, hi: other->m_rangesUnicode [i].end);
73	}
74
75	void appendInverted(const CharacterClass* other)
76	{
77	auto addSortedInverted = [&](UChar32 min, UChar32 max,
78	const Vector<UChar32>& srcMatches, const Vector<CharacterRange>& srcRanges,
79	Vector<UChar32>& destMatches, Vector<CharacterRange>& destRanges) {
80
81	auto addSortedMatchOrRange = [&](UChar32 lo, UChar32 hiPlusOne) {
82	if (lo < hiPlusOne) {
83	if (lo + `1` == hiPlusOne)
84	addSorted(destMatches, lo);
85	else
86	addSortedRange(ranges&: destRanges, lo, hi: hiPlusOne - `1`);
87	}
88	};
89
90	UChar32 lo = min;
91	size_t matchesIndex = `0`;
92	size_t rangesIndex = `0`;
93	bool matchesRemaining = matchesIndex < srcMatches.size();
94	bool rangesRemaining = rangesIndex < srcRanges.size();
95
96	if (!matchesRemaining && !rangesRemaining) {
97	addSortedMatchOrRange(min, max + `1`);
98	return;
99	}
100
101	while (matchesRemaining \|\| rangesRemaining) {
102	UChar32 hiPlusOne;
103	UChar32 nextLo;
104
105	if (matchesRemaining
106	&& (!rangesRemaining \|\| srcMatches[matchesIndex] < srcRanges[rangesIndex].begin)) {
107	hiPlusOne = srcMatches[matchesIndex];
108	nextLo = hiPlusOne + `1`;
109	++matchesIndex;
110	matchesRemaining = matchesIndex < srcMatches.size();
111	} else {
112	hiPlusOne = srcRanges[rangesIndex].begin;
113	nextLo = srcRanges[rangesIndex].end + `1`;
114	++rangesIndex;
115	rangesRemaining = rangesIndex < srcRanges.size();
116	}
117
118	addSortedMatchOrRange(lo, hiPlusOne);
119
120	lo = nextLo;
121	}
122
123	addSortedMatchOrRange(lo, max + `1`);
124	};
125
126	addSortedInverted(`0`, `0x7f`, other->m_matches, other->m_ranges, m_matches, m_ranges);
127	addSortedInverted(`0x80`, `0x10ffff`, other->m_matchesUnicode, other->m_rangesUnicode, m_matchesUnicode, m_rangesUnicode);
128	}
129
130	void putChar(UChar32 ch)
131	{
132	if (!m_isCaseInsensitive) {
133	addSorted(ch);
134	return;
135	}
136
137	if (m_canonicalMode == CanonicalMode::UCS2 && isASCII(ch)) {
138	// Handle ASCII cases.
139	if (isASCIIAlpha(ch)) {
140	addSorted(m_matches, toASCIIUpper(ch));
141	addSorted(m_matches, toASCIILower(ch));
142	} else
143	addSorted(matches&: m_matches, ch);
144	return;
145	}
146
147	// Add multiple matches, if necessary.
148	const CanonicalizationRange* info = canonicalRangeInfoFor(ch, canonicalMode: m_canonicalMode);
149	if (info->type == CanonicalizeUnique)
150	addSorted(ch);
151	else
152	putUnicodeIgnoreCase(ch, info);
153	}
154
155	void putUnicodeIgnoreCase(UChar32 ch, const CanonicalizationRange* info)
156	{
157	ASSERT(m_isCaseInsensitive);
158	ASSERT(ch >= info->begin && ch <= info->end);
159	ASSERT(info->type != CanonicalizeUnique);
160	if (info->type == CanonicalizeSet) {
161	for (const UChar32* set = canonicalCharacterSetInfo(index: info->value, canonicalMode: m_canonicalMode); (ch = *set); ++set)
162	addSorted(ch);
163	} else {
164	addSorted(ch);
165	addSorted(ch: getCanonicalPair(info, ch));
166	}
167	}
168
169	void putRange(UChar32 lo, UChar32 hi)
170	{
171	if (isASCII(lo)) {
172	char asciiLo = lo;
173	char asciiHi = std::min(hi, (UChar32)`0x7f`);
174	addSortedRange(ranges&: m_ranges, lo, hi: asciiHi);
175
176	if (m_isCaseInsensitive) {
177	if ((asciiLo <= `'Z'`) && (asciiHi >= `'A'`))
178	addSortedRange(ranges&: m_ranges, lo: std::max(asciiLo, `'A'`)+(`'a'`-`'A'`), hi: std::min(asciiHi, `'Z'`)+(`'a'`-`'A'`));
179	if ((asciiLo <= `'z'`) && (asciiHi >= `'a'`))
180	addSortedRange(ranges&: m_ranges, lo: std::max(asciiLo, `'a'`)+(`'A'`-`'a'`), hi: std::min(asciiHi, `'z'`)+(`'A'`-`'a'`));
181	}
182	}
183	if (isASCII(hi))
184	return;
185
186	lo = std::max(lo, (UChar32)`0x80`);
187	addSortedRange(ranges&: m_rangesUnicode, lo, hi);
188
189	if (!m_isCaseInsensitive)
190	return;
191
192	const CanonicalizationRange* info = canonicalRangeInfoFor(ch: lo, canonicalMode: m_canonicalMode);
193	while (true) {
194	// Handle the range [lo .. end]
195	UChar32 end = std::min<UChar32>(info->end, hi);
196
197	switch (info->type) {
198	case CanonicalizeUnique:
199	// Nothing to do - no canonical equivalents.
200	break;
201	case CanonicalizeSet: {
202	UChar ch;
203	for (const UChar32* set = canonicalCharacterSetInfo(index: info->value, canonicalMode: m_canonicalMode); (ch = *set); ++set)
204	addSorted(matches&: m_matchesUnicode, ch);
205	break;
206	}
207	case CanonicalizeRangeLo:
208	addSortedRange(ranges&: m_rangesUnicode, lo: lo + info->value, hi: end + info->value);
209	break;
210	case CanonicalizeRangeHi:
211	addSortedRange(ranges&: m_rangesUnicode, lo: lo - info->value, hi: end - info->value);
212	break;
213	case CanonicalizeAlternatingAligned:
214	// Use addSortedRange since there is likely an abutting range to combine with.
215	if (lo & `1`)
216	addSortedRange(ranges&: m_rangesUnicode, lo: lo - `1`, hi: lo - `1`);
217	if (!(end & `1`))
218	addSortedRange(ranges&: m_rangesUnicode, lo: end + `1`, hi: end + `1`);
219	break;
220	case CanonicalizeAlternatingUnaligned:
221	// Use addSortedRange since there is likely an abutting range to combine with.
222	if (!(lo & `1`))
223	addSortedRange(ranges&: m_rangesUnicode, lo: lo - `1`, hi: lo - `1`);
224	if (end & `1`)
225	addSortedRange(ranges&: m_rangesUnicode, lo: end + `1`, hi: end + `1`);
226	break;
227	}
228
229	if (hi == end)
230	return;
231
232	++info;
233	lo = info->begin;
234	};
235
236	}
237
238	std::unique_ptr<CharacterClass> charClass()
239	{
240	coalesceTables();
241
242	auto characterClass = std::make_unique<CharacterClass>();
243
244	characterClass->m_matches.swap(m_matches);
245	characterClass->m_ranges.swap(m_ranges);
246	characterClass->m_matchesUnicode.swap(m_matchesUnicode);
247	characterClass->m_rangesUnicode.swap(m_rangesUnicode);
248	characterClass->m_hasNonBMPCharacters = hasNonBMPCharacters();
249	characterClass->m_anyCharacter = anyCharacter();
250
251	m_hasNonBMPCharacters = false;
252	m_anyCharacter = false;
253
254	return characterClass;
255	}
256
257	private:
258	void addSorted(UChar32 ch)
259	{
260	addSorted(matches&: isASCII(ch) ? m_matches : m_matchesUnicode, ch);
261	}
262
263	void addSorted(Vector<UChar32>& matches, UChar32 ch)
264	{
265	unsigned pos = `0`;
266	unsigned range = matches.size();
267
268	if (!U_IS_BMP(ch))
269	m_hasNonBMPCharacters = true;
270
271	// binary chop, find position to insert char.
272	while (range) {
273	unsigned index = range >> `1`;
274
275	int val = matches [pos+index] - ch;
276	if (!val)
277	return;
278	else if (val > `0`) {
279	if (val == `1`) {
280	UChar32 lo = ch;
281	UChar32 hi = ch + `1`;
282	matches.remove(position: pos + index);
283	if (pos + index > `0` && matches [pos + index - `1`] == ch - `1`) {
284	lo = ch - `1`;
285	matches.remove(position: pos + index - `1`);
286	}
287	addSortedRange(ranges&: isASCII(ch) ? m_ranges : m_rangesUnicode, lo, hi);
288	return;
289	}
290	range = index;
291	} else {
292	if (val == -`1`) {
293	UChar32 lo = ch - `1`;
294	UChar32 hi = ch;
295	matches.remove(position: pos + index);
296	if (pos + index + `1` < matches.size() && matches [pos + index + `1`] == ch + `1`) {
297	hi = ch + `1`;
298	matches.remove(position: pos + index + `1`);
299	}
300	addSortedRange(ranges&: isASCII(ch) ? m_ranges : m_rangesUnicode, lo, hi);
301	return;
302	}
303	pos += (index+`1`);
304	range -= (index+`1`);
305	}
306	}
307
308	if (pos == matches.size())
309	matches.append(value: ch);
310	else
311	matches.insert(position: pos, value: ch);
312	}
313
314	void addSortedRange(Vector<CharacterRange>& ranges, UChar32 lo, UChar32 hi)
315	{
316	size_t end = ranges.size();
317
318	if (!U_IS_BMP(hi))
319	m_hasNonBMPCharacters = true;
320
321	// Simple linear scan - I doubt there are that many ranges anyway...
322	// feel free to fix this with something faster (eg binary chop).
323	for (size_t i = `0`; i < end; ++i) {
324	// does the new range fall before the current position in the array
325	if (hi < ranges [i].begin) {
326	// Concatenate appending ranges.
327	if (hi == (ranges [i].begin - `1`)) {
328	ranges [i].begin = lo;
329	return;
330	}
331	ranges.insert(position: i, value: CharacterRange (lo, hi));
332	return;
333	}
334	// Okay, since we didn't hit the last case, the end of the new range is definitely at or after the beginning
335	// If the new range start at or before the end of the last range, then the overlap (if it starts one after the
336	// end of the last range they concatenate, which is just as good.
337	if (lo <= (ranges [i].end + `1`)) {
338	// found an intersect! we'll replace this entry in the array.
339	ranges [i].begin = std::min(ranges [i].begin, lo);
340	ranges [i].end = std::max(ranges [i].end, hi);
341
342	mergeRangesFrom(ranges, index: i);
343	return;
344	}
345	}
346
347	// CharacterRange comes after all existing ranges.
348	ranges.append(other: CharacterRange (lo, hi));
349	}
350
351	void mergeRangesFrom(Vector<CharacterRange>& ranges, size_t index)
352	{
353	size_t next = index + `1`;
354
355	// each iteration of the loop we will either remove something from the list, or break out of the loop.
356	while (next < ranges.size()) {
357	if (ranges [next].begin <= (ranges [index].end + `1`)) {
358	// the next entry now overlaps / concatenates with this one.
359	ranges [index].end = std::max(ranges [index].end, ranges [next].end);
360	ranges.remove(position: next);
361	} else
362	break;
363	}
364
365	}
366
367	void coalesceTables()
368	{
369	auto coalesceMatchesAndRanges = [&](Vector<UChar32>& matches, Vector<CharacterRange>& ranges) {
370
371	size_t matchesIndex = `0`;
372	size_t rangesIndex = `0`;
373
374	while (matchesIndex < matches.size() && rangesIndex < ranges.size()) {
375	while (matchesIndex < matches.size() && matches[matchesIndex] < ranges[rangesIndex].begin - `1`)
376	matchesIndex++;
377
378	if (matchesIndex < matches.size() && matches[matchesIndex] == ranges[rangesIndex].begin - `1`) {
379	ranges[rangesIndex].begin = matches[matchesIndex];
380	matches.remove(matchesIndex);
381	}
382
383	while (matchesIndex < matches.size() && matches[matchesIndex] < ranges[rangesIndex].end + `1`)
384	matchesIndex++;
385
386	if (matchesIndex < matches.size()) {
387	if (matches[matchesIndex] == ranges[rangesIndex].end + `1`) {
388	ranges[rangesIndex].end = matches[matchesIndex];
389	matches.remove(matchesIndex);
390
391	mergeRangesFrom(ranges&: ranges, index: rangesIndex);
392	} else
393	matchesIndex++;
394	}
395	}
396	};
397
398	coalesceMatchesAndRanges(m_matches, m_ranges);
399	coalesceMatchesAndRanges(m_matchesUnicode, m_rangesUnicode);
400
401	if (!m_matches.size() && !m_matchesUnicode.size()
402	&& m_ranges.size() == `1` && m_rangesUnicode.size() == `1`
403	&& m_ranges [`0`].begin == `0` && m_ranges [`0`].end == `0x7f`
404	&& m_rangesUnicode [`0`].begin == `0x80` && m_rangesUnicode [`0`].end == `0x10ffff`)
405	m_anyCharacter = true;
406	}
407
408	bool hasNonBMPCharacters()
409	{
410	return m_hasNonBMPCharacters;
411	}
412
413	bool anyCharacter()
414	{
415	return m_anyCharacter;
416	}
417
418	bool m_isCaseInsensitive : `1`;
419	bool m_hasNonBMPCharacters : `1`;
420	bool m_anyCharacter : `1`;
421	CanonicalMode m_canonicalMode;
422
423	Vector<UChar32> m_matches;
424	Vector<CharacterRange> m_ranges;
425	Vector<UChar32> m_matchesUnicode;
426	Vector<CharacterRange> m_rangesUnicode;
427	};
428
429	class YarrPatternConstructor {
430	public:
431	YarrPatternConstructor(YarrPattern& pattern, void* stackLimit)
432	: m_pattern(pattern)
433	, m_characterClassConstructor (pattern.ignoreCase(), pattern.unicode() ? CanonicalMode::Unicode : CanonicalMode::UCS2)
434	, m_stackLimit(stackLimit)
435	{
436	auto body = std::make_unique<PatternDisjunction>();
437	m_pattern.m_body = body.get();
438	m_alternative = body->addNewAlternative();
439	m_pattern.m_disjunctions.append(WTFMove(body));
440	}
441
442	~YarrPatternConstructor()
443	{
444	}
445
446	void resetForReparsing()
447	{
448	m_pattern.resetForReparsing();
449	m_characterClassConstructor.reset();
450
451	auto body = std::make_unique<PatternDisjunction>();
452	m_pattern.m_body = body.get();
453	m_alternative = body->addNewAlternative();
454	m_pattern.m_disjunctions.append(WTFMove(body));
455	}
456
457	void saveUnmatchedNamedForwardReferences()
458	{
459	m_unmatchedNamedForwardReferences.shrink(`0`);
460
461	for (auto& entry : m_pattern.m_namedForwardReferences) {
462	if (!m_pattern.m_captureGroupNames.contains(entry))
463	m_unmatchedNamedForwardReferences.append(entry);
464	}
465	}
466
467	void assertionBOL()
468	{
469	if (!m_alternative->m_terms.size() && !m_invertParentheticalAssertion) {
470	m_alternative->m_startsWithBOL = true;
471	m_alternative->m_containsBOL = true;
472	m_pattern.m_containsBOL = true;
473	}
474	m_alternative->m_terms.append(other: PatternTerm::BOL());
475	}
476	void assertionEOL()
477	{
478	m_alternative->m_terms.append(other: PatternTerm::EOL());
479	}
480	void assertionWordBoundary(bool invert)
481	{
482	m_alternative->m_terms.append(other: PatternTerm::WordBoundary(invert));
483	}
484
485	void atomPatternCharacter(UChar32 ch)
486	{
487	// We handle case-insensitive checking of unicode characters which do have both
488	// cases by handling them as if they were defined using a CharacterClass.
489	if (!m_pattern.ignoreCase() \|\| (isASCII(ch) && !m_pattern.unicode())) {
490	m_alternative->m_terms.append(other: PatternTerm (ch));
491	return;
492	}
493
494	const CanonicalizationRange* info = canonicalRangeInfoFor(ch, canonicalMode: m_pattern.unicode() ? CanonicalMode::Unicode : CanonicalMode::UCS2);
495	if (info->type == CanonicalizeUnique) {
496	m_alternative->m_terms.append(other: PatternTerm (ch));
497	return;
498	}
499
500	m_characterClassConstructor.putUnicodeIgnoreCase(ch, info);
501	auto newCharacterClass = m_characterClassConstructor.charClass();
502	m_alternative->m_terms.append(other: PatternTerm(newCharacterClass.get(), false));
503	m_pattern.m_userCharacterClasses.append(WTFMove(newCharacterClass));
504	}
505
506	void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert)
507	{
508	switch (classID) {
509	case BuiltInCharacterClassID::DigitClassID:
510	m_alternative->m_terms.append(other: PatternTerm (m_pattern.digitsCharacterClass(), invert));
511	break;
512	case BuiltInCharacterClassID::SpaceClassID:
513	m_alternative->m_terms.append(other: PatternTerm (m_pattern.spacesCharacterClass(), invert));
514	break;
515	case BuiltInCharacterClassID::WordClassID:
516	if (m_pattern.unicode() && m_pattern.ignoreCase())
517	m_alternative->m_terms.append(other: PatternTerm (m_pattern.wordUnicodeIgnoreCaseCharCharacterClass(), invert));
518	else
519	m_alternative->m_terms.append(other: PatternTerm (m_pattern.wordcharCharacterClass(), invert));
520	break;
521	case BuiltInCharacterClassID::DotClassID:
522	ASSERT(!invert);
523	if (m_pattern.dotAll())
524	m_alternative->m_terms.append(other: PatternTerm (m_pattern.anyCharacterClass(), false));
525	else
526	m_alternative->m_terms.append(other: PatternTerm (m_pattern.newlineCharacterClass(), true));
527	break;
528	default:
529	m_alternative->m_terms.append(other: PatternTerm (m_pattern.unicodeCharacterClassFor(unicodeClassID: classID), invert));
530	break;
531	}
532	}
533
534	void atomCharacterClassBegin(bool invert = false)
535	{
536	m_invertCharacterClass = invert;
537	}
538
539	void atomCharacterClassAtom(UChar32 ch)
540	{
541	m_characterClassConstructor.putChar(ch);
542	}
543
544	void atomCharacterClassRange(UChar32 begin, UChar32 end)
545	{
546	m_characterClassConstructor.putRange(lo: begin, hi: end);
547	}
548
549	void atomCharacterClassBuiltIn(BuiltInCharacterClassID classID, bool invert)
550	{
551	ASSERT(classID != BuiltInCharacterClassID::DotClassID);
552
553	switch (classID) {
554	case BuiltInCharacterClassID::DigitClassID:
555	m_characterClassConstructor.append(other: invert ? m_pattern.nondigitsCharacterClass() : m_pattern.digitsCharacterClass());
556	break;
557
558	case BuiltInCharacterClassID::SpaceClassID:
559	m_characterClassConstructor.append(other: invert ? m_pattern.nonspacesCharacterClass() : m_pattern.spacesCharacterClass());
560	break;
561
562	case BuiltInCharacterClassID::WordClassID:
563	if (m_pattern.unicode() && m_pattern.ignoreCase())
564	m_characterClassConstructor.append(other: invert ? m_pattern.nonwordUnicodeIgnoreCaseCharCharacterClass() : m_pattern.wordUnicodeIgnoreCaseCharCharacterClass());
565	else
566	m_characterClassConstructor.append(other: invert ? m_pattern.nonwordcharCharacterClass() : m_pattern.wordcharCharacterClass());
567	break;
568
569	default:
570	if (!invert)
571	m_characterClassConstructor.append(other: m_pattern.unicodeCharacterClassFor(unicodeClassID: classID));
572	else
573	m_characterClassConstructor.appendInverted(other: m_pattern.unicodeCharacterClassFor(unicodeClassID: classID));
574	}
575	}
576
577	void atomCharacterClassEnd()
578	{
579	auto newCharacterClass = m_characterClassConstructor.charClass();
580
581	if (!m_invertCharacterClass && newCharacterClass.get()->m_anyCharacter) {
582	m_alternative->m_terms.append(other: PatternTerm (m_pattern.anyCharacterClass(), false));
583	return;
584	}
585	m_alternative->m_terms.append(other: PatternTerm(newCharacterClass.get(), m_invertCharacterClass));
586	m_pattern.m_userCharacterClasses.append(WTFMove(newCharacterClass));
587	}
588
589	void atomParenthesesSubpatternBegin(bool capture = true, std::optional<String> optGroupName = std::nullopt)
590	{
591	unsigned subpatternId = m_pattern.m_numSubpatterns + `1`;
592	if (capture) {
593	m_pattern.m_numSubpatterns++;
594	if (optGroupName) {
595	while (m_pattern.m_captureGroupNames.size() < subpatternId)
596	m_pattern.m_captureGroupNames.append(other: String());
597	m_pattern.m_captureGroupNames.append(optGroupName.value());
598	m_pattern.m_namedGroupToParenIndex.add(k: optGroupName.value(), v: subpatternId);
599	}
600	} else
601	ASSERT(!optGroupName);
602
603	auto parenthesesDisjunction = std::make_unique<PatternDisjunction>(m_alternative);
604	m_alternative->m_terms.append(other: PatternTerm(PatternTerm::TypeParenthesesSubpattern, subpatternId, parenthesesDisjunction.get(), capture, false));
605	m_alternative = parenthesesDisjunction->addNewAlternative();
606	m_pattern.m_disjunctions.append(WTFMove(parenthesesDisjunction));
607	}
608
609	void atomParentheticalAssertionBegin(bool invert = false)
610	{
611	auto parenthesesDisjunction = std::make_unique<PatternDisjunction>(m_alternative);
612	m_alternative->m_terms.append(other: PatternTerm(PatternTerm::TypeParentheticalAssertion, m_pattern.m_numSubpatterns + `1`, parenthesesDisjunction.get(), false, invert));
613	m_alternative = parenthesesDisjunction->addNewAlternative();
614	m_invertParentheticalAssertion = invert;
615	m_pattern.m_disjunctions.append(WTFMove(parenthesesDisjunction));
616	}
617
618	void atomParenthesesEnd()
619	{
620	ASSERT(m_alternative->m_parent);
621	ASSERT(m_alternative->m_parent->m_parent);
622
623	PatternDisjunction* parenthesesDisjunction = m_alternative->m_parent;
624	m_alternative = m_alternative->m_parent->m_parent;
625
626	PatternTerm& lastTerm = m_alternative->lastTerm();
627
628	unsigned numParenAlternatives = parenthesesDisjunction->m_alternatives.size();
629	unsigned numBOLAnchoredAlts = `0`;
630
631	for (unsigned i = `0`; i < numParenAlternatives; i++) {
632	// Bubble up BOL flags
633	if (parenthesesDisjunction->m_alternatives[i]->m_startsWithBOL)
634	numBOLAnchoredAlts++;
635	}
636
637	if (numBOLAnchoredAlts) {
638	m_alternative->m_containsBOL = true;
639	// If all the alternatives in parens start with BOL, then so does this one
640	if (numBOLAnchoredAlts == numParenAlternatives)
641	m_alternative->m_startsWithBOL = true;
642	}
643
644	lastTerm.parentheses.lastSubpatternId = m_pattern.m_numSubpatterns;
645	m_invertParentheticalAssertion = false;
646	}
647
648	void atomBackReference(unsigned subpatternId)
649	{
650	ASSERT(subpatternId);
651	m_pattern.m_containsBackreferences = true;
652	m_pattern.m_maxBackReference = std::max(m_pattern.m_maxBackReference, subpatternId);
653
654	if (subpatternId > m_pattern.m_numSubpatterns) {
655	m_alternative->m_terms.append(other: PatternTerm::ForwardReference());
656	return;
657	}
658
659	PatternAlternative* currentAlternative = m_alternative;
660	ASSERT(currentAlternative);
661
662	// Note to self: if we waited until the AST was baked, we could also remove forwards refs
663	while ((currentAlternative = currentAlternative->m_parent->m_parent)) {
664	PatternTerm& term = currentAlternative->lastTerm();
665	ASSERT((term.type == PatternTerm::TypeParenthesesSubpattern) \|\| (term.type == PatternTerm::TypeParentheticalAssertion));
666
667	if ((term.type == PatternTerm::TypeParenthesesSubpattern) && term.capture() && (subpatternId == term.parentheses.subpatternId)) {
668	m_alternative->m_terms.append(other: PatternTerm::ForwardReference());
669	return;
670	}
671	}
672
673	m_alternative->m_terms.append(other: PatternTerm (subpatternId));
674	}
675
676	void atomNamedBackReference(const String& subpatternName)
677	{
678	ASSERT(m_pattern.m_namedGroupToParenIndex.find(subpatternName) != m_pattern.m_namedGroupToParenIndex.end());
679	atomBackReference(subpatternId: m_pattern.m_namedGroupToParenIndex.get(k: subpatternName));
680	}
681
682	bool isValidNamedForwardReference(const String& subpatternName)
683	{
684	return !m_unmatchedNamedForwardReferences.contains(subpatternName);
685	}
686
687	void atomNamedForwardReference(const String& subpatternName)
688	{
689	if (!m_pattern.m_namedForwardReferences.contains(value: subpatternName))
690	m_pattern.m_namedForwardReferences.append(value: subpatternName);
691	m_alternative->m_terms.append(other: PatternTerm::ForwardReference());
692	}
693
694	// deep copy the argument disjunction. If filterStartsWithBOL is true,
695	// skip alternatives with m_startsWithBOL set true.
696	PatternDisjunction* copyDisjunction(PatternDisjunction* disjunction, bool filterStartsWithBOL = false)
697	{
698	std::unique_ptr<PatternDisjunction> newDisjunction;
699	for (unsigned alt = `0`; alt < disjunction->m_alternatives.size(); ++alt) {
700	PatternAlternative* alternative = disjunction->m_alternatives [alt].get();
701	if (!filterStartsWithBOL \|\| !alternative->m_startsWithBOL) {
702	if (!newDisjunction) {
703	newDisjunction = std::make_unique<PatternDisjunction>();
704	newDisjunction->m_parent = disjunction->m_parent;
705	}
706	PatternAlternative* newAlternative = newDisjunction->addNewAlternative();
707	newAlternative->m_terms.reserveInitialCapacity(size: alternative->m_terms.size());
708	for (unsigned i = `0`; i < alternative->m_terms.size(); ++i)
709	newAlternative->m_terms.append(other: copyTerm(term&: alternative->m_terms [i], filterStartsWithBOL));
710	}
711	}
712
713	if (!newDisjunction)
714	return `0`;
715
716	PatternDisjunction* copiedDisjunction = newDisjunction.get();
717	m_pattern.m_disjunctions.append(WTFMove(newDisjunction));
718	return copiedDisjunction;
719	}
720
721	PatternTerm copyTerm(PatternTerm& term, bool filterStartsWithBOL = false)
722	{
723	if ((term.type != PatternTerm::TypeParenthesesSubpattern) && (term.type != PatternTerm::TypeParentheticalAssertion))
724	return PatternTerm (term);
725
726	PatternTerm termCopy = term;
727	termCopy.parentheses.disjunction = copyDisjunction(disjunction: termCopy.parentheses.disjunction, filterStartsWithBOL);
728	m_pattern.m_hasCopiedParenSubexpressions = true;
729	return termCopy;
730	}
731
732	void quantifyAtom(unsigned min, unsigned max, bool greedy)
733	{
734	ASSERT(min <= max);
735	ASSERT(m_alternative->m_terms.size());
736
737	if (!max) {
738	m_alternative->removeLastTerm();
739	return;
740	}
741
742	PatternTerm& term = m_alternative->lastTerm();
743	ASSERT(term.type > PatternTerm::TypeAssertionWordBoundary);
744	ASSERT(term.quantityMinCount == `1` && term.quantityMaxCount == `1` && term.quantityType == QuantifierFixedCount);
745
746	if (term.type == PatternTerm::TypeParentheticalAssertion) {
747	// If an assertion is quantified with a minimum count of zero, it can simply be removed.
748	// This arises from the RepeatMatcher behaviour in the spec. Matching an assertion never
749	// results in any input being consumed, however the continuation passed to the assertion
750	// (called in steps, 8c and 9 of the RepeatMatcher definition, ES5.1 15.10.2.5) will
751	// reject all zero length matches (see step 2.1). A match from the continuation of the
752	// expression will still be accepted regardless (via steps 8a and 11) - the upshot of all
753	// this is that matches from the assertion are not required, and won't be accepted anyway,
754	// so no need to ever run it.
755	if (!min)
756	m_alternative->removeLastTerm();
757	// We never need to run an assertion more than once. Subsequent interations will be run
758	// with the same start index (since assertions are non-capturing) and the same captures
759	// (per step 4 of RepeatMatcher in ES5.1 15.10.2.5), and as such will always produce the
760	// same result and captures. If the first match succeeds then the subsequent (min - 1)
761	// matches will too. Any additional optional matches will fail (on the same basis as the
762	// minimum zero quantified assertions, above), but this will still result in a match.
763	return;
764	}
765
766	if (min == max)
767	term.quantify(minCount: min, maxCount: max, type: QuantifierFixedCount);
768	else if (!min \|\| (term.type == PatternTerm::TypeParenthesesSubpattern && m_pattern.m_hasCopiedParenSubexpressions))
769	term.quantify(minCount: min, maxCount: max, type: greedy ? QuantifierGreedy : QuantifierNonGreedy);
770	else {
771	term.quantify(minCount: min, maxCount: min, type: QuantifierFixedCount);
772	m_alternative->m_terms.append(other: copyTerm(term));
773	// NOTE: this term is interesting from an analysis perspective, in that it can be ignored.....
774	m_alternative->lastTerm().quantify(count: (max == quantifyInfinite) ? max : max - min, type: greedy ? QuantifierGreedy : QuantifierNonGreedy);
775	if (m_alternative->lastTerm().type == PatternTerm::TypeParenthesesSubpattern)
776	m_alternative->lastTerm().parentheses.isCopy = true;
777	}
778	}
779
780	void disjunction()
781	{
782	m_alternative = m_alternative->m_parent->addNewAlternative();
783	}
784
785	ErrorCode setupAlternativeOffsets(PatternAlternative* alternative, unsigned currentCallFrameSize, unsigned initialInputPosition, unsigned& newCallFrameSize) WARN_UNUSED_RETURN
786	{
787	if (UNLIKELY(!isSafeToRecurse()))
788	return ErrorCode::TooManyDisjunctions;
789
790	ErrorCode error = ErrorCode::NoError;
791	alternative->m_hasFixedSize = true;
792	Checked<unsigned, RecordOverflow> currentInputPosition = initialInputPosition;
793
794	for (unsigned i = `0`; i < alternative->m_terms.size(); ++i) {
795	PatternTerm& term = alternative->m_terms [i];
796
797	switch (term.type) {
798	case PatternTerm::TypeAssertionBOL:
799	case PatternTerm::TypeAssertionEOL:
800	case PatternTerm::TypeAssertionWordBoundary:
801	term.inputPosition = currentInputPosition.unsafeGet();
802	break;
803
804	case PatternTerm::TypeBackReference:
805	term.inputPosition = currentInputPosition.unsafeGet();
806	term.frameLocation = currentCallFrameSize;
807	currentCallFrameSize += YarrStackSpaceForBackTrackInfoBackReference;
808	alternative->m_hasFixedSize = false;
809	break;
810
811	case PatternTerm::TypeForwardReference:
812	break;
813
814	case PatternTerm::TypePatternCharacter:
815	term.inputPosition = currentInputPosition.unsafeGet();
816	if (term.quantityType != QuantifierFixedCount) {
817	term.frameLocation = currentCallFrameSize;
818	currentCallFrameSize += YarrStackSpaceForBackTrackInfoPatternCharacter;
819	alternative->m_hasFixedSize = false;
820	} else if (m_pattern.unicode()) {
821	Checked<unsigned, RecordOverflow> tempCount = term.quantityMaxCount;
822	tempCount *= U16_LENGTH(term.patternCharacter);
823	if (tempCount.hasOverflowed())
824	return ErrorCode::OffsetTooLarge;
825	currentInputPosition += tempCount;
826	} else
827	currentInputPosition += term.quantityMaxCount;
828	break;
829
830	case PatternTerm::TypeCharacterClass:
831	term.inputPosition = currentInputPosition.unsafeGet();
832	if (term.quantityType != QuantifierFixedCount) {
833	term.frameLocation = currentCallFrameSize;
834	currentCallFrameSize += YarrStackSpaceForBackTrackInfoCharacterClass;
835	alternative->m_hasFixedSize = false;
836	} else if (m_pattern.unicode()) {
837	term.frameLocation = currentCallFrameSize;
838	currentCallFrameSize += YarrStackSpaceForBackTrackInfoCharacterClass;
839	currentInputPosition += term.quantityMaxCount;
840	alternative->m_hasFixedSize = false;
841	} else
842	currentInputPosition += term.quantityMaxCount;
843	break;
844
845	case PatternTerm::TypeParenthesesSubpattern:
846	// Note: for fixed once parentheses we will ensure at least the minimum is available; others are on their own.
847	term.frameLocation = currentCallFrameSize;
848	if (term.quantityMaxCount == `1` && !term.parentheses.isCopy) {
849	currentCallFrameSize += YarrStackSpaceForBackTrackInfoParenthesesOnce;
850	error = setupDisjunctionOffsets(disjunction: term.parentheses.disjunction, initialCallFrameSize: currentCallFrameSize, initialInputPosition: currentInputPosition.unsafeGet(), callFrameSize&: currentCallFrameSize);
851	if (hasError(errorCode: error))
852	return error;
853	// If quantity is fixed, then pre-check its minimum size.
854	if (term.quantityType == QuantifierFixedCount)
855	currentInputPosition += term.parentheses.disjunction->m_minimumSize;
856	term.inputPosition = currentInputPosition.unsafeGet();
857	} else if (term.parentheses.isTerminal) {
858	currentCallFrameSize += YarrStackSpaceForBackTrackInfoParenthesesTerminal;
859	error = setupDisjunctionOffsets(disjunction: term.parentheses.disjunction, initialCallFrameSize: currentCallFrameSize, initialInputPosition: currentInputPosition.unsafeGet(), callFrameSize&: currentCallFrameSize);
860	if (hasError(errorCode: error))
861	return error;
862	term.inputPosition = currentInputPosition.unsafeGet();
863	} else {
864	term.inputPosition = currentInputPosition.unsafeGet();
865	currentCallFrameSize += YarrStackSpaceForBackTrackInfoParentheses;
866	error = setupDisjunctionOffsets(disjunction: term.parentheses.disjunction, initialCallFrameSize: currentCallFrameSize, initialInputPosition: currentInputPosition.unsafeGet(), callFrameSize&: currentCallFrameSize);
867	if (hasError(errorCode: error))
868	return error;
869	}
870	// Fixed count of 1 could be accepted, if they have a fixed size AND* if all alternatives are of the same length.*
871	alternative->m_hasFixedSize = false;
872	break;
873
874	case PatternTerm::TypeParentheticalAssertion:
875	term.inputPosition = currentInputPosition.unsafeGet();
876	term.frameLocation = currentCallFrameSize;
877	error = setupDisjunctionOffsets(disjunction: term.parentheses.disjunction, initialCallFrameSize: currentCallFrameSize + YarrStackSpaceForBackTrackInfoParentheticalAssertion, initialInputPosition: currentInputPosition.unsafeGet(), callFrameSize&: currentCallFrameSize);
878	if (hasError(errorCode: error))
879	return error;
880	break;
881
882	case PatternTerm::TypeDotStarEnclosure:
883	ASSERT(!m_pattern.m_saveInitialStartValue);
884	alternative->m_hasFixedSize = false;
885	term.inputPosition = initialInputPosition;
886	m_pattern.m_initialStartValueFrameLocation = currentCallFrameSize;
887	currentCallFrameSize += YarrStackSpaceForDotStarEnclosure;
888	m_pattern.m_saveInitialStartValue = true;
889	break;
890	}
891	if (currentInputPosition.hasOverflowed())
892	return ErrorCode::OffsetTooLarge;
893	}
894
895	alternative->m_minimumSize = (currentInputPosition - initialInputPosition).unsafeGet();
896	newCallFrameSize = currentCallFrameSize;
897	return error;
898	}
899
900	ErrorCode setupDisjunctionOffsets(PatternDisjunction* disjunction, unsigned initialCallFrameSize, unsigned initialInputPosition, unsigned& callFrameSize)
901	{
902	if (UNLIKELY(!isSafeToRecurse()))
903	return ErrorCode::TooManyDisjunctions;
904
905	if ((disjunction != m_pattern.m_body) && (disjunction->m_alternatives.size() > `1`))
906	initialCallFrameSize += YarrStackSpaceForBackTrackInfoAlternative;
907
908	unsigned minimumInputSize = UINT_MAX;
909	unsigned maximumCallFrameSize = `0`;
910	bool hasFixedSize = true;
911	ErrorCode error = ErrorCode::NoError;
912
913	for (unsigned alt = `0`; alt < disjunction->m_alternatives.size(); ++alt) {
914	PatternAlternative* alternative = disjunction->m_alternatives [alt].get();
915	unsigned currentAlternativeCallFrameSize;
916	error = setupAlternativeOffsets(alternative, currentCallFrameSize: initialCallFrameSize, initialInputPosition, newCallFrameSize&: currentAlternativeCallFrameSize);
917	if (hasError(errorCode: error))
918	return error;
919	minimumInputSize = std::min(minimumInputSize, alternative->m_minimumSize);
920	maximumCallFrameSize = std::max(maximumCallFrameSize, currentAlternativeCallFrameSize);
921	hasFixedSize &= alternative->m_hasFixedSize;
922	if (alternative->m_minimumSize > INT_MAX)
923	m_pattern.m_containsUnsignedLengthPattern = true;
924	}
925
926	ASSERT(minimumInputSize != UINT_MAX);
927	ASSERT(maximumCallFrameSize >= initialCallFrameSize);
928
929	disjunction->m_hasFixedSize = hasFixedSize;
930	disjunction->m_minimumSize = minimumInputSize;
931	disjunction->m_callFrameSize = maximumCallFrameSize;
932	callFrameSize = maximumCallFrameSize;
933	return error;
934	}
935
936	ErrorCode setupOffsets()
937	{
938	// FIXME: Yarr should not use the stack to handle subpatterns (rdar://problem/26436314).
939	unsigned ignoredCallFrameSize;
940	return setupDisjunctionOffsets(disjunction: m_pattern.m_body, initialCallFrameSize: `0`, initialInputPosition: `0`, callFrameSize&: ignoredCallFrameSize);
941	}
942
943	// This optimization identifies sets of parentheses that we will never need to backtrack.
944	// In these cases we do not need to store state from prior iterations.
945	// We can presently avoid backtracking for:
946	// where the parens are at the end of the regular expression (last term in any of the*
947	// alternatives of the main body disjunction).
948	// where the parens are non-capturing, and quantified unbounded greedy ().
949	// where the parens do not contain any capturing subpatterns.*
950	void checkForTerminalParentheses()
951	{
952	// This check is much too crude; should be just checking whether the candidate
953	// node contains nested capturing subpatterns, not the whole expression!
954	if (m_pattern.m_numSubpatterns)
955	return;
956
957	Vector<std::unique_ptr<PatternAlternative>>& alternatives = m_pattern.m_body->m_alternatives;
958	for (size_t i = `0`; i < alternatives.size(); ++i) {
959	Vector<PatternTerm>& terms = alternatives[i]->m_terms;
960	if (terms.size()) {
961	PatternTerm& term = terms.last();
962	if (term.type == PatternTerm::TypeParenthesesSubpattern
963	&& term.quantityType == QuantifierGreedy
964	&& term.quantityMinCount == `0`
965	&& term.quantityMaxCount == quantifyInfinite
966	&& !term.capture())
967	term.parentheses.isTerminal = true;
968	}
969	}
970	}
971
972	void optimizeBOL()
973	{
974	// Look for expressions containing beginning of line (^) anchoring and unroll them.
975	// e.g. /^a\|^b\|c/ becomes /^a\|^b\|c/ which is executed once followed by /c/ which loops
976	// This code relies on the parsing code tagging alternatives with m_containsBOL and
977	// m_startsWithBOL and rolling those up to containing alternatives.
978	// At this point, this is only valid for non-multiline expressions.
979	PatternDisjunction* disjunction = m_pattern.m_body;
980
981	if (!m_pattern.m_containsBOL \|\| m_pattern.multiline())
982	return;
983
984	PatternDisjunction* loopDisjunction = copyDisjunction(disjunction, filterStartsWithBOL: true);
985
986	// Set alternatives in disjunction to "onceThrough"
987	for (unsigned alt = `0`; alt < disjunction->m_alternatives.size(); ++alt)
988	disjunction->m_alternatives[alt]->setOnceThrough();
989
990	if (loopDisjunction) {
991	// Move alternatives from loopDisjunction to disjunction
992	for (unsigned alt = `0`; alt < loopDisjunction->m_alternatives.size(); ++alt)
993	disjunction->m_alternatives.append(loopDisjunction->m_alternatives [alt].release());
994
995	loopDisjunction->m_alternatives.clear();
996	}
997	}
998
999	bool containsCapturingTerms(PatternAlternative* alternative, size_t firstTermIndex, size_t endIndex)
1000	{
1001	Vector<PatternTerm>& terms = alternative->m_terms;
1002
1003	ASSERT(endIndex <= terms.size());
1004	for (size_t termIndex = firstTermIndex; termIndex < endIndex; ++termIndex) {
1005	PatternTerm& term = terms[termIndex];
1006
1007	if (term.m_capture)
1008	return true;
1009
1010	if (term.type == PatternTerm::TypeParenthesesSubpattern) {
1011	PatternDisjunction* nestedDisjunction = term.parentheses.disjunction;
1012	for (unsigned alt = `0`; alt < nestedDisjunction->m_alternatives.size(); ++alt) {
1013	if (containsCapturingTerms(nestedDisjunction->m_alternatives[alt].get(), `0`, nestedDisjunction->m_alternatives[alt]->m_terms.size()))
1014	return true;
1015	}
1016	}
1017	}
1018
1019	return false;
1020	}
1021
1022	// This optimization identifies alternatives in the form of
1023	// [^].[?]<expression>.[$] for expressions that don't have any
1024	// capturing terms. The alternative is changed to <expression>
1025	// followed by processing of the dot stars to find and adjust the
1026	// beginning and the end of the match.
1027	void optimizeDotStarWrappedExpressions()
1028	{
1029	Vector<std::unique_ptr<PatternAlternative>>& alternatives = m_pattern.m_body->m_alternatives;
1030	if (alternatives.size() != `1`)
1031	return;
1032
1033	CharacterClass* dotCharacterClass = m_pattern.dotAll() ? m_pattern.anyCharacterClass() : m_pattern.newlineCharacterClass();
1034	PatternAlternative* alternative = alternatives[`0`].get();
1035	Vector<PatternTerm>& terms = alternative->m_terms;
1036	if (terms.size() >= `3`) {
1037	bool startsWithBOL = false;
1038	bool endsWithEOL = false;
1039	size_t termIndex, firstExpressionTerm;
1040
1041	termIndex = `0`;
1042	if (terms[termIndex].type == PatternTerm::TypeAssertionBOL) {
1043	startsWithBOL = true;
1044	++termIndex;
1045	}
1046
1047	PatternTerm& firstNonAnchorTerm = terms[termIndex];
1048	if (firstNonAnchorTerm.type != PatternTerm::TypeCharacterClass
1049	\|\| firstNonAnchorTerm.characterClass != dotCharacterClass
1050	\|\| firstNonAnchorTerm.quantityMinCount
1051	\|\| firstNonAnchorTerm.quantityMaxCount != quantifyInfinite)
1052	return;
1053
1054	firstExpressionTerm = termIndex + `1`;
1055
1056	termIndex = terms.size() - `1`;
1057	if (terms[termIndex].type == PatternTerm::TypeAssertionEOL) {
1058	endsWithEOL = true;
1059	--termIndex;
1060	}
1061
1062	PatternTerm& lastNonAnchorTerm = terms[termIndex];
1063	if (lastNonAnchorTerm.type != PatternTerm::TypeCharacterClass
1064	\|\| lastNonAnchorTerm.characterClass != dotCharacterClass
1065	\|\| lastNonAnchorTerm.quantityType != QuantifierGreedy
1066	\|\| lastNonAnchorTerm.quantityMinCount
1067	\|\| lastNonAnchorTerm.quantityMaxCount != quantifyInfinite)
1068	return;
1069
1070	size_t endIndex = termIndex;
1071	if (firstExpressionTerm >= endIndex)
1072	return;
1073
1074	if (!containsCapturingTerms(alternative, firstTermIndex: firstExpressionTerm, endIndex)) {
1075	for (termIndex = terms.size() - `1`; termIndex >= endIndex; --termIndex)
1076	terms.remove(termIndex);
1077
1078	for (termIndex = firstExpressionTerm; termIndex > `0`; --termIndex)
1079	terms.remove(termIndex - `1`);
1080
1081	terms.append(PatternTerm (startsWithBOL, endsWithEOL));
1082
1083	m_pattern.m_containsBOL = false;
1084	}
1085	}
1086	}
1087
1088	private:
1089	bool isSafeToRecurse() const
1090	{
1091	if (!m_stackLimit)
1092	return true;
1093	int8_t* curr = reinterpret_cast<int8_t*>(&curr);
1094	int8_t* limit = reinterpret_cast<int8_t*>(m_stackLimit);
1095	return curr >= limit;
1096	}
1097
1098	YarrPattern& m_pattern;
1099	PatternAlternative* m_alternative;
1100	CharacterClassConstructor m_characterClassConstructor;
1101	Vector<String> m_unmatchedNamedForwardReferences;
1102	void* m_stackLimit;
1103	bool m_invertCharacterClass;
1104	bool m_invertParentheticalAssertion { false };
1105	};
1106
1107	ErrorCode YarrPattern::compile(const String& patternString, void* stackLimit)
1108	{
1109	YarrPatternConstructor constructor(*this, stackLimit);
1110
1111	if (m_flags == InvalidFlags)
1112	return ErrorCode::InvalidRegularExpressionFlags;
1113
1114	{
1115	ErrorCode error = parse(constructor, patternString, unicode());
1116	if (hasError(errorCode: error))
1117	return error;
1118	}
1119
1120	// If the pattern contains illegal backreferences reset & reparse.
1121	// Quoting Netscape's "What's new in JavaScript 1.2",
1122	// "Note: if the number of left parentheses is less than the number specified
1123	// in \#, the \# is taken as an octal escape as described in the next row."
1124	if (containsIllegalBackReference() \|\| containsIllegalNamedForwardReferences()) {
1125	if (unicode())
1126	return ErrorCode::InvalidBackreference;
1127
1128	unsigned numSubpatterns = m_numSubpatterns;
1129
1130	constructor.saveUnmatchedNamedForwardReferences();
1131	constructor.resetForReparsing();
1132	ErrorCode error = parse(constructor, patternString, unicode(), numSubpatterns);
1133	ASSERT_UNUSED(error, !hasError(error));
1134	ASSERT(numSubpatterns == m_numSubpatterns);
1135	}
1136
1137	constructor.checkForTerminalParentheses();
1138	constructor.optimizeDotStarWrappedExpressions();
1139	constructor.optimizeBOL();
1140
1141	{
1142	ErrorCode error = constructor.setupOffsets();
1143	if (hasError(errorCode: error))
1144	return error;
1145	}
1146
1147	if (Options::dumpCompiledRegExpPatterns())
1148	dumpPattern(pattern: patternString);
1149
1150	return ErrorCode::NoError;
1151	}
1152
1153	YarrPattern::YarrPattern(const String& pattern, RegExpFlags flags, ErrorCode& error, void* stackLimit)
1154	: m_containsBackreferences(false)
1155	, m_containsBOL(false)
1156	, m_containsUnsignedLengthPattern(false)
1157	, m_hasCopiedParenSubexpressions(false)
1158	, m_saveInitialStartValue(false)
1159	, m_flags(flags)
1160	{
1161	error = compile(patternString: pattern, stackLimit);
1162	}
1163
1164	void indentForNestingLevel(PrintStream& out, unsigned nestingDepth)
1165	{
1166	out.print(" ");
1167	for (; nestingDepth; --nestingDepth)
1168	out.print(" ");
1169	}
1170
1171	void dumpUChar32(PrintStream& out, UChar32 c)
1172	{
1173	if (c >= `' '`&& c <= `0xff`)
1174	out.printf(format: "'%c'", static_cast<char>(c));
1175	else
1176	out.printf(format: "0x%04x", c);
1177	}
1178
1179	void dumpCharacterClass(PrintStream& out, YarrPattern* pattern, CharacterClass* characterClass)
1180	{
1181	if (characterClass == pattern->anyCharacterClass())
1182	out.print("<any character>");
1183	else if (characterClass == pattern->newlineCharacterClass())
1184	out.print("<newline>");
1185	else if (characterClass == pattern->digitsCharacterClass())
1186	out.print("<digits>");
1187	else if (characterClass == pattern->spacesCharacterClass())
1188	out.print("<whitespace>");
1189	else if (characterClass == pattern->wordcharCharacterClass())
1190	out.print("<word>");
1191	else if (characterClass == pattern->wordUnicodeIgnoreCaseCharCharacterClass())
1192	out.print("<unicode word ignore case>");
1193	else if (characterClass == pattern->nondigitsCharacterClass())
1194	out.print("<non-digits>");
1195	else if (characterClass == pattern->nonspacesCharacterClass())
1196	out.print("<non-whitespace>");
1197	else if (characterClass == pattern->nonwordcharCharacterClass())
1198	out.print("<non-word>");
1199	else if (characterClass == pattern->nonwordUnicodeIgnoreCaseCharCharacterClass())
1200	out.print("<unicode non-word ignore case>");
1201	else {
1202	bool needMatchesRangesSeperator = false;
1203
1204	auto dumpMatches = [&] (const char* prefix, Vector<UChar32> matches) {
1205	size_t matchesSize = matches.size();
1206	if (matchesSize) {
1207	if (needMatchesRangesSeperator)
1208	out.print(",");
1209	needMatchesRangesSeperator = true;
1210
1211	out.print(prefix, ":(");
1212	for (size_t i = `0`; i < matchesSize; ++i) {
1213	if (i)
1214	out.print(",");
1215	dumpUChar32(out, matches[i]);
1216	}
1217	out.print(")");
1218	}
1219	};
1220
1221	auto dumpRanges = [&] (const char* prefix, Vector<CharacterRange> ranges) {
1222	size_t rangeSize = ranges.size();
1223	if (rangeSize) {
1224	if (needMatchesRangesSeperator)
1225	out.print(",");
1226	needMatchesRangesSeperator = true;
1227
1228	out.print(prefix, " ranges:(");
1229	for (size_t i = `0`; i < rangeSize; ++i) {
1230	if (i)
1231	out.print(",");
1232	CharacterRange range = ranges[i];
1233	out.print("(");
1234	dumpUChar32(out, c: range.begin);
1235	out.print("..");
1236	dumpUChar32(out, c: range.end);
1237	out.print(")");
1238	}
1239	out.print(")");
1240	}
1241	};
1242
1243	out.print("[");
1244	dumpMatches("ASCII", characterClass->m_matches);
1245	dumpRanges("ASCII", characterClass->m_ranges);
1246	dumpMatches("Unicode", characterClass->m_matchesUnicode);
1247	dumpRanges("Unicode", characterClass->m_rangesUnicode);
1248	out.print("]");
1249	}
1250	}
1251
1252	void PatternAlternative::dump(PrintStream& out, YarrPattern* thisPattern, unsigned nestingDepth)
1253	{
1254	out.print("minimum size: ", m_minimumSize);
1255	if (m_hasFixedSize)
1256	out.print(",fixed size");
1257	if (m_onceThrough)
1258	out.print(",once through");
1259	if (m_startsWithBOL)
1260	out.print(",starts with ^");
1261	if (m_containsBOL)
1262	out.print(",contains ^");
1263	out.print("\n");
1264
1265	for (size_t i = `0`; i < m_terms.size(); ++i)
1266	m_terms [i].dump(out, thisPattern, nestingDepth);
1267	}
1268
1269	void PatternTerm::dumpQuantifier(PrintStream& out)
1270	{
1271	if (quantityType == QuantifierFixedCount && quantityMinCount == `1` && quantityMaxCount == `1`)
1272	return;
1273	out.print(" {", quantityMinCount.unsafeGet());
1274	if (quantityMinCount != quantityMaxCount) {
1275	if (quantityMaxCount == UINT_MAX)
1276	out.print(",...");
1277	else
1278	out.print(",", quantityMaxCount.unsafeGet());
1279	}
1280	out.print("}");
1281	if (quantityType == QuantifierGreedy)
1282	out.print(" greedy");
1283	else if (quantityType == QuantifierNonGreedy)
1284	out.print(" non-greedy");
1285	}
1286
1287	void PatternTerm::dump(PrintStream& out, YarrPattern* thisPattern, unsigned nestingDepth)
1288	{
1289	indentForNestingLevel(out, nestingDepth);
1290
1291	if (type != TypeParenthesesSubpattern && type != TypeParentheticalAssertion) {
1292	if (invert())
1293	out.print("not ");
1294	}
1295
1296	switch (type) {
1297	case TypeAssertionBOL:
1298	out.println("BOL");
1299	break;
1300	case TypeAssertionEOL:
1301	out.println("EOL");
1302	break;
1303	case TypeAssertionWordBoundary:
1304	out.println("word boundary");
1305	break;
1306	case TypePatternCharacter:
1307	out.printf(format: "character ");
1308	out.printf(format: "inputPosition %u ", inputPosition);
1309	if (thisPattern->ignoreCase() && isASCIIAlpha(patternCharacter)) {
1310	dumpUChar32(out, toASCIIUpper(patternCharacter));
1311	out.print("/");
1312	dumpUChar32(out, toASCIILower(patternCharacter));
1313	} else
1314	dumpUChar32(out, c: patternCharacter);
1315	dumpQuantifier(out);
1316	if (quantityType != QuantifierFixedCount)
1317	out.print(",frame location ", frameLocation);
1318	out.println();
1319	break;
1320	case TypeCharacterClass:
1321	out.print("character class ");
1322	dumpCharacterClass(out, pattern: thisPattern, characterClass: characterClass);
1323	dumpQuantifier(out);
1324	if (quantityType != QuantifierFixedCount \|\| thisPattern->unicode())
1325	out.print(",frame location ", frameLocation);
1326	out.println();
1327	break;
1328	case TypeBackReference:
1329	out.print("back reference to subpattern #", backReferenceSubpatternId);
1330	out.println(",frame location ", frameLocation);
1331	break;
1332	case TypeForwardReference:
1333	out.println("forward reference");
1334	break;
1335	case TypeParenthesesSubpattern:
1336	if (m_capture)
1337	out.print("captured ");
1338	else
1339	out.print("non-captured ");
1340
1341	FALLTHROUGH;
1342	case TypeParentheticalAssertion:
1343	if (m_invert)
1344	out.print("inverted ");
1345
1346	if (type == TypeParenthesesSubpattern)
1347	out.print("subpattern");
1348	else if (type == TypeParentheticalAssertion)
1349	out.print("assertion");
1350
1351	if (m_capture)
1352	out.print(" #", parentheses.subpatternId);
1353
1354	dumpQuantifier(out);
1355
1356	if (parentheses.isCopy)
1357	out.print(",copy");
1358
1359	if (parentheses.isTerminal)
1360	out.print(",terminal");
1361
1362	out.println(",frame location ", frameLocation);
1363
1364	if (parentheses.disjunction->m_alternatives.size() > `1`) {
1365	indentForNestingLevel(out, nestingDepth: nestingDepth + `1`);
1366	unsigned alternativeFrameLocation = frameLocation;
1367	if (quantityMaxCount == `1` && !parentheses.isCopy)
1368	alternativeFrameLocation += YarrStackSpaceForBackTrackInfoParenthesesOnce;
1369	else if (parentheses.isTerminal)
1370	alternativeFrameLocation += YarrStackSpaceForBackTrackInfoParenthesesTerminal;
1371	else
1372	alternativeFrameLocation += YarrStackSpaceForBackTrackInfoParentheses;
1373	out.println("alternative list,frame location ", alternativeFrameLocation);
1374	}
1375
1376	parentheses.disjunction->dump(out, thisPattern, nestingDepth + `1`);
1377	break;
1378	case TypeDotStarEnclosure:
1379	out.println(".* enclosure,frame location ", thisPattern->m_initialStartValueFrameLocation);
1380	break;
1381	}
1382	}
1383
1384	void PatternDisjunction::dump(PrintStream& out, YarrPattern* thisPattern, unsigned nestingDepth = `0`)
1385	{
1386	unsigned alternativeCount = m_alternatives.size();
1387	for (unsigned i = `0`; i < alternativeCount; ++i) {
1388	indentForNestingLevel(out, nestingDepth);
1389	if (alternativeCount > `1`)
1390	out.print("alternative #", i, ": ");
1391	m_alternatives [i].get()->dump(out, thisPattern, nestingDepth + (alternativeCount > `1`));
1392	}
1393	}
1394
1395	void YarrPattern::dumpPatternString(PrintStream& out, const String& patternString)
1396	{
1397	out.print("/", patternString, "/");
1398
1399	if (global())
1400	out.print("g");
1401	if (ignoreCase())
1402	out.print("i");
1403	if (multiline())
1404	out.print("m");
1405	if (unicode())
1406	out.print("u");
1407	if (sticky())
1408	out.print("y");
1409	}
1410
1411	void YarrPattern::dumpPattern(const String& patternString)
1412	{
1413	dumpPattern(out&: WTF::dataFile(), pattern: patternString);
1414	}
1415
1416	void YarrPattern::dumpPattern(PrintStream& out, const String& patternString)
1417	{
1418	out.print("RegExp pattern for ");
1419	dumpPatternString(out, patternString);
1420
1421	if (m_flags != NoFlags) {
1422	bool printSeperator = false;
1423	out.print(" (");
1424	if (global()) {
1425	out.print("global");
1426	printSeperator = true;
1427	}
1428	if (ignoreCase()) {
1429	if (printSeperator)
1430	out.print("\|");
1431	out.print("ignore case");
1432	printSeperator = true;
1433	}
1434	if (multiline()) {
1435	if (printSeperator)
1436	out.print("\|");
1437	out.print("multiline");
1438	printSeperator = true;
1439	}
1440	if (unicode()) {
1441	if (printSeperator)
1442	out.print("\|");
1443	out.print("unicode");
1444	printSeperator = true;
1445	}
1446	if (sticky()) {
1447	if (printSeperator)
1448	out.print("\|");
1449	out.print("sticky");
1450	printSeperator = true;
1451	}
1452	out.print(")");
1453	}
1454	out.print(":\n");
1455	if (m_body->m_callFrameSize)
1456	out.print(" callframe size: ", m_body->m_callFrameSize, "\n");
1457	m_body->dump(out, thisPattern: this);
1458	}
1459
1460	std::unique_ptr<CharacterClass> anycharCreate()
1461	{
1462	auto characterClass = std::make_unique<CharacterClass>();
1463	characterClass->m_ranges.append(CharacterRange (`0x00`, `0x7f`));
1464	characterClass->m_rangesUnicode.append(CharacterRange (`0x0080`, `0x10ffff`));
1465	characterClass->m_hasNonBMPCharacters = true;
1466	characterClass->m_anyCharacter = true;
1467	return characterClass;
1468	}
1469
1470	} } // namespace JSC::Yarr
1471

source code of qtdeclarative/src/3rdparty/masm/yarr/YarrPattern.cpp