YarrParser.h source code [qtdeclarative/src/3rdparty/masm/yarr/YarrParser.h]

1	/*
2	* Copyright (C) 2009, 2014-2016 Apple Inc. All rights reserved.
3	*
4	* Redistribution and use in source and binary forms, with or without
5	* modification, are permitted provided that the following conditions
6	* are met:
7	* 1. Redistributions of source code must retain the above copyright
8	* notice, this list of conditions and the following disclaimer.
9	* 2. Redistributions in binary form must reproduce the above copyright
10	* notice, this list of conditions and the following disclaimer in the
11	* documentation and/or other materials provided with the distribution.
12	*
13	* THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
14	* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
16	* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
17	* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
18	* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
19	* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
20	* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
21	* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
23	* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24	*/
25
26	#pragma once
27
28	#include "Yarr.h"
29	#include "YarrPattern.h"
30	#include "YarrUnicodeProperties.h"
31	#include <wtf/ASCIICType.h>
32	#include <wtf/HashSet.h>
33	#include <wtf/Optional.h>
34	#include <wtf/text/StringBuilder.h>
35	#include <wtf/text/WTFString.h>
36
37	namespace JSC { namespace Yarr {
38
39	// The Parser class should not be used directly - only via the Yarr::parse() method.
40	template<class Delegate, typename CharType>
41	class Parser {
42	private:
43	template<class FriendDelegate>
44	friend ErrorCode parse(FriendDelegate&, const String& pattern, bool isUnicode, unsigned backReferenceLimit);
45
46	/*
47	* CharacterClassParserDelegate:
48	*
49	* The class CharacterClassParserDelegate is used in the parsing of character
50	* classes. This class handles detection of character ranges. This class
51	* implements enough of the delegate interface such that it can be passed to
52	* parseEscape() as an EscapeDelegate. This allows parseEscape() to be reused
53	* to perform the parsing of escape characters in character sets.
54	*/
55	class CharacterClassParserDelegate {
56	public:
57	CharacterClassParserDelegate(Delegate& delegate, ErrorCode& err)
58	: m_delegate(delegate)
59	, m_errorCode(err)
60	, m_state(Empty)
61	, m_character(`0`)
62	{
63	}
64
65	/*
66	* begin():
67	*
68	* Called at beginning of construction.
69	*/
70	void begin(bool invert)
71	{
72	m_delegate.atomCharacterClassBegin(invert);
73	}
74
75	/*
76	* atomPatternCharacter():
77	*
78	* This method is called either from parseCharacterClass() (for an unescaped
79	* character in a character class), or from parseEscape(). In the former case
80	* the value true will be passed for the argument 'hyphenIsRange', and in this
81	* mode we will allow a hypen to be treated as indicating a range (i.e. /[a-z]/
82	* is different to /[a\-z]/).
83	*/
84	void atomPatternCharacter(UChar32 ch, bool hyphenIsRange = false)
85	{
86	switch (m_state) {
87	case AfterCharacterClass:
88	// Following a builtin character class we need look out for a hyphen.
89	// We're looking for invalid ranges, such as /[\d-x]/ or /[\d-\d]/.
90	// If we see a hyphen following a charater class then unlike usual
91	// we'll report it to the delegate immediately, and put ourself into
92	// a poisoned state. Any following calls to add another character or
93	// character class will result in an error. (A hypen following a
94	// character-class is itself valid, but only at the end of a regex).
95	if (hyphenIsRange && ch == `'-'`) {
96	m_delegate.atomCharacterClassAtom(`'-'`);
97	m_state = AfterCharacterClassHyphen;
98	return;
99	}
100	// Otherwise just fall through - cached character so treat this as Empty.
101	FALLTHROUGH;
102
103	case Empty:
104	m_character = ch;
105	m_state = CachedCharacter;
106	return;
107
108	case CachedCharacter:
109	if (hyphenIsRange && ch == `'-'`)
110	m_state = CachedCharacterHyphen;
111	else {
112	m_delegate.atomCharacterClassAtom(m_character);
113	m_character = ch;
114	}
115	return;
116
117	case CachedCharacterHyphen:
118	if (ch < m_character) {
119	m_errorCode = ErrorCode::CharacterClassOutOfOrder;
120	return;
121	}
122	m_delegate.atomCharacterClassRange(m_character, ch);
123	m_state = Empty;
124	return;
125
126	// See coment in atomBuiltInCharacterClass below.
127	// This too is technically an error, per ECMA-262, and again we
128	// we chose to allow this. Note a subtlely here that while we
129	// diverge from the spec's definition of CharacterRange we do
130	// remain in compliance with the grammar. For example, consider
131	// the expression /[\d-a-z]/. We comply with the grammar in
132	// this case by not allowing a-z to be matched as a range.
133	case AfterCharacterClassHyphen:
134	m_delegate.atomCharacterClassAtom(ch);
135	m_state = Empty;
136	return;
137	}
138	}
139
140	/*
141	* atomBuiltInCharacterClass():
142	*
143	* Adds a built-in character class, called by parseEscape().
144	*/
145	void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert)
146	{
147	switch (m_state) {
148	case CachedCharacter:
149	// Flush the currently cached character, then fall through.
150	m_delegate.atomCharacterClassAtom(m_character);
151	FALLTHROUGH;
152	case Empty:
153	case AfterCharacterClass:
154	m_state = AfterCharacterClass;
155	m_delegate.atomCharacterClassBuiltIn(classID, invert);
156	return;
157
158	// If we hit either of these cases, we have an invalid range that
159	// looks something like /[x-\d]/ or /[\d-\d]/.
160	// According to ECMA-262 this should be a syntax error, but
161	// empirical testing shows this to break teh webz. Instead we
162	// comply with to the ECMA-262 grammar, and assume the grammar to
163	// have matched the range correctly, but tweak our interpretation
164	// of CharacterRange. Effectively we implicitly handle the hyphen
165	// as if it were escaped, e.g. /[\w-_]/ is treated as /[\w\-_]/.
166	case CachedCharacterHyphen:
167	m_delegate.atomCharacterClassAtom(m_character);
168	m_delegate.atomCharacterClassAtom(`'-'`);
169	FALLTHROUGH;
170	case AfterCharacterClassHyphen:
171	m_delegate.atomCharacterClassBuiltIn(classID, invert);
172	m_state = Empty;
173	return;
174	}
175	}
176
177	/*
178	* end():
179	*
180	* Called at end of construction.
181	*/
182	void end()
183	{
184	if (m_state == CachedCharacter)
185	m_delegate.atomCharacterClassAtom(m_character);
186	else if (m_state == CachedCharacterHyphen) {
187	m_delegate.atomCharacterClassAtom(m_character);
188	m_delegate.atomCharacterClassAtom(`'-'`);
189	}
190	m_delegate.atomCharacterClassEnd();
191	}
192
193	// parseEscape() should never call these delegate methods when
194	// invoked with inCharacterClass set.
195	NO_RETURN_DUE_TO_ASSERT void assertionWordBoundary(bool) { RELEASE_ASSERT_NOT_REACHED(); }
196	NO_RETURN_DUE_TO_ASSERT void atomBackReference(unsigned) { RELEASE_ASSERT_NOT_REACHED(); }
197	NO_RETURN_DUE_TO_ASSERT void atomNamedBackReference(const String&) { RELEASE_ASSERT_NOT_REACHED(); }
198	bool isValidNamedForwardReference(const String&) { RELEASE_ASSERT_NOT_REACHED(); return false; }
199	NO_RETURN_DUE_TO_ASSERT void atomNamedForwardReference(const String&) { RELEASE_ASSERT_NOT_REACHED(); }
200
201	private:
202	Delegate& m_delegate;
203	ErrorCode& m_errorCode;
204	enum CharacterClassConstructionState {
205	Empty,
206	CachedCharacter,
207	CachedCharacterHyphen,
208	AfterCharacterClass,
209	AfterCharacterClassHyphen,
210	} m_state;
211	UChar32 m_character;
212	};
213
214	Parser(Delegate& delegate, const String& pattern, bool isUnicode, unsigned backReferenceLimit)
215	: m_delegate(delegate)
216	, m_backReferenceLimit(backReferenceLimit)
217	, m_data(pattern.characters<CharType>())
218	, m_size(pattern.size())
219	, m_isUnicode(isUnicode)
220	{
221	}
222
223	// The handling of IdentityEscapes is different depending on the unicode flag.
224	// For Unicode patterns, IdentityEscapes only include SyntaxCharacters or '/'.
225	// For non-unicode patterns, most any character can be escaped.
226	bool isIdentityEscapeAnError(int ch)
227	{
228	if (m_isUnicode && !strchr(s: "^$\\.*+?()[]{}\|/", c: ch)) {
229	m_errorCode = ErrorCode::InvalidIdentityEscape;
230	return true;
231	}
232
233	return false;
234	}
235
236	/*
237	* parseEscape():
238	*
239	* Helper for parseTokens() AND parseCharacterClass().
240	* Unlike the other parser methods, this function does not report tokens
241	* directly to the member delegate (m_delegate), instead tokens are
242	* emitted to the delegate provided as an argument. In the case of atom
243	* escapes, parseTokens() will call parseEscape() passing m_delegate as
244	* an argument, and as such the escape will be reported to the delegate.
245	*
246	* However this method may also be used by parseCharacterClass(), in which
247	* case a CharacterClassParserDelegate will be passed as the delegate that
248	* tokens should be added to. A boolean flag is also provided to indicate
249	* whether that an escape in a CharacterClass is being parsed (some parsing
250	* rules change in this context).
251	*
252	* The boolean value returned by this method indicates whether the token
253	* parsed was an atom (outside of a characted class \b and \B will be
254	* interpreted as assertions).
255	*/
256	template<bool inCharacterClass, class EscapeDelegate>
257	bool parseEscape(EscapeDelegate& delegate)
258	{
259	ASSERT(!hasError(m_errorCode));
260	ASSERT(peek() == `'\\'`);
261	consume();
262
263	if (atEndOfPattern()) {
264	m_errorCode = ErrorCode::EscapeUnterminated;
265	return false;
266	}
267
268	switch (peek()) {
269	// Assertions
270	case `'b'`:
271	consume();
272	if (inCharacterClass) {
273	if (isIdentityEscapeAnError(ch: `'b'`))
274	break;
275
276	delegate.atomPatternCharacter(`'\b'`);
277	} else {
278	delegate.assertionWordBoundary(false);
279	return false;
280	}
281	break;
282	case `'B'`:
283	consume();
284	if (inCharacterClass) {
285	if (isIdentityEscapeAnError(ch: `'B'`))
286	break;
287
288	delegate.atomPatternCharacter(`'B'`);
289	} else {
290	delegate.assertionWordBoundary(true);
291	return false;
292	}
293	break;
294
295	// CharacterClassEscape
296	case `'d'`:
297	consume();
298	delegate.atomBuiltInCharacterClass(BuiltInCharacterClassID::DigitClassID, false);
299	break;
300	case `'s'`:
301	consume();
302	delegate.atomBuiltInCharacterClass(BuiltInCharacterClassID::SpaceClassID, false);
303	break;
304	case `'w'`:
305	consume();
306	delegate.atomBuiltInCharacterClass(BuiltInCharacterClassID::WordClassID, false);
307	break;
308	case `'D'`:
309	consume();
310	delegate.atomBuiltInCharacterClass(BuiltInCharacterClassID::DigitClassID, true);
311	break;
312	case `'S'`:
313	consume();
314	delegate.atomBuiltInCharacterClass(BuiltInCharacterClassID::SpaceClassID, true);
315	break;
316	case `'W'`:
317	consume();
318	delegate.atomBuiltInCharacterClass(BuiltInCharacterClassID::WordClassID, true);
319	break;
320
321	// DecimalEscape
322	case `'1'`:
323	case `'2'`:
324	case `'3'`:
325	case `'4'`:
326	case `'5'`:
327	case `'6'`:
328	case `'7'`:
329	case `'8'`:
330	case `'9'`: {
331	// To match Firefox, we parse an invalid backreference in the range [1-7] as an octal escape.
332	// First, try to parse this as backreference.
333	if (!inCharacterClass) {
334	ParseState state = saveState();
335
336	unsigned backReference = consumeNumber();
337	if (backReference <= m_backReferenceLimit) {
338	delegate.atomBackReference(backReference);
339	break;
340	}
341
342	restoreState(state);
343
344	if (m_isUnicode) {
345	m_errorCode = ErrorCode::InvalidBackreference;
346	return false;
347	}
348	}
349
350	// Not a backreference, and not octal. Just a number.
351	if (peek() >= `'8'`) {
352	delegate.atomPatternCharacter(consume());
353	break;
354	}
355
356	// Fall-through to handle this as an octal escape.
357	FALLTHROUGH;
358	}
359
360	// Octal escape
361	case `'0'`:
362	delegate.atomPatternCharacter(consumeOctal());
363	break;
364
365	// ControlEscape
366	case `'f'`:
367	consume();
368	delegate.atomPatternCharacter(`'\f'`);
369	break;
370	case `'n'`:
371	consume();
372	delegate.atomPatternCharacter(`'\n'`);
373	break;
374	case `'r'`:
375	consume();
376	delegate.atomPatternCharacter(`'\r'`);
377	break;
378	case `'t'`:
379	consume();
380	delegate.atomPatternCharacter(`'\t'`);
381	break;
382	case `'v'`:
383	consume();
384	delegate.atomPatternCharacter(`'\v'`);
385	break;
386
387	// ControlLetter
388	case `'c'`: {
389	ParseState state = saveState();
390	consume();
391	if (!atEndOfPattern()) {
392	int control = consume();
393
394	// To match Firefox, inside a character class, we also accept numbers and '_' as control characters.
395	if (inCharacterClass ? WTF::isASCIIAlphanumeric(c: control) \|\| (control == `'_'`) : WTF::isASCIIAlpha(c: control)) {
396	delegate.atomPatternCharacter(control & `0x1f`);
397	break;
398	}
399	}
400	restoreState(state);
401	delegate.atomPatternCharacter(`'\\'`);
402	break;
403	}
404
405	// HexEscape
406	case `'x'`: {
407	consume();
408	int x = tryConsumeHex(count: `2`);
409	if (x == -`1`) {
410	if (isIdentityEscapeAnError(ch: `'x'`))
411	break;
412
413	delegate.atomPatternCharacter(`'x'`);
414	} else
415	delegate.atomPatternCharacter(x);
416	break;
417	}
418
419	// Named backreference
420	case `'k'`: {
421	consume();
422	ParseState state = saveState();
423	if (!atEndOfPattern() && !inCharacterClass) {
424	if (consume() == `'<'`) {
425	auto groupName = tryConsumeGroupName();
426	if (groupName) {
427	if (m_captureGroupNames.contains(groupName.value())) {
428	delegate.atomNamedBackReference(groupName.value());
429	break;
430	}
431
432	if (delegate.isValidNamedForwardReference(groupName.value())) {
433	delegate.atomNamedForwardReference(groupName.value());
434	break;
435	}
436	}
437	if (m_isUnicode) {
438	m_errorCode = ErrorCode::InvalidBackreference;
439	break;
440	}
441	}
442	}
443	restoreState(state);
444	delegate.atomPatternCharacter(`'k'`);
445	break;
446	}
447
448	// Unicode property escapes
449	case `'p'`:
450	case `'P'`: {
451	int escapeChar = consume();
452
453	if (!m_isUnicode) {
454	if (isIdentityEscapeAnError(ch: escapeChar))
455	break;
456	delegate.atomPatternCharacter(escapeChar);
457	break;
458	}
459
460	if (!atEndOfPattern() && peek() == `'{'`) {
461	consume();
462	auto optClassID = tryConsumeUnicodePropertyExpression();
463	if (!optClassID) {
464	// tryConsumeUnicodePropertyExpression() will set m_errorCode for a malformed property expression
465	break;
466	}
467	delegate.atomBuiltInCharacterClass(optClassID.value(), escapeChar == `'P'`);
468	} else
469	m_errorCode = ErrorCode::InvalidUnicodePropertyExpression;
470	break;
471	}
472
473	// UnicodeEscape
474	case `'u'`: {
475	consume();
476	if (atEndOfPattern()) {
477	if (isIdentityEscapeAnError(ch: `'u'`))
478	break;
479
480	delegate.atomPatternCharacter(`'u'`);
481	break;
482	}
483
484	if (m_isUnicode && peek() == `'{'`) {
485	consume();
486	UChar32 codePoint = `0`;
487	do {
488	if (atEndOfPattern() \|\| !isASCIIHexDigit(peek())) {
489	m_errorCode = ErrorCode::InvalidUnicodeEscape;
490	break;
491	}
492
493	codePoint = (codePoint << `4`) \| toASCIIHexValue(consume());
494
495	if (codePoint > UCHAR_MAX_VALUE)
496	m_errorCode = ErrorCode::InvalidUnicodeEscape;
497	} while (!atEndOfPattern() && peek() != `'}'`);
498	if (!atEndOfPattern() && peek() == `'}'`)
499	consume();
500	else if (!hasError(errorCode: m_errorCode))
501	m_errorCode = ErrorCode::InvalidUnicodeEscape;
502	if (hasError(errorCode: m_errorCode))
503	return false;
504
505	delegate.atomPatternCharacter(codePoint);
506	break;
507	}
508	int u = tryConsumeHex(count: `4`);
509	if (u == -`1`) {
510	if (isIdentityEscapeAnError(ch: `'u'`))
511	break;
512
513	delegate.atomPatternCharacter(`'u'`);
514	} else {
515	// If we have the first of a surrogate pair, look for the second.
516	if (U16_IS_LEAD(u) && m_isUnicode && (patternRemaining() >= `6`) && peek() == `'\\'`) {
517	ParseState state = saveState();
518	consume();
519
520	if (tryConsume(ch: `'u'`)) {
521	int surrogate2 = tryConsumeHex(count: `4`);
522	if (U16_IS_TRAIL(surrogate2)) {
523	u = U16_GET_SUPPLEMENTARY(u, surrogate2);
524	delegate.atomPatternCharacter(u);
525	break;
526	}
527	}
528
529	restoreState(state);
530	}
531	delegate.atomPatternCharacter(u);
532	}
533	break;
534	}
535
536	// IdentityEscape
537	default:
538	int ch = peek();
539
540	if (ch == `'-'` && m_isUnicode && inCharacterClass) {
541	// \- is allowed for ClassEscape with unicode flag.
542	delegate.atomPatternCharacter(consume());
543	break;
544	}
545
546	if (isIdentityEscapeAnError(ch))
547	break;
548
549	delegate.atomPatternCharacter(consume());
550	}
551
552	return true;
553	}
554
555	UChar32 consumePossibleSurrogatePair()
556	{
557	UChar32 ch = consume();
558	if (U16_IS_LEAD(ch) && m_isUnicode && (patternRemaining() > `0`)) {
559	ParseState state = saveState();
560
561	UChar32 surrogate2 = consume();
562	if (U16_IS_TRAIL(surrogate2))
563	ch = U16_GET_SUPPLEMENTARY(ch, surrogate2);
564	else
565	restoreState(state);
566	}
567
568	return ch;
569	}
570
571	/*
572	* parseAtomEscape(), parseCharacterClassEscape():
573	*
574	* These methods alias to parseEscape().
575	*/
576	bool parseAtomEscape()
577	{
578	return parseEscape<false>(m_delegate);
579	}
580	void parseCharacterClassEscape(CharacterClassParserDelegate& delegate)
581	{
582	parseEscape<true>(delegate);
583	}
584
585	/*
586	* parseCharacterClass():
587	*
588	* Helper for parseTokens(); calls dirctly and indirectly (via parseCharacterClassEscape)
589	* to an instance of CharacterClassParserDelegate, to describe the character class to the
590	* delegate.
591	*/
592	void parseCharacterClass()
593	{
594	ASSERT(!hasError(m_errorCode));
595	ASSERT(peek() == `'['`);
596	consume();
597
598	CharacterClassParserDelegate characterClassConstructor(m_delegate, m_errorCode);
599
600	characterClassConstructor.begin(tryConsume(ch: `'^'`));
601
602	while (!atEndOfPattern()) {
603	switch (peek()) {
604	case `']'`:
605	consume();
606	characterClassConstructor.end();
607	return;
608
609	case `'\\'`:
610	parseCharacterClassEscape(delegate&: characterClassConstructor);
611	break;
612
613	default:
614	characterClassConstructor.atomPatternCharacter(consumePossibleSurrogatePair(), true);
615	}
616
617	if (hasError(errorCode: m_errorCode))
618	return;
619	}
620
621	m_errorCode = ErrorCode::CharacterClassUnmatched;
622	}
623
624	/*
625	* parseParenthesesBegin():
626	*
627	* Helper for parseTokens(); checks for parentheses types other than regular capturing subpatterns.
628	*/
629	void parseParenthesesBegin()
630	{
631	ASSERT(!hasError(m_errorCode));
632	ASSERT(peek() == `'('`);
633	consume();
634
635	if (tryConsume(ch: `'?'`)) {
636	if (atEndOfPattern()) {
637	m_errorCode = ErrorCode::ParenthesesTypeInvalid;
638	return;
639	}
640
641	switch (consume()) {
642	case `':'`:
643	m_delegate.atomParenthesesSubpatternBegin(false);
644	break;
645
646	case `'='`:
647	m_delegate.atomParentheticalAssertionBegin();
648	break;
649
650	case `'!'`:
651	m_delegate.atomParentheticalAssertionBegin(true);
652	break;
653
654	case `'<'`: {
655	auto groupName = tryConsumeGroupName();
656	if (groupName) {
657	auto setAddResult = m_captureGroupNames.add(k: groupName.value());
658	if (setAddResult.isNewEntry)
659	m_delegate.atomParenthesesSubpatternBegin(true, groupName);
660	else
661	m_errorCode = ErrorCode::DuplicateGroupName;
662	} else
663	m_errorCode = ErrorCode::InvalidGroupName;
664
665	break;
666	}
667
668	default:
669	m_errorCode = ErrorCode::ParenthesesTypeInvalid;
670	}
671	} else
672	m_delegate.atomParenthesesSubpatternBegin();
673
674	++m_parenthesesNestingDepth;
675	}
676
677	/*
678	* parseParenthesesEnd():
679	*
680	* Helper for parseTokens(); checks for parse errors (due to unmatched parentheses).
681	*/
682	void parseParenthesesEnd()
683	{
684	ASSERT(!hasError(m_errorCode));
685	ASSERT(peek() == `')'`);
686	consume();
687
688	if (m_parenthesesNestingDepth > `0`)
689	m_delegate.atomParenthesesEnd();
690	else
691	m_errorCode = ErrorCode::ParenthesesUnmatched;
692
693	--m_parenthesesNestingDepth;
694	}
695
696	/*
697	* parseQuantifier():
698	*
699	* Helper for parseTokens(); checks for parse errors and non-greedy quantifiers.
700	*/
701	void parseQuantifier(bool lastTokenWasAnAtom, unsigned min, unsigned max)
702	{
703	ASSERT(!hasError(m_errorCode));
704	ASSERT(min <= max);
705
706	const unsigned quantifyLimit = `1` << `24`;
707	if (min > quantifyLimit \|\| (max != quantifyInfinite && max > quantifyLimit)) {
708	m_errorCode = ErrorCode::QuantifierTooLarge;
709	return;
710	}
711
712	if (lastTokenWasAnAtom)
713	m_delegate.quantifyAtom(min, max, !tryConsume(ch: `'?'`));
714	else
715	m_errorCode = ErrorCode::QuantifierWithoutAtom;
716	}
717
718	/*
719	* parseTokens():
720	*
721	* This method loops over the input pattern reporting tokens to the delegate.
722	* The method returns when a parse error is detected, or the end of the pattern
723	* is reached. One piece of state is tracked around the loop, which is whether
724	* the last token passed to the delegate was an atom (this is necessary to detect
725	* a parse error when a quantifier provided without an atom to quantify).
726	*/
727	void parseTokens()
728	{
729	bool lastTokenWasAnAtom = false;
730
731	while (!atEndOfPattern()) {
732	switch (peek()) {
733	case `'\|'`:
734	consume();
735	m_delegate.disjunction();
736	lastTokenWasAnAtom = false;
737	break;
738
739	case `'('`:
740	parseParenthesesBegin();
741	lastTokenWasAnAtom = false;
742	break;
743
744	case `')'`:
745	parseParenthesesEnd();
746	lastTokenWasAnAtom = true;
747	break;
748
749	case `'^'`:
750	consume();
751	m_delegate.assertionBOL();
752	lastTokenWasAnAtom = false;
753	break;
754
755	case `'$'`:
756	consume();
757	m_delegate.assertionEOL();
758	lastTokenWasAnAtom = false;
759	break;
760
761	case `'.'`:
762	consume();
763	m_delegate.atomBuiltInCharacterClass(BuiltInCharacterClassID::DotClassID, false);
764	lastTokenWasAnAtom = true;
765	break;
766
767	case `'['`:
768	parseCharacterClass();
769	lastTokenWasAnAtom = true;
770	break;
771
772	case `'\\'`:
773	lastTokenWasAnAtom = parseAtomEscape();
774	break;
775
776	case `'*'`:
777	consume();
778	parseQuantifier(lastTokenWasAnAtom, min: `0`, max: quantifyInfinite);
779	lastTokenWasAnAtom = false;
780	break;
781
782	case `'+'`:
783	consume();
784	parseQuantifier(lastTokenWasAnAtom, min: `1`, max: quantifyInfinite);
785	lastTokenWasAnAtom = false;
786	break;
787
788	case `'?'`:
789	consume();
790	parseQuantifier(lastTokenWasAnAtom, min: `0`, max: `1`);
791	lastTokenWasAnAtom = false;
792	break;
793
794	case `'{'`: {
795	ParseState state = saveState();
796
797	consume();
798	if (peekIsDigit()) {
799	unsigned min = consumeNumber();
800	unsigned max = min;
801
802	if (tryConsume(ch: `','`))
803	max = peekIsDigit() ? consumeNumber() : quantifyInfinite;
804
805	if (tryConsume(ch: `'}'`)) {
806	if (min <= max)
807	parseQuantifier(lastTokenWasAnAtom, min, max);
808	else
809	m_errorCode = ErrorCode::QuantifierOutOfOrder;
810	lastTokenWasAnAtom = false;
811	break;
812	}
813	}
814
815	restoreState(state);
816	}
817	// if we did not find a complete quantifer, fall through to the default case.
818	FALLTHROUGH;
819
820	default:
821	m_delegate.atomPatternCharacter(consumePossibleSurrogatePair());
822	lastTokenWasAnAtom = true;
823	}
824
825	if (hasError(errorCode: m_errorCode))
826	return;
827	}
828
829	if (m_parenthesesNestingDepth > `0`)
830	m_errorCode = ErrorCode::MissingParentheses;
831	}
832
833	/*
834	* parse():
835	*
836	* This method calls parseTokens() to parse over the input and returns error code for a result.
837	*/
838	ErrorCode parse()
839	{
840	if (m_size > MAX_PATTERN_SIZE)
841	m_errorCode = ErrorCode::PatternTooLarge;
842	else
843	parseTokens();
844	ASSERT(atEndOfPattern() \|\| hasError(m_errorCode));
845
846	return m_errorCode;
847	}
848
849	// Misc helper functions:
850
851	typedef unsigned ParseState;
852
853	ParseState saveState()
854	{
855	return m_index;
856	}
857
858	void restoreState(ParseState state)
859	{
860	m_index = state;
861	}
862
863	bool atEndOfPattern()
864	{
865	ASSERT(m_index <= m_size);
866	return m_index == m_size;
867	}
868
869	unsigned patternRemaining()
870	{
871	ASSERT(m_index <= m_size);
872	return m_size - m_index;
873	}
874
875	int peek()
876	{
877	ASSERT(m_index < m_size);
878	return m_data[m_index];
879	}
880
881	bool peekIsDigit()
882	{
883	return !atEndOfPattern() && WTF::isASCIIDigit(peek());
884	}
885
886	unsigned peekDigit()
887	{
888	ASSERT(peekIsDigit());
889	return peek() - `'0'`;
890	}
891
892	char16_t tryConsumeUnicodeEscape()
893	{
894	if (!tryConsume(ch: `'u'`))
895	return -`1`;
896
897	if (m_isUnicode && tryConsume(ch: `'{'`)) {
898	int codePoint = `0`;
899	do {
900	if (atEndOfPattern() \|\| !isASCIIHexDigit(peek())) {
901	m_errorCode = ErrorCode::InvalidUnicodeEscape;
902	return -`1`;
903	}
904
905	codePoint = (codePoint << `4`) \| toASCIIHexValue(consume());
906
907	if (codePoint > UCHAR_MAX_VALUE) {
908	m_errorCode = ErrorCode::InvalidUnicodeEscape;
909	return -`1`;
910	}
911	} while (!atEndOfPattern() && peek() != `'}'`);
912	if (!atEndOfPattern() && peek() == `'}'`)
913	consume();
914	else if (!hasError(errorCode: m_errorCode))
915	m_errorCode = ErrorCode::InvalidUnicodeEscape;
916	if (hasError(errorCode: m_errorCode))
917	return -`1`;
918
919	return codePoint;
920	}
921
922	int u = tryConsumeHex(count: `4`);
923	if (u == -`1`)
924	return -`1`;
925
926	// If we have the first of a surrogate pair, look for the second.
927	if (U16_IS_LEAD(u) && m_isUnicode && (patternRemaining() >= `6`) && peek() == `'\\'`) {
928	ParseState state = saveState();
929	consume();
930
931	if (tryConsume(ch: `'u'`)) {
932	int surrogate2 = tryConsumeHex(count: `4`);
933	if (U16_IS_TRAIL(surrogate2)) {
934	u = U16_GET_SUPPLEMENTARY(u, surrogate2);
935	return u;
936	}
937	}
938
939	restoreState(state);
940	}
941
942	return u;
943	}
944
945	char16_t tryConsumeIdentifierCharacter()
946	{
947	char16_t ch = peek();
948
949	if (ch == `'\\'`) {
950	consume();
951	ch = tryConsumeUnicodeEscape();
952	} else
953	consume();
954
955	return ch;
956	}
957
958	bool isIdentifierStart(int ch)
959	{
960	return (WTF::isASCII(c: ch) && (WTF::isASCIIAlpha(c: ch) \|\| ch == `'_'` \|\| ch == `'$'`)) \|\| (U_GET_GC_MASK(ch) & U_GC_L_MASK);
961	}
962
963	bool isIdentifierPart(int ch)
964	{
965	return (WTF::isASCII(c: ch) && (WTF::isASCIIAlpha(c: ch) \|\| ch == `'_'` \|\| ch == `'$'`)) \|\| (U_GET_GC_MASK(ch) & (U_GC_L_MASK \| U_GC_MN_MASK \| U_GC_MC_MASK \| U_GC_ND_MASK \| U_GC_PC_MASK)) \|\| ch == `0x200C` \|\| ch == `0x200D`;
966	}
967
968	bool isUnicodePropertyValueExpressionChar(int ch)
969	{
970	return WTF::isASCIIAlphanumeric(c: ch) \|\| ch == `'_'` \|\| ch == `'='`;
971	}
972
973	char16_t consume()
974	{
975	ASSERT(m_index < m_size);
976	return m_data[m_index++];
977	}
978
979	unsigned consumeDigit()
980	{
981	ASSERT(peekIsDigit());
982	return consume() - `'0'`;
983	}
984
985	unsigned consumeNumber()
986	{
987	Checked<unsigned, RecordOverflow> n = consumeDigit();
988	while (peekIsDigit())
989	n = n * `10` + consumeDigit();
990	return n.hasOverflowed() ? quantifyInfinite : n.unsafeGet();
991	}
992
993	unsigned consumeOctal()
994	{
995	ASSERT(WTF::isASCIIOctalDigit(peek()));
996
997	unsigned n = consumeDigit();
998	while (n < `32` && !atEndOfPattern() && WTF::isASCIIOctalDigit(peek()))
999	n = n * `8` + consumeDigit();
1000	return n;
1001	}
1002
1003	bool tryConsume(UChar ch)
1004	{
1005	if (atEndOfPattern() \|\| (m_data[m_index] != ch))
1006	return false;
1007	++m_index;
1008	return true;
1009	}
1010
1011	int tryConsumeHex(int count)
1012	{
1013	ParseState state = saveState();
1014
1015	int n = `0`;
1016	while (count--) {
1017	if (atEndOfPattern() \|\| !WTF::isASCIIHexDigit(peek())) {
1018	restoreState(state);
1019	return -`1`;
1020	}
1021	n = (n << `4`) \| WTF::toASCIIHexValue(consume());
1022	}
1023	return n;
1024	}
1025
1026	std::optional<String> tryConsumeGroupName()
1027	{
1028	if (atEndOfPattern())
1029	return std::nullopt;
1030
1031	ParseState state = saveState();
1032
1033	char16_t ch = tryConsumeIdentifierCharacter();
1034
1035	if (isIdentifierStart(ch)) {
1036	StringBuilder identifierBuilder;
1037	identifierBuilder.append(c: ch);
1038
1039	while (!atEndOfPattern()) {
1040	ch = tryConsumeIdentifierCharacter();
1041	if (ch == `'>'`)
1042	return std::optional<String>(identifierBuilder.toString());
1043
1044	if (!isIdentifierPart(ch))
1045	break;
1046
1047	identifierBuilder.append(c: ch);
1048	}
1049	}
1050
1051	restoreState(state);
1052
1053	return std::nullopt;
1054	}
1055
1056	std::optional<BuiltInCharacterClassID> tryConsumeUnicodePropertyExpression()
1057	{
1058	if (atEndOfPattern() \|\| !isUnicodePropertyValueExpressionChar(ch: peek())) {
1059	m_errorCode = ErrorCode::InvalidUnicodePropertyExpression;
1060	return std::nullopt;
1061	}
1062
1063	StringBuilder expressionBuilder;
1064	String unicodePropertyName;
1065	bool foundEquals = false;
1066	unsigned errors = `0`;
1067
1068	expressionBuilder.append(c: QChar(consume()));
1069
1070	while (!atEndOfPattern()) {
1071	char16_t ch = peek();
1072	if (ch == `'}'`) {
1073	consume();
1074	if (errors) {
1075	m_errorCode = ErrorCode::InvalidUnicodePropertyExpression;
1076	return std::nullopt;
1077	}
1078
1079	if (foundEquals) {
1080	auto result = unicodeMatchPropertyValue(unicodePropertyName, expressionBuilder.toString());
1081	if (!result)
1082	m_errorCode = ErrorCode::InvalidUnicodePropertyExpression;
1083	return result;
1084	}
1085
1086	auto result = unicodeMatchProperty(expressionBuilder.toString());
1087	if (!result)
1088	m_errorCode = ErrorCode::InvalidUnicodePropertyExpression;
1089	return result;
1090	}
1091
1092	consume();
1093	if (ch == `'='`) {
1094	if (!foundEquals) {
1095	foundEquals = true;
1096	unicodePropertyName = expressionBuilder.toString();
1097	expressionBuilder.clear();
1098	} else
1099	errors++;
1100	} else if (!isUnicodePropertyValueExpressionChar(ch))
1101	errors++;
1102	else
1103	expressionBuilder.append(c: ch);
1104	}
1105
1106	m_errorCode = ErrorCode::InvalidUnicodePropertyExpression;
1107	return std::nullopt;
1108	}
1109
1110	Delegate& m_delegate;
1111	unsigned m_backReferenceLimit;
1112	ErrorCode m_errorCode { ErrorCode::NoError };
1113	const CharType* m_data;
1114	unsigned m_size;
1115	unsigned m_index { `0` };
1116	bool m_isUnicode;
1117	unsigned m_parenthesesNestingDepth { `0` };
1118	HashSet<String> m_captureGroupNames;
1119
1120	// Derived by empirical testing of compile time in PCRE and WREC.
1121	static const unsigned MAX_PATTERN_SIZE = `1024` * `1024`;
1122	};
1123
1124	/*
1125	* Yarr::parse():
1126	*
1127	* The parse method is passed a pattern to be parsed and a delegate upon which
1128	* callbacks will be made to record the parsed tokens forming the regex.
1129	* Yarr::parse() returns null on success, or a const C string providing an error
1130	* message where a parse error occurs.
1131	*
1132	* The Delegate must implement the following interface:
1133	*
1134	* void assertionBOL();
1135	* void assertionEOL();
1136	* void assertionWordBoundary(bool invert);
1137	*
1138	* void atomPatternCharacter(UChar32 ch);
1139	* void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert);
1140	* void atomCharacterClassBegin(bool invert)
1141	* void atomCharacterClassAtom(UChar32 ch)
1142	* void atomCharacterClassRange(UChar32 begin, UChar32 end)
1143	* void atomCharacterClassBuiltIn(BuiltInCharacterClassID classID, bool invert)
1144	* void atomCharacterClassEnd()
1145	* void atomParenthesesSubpatternBegin(bool capture = true, Optional<String> groupName);
1146	* void atomParentheticalAssertionBegin(bool invert = false);
1147	* void atomParenthesesEnd();
1148	* void atomBackReference(unsigned subpatternId);
1149	* void atomNamedBackReference(const String& subpatternName);
1150	* bool isValidNamedForwardReference(const String& subpatternName);
1151	* void atomNamedForwardReference(const String& subpatternName);
1152	*
1153	* void quantifyAtom(unsigned min, unsigned max, bool greedy);
1154	*
1155	* void disjunction();
1156	*
1157	* The regular expression is described by a sequence of assertion() and atom()
1158	* callbacks to the delegate, describing the terms in the regular expression.
1159	* Following an atom a quantifyAtom() call may occur to indicate that the previous
1160	* atom should be quantified. In the case of atoms described across multiple
1161	* calls (parentheses and character classes) the call to quantifyAtom() will come
1162	* after the call to the atomEnd() method, never after atomBegin().
1163	*
1164	* Character classes may either be described by a single call to
1165	* atomBuiltInCharacterClass(), or by a sequence of atomCharacterClass*() calls.
1166	* In the latter case, ...Begin() will be called, followed by a sequence of
1167	* calls to ...Atom(), ...Range(), and ...BuiltIn(), followed by a call to ...End().
1168	*
1169	* Sequences of atoms and assertions are broken into alternatives via calls to
1170	* disjunction(). Assertions, atoms, and disjunctions emitted between calls to
1171	* atomParenthesesBegin() and atomParenthesesEnd() form the body of a subpattern.
1172	* atomParenthesesBegin() is passed a subpatternId. In the case of a regular
1173	* capturing subpattern, this will be the subpatternId associated with these
1174	* parentheses, and will also by definition be the lowest subpatternId of these
1175	* parentheses and of any nested paretheses. The atomParenthesesEnd() method
1176	* is passed the subpatternId of the last capturing subexpression nested within
1177	* these paretheses. In the case of a capturing subpattern with no nested
1178	* capturing subpatterns, the same subpatternId will be passed to the begin and
1179	* end functions. In the case of non-capturing subpatterns the subpatternId
1180	* passed to the begin method is also the first possible subpatternId that might
1181	* be nested within these paretheses. If a set of non-capturing parentheses does
1182	* not contain any capturing subpatterns, then the subpatternId passed to begin
1183	* will be greater than the subpatternId passed to end.
1184	*/
1185
1186	template<class Delegate>
1187	ErrorCode parse(Delegate& delegate, const String& pattern, bool isUnicode, unsigned backReferenceLimit = quantifyInfinite)
1188	{
1189	if (pattern.is8Bit())
1190	return Parser<Delegate, LChar>(delegate, pattern, isUnicode, backReferenceLimit).parse();
1191	return Parser<Delegate, UChar>(delegate, pattern, isUnicode, backReferenceLimit).parse();
1192	}
1193
1194	} } // namespace JSC::Yarr
1195

source code of qtdeclarative/src/3rdparty/masm/yarr/YarrParser.h