1/*
2 * Copyright (C) 2009, 2014-2016 Apple Inc. All rights reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
6 * are met:
7 * 1. Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * 2. Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 *
13 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
14 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
16 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
17 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
18 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
19 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
20 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
21 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
23 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24 */
25
26#pragma once
27
28#include "Yarr.h"
29#include "YarrPattern.h"
30#include "YarrUnicodeProperties.h"
31#include <wtf/ASCIICType.h>
32#include <wtf/HashSet.h>
33#include <wtf/Optional.h>
34#include <wtf/text/StringBuilder.h>
35#include <wtf/text/WTFString.h>
36
37namespace JSC { namespace Yarr {
38
39// The Parser class should not be used directly - only via the Yarr::parse() method.
40template<class Delegate, typename CharType>
41class Parser {
42private:
43 template<class FriendDelegate>
44 friend ErrorCode parse(FriendDelegate&, const String& pattern, bool isUnicode, unsigned backReferenceLimit);
45
46 /*
47 * CharacterClassParserDelegate:
48 *
49 * The class CharacterClassParserDelegate is used in the parsing of character
50 * classes. This class handles detection of character ranges. This class
51 * implements enough of the delegate interface such that it can be passed to
52 * parseEscape() as an EscapeDelegate. This allows parseEscape() to be reused
53 * to perform the parsing of escape characters in character sets.
54 */
55 class CharacterClassParserDelegate {
56 public:
57 CharacterClassParserDelegate(Delegate& delegate, ErrorCode& err)
58 : m_delegate(delegate)
59 , m_errorCode(err)
60 , m_state(Empty)
61 , m_character(0)
62 {
63 }
64
65 /*
66 * begin():
67 *
68 * Called at beginning of construction.
69 */
70 void begin(bool invert)
71 {
72 m_delegate.atomCharacterClassBegin(invert);
73 }
74
75 /*
76 * atomPatternCharacter():
77 *
78 * This method is called either from parseCharacterClass() (for an unescaped
79 * character in a character class), or from parseEscape(). In the former case
80 * the value true will be passed for the argument 'hyphenIsRange', and in this
81 * mode we will allow a hypen to be treated as indicating a range (i.e. /[a-z]/
82 * is different to /[a\-z]/).
83 */
84 void atomPatternCharacter(UChar32 ch, bool hyphenIsRange = false)
85 {
86 switch (m_state) {
87 case AfterCharacterClass:
88 // Following a builtin character class we need look out for a hyphen.
89 // We're looking for invalid ranges, such as /[\d-x]/ or /[\d-\d]/.
90 // If we see a hyphen following a charater class then unlike usual
91 // we'll report it to the delegate immediately, and put ourself into
92 // a poisoned state. Any following calls to add another character or
93 // character class will result in an error. (A hypen following a
94 // character-class is itself valid, but only at the end of a regex).
95 if (hyphenIsRange && ch == '-') {
96 m_delegate.atomCharacterClassAtom('-');
97 m_state = AfterCharacterClassHyphen;
98 return;
99 }
100 // Otherwise just fall through - cached character so treat this as Empty.
101 FALLTHROUGH;
102
103 case Empty:
104 m_character = ch;
105 m_state = CachedCharacter;
106 return;
107
108 case CachedCharacter:
109 if (hyphenIsRange && ch == '-')
110 m_state = CachedCharacterHyphen;
111 else {
112 m_delegate.atomCharacterClassAtom(m_character);
113 m_character = ch;
114 }
115 return;
116
117 case CachedCharacterHyphen:
118 if (ch < m_character) {
119 m_errorCode = ErrorCode::CharacterClassOutOfOrder;
120 return;
121 }
122 m_delegate.atomCharacterClassRange(m_character, ch);
123 m_state = Empty;
124 return;
125
126 // See coment in atomBuiltInCharacterClass below.
127 // This too is technically an error, per ECMA-262, and again we
128 // we chose to allow this. Note a subtlely here that while we
129 // diverge from the spec's definition of CharacterRange we do
130 // remain in compliance with the grammar. For example, consider
131 // the expression /[\d-a-z]/. We comply with the grammar in
132 // this case by not allowing a-z to be matched as a range.
133 case AfterCharacterClassHyphen:
134 m_delegate.atomCharacterClassAtom(ch);
135 m_state = Empty;
136 return;
137 }
138 }
139
140 /*
141 * atomBuiltInCharacterClass():
142 *
143 * Adds a built-in character class, called by parseEscape().
144 */
145 void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert)
146 {
147 switch (m_state) {
148 case CachedCharacter:
149 // Flush the currently cached character, then fall through.
150 m_delegate.atomCharacterClassAtom(m_character);
151 FALLTHROUGH;
152 case Empty:
153 case AfterCharacterClass:
154 m_state = AfterCharacterClass;
155 m_delegate.atomCharacterClassBuiltIn(classID, invert);
156 return;
157
158 // If we hit either of these cases, we have an invalid range that
159 // looks something like /[x-\d]/ or /[\d-\d]/.
160 // According to ECMA-262 this should be a syntax error, but
161 // empirical testing shows this to break teh webz. Instead we
162 // comply with to the ECMA-262 grammar, and assume the grammar to
163 // have matched the range correctly, but tweak our interpretation
164 // of CharacterRange. Effectively we implicitly handle the hyphen
165 // as if it were escaped, e.g. /[\w-_]/ is treated as /[\w\-_]/.
166 case CachedCharacterHyphen:
167 m_delegate.atomCharacterClassAtom(m_character);
168 m_delegate.atomCharacterClassAtom('-');
169 FALLTHROUGH;
170 case AfterCharacterClassHyphen:
171 m_delegate.atomCharacterClassBuiltIn(classID, invert);
172 m_state = Empty;
173 return;
174 }
175 }
176
177 /*
178 * end():
179 *
180 * Called at end of construction.
181 */
182 void end()
183 {
184 if (m_state == CachedCharacter)
185 m_delegate.atomCharacterClassAtom(m_character);
186 else if (m_state == CachedCharacterHyphen) {
187 m_delegate.atomCharacterClassAtom(m_character);
188 m_delegate.atomCharacterClassAtom('-');
189 }
190 m_delegate.atomCharacterClassEnd();
191 }
192
193 // parseEscape() should never call these delegate methods when
194 // invoked with inCharacterClass set.
195 NO_RETURN_DUE_TO_ASSERT void assertionWordBoundary(bool) { RELEASE_ASSERT_NOT_REACHED(); }
196 NO_RETURN_DUE_TO_ASSERT void atomBackReference(unsigned) { RELEASE_ASSERT_NOT_REACHED(); }
197 NO_RETURN_DUE_TO_ASSERT void atomNamedBackReference(const String&) { RELEASE_ASSERT_NOT_REACHED(); }
198 bool isValidNamedForwardReference(const String&) { RELEASE_ASSERT_NOT_REACHED(); return false; }
199 NO_RETURN_DUE_TO_ASSERT void atomNamedForwardReference(const String&) { RELEASE_ASSERT_NOT_REACHED(); }
200
201 private:
202 Delegate& m_delegate;
203 ErrorCode& m_errorCode;
204 enum CharacterClassConstructionState {
205 Empty,
206 CachedCharacter,
207 CachedCharacterHyphen,
208 AfterCharacterClass,
209 AfterCharacterClassHyphen,
210 } m_state;
211 UChar32 m_character;
212 };
213
214 Parser(Delegate& delegate, const String& pattern, bool isUnicode, unsigned backReferenceLimit)
215 : m_delegate(delegate)
216 , m_backReferenceLimit(backReferenceLimit)
217 , m_data(pattern.characters<CharType>())
218 , m_size(pattern.length())
219 , m_isUnicode(isUnicode)
220 {
221 }
222
223 // The handling of IdentityEscapes is different depending on the unicode flag.
224 // For Unicode patterns, IdentityEscapes only include SyntaxCharacters or '/'.
225 // For non-unicode patterns, most any character can be escaped.
226 bool isIdentityEscapeAnError(int ch)
227 {
228 if (m_isUnicode && !strchr(s: "^$\\.*+?()[]{}|/", c: ch)) {
229 m_errorCode = ErrorCode::InvalidIdentityEscape;
230 return true;
231 }
232
233 return false;
234 }
235
236 /*
237 * parseEscape():
238 *
239 * Helper for parseTokens() AND parseCharacterClass().
240 * Unlike the other parser methods, this function does not report tokens
241 * directly to the member delegate (m_delegate), instead tokens are
242 * emitted to the delegate provided as an argument. In the case of atom
243 * escapes, parseTokens() will call parseEscape() passing m_delegate as
244 * an argument, and as such the escape will be reported to the delegate.
245 *
246 * However this method may also be used by parseCharacterClass(), in which
247 * case a CharacterClassParserDelegate will be passed as the delegate that
248 * tokens should be added to. A boolean flag is also provided to indicate
249 * whether that an escape in a CharacterClass is being parsed (some parsing
250 * rules change in this context).
251 *
252 * The boolean value returned by this method indicates whether the token
253 * parsed was an atom (outside of a characted class \b and \B will be
254 * interpreted as assertions).
255 */
256 template<bool inCharacterClass, class EscapeDelegate>
257 bool parseEscape(EscapeDelegate& delegate)
258 {
259 ASSERT(!hasError(m_errorCode));
260 ASSERT(peek() == '\\');
261 consume();
262
263 if (atEndOfPattern()) {
264 m_errorCode = ErrorCode::EscapeUnterminated;
265 return false;
266 }
267
268 switch (peek()) {
269 // Assertions
270 case 'b':
271 consume();
272 if (inCharacterClass) {
273 if (isIdentityEscapeAnError(ch: 'b'))
274 break;
275
276 delegate.atomPatternCharacter('\b');
277 } else {
278 delegate.assertionWordBoundary(false);
279 return false;
280 }
281 break;
282 case 'B':
283 consume();
284 if (inCharacterClass) {
285 if (isIdentityEscapeAnError(ch: 'B'))
286 break;
287
288 delegate.atomPatternCharacter('B');
289 } else {
290 delegate.assertionWordBoundary(true);
291 return false;
292 }
293 break;
294
295 // CharacterClassEscape
296 case 'd':
297 consume();
298 delegate.atomBuiltInCharacterClass(BuiltInCharacterClassID::DigitClassID, false);
299 break;
300 case 's':
301 consume();
302 delegate.atomBuiltInCharacterClass(BuiltInCharacterClassID::SpaceClassID, false);
303 break;
304 case 'w':
305 consume();
306 delegate.atomBuiltInCharacterClass(BuiltInCharacterClassID::WordClassID, false);
307 break;
308 case 'D':
309 consume();
310 delegate.atomBuiltInCharacterClass(BuiltInCharacterClassID::DigitClassID, true);
311 break;
312 case 'S':
313 consume();
314 delegate.atomBuiltInCharacterClass(BuiltInCharacterClassID::SpaceClassID, true);
315 break;
316 case 'W':
317 consume();
318 delegate.atomBuiltInCharacterClass(BuiltInCharacterClassID::WordClassID, true);
319 break;
320
321 // DecimalEscape
322 case '1':
323 case '2':
324 case '3':
325 case '4':
326 case '5':
327 case '6':
328 case '7':
329 case '8':
330 case '9': {
331 // To match Firefox, we parse an invalid backreference in the range [1-7] as an octal escape.
332 // First, try to parse this as backreference.
333 if (!inCharacterClass) {
334 ParseState state = saveState();
335
336 unsigned backReference = consumeNumber();
337 if (backReference <= m_backReferenceLimit) {
338 delegate.atomBackReference(backReference);
339 break;
340 }
341
342 restoreState(state);
343
344 if (m_isUnicode) {
345 m_errorCode = ErrorCode::InvalidBackreference;
346 return false;
347 }
348 }
349
350 // Not a backreference, and not octal. Just a number.
351 if (peek() >= '8') {
352 delegate.atomPatternCharacter(consume());
353 break;
354 }
355
356 // Fall-through to handle this as an octal escape.
357 FALLTHROUGH;
358 }
359
360 // Octal escape
361 case '0':
362 delegate.atomPatternCharacter(consumeOctal());
363 break;
364
365 // ControlEscape
366 case 'f':
367 consume();
368 delegate.atomPatternCharacter('\f');
369 break;
370 case 'n':
371 consume();
372 delegate.atomPatternCharacter('\n');
373 break;
374 case 'r':
375 consume();
376 delegate.atomPatternCharacter('\r');
377 break;
378 case 't':
379 consume();
380 delegate.atomPatternCharacter('\t');
381 break;
382 case 'v':
383 consume();
384 delegate.atomPatternCharacter('\v');
385 break;
386
387 // ControlLetter
388 case 'c': {
389 ParseState state = saveState();
390 consume();
391 if (!atEndOfPattern()) {
392 int control = consume();
393
394 // To match Firefox, inside a character class, we also accept numbers and '_' as control characters.
395 if (inCharacterClass ? WTF::isASCIIAlphanumeric(c: control) || (control == '_') : WTF::isASCIIAlpha(c: control)) {
396 delegate.atomPatternCharacter(control & 0x1f);
397 break;
398 }
399 }
400 restoreState(state);
401 delegate.atomPatternCharacter('\\');
402 break;
403 }
404
405 // HexEscape
406 case 'x': {
407 consume();
408 int x = tryConsumeHex(count: 2);
409 if (x == -1) {
410 if (isIdentityEscapeAnError(ch: 'x'))
411 break;
412
413 delegate.atomPatternCharacter('x');
414 } else
415 delegate.atomPatternCharacter(x);
416 break;
417 }
418
419 // Named backreference
420 case 'k': {
421 consume();
422 ParseState state = saveState();
423 if (!atEndOfPattern() && !inCharacterClass) {
424 if (consume() == '<') {
425 auto groupName = tryConsumeGroupName();
426 if (groupName) {
427 if (m_captureGroupNames.contains(groupName.value())) {
428 delegate.atomNamedBackReference(groupName.value());
429 break;
430 }
431
432 if (delegate.isValidNamedForwardReference(groupName.value())) {
433 delegate.atomNamedForwardReference(groupName.value());
434 break;
435 }
436 }
437 if (m_isUnicode) {
438 m_errorCode = ErrorCode::InvalidBackreference;
439 break;
440 }
441 }
442 }
443 restoreState(state);
444 delegate.atomPatternCharacter('k');
445 break;
446 }
447
448 // Unicode property escapes
449 case 'p':
450 case 'P': {
451 int escapeChar = consume();
452
453 if (!m_isUnicode) {
454 if (isIdentityEscapeAnError(ch: escapeChar))
455 break;
456 delegate.atomPatternCharacter(escapeChar);
457 break;
458 }
459
460 if (!atEndOfPattern() && peek() == '{') {
461 consume();
462 auto optClassID = tryConsumeUnicodePropertyExpression();
463 if (!optClassID) {
464 // tryConsumeUnicodePropertyExpression() will set m_errorCode for a malformed property expression
465 break;
466 }
467 delegate.atomBuiltInCharacterClass(optClassID.value(), escapeChar == 'P');
468 } else
469 m_errorCode = ErrorCode::InvalidUnicodePropertyExpression;
470 break;
471 }
472
473 // UnicodeEscape
474 case 'u': {
475 consume();
476 if (atEndOfPattern()) {
477 if (isIdentityEscapeAnError(ch: 'u'))
478 break;
479
480 delegate.atomPatternCharacter('u');
481 break;
482 }
483
484 if (m_isUnicode && peek() == '{') {
485 consume();
486 UChar32 codePoint = 0;
487 do {
488 if (atEndOfPattern() || !isASCIIHexDigit(peek())) {
489 m_errorCode = ErrorCode::InvalidUnicodeEscape;
490 break;
491 }
492
493 codePoint = (codePoint << 4) | toASCIIHexValue(consume());
494
495 if (codePoint > UCHAR_MAX_VALUE)
496 m_errorCode = ErrorCode::InvalidUnicodeEscape;
497 } while (!atEndOfPattern() && peek() != '}');
498 if (!atEndOfPattern() && peek() == '}')
499 consume();
500 else if (!hasError(errorCode: m_errorCode))
501 m_errorCode = ErrorCode::InvalidUnicodeEscape;
502 if (hasError(errorCode: m_errorCode))
503 return false;
504
505 delegate.atomPatternCharacter(codePoint);
506 break;
507 }
508 int u = tryConsumeHex(count: 4);
509 if (u == -1) {
510 if (isIdentityEscapeAnError(ch: 'u'))
511 break;
512
513 delegate.atomPatternCharacter('u');
514 } else {
515 // If we have the first of a surrogate pair, look for the second.
516 if (U16_IS_LEAD(u) && m_isUnicode && (patternRemaining() >= 6) && peek() == '\\') {
517 ParseState state = saveState();
518 consume();
519
520 if (tryConsume(ch: 'u')) {
521 int surrogate2 = tryConsumeHex(count: 4);
522 if (U16_IS_TRAIL(surrogate2)) {
523 u = U16_GET_SUPPLEMENTARY(u, surrogate2);
524 delegate.atomPatternCharacter(u);
525 break;
526 }
527 }
528
529 restoreState(state);
530 }
531 delegate.atomPatternCharacter(u);
532 }
533 break;
534 }
535
536 // IdentityEscape
537 default:
538 int ch = peek();
539
540 if (ch == '-' && m_isUnicode && inCharacterClass) {
541 // \- is allowed for ClassEscape with unicode flag.
542 delegate.atomPatternCharacter(consume());
543 break;
544 }
545
546 if (isIdentityEscapeAnError(ch))
547 break;
548
549 delegate.atomPatternCharacter(consume());
550 }
551
552 return true;
553 }
554
555 UChar32 consumePossibleSurrogatePair()
556 {
557 UChar32 ch = consume();
558 if (U16_IS_LEAD(ch) && m_isUnicode && (patternRemaining() > 0)) {
559 ParseState state = saveState();
560
561 UChar32 surrogate2 = consume();
562 if (U16_IS_TRAIL(surrogate2))
563 ch = U16_GET_SUPPLEMENTARY(ch, surrogate2);
564 else
565 restoreState(state);
566 }
567
568 return ch;
569 }
570
571 /*
572 * parseAtomEscape(), parseCharacterClassEscape():
573 *
574 * These methods alias to parseEscape().
575 */
576 bool parseAtomEscape()
577 {
578 return parseEscape<false>(m_delegate);
579 }
580 void parseCharacterClassEscape(CharacterClassParserDelegate& delegate)
581 {
582 parseEscape<true>(delegate);
583 }
584
585 /*
586 * parseCharacterClass():
587 *
588 * Helper for parseTokens(); calls dirctly and indirectly (via parseCharacterClassEscape)
589 * to an instance of CharacterClassParserDelegate, to describe the character class to the
590 * delegate.
591 */
592 void parseCharacterClass()
593 {
594 ASSERT(!hasError(m_errorCode));
595 ASSERT(peek() == '[');
596 consume();
597
598 CharacterClassParserDelegate characterClassConstructor(m_delegate, m_errorCode);
599
600 characterClassConstructor.begin(tryConsume(ch: '^'));
601
602 while (!atEndOfPattern()) {
603 switch (peek()) {
604 case ']':
605 consume();
606 characterClassConstructor.end();
607 return;
608
609 case '\\':
610 parseCharacterClassEscape(delegate&: characterClassConstructor);
611 break;
612
613 default:
614 characterClassConstructor.atomPatternCharacter(consumePossibleSurrogatePair(), true);
615 }
616
617 if (hasError(errorCode: m_errorCode))
618 return;
619 }
620
621 m_errorCode = ErrorCode::CharacterClassUnmatched;
622 }
623
624 /*
625 * parseParenthesesBegin():
626 *
627 * Helper for parseTokens(); checks for parentheses types other than regular capturing subpatterns.
628 */
629 void parseParenthesesBegin()
630 {
631 ASSERT(!hasError(m_errorCode));
632 ASSERT(peek() == '(');
633 consume();
634
635 if (tryConsume(ch: '?')) {
636 if (atEndOfPattern()) {
637 m_errorCode = ErrorCode::ParenthesesTypeInvalid;
638 return;
639 }
640
641 switch (consume()) {
642 case ':':
643 m_delegate.atomParenthesesSubpatternBegin(false);
644 break;
645
646 case '=':
647 m_delegate.atomParentheticalAssertionBegin();
648 break;
649
650 case '!':
651 m_delegate.atomParentheticalAssertionBegin(true);
652 break;
653
654 case '<': {
655 auto groupName = tryConsumeGroupName();
656 if (groupName) {
657 auto setAddResult = m_captureGroupNames.add(k: groupName.value());
658 if (setAddResult.isNewEntry)
659 m_delegate.atomParenthesesSubpatternBegin(true, groupName);
660 else
661 m_errorCode = ErrorCode::DuplicateGroupName;
662 } else
663 m_errorCode = ErrorCode::InvalidGroupName;
664
665 break;
666 }
667
668 default:
669 m_errorCode = ErrorCode::ParenthesesTypeInvalid;
670 }
671 } else
672 m_delegate.atomParenthesesSubpatternBegin();
673
674 ++m_parenthesesNestingDepth;
675 }
676
677 /*
678 * parseParenthesesEnd():
679 *
680 * Helper for parseTokens(); checks for parse errors (due to unmatched parentheses).
681 */
682 void parseParenthesesEnd()
683 {
684 ASSERT(!hasError(m_errorCode));
685 ASSERT(peek() == ')');
686 consume();
687
688 if (m_parenthesesNestingDepth > 0)
689 m_delegate.atomParenthesesEnd();
690 else
691 m_errorCode = ErrorCode::ParenthesesUnmatched;
692
693 --m_parenthesesNestingDepth;
694 }
695
696 /*
697 * parseQuantifier():
698 *
699 * Helper for parseTokens(); checks for parse errors and non-greedy quantifiers.
700 */
701 void parseQuantifier(bool lastTokenWasAnAtom, unsigned min, unsigned max)
702 {
703 ASSERT(!hasError(m_errorCode));
704 ASSERT(min <= max);
705
706 const unsigned quantifyLimit = 1 << 24;
707 if (min > quantifyLimit || (max != quantifyInfinite && max > quantifyLimit)) {
708 m_errorCode = ErrorCode::QuantifierTooLarge;
709 return;
710 }
711
712 if (lastTokenWasAnAtom)
713 m_delegate.quantifyAtom(min, max, !tryConsume(ch: '?'));
714 else
715 m_errorCode = ErrorCode::QuantifierWithoutAtom;
716 }
717
718 /*
719 * parseTokens():
720 *
721 * This method loops over the input pattern reporting tokens to the delegate.
722 * The method returns when a parse error is detected, or the end of the pattern
723 * is reached. One piece of state is tracked around the loop, which is whether
724 * the last token passed to the delegate was an atom (this is necessary to detect
725 * a parse error when a quantifier provided without an atom to quantify).
726 */
727 void parseTokens()
728 {
729 bool lastTokenWasAnAtom = false;
730
731 while (!atEndOfPattern()) {
732 switch (peek()) {
733 case '|':
734 consume();
735 m_delegate.disjunction();
736 lastTokenWasAnAtom = false;
737 break;
738
739 case '(':
740 parseParenthesesBegin();
741 lastTokenWasAnAtom = false;
742 break;
743
744 case ')':
745 parseParenthesesEnd();
746 lastTokenWasAnAtom = true;
747 break;
748
749 case '^':
750 consume();
751 m_delegate.assertionBOL();
752 lastTokenWasAnAtom = false;
753 break;
754
755 case '$':
756 consume();
757 m_delegate.assertionEOL();
758 lastTokenWasAnAtom = false;
759 break;
760
761 case '.':
762 consume();
763 m_delegate.atomBuiltInCharacterClass(BuiltInCharacterClassID::DotClassID, false);
764 lastTokenWasAnAtom = true;
765 break;
766
767 case '[':
768 parseCharacterClass();
769 lastTokenWasAnAtom = true;
770 break;
771
772 case '\\':
773 lastTokenWasAnAtom = parseAtomEscape();
774 break;
775
776 case '*':
777 consume();
778 parseQuantifier(lastTokenWasAnAtom, min: 0, max: quantifyInfinite);
779 lastTokenWasAnAtom = false;
780 break;
781
782 case '+':
783 consume();
784 parseQuantifier(lastTokenWasAnAtom, min: 1, max: quantifyInfinite);
785 lastTokenWasAnAtom = false;
786 break;
787
788 case '?':
789 consume();
790 parseQuantifier(lastTokenWasAnAtom, min: 0, max: 1);
791 lastTokenWasAnAtom = false;
792 break;
793
794 case '{': {
795 ParseState state = saveState();
796
797 consume();
798 if (peekIsDigit()) {
799 unsigned min = consumeNumber();
800 unsigned max = min;
801
802 if (tryConsume(ch: ','))
803 max = peekIsDigit() ? consumeNumber() : quantifyInfinite;
804
805 if (tryConsume(ch: '}')) {
806 if (min <= max)
807 parseQuantifier(lastTokenWasAnAtom, min, max);
808 else
809 m_errorCode = ErrorCode::QuantifierOutOfOrder;
810 lastTokenWasAnAtom = false;
811 break;
812 }
813 }
814
815 restoreState(state);
816 }
817 // if we did not find a complete quantifer, fall through to the default case.
818 FALLTHROUGH;
819
820 default:
821 m_delegate.atomPatternCharacter(consumePossibleSurrogatePair());
822 lastTokenWasAnAtom = true;
823 }
824
825 if (hasError(errorCode: m_errorCode))
826 return;
827 }
828
829 if (m_parenthesesNestingDepth > 0)
830 m_errorCode = ErrorCode::MissingParentheses;
831 }
832
833 /*
834 * parse():
835 *
836 * This method calls parseTokens() to parse over the input and returns error code for a result.
837 */
838 ErrorCode parse()
839 {
840 if (m_size > MAX_PATTERN_SIZE)
841 m_errorCode = ErrorCode::PatternTooLarge;
842 else
843 parseTokens();
844 ASSERT(atEndOfPattern() || hasError(m_errorCode));
845
846 return m_errorCode;
847 }
848
849 // Misc helper functions:
850
851 typedef unsigned ParseState;
852
853 ParseState saveState()
854 {
855 return m_index;
856 }
857
858 void restoreState(ParseState state)
859 {
860 m_index = state;
861 }
862
863 bool atEndOfPattern()
864 {
865 ASSERT(m_index <= m_size);
866 return m_index == m_size;
867 }
868
869 unsigned patternRemaining()
870 {
871 ASSERT(m_index <= m_size);
872 return m_size - m_index;
873 }
874
875 int peek()
876 {
877 ASSERT(m_index < m_size);
878 return m_data[m_index];
879 }
880
881 bool peekIsDigit()
882 {
883 return !atEndOfPattern() && WTF::isASCIIDigit(peek());
884 }
885
886 unsigned peekDigit()
887 {
888 ASSERT(peekIsDigit());
889 return peek() - '0';
890 }
891
892 int tryConsumeUnicodeEscape()
893 {
894 if (!tryConsume(ch: 'u'))
895 return -1;
896
897 if (m_isUnicode && tryConsume(ch: '{')) {
898 int codePoint = 0;
899 do {
900 if (atEndOfPattern() || !isASCIIHexDigit(peek())) {
901 m_errorCode = ErrorCode::InvalidUnicodeEscape;
902 return -1;
903 }
904
905 codePoint = (codePoint << 4) | toASCIIHexValue(consume());
906
907 if (codePoint > UCHAR_MAX_VALUE) {
908 m_errorCode = ErrorCode::InvalidUnicodeEscape;
909 return -1;
910 }
911 } while (!atEndOfPattern() && peek() != '}');
912 if (!atEndOfPattern() && peek() == '}')
913 consume();
914 else if (!hasError(errorCode: m_errorCode))
915 m_errorCode = ErrorCode::InvalidUnicodeEscape;
916 if (hasError(errorCode: m_errorCode))
917 return -1;
918
919 return codePoint;
920 }
921
922 int u = tryConsumeHex(count: 4);
923 if (u == -1)
924 return -1;
925
926 // If we have the first of a surrogate pair, look for the second.
927 if (U16_IS_LEAD(u) && m_isUnicode && (patternRemaining() >= 6) && peek() == '\\') {
928 ParseState state = saveState();
929 consume();
930
931 if (tryConsume(ch: 'u')) {
932 int surrogate2 = tryConsumeHex(count: 4);
933 if (U16_IS_TRAIL(surrogate2)) {
934 u = U16_GET_SUPPLEMENTARY(u, surrogate2);
935 return u;
936 }
937 }
938
939 restoreState(state);
940 }
941
942 return u;
943 }
944
945 int tryConsumeIdentifierCharacter()
946 {
947 int ch = peek();
948
949 if (ch == '\\') {
950 consume();
951 ch = tryConsumeUnicodeEscape();
952 } else
953 consume();
954
955 return ch;
956 }
957
958 bool isIdentifierStart(int ch)
959 {
960 return (WTF::isASCII(c: ch) && (WTF::isASCIIAlpha(c: ch) || ch == '_' || ch == '$')) || (U_GET_GC_MASK(ch) & U_GC_L_MASK);
961 }
962
963 bool isIdentifierPart(int ch)
964 {
965 return (WTF::isASCII(c: ch) && (WTF::isASCIIAlpha(c: ch) || ch == '_' || ch == '$')) || (U_GET_GC_MASK(ch) & (U_GC_L_MASK | U_GC_MN_MASK | U_GC_MC_MASK | U_GC_ND_MASK | U_GC_PC_MASK)) || ch == 0x200C || ch == 0x200D;
966 }
967
968 bool isUnicodePropertyValueExpressionChar(int ch)
969 {
970 return WTF::isASCIIAlphanumeric(c: ch) || ch == '_' || ch == '=';
971 }
972
973 int consume()
974 {
975 ASSERT(m_index < m_size);
976 return m_data[m_index++];
977 }
978
979 unsigned consumeDigit()
980 {
981 ASSERT(peekIsDigit());
982 return consume() - '0';
983 }
984
985 unsigned consumeNumber()
986 {
987 Checked<unsigned, RecordOverflow> n = consumeDigit();
988 while (peekIsDigit())
989 n = n * 10 + consumeDigit();
990 return n.hasOverflowed() ? quantifyInfinite : n.unsafeGet();
991 }
992
993 unsigned consumeOctal()
994 {
995 ASSERT(WTF::isASCIIOctalDigit(peek()));
996
997 unsigned n = consumeDigit();
998 while (n < 32 && !atEndOfPattern() && WTF::isASCIIOctalDigit(peek()))
999 n = n * 8 + consumeDigit();
1000 return n;
1001 }
1002
1003 bool tryConsume(UChar ch)
1004 {
1005 if (atEndOfPattern() || (m_data[m_index] != ch))
1006 return false;
1007 ++m_index;
1008 return true;
1009 }
1010
1011 int tryConsumeHex(int count)
1012 {
1013 ParseState state = saveState();
1014
1015 int n = 0;
1016 while (count--) {
1017 if (atEndOfPattern() || !WTF::isASCIIHexDigit(peek())) {
1018 restoreState(state);
1019 return -1;
1020 }
1021 n = (n << 4) | WTF::toASCIIHexValue(consume());
1022 }
1023 return n;
1024 }
1025
1026 std::optional<String> tryConsumeGroupName()
1027 {
1028 if (atEndOfPattern())
1029 return std::nullopt;
1030
1031 ParseState state = saveState();
1032
1033 int ch = tryConsumeIdentifierCharacter();
1034
1035 if (isIdentifierStart(ch)) {
1036 StringBuilder identifierBuilder;
1037 identifierBuilder.append(c: ch);
1038
1039 while (!atEndOfPattern()) {
1040 ch = tryConsumeIdentifierCharacter();
1041 if (ch == '>')
1042 return std::optional<String>(identifierBuilder.toString());
1043
1044 if (!isIdentifierPart(ch))
1045 break;
1046
1047 identifierBuilder.append(c: ch);
1048 }
1049 }
1050
1051 restoreState(state);
1052
1053 return std::nullopt;
1054 }
1055
1056 std::optional<BuiltInCharacterClassID> tryConsumeUnicodePropertyExpression()
1057 {
1058 if (atEndOfPattern() || !isUnicodePropertyValueExpressionChar(ch: peek())) {
1059 m_errorCode = ErrorCode::InvalidUnicodePropertyExpression;
1060 return std::nullopt;
1061 }
1062
1063 StringBuilder expressionBuilder;
1064 String unicodePropertyName;
1065 bool foundEquals = false;
1066 unsigned errors = 0;
1067
1068 expressionBuilder.append(consume());
1069
1070 while (!atEndOfPattern()) {
1071 int ch = peek();
1072 if (ch == '}') {
1073 consume();
1074 if (errors) {
1075 m_errorCode = ErrorCode::InvalidUnicodePropertyExpression;
1076 return std::nullopt;
1077 }
1078
1079 if (foundEquals) {
1080 auto result = unicodeMatchPropertyValue(unicodePropertyName, expressionBuilder.toString());
1081 if (!result)
1082 m_errorCode = ErrorCode::InvalidUnicodePropertyExpression;
1083 return result;
1084 }
1085
1086 auto result = unicodeMatchProperty(expressionBuilder.toString());
1087 if (!result)
1088 m_errorCode = ErrorCode::InvalidUnicodePropertyExpression;
1089 return result;
1090 }
1091
1092 consume();
1093 if (ch == '=') {
1094 if (!foundEquals) {
1095 foundEquals = true;
1096 unicodePropertyName = expressionBuilder.toString();
1097 expressionBuilder.clear();
1098 } else
1099 errors++;
1100 } else if (!isUnicodePropertyValueExpressionChar(ch))
1101 errors++;
1102 else
1103 expressionBuilder.append(c: ch);
1104 }
1105
1106 m_errorCode = ErrorCode::InvalidUnicodePropertyExpression;
1107 return std::nullopt;
1108 }
1109
1110 Delegate& m_delegate;
1111 unsigned m_backReferenceLimit;
1112 ErrorCode m_errorCode { ErrorCode::NoError };
1113 const CharType* m_data;
1114 unsigned m_size;
1115 unsigned m_index { 0 };
1116 bool m_isUnicode;
1117 unsigned m_parenthesesNestingDepth { 0 };
1118 HashSet<String> m_captureGroupNames;
1119
1120 // Derived by empirical testing of compile time in PCRE and WREC.
1121 static const unsigned MAX_PATTERN_SIZE = 1024 * 1024;
1122};
1123
1124/*
1125 * Yarr::parse():
1126 *
1127 * The parse method is passed a pattern to be parsed and a delegate upon which
1128 * callbacks will be made to record the parsed tokens forming the regex.
1129 * Yarr::parse() returns null on success, or a const C string providing an error
1130 * message where a parse error occurs.
1131 *
1132 * The Delegate must implement the following interface:
1133 *
1134 * void assertionBOL();
1135 * void assertionEOL();
1136 * void assertionWordBoundary(bool invert);
1137 *
1138 * void atomPatternCharacter(UChar32 ch);
1139 * void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert);
1140 * void atomCharacterClassBegin(bool invert)
1141 * void atomCharacterClassAtom(UChar32 ch)
1142 * void atomCharacterClassRange(UChar32 begin, UChar32 end)
1143 * void atomCharacterClassBuiltIn(BuiltInCharacterClassID classID, bool invert)
1144 * void atomCharacterClassEnd()
1145 * void atomParenthesesSubpatternBegin(bool capture = true, Optional<String> groupName);
1146 * void atomParentheticalAssertionBegin(bool invert = false);
1147 * void atomParenthesesEnd();
1148 * void atomBackReference(unsigned subpatternId);
1149 * void atomNamedBackReference(const String& subpatternName);
1150 * bool isValidNamedForwardReference(const String& subpatternName);
1151 * void atomNamedForwardReference(const String& subpatternName);
1152 *
1153 * void quantifyAtom(unsigned min, unsigned max, bool greedy);
1154 *
1155 * void disjunction();
1156 *
1157 * The regular expression is described by a sequence of assertion*() and atom*()
1158 * callbacks to the delegate, describing the terms in the regular expression.
1159 * Following an atom a quantifyAtom() call may occur to indicate that the previous
1160 * atom should be quantified. In the case of atoms described across multiple
1161 * calls (parentheses and character classes) the call to quantifyAtom() will come
1162 * after the call to the atom*End() method, never after atom*Begin().
1163 *
1164 * Character classes may either be described by a single call to
1165 * atomBuiltInCharacterClass(), or by a sequence of atomCharacterClass*() calls.
1166 * In the latter case, ...Begin() will be called, followed by a sequence of
1167 * calls to ...Atom(), ...Range(), and ...BuiltIn(), followed by a call to ...End().
1168 *
1169 * Sequences of atoms and assertions are broken into alternatives via calls to
1170 * disjunction(). Assertions, atoms, and disjunctions emitted between calls to
1171 * atomParenthesesBegin() and atomParenthesesEnd() form the body of a subpattern.
1172 * atomParenthesesBegin() is passed a subpatternId. In the case of a regular
1173 * capturing subpattern, this will be the subpatternId associated with these
1174 * parentheses, and will also by definition be the lowest subpatternId of these
1175 * parentheses and of any nested paretheses. The atomParenthesesEnd() method
1176 * is passed the subpatternId of the last capturing subexpression nested within
1177 * these paretheses. In the case of a capturing subpattern with no nested
1178 * capturing subpatterns, the same subpatternId will be passed to the begin and
1179 * end functions. In the case of non-capturing subpatterns the subpatternId
1180 * passed to the begin method is also the first possible subpatternId that might
1181 * be nested within these paretheses. If a set of non-capturing parentheses does
1182 * not contain any capturing subpatterns, then the subpatternId passed to begin
1183 * will be greater than the subpatternId passed to end.
1184 */
1185
1186template<class Delegate>
1187ErrorCode parse(Delegate& delegate, const String& pattern, bool isUnicode, unsigned backReferenceLimit = quantifyInfinite)
1188{
1189 if (pattern.is8Bit())
1190 return Parser<Delegate, LChar>(delegate, pattern, isUnicode, backReferenceLimit).parse();
1191 return Parser<Delegate, UChar>(delegate, pattern, isUnicode, backReferenceLimit).parse();
1192}
1193
1194} } // namespace JSC::Yarr
1195

source code of qtdeclarative/src/3rdparty/masm/yarr/YarrParser.h