1 | /* |
2 | * Copyright (C) 2009 Apple Inc. All rights reserved. |
3 | * |
4 | * Redistribution and use in source and binary forms, with or without |
5 | * modification, are permitted provided that the following conditions |
6 | * are met: |
7 | * 1. Redistributions of source code must retain the above copyright |
8 | * notice, this list of conditions and the following disclaimer. |
9 | * 2. Redistributions in binary form must reproduce the above copyright |
10 | * notice, this list of conditions and the following disclaimer in the |
11 | * documentation and/or other materials provided with the distribution. |
12 | * |
13 | * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY |
14 | * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
15 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR |
16 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR |
17 | * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, |
18 | * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, |
19 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR |
20 | * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY |
21 | * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
22 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
23 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
24 | */ |
25 | |
26 | #ifndef RegexParser_h |
27 | #define RegexParser_h |
28 | |
29 | #include <wtf/Platform.h> |
30 | |
31 | #if ENABLE(YARR) |
32 | |
33 | #include <UString.h> |
34 | #include <wtf/ASCIICType.h> |
35 | #include <wtf/unicode/Unicode.h> |
36 | #include <limits.h> |
37 | |
38 | namespace JSC { namespace Yarr { |
39 | |
40 | enum BuiltInCharacterClassID { |
41 | DigitClassID, |
42 | SpaceClassID, |
43 | WordClassID, |
44 | NewlineClassID, |
45 | }; |
46 | |
47 | // The Parser class should not be used directly - only via the Yarr::parse() method. |
48 | template<class Delegate> |
49 | class Parser { |
50 | private: |
51 | template<class FriendDelegate> |
52 | friend const char* parse(FriendDelegate& delegate, const UString& pattern, unsigned backReferenceLimit); |
53 | |
54 | enum ErrorCode { |
55 | NoError, |
56 | PatternTooLarge, |
57 | QuantifierOutOfOrder, |
58 | QuantifierWithoutAtom, |
59 | MissingParentheses, |
60 | ParenthesesUnmatched, |
61 | ParenthesesTypeInvalid, |
62 | CharacterClassUnmatched, |
63 | CharacterClassOutOfOrder, |
64 | EscapeUnterminated, |
65 | NumberOfErrorCodes |
66 | }; |
67 | |
68 | /* |
69 | * CharacterClassParserDelegate: |
70 | * |
71 | * The class CharacterClassParserDelegate is used in the parsing of character |
72 | * classes. This class handles detection of character ranges. This class |
73 | * implements enough of the delegate interface such that it can be passed to |
74 | * parseEscape() as an EscapeDelegate. This allows parseEscape() to be reused |
75 | * to perform the parsing of escape characters in character sets. |
76 | */ |
77 | class CharacterClassParserDelegate { |
78 | public: |
79 | CharacterClassParserDelegate(Delegate& delegate, ErrorCode& err) |
80 | : m_delegate(delegate) |
81 | , m_err(err) |
82 | , m_state(empty) |
83 | { |
84 | } |
85 | |
86 | /* |
87 | * begin(): |
88 | * |
89 | * Called at beginning of construction. |
90 | */ |
91 | void begin(bool invert) |
92 | { |
93 | m_delegate.atomCharacterClassBegin(invert); |
94 | } |
95 | |
96 | /* |
97 | * atomPatternCharacterUnescaped(): |
98 | * |
99 | * This method is called directly from parseCharacterClass(), to report a new |
100 | * pattern character token. This method differs from atomPatternCharacter(), |
101 | * which will be called from parseEscape(), since a hypen provided via this |
102 | * method may be indicating a character range, but a hyphen parsed by |
103 | * parseEscape() cannot be interpreted as doing so. |
104 | */ |
105 | void atomPatternCharacterUnescaped(UChar ch) |
106 | { |
107 | switch (m_state) { |
108 | case empty: |
109 | m_character = ch; |
110 | m_state = cachedCharacter; |
111 | break; |
112 | |
113 | case cachedCharacter: |
114 | if (ch == '-') |
115 | m_state = cachedCharacterHyphen; |
116 | else { |
117 | m_delegate.atomCharacterClassAtom(m_character); |
118 | m_character = ch; |
119 | } |
120 | break; |
121 | |
122 | case cachedCharacterHyphen: |
123 | if (ch >= m_character) |
124 | m_delegate.atomCharacterClassRange(m_character, ch); |
125 | else |
126 | m_err = CharacterClassOutOfOrder; |
127 | m_state = empty; |
128 | } |
129 | } |
130 | |
131 | /* |
132 | * atomPatternCharacter(): |
133 | * |
134 | * Adds a pattern character, called by parseEscape(), as such will not |
135 | * interpret a hyphen as indicating a character range. |
136 | */ |
137 | void atomPatternCharacter(UChar ch) |
138 | { |
139 | // Flush if a character is already pending to prevent the |
140 | // hyphen from begin interpreted as indicating a range. |
141 | if((ch == '-') && (m_state == cachedCharacter)) |
142 | flush(); |
143 | |
144 | atomPatternCharacterUnescaped(ch); |
145 | } |
146 | |
147 | /* |
148 | * atomBuiltInCharacterClass(): |
149 | * |
150 | * Adds a built-in character class, called by parseEscape(). |
151 | */ |
152 | void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert) |
153 | { |
154 | flush(); |
155 | m_delegate.atomCharacterClassBuiltIn(classID, invert); |
156 | } |
157 | |
158 | /* |
159 | * end(): |
160 | * |
161 | * Called at end of construction. |
162 | */ |
163 | void end() |
164 | { |
165 | flush(); |
166 | m_delegate.atomCharacterClassEnd(); |
167 | } |
168 | |
169 | // parseEscape() should never call these delegate methods when |
170 | // invoked with inCharacterClass set. |
171 | void assertionWordBoundary(bool) { ASSERT_NOT_REACHED(); } |
172 | void atomBackReference(unsigned) { ASSERT_NOT_REACHED(); } |
173 | |
174 | private: |
175 | void flush() |
176 | { |
177 | if (m_state != empty) // either cachedCharacter or cachedCharacterHyphen |
178 | m_delegate.atomCharacterClassAtom(m_character); |
179 | if (m_state == cachedCharacterHyphen) |
180 | m_delegate.atomCharacterClassAtom('-'); |
181 | m_state = empty; |
182 | } |
183 | |
184 | Delegate& m_delegate; |
185 | ErrorCode& m_err; |
186 | enum CharacterClassConstructionState { |
187 | empty, |
188 | cachedCharacter, |
189 | cachedCharacterHyphen, |
190 | } m_state; |
191 | UChar m_character; |
192 | }; |
193 | |
194 | Parser(Delegate& delegate, const UString& pattern, unsigned backReferenceLimit) |
195 | : m_delegate(delegate) |
196 | , m_backReferenceLimit(backReferenceLimit) |
197 | , m_err(NoError) |
198 | , m_data(pattern.data()) |
199 | , m_size(pattern.size()) |
200 | , m_index(0) |
201 | , m_parenthesesNestingDepth(0) |
202 | { |
203 | } |
204 | |
205 | /* |
206 | * parseEscape(): |
207 | * |
208 | * Helper for parseTokens() AND parseCharacterClass(). |
209 | * Unlike the other parser methods, this function does not report tokens |
210 | * directly to the member delegate (m_delegate), instead tokens are |
211 | * emitted to the delegate provided as an argument. In the case of atom |
212 | * escapes, parseTokens() will call parseEscape() passing m_delegate as |
213 | * an argument, and as such the escape will be reported to the delegate. |
214 | * |
215 | * However this method may also be used by parseCharacterClass(), in which |
216 | * case a CharacterClassParserDelegate will be passed as the delegate that |
217 | * tokens should be added to. A boolean flag is also provided to indicate |
218 | * whether that an escape in a CharacterClass is being parsed (some parsing |
219 | * rules change in this context). |
220 | * |
221 | * The boolean value returned by this method indicates whether the token |
222 | * parsed was an atom (outside of a characted class \b and \B will be |
223 | * interpreted as assertions). |
224 | */ |
225 | template<bool inCharacterClass, class EscapeDelegate> |
226 | bool parseEscape(EscapeDelegate& delegate) |
227 | { |
228 | ASSERT(!m_err); |
229 | ASSERT(peek() == '\\'); |
230 | consume(); |
231 | |
232 | if (atEndOfPattern()) { |
233 | m_err = EscapeUnterminated; |
234 | return false; |
235 | } |
236 | |
237 | switch (peek()) { |
238 | // Assertions |
239 | case 'b': |
240 | consume(); |
241 | if (inCharacterClass) |
242 | delegate.atomPatternCharacter('\b'); |
243 | else { |
244 | delegate.assertionWordBoundary(false); |
245 | return false; |
246 | } |
247 | break; |
248 | case 'B': |
249 | consume(); |
250 | if (inCharacterClass) |
251 | delegate.atomPatternCharacter('B'); |
252 | else { |
253 | delegate.assertionWordBoundary(true); |
254 | return false; |
255 | } |
256 | break; |
257 | |
258 | // CharacterClassEscape |
259 | case 'd': |
260 | consume(); |
261 | delegate.atomBuiltInCharacterClass(DigitClassID, false); |
262 | break; |
263 | case 's': |
264 | consume(); |
265 | delegate.atomBuiltInCharacterClass(SpaceClassID, false); |
266 | break; |
267 | case 'w': |
268 | consume(); |
269 | delegate.atomBuiltInCharacterClass(WordClassID, false); |
270 | break; |
271 | case 'D': |
272 | consume(); |
273 | delegate.atomBuiltInCharacterClass(DigitClassID, true); |
274 | break; |
275 | case 'S': |
276 | consume(); |
277 | delegate.atomBuiltInCharacterClass(SpaceClassID, true); |
278 | break; |
279 | case 'W': |
280 | consume(); |
281 | delegate.atomBuiltInCharacterClass(WordClassID, true); |
282 | break; |
283 | |
284 | // DecimalEscape |
285 | case '1': |
286 | case '2': |
287 | case '3': |
288 | case '4': |
289 | case '5': |
290 | case '6': |
291 | case '7': |
292 | case '8': |
293 | case '9': { |
294 | // To match Firefox, we parse an invalid backreference in the range [1-7] as an octal escape. |
295 | // First, try to parse this as backreference. |
296 | if (!inCharacterClass) { |
297 | ParseState state = saveState(); |
298 | |
299 | unsigned backReference = consumeNumber(); |
300 | if (backReference <= m_backReferenceLimit) { |
301 | delegate.atomBackReference(backReference); |
302 | break; |
303 | } |
304 | |
305 | restoreState(state); |
306 | } |
307 | |
308 | // Not a backreference, and not octal. |
309 | if (peek() >= '8') { |
310 | delegate.atomPatternCharacter('\\'); |
311 | break; |
312 | } |
313 | |
314 | // Fall-through to handle this as an octal escape. |
315 | } |
316 | |
317 | // Octal escape |
318 | case '0': |
319 | delegate.atomPatternCharacter(consumeOctal()); |
320 | break; |
321 | |
322 | // ControlEscape |
323 | case 'f': |
324 | consume(); |
325 | delegate.atomPatternCharacter('\f'); |
326 | break; |
327 | case 'n': |
328 | consume(); |
329 | delegate.atomPatternCharacter('\n'); |
330 | break; |
331 | case 'r': |
332 | consume(); |
333 | delegate.atomPatternCharacter('\r'); |
334 | break; |
335 | case 't': |
336 | consume(); |
337 | delegate.atomPatternCharacter('\t'); |
338 | break; |
339 | case 'v': |
340 | consume(); |
341 | delegate.atomPatternCharacter('\v'); |
342 | break; |
343 | |
344 | // ControlLetter |
345 | case 'c': { |
346 | ParseState state = saveState(); |
347 | consume(); |
348 | if (!atEndOfPattern()) { |
349 | int control = consume(); |
350 | |
351 | // To match Firefox, inside a character class, we also accept numbers and '_' as control characters. |
352 | if (inCharacterClass ? WTF::isASCIIAlphanumeric(c: control) || (control == '_') : WTF::isASCIIAlpha(c: control)) { |
353 | delegate.atomPatternCharacter(control & 0x1f); |
354 | break; |
355 | } |
356 | } |
357 | restoreState(state); |
358 | delegate.atomPatternCharacter('\\'); |
359 | break; |
360 | } |
361 | |
362 | // HexEscape |
363 | case 'x': { |
364 | consume(); |
365 | int x = tryConsumeHex(count: 2); |
366 | if (x == -1) |
367 | delegate.atomPatternCharacter('x'); |
368 | else |
369 | delegate.atomPatternCharacter(x); |
370 | break; |
371 | } |
372 | |
373 | // UnicodeEscape |
374 | case 'u': { |
375 | consume(); |
376 | int u = tryConsumeHex(count: 4); |
377 | if (u == -1) |
378 | delegate.atomPatternCharacter('u'); |
379 | else |
380 | delegate.atomPatternCharacter(u); |
381 | break; |
382 | } |
383 | |
384 | // IdentityEscape |
385 | default: |
386 | delegate.atomPatternCharacter(consume()); |
387 | } |
388 | |
389 | return true; |
390 | } |
391 | |
392 | /* |
393 | * parseAtomEscape(), parseCharacterClassEscape(): |
394 | * |
395 | * These methods alias to parseEscape(). |
396 | */ |
397 | bool parseAtomEscape() |
398 | { |
399 | return parseEscape<false>(m_delegate); |
400 | } |
401 | void parseCharacterClassEscape(CharacterClassParserDelegate& delegate) |
402 | { |
403 | parseEscape<true>(delegate); |
404 | } |
405 | |
406 | /* |
407 | * parseCharacterClass(): |
408 | * |
409 | * Helper for parseTokens(); calls dirctly and indirectly (via parseCharacterClassEscape) |
410 | * to an instance of CharacterClassParserDelegate, to describe the character class to the |
411 | * delegate. |
412 | */ |
413 | void parseCharacterClass() |
414 | { |
415 | ASSERT(!m_err); |
416 | ASSERT(peek() == '['); |
417 | consume(); |
418 | |
419 | CharacterClassParserDelegate characterClassConstructor(m_delegate, m_err); |
420 | |
421 | characterClassConstructor.begin(tryConsume(ch: '^')); |
422 | |
423 | while (!atEndOfPattern()) { |
424 | switch (peek()) { |
425 | case ']': |
426 | consume(); |
427 | characterClassConstructor.end(); |
428 | return; |
429 | |
430 | case '\\': |
431 | parseCharacterClassEscape(delegate&: characterClassConstructor); |
432 | break; |
433 | |
434 | default: |
435 | characterClassConstructor.atomPatternCharacterUnescaped(consume()); |
436 | } |
437 | |
438 | if (m_err) |
439 | return; |
440 | } |
441 | |
442 | m_err = CharacterClassUnmatched; |
443 | } |
444 | |
445 | /* |
446 | * parseParenthesesBegin(): |
447 | * |
448 | * Helper for parseTokens(); checks for parentheses types other than regular capturing subpatterns. |
449 | */ |
450 | void parseParenthesesBegin() |
451 | { |
452 | ASSERT(!m_err); |
453 | ASSERT(peek() == '('); |
454 | consume(); |
455 | |
456 | if (tryConsume(ch: '?')) { |
457 | if (atEndOfPattern()) { |
458 | m_err = ParenthesesTypeInvalid; |
459 | return; |
460 | } |
461 | |
462 | switch (consume()) { |
463 | case ':': |
464 | m_delegate.atomParenthesesSubpatternBegin(false); |
465 | break; |
466 | |
467 | case '=': |
468 | m_delegate.atomParentheticalAssertionBegin(); |
469 | break; |
470 | |
471 | case '!': |
472 | m_delegate.atomParentheticalAssertionBegin(true); |
473 | break; |
474 | |
475 | default: |
476 | m_err = ParenthesesTypeInvalid; |
477 | } |
478 | } else |
479 | m_delegate.atomParenthesesSubpatternBegin(); |
480 | |
481 | ++m_parenthesesNestingDepth; |
482 | } |
483 | |
484 | /* |
485 | * parseParenthesesEnd(): |
486 | * |
487 | * Helper for parseTokens(); checks for parse errors (due to unmatched parentheses). |
488 | */ |
489 | void parseParenthesesEnd() |
490 | { |
491 | ASSERT(!m_err); |
492 | ASSERT(peek() == ')'); |
493 | consume(); |
494 | |
495 | if (m_parenthesesNestingDepth > 0) |
496 | m_delegate.atomParenthesesEnd(); |
497 | else |
498 | m_err = ParenthesesUnmatched; |
499 | |
500 | --m_parenthesesNestingDepth; |
501 | } |
502 | |
503 | /* |
504 | * parseQuantifier(): |
505 | * |
506 | * Helper for parseTokens(); checks for parse errors and non-greedy quantifiers. |
507 | */ |
508 | void parseQuantifier(bool lastTokenWasAnAtom, unsigned min, unsigned max) |
509 | { |
510 | ASSERT(!m_err); |
511 | ASSERT(min <= max); |
512 | |
513 | if (lastTokenWasAnAtom) |
514 | m_delegate.quantifyAtom(min, max, !tryConsume(ch: '?')); |
515 | else |
516 | m_err = QuantifierWithoutAtom; |
517 | } |
518 | |
519 | /* |
520 | * parseTokens(): |
521 | * |
522 | * This method loops over the input pattern reporting tokens to the delegate. |
523 | * The method returns when a parse error is detected, or the end of the pattern |
524 | * is reached. One piece of state is tracked around the loop, which is whether |
525 | * the last token passed to the delegate was an atom (this is necessary to detect |
526 | * a parse error when a quantifier provided without an atom to quantify). |
527 | */ |
528 | void parseTokens() |
529 | { |
530 | bool lastTokenWasAnAtom = false; |
531 | |
532 | while (!atEndOfPattern()) { |
533 | switch (peek()) { |
534 | case '|': |
535 | consume(); |
536 | m_delegate.disjunction(); |
537 | lastTokenWasAnAtom = false; |
538 | break; |
539 | |
540 | case '(': |
541 | parseParenthesesBegin(); |
542 | lastTokenWasAnAtom = false; |
543 | break; |
544 | |
545 | case ')': |
546 | parseParenthesesEnd(); |
547 | lastTokenWasAnAtom = true; |
548 | break; |
549 | |
550 | case '^': |
551 | consume(); |
552 | m_delegate.assertionBOL(); |
553 | lastTokenWasAnAtom = false; |
554 | break; |
555 | |
556 | case '$': |
557 | consume(); |
558 | m_delegate.assertionEOL(); |
559 | lastTokenWasAnAtom = false; |
560 | break; |
561 | |
562 | case '.': |
563 | consume(); |
564 | m_delegate.atomBuiltInCharacterClass(NewlineClassID, true); |
565 | lastTokenWasAnAtom = true; |
566 | break; |
567 | |
568 | case '[': |
569 | parseCharacterClass(); |
570 | lastTokenWasAnAtom = true; |
571 | break; |
572 | |
573 | case '\\': |
574 | lastTokenWasAnAtom = parseAtomEscape(); |
575 | break; |
576 | |
577 | case '*': |
578 | consume(); |
579 | parseQuantifier(lastTokenWasAnAtom, min: 0, UINT_MAX); |
580 | lastTokenWasAnAtom = false; |
581 | break; |
582 | |
583 | case '+': |
584 | consume(); |
585 | parseQuantifier(lastTokenWasAnAtom, min: 1, UINT_MAX); |
586 | lastTokenWasAnAtom = false; |
587 | break; |
588 | |
589 | case '?': |
590 | consume(); |
591 | parseQuantifier(lastTokenWasAnAtom, min: 0, max: 1); |
592 | lastTokenWasAnAtom = false; |
593 | break; |
594 | |
595 | case '{': { |
596 | ParseState state = saveState(); |
597 | |
598 | consume(); |
599 | if (peekIsDigit()) { |
600 | unsigned min = consumeNumber(); |
601 | unsigned max = min; |
602 | |
603 | if (tryConsume(ch: ',')) |
604 | max = peekIsDigit() ? consumeNumber() : UINT_MAX; |
605 | |
606 | if (tryConsume(ch: '}')) { |
607 | if (min <= max) |
608 | parseQuantifier(lastTokenWasAnAtom, min, max); |
609 | else |
610 | m_err = QuantifierOutOfOrder; |
611 | lastTokenWasAnAtom = false; |
612 | break; |
613 | } |
614 | } |
615 | |
616 | restoreState(state); |
617 | } // if we did not find a complete quantifer, fall through to the default case. |
618 | |
619 | default: |
620 | m_delegate.atomPatternCharacter(consume()); |
621 | lastTokenWasAnAtom = true; |
622 | } |
623 | |
624 | if (m_err) |
625 | return; |
626 | } |
627 | |
628 | if (m_parenthesesNestingDepth > 0) |
629 | m_err = MissingParentheses; |
630 | } |
631 | |
632 | /* |
633 | * parse(): |
634 | * |
635 | * This method calls regexBegin(), calls parseTokens() to parse over the input |
636 | * patterns, calls regexEnd() or regexError() as appropriate, and converts any |
637 | * error code to a const char* for a result. |
638 | */ |
639 | const char* parse() |
640 | { |
641 | m_delegate.regexBegin(); |
642 | |
643 | if (m_size > MAX_PATTERN_SIZE) |
644 | m_err = PatternTooLarge; |
645 | else |
646 | parseTokens(); |
647 | ASSERT(atEndOfPattern() || m_err); |
648 | |
649 | if (m_err) |
650 | m_delegate.regexError(); |
651 | else |
652 | m_delegate.regexEnd(); |
653 | |
654 | // The order of this array must match the ErrorCode enum. |
655 | static const char* errorMessages[NumberOfErrorCodes] = { |
656 | 0, // NoError |
657 | "regular expression too large" , |
658 | "numbers out of order in {} quantifier" , |
659 | "nothing to repeat" , |
660 | "missing )" , |
661 | "unmatched parentheses" , |
662 | "unrecognized character after (?" , |
663 | "missing terminating ] for character class" , |
664 | "range out of order in character class" , |
665 | "\\ at end of pattern" |
666 | }; |
667 | |
668 | return errorMessages[m_err]; |
669 | } |
670 | |
671 | |
672 | // Misc helper functions: |
673 | |
674 | typedef unsigned ParseState; |
675 | |
676 | ParseState saveState() |
677 | { |
678 | return m_index; |
679 | } |
680 | |
681 | void restoreState(ParseState state) |
682 | { |
683 | m_index = state; |
684 | } |
685 | |
686 | bool atEndOfPattern() |
687 | { |
688 | ASSERT(m_index <= m_size); |
689 | return m_index == m_size; |
690 | } |
691 | |
692 | int peek() |
693 | { |
694 | ASSERT(m_index < m_size); |
695 | return m_data[m_index]; |
696 | } |
697 | |
698 | bool peekIsDigit() |
699 | { |
700 | return !atEndOfPattern() && WTF::isASCIIDigit(peek()); |
701 | } |
702 | |
703 | unsigned peekDigit() |
704 | { |
705 | ASSERT(peekIsDigit()); |
706 | return peek() - '0'; |
707 | } |
708 | |
709 | int consume() |
710 | { |
711 | ASSERT(m_index < m_size); |
712 | return m_data[m_index++]; |
713 | } |
714 | |
715 | unsigned consumeDigit() |
716 | { |
717 | ASSERT(peekIsDigit()); |
718 | return consume() - '0'; |
719 | } |
720 | |
721 | unsigned () |
722 | { |
723 | unsigned n = consumeDigit(); |
724 | // check for overflow. |
725 | for (unsigned newValue; peekIsDigit() && ((newValue = n * 10 + peekDigit()) >= n); ) { |
726 | n = newValue; |
727 | consume(); |
728 | } |
729 | return n; |
730 | } |
731 | |
732 | unsigned consumeOctal() |
733 | { |
734 | ASSERT(WTF::isASCIIOctalDigit(peek())); |
735 | |
736 | unsigned n = consumeDigit(); |
737 | while (n < 32 && !atEndOfPattern() && WTF::isASCIIOctalDigit(peek())) |
738 | n = n * 8 + consumeDigit(); |
739 | return n; |
740 | } |
741 | |
742 | bool tryConsume(UChar ch) |
743 | { |
744 | if (atEndOfPattern() || (m_data[m_index] != ch)) |
745 | return false; |
746 | ++m_index; |
747 | return true; |
748 | } |
749 | |
750 | int tryConsumeHex(int count) |
751 | { |
752 | ParseState state = saveState(); |
753 | |
754 | int n = 0; |
755 | while (count--) { |
756 | if (atEndOfPattern() || !WTF::isASCIIHexDigit(peek())) { |
757 | restoreState(state); |
758 | return -1; |
759 | } |
760 | n = (n << 4) | WTF::toASCIIHexValue(consume()); |
761 | } |
762 | return n; |
763 | } |
764 | |
765 | Delegate& m_delegate; |
766 | unsigned m_backReferenceLimit; |
767 | ErrorCode m_err; |
768 | const UChar* m_data; |
769 | unsigned m_size; |
770 | unsigned m_index; |
771 | unsigned m_parenthesesNestingDepth; |
772 | |
773 | // Derived by empirical testing of compile time in PCRE and WREC. |
774 | static const unsigned MAX_PATTERN_SIZE = 1024 * 1024; |
775 | }; |
776 | |
777 | /* |
778 | * Yarr::parse(): |
779 | * |
780 | * The parse method is passed a pattern to be parsed and a delegate upon which |
781 | * callbacks will be made to record the parsed tokens forming the regex. |
782 | * Yarr::parse() returns null on success, or a const C string providing an error |
783 | * message where a parse error occurs. |
784 | * |
785 | * The Delegate must implement the following interface: |
786 | * |
787 | * void assertionBOL(); |
788 | * void assertionEOL(); |
789 | * void assertionWordBoundary(bool invert); |
790 | * |
791 | * void atomPatternCharacter(UChar ch); |
792 | * void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert); |
793 | * void atomCharacterClassBegin(bool invert) |
794 | * void atomCharacterClassAtom(UChar ch) |
795 | * void atomCharacterClassRange(UChar begin, UChar end) |
796 | * void atomCharacterClassBuiltIn(BuiltInCharacterClassID classID, bool invert) |
797 | * void atomCharacterClassEnd() |
798 | * void atomParenthesesSubpatternBegin(bool capture = true); |
799 | * void atomParentheticalAssertionBegin(bool invert = false); |
800 | * void atomParenthesesEnd(); |
801 | * void atomBackReference(unsigned subpatternId); |
802 | * |
803 | * void quantifyAtom(unsigned min, unsigned max, bool greedy); |
804 | * |
805 | * void disjunction(); |
806 | * |
807 | * void regexBegin(); |
808 | * void regexEnd(); |
809 | * void regexError(); |
810 | * |
811 | * Before any call recording tokens are made, regexBegin() will be called on the |
812 | * delegate once. Once parsing is complete either regexEnd() or regexError() will |
813 | * be called, as appropriate. |
814 | * |
815 | * The regular expression is described by a sequence of assertion*() and atom*() |
816 | * callbacks to the delegate, describing the terms in the regular expression. |
817 | * Following an atom a quantifyAtom() call may occur to indicate that the previous |
818 | * atom should be quantified. In the case of atoms described across multiple |
819 | * calls (parentheses and character classes) the call to quantifyAtom() will come |
820 | * after the call to the atom*End() method, never after atom*Begin(). |
821 | * |
822 | * Character classes may either be described by a single call to |
823 | * atomBuiltInCharacterClass(), or by a sequence of atomCharacterClass*() calls. |
824 | * In the latter case, ...Begin() will be called, followed by a sequence of |
825 | * calls to ...Atom(), ...Range(), and ...BuiltIn(), followed by a call to ...End(). |
826 | * |
827 | * Sequences of atoms and assertions are broken into alternatives via calls to |
828 | * disjunction(). Assertions, atoms, and disjunctions emitted between calls to |
829 | * atomParenthesesBegin() and atomParenthesesEnd() form the body of a subpattern. |
830 | * atomParenthesesBegin() is passed a subpatternId. In the case of a regular |
831 | * capturing subpattern, this will be the subpatternId associated with these |
832 | * parentheses, and will also by definition be the lowest subpatternId of these |
833 | * parentheses and of any nested paretheses. The atomParenthesesEnd() method |
834 | * is passed the subpatternId of the last capturing subexpression nested within |
835 | * these paretheses. In the case of a capturing subpattern with no nested |
836 | * capturing subpatterns, the same subpatternId will be passed to the begin and |
837 | * end functions. In the case of non-capturing subpatterns the subpatternId |
838 | * passed to the begin method is also the first possible subpatternId that might |
839 | * be nested within these paretheses. If a set of non-capturing parentheses does |
840 | * not contain any capturing subpatterns, then the subpatternId passed to begin |
841 | * will be greater than the subpatternId passed to end. |
842 | */ |
843 | |
844 | template<class Delegate> |
845 | const char* parse(Delegate& delegate, const UString& pattern, unsigned backReferenceLimit = UINT_MAX) |
846 | { |
847 | return Parser<Delegate>(delegate, pattern, backReferenceLimit).parse(); |
848 | } |
849 | |
850 | } } // namespace JSC::Yarr |
851 | |
852 | #endif |
853 | |
854 | #endif // RegexParser_h |
855 | |