1/*
2 * Copyright (C) 2009, 2013-2017 Apple Inc. All rights reserved.
3 * Copyright (C) 2010 Peter Varga (pvarga@inf.u-szeged.hu), University of Szeged
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
15 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
17 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
18 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
19 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
21 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
22 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
24 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 */
26
27#include "config.h"
28#include "YarrInterpreter.h"
29
30#include "Options.h"
31#include "SuperSampler.h"
32#include "Yarr.h"
33#include "YarrCanonicalize.h"
34#include <wtf/BumpPointerAllocator.h>
35#include <wtf/CheckedArithmetic.h>
36#include <wtf/DataLog.h>
37#include <wtf/StdLibExtras.h>
38#include <wtf/text/CString.h>
39#include <wtf/text/WTFString.h>
40
41namespace JSC { namespace Yarr {
42
43template<typename CharType>
44class Interpreter {
45public:
46 struct ParenthesesDisjunctionContext;
47
48 struct BackTrackInfoParentheses {
49 uintptr_t matchAmount;
50 ParenthesesDisjunctionContext* lastContext;
51 };
52
53 static inline void appendParenthesesDisjunctionContext(BackTrackInfoParentheses* backTrack, ParenthesesDisjunctionContext* context)
54 {
55 context->next = backTrack->lastContext;
56 backTrack->lastContext = context;
57 ++backTrack->matchAmount;
58 }
59
60 static inline void popParenthesesDisjunctionContext(BackTrackInfoParentheses* backTrack)
61 {
62 RELEASE_ASSERT(backTrack->matchAmount);
63 RELEASE_ASSERT(backTrack->lastContext);
64 backTrack->lastContext = backTrack->lastContext->next;
65 --backTrack->matchAmount;
66 }
67
68 struct DisjunctionContext
69 {
70 DisjunctionContext() = default;
71
72 void* operator new(size_t, void* where)
73 {
74 return where;
75 }
76
77 static size_t allocationSize(unsigned numberOfFrames)
78 {
79 static_assert(alignof(DisjunctionContext) <= sizeof(void*), "");
80 size_t rawSize = (sizeof(DisjunctionContext) - sizeof(uintptr_t) + Checked<size_t>(numberOfFrames) * sizeof(uintptr_t)).unsafeGet();
81 size_t roundedSize = WTF::roundUpToMultipleOf<sizeof(void*)>(x: rawSize);
82 RELEASE_ASSERT(roundedSize >= rawSize);
83 return roundedSize;
84 }
85
86 int term { 0 };
87 unsigned matchBegin;
88 unsigned matchEnd;
89 uintptr_t frame[1];
90 };
91
92 DisjunctionContext* allocDisjunctionContext(ByteDisjunction* disjunction)
93 {
94 size_t size = DisjunctionContext::allocationSize(disjunction->m_frameSize);
95 allocatorPool = allocatorPool->ensureCapacity(size);
96 RELEASE_ASSERT(allocatorPool);
97 return new (allocatorPool->alloc(size)) DisjunctionContext();
98 }
99
100 void freeDisjunctionContext(DisjunctionContext* context)
101 {
102 allocatorPool = allocatorPool->dealloc(position: context);
103 }
104
105 struct ParenthesesDisjunctionContext
106 {
107 ParenthesesDisjunctionContext(unsigned* output, ByteTerm& term)
108 {
109 unsigned firstSubpatternId = term.atom.subpatternId;
110 unsigned numNestedSubpatterns = term.atom.parenthesesDisjunction->m_numSubpatterns;
111
112 for (unsigned i = 0; i < (numNestedSubpatterns << 1); ++i) {
113 subpatternBackup[i] = output[(firstSubpatternId << 1) + i];
114 output[(firstSubpatternId << 1) + i] = offsetNoMatch;
115 }
116
117 new (getDisjunctionContext(term)) DisjunctionContext();
118 }
119
120 void* operator new(size_t, void* where)
121 {
122 return where;
123 }
124
125 void restoreOutput(unsigned* output, unsigned firstSubpatternId, unsigned numNestedSubpatterns)
126 {
127 for (unsigned i = 0; i < (numNestedSubpatterns << 1); ++i)
128 output[(firstSubpatternId << 1) + i] = subpatternBackup[i];
129 }
130
131 DisjunctionContext* getDisjunctionContext(ByteTerm& term)
132 {
133 return bitwise_cast<DisjunctionContext*>(bitwise_cast<uintptr_t>(this) + allocationSize(numberOfSubpatterns: term.atom.parenthesesDisjunction->m_numSubpatterns));
134 }
135
136 static size_t allocationSize(unsigned numberOfSubpatterns)
137 {
138 static_assert(alignof(ParenthesesDisjunctionContext) <= sizeof(void*), "");
139 size_t rawSize = (sizeof(ParenthesesDisjunctionContext) - sizeof(unsigned) + (Checked<size_t>(numberOfSubpatterns) * 2U) * sizeof(unsigned)).unsafeGet();
140 size_t roundedSize = WTF::roundUpToMultipleOf<sizeof(void*)>(x: rawSize);
141 RELEASE_ASSERT(roundedSize >= rawSize);
142 return roundedSize;
143 }
144
145 ParenthesesDisjunctionContext* next { nullptr };
146 unsigned subpatternBackup[1];
147 };
148
149 ParenthesesDisjunctionContext* allocParenthesesDisjunctionContext(ByteDisjunction* disjunction, unsigned* output, ByteTerm& term)
150 {
151 size_t size = (Checked<size_t>(ParenthesesDisjunctionContext::allocationSize(term.atom.parenthesesDisjunction->m_numSubpatterns)) + DisjunctionContext::allocationSize(disjunction->m_frameSize)).unsafeGet();
152 allocatorPool = allocatorPool->ensureCapacity(size);
153 RELEASE_ASSERT(allocatorPool);
154 return new (allocatorPool->alloc(size)) ParenthesesDisjunctionContext(output, term);
155 }
156
157 void freeParenthesesDisjunctionContext(ParenthesesDisjunctionContext* context)
158 {
159 allocatorPool = allocatorPool->dealloc(position: context);
160 }
161
162 class InputStream {
163 public:
164 InputStream(const CharType* input, unsigned start, unsigned length, bool decodeSurrogatePairs)
165 : input(input)
166 , pos(start)
167 , length(length)
168 , decodeSurrogatePairs(decodeSurrogatePairs)
169 {
170 }
171
172 void next()
173 {
174 ++pos;
175 }
176
177 void rewind(unsigned amount)
178 {
179 ASSERT(pos >= amount);
180 pos -= amount;
181 }
182
183 int read()
184 {
185 ASSERT(pos < length);
186 if (pos < length)
187 return input[pos];
188 return -1;
189 }
190
191 int readPair()
192 {
193 ASSERT(pos + 1 < length);
194 return input[pos] | input[pos + 1] << 16;
195 }
196
197 int readChecked(unsigned negativePositionOffest)
198 {
199 RELEASE_ASSERT(pos >= negativePositionOffest);
200 unsigned p = pos - negativePositionOffest;
201 ASSERT(p < length);
202 int result = input[p];
203 if (U16_IS_LEAD(result) && decodeSurrogatePairs && p + 1 < length && U16_IS_TRAIL(input[p + 1])) {
204 if (atEnd())
205 return -1;
206
207 result = U16_GET_SUPPLEMENTARY(result, input[p + 1]);
208 next();
209 }
210 return result;
211 }
212
213 int readSurrogatePairChecked(unsigned negativePositionOffset)
214 {
215 RELEASE_ASSERT(pos >= negativePositionOffset);
216 unsigned p = pos - negativePositionOffset;
217 ASSERT(p < length);
218 if (p + 1 >= length)
219 return -1;
220
221 int first = input[p];
222 int second = input[p + 1];
223 if (U16_IS_LEAD(first) && U16_IS_TRAIL(second))
224 return U16_GET_SUPPLEMENTARY(first, second);
225
226 return -1;
227 }
228
229 int reread(unsigned from)
230 {
231 ASSERT(from < length);
232 int result = input[from];
233 if (U16_IS_LEAD(result) && decodeSurrogatePairs && from + 1 < length && U16_IS_TRAIL(input[from + 1]))
234 result = U16_GET_SUPPLEMENTARY(result, input[from + 1]);
235 return result;
236 }
237
238 int prev()
239 {
240 ASSERT(!(pos > length));
241 if (pos && length)
242 return input[pos - 1];
243 return -1;
244 }
245
246 unsigned getPos()
247 {
248 return pos;
249 }
250
251 void setPos(unsigned p)
252 {
253 pos = p;
254 }
255
256 bool atStart()
257 {
258 return pos == 0;
259 }
260
261 bool atEnd()
262 {
263 return pos == length;
264 }
265
266 unsigned end()
267 {
268 return length;
269 }
270
271 bool checkInput(unsigned count)
272 {
273 if (((pos + count) <= length) && ((pos + count) >= pos)) {
274 pos += count;
275 return true;
276 }
277 return false;
278 }
279
280 void uncheckInput(unsigned count)
281 {
282 RELEASE_ASSERT(pos >= count);
283 pos -= count;
284 }
285
286 bool atStart(unsigned negativePositionOffset)
287 {
288 return pos == negativePositionOffset;
289 }
290
291 bool atEnd(unsigned negativePositionOffest)
292 {
293 RELEASE_ASSERT(pos >= negativePositionOffest);
294 return (pos - negativePositionOffest) == length;
295 }
296
297 bool isAvailableInput(unsigned offset)
298 {
299 return (((pos + offset) <= length) && ((pos + offset) >= pos));
300 }
301
302 private:
303 const CharType* input;
304 unsigned pos;
305 unsigned length;
306 bool decodeSurrogatePairs;
307 };
308
309 bool testCharacterClass(CharacterClass* characterClass, int ch)
310 {
311 auto linearSearchMatches = [&ch](const Vector<UChar32>& matches) {
312 for (unsigned i = 0; i < matches.size(); ++i) {
313 if (ch == matches[i])
314 return true;
315 }
316
317 return false;
318 };
319
320 auto binarySearchMatches = [&ch](const Vector<UChar32>& matches) {
321 size_t low = 0;
322 size_t high = matches.size() - 1;
323
324 while (low <= high) {
325 size_t mid = low + (high - low) / 2;
326 int diff = ch - matches[mid];
327 if (!diff)
328 return true;
329
330 if (diff < 0) {
331 if (mid == low)
332 return false;
333 high = mid - 1;
334 } else
335 low = mid + 1;
336 }
337 return false;
338 };
339
340 auto linearSearchRanges = [&ch](const Vector<CharacterRange>& ranges) {
341 for (unsigned i = 0; i < ranges.size(); ++i) {
342 if ((ch >= ranges[i].begin) && (ch <= ranges[i].end))
343 return true;
344 }
345
346 return false;
347 };
348
349 auto binarySearchRanges = [&ch](const Vector<CharacterRange>& ranges) {
350 size_t low = 0;
351 size_t high = ranges.size() - 1;
352
353 while (low <= high) {
354 size_t mid = low + (high - low) / 2;
355 int rangeBeginDiff = ch - ranges[mid].begin;
356 if (rangeBeginDiff >= 0 && ch <= ranges[mid].end)
357 return true;
358
359 if (rangeBeginDiff < 0) {
360 if (mid == low)
361 return false;
362 high = mid - 1;
363 } else
364 low = mid + 1;
365 }
366 return false;
367 };
368
369 if (characterClass->m_anyCharacter)
370 return true;
371
372 const size_t thresholdForBinarySearch = 6;
373
374 if (!isASCII(c: ch)) {
375 if (characterClass->m_matchesUnicode.size()) {
376 if (characterClass->m_matchesUnicode.size() > thresholdForBinarySearch) {
377 if (binarySearchMatches(characterClass->m_matchesUnicode))
378 return true;
379 } else if (linearSearchMatches(characterClass->m_matchesUnicode))
380 return true;
381 }
382
383 if (characterClass->m_rangesUnicode.size()) {
384 if (characterClass->m_rangesUnicode.size() > thresholdForBinarySearch) {
385 if (binarySearchRanges(characterClass->m_rangesUnicode))
386 return true;
387 } else if (linearSearchRanges(characterClass->m_rangesUnicode))
388 return true;
389 }
390 } else {
391 if (characterClass->m_matches.size()) {
392 if (characterClass->m_matches.size() > thresholdForBinarySearch) {
393 if (binarySearchMatches(characterClass->m_matches))
394 return true;
395 } else if (linearSearchMatches(characterClass->m_matches))
396 return true;
397 }
398
399 if (characterClass->m_ranges.size()) {
400 if (characterClass->m_ranges.size() > thresholdForBinarySearch) {
401 if (binarySearchRanges(characterClass->m_ranges))
402 return true;
403 } else if (linearSearchRanges(characterClass->m_ranges))
404 return true;
405 }
406 }
407
408 return false;
409 }
410
411 bool checkCharacter(int testChar, unsigned negativeInputOffset)
412 {
413 return testChar == input.readChecked(negativeInputOffset);
414 }
415
416 bool checkSurrogatePair(int testUnicodeChar, unsigned negativeInputOffset)
417 {
418 return testUnicodeChar == input.readSurrogatePairChecked(negativeInputOffset);
419 }
420
421 bool checkCasedCharacter(int loChar, int hiChar, unsigned negativeInputOffset)
422 {
423 int ch = input.readChecked(negativeInputOffset);
424 return (loChar == ch) || (hiChar == ch);
425 }
426
427 bool checkCharacterClass(CharacterClass* characterClass, bool invert, unsigned negativeInputOffset)
428 {
429 bool match = testCharacterClass(characterClass, ch: input.readChecked(negativeInputOffset));
430 return invert ? !match : match;
431 }
432
433 bool tryConsumeBackReference(int matchBegin, int matchEnd, unsigned negativeInputOffset)
434 {
435 unsigned matchSize = (unsigned)(matchEnd - matchBegin);
436
437 if (!input.checkInput(matchSize))
438 return false;
439
440 for (unsigned i = 0; i < matchSize; ++i) {
441 int oldCh = input.reread(matchBegin + i);
442 int ch;
443 if (!U_IS_BMP(oldCh)) {
444 ch = input.readSurrogatePairChecked(negativeInputOffset + matchSize - i);
445 ++i;
446 } else
447 ch = input.readChecked(negativeInputOffset + matchSize - i);
448
449 if (oldCh == ch)
450 continue;
451
452 if (pattern->ignoreCase()) {
453 // See ES 6.0, 21.2.2.8.2 for the definition of Canonicalize(). For non-Unicode
454 // patterns, Unicode values are never allowed to match against ASCII ones.
455 // For Unicode, we need to check all canonical equivalents of a character.
456 if (!unicode && (isASCII(c: oldCh) || isASCII(c: ch))) {
457 if (toASCIIUpper(c: oldCh) == toASCIIUpper(c: ch))
458 continue;
459 } else if (areCanonicallyEquivalent(a: oldCh, b: ch, canonicalMode: unicode ? CanonicalMode::Unicode : CanonicalMode::UCS2))
460 continue;
461 }
462
463 input.uncheckInput(matchSize);
464 return false;
465 }
466
467 return true;
468 }
469
470 bool matchAssertionBOL(ByteTerm& term)
471 {
472 return (input.atStart(term.inputPosition)) || (pattern->multiline() && testCharacterClass(characterClass: pattern->newlineCharacterClass, ch: input.readChecked(term.inputPosition + 1)));
473 }
474
475 bool matchAssertionEOL(ByteTerm& term)
476 {
477 if (term.inputPosition)
478 return (input.atEnd(term.inputPosition)) || (pattern->multiline() && testCharacterClass(characterClass: pattern->newlineCharacterClass, ch: input.readChecked(term.inputPosition)));
479
480 return (input.atEnd()) || (pattern->multiline() && testCharacterClass(characterClass: pattern->newlineCharacterClass, ch: input.read()));
481 }
482
483 bool matchAssertionWordBoundary(ByteTerm& term)
484 {
485 bool prevIsWordchar = !input.atStart(term.inputPosition) && testCharacterClass(characterClass: pattern->wordcharCharacterClass, ch: input.readChecked(term.inputPosition + 1));
486 bool readIsWordchar;
487 if (term.inputPosition)
488 readIsWordchar = !input.atEnd(term.inputPosition) && testCharacterClass(characterClass: pattern->wordcharCharacterClass, ch: input.readChecked(term.inputPosition));
489 else
490 readIsWordchar = !input.atEnd() && testCharacterClass(characterClass: pattern->wordcharCharacterClass, ch: input.read());
491
492 bool wordBoundary = prevIsWordchar != readIsWordchar;
493 return term.invert() ? !wordBoundary : wordBoundary;
494 }
495
496 bool backtrackPatternCharacter(ByteTerm& term, DisjunctionContext* context)
497 {
498 BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + term.frameLocation);
499
500 switch (term.atom.quantityType) {
501 case QuantifierFixedCount:
502 break;
503
504 case QuantifierGreedy:
505 if (backTrack->matchAmount) {
506 --backTrack->matchAmount;
507 input.uncheckInput(U16_LENGTH(term.atom.patternCharacter));
508 return true;
509 }
510 break;
511
512 case QuantifierNonGreedy:
513 if ((backTrack->matchAmount < term.atom.quantityMaxCount) && input.checkInput(1)) {
514 ++backTrack->matchAmount;
515 if (checkCharacter(testChar: term.atom.patternCharacter, negativeInputOffset: term.inputPosition + 1))
516 return true;
517 }
518 input.setPos(backTrack->begin);
519 break;
520 }
521
522 return false;
523 }
524
525 bool backtrackPatternCasedCharacter(ByteTerm& term, DisjunctionContext* context)
526 {
527 BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + term.frameLocation);
528
529 switch (term.atom.quantityType) {
530 case QuantifierFixedCount:
531 break;
532
533 case QuantifierGreedy:
534 if (backTrack->matchAmount) {
535 --backTrack->matchAmount;
536 input.uncheckInput(1);
537 return true;
538 }
539 break;
540
541 case QuantifierNonGreedy:
542 if ((backTrack->matchAmount < term.atom.quantityMaxCount) && input.checkInput(1)) {
543 ++backTrack->matchAmount;
544 if (checkCasedCharacter(loChar: term.atom.casedCharacter.lo, hiChar: term.atom.casedCharacter.hi, negativeInputOffset: term.inputPosition + 1))
545 return true;
546 }
547 input.uncheckInput(backTrack->matchAmount);
548 break;
549 }
550
551 return false;
552 }
553
554 bool matchCharacterClass(ByteTerm& term, DisjunctionContext* context)
555 {
556 ASSERT(term.type == ByteTerm::TypeCharacterClass);
557 BackTrackInfoCharacterClass* backTrack = reinterpret_cast<BackTrackInfoCharacterClass*>(context->frame + term.frameLocation);
558
559 switch (term.atom.quantityType) {
560 case QuantifierFixedCount: {
561 if (unicode) {
562 backTrack->begin = input.getPos();
563 unsigned matchAmount = 0;
564 for (matchAmount = 0; matchAmount < term.atom.quantityMaxCount; ++matchAmount) {
565 if (!checkCharacterClass(characterClass: term.atom.characterClass, invert: term.invert(), negativeInputOffset: term.inputPosition - matchAmount)) {
566 input.setPos(backTrack->begin);
567 return false;
568 }
569 }
570
571 return true;
572 }
573
574 for (unsigned matchAmount = 0; matchAmount < term.atom.quantityMaxCount; ++matchAmount) {
575 if (!checkCharacterClass(characterClass: term.atom.characterClass, invert: term.invert(), negativeInputOffset: term.inputPosition - matchAmount))
576 return false;
577 }
578 return true;
579 }
580
581 case QuantifierGreedy: {
582 unsigned position = input.getPos();
583 backTrack->begin = position;
584 unsigned matchAmount = 0;
585 while ((matchAmount < term.atom.quantityMaxCount) && input.checkInput(1)) {
586 if (!checkCharacterClass(characterClass: term.atom.characterClass, invert: term.invert(), negativeInputOffset: term.inputPosition + 1)) {
587 input.setPos(position);
588 break;
589 }
590 ++matchAmount;
591 position = input.getPos();
592 }
593 backTrack->matchAmount = matchAmount;
594
595 return true;
596 }
597
598 case QuantifierNonGreedy:
599 backTrack->begin = input.getPos();
600 backTrack->matchAmount = 0;
601 return true;
602 }
603
604 RELEASE_ASSERT_NOT_REACHED();
605 return false;
606 }
607
608 bool backtrackCharacterClass(ByteTerm& term, DisjunctionContext* context)
609 {
610 ASSERT(term.type == ByteTerm::TypeCharacterClass);
611 BackTrackInfoCharacterClass* backTrack = reinterpret_cast<BackTrackInfoCharacterClass*>(context->frame + term.frameLocation);
612
613 switch (term.atom.quantityType) {
614 case QuantifierFixedCount:
615 if (unicode)
616 input.setPos(backTrack->begin);
617 break;
618
619 case QuantifierGreedy:
620 if (backTrack->matchAmount) {
621 if (unicode) {
622 // Rematch one less match
623 input.setPos(backTrack->begin);
624 --backTrack->matchAmount;
625 for (unsigned matchAmount = 0; (matchAmount < backTrack->matchAmount) && input.checkInput(1); ++matchAmount) {
626 if (!checkCharacterClass(characterClass: term.atom.characterClass, invert: term.invert(), negativeInputOffset: term.inputPosition + 1)) {
627 input.uncheckInput(1);
628 break;
629 }
630 }
631 return true;
632 }
633 --backTrack->matchAmount;
634 input.uncheckInput(1);
635 return true;
636 }
637 break;
638
639 case QuantifierNonGreedy:
640 if ((backTrack->matchAmount < term.atom.quantityMaxCount) && input.checkInput(1)) {
641 ++backTrack->matchAmount;
642 if (checkCharacterClass(characterClass: term.atom.characterClass, invert: term.invert(), negativeInputOffset: term.inputPosition + 1))
643 return true;
644 }
645 input.setPos(backTrack->begin);
646 break;
647 }
648
649 return false;
650 }
651
652 bool matchBackReference(ByteTerm& term, DisjunctionContext* context)
653 {
654 ASSERT(term.type == ByteTerm::TypeBackReference);
655 BackTrackInfoBackReference* backTrack = reinterpret_cast<BackTrackInfoBackReference*>(context->frame + term.frameLocation);
656
657 unsigned matchBegin = output[(term.atom.subpatternId << 1)];
658 unsigned matchEnd = output[(term.atom.subpatternId << 1) + 1];
659
660 // If the end position of the referenced match hasn't set yet then the backreference in the same parentheses where it references to that.
661 // In this case the result of match is empty string like when it references to a parentheses with zero-width match.
662 // Eg.: /(a\1)/
663 if (matchEnd == offsetNoMatch)
664 return true;
665
666 if (matchBegin == offsetNoMatch)
667 return true;
668
669 ASSERT(matchBegin <= matchEnd);
670
671 if (matchBegin == matchEnd)
672 return true;
673
674 switch (term.atom.quantityType) {
675 case QuantifierFixedCount: {
676 backTrack->begin = input.getPos();
677 for (unsigned matchAmount = 0; matchAmount < term.atom.quantityMaxCount; ++matchAmount) {
678 if (!tryConsumeBackReference(matchBegin, matchEnd, negativeInputOffset: term.inputPosition)) {
679 input.setPos(backTrack->begin);
680 return false;
681 }
682 }
683 return true;
684 }
685
686 case QuantifierGreedy: {
687 unsigned matchAmount = 0;
688 while ((matchAmount < term.atom.quantityMaxCount) && tryConsumeBackReference(matchBegin, matchEnd, negativeInputOffset: term.inputPosition))
689 ++matchAmount;
690 backTrack->matchAmount = matchAmount;
691 return true;
692 }
693
694 case QuantifierNonGreedy:
695 backTrack->begin = input.getPos();
696 backTrack->matchAmount = 0;
697 return true;
698 }
699
700 RELEASE_ASSERT_NOT_REACHED();
701 return false;
702 }
703
704 bool backtrackBackReference(ByteTerm& term, DisjunctionContext* context)
705 {
706 ASSERT(term.type == ByteTerm::TypeBackReference);
707 BackTrackInfoBackReference* backTrack = reinterpret_cast<BackTrackInfoBackReference*>(context->frame + term.frameLocation);
708
709 unsigned matchBegin = output[(term.atom.subpatternId << 1)];
710 unsigned matchEnd = output[(term.atom.subpatternId << 1) + 1];
711
712 if (matchBegin == offsetNoMatch)
713 return false;
714
715 ASSERT(matchBegin <= matchEnd);
716
717 if (matchBegin == matchEnd)
718 return false;
719
720 switch (term.atom.quantityType) {
721 case QuantifierFixedCount:
722 // for quantityMaxCount == 1, could rewind.
723 input.setPos(backTrack->begin);
724 break;
725
726 case QuantifierGreedy:
727 if (backTrack->matchAmount) {
728 --backTrack->matchAmount;
729 input.rewind(matchEnd - matchBegin);
730 return true;
731 }
732 break;
733
734 case QuantifierNonGreedy:
735 if ((backTrack->matchAmount < term.atom.quantityMaxCount) && tryConsumeBackReference(matchBegin, matchEnd, negativeInputOffset: term.inputPosition)) {
736 ++backTrack->matchAmount;
737 return true;
738 }
739 input.setPos(backTrack->begin);
740 break;
741 }
742
743 return false;
744 }
745
746 void recordParenthesesMatch(ByteTerm& term, ParenthesesDisjunctionContext* context)
747 {
748 if (term.capture()) {
749 unsigned subpatternId = term.atom.subpatternId;
750 output[(subpatternId << 1)] = context->getDisjunctionContext(term)->matchBegin - term.inputPosition;
751 output[(subpatternId << 1) + 1] = context->getDisjunctionContext(term)->matchEnd - term.inputPosition;
752 }
753 }
754 void resetMatches(ByteTerm& term, ParenthesesDisjunctionContext* context)
755 {
756 unsigned firstSubpatternId = term.atom.subpatternId;
757 unsigned count = term.atom.parenthesesDisjunction->m_numSubpatterns;
758 context->restoreOutput(output, firstSubpatternId, count);
759 }
760 JSRegExpResult parenthesesDoBacktrack(ByteTerm& term, BackTrackInfoParentheses* backTrack)
761 {
762 while (backTrack->matchAmount) {
763 ParenthesesDisjunctionContext* context = backTrack->lastContext;
764
765 JSRegExpResult result = matchDisjunction(disjunction: term.atom.parenthesesDisjunction, context: context->getDisjunctionContext(term), btrack: true);
766 if (result == JSRegExpMatch)
767 return JSRegExpMatch;
768
769 resetMatches(term, context);
770 popParenthesesDisjunctionContext(backTrack);
771 freeParenthesesDisjunctionContext(context);
772
773 if (result != JSRegExpNoMatch)
774 return result;
775 }
776
777 return JSRegExpNoMatch;
778 }
779
780 bool matchParenthesesOnceBegin(ByteTerm& term, DisjunctionContext* context)
781 {
782 ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceBegin);
783 ASSERT(term.atom.quantityMaxCount == 1);
784
785 BackTrackInfoParenthesesOnce* backTrack = reinterpret_cast<BackTrackInfoParenthesesOnce*>(context->frame + term.frameLocation);
786
787 switch (term.atom.quantityType) {
788 case QuantifierGreedy: {
789 // set this speculatively; if we get to the parens end this will be true.
790 backTrack->begin = input.getPos();
791 break;
792 }
793 case QuantifierNonGreedy: {
794 backTrack->begin = notFound;
795 context->term += term.atom.parenthesesWidth;
796 return true;
797 }
798 case QuantifierFixedCount:
799 break;
800 }
801
802 if (term.capture()) {
803 unsigned subpatternId = term.atom.subpatternId;
804 output[(subpatternId << 1)] = input.getPos() - term.inputPosition;
805 }
806
807 return true;
808 }
809
810 bool matchParenthesesOnceEnd(ByteTerm& term, DisjunctionContext* context)
811 {
812 ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceEnd);
813 ASSERT(term.atom.quantityMaxCount == 1);
814
815 if (term.capture()) {
816 unsigned subpatternId = term.atom.subpatternId;
817 output[(subpatternId << 1) + 1] = input.getPos() - term.inputPosition;
818 }
819
820 if (term.atom.quantityType == QuantifierFixedCount)
821 return true;
822
823 BackTrackInfoParenthesesOnce* backTrack = reinterpret_cast<BackTrackInfoParenthesesOnce*>(context->frame + term.frameLocation);
824 return backTrack->begin != input.getPos();
825 }
826
827 bool backtrackParenthesesOnceBegin(ByteTerm& term, DisjunctionContext* context)
828 {
829 ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceBegin);
830 ASSERT(term.atom.quantityMaxCount == 1);
831
832 BackTrackInfoParenthesesOnce* backTrack = reinterpret_cast<BackTrackInfoParenthesesOnce*>(context->frame + term.frameLocation);
833
834 if (term.capture()) {
835 unsigned subpatternId = term.atom.subpatternId;
836 output[(subpatternId << 1)] = offsetNoMatch;
837 output[(subpatternId << 1) + 1] = offsetNoMatch;
838 }
839
840 switch (term.atom.quantityType) {
841 case QuantifierGreedy:
842 // if we backtrack to this point, there is another chance - try matching nothing.
843 ASSERT(backTrack->begin != notFound);
844 backTrack->begin = notFound;
845 context->term += term.atom.parenthesesWidth;
846 return true;
847 case QuantifierNonGreedy:
848 ASSERT(backTrack->begin != notFound);
849 FALLTHROUGH;
850 case QuantifierFixedCount:
851 break;
852 }
853
854 return false;
855 }
856
857 bool backtrackParenthesesOnceEnd(ByteTerm& term, DisjunctionContext* context)
858 {
859 ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceEnd);
860 ASSERT(term.atom.quantityMaxCount == 1);
861
862 BackTrackInfoParenthesesOnce* backTrack = reinterpret_cast<BackTrackInfoParenthesesOnce*>(context->frame + term.frameLocation);
863
864 switch (term.atom.quantityType) {
865 case QuantifierGreedy:
866 if (backTrack->begin == notFound) {
867 context->term -= term.atom.parenthesesWidth;
868 return false;
869 }
870 FALLTHROUGH;
871 case QuantifierNonGreedy:
872 if (backTrack->begin == notFound) {
873 backTrack->begin = input.getPos();
874 if (term.capture()) {
875 // Technically this access to inputPosition should be accessing the begin term's
876 // inputPosition, but for repeats other than fixed these values should be
877 // the same anyway! (We don't pre-check for greedy or non-greedy matches.)
878 ASSERT((&term - term.atom.parenthesesWidth)->type == ByteTerm::TypeParenthesesSubpatternOnceBegin);
879 ASSERT((&term - term.atom.parenthesesWidth)->inputPosition == term.inputPosition);
880 unsigned subpatternId = term.atom.subpatternId;
881 output[subpatternId << 1] = input.getPos() - term.inputPosition;
882 }
883 context->term -= term.atom.parenthesesWidth;
884 return true;
885 }
886 FALLTHROUGH;
887 case QuantifierFixedCount:
888 break;
889 }
890
891 return false;
892 }
893
894 bool matchParenthesesTerminalBegin(ByteTerm& term, DisjunctionContext* context)
895 {
896 ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternTerminalBegin);
897 ASSERT(term.atom.quantityType == QuantifierGreedy);
898 ASSERT(term.atom.quantityMaxCount == quantifyInfinite);
899 ASSERT(!term.capture());
900
901 BackTrackInfoParenthesesTerminal* backTrack = reinterpret_cast<BackTrackInfoParenthesesTerminal*>(context->frame + term.frameLocation);
902 backTrack->begin = input.getPos();
903 return true;
904 }
905
906 bool matchParenthesesTerminalEnd(ByteTerm& term, DisjunctionContext* context)
907 {
908 ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternTerminalEnd);
909
910 BackTrackInfoParenthesesTerminal* backTrack = reinterpret_cast<BackTrackInfoParenthesesTerminal*>(context->frame + term.frameLocation);
911 // Empty match is a failed match.
912 if (backTrack->begin == input.getPos())
913 return false;
914
915 // Successful match! Okay, what's next? - loop around and try to match more!
916 context->term -= (term.atom.parenthesesWidth + 1);
917 return true;
918 }
919
920 bool backtrackParenthesesTerminalBegin(ByteTerm& term, DisjunctionContext* context)
921 {
922 ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternTerminalBegin);
923 ASSERT(term.atom.quantityType == QuantifierGreedy);
924 ASSERT(term.atom.quantityMaxCount == quantifyInfinite);
925 ASSERT(!term.capture());
926
927 // If we backtrack to this point, we have failed to match this iteration of the parens.
928 // Since this is greedy / zero minimum a failed is also accepted as a match!
929 context->term += term.atom.parenthesesWidth;
930 return true;
931 }
932
933 bool backtrackParenthesesTerminalEnd(ByteTerm&, DisjunctionContext*)
934 {
935 // 'Terminal' parentheses are at the end of the regex, and as such a match past end
936 // should always be returned as a successful match - we should never backtrack to here.
937 RELEASE_ASSERT_NOT_REACHED();
938 return false;
939 }
940
941 bool matchParentheticalAssertionBegin(ByteTerm& term, DisjunctionContext* context)
942 {
943 ASSERT(term.type == ByteTerm::TypeParentheticalAssertionBegin);
944 ASSERT(term.atom.quantityMaxCount == 1);
945
946 BackTrackInfoParentheticalAssertion* backTrack = reinterpret_cast<BackTrackInfoParentheticalAssertion*>(context->frame + term.frameLocation);
947
948 backTrack->begin = input.getPos();
949 return true;
950 }
951
952 bool matchParentheticalAssertionEnd(ByteTerm& term, DisjunctionContext* context)
953 {
954 ASSERT(term.type == ByteTerm::TypeParentheticalAssertionEnd);
955 ASSERT(term.atom.quantityMaxCount == 1);
956
957 BackTrackInfoParentheticalAssertion* backTrack = reinterpret_cast<BackTrackInfoParentheticalAssertion*>(context->frame + term.frameLocation);
958
959 input.setPos(backTrack->begin);
960
961 // We've reached the end of the parens; if they are inverted, this is failure.
962 if (term.invert()) {
963 context->term -= term.atom.parenthesesWidth;
964 return false;
965 }
966
967 return true;
968 }
969
970 bool backtrackParentheticalAssertionBegin(ByteTerm& term, DisjunctionContext* context)
971 {
972 ASSERT(term.type == ByteTerm::TypeParentheticalAssertionBegin);
973 ASSERT(term.atom.quantityMaxCount == 1);
974
975 // We've failed to match parens; if they are inverted, this is win!
976 if (term.invert()) {
977 context->term += term.atom.parenthesesWidth;
978 return true;
979 }
980
981 return false;
982 }
983
984 bool backtrackParentheticalAssertionEnd(ByteTerm& term, DisjunctionContext* context)
985 {
986 ASSERT(term.type == ByteTerm::TypeParentheticalAssertionEnd);
987 ASSERT(term.atom.quantityMaxCount == 1);
988
989 BackTrackInfoParentheticalAssertion* backTrack = reinterpret_cast<BackTrackInfoParentheticalAssertion*>(context->frame + term.frameLocation);
990
991 input.setPos(backTrack->begin);
992
993 context->term -= term.atom.parenthesesWidth;
994 return false;
995 }
996
997 JSRegExpResult matchParentheses(ByteTerm& term, DisjunctionContext* context)
998 {
999 ASSERT(term.type == ByteTerm::TypeParenthesesSubpattern);
1000
1001 BackTrackInfoParentheses* backTrack = reinterpret_cast<BackTrackInfoParentheses*>(context->frame + term.frameLocation);
1002 ByteDisjunction* disjunctionBody = term.atom.parenthesesDisjunction;
1003
1004 backTrack->matchAmount = 0;
1005 backTrack->lastContext = 0;
1006
1007 ASSERT(term.atom.quantityType != QuantifierFixedCount || term.atom.quantityMinCount == term.atom.quantityMaxCount);
1008
1009 unsigned minimumMatchCount = term.atom.quantityMinCount;
1010 JSRegExpResult fixedMatchResult;
1011
1012 // Handle fixed matches and the minimum part of a variable length match.
1013 if (minimumMatchCount) {
1014 // While we haven't yet reached our fixed limit,
1015 while (backTrack->matchAmount < minimumMatchCount) {
1016 // Try to do a match, and it it succeeds, add it to the list.
1017 ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunction: disjunctionBody, output, term);
1018 fixedMatchResult = matchDisjunction(disjunction: disjunctionBody, context: context->getDisjunctionContext(term));
1019 if (fixedMatchResult == JSRegExpMatch)
1020 appendParenthesesDisjunctionContext(backTrack, context);
1021 else {
1022 // The match failed; try to find an alternate point to carry on from.
1023 resetMatches(term, context);
1024 freeParenthesesDisjunctionContext(context);
1025
1026 if (fixedMatchResult != JSRegExpNoMatch)
1027 return fixedMatchResult;
1028 JSRegExpResult backtrackResult = parenthesesDoBacktrack(term, backTrack);
1029 if (backtrackResult != JSRegExpMatch)
1030 return backtrackResult;
1031 }
1032 }
1033
1034 ParenthesesDisjunctionContext* context = backTrack->lastContext;
1035 recordParenthesesMatch(term, context);
1036 }
1037
1038 switch (term.atom.quantityType) {
1039 case QuantifierFixedCount: {
1040 ASSERT(backTrack->matchAmount == term.atom.quantityMaxCount);
1041 return JSRegExpMatch;
1042 }
1043
1044 case QuantifierGreedy: {
1045 while (backTrack->matchAmount < term.atom.quantityMaxCount) {
1046 ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunction: disjunctionBody, output, term);
1047 JSRegExpResult result = matchNonZeroDisjunction(disjunction: disjunctionBody, context: context->getDisjunctionContext(term));
1048 if (result == JSRegExpMatch)
1049 appendParenthesesDisjunctionContext(backTrack, context);
1050 else {
1051 resetMatches(term, context);
1052 freeParenthesesDisjunctionContext(context);
1053
1054 if (result != JSRegExpNoMatch)
1055 return result;
1056
1057 break;
1058 }
1059 }
1060
1061 if (backTrack->matchAmount) {
1062 ParenthesesDisjunctionContext* context = backTrack->lastContext;
1063 recordParenthesesMatch(term, context);
1064 }
1065 return JSRegExpMatch;
1066 }
1067
1068 case QuantifierNonGreedy:
1069 return JSRegExpMatch;
1070 }
1071
1072 RELEASE_ASSERT_NOT_REACHED();
1073 return JSRegExpErrorNoMatch;
1074 }
1075
1076 // Rules for backtracking differ depending on whether this is greedy or non-greedy.
1077 //
1078 // Greedy matches never should try just adding more - you should already have done
1079 // the 'more' cases. Always backtrack, at least a leetle bit. However cases where
1080 // you backtrack an item off the list needs checking, since we'll never have matched
1081 // the one less case. Tracking forwards, still add as much as possible.
1082 //
1083 // Non-greedy, we've already done the one less case, so don't match on popping.
1084 // We haven't done the one more case, so always try to add that.
1085 //
1086 JSRegExpResult backtrackParentheses(ByteTerm& term, DisjunctionContext* context)
1087 {
1088 ASSERT(term.type == ByteTerm::TypeParenthesesSubpattern);
1089
1090 BackTrackInfoParentheses* backTrack = reinterpret_cast<BackTrackInfoParentheses*>(context->frame + term.frameLocation);
1091 ByteDisjunction* disjunctionBody = term.atom.parenthesesDisjunction;
1092
1093 switch (term.atom.quantityType) {
1094 case QuantifierFixedCount: {
1095 ASSERT(backTrack->matchAmount == term.atom.quantityMaxCount);
1096
1097 ParenthesesDisjunctionContext* context = 0;
1098 JSRegExpResult result = parenthesesDoBacktrack(term, backTrack);
1099
1100 if (result != JSRegExpMatch)
1101 return result;
1102
1103 // While we haven't yet reached our fixed limit,
1104 while (backTrack->matchAmount < term.atom.quantityMaxCount) {
1105 // Try to do a match, and it it succeeds, add it to the list.
1106 context = allocParenthesesDisjunctionContext(disjunction: disjunctionBody, output, term);
1107 result = matchDisjunction(disjunction: disjunctionBody, context: context->getDisjunctionContext(term));
1108
1109 if (result == JSRegExpMatch)
1110 appendParenthesesDisjunctionContext(backTrack, context);
1111 else {
1112 // The match failed; try to find an alternate point to carry on from.
1113 resetMatches(term, context);
1114 freeParenthesesDisjunctionContext(context);
1115
1116 if (result != JSRegExpNoMatch)
1117 return result;
1118 JSRegExpResult backtrackResult = parenthesesDoBacktrack(term, backTrack);
1119 if (backtrackResult != JSRegExpMatch)
1120 return backtrackResult;
1121 }
1122 }
1123
1124 ASSERT(backTrack->matchAmount == term.atom.quantityMaxCount);
1125 context = backTrack->lastContext;
1126 recordParenthesesMatch(term, context);
1127 return JSRegExpMatch;
1128 }
1129
1130 case QuantifierGreedy: {
1131 if (!backTrack->matchAmount)
1132 return JSRegExpNoMatch;
1133
1134 ParenthesesDisjunctionContext* context = backTrack->lastContext;
1135 JSRegExpResult result = matchNonZeroDisjunction(disjunction: disjunctionBody, context: context->getDisjunctionContext(term), btrack: true);
1136 if (result == JSRegExpMatch) {
1137 while (backTrack->matchAmount < term.atom.quantityMaxCount) {
1138 ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunction: disjunctionBody, output, term);
1139 JSRegExpResult parenthesesResult = matchNonZeroDisjunction(disjunction: disjunctionBody, context: context->getDisjunctionContext(term));
1140 if (parenthesesResult == JSRegExpMatch)
1141 appendParenthesesDisjunctionContext(backTrack, context);
1142 else {
1143 resetMatches(term, context);
1144 freeParenthesesDisjunctionContext(context);
1145
1146 if (parenthesesResult != JSRegExpNoMatch)
1147 return parenthesesResult;
1148
1149 break;
1150 }
1151 }
1152 } else {
1153 resetMatches(term, context);
1154 popParenthesesDisjunctionContext(backTrack);
1155 freeParenthesesDisjunctionContext(context);
1156
1157 if (result != JSRegExpNoMatch || backTrack->matchAmount < term.atom.quantityMinCount)
1158 return result;
1159 }
1160
1161 if (backTrack->matchAmount) {
1162 ParenthesesDisjunctionContext* context = backTrack->lastContext;
1163 recordParenthesesMatch(term, context);
1164 }
1165 return JSRegExpMatch;
1166 }
1167
1168 case QuantifierNonGreedy: {
1169 // If we've not reached the limit, try to add one more match.
1170 if (backTrack->matchAmount < term.atom.quantityMaxCount) {
1171 ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunction: disjunctionBody, output, term);
1172 JSRegExpResult result = matchNonZeroDisjunction(disjunction: disjunctionBody, context: context->getDisjunctionContext(term));
1173 if (result == JSRegExpMatch) {
1174 appendParenthesesDisjunctionContext(backTrack, context);
1175 recordParenthesesMatch(term, context);
1176 return JSRegExpMatch;
1177 }
1178
1179 resetMatches(term, context);
1180 freeParenthesesDisjunctionContext(context);
1181
1182 if (result != JSRegExpNoMatch)
1183 return result;
1184 }
1185
1186 // Nope - okay backtrack looking for an alternative.
1187 while (backTrack->matchAmount) {
1188 ParenthesesDisjunctionContext* context = backTrack->lastContext;
1189 JSRegExpResult result = matchNonZeroDisjunction(disjunction: disjunctionBody, context: context->getDisjunctionContext(term), btrack: true);
1190 if (result == JSRegExpMatch) {
1191 // successful backtrack! we're back in the game!
1192 if (backTrack->matchAmount) {
1193 context = backTrack->lastContext;
1194 recordParenthesesMatch(term, context);
1195 }
1196 return JSRegExpMatch;
1197 }
1198
1199 // pop a match off the stack
1200 resetMatches(term, context);
1201 popParenthesesDisjunctionContext(backTrack);
1202 freeParenthesesDisjunctionContext(context);
1203
1204 if (result != JSRegExpNoMatch)
1205 return result;
1206 }
1207
1208 return JSRegExpNoMatch;
1209 }
1210 }
1211
1212 RELEASE_ASSERT_NOT_REACHED();
1213 return JSRegExpErrorNoMatch;
1214 }
1215
1216 bool matchDotStarEnclosure(ByteTerm& term, DisjunctionContext* context)
1217 {
1218 UNUSED_PARAM(term);
1219
1220 if (pattern->dotAll()) {
1221 context->matchBegin = startOffset;
1222 context->matchEnd = input.end();
1223 return true;
1224 }
1225
1226 unsigned matchBegin = context->matchBegin;
1227
1228 if (matchBegin > startOffset) {
1229 for (matchBegin--; true; matchBegin--) {
1230 if (testCharacterClass(characterClass: pattern->newlineCharacterClass, ch: input.reread(matchBegin))) {
1231 ++matchBegin;
1232 break;
1233 }
1234
1235 if (matchBegin == startOffset)
1236 break;
1237 }
1238 }
1239
1240 unsigned matchEnd = input.getPos();
1241
1242 for (; (matchEnd != input.end())
1243 && (!testCharacterClass(characterClass: pattern->newlineCharacterClass, ch: input.reread(matchEnd))); matchEnd++) { }
1244
1245 if (((matchBegin && term.anchors.m_bol)
1246 || ((matchEnd != input.end()) && term.anchors.m_eol))
1247 && !pattern->multiline())
1248 return false;
1249
1250 context->matchBegin = matchBegin;
1251 context->matchEnd = matchEnd;
1252 return true;
1253 }
1254
1255#define MATCH_NEXT() { ++context->term; goto matchAgain; }
1256#define BACKTRACK() { --context->term; goto backtrack; }
1257#define currentTerm() (disjunction->terms[context->term])
1258 JSRegExpResult matchDisjunction(ByteDisjunction* disjunction, DisjunctionContext* context, bool btrack = false)
1259 {
1260 if (!--remainingMatchCount)
1261 return JSRegExpErrorHitLimit;
1262
1263 if (btrack)
1264 BACKTRACK();
1265
1266 context->matchBegin = input.getPos();
1267 context->term = 0;
1268
1269 matchAgain:
1270 ASSERT(context->term < static_cast<int>(disjunction->terms.size()));
1271
1272 switch (currentTerm().type) {
1273 case ByteTerm::TypeSubpatternBegin:
1274 MATCH_NEXT();
1275 case ByteTerm::TypeSubpatternEnd:
1276 context->matchEnd = input.getPos();
1277 return JSRegExpMatch;
1278
1279 case ByteTerm::TypeBodyAlternativeBegin:
1280 MATCH_NEXT();
1281 case ByteTerm::TypeBodyAlternativeDisjunction:
1282 case ByteTerm::TypeBodyAlternativeEnd:
1283 context->matchEnd = input.getPos();
1284 return JSRegExpMatch;
1285
1286 case ByteTerm::TypeAlternativeBegin:
1287 MATCH_NEXT();
1288 case ByteTerm::TypeAlternativeDisjunction:
1289 case ByteTerm::TypeAlternativeEnd: {
1290 int offset = currentTerm().alternative.end;
1291 BackTrackInfoAlternative* backTrack = reinterpret_cast<BackTrackInfoAlternative*>(context->frame + currentTerm().frameLocation);
1292 backTrack->offset = offset;
1293 context->term += offset;
1294 MATCH_NEXT();
1295 }
1296
1297 case ByteTerm::TypeAssertionBOL:
1298 if (matchAssertionBOL(currentTerm()))
1299 MATCH_NEXT();
1300 BACKTRACK();
1301 case ByteTerm::TypeAssertionEOL:
1302 if (matchAssertionEOL(currentTerm()))
1303 MATCH_NEXT();
1304 BACKTRACK();
1305 case ByteTerm::TypeAssertionWordBoundary:
1306 if (matchAssertionWordBoundary(currentTerm()))
1307 MATCH_NEXT();
1308 BACKTRACK();
1309
1310 case ByteTerm::TypePatternCharacterOnce:
1311 case ByteTerm::TypePatternCharacterFixed: {
1312 if (unicode) {
1313 if (!U_IS_BMP(currentTerm().atom.patternCharacter)) {
1314 for (unsigned matchAmount = 0; matchAmount < currentTerm().atom.quantityMaxCount; ++matchAmount) {
1315 if (!checkSurrogatePair(currentTerm().atom.patternCharacter, currentTerm().inputPosition - 2 * matchAmount)) {
1316 BACKTRACK();
1317 }
1318 }
1319 MATCH_NEXT();
1320 }
1321 }
1322 unsigned position = input.getPos(); // May need to back out reading a surrogate pair.
1323
1324 for (unsigned matchAmount = 0; matchAmount < currentTerm().atom.quantityMaxCount; ++matchAmount) {
1325 if (!checkCharacter(currentTerm().atom.patternCharacter, currentTerm().inputPosition - matchAmount)) {
1326 input.setPos(position);
1327 BACKTRACK();
1328 }
1329 }
1330 MATCH_NEXT();
1331 }
1332 case ByteTerm::TypePatternCharacterGreedy: {
1333 BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation);
1334 unsigned matchAmount = 0;
1335 unsigned position = input.getPos(); // May need to back out reading a surrogate pair.
1336 while ((matchAmount < currentTerm().atom.quantityMaxCount) && input.checkInput(1)) {
1337 if (!checkCharacter(currentTerm().atom.patternCharacter, currentTerm().inputPosition + 1)) {
1338 input.setPos(position);
1339 break;
1340 }
1341 ++matchAmount;
1342 position = input.getPos();
1343 }
1344 backTrack->matchAmount = matchAmount;
1345
1346 MATCH_NEXT();
1347 }
1348 case ByteTerm::TypePatternCharacterNonGreedy: {
1349 BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation);
1350 backTrack->begin = input.getPos();
1351 backTrack->matchAmount = 0;
1352 MATCH_NEXT();
1353 }
1354
1355 case ByteTerm::TypePatternCasedCharacterOnce:
1356 case ByteTerm::TypePatternCasedCharacterFixed: {
1357 if (unicode) {
1358 // Case insensitive matching of unicode characters is handled as TypeCharacterClass.
1359 ASSERT(U_IS_BMP(currentTerm().atom.patternCharacter));
1360
1361 unsigned position = input.getPos(); // May need to back out reading a surrogate pair.
1362
1363 for (unsigned matchAmount = 0; matchAmount < currentTerm().atom.quantityMaxCount; ++matchAmount) {
1364 if (!checkCasedCharacter(currentTerm().atom.casedCharacter.lo, currentTerm().atom.casedCharacter.hi, currentTerm().inputPosition - matchAmount)) {
1365 input.setPos(position);
1366 BACKTRACK();
1367 }
1368 }
1369 MATCH_NEXT();
1370 }
1371
1372 for (unsigned matchAmount = 0; matchAmount < currentTerm().atom.quantityMaxCount; ++matchAmount) {
1373 if (!checkCasedCharacter(currentTerm().atom.casedCharacter.lo, currentTerm().atom.casedCharacter.hi, currentTerm().inputPosition - matchAmount))
1374 BACKTRACK();
1375 }
1376 MATCH_NEXT();
1377 }
1378 case ByteTerm::TypePatternCasedCharacterGreedy: {
1379 BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation);
1380
1381 // Case insensitive matching of unicode characters is handled as TypeCharacterClass.
1382 ASSERT(!unicode || U_IS_BMP(currentTerm().atom.patternCharacter));
1383
1384 unsigned matchAmount = 0;
1385 while ((matchAmount < currentTerm().atom.quantityMaxCount) && input.checkInput(1)) {
1386 if (!checkCasedCharacter(currentTerm().atom.casedCharacter.lo, currentTerm().atom.casedCharacter.hi, currentTerm().inputPosition + 1)) {
1387 input.uncheckInput(1);
1388 break;
1389 }
1390 ++matchAmount;
1391 }
1392 backTrack->matchAmount = matchAmount;
1393
1394 MATCH_NEXT();
1395 }
1396 case ByteTerm::TypePatternCasedCharacterNonGreedy: {
1397 BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation);
1398
1399 // Case insensitive matching of unicode characters is handled as TypeCharacterClass.
1400 ASSERT(!unicode || U_IS_BMP(currentTerm().atom.patternCharacter));
1401
1402 backTrack->matchAmount = 0;
1403 MATCH_NEXT();
1404 }
1405
1406 case ByteTerm::TypeCharacterClass:
1407 if (matchCharacterClass(currentTerm(), context))
1408 MATCH_NEXT();
1409 BACKTRACK();
1410 case ByteTerm::TypeBackReference:
1411 if (matchBackReference(currentTerm(), context))
1412 MATCH_NEXT();
1413 BACKTRACK();
1414 case ByteTerm::TypeParenthesesSubpattern: {
1415 JSRegExpResult result = matchParentheses(currentTerm(), context);
1416
1417 if (result == JSRegExpMatch) {
1418 MATCH_NEXT();
1419 } else if (result != JSRegExpNoMatch)
1420 return result;
1421
1422 BACKTRACK();
1423 }
1424 case ByteTerm::TypeParenthesesSubpatternOnceBegin:
1425 if (matchParenthesesOnceBegin(currentTerm(), context))
1426 MATCH_NEXT();
1427 BACKTRACK();
1428 case ByteTerm::TypeParenthesesSubpatternOnceEnd:
1429 if (matchParenthesesOnceEnd(currentTerm(), context))
1430 MATCH_NEXT();
1431 BACKTRACK();
1432 case ByteTerm::TypeParenthesesSubpatternTerminalBegin:
1433 if (matchParenthesesTerminalBegin(currentTerm(), context))
1434 MATCH_NEXT();
1435 BACKTRACK();
1436 case ByteTerm::TypeParenthesesSubpatternTerminalEnd:
1437 if (matchParenthesesTerminalEnd(currentTerm(), context))
1438 MATCH_NEXT();
1439 BACKTRACK();
1440 case ByteTerm::TypeParentheticalAssertionBegin:
1441 if (matchParentheticalAssertionBegin(currentTerm(), context))
1442 MATCH_NEXT();
1443 BACKTRACK();
1444 case ByteTerm::TypeParentheticalAssertionEnd:
1445 if (matchParentheticalAssertionEnd(currentTerm(), context))
1446 MATCH_NEXT();
1447 BACKTRACK();
1448
1449 case ByteTerm::TypeCheckInput:
1450 if (input.checkInput(currentTerm().checkInputCount))
1451 MATCH_NEXT();
1452 BACKTRACK();
1453
1454 case ByteTerm::TypeUncheckInput:
1455 input.uncheckInput(currentTerm().checkInputCount);
1456 MATCH_NEXT();
1457
1458 case ByteTerm::TypeDotStarEnclosure:
1459 if (matchDotStarEnclosure(currentTerm(), context))
1460 return JSRegExpMatch;
1461 BACKTRACK();
1462 }
1463
1464 // We should never fall-through to here.
1465 RELEASE_ASSERT_NOT_REACHED();
1466
1467 backtrack:
1468 ASSERT(context->term < static_cast<int>(disjunction->terms.size()));
1469
1470 switch (currentTerm().type) {
1471 case ByteTerm::TypeSubpatternBegin:
1472 return JSRegExpNoMatch;
1473 case ByteTerm::TypeSubpatternEnd:
1474 RELEASE_ASSERT_NOT_REACHED();
1475
1476 case ByteTerm::TypeBodyAlternativeBegin:
1477 case ByteTerm::TypeBodyAlternativeDisjunction: {
1478 int offset = currentTerm().alternative.next;
1479 context->term += offset;
1480 if (offset > 0)
1481 MATCH_NEXT();
1482
1483 if (input.atEnd() || pattern->sticky())
1484 return JSRegExpNoMatch;
1485
1486 input.next();
1487
1488 context->matchBegin = input.getPos();
1489
1490 if (currentTerm().alternative.onceThrough)
1491 context->term += currentTerm().alternative.next;
1492
1493 MATCH_NEXT();
1494 }
1495 case ByteTerm::TypeBodyAlternativeEnd:
1496 RELEASE_ASSERT_NOT_REACHED();
1497
1498 case ByteTerm::TypeAlternativeBegin:
1499 case ByteTerm::TypeAlternativeDisjunction: {
1500 int offset = currentTerm().alternative.next;
1501 context->term += offset;
1502 if (offset > 0)
1503 MATCH_NEXT();
1504 BACKTRACK();
1505 }
1506 case ByteTerm::TypeAlternativeEnd: {
1507 // We should never backtrack back into an alternative of the main body of the regex.
1508 BackTrackInfoAlternative* backTrack = reinterpret_cast<BackTrackInfoAlternative*>(context->frame + currentTerm().frameLocation);
1509 unsigned offset = backTrack->offset;
1510 context->term -= offset;
1511 BACKTRACK();
1512 }
1513
1514 case ByteTerm::TypeAssertionBOL:
1515 case ByteTerm::TypeAssertionEOL:
1516 case ByteTerm::TypeAssertionWordBoundary:
1517 BACKTRACK();
1518
1519 case ByteTerm::TypePatternCharacterOnce:
1520 case ByteTerm::TypePatternCharacterFixed:
1521 case ByteTerm::TypePatternCharacterGreedy:
1522 case ByteTerm::TypePatternCharacterNonGreedy:
1523 if (backtrackPatternCharacter(currentTerm(), context))
1524 MATCH_NEXT();
1525 BACKTRACK();
1526 case ByteTerm::TypePatternCasedCharacterOnce:
1527 case ByteTerm::TypePatternCasedCharacterFixed:
1528 case ByteTerm::TypePatternCasedCharacterGreedy:
1529 case ByteTerm::TypePatternCasedCharacterNonGreedy:
1530 if (backtrackPatternCasedCharacter(currentTerm(), context))
1531 MATCH_NEXT();
1532 BACKTRACK();
1533 case ByteTerm::TypeCharacterClass:
1534 if (backtrackCharacterClass(currentTerm(), context))
1535 MATCH_NEXT();
1536 BACKTRACK();
1537 case ByteTerm::TypeBackReference:
1538 if (backtrackBackReference(currentTerm(), context))
1539 MATCH_NEXT();
1540 BACKTRACK();
1541 case ByteTerm::TypeParenthesesSubpattern: {
1542 JSRegExpResult result = backtrackParentheses(currentTerm(), context);
1543
1544 if (result == JSRegExpMatch) {
1545 MATCH_NEXT();
1546 } else if (result != JSRegExpNoMatch)
1547 return result;
1548
1549 BACKTRACK();
1550 }
1551 case ByteTerm::TypeParenthesesSubpatternOnceBegin:
1552 if (backtrackParenthesesOnceBegin(currentTerm(), context))
1553 MATCH_NEXT();
1554 BACKTRACK();
1555 case ByteTerm::TypeParenthesesSubpatternOnceEnd:
1556 if (backtrackParenthesesOnceEnd(currentTerm(), context))
1557 MATCH_NEXT();
1558 BACKTRACK();
1559 case ByteTerm::TypeParenthesesSubpatternTerminalBegin:
1560 if (backtrackParenthesesTerminalBegin(currentTerm(), context))
1561 MATCH_NEXT();
1562 BACKTRACK();
1563 case ByteTerm::TypeParenthesesSubpatternTerminalEnd:
1564 if (backtrackParenthesesTerminalEnd(currentTerm(), context))
1565 MATCH_NEXT();
1566 BACKTRACK();
1567 case ByteTerm::TypeParentheticalAssertionBegin:
1568 if (backtrackParentheticalAssertionBegin(currentTerm(), context))
1569 MATCH_NEXT();
1570 BACKTRACK();
1571 case ByteTerm::TypeParentheticalAssertionEnd:
1572 if (backtrackParentheticalAssertionEnd(currentTerm(), context))
1573 MATCH_NEXT();
1574 BACKTRACK();
1575
1576 case ByteTerm::TypeCheckInput:
1577 input.uncheckInput(currentTerm().checkInputCount);
1578 BACKTRACK();
1579
1580 case ByteTerm::TypeUncheckInput:
1581 input.checkInput(currentTerm().checkInputCount);
1582 BACKTRACK();
1583
1584 case ByteTerm::TypeDotStarEnclosure:
1585 RELEASE_ASSERT_NOT_REACHED();
1586 }
1587
1588 RELEASE_ASSERT_NOT_REACHED();
1589 return JSRegExpErrorNoMatch;
1590 }
1591
1592 JSRegExpResult matchNonZeroDisjunction(ByteDisjunction* disjunction, DisjunctionContext* context, bool btrack = false)
1593 {
1594 JSRegExpResult result = matchDisjunction(disjunction, context, btrack);
1595
1596 if (result == JSRegExpMatch) {
1597 while (context->matchBegin == context->matchEnd) {
1598 result = matchDisjunction(disjunction, context, btrack: true);
1599 if (result != JSRegExpMatch)
1600 return result;
1601 }
1602 return JSRegExpMatch;
1603 }
1604
1605 return result;
1606 }
1607
1608 unsigned interpret()
1609 {
1610 if (!input.isAvailableInput(0))
1611 return offsetNoMatch;
1612
1613 if (pattern->m_lock)
1614 pattern->m_lock->lock();
1615
1616 for (unsigned i = 0; i < pattern->m_body->m_numSubpatterns + 1; ++i)
1617 output[i << 1] = offsetNoMatch;
1618
1619 allocatorPool = pattern->m_allocator->startAllocator();
1620 RELEASE_ASSERT(allocatorPool);
1621
1622 DisjunctionContext* context = allocDisjunctionContext(disjunction: pattern->m_body.get());
1623
1624 JSRegExpResult result = matchDisjunction(disjunction: pattern->m_body.get(), context, btrack: false);
1625 if (result == JSRegExpMatch) {
1626 output[0] = context->matchBegin;
1627 output[1] = context->matchEnd;
1628 }
1629
1630 freeDisjunctionContext(context);
1631
1632 pattern->m_allocator->stopAllocator();
1633
1634 ASSERT((result == JSRegExpMatch) == (output[0] != offsetNoMatch));
1635
1636 if (pattern->m_lock)
1637 pattern->m_lock->unlock();
1638
1639 return output[0];
1640 }
1641
1642 Interpreter(BytecodePattern* pattern, unsigned* output, const CharType* input, unsigned length, unsigned start)
1643 : pattern(pattern)
1644 , unicode(pattern->unicode())
1645 , output(output)
1646 , input(input, start, length, pattern->unicode())
1647 , startOffset(start)
1648 , remainingMatchCount(matchLimit)
1649 {
1650 }
1651
1652private:
1653 BytecodePattern* pattern;
1654 bool unicode;
1655 unsigned* output;
1656 InputStream input;
1657 WTF::BumpPointerPool* allocatorPool { nullptr };
1658 unsigned startOffset;
1659 unsigned remainingMatchCount;
1660};
1661
1662class ByteCompiler {
1663 struct ParenthesesStackEntry {
1664 unsigned beginTerm;
1665 unsigned savedAlternativeIndex;
1666 ParenthesesStackEntry(unsigned beginTerm, unsigned savedAlternativeIndex/*, unsigned subpatternId, bool capture = false*/)
1667 : beginTerm(beginTerm)
1668 , savedAlternativeIndex(savedAlternativeIndex)
1669 {
1670 }
1671 };
1672
1673public:
1674 ByteCompiler(YarrPattern& pattern)
1675 : m_pattern(pattern)
1676 {
1677 m_currentAlternativeIndex = 0;
1678 }
1679
1680 std::unique_ptr<BytecodePattern> compile(BumpPointerAllocator* allocator, ConcurrentJSLock* lock)
1681 {
1682 regexBegin(numSubpatterns: m_pattern.m_numSubpatterns, callFrameSize: m_pattern.m_body->m_callFrameSize, onceThrough: m_pattern.m_body->m_alternatives[0]->onceThrough());
1683 emitDisjunction(disjunction: m_pattern.m_body);
1684 regexEnd();
1685
1686#ifndef NDEBUG
1687 if (Options::dumpCompiledRegExpPatterns())
1688 dumpDisjunction(disjunction: m_bodyDisjunction.get());
1689#endif
1690
1691 return std::make_unique<BytecodePattern>(WTFMove(m_bodyDisjunction), args&: m_allParenthesesInfo, args&: m_pattern, args&: allocator, args&: lock);
1692 }
1693
1694 void checkInput(unsigned count)
1695 {
1696 m_bodyDisjunction->terms.append(other: ByteTerm::CheckInput(count));
1697 }
1698
1699 void uncheckInput(unsigned count)
1700 {
1701 m_bodyDisjunction->terms.append(other: ByteTerm::UncheckInput(count));
1702 }
1703
1704 void assertionBOL(unsigned inputPosition)
1705 {
1706 m_bodyDisjunction->terms.append(other: ByteTerm::BOL(inputPos: inputPosition));
1707 }
1708
1709 void assertionEOL(unsigned inputPosition)
1710 {
1711 m_bodyDisjunction->terms.append(other: ByteTerm::EOL(inputPos: inputPosition));
1712 }
1713
1714 void assertionWordBoundary(bool invert, unsigned inputPosition)
1715 {
1716 m_bodyDisjunction->terms.append(other: ByteTerm::WordBoundary(invert, inputPos: inputPosition));
1717 }
1718
1719 void atomPatternCharacter(UChar32 ch, unsigned inputPosition, unsigned frameLocation, Checked<unsigned> quantityMaxCount, QuantifierType quantityType)
1720 {
1721 if (m_pattern.ignoreCase()) {
1722 UChar32 lo = u_tolower(ch);
1723 UChar32 hi = u_toupper(ch);
1724
1725 if (lo != hi) {
1726 m_bodyDisjunction->terms.append(other: ByteTerm(lo, hi, inputPosition, frameLocation, quantityMaxCount, quantityType));
1727 return;
1728 }
1729 }
1730
1731 m_bodyDisjunction->terms.append(other: ByteTerm(ch, inputPosition, frameLocation, quantityMaxCount, quantityType));
1732 }
1733
1734 void atomCharacterClass(CharacterClass* characterClass, bool invert, unsigned inputPosition, unsigned frameLocation, Checked<unsigned> quantityMaxCount, QuantifierType quantityType)
1735 {
1736 m_bodyDisjunction->terms.append(other: ByteTerm(characterClass, invert, inputPosition));
1737
1738 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].atom.quantityMaxCount = quantityMaxCount.unsafeGet();
1739 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].atom.quantityType = quantityType;
1740 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation;
1741 }
1742
1743 void atomBackReference(unsigned subpatternId, unsigned inputPosition, unsigned frameLocation, Checked<unsigned> quantityMaxCount, QuantifierType quantityType)
1744 {
1745 ASSERT(subpatternId);
1746
1747 m_bodyDisjunction->terms.append(other: ByteTerm::BackReference(subpatternId, inputPos: inputPosition));
1748
1749 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].atom.quantityMaxCount = quantityMaxCount.unsafeGet();
1750 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].atom.quantityType = quantityType;
1751 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation;
1752 }
1753
1754 void atomParenthesesOnceBegin(unsigned subpatternId, bool capture, unsigned inputPosition, unsigned frameLocation, unsigned alternativeFrameLocation)
1755 {
1756 int beginTerm = m_bodyDisjunction->terms.size();
1757
1758 m_bodyDisjunction->terms.append(other: ByteTerm(ByteTerm::TypeParenthesesSubpatternOnceBegin, subpatternId, capture, false, inputPosition));
1759 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation;
1760 m_bodyDisjunction->terms.append(other: ByteTerm::AlternativeBegin());
1761 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = alternativeFrameLocation;
1762
1763 m_parenthesesStack.append(other: ParenthesesStackEntry(beginTerm, m_currentAlternativeIndex));
1764 m_currentAlternativeIndex = beginTerm + 1;
1765 }
1766
1767 void atomParenthesesTerminalBegin(unsigned subpatternId, bool capture, unsigned inputPosition, unsigned frameLocation, unsigned alternativeFrameLocation)
1768 {
1769 int beginTerm = m_bodyDisjunction->terms.size();
1770
1771 m_bodyDisjunction->terms.append(other: ByteTerm(ByteTerm::TypeParenthesesSubpatternTerminalBegin, subpatternId, capture, false, inputPosition));
1772 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation;
1773 m_bodyDisjunction->terms.append(other: ByteTerm::AlternativeBegin());
1774 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = alternativeFrameLocation;
1775
1776 m_parenthesesStack.append(other: ParenthesesStackEntry(beginTerm, m_currentAlternativeIndex));
1777 m_currentAlternativeIndex = beginTerm + 1;
1778 }
1779
1780 void atomParenthesesSubpatternBegin(unsigned subpatternId, bool capture, unsigned inputPosition, unsigned frameLocation, unsigned alternativeFrameLocation)
1781 {
1782 // Errrk! - this is a little crazy, we initially generate as a TypeParenthesesSubpatternOnceBegin,
1783 // then fix this up at the end! - simplifying this should make it much clearer.
1784 // https://bugs.webkit.org/show_bug.cgi?id=50136
1785
1786 int beginTerm = m_bodyDisjunction->terms.size();
1787
1788 m_bodyDisjunction->terms.append(other: ByteTerm(ByteTerm::TypeParenthesesSubpatternOnceBegin, subpatternId, capture, false, inputPosition));
1789 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation;
1790 m_bodyDisjunction->terms.append(other: ByteTerm::AlternativeBegin());
1791 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = alternativeFrameLocation;
1792
1793 m_parenthesesStack.append(other: ParenthesesStackEntry(beginTerm, m_currentAlternativeIndex));
1794 m_currentAlternativeIndex = beginTerm + 1;
1795 }
1796
1797 void atomParentheticalAssertionBegin(unsigned subpatternId, bool invert, unsigned frameLocation, unsigned alternativeFrameLocation)
1798 {
1799 int beginTerm = m_bodyDisjunction->terms.size();
1800
1801 m_bodyDisjunction->terms.append(other: ByteTerm(ByteTerm::TypeParentheticalAssertionBegin, subpatternId, false, invert, 0));
1802 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation;
1803 m_bodyDisjunction->terms.append(other: ByteTerm::AlternativeBegin());
1804 m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = alternativeFrameLocation;
1805
1806 m_parenthesesStack.append(other: ParenthesesStackEntry(beginTerm, m_currentAlternativeIndex));
1807 m_currentAlternativeIndex = beginTerm + 1;
1808 }
1809
1810 void atomParentheticalAssertionEnd(unsigned inputPosition, unsigned frameLocation, Checked<unsigned> quantityMaxCount, QuantifierType quantityType)
1811 {
1812 unsigned beginTerm = popParenthesesStack();
1813 closeAlternative(beginTerm: beginTerm + 1);
1814 unsigned endTerm = m_bodyDisjunction->terms.size();
1815
1816 ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeParentheticalAssertionBegin);
1817
1818 bool invert = m_bodyDisjunction->terms[beginTerm].invert();
1819 unsigned subpatternId = m_bodyDisjunction->terms[beginTerm].atom.subpatternId;
1820
1821 m_bodyDisjunction->terms.append(other: ByteTerm(ByteTerm::TypeParentheticalAssertionEnd, subpatternId, false, invert, inputPosition));
1822 m_bodyDisjunction->terms[beginTerm].atom.parenthesesWidth = endTerm - beginTerm;
1823 m_bodyDisjunction->terms[endTerm].atom.parenthesesWidth = endTerm - beginTerm;
1824 m_bodyDisjunction->terms[endTerm].frameLocation = frameLocation;
1825
1826 m_bodyDisjunction->terms[beginTerm].atom.quantityMaxCount = quantityMaxCount.unsafeGet();
1827 m_bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType;
1828 m_bodyDisjunction->terms[endTerm].atom.quantityMaxCount = quantityMaxCount.unsafeGet();
1829 m_bodyDisjunction->terms[endTerm].atom.quantityType = quantityType;
1830 }
1831
1832 void assertionDotStarEnclosure(bool bolAnchored, bool eolAnchored)
1833 {
1834 m_bodyDisjunction->terms.append(other: ByteTerm::DotStarEnclosure(bolAnchor: bolAnchored, eolAnchor: eolAnchored));
1835 }
1836
1837 unsigned popParenthesesStack()
1838 {
1839 ASSERT(m_parenthesesStack.size());
1840 int stackEnd = m_parenthesesStack.size() - 1;
1841 unsigned beginTerm = m_parenthesesStack[stackEnd].beginTerm;
1842 m_currentAlternativeIndex = m_parenthesesStack[stackEnd].savedAlternativeIndex;
1843 m_parenthesesStack.shrink(size: stackEnd);
1844
1845 ASSERT(beginTerm < m_bodyDisjunction->terms.size());
1846 ASSERT(m_currentAlternativeIndex < m_bodyDisjunction->terms.size());
1847
1848 return beginTerm;
1849 }
1850
1851 void closeAlternative(int beginTerm)
1852 {
1853 int origBeginTerm = beginTerm;
1854 ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeAlternativeBegin);
1855 int endIndex = m_bodyDisjunction->terms.size();
1856
1857 unsigned frameLocation = m_bodyDisjunction->terms[beginTerm].frameLocation;
1858
1859 if (!m_bodyDisjunction->terms[beginTerm].alternative.next)
1860 m_bodyDisjunction->terms.remove(position: beginTerm);
1861 else {
1862 while (m_bodyDisjunction->terms[beginTerm].alternative.next) {
1863 beginTerm += m_bodyDisjunction->terms[beginTerm].alternative.next;
1864 ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeAlternativeDisjunction);
1865 m_bodyDisjunction->terms[beginTerm].alternative.end = endIndex - beginTerm;
1866 m_bodyDisjunction->terms[beginTerm].frameLocation = frameLocation;
1867 }
1868
1869 m_bodyDisjunction->terms[beginTerm].alternative.next = origBeginTerm - beginTerm;
1870
1871 m_bodyDisjunction->terms.append(other: ByteTerm::AlternativeEnd());
1872 m_bodyDisjunction->terms[endIndex].frameLocation = frameLocation;
1873 }
1874 }
1875
1876 void closeBodyAlternative()
1877 {
1878 int beginTerm = 0;
1879 int origBeginTerm = 0;
1880 ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeBodyAlternativeBegin);
1881 int endIndex = m_bodyDisjunction->terms.size();
1882
1883 unsigned frameLocation = m_bodyDisjunction->terms[beginTerm].frameLocation;
1884
1885 while (m_bodyDisjunction->terms[beginTerm].alternative.next) {
1886 beginTerm += m_bodyDisjunction->terms[beginTerm].alternative.next;
1887 ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeBodyAlternativeDisjunction);
1888 m_bodyDisjunction->terms[beginTerm].alternative.end = endIndex - beginTerm;
1889 m_bodyDisjunction->terms[beginTerm].frameLocation = frameLocation;
1890 }
1891
1892 m_bodyDisjunction->terms[beginTerm].alternative.next = origBeginTerm - beginTerm;
1893
1894 m_bodyDisjunction->terms.append(other: ByteTerm::BodyAlternativeEnd());
1895 m_bodyDisjunction->terms[endIndex].frameLocation = frameLocation;
1896 }
1897
1898 void atomParenthesesSubpatternEnd(unsigned lastSubpatternId, unsigned inputPosition, unsigned frameLocation, Checked<unsigned> quantityMinCount, Checked<unsigned> quantityMaxCount, QuantifierType quantityType, unsigned callFrameSize = 0)
1899 {
1900 unsigned beginTerm = popParenthesesStack();
1901 closeAlternative(beginTerm: beginTerm + 1);
1902 unsigned endTerm = m_bodyDisjunction->terms.size();
1903
1904 ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeParenthesesSubpatternOnceBegin);
1905
1906 ByteTerm& parenthesesBegin = m_bodyDisjunction->terms[beginTerm];
1907
1908 bool capture = parenthesesBegin.capture();
1909 unsigned subpatternId = parenthesesBegin.atom.subpatternId;
1910
1911 unsigned numSubpatterns = lastSubpatternId - subpatternId + 1;
1912 auto parenthesesDisjunction = std::make_unique<ByteDisjunction>(args&: numSubpatterns, args&: callFrameSize);
1913
1914 unsigned firstTermInParentheses = beginTerm + 1;
1915 parenthesesDisjunction->terms.reserveInitialCapacity(size: endTerm - firstTermInParentheses + 2);
1916
1917 parenthesesDisjunction->terms.append(other: ByteTerm::SubpatternBegin());
1918 for (unsigned termInParentheses = firstTermInParentheses; termInParentheses < endTerm; ++termInParentheses)
1919 parenthesesDisjunction->terms.append(value: m_bodyDisjunction->terms[termInParentheses]);
1920 parenthesesDisjunction->terms.append(other: ByteTerm::SubpatternEnd());
1921
1922 m_bodyDisjunction->terms.shrink(size: beginTerm);
1923
1924 m_bodyDisjunction->terms.append(other: ByteTerm(ByteTerm::TypeParenthesesSubpattern, subpatternId, parenthesesDisjunction.get(), capture, inputPosition));
1925 m_allParenthesesInfo.append(WTFMove(parenthesesDisjunction));
1926
1927 m_bodyDisjunction->terms[beginTerm].atom.quantityMinCount = quantityMinCount.unsafeGet();
1928 m_bodyDisjunction->terms[beginTerm].atom.quantityMaxCount = quantityMaxCount.unsafeGet();
1929 m_bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType;
1930 m_bodyDisjunction->terms[beginTerm].frameLocation = frameLocation;
1931 }
1932
1933 void atomParenthesesOnceEnd(unsigned inputPosition, unsigned frameLocation, Checked<unsigned> quantityMinCount, Checked<unsigned> quantityMaxCount, QuantifierType quantityType)
1934 {
1935 unsigned beginTerm = popParenthesesStack();
1936 closeAlternative(beginTerm: beginTerm + 1);
1937 unsigned endTerm = m_bodyDisjunction->terms.size();
1938
1939 ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeParenthesesSubpatternOnceBegin);
1940
1941 bool capture = m_bodyDisjunction->terms[beginTerm].capture();
1942 unsigned subpatternId = m_bodyDisjunction->terms[beginTerm].atom.subpatternId;
1943
1944 m_bodyDisjunction->terms.append(other: ByteTerm(ByteTerm::TypeParenthesesSubpatternOnceEnd, subpatternId, capture, false, inputPosition));
1945 m_bodyDisjunction->terms[beginTerm].atom.parenthesesWidth = endTerm - beginTerm;
1946 m_bodyDisjunction->terms[endTerm].atom.parenthesesWidth = endTerm - beginTerm;
1947 m_bodyDisjunction->terms[endTerm].frameLocation = frameLocation;
1948
1949 m_bodyDisjunction->terms[beginTerm].atom.quantityMinCount = quantityMinCount.unsafeGet();
1950 m_bodyDisjunction->terms[beginTerm].atom.quantityMaxCount = quantityMaxCount.unsafeGet();
1951 m_bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType;
1952 m_bodyDisjunction->terms[endTerm].atom.quantityMinCount = quantityMinCount.unsafeGet();
1953 m_bodyDisjunction->terms[endTerm].atom.quantityMaxCount = quantityMaxCount.unsafeGet();
1954 m_bodyDisjunction->terms[endTerm].atom.quantityType = quantityType;
1955 }
1956
1957 void atomParenthesesTerminalEnd(unsigned inputPosition, unsigned frameLocation, Checked<unsigned> quantityMinCount, Checked<unsigned> quantityMaxCount, QuantifierType quantityType)
1958 {
1959 unsigned beginTerm = popParenthesesStack();
1960 closeAlternative(beginTerm: beginTerm + 1);
1961 unsigned endTerm = m_bodyDisjunction->terms.size();
1962
1963 ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeParenthesesSubpatternTerminalBegin);
1964
1965 bool capture = m_bodyDisjunction->terms[beginTerm].capture();
1966 unsigned subpatternId = m_bodyDisjunction->terms[beginTerm].atom.subpatternId;
1967
1968 m_bodyDisjunction->terms.append(other: ByteTerm(ByteTerm::TypeParenthesesSubpatternTerminalEnd, subpatternId, capture, false, inputPosition));
1969 m_bodyDisjunction->terms[beginTerm].atom.parenthesesWidth = endTerm - beginTerm;
1970 m_bodyDisjunction->terms[endTerm].atom.parenthesesWidth = endTerm - beginTerm;
1971 m_bodyDisjunction->terms[endTerm].frameLocation = frameLocation;
1972
1973 m_bodyDisjunction->terms[beginTerm].atom.quantityMinCount = quantityMinCount.unsafeGet();
1974 m_bodyDisjunction->terms[beginTerm].atom.quantityMaxCount = quantityMaxCount.unsafeGet();
1975 m_bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType;
1976 m_bodyDisjunction->terms[endTerm].atom.quantityMinCount = quantityMinCount.unsafeGet();
1977 m_bodyDisjunction->terms[endTerm].atom.quantityMaxCount = quantityMaxCount.unsafeGet();
1978 m_bodyDisjunction->terms[endTerm].atom.quantityType = quantityType;
1979 }
1980
1981 void regexBegin(unsigned numSubpatterns, unsigned callFrameSize, bool onceThrough)
1982 {
1983 m_bodyDisjunction = std::make_unique<ByteDisjunction>(args&: numSubpatterns, args&: callFrameSize);
1984 m_bodyDisjunction->terms.append(other: ByteTerm::BodyAlternativeBegin(onceThrough));
1985 m_bodyDisjunction->terms[0].frameLocation = 0;
1986 m_currentAlternativeIndex = 0;
1987 }
1988
1989 void regexEnd()
1990 {
1991 closeBodyAlternative();
1992 }
1993
1994 void alternativeBodyDisjunction(bool onceThrough)
1995 {
1996 int newAlternativeIndex = m_bodyDisjunction->terms.size();
1997 m_bodyDisjunction->terms[m_currentAlternativeIndex].alternative.next = newAlternativeIndex - m_currentAlternativeIndex;
1998 m_bodyDisjunction->terms.append(other: ByteTerm::BodyAlternativeDisjunction(onceThrough));
1999
2000 m_currentAlternativeIndex = newAlternativeIndex;
2001 }
2002
2003 void alternativeDisjunction()
2004 {
2005 int newAlternativeIndex = m_bodyDisjunction->terms.size();
2006 m_bodyDisjunction->terms[m_currentAlternativeIndex].alternative.next = newAlternativeIndex - m_currentAlternativeIndex;
2007 m_bodyDisjunction->terms.append(other: ByteTerm::AlternativeDisjunction());
2008
2009 m_currentAlternativeIndex = newAlternativeIndex;
2010 }
2011
2012 void emitDisjunction(PatternDisjunction* disjunction, unsigned inputCountAlreadyChecked = 0, unsigned parenthesesInputCountAlreadyChecked = 0)
2013 {
2014 for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) {
2015 unsigned currentCountAlreadyChecked = inputCountAlreadyChecked;
2016
2017 PatternAlternative* alternative = disjunction->m_alternatives[alt].get();
2018
2019 if (alt) {
2020 if (disjunction == m_pattern.m_body)
2021 alternativeBodyDisjunction(onceThrough: alternative->onceThrough());
2022 else
2023 alternativeDisjunction();
2024 }
2025
2026 unsigned minimumSize = alternative->m_minimumSize;
2027 ASSERT(minimumSize >= parenthesesInputCountAlreadyChecked);
2028 unsigned countToCheck = minimumSize - parenthesesInputCountAlreadyChecked;
2029
2030 if (countToCheck) {
2031 checkInput(count: countToCheck);
2032 currentCountAlreadyChecked += countToCheck;
2033 }
2034
2035 for (auto& term : alternative->m_terms) {
2036 switch (term.type) {
2037 case PatternTerm::TypeAssertionBOL:
2038 assertionBOL(inputPosition: currentCountAlreadyChecked - term.inputPosition);
2039 break;
2040
2041 case PatternTerm::TypeAssertionEOL:
2042 assertionEOL(inputPosition: currentCountAlreadyChecked - term.inputPosition);
2043 break;
2044
2045 case PatternTerm::TypeAssertionWordBoundary:
2046 assertionWordBoundary(invert: term.invert(), inputPosition: currentCountAlreadyChecked - term.inputPosition);
2047 break;
2048
2049 case PatternTerm::TypePatternCharacter:
2050 atomPatternCharacter(ch: term.patternCharacter, inputPosition: currentCountAlreadyChecked - term.inputPosition, frameLocation: term.frameLocation, quantityMaxCount: term.quantityMaxCount, quantityType: term.quantityType);
2051 break;
2052
2053 case PatternTerm::TypeCharacterClass:
2054 atomCharacterClass(characterClass: term.characterClass, invert: term.invert(), inputPosition: currentCountAlreadyChecked- term.inputPosition, frameLocation: term.frameLocation, quantityMaxCount: term.quantityMaxCount, quantityType: term.quantityType);
2055 break;
2056
2057 case PatternTerm::TypeBackReference:
2058 atomBackReference(subpatternId: term.backReferenceSubpatternId, inputPosition: currentCountAlreadyChecked - term.inputPosition, frameLocation: term.frameLocation, quantityMaxCount: term.quantityMaxCount, quantityType: term.quantityType);
2059 break;
2060
2061 case PatternTerm::TypeForwardReference:
2062 break;
2063
2064 case PatternTerm::TypeParenthesesSubpattern: {
2065 unsigned disjunctionAlreadyCheckedCount = 0;
2066 if (term.quantityMaxCount == 1 && !term.parentheses.isCopy) {
2067 unsigned alternativeFrameLocation = term.frameLocation;
2068 // For QuantifierFixedCount we pre-check the minimum size; for greedy/non-greedy we reserve a slot in the frame.
2069 if (term.quantityType == QuantifierFixedCount)
2070 disjunctionAlreadyCheckedCount = term.parentheses.disjunction->m_minimumSize;
2071 else
2072 alternativeFrameLocation += YarrStackSpaceForBackTrackInfoParenthesesOnce;
2073 ASSERT(currentCountAlreadyChecked >= term.inputPosition);
2074 unsigned delegateEndInputOffset = currentCountAlreadyChecked - term.inputPosition;
2075 atomParenthesesOnceBegin(subpatternId: term.parentheses.subpatternId, capture: term.capture(), inputPosition: disjunctionAlreadyCheckedCount + delegateEndInputOffset, frameLocation: term.frameLocation, alternativeFrameLocation);
2076 emitDisjunction(disjunction: term.parentheses.disjunction, inputCountAlreadyChecked: currentCountAlreadyChecked, parenthesesInputCountAlreadyChecked: disjunctionAlreadyCheckedCount);
2077 atomParenthesesOnceEnd(inputPosition: delegateEndInputOffset, frameLocation: term.frameLocation, quantityMinCount: term.quantityMinCount, quantityMaxCount: term.quantityMaxCount, quantityType: term.quantityType);
2078 } else if (term.parentheses.isTerminal) {
2079 ASSERT(currentCountAlreadyChecked >= term.inputPosition);
2080 unsigned delegateEndInputOffset = currentCountAlreadyChecked - term.inputPosition;
2081 atomParenthesesTerminalBegin(subpatternId: term.parentheses.subpatternId, capture: term.capture(), inputPosition: disjunctionAlreadyCheckedCount + delegateEndInputOffset, frameLocation: term.frameLocation, alternativeFrameLocation: term.frameLocation + YarrStackSpaceForBackTrackInfoParenthesesTerminal);
2082 emitDisjunction(disjunction: term.parentheses.disjunction, inputCountAlreadyChecked: currentCountAlreadyChecked, parenthesesInputCountAlreadyChecked: disjunctionAlreadyCheckedCount);
2083 atomParenthesesTerminalEnd(inputPosition: delegateEndInputOffset, frameLocation: term.frameLocation, quantityMinCount: term.quantityMinCount, quantityMaxCount: term.quantityMaxCount, quantityType: term.quantityType);
2084 } else {
2085 ASSERT(currentCountAlreadyChecked >= term.inputPosition);
2086 unsigned delegateEndInputOffset = currentCountAlreadyChecked - term.inputPosition;
2087 atomParenthesesSubpatternBegin(subpatternId: term.parentheses.subpatternId, capture: term.capture(), inputPosition: disjunctionAlreadyCheckedCount + delegateEndInputOffset, frameLocation: term.frameLocation, alternativeFrameLocation: 0);
2088 emitDisjunction(disjunction: term.parentheses.disjunction, inputCountAlreadyChecked: currentCountAlreadyChecked, parenthesesInputCountAlreadyChecked: 0);
2089 atomParenthesesSubpatternEnd(lastSubpatternId: term.parentheses.lastSubpatternId, inputPosition: delegateEndInputOffset, frameLocation: term.frameLocation, quantityMinCount: term.quantityMinCount, quantityMaxCount: term.quantityMaxCount, quantityType: term.quantityType, callFrameSize: term.parentheses.disjunction->m_callFrameSize);
2090 }
2091 break;
2092 }
2093
2094 case PatternTerm::TypeParentheticalAssertion: {
2095 unsigned alternativeFrameLocation = term.frameLocation + YarrStackSpaceForBackTrackInfoParentheticalAssertion;
2096
2097 ASSERT(currentCountAlreadyChecked >= term.inputPosition);
2098 unsigned positiveInputOffset = currentCountAlreadyChecked - term.inputPosition;
2099 unsigned uncheckAmount = 0;
2100 if (positiveInputOffset > term.parentheses.disjunction->m_minimumSize) {
2101 uncheckAmount = positiveInputOffset - term.parentheses.disjunction->m_minimumSize;
2102 uncheckInput(count: uncheckAmount);
2103 currentCountAlreadyChecked -= uncheckAmount;
2104 }
2105
2106 atomParentheticalAssertionBegin(subpatternId: term.parentheses.subpatternId, invert: term.invert(), frameLocation: term.frameLocation, alternativeFrameLocation);
2107 emitDisjunction(disjunction: term.parentheses.disjunction, inputCountAlreadyChecked: currentCountAlreadyChecked, parenthesesInputCountAlreadyChecked: positiveInputOffset - uncheckAmount);
2108 atomParentheticalAssertionEnd(inputPosition: 0, frameLocation: term.frameLocation, quantityMaxCount: term.quantityMaxCount, quantityType: term.quantityType);
2109 if (uncheckAmount) {
2110 checkInput(count: uncheckAmount);
2111 currentCountAlreadyChecked += uncheckAmount;
2112 }
2113 break;
2114 }
2115
2116 case PatternTerm::TypeDotStarEnclosure:
2117 assertionDotStarEnclosure(bolAnchored: term.anchors.bolAnchor, eolAnchored: term.anchors.eolAnchor);
2118 break;
2119 }
2120 }
2121 }
2122 }
2123#ifndef NDEBUG
2124 void dumpDisjunction(ByteDisjunction* disjunction, unsigned nesting = 0)
2125 {
2126 PrintStream& out = WTF::dataFile();
2127
2128 unsigned termIndexNest = 0;
2129
2130 if (!nesting) {
2131 out.printf(format: "ByteDisjunction(%p):\n", disjunction);
2132 nesting = 1;
2133 } else {
2134 termIndexNest = nesting - 1;
2135 nesting = 2;
2136 }
2137
2138 auto outputTermIndexAndNest = [&](size_t index, unsigned termNesting) {
2139 for (unsigned nestingDepth = 0; nestingDepth < termIndexNest; nestingDepth++)
2140 out.print(value: " ");
2141#if defined(WIN32) && defined(__MINGW32__)
2142# if __SIZEOF_POINTER__ == 8
2143 out.printf("%4I64u", index);
2144# else
2145 out.printf("%4I32u", index);
2146# endif
2147#else
2148 out.printf(format: "%4zu", index);
2149#endif
2150 for (unsigned nestingDepth = 0; nestingDepth < termNesting; nestingDepth++)
2151 out.print(value: " ");
2152 };
2153
2154 auto dumpQuantity = [&](ByteTerm& term) {
2155 if (term.atom.quantityType == QuantifierFixedCount && term.atom.quantityMinCount == 1 && term.atom.quantityMaxCount == 1)
2156 return;
2157
2158 out.print(value1: " {", value2: term.atom.quantityMinCount);
2159 if (term.atom.quantityMinCount != term.atom.quantityMaxCount) {
2160 if (term.atom.quantityMaxCount == UINT_MAX)
2161 out.print(value: ",inf");
2162 else
2163 out.print(value1: ",", value2: term.atom.quantityMaxCount);
2164 }
2165 out.print(value: "}");
2166 if (term.atom.quantityType == QuantifierGreedy)
2167 out.print(value: " greedy");
2168 else if (term.atom.quantityType == QuantifierNonGreedy)
2169 out.print(value: " non-greedy");
2170 };
2171
2172 auto dumpCaptured = [&](ByteTerm& term) {
2173 if (term.capture())
2174 out.print(value1: " captured (#", value2: term.atom.subpatternId, value3: ")");
2175 };
2176
2177 auto dumpInverted = [&](ByteTerm& term) {
2178 if (term.invert())
2179 out.print(value: " inverted");
2180 };
2181
2182 auto dumpInputPosition = [&](ByteTerm& term) {
2183 out.printf(format: " inputPosition %u", term.inputPosition);
2184 };
2185
2186 auto dumpFrameLocation = [&](ByteTerm& term) {
2187 out.printf(format: " frameLocation %u", term.frameLocation);
2188 };
2189
2190 auto dumpCharacter = [&](ByteTerm& term) {
2191 out.print(value: " ");
2192 dumpUChar32(out, term.atom.patternCharacter);
2193 };
2194
2195 auto dumpCharClass = [&](ByteTerm& term) {
2196 out.print(value: " ");
2197 dumpCharacterClass(out, &m_pattern, term.atom.characterClass);
2198 };
2199
2200 for (size_t idx = 0; idx < disjunction->terms.size(); ++idx) {
2201 ByteTerm term = disjunction->terms[idx];
2202
2203 bool outputNewline = true;
2204
2205 switch (term.type) {
2206 case ByteTerm::TypeBodyAlternativeBegin:
2207 outputTermIndexAndNest(idx, nesting++);
2208 out.print(value: "BodyAlternativeBegin");
2209 if (term.alternative.onceThrough)
2210 out.print(value: " onceThrough");
2211 dumpFrameLocation(term);
2212 break;
2213 case ByteTerm::TypeBodyAlternativeDisjunction:
2214 outputTermIndexAndNest(idx, nesting - 1);
2215 out.print(value: "BodyAlternativeDisjunction");
2216 dumpFrameLocation(term);
2217 break;
2218 case ByteTerm::TypeBodyAlternativeEnd:
2219 outputTermIndexAndNest(idx, --nesting);
2220 out.print(value: "BodyAlternativeEnd");
2221 dumpFrameLocation(term);
2222 break;
2223 case ByteTerm::TypeAlternativeBegin:
2224 outputTermIndexAndNest(idx, nesting++);
2225 out.print(value: "AlternativeBegin");
2226 dumpFrameLocation(term);
2227 break;
2228 case ByteTerm::TypeAlternativeDisjunction:
2229 outputTermIndexAndNest(idx, nesting - 1);
2230 out.print(value: "AlternativeDisjunction");
2231 dumpFrameLocation(term);
2232 break;
2233 case ByteTerm::TypeAlternativeEnd:
2234 outputTermIndexAndNest(idx, --nesting);
2235 out.print(value: "AlternativeEnd");
2236 dumpFrameLocation(term);
2237 break;
2238 case ByteTerm::TypeSubpatternBegin:
2239 outputTermIndexAndNest(idx, nesting++);
2240 out.print(value: "SubpatternBegin");
2241 break;
2242 case ByteTerm::TypeSubpatternEnd:
2243 outputTermIndexAndNest(idx, --nesting);
2244 out.print(value: "SubpatternEnd");
2245 break;
2246 case ByteTerm::TypeAssertionBOL:
2247 outputTermIndexAndNest(idx, nesting);
2248 out.print(value: "AssertionBOL");
2249 break;
2250 case ByteTerm::TypeAssertionEOL:
2251 outputTermIndexAndNest(idx, nesting);
2252 out.print(value: "AssertionEOL");
2253 break;
2254 case ByteTerm::TypeAssertionWordBoundary:
2255 outputTermIndexAndNest(idx, nesting);
2256 out.print(value: "AssertionWordBoundary");
2257 break;
2258 case ByteTerm::TypePatternCharacterOnce:
2259 outputTermIndexAndNest(idx, nesting);
2260 out.print(value: "PatternCharacterOnce");
2261 dumpInverted(term);
2262 dumpInputPosition(term);
2263 dumpFrameLocation(term);
2264 dumpCharacter(term);
2265 dumpQuantity(term);
2266 break;
2267 case ByteTerm::TypePatternCharacterFixed:
2268 outputTermIndexAndNest(idx, nesting);
2269 out.print(value: "PatternCharacterFixed");
2270 dumpInverted(term);
2271 dumpInputPosition(term);
2272 dumpFrameLocation(term);
2273 dumpCharacter(term);
2274 out.print(value1: " {", value2: term.atom.quantityMinCount, value3: "}");
2275 break;
2276 case ByteTerm::TypePatternCharacterGreedy:
2277 outputTermIndexAndNest(idx, nesting);
2278 out.print(value: "PatternCharacterGreedy");
2279 dumpInverted(term);
2280 dumpInputPosition(term);
2281 dumpFrameLocation(term);
2282 dumpCharacter(term);
2283 dumpQuantity(term);
2284 break;
2285 case ByteTerm::TypePatternCharacterNonGreedy:
2286 outputTermIndexAndNest(idx, nesting);
2287 out.print(value: "PatternCharacterNonGreedy");
2288 dumpInverted(term);
2289 dumpInputPosition(term);
2290 dumpFrameLocation(term);
2291 dumpCharacter(term);
2292 dumpQuantity(term);
2293 break;
2294 case ByteTerm::TypePatternCasedCharacterOnce:
2295 outputTermIndexAndNest(idx, nesting);
2296 out.print(value: "PatternCasedCharacterOnce");
2297 break;
2298 case ByteTerm::TypePatternCasedCharacterFixed:
2299 outputTermIndexAndNest(idx, nesting);
2300 out.print(value: "PatternCasedCharacterFixed");
2301 break;
2302 case ByteTerm::TypePatternCasedCharacterGreedy:
2303 outputTermIndexAndNest(idx, nesting);
2304 out.print(value: "PatternCasedCharacterGreedy");
2305 break;
2306 case ByteTerm::TypePatternCasedCharacterNonGreedy:
2307 outputTermIndexAndNest(idx, nesting);
2308 out.print(value: "PatternCasedCharacterNonGreedy");
2309 break;
2310 case ByteTerm::TypeCharacterClass:
2311 outputTermIndexAndNest(idx, nesting);
2312 out.print(value: "CharacterClass");
2313 dumpInverted(term);
2314 dumpInputPosition(term);
2315 dumpFrameLocation(term);
2316 dumpCharClass(term);
2317 dumpQuantity(term);
2318 break;
2319 case ByteTerm::TypeBackReference:
2320 outputTermIndexAndNest(idx, nesting);
2321 out.print(value1: "BackReference #", value2: term.atom.subpatternId);
2322 dumpQuantity(term);
2323 break;
2324 case ByteTerm::TypeParenthesesSubpattern:
2325 outputTermIndexAndNest(idx, nesting);
2326 out.print(value: "ParenthesesSubpattern");
2327 dumpCaptured(term);
2328 dumpInverted(term);
2329 dumpInputPosition(term);
2330 dumpFrameLocation(term);
2331 dumpQuantity(term);
2332 out.print(value: "\n");
2333 outputNewline = false;
2334 dumpDisjunction(disjunction: term.atom.parenthesesDisjunction, nesting);
2335 break;
2336 case ByteTerm::TypeParenthesesSubpatternOnceBegin:
2337 outputTermIndexAndNest(idx, nesting++);
2338 out.print(value: "ParenthesesSubpatternOnceBegin");
2339 dumpCaptured(term);
2340 dumpInverted(term);
2341 dumpInputPosition(term);
2342 dumpFrameLocation(term);
2343 break;
2344 case ByteTerm::TypeParenthesesSubpatternOnceEnd:
2345 outputTermIndexAndNest(idx, --nesting);
2346 out.print(value: "ParenthesesSubpatternOnceEnd");
2347 dumpFrameLocation(term);
2348 break;
2349 case ByteTerm::TypeParenthesesSubpatternTerminalBegin:
2350 outputTermIndexAndNest(idx, nesting++);
2351 out.print(value: "ParenthesesSubpatternTerminalBegin");
2352 dumpInverted(term);
2353 dumpInputPosition(term);
2354 dumpFrameLocation(term);
2355 break;
2356 case ByteTerm::TypeParenthesesSubpatternTerminalEnd:
2357 outputTermIndexAndNest(idx, --nesting);
2358 out.print(value: "ParenthesesSubpatternTerminalEnd");
2359 dumpFrameLocation(term);
2360 break;
2361 case ByteTerm::TypeParentheticalAssertionBegin:
2362 outputTermIndexAndNest(idx, nesting++);
2363 out.print(value: "ParentheticalAssertionBegin");
2364 dumpInverted(term);
2365 dumpInputPosition(term);
2366 dumpFrameLocation(term);
2367 break;
2368 case ByteTerm::TypeParentheticalAssertionEnd:
2369 outputTermIndexAndNest(idx, --nesting);
2370 out.print(value: "ParentheticalAssertionEnd");
2371 dumpFrameLocation(term);
2372 break;
2373 case ByteTerm::TypeCheckInput:
2374 outputTermIndexAndNest(idx, nesting);
2375 out.print(value1: "CheckInput ", value2: term.checkInputCount);
2376 break;
2377 case ByteTerm::TypeUncheckInput:
2378 outputTermIndexAndNest(idx, nesting);
2379 out.print(value1: "UncheckInput ", value2: term.checkInputCount);
2380 break;
2381 case ByteTerm::TypeDotStarEnclosure:
2382 outputTermIndexAndNest(idx, nesting);
2383 out.print(value: "DotStarEnclosure");
2384 break;
2385 }
2386 if (outputNewline)
2387 out.print(value: "\n");
2388 }
2389 }
2390#endif
2391
2392private:
2393 YarrPattern& m_pattern;
2394 std::unique_ptr<ByteDisjunction> m_bodyDisjunction;
2395 unsigned m_currentAlternativeIndex;
2396 Vector<ParenthesesStackEntry> m_parenthesesStack;
2397 Vector<std::unique_ptr<ByteDisjunction>> m_allParenthesesInfo;
2398};
2399
2400std::unique_ptr<BytecodePattern> byteCompile(YarrPattern& pattern, BumpPointerAllocator* allocator, ConcurrentJSLock* lock)
2401{
2402 return ByteCompiler(pattern).compile(allocator, lock);
2403}
2404
2405unsigned interpret(BytecodePattern* bytecode, const String& input, unsigned start, unsigned* output)
2406{
2407 SuperSamplerScope superSamplerScope(false);
2408 if (input.is8Bit())
2409 return Interpreter<LChar>(bytecode, output, input.characters8(), input.size(), start).interpret();
2410 return Interpreter<UChar>(bytecode, output, input.characters16(), input.size(), start).interpret();
2411}
2412
2413unsigned interpret(BytecodePattern* bytecode, const LChar* input, unsigned length, unsigned start, unsigned* output)
2414{
2415 SuperSamplerScope superSamplerScope(false);
2416 return Interpreter<LChar>(bytecode, output, input, length, start).interpret();
2417}
2418
2419unsigned interpret(BytecodePattern* bytecode, const UChar* input, unsigned length, unsigned start, unsigned* output)
2420{
2421 SuperSamplerScope superSamplerScope(false);
2422 return Interpreter<UChar>(bytecode, output, input, length, start).interpret();
2423}
2424
2425// These should be the same for both UChar & LChar.
2426COMPILE_ASSERT(sizeof(BackTrackInfoPatternCharacter) == (YarrStackSpaceForBackTrackInfoPatternCharacter * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoPatternCharacter);
2427COMPILE_ASSERT(sizeof(BackTrackInfoCharacterClass) == (YarrStackSpaceForBackTrackInfoCharacterClass * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoCharacterClass);
2428COMPILE_ASSERT(sizeof(BackTrackInfoBackReference) == (YarrStackSpaceForBackTrackInfoBackReference * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoBackReference);
2429COMPILE_ASSERT(sizeof(BackTrackInfoAlternative) == (YarrStackSpaceForBackTrackInfoAlternative * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoAlternative);
2430COMPILE_ASSERT(sizeof(BackTrackInfoParentheticalAssertion) == (YarrStackSpaceForBackTrackInfoParentheticalAssertion * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoParentheticalAssertion);
2431COMPILE_ASSERT(sizeof(BackTrackInfoParenthesesOnce) == (YarrStackSpaceForBackTrackInfoParenthesesOnce * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoParenthesesOnce);
2432COMPILE_ASSERT(sizeof(Interpreter<UChar>::BackTrackInfoParentheses) <= (YarrStackSpaceForBackTrackInfoParentheses * sizeof(uintptr_t)), CheckYarrStackSpaceForBackTrackInfoParentheses);
2433
2434
2435} }
2436

source code of qtdeclarative/src/3rdparty/masm/yarr/YarrInterpreter.cpp