1/*
2 * Copyright (C) 2009, 2013-2016 Apple Inc. All rights reserved.
3 * Copyright (C) 2010 Peter Varga (pvarga@inf.u-szeged.hu), University of Szeged
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
15 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
17 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
18 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
19 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
21 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
22 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
24 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 */
26
27#include "config.h"
28#include "YarrPattern.h"
29
30#include "Options.h"
31#include "Yarr.h"
32#include "YarrCanonicalize.h"
33#include "YarrParser.h"
34#include <wtf/DataLog.h>
35#include <wtf/Optional.h>
36#include <wtf/Vector.h>
37#include <wtf/text/WTFString.h>
38
39namespace JSC { namespace Yarr {
40
41#include "RegExpJitTables.h"
42
43class CharacterClassConstructor {
44public:
45 CharacterClassConstructor(bool isCaseInsensitive, CanonicalMode canonicalMode)
46 : m_isCaseInsensitive(isCaseInsensitive)
47 , m_hasNonBMPCharacters(false)
48 , m_anyCharacter(false)
49 , m_canonicalMode(canonicalMode)
50 {
51 }
52
53 void reset()
54 {
55 m_matches.clear();
56 m_ranges.clear();
57 m_matchesUnicode.clear();
58 m_rangesUnicode.clear();
59 m_hasNonBMPCharacters = false;
60 m_anyCharacter = false;
61 }
62
63 void append(const CharacterClass* other)
64 {
65 for (size_t i = 0; i < other->m_matches.size(); ++i)
66 addSorted(matches&: m_matches, ch: other->m_matches[i]);
67 for (size_t i = 0; i < other->m_ranges.size(); ++i)
68 addSortedRange(ranges&: m_ranges, lo: other->m_ranges[i].begin, hi: other->m_ranges[i].end);
69 for (size_t i = 0; i < other->m_matchesUnicode.size(); ++i)
70 addSorted(matches&: m_matchesUnicode, ch: other->m_matchesUnicode[i]);
71 for (size_t i = 0; i < other->m_rangesUnicode.size(); ++i)
72 addSortedRange(ranges&: m_rangesUnicode, lo: other->m_rangesUnicode[i].begin, hi: other->m_rangesUnicode[i].end);
73 }
74
75 void appendInverted(const CharacterClass* other)
76 {
77 auto addSortedInverted = [&](UChar32 min, UChar32 max,
78 const Vector<UChar32>& srcMatches, const Vector<CharacterRange>& srcRanges,
79 Vector<UChar32>& destMatches, Vector<CharacterRange>& destRanges) {
80
81 auto addSortedMatchOrRange = [&](UChar32 lo, UChar32 hiPlusOne) {
82 if (lo < hiPlusOne) {
83 if (lo + 1 == hiPlusOne)
84 addSorted(destMatches, lo);
85 else
86 addSortedRange(ranges&: destRanges, lo, hi: hiPlusOne - 1);
87 }
88 };
89
90 UChar32 lo = min;
91 size_t matchesIndex = 0;
92 size_t rangesIndex = 0;
93 bool matchesRemaining = matchesIndex < srcMatches.size();
94 bool rangesRemaining = rangesIndex < srcRanges.size();
95
96 if (!matchesRemaining && !rangesRemaining) {
97 addSortedMatchOrRange(min, max + 1);
98 return;
99 }
100
101 while (matchesRemaining || rangesRemaining) {
102 UChar32 hiPlusOne;
103 UChar32 nextLo;
104
105 if (matchesRemaining
106 && (!rangesRemaining || srcMatches[matchesIndex] < srcRanges[rangesIndex].begin)) {
107 hiPlusOne = srcMatches[matchesIndex];
108 nextLo = hiPlusOne + 1;
109 ++matchesIndex;
110 matchesRemaining = matchesIndex < srcMatches.size();
111 } else {
112 hiPlusOne = srcRanges[rangesIndex].begin;
113 nextLo = srcRanges[rangesIndex].end + 1;
114 ++rangesIndex;
115 rangesRemaining = rangesIndex < srcRanges.size();
116 }
117
118 addSortedMatchOrRange(lo, hiPlusOne);
119
120 lo = nextLo;
121 }
122
123 addSortedMatchOrRange(lo, max + 1);
124 };
125
126 addSortedInverted(0, 0x7f, other->m_matches, other->m_ranges, m_matches, m_ranges);
127 addSortedInverted(0x80, 0x10ffff, other->m_matchesUnicode, other->m_rangesUnicode, m_matchesUnicode, m_rangesUnicode);
128 }
129
130 void putChar(UChar32 ch)
131 {
132 if (!m_isCaseInsensitive) {
133 addSorted(ch);
134 return;
135 }
136
137 if (m_canonicalMode == CanonicalMode::UCS2 && isASCII(ch)) {
138 // Handle ASCII cases.
139 if (isASCIIAlpha(ch)) {
140 addSorted(m_matches, toASCIIUpper(ch));
141 addSorted(m_matches, toASCIILower(ch));
142 } else
143 addSorted(matches&: m_matches, ch);
144 return;
145 }
146
147 // Add multiple matches, if necessary.
148 const CanonicalizationRange* info = canonicalRangeInfoFor(ch, canonicalMode: m_canonicalMode);
149 if (info->type == CanonicalizeUnique)
150 addSorted(ch);
151 else
152 putUnicodeIgnoreCase(ch, info);
153 }
154
155 void putUnicodeIgnoreCase(UChar32 ch, const CanonicalizationRange* info)
156 {
157 ASSERT(m_isCaseInsensitive);
158 ASSERT(ch >= info->begin && ch <= info->end);
159 ASSERT(info->type != CanonicalizeUnique);
160 if (info->type == CanonicalizeSet) {
161 for (const UChar32* set = canonicalCharacterSetInfo(index: info->value, canonicalMode: m_canonicalMode); (ch = *set); ++set)
162 addSorted(ch);
163 } else {
164 addSorted(ch);
165 addSorted(ch: getCanonicalPair(info, ch));
166 }
167 }
168
169 void putRange(UChar32 lo, UChar32 hi)
170 {
171 if (isASCII(lo)) {
172 char asciiLo = lo;
173 char asciiHi = std::min(hi, (UChar32)0x7f);
174 addSortedRange(ranges&: m_ranges, lo, hi: asciiHi);
175
176 if (m_isCaseInsensitive) {
177 if ((asciiLo <= 'Z') && (asciiHi >= 'A'))
178 addSortedRange(ranges&: m_ranges, lo: std::max(asciiLo, 'A')+('a'-'A'), hi: std::min(asciiHi, 'Z')+('a'-'A'));
179 if ((asciiLo <= 'z') && (asciiHi >= 'a'))
180 addSortedRange(ranges&: m_ranges, lo: std::max(asciiLo, 'a')+('A'-'a'), hi: std::min(asciiHi, 'z')+('A'-'a'));
181 }
182 }
183 if (isASCII(hi))
184 return;
185
186 lo = std::max(lo, (UChar32)0x80);
187 addSortedRange(ranges&: m_rangesUnicode, lo, hi);
188
189 if (!m_isCaseInsensitive)
190 return;
191
192 const CanonicalizationRange* info = canonicalRangeInfoFor(ch: lo, canonicalMode: m_canonicalMode);
193 while (true) {
194 // Handle the range [lo .. end]
195 UChar32 end = std::min<UChar32>(info->end, hi);
196
197 switch (info->type) {
198 case CanonicalizeUnique:
199 // Nothing to do - no canonical equivalents.
200 break;
201 case CanonicalizeSet: {
202 UChar ch;
203 for (const UChar32* set = canonicalCharacterSetInfo(index: info->value, canonicalMode: m_canonicalMode); (ch = *set); ++set)
204 addSorted(matches&: m_matchesUnicode, ch);
205 break;
206 }
207 case CanonicalizeRangeLo:
208 addSortedRange(ranges&: m_rangesUnicode, lo: lo + info->value, hi: end + info->value);
209 break;
210 case CanonicalizeRangeHi:
211 addSortedRange(ranges&: m_rangesUnicode, lo: lo - info->value, hi: end - info->value);
212 break;
213 case CanonicalizeAlternatingAligned:
214 // Use addSortedRange since there is likely an abutting range to combine with.
215 if (lo & 1)
216 addSortedRange(ranges&: m_rangesUnicode, lo: lo - 1, hi: lo - 1);
217 if (!(end & 1))
218 addSortedRange(ranges&: m_rangesUnicode, lo: end + 1, hi: end + 1);
219 break;
220 case CanonicalizeAlternatingUnaligned:
221 // Use addSortedRange since there is likely an abutting range to combine with.
222 if (!(lo & 1))
223 addSortedRange(ranges&: m_rangesUnicode, lo: lo - 1, hi: lo - 1);
224 if (end & 1)
225 addSortedRange(ranges&: m_rangesUnicode, lo: end + 1, hi: end + 1);
226 break;
227 }
228
229 if (hi == end)
230 return;
231
232 ++info;
233 lo = info->begin;
234 };
235
236 }
237
238 std::unique_ptr<CharacterClass> charClass()
239 {
240 coalesceTables();
241
242 auto characterClass = std::make_unique<CharacterClass>();
243
244 characterClass->m_matches.swap(m_matches);
245 characterClass->m_ranges.swap(m_ranges);
246 characterClass->m_matchesUnicode.swap(m_matchesUnicode);
247 characterClass->m_rangesUnicode.swap(m_rangesUnicode);
248 characterClass->m_hasNonBMPCharacters = hasNonBMPCharacters();
249 characterClass->m_anyCharacter = anyCharacter();
250
251 m_hasNonBMPCharacters = false;
252 m_anyCharacter = false;
253
254 return characterClass;
255 }
256
257private:
258 void addSorted(UChar32 ch)
259 {
260 addSorted(matches&: isASCII(ch) ? m_matches : m_matchesUnicode, ch);
261 }
262
263 void addSorted(Vector<UChar32>& matches, UChar32 ch)
264 {
265 unsigned pos = 0;
266 unsigned range = matches.size();
267
268 if (!U_IS_BMP(ch))
269 m_hasNonBMPCharacters = true;
270
271 // binary chop, find position to insert char.
272 while (range) {
273 unsigned index = range >> 1;
274
275 int val = matches[pos+index] - ch;
276 if (!val)
277 return;
278 else if (val > 0) {
279 if (val == 1) {
280 UChar32 lo = ch;
281 UChar32 hi = ch + 1;
282 matches.remove(position: pos + index);
283 if (pos + index > 0 && matches[pos + index - 1] == ch - 1) {
284 lo = ch - 1;
285 matches.remove(position: pos + index - 1);
286 }
287 addSortedRange(ranges&: isASCII(ch) ? m_ranges : m_rangesUnicode, lo, hi);
288 return;
289 }
290 range = index;
291 } else {
292 if (val == -1) {
293 UChar32 lo = ch - 1;
294 UChar32 hi = ch;
295 matches.remove(position: pos + index);
296 if (pos + index + 1 < matches.size() && matches[pos + index + 1] == ch + 1) {
297 hi = ch + 1;
298 matches.remove(position: pos + index + 1);
299 }
300 addSortedRange(ranges&: isASCII(ch) ? m_ranges : m_rangesUnicode, lo, hi);
301 return;
302 }
303 pos += (index+1);
304 range -= (index+1);
305 }
306 }
307
308 if (pos == matches.size())
309 matches.append(value: ch);
310 else
311 matches.insert(position: pos, value: ch);
312 }
313
314 void addSortedRange(Vector<CharacterRange>& ranges, UChar32 lo, UChar32 hi)
315 {
316 size_t end = ranges.size();
317
318 if (!U_IS_BMP(hi))
319 m_hasNonBMPCharacters = true;
320
321 // Simple linear scan - I doubt there are that many ranges anyway...
322 // feel free to fix this with something faster (eg binary chop).
323 for (size_t i = 0; i < end; ++i) {
324 // does the new range fall before the current position in the array
325 if (hi < ranges[i].begin) {
326 // Concatenate appending ranges.
327 if (hi == (ranges[i].begin - 1)) {
328 ranges[i].begin = lo;
329 return;
330 }
331 ranges.insert(position: i, value: CharacterRange(lo, hi));
332 return;
333 }
334 // Okay, since we didn't hit the last case, the end of the new range is definitely at or after the beginning
335 // If the new range start at or before the end of the last range, then the overlap (if it starts one after the
336 // end of the last range they concatenate, which is just as good.
337 if (lo <= (ranges[i].end + 1)) {
338 // found an intersect! we'll replace this entry in the array.
339 ranges[i].begin = std::min(ranges[i].begin, lo);
340 ranges[i].end = std::max(ranges[i].end, hi);
341
342 mergeRangesFrom(ranges, index: i);
343 return;
344 }
345 }
346
347 // CharacterRange comes after all existing ranges.
348 ranges.append(other: CharacterRange(lo, hi));
349 }
350
351 void mergeRangesFrom(Vector<CharacterRange>& ranges, size_t index)
352 {
353 size_t next = index + 1;
354
355 // each iteration of the loop we will either remove something from the list, or break out of the loop.
356 while (next < ranges.size()) {
357 if (ranges[next].begin <= (ranges[index].end + 1)) {
358 // the next entry now overlaps / concatenates with this one.
359 ranges[index].end = std::max(ranges[index].end, ranges[next].end);
360 ranges.remove(position: next);
361 } else
362 break;
363 }
364
365 }
366
367 void coalesceTables()
368 {
369 auto coalesceMatchesAndRanges = [&](Vector<UChar32>& matches, Vector<CharacterRange>& ranges) {
370
371 size_t matchesIndex = 0;
372 size_t rangesIndex = 0;
373
374 while (matchesIndex < matches.size() && rangesIndex < ranges.size()) {
375 while (matchesIndex < matches.size() && matches[matchesIndex] < ranges[rangesIndex].begin - 1)
376 matchesIndex++;
377
378 if (matchesIndex < matches.size() && matches[matchesIndex] == ranges[rangesIndex].begin - 1) {
379 ranges[rangesIndex].begin = matches[matchesIndex];
380 matches.remove(matchesIndex);
381 }
382
383 while (matchesIndex < matches.size() && matches[matchesIndex] < ranges[rangesIndex].end + 1)
384 matchesIndex++;
385
386 if (matchesIndex < matches.size()) {
387 if (matches[matchesIndex] == ranges[rangesIndex].end + 1) {
388 ranges[rangesIndex].end = matches[matchesIndex];
389 matches.remove(matchesIndex);
390
391 mergeRangesFrom(ranges&: ranges, index: rangesIndex);
392 } else
393 matchesIndex++;
394 }
395 }
396 };
397
398 coalesceMatchesAndRanges(m_matches, m_ranges);
399 coalesceMatchesAndRanges(m_matchesUnicode, m_rangesUnicode);
400
401 if (!m_matches.size() && !m_matchesUnicode.size()
402 && m_ranges.size() == 1 && m_rangesUnicode.size() == 1
403 && m_ranges[0].begin == 0 && m_ranges[0].end == 0x7f
404 && m_rangesUnicode[0].begin == 0x80 && m_rangesUnicode[0].end == 0x10ffff)
405 m_anyCharacter = true;
406 }
407
408 bool hasNonBMPCharacters()
409 {
410 return m_hasNonBMPCharacters;
411 }
412
413 bool anyCharacter()
414 {
415 return m_anyCharacter;
416 }
417
418 bool m_isCaseInsensitive : 1;
419 bool m_hasNonBMPCharacters : 1;
420 bool m_anyCharacter : 1;
421 CanonicalMode m_canonicalMode;
422
423 Vector<UChar32> m_matches;
424 Vector<CharacterRange> m_ranges;
425 Vector<UChar32> m_matchesUnicode;
426 Vector<CharacterRange> m_rangesUnicode;
427};
428
429class YarrPatternConstructor {
430public:
431 YarrPatternConstructor(YarrPattern& pattern, void* stackLimit)
432 : m_pattern(pattern)
433 , m_characterClassConstructor(pattern.ignoreCase(), pattern.unicode() ? CanonicalMode::Unicode : CanonicalMode::UCS2)
434 , m_stackLimit(stackLimit)
435 {
436 auto body = std::make_unique<PatternDisjunction>();
437 m_pattern.m_body = body.get();
438 m_alternative = body->addNewAlternative();
439 m_pattern.m_disjunctions.append(WTFMove(body));
440 }
441
442 ~YarrPatternConstructor()
443 {
444 }
445
446 void resetForReparsing()
447 {
448 m_pattern.resetForReparsing();
449 m_characterClassConstructor.reset();
450
451 auto body = std::make_unique<PatternDisjunction>();
452 m_pattern.m_body = body.get();
453 m_alternative = body->addNewAlternative();
454 m_pattern.m_disjunctions.append(WTFMove(body));
455 }
456
457 void saveUnmatchedNamedForwardReferences()
458 {
459 m_unmatchedNamedForwardReferences.shrink(0);
460
461 for (auto& entry : m_pattern.m_namedForwardReferences) {
462 if (!m_pattern.m_captureGroupNames.contains(entry))
463 m_unmatchedNamedForwardReferences.append(entry);
464 }
465 }
466
467 void assertionBOL()
468 {
469 if (!m_alternative->m_terms.size() && !m_invertParentheticalAssertion) {
470 m_alternative->m_startsWithBOL = true;
471 m_alternative->m_containsBOL = true;
472 m_pattern.m_containsBOL = true;
473 }
474 m_alternative->m_terms.append(other: PatternTerm::BOL());
475 }
476 void assertionEOL()
477 {
478 m_alternative->m_terms.append(other: PatternTerm::EOL());
479 }
480 void assertionWordBoundary(bool invert)
481 {
482 m_alternative->m_terms.append(other: PatternTerm::WordBoundary(invert));
483 }
484
485 void atomPatternCharacter(UChar32 ch)
486 {
487 // We handle case-insensitive checking of unicode characters which do have both
488 // cases by handling them as if they were defined using a CharacterClass.
489 if (!m_pattern.ignoreCase() || (isASCII(ch) && !m_pattern.unicode())) {
490 m_alternative->m_terms.append(other: PatternTerm(ch));
491 return;
492 }
493
494 const CanonicalizationRange* info = canonicalRangeInfoFor(ch, canonicalMode: m_pattern.unicode() ? CanonicalMode::Unicode : CanonicalMode::UCS2);
495 if (info->type == CanonicalizeUnique) {
496 m_alternative->m_terms.append(other: PatternTerm(ch));
497 return;
498 }
499
500 m_characterClassConstructor.putUnicodeIgnoreCase(ch, info);
501 auto newCharacterClass = m_characterClassConstructor.charClass();
502 m_alternative->m_terms.append(other: PatternTerm(newCharacterClass.get(), false));
503 m_pattern.m_userCharacterClasses.append(WTFMove(newCharacterClass));
504 }
505
506 void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert)
507 {
508 switch (classID) {
509 case BuiltInCharacterClassID::DigitClassID:
510 m_alternative->m_terms.append(other: PatternTerm(m_pattern.digitsCharacterClass(), invert));
511 break;
512 case BuiltInCharacterClassID::SpaceClassID:
513 m_alternative->m_terms.append(other: PatternTerm(m_pattern.spacesCharacterClass(), invert));
514 break;
515 case BuiltInCharacterClassID::WordClassID:
516 if (m_pattern.unicode() && m_pattern.ignoreCase())
517 m_alternative->m_terms.append(other: PatternTerm(m_pattern.wordUnicodeIgnoreCaseCharCharacterClass(), invert));
518 else
519 m_alternative->m_terms.append(other: PatternTerm(m_pattern.wordcharCharacterClass(), invert));
520 break;
521 case BuiltInCharacterClassID::DotClassID:
522 ASSERT(!invert);
523 if (m_pattern.dotAll())
524 m_alternative->m_terms.append(other: PatternTerm(m_pattern.anyCharacterClass(), false));
525 else
526 m_alternative->m_terms.append(other: PatternTerm(m_pattern.newlineCharacterClass(), true));
527 break;
528 default:
529 m_alternative->m_terms.append(other: PatternTerm(m_pattern.unicodeCharacterClassFor(unicodeClassID: classID), invert));
530 break;
531 }
532 }
533
534 void atomCharacterClassBegin(bool invert = false)
535 {
536 m_invertCharacterClass = invert;
537 }
538
539 void atomCharacterClassAtom(UChar32 ch)
540 {
541 m_characterClassConstructor.putChar(ch);
542 }
543
544 void atomCharacterClassRange(UChar32 begin, UChar32 end)
545 {
546 m_characterClassConstructor.putRange(lo: begin, hi: end);
547 }
548
549 void atomCharacterClassBuiltIn(BuiltInCharacterClassID classID, bool invert)
550 {
551 ASSERT(classID != BuiltInCharacterClassID::DotClassID);
552
553 switch (classID) {
554 case BuiltInCharacterClassID::DigitClassID:
555 m_characterClassConstructor.append(other: invert ? m_pattern.nondigitsCharacterClass() : m_pattern.digitsCharacterClass());
556 break;
557
558 case BuiltInCharacterClassID::SpaceClassID:
559 m_characterClassConstructor.append(other: invert ? m_pattern.nonspacesCharacterClass() : m_pattern.spacesCharacterClass());
560 break;
561
562 case BuiltInCharacterClassID::WordClassID:
563 if (m_pattern.unicode() && m_pattern.ignoreCase())
564 m_characterClassConstructor.append(other: invert ? m_pattern.nonwordUnicodeIgnoreCaseCharCharacterClass() : m_pattern.wordUnicodeIgnoreCaseCharCharacterClass());
565 else
566 m_characterClassConstructor.append(other: invert ? m_pattern.nonwordcharCharacterClass() : m_pattern.wordcharCharacterClass());
567 break;
568
569 default:
570 if (!invert)
571 m_characterClassConstructor.append(other: m_pattern.unicodeCharacterClassFor(unicodeClassID: classID));
572 else
573 m_characterClassConstructor.appendInverted(other: m_pattern.unicodeCharacterClassFor(unicodeClassID: classID));
574 }
575 }
576
577 void atomCharacterClassEnd()
578 {
579 auto newCharacterClass = m_characterClassConstructor.charClass();
580
581 if (!m_invertCharacterClass && newCharacterClass.get()->m_anyCharacter) {
582 m_alternative->m_terms.append(other: PatternTerm(m_pattern.anyCharacterClass(), false));
583 return;
584 }
585 m_alternative->m_terms.append(other: PatternTerm(newCharacterClass.get(), m_invertCharacterClass));
586 m_pattern.m_userCharacterClasses.append(WTFMove(newCharacterClass));
587 }
588
589 void atomParenthesesSubpatternBegin(bool capture = true, std::optional<String> optGroupName = std::nullopt)
590 {
591 unsigned subpatternId = m_pattern.m_numSubpatterns + 1;
592 if (capture) {
593 m_pattern.m_numSubpatterns++;
594 if (optGroupName) {
595 while (m_pattern.m_captureGroupNames.size() < subpatternId)
596 m_pattern.m_captureGroupNames.append(other: String());
597 m_pattern.m_captureGroupNames.append(value: optGroupName.value());
598 m_pattern.m_namedGroupToParenIndex.add(k: optGroupName.value(), v: subpatternId);
599 }
600 } else
601 ASSERT(!optGroupName);
602
603 auto parenthesesDisjunction = std::make_unique<PatternDisjunction>(m_alternative);
604 m_alternative->m_terms.append(other: PatternTerm(PatternTerm::TypeParenthesesSubpattern, subpatternId, parenthesesDisjunction.get(), capture, false));
605 m_alternative = parenthesesDisjunction->addNewAlternative();
606 m_pattern.m_disjunctions.append(WTFMove(parenthesesDisjunction));
607 }
608
609 void atomParentheticalAssertionBegin(bool invert = false)
610 {
611 auto parenthesesDisjunction = std::make_unique<PatternDisjunction>(m_alternative);
612 m_alternative->m_terms.append(other: PatternTerm(PatternTerm::TypeParentheticalAssertion, m_pattern.m_numSubpatterns + 1, parenthesesDisjunction.get(), false, invert));
613 m_alternative = parenthesesDisjunction->addNewAlternative();
614 m_invertParentheticalAssertion = invert;
615 m_pattern.m_disjunctions.append(WTFMove(parenthesesDisjunction));
616 }
617
618 void atomParenthesesEnd()
619 {
620 ASSERT(m_alternative->m_parent);
621 ASSERT(m_alternative->m_parent->m_parent);
622
623 PatternDisjunction* parenthesesDisjunction = m_alternative->m_parent;
624 m_alternative = m_alternative->m_parent->m_parent;
625
626 PatternTerm& lastTerm = m_alternative->lastTerm();
627
628 unsigned numParenAlternatives = parenthesesDisjunction->m_alternatives.size();
629 unsigned numBOLAnchoredAlts = 0;
630
631 for (unsigned i = 0; i < numParenAlternatives; i++) {
632 // Bubble up BOL flags
633 if (parenthesesDisjunction->m_alternatives[i]->m_startsWithBOL)
634 numBOLAnchoredAlts++;
635 }
636
637 if (numBOLAnchoredAlts) {
638 m_alternative->m_containsBOL = true;
639 // If all the alternatives in parens start with BOL, then so does this one
640 if (numBOLAnchoredAlts == numParenAlternatives)
641 m_alternative->m_startsWithBOL = true;
642 }
643
644 lastTerm.parentheses.lastSubpatternId = m_pattern.m_numSubpatterns;
645 m_invertParentheticalAssertion = false;
646 }
647
648 void atomBackReference(unsigned subpatternId)
649 {
650 ASSERT(subpatternId);
651 m_pattern.m_containsBackreferences = true;
652 m_pattern.m_maxBackReference = std::max(m_pattern.m_maxBackReference, subpatternId);
653
654 if (subpatternId > m_pattern.m_numSubpatterns) {
655 m_alternative->m_terms.append(other: PatternTerm::ForwardReference());
656 return;
657 }
658
659 PatternAlternative* currentAlternative = m_alternative;
660 ASSERT(currentAlternative);
661
662 // Note to self: if we waited until the AST was baked, we could also remove forwards refs
663 while ((currentAlternative = currentAlternative->m_parent->m_parent)) {
664 PatternTerm& term = currentAlternative->lastTerm();
665 ASSERT((term.type == PatternTerm::TypeParenthesesSubpattern) || (term.type == PatternTerm::TypeParentheticalAssertion));
666
667 if ((term.type == PatternTerm::TypeParenthesesSubpattern) && term.capture() && (subpatternId == term.parentheses.subpatternId)) {
668 m_alternative->m_terms.append(other: PatternTerm::ForwardReference());
669 return;
670 }
671 }
672
673 m_alternative->m_terms.append(other: PatternTerm(subpatternId));
674 }
675
676 void atomNamedBackReference(const String& subpatternName)
677 {
678 ASSERT(m_pattern.m_namedGroupToParenIndex.find(subpatternName) != m_pattern.m_namedGroupToParenIndex.end());
679 atomBackReference(subpatternId: m_pattern.m_namedGroupToParenIndex.get(k: subpatternName));
680 }
681
682 bool isValidNamedForwardReference(const String& subpatternName)
683 {
684 return !m_unmatchedNamedForwardReferences.contains(subpatternName);
685 }
686
687 void atomNamedForwardReference(const String& subpatternName)
688 {
689 if (!m_pattern.m_namedForwardReferences.contains(value: subpatternName))
690 m_pattern.m_namedForwardReferences.append(value: subpatternName);
691 m_alternative->m_terms.append(other: PatternTerm::ForwardReference());
692 }
693
694 // deep copy the argument disjunction. If filterStartsWithBOL is true,
695 // skip alternatives with m_startsWithBOL set true.
696 PatternDisjunction* copyDisjunction(PatternDisjunction* disjunction, bool filterStartsWithBOL = false)
697 {
698 std::unique_ptr<PatternDisjunction> newDisjunction;
699 for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) {
700 PatternAlternative* alternative = disjunction->m_alternatives[alt].get();
701 if (!filterStartsWithBOL || !alternative->m_startsWithBOL) {
702 if (!newDisjunction) {
703 newDisjunction = std::make_unique<PatternDisjunction>();
704 newDisjunction->m_parent = disjunction->m_parent;
705 }
706 PatternAlternative* newAlternative = newDisjunction->addNewAlternative();
707 newAlternative->m_terms.reserveInitialCapacity(size: alternative->m_terms.size());
708 for (unsigned i = 0; i < alternative->m_terms.size(); ++i)
709 newAlternative->m_terms.append(other: copyTerm(term&: alternative->m_terms[i], filterStartsWithBOL));
710 }
711 }
712
713 if (!newDisjunction)
714 return 0;
715
716 PatternDisjunction* copiedDisjunction = newDisjunction.get();
717 m_pattern.m_disjunctions.append(WTFMove(newDisjunction));
718 return copiedDisjunction;
719 }
720
721 PatternTerm copyTerm(PatternTerm& term, bool filterStartsWithBOL = false)
722 {
723 if ((term.type != PatternTerm::TypeParenthesesSubpattern) && (term.type != PatternTerm::TypeParentheticalAssertion))
724 return PatternTerm(term);
725
726 PatternTerm termCopy = term;
727 termCopy.parentheses.disjunction = copyDisjunction(disjunction: termCopy.parentheses.disjunction, filterStartsWithBOL);
728 m_pattern.m_hasCopiedParenSubexpressions = true;
729 return termCopy;
730 }
731
732 void quantifyAtom(unsigned min, unsigned max, bool greedy)
733 {
734 ASSERT(min <= max);
735 ASSERT(m_alternative->m_terms.size());
736
737 if (!max) {
738 m_alternative->removeLastTerm();
739 return;
740 }
741
742 PatternTerm& term = m_alternative->lastTerm();
743 ASSERT(term.type > PatternTerm::TypeAssertionWordBoundary);
744 ASSERT(term.quantityMinCount == 1 && term.quantityMaxCount == 1 && term.quantityType == QuantifierFixedCount);
745
746 if (term.type == PatternTerm::TypeParentheticalAssertion) {
747 // If an assertion is quantified with a minimum count of zero, it can simply be removed.
748 // This arises from the RepeatMatcher behaviour in the spec. Matching an assertion never
749 // results in any input being consumed, however the continuation passed to the assertion
750 // (called in steps, 8c and 9 of the RepeatMatcher definition, ES5.1 15.10.2.5) will
751 // reject all zero length matches (see step 2.1). A match from the continuation of the
752 // expression will still be accepted regardless (via steps 8a and 11) - the upshot of all
753 // this is that matches from the assertion are not required, and won't be accepted anyway,
754 // so no need to ever run it.
755 if (!min)
756 m_alternative->removeLastTerm();
757 // We never need to run an assertion more than once. Subsequent interations will be run
758 // with the same start index (since assertions are non-capturing) and the same captures
759 // (per step 4 of RepeatMatcher in ES5.1 15.10.2.5), and as such will always produce the
760 // same result and captures. If the first match succeeds then the subsequent (min - 1)
761 // matches will too. Any additional optional matches will fail (on the same basis as the
762 // minimum zero quantified assertions, above), but this will still result in a match.
763 return;
764 }
765
766 if (min == max)
767 term.quantify(minCount: min, maxCount: max, type: QuantifierFixedCount);
768 else if (!min || (term.type == PatternTerm::TypeParenthesesSubpattern && m_pattern.m_hasCopiedParenSubexpressions))
769 term.quantify(minCount: min, maxCount: max, type: greedy ? QuantifierGreedy : QuantifierNonGreedy);
770 else {
771 term.quantify(minCount: min, maxCount: min, type: QuantifierFixedCount);
772 m_alternative->m_terms.append(other: copyTerm(term));
773 // NOTE: this term is interesting from an analysis perspective, in that it can be ignored.....
774 m_alternative->lastTerm().quantify(count: (max == quantifyInfinite) ? max : max - min, type: greedy ? QuantifierGreedy : QuantifierNonGreedy);
775 if (m_alternative->lastTerm().type == PatternTerm::TypeParenthesesSubpattern)
776 m_alternative->lastTerm().parentheses.isCopy = true;
777 }
778 }
779
780 void disjunction()
781 {
782 m_alternative = m_alternative->m_parent->addNewAlternative();
783 }
784
785 ErrorCode setupAlternativeOffsets(PatternAlternative* alternative, unsigned currentCallFrameSize, unsigned initialInputPosition, unsigned& newCallFrameSize) WARN_UNUSED_RETURN
786 {
787 if (UNLIKELY(!isSafeToRecurse()))
788 return ErrorCode::TooManyDisjunctions;
789
790 ErrorCode error = ErrorCode::NoError;
791 alternative->m_hasFixedSize = true;
792 Checked<unsigned, RecordOverflow> currentInputPosition = initialInputPosition;
793
794 for (unsigned i = 0; i < alternative->m_terms.size(); ++i) {
795 PatternTerm& term = alternative->m_terms[i];
796
797 switch (term.type) {
798 case PatternTerm::TypeAssertionBOL:
799 case PatternTerm::TypeAssertionEOL:
800 case PatternTerm::TypeAssertionWordBoundary:
801 term.inputPosition = currentInputPosition.unsafeGet();
802 break;
803
804 case PatternTerm::TypeBackReference:
805 term.inputPosition = currentInputPosition.unsafeGet();
806 term.frameLocation = currentCallFrameSize;
807 currentCallFrameSize += YarrStackSpaceForBackTrackInfoBackReference;
808 alternative->m_hasFixedSize = false;
809 break;
810
811 case PatternTerm::TypeForwardReference:
812 break;
813
814 case PatternTerm::TypePatternCharacter:
815 term.inputPosition = currentInputPosition.unsafeGet();
816 if (term.quantityType != QuantifierFixedCount) {
817 term.frameLocation = currentCallFrameSize;
818 currentCallFrameSize += YarrStackSpaceForBackTrackInfoPatternCharacter;
819 alternative->m_hasFixedSize = false;
820 } else if (m_pattern.unicode()) {
821 Checked<unsigned, RecordOverflow> tempCount = term.quantityMaxCount;
822 tempCount *= U16_LENGTH(term.patternCharacter);
823 if (tempCount.hasOverflowed())
824 return ErrorCode::OffsetTooLarge;
825 currentInputPosition += tempCount;
826 } else
827 currentInputPosition += term.quantityMaxCount;
828 break;
829
830 case PatternTerm::TypeCharacterClass:
831 term.inputPosition = currentInputPosition.unsafeGet();
832 if (term.quantityType != QuantifierFixedCount) {
833 term.frameLocation = currentCallFrameSize;
834 currentCallFrameSize += YarrStackSpaceForBackTrackInfoCharacterClass;
835 alternative->m_hasFixedSize = false;
836 } else if (m_pattern.unicode()) {
837 term.frameLocation = currentCallFrameSize;
838 currentCallFrameSize += YarrStackSpaceForBackTrackInfoCharacterClass;
839 currentInputPosition += term.quantityMaxCount;
840 alternative->m_hasFixedSize = false;
841 } else
842 currentInputPosition += term.quantityMaxCount;
843 break;
844
845 case PatternTerm::TypeParenthesesSubpattern:
846 // Note: for fixed once parentheses we will ensure at least the minimum is available; others are on their own.
847 term.frameLocation = currentCallFrameSize;
848 if (term.quantityMaxCount == 1 && !term.parentheses.isCopy) {
849 currentCallFrameSize += YarrStackSpaceForBackTrackInfoParenthesesOnce;
850 error = setupDisjunctionOffsets(disjunction: term.parentheses.disjunction, initialCallFrameSize: currentCallFrameSize, initialInputPosition: currentInputPosition.unsafeGet(), callFrameSize&: currentCallFrameSize);
851 if (hasError(errorCode: error))
852 return error;
853 // If quantity is fixed, then pre-check its minimum size.
854 if (term.quantityType == QuantifierFixedCount)
855 currentInputPosition += term.parentheses.disjunction->m_minimumSize;
856 term.inputPosition = currentInputPosition.unsafeGet();
857 } else if (term.parentheses.isTerminal) {
858 currentCallFrameSize += YarrStackSpaceForBackTrackInfoParenthesesTerminal;
859 error = setupDisjunctionOffsets(disjunction: term.parentheses.disjunction, initialCallFrameSize: currentCallFrameSize, initialInputPosition: currentInputPosition.unsafeGet(), callFrameSize&: currentCallFrameSize);
860 if (hasError(errorCode: error))
861 return error;
862 term.inputPosition = currentInputPosition.unsafeGet();
863 } else {
864 term.inputPosition = currentInputPosition.unsafeGet();
865 currentCallFrameSize += YarrStackSpaceForBackTrackInfoParentheses;
866 error = setupDisjunctionOffsets(disjunction: term.parentheses.disjunction, initialCallFrameSize: currentCallFrameSize, initialInputPosition: currentInputPosition.unsafeGet(), callFrameSize&: currentCallFrameSize);
867 if (hasError(errorCode: error))
868 return error;
869 }
870 // Fixed count of 1 could be accepted, if they have a fixed size *AND* if all alternatives are of the same length.
871 alternative->m_hasFixedSize = false;
872 break;
873
874 case PatternTerm::TypeParentheticalAssertion:
875 term.inputPosition = currentInputPosition.unsafeGet();
876 term.frameLocation = currentCallFrameSize;
877 error = setupDisjunctionOffsets(disjunction: term.parentheses.disjunction, initialCallFrameSize: currentCallFrameSize + YarrStackSpaceForBackTrackInfoParentheticalAssertion, initialInputPosition: currentInputPosition.unsafeGet(), callFrameSize&: currentCallFrameSize);
878 if (hasError(errorCode: error))
879 return error;
880 break;
881
882 case PatternTerm::TypeDotStarEnclosure:
883 ASSERT(!m_pattern.m_saveInitialStartValue);
884 alternative->m_hasFixedSize = false;
885 term.inputPosition = initialInputPosition;
886 m_pattern.m_initialStartValueFrameLocation = currentCallFrameSize;
887 currentCallFrameSize += YarrStackSpaceForDotStarEnclosure;
888 m_pattern.m_saveInitialStartValue = true;
889 break;
890 }
891 if (currentInputPosition.hasOverflowed())
892 return ErrorCode::OffsetTooLarge;
893 }
894
895 alternative->m_minimumSize = (currentInputPosition - initialInputPosition).unsafeGet();
896 newCallFrameSize = currentCallFrameSize;
897 return error;
898 }
899
900 ErrorCode setupDisjunctionOffsets(PatternDisjunction* disjunction, unsigned initialCallFrameSize, unsigned initialInputPosition, unsigned& callFrameSize)
901 {
902 if (UNLIKELY(!isSafeToRecurse()))
903 return ErrorCode::TooManyDisjunctions;
904
905 if ((disjunction != m_pattern.m_body) && (disjunction->m_alternatives.size() > 1))
906 initialCallFrameSize += YarrStackSpaceForBackTrackInfoAlternative;
907
908 unsigned minimumInputSize = UINT_MAX;
909 unsigned maximumCallFrameSize = 0;
910 bool hasFixedSize = true;
911 ErrorCode error = ErrorCode::NoError;
912
913 for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) {
914 PatternAlternative* alternative = disjunction->m_alternatives[alt].get();
915 unsigned currentAlternativeCallFrameSize;
916 error = setupAlternativeOffsets(alternative, currentCallFrameSize: initialCallFrameSize, initialInputPosition, newCallFrameSize&: currentAlternativeCallFrameSize);
917 if (hasError(errorCode: error))
918 return error;
919 minimumInputSize = std::min(minimumInputSize, alternative->m_minimumSize);
920 maximumCallFrameSize = std::max(maximumCallFrameSize, currentAlternativeCallFrameSize);
921 hasFixedSize &= alternative->m_hasFixedSize;
922 if (alternative->m_minimumSize > INT_MAX)
923 m_pattern.m_containsUnsignedLengthPattern = true;
924 }
925
926 ASSERT(minimumInputSize != UINT_MAX);
927 ASSERT(maximumCallFrameSize >= initialCallFrameSize);
928
929 disjunction->m_hasFixedSize = hasFixedSize;
930 disjunction->m_minimumSize = minimumInputSize;
931 disjunction->m_callFrameSize = maximumCallFrameSize;
932 callFrameSize = maximumCallFrameSize;
933 return error;
934 }
935
936 ErrorCode setupOffsets()
937 {
938 // FIXME: Yarr should not use the stack to handle subpatterns (rdar://problem/26436314).
939 unsigned ignoredCallFrameSize;
940 return setupDisjunctionOffsets(disjunction: m_pattern.m_body, initialCallFrameSize: 0, initialInputPosition: 0, callFrameSize&: ignoredCallFrameSize);
941 }
942
943 // This optimization identifies sets of parentheses that we will never need to backtrack.
944 // In these cases we do not need to store state from prior iterations.
945 // We can presently avoid backtracking for:
946 // * where the parens are at the end of the regular expression (last term in any of the
947 // alternatives of the main body disjunction).
948 // * where the parens are non-capturing, and quantified unbounded greedy (*).
949 // * where the parens do not contain any capturing subpatterns.
950 void checkForTerminalParentheses()
951 {
952 // This check is much too crude; should be just checking whether the candidate
953 // node contains nested capturing subpatterns, not the whole expression!
954 if (m_pattern.m_numSubpatterns)
955 return;
956
957 Vector<std::unique_ptr<PatternAlternative>>& alternatives = m_pattern.m_body->m_alternatives;
958 for (size_t i = 0; i < alternatives.size(); ++i) {
959 Vector<PatternTerm>& terms = alternatives[i]->m_terms;
960 if (terms.size()) {
961 PatternTerm& term = terms.last();
962 if (term.type == PatternTerm::TypeParenthesesSubpattern
963 && term.quantityType == QuantifierGreedy
964 && term.quantityMinCount == 0
965 && term.quantityMaxCount == quantifyInfinite
966 && !term.capture())
967 term.parentheses.isTerminal = true;
968 }
969 }
970 }
971
972 void optimizeBOL()
973 {
974 // Look for expressions containing beginning of line (^) anchoring and unroll them.
975 // e.g. /^a|^b|c/ becomes /^a|^b|c/ which is executed once followed by /c/ which loops
976 // This code relies on the parsing code tagging alternatives with m_containsBOL and
977 // m_startsWithBOL and rolling those up to containing alternatives.
978 // At this point, this is only valid for non-multiline expressions.
979 PatternDisjunction* disjunction = m_pattern.m_body;
980
981 if (!m_pattern.m_containsBOL || m_pattern.multiline())
982 return;
983
984 PatternDisjunction* loopDisjunction = copyDisjunction(disjunction, filterStartsWithBOL: true);
985
986 // Set alternatives in disjunction to "onceThrough"
987 for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt)
988 disjunction->m_alternatives[alt]->setOnceThrough();
989
990 if (loopDisjunction) {
991 // Move alternatives from loopDisjunction to disjunction
992 for (unsigned alt = 0; alt < loopDisjunction->m_alternatives.size(); ++alt)
993 disjunction->m_alternatives.append(loopDisjunction->m_alternatives[alt].release());
994
995 loopDisjunction->m_alternatives.clear();
996 }
997 }
998
999 bool containsCapturingTerms(PatternAlternative* alternative, size_t firstTermIndex, size_t endIndex)
1000 {
1001 Vector<PatternTerm>& terms = alternative->m_terms;
1002
1003 ASSERT(endIndex <= terms.size());
1004 for (size_t termIndex = firstTermIndex; termIndex < endIndex; ++termIndex) {
1005 PatternTerm& term = terms[termIndex];
1006
1007 if (term.m_capture)
1008 return true;
1009
1010 if (term.type == PatternTerm::TypeParenthesesSubpattern) {
1011 PatternDisjunction* nestedDisjunction = term.parentheses.disjunction;
1012 for (unsigned alt = 0; alt < nestedDisjunction->m_alternatives.size(); ++alt) {
1013 if (containsCapturingTerms(nestedDisjunction->m_alternatives[alt].get(), 0, nestedDisjunction->m_alternatives[alt]->m_terms.size()))
1014 return true;
1015 }
1016 }
1017 }
1018
1019 return false;
1020 }
1021
1022 // This optimization identifies alternatives in the form of
1023 // [^].*[?]<expression>.*[$] for expressions that don't have any
1024 // capturing terms. The alternative is changed to <expression>
1025 // followed by processing of the dot stars to find and adjust the
1026 // beginning and the end of the match.
1027 void optimizeDotStarWrappedExpressions()
1028 {
1029 Vector<std::unique_ptr<PatternAlternative>>& alternatives = m_pattern.m_body->m_alternatives;
1030 if (alternatives.size() != 1)
1031 return;
1032
1033 CharacterClass* dotCharacterClass = m_pattern.dotAll() ? m_pattern.anyCharacterClass() : m_pattern.newlineCharacterClass();
1034 PatternAlternative* alternative = alternatives[0].get();
1035 Vector<PatternTerm>& terms = alternative->m_terms;
1036 if (terms.size() >= 3) {
1037 bool startsWithBOL = false;
1038 bool endsWithEOL = false;
1039 size_t termIndex, firstExpressionTerm;
1040
1041 termIndex = 0;
1042 if (terms[termIndex].type == PatternTerm::TypeAssertionBOL) {
1043 startsWithBOL = true;
1044 ++termIndex;
1045 }
1046
1047 PatternTerm& firstNonAnchorTerm = terms[termIndex];
1048 if (firstNonAnchorTerm.type != PatternTerm::TypeCharacterClass
1049 || firstNonAnchorTerm.characterClass != dotCharacterClass
1050 || firstNonAnchorTerm.quantityMinCount
1051 || firstNonAnchorTerm.quantityMaxCount != quantifyInfinite)
1052 return;
1053
1054 firstExpressionTerm = termIndex + 1;
1055
1056 termIndex = terms.size() - 1;
1057 if (terms[termIndex].type == PatternTerm::TypeAssertionEOL) {
1058 endsWithEOL = true;
1059 --termIndex;
1060 }
1061
1062 PatternTerm& lastNonAnchorTerm = terms[termIndex];
1063 if (lastNonAnchorTerm.type != PatternTerm::TypeCharacterClass
1064 || lastNonAnchorTerm.characterClass != dotCharacterClass
1065 || lastNonAnchorTerm.quantityType != QuantifierGreedy
1066 || lastNonAnchorTerm.quantityMinCount
1067 || lastNonAnchorTerm.quantityMaxCount != quantifyInfinite)
1068 return;
1069
1070 size_t endIndex = termIndex;
1071 if (firstExpressionTerm >= endIndex)
1072 return;
1073
1074 if (!containsCapturingTerms(alternative, firstTermIndex: firstExpressionTerm, endIndex)) {
1075 for (termIndex = terms.size() - 1; termIndex >= endIndex; --termIndex)
1076 terms.remove(termIndex);
1077
1078 for (termIndex = firstExpressionTerm; termIndex > 0; --termIndex)
1079 terms.remove(termIndex - 1);
1080
1081 terms.append(PatternTerm(startsWithBOL, endsWithEOL));
1082
1083 m_pattern.m_containsBOL = false;
1084 }
1085 }
1086 }
1087
1088private:
1089 bool isSafeToRecurse() const
1090 {
1091 if (!m_stackLimit)
1092 return true;
1093 int8_t* curr = reinterpret_cast<int8_t*>(&curr);
1094 int8_t* limit = reinterpret_cast<int8_t*>(m_stackLimit);
1095 return curr >= limit;
1096 }
1097
1098 YarrPattern& m_pattern;
1099 PatternAlternative* m_alternative;
1100 CharacterClassConstructor m_characterClassConstructor;
1101 Vector<String> m_unmatchedNamedForwardReferences;
1102 void* m_stackLimit;
1103 bool m_invertCharacterClass;
1104 bool m_invertParentheticalAssertion { false };
1105};
1106
1107ErrorCode YarrPattern::compile(const String& patternString, void* stackLimit)
1108{
1109 YarrPatternConstructor constructor(*this, stackLimit);
1110
1111 if (m_flags == InvalidFlags)
1112 return ErrorCode::InvalidRegularExpressionFlags;
1113
1114 {
1115 ErrorCode error = parse(constructor, patternString, unicode());
1116 if (hasError(errorCode: error))
1117 return error;
1118 }
1119
1120 // If the pattern contains illegal backreferences reset & reparse.
1121 // Quoting Netscape's "What's new in JavaScript 1.2",
1122 // "Note: if the number of left parentheses is less than the number specified
1123 // in \#, the \# is taken as an octal escape as described in the next row."
1124 if (containsIllegalBackReference() || containsIllegalNamedForwardReferences()) {
1125 if (unicode())
1126 return ErrorCode::InvalidBackreference;
1127
1128 unsigned numSubpatterns = m_numSubpatterns;
1129
1130 constructor.saveUnmatchedNamedForwardReferences();
1131 constructor.resetForReparsing();
1132 ErrorCode error = parse(constructor, patternString, unicode(), numSubpatterns);
1133 ASSERT_UNUSED(error, !hasError(error));
1134 ASSERT(numSubpatterns == m_numSubpatterns);
1135 }
1136
1137 constructor.checkForTerminalParentheses();
1138 constructor.optimizeDotStarWrappedExpressions();
1139 constructor.optimizeBOL();
1140
1141 {
1142 ErrorCode error = constructor.setupOffsets();
1143 if (hasError(errorCode: error))
1144 return error;
1145 }
1146
1147 if (Options::dumpCompiledRegExpPatterns())
1148 dumpPattern(pattern: patternString);
1149
1150 return ErrorCode::NoError;
1151}
1152
1153YarrPattern::YarrPattern(const String& pattern, RegExpFlags flags, ErrorCode& error, void* stackLimit)
1154 : m_containsBackreferences(false)
1155 , m_containsBOL(false)
1156 , m_containsUnsignedLengthPattern(false)
1157 , m_hasCopiedParenSubexpressions(false)
1158 , m_saveInitialStartValue(false)
1159 , m_flags(flags)
1160{
1161 error = compile(patternString: pattern, stackLimit);
1162}
1163
1164void indentForNestingLevel(PrintStream& out, unsigned nestingDepth)
1165{
1166 out.print(" ");
1167 for (; nestingDepth; --nestingDepth)
1168 out.print(" ");
1169}
1170
1171void dumpUChar32(PrintStream& out, UChar32 c)
1172{
1173 if (c >= ' '&& c <= 0xff)
1174 out.printf(format: "'%c'", static_cast<char>(c));
1175 else
1176 out.printf(format: "0x%04x", c);
1177}
1178
1179void dumpCharacterClass(PrintStream& out, YarrPattern* pattern, CharacterClass* characterClass)
1180{
1181 if (characterClass == pattern->anyCharacterClass())
1182 out.print("<any character>");
1183 else if (characterClass == pattern->newlineCharacterClass())
1184 out.print("<newline>");
1185 else if (characterClass == pattern->digitsCharacterClass())
1186 out.print("<digits>");
1187 else if (characterClass == pattern->spacesCharacterClass())
1188 out.print("<whitespace>");
1189 else if (characterClass == pattern->wordcharCharacterClass())
1190 out.print("<word>");
1191 else if (characterClass == pattern->wordUnicodeIgnoreCaseCharCharacterClass())
1192 out.print("<unicode word ignore case>");
1193 else if (characterClass == pattern->nondigitsCharacterClass())
1194 out.print("<non-digits>");
1195 else if (characterClass == pattern->nonspacesCharacterClass())
1196 out.print("<non-whitespace>");
1197 else if (characterClass == pattern->nonwordcharCharacterClass())
1198 out.print("<non-word>");
1199 else if (characterClass == pattern->nonwordUnicodeIgnoreCaseCharCharacterClass())
1200 out.print("<unicode non-word ignore case>");
1201 else {
1202 bool needMatchesRangesSeperator = false;
1203
1204 auto dumpMatches = [&] (const char* prefix, Vector<UChar32> matches) {
1205 size_t matchesSize = matches.size();
1206 if (matchesSize) {
1207 if (needMatchesRangesSeperator)
1208 out.print(",");
1209 needMatchesRangesSeperator = true;
1210
1211 out.print(prefix, ":(");
1212 for (size_t i = 0; i < matchesSize; ++i) {
1213 if (i)
1214 out.print(",");
1215 dumpUChar32(out, matches[i]);
1216 }
1217 out.print(")");
1218 }
1219 };
1220
1221 auto dumpRanges = [&] (const char* prefix, Vector<CharacterRange> ranges) {
1222 size_t rangeSize = ranges.size();
1223 if (rangeSize) {
1224 if (needMatchesRangesSeperator)
1225 out.print(",");
1226 needMatchesRangesSeperator = true;
1227
1228 out.print(prefix, " ranges:(");
1229 for (size_t i = 0; i < rangeSize; ++i) {
1230 if (i)
1231 out.print(",");
1232 CharacterRange range = ranges[i];
1233 out.print("(");
1234 dumpUChar32(out, c: range.begin);
1235 out.print("..");
1236 dumpUChar32(out, c: range.end);
1237 out.print(")");
1238 }
1239 out.print(")");
1240 }
1241 };
1242
1243 out.print("[");
1244 dumpMatches("ASCII", characterClass->m_matches);
1245 dumpRanges("ASCII", characterClass->m_ranges);
1246 dumpMatches("Unicode", characterClass->m_matchesUnicode);
1247 dumpRanges("Unicode", characterClass->m_rangesUnicode);
1248 out.print("]");
1249 }
1250}
1251
1252void PatternAlternative::dump(PrintStream& out, YarrPattern* thisPattern, unsigned nestingDepth)
1253{
1254 out.print("minimum size: ", m_minimumSize);
1255 if (m_hasFixedSize)
1256 out.print(",fixed size");
1257 if (m_onceThrough)
1258 out.print(",once through");
1259 if (m_startsWithBOL)
1260 out.print(",starts with ^");
1261 if (m_containsBOL)
1262 out.print(",contains ^");
1263 out.print("\n");
1264
1265 for (size_t i = 0; i < m_terms.size(); ++i)
1266 m_terms[i].dump(out, thisPattern, nestingDepth);
1267}
1268
1269void PatternTerm::dumpQuantifier(PrintStream& out)
1270{
1271 if (quantityType == QuantifierFixedCount && quantityMinCount == 1 && quantityMaxCount == 1)
1272 return;
1273 out.print(" {", quantityMinCount.unsafeGet());
1274 if (quantityMinCount != quantityMaxCount) {
1275 if (quantityMaxCount == UINT_MAX)
1276 out.print(",...");
1277 else
1278 out.print(",", quantityMaxCount.unsafeGet());
1279 }
1280 out.print("}");
1281 if (quantityType == QuantifierGreedy)
1282 out.print(" greedy");
1283 else if (quantityType == QuantifierNonGreedy)
1284 out.print(" non-greedy");
1285}
1286
1287void PatternTerm::dump(PrintStream& out, YarrPattern* thisPattern, unsigned nestingDepth)
1288{
1289 indentForNestingLevel(out, nestingDepth);
1290
1291 if (type != TypeParenthesesSubpattern && type != TypeParentheticalAssertion) {
1292 if (invert())
1293 out.print("not ");
1294 }
1295
1296 switch (type) {
1297 case TypeAssertionBOL:
1298 out.println("BOL");
1299 break;
1300 case TypeAssertionEOL:
1301 out.println("EOL");
1302 break;
1303 case TypeAssertionWordBoundary:
1304 out.println("word boundary");
1305 break;
1306 case TypePatternCharacter:
1307 out.printf(format: "character ");
1308 out.printf(format: "inputPosition %u ", inputPosition);
1309 if (thisPattern->ignoreCase() && isASCIIAlpha(patternCharacter)) {
1310 dumpUChar32(out, toASCIIUpper(patternCharacter));
1311 out.print("/");
1312 dumpUChar32(out, toASCIILower(patternCharacter));
1313 } else
1314 dumpUChar32(out, c: patternCharacter);
1315 dumpQuantifier(out);
1316 if (quantityType != QuantifierFixedCount)
1317 out.print(",frame location ", frameLocation);
1318 out.println();
1319 break;
1320 case TypeCharacterClass:
1321 out.print("character class ");
1322 dumpCharacterClass(out, pattern: thisPattern, characterClass: characterClass);
1323 dumpQuantifier(out);
1324 if (quantityType != QuantifierFixedCount || thisPattern->unicode())
1325 out.print(",frame location ", frameLocation);
1326 out.println();
1327 break;
1328 case TypeBackReference:
1329 out.print("back reference to subpattern #", backReferenceSubpatternId);
1330 out.println(",frame location ", frameLocation);
1331 break;
1332 case TypeForwardReference:
1333 out.println("forward reference");
1334 break;
1335 case TypeParenthesesSubpattern:
1336 if (m_capture)
1337 out.print("captured ");
1338 else
1339 out.print("non-captured ");
1340
1341 FALLTHROUGH;
1342 case TypeParentheticalAssertion:
1343 if (m_invert)
1344 out.print("inverted ");
1345
1346 if (type == TypeParenthesesSubpattern)
1347 out.print("subpattern");
1348 else if (type == TypeParentheticalAssertion)
1349 out.print("assertion");
1350
1351 if (m_capture)
1352 out.print(" #", parentheses.subpatternId);
1353
1354 dumpQuantifier(out);
1355
1356 if (parentheses.isCopy)
1357 out.print(",copy");
1358
1359 if (parentheses.isTerminal)
1360 out.print(",terminal");
1361
1362 out.println(",frame location ", frameLocation);
1363
1364 if (parentheses.disjunction->m_alternatives.size() > 1) {
1365 indentForNestingLevel(out, nestingDepth: nestingDepth + 1);
1366 unsigned alternativeFrameLocation = frameLocation;
1367 if (quantityMaxCount == 1 && !parentheses.isCopy)
1368 alternativeFrameLocation += YarrStackSpaceForBackTrackInfoParenthesesOnce;
1369 else if (parentheses.isTerminal)
1370 alternativeFrameLocation += YarrStackSpaceForBackTrackInfoParenthesesTerminal;
1371 else
1372 alternativeFrameLocation += YarrStackSpaceForBackTrackInfoParentheses;
1373 out.println("alternative list,frame location ", alternativeFrameLocation);
1374 }
1375
1376 parentheses.disjunction->dump(out, thisPattern, nestingDepth + 1);
1377 break;
1378 case TypeDotStarEnclosure:
1379 out.println(".* enclosure,frame location ", thisPattern->m_initialStartValueFrameLocation);
1380 break;
1381 }
1382}
1383
1384void PatternDisjunction::dump(PrintStream& out, YarrPattern* thisPattern, unsigned nestingDepth = 0)
1385{
1386 unsigned alternativeCount = m_alternatives.size();
1387 for (unsigned i = 0; i < alternativeCount; ++i) {
1388 indentForNestingLevel(out, nestingDepth);
1389 if (alternativeCount > 1)
1390 out.print("alternative #", i, ": ");
1391 m_alternatives[i].get()->dump(out, thisPattern, nestingDepth + (alternativeCount > 1));
1392 }
1393}
1394
1395void YarrPattern::dumpPatternString(PrintStream& out, const String& patternString)
1396{
1397 out.print("/", patternString, "/");
1398
1399 if (global())
1400 out.print("g");
1401 if (ignoreCase())
1402 out.print("i");
1403 if (multiline())
1404 out.print("m");
1405 if (unicode())
1406 out.print("u");
1407 if (sticky())
1408 out.print("y");
1409}
1410
1411void YarrPattern::dumpPattern(const String& patternString)
1412{
1413 dumpPattern(out&: WTF::dataFile(), pattern: patternString);
1414}
1415
1416void YarrPattern::dumpPattern(PrintStream& out, const String& patternString)
1417{
1418 out.print("RegExp pattern for ");
1419 dumpPatternString(out, patternString);
1420
1421 if (m_flags != NoFlags) {
1422 bool printSeperator = false;
1423 out.print(" (");
1424 if (global()) {
1425 out.print("global");
1426 printSeperator = true;
1427 }
1428 if (ignoreCase()) {
1429 if (printSeperator)
1430 out.print("|");
1431 out.print("ignore case");
1432 printSeperator = true;
1433 }
1434 if (multiline()) {
1435 if (printSeperator)
1436 out.print("|");
1437 out.print("multiline");
1438 printSeperator = true;
1439 }
1440 if (unicode()) {
1441 if (printSeperator)
1442 out.print("|");
1443 out.print("unicode");
1444 printSeperator = true;
1445 }
1446 if (sticky()) {
1447 if (printSeperator)
1448 out.print("|");
1449 out.print("sticky");
1450 printSeperator = true;
1451 }
1452 out.print(")");
1453 }
1454 out.print(":\n");
1455 if (m_body->m_callFrameSize)
1456 out.print(" callframe size: ", m_body->m_callFrameSize, "\n");
1457 m_body->dump(out, thisPattern: this);
1458}
1459
1460std::unique_ptr<CharacterClass> anycharCreate()
1461{
1462 auto characterClass = std::make_unique<CharacterClass>();
1463 characterClass->m_ranges.append(CharacterRange(0x00, 0x7f));
1464 characterClass->m_rangesUnicode.append(CharacterRange(0x0080, 0x10ffff));
1465 characterClass->m_hasNonBMPCharacters = true;
1466 characterClass->m_anyCharacter = true;
1467 return characterClass;
1468}
1469
1470} } // namespace JSC::Yarr
1471

source code of qtdeclarative/src/3rdparty/masm/yarr/YarrPattern.cpp