1/*
2 * Copyright (C) 2009, 2013-2017 Apple Inc. All rights reserved.
3 * Copyright (C) 2010 Peter Varga (pvarga@inf.u-szeged.hu), University of Szeged
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
15 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
17 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
18 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
19 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
21 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
22 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
24 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 */
26
27#pragma once
28
29#include "RegExpKey.h"
30#include "YarrErrorCode.h"
31#include "YarrUnicodeProperties.h"
32#include <wtf/CheckedArithmetic.h>
33#include <wtf/HashMap.h>
34#include <wtf/Optional.h>
35#include <wtf/PrintStream.h>
36#include <wtf/Vector.h>
37#include <wtf/text/WTFString.h>
38
39namespace JSC { namespace Yarr {
40
41struct YarrPattern;
42struct PatternDisjunction;
43
44struct CharacterRange {
45 UChar32 begin { 0 };
46 UChar32 end { 0x10ffff };
47
48 CharacterRange(UChar32 begin, UChar32 end)
49 : begin(begin)
50 , end(end)
51 {
52 }
53};
54
55struct CharacterClass {
56 WTF_MAKE_FAST_ALLOCATED;
57public:
58 // All CharacterClass instances have to have the full set of matches and ranges,
59 // they may have an optional m_table for faster lookups (which must match the
60 // specified matches and ranges)
61 CharacterClass()
62 : m_table(0)
63 , m_hasNonBMPCharacters(false)
64 , m_anyCharacter(false)
65 {
66 }
67 CharacterClass(const char* table, bool inverted)
68 : m_table(table)
69 , m_tableInverted(inverted)
70 , m_hasNonBMPCharacters(false)
71 , m_anyCharacter(false)
72 {
73 }
74 CharacterClass(std::initializer_list<UChar32> matches, std::initializer_list<CharacterRange> ranges, std::initializer_list<UChar32> matchesUnicode, std::initializer_list<CharacterRange> rangesUnicode)
75 : m_matches(matches)
76 , m_ranges(ranges)
77 , m_matchesUnicode(matchesUnicode)
78 , m_rangesUnicode(rangesUnicode)
79 , m_table(0)
80 , m_tableInverted(false)
81 , m_hasNonBMPCharacters(false)
82 , m_anyCharacter(false)
83 {
84 }
85
86 Vector<UChar32> m_matches;
87 Vector<CharacterRange> m_ranges;
88 Vector<UChar32> m_matchesUnicode;
89 Vector<CharacterRange> m_rangesUnicode;
90
91 const char* m_table;
92 bool m_tableInverted : 1;
93 bool m_hasNonBMPCharacters : 1;
94 bool m_anyCharacter : 1;
95};
96
97enum QuantifierType {
98 QuantifierFixedCount,
99 QuantifierGreedy,
100 QuantifierNonGreedy,
101};
102
103struct PatternTerm {
104 enum Type {
105 TypeAssertionBOL,
106 TypeAssertionEOL,
107 TypeAssertionWordBoundary,
108 TypePatternCharacter,
109 TypeCharacterClass,
110 TypeBackReference,
111 TypeForwardReference,
112 TypeParenthesesSubpattern,
113 TypeParentheticalAssertion,
114 TypeDotStarEnclosure,
115 } type;
116 bool m_capture :1;
117 bool m_invert :1;
118 union {
119 UChar32 patternCharacter;
120 CharacterClass* characterClass;
121 unsigned backReferenceSubpatternId;
122 struct {
123 PatternDisjunction* disjunction;
124 unsigned subpatternId;
125 unsigned lastSubpatternId;
126 bool isCopy;
127 bool isTerminal;
128 } parentheses;
129 struct {
130 bool bolAnchor : 1;
131 bool eolAnchor : 1;
132 } anchors;
133 };
134 QuantifierType quantityType;
135 Checked<unsigned> quantityMinCount;
136 Checked<unsigned> quantityMaxCount;
137 unsigned inputPosition;
138 unsigned frameLocation;
139
140 PatternTerm(UChar32 ch)
141 : type(PatternTerm::TypePatternCharacter)
142 , m_capture(false)
143 , m_invert(false)
144 {
145 patternCharacter = ch;
146 quantityType = QuantifierFixedCount;
147 quantityMinCount = quantityMaxCount = 1;
148 }
149
150 PatternTerm(CharacterClass* charClass, bool invert)
151 : type(PatternTerm::TypeCharacterClass)
152 , m_capture(false)
153 , m_invert(invert)
154 {
155 characterClass = charClass;
156 quantityType = QuantifierFixedCount;
157 quantityMinCount = quantityMaxCount = 1;
158 }
159
160 PatternTerm(Type type, unsigned subpatternId, PatternDisjunction* disjunction, bool capture = false, bool invert = false)
161 : type(type)
162 , m_capture(capture)
163 , m_invert(invert)
164 {
165 parentheses.disjunction = disjunction;
166 parentheses.subpatternId = subpatternId;
167 parentheses.isCopy = false;
168 parentheses.isTerminal = false;
169 quantityType = QuantifierFixedCount;
170 quantityMinCount = quantityMaxCount = 1;
171 }
172
173 PatternTerm(Type type, bool invert = false)
174 : type(type)
175 , m_capture(false)
176 , m_invert(invert)
177 {
178 quantityType = QuantifierFixedCount;
179 quantityMinCount = quantityMaxCount = 1;
180 }
181
182 PatternTerm(unsigned spatternId)
183 : type(TypeBackReference)
184 , m_capture(false)
185 , m_invert(false)
186 {
187 backReferenceSubpatternId = spatternId;
188 quantityType = QuantifierFixedCount;
189 quantityMinCount = quantityMaxCount = 1;
190 }
191
192 PatternTerm(bool bolAnchor, bool eolAnchor)
193 : type(TypeDotStarEnclosure)
194 , m_capture(false)
195 , m_invert(false)
196 {
197 anchors.bolAnchor = bolAnchor;
198 anchors.eolAnchor = eolAnchor;
199 quantityType = QuantifierFixedCount;
200 quantityMinCount = quantityMaxCount = 1;
201 }
202
203 static PatternTerm ForwardReference()
204 {
205 return PatternTerm(TypeForwardReference);
206 }
207
208 static PatternTerm BOL()
209 {
210 return PatternTerm(TypeAssertionBOL);
211 }
212
213 static PatternTerm EOL()
214 {
215 return PatternTerm(TypeAssertionEOL);
216 }
217
218 static PatternTerm WordBoundary(bool invert)
219 {
220 return PatternTerm(TypeAssertionWordBoundary, invert);
221 }
222
223 bool invert()
224 {
225 return m_invert;
226 }
227
228 bool capture()
229 {
230 return m_capture;
231 }
232
233 bool containsAnyCaptures()
234 {
235 ASSERT(this->type == TypeParenthesesSubpattern);
236 return parentheses.lastSubpatternId >= parentheses.subpatternId;
237 }
238
239 void quantify(unsigned count, QuantifierType type)
240 {
241 quantityMinCount = 0;
242 quantityMaxCount = count;
243 quantityType = type;
244 }
245
246 void quantify(unsigned minCount, unsigned maxCount, QuantifierType type)
247 {
248 // Currently only Parentheses can specify a non-zero min with a different max.
249 ASSERT(this->type == TypeParenthesesSubpattern || !minCount || minCount == maxCount);
250 ASSERT(minCount <= maxCount);
251 quantityMinCount = minCount;
252 quantityMaxCount = maxCount;
253 quantityType = type;
254 }
255
256 void dumpQuantifier(PrintStream&);
257 void dump(PrintStream&, YarrPattern*, unsigned);
258};
259
260struct PatternAlternative {
261 WTF_MAKE_FAST_ALLOCATED;
262public:
263 PatternAlternative(PatternDisjunction* disjunction)
264 : m_parent(disjunction)
265 , m_onceThrough(false)
266 , m_hasFixedSize(false)
267 , m_startsWithBOL(false)
268 , m_containsBOL(false)
269 {
270 }
271
272 PatternTerm& lastTerm()
273 {
274 ASSERT(m_terms.size());
275 return m_terms[m_terms.size() - 1];
276 }
277
278 void removeLastTerm()
279 {
280 ASSERT(m_terms.size());
281 m_terms.shrink(size: m_terms.size() - 1);
282 }
283
284 void setOnceThrough()
285 {
286 m_onceThrough = true;
287 }
288
289 bool onceThrough()
290 {
291 return m_onceThrough;
292 }
293
294 void dump(PrintStream&, YarrPattern*, unsigned);
295
296 Vector<PatternTerm> m_terms;
297 PatternDisjunction* m_parent;
298 unsigned m_minimumSize;
299 bool m_onceThrough : 1;
300 bool m_hasFixedSize : 1;
301 bool m_startsWithBOL : 1;
302 bool m_containsBOL : 1;
303};
304
305struct PatternDisjunction {
306 WTF_MAKE_FAST_ALLOCATED;
307public:
308 PatternDisjunction(PatternAlternative *parent = nullptr)
309 : m_parent(parent)
310 , m_hasFixedSize(false)
311 {
312 }
313
314 PatternAlternative* addNewAlternative()
315 {
316 m_alternatives.append(other: std::make_unique<PatternAlternative>(args: this));
317 return static_cast<PatternAlternative*>(m_alternatives.last().get());
318 }
319
320 void dump(PrintStream&, YarrPattern*, unsigned);
321
322 Vector<std::unique_ptr<PatternAlternative>> m_alternatives;
323 PatternAlternative* m_parent;
324 unsigned m_minimumSize;
325 unsigned m_callFrameSize;
326 bool m_hasFixedSize;
327};
328
329// You probably don't want to be calling these functions directly
330// (please to be calling newlineCharacterClass() et al on your
331// friendly neighborhood YarrPattern instance to get nicely
332// cached copies).
333
334std::unique_ptr<CharacterClass> anycharCreate();
335std::unique_ptr<CharacterClass> newlineCreate();
336std::unique_ptr<CharacterClass> digitsCreate();
337std::unique_ptr<CharacterClass> spacesCreate();
338std::unique_ptr<CharacterClass> wordcharCreate();
339std::unique_ptr<CharacterClass> wordUnicodeIgnoreCaseCharCreate();
340std::unique_ptr<CharacterClass> nondigitsCreate();
341std::unique_ptr<CharacterClass> nonspacesCreate();
342std::unique_ptr<CharacterClass> nonwordcharCreate();
343std::unique_ptr<CharacterClass> nonwordUnicodeIgnoreCaseCharCreate();
344
345struct TermChain {
346 TermChain(PatternTerm term)
347 : term(term)
348 {}
349
350 PatternTerm term;
351 Vector<TermChain> hotTerms;
352};
353
354
355struct YarrPattern {
356 JS_EXPORT_PRIVATE YarrPattern(const String& pattern, RegExpFlags, ErrorCode&, void* stackLimit = nullptr);
357
358 void resetForReparsing()
359 {
360 m_numSubpatterns = 0;
361 m_maxBackReference = 0;
362 m_initialStartValueFrameLocation = 0;
363
364 m_containsBackreferences = false;
365 m_containsBOL = false;
366 m_containsUnsignedLengthPattern = false;
367 m_hasCopiedParenSubexpressions = false;
368 m_saveInitialStartValue = false;
369
370 anycharCached = nullptr;
371 newlineCached = nullptr;
372 digitsCached = nullptr;
373 spacesCached = nullptr;
374 wordcharCached = nullptr;
375 wordUnicodeIgnoreCaseCharCached = nullptr;
376 nondigitsCached = nullptr;
377 nonspacesCached = nullptr;
378 nonwordcharCached = nullptr;
379 nonwordUnicodeIgnoreCasecharCached = nullptr;
380 unicodePropertiesCached.clear();
381
382 m_disjunctions.clear();
383 m_userCharacterClasses.clear();
384 m_captureGroupNames.shrink(size: 0);
385 m_namedForwardReferences.shrink(size: 0);
386 }
387
388 bool containsIllegalBackReference()
389 {
390 return m_maxBackReference > m_numSubpatterns;
391 }
392
393 bool containsIllegalNamedForwardReferences()
394 {
395 if (m_namedForwardReferences.isEmpty())
396 return false;
397
398 for (auto& entry : m_namedForwardReferences) {
399 if (m_captureGroupNames.contains(value: entry))
400 return true;
401 }
402
403 return false;
404 }
405
406 bool containsUnsignedLengthPattern()
407 {
408 return m_containsUnsignedLengthPattern;
409 }
410
411 CharacterClass* anyCharacterClass()
412 {
413 if (!anycharCached) {
414 m_userCharacterClasses.append(other: anycharCreate());
415 anycharCached = m_userCharacterClasses.last().get();
416 }
417 return anycharCached;
418 }
419 CharacterClass* newlineCharacterClass()
420 {
421 if (!newlineCached) {
422 m_userCharacterClasses.append(other: newlineCreate());
423 newlineCached = m_userCharacterClasses.last().get();
424 }
425 return newlineCached;
426 }
427 CharacterClass* digitsCharacterClass()
428 {
429 if (!digitsCached) {
430 m_userCharacterClasses.append(other: digitsCreate());
431 digitsCached = m_userCharacterClasses.last().get();
432 }
433 return digitsCached;
434 }
435 CharacterClass* spacesCharacterClass()
436 {
437 if (!spacesCached) {
438 m_userCharacterClasses.append(other: spacesCreate());
439 spacesCached = m_userCharacterClasses.last().get();
440 }
441 return spacesCached;
442 }
443 CharacterClass* wordcharCharacterClass()
444 {
445 if (!wordcharCached) {
446 m_userCharacterClasses.append(other: wordcharCreate());
447 wordcharCached = m_userCharacterClasses.last().get();
448 }
449 return wordcharCached;
450 }
451 CharacterClass* wordUnicodeIgnoreCaseCharCharacterClass()
452 {
453 if (!wordUnicodeIgnoreCaseCharCached) {
454 m_userCharacterClasses.append(other: wordUnicodeIgnoreCaseCharCreate());
455 wordUnicodeIgnoreCaseCharCached = m_userCharacterClasses.last().get();
456 }
457 return wordUnicodeIgnoreCaseCharCached;
458 }
459 CharacterClass* nondigitsCharacterClass()
460 {
461 if (!nondigitsCached) {
462 m_userCharacterClasses.append(other: nondigitsCreate());
463 nondigitsCached = m_userCharacterClasses.last().get();
464 }
465 return nondigitsCached;
466 }
467 CharacterClass* nonspacesCharacterClass()
468 {
469 if (!nonspacesCached) {
470 m_userCharacterClasses.append(other: nonspacesCreate());
471 nonspacesCached = m_userCharacterClasses.last().get();
472 }
473 return nonspacesCached;
474 }
475 CharacterClass* nonwordcharCharacterClass()
476 {
477 if (!nonwordcharCached) {
478 m_userCharacterClasses.append(other: nonwordcharCreate());
479 nonwordcharCached = m_userCharacterClasses.last().get();
480 }
481 return nonwordcharCached;
482 }
483 CharacterClass* nonwordUnicodeIgnoreCaseCharCharacterClass()
484 {
485 if (!nonwordUnicodeIgnoreCasecharCached) {
486 m_userCharacterClasses.append(other: nonwordUnicodeIgnoreCaseCharCreate());
487 nonwordUnicodeIgnoreCasecharCached = m_userCharacterClasses.last().get();
488 }
489 return nonwordUnicodeIgnoreCasecharCached;
490 }
491 CharacterClass* unicodeCharacterClassFor(BuiltInCharacterClassID unicodeClassID)
492 {
493 ASSERT(unicodeClassID >= BuiltInCharacterClassID::BaseUnicodePropertyID);
494
495 unsigned classID = static_cast<unsigned>(unicodeClassID);
496
497 if (unicodePropertiesCached.find(key: classID) == unicodePropertiesCached.end()) {
498 m_userCharacterClasses.append(other: createUnicodeCharacterClassFor(unicodeClassID));
499 CharacterClass* result = m_userCharacterClasses.last().get();
500 unicodePropertiesCached.add(k: classID, v: result);
501 return result;
502 }
503
504 return unicodePropertiesCached.get(k: classID);
505 }
506
507 void dumpPatternString(PrintStream& out, const String& patternString);
508 void dumpPattern(const String& pattern);
509 void dumpPattern(PrintStream& out, const String& pattern);
510
511 bool global() const { return m_flags & FlagGlobal; }
512 bool ignoreCase() const { return m_flags & FlagIgnoreCase; }
513 bool multiline() const { return m_flags & FlagMultiline; }
514 bool sticky() const { return m_flags & FlagSticky; }
515 bool unicode() const { return m_flags & FlagUnicode; }
516 bool dotAll() const { return m_flags & FlagDotAll; }
517
518 bool m_containsBackreferences : 1;
519 bool m_containsBOL : 1;
520 bool m_containsUnsignedLengthPattern : 1;
521 bool m_hasCopiedParenSubexpressions : 1;
522 bool m_saveInitialStartValue : 1;
523 RegExpFlags m_flags;
524 unsigned m_numSubpatterns { 0 };
525 unsigned m_maxBackReference { 0 };
526 unsigned m_initialStartValueFrameLocation { 0 };
527 PatternDisjunction* m_body;
528 Vector<std::unique_ptr<PatternDisjunction>, 4> m_disjunctions;
529 Vector<std::unique_ptr<CharacterClass>> m_userCharacterClasses;
530 Vector<String> m_captureGroupNames;
531 Vector<String> m_namedForwardReferences;
532 HashMap<String, unsigned> m_namedGroupToParenIndex;
533
534private:
535 ErrorCode compile(const String& patternString, void* stackLimit);
536
537 CharacterClass* anycharCached { nullptr };
538 CharacterClass* newlineCached { nullptr };
539 CharacterClass* digitsCached { nullptr };
540 CharacterClass* spacesCached { nullptr };
541 CharacterClass* wordcharCached { nullptr };
542 CharacterClass* wordUnicodeIgnoreCaseCharCached { nullptr };
543 CharacterClass* nondigitsCached { nullptr };
544 CharacterClass* nonspacesCached { nullptr };
545 CharacterClass* nonwordcharCached { nullptr };
546 CharacterClass* nonwordUnicodeIgnoreCasecharCached { nullptr };
547 HashMap<unsigned, CharacterClass*> unicodePropertiesCached;
548};
549
550 void indentForNestingLevel(PrintStream&, unsigned);
551 void dumpUChar32(PrintStream&, UChar32);
552 void dumpCharacterClass(PrintStream&, YarrPattern*, CharacterClass*);
553
554 struct BackTrackInfoPatternCharacter {
555 uintptr_t begin; // Only needed for unicode patterns
556 uintptr_t matchAmount;
557
558 static unsigned beginIndex() { return offsetof(BackTrackInfoPatternCharacter, begin) / sizeof(uintptr_t); }
559 static unsigned matchAmountIndex() { return offsetof(BackTrackInfoPatternCharacter, matchAmount) / sizeof(uintptr_t); }
560 };
561
562 struct BackTrackInfoCharacterClass {
563 uintptr_t begin; // Only needed for unicode patterns
564 uintptr_t matchAmount;
565
566 static unsigned beginIndex() { return offsetof(BackTrackInfoCharacterClass, begin) / sizeof(uintptr_t); }
567 static unsigned matchAmountIndex() { return offsetof(BackTrackInfoCharacterClass, matchAmount) / sizeof(uintptr_t); }
568 };
569
570 struct BackTrackInfoBackReference {
571 uintptr_t begin; // Not really needed for greedy quantifiers.
572 uintptr_t matchAmount; // Not really needed for fixed quantifiers.
573
574 static unsigned beginIndex() { return offsetof(BackTrackInfoBackReference, begin) / sizeof(uintptr_t); }
575 static unsigned matchAmountIndex() { return offsetof(BackTrackInfoBackReference, matchAmount) / sizeof(uintptr_t); }
576 };
577
578 struct BackTrackInfoAlternative {
579 union {
580 uintptr_t offset;
581 };
582 };
583
584 struct BackTrackInfoParentheticalAssertion {
585 uintptr_t begin;
586
587 static unsigned beginIndex() { return offsetof(BackTrackInfoParentheticalAssertion, begin) / sizeof(uintptr_t); }
588 };
589
590 struct BackTrackInfoParenthesesOnce {
591 uintptr_t begin;
592 uintptr_t returnAddress;
593
594 static unsigned beginIndex() { return offsetof(BackTrackInfoParenthesesOnce, begin) / sizeof(uintptr_t); }
595 static unsigned returnAddressIndex() { return offsetof(BackTrackInfoParenthesesOnce, returnAddress) / sizeof(uintptr_t); }
596 };
597
598 struct BackTrackInfoParenthesesTerminal {
599 uintptr_t begin;
600
601 static unsigned beginIndex() { return offsetof(BackTrackInfoParenthesesTerminal, begin) / sizeof(uintptr_t); }
602 };
603
604 struct BackTrackInfoParentheses {
605 uintptr_t begin;
606 uintptr_t returnAddress;
607 uintptr_t matchAmount;
608 uintptr_t parenContextHead;
609
610 static unsigned beginIndex() { return offsetof(BackTrackInfoParentheses, begin) / sizeof(uintptr_t); }
611 static unsigned returnAddressIndex() { return offsetof(BackTrackInfoParentheses, returnAddress) / sizeof(uintptr_t); }
612 static unsigned matchAmountIndex() { return offsetof(BackTrackInfoParentheses, matchAmount) / sizeof(uintptr_t); }
613 static unsigned parenContextHeadIndex() { return offsetof(BackTrackInfoParentheses, parenContextHead) / sizeof(uintptr_t); }
614 };
615
616} } // namespace JSC::Yarr
617

source code of qtdeclarative/src/3rdparty/masm/yarr/YarrPattern.h