1 | /* |
2 | * Copyright (C) 2009, 2010-2012, 2014, 2016 Apple Inc. All rights reserved. |
3 | * |
4 | * Redistribution and use in source and binary forms, with or without |
5 | * modification, are permitted provided that the following conditions |
6 | * are met: |
7 | * 1. Redistributions of source code must retain the above copyright |
8 | * notice, this list of conditions and the following disclaimer. |
9 | * 2. Redistributions in binary form must reproduce the above copyright |
10 | * notice, this list of conditions and the following disclaimer in the |
11 | * documentation and/or other materials provided with the distribution. |
12 | * |
13 | * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY |
14 | * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
15 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR |
16 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR |
17 | * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, |
18 | * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, |
19 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR |
20 | * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY |
21 | * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
22 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
23 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
24 | */ |
25 | |
26 | #pragma once |
27 | |
28 | #include "ConcurrentJSLock.h" |
29 | #include "YarrPattern.h" |
30 | |
31 | namespace WTF { |
32 | class BumpPointerAllocator; |
33 | } |
34 | using WTF::BumpPointerAllocator; |
35 | |
36 | namespace JSC { namespace Yarr { |
37 | |
38 | class ByteDisjunction; |
39 | |
40 | struct ByteTerm { |
41 | enum Type { |
42 | TypeBodyAlternativeBegin, |
43 | TypeBodyAlternativeDisjunction, |
44 | TypeBodyAlternativeEnd, |
45 | TypeAlternativeBegin, |
46 | TypeAlternativeDisjunction, |
47 | TypeAlternativeEnd, |
48 | TypeSubpatternBegin, |
49 | TypeSubpatternEnd, |
50 | TypeAssertionBOL, |
51 | TypeAssertionEOL, |
52 | TypeAssertionWordBoundary, |
53 | TypePatternCharacterOnce, |
54 | TypePatternCharacterFixed, |
55 | TypePatternCharacterGreedy, |
56 | TypePatternCharacterNonGreedy, |
57 | TypePatternCasedCharacterOnce, |
58 | TypePatternCasedCharacterFixed, |
59 | TypePatternCasedCharacterGreedy, |
60 | TypePatternCasedCharacterNonGreedy, |
61 | TypeCharacterClass, |
62 | TypeBackReference, |
63 | TypeParenthesesSubpattern, |
64 | TypeParenthesesSubpatternOnceBegin, |
65 | TypeParenthesesSubpatternOnceEnd, |
66 | TypeParenthesesSubpatternTerminalBegin, |
67 | TypeParenthesesSubpatternTerminalEnd, |
68 | TypeParentheticalAssertionBegin, |
69 | TypeParentheticalAssertionEnd, |
70 | TypeCheckInput, |
71 | TypeUncheckInput, |
72 | TypeDotStarEnclosure, |
73 | } type; |
74 | union { |
75 | struct { |
76 | union { |
77 | UChar32 patternCharacter; |
78 | struct { |
79 | UChar32 lo; |
80 | UChar32 hi; |
81 | } casedCharacter; |
82 | CharacterClass* characterClass; |
83 | unsigned subpatternId; |
84 | }; |
85 | union { |
86 | ByteDisjunction* parenthesesDisjunction; |
87 | unsigned parenthesesWidth; |
88 | }; |
89 | QuantifierType quantityType; |
90 | unsigned quantityMinCount; |
91 | unsigned quantityMaxCount; |
92 | } atom; |
93 | struct { |
94 | int next; |
95 | int end; |
96 | bool onceThrough; |
97 | } alternative; |
98 | struct { |
99 | bool m_bol : 1; |
100 | bool m_eol : 1; |
101 | } anchors; |
102 | unsigned checkInputCount; |
103 | }; |
104 | unsigned frameLocation; |
105 | bool m_capture : 1; |
106 | bool m_invert : 1; |
107 | unsigned inputPosition; |
108 | |
109 | ByteTerm(UChar32 ch, unsigned inputPos, unsigned frameLocation, Checked<unsigned> quantityCount, QuantifierType quantityType) |
110 | : frameLocation(frameLocation) |
111 | , m_capture(false) |
112 | , m_invert(false) |
113 | { |
114 | atom.patternCharacter = ch; |
115 | atom.quantityType = quantityType; |
116 | atom.quantityMinCount = quantityCount.unsafeGet(); |
117 | atom.quantityMaxCount = quantityCount.unsafeGet(); |
118 | inputPosition = inputPos; |
119 | |
120 | switch (quantityType) { |
121 | case QuantifierFixedCount: |
122 | type = (quantityCount == 1) ? ByteTerm::TypePatternCharacterOnce : ByteTerm::TypePatternCharacterFixed; |
123 | break; |
124 | case QuantifierGreedy: |
125 | type = ByteTerm::TypePatternCharacterGreedy; |
126 | break; |
127 | case QuantifierNonGreedy: |
128 | type = ByteTerm::TypePatternCharacterNonGreedy; |
129 | break; |
130 | } |
131 | } |
132 | |
133 | ByteTerm(UChar32 lo, UChar32 hi, unsigned inputPos, unsigned frameLocation, Checked<unsigned> quantityCount, QuantifierType quantityType) |
134 | : frameLocation(frameLocation) |
135 | , m_capture(false) |
136 | , m_invert(false) |
137 | { |
138 | switch (quantityType) { |
139 | case QuantifierFixedCount: |
140 | type = (quantityCount == 1) ? ByteTerm::TypePatternCasedCharacterOnce : ByteTerm::TypePatternCasedCharacterFixed; |
141 | break; |
142 | case QuantifierGreedy: |
143 | type = ByteTerm::TypePatternCasedCharacterGreedy; |
144 | break; |
145 | case QuantifierNonGreedy: |
146 | type = ByteTerm::TypePatternCasedCharacterNonGreedy; |
147 | break; |
148 | } |
149 | |
150 | atom.casedCharacter.lo = lo; |
151 | atom.casedCharacter.hi = hi; |
152 | atom.quantityType = quantityType; |
153 | atom.quantityMinCount = quantityCount.unsafeGet(); |
154 | atom.quantityMaxCount = quantityCount.unsafeGet(); |
155 | inputPosition = inputPos; |
156 | } |
157 | |
158 | ByteTerm(CharacterClass* characterClass, bool invert, unsigned inputPos) |
159 | : type(ByteTerm::TypeCharacterClass) |
160 | , m_capture(false) |
161 | , m_invert(invert) |
162 | { |
163 | atom.characterClass = characterClass; |
164 | atom.quantityType = QuantifierFixedCount; |
165 | atom.quantityMinCount = 1; |
166 | atom.quantityMaxCount = 1; |
167 | inputPosition = inputPos; |
168 | } |
169 | |
170 | ByteTerm(Type type, unsigned subpatternId, ByteDisjunction* parenthesesInfo, bool capture, unsigned inputPos) |
171 | : type(type) |
172 | , m_capture(capture) |
173 | , m_invert(false) |
174 | { |
175 | atom.subpatternId = subpatternId; |
176 | atom.parenthesesDisjunction = parenthesesInfo; |
177 | atom.quantityType = QuantifierFixedCount; |
178 | atom.quantityMinCount = 1; |
179 | atom.quantityMaxCount = 1; |
180 | inputPosition = inputPos; |
181 | } |
182 | |
183 | ByteTerm(Type type, bool invert = false) |
184 | : type(type) |
185 | , m_capture(false) |
186 | , m_invert(invert) |
187 | { |
188 | atom.quantityType = QuantifierFixedCount; |
189 | atom.quantityMinCount = 1; |
190 | atom.quantityMaxCount = 1; |
191 | } |
192 | |
193 | ByteTerm(Type type, unsigned subpatternId, bool capture, bool invert, unsigned inputPos) |
194 | : type(type) |
195 | , m_capture(capture) |
196 | , m_invert(invert) |
197 | { |
198 | atom.subpatternId = subpatternId; |
199 | atom.quantityType = QuantifierFixedCount; |
200 | atom.quantityMinCount = 1; |
201 | atom.quantityMaxCount = 1; |
202 | inputPosition = inputPos; |
203 | } |
204 | |
205 | static ByteTerm BOL(unsigned inputPos) |
206 | { |
207 | ByteTerm term(TypeAssertionBOL); |
208 | term.inputPosition = inputPos; |
209 | return term; |
210 | } |
211 | |
212 | static ByteTerm CheckInput(Checked<unsigned> count) |
213 | { |
214 | ByteTerm term(TypeCheckInput); |
215 | term.checkInputCount = count.unsafeGet(); |
216 | return term; |
217 | } |
218 | |
219 | static ByteTerm UncheckInput(Checked<unsigned> count) |
220 | { |
221 | ByteTerm term(TypeUncheckInput); |
222 | term.checkInputCount = count.unsafeGet(); |
223 | return term; |
224 | } |
225 | |
226 | static ByteTerm EOL(unsigned inputPos) |
227 | { |
228 | ByteTerm term(TypeAssertionEOL); |
229 | term.inputPosition = inputPos; |
230 | return term; |
231 | } |
232 | |
233 | static ByteTerm WordBoundary(bool invert, unsigned inputPos) |
234 | { |
235 | ByteTerm term(TypeAssertionWordBoundary, invert); |
236 | term.inputPosition = inputPos; |
237 | return term; |
238 | } |
239 | |
240 | static ByteTerm BackReference(unsigned subpatternId, unsigned inputPos) |
241 | { |
242 | return ByteTerm(TypeBackReference, subpatternId, false, false, inputPos); |
243 | } |
244 | |
245 | static ByteTerm BodyAlternativeBegin(bool onceThrough) |
246 | { |
247 | ByteTerm term(TypeBodyAlternativeBegin); |
248 | term.alternative.next = 0; |
249 | term.alternative.end = 0; |
250 | term.alternative.onceThrough = onceThrough; |
251 | return term; |
252 | } |
253 | |
254 | static ByteTerm BodyAlternativeDisjunction(bool onceThrough) |
255 | { |
256 | ByteTerm term(TypeBodyAlternativeDisjunction); |
257 | term.alternative.next = 0; |
258 | term.alternative.end = 0; |
259 | term.alternative.onceThrough = onceThrough; |
260 | return term; |
261 | } |
262 | |
263 | static ByteTerm BodyAlternativeEnd() |
264 | { |
265 | ByteTerm term(TypeBodyAlternativeEnd); |
266 | term.alternative.next = 0; |
267 | term.alternative.end = 0; |
268 | term.alternative.onceThrough = false; |
269 | return term; |
270 | } |
271 | |
272 | static ByteTerm AlternativeBegin() |
273 | { |
274 | ByteTerm term(TypeAlternativeBegin); |
275 | term.alternative.next = 0; |
276 | term.alternative.end = 0; |
277 | term.alternative.onceThrough = false; |
278 | return term; |
279 | } |
280 | |
281 | static ByteTerm AlternativeDisjunction() |
282 | { |
283 | ByteTerm term(TypeAlternativeDisjunction); |
284 | term.alternative.next = 0; |
285 | term.alternative.end = 0; |
286 | term.alternative.onceThrough = false; |
287 | return term; |
288 | } |
289 | |
290 | static ByteTerm AlternativeEnd() |
291 | { |
292 | ByteTerm term(TypeAlternativeEnd); |
293 | term.alternative.next = 0; |
294 | term.alternative.end = 0; |
295 | term.alternative.onceThrough = false; |
296 | return term; |
297 | } |
298 | |
299 | static ByteTerm SubpatternBegin() |
300 | { |
301 | return ByteTerm(TypeSubpatternBegin); |
302 | } |
303 | |
304 | static ByteTerm SubpatternEnd() |
305 | { |
306 | return ByteTerm(TypeSubpatternEnd); |
307 | } |
308 | |
309 | static ByteTerm DotStarEnclosure(bool bolAnchor, bool eolAnchor) |
310 | { |
311 | ByteTerm term(TypeDotStarEnclosure); |
312 | term.anchors.m_bol = bolAnchor; |
313 | term.anchors.m_eol = eolAnchor; |
314 | return term; |
315 | } |
316 | |
317 | bool invert() |
318 | { |
319 | return m_invert; |
320 | } |
321 | |
322 | bool capture() |
323 | { |
324 | return m_capture; |
325 | } |
326 | }; |
327 | |
328 | class ByteDisjunction { |
329 | WTF_MAKE_FAST_ALLOCATED; |
330 | public: |
331 | ByteDisjunction(unsigned numSubpatterns, unsigned frameSize) |
332 | : m_numSubpatterns(numSubpatterns) |
333 | , m_frameSize(frameSize) |
334 | { |
335 | } |
336 | |
337 | size_t estimatedSizeInBytes() const { return terms.capacity() * sizeof(ByteTerm); } |
338 | |
339 | Vector<ByteTerm> terms; |
340 | unsigned m_numSubpatterns; |
341 | unsigned m_frameSize; |
342 | }; |
343 | |
344 | struct BytecodePattern { |
345 | WTF_MAKE_FAST_ALLOCATED; |
346 | public: |
347 | BytecodePattern(std::unique_ptr<ByteDisjunction> body, Vector<std::unique_ptr<ByteDisjunction>>& parenthesesInfoToAdopt, YarrPattern& pattern, BumpPointerAllocator* allocator, ConcurrentJSLock* lock) |
348 | : m_body(WTFMove(body)) |
349 | , m_flags(pattern.m_flags) |
350 | , m_allocator(allocator) |
351 | , m_lock(lock) |
352 | { |
353 | m_body->terms.shrinkToFit(); |
354 | |
355 | newlineCharacterClass = pattern.newlineCharacterClass(); |
356 | if (unicode() && ignoreCase()) |
357 | wordcharCharacterClass = pattern.wordUnicodeIgnoreCaseCharCharacterClass(); |
358 | else |
359 | wordcharCharacterClass = pattern.wordcharCharacterClass(); |
360 | |
361 | m_allParenthesesInfo.swap(x&: parenthesesInfoToAdopt); |
362 | m_allParenthesesInfo.shrinkToFit(); |
363 | |
364 | m_userCharacterClasses.swap(x&: pattern.m_userCharacterClasses); |
365 | m_userCharacterClasses.shrinkToFit(); |
366 | } |
367 | |
368 | size_t estimatedSizeInBytes() const { return m_body->estimatedSizeInBytes(); } |
369 | |
370 | bool ignoreCase() const { return m_flags & FlagIgnoreCase; } |
371 | bool multiline() const { return m_flags & FlagMultiline; } |
372 | bool sticky() const { return m_flags & FlagSticky; } |
373 | bool unicode() const { return m_flags & FlagUnicode; } |
374 | bool dotAll() const { return m_flags & FlagDotAll; } |
375 | |
376 | std::unique_ptr<ByteDisjunction> m_body; |
377 | RegExpFlags m_flags; |
378 | // Each BytecodePattern is associated with a RegExp, each RegExp is associated |
379 | // with a VM. Cache a pointer to out VM's m_regExpAllocator. |
380 | BumpPointerAllocator* m_allocator; |
381 | ConcurrentJSLock* m_lock; |
382 | |
383 | CharacterClass* newlineCharacterClass; |
384 | CharacterClass* wordcharCharacterClass; |
385 | |
386 | private: |
387 | Vector<std::unique_ptr<ByteDisjunction>> m_allParenthesesInfo; |
388 | Vector<std::unique_ptr<CharacterClass>> m_userCharacterClasses; |
389 | }; |
390 | |
391 | JS_EXPORT_PRIVATE std::unique_ptr<BytecodePattern> byteCompile(YarrPattern&, BumpPointerAllocator*, ConcurrentJSLock* = nullptr); |
392 | JS_EXPORT_PRIVATE unsigned interpret(BytecodePattern*, const String& input, unsigned start, unsigned* output); |
393 | unsigned interpret(BytecodePattern*, const LChar* input, unsigned length, unsigned start, unsigned* output); |
394 | unsigned interpret(BytecodePattern*, const UChar* input, unsigned length, unsigned start, unsigned* output); |
395 | |
396 | } } // namespace JSC::Yarr |
397 | |