1 | /* |
2 | * Copyright (C) 2009 Apple Inc. All rights reserved. |
3 | * |
4 | * Redistribution and use in source and binary forms, with or without |
5 | * modification, are permitted provided that the following conditions |
6 | * are met: |
7 | * 1. Redistributions of source code must retain the above copyright |
8 | * notice, this list of conditions and the following disclaimer. |
9 | * 2. Redistributions in binary form must reproduce the above copyright |
10 | * notice, this list of conditions and the following disclaimer in the |
11 | * documentation and/or other materials provided with the distribution. |
12 | * |
13 | * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY |
14 | * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
15 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR |
16 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR |
17 | * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, |
18 | * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, |
19 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR |
20 | * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY |
21 | * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
22 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
23 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
24 | */ |
25 | |
26 | #ifndef RegexInterpreter_h |
27 | #define RegexInterpreter_h |
28 | |
29 | #include <wtf/Platform.h> |
30 | |
31 | #if ENABLE(YARR) |
32 | |
33 | #include <wtf/unicode/Unicode.h> |
34 | #include "RegexParser.h" |
35 | #include "RegexPattern.h" |
36 | |
37 | namespace JSC { namespace Yarr { |
38 | |
39 | class ByteDisjunction; |
40 | |
41 | struct ByteTerm { |
42 | enum Type { |
43 | TypeBodyAlternativeBegin, |
44 | TypeBodyAlternativeDisjunction, |
45 | TypeBodyAlternativeEnd, |
46 | TypeAlternativeBegin, |
47 | TypeAlternativeDisjunction, |
48 | TypeAlternativeEnd, |
49 | TypeSubpatternBegin, |
50 | TypeSubpatternEnd, |
51 | TypeAssertionBOL, |
52 | TypeAssertionEOL, |
53 | TypeAssertionWordBoundary, |
54 | TypePatternCharacterOnce, |
55 | TypePatternCharacterFixed, |
56 | TypePatternCharacterGreedy, |
57 | TypePatternCharacterNonGreedy, |
58 | TypePatternCasedCharacterOnce, |
59 | TypePatternCasedCharacterFixed, |
60 | TypePatternCasedCharacterGreedy, |
61 | TypePatternCasedCharacterNonGreedy, |
62 | TypeCharacterClass, |
63 | TypeBackReference, |
64 | TypeParenthesesSubpattern, |
65 | TypeParenthesesSubpatternOnceBegin, |
66 | TypeParenthesesSubpatternOnceEnd, |
67 | TypeParentheticalAssertionBegin, |
68 | TypeParentheticalAssertionEnd, |
69 | TypeCheckInput, |
70 | } type; |
71 | bool invertOrCapture; |
72 | union { |
73 | struct { |
74 | union { |
75 | UChar patternCharacter; |
76 | struct { |
77 | UChar lo; |
78 | UChar hi; |
79 | } casedCharacter; |
80 | CharacterClass* characterClass; |
81 | unsigned subpatternId; |
82 | }; |
83 | union { |
84 | ByteDisjunction* parenthesesDisjunction; |
85 | unsigned parenthesesWidth; |
86 | }; |
87 | QuantifierType quantityType; |
88 | unsigned quantityCount; |
89 | } atom; |
90 | struct { |
91 | int next; |
92 | int end; |
93 | } alternative; |
94 | unsigned checkInputCount; |
95 | }; |
96 | unsigned frameLocation; |
97 | int inputPosition; |
98 | |
99 | ByteTerm(UChar ch, int inputPos, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType) |
100 | : frameLocation(frameLocation) |
101 | { |
102 | switch (quantityType) { |
103 | case QuantifierFixedCount: |
104 | type = (quantityCount == 1) ? ByteTerm::TypePatternCharacterOnce : ByteTerm::TypePatternCharacterFixed; |
105 | break; |
106 | case QuantifierGreedy: |
107 | type = ByteTerm::TypePatternCharacterGreedy; |
108 | break; |
109 | case QuantifierNonGreedy: |
110 | type = ByteTerm::TypePatternCharacterNonGreedy; |
111 | break; |
112 | } |
113 | |
114 | atom.patternCharacter = ch; |
115 | atom.quantityType = quantityType; |
116 | atom.quantityCount = quantityCount; |
117 | inputPosition = inputPos; |
118 | } |
119 | |
120 | ByteTerm(UChar lo, UChar hi, int inputPos, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType) |
121 | : frameLocation(frameLocation) |
122 | { |
123 | switch (quantityType) { |
124 | case QuantifierFixedCount: |
125 | type = (quantityCount == 1) ? ByteTerm::TypePatternCasedCharacterOnce : ByteTerm::TypePatternCasedCharacterFixed; |
126 | break; |
127 | case QuantifierGreedy: |
128 | type = ByteTerm::TypePatternCasedCharacterGreedy; |
129 | break; |
130 | case QuantifierNonGreedy: |
131 | type = ByteTerm::TypePatternCasedCharacterNonGreedy; |
132 | break; |
133 | } |
134 | |
135 | atom.casedCharacter.lo = lo; |
136 | atom.casedCharacter.hi = hi; |
137 | atom.quantityType = quantityType; |
138 | atom.quantityCount = quantityCount; |
139 | inputPosition = inputPos; |
140 | } |
141 | |
142 | ByteTerm(CharacterClass* characterClass, bool invert, int inputPos) |
143 | : type(ByteTerm::TypeCharacterClass) |
144 | , invertOrCapture(invert) |
145 | { |
146 | atom.characterClass = characterClass; |
147 | atom.quantityType = QuantifierFixedCount; |
148 | atom.quantityCount = 1; |
149 | inputPosition = inputPos; |
150 | } |
151 | |
152 | ByteTerm(Type type, unsigned subpatternId, ByteDisjunction* parenthesesInfo, bool invertOrCapture, int inputPos) |
153 | : type(type) |
154 | , invertOrCapture(invertOrCapture) |
155 | { |
156 | atom.subpatternId = subpatternId; |
157 | atom.parenthesesDisjunction = parenthesesInfo; |
158 | atom.quantityType = QuantifierFixedCount; |
159 | atom.quantityCount = 1; |
160 | inputPosition = inputPos; |
161 | } |
162 | |
163 | ByteTerm(Type type, bool invert = false) |
164 | : type(type) |
165 | , invertOrCapture(invert) |
166 | { |
167 | atom.quantityType = QuantifierFixedCount; |
168 | atom.quantityCount = 1; |
169 | } |
170 | |
171 | ByteTerm(Type type, unsigned subpatternId, bool invertOrCapture, int inputPos) |
172 | : type(type) |
173 | , invertOrCapture(invertOrCapture) |
174 | { |
175 | atom.subpatternId = subpatternId; |
176 | atom.quantityType = QuantifierFixedCount; |
177 | atom.quantityCount = 1; |
178 | inputPosition = inputPos; |
179 | } |
180 | |
181 | static ByteTerm BOL(int inputPos) |
182 | { |
183 | ByteTerm term(TypeAssertionBOL); |
184 | term.inputPosition = inputPos; |
185 | return term; |
186 | } |
187 | |
188 | static ByteTerm CheckInput(unsigned count) |
189 | { |
190 | ByteTerm term(TypeCheckInput); |
191 | term.checkInputCount = count; |
192 | return term; |
193 | } |
194 | |
195 | static ByteTerm EOL(int inputPos) |
196 | { |
197 | ByteTerm term(TypeAssertionEOL); |
198 | term.inputPosition = inputPos; |
199 | return term; |
200 | } |
201 | |
202 | static ByteTerm WordBoundary(bool invert, int inputPos) |
203 | { |
204 | ByteTerm term(TypeAssertionWordBoundary, invert); |
205 | term.inputPosition = inputPos; |
206 | return term; |
207 | } |
208 | |
209 | static ByteTerm BackReference(unsigned subpatternId, int inputPos) |
210 | { |
211 | return ByteTerm(TypeBackReference, subpatternId, false, inputPos); |
212 | } |
213 | |
214 | static ByteTerm BodyAlternativeBegin() |
215 | { |
216 | ByteTerm term(TypeBodyAlternativeBegin); |
217 | term.alternative.next = 0; |
218 | term.alternative.end = 0; |
219 | return term; |
220 | } |
221 | |
222 | static ByteTerm BodyAlternativeDisjunction() |
223 | { |
224 | ByteTerm term(TypeBodyAlternativeDisjunction); |
225 | term.alternative.next = 0; |
226 | term.alternative.end = 0; |
227 | return term; |
228 | } |
229 | |
230 | static ByteTerm BodyAlternativeEnd() |
231 | { |
232 | ByteTerm term(TypeBodyAlternativeEnd); |
233 | term.alternative.next = 0; |
234 | term.alternative.end = 0; |
235 | return term; |
236 | } |
237 | |
238 | static ByteTerm AlternativeBegin() |
239 | { |
240 | ByteTerm term(TypeAlternativeBegin); |
241 | term.alternative.next = 0; |
242 | term.alternative.end = 0; |
243 | return term; |
244 | } |
245 | |
246 | static ByteTerm AlternativeDisjunction() |
247 | { |
248 | ByteTerm term(TypeAlternativeDisjunction); |
249 | term.alternative.next = 0; |
250 | term.alternative.end = 0; |
251 | return term; |
252 | } |
253 | |
254 | static ByteTerm AlternativeEnd() |
255 | { |
256 | ByteTerm term(TypeAlternativeEnd); |
257 | term.alternative.next = 0; |
258 | term.alternative.end = 0; |
259 | return term; |
260 | } |
261 | |
262 | static ByteTerm SubpatternBegin() |
263 | { |
264 | return ByteTerm(TypeSubpatternBegin); |
265 | } |
266 | |
267 | static ByteTerm SubpatternEnd() |
268 | { |
269 | return ByteTerm(TypeSubpatternEnd); |
270 | } |
271 | |
272 | bool invert() |
273 | { |
274 | return invertOrCapture; |
275 | } |
276 | |
277 | bool capture() |
278 | { |
279 | return invertOrCapture; |
280 | } |
281 | }; |
282 | |
283 | class ByteDisjunction : public FastAllocBase { |
284 | public: |
285 | ByteDisjunction(unsigned numSubpatterns, unsigned frameSize) |
286 | : m_numSubpatterns(numSubpatterns) |
287 | , m_frameSize(frameSize) |
288 | { |
289 | } |
290 | |
291 | Vector<ByteTerm> terms; |
292 | unsigned m_numSubpatterns; |
293 | unsigned m_frameSize; |
294 | }; |
295 | |
296 | struct BytecodePattern : FastAllocBase { |
297 | BytecodePattern(ByteDisjunction* body, Vector<ByteDisjunction*> allParenthesesInfo, RegexPattern& pattern) |
298 | : m_body(body) |
299 | , m_ignoreCase(pattern.m_ignoreCase) |
300 | , m_multiline(pattern.m_multiline) |
301 | { |
302 | newlineCharacterClass = pattern.newlineCharacterClass(); |
303 | wordcharCharacterClass = pattern.wordcharCharacterClass(); |
304 | |
305 | m_allParenthesesInfo.append(val: allParenthesesInfo); |
306 | m_userCharacterClasses.append(val: pattern.m_userCharacterClasses); |
307 | // 'Steal' the RegexPattern's CharacterClasses! We clear its |
308 | // array, so that it won't delete them on destruction. We'll |
309 | // take responsibility for that. |
310 | pattern.m_userCharacterClasses.clear(); |
311 | } |
312 | |
313 | ~BytecodePattern() |
314 | { |
315 | deleteAllValues(collection: m_allParenthesesInfo); |
316 | deleteAllValues(collection: m_userCharacterClasses); |
317 | } |
318 | |
319 | OwnPtr<ByteDisjunction> m_body; |
320 | bool m_ignoreCase; |
321 | bool m_multiline; |
322 | |
323 | CharacterClass* newlineCharacterClass; |
324 | CharacterClass* wordcharCharacterClass; |
325 | private: |
326 | Vector<ByteDisjunction*> m_allParenthesesInfo; |
327 | Vector<CharacterClass*> m_userCharacterClasses; |
328 | }; |
329 | |
330 | BytecodePattern* byteCompileRegex(const UString& pattern, unsigned& numSubpatterns, const char*& error, bool ignoreCase = false, bool multiline = false); |
331 | int interpretRegex(BytecodePattern* v_regex, const UChar* input, unsigned start, unsigned length, int* output); |
332 | |
333 | } } // namespace JSC::Yarr |
334 | |
335 | #endif |
336 | |
337 | #endif // RegexInterpreter_h |
338 | |