1 | /* |
2 | * Copyright (C) 2009 Apple Inc. All rights reserved. |
3 | * |
4 | * Redistribution and use in source and binary forms, with or without |
5 | * modification, are permitted provided that the following conditions |
6 | * are met: |
7 | * 1. Redistributions of source code must retain the above copyright |
8 | * notice, this list of conditions and the following disclaimer. |
9 | * 2. Redistributions in binary form must reproduce the above copyright |
10 | * notice, this list of conditions and the following disclaimer in the |
11 | * documentation and/or other materials provided with the distribution. |
12 | * |
13 | * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY |
14 | * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
15 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR |
16 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR |
17 | * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, |
18 | * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, |
19 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR |
20 | * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY |
21 | * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
22 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
23 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
24 | */ |
25 | |
26 | #ifndef RegexPattern_h |
27 | #define RegexPattern_h |
28 | |
29 | #include <wtf/Platform.h> |
30 | |
31 | #if ENABLE(YARR) |
32 | |
33 | #include <wtf/Vector.h> |
34 | #include <wtf/unicode/Unicode.h> |
35 | |
36 | |
37 | namespace JSC { namespace Yarr { |
38 | |
39 | #define RegexStackSpaceForBackTrackInfoPatternCharacter 1 // Only for !fixed quantifiers. |
40 | #define RegexStackSpaceForBackTrackInfoCharacterClass 1 // Only for !fixed quantifiers. |
41 | #define RegexStackSpaceForBackTrackInfoBackReference 2 |
42 | #define RegexStackSpaceForBackTrackInfoAlternative 1 // One per alternative. |
43 | #define RegexStackSpaceForBackTrackInfoParentheticalAssertion 1 |
44 | #define RegexStackSpaceForBackTrackInfoParenthesesOnce 1 // Only for !fixed quantifiers. |
45 | #define RegexStackSpaceForBackTrackInfoParentheses 4 |
46 | |
47 | struct PatternDisjunction; |
48 | |
49 | struct CharacterRange { |
50 | UChar begin; |
51 | UChar end; |
52 | |
53 | CharacterRange(UChar begin, UChar end) |
54 | : begin(begin) |
55 | , end(end) |
56 | { |
57 | } |
58 | }; |
59 | |
60 | struct CharacterClass : FastAllocBase { |
61 | Vector<UChar> m_matches; |
62 | Vector<CharacterRange> m_ranges; |
63 | Vector<UChar> m_matchesUnicode; |
64 | Vector<CharacterRange> m_rangesUnicode; |
65 | }; |
66 | |
67 | enum QuantifierType { |
68 | QuantifierFixedCount, |
69 | QuantifierGreedy, |
70 | QuantifierNonGreedy, |
71 | }; |
72 | |
73 | struct PatternTerm { |
74 | enum Type { |
75 | TypeAssertionBOL, |
76 | TypeAssertionEOL, |
77 | TypeAssertionWordBoundary, |
78 | TypePatternCharacter, |
79 | TypeCharacterClass, |
80 | TypeBackReference, |
81 | TypeForwardReference, |
82 | TypeParenthesesSubpattern, |
83 | TypeParentheticalAssertion, |
84 | } type; |
85 | bool invertOrCapture; |
86 | union { |
87 | UChar patternCharacter; |
88 | CharacterClass* characterClass; |
89 | unsigned subpatternId; |
90 | struct { |
91 | PatternDisjunction* disjunction; |
92 | unsigned subpatternId; |
93 | unsigned lastSubpatternId; |
94 | bool isCopy; |
95 | } parentheses; |
96 | }; |
97 | QuantifierType quantityType; |
98 | unsigned quantityCount; |
99 | int inputPosition; |
100 | unsigned frameLocation; |
101 | |
102 | PatternTerm(UChar ch) |
103 | : type(PatternTerm::TypePatternCharacter) |
104 | { |
105 | patternCharacter = ch; |
106 | quantityType = QuantifierFixedCount; |
107 | quantityCount = 1; |
108 | } |
109 | |
110 | PatternTerm(CharacterClass* charClass, bool invert) |
111 | : type(PatternTerm::TypeCharacterClass) |
112 | , invertOrCapture(invert) |
113 | { |
114 | characterClass = charClass; |
115 | quantityType = QuantifierFixedCount; |
116 | quantityCount = 1; |
117 | } |
118 | |
119 | PatternTerm(Type type, unsigned subpatternId, PatternDisjunction* disjunction, bool invertOrCapture) |
120 | : type(type) |
121 | , invertOrCapture(invertOrCapture) |
122 | { |
123 | parentheses.disjunction = disjunction; |
124 | parentheses.subpatternId = subpatternId; |
125 | parentheses.isCopy = false; |
126 | quantityType = QuantifierFixedCount; |
127 | quantityCount = 1; |
128 | } |
129 | |
130 | PatternTerm(Type type, bool invert = false) |
131 | : type(type) |
132 | , invertOrCapture(invert) |
133 | { |
134 | quantityType = QuantifierFixedCount; |
135 | quantityCount = 1; |
136 | } |
137 | |
138 | PatternTerm(unsigned spatternId) |
139 | : type(TypeBackReference) |
140 | , invertOrCapture(false) |
141 | { |
142 | subpatternId = spatternId; |
143 | quantityType = QuantifierFixedCount; |
144 | quantityCount = 1; |
145 | } |
146 | |
147 | static PatternTerm ForwardReference() |
148 | { |
149 | return PatternTerm(TypeForwardReference); |
150 | } |
151 | |
152 | static PatternTerm BOL() |
153 | { |
154 | return PatternTerm(TypeAssertionBOL); |
155 | } |
156 | |
157 | static PatternTerm EOL() |
158 | { |
159 | return PatternTerm(TypeAssertionEOL); |
160 | } |
161 | |
162 | static PatternTerm WordBoundary(bool invert) |
163 | { |
164 | return PatternTerm(TypeAssertionWordBoundary, invert); |
165 | } |
166 | |
167 | bool invert() |
168 | { |
169 | return invertOrCapture; |
170 | } |
171 | |
172 | bool capture() |
173 | { |
174 | return invertOrCapture; |
175 | } |
176 | |
177 | void quantify(unsigned count, QuantifierType type) |
178 | { |
179 | quantityCount = count; |
180 | quantityType = type; |
181 | } |
182 | }; |
183 | |
184 | struct PatternAlternative : FastAllocBase { |
185 | PatternAlternative(PatternDisjunction* disjunction) |
186 | : m_parent(disjunction) |
187 | { |
188 | } |
189 | |
190 | PatternTerm& lastTerm() |
191 | { |
192 | ASSERT(m_terms.size()); |
193 | return m_terms[m_terms.size() - 1]; |
194 | } |
195 | |
196 | void removeLastTerm() |
197 | { |
198 | ASSERT(m_terms.size()); |
199 | m_terms.shrink(size: m_terms.size() - 1); |
200 | } |
201 | |
202 | Vector<PatternTerm> m_terms; |
203 | PatternDisjunction* m_parent; |
204 | unsigned m_minimumSize; |
205 | bool m_hasFixedSize; |
206 | }; |
207 | |
208 | struct PatternDisjunction : FastAllocBase { |
209 | PatternDisjunction(PatternAlternative* parent = 0) |
210 | : m_parent(parent) |
211 | { |
212 | } |
213 | |
214 | ~PatternDisjunction() |
215 | { |
216 | deleteAllValues(collection: m_alternatives); |
217 | } |
218 | |
219 | PatternAlternative* addNewAlternative() |
220 | { |
221 | PatternAlternative* alternative = new PatternAlternative(this); |
222 | m_alternatives.append(val: alternative); |
223 | return alternative; |
224 | } |
225 | |
226 | Vector<PatternAlternative*> m_alternatives; |
227 | PatternAlternative* m_parent; |
228 | unsigned m_minimumSize; |
229 | unsigned m_callFrameSize; |
230 | bool m_hasFixedSize; |
231 | }; |
232 | |
233 | // You probably don't want to be calling these functions directly |
234 | // (please to be calling newlineCharacterClass() et al on your |
235 | // friendly neighborhood RegexPattern instance to get nicely |
236 | // cached copies). |
237 | CharacterClass* newlineCreate(); |
238 | CharacterClass* digitsCreate(); |
239 | CharacterClass* spacesCreate(); |
240 | CharacterClass* wordcharCreate(); |
241 | CharacterClass* nondigitsCreate(); |
242 | CharacterClass* nonspacesCreate(); |
243 | CharacterClass* nonwordcharCreate(); |
244 | |
245 | struct RegexPattern { |
246 | RegexPattern(bool ignoreCase, bool multiline) |
247 | : m_ignoreCase(ignoreCase) |
248 | , m_multiline(multiline) |
249 | , m_numSubpatterns(0) |
250 | , m_maxBackReference(0) |
251 | , newlineCached(0) |
252 | , digitsCached(0) |
253 | , spacesCached(0) |
254 | , wordcharCached(0) |
255 | , nondigitsCached(0) |
256 | , nonspacesCached(0) |
257 | , nonwordcharCached(0) |
258 | { |
259 | } |
260 | |
261 | ~RegexPattern() |
262 | { |
263 | deleteAllValues(collection: m_disjunctions); |
264 | deleteAllValues(collection: m_userCharacterClasses); |
265 | } |
266 | |
267 | void reset() |
268 | { |
269 | m_numSubpatterns = 0; |
270 | m_maxBackReference = 0; |
271 | |
272 | newlineCached = 0; |
273 | digitsCached = 0; |
274 | spacesCached = 0; |
275 | wordcharCached = 0; |
276 | nondigitsCached = 0; |
277 | nonspacesCached = 0; |
278 | nonwordcharCached = 0; |
279 | |
280 | deleteAllValues(collection: m_disjunctions); |
281 | m_disjunctions.clear(); |
282 | deleteAllValues(collection: m_userCharacterClasses); |
283 | m_userCharacterClasses.clear(); |
284 | } |
285 | |
286 | bool containsIllegalBackReference() |
287 | { |
288 | return m_maxBackReference > m_numSubpatterns; |
289 | } |
290 | |
291 | CharacterClass* newlineCharacterClass() |
292 | { |
293 | if (!newlineCached) |
294 | m_userCharacterClasses.append(val: newlineCached = newlineCreate()); |
295 | return newlineCached; |
296 | } |
297 | CharacterClass* digitsCharacterClass() |
298 | { |
299 | if (!digitsCached) |
300 | m_userCharacterClasses.append(val: digitsCached = digitsCreate()); |
301 | return digitsCached; |
302 | } |
303 | CharacterClass* spacesCharacterClass() |
304 | { |
305 | if (!spacesCached) |
306 | m_userCharacterClasses.append(val: spacesCached = spacesCreate()); |
307 | return spacesCached; |
308 | } |
309 | CharacterClass* wordcharCharacterClass() |
310 | { |
311 | if (!wordcharCached) |
312 | m_userCharacterClasses.append(val: wordcharCached = wordcharCreate()); |
313 | return wordcharCached; |
314 | } |
315 | CharacterClass* nondigitsCharacterClass() |
316 | { |
317 | if (!nondigitsCached) |
318 | m_userCharacterClasses.append(val: nondigitsCached = nondigitsCreate()); |
319 | return nondigitsCached; |
320 | } |
321 | CharacterClass* nonspacesCharacterClass() |
322 | { |
323 | if (!nonspacesCached) |
324 | m_userCharacterClasses.append(val: nonspacesCached = nonspacesCreate()); |
325 | return nonspacesCached; |
326 | } |
327 | CharacterClass* nonwordcharCharacterClass() |
328 | { |
329 | if (!nonwordcharCached) |
330 | m_userCharacterClasses.append(val: nonwordcharCached = nonwordcharCreate()); |
331 | return nonwordcharCached; |
332 | } |
333 | |
334 | bool m_ignoreCase; |
335 | bool m_multiline; |
336 | unsigned m_numSubpatterns; |
337 | unsigned m_maxBackReference; |
338 | PatternDisjunction* m_body; |
339 | Vector<PatternDisjunction*, 4> m_disjunctions; |
340 | Vector<CharacterClass*> m_userCharacterClasses; |
341 | |
342 | private: |
343 | CharacterClass* newlineCached; |
344 | CharacterClass* digitsCached; |
345 | CharacterClass* spacesCached; |
346 | CharacterClass* wordcharCached; |
347 | CharacterClass* nondigitsCached; |
348 | CharacterClass* nonspacesCached; |
349 | CharacterClass* nonwordcharCached; |
350 | }; |
351 | |
352 | } } // namespace JSC::Yarr |
353 | |
354 | #endif |
355 | |
356 | #endif // RegexPattern_h |
357 | |