1 | /* |
2 | * Copyright (C) 2009 Apple Inc. All rights reserved. |
3 | * |
4 | * Redistribution and use in source and binary forms, with or without |
5 | * modification, are permitted provided that the following conditions |
6 | * are met: |
7 | * 1. Redistributions of source code must retain the above copyright |
8 | * notice, this list of conditions and the following disclaimer. |
9 | * 2. Redistributions in binary form must reproduce the above copyright |
10 | * notice, this list of conditions and the following disclaimer in the |
11 | * documentation and/or other materials provided with the distribution. |
12 | * |
13 | * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY |
14 | * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
15 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR |
16 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR |
17 | * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, |
18 | * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, |
19 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR |
20 | * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY |
21 | * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
22 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
23 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
24 | */ |
25 | |
26 | #include "config.h" |
27 | #include "RegexCompiler.h" |
28 | |
29 | #include "RegexInterpreter.h" |
30 | #include "RegexPattern.h" |
31 | #include <wtf/Vector.h> |
32 | |
33 | #if ENABLE(YARR) |
34 | |
35 | using namespace WTF; |
36 | |
37 | namespace JSC { namespace Yarr { |
38 | |
39 | class CharacterClassConstructor { |
40 | public: |
41 | CharacterClassConstructor(bool isCaseInsensitive = false) |
42 | : m_isCaseInsensitive(isCaseInsensitive) |
43 | { |
44 | } |
45 | |
46 | void reset() |
47 | { |
48 | m_matches.clear(); |
49 | m_ranges.clear(); |
50 | m_matchesUnicode.clear(); |
51 | m_rangesUnicode.clear(); |
52 | } |
53 | |
54 | void append(const CharacterClass* other) |
55 | { |
56 | for (size_t i = 0; i < other->m_matches.size(); ++i) |
57 | addSorted(matches&: m_matches, ch: other->m_matches[i]); |
58 | for (size_t i = 0; i < other->m_ranges.size(); ++i) |
59 | addSortedRange(ranges&: m_ranges, lo: other->m_ranges[i].begin, hi: other->m_ranges[i].end); |
60 | for (size_t i = 0; i < other->m_matchesUnicode.size(); ++i) |
61 | addSorted(matches&: m_matchesUnicode, ch: other->m_matchesUnicode[i]); |
62 | for (size_t i = 0; i < other->m_rangesUnicode.size(); ++i) |
63 | addSortedRange(ranges&: m_rangesUnicode, lo: other->m_rangesUnicode[i].begin, hi: other->m_rangesUnicode[i].end); |
64 | } |
65 | |
66 | void putChar(UChar ch) |
67 | { |
68 | if (ch <= 0x7f) { |
69 | if (m_isCaseInsensitive && isASCIIAlpha(c: ch)) { |
70 | addSorted(matches&: m_matches, ch: toASCIIUpper(c: ch)); |
71 | addSorted(matches&: m_matches, ch: toASCIILower(c: ch)); |
72 | } else |
73 | addSorted(matches&: m_matches, ch); |
74 | } else { |
75 | UChar upper, lower; |
76 | if (m_isCaseInsensitive && ((upper = Unicode::toUpper(ch)) != (lower = Unicode::toLower(ch)))) { |
77 | addSorted(matches&: m_matchesUnicode, ch: upper); |
78 | addSorted(matches&: m_matchesUnicode, ch: lower); |
79 | } else |
80 | addSorted(matches&: m_matchesUnicode, ch); |
81 | } |
82 | } |
83 | |
84 | // returns true if this character has another case, and 'ch' is the upper case form. |
85 | static inline bool isUnicodeUpper(UChar ch) |
86 | { |
87 | return ch != Unicode::toLower(ch); |
88 | } |
89 | |
90 | // returns true if this character has another case, and 'ch' is the lower case form. |
91 | static inline bool isUnicodeLower(UChar ch) |
92 | { |
93 | return ch != Unicode::toUpper(ch); |
94 | } |
95 | |
96 | void putRange(UChar lo, UChar hi) |
97 | { |
98 | if (lo <= 0x7f) { |
99 | char asciiLo = lo; |
100 | char asciiHi = std::min(a: hi, b: (UChar)0x7f); |
101 | addSortedRange(ranges&: m_ranges, lo, hi: asciiHi); |
102 | |
103 | if (m_isCaseInsensitive) { |
104 | if ((asciiLo <= 'Z') && (asciiHi >= 'A')) |
105 | addSortedRange(ranges&: m_ranges, lo: std::max(a: asciiLo, b: 'A')+('a'-'A'), hi: std::min(a: asciiHi, b: 'Z')+('a'-'A')); |
106 | if ((asciiLo <= 'z') && (asciiHi >= 'a')) |
107 | addSortedRange(ranges&: m_ranges, lo: std::max(a: asciiLo, b: 'a')+('A'-'a'), hi: std::min(a: asciiHi, b: 'z')+('A'-'a')); |
108 | } |
109 | } |
110 | if (hi >= 0x80) { |
111 | uint32_t unicodeCurr = std::max(a: lo, b: (UChar)0x80); |
112 | addSortedRange(ranges&: m_rangesUnicode, lo: unicodeCurr, hi); |
113 | |
114 | if (m_isCaseInsensitive) { |
115 | while (unicodeCurr <= hi) { |
116 | // If the upper bound of the range (hi) is 0xffff, the increments to |
117 | // unicodeCurr in this loop may take it to 0x10000. This is fine |
118 | // (if so we won't re-enter the loop, since the loop condition above |
119 | // will definitely fail) - but this does mean we cannot use a UChar |
120 | // to represent unicodeCurr, we must use a 32-bit value instead. |
121 | ASSERT(unicodeCurr <= 0xffff); |
122 | |
123 | if (isUnicodeUpper(ch: unicodeCurr)) { |
124 | UChar lowerCaseRangeBegin = Unicode::toLower(ch: unicodeCurr); |
125 | UChar lowerCaseRangeEnd = lowerCaseRangeBegin; |
126 | while ((++unicodeCurr <= hi) && isUnicodeUpper(ch: unicodeCurr) && (Unicode::toLower(ch: unicodeCurr) == (lowerCaseRangeEnd + 1))) |
127 | lowerCaseRangeEnd++; |
128 | addSortedRange(ranges&: m_rangesUnicode, lo: lowerCaseRangeBegin, hi: lowerCaseRangeEnd); |
129 | } else if (isUnicodeLower(ch: unicodeCurr)) { |
130 | UChar upperCaseRangeBegin = Unicode::toUpper(ch: unicodeCurr); |
131 | UChar upperCaseRangeEnd = upperCaseRangeBegin; |
132 | while ((++unicodeCurr <= hi) && isUnicodeLower(ch: unicodeCurr) && (Unicode::toUpper(ch: unicodeCurr) == (upperCaseRangeEnd + 1))) |
133 | upperCaseRangeEnd++; |
134 | addSortedRange(ranges&: m_rangesUnicode, lo: upperCaseRangeBegin, hi: upperCaseRangeEnd); |
135 | } else |
136 | ++unicodeCurr; |
137 | } |
138 | } |
139 | } |
140 | } |
141 | |
142 | CharacterClass* charClass() |
143 | { |
144 | CharacterClass* characterClass = new CharacterClass(); |
145 | |
146 | characterClass->m_matches.append(val: m_matches); |
147 | characterClass->m_ranges.append(val: m_ranges); |
148 | characterClass->m_matchesUnicode.append(val: m_matchesUnicode); |
149 | characterClass->m_rangesUnicode.append(val: m_rangesUnicode); |
150 | |
151 | reset(); |
152 | |
153 | return characterClass; |
154 | } |
155 | |
156 | private: |
157 | void addSorted(Vector<UChar>& matches, UChar ch) |
158 | { |
159 | unsigned pos = 0; |
160 | unsigned range = matches.size(); |
161 | |
162 | // binary chop, find position to insert char. |
163 | while (range) { |
164 | unsigned index = range >> 1; |
165 | |
166 | int val = matches[pos+index] - ch; |
167 | if (!val) |
168 | return; |
169 | else if (val > 0) |
170 | range = index; |
171 | else { |
172 | pos += (index+1); |
173 | range -= (index+1); |
174 | } |
175 | } |
176 | |
177 | if (pos == matches.size()) |
178 | matches.append(val: ch); |
179 | else |
180 | matches.insert(position: pos, val: ch); |
181 | } |
182 | |
183 | void addSortedRange(Vector<CharacterRange>& ranges, UChar lo, UChar hi) |
184 | { |
185 | unsigned end = ranges.size(); |
186 | |
187 | // Simple linear scan - I doubt there are that many ranges anyway... |
188 | // feel free to fix this with something faster (eg binary chop). |
189 | for (unsigned i = 0; i < end; ++i) { |
190 | // does the new range fall before the current position in the array |
191 | if (hi < ranges[i].begin) { |
192 | // optional optimization: concatenate appending ranges? - may not be worthwhile. |
193 | if (hi == (ranges[i].begin - 1)) { |
194 | ranges[i].begin = lo; |
195 | return; |
196 | } |
197 | ranges.insert(position: i, val: CharacterRange(lo, hi)); |
198 | return; |
199 | } |
200 | // Okay, since we didn't hit the last case, the end of the new range is definitely at or after the begining |
201 | // If the new range start at or before the end of the last range, then the overlap (if it starts one after the |
202 | // end of the last range they concatenate, which is just as good. |
203 | if (lo <= (ranges[i].end + 1)) { |
204 | // found an intersect! we'll replace this entry in the array. |
205 | ranges[i].begin = std::min(a: ranges[i].begin, b: lo); |
206 | ranges[i].end = std::max(a: ranges[i].end, b: hi); |
207 | |
208 | // now check if the new range can subsume any subsequent ranges. |
209 | unsigned next = i+1; |
210 | // each iteration of the loop we will either remove something from the list, or break the loop. |
211 | while (next < ranges.size()) { |
212 | if (ranges[next].begin <= (ranges[i].end + 1)) { |
213 | // the next entry now overlaps / concatenates this one. |
214 | ranges[i].end = std::max(a: ranges[i].end, b: ranges[next].end); |
215 | ranges.remove(position: next); |
216 | } else |
217 | break; |
218 | } |
219 | |
220 | return; |
221 | } |
222 | } |
223 | |
224 | // CharacterRange comes after all existing ranges. |
225 | ranges.append(val: CharacterRange(lo, hi)); |
226 | } |
227 | |
228 | bool m_isCaseInsensitive; |
229 | |
230 | Vector<UChar> m_matches; |
231 | Vector<CharacterRange> m_ranges; |
232 | Vector<UChar> m_matchesUnicode; |
233 | Vector<CharacterRange> m_rangesUnicode; |
234 | }; |
235 | |
236 | |
237 | CharacterClass* newlineCreate() |
238 | { |
239 | CharacterClass* characterClass = new CharacterClass(); |
240 | |
241 | characterClass->m_matches.append(val: '\n'); |
242 | characterClass->m_matches.append(val: '\r'); |
243 | characterClass->m_matchesUnicode.append(val: 0x2028); |
244 | characterClass->m_matchesUnicode.append(val: 0x2029); |
245 | |
246 | return characterClass; |
247 | } |
248 | |
249 | CharacterClass* digitsCreate() |
250 | { |
251 | CharacterClass* characterClass = new CharacterClass(); |
252 | |
253 | characterClass->m_ranges.append(val: CharacterRange('0', '9')); |
254 | |
255 | return characterClass; |
256 | } |
257 | |
258 | CharacterClass* spacesCreate() |
259 | { |
260 | CharacterClass* characterClass = new CharacterClass(); |
261 | |
262 | characterClass->m_matches.append(val: ' '); |
263 | characterClass->m_ranges.append(val: CharacterRange('\t', '\r')); |
264 | characterClass->m_matchesUnicode.append(val: 0x00a0); |
265 | characterClass->m_matchesUnicode.append(val: 0x1680); |
266 | characterClass->m_matchesUnicode.append(val: 0x180e); |
267 | characterClass->m_matchesUnicode.append(val: 0x2028); |
268 | characterClass->m_matchesUnicode.append(val: 0x2029); |
269 | characterClass->m_matchesUnicode.append(val: 0x202f); |
270 | characterClass->m_matchesUnicode.append(val: 0x205f); |
271 | characterClass->m_matchesUnicode.append(val: 0x3000); |
272 | characterClass->m_rangesUnicode.append(val: CharacterRange(0x2000, 0x200a)); |
273 | |
274 | return characterClass; |
275 | } |
276 | |
277 | CharacterClass* wordcharCreate() |
278 | { |
279 | CharacterClass* characterClass = new CharacterClass(); |
280 | |
281 | characterClass->m_matches.append(val: '_'); |
282 | characterClass->m_ranges.append(val: CharacterRange('0', '9')); |
283 | characterClass->m_ranges.append(val: CharacterRange('A', 'Z')); |
284 | characterClass->m_ranges.append(val: CharacterRange('a', 'z')); |
285 | |
286 | return characterClass; |
287 | } |
288 | |
289 | CharacterClass* nondigitsCreate() |
290 | { |
291 | CharacterClass* characterClass = new CharacterClass(); |
292 | |
293 | characterClass->m_ranges.append(val: CharacterRange(0, '0' - 1)); |
294 | characterClass->m_ranges.append(val: CharacterRange('9' + 1, 0x7f)); |
295 | characterClass->m_rangesUnicode.append(val: CharacterRange(0x80, 0xffff)); |
296 | |
297 | return characterClass; |
298 | } |
299 | |
300 | CharacterClass* nonspacesCreate() |
301 | { |
302 | CharacterClass* characterClass = new CharacterClass(); |
303 | |
304 | characterClass->m_ranges.append(val: CharacterRange(0, '\t' - 1)); |
305 | characterClass->m_ranges.append(val: CharacterRange('\r' + 1, ' ' - 1)); |
306 | characterClass->m_ranges.append(val: CharacterRange(' ' + 1, 0x7f)); |
307 | characterClass->m_rangesUnicode.append(val: CharacterRange(0x0080, 0x009f)); |
308 | characterClass->m_rangesUnicode.append(val: CharacterRange(0x00a1, 0x167f)); |
309 | characterClass->m_rangesUnicode.append(val: CharacterRange(0x1681, 0x180d)); |
310 | characterClass->m_rangesUnicode.append(val: CharacterRange(0x180f, 0x1fff)); |
311 | characterClass->m_rangesUnicode.append(val: CharacterRange(0x200b, 0x2027)); |
312 | characterClass->m_rangesUnicode.append(val: CharacterRange(0x202a, 0x202e)); |
313 | characterClass->m_rangesUnicode.append(val: CharacterRange(0x2030, 0x205e)); |
314 | characterClass->m_rangesUnicode.append(val: CharacterRange(0x2060, 0x2fff)); |
315 | characterClass->m_rangesUnicode.append(val: CharacterRange(0x3001, 0xffff)); |
316 | |
317 | return characterClass; |
318 | } |
319 | |
320 | CharacterClass* nonwordcharCreate() |
321 | { |
322 | CharacterClass* characterClass = new CharacterClass(); |
323 | |
324 | characterClass->m_matches.append(val: '`'); |
325 | characterClass->m_ranges.append(val: CharacterRange(0, '0' - 1)); |
326 | characterClass->m_ranges.append(val: CharacterRange('9' + 1, 'A' - 1)); |
327 | characterClass->m_ranges.append(val: CharacterRange('Z' + 1, '_' - 1)); |
328 | characterClass->m_ranges.append(val: CharacterRange('z' + 1, 0x7f)); |
329 | characterClass->m_rangesUnicode.append(val: CharacterRange(0x80, 0xffff)); |
330 | |
331 | return characterClass; |
332 | } |
333 | |
334 | |
335 | class RegexPatternConstructor { |
336 | public: |
337 | RegexPatternConstructor(RegexPattern& pattern) |
338 | : m_pattern(pattern) |
339 | , m_characterClassConstructor(pattern.m_ignoreCase) |
340 | { |
341 | } |
342 | |
343 | ~RegexPatternConstructor() |
344 | { |
345 | } |
346 | |
347 | void reset() |
348 | { |
349 | m_pattern.reset(); |
350 | m_characterClassConstructor.reset(); |
351 | } |
352 | |
353 | void assertionBOL() |
354 | { |
355 | m_alternative->m_terms.append(val: PatternTerm::BOL()); |
356 | } |
357 | void assertionEOL() |
358 | { |
359 | m_alternative->m_terms.append(val: PatternTerm::EOL()); |
360 | } |
361 | void assertionWordBoundary(bool invert) |
362 | { |
363 | m_alternative->m_terms.append(val: PatternTerm::WordBoundary(invert)); |
364 | } |
365 | |
366 | void atomPatternCharacter(UChar ch) |
367 | { |
368 | // We handle case-insensitive checking of unicode characters which do have both |
369 | // cases by handling them as if they were defined using a CharacterClass. |
370 | if (m_pattern.m_ignoreCase && !isASCII(c: ch) && (Unicode::toUpper(ch) != Unicode::toLower(ch))) { |
371 | atomCharacterClassBegin(); |
372 | atomCharacterClassAtom(ch); |
373 | atomCharacterClassEnd(); |
374 | } else |
375 | m_alternative->m_terms.append(val: PatternTerm(ch)); |
376 | } |
377 | |
378 | void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert) |
379 | { |
380 | switch (classID) { |
381 | case DigitClassID: |
382 | m_alternative->m_terms.append(val: PatternTerm(m_pattern.digitsCharacterClass(), invert)); |
383 | break; |
384 | case SpaceClassID: |
385 | m_alternative->m_terms.append(val: PatternTerm(m_pattern.spacesCharacterClass(), invert)); |
386 | break; |
387 | case WordClassID: |
388 | m_alternative->m_terms.append(val: PatternTerm(m_pattern.wordcharCharacterClass(), invert)); |
389 | break; |
390 | case NewlineClassID: |
391 | m_alternative->m_terms.append(val: PatternTerm(m_pattern.newlineCharacterClass(), invert)); |
392 | break; |
393 | } |
394 | } |
395 | |
396 | void atomCharacterClassBegin(bool invert = false) |
397 | { |
398 | m_invertCharacterClass = invert; |
399 | } |
400 | |
401 | void atomCharacterClassAtom(UChar ch) |
402 | { |
403 | m_characterClassConstructor.putChar(ch); |
404 | } |
405 | |
406 | void atomCharacterClassRange(UChar begin, UChar end) |
407 | { |
408 | m_characterClassConstructor.putRange(lo: begin, hi: end); |
409 | } |
410 | |
411 | void atomCharacterClassBuiltIn(BuiltInCharacterClassID classID, bool invert) |
412 | { |
413 | ASSERT(classID != NewlineClassID); |
414 | |
415 | switch (classID) { |
416 | case DigitClassID: |
417 | m_characterClassConstructor.append(other: invert ? m_pattern.nondigitsCharacterClass() : m_pattern.digitsCharacterClass()); |
418 | break; |
419 | |
420 | case SpaceClassID: |
421 | m_characterClassConstructor.append(other: invert ? m_pattern.nonspacesCharacterClass() : m_pattern.spacesCharacterClass()); |
422 | break; |
423 | |
424 | case WordClassID: |
425 | m_characterClassConstructor.append(other: invert ? m_pattern.nonwordcharCharacterClass() : m_pattern.wordcharCharacterClass()); |
426 | break; |
427 | |
428 | default: |
429 | ASSERT_NOT_REACHED(); |
430 | } |
431 | } |
432 | |
433 | void atomCharacterClassEnd() |
434 | { |
435 | CharacterClass* newCharacterClass = m_characterClassConstructor.charClass(); |
436 | m_pattern.m_userCharacterClasses.append(val: newCharacterClass); |
437 | m_alternative->m_terms.append(val: PatternTerm(newCharacterClass, m_invertCharacterClass)); |
438 | } |
439 | |
440 | void atomParenthesesSubpatternBegin(bool capture = true) |
441 | { |
442 | unsigned subpatternId = m_pattern.m_numSubpatterns + 1; |
443 | if (capture) |
444 | m_pattern.m_numSubpatterns++; |
445 | |
446 | PatternDisjunction* parenthesesDisjunction = new PatternDisjunction(m_alternative); |
447 | m_pattern.m_disjunctions.append(val: parenthesesDisjunction); |
448 | m_alternative->m_terms.append(val: PatternTerm(PatternTerm::TypeParenthesesSubpattern, subpatternId, parenthesesDisjunction, capture)); |
449 | m_alternative = parenthesesDisjunction->addNewAlternative(); |
450 | } |
451 | |
452 | void atomParentheticalAssertionBegin(bool invert = false) |
453 | { |
454 | PatternDisjunction* parenthesesDisjunction = new PatternDisjunction(m_alternative); |
455 | m_pattern.m_disjunctions.append(val: parenthesesDisjunction); |
456 | m_alternative->m_terms.append(val: PatternTerm(PatternTerm::TypeParentheticalAssertion, m_pattern.m_numSubpatterns + 1, parenthesesDisjunction, invert)); |
457 | m_alternative = parenthesesDisjunction->addNewAlternative(); |
458 | } |
459 | |
460 | void atomParenthesesEnd() |
461 | { |
462 | ASSERT(m_alternative->m_parent); |
463 | ASSERT(m_alternative->m_parent->m_parent); |
464 | m_alternative = m_alternative->m_parent->m_parent; |
465 | |
466 | m_alternative->lastTerm().parentheses.lastSubpatternId = m_pattern.m_numSubpatterns; |
467 | } |
468 | |
469 | void atomBackReference(unsigned subpatternId) |
470 | { |
471 | ASSERT(subpatternId); |
472 | m_pattern.m_maxBackReference = std::max(a: m_pattern.m_maxBackReference, b: subpatternId); |
473 | |
474 | if (subpatternId > m_pattern.m_numSubpatterns) { |
475 | m_alternative->m_terms.append(val: PatternTerm::ForwardReference()); |
476 | return; |
477 | } |
478 | |
479 | PatternAlternative* currentAlternative = m_alternative; |
480 | ASSERT(currentAlternative); |
481 | |
482 | // Note to self: if we waited until the AST was baked, we could also remove forwards refs |
483 | while ((currentAlternative = currentAlternative->m_parent->m_parent)) { |
484 | PatternTerm& term = currentAlternative->lastTerm(); |
485 | ASSERT((term.type == PatternTerm::TypeParenthesesSubpattern) || (term.type == PatternTerm::TypeParentheticalAssertion)); |
486 | |
487 | if ((term.type == PatternTerm::TypeParenthesesSubpattern) && term.invertOrCapture && (subpatternId == term.subpatternId)) { |
488 | m_alternative->m_terms.append(val: PatternTerm::ForwardReference()); |
489 | return; |
490 | } |
491 | } |
492 | |
493 | m_alternative->m_terms.append(val: PatternTerm(subpatternId)); |
494 | } |
495 | |
496 | PatternDisjunction* copyDisjunction(PatternDisjunction* disjunction) |
497 | { |
498 | PatternDisjunction* newDisjunction = new PatternDisjunction(); |
499 | |
500 | newDisjunction->m_parent = disjunction->m_parent; |
501 | for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) { |
502 | PatternAlternative* alternative = disjunction->m_alternatives[alt]; |
503 | PatternAlternative* newAlternative = newDisjunction->addNewAlternative(); |
504 | for (unsigned i = 0; i < alternative->m_terms.size(); ++i) |
505 | newAlternative->m_terms.append(val: copyTerm(term&: alternative->m_terms[i])); |
506 | } |
507 | |
508 | m_pattern.m_disjunctions.append(val: newDisjunction); |
509 | return newDisjunction; |
510 | } |
511 | |
512 | PatternTerm copyTerm(PatternTerm& term) |
513 | { |
514 | if ((term.type != PatternTerm::TypeParenthesesSubpattern) && (term.type != PatternTerm::TypeParentheticalAssertion)) |
515 | return PatternTerm(term); |
516 | |
517 | PatternTerm termCopy = term; |
518 | termCopy.parentheses.disjunction = copyDisjunction(disjunction: termCopy.parentheses.disjunction); |
519 | return termCopy; |
520 | } |
521 | |
522 | void quantifyAtom(unsigned min, unsigned max, bool greedy) |
523 | { |
524 | ASSERT(min <= max); |
525 | ASSERT(m_alternative->m_terms.size()); |
526 | |
527 | if (!max) { |
528 | m_alternative->removeLastTerm(); |
529 | return; |
530 | } |
531 | |
532 | PatternTerm& term = m_alternative->lastTerm(); |
533 | ASSERT(term.type > PatternTerm::TypeAssertionWordBoundary); |
534 | ASSERT((term.quantityCount == 1) && (term.quantityType == QuantifierFixedCount)); |
535 | |
536 | // For any assertion with a zero minimum, not matching is valid and has no effect, |
537 | // remove it. Otherwise, we need to match as least once, but there is no point |
538 | // matching more than once, so remove the quantifier. It is not entirely clear |
539 | // from the spec whether or not this behavior is correct, but I believe this |
540 | // matches Firefox. :-/ |
541 | if (term.type == PatternTerm::TypeParentheticalAssertion) { |
542 | if (!min) |
543 | m_alternative->removeLastTerm(); |
544 | return; |
545 | } |
546 | |
547 | if (min == 0) |
548 | term.quantify(count: max, type: greedy ? QuantifierGreedy : QuantifierNonGreedy); |
549 | else if (min == max) |
550 | term.quantify(count: min, type: QuantifierFixedCount); |
551 | else { |
552 | term.quantify(count: min, type: QuantifierFixedCount); |
553 | m_alternative->m_terms.append(val: copyTerm(term)); |
554 | // NOTE: this term is interesting from an analysis perspective, in that it can be ignored..... |
555 | m_alternative->lastTerm().quantify(count: (max == UINT_MAX) ? max : max - min, type: greedy ? QuantifierGreedy : QuantifierNonGreedy); |
556 | if (m_alternative->lastTerm().type == PatternTerm::TypeParenthesesSubpattern) |
557 | m_alternative->lastTerm().parentheses.isCopy = true; |
558 | } |
559 | } |
560 | |
561 | void disjunction() |
562 | { |
563 | m_alternative = m_alternative->m_parent->addNewAlternative(); |
564 | } |
565 | |
566 | void regexBegin() |
567 | { |
568 | m_pattern.m_body = new PatternDisjunction(); |
569 | m_alternative = m_pattern.m_body->addNewAlternative(); |
570 | m_pattern.m_disjunctions.append(val: m_pattern.m_body); |
571 | } |
572 | void regexEnd() |
573 | { |
574 | } |
575 | void regexError() |
576 | { |
577 | } |
578 | |
579 | unsigned setupAlternativeOffsets(PatternAlternative* alternative, unsigned currentCallFrameSize, unsigned initialInputPosition) |
580 | { |
581 | alternative->m_hasFixedSize = true; |
582 | unsigned currentInputPosition = initialInputPosition; |
583 | |
584 | for (unsigned i = 0; i < alternative->m_terms.size(); ++i) { |
585 | PatternTerm& term = alternative->m_terms[i]; |
586 | |
587 | switch (term.type) { |
588 | case PatternTerm::TypeAssertionBOL: |
589 | case PatternTerm::TypeAssertionEOL: |
590 | case PatternTerm::TypeAssertionWordBoundary: |
591 | term.inputPosition = currentInputPosition; |
592 | break; |
593 | |
594 | case PatternTerm::TypeBackReference: |
595 | term.inputPosition = currentInputPosition; |
596 | term.frameLocation = currentCallFrameSize; |
597 | currentCallFrameSize += RegexStackSpaceForBackTrackInfoBackReference; |
598 | alternative->m_hasFixedSize = false; |
599 | break; |
600 | |
601 | case PatternTerm::TypeForwardReference: |
602 | break; |
603 | |
604 | case PatternTerm::TypePatternCharacter: |
605 | term.inputPosition = currentInputPosition; |
606 | if (term.quantityType != QuantifierFixedCount) { |
607 | term.frameLocation = currentCallFrameSize; |
608 | currentCallFrameSize += RegexStackSpaceForBackTrackInfoPatternCharacter; |
609 | alternative->m_hasFixedSize = false; |
610 | } else |
611 | currentInputPosition += term.quantityCount; |
612 | break; |
613 | |
614 | case PatternTerm::TypeCharacterClass: |
615 | term.inputPosition = currentInputPosition; |
616 | if (term.quantityType != QuantifierFixedCount) { |
617 | term.frameLocation = currentCallFrameSize; |
618 | currentCallFrameSize += RegexStackSpaceForBackTrackInfoCharacterClass; |
619 | alternative->m_hasFixedSize = false; |
620 | } else |
621 | currentInputPosition += term.quantityCount; |
622 | break; |
623 | |
624 | case PatternTerm::TypeParenthesesSubpattern: |
625 | // Note: for fixed once parentheses we will ensure at least the minimum is available; others are on their own. |
626 | term.frameLocation = currentCallFrameSize; |
627 | if ((term.quantityCount == 1) && !term.parentheses.isCopy) { |
628 | if (term.quantityType == QuantifierFixedCount) { |
629 | currentCallFrameSize = setupDisjunctionOffsets(disjunction: term.parentheses.disjunction, initialCallFrameSize: currentCallFrameSize, initialInputPosition: currentInputPosition); |
630 | currentInputPosition += term.parentheses.disjunction->m_minimumSize; |
631 | } else { |
632 | currentCallFrameSize += RegexStackSpaceForBackTrackInfoParenthesesOnce; |
633 | currentCallFrameSize = setupDisjunctionOffsets(disjunction: term.parentheses.disjunction, initialCallFrameSize: currentCallFrameSize, initialInputPosition: currentInputPosition); |
634 | } |
635 | term.inputPosition = currentInputPosition; |
636 | } else { |
637 | term.inputPosition = currentInputPosition; |
638 | setupDisjunctionOffsets(disjunction: term.parentheses.disjunction, initialCallFrameSize: 0, initialInputPosition: currentInputPosition); |
639 | currentCallFrameSize += RegexStackSpaceForBackTrackInfoParentheses; |
640 | } |
641 | // Fixed count of 1 could be accepted, if they have a fixed size *AND* if all alternatives are of the same length. |
642 | alternative->m_hasFixedSize = false; |
643 | break; |
644 | |
645 | case PatternTerm::TypeParentheticalAssertion: |
646 | term.inputPosition = currentInputPosition; |
647 | term.frameLocation = currentCallFrameSize; |
648 | currentCallFrameSize = setupDisjunctionOffsets(disjunction: term.parentheses.disjunction, initialCallFrameSize: currentCallFrameSize + RegexStackSpaceForBackTrackInfoParentheticalAssertion, initialInputPosition: currentInputPosition); |
649 | break; |
650 | } |
651 | } |
652 | |
653 | alternative->m_minimumSize = currentInputPosition - initialInputPosition; |
654 | return currentCallFrameSize; |
655 | } |
656 | |
657 | unsigned setupDisjunctionOffsets(PatternDisjunction* disjunction, unsigned initialCallFrameSize, unsigned initialInputPosition) |
658 | { |
659 | if ((disjunction != m_pattern.m_body) && (disjunction->m_alternatives.size() > 1)) |
660 | initialCallFrameSize += RegexStackSpaceForBackTrackInfoAlternative; |
661 | |
662 | unsigned minimumInputSize = UINT_MAX; |
663 | unsigned maximumCallFrameSize = 0; |
664 | bool hasFixedSize = true; |
665 | |
666 | for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) { |
667 | PatternAlternative* alternative = disjunction->m_alternatives[alt]; |
668 | unsigned currentAlternativeCallFrameSize = setupAlternativeOffsets(alternative, currentCallFrameSize: initialCallFrameSize, initialInputPosition); |
669 | minimumInputSize = min(a: minimumInputSize, b: alternative->m_minimumSize); |
670 | maximumCallFrameSize = max(a: maximumCallFrameSize, b: currentAlternativeCallFrameSize); |
671 | hasFixedSize &= alternative->m_hasFixedSize; |
672 | } |
673 | |
674 | ASSERT(minimumInputSize != UINT_MAX); |
675 | ASSERT(maximumCallFrameSize >= initialCallFrameSize); |
676 | |
677 | disjunction->m_hasFixedSize = hasFixedSize; |
678 | disjunction->m_minimumSize = minimumInputSize; |
679 | disjunction->m_callFrameSize = maximumCallFrameSize; |
680 | return maximumCallFrameSize; |
681 | } |
682 | |
683 | void setupOffsets() |
684 | { |
685 | setupDisjunctionOffsets(disjunction: m_pattern.m_body, initialCallFrameSize: 0, initialInputPosition: 0); |
686 | } |
687 | |
688 | private: |
689 | RegexPattern& m_pattern; |
690 | PatternAlternative* m_alternative; |
691 | CharacterClassConstructor m_characterClassConstructor; |
692 | bool m_invertCharacterClass; |
693 | }; |
694 | |
695 | |
696 | const char* compileRegex(const UString& patternString, RegexPattern& pattern) |
697 | { |
698 | RegexPatternConstructor constructor(pattern); |
699 | |
700 | if (const char* error = parse(delegate&: constructor, pattern: patternString)) |
701 | return error; |
702 | |
703 | // If the pattern contains illegal backreferences reset & reparse. |
704 | // Quoting Netscape's "What's new in JavaScript 1.2", |
705 | // "Note: if the number of left parentheses is less than the number specified |
706 | // in \#, the \# is taken as an octal escape as described in the next row." |
707 | if (pattern.containsIllegalBackReference()) { |
708 | unsigned numSubpatterns = pattern.m_numSubpatterns; |
709 | |
710 | constructor.reset(); |
711 | #if !ASSERT_DISABLED |
712 | const char* error = |
713 | #endif |
714 | parse(delegate&: constructor, pattern: patternString, backReferenceLimit: numSubpatterns); |
715 | |
716 | ASSERT(!error); |
717 | ASSERT(numSubpatterns == pattern.m_numSubpatterns); |
718 | } |
719 | |
720 | constructor.setupOffsets(); |
721 | |
722 | return NULL; |
723 | }; |
724 | |
725 | |
726 | } } |
727 | |
728 | #endif |
729 | |