1 | /* |
2 | * Copyright (C) 2009 Apple Inc. All rights reserved. |
3 | * |
4 | * Redistribution and use in source and binary forms, with or without |
5 | * modification, are permitted provided that the following conditions |
6 | * are met: |
7 | * 1. Redistributions of source code must retain the above copyright |
8 | * notice, this list of conditions and the following disclaimer. |
9 | * 2. Redistributions in binary form must reproduce the above copyright |
10 | * notice, this list of conditions and the following disclaimer in the |
11 | * documentation and/or other materials provided with the distribution. |
12 | * |
13 | * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY |
14 | * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
15 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR |
16 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR |
17 | * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, |
18 | * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, |
19 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR |
20 | * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY |
21 | * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
22 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
23 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
24 | */ |
25 | |
26 | #include "config.h" |
27 | #include "RegexInterpreter.h" |
28 | |
29 | #include "RegexCompiler.h" |
30 | #include "RegexPattern.h" |
31 | |
32 | #ifndef NDEBUG |
33 | #include <stdio.h> |
34 | #endif |
35 | |
36 | #if ENABLE(YARR) |
37 | |
38 | using namespace WTF; |
39 | |
40 | namespace JSC { namespace Yarr { |
41 | |
42 | class Interpreter { |
43 | public: |
44 | struct ParenthesesDisjunctionContext; |
45 | |
46 | struct BackTrackInfoPatternCharacter { |
47 | uintptr_t matchAmount; |
48 | }; |
49 | struct BackTrackInfoCharacterClass { |
50 | uintptr_t matchAmount; |
51 | }; |
52 | struct BackTrackInfoBackReference { |
53 | uintptr_t begin; // Not really needed for greedy quantifiers. |
54 | uintptr_t matchAmount; // Not really needed for fixed quantifiers. |
55 | }; |
56 | struct BackTrackInfoAlternative { |
57 | uintptr_t offset; |
58 | }; |
59 | struct BackTrackInfoParentheticalAssertion { |
60 | uintptr_t begin; |
61 | }; |
62 | struct BackTrackInfoParenthesesOnce { |
63 | uintptr_t inParentheses; |
64 | }; |
65 | struct BackTrackInfoParentheses { |
66 | uintptr_t matchAmount; |
67 | ParenthesesDisjunctionContext* lastContext; |
68 | uintptr_t prevBegin; |
69 | uintptr_t prevEnd; |
70 | }; |
71 | |
72 | static inline void appendParenthesesDisjunctionContext(BackTrackInfoParentheses* backTrack, ParenthesesDisjunctionContext* context) |
73 | { |
74 | context->next = backTrack->lastContext; |
75 | backTrack->lastContext = context; |
76 | ++backTrack->matchAmount; |
77 | } |
78 | |
79 | static inline void popParenthesesDisjunctionContext(BackTrackInfoParentheses* backTrack) |
80 | { |
81 | ASSERT(backTrack->matchAmount); |
82 | ASSERT(backTrack->lastContext); |
83 | backTrack->lastContext = backTrack->lastContext->next; |
84 | --backTrack->matchAmount; |
85 | } |
86 | |
87 | struct DisjunctionContext |
88 | { |
89 | DisjunctionContext() |
90 | : term(0) |
91 | { |
92 | } |
93 | |
94 | void* operator new(size_t, void* where) |
95 | { |
96 | return where; |
97 | } |
98 | |
99 | int term; |
100 | unsigned matchBegin; |
101 | unsigned matchEnd; |
102 | uintptr_t frame[1]; |
103 | }; |
104 | |
105 | DisjunctionContext* allocDisjunctionContext(ByteDisjunction* disjunction) |
106 | { |
107 | return new(malloc(size: sizeof(DisjunctionContext) + (disjunction->m_frameSize - 1) * sizeof(uintptr_t))) DisjunctionContext(); |
108 | } |
109 | |
110 | void freeDisjunctionContext(DisjunctionContext* context) |
111 | { |
112 | free(ptr: context); |
113 | } |
114 | |
115 | struct ParenthesesDisjunctionContext |
116 | { |
117 | ParenthesesDisjunctionContext(int* output, ByteTerm& term) |
118 | : next(0) |
119 | { |
120 | unsigned firstSubpatternId = term.atom.subpatternId; |
121 | unsigned numNestedSubpatterns = term.atom.parenthesesDisjunction->m_numSubpatterns; |
122 | |
123 | for (unsigned i = 0; i < (numNestedSubpatterns << 1); ++i) { |
124 | subpatternBackup[i] = output[(firstSubpatternId << 1) + i]; |
125 | output[(firstSubpatternId << 1) + i] = -1; |
126 | } |
127 | |
128 | new(getDisjunctionContext(term)) DisjunctionContext(); |
129 | } |
130 | |
131 | void* operator new(size_t, void* where) |
132 | { |
133 | return where; |
134 | } |
135 | |
136 | void restoreOutput(int* output, unsigned firstSubpatternId, unsigned numNestedSubpatterns) |
137 | { |
138 | for (unsigned i = 0; i < (numNestedSubpatterns << 1); ++i) |
139 | output[(firstSubpatternId << 1) + i] = subpatternBackup[i]; |
140 | } |
141 | |
142 | DisjunctionContext* getDisjunctionContext(ByteTerm& term) |
143 | { |
144 | return reinterpret_cast<DisjunctionContext*>(&(subpatternBackup[term.atom.parenthesesDisjunction->m_numSubpatterns << 1])); |
145 | } |
146 | |
147 | ParenthesesDisjunctionContext* next; |
148 | int subpatternBackup[1]; |
149 | }; |
150 | |
151 | ParenthesesDisjunctionContext* allocParenthesesDisjunctionContext(ByteDisjunction* disjunction, int* output, ByteTerm& term) |
152 | { |
153 | return new(malloc(size: sizeof(ParenthesesDisjunctionContext) + (((term.atom.parenthesesDisjunction->m_numSubpatterns << 1) - 1) * sizeof(int)) + sizeof(DisjunctionContext) + (disjunction->m_frameSize - 1) * sizeof(uintptr_t))) ParenthesesDisjunctionContext(output, term); |
154 | } |
155 | |
156 | void freeParenthesesDisjunctionContext(ParenthesesDisjunctionContext* context) |
157 | { |
158 | free(ptr: context); |
159 | } |
160 | |
161 | class InputStream { |
162 | public: |
163 | InputStream(const UChar* input, unsigned start, unsigned length) |
164 | : input(input) |
165 | , pos(start) |
166 | , length(length) |
167 | { |
168 | } |
169 | |
170 | void next() |
171 | { |
172 | ++pos; |
173 | } |
174 | |
175 | void rewind(unsigned amount) |
176 | { |
177 | ASSERT(pos >= amount); |
178 | pos -= amount; |
179 | } |
180 | |
181 | int read() |
182 | { |
183 | ASSERT(pos < length); |
184 | if (pos < length) |
185 | return input[pos]; |
186 | return -1; |
187 | } |
188 | |
189 | int readChecked(int position) |
190 | { |
191 | ASSERT(position < 0); |
192 | ASSERT((unsigned)-position <= pos); |
193 | unsigned p = pos + position; |
194 | ASSERT(p < length); |
195 | return input[p]; |
196 | } |
197 | |
198 | int reread(unsigned from) |
199 | { |
200 | ASSERT(from < length); |
201 | return input[from]; |
202 | } |
203 | |
204 | int prev() |
205 | { |
206 | ASSERT(!(pos > length)); |
207 | if (pos && length) |
208 | return input[pos - 1]; |
209 | return -1; |
210 | } |
211 | |
212 | unsigned getPos() |
213 | { |
214 | return pos; |
215 | } |
216 | |
217 | void setPos(unsigned p) |
218 | { |
219 | pos = p; |
220 | } |
221 | |
222 | bool atStart() |
223 | { |
224 | return pos == 0; |
225 | } |
226 | |
227 | bool atEnd() |
228 | { |
229 | return pos == length; |
230 | } |
231 | |
232 | bool checkInput(int count) |
233 | { |
234 | if ((pos + count) <= length) { |
235 | pos += count; |
236 | return true; |
237 | } else |
238 | return false; |
239 | } |
240 | |
241 | void uncheckInput(int count) |
242 | { |
243 | pos -= count; |
244 | } |
245 | |
246 | bool atStart(int position) |
247 | { |
248 | return (pos + position) == 0; |
249 | } |
250 | |
251 | bool atEnd(int position) |
252 | { |
253 | return (pos + position) == length; |
254 | } |
255 | |
256 | private: |
257 | const UChar* input; |
258 | unsigned pos; |
259 | unsigned length; |
260 | }; |
261 | |
262 | bool testCharacterClass(CharacterClass* characterClass, int ch) |
263 | { |
264 | if (ch & 0xFF80) { |
265 | for (unsigned i = 0; i < characterClass->m_matchesUnicode.size(); ++i) |
266 | if (ch == characterClass->m_matchesUnicode[i]) |
267 | return true; |
268 | for (unsigned i = 0; i < characterClass->m_rangesUnicode.size(); ++i) |
269 | if ((ch >= characterClass->m_rangesUnicode[i].begin) && (ch <= characterClass->m_rangesUnicode[i].end)) |
270 | return true; |
271 | } else { |
272 | for (unsigned i = 0; i < characterClass->m_matches.size(); ++i) |
273 | if (ch == characterClass->m_matches[i]) |
274 | return true; |
275 | for (unsigned i = 0; i < characterClass->m_ranges.size(); ++i) |
276 | if ((ch >= characterClass->m_ranges[i].begin) && (ch <= characterClass->m_ranges[i].end)) |
277 | return true; |
278 | } |
279 | |
280 | return false; |
281 | } |
282 | |
283 | bool tryConsumeCharacter(int testChar) |
284 | { |
285 | if (input.atEnd()) |
286 | return false; |
287 | |
288 | int ch = input.read(); |
289 | |
290 | if (pattern->m_ignoreCase ? ((Unicode::toLower(ch: testChar) == ch) || (Unicode::toUpper(ch: testChar) == ch)) : (testChar == ch)) { |
291 | input.next(); |
292 | return true; |
293 | } |
294 | return false; |
295 | } |
296 | |
297 | bool checkCharacter(int testChar, int inputPosition) |
298 | { |
299 | return testChar == input.readChecked(position: inputPosition); |
300 | } |
301 | |
302 | bool checkCasedCharacter(int loChar, int hiChar, int inputPosition) |
303 | { |
304 | int ch = input.readChecked(position: inputPosition); |
305 | return (loChar == ch) || (hiChar == ch); |
306 | } |
307 | |
308 | bool tryConsumeCharacterClass(CharacterClass* characterClass, bool invert) |
309 | { |
310 | if (input.atEnd()) |
311 | return false; |
312 | |
313 | bool match = testCharacterClass(characterClass, ch: input.read()); |
314 | |
315 | if (invert) |
316 | match = !match; |
317 | |
318 | if (match) { |
319 | input.next(); |
320 | return true; |
321 | } |
322 | return false; |
323 | } |
324 | |
325 | bool checkCharacterClass(CharacterClass* characterClass, bool invert, int inputPosition) |
326 | { |
327 | bool match = testCharacterClass(characterClass, ch: input.readChecked(position: inputPosition)); |
328 | return invert ? !match : match; |
329 | } |
330 | |
331 | bool tryConsumeBackReference(int matchBegin, int matchEnd, int inputOffset) |
332 | { |
333 | int matchSize = matchEnd - matchBegin; |
334 | |
335 | if (!input.checkInput(count: matchSize)) |
336 | return false; |
337 | |
338 | for (int i = 0; i < matchSize; ++i) { |
339 | if (!checkCharacter(testChar: input.reread(from: matchBegin + i), inputPosition: inputOffset - matchSize + i)) { |
340 | input.uncheckInput(count: matchSize); |
341 | return false; |
342 | } |
343 | } |
344 | |
345 | return true; |
346 | } |
347 | |
348 | bool matchAssertionBOL(ByteTerm& term) |
349 | { |
350 | return (input.atStart(position: term.inputPosition)) || (pattern->m_multiline && testCharacterClass(characterClass: pattern->newlineCharacterClass, ch: input.readChecked(position: term.inputPosition - 1))); |
351 | } |
352 | |
353 | bool matchAssertionEOL(ByteTerm& term) |
354 | { |
355 | if (term.inputPosition) |
356 | return (input.atEnd(position: term.inputPosition)) || (pattern->m_multiline && testCharacterClass(characterClass: pattern->newlineCharacterClass, ch: input.readChecked(position: term.inputPosition))); |
357 | else |
358 | return (input.atEnd()) || (pattern->m_multiline && testCharacterClass(characterClass: pattern->newlineCharacterClass, ch: input.read())); |
359 | } |
360 | |
361 | bool matchAssertionWordBoundary(ByteTerm& term) |
362 | { |
363 | bool prevIsWordchar = !input.atStart(position: term.inputPosition) && testCharacterClass(characterClass: pattern->wordcharCharacterClass, ch: input.readChecked(position: term.inputPosition - 1)); |
364 | bool readIsWordchar; |
365 | if (term.inputPosition) |
366 | readIsWordchar = !input.atEnd(position: term.inputPosition) && testCharacterClass(characterClass: pattern->wordcharCharacterClass, ch: input.readChecked(position: term.inputPosition)); |
367 | else |
368 | readIsWordchar = !input.atEnd() && testCharacterClass(characterClass: pattern->wordcharCharacterClass, ch: input.read()); |
369 | |
370 | bool wordBoundary = prevIsWordchar != readIsWordchar; |
371 | return term.invert() ? !wordBoundary : wordBoundary; |
372 | } |
373 | |
374 | bool backtrackPatternCharacter(ByteTerm& term, DisjunctionContext* context) |
375 | { |
376 | BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + term.frameLocation); |
377 | |
378 | switch (term.atom.quantityType) { |
379 | case QuantifierFixedCount: |
380 | break; |
381 | |
382 | case QuantifierGreedy: |
383 | if (backTrack->matchAmount) { |
384 | --backTrack->matchAmount; |
385 | input.uncheckInput(count: 1); |
386 | return true; |
387 | } |
388 | break; |
389 | |
390 | case QuantifierNonGreedy: |
391 | if ((backTrack->matchAmount < term.atom.quantityCount) && input.checkInput(count: 1)) { |
392 | ++backTrack->matchAmount; |
393 | if (checkCharacter(testChar: term.atom.patternCharacter, inputPosition: term.inputPosition - 1)) |
394 | return true; |
395 | } |
396 | input.uncheckInput(count: backTrack->matchAmount); |
397 | break; |
398 | } |
399 | |
400 | return false; |
401 | } |
402 | |
403 | bool backtrackPatternCasedCharacter(ByteTerm& term, DisjunctionContext* context) |
404 | { |
405 | BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + term.frameLocation); |
406 | |
407 | switch (term.atom.quantityType) { |
408 | case QuantifierFixedCount: |
409 | break; |
410 | |
411 | case QuantifierGreedy: |
412 | if (backTrack->matchAmount) { |
413 | --backTrack->matchAmount; |
414 | input.uncheckInput(count: 1); |
415 | return true; |
416 | } |
417 | break; |
418 | |
419 | case QuantifierNonGreedy: |
420 | if ((backTrack->matchAmount < term.atom.quantityCount) && input.checkInput(count: 1)) { |
421 | ++backTrack->matchAmount; |
422 | if (checkCasedCharacter(loChar: term.atom.casedCharacter.lo, hiChar: term.atom.casedCharacter.hi, inputPosition: term.inputPosition - 1)) |
423 | return true; |
424 | } |
425 | input.uncheckInput(count: backTrack->matchAmount); |
426 | break; |
427 | } |
428 | |
429 | return false; |
430 | } |
431 | |
432 | bool matchCharacterClass(ByteTerm& term, DisjunctionContext* context) |
433 | { |
434 | ASSERT(term.type == ByteTerm::TypeCharacterClass); |
435 | BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + term.frameLocation); |
436 | |
437 | switch (term.atom.quantityType) { |
438 | case QuantifierFixedCount: { |
439 | for (unsigned matchAmount = 0; matchAmount < term.atom.quantityCount; ++matchAmount) { |
440 | if (!checkCharacterClass(characterClass: term.atom.characterClass, invert: term.invert(), inputPosition: term.inputPosition + matchAmount)) |
441 | return false; |
442 | } |
443 | return true; |
444 | } |
445 | |
446 | case QuantifierGreedy: { |
447 | unsigned matchAmount = 0; |
448 | while ((matchAmount < term.atom.quantityCount) && input.checkInput(count: 1)) { |
449 | if (!checkCharacterClass(characterClass: term.atom.characterClass, invert: term.invert(), inputPosition: term.inputPosition - 1)) { |
450 | input.uncheckInput(count: 1); |
451 | break; |
452 | } |
453 | ++matchAmount; |
454 | } |
455 | backTrack->matchAmount = matchAmount; |
456 | |
457 | return true; |
458 | } |
459 | |
460 | case QuantifierNonGreedy: |
461 | backTrack->matchAmount = 0; |
462 | return true; |
463 | } |
464 | |
465 | ASSERT_NOT_REACHED(); |
466 | return false; |
467 | } |
468 | |
469 | bool backtrackCharacterClass(ByteTerm& term, DisjunctionContext* context) |
470 | { |
471 | ASSERT(term.type == ByteTerm::TypeCharacterClass); |
472 | BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + term.frameLocation); |
473 | |
474 | switch (term.atom.quantityType) { |
475 | case QuantifierFixedCount: |
476 | break; |
477 | |
478 | case QuantifierGreedy: |
479 | if (backTrack->matchAmount) { |
480 | --backTrack->matchAmount; |
481 | input.uncheckInput(count: 1); |
482 | return true; |
483 | } |
484 | break; |
485 | |
486 | case QuantifierNonGreedy: |
487 | if ((backTrack->matchAmount < term.atom.quantityCount) && input.checkInput(count: 1)) { |
488 | ++backTrack->matchAmount; |
489 | if (checkCharacterClass(characterClass: term.atom.characterClass, invert: term.invert(), inputPosition: term.inputPosition - 1)) |
490 | return true; |
491 | } |
492 | input.uncheckInput(count: backTrack->matchAmount); |
493 | break; |
494 | } |
495 | |
496 | return false; |
497 | } |
498 | |
499 | bool matchBackReference(ByteTerm& term, DisjunctionContext* context) |
500 | { |
501 | ASSERT(term.type == ByteTerm::TypeBackReference); |
502 | BackTrackInfoBackReference* backTrack = reinterpret_cast<BackTrackInfoBackReference*>(context->frame + term.frameLocation); |
503 | |
504 | int matchBegin = output[(term.atom.subpatternId << 1)]; |
505 | int matchEnd = output[(term.atom.subpatternId << 1) + 1]; |
506 | ASSERT((matchBegin == -1) == (matchEnd == -1)); |
507 | ASSERT(matchBegin <= matchEnd); |
508 | |
509 | if (matchBegin == matchEnd) |
510 | return true; |
511 | |
512 | switch (term.atom.quantityType) { |
513 | case QuantifierFixedCount: { |
514 | backTrack->begin = input.getPos(); |
515 | for (unsigned matchAmount = 0; matchAmount < term.atom.quantityCount; ++matchAmount) { |
516 | if (!tryConsumeBackReference(matchBegin, matchEnd, inputOffset: term.inputPosition)) { |
517 | input.setPos(backTrack->begin); |
518 | return false; |
519 | } |
520 | } |
521 | return true; |
522 | } |
523 | |
524 | case QuantifierGreedy: { |
525 | unsigned matchAmount = 0; |
526 | while ((matchAmount < term.atom.quantityCount) && tryConsumeBackReference(matchBegin, matchEnd, inputOffset: term.inputPosition)) |
527 | ++matchAmount; |
528 | backTrack->matchAmount = matchAmount; |
529 | return true; |
530 | } |
531 | |
532 | case QuantifierNonGreedy: |
533 | backTrack->begin = input.getPos(); |
534 | backTrack->matchAmount = 0; |
535 | return true; |
536 | } |
537 | |
538 | ASSERT_NOT_REACHED(); |
539 | return false; |
540 | } |
541 | |
542 | bool backtrackBackReference(ByteTerm& term, DisjunctionContext* context) |
543 | { |
544 | ASSERT(term.type == ByteTerm::TypeBackReference); |
545 | BackTrackInfoBackReference* backTrack = reinterpret_cast<BackTrackInfoBackReference*>(context->frame + term.frameLocation); |
546 | |
547 | int matchBegin = output[(term.atom.subpatternId << 1)]; |
548 | int matchEnd = output[(term.atom.subpatternId << 1) + 1]; |
549 | ASSERT((matchBegin == -1) == (matchEnd == -1)); |
550 | ASSERT(matchBegin <= matchEnd); |
551 | |
552 | if (matchBegin == matchEnd) |
553 | return false; |
554 | |
555 | switch (term.atom.quantityType) { |
556 | case QuantifierFixedCount: |
557 | // for quantityCount == 1, could rewind. |
558 | input.setPos(backTrack->begin); |
559 | break; |
560 | |
561 | case QuantifierGreedy: |
562 | if (backTrack->matchAmount) { |
563 | --backTrack->matchAmount; |
564 | input.rewind(amount: matchEnd - matchBegin); |
565 | return true; |
566 | } |
567 | break; |
568 | |
569 | case QuantifierNonGreedy: |
570 | if ((backTrack->matchAmount < term.atom.quantityCount) && tryConsumeBackReference(matchBegin, matchEnd, inputOffset: term.inputPosition)) { |
571 | ++backTrack->matchAmount; |
572 | return true; |
573 | } else |
574 | input.setPos(backTrack->begin); |
575 | break; |
576 | } |
577 | |
578 | return false; |
579 | } |
580 | |
581 | void recordParenthesesMatch(ByteTerm& term, ParenthesesDisjunctionContext* context) |
582 | { |
583 | if (term.capture()) { |
584 | unsigned subpatternId = term.atom.subpatternId; |
585 | output[(subpatternId << 1)] = context->getDisjunctionContext(term)->matchBegin + term.inputPosition; |
586 | output[(subpatternId << 1) + 1] = context->getDisjunctionContext(term)->matchEnd + term.inputPosition; |
587 | } |
588 | } |
589 | void resetMatches(ByteTerm& term, ParenthesesDisjunctionContext* context) |
590 | { |
591 | unsigned firstSubpatternId = term.atom.subpatternId; |
592 | unsigned count = term.atom.parenthesesDisjunction->m_numSubpatterns; |
593 | context->restoreOutput(output, firstSubpatternId, numNestedSubpatterns: count); |
594 | } |
595 | void resetAssertionMatches(ByteTerm& term) |
596 | { |
597 | unsigned firstSubpatternId = term.atom.subpatternId; |
598 | unsigned count = term.atom.parenthesesDisjunction->m_numSubpatterns; |
599 | for (unsigned i = 0; i < (count << 1); ++i) |
600 | output[(firstSubpatternId << 1) + i] = -1; |
601 | } |
602 | bool parenthesesDoBacktrack(ByteTerm& term, BackTrackInfoParentheses* backTrack) |
603 | { |
604 | while (backTrack->matchAmount) { |
605 | ParenthesesDisjunctionContext* context = backTrack->lastContext; |
606 | |
607 | if (matchDisjunction(disjunction: term.atom.parenthesesDisjunction, context: context->getDisjunctionContext(term), btrack: true)) |
608 | return true; |
609 | |
610 | resetMatches(term, context); |
611 | popParenthesesDisjunctionContext(backTrack); |
612 | freeParenthesesDisjunctionContext(context); |
613 | } |
614 | |
615 | return false; |
616 | } |
617 | |
618 | bool matchParenthesesOnceBegin(ByteTerm& term, DisjunctionContext* context) |
619 | { |
620 | ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceBegin); |
621 | ASSERT(term.atom.quantityCount == 1); |
622 | |
623 | BackTrackInfoParenthesesOnce* backTrack = reinterpret_cast<BackTrackInfoParenthesesOnce*>(context->frame + term.frameLocation); |
624 | |
625 | switch (term.atom.quantityType) { |
626 | case QuantifierGreedy: { |
627 | // set this speculatively; if we get to the parens end this will be true. |
628 | backTrack->inParentheses = 1; |
629 | break; |
630 | } |
631 | case QuantifierNonGreedy: { |
632 | backTrack->inParentheses = 0; |
633 | context->term += term.atom.parenthesesWidth; |
634 | return true; |
635 | } |
636 | case QuantifierFixedCount: |
637 | break; |
638 | } |
639 | |
640 | if (term.capture()) { |
641 | unsigned subpatternId = term.atom.subpatternId; |
642 | output[(subpatternId << 1)] = input.getPos() + term.inputPosition; |
643 | } |
644 | |
645 | return true; |
646 | } |
647 | |
648 | bool matchParenthesesOnceEnd(ByteTerm& term, DisjunctionContext*) |
649 | { |
650 | ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceEnd); |
651 | ASSERT(term.atom.quantityCount == 1); |
652 | |
653 | if (term.capture()) { |
654 | unsigned subpatternId = term.atom.subpatternId; |
655 | output[(subpatternId << 1) + 1] = input.getPos() + term.inputPosition; |
656 | } |
657 | return true; |
658 | } |
659 | |
660 | bool backtrackParenthesesOnceBegin(ByteTerm& term, DisjunctionContext* context) |
661 | { |
662 | ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceBegin); |
663 | ASSERT(term.atom.quantityCount == 1); |
664 | |
665 | BackTrackInfoParenthesesOnce* backTrack = reinterpret_cast<BackTrackInfoParenthesesOnce*>(context->frame + term.frameLocation); |
666 | |
667 | if (term.capture()) { |
668 | unsigned subpatternId = term.atom.subpatternId; |
669 | output[(subpatternId << 1)] = -1; |
670 | output[(subpatternId << 1) + 1] = -1; |
671 | } |
672 | |
673 | switch (term.atom.quantityType) { |
674 | case QuantifierGreedy: |
675 | // if we backtrack to this point, there is another chance - try matching nothing. |
676 | ASSERT(backTrack->inParentheses); |
677 | backTrack->inParentheses = 0; |
678 | context->term += term.atom.parenthesesWidth; |
679 | return true; |
680 | case QuantifierNonGreedy: |
681 | ASSERT(backTrack->inParentheses); |
682 | case QuantifierFixedCount: |
683 | break; |
684 | } |
685 | |
686 | return false; |
687 | } |
688 | |
689 | bool backtrackParenthesesOnceEnd(ByteTerm& term, DisjunctionContext* context) |
690 | { |
691 | ASSERT(term.type == ByteTerm::TypeParenthesesSubpatternOnceEnd); |
692 | ASSERT(term.atom.quantityCount == 1); |
693 | |
694 | BackTrackInfoParenthesesOnce* backTrack = reinterpret_cast<BackTrackInfoParenthesesOnce*>(context->frame + term.frameLocation); |
695 | |
696 | switch (term.atom.quantityType) { |
697 | case QuantifierGreedy: |
698 | if (!backTrack->inParentheses) { |
699 | context->term -= term.atom.parenthesesWidth; |
700 | return false; |
701 | } |
702 | case QuantifierNonGreedy: |
703 | if (!backTrack->inParentheses) { |
704 | // now try to match the parens; set this speculatively. |
705 | backTrack->inParentheses = 1; |
706 | if (term.capture()) { |
707 | unsigned subpatternId = term.atom.subpatternId; |
708 | output[(subpatternId << 1) + 1] = input.getPos() + term.inputPosition; |
709 | } |
710 | context->term -= term.atom.parenthesesWidth; |
711 | return true; |
712 | } |
713 | case QuantifierFixedCount: |
714 | break; |
715 | } |
716 | |
717 | return false; |
718 | } |
719 | |
720 | bool matchParentheticalAssertionBegin(ByteTerm& term, DisjunctionContext* context) |
721 | { |
722 | ASSERT(term.type == ByteTerm::TypeParentheticalAssertionBegin); |
723 | ASSERT(term.atom.quantityCount == 1); |
724 | |
725 | BackTrackInfoParentheticalAssertion* backTrack = reinterpret_cast<BackTrackInfoParentheticalAssertion*>(context->frame + term.frameLocation); |
726 | |
727 | backTrack->begin = input.getPos(); |
728 | return true; |
729 | } |
730 | |
731 | bool matchParentheticalAssertionEnd(ByteTerm& term, DisjunctionContext* context) |
732 | { |
733 | ASSERT(term.type == ByteTerm::TypeParentheticalAssertionEnd); |
734 | ASSERT(term.atom.quantityCount == 1); |
735 | |
736 | BackTrackInfoParentheticalAssertion* backTrack = reinterpret_cast<BackTrackInfoParentheticalAssertion*>(context->frame + term.frameLocation); |
737 | |
738 | input.setPos(backTrack->begin); |
739 | |
740 | // We've reached the end of the parens; if they are inverted, this is failure. |
741 | if (term.invert()) { |
742 | context->term -= term.atom.parenthesesWidth; |
743 | return false; |
744 | } |
745 | |
746 | return true; |
747 | } |
748 | |
749 | bool backtrackParentheticalAssertionBegin(ByteTerm& term, DisjunctionContext* context) |
750 | { |
751 | ASSERT(term.type == ByteTerm::TypeParentheticalAssertionBegin); |
752 | ASSERT(term.atom.quantityCount == 1); |
753 | |
754 | // We've failed to match parens; if they are inverted, this is win! |
755 | if (term.invert()) { |
756 | context->term += term.atom.parenthesesWidth; |
757 | return true; |
758 | } |
759 | |
760 | return false; |
761 | } |
762 | |
763 | bool backtrackParentheticalAssertionEnd(ByteTerm& term, DisjunctionContext* context) |
764 | { |
765 | ASSERT(term.type == ByteTerm::TypeParentheticalAssertionEnd); |
766 | ASSERT(term.atom.quantityCount == 1); |
767 | |
768 | BackTrackInfoParentheticalAssertion* backTrack = reinterpret_cast<BackTrackInfoParentheticalAssertion*>(context->frame + term.frameLocation); |
769 | |
770 | input.setPos(backTrack->begin); |
771 | |
772 | context->term -= term.atom.parenthesesWidth; |
773 | return false; |
774 | } |
775 | |
776 | bool matchParentheses(ByteTerm& term, DisjunctionContext* context) |
777 | { |
778 | ASSERT(term.type == ByteTerm::TypeParenthesesSubpattern); |
779 | |
780 | BackTrackInfoParentheses* backTrack = reinterpret_cast<BackTrackInfoParentheses*>(context->frame + term.frameLocation); |
781 | |
782 | unsigned subpatternId = term.atom.subpatternId; |
783 | ByteDisjunction* disjunctionBody = term.atom.parenthesesDisjunction; |
784 | |
785 | backTrack->prevBegin = output[(subpatternId << 1)]; |
786 | backTrack->prevEnd = output[(subpatternId << 1) + 1]; |
787 | |
788 | backTrack->matchAmount = 0; |
789 | backTrack->lastContext = 0; |
790 | |
791 | switch (term.atom.quantityType) { |
792 | case QuantifierFixedCount: { |
793 | // While we haven't yet reached our fixed limit, |
794 | while (backTrack->matchAmount < term.atom.quantityCount) { |
795 | // Try to do a match, and it it succeeds, add it to the list. |
796 | ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunction: disjunctionBody, output, term); |
797 | if (matchDisjunction(disjunction: disjunctionBody, context: context->getDisjunctionContext(term))) |
798 | appendParenthesesDisjunctionContext(backTrack, context); |
799 | else { |
800 | // The match failed; try to find an alternate point to carry on from. |
801 | resetMatches(term, context); |
802 | freeParenthesesDisjunctionContext(context); |
803 | if (!parenthesesDoBacktrack(term, backTrack)) |
804 | return false; |
805 | } |
806 | } |
807 | |
808 | ASSERT(backTrack->matchAmount == term.atom.quantityCount); |
809 | ParenthesesDisjunctionContext* context = backTrack->lastContext; |
810 | recordParenthesesMatch(term, context); |
811 | return true; |
812 | } |
813 | |
814 | case QuantifierGreedy: { |
815 | while (backTrack->matchAmount < term.atom.quantityCount) { |
816 | ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunction: disjunctionBody, output, term); |
817 | if (matchNonZeroDisjunction(disjunction: disjunctionBody, context: context->getDisjunctionContext(term))) |
818 | appendParenthesesDisjunctionContext(backTrack, context); |
819 | else { |
820 | resetMatches(term, context); |
821 | freeParenthesesDisjunctionContext(context); |
822 | break; |
823 | } |
824 | } |
825 | |
826 | if (backTrack->matchAmount) { |
827 | ParenthesesDisjunctionContext* context = backTrack->lastContext; |
828 | recordParenthesesMatch(term, context); |
829 | } |
830 | return true; |
831 | } |
832 | |
833 | case QuantifierNonGreedy: |
834 | return true; |
835 | } |
836 | |
837 | ASSERT_NOT_REACHED(); |
838 | return false; |
839 | } |
840 | |
841 | // Rules for backtracking differ depending on whether this is greedy or non-greedy. |
842 | // |
843 | // Greedy matches never should try just adding more - you should already have done |
844 | // the 'more' cases. Always backtrack, at least a leetle bit. However cases where |
845 | // you backtrack an item off the list needs checking, since we'll never have matched |
846 | // the one less case. Tracking forwards, still add as much as possible. |
847 | // |
848 | // Non-greedy, we've already done the one less case, so don't match on popping. |
849 | // We haven't done the one more case, so always try to add that. |
850 | // |
851 | bool backtrackParentheses(ByteTerm& term, DisjunctionContext* context) |
852 | { |
853 | ASSERT(term.type == ByteTerm::TypeParenthesesSubpattern); |
854 | |
855 | BackTrackInfoParentheses* backTrack = reinterpret_cast<BackTrackInfoParentheses*>(context->frame + term.frameLocation); |
856 | |
857 | if (term.capture()) { |
858 | unsigned subpatternId = term.atom.subpatternId; |
859 | output[(subpatternId << 1)] = backTrack->prevBegin; |
860 | output[(subpatternId << 1) + 1] = backTrack->prevEnd; |
861 | } |
862 | |
863 | ByteDisjunction* disjunctionBody = term.atom.parenthesesDisjunction; |
864 | |
865 | switch (term.atom.quantityType) { |
866 | case QuantifierFixedCount: { |
867 | ASSERT(backTrack->matchAmount == term.atom.quantityCount); |
868 | |
869 | ParenthesesDisjunctionContext* context = 0; |
870 | |
871 | if (!parenthesesDoBacktrack(term, backTrack)) |
872 | return false; |
873 | |
874 | // While we haven't yet reached our fixed limit, |
875 | while (backTrack->matchAmount < term.atom.quantityCount) { |
876 | // Try to do a match, and it it succeeds, add it to the list. |
877 | context = allocParenthesesDisjunctionContext(disjunction: disjunctionBody, output, term); |
878 | if (matchDisjunction(disjunction: disjunctionBody, context: context->getDisjunctionContext(term))) |
879 | appendParenthesesDisjunctionContext(backTrack, context); |
880 | else { |
881 | // The match failed; try to find an alternate point to carry on from. |
882 | resetMatches(term, context); |
883 | freeParenthesesDisjunctionContext(context); |
884 | if (!parenthesesDoBacktrack(term, backTrack)) |
885 | return false; |
886 | } |
887 | } |
888 | |
889 | ASSERT(backTrack->matchAmount == term.atom.quantityCount); |
890 | context = backTrack->lastContext; |
891 | recordParenthesesMatch(term, context); |
892 | return true; |
893 | } |
894 | |
895 | case QuantifierGreedy: { |
896 | if (!backTrack->matchAmount) |
897 | return false; |
898 | |
899 | ParenthesesDisjunctionContext* context = backTrack->lastContext; |
900 | if (matchNonZeroDisjunction(disjunction: disjunctionBody, context: context->getDisjunctionContext(term), btrack: true)) { |
901 | while (backTrack->matchAmount < term.atom.quantityCount) { |
902 | ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunction: disjunctionBody, output, term); |
903 | if (matchNonZeroDisjunction(disjunction: disjunctionBody, context: context->getDisjunctionContext(term))) |
904 | appendParenthesesDisjunctionContext(backTrack, context); |
905 | else { |
906 | resetMatches(term, context); |
907 | freeParenthesesDisjunctionContext(context); |
908 | break; |
909 | } |
910 | } |
911 | } else { |
912 | resetMatches(term, context); |
913 | popParenthesesDisjunctionContext(backTrack); |
914 | freeParenthesesDisjunctionContext(context); |
915 | } |
916 | |
917 | if (backTrack->matchAmount) { |
918 | ParenthesesDisjunctionContext* context = backTrack->lastContext; |
919 | recordParenthesesMatch(term, context); |
920 | } |
921 | return true; |
922 | } |
923 | |
924 | case QuantifierNonGreedy: { |
925 | // If we've not reached the limit, try to add one more match. |
926 | if (backTrack->matchAmount < term.atom.quantityCount) { |
927 | ParenthesesDisjunctionContext* context = allocParenthesesDisjunctionContext(disjunction: disjunctionBody, output, term); |
928 | if (matchNonZeroDisjunction(disjunction: disjunctionBody, context: context->getDisjunctionContext(term))) { |
929 | appendParenthesesDisjunctionContext(backTrack, context); |
930 | recordParenthesesMatch(term, context); |
931 | return true; |
932 | } else { |
933 | resetMatches(term, context); |
934 | freeParenthesesDisjunctionContext(context); |
935 | } |
936 | } |
937 | |
938 | // Nope - okay backtrack looking for an alternative. |
939 | while (backTrack->matchAmount) { |
940 | ParenthesesDisjunctionContext* context = backTrack->lastContext; |
941 | if (matchNonZeroDisjunction(disjunction: disjunctionBody, context: context->getDisjunctionContext(term), btrack: true)) { |
942 | // successful backtrack! we're back in the game! |
943 | if (backTrack->matchAmount) { |
944 | context = backTrack->lastContext; |
945 | recordParenthesesMatch(term, context); |
946 | } |
947 | return true; |
948 | } |
949 | |
950 | // pop a match off the stack |
951 | resetMatches(term, context); |
952 | popParenthesesDisjunctionContext(backTrack); |
953 | freeParenthesesDisjunctionContext(context); |
954 | } |
955 | |
956 | return false; |
957 | } |
958 | } |
959 | |
960 | ASSERT_NOT_REACHED(); |
961 | return false; |
962 | } |
963 | |
964 | #define MATCH_NEXT() { ++context->term; goto matchAgain; } |
965 | #define BACKTRACK() { --context->term; goto backtrack; } |
966 | #define currentTerm() (disjunction->terms[context->term]) |
967 | bool matchDisjunction(ByteDisjunction* disjunction, DisjunctionContext* context, bool btrack = false) |
968 | { |
969 | if (btrack) |
970 | BACKTRACK(); |
971 | |
972 | context->matchBegin = input.getPos(); |
973 | context->term = 0; |
974 | |
975 | matchAgain: |
976 | ASSERT(context->term < static_cast<int>(disjunction->terms.size())); |
977 | |
978 | switch (currentTerm().type) { |
979 | case ByteTerm::TypeSubpatternBegin: |
980 | MATCH_NEXT(); |
981 | case ByteTerm::TypeSubpatternEnd: |
982 | context->matchEnd = input.getPos(); |
983 | return true; |
984 | |
985 | case ByteTerm::TypeBodyAlternativeBegin: |
986 | MATCH_NEXT(); |
987 | case ByteTerm::TypeBodyAlternativeDisjunction: |
988 | case ByteTerm::TypeBodyAlternativeEnd: |
989 | context->matchEnd = input.getPos(); |
990 | return true; |
991 | |
992 | case ByteTerm::TypeAlternativeBegin: |
993 | MATCH_NEXT(); |
994 | case ByteTerm::TypeAlternativeDisjunction: |
995 | case ByteTerm::TypeAlternativeEnd: { |
996 | int offset = currentTerm().alternative.end; |
997 | BackTrackInfoAlternative* backTrack = reinterpret_cast<BackTrackInfoAlternative*>(context->frame + currentTerm().frameLocation); |
998 | backTrack->offset = offset; |
999 | context->term += offset; |
1000 | MATCH_NEXT(); |
1001 | } |
1002 | |
1003 | case ByteTerm::TypeAssertionBOL: |
1004 | if (matchAssertionBOL(currentTerm())) |
1005 | MATCH_NEXT(); |
1006 | BACKTRACK(); |
1007 | case ByteTerm::TypeAssertionEOL: |
1008 | if (matchAssertionEOL(currentTerm())) |
1009 | MATCH_NEXT(); |
1010 | BACKTRACK(); |
1011 | case ByteTerm::TypeAssertionWordBoundary: |
1012 | if (matchAssertionWordBoundary(currentTerm())) |
1013 | MATCH_NEXT(); |
1014 | BACKTRACK(); |
1015 | |
1016 | case ByteTerm::TypePatternCharacterOnce: |
1017 | case ByteTerm::TypePatternCharacterFixed: { |
1018 | for (unsigned matchAmount = 0; matchAmount < currentTerm().atom.quantityCount; ++matchAmount) { |
1019 | if (!checkCharacter(currentTerm().atom.patternCharacter, currentTerm().inputPosition + matchAmount)) |
1020 | BACKTRACK(); |
1021 | } |
1022 | MATCH_NEXT(); |
1023 | } |
1024 | case ByteTerm::TypePatternCharacterGreedy: { |
1025 | BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation); |
1026 | unsigned matchAmount = 0; |
1027 | while ((matchAmount < currentTerm().atom.quantityCount) && input.checkInput(count: 1)) { |
1028 | if (!checkCharacter(currentTerm().atom.patternCharacter, currentTerm().inputPosition - 1)) { |
1029 | input.uncheckInput(count: 1); |
1030 | break; |
1031 | } |
1032 | ++matchAmount; |
1033 | } |
1034 | backTrack->matchAmount = matchAmount; |
1035 | |
1036 | MATCH_NEXT(); |
1037 | } |
1038 | case ByteTerm::TypePatternCharacterNonGreedy: { |
1039 | BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation); |
1040 | backTrack->matchAmount = 0; |
1041 | MATCH_NEXT(); |
1042 | } |
1043 | |
1044 | case ByteTerm::TypePatternCasedCharacterOnce: |
1045 | case ByteTerm::TypePatternCasedCharacterFixed: { |
1046 | for (unsigned matchAmount = 0; matchAmount < currentTerm().atom.quantityCount; ++matchAmount) { |
1047 | if (!checkCasedCharacter(currentTerm().atom.casedCharacter.lo, currentTerm().atom.casedCharacter.hi, currentTerm().inputPosition + matchAmount)) |
1048 | BACKTRACK(); |
1049 | } |
1050 | MATCH_NEXT(); |
1051 | } |
1052 | case ByteTerm::TypePatternCasedCharacterGreedy: { |
1053 | BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation); |
1054 | unsigned matchAmount = 0; |
1055 | while ((matchAmount < currentTerm().atom.quantityCount) && input.checkInput(count: 1)) { |
1056 | if (!checkCasedCharacter(currentTerm().atom.casedCharacter.lo, currentTerm().atom.casedCharacter.hi, currentTerm().inputPosition - 1)) { |
1057 | input.uncheckInput(count: 1); |
1058 | break; |
1059 | } |
1060 | ++matchAmount; |
1061 | } |
1062 | backTrack->matchAmount = matchAmount; |
1063 | |
1064 | MATCH_NEXT(); |
1065 | } |
1066 | case ByteTerm::TypePatternCasedCharacterNonGreedy: { |
1067 | BackTrackInfoPatternCharacter* backTrack = reinterpret_cast<BackTrackInfoPatternCharacter*>(context->frame + currentTerm().frameLocation); |
1068 | backTrack->matchAmount = 0; |
1069 | MATCH_NEXT(); |
1070 | } |
1071 | |
1072 | case ByteTerm::TypeCharacterClass: |
1073 | if (matchCharacterClass(currentTerm(), context)) |
1074 | MATCH_NEXT(); |
1075 | BACKTRACK(); |
1076 | case ByteTerm::TypeBackReference: |
1077 | if (matchBackReference(currentTerm(), context)) |
1078 | MATCH_NEXT(); |
1079 | BACKTRACK(); |
1080 | case ByteTerm::TypeParenthesesSubpattern: |
1081 | if (matchParentheses(currentTerm(), context)) |
1082 | MATCH_NEXT(); |
1083 | BACKTRACK(); |
1084 | case ByteTerm::TypeParenthesesSubpatternOnceBegin: |
1085 | if (matchParenthesesOnceBegin(currentTerm(), context)) |
1086 | MATCH_NEXT(); |
1087 | BACKTRACK(); |
1088 | case ByteTerm::TypeParenthesesSubpatternOnceEnd: |
1089 | if (matchParenthesesOnceEnd(currentTerm(), context)) |
1090 | MATCH_NEXT(); |
1091 | BACKTRACK(); |
1092 | case ByteTerm::TypeParentheticalAssertionBegin: |
1093 | if (matchParentheticalAssertionBegin(currentTerm(), context)) |
1094 | MATCH_NEXT(); |
1095 | BACKTRACK(); |
1096 | case ByteTerm::TypeParentheticalAssertionEnd: |
1097 | if (matchParentheticalAssertionEnd(currentTerm(), context)) |
1098 | MATCH_NEXT(); |
1099 | BACKTRACK(); |
1100 | |
1101 | case ByteTerm::TypeCheckInput: |
1102 | if (input.checkInput(currentTerm().checkInputCount)) |
1103 | MATCH_NEXT(); |
1104 | BACKTRACK(); |
1105 | } |
1106 | |
1107 | // We should never fall-through to here. |
1108 | ASSERT_NOT_REACHED(); |
1109 | |
1110 | backtrack: |
1111 | ASSERT(context->term < static_cast<int>(disjunction->terms.size())); |
1112 | |
1113 | switch (currentTerm().type) { |
1114 | case ByteTerm::TypeSubpatternBegin: |
1115 | return false; |
1116 | case ByteTerm::TypeSubpatternEnd: |
1117 | ASSERT_NOT_REACHED(); |
1118 | |
1119 | case ByteTerm::TypeBodyAlternativeBegin: |
1120 | case ByteTerm::TypeBodyAlternativeDisjunction: { |
1121 | int offset = currentTerm().alternative.next; |
1122 | context->term += offset; |
1123 | if (offset > 0) |
1124 | MATCH_NEXT(); |
1125 | |
1126 | if (input.atEnd()) |
1127 | return false; |
1128 | |
1129 | input.next(); |
1130 | context->matchBegin = input.getPos(); |
1131 | MATCH_NEXT(); |
1132 | } |
1133 | case ByteTerm::TypeBodyAlternativeEnd: |
1134 | ASSERT_NOT_REACHED(); |
1135 | |
1136 | case ByteTerm::TypeAlternativeBegin: |
1137 | case ByteTerm::TypeAlternativeDisjunction: { |
1138 | int offset = currentTerm().alternative.next; |
1139 | context->term += offset; |
1140 | if (offset > 0) |
1141 | MATCH_NEXT(); |
1142 | BACKTRACK(); |
1143 | } |
1144 | case ByteTerm::TypeAlternativeEnd: { |
1145 | // We should never backtrack back into an alternative of the main body of the regex. |
1146 | BackTrackInfoAlternative* backTrack = reinterpret_cast<BackTrackInfoAlternative*>(context->frame + currentTerm().frameLocation); |
1147 | unsigned offset = backTrack->offset; |
1148 | context->term -= offset; |
1149 | BACKTRACK(); |
1150 | } |
1151 | |
1152 | case ByteTerm::TypeAssertionBOL: |
1153 | case ByteTerm::TypeAssertionEOL: |
1154 | case ByteTerm::TypeAssertionWordBoundary: |
1155 | BACKTRACK(); |
1156 | |
1157 | case ByteTerm::TypePatternCharacterOnce: |
1158 | case ByteTerm::TypePatternCharacterFixed: |
1159 | case ByteTerm::TypePatternCharacterGreedy: |
1160 | case ByteTerm::TypePatternCharacterNonGreedy: |
1161 | if (backtrackPatternCharacter(currentTerm(), context)) |
1162 | MATCH_NEXT(); |
1163 | BACKTRACK(); |
1164 | case ByteTerm::TypePatternCasedCharacterOnce: |
1165 | case ByteTerm::TypePatternCasedCharacterFixed: |
1166 | case ByteTerm::TypePatternCasedCharacterGreedy: |
1167 | case ByteTerm::TypePatternCasedCharacterNonGreedy: |
1168 | if (backtrackPatternCasedCharacter(currentTerm(), context)) |
1169 | MATCH_NEXT(); |
1170 | BACKTRACK(); |
1171 | case ByteTerm::TypeCharacterClass: |
1172 | if (backtrackCharacterClass(currentTerm(), context)) |
1173 | MATCH_NEXT(); |
1174 | BACKTRACK(); |
1175 | case ByteTerm::TypeBackReference: |
1176 | if (backtrackBackReference(currentTerm(), context)) |
1177 | MATCH_NEXT(); |
1178 | BACKTRACK(); |
1179 | case ByteTerm::TypeParenthesesSubpattern: |
1180 | if (backtrackParentheses(currentTerm(), context)) |
1181 | MATCH_NEXT(); |
1182 | BACKTRACK(); |
1183 | case ByteTerm::TypeParenthesesSubpatternOnceBegin: |
1184 | if (backtrackParenthesesOnceBegin(currentTerm(), context)) |
1185 | MATCH_NEXT(); |
1186 | BACKTRACK(); |
1187 | case ByteTerm::TypeParenthesesSubpatternOnceEnd: |
1188 | if (backtrackParenthesesOnceEnd(currentTerm(), context)) |
1189 | MATCH_NEXT(); |
1190 | BACKTRACK(); |
1191 | case ByteTerm::TypeParentheticalAssertionBegin: |
1192 | if (backtrackParentheticalAssertionBegin(currentTerm(), context)) |
1193 | MATCH_NEXT(); |
1194 | BACKTRACK(); |
1195 | case ByteTerm::TypeParentheticalAssertionEnd: |
1196 | if (backtrackParentheticalAssertionEnd(currentTerm(), context)) |
1197 | MATCH_NEXT(); |
1198 | BACKTRACK(); |
1199 | |
1200 | case ByteTerm::TypeCheckInput: |
1201 | input.uncheckInput(currentTerm().checkInputCount); |
1202 | BACKTRACK(); |
1203 | } |
1204 | |
1205 | ASSERT_NOT_REACHED(); |
1206 | return false; |
1207 | } |
1208 | |
1209 | bool matchNonZeroDisjunction(ByteDisjunction* disjunction, DisjunctionContext* context, bool btrack = false) |
1210 | { |
1211 | if (matchDisjunction(disjunction, context, btrack)) { |
1212 | while (context->matchBegin == context->matchEnd) { |
1213 | if (!matchDisjunction(disjunction, context, btrack: true)) |
1214 | return false; |
1215 | } |
1216 | return true; |
1217 | } |
1218 | |
1219 | return false; |
1220 | } |
1221 | |
1222 | int interpret() |
1223 | { |
1224 | for (unsigned i = 0; i < ((pattern->m_body->m_numSubpatterns + 1) << 1); ++i) |
1225 | output[i] = -1; |
1226 | |
1227 | DisjunctionContext* context = allocDisjunctionContext(disjunction: pattern->m_body.get()); |
1228 | |
1229 | if (matchDisjunction(disjunction: pattern->m_body.get(), context)) { |
1230 | output[0] = context->matchBegin; |
1231 | output[1] = context->matchEnd; |
1232 | } |
1233 | |
1234 | freeDisjunctionContext(context); |
1235 | |
1236 | return output[0]; |
1237 | } |
1238 | |
1239 | Interpreter(BytecodePattern* pattern, int* output, const UChar* inputChar, unsigned start, unsigned length) |
1240 | : pattern(pattern) |
1241 | , output(output) |
1242 | , input(inputChar, start, length) |
1243 | { |
1244 | } |
1245 | |
1246 | private: |
1247 | BytecodePattern *pattern; |
1248 | int* output; |
1249 | InputStream input; |
1250 | }; |
1251 | |
1252 | |
1253 | |
1254 | class ByteCompiler { |
1255 | struct ParenthesesStackEntry { |
1256 | unsigned beginTerm; |
1257 | unsigned savedAlternativeIndex; |
1258 | ParenthesesStackEntry(unsigned beginTerm, unsigned savedAlternativeIndex/*, unsigned subpatternId, bool capture = false*/) |
1259 | : beginTerm(beginTerm) |
1260 | , savedAlternativeIndex(savedAlternativeIndex) |
1261 | { |
1262 | } |
1263 | }; |
1264 | |
1265 | public: |
1266 | ByteCompiler(RegexPattern& pattern) |
1267 | : m_pattern(pattern) |
1268 | { |
1269 | m_bodyDisjunction = 0; |
1270 | m_currentAlternativeIndex = 0; |
1271 | } |
1272 | |
1273 | BytecodePattern* compile() |
1274 | { |
1275 | regexBegin(numSubpatterns: m_pattern.m_numSubpatterns, callFrameSize: m_pattern.m_body->m_callFrameSize); |
1276 | emitDisjunction(disjunction: m_pattern.m_body); |
1277 | regexEnd(); |
1278 | |
1279 | return new BytecodePattern(m_bodyDisjunction, m_allParenthesesInfo, m_pattern); |
1280 | } |
1281 | |
1282 | void checkInput(unsigned count) |
1283 | { |
1284 | m_bodyDisjunction->terms.append(val: ByteTerm::CheckInput(count)); |
1285 | } |
1286 | |
1287 | void assertionBOL(int inputPosition) |
1288 | { |
1289 | m_bodyDisjunction->terms.append(val: ByteTerm::BOL(inputPos: inputPosition)); |
1290 | } |
1291 | |
1292 | void assertionEOL(int inputPosition) |
1293 | { |
1294 | m_bodyDisjunction->terms.append(val: ByteTerm::EOL(inputPos: inputPosition)); |
1295 | } |
1296 | |
1297 | void assertionWordBoundary(bool invert, int inputPosition) |
1298 | { |
1299 | m_bodyDisjunction->terms.append(val: ByteTerm::WordBoundary(invert, inputPos: inputPosition)); |
1300 | } |
1301 | |
1302 | void atomPatternCharacter(UChar ch, int inputPosition, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType) |
1303 | { |
1304 | if (m_pattern.m_ignoreCase) { |
1305 | UChar lo = Unicode::toLower(ch); |
1306 | UChar hi = Unicode::toUpper(ch); |
1307 | |
1308 | if (lo != hi) { |
1309 | m_bodyDisjunction->terms.append(val: ByteTerm(lo, hi, inputPosition, frameLocation, quantityCount, quantityType)); |
1310 | return; |
1311 | } |
1312 | } |
1313 | |
1314 | m_bodyDisjunction->terms.append(val: ByteTerm(ch, inputPosition, frameLocation, quantityCount, quantityType)); |
1315 | } |
1316 | |
1317 | void atomCharacterClass(CharacterClass* characterClass, bool invert, int inputPosition, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType) |
1318 | { |
1319 | m_bodyDisjunction->terms.append(val: ByteTerm(characterClass, invert, inputPosition)); |
1320 | |
1321 | m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].atom.quantityCount = quantityCount; |
1322 | m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].atom.quantityType = quantityType; |
1323 | m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation; |
1324 | } |
1325 | |
1326 | void atomBackReference(unsigned subpatternId, int inputPosition, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType) |
1327 | { |
1328 | ASSERT(subpatternId); |
1329 | |
1330 | m_bodyDisjunction->terms.append(val: ByteTerm::BackReference(subpatternId, inputPos: inputPosition)); |
1331 | |
1332 | m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].atom.quantityCount = quantityCount; |
1333 | m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].atom.quantityType = quantityType; |
1334 | m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation; |
1335 | } |
1336 | |
1337 | void atomParenthesesSubpatternBegin(unsigned subpatternId, bool capture, int inputPosition, unsigned frameLocation, unsigned alternativeFrameLocation) |
1338 | { |
1339 | int beginTerm = m_bodyDisjunction->terms.size(); |
1340 | |
1341 | m_bodyDisjunction->terms.append(val: ByteTerm(ByteTerm::TypeParenthesesSubpatternOnceBegin, subpatternId, capture, inputPosition)); |
1342 | m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation; |
1343 | m_bodyDisjunction->terms.append(val: ByteTerm::AlternativeBegin()); |
1344 | m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = alternativeFrameLocation; |
1345 | |
1346 | m_parenthesesStack.append(val: ParenthesesStackEntry(beginTerm, m_currentAlternativeIndex)); |
1347 | m_currentAlternativeIndex = beginTerm + 1; |
1348 | } |
1349 | |
1350 | void atomParentheticalAssertionBegin(unsigned subpatternId, bool invert, unsigned frameLocation, unsigned alternativeFrameLocation) |
1351 | { |
1352 | int beginTerm = m_bodyDisjunction->terms.size(); |
1353 | |
1354 | m_bodyDisjunction->terms.append(val: ByteTerm(ByteTerm::TypeParentheticalAssertionBegin, subpatternId, invert, 0)); |
1355 | m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = frameLocation; |
1356 | m_bodyDisjunction->terms.append(val: ByteTerm::AlternativeBegin()); |
1357 | m_bodyDisjunction->terms[m_bodyDisjunction->terms.size() - 1].frameLocation = alternativeFrameLocation; |
1358 | |
1359 | m_parenthesesStack.append(val: ParenthesesStackEntry(beginTerm, m_currentAlternativeIndex)); |
1360 | m_currentAlternativeIndex = beginTerm + 1; |
1361 | } |
1362 | |
1363 | unsigned popParenthesesStack() |
1364 | { |
1365 | ASSERT(m_parenthesesStack.size()); |
1366 | int stackEnd = m_parenthesesStack.size() - 1; |
1367 | unsigned beginTerm = m_parenthesesStack[stackEnd].beginTerm; |
1368 | m_currentAlternativeIndex = m_parenthesesStack[stackEnd].savedAlternativeIndex; |
1369 | m_parenthesesStack.shrink(size: stackEnd); |
1370 | |
1371 | ASSERT(beginTerm < m_bodyDisjunction->terms.size()); |
1372 | ASSERT(m_currentAlternativeIndex < m_bodyDisjunction->terms.size()); |
1373 | |
1374 | return beginTerm; |
1375 | } |
1376 | |
1377 | #ifndef NDEBUG |
1378 | void dumpDisjunction(ByteDisjunction* disjunction) |
1379 | { |
1380 | printf(format: "ByteDisjunction(%p):\n\t" , disjunction); |
1381 | for (unsigned i = 0; i < disjunction->terms.size(); ++i) |
1382 | printf(format: "{ %d } " , disjunction->terms[i].type); |
1383 | printf(format: "\n" ); |
1384 | } |
1385 | #endif |
1386 | |
1387 | void closeAlternative(int beginTerm) |
1388 | { |
1389 | int origBeginTerm = beginTerm; |
1390 | ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeAlternativeBegin); |
1391 | int endIndex = m_bodyDisjunction->terms.size(); |
1392 | |
1393 | unsigned frameLocation = m_bodyDisjunction->terms[beginTerm].frameLocation; |
1394 | |
1395 | if (!m_bodyDisjunction->terms[beginTerm].alternative.next) |
1396 | m_bodyDisjunction->terms.remove(position: beginTerm); |
1397 | else { |
1398 | while (m_bodyDisjunction->terms[beginTerm].alternative.next) { |
1399 | beginTerm += m_bodyDisjunction->terms[beginTerm].alternative.next; |
1400 | ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeAlternativeDisjunction); |
1401 | m_bodyDisjunction->terms[beginTerm].alternative.end = endIndex - beginTerm; |
1402 | m_bodyDisjunction->terms[beginTerm].frameLocation = frameLocation; |
1403 | } |
1404 | |
1405 | m_bodyDisjunction->terms[beginTerm].alternative.next = origBeginTerm - beginTerm; |
1406 | |
1407 | m_bodyDisjunction->terms.append(val: ByteTerm::AlternativeEnd()); |
1408 | m_bodyDisjunction->terms[endIndex].frameLocation = frameLocation; |
1409 | } |
1410 | } |
1411 | |
1412 | void closeBodyAlternative() |
1413 | { |
1414 | int beginTerm = 0; |
1415 | int origBeginTerm = 0; |
1416 | ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeBodyAlternativeBegin); |
1417 | int endIndex = m_bodyDisjunction->terms.size(); |
1418 | |
1419 | unsigned frameLocation = m_bodyDisjunction->terms[beginTerm].frameLocation; |
1420 | |
1421 | while (m_bodyDisjunction->terms[beginTerm].alternative.next) { |
1422 | beginTerm += m_bodyDisjunction->terms[beginTerm].alternative.next; |
1423 | ASSERT(m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeBodyAlternativeDisjunction); |
1424 | m_bodyDisjunction->terms[beginTerm].alternative.end = endIndex - beginTerm; |
1425 | m_bodyDisjunction->terms[beginTerm].frameLocation = frameLocation; |
1426 | } |
1427 | |
1428 | m_bodyDisjunction->terms[beginTerm].alternative.next = origBeginTerm - beginTerm; |
1429 | |
1430 | m_bodyDisjunction->terms.append(val: ByteTerm::BodyAlternativeEnd()); |
1431 | m_bodyDisjunction->terms[endIndex].frameLocation = frameLocation; |
1432 | } |
1433 | |
1434 | void atomParenthesesEnd(bool doInline, unsigned lastSubpatternId, int inputPosition, unsigned frameLocation, unsigned quantityCount, QuantifierType quantityType, unsigned callFrameSize = 0) |
1435 | { |
1436 | unsigned beginTerm = popParenthesesStack(); |
1437 | closeAlternative(beginTerm: beginTerm + 1); |
1438 | unsigned endTerm = m_bodyDisjunction->terms.size(); |
1439 | |
1440 | bool isAssertion = m_bodyDisjunction->terms[beginTerm].type == ByteTerm::TypeParentheticalAssertionBegin; |
1441 | bool invertOrCapture = m_bodyDisjunction->terms[beginTerm].invertOrCapture; |
1442 | unsigned subpatternId = m_bodyDisjunction->terms[beginTerm].atom.subpatternId; |
1443 | |
1444 | m_bodyDisjunction->terms.append(val: ByteTerm(isAssertion ? ByteTerm::TypeParentheticalAssertionEnd : ByteTerm::TypeParenthesesSubpatternOnceEnd, subpatternId, invertOrCapture, inputPosition)); |
1445 | m_bodyDisjunction->terms[beginTerm].atom.parenthesesWidth = endTerm - beginTerm; |
1446 | m_bodyDisjunction->terms[endTerm].atom.parenthesesWidth = endTerm - beginTerm; |
1447 | m_bodyDisjunction->terms[endTerm].frameLocation = frameLocation; |
1448 | |
1449 | if (doInline) { |
1450 | m_bodyDisjunction->terms[beginTerm].atom.quantityCount = quantityCount; |
1451 | m_bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType; |
1452 | m_bodyDisjunction->terms[endTerm].atom.quantityCount = quantityCount; |
1453 | m_bodyDisjunction->terms[endTerm].atom.quantityType = quantityType; |
1454 | } else { |
1455 | ByteTerm& parenthesesBegin = m_bodyDisjunction->terms[beginTerm]; |
1456 | ASSERT(parenthesesBegin.type == ByteTerm::TypeParenthesesSubpatternOnceBegin); |
1457 | |
1458 | bool invertOrCapture = parenthesesBegin.invertOrCapture; |
1459 | unsigned subpatternId = parenthesesBegin.atom.subpatternId; |
1460 | |
1461 | unsigned numSubpatterns = lastSubpatternId - subpatternId + 1; |
1462 | ByteDisjunction* parenthesesDisjunction = new ByteDisjunction(numSubpatterns, callFrameSize); |
1463 | |
1464 | parenthesesDisjunction->terms.append(val: ByteTerm::SubpatternBegin()); |
1465 | for (unsigned termInParentheses = beginTerm + 1; termInParentheses < endTerm; ++termInParentheses) |
1466 | parenthesesDisjunction->terms.append(val: m_bodyDisjunction->terms[termInParentheses]); |
1467 | parenthesesDisjunction->terms.append(val: ByteTerm::SubpatternEnd()); |
1468 | |
1469 | m_bodyDisjunction->terms.shrink(size: beginTerm); |
1470 | |
1471 | m_allParenthesesInfo.append(val: parenthesesDisjunction); |
1472 | m_bodyDisjunction->terms.append(val: ByteTerm(ByteTerm::TypeParenthesesSubpattern, subpatternId, parenthesesDisjunction, invertOrCapture, inputPosition)); |
1473 | |
1474 | m_bodyDisjunction->terms[beginTerm].atom.quantityCount = quantityCount; |
1475 | m_bodyDisjunction->terms[beginTerm].atom.quantityType = quantityType; |
1476 | m_bodyDisjunction->terms[beginTerm].frameLocation = frameLocation; |
1477 | } |
1478 | } |
1479 | |
1480 | void regexBegin(unsigned numSubpatterns, unsigned callFrameSize) |
1481 | { |
1482 | m_bodyDisjunction = new ByteDisjunction(numSubpatterns, callFrameSize); |
1483 | m_bodyDisjunction->terms.append(val: ByteTerm::BodyAlternativeBegin()); |
1484 | m_bodyDisjunction->terms[0].frameLocation = 0; |
1485 | m_currentAlternativeIndex = 0; |
1486 | } |
1487 | |
1488 | void regexEnd() |
1489 | { |
1490 | closeBodyAlternative(); |
1491 | } |
1492 | |
1493 | void alternativeBodyDisjunction() |
1494 | { |
1495 | int newAlternativeIndex = m_bodyDisjunction->terms.size(); |
1496 | m_bodyDisjunction->terms[m_currentAlternativeIndex].alternative.next = newAlternativeIndex - m_currentAlternativeIndex; |
1497 | m_bodyDisjunction->terms.append(val: ByteTerm::BodyAlternativeDisjunction()); |
1498 | |
1499 | m_currentAlternativeIndex = newAlternativeIndex; |
1500 | } |
1501 | |
1502 | void alternativeDisjunction() |
1503 | { |
1504 | int newAlternativeIndex = m_bodyDisjunction->terms.size(); |
1505 | m_bodyDisjunction->terms[m_currentAlternativeIndex].alternative.next = newAlternativeIndex - m_currentAlternativeIndex; |
1506 | m_bodyDisjunction->terms.append(val: ByteTerm::AlternativeDisjunction()); |
1507 | |
1508 | m_currentAlternativeIndex = newAlternativeIndex; |
1509 | } |
1510 | |
1511 | void emitDisjunction(PatternDisjunction* disjunction, unsigned inputCountAlreadyChecked = 0, unsigned parenthesesInputCountAlreadyChecked = 0) |
1512 | { |
1513 | for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) { |
1514 | unsigned currentCountAlreadyChecked = inputCountAlreadyChecked; |
1515 | |
1516 | if (alt) { |
1517 | if (disjunction == m_pattern.m_body) |
1518 | alternativeBodyDisjunction(); |
1519 | else |
1520 | alternativeDisjunction(); |
1521 | } |
1522 | |
1523 | PatternAlternative* alternative = disjunction->m_alternatives[alt]; |
1524 | unsigned minimumSize = alternative->m_minimumSize; |
1525 | |
1526 | ASSERT(minimumSize >= parenthesesInputCountAlreadyChecked); |
1527 | unsigned countToCheck = minimumSize - parenthesesInputCountAlreadyChecked; |
1528 | if (countToCheck) |
1529 | checkInput(count: countToCheck); |
1530 | currentCountAlreadyChecked += countToCheck; |
1531 | |
1532 | for (unsigned i = 0; i < alternative->m_terms.size(); ++i) { |
1533 | PatternTerm& term = alternative->m_terms[i]; |
1534 | |
1535 | switch (term.type) { |
1536 | case PatternTerm::TypeAssertionBOL: |
1537 | assertionBOL(inputPosition: term.inputPosition - currentCountAlreadyChecked); |
1538 | break; |
1539 | |
1540 | case PatternTerm::TypeAssertionEOL: |
1541 | assertionEOL(inputPosition: term.inputPosition - currentCountAlreadyChecked); |
1542 | break; |
1543 | |
1544 | case PatternTerm::TypeAssertionWordBoundary: |
1545 | assertionWordBoundary(invert: term.invertOrCapture, inputPosition: term.inputPosition - currentCountAlreadyChecked); |
1546 | break; |
1547 | |
1548 | case PatternTerm::TypePatternCharacter: |
1549 | atomPatternCharacter(ch: term.patternCharacter, inputPosition: term.inputPosition - currentCountAlreadyChecked, frameLocation: term.frameLocation, quantityCount: term.quantityCount, quantityType: term.quantityType); |
1550 | break; |
1551 | |
1552 | case PatternTerm::TypeCharacterClass: |
1553 | atomCharacterClass(characterClass: term.characterClass, invert: term.invertOrCapture, inputPosition: term.inputPosition - currentCountAlreadyChecked, frameLocation: term.frameLocation, quantityCount: term.quantityCount, quantityType: term.quantityType); |
1554 | break; |
1555 | |
1556 | case PatternTerm::TypeBackReference: |
1557 | atomBackReference(subpatternId: term.subpatternId, inputPosition: term.inputPosition - currentCountAlreadyChecked, frameLocation: term.frameLocation, quantityCount: term.quantityCount, quantityType: term.quantityType); |
1558 | break; |
1559 | |
1560 | case PatternTerm::TypeForwardReference: |
1561 | break; |
1562 | |
1563 | case PatternTerm::TypeParenthesesSubpattern: { |
1564 | unsigned disjunctionAlreadyCheckedCount = 0; |
1565 | if ((term.quantityCount == 1) && !term.parentheses.isCopy) { |
1566 | if (term.quantityType == QuantifierFixedCount) { |
1567 | disjunctionAlreadyCheckedCount = term.parentheses.disjunction->m_minimumSize; |
1568 | unsigned delegateEndInputOffset = term.inputPosition - currentCountAlreadyChecked; |
1569 | atomParenthesesSubpatternBegin(subpatternId: term.parentheses.subpatternId, capture: term.invertOrCapture, inputPosition: delegateEndInputOffset - disjunctionAlreadyCheckedCount, frameLocation: term.frameLocation, alternativeFrameLocation: term.frameLocation); |
1570 | emitDisjunction(disjunction: term.parentheses.disjunction, inputCountAlreadyChecked: currentCountAlreadyChecked, parenthesesInputCountAlreadyChecked: term.parentheses.disjunction->m_minimumSize); |
1571 | atomParenthesesEnd(doInline: true, lastSubpatternId: term.parentheses.lastSubpatternId, inputPosition: delegateEndInputOffset, frameLocation: term.frameLocation, quantityCount: term.quantityCount, quantityType: term.quantityType, callFrameSize: term.parentheses.disjunction->m_callFrameSize); |
1572 | } else { |
1573 | unsigned delegateEndInputOffset = term.inputPosition - currentCountAlreadyChecked; |
1574 | atomParenthesesSubpatternBegin(subpatternId: term.parentheses.subpatternId, capture: term.invertOrCapture, inputPosition: delegateEndInputOffset - disjunctionAlreadyCheckedCount, frameLocation: term.frameLocation, alternativeFrameLocation: term.frameLocation + RegexStackSpaceForBackTrackInfoParenthesesOnce); |
1575 | emitDisjunction(disjunction: term.parentheses.disjunction, inputCountAlreadyChecked: currentCountAlreadyChecked, parenthesesInputCountAlreadyChecked: 0); |
1576 | atomParenthesesEnd(doInline: true, lastSubpatternId: term.parentheses.lastSubpatternId, inputPosition: delegateEndInputOffset, frameLocation: term.frameLocation, quantityCount: term.quantityCount, quantityType: term.quantityType, callFrameSize: term.parentheses.disjunction->m_callFrameSize); |
1577 | } |
1578 | } else { |
1579 | unsigned delegateEndInputOffset = term.inputPosition - currentCountAlreadyChecked; |
1580 | atomParenthesesSubpatternBegin(subpatternId: term.parentheses.subpatternId, capture: term.invertOrCapture, inputPosition: delegateEndInputOffset - disjunctionAlreadyCheckedCount, frameLocation: term.frameLocation, alternativeFrameLocation: 0); |
1581 | emitDisjunction(disjunction: term.parentheses.disjunction, inputCountAlreadyChecked: currentCountAlreadyChecked, parenthesesInputCountAlreadyChecked: 0); |
1582 | atomParenthesesEnd(doInline: false, lastSubpatternId: term.parentheses.lastSubpatternId, inputPosition: delegateEndInputOffset, frameLocation: term.frameLocation, quantityCount: term.quantityCount, quantityType: term.quantityType, callFrameSize: term.parentheses.disjunction->m_callFrameSize); |
1583 | } |
1584 | break; |
1585 | } |
1586 | |
1587 | case PatternTerm::TypeParentheticalAssertion: { |
1588 | unsigned alternativeFrameLocation = term.inputPosition + RegexStackSpaceForBackTrackInfoParentheticalAssertion; |
1589 | |
1590 | atomParentheticalAssertionBegin(subpatternId: term.parentheses.subpatternId, invert: term.invertOrCapture, frameLocation: term.frameLocation, alternativeFrameLocation); |
1591 | emitDisjunction(disjunction: term.parentheses.disjunction, inputCountAlreadyChecked: currentCountAlreadyChecked, parenthesesInputCountAlreadyChecked: 0); |
1592 | atomParenthesesEnd(doInline: true, lastSubpatternId: term.parentheses.lastSubpatternId, inputPosition: 0, frameLocation: term.frameLocation, quantityCount: term.quantityCount, quantityType: term.quantityType); |
1593 | break; |
1594 | } |
1595 | } |
1596 | } |
1597 | } |
1598 | } |
1599 | |
1600 | private: |
1601 | RegexPattern& m_pattern; |
1602 | ByteDisjunction* m_bodyDisjunction; |
1603 | unsigned m_currentAlternativeIndex; |
1604 | Vector<ParenthesesStackEntry> m_parenthesesStack; |
1605 | Vector<ByteDisjunction*> m_allParenthesesInfo; |
1606 | }; |
1607 | |
1608 | |
1609 | BytecodePattern* byteCompileRegex(const UString& patternString, unsigned& numSubpatterns, const char*& error, bool ignoreCase, bool multiline) |
1610 | { |
1611 | RegexPattern pattern(ignoreCase, multiline); |
1612 | |
1613 | if ((error = compileRegex(patternString, pattern))) |
1614 | return 0; |
1615 | |
1616 | numSubpatterns = pattern.m_numSubpatterns; |
1617 | |
1618 | return ByteCompiler(pattern).compile(); |
1619 | } |
1620 | |
1621 | int interpretRegex(BytecodePattern* regex, const UChar* input, unsigned start, unsigned length, int* output) |
1622 | { |
1623 | return Interpreter(regex, output, input, start, length).interpret(); |
1624 | } |
1625 | |
1626 | |
1627 | COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoPatternCharacter) == (RegexStackSpaceForBackTrackInfoPatternCharacter * sizeof(uintptr_t)), CheckRegexStackSpaceForBackTrackInfoPatternCharacter); |
1628 | COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoCharacterClass) == (RegexStackSpaceForBackTrackInfoCharacterClass * sizeof(uintptr_t)), CheckRegexStackSpaceForBackTrackInfoCharacterClass); |
1629 | COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoBackReference) == (RegexStackSpaceForBackTrackInfoBackReference * sizeof(uintptr_t)), CheckRegexStackSpaceForBackTrackInfoBackReference); |
1630 | COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoAlternative) == (RegexStackSpaceForBackTrackInfoAlternative * sizeof(uintptr_t)), CheckRegexStackSpaceForBackTrackInfoAlternative); |
1631 | COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoParentheticalAssertion) == (RegexStackSpaceForBackTrackInfoParentheticalAssertion * sizeof(uintptr_t)), CheckRegexStackSpaceForBackTrackInfoParentheticalAssertion); |
1632 | COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoParenthesesOnce) == (RegexStackSpaceForBackTrackInfoParenthesesOnce * sizeof(uintptr_t)), CheckRegexStackSpaceForBackTrackInfoParenthesesOnce); |
1633 | COMPILE_ASSERT(sizeof(Interpreter::BackTrackInfoParentheses) == (RegexStackSpaceForBackTrackInfoParentheses * sizeof(uintptr_t)), CheckRegexStackSpaceForBackTrackInfoParentheses); |
1634 | |
1635 | |
1636 | } } |
1637 | |
1638 | #endif |
1639 | |