1 | /* This is JavaScriptCore's variant of the PCRE library. While this library |
2 | started out as a copy of PCRE, many of the features of PCRE have been |
3 | removed. This library now supports only the regular expression features |
4 | required by the JavaScript language specification, and has only the functions |
5 | needed by JavaScriptCore and the rest of WebKit. |
6 | |
7 | Originally written by Philip Hazel |
8 | Copyright (c) 1997-2006 University of Cambridge |
9 | Copyright (C) 2002, 2004, 2006, 2007, 2008, 2009 Apple Inc. All rights reserved. |
10 | Copyright (C) 2007 Eric Seidel <eric@webkit.org> |
11 | |
12 | ----------------------------------------------------------------------------- |
13 | Redistribution and use in source and binary forms, with or without |
14 | modification, are permitted provided that the following conditions are met: |
15 | |
16 | * Redistributions of source code must retain the above copyright notice, |
17 | this list of conditions and the following disclaimer. |
18 | |
19 | * Redistributions in binary form must reproduce the above copyright |
20 | notice, this list of conditions and the following disclaimer in the |
21 | documentation and/or other materials provided with the distribution. |
22 | |
23 | * Neither the name of the University of Cambridge nor the names of its |
24 | contributors may be used to endorse or promote products derived from |
25 | this software without specific prior written permission. |
26 | |
27 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
28 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
29 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
30 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE |
31 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
32 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
33 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
34 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
35 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
36 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
37 | POSSIBILITY OF SUCH DAMAGE. |
38 | ----------------------------------------------------------------------------- |
39 | */ |
40 | |
41 | /* This module contains jsRegExpExecute(), the externally visible function |
42 | that does pattern matching using an NFA algorithm, following the rules from |
43 | the JavaScript specification. There are also some supporting functions. */ |
44 | |
45 | #include "config.h" |
46 | #include "pcre_internal.h" |
47 | |
48 | #include <limits.h> |
49 | #include <wtf/ASCIICType.h> |
50 | #include <wtf/Vector.h> |
51 | |
52 | #if REGEXP_HISTOGRAM |
53 | #include <wtf/DateMath.h> |
54 | #include <runtime/UString.h> |
55 | #endif |
56 | |
57 | using namespace WTF; |
58 | |
59 | #if COMPILER(GCC) |
60 | #define USE_COMPUTED_GOTO_FOR_MATCH_RECURSION |
61 | //#define USE_COMPUTED_GOTO_FOR_MATCH_OPCODE_LOOP |
62 | #endif |
63 | |
64 | /* Avoid warnings on Windows. */ |
65 | #undef min |
66 | #undef max |
67 | |
68 | #ifndef USE_COMPUTED_GOTO_FOR_MATCH_RECURSION |
69 | typedef int ReturnLocation; |
70 | #else |
71 | typedef void* ReturnLocation; |
72 | #endif |
73 | |
74 | #if !REGEXP_HISTOGRAM |
75 | |
76 | class HistogramTimeLogger { |
77 | public: |
78 | HistogramTimeLogger(const JSRegExp*) { } |
79 | }; |
80 | |
81 | #else |
82 | |
83 | using namespace JSC; |
84 | |
85 | class Histogram { |
86 | public: |
87 | ~Histogram(); |
88 | void add(const JSRegExp*, double); |
89 | |
90 | private: |
91 | typedef HashMap<RefPtr<UString::Rep>, double> Map; |
92 | Map times; |
93 | }; |
94 | |
95 | class HistogramTimeLogger { |
96 | public: |
97 | HistogramTimeLogger(const JSRegExp*); |
98 | ~HistogramTimeLogger(); |
99 | |
100 | private: |
101 | const JSRegExp* m_re; |
102 | double m_startTime; |
103 | }; |
104 | |
105 | #endif |
106 | |
107 | /* Structure for building a chain of data for holding the values of |
108 | the subject pointer at the start of each bracket, used to detect when |
109 | an empty string has been matched by a bracket to break infinite loops. */ |
110 | struct BracketChainNode { |
111 | BracketChainNode* previousBracket; |
112 | const UChar* bracketStart; |
113 | }; |
114 | |
115 | struct MatchFrame : FastAllocBase { |
116 | ReturnLocation returnLocation; |
117 | struct MatchFrame* previousFrame; |
118 | |
119 | /* Function arguments that may change */ |
120 | struct { |
121 | const UChar* subjectPtr; |
122 | const unsigned char* instructionPtr; |
123 | int offsetTop; |
124 | BracketChainNode* bracketChain; |
125 | } args; |
126 | |
127 | |
128 | /* PCRE uses "fake" recursion built off of gotos, thus |
129 | stack-based local variables are not safe to use. Instead we have to |
130 | store local variables on the current MatchFrame. */ |
131 | struct { |
132 | const unsigned char* data; |
133 | const unsigned char* startOfRepeatingBracket; |
134 | const UChar* subjectPtrAtStartOfInstruction; // Several instrutions stash away a subjectPtr here for later compare |
135 | const unsigned char* instructionPtrAtStartOfOnce; |
136 | |
137 | int repeatOthercase; |
138 | |
139 | int ctype; |
140 | int fc; |
141 | int fi; |
142 | int length; |
143 | int max; |
144 | int number; |
145 | int offset; |
146 | int saveOffset1; |
147 | int saveOffset2; |
148 | int saveOffset3; |
149 | |
150 | BracketChainNode bracketChainNode; |
151 | } locals; |
152 | }; |
153 | |
154 | /* Structure for passing "static" information around between the functions |
155 | doing traditional NFA matching, so that they are thread-safe. */ |
156 | |
157 | struct MatchData { |
158 | int* offsetVector; /* Offset vector */ |
159 | int offsetEnd; /* One past the end */ |
160 | int offsetMax; /* The maximum usable for return data */ |
161 | bool offsetOverflow; /* Set if too many extractions */ |
162 | const UChar* startSubject; /* Start of the subject string */ |
163 | const UChar* endSubject; /* End of the subject string */ |
164 | const UChar* endMatchPtr; /* Subject position at end match */ |
165 | int endOffsetTop; /* Highwater mark at end of match */ |
166 | bool multiline; |
167 | bool ignoreCase; |
168 | }; |
169 | |
170 | /* The maximum remaining length of subject we are prepared to search for a |
171 | reqByte match. */ |
172 | |
173 | #define REQ_BYTE_MAX 1000 |
174 | |
175 | /* The below limit restricts the number of "recursive" match calls in order to |
176 | avoid spending exponential time on complex regular expressions. */ |
177 | |
178 | static const unsigned matchLimit = 1000000; |
179 | |
180 | #ifdef DEBUG |
181 | /************************************************* |
182 | * Debugging function to print chars * |
183 | *************************************************/ |
184 | |
185 | /* Print a sequence of chars in printable format, stopping at the end of the |
186 | subject if the requested. |
187 | |
188 | Arguments: |
189 | p points to characters |
190 | length number to print |
191 | isSubject true if printing from within md.startSubject |
192 | md pointer to matching data block, if isSubject is true |
193 | */ |
194 | |
195 | static void pchars(const UChar* p, int length, bool isSubject, const MatchData& md) |
196 | { |
197 | if (isSubject && length > md.endSubject - p) |
198 | length = md.endSubject - p; |
199 | while (length-- > 0) { |
200 | int c; |
201 | if (isprint(c = *(p++))) |
202 | printf("%c" , c); |
203 | else if (c < 256) |
204 | printf("\\x%02x" , c); |
205 | else |
206 | printf("\\x{%x}" , c); |
207 | } |
208 | } |
209 | #endif |
210 | |
211 | /************************************************* |
212 | * Match a back-reference * |
213 | *************************************************/ |
214 | |
215 | /* If a back reference hasn't been set, the length that is passed is greater |
216 | than the number of characters left in the string, so the match fails. |
217 | |
218 | Arguments: |
219 | offset index into the offset vector |
220 | subjectPtr points into the subject |
221 | length length to be matched |
222 | md points to match data block |
223 | |
224 | Returns: true if matched |
225 | */ |
226 | |
227 | static bool matchRef(int offset, const UChar* subjectPtr, int length, const MatchData& md) |
228 | { |
229 | const UChar* p = md.startSubject + md.offsetVector[offset]; |
230 | |
231 | #ifdef DEBUG |
232 | if (subjectPtr >= md.endSubject) |
233 | printf("matching subject <null>" ); |
234 | else { |
235 | printf("matching subject " ); |
236 | pchars(subjectPtr, length, true, md); |
237 | } |
238 | printf(" against backref " ); |
239 | pchars(p, length, false, md); |
240 | printf("\n" ); |
241 | #endif |
242 | |
243 | /* Always fail if not enough characters left */ |
244 | |
245 | if (length > md.endSubject - subjectPtr) |
246 | return false; |
247 | |
248 | /* Separate the caselesss case for speed */ |
249 | |
250 | if (md.ignoreCase) { |
251 | while (length-- > 0) { |
252 | UChar c = *p++; |
253 | int othercase = jsc_pcre_ucp_othercase(c); |
254 | UChar d = *subjectPtr++; |
255 | if (c != d && othercase != d) |
256 | return false; |
257 | } |
258 | } |
259 | else { |
260 | while (length-- > 0) |
261 | if (*p++ != *subjectPtr++) |
262 | return false; |
263 | } |
264 | |
265 | return true; |
266 | } |
267 | |
268 | #ifndef USE_COMPUTED_GOTO_FOR_MATCH_RECURSION |
269 | |
270 | /* Use numbered labels and switch statement at the bottom of the match function. */ |
271 | |
272 | #define RMATCH_WHERE(num) num |
273 | #define RRETURN_LABEL RRETURN_SWITCH |
274 | |
275 | #else |
276 | |
277 | /* Use GCC's computed goto extension. */ |
278 | |
279 | /* For one test case this is more than 40% faster than the switch statement. |
280 | We could avoid the use of the num argument entirely by using local labels, |
281 | but using it for the GCC case as well as the non-GCC case allows us to share |
282 | a bit more code and notice if we use conflicting numbers.*/ |
283 | |
284 | #define RMATCH_WHERE(num) &&RRETURN_##num |
285 | #define RRETURN_LABEL *stack.currentFrame->returnLocation |
286 | |
287 | #endif |
288 | |
289 | #define RECURSIVE_MATCH_COMMON(num) \ |
290 | goto RECURSE;\ |
291 | RRETURN_##num: \ |
292 | stack.popCurrentFrame(); |
293 | |
294 | #define RECURSIVE_MATCH(num, ra, rb) \ |
295 | do { \ |
296 | stack.pushNewFrame((ra), (rb), RMATCH_WHERE(num)); \ |
297 | RECURSIVE_MATCH_COMMON(num) \ |
298 | } while (0) |
299 | |
300 | #define RECURSIVE_MATCH_NEW_GROUP(num, ra, rb) \ |
301 | do { \ |
302 | stack.pushNewFrame((ra), (rb), RMATCH_WHERE(num)); \ |
303 | startNewGroup(stack.currentFrame); \ |
304 | RECURSIVE_MATCH_COMMON(num) \ |
305 | } while (0) |
306 | |
307 | #define RRETURN goto RRETURN_LABEL |
308 | |
309 | #define RRETURN_NO_MATCH do { isMatch = false; RRETURN; } while (0) |
310 | |
311 | /************************************************* |
312 | * Match from current position * |
313 | *************************************************/ |
314 | |
315 | /* On entry instructionPtr points to the first opcode, and subjectPtr to the first character |
316 | in the subject string, while substringStart holds the value of subjectPtr at the start of the |
317 | last bracketed group - used for breaking infinite loops matching zero-length |
318 | strings. This function is called recursively in many circumstances. Whenever it |
319 | returns a negative (error) response, the outer match() call must also return the |
320 | same response. |
321 | |
322 | Arguments: |
323 | subjectPtr pointer in subject |
324 | instructionPtr position in code |
325 | offsetTop current top pointer |
326 | md pointer to "static" info for the match |
327 | |
328 | Returns: 1 if matched ) these values are >= 0 |
329 | 0 if failed to match ) |
330 | a negative error value if aborted by an error condition |
331 | (e.g. stopped by repeated call or recursion limit) |
332 | */ |
333 | |
334 | static const unsigned numFramesOnStack = 16; |
335 | |
336 | struct MatchStack { |
337 | MatchStack() |
338 | : framesEnd(frames + numFramesOnStack) |
339 | , currentFrame(frames) |
340 | , size(1) // match() creates accesses the first frame w/o calling pushNewFrame |
341 | { |
342 | ASSERT((sizeof(frames) / sizeof(frames[0])) == numFramesOnStack); |
343 | } |
344 | |
345 | MatchFrame frames[numFramesOnStack]; |
346 | MatchFrame* framesEnd; |
347 | MatchFrame* currentFrame; |
348 | unsigned size; |
349 | |
350 | inline bool canUseStackBufferForNextFrame() |
351 | { |
352 | return size < numFramesOnStack; |
353 | } |
354 | |
355 | inline MatchFrame* allocateNextFrame() |
356 | { |
357 | if (canUseStackBufferForNextFrame()) |
358 | return currentFrame + 1; |
359 | return new MatchFrame; |
360 | } |
361 | |
362 | inline void pushNewFrame(const unsigned char* instructionPtr, BracketChainNode* bracketChain, ReturnLocation returnLocation) |
363 | { |
364 | MatchFrame* newframe = allocateNextFrame(); |
365 | newframe->previousFrame = currentFrame; |
366 | |
367 | newframe->args.subjectPtr = currentFrame->args.subjectPtr; |
368 | newframe->args.offsetTop = currentFrame->args.offsetTop; |
369 | newframe->args.instructionPtr = instructionPtr; |
370 | newframe->args.bracketChain = bracketChain; |
371 | newframe->returnLocation = returnLocation; |
372 | size++; |
373 | |
374 | currentFrame = newframe; |
375 | } |
376 | |
377 | inline void popCurrentFrame() |
378 | { |
379 | MatchFrame* oldFrame = currentFrame; |
380 | currentFrame = currentFrame->previousFrame; |
381 | if (size > numFramesOnStack) |
382 | delete oldFrame; |
383 | size--; |
384 | } |
385 | |
386 | void popAllFrames() |
387 | { |
388 | while (size) |
389 | popCurrentFrame(); |
390 | } |
391 | }; |
392 | |
393 | static int matchError(int errorCode, MatchStack& stack) |
394 | { |
395 | stack.popAllFrames(); |
396 | return errorCode; |
397 | } |
398 | |
399 | /* Get the next UTF-8 character, not advancing the pointer, incrementing length |
400 | if there are extra bytes. This is called when we know we are in UTF-8 mode. */ |
401 | |
402 | static inline void getUTF8CharAndIncrementLength(int& c, const unsigned char* subjectPtr, int& len) |
403 | { |
404 | c = *subjectPtr; |
405 | if ((c & 0xc0) == 0xc0) { |
406 | int gcaa = jsc_pcre_utf8_table4[c & 0x3f]; /* Number of additional bytes */ |
407 | int gcss = 6 * gcaa; |
408 | c = (c & jsc_pcre_utf8_table3[gcaa]) << gcss; |
409 | for (int gcii = 1; gcii <= gcaa; gcii++) { |
410 | gcss -= 6; |
411 | c |= (subjectPtr[gcii] & 0x3f) << gcss; |
412 | } |
413 | len += gcaa; |
414 | } |
415 | } |
416 | |
417 | static inline void startNewGroup(MatchFrame* currentFrame) |
418 | { |
419 | /* At the start of a bracketed group, add the current subject pointer to the |
420 | stack of such pointers, to be re-instated at the end of the group when we hit |
421 | the closing ket. When match() is called in other circumstances, we don't add to |
422 | this stack. */ |
423 | |
424 | currentFrame->locals.bracketChainNode.previousBracket = currentFrame->args.bracketChain; |
425 | currentFrame->locals.bracketChainNode.bracketStart = currentFrame->args.subjectPtr; |
426 | currentFrame->args.bracketChain = ¤tFrame->locals.bracketChainNode; |
427 | } |
428 | |
429 | // FIXME: "minimize" means "not greedy", we should invert the callers to ask for "greedy" to be less confusing |
430 | static inline void repeatInformationFromInstructionOffset(short instructionOffset, bool& minimize, int& minimumRepeats, int& maximumRepeats) |
431 | { |
432 | // Instruction offsets are based off of OP_CRSTAR, OP_STAR, OP_TYPESTAR, OP_NOTSTAR |
433 | static const char minimumRepeatsFromInstructionOffset[] = { 0, 0, 1, 1, 0, 0 }; |
434 | static const int maximumRepeatsFromInstructionOffset[] = { INT_MAX, INT_MAX, INT_MAX, INT_MAX, 1, 1 }; |
435 | |
436 | ASSERT(instructionOffset >= 0); |
437 | ASSERT(instructionOffset <= (OP_CRMINQUERY - OP_CRSTAR)); |
438 | |
439 | minimize = (instructionOffset & 1); // this assumes ordering: Instruction, MinimizeInstruction, Instruction2, MinimizeInstruction2 |
440 | minimumRepeats = minimumRepeatsFromInstructionOffset[instructionOffset]; |
441 | maximumRepeats = maximumRepeatsFromInstructionOffset[instructionOffset]; |
442 | } |
443 | |
444 | static int match(const UChar* subjectPtr, const unsigned char* instructionPtr, int offsetTop, MatchData& md) |
445 | { |
446 | bool isMatch = false; |
447 | int min; |
448 | bool minimize = false; /* Initialization not really needed, but some compilers think so. */ |
449 | unsigned remainingMatchCount = matchLimit; |
450 | int othercase; /* Declare here to avoid errors during jumps */ |
451 | |
452 | MatchStack stack; |
453 | |
454 | /* The opcode jump table. */ |
455 | #ifdef USE_COMPUTED_GOTO_FOR_MATCH_OPCODE_LOOP |
456 | #define EMIT_JUMP_TABLE_ENTRY(opcode) &&LABEL_OP_##opcode, |
457 | static void* opcodeJumpTable[256] = { FOR_EACH_OPCODE(EMIT_JUMP_TABLE_ENTRY) }; |
458 | #undef EMIT_JUMP_TABLE_ENTRY |
459 | #endif |
460 | |
461 | /* One-time setup of the opcode jump table. */ |
462 | #ifdef USE_COMPUTED_GOTO_FOR_MATCH_OPCODE_LOOP |
463 | for (int i = 255; !opcodeJumpTable[i]; i--) |
464 | opcodeJumpTable[i] = &&CAPTURING_BRACKET; |
465 | #endif |
466 | |
467 | #ifdef USE_COMPUTED_GOTO_FOR_MATCH_RECURSION |
468 | // Shark shows this as a hot line |
469 | // Using a static const here makes this line disappear, but makes later access hotter (not sure why) |
470 | stack.currentFrame->returnLocation = &&RETURN; |
471 | #else |
472 | stack.currentFrame->returnLocation = 0; |
473 | #endif |
474 | stack.currentFrame->args.subjectPtr = subjectPtr; |
475 | stack.currentFrame->args.instructionPtr = instructionPtr; |
476 | stack.currentFrame->args.offsetTop = offsetTop; |
477 | stack.currentFrame->args.bracketChain = 0; |
478 | startNewGroup(currentFrame: stack.currentFrame); |
479 | |
480 | /* This is where control jumps back to to effect "recursion" */ |
481 | |
482 | RECURSE: |
483 | if (!--remainingMatchCount) |
484 | return matchError(errorCode: JSRegExpErrorHitLimit, stack); |
485 | |
486 | /* Now start processing the operations. */ |
487 | |
488 | #ifndef USE_COMPUTED_GOTO_FOR_MATCH_OPCODE_LOOP |
489 | while (true) |
490 | #endif |
491 | { |
492 | |
493 | #ifdef USE_COMPUTED_GOTO_FOR_MATCH_OPCODE_LOOP |
494 | #define BEGIN_OPCODE(opcode) LABEL_OP_##opcode |
495 | #define NEXT_OPCODE goto *opcodeJumpTable[*stack.currentFrame->args.instructionPtr] |
496 | #else |
497 | #define BEGIN_OPCODE(opcode) case OP_##opcode |
498 | #define NEXT_OPCODE continue |
499 | #endif |
500 | |
501 | #ifdef USE_COMPUTED_GOTO_FOR_MATCH_OPCODE_LOOP |
502 | NEXT_OPCODE; |
503 | #else |
504 | switch (*stack.currentFrame->args.instructionPtr) |
505 | #endif |
506 | { |
507 | /* Non-capturing bracket: optimized */ |
508 | |
509 | BEGIN_OPCODE(BRA): |
510 | NON_CAPTURING_BRACKET: |
511 | DPRINTF(("start bracket 0\n" )); |
512 | do { |
513 | RECURSIVE_MATCH_NEW_GROUP(2, stack.currentFrame->args.instructionPtr + 1 + LINK_SIZE, stack.currentFrame->args.bracketChain); |
514 | if (isMatch) |
515 | RRETURN; |
516 | stack.currentFrame->args.instructionPtr += getLinkValue(opcodePtr: stack.currentFrame->args.instructionPtr + 1); |
517 | } while (*stack.currentFrame->args.instructionPtr == OP_ALT); |
518 | DPRINTF(("bracket 0 failed\n" )); |
519 | RRETURN; |
520 | |
521 | /* Skip over large extraction number data if encountered. */ |
522 | |
523 | BEGIN_OPCODE(BRANUMBER): |
524 | stack.currentFrame->args.instructionPtr += 3; |
525 | NEXT_OPCODE; |
526 | |
527 | /* End of the pattern. */ |
528 | |
529 | BEGIN_OPCODE(END): |
530 | md.endMatchPtr = stack.currentFrame->args.subjectPtr; /* Record where we ended */ |
531 | md.endOffsetTop = stack.currentFrame->args.offsetTop; /* and how many extracts were taken */ |
532 | isMatch = true; |
533 | RRETURN; |
534 | |
535 | /* Assertion brackets. Check the alternative branches in turn - the |
536 | matching won't pass the KET for an assertion. If any one branch matches, |
537 | the assertion is true. Lookbehind assertions have an OP_REVERSE item at the |
538 | start of each branch to move the current point backwards, so the code at |
539 | this level is identical to the lookahead case. */ |
540 | |
541 | BEGIN_OPCODE(ASSERT): |
542 | do { |
543 | RECURSIVE_MATCH_NEW_GROUP(6, stack.currentFrame->args.instructionPtr + 1 + LINK_SIZE, NULL); |
544 | if (isMatch) |
545 | break; |
546 | stack.currentFrame->args.instructionPtr += getLinkValue(opcodePtr: stack.currentFrame->args.instructionPtr + 1); |
547 | } while (*stack.currentFrame->args.instructionPtr == OP_ALT); |
548 | if (*stack.currentFrame->args.instructionPtr == OP_KET) |
549 | RRETURN_NO_MATCH; |
550 | |
551 | /* Continue from after the assertion, updating the offsets high water |
552 | mark, since extracts may have been taken during the assertion. */ |
553 | |
554 | advanceToEndOfBracket(opcodePtr&: stack.currentFrame->args.instructionPtr); |
555 | stack.currentFrame->args.instructionPtr += 1 + LINK_SIZE; |
556 | stack.currentFrame->args.offsetTop = md.endOffsetTop; |
557 | NEXT_OPCODE; |
558 | |
559 | /* Negative assertion: all branches must fail to match */ |
560 | |
561 | BEGIN_OPCODE(ASSERT_NOT): |
562 | do { |
563 | RECURSIVE_MATCH_NEW_GROUP(7, stack.currentFrame->args.instructionPtr + 1 + LINK_SIZE, NULL); |
564 | if (isMatch) |
565 | RRETURN_NO_MATCH; |
566 | stack.currentFrame->args.instructionPtr += getLinkValue(opcodePtr: stack.currentFrame->args.instructionPtr + 1); |
567 | } while (*stack.currentFrame->args.instructionPtr == OP_ALT); |
568 | |
569 | stack.currentFrame->args.instructionPtr += 1 + LINK_SIZE; |
570 | NEXT_OPCODE; |
571 | |
572 | /* An alternation is the end of a branch; scan along to find the end of the |
573 | bracketed group and go to there. */ |
574 | |
575 | BEGIN_OPCODE(ALT): |
576 | advanceToEndOfBracket(opcodePtr&: stack.currentFrame->args.instructionPtr); |
577 | NEXT_OPCODE; |
578 | |
579 | /* BRAZERO and BRAMINZERO occur just before a bracket group, indicating |
580 | that it may occur zero times. It may repeat infinitely, or not at all - |
581 | i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper |
582 | repeat limits are compiled as a number of copies, with the optional ones |
583 | preceded by BRAZERO or BRAMINZERO. */ |
584 | |
585 | BEGIN_OPCODE(BRAZERO): { |
586 | stack.currentFrame->locals.startOfRepeatingBracket = stack.currentFrame->args.instructionPtr + 1; |
587 | RECURSIVE_MATCH_NEW_GROUP(14, stack.currentFrame->locals.startOfRepeatingBracket, stack.currentFrame->args.bracketChain); |
588 | if (isMatch) |
589 | RRETURN; |
590 | advanceToEndOfBracket(opcodePtr&: stack.currentFrame->locals.startOfRepeatingBracket); |
591 | stack.currentFrame->args.instructionPtr = stack.currentFrame->locals.startOfRepeatingBracket + 1 + LINK_SIZE; |
592 | NEXT_OPCODE; |
593 | } |
594 | |
595 | BEGIN_OPCODE(BRAMINZERO): { |
596 | stack.currentFrame->locals.startOfRepeatingBracket = stack.currentFrame->args.instructionPtr + 1; |
597 | advanceToEndOfBracket(opcodePtr&: stack.currentFrame->locals.startOfRepeatingBracket); |
598 | RECURSIVE_MATCH_NEW_GROUP(15, stack.currentFrame->locals.startOfRepeatingBracket + 1 + LINK_SIZE, stack.currentFrame->args.bracketChain); |
599 | if (isMatch) |
600 | RRETURN; |
601 | stack.currentFrame->args.instructionPtr++; |
602 | NEXT_OPCODE; |
603 | } |
604 | |
605 | /* End of a group, repeated or non-repeating. If we are at the end of |
606 | an assertion "group", stop matching and return 1, but record the |
607 | current high water mark for use by positive assertions. Do this also |
608 | for the "once" (not-backup up) groups. */ |
609 | |
610 | BEGIN_OPCODE(KET): |
611 | BEGIN_OPCODE(KETRMIN): |
612 | BEGIN_OPCODE(KETRMAX): |
613 | stack.currentFrame->locals.instructionPtrAtStartOfOnce = stack.currentFrame->args.instructionPtr - getLinkValue(opcodePtr: stack.currentFrame->args.instructionPtr + 1); |
614 | stack.currentFrame->locals.subjectPtrAtStartOfInstruction = stack.currentFrame->args.bracketChain->bracketStart; |
615 | |
616 | /* Back up the stack of bracket start pointers. */ |
617 | |
618 | stack.currentFrame->args.bracketChain = stack.currentFrame->args.bracketChain->previousBracket; |
619 | |
620 | if (*stack.currentFrame->locals.instructionPtrAtStartOfOnce == OP_ASSERT || *stack.currentFrame->locals.instructionPtrAtStartOfOnce == OP_ASSERT_NOT) { |
621 | md.endOffsetTop = stack.currentFrame->args.offsetTop; |
622 | isMatch = true; |
623 | RRETURN; |
624 | } |
625 | |
626 | /* In all other cases except a conditional group we have to check the |
627 | group number back at the start and if necessary complete handling an |
628 | extraction by setting the offsets and bumping the high water mark. */ |
629 | |
630 | stack.currentFrame->locals.number = *stack.currentFrame->locals.instructionPtrAtStartOfOnce - OP_BRA; |
631 | |
632 | /* For extended extraction brackets (large number), we have to fish out |
633 | the number from a dummy opcode at the start. */ |
634 | |
635 | if (stack.currentFrame->locals.number > EXTRACT_BASIC_MAX) |
636 | stack.currentFrame->locals.number = get2ByteValue(opcodePtr: stack.currentFrame->locals.instructionPtrAtStartOfOnce + 2 + LINK_SIZE); |
637 | stack.currentFrame->locals.offset = stack.currentFrame->locals.number << 1; |
638 | |
639 | #ifdef DEBUG |
640 | printf("end bracket %d" , stack.currentFrame->locals.number); |
641 | printf("\n" ); |
642 | #endif |
643 | |
644 | /* Test for a numbered group. This includes groups called as a result |
645 | of recursion. Note that whole-pattern recursion is coded as a recurse |
646 | into group 0, so it won't be picked up here. Instead, we catch it when |
647 | the OP_END is reached. */ |
648 | |
649 | if (stack.currentFrame->locals.number > 0) { |
650 | if (stack.currentFrame->locals.offset >= md.offsetMax) |
651 | md.offsetOverflow = true; |
652 | else { |
653 | md.offsetVector[stack.currentFrame->locals.offset] = |
654 | md.offsetVector[md.offsetEnd - stack.currentFrame->locals.number]; |
655 | md.offsetVector[stack.currentFrame->locals.offset+1] = stack.currentFrame->args.subjectPtr - md.startSubject; |
656 | if (stack.currentFrame->args.offsetTop <= stack.currentFrame->locals.offset) |
657 | stack.currentFrame->args.offsetTop = stack.currentFrame->locals.offset + 2; |
658 | } |
659 | } |
660 | |
661 | /* For a non-repeating ket, just continue at this level. This also |
662 | happens for a repeating ket if no characters were matched in the group. |
663 | This is the forcible breaking of infinite loops as implemented in Perl |
664 | 5.005. If there is an options reset, it will get obeyed in the normal |
665 | course of events. */ |
666 | |
667 | if (*stack.currentFrame->args.instructionPtr == OP_KET || stack.currentFrame->args.subjectPtr == stack.currentFrame->locals.subjectPtrAtStartOfInstruction) { |
668 | stack.currentFrame->args.instructionPtr += 1 + LINK_SIZE; |
669 | NEXT_OPCODE; |
670 | } |
671 | |
672 | /* The repeating kets try the rest of the pattern or restart from the |
673 | preceding bracket, in the appropriate order. */ |
674 | |
675 | if (*stack.currentFrame->args.instructionPtr == OP_KETRMIN) { |
676 | RECURSIVE_MATCH(16, stack.currentFrame->args.instructionPtr + 1 + LINK_SIZE, stack.currentFrame->args.bracketChain); |
677 | if (isMatch) |
678 | RRETURN; |
679 | RECURSIVE_MATCH_NEW_GROUP(17, stack.currentFrame->locals.instructionPtrAtStartOfOnce, stack.currentFrame->args.bracketChain); |
680 | if (isMatch) |
681 | RRETURN; |
682 | } else { /* OP_KETRMAX */ |
683 | RECURSIVE_MATCH_NEW_GROUP(18, stack.currentFrame->locals.instructionPtrAtStartOfOnce, stack.currentFrame->args.bracketChain); |
684 | if (isMatch) |
685 | RRETURN; |
686 | RECURSIVE_MATCH(19, stack.currentFrame->args.instructionPtr + 1 + LINK_SIZE, stack.currentFrame->args.bracketChain); |
687 | if (isMatch) |
688 | RRETURN; |
689 | } |
690 | RRETURN; |
691 | |
692 | /* Start of subject. */ |
693 | |
694 | BEGIN_OPCODE(CIRC): |
695 | if (stack.currentFrame->args.subjectPtr != md.startSubject) |
696 | RRETURN_NO_MATCH; |
697 | stack.currentFrame->args.instructionPtr++; |
698 | NEXT_OPCODE; |
699 | |
700 | /* After internal newline if multiline. */ |
701 | |
702 | BEGIN_OPCODE(BOL): |
703 | if (stack.currentFrame->args.subjectPtr != md.startSubject && !isNewline(nl: stack.currentFrame->args.subjectPtr[-1])) |
704 | RRETURN_NO_MATCH; |
705 | stack.currentFrame->args.instructionPtr++; |
706 | NEXT_OPCODE; |
707 | |
708 | /* End of subject. */ |
709 | |
710 | BEGIN_OPCODE(DOLL): |
711 | if (stack.currentFrame->args.subjectPtr < md.endSubject) |
712 | RRETURN_NO_MATCH; |
713 | stack.currentFrame->args.instructionPtr++; |
714 | NEXT_OPCODE; |
715 | |
716 | /* Before internal newline if multiline. */ |
717 | |
718 | BEGIN_OPCODE(EOL): |
719 | if (stack.currentFrame->args.subjectPtr < md.endSubject && !isNewline(nl: *stack.currentFrame->args.subjectPtr)) |
720 | RRETURN_NO_MATCH; |
721 | stack.currentFrame->args.instructionPtr++; |
722 | NEXT_OPCODE; |
723 | |
724 | /* Word boundary assertions */ |
725 | |
726 | BEGIN_OPCODE(NOT_WORD_BOUNDARY): |
727 | BEGIN_OPCODE(WORD_BOUNDARY): { |
728 | bool currentCharIsWordChar = false; |
729 | bool previousCharIsWordChar = false; |
730 | |
731 | if (stack.currentFrame->args.subjectPtr > md.startSubject) |
732 | previousCharIsWordChar = isWordChar(c: stack.currentFrame->args.subjectPtr[-1]); |
733 | if (stack.currentFrame->args.subjectPtr < md.endSubject) |
734 | currentCharIsWordChar = isWordChar(c: *stack.currentFrame->args.subjectPtr); |
735 | |
736 | /* Now see if the situation is what we want */ |
737 | bool wordBoundaryDesired = (*stack.currentFrame->args.instructionPtr++ == OP_WORD_BOUNDARY); |
738 | if (wordBoundaryDesired ? currentCharIsWordChar == previousCharIsWordChar : currentCharIsWordChar != previousCharIsWordChar) |
739 | RRETURN_NO_MATCH; |
740 | NEXT_OPCODE; |
741 | } |
742 | |
743 | /* Match a single character type; inline for speed */ |
744 | |
745 | BEGIN_OPCODE(NOT_NEWLINE): |
746 | if (stack.currentFrame->args.subjectPtr >= md.endSubject) |
747 | RRETURN_NO_MATCH; |
748 | if (isNewline(nl: *stack.currentFrame->args.subjectPtr++)) |
749 | RRETURN_NO_MATCH; |
750 | stack.currentFrame->args.instructionPtr++; |
751 | NEXT_OPCODE; |
752 | |
753 | BEGIN_OPCODE(NOT_DIGIT): |
754 | if (stack.currentFrame->args.subjectPtr >= md.endSubject) |
755 | RRETURN_NO_MATCH; |
756 | if (isASCIIDigit(c: *stack.currentFrame->args.subjectPtr++)) |
757 | RRETURN_NO_MATCH; |
758 | stack.currentFrame->args.instructionPtr++; |
759 | NEXT_OPCODE; |
760 | |
761 | BEGIN_OPCODE(DIGIT): |
762 | if (stack.currentFrame->args.subjectPtr >= md.endSubject) |
763 | RRETURN_NO_MATCH; |
764 | if (!isASCIIDigit(c: *stack.currentFrame->args.subjectPtr++)) |
765 | RRETURN_NO_MATCH; |
766 | stack.currentFrame->args.instructionPtr++; |
767 | NEXT_OPCODE; |
768 | |
769 | BEGIN_OPCODE(NOT_WHITESPACE): |
770 | if (stack.currentFrame->args.subjectPtr >= md.endSubject) |
771 | RRETURN_NO_MATCH; |
772 | if (isSpaceChar(c: *stack.currentFrame->args.subjectPtr++)) |
773 | RRETURN_NO_MATCH; |
774 | stack.currentFrame->args.instructionPtr++; |
775 | NEXT_OPCODE; |
776 | |
777 | BEGIN_OPCODE(WHITESPACE): |
778 | if (stack.currentFrame->args.subjectPtr >= md.endSubject) |
779 | RRETURN_NO_MATCH; |
780 | if (!isSpaceChar(c: *stack.currentFrame->args.subjectPtr++)) |
781 | RRETURN_NO_MATCH; |
782 | stack.currentFrame->args.instructionPtr++; |
783 | NEXT_OPCODE; |
784 | |
785 | BEGIN_OPCODE(NOT_WORDCHAR): |
786 | if (stack.currentFrame->args.subjectPtr >= md.endSubject) |
787 | RRETURN_NO_MATCH; |
788 | if (isWordChar(c: *stack.currentFrame->args.subjectPtr++)) |
789 | RRETURN_NO_MATCH; |
790 | stack.currentFrame->args.instructionPtr++; |
791 | NEXT_OPCODE; |
792 | |
793 | BEGIN_OPCODE(WORDCHAR): |
794 | if (stack.currentFrame->args.subjectPtr >= md.endSubject) |
795 | RRETURN_NO_MATCH; |
796 | if (!isWordChar(c: *stack.currentFrame->args.subjectPtr++)) |
797 | RRETURN_NO_MATCH; |
798 | stack.currentFrame->args.instructionPtr++; |
799 | NEXT_OPCODE; |
800 | |
801 | /* Match a back reference, possibly repeatedly. Look past the end of the |
802 | item to see if there is repeat information following. The code is similar |
803 | to that for character classes, but repeated for efficiency. Then obey |
804 | similar code to character type repeats - written out again for speed. |
805 | However, if the referenced string is the empty string, always treat |
806 | it as matched, any number of times (otherwise there could be infinite |
807 | loops). */ |
808 | |
809 | BEGIN_OPCODE(REF): |
810 | stack.currentFrame->locals.offset = get2ByteValue(opcodePtr: stack.currentFrame->args.instructionPtr + 1) << 1; /* Doubled ref number */ |
811 | stack.currentFrame->args.instructionPtr += 3; /* Advance past item */ |
812 | |
813 | /* If the reference is unset, set the length to be longer than the amount |
814 | of subject left; this ensures that every attempt at a match fails. We |
815 | can't just fail here, because of the possibility of quantifiers with zero |
816 | minima. */ |
817 | |
818 | if (stack.currentFrame->locals.offset >= stack.currentFrame->args.offsetTop || md.offsetVector[stack.currentFrame->locals.offset] < 0) |
819 | stack.currentFrame->locals.length = 0; |
820 | else |
821 | stack.currentFrame->locals.length = md.offsetVector[stack.currentFrame->locals.offset+1] - md.offsetVector[stack.currentFrame->locals.offset]; |
822 | |
823 | /* Set up for repetition, or handle the non-repeated case */ |
824 | |
825 | switch (*stack.currentFrame->args.instructionPtr) { |
826 | case OP_CRSTAR: |
827 | case OP_CRMINSTAR: |
828 | case OP_CRPLUS: |
829 | case OP_CRMINPLUS: |
830 | case OP_CRQUERY: |
831 | case OP_CRMINQUERY: |
832 | repeatInformationFromInstructionOffset(instructionOffset: *stack.currentFrame->args.instructionPtr++ - OP_CRSTAR, minimize, minimumRepeats&: min, maximumRepeats&: stack.currentFrame->locals.max); |
833 | break; |
834 | |
835 | case OP_CRRANGE: |
836 | case OP_CRMINRANGE: |
837 | minimize = (*stack.currentFrame->args.instructionPtr == OP_CRMINRANGE); |
838 | min = get2ByteValue(opcodePtr: stack.currentFrame->args.instructionPtr + 1); |
839 | stack.currentFrame->locals.max = get2ByteValue(opcodePtr: stack.currentFrame->args.instructionPtr + 3); |
840 | if (stack.currentFrame->locals.max == 0) |
841 | stack.currentFrame->locals.max = INT_MAX; |
842 | stack.currentFrame->args.instructionPtr += 5; |
843 | break; |
844 | |
845 | default: /* No repeat follows */ |
846 | if (!matchRef(offset: stack.currentFrame->locals.offset, subjectPtr: stack.currentFrame->args.subjectPtr, length: stack.currentFrame->locals.length, md)) |
847 | RRETURN_NO_MATCH; |
848 | stack.currentFrame->args.subjectPtr += stack.currentFrame->locals.length; |
849 | NEXT_OPCODE; |
850 | } |
851 | |
852 | /* If the length of the reference is zero, just continue with the |
853 | main loop. */ |
854 | |
855 | if (stack.currentFrame->locals.length == 0) |
856 | NEXT_OPCODE; |
857 | |
858 | /* First, ensure the minimum number of matches are present. */ |
859 | |
860 | for (int i = 1; i <= min; i++) { |
861 | if (!matchRef(offset: stack.currentFrame->locals.offset, subjectPtr: stack.currentFrame->args.subjectPtr, length: stack.currentFrame->locals.length, md)) |
862 | RRETURN_NO_MATCH; |
863 | stack.currentFrame->args.subjectPtr += stack.currentFrame->locals.length; |
864 | } |
865 | |
866 | /* If min = max, continue at the same level without recursion. |
867 | They are not both allowed to be zero. */ |
868 | |
869 | if (min == stack.currentFrame->locals.max) |
870 | NEXT_OPCODE; |
871 | |
872 | /* If minimizing, keep trying and advancing the pointer */ |
873 | |
874 | if (minimize) { |
875 | for (stack.currentFrame->locals.fi = min;; stack.currentFrame->locals.fi++) { |
876 | RECURSIVE_MATCH(20, stack.currentFrame->args.instructionPtr, stack.currentFrame->args.bracketChain); |
877 | if (isMatch) |
878 | RRETURN; |
879 | if (stack.currentFrame->locals.fi >= stack.currentFrame->locals.max || !matchRef(offset: stack.currentFrame->locals.offset, subjectPtr: stack.currentFrame->args.subjectPtr, length: stack.currentFrame->locals.length, md)) |
880 | RRETURN; |
881 | stack.currentFrame->args.subjectPtr += stack.currentFrame->locals.length; |
882 | } |
883 | /* Control never reaches here */ |
884 | } |
885 | |
886 | /* If maximizing, find the longest string and work backwards */ |
887 | |
888 | else { |
889 | stack.currentFrame->locals.subjectPtrAtStartOfInstruction = stack.currentFrame->args.subjectPtr; |
890 | for (int i = min; i < stack.currentFrame->locals.max; i++) { |
891 | if (!matchRef(offset: stack.currentFrame->locals.offset, subjectPtr: stack.currentFrame->args.subjectPtr, length: stack.currentFrame->locals.length, md)) |
892 | break; |
893 | stack.currentFrame->args.subjectPtr += stack.currentFrame->locals.length; |
894 | } |
895 | while (stack.currentFrame->args.subjectPtr >= stack.currentFrame->locals.subjectPtrAtStartOfInstruction) { |
896 | RECURSIVE_MATCH(21, stack.currentFrame->args.instructionPtr, stack.currentFrame->args.bracketChain); |
897 | if (isMatch) |
898 | RRETURN; |
899 | stack.currentFrame->args.subjectPtr -= stack.currentFrame->locals.length; |
900 | } |
901 | RRETURN_NO_MATCH; |
902 | } |
903 | /* Control never reaches here */ |
904 | |
905 | /* Match a bit-mapped character class, possibly repeatedly. This op code is |
906 | used when all the characters in the class have values in the range 0-255, |
907 | and either the matching is caseful, or the characters are in the range |
908 | 0-127 when UTF-8 processing is enabled. The only difference between |
909 | OP_CLASS and OP_NCLASS occurs when a data character outside the range is |
910 | encountered. |
911 | |
912 | First, look past the end of the item to see if there is repeat information |
913 | following. Then obey similar code to character type repeats - written out |
914 | again for speed. */ |
915 | |
916 | BEGIN_OPCODE(NCLASS): |
917 | BEGIN_OPCODE(CLASS): |
918 | stack.currentFrame->locals.data = stack.currentFrame->args.instructionPtr + 1; /* Save for matching */ |
919 | stack.currentFrame->args.instructionPtr += 33; /* Advance past the item */ |
920 | |
921 | switch (*stack.currentFrame->args.instructionPtr) { |
922 | case OP_CRSTAR: |
923 | case OP_CRMINSTAR: |
924 | case OP_CRPLUS: |
925 | case OP_CRMINPLUS: |
926 | case OP_CRQUERY: |
927 | case OP_CRMINQUERY: |
928 | repeatInformationFromInstructionOffset(instructionOffset: *stack.currentFrame->args.instructionPtr++ - OP_CRSTAR, minimize, minimumRepeats&: min, maximumRepeats&: stack.currentFrame->locals.max); |
929 | break; |
930 | |
931 | case OP_CRRANGE: |
932 | case OP_CRMINRANGE: |
933 | minimize = (*stack.currentFrame->args.instructionPtr == OP_CRMINRANGE); |
934 | min = get2ByteValue(opcodePtr: stack.currentFrame->args.instructionPtr + 1); |
935 | stack.currentFrame->locals.max = get2ByteValue(opcodePtr: stack.currentFrame->args.instructionPtr + 3); |
936 | if (stack.currentFrame->locals.max == 0) |
937 | stack.currentFrame->locals.max = INT_MAX; |
938 | stack.currentFrame->args.instructionPtr += 5; |
939 | break; |
940 | |
941 | default: /* No repeat follows */ |
942 | min = stack.currentFrame->locals.max = 1; |
943 | break; |
944 | } |
945 | |
946 | /* First, ensure the minimum number of matches are present. */ |
947 | |
948 | for (int i = 1; i <= min; i++) { |
949 | if (stack.currentFrame->args.subjectPtr >= md.endSubject) |
950 | RRETURN_NO_MATCH; |
951 | int c = *stack.currentFrame->args.subjectPtr++; |
952 | if (c > 255) { |
953 | if (stack.currentFrame->locals.data[-1] == OP_CLASS) |
954 | RRETURN_NO_MATCH; |
955 | } else { |
956 | if (!(stack.currentFrame->locals.data[c / 8] & (1 << (c & 7)))) |
957 | RRETURN_NO_MATCH; |
958 | } |
959 | } |
960 | |
961 | /* If max == min we can continue with the main loop without the |
962 | need to recurse. */ |
963 | |
964 | if (min == stack.currentFrame->locals.max) |
965 | NEXT_OPCODE; |
966 | |
967 | /* If minimizing, keep testing the rest of the expression and advancing |
968 | the pointer while it matches the class. */ |
969 | if (minimize) { |
970 | for (stack.currentFrame->locals.fi = min;; stack.currentFrame->locals.fi++) { |
971 | RECURSIVE_MATCH(22, stack.currentFrame->args.instructionPtr, stack.currentFrame->args.bracketChain); |
972 | if (isMatch) |
973 | RRETURN; |
974 | if (stack.currentFrame->locals.fi >= stack.currentFrame->locals.max || stack.currentFrame->args.subjectPtr >= md.endSubject) |
975 | RRETURN; |
976 | int c = *stack.currentFrame->args.subjectPtr++; |
977 | if (c > 255) { |
978 | if (stack.currentFrame->locals.data[-1] == OP_CLASS) |
979 | RRETURN; |
980 | } else { |
981 | if ((stack.currentFrame->locals.data[c/8] & (1 << (c&7))) == 0) |
982 | RRETURN; |
983 | } |
984 | } |
985 | /* Control never reaches here */ |
986 | } |
987 | /* If maximizing, find the longest possible run, then work backwards. */ |
988 | else { |
989 | stack.currentFrame->locals.subjectPtrAtStartOfInstruction = stack.currentFrame->args.subjectPtr; |
990 | |
991 | for (int i = min; i < stack.currentFrame->locals.max; i++) { |
992 | if (stack.currentFrame->args.subjectPtr >= md.endSubject) |
993 | break; |
994 | int c = *stack.currentFrame->args.subjectPtr; |
995 | if (c > 255) { |
996 | if (stack.currentFrame->locals.data[-1] == OP_CLASS) |
997 | break; |
998 | } else { |
999 | if (!(stack.currentFrame->locals.data[c / 8] & (1 << (c & 7)))) |
1000 | break; |
1001 | } |
1002 | ++stack.currentFrame->args.subjectPtr; |
1003 | } |
1004 | for (;;) { |
1005 | RECURSIVE_MATCH(24, stack.currentFrame->args.instructionPtr, stack.currentFrame->args.bracketChain); |
1006 | if (isMatch) |
1007 | RRETURN; |
1008 | if (stack.currentFrame->args.subjectPtr-- == stack.currentFrame->locals.subjectPtrAtStartOfInstruction) |
1009 | break; /* Stop if tried at original pos */ |
1010 | } |
1011 | |
1012 | RRETURN; |
1013 | } |
1014 | /* Control never reaches here */ |
1015 | |
1016 | /* Match an extended character class. */ |
1017 | |
1018 | BEGIN_OPCODE(XCLASS): |
1019 | stack.currentFrame->locals.data = stack.currentFrame->args.instructionPtr + 1 + LINK_SIZE; /* Save for matching */ |
1020 | stack.currentFrame->args.instructionPtr += getLinkValue(opcodePtr: stack.currentFrame->args.instructionPtr + 1); /* Advance past the item */ |
1021 | |
1022 | switch (*stack.currentFrame->args.instructionPtr) { |
1023 | case OP_CRSTAR: |
1024 | case OP_CRMINSTAR: |
1025 | case OP_CRPLUS: |
1026 | case OP_CRMINPLUS: |
1027 | case OP_CRQUERY: |
1028 | case OP_CRMINQUERY: |
1029 | repeatInformationFromInstructionOffset(instructionOffset: *stack.currentFrame->args.instructionPtr++ - OP_CRSTAR, minimize, minimumRepeats&: min, maximumRepeats&: stack.currentFrame->locals.max); |
1030 | break; |
1031 | |
1032 | case OP_CRRANGE: |
1033 | case OP_CRMINRANGE: |
1034 | minimize = (*stack.currentFrame->args.instructionPtr == OP_CRMINRANGE); |
1035 | min = get2ByteValue(opcodePtr: stack.currentFrame->args.instructionPtr + 1); |
1036 | stack.currentFrame->locals.max = get2ByteValue(opcodePtr: stack.currentFrame->args.instructionPtr + 3); |
1037 | if (stack.currentFrame->locals.max == 0) |
1038 | stack.currentFrame->locals.max = INT_MAX; |
1039 | stack.currentFrame->args.instructionPtr += 5; |
1040 | break; |
1041 | |
1042 | default: /* No repeat follows */ |
1043 | min = stack.currentFrame->locals.max = 1; |
1044 | } |
1045 | |
1046 | /* First, ensure the minimum number of matches are present. */ |
1047 | |
1048 | for (int i = 1; i <= min; i++) { |
1049 | if (stack.currentFrame->args.subjectPtr >= md.endSubject) |
1050 | RRETURN_NO_MATCH; |
1051 | int c = *stack.currentFrame->args.subjectPtr++; |
1052 | if (!jsc_pcre_xclass(c, stack.currentFrame->locals.data)) |
1053 | RRETURN_NO_MATCH; |
1054 | } |
1055 | |
1056 | /* If max == min we can continue with the main loop without the |
1057 | need to recurse. */ |
1058 | |
1059 | if (min == stack.currentFrame->locals.max) |
1060 | NEXT_OPCODE; |
1061 | |
1062 | /* If minimizing, keep testing the rest of the expression and advancing |
1063 | the pointer while it matches the class. */ |
1064 | |
1065 | if (minimize) { |
1066 | for (stack.currentFrame->locals.fi = min;; stack.currentFrame->locals.fi++) { |
1067 | RECURSIVE_MATCH(26, stack.currentFrame->args.instructionPtr, stack.currentFrame->args.bracketChain); |
1068 | if (isMatch) |
1069 | RRETURN; |
1070 | if (stack.currentFrame->locals.fi >= stack.currentFrame->locals.max || stack.currentFrame->args.subjectPtr >= md.endSubject) |
1071 | RRETURN; |
1072 | int c = *stack.currentFrame->args.subjectPtr++; |
1073 | if (!jsc_pcre_xclass(c, stack.currentFrame->locals.data)) |
1074 | RRETURN; |
1075 | } |
1076 | /* Control never reaches here */ |
1077 | } |
1078 | |
1079 | /* If maximizing, find the longest possible run, then work backwards. */ |
1080 | |
1081 | else { |
1082 | stack.currentFrame->locals.subjectPtrAtStartOfInstruction = stack.currentFrame->args.subjectPtr; |
1083 | for (int i = min; i < stack.currentFrame->locals.max; i++) { |
1084 | if (stack.currentFrame->args.subjectPtr >= md.endSubject) |
1085 | break; |
1086 | int c = *stack.currentFrame->args.subjectPtr; |
1087 | if (!jsc_pcre_xclass(c, stack.currentFrame->locals.data)) |
1088 | break; |
1089 | ++stack.currentFrame->args.subjectPtr; |
1090 | } |
1091 | for(;;) { |
1092 | RECURSIVE_MATCH(27, stack.currentFrame->args.instructionPtr, stack.currentFrame->args.bracketChain); |
1093 | if (isMatch) |
1094 | RRETURN; |
1095 | if (stack.currentFrame->args.subjectPtr-- == stack.currentFrame->locals.subjectPtrAtStartOfInstruction) |
1096 | break; /* Stop if tried at original pos */ |
1097 | } |
1098 | RRETURN; |
1099 | } |
1100 | |
1101 | /* Control never reaches here */ |
1102 | |
1103 | /* Match a single character, casefully */ |
1104 | |
1105 | BEGIN_OPCODE(CHAR): |
1106 | stack.currentFrame->locals.length = 1; |
1107 | stack.currentFrame->args.instructionPtr++; |
1108 | getUTF8CharAndIncrementLength(c&: stack.currentFrame->locals.fc, subjectPtr: stack.currentFrame->args.instructionPtr, len&: stack.currentFrame->locals.length); |
1109 | stack.currentFrame->args.instructionPtr += stack.currentFrame->locals.length; |
1110 | if (stack.currentFrame->args.subjectPtr >= md.endSubject) |
1111 | RRETURN_NO_MATCH; |
1112 | if (stack.currentFrame->locals.fc != *stack.currentFrame->args.subjectPtr++) |
1113 | RRETURN_NO_MATCH; |
1114 | NEXT_OPCODE; |
1115 | |
1116 | /* Match a single character, caselessly */ |
1117 | |
1118 | BEGIN_OPCODE(CHAR_IGNORING_CASE): { |
1119 | stack.currentFrame->locals.length = 1; |
1120 | stack.currentFrame->args.instructionPtr++; |
1121 | getUTF8CharAndIncrementLength(c&: stack.currentFrame->locals.fc, subjectPtr: stack.currentFrame->args.instructionPtr, len&: stack.currentFrame->locals.length); |
1122 | stack.currentFrame->args.instructionPtr += stack.currentFrame->locals.length; |
1123 | if (stack.currentFrame->args.subjectPtr >= md.endSubject) |
1124 | RRETURN_NO_MATCH; |
1125 | int dc = *stack.currentFrame->args.subjectPtr++; |
1126 | if (stack.currentFrame->locals.fc != dc && jsc_pcre_ucp_othercase(stack.currentFrame->locals.fc) != dc) |
1127 | RRETURN_NO_MATCH; |
1128 | NEXT_OPCODE; |
1129 | } |
1130 | |
1131 | /* Match a single ASCII character. */ |
1132 | |
1133 | BEGIN_OPCODE(ASCII_CHAR): |
1134 | if (md.endSubject == stack.currentFrame->args.subjectPtr) |
1135 | RRETURN_NO_MATCH; |
1136 | if (*stack.currentFrame->args.subjectPtr != stack.currentFrame->args.instructionPtr[1]) |
1137 | RRETURN_NO_MATCH; |
1138 | ++stack.currentFrame->args.subjectPtr; |
1139 | stack.currentFrame->args.instructionPtr += 2; |
1140 | NEXT_OPCODE; |
1141 | |
1142 | /* Match one of two cases of an ASCII letter. */ |
1143 | |
1144 | BEGIN_OPCODE(ASCII_LETTER_IGNORING_CASE): |
1145 | if (md.endSubject == stack.currentFrame->args.subjectPtr) |
1146 | RRETURN_NO_MATCH; |
1147 | if ((*stack.currentFrame->args.subjectPtr | 0x20) != stack.currentFrame->args.instructionPtr[1]) |
1148 | RRETURN_NO_MATCH; |
1149 | ++stack.currentFrame->args.subjectPtr; |
1150 | stack.currentFrame->args.instructionPtr += 2; |
1151 | NEXT_OPCODE; |
1152 | |
1153 | /* Match a single character repeatedly; different opcodes share code. */ |
1154 | |
1155 | BEGIN_OPCODE(EXACT): |
1156 | min = stack.currentFrame->locals.max = get2ByteValue(opcodePtr: stack.currentFrame->args.instructionPtr + 1); |
1157 | minimize = false; |
1158 | stack.currentFrame->args.instructionPtr += 3; |
1159 | goto REPEATCHAR; |
1160 | |
1161 | BEGIN_OPCODE(UPTO): |
1162 | BEGIN_OPCODE(MINUPTO): |
1163 | min = 0; |
1164 | stack.currentFrame->locals.max = get2ByteValue(opcodePtr: stack.currentFrame->args.instructionPtr + 1); |
1165 | minimize = *stack.currentFrame->args.instructionPtr == OP_MINUPTO; |
1166 | stack.currentFrame->args.instructionPtr += 3; |
1167 | goto REPEATCHAR; |
1168 | |
1169 | BEGIN_OPCODE(STAR): |
1170 | BEGIN_OPCODE(MINSTAR): |
1171 | BEGIN_OPCODE(PLUS): |
1172 | BEGIN_OPCODE(MINPLUS): |
1173 | BEGIN_OPCODE(QUERY): |
1174 | BEGIN_OPCODE(MINQUERY): |
1175 | repeatInformationFromInstructionOffset(instructionOffset: *stack.currentFrame->args.instructionPtr++ - OP_STAR, minimize, minimumRepeats&: min, maximumRepeats&: stack.currentFrame->locals.max); |
1176 | |
1177 | /* Common code for all repeated single-character matches. We can give |
1178 | up quickly if there are fewer than the minimum number of characters left in |
1179 | the subject. */ |
1180 | |
1181 | REPEATCHAR: |
1182 | |
1183 | stack.currentFrame->locals.length = 1; |
1184 | getUTF8CharAndIncrementLength(c&: stack.currentFrame->locals.fc, subjectPtr: stack.currentFrame->args.instructionPtr, len&: stack.currentFrame->locals.length); |
1185 | if (min * (stack.currentFrame->locals.fc > 0xFFFF ? 2 : 1) > md.endSubject - stack.currentFrame->args.subjectPtr) |
1186 | RRETURN_NO_MATCH; |
1187 | stack.currentFrame->args.instructionPtr += stack.currentFrame->locals.length; |
1188 | |
1189 | if (stack.currentFrame->locals.fc <= 0xFFFF) { |
1190 | othercase = md.ignoreCase ? jsc_pcre_ucp_othercase(stack.currentFrame->locals.fc) : -1; |
1191 | |
1192 | for (int i = 1; i <= min; i++) { |
1193 | if (*stack.currentFrame->args.subjectPtr != stack.currentFrame->locals.fc && *stack.currentFrame->args.subjectPtr != othercase) |
1194 | RRETURN_NO_MATCH; |
1195 | ++stack.currentFrame->args.subjectPtr; |
1196 | } |
1197 | |
1198 | if (min == stack.currentFrame->locals.max) |
1199 | NEXT_OPCODE; |
1200 | |
1201 | if (minimize) { |
1202 | stack.currentFrame->locals.repeatOthercase = othercase; |
1203 | for (stack.currentFrame->locals.fi = min;; stack.currentFrame->locals.fi++) { |
1204 | RECURSIVE_MATCH(28, stack.currentFrame->args.instructionPtr, stack.currentFrame->args.bracketChain); |
1205 | if (isMatch) |
1206 | RRETURN; |
1207 | if (stack.currentFrame->locals.fi >= stack.currentFrame->locals.max || stack.currentFrame->args.subjectPtr >= md.endSubject) |
1208 | RRETURN; |
1209 | if (*stack.currentFrame->args.subjectPtr != stack.currentFrame->locals.fc && *stack.currentFrame->args.subjectPtr != stack.currentFrame->locals.repeatOthercase) |
1210 | RRETURN; |
1211 | ++stack.currentFrame->args.subjectPtr; |
1212 | } |
1213 | /* Control never reaches here */ |
1214 | } else { |
1215 | stack.currentFrame->locals.subjectPtrAtStartOfInstruction = stack.currentFrame->args.subjectPtr; |
1216 | for (int i = min; i < stack.currentFrame->locals.max; i++) { |
1217 | if (stack.currentFrame->args.subjectPtr >= md.endSubject) |
1218 | break; |
1219 | if (*stack.currentFrame->args.subjectPtr != stack.currentFrame->locals.fc && *stack.currentFrame->args.subjectPtr != othercase) |
1220 | break; |
1221 | ++stack.currentFrame->args.subjectPtr; |
1222 | } |
1223 | while (stack.currentFrame->args.subjectPtr >= stack.currentFrame->locals.subjectPtrAtStartOfInstruction) { |
1224 | RECURSIVE_MATCH(29, stack.currentFrame->args.instructionPtr, stack.currentFrame->args.bracketChain); |
1225 | if (isMatch) |
1226 | RRETURN; |
1227 | --stack.currentFrame->args.subjectPtr; |
1228 | } |
1229 | RRETURN_NO_MATCH; |
1230 | } |
1231 | /* Control never reaches here */ |
1232 | } else { |
1233 | /* No case on surrogate pairs, so no need to bother with "othercase". */ |
1234 | |
1235 | for (int i = 1; i <= min; i++) { |
1236 | if (*stack.currentFrame->args.subjectPtr != stack.currentFrame->locals.fc) |
1237 | RRETURN_NO_MATCH; |
1238 | stack.currentFrame->args.subjectPtr += 2; |
1239 | } |
1240 | |
1241 | if (min == stack.currentFrame->locals.max) |
1242 | NEXT_OPCODE; |
1243 | |
1244 | if (minimize) { |
1245 | for (stack.currentFrame->locals.fi = min;; stack.currentFrame->locals.fi++) { |
1246 | RECURSIVE_MATCH(30, stack.currentFrame->args.instructionPtr, stack.currentFrame->args.bracketChain); |
1247 | if (isMatch) |
1248 | RRETURN; |
1249 | if (stack.currentFrame->locals.fi >= stack.currentFrame->locals.max || stack.currentFrame->args.subjectPtr >= md.endSubject) |
1250 | RRETURN; |
1251 | if (*stack.currentFrame->args.subjectPtr != stack.currentFrame->locals.fc) |
1252 | RRETURN; |
1253 | stack.currentFrame->args.subjectPtr += 2; |
1254 | } |
1255 | /* Control never reaches here */ |
1256 | } else { |
1257 | stack.currentFrame->locals.subjectPtrAtStartOfInstruction = stack.currentFrame->args.subjectPtr; |
1258 | for (int i = min; i < stack.currentFrame->locals.max; i++) { |
1259 | if (stack.currentFrame->args.subjectPtr > md.endSubject - 2) |
1260 | break; |
1261 | if (*stack.currentFrame->args.subjectPtr != stack.currentFrame->locals.fc) |
1262 | break; |
1263 | stack.currentFrame->args.subjectPtr += 2; |
1264 | } |
1265 | while (stack.currentFrame->args.subjectPtr >= stack.currentFrame->locals.subjectPtrAtStartOfInstruction) { |
1266 | RECURSIVE_MATCH(31, stack.currentFrame->args.instructionPtr, stack.currentFrame->args.bracketChain); |
1267 | if (isMatch) |
1268 | RRETURN; |
1269 | stack.currentFrame->args.subjectPtr -= 2; |
1270 | } |
1271 | RRETURN_NO_MATCH; |
1272 | } |
1273 | /* Control never reaches here */ |
1274 | } |
1275 | /* Control never reaches here */ |
1276 | |
1277 | /* Match a negated single one-byte character. */ |
1278 | |
1279 | BEGIN_OPCODE(NOT): { |
1280 | if (stack.currentFrame->args.subjectPtr >= md.endSubject) |
1281 | RRETURN_NO_MATCH; |
1282 | int b = stack.currentFrame->args.instructionPtr[1]; |
1283 | int c = *stack.currentFrame->args.subjectPtr++; |
1284 | stack.currentFrame->args.instructionPtr += 2; |
1285 | if (md.ignoreCase) { |
1286 | if (c < 128) |
1287 | c = toLowerCase(c); |
1288 | if (toLowerCase(c: b) == c) |
1289 | RRETURN_NO_MATCH; |
1290 | } else { |
1291 | if (b == c) |
1292 | RRETURN_NO_MATCH; |
1293 | } |
1294 | NEXT_OPCODE; |
1295 | } |
1296 | |
1297 | /* Match a negated single one-byte character repeatedly. This is almost a |
1298 | repeat of the code for a repeated single character, but I haven't found a |
1299 | nice way of commoning these up that doesn't require a test of the |
1300 | positive/negative option for each character match. Maybe that wouldn't add |
1301 | very much to the time taken, but character matching *is* what this is all |
1302 | about... */ |
1303 | |
1304 | BEGIN_OPCODE(NOTEXACT): |
1305 | min = stack.currentFrame->locals.max = get2ByteValue(opcodePtr: stack.currentFrame->args.instructionPtr + 1); |
1306 | minimize = false; |
1307 | stack.currentFrame->args.instructionPtr += 3; |
1308 | goto REPEATNOTCHAR; |
1309 | |
1310 | BEGIN_OPCODE(NOTUPTO): |
1311 | BEGIN_OPCODE(NOTMINUPTO): |
1312 | min = 0; |
1313 | stack.currentFrame->locals.max = get2ByteValue(opcodePtr: stack.currentFrame->args.instructionPtr + 1); |
1314 | minimize = *stack.currentFrame->args.instructionPtr == OP_NOTMINUPTO; |
1315 | stack.currentFrame->args.instructionPtr += 3; |
1316 | goto REPEATNOTCHAR; |
1317 | |
1318 | BEGIN_OPCODE(NOTSTAR): |
1319 | BEGIN_OPCODE(NOTMINSTAR): |
1320 | BEGIN_OPCODE(NOTPLUS): |
1321 | BEGIN_OPCODE(NOTMINPLUS): |
1322 | BEGIN_OPCODE(NOTQUERY): |
1323 | BEGIN_OPCODE(NOTMINQUERY): |
1324 | repeatInformationFromInstructionOffset(instructionOffset: *stack.currentFrame->args.instructionPtr++ - OP_NOTSTAR, minimize, minimumRepeats&: min, maximumRepeats&: stack.currentFrame->locals.max); |
1325 | |
1326 | /* Common code for all repeated single-byte matches. We can give up quickly |
1327 | if there are fewer than the minimum number of bytes left in the |
1328 | subject. */ |
1329 | |
1330 | REPEATNOTCHAR: |
1331 | if (min > md.endSubject - stack.currentFrame->args.subjectPtr) |
1332 | RRETURN_NO_MATCH; |
1333 | stack.currentFrame->locals.fc = *stack.currentFrame->args.instructionPtr++; |
1334 | |
1335 | /* The code is duplicated for the caseless and caseful cases, for speed, |
1336 | since matching characters is likely to be quite common. First, ensure the |
1337 | minimum number of matches are present. If min = max, continue at the same |
1338 | level without recursing. Otherwise, if minimizing, keep trying the rest of |
1339 | the expression and advancing one matching character if failing, up to the |
1340 | maximum. Alternatively, if maximizing, find the maximum number of |
1341 | characters and work backwards. */ |
1342 | |
1343 | DPRINTF(("negative matching %c{%d,%d}\n" , stack.currentFrame->locals.fc, min, stack.currentFrame->locals.max)); |
1344 | |
1345 | if (md.ignoreCase) { |
1346 | if (stack.currentFrame->locals.fc < 128) |
1347 | stack.currentFrame->locals.fc = toLowerCase(c: stack.currentFrame->locals.fc); |
1348 | |
1349 | for (int i = 1; i <= min; i++) { |
1350 | int d = *stack.currentFrame->args.subjectPtr++; |
1351 | if (d < 128) |
1352 | d = toLowerCase(c: d); |
1353 | if (stack.currentFrame->locals.fc == d) |
1354 | RRETURN_NO_MATCH; |
1355 | } |
1356 | |
1357 | if (min == stack.currentFrame->locals.max) |
1358 | NEXT_OPCODE; |
1359 | |
1360 | if (minimize) { |
1361 | for (stack.currentFrame->locals.fi = min;; stack.currentFrame->locals.fi++) { |
1362 | RECURSIVE_MATCH(38, stack.currentFrame->args.instructionPtr, stack.currentFrame->args.bracketChain); |
1363 | if (isMatch) |
1364 | RRETURN; |
1365 | int d = *stack.currentFrame->args.subjectPtr++; |
1366 | if (d < 128) |
1367 | d = toLowerCase(c: d); |
1368 | if (stack.currentFrame->locals.fi >= stack.currentFrame->locals.max || stack.currentFrame->args.subjectPtr >= md.endSubject || stack.currentFrame->locals.fc == d) |
1369 | RRETURN; |
1370 | } |
1371 | /* Control never reaches here */ |
1372 | } |
1373 | |
1374 | /* Maximize case */ |
1375 | |
1376 | else { |
1377 | stack.currentFrame->locals.subjectPtrAtStartOfInstruction = stack.currentFrame->args.subjectPtr; |
1378 | |
1379 | for (int i = min; i < stack.currentFrame->locals.max; i++) { |
1380 | if (stack.currentFrame->args.subjectPtr >= md.endSubject) |
1381 | break; |
1382 | int d = *stack.currentFrame->args.subjectPtr; |
1383 | if (d < 128) |
1384 | d = toLowerCase(c: d); |
1385 | if (stack.currentFrame->locals.fc == d) |
1386 | break; |
1387 | ++stack.currentFrame->args.subjectPtr; |
1388 | } |
1389 | for (;;) { |
1390 | RECURSIVE_MATCH(40, stack.currentFrame->args.instructionPtr, stack.currentFrame->args.bracketChain); |
1391 | if (isMatch) |
1392 | RRETURN; |
1393 | if (stack.currentFrame->args.subjectPtr-- == stack.currentFrame->locals.subjectPtrAtStartOfInstruction) |
1394 | break; /* Stop if tried at original pos */ |
1395 | } |
1396 | |
1397 | RRETURN; |
1398 | } |
1399 | /* Control never reaches here */ |
1400 | } |
1401 | |
1402 | /* Caseful comparisons */ |
1403 | |
1404 | else { |
1405 | for (int i = 1; i <= min; i++) { |
1406 | int d = *stack.currentFrame->args.subjectPtr++; |
1407 | if (stack.currentFrame->locals.fc == d) |
1408 | RRETURN_NO_MATCH; |
1409 | } |
1410 | |
1411 | if (min == stack.currentFrame->locals.max) |
1412 | NEXT_OPCODE; |
1413 | |
1414 | if (minimize) { |
1415 | for (stack.currentFrame->locals.fi = min;; stack.currentFrame->locals.fi++) { |
1416 | RECURSIVE_MATCH(42, stack.currentFrame->args.instructionPtr, stack.currentFrame->args.bracketChain); |
1417 | if (isMatch) |
1418 | RRETURN; |
1419 | int d = *stack.currentFrame->args.subjectPtr++; |
1420 | if (stack.currentFrame->locals.fi >= stack.currentFrame->locals.max || stack.currentFrame->args.subjectPtr >= md.endSubject || stack.currentFrame->locals.fc == d) |
1421 | RRETURN; |
1422 | } |
1423 | /* Control never reaches here */ |
1424 | } |
1425 | |
1426 | /* Maximize case */ |
1427 | |
1428 | else { |
1429 | stack.currentFrame->locals.subjectPtrAtStartOfInstruction = stack.currentFrame->args.subjectPtr; |
1430 | |
1431 | for (int i = min; i < stack.currentFrame->locals.max; i++) { |
1432 | if (stack.currentFrame->args.subjectPtr >= md.endSubject) |
1433 | break; |
1434 | int d = *stack.currentFrame->args.subjectPtr; |
1435 | if (stack.currentFrame->locals.fc == d) |
1436 | break; |
1437 | ++stack.currentFrame->args.subjectPtr; |
1438 | } |
1439 | for (;;) { |
1440 | RECURSIVE_MATCH(44, stack.currentFrame->args.instructionPtr, stack.currentFrame->args.bracketChain); |
1441 | if (isMatch) |
1442 | RRETURN; |
1443 | if (stack.currentFrame->args.subjectPtr-- == stack.currentFrame->locals.subjectPtrAtStartOfInstruction) |
1444 | break; /* Stop if tried at original pos */ |
1445 | } |
1446 | |
1447 | RRETURN; |
1448 | } |
1449 | } |
1450 | /* Control never reaches here */ |
1451 | |
1452 | /* Match a single character type repeatedly; several different opcodes |
1453 | share code. This is very similar to the code for single characters, but we |
1454 | repeat it in the interests of efficiency. */ |
1455 | |
1456 | BEGIN_OPCODE(TYPEEXACT): |
1457 | min = stack.currentFrame->locals.max = get2ByteValue(opcodePtr: stack.currentFrame->args.instructionPtr + 1); |
1458 | minimize = true; |
1459 | stack.currentFrame->args.instructionPtr += 3; |
1460 | goto REPEATTYPE; |
1461 | |
1462 | BEGIN_OPCODE(TYPEUPTO): |
1463 | BEGIN_OPCODE(TYPEMINUPTO): |
1464 | min = 0; |
1465 | stack.currentFrame->locals.max = get2ByteValue(opcodePtr: stack.currentFrame->args.instructionPtr + 1); |
1466 | minimize = *stack.currentFrame->args.instructionPtr == OP_TYPEMINUPTO; |
1467 | stack.currentFrame->args.instructionPtr += 3; |
1468 | goto REPEATTYPE; |
1469 | |
1470 | BEGIN_OPCODE(TYPESTAR): |
1471 | BEGIN_OPCODE(TYPEMINSTAR): |
1472 | BEGIN_OPCODE(TYPEPLUS): |
1473 | BEGIN_OPCODE(TYPEMINPLUS): |
1474 | BEGIN_OPCODE(TYPEQUERY): |
1475 | BEGIN_OPCODE(TYPEMINQUERY): |
1476 | repeatInformationFromInstructionOffset(instructionOffset: *stack.currentFrame->args.instructionPtr++ - OP_TYPESTAR, minimize, minimumRepeats&: min, maximumRepeats&: stack.currentFrame->locals.max); |
1477 | |
1478 | /* Common code for all repeated single character type matches. Note that |
1479 | in UTF-8 mode, '.' matches a character of any length, but for the other |
1480 | character types, the valid characters are all one-byte long. */ |
1481 | |
1482 | REPEATTYPE: |
1483 | stack.currentFrame->locals.ctype = *stack.currentFrame->args.instructionPtr++; /* Code for the character type */ |
1484 | |
1485 | /* First, ensure the minimum number of matches are present. Use inline |
1486 | code for maximizing the speed, and do the type test once at the start |
1487 | (i.e. keep it out of the loop). Also we can test that there are at least |
1488 | the minimum number of characters before we start. */ |
1489 | |
1490 | if (min > md.endSubject - stack.currentFrame->args.subjectPtr) |
1491 | RRETURN_NO_MATCH; |
1492 | if (min > 0) { |
1493 | switch (stack.currentFrame->locals.ctype) { |
1494 | case OP_NOT_NEWLINE: |
1495 | for (int i = 1; i <= min; i++) { |
1496 | if (isNewline(nl: *stack.currentFrame->args.subjectPtr)) |
1497 | RRETURN_NO_MATCH; |
1498 | ++stack.currentFrame->args.subjectPtr; |
1499 | } |
1500 | break; |
1501 | |
1502 | case OP_NOT_DIGIT: |
1503 | for (int i = 1; i <= min; i++) { |
1504 | if (isASCIIDigit(c: *stack.currentFrame->args.subjectPtr)) |
1505 | RRETURN_NO_MATCH; |
1506 | ++stack.currentFrame->args.subjectPtr; |
1507 | } |
1508 | break; |
1509 | |
1510 | case OP_DIGIT: |
1511 | for (int i = 1; i <= min; i++) { |
1512 | if (!isASCIIDigit(c: *stack.currentFrame->args.subjectPtr)) |
1513 | RRETURN_NO_MATCH; |
1514 | ++stack.currentFrame->args.subjectPtr; |
1515 | } |
1516 | break; |
1517 | |
1518 | case OP_NOT_WHITESPACE: |
1519 | for (int i = 1; i <= min; i++) { |
1520 | if (isSpaceChar(c: *stack.currentFrame->args.subjectPtr)) |
1521 | RRETURN_NO_MATCH; |
1522 | ++stack.currentFrame->args.subjectPtr; |
1523 | } |
1524 | break; |
1525 | |
1526 | case OP_WHITESPACE: |
1527 | for (int i = 1; i <= min; i++) { |
1528 | if (!isSpaceChar(c: *stack.currentFrame->args.subjectPtr)) |
1529 | RRETURN_NO_MATCH; |
1530 | ++stack.currentFrame->args.subjectPtr; |
1531 | } |
1532 | break; |
1533 | |
1534 | case OP_NOT_WORDCHAR: |
1535 | for (int i = 1; i <= min; i++) { |
1536 | if (isWordChar(c: *stack.currentFrame->args.subjectPtr)) |
1537 | RRETURN_NO_MATCH; |
1538 | ++stack.currentFrame->args.subjectPtr; |
1539 | } |
1540 | break; |
1541 | |
1542 | case OP_WORDCHAR: |
1543 | for (int i = 1; i <= min; i++) { |
1544 | if (!isWordChar(c: *stack.currentFrame->args.subjectPtr)) |
1545 | RRETURN_NO_MATCH; |
1546 | ++stack.currentFrame->args.subjectPtr; |
1547 | } |
1548 | break; |
1549 | |
1550 | default: |
1551 | ASSERT_NOT_REACHED(); |
1552 | return matchError(errorCode: JSRegExpErrorInternal, stack); |
1553 | } /* End switch(stack.currentFrame->locals.ctype) */ |
1554 | } |
1555 | |
1556 | /* If min = max, continue at the same level without recursing */ |
1557 | |
1558 | if (min == stack.currentFrame->locals.max) |
1559 | NEXT_OPCODE; |
1560 | |
1561 | /* If minimizing, we have to test the rest of the pattern before each |
1562 | subsequent match. */ |
1563 | |
1564 | if (minimize) { |
1565 | for (stack.currentFrame->locals.fi = min;; stack.currentFrame->locals.fi++) { |
1566 | RECURSIVE_MATCH(48, stack.currentFrame->args.instructionPtr, stack.currentFrame->args.bracketChain); |
1567 | if (isMatch) |
1568 | RRETURN; |
1569 | if (stack.currentFrame->locals.fi >= stack.currentFrame->locals.max || stack.currentFrame->args.subjectPtr >= md.endSubject) |
1570 | RRETURN; |
1571 | |
1572 | int c = *stack.currentFrame->args.subjectPtr++; |
1573 | switch (stack.currentFrame->locals.ctype) { |
1574 | case OP_NOT_NEWLINE: |
1575 | if (isNewline(nl: c)) |
1576 | RRETURN; |
1577 | break; |
1578 | |
1579 | case OP_NOT_DIGIT: |
1580 | if (isASCIIDigit(c)) |
1581 | RRETURN; |
1582 | break; |
1583 | |
1584 | case OP_DIGIT: |
1585 | if (!isASCIIDigit(c)) |
1586 | RRETURN; |
1587 | break; |
1588 | |
1589 | case OP_NOT_WHITESPACE: |
1590 | if (isSpaceChar(c)) |
1591 | RRETURN; |
1592 | break; |
1593 | |
1594 | case OP_WHITESPACE: |
1595 | if (!isSpaceChar(c)) |
1596 | RRETURN; |
1597 | break; |
1598 | |
1599 | case OP_NOT_WORDCHAR: |
1600 | if (isWordChar(c)) |
1601 | RRETURN; |
1602 | break; |
1603 | |
1604 | case OP_WORDCHAR: |
1605 | if (!isWordChar(c)) |
1606 | RRETURN; |
1607 | break; |
1608 | |
1609 | default: |
1610 | ASSERT_NOT_REACHED(); |
1611 | return matchError(errorCode: JSRegExpErrorInternal, stack); |
1612 | } |
1613 | } |
1614 | /* Control never reaches here */ |
1615 | } |
1616 | |
1617 | /* If maximizing it is worth using inline code for speed, doing the type |
1618 | test once at the start (i.e. keep it out of the loop). */ |
1619 | |
1620 | else { |
1621 | stack.currentFrame->locals.subjectPtrAtStartOfInstruction = stack.currentFrame->args.subjectPtr; /* Remember where we started */ |
1622 | |
1623 | switch (stack.currentFrame->locals.ctype) { |
1624 | case OP_NOT_NEWLINE: |
1625 | for (int i = min; i < stack.currentFrame->locals.max; i++) { |
1626 | if (stack.currentFrame->args.subjectPtr >= md.endSubject || isNewline(nl: *stack.currentFrame->args.subjectPtr)) |
1627 | break; |
1628 | stack.currentFrame->args.subjectPtr++; |
1629 | } |
1630 | break; |
1631 | |
1632 | case OP_NOT_DIGIT: |
1633 | for (int i = min; i < stack.currentFrame->locals.max; i++) { |
1634 | if (stack.currentFrame->args.subjectPtr >= md.endSubject) |
1635 | break; |
1636 | int c = *stack.currentFrame->args.subjectPtr; |
1637 | if (isASCIIDigit(c)) |
1638 | break; |
1639 | ++stack.currentFrame->args.subjectPtr; |
1640 | } |
1641 | break; |
1642 | |
1643 | case OP_DIGIT: |
1644 | for (int i = min; i < stack.currentFrame->locals.max; i++) { |
1645 | if (stack.currentFrame->args.subjectPtr >= md.endSubject) |
1646 | break; |
1647 | int c = *stack.currentFrame->args.subjectPtr; |
1648 | if (!isASCIIDigit(c)) |
1649 | break; |
1650 | ++stack.currentFrame->args.subjectPtr; |
1651 | } |
1652 | break; |
1653 | |
1654 | case OP_NOT_WHITESPACE: |
1655 | for (int i = min; i < stack.currentFrame->locals.max; i++) { |
1656 | if (stack.currentFrame->args.subjectPtr >= md.endSubject) |
1657 | break; |
1658 | int c = *stack.currentFrame->args.subjectPtr; |
1659 | if (isSpaceChar(c)) |
1660 | break; |
1661 | ++stack.currentFrame->args.subjectPtr; |
1662 | } |
1663 | break; |
1664 | |
1665 | case OP_WHITESPACE: |
1666 | for (int i = min; i < stack.currentFrame->locals.max; i++) { |
1667 | if (stack.currentFrame->args.subjectPtr >= md.endSubject) |
1668 | break; |
1669 | int c = *stack.currentFrame->args.subjectPtr; |
1670 | if (!isSpaceChar(c)) |
1671 | break; |
1672 | ++stack.currentFrame->args.subjectPtr; |
1673 | } |
1674 | break; |
1675 | |
1676 | case OP_NOT_WORDCHAR: |
1677 | for (int i = min; i < stack.currentFrame->locals.max; i++) { |
1678 | if (stack.currentFrame->args.subjectPtr >= md.endSubject) |
1679 | break; |
1680 | int c = *stack.currentFrame->args.subjectPtr; |
1681 | if (isWordChar(c)) |
1682 | break; |
1683 | ++stack.currentFrame->args.subjectPtr; |
1684 | } |
1685 | break; |
1686 | |
1687 | case OP_WORDCHAR: |
1688 | for (int i = min; i < stack.currentFrame->locals.max; i++) { |
1689 | if (stack.currentFrame->args.subjectPtr >= md.endSubject) |
1690 | break; |
1691 | int c = *stack.currentFrame->args.subjectPtr; |
1692 | if (!isWordChar(c)) |
1693 | break; |
1694 | ++stack.currentFrame->args.subjectPtr; |
1695 | } |
1696 | break; |
1697 | |
1698 | default: |
1699 | ASSERT_NOT_REACHED(); |
1700 | return matchError(errorCode: JSRegExpErrorInternal, stack); |
1701 | } |
1702 | |
1703 | /* stack.currentFrame->args.subjectPtr is now past the end of the maximum run */ |
1704 | |
1705 | for (;;) { |
1706 | RECURSIVE_MATCH(52, stack.currentFrame->args.instructionPtr, stack.currentFrame->args.bracketChain); |
1707 | if (isMatch) |
1708 | RRETURN; |
1709 | if (stack.currentFrame->args.subjectPtr-- == stack.currentFrame->locals.subjectPtrAtStartOfInstruction) |
1710 | break; /* Stop if tried at original pos */ |
1711 | } |
1712 | |
1713 | /* Get here if we can't make it match with any permitted repetitions */ |
1714 | |
1715 | RRETURN; |
1716 | } |
1717 | /* Control never reaches here */ |
1718 | |
1719 | BEGIN_OPCODE(CRMINPLUS): |
1720 | BEGIN_OPCODE(CRMINQUERY): |
1721 | BEGIN_OPCODE(CRMINRANGE): |
1722 | BEGIN_OPCODE(CRMINSTAR): |
1723 | BEGIN_OPCODE(CRPLUS): |
1724 | BEGIN_OPCODE(CRQUERY): |
1725 | BEGIN_OPCODE(CRRANGE): |
1726 | BEGIN_OPCODE(CRSTAR): |
1727 | ASSERT_NOT_REACHED(); |
1728 | return matchError(errorCode: JSRegExpErrorInternal, stack); |
1729 | |
1730 | #ifdef USE_COMPUTED_GOTO_FOR_MATCH_OPCODE_LOOP |
1731 | CAPTURING_BRACKET: |
1732 | #else |
1733 | default: |
1734 | #endif |
1735 | /* Opening capturing bracket. If there is space in the offset vector, save |
1736 | the current subject position in the working slot at the top of the vector. We |
1737 | mustn't change the current values of the data slot, because they may be set |
1738 | from a previous iteration of this group, and be referred to by a reference |
1739 | inside the group. |
1740 | |
1741 | If the bracket fails to match, we need to restore this value and also the |
1742 | values of the final offsets, in case they were set by a previous iteration of |
1743 | the same bracket. |
1744 | |
1745 | If there isn't enough space in the offset vector, treat this as if it were a |
1746 | non-capturing bracket. Don't worry about setting the flag for the error case |
1747 | here; that is handled in the code for KET. */ |
1748 | |
1749 | ASSERT(*stack.currentFrame->args.instructionPtr > OP_BRA); |
1750 | |
1751 | stack.currentFrame->locals.number = *stack.currentFrame->args.instructionPtr - OP_BRA; |
1752 | |
1753 | /* For extended extraction brackets (large number), we have to fish out the |
1754 | number from a dummy opcode at the start. */ |
1755 | |
1756 | if (stack.currentFrame->locals.number > EXTRACT_BASIC_MAX) |
1757 | stack.currentFrame->locals.number = get2ByteValue(opcodePtr: stack.currentFrame->args.instructionPtr + 2 + LINK_SIZE); |
1758 | stack.currentFrame->locals.offset = stack.currentFrame->locals.number << 1; |
1759 | |
1760 | #ifdef DEBUG |
1761 | printf("start bracket %d subject=" , stack.currentFrame->locals.number); |
1762 | pchars(stack.currentFrame->args.subjectPtr, 16, true, md); |
1763 | printf("\n" ); |
1764 | #endif |
1765 | |
1766 | if (stack.currentFrame->locals.offset < md.offsetMax) { |
1767 | stack.currentFrame->locals.saveOffset1 = md.offsetVector[stack.currentFrame->locals.offset]; |
1768 | stack.currentFrame->locals.saveOffset2 = md.offsetVector[stack.currentFrame->locals.offset + 1]; |
1769 | stack.currentFrame->locals.saveOffset3 = md.offsetVector[md.offsetEnd - stack.currentFrame->locals.number]; |
1770 | |
1771 | DPRINTF(("saving %d %d %d\n" , stack.currentFrame->locals.saveOffset1, stack.currentFrame->locals.saveOffset2, stack.currentFrame->locals.saveOffset3)); |
1772 | md.offsetVector[md.offsetEnd - stack.currentFrame->locals.number] = stack.currentFrame->args.subjectPtr - md.startSubject; |
1773 | |
1774 | do { |
1775 | RECURSIVE_MATCH_NEW_GROUP(1, stack.currentFrame->args.instructionPtr + 1 + LINK_SIZE, stack.currentFrame->args.bracketChain); |
1776 | if (isMatch) |
1777 | RRETURN; |
1778 | stack.currentFrame->args.instructionPtr += getLinkValue(opcodePtr: stack.currentFrame->args.instructionPtr + 1); |
1779 | } while (*stack.currentFrame->args.instructionPtr == OP_ALT); |
1780 | |
1781 | DPRINTF(("bracket %d failed\n" , stack.currentFrame->locals.number)); |
1782 | |
1783 | md.offsetVector[stack.currentFrame->locals.offset] = stack.currentFrame->locals.saveOffset1; |
1784 | md.offsetVector[stack.currentFrame->locals.offset + 1] = stack.currentFrame->locals.saveOffset2; |
1785 | md.offsetVector[md.offsetEnd - stack.currentFrame->locals.number] = stack.currentFrame->locals.saveOffset3; |
1786 | |
1787 | RRETURN; |
1788 | } |
1789 | |
1790 | /* Insufficient room for saving captured contents */ |
1791 | |
1792 | goto NON_CAPTURING_BRACKET; |
1793 | } |
1794 | |
1795 | /* Do not stick any code in here without much thought; it is assumed |
1796 | that "continue" in the code above comes out to here to repeat the main |
1797 | loop. */ |
1798 | |
1799 | } /* End of main loop */ |
1800 | |
1801 | ASSERT_NOT_REACHED(); |
1802 | |
1803 | #ifndef USE_COMPUTED_GOTO_FOR_MATCH_RECURSION |
1804 | |
1805 | RRETURN_SWITCH: |
1806 | switch (stack.currentFrame->returnLocation) { |
1807 | case 0: goto RETURN; |
1808 | case 1: goto RRETURN_1; |
1809 | case 2: goto RRETURN_2; |
1810 | case 6: goto RRETURN_6; |
1811 | case 7: goto RRETURN_7; |
1812 | case 14: goto RRETURN_14; |
1813 | case 15: goto RRETURN_15; |
1814 | case 16: goto RRETURN_16; |
1815 | case 17: goto RRETURN_17; |
1816 | case 18: goto RRETURN_18; |
1817 | case 19: goto RRETURN_19; |
1818 | case 20: goto RRETURN_20; |
1819 | case 21: goto RRETURN_21; |
1820 | case 22: goto RRETURN_22; |
1821 | case 24: goto RRETURN_24; |
1822 | case 26: goto RRETURN_26; |
1823 | case 27: goto RRETURN_27; |
1824 | case 28: goto RRETURN_28; |
1825 | case 29: goto RRETURN_29; |
1826 | case 30: goto RRETURN_30; |
1827 | case 31: goto RRETURN_31; |
1828 | case 38: goto RRETURN_38; |
1829 | case 40: goto RRETURN_40; |
1830 | case 42: goto RRETURN_42; |
1831 | case 44: goto RRETURN_44; |
1832 | case 48: goto RRETURN_48; |
1833 | case 52: goto RRETURN_52; |
1834 | } |
1835 | |
1836 | ASSERT_NOT_REACHED(); |
1837 | return matchError(JSRegExpErrorInternal, stack); |
1838 | |
1839 | #endif |
1840 | |
1841 | RETURN: |
1842 | return isMatch; |
1843 | } |
1844 | |
1845 | |
1846 | /************************************************* |
1847 | * Execute a Regular Expression * |
1848 | *************************************************/ |
1849 | |
1850 | /* This function applies a compiled re to a subject string and picks out |
1851 | portions of the string if it matches. Two elements in the vector are set for |
1852 | each substring: the offsets to the start and end of the substring. |
1853 | |
1854 | Arguments: |
1855 | re points to the compiled expression |
1856 | extra_data points to extra data or is NULL |
1857 | subject points to the subject string |
1858 | length length of subject string (may contain binary zeros) |
1859 | start_offset where to start in the subject string |
1860 | options option bits |
1861 | offsets points to a vector of ints to be filled in with offsets |
1862 | offsetCount the number of elements in the vector |
1863 | |
1864 | Returns: > 0 => success; value is the number of elements filled in |
1865 | = 0 => success, but offsets is not big enough |
1866 | -1 => failed to match |
1867 | < -1 => some kind of unexpected problem |
1868 | */ |
1869 | |
1870 | static void tryFirstByteOptimization(const UChar*& subjectPtr, const UChar* endSubject, int firstByte, bool firstByteIsCaseless, bool useMultiLineFirstCharOptimization, const UChar* originalSubjectStart) |
1871 | { |
1872 | // If firstByte is set, try scanning to the first instance of that byte |
1873 | // no need to try and match against any earlier part of the subject string. |
1874 | if (firstByte >= 0) { |
1875 | UChar firstChar = firstByte; |
1876 | if (firstByteIsCaseless) |
1877 | while (subjectPtr < endSubject) { |
1878 | int c = *subjectPtr; |
1879 | if (c > 127) |
1880 | break; |
1881 | if (toLowerCase(c) == firstChar) |
1882 | break; |
1883 | subjectPtr++; |
1884 | } |
1885 | else { |
1886 | while (subjectPtr < endSubject && *subjectPtr != firstChar) |
1887 | subjectPtr++; |
1888 | } |
1889 | } else if (useMultiLineFirstCharOptimization) { |
1890 | /* Or to just after \n for a multiline match if possible */ |
1891 | // I'm not sure why this != originalSubjectStart check is necessary -- ecs 11/18/07 |
1892 | if (subjectPtr > originalSubjectStart) { |
1893 | while (subjectPtr < endSubject && !isNewline(nl: subjectPtr[-1])) |
1894 | subjectPtr++; |
1895 | } |
1896 | } |
1897 | } |
1898 | |
1899 | static bool tryRequiredByteOptimization(const UChar*& subjectPtr, const UChar* endSubject, int reqByte, int reqByte2, bool reqByteIsCaseless, bool hasFirstByte, const UChar*& reqBytePtr) |
1900 | { |
1901 | /* If reqByte is set, we know that that character must appear in the subject |
1902 | for the match to succeed. If the first character is set, reqByte must be |
1903 | later in the subject; otherwise the test starts at the match point. This |
1904 | optimization can save a huge amount of backtracking in patterns with nested |
1905 | unlimited repeats that aren't going to match. Writing separate code for |
1906 | cased/caseless versions makes it go faster, as does using an autoincrement |
1907 | and backing off on a match. |
1908 | |
1909 | HOWEVER: when the subject string is very, very long, searching to its end can |
1910 | take a long time, and give bad performance on quite ordinary patterns. This |
1911 | showed up when somebody was matching /^C/ on a 32-megabyte string... so we |
1912 | don't do this when the string is sufficiently long. |
1913 | */ |
1914 | |
1915 | if (reqByte >= 0 && endSubject - subjectPtr < REQ_BYTE_MAX) { |
1916 | const UChar* p = subjectPtr + (hasFirstByte ? 1 : 0); |
1917 | |
1918 | /* We don't need to repeat the search if we haven't yet reached the |
1919 | place we found it at last time. */ |
1920 | |
1921 | if (p > reqBytePtr) { |
1922 | if (reqByteIsCaseless) { |
1923 | while (p < endSubject) { |
1924 | int pp = *p++; |
1925 | if (pp == reqByte || pp == reqByte2) { |
1926 | p--; |
1927 | break; |
1928 | } |
1929 | } |
1930 | } else { |
1931 | while (p < endSubject) { |
1932 | if (*p++ == reqByte) { |
1933 | p--; |
1934 | break; |
1935 | } |
1936 | } |
1937 | } |
1938 | |
1939 | /* If we can't find the required character, break the matching loop */ |
1940 | |
1941 | if (p >= endSubject) |
1942 | return true; |
1943 | |
1944 | /* If we have found the required character, save the point where we |
1945 | found it, so that we don't search again next time round the loop if |
1946 | the start hasn't passed this character yet. */ |
1947 | |
1948 | reqBytePtr = p; |
1949 | } |
1950 | } |
1951 | return false; |
1952 | } |
1953 | |
1954 | int jsRegExpExecute(const JSRegExp* re, |
1955 | const UChar* subject, int length, int start_offset, int* offsets, |
1956 | int offsetCount) |
1957 | { |
1958 | ASSERT(re); |
1959 | ASSERT(subject || !length); |
1960 | ASSERT(offsetCount >= 0); |
1961 | ASSERT(offsets || offsetCount == 0); |
1962 | |
1963 | HistogramTimeLogger logger(re); |
1964 | |
1965 | MatchData matchBlock; |
1966 | matchBlock.startSubject = subject; |
1967 | matchBlock.endSubject = matchBlock.startSubject + length; |
1968 | const UChar* endSubject = matchBlock.endSubject; |
1969 | |
1970 | matchBlock.multiline = (re->options & MatchAcrossMultipleLinesOption); |
1971 | matchBlock.ignoreCase = (re->options & IgnoreCaseOption); |
1972 | |
1973 | /* If the expression has got more back references than the offsets supplied can |
1974 | hold, we get a temporary chunk of working store to use during the matching. |
1975 | Otherwise, we can use the vector supplied, rounding down its size to a multiple |
1976 | of 3. */ |
1977 | |
1978 | int ocount = offsetCount - (offsetCount % 3); |
1979 | |
1980 | // FIXME: This is lame that we have to second-guess our caller here. |
1981 | // The API should change to either fail-hard when we don't have enough offset space |
1982 | // or that we shouldn't ask our callers to pre-allocate in the first place. |
1983 | bool usingTemporaryOffsets = false; |
1984 | if (re->topBackref > 0 && re->topBackref >= ocount/3) { |
1985 | ocount = re->topBackref * 3 + 3; |
1986 | matchBlock.offsetVector = new int[ocount]; |
1987 | if (!matchBlock.offsetVector) |
1988 | return JSRegExpErrorNoMemory; |
1989 | usingTemporaryOffsets = true; |
1990 | } else |
1991 | matchBlock.offsetVector = offsets; |
1992 | |
1993 | matchBlock.offsetEnd = ocount; |
1994 | matchBlock.offsetMax = (2*ocount)/3; |
1995 | matchBlock.offsetOverflow = false; |
1996 | |
1997 | /* Compute the minimum number of offsets that we need to reset each time. Doing |
1998 | this makes a huge difference to execution time when there aren't many brackets |
1999 | in the pattern. */ |
2000 | |
2001 | int resetCount = 2 + re->topBracket * 2; |
2002 | if (resetCount > offsetCount) |
2003 | resetCount = ocount; |
2004 | |
2005 | /* Reset the working variable associated with each extraction. These should |
2006 | never be used unless previously set, but they get saved and restored, and so we |
2007 | initialize them to avoid reading uninitialized locations. */ |
2008 | |
2009 | if (matchBlock.offsetVector) { |
2010 | int* iptr = matchBlock.offsetVector + ocount; |
2011 | int* iend = iptr - resetCount/2 + 1; |
2012 | while (--iptr >= iend) |
2013 | *iptr = -1; |
2014 | } |
2015 | |
2016 | /* Set up the first character to match, if available. The firstByte value is |
2017 | never set for an anchored regular expression, but the anchoring may be forced |
2018 | at run time, so we have to test for anchoring. The first char may be unset for |
2019 | an unanchored pattern, of course. If there's no first char and the pattern was |
2020 | studied, there may be a bitmap of possible first characters. */ |
2021 | |
2022 | bool firstByteIsCaseless = false; |
2023 | int firstByte = -1; |
2024 | if (re->options & UseFirstByteOptimizationOption) { |
2025 | firstByte = re->firstByte & 255; |
2026 | if ((firstByteIsCaseless = (re->firstByte & REQ_IGNORE_CASE))) |
2027 | firstByte = toLowerCase(c: firstByte); |
2028 | } |
2029 | |
2030 | /* For anchored or unanchored matches, there may be a "last known required |
2031 | character" set. */ |
2032 | |
2033 | bool reqByteIsCaseless = false; |
2034 | int reqByte = -1; |
2035 | int reqByte2 = -1; |
2036 | if (re->options & UseRequiredByteOptimizationOption) { |
2037 | reqByte = re->reqByte & 255; // FIXME: This optimization could be made to work for UTF16 chars as well... |
2038 | reqByteIsCaseless = (re->reqByte & REQ_IGNORE_CASE); |
2039 | reqByte2 = flipCase(c: reqByte); |
2040 | } |
2041 | |
2042 | /* Loop for handling unanchored repeated matching attempts; for anchored regexs |
2043 | the loop runs just once. */ |
2044 | |
2045 | const UChar* startMatch = subject + start_offset; |
2046 | const UChar* reqBytePtr = startMatch - 1; |
2047 | bool useMultiLineFirstCharOptimization = re->options & UseMultiLineFirstByteOptimizationOption; |
2048 | |
2049 | do { |
2050 | /* Reset the maximum number of extractions we might see. */ |
2051 | if (matchBlock.offsetVector) { |
2052 | int* iptr = matchBlock.offsetVector; |
2053 | int* iend = iptr + resetCount; |
2054 | while (iptr < iend) |
2055 | *iptr++ = -1; |
2056 | } |
2057 | |
2058 | tryFirstByteOptimization(subjectPtr&: startMatch, endSubject, firstByte, firstByteIsCaseless, useMultiLineFirstCharOptimization, originalSubjectStart: matchBlock.startSubject + start_offset); |
2059 | if (tryRequiredByteOptimization(subjectPtr&: startMatch, endSubject, reqByte, reqByte2, reqByteIsCaseless, hasFirstByte: firstByte >= 0, reqBytePtr)) |
2060 | break; |
2061 | |
2062 | /* When a match occurs, substrings will be set for all internal extractions; |
2063 | we just need to set up the whole thing as substring 0 before returning. If |
2064 | there were too many extractions, set the return code to zero. In the case |
2065 | where we had to get some local store to hold offsets for backreferences, copy |
2066 | those back references that we can. In this case there need not be overflow |
2067 | if certain parts of the pattern were not used. */ |
2068 | |
2069 | /* The code starts after the JSRegExp block and the capture name table. */ |
2070 | const unsigned char* start_code = (const unsigned char*)(re + 1); |
2071 | |
2072 | int returnCode = match(subjectPtr: startMatch, instructionPtr: start_code, offsetTop: 2, md&: matchBlock); |
2073 | |
2074 | /* When the result is no match, advance the pointer to the next character |
2075 | and continue. */ |
2076 | if (returnCode == 0) { |
2077 | startMatch++; |
2078 | continue; |
2079 | } |
2080 | |
2081 | if (returnCode != 1) { |
2082 | ASSERT(returnCode == JSRegExpErrorHitLimit || returnCode == JSRegExpErrorNoMemory); |
2083 | DPRINTF((">>>> error: returning %d\n" , returnCode)); |
2084 | return returnCode; |
2085 | } |
2086 | |
2087 | /* We have a match! Copy the offset information from temporary store if |
2088 | necessary */ |
2089 | |
2090 | if (usingTemporaryOffsets) { |
2091 | if (offsetCount >= 4) { |
2092 | memcpy(dest: offsets + 2, src: matchBlock.offsetVector + 2, n: (offsetCount - 2) * sizeof(int)); |
2093 | DPRINTF(("Copied offsets from temporary memory\n" )); |
2094 | } |
2095 | if (matchBlock.endOffsetTop > offsetCount) |
2096 | matchBlock.offsetOverflow = true; |
2097 | |
2098 | DPRINTF(("Freeing temporary memory\n" )); |
2099 | delete [] matchBlock.offsetVector; |
2100 | } |
2101 | |
2102 | returnCode = matchBlock.offsetOverflow ? 0 : matchBlock.endOffsetTop / 2; |
2103 | |
2104 | if (offsetCount < 2) |
2105 | returnCode = 0; |
2106 | else { |
2107 | offsets[0] = startMatch - matchBlock.startSubject; |
2108 | offsets[1] = matchBlock.endMatchPtr - matchBlock.startSubject; |
2109 | } |
2110 | |
2111 | DPRINTF((">>>> returning %d\n" , returnCode)); |
2112 | return returnCode; |
2113 | } while (!(re->options & IsAnchoredOption) && startMatch <= endSubject); |
2114 | |
2115 | if (usingTemporaryOffsets) { |
2116 | DPRINTF(("Freeing temporary memory\n" )); |
2117 | delete [] matchBlock.offsetVector; |
2118 | } |
2119 | |
2120 | DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n" )); |
2121 | return JSRegExpErrorNoMatch; |
2122 | } |
2123 | |
2124 | #if REGEXP_HISTOGRAM |
2125 | |
2126 | class CompareHistogramEntries { |
2127 | public: |
2128 | bool operator()(const pair<UString, double>& a, const pair<UString, double>& b) |
2129 | { |
2130 | if (a.second == b.second) |
2131 | return a.first < b.first; |
2132 | return a.second < b.second; |
2133 | } |
2134 | }; |
2135 | |
2136 | Histogram::~Histogram() |
2137 | { |
2138 | Vector<pair<UString, double> > values; |
2139 | Map::iterator end = times.end(); |
2140 | for (Map::iterator it = times.begin(); it != end; ++it) |
2141 | values.append(*it); |
2142 | sort(values.begin(), values.end(), CompareHistogramEntries()); |
2143 | size_t size = values.size(); |
2144 | printf("Regular Expressions, sorted by time spent evaluating them:\n" ); |
2145 | for (size_t i = 0; i < size; ++i) |
2146 | printf(" %f - %s\n" , values[size - i - 1].second, values[size - i - 1].first.UTF8String().c_str()); |
2147 | } |
2148 | |
2149 | void Histogram::add(const JSRegExp* re, double elapsedTime) |
2150 | { |
2151 | UString string(reinterpret_cast<const UChar*>(reinterpret_cast<const char*>(re) + re->stringOffset), re->stringLength); |
2152 | if (re->options & IgnoreCaseOption && re->options & MatchAcrossMultipleLinesOption) |
2153 | string += " (multi-line, ignore case)" ; |
2154 | else { |
2155 | if (re->options & IgnoreCaseOption) |
2156 | string += " (ignore case)" ; |
2157 | if (re->options & MatchAcrossMultipleLinesOption) |
2158 | string += " (multi-line)" ; |
2159 | } |
2160 | pair<Map::iterator, bool> result = times.add(string.rep(), elapsedTime); |
2161 | if (!result.second) |
2162 | result.first->second += elapsedTime; |
2163 | } |
2164 | |
2165 | HistogramTimeLogger::HistogramTimeLogger(const JSRegExp* re) |
2166 | : m_re(re) |
2167 | , m_startTime(currentTimeMS()) |
2168 | { |
2169 | } |
2170 | |
2171 | HistogramTimeLogger::~HistogramTimeLogger() |
2172 | { |
2173 | static Histogram histogram; |
2174 | histogram.add(m_re, currentTimeMS() - m_startTime); |
2175 | } |
2176 | |
2177 | #endif |
2178 | |