| 1 | /* This is JavaScriptCore's variant of the PCRE library. While this library | 
| 2 | started out as a copy of PCRE, many of the features of PCRE have been | 
| 3 | removed. This library now supports only the regular expression features | 
| 4 | required by the JavaScript language specification, and has only the functions | 
| 5 | needed by JavaScriptCore and the rest of WebKit. | 
| 6 |  | 
| 7 |                  Originally written by Philip Hazel | 
| 8 |            Copyright (c) 1997-2006 University of Cambridge | 
| 9 |     Copyright (C) 2002, 2004, 2006, 2007, 2008, 2009 Apple Inc. All rights reserved. | 
| 10 |  | 
| 11 | ----------------------------------------------------------------------------- | 
| 12 | Redistribution and use in source and binary forms, with or without | 
| 13 | modification, are permitted provided that the following conditions are met: | 
| 14 |  | 
| 15 |     * Redistributions of source code must retain the above copyright notice, | 
| 16 |       this list of conditions and the following disclaimer. | 
| 17 |  | 
| 18 |     * Redistributions in binary form must reproduce the above copyright | 
| 19 |       notice, this list of conditions and the following disclaimer in the | 
| 20 |       documentation and/or other materials provided with the distribution. | 
| 21 |  | 
| 22 |     * Neither the name of the University of Cambridge nor the names of its | 
| 23 |       contributors may be used to endorse or promote products derived from | 
| 24 |       this software without specific prior written permission. | 
| 25 |  | 
| 26 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | 
| 27 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | 
| 28 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | 
| 29 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | 
| 30 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | 
| 31 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF | 
| 32 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS | 
| 33 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN | 
| 34 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | 
| 35 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE | 
| 36 | POSSIBILITY OF SUCH DAMAGE. | 
| 37 | ----------------------------------------------------------------------------- | 
| 38 | */ | 
| 39 |  | 
| 40 | /* This header contains definitions that are shared between the different | 
| 41 | modules, but which are not relevant to the exported API. This includes some | 
| 42 | functions whose names all begin with "_pcre_". */ | 
| 43 |  | 
| 44 | #ifndef PCRE_INTERNAL_H | 
| 45 | #define PCRE_INTERNAL_H | 
| 46 |  | 
| 47 | /* Bit definitions for entries in the pcre_ctypes table. */ | 
| 48 |  | 
| 49 | #define ctype_space   0x01 | 
| 50 | #define ctype_xdigit  0x08 | 
| 51 | #define ctype_word    0x10   /* alphameric or '_' */ | 
| 52 |  | 
| 53 | /* Offsets for the bitmap tables in pcre_cbits. Each table contains a set | 
| 54 | of bits for a class map. Some classes are built by combining these tables. */ | 
| 55 |  | 
| 56 | #define cbit_space     0      /* \s */ | 
| 57 | #define cbit_digit    32      /* \d */ | 
| 58 | #define cbit_word     64      /* \w */ | 
| 59 | #define cbit_length   96      /* Length of the cbits table */ | 
| 60 |  | 
| 61 | /* Offsets of the various tables from the base tables pointer, and | 
| 62 | total length. */ | 
| 63 |  | 
| 64 | #define lcc_offset      0 | 
| 65 | #define fcc_offset    128 | 
| 66 | #define cbits_offset  256 | 
| 67 | #define ctypes_offset (cbits_offset + cbit_length) | 
| 68 | #define tables_length (ctypes_offset + 128) | 
| 69 |  | 
| 70 | #ifndef DFTABLES | 
| 71 |  | 
| 72 | // Change the following to 1 to dump used regular expressions at process exit time. | 
| 73 | #define REGEXP_HISTOGRAM 0 | 
| 74 |  | 
| 75 | #include "Assertions.h" | 
| 76 |  | 
| 77 | #if COMPILER(MSVC) | 
| 78 | #pragma warning(disable: 4232) | 
| 79 | #pragma warning(disable: 4244) | 
| 80 | #endif | 
| 81 |  | 
| 82 | #include "pcre.h" | 
| 83 |  | 
| 84 | /* The value of LINK_SIZE determines the number of bytes used to store links as | 
| 85 | offsets within the compiled regex. The default is 2, which allows for compiled | 
| 86 | patterns up to 64K long. */ | 
| 87 |  | 
| 88 | #define LINK_SIZE   3 | 
| 89 |  | 
| 90 | /* Define DEBUG to get debugging output on stdout. */ | 
| 91 |  | 
| 92 | #if 0 | 
| 93 | #define DEBUG | 
| 94 | #endif | 
| 95 |  | 
| 96 | /* Use a macro for debugging printing, 'cause that eliminates the use of #ifdef | 
| 97 | inline, and there are *still* stupid compilers about that don't like indented | 
| 98 | pre-processor statements, or at least there were when I first wrote this. After | 
| 99 | all, it had only been about 10 years then... */ | 
| 100 |  | 
| 101 | #ifdef DEBUG | 
| 102 | #define DPRINTF(p) printf p | 
| 103 | #else | 
| 104 | #define DPRINTF(p) /*nothing*/ | 
| 105 | #endif | 
| 106 |  | 
| 107 | /* PCRE keeps offsets in its compiled code as 2-byte quantities (always stored | 
| 108 | in big-endian order) by default. These are used, for example, to link from the | 
| 109 | start of a subpattern to its alternatives and its end. The use of 2 bytes per | 
| 110 | offset limits the size of the compiled regex to around 64K, which is big enough | 
| 111 | for almost everybody. However, I received a request for an even bigger limit. | 
| 112 | For this reason, and also to make the code easier to maintain, the storing and | 
| 113 | loading of offsets from the byte string is now handled by the functions that are | 
| 114 | defined here. */ | 
| 115 |  | 
| 116 | /* PCRE uses some other 2-byte quantities that do not change when the size of | 
| 117 | offsets changes. There are used for repeat counts and for other things such as | 
| 118 | capturing parenthesis numbers in back references. */ | 
| 119 |  | 
| 120 | static inline void put2ByteValue(unsigned char* opcodePtr, int value) | 
| 121 | { | 
| 122 |     ASSERT(value >= 0 && value <= 0xFFFF); | 
| 123 |     opcodePtr[0] = value >> 8; | 
| 124 |     opcodePtr[1] = value; | 
| 125 | } | 
| 126 |  | 
| 127 | static inline void put3ByteValue(unsigned char* opcodePtr, int value) | 
| 128 | { | 
| 129 |     ASSERT(value >= 0 && value <= 0xFFFFFF); | 
| 130 |     opcodePtr[0] = value >> 16; | 
| 131 |     opcodePtr[1] = value >> 8; | 
| 132 |     opcodePtr[2] = value; | 
| 133 | } | 
| 134 |  | 
| 135 | static inline int get2ByteValue(const unsigned char* opcodePtr) | 
| 136 | { | 
| 137 |     return (opcodePtr[0] << 8) | opcodePtr[1]; | 
| 138 | } | 
| 139 |  | 
| 140 | static inline int get3ByteValue(const unsigned char* opcodePtr) | 
| 141 | { | 
| 142 |     return (opcodePtr[0] << 16) | (opcodePtr[1] << 8) | opcodePtr[2]; | 
| 143 | } | 
| 144 |  | 
| 145 | static inline void put2ByteValueAndAdvance(unsigned char*& opcodePtr, int value) | 
| 146 | { | 
| 147 |     put2ByteValue(opcodePtr, value); | 
| 148 |     opcodePtr += 2; | 
| 149 | } | 
| 150 |  | 
| 151 | static inline void put3ByteValueAndAdvance(unsigned char*& opcodePtr, int value) | 
| 152 | { | 
| 153 |     put3ByteValue(opcodePtr, value); | 
| 154 |     opcodePtr += 3; | 
| 155 | } | 
| 156 |  | 
| 157 | static inline void putLinkValueAllowZero(unsigned char* opcodePtr, int value) | 
| 158 | { | 
| 159 | #if LINK_SIZE == 3 | 
| 160 |     put3ByteValue(opcodePtr, value); | 
| 161 | #elif LINK_SIZE == 2 | 
| 162 |     put2ByteValue(opcodePtr, value); | 
| 163 | #else | 
| 164 | #   error LINK_SIZE not supported. | 
| 165 | #endif | 
| 166 | } | 
| 167 |  | 
| 168 | static inline int getLinkValueAllowZero(const unsigned char* opcodePtr) | 
| 169 | { | 
| 170 | #if LINK_SIZE == 3 | 
| 171 |     return get3ByteValue(opcodePtr); | 
| 172 | #elif LINK_SIZE == 2 | 
| 173 |     return get2ByteValue(opcodePtr); | 
| 174 | #else | 
| 175 | #   error LINK_SIZE not supported. | 
| 176 | #endif | 
| 177 | } | 
| 178 |  | 
| 179 | #define MAX_PATTERN_SIZE 1024 * 1024 // Derived by empirical testing of compile time in PCRE and WREC. | 
| 180 | COMPILE_ASSERT(MAX_PATTERN_SIZE < (1 << (8 * LINK_SIZE)), pcre_max_pattern_fits_in_bytecode); | 
| 181 |  | 
| 182 | static inline void putLinkValue(unsigned char* opcodePtr, int value) | 
| 183 | { | 
| 184 |     ASSERT(value); | 
| 185 |     putLinkValueAllowZero(opcodePtr, value); | 
| 186 | } | 
| 187 |  | 
| 188 | static inline int getLinkValue(const unsigned char* opcodePtr) | 
| 189 | { | 
| 190 |     int value = getLinkValueAllowZero(opcodePtr); | 
| 191 |     ASSERT(value); | 
| 192 |     return value; | 
| 193 | } | 
| 194 |  | 
| 195 | static inline void putLinkValueAndAdvance(unsigned char*& opcodePtr, int value) | 
| 196 | { | 
| 197 |     putLinkValue(opcodePtr, value); | 
| 198 |     opcodePtr += LINK_SIZE; | 
| 199 | } | 
| 200 |  | 
| 201 | static inline void putLinkValueAllowZeroAndAdvance(unsigned char*& opcodePtr, int value) | 
| 202 | { | 
| 203 |     putLinkValueAllowZero(opcodePtr, value); | 
| 204 |     opcodePtr += LINK_SIZE; | 
| 205 | } | 
| 206 |  | 
| 207 | // FIXME: These are really more of a "compiled regexp state" than "regexp options" | 
| 208 | enum RegExpOptions { | 
| 209 |     UseFirstByteOptimizationOption = 0x40000000,  /* firstByte is set */ | 
| 210 |     UseRequiredByteOptimizationOption = 0x20000000,  /* reqByte is set */ | 
| 211 |     UseMultiLineFirstByteOptimizationOption = 0x10000000,  /* start after \n for multiline */ | 
| 212 |     IsAnchoredOption = 0x02000000,  /* can't use partial with this regex */ | 
| 213 |     IgnoreCaseOption = 0x00000001, | 
| 214 |     MatchAcrossMultipleLinesOption = 0x00000002 | 
| 215 | }; | 
| 216 |  | 
| 217 | /* Flags added to firstByte or reqByte; a "non-literal" item is either a | 
| 218 | variable-length repeat, or a anything other than literal characters. */ | 
| 219 |  | 
| 220 | #define REQ_IGNORE_CASE 0x0100    /* indicates should ignore case */ | 
| 221 | #define REQ_VARY     0x0200    /* reqByte followed non-literal item */ | 
| 222 |  | 
| 223 | /* Miscellaneous definitions */ | 
| 224 |  | 
| 225 | /* Flag bits and data types for the extended class (OP_XCLASS) for classes that | 
| 226 | contain UTF-8 characters with values greater than 255. */ | 
| 227 |  | 
| 228 | #define XCL_NOT    0x01    /* Flag: this is a negative class */ | 
| 229 | #define XCL_MAP    0x02    /* Flag: a 32-byte map is present */ | 
| 230 |  | 
| 231 | #define XCL_END       0    /* Marks end of individual items */ | 
| 232 | #define XCL_SINGLE    1    /* Single item (one multibyte char) follows */ | 
| 233 | #define XCL_RANGE     2    /* A range (two multibyte chars) follows */ | 
| 234 |  | 
| 235 | /* These are escaped items that aren't just an encoding of a particular data | 
| 236 | value such as \n. They must have non-zero values, as check_escape() returns | 
| 237 | their negation. Also, they must appear in the same order as in the opcode | 
| 238 | definitions below, up to ESC_w. The final one must be | 
| 239 | ESC_REF as subsequent values are used for \1, \2, \3, etc. There is are two | 
| 240 | tests in the code for an escape > ESC_b and <= ESC_w to | 
| 241 | detect the types that may be repeated. These are the types that consume | 
| 242 | characters. If any new escapes are put in between that don't consume a | 
| 243 | character, that code will have to change. */ | 
| 244 |  | 
| 245 | enum { ESC_B = 1, ESC_b, ESC_D, ESC_d, ESC_S, ESC_s, ESC_W, ESC_w, ESC_REF }; | 
| 246 |  | 
| 247 | /* Opcode table: OP_BRA must be last, as all values >= it are used for brackets | 
| 248 | that extract substrings. Starting from 1 (i.e. after OP_END), the values up to | 
| 249 | OP_EOD must correspond in order to the list of escapes immediately above. | 
| 250 | Note that whenever this list is updated, the two macro definitions that follow | 
| 251 | must also be updated to match. */ | 
| 252 |  | 
| 253 | #define FOR_EACH_OPCODE(macro) \ | 
| 254 |     macro(END) \ | 
| 255 |     \ | 
| 256 |     macro(NOT_WORD_BOUNDARY) \ | 
| 257 |     macro(WORD_BOUNDARY) \ | 
| 258 |     macro(NOT_DIGIT) \ | 
| 259 |     macro(DIGIT) \ | 
| 260 |     macro(NOT_WHITESPACE) \ | 
| 261 |     macro(WHITESPACE) \ | 
| 262 |     macro(NOT_WORDCHAR) \ | 
| 263 |     macro(WORDCHAR) \ | 
| 264 |     \ | 
| 265 |     macro(NOT_NEWLINE) \ | 
| 266 |     \ | 
| 267 |     macro(CIRC) \ | 
| 268 |     macro(DOLL) \ | 
| 269 |     macro(BOL) \ | 
| 270 |     macro(EOL) \ | 
| 271 |     macro(CHAR) \ | 
| 272 |     macro(CHAR_IGNORING_CASE) \ | 
| 273 |     macro(ASCII_CHAR) \ | 
| 274 |     macro(ASCII_LETTER_IGNORING_CASE) \ | 
| 275 |     macro(NOT) \ | 
| 276 |     \ | 
| 277 |     macro(STAR) \ | 
| 278 |     macro(MINSTAR) \ | 
| 279 |     macro(PLUS) \ | 
| 280 |     macro(MINPLUS) \ | 
| 281 |     macro(QUERY) \ | 
| 282 |     macro(MINQUERY) \ | 
| 283 |     macro(UPTO) \ | 
| 284 |     macro(MINUPTO) \ | 
| 285 |     macro(EXACT) \ | 
| 286 |     \ | 
| 287 |     macro(NOTSTAR) \ | 
| 288 |     macro(NOTMINSTAR) \ | 
| 289 |     macro(NOTPLUS) \ | 
| 290 |     macro(NOTMINPLUS) \ | 
| 291 |     macro(NOTQUERY) \ | 
| 292 |     macro(NOTMINQUERY) \ | 
| 293 |     macro(NOTUPTO) \ | 
| 294 |     macro(NOTMINUPTO) \ | 
| 295 |     macro(NOTEXACT) \ | 
| 296 |     \ | 
| 297 |     macro(TYPESTAR) \ | 
| 298 |     macro(TYPEMINSTAR) \ | 
| 299 |     macro(TYPEPLUS) \ | 
| 300 |     macro(TYPEMINPLUS) \ | 
| 301 |     macro(TYPEQUERY) \ | 
| 302 |     macro(TYPEMINQUERY) \ | 
| 303 |     macro(TYPEUPTO) \ | 
| 304 |     macro(TYPEMINUPTO) \ | 
| 305 |     macro(TYPEEXACT) \ | 
| 306 |     \ | 
| 307 |     macro(CRSTAR) \ | 
| 308 |     macro(CRMINSTAR) \ | 
| 309 |     macro(CRPLUS) \ | 
| 310 |     macro(CRMINPLUS) \ | 
| 311 |     macro(CRQUERY) \ | 
| 312 |     macro(CRMINQUERY) \ | 
| 313 |     macro(CRRANGE) \ | 
| 314 |     macro(CRMINRANGE) \ | 
| 315 |     \ | 
| 316 |     macro(CLASS) \ | 
| 317 |     macro(NCLASS) \ | 
| 318 |     macro(XCLASS) \ | 
| 319 |     \ | 
| 320 |     macro(REF) \ | 
| 321 |     \ | 
| 322 |     macro(ALT) \ | 
| 323 |     macro(KET) \ | 
| 324 |     macro(KETRMAX) \ | 
| 325 |     macro(KETRMIN) \ | 
| 326 |     \ | 
| 327 |     macro(ASSERT) \ | 
| 328 |     macro(ASSERT_NOT) \ | 
| 329 |     \ | 
| 330 |     macro(BRAZERO) \ | 
| 331 |     macro(BRAMINZERO) \ | 
| 332 |     macro(BRANUMBER) \ | 
| 333 |     macro(BRA) | 
| 334 |  | 
| 335 | #define OPCODE_ENUM_VALUE(opcode) OP_##opcode, | 
| 336 | enum { FOR_EACH_OPCODE(OPCODE_ENUM_VALUE) }; | 
| 337 |  | 
| 338 | /* WARNING WARNING WARNING: There is an implicit assumption in pcre.c and | 
| 339 | study.c that all opcodes are less than 128 in value. This makes handling UTF-8 | 
| 340 | character sequences easier. */ | 
| 341 |  | 
| 342 | /* The highest extraction number before we have to start using additional | 
| 343 | bytes. (Originally PCRE didn't have support for extraction counts higher than | 
| 344 | this number.) The value is limited by the number of opcodes left after OP_BRA, | 
| 345 | i.e. 255 - OP_BRA. We actually set it a bit lower to leave room for additional | 
| 346 | opcodes. */ | 
| 347 |  | 
| 348 | /* FIXME: Note that OP_BRA + 100 is > 128, so the two comments above | 
| 349 | are in conflict! */ | 
| 350 |  | 
| 351 | #define   100 | 
| 352 |  | 
| 353 | /* The code vector runs on as long as necessary after the end. */ | 
| 354 |  | 
| 355 | struct JSRegExp { | 
| 356 |     unsigned options; | 
| 357 |  | 
| 358 |     unsigned short topBracket; | 
| 359 |     unsigned short topBackref; | 
| 360 |      | 
| 361 |     unsigned short firstByte; | 
| 362 |     unsigned short reqByte; | 
| 363 |  | 
| 364 | #if REGEXP_HISTOGRAM | 
| 365 |     size_t stringOffset; | 
| 366 |     size_t stringLength; | 
| 367 | #endif | 
| 368 | }; | 
| 369 |  | 
| 370 | /* Internal shared data tables. These are tables that are used by more than one | 
| 371 |  of the exported public functions. They have to be "external" in the C sense, | 
| 372 |  but are not part of the PCRE public API. The data for these tables is in the | 
| 373 |  pcre_tables.c module. */ | 
| 374 |  | 
| 375 | #define jsc_pcre_utf8_table1_size 6 | 
| 376 |  | 
| 377 | extern const int    jsc_pcre_utf8_table1[6]; | 
| 378 | extern const int    jsc_pcre_utf8_table2[6]; | 
| 379 | extern const int    jsc_pcre_utf8_table3[6]; | 
| 380 | extern const unsigned char jsc_pcre_utf8_table4[0x40]; | 
| 381 |  | 
| 382 | extern const unsigned char jsc_pcre_default_tables[tables_length]; | 
| 383 |  | 
| 384 | static inline unsigned char toLowerCase(unsigned char c) | 
| 385 | { | 
| 386 |     static const unsigned char* lowerCaseChars = jsc_pcre_default_tables + lcc_offset; | 
| 387 |     return lowerCaseChars[c]; | 
| 388 | } | 
| 389 |  | 
| 390 | static inline unsigned char flipCase(unsigned char c) | 
| 391 | { | 
| 392 |     static const unsigned char* flippedCaseChars = jsc_pcre_default_tables + fcc_offset; | 
| 393 |     return flippedCaseChars[c]; | 
| 394 | } | 
| 395 |  | 
| 396 | static inline unsigned char classBitmapForChar(unsigned char c) | 
| 397 | { | 
| 398 |     static const unsigned char* charClassBitmaps = jsc_pcre_default_tables + cbits_offset; | 
| 399 |     return charClassBitmaps[c]; | 
| 400 | } | 
| 401 |  | 
| 402 | static inline unsigned char charTypeForChar(unsigned char c) | 
| 403 | { | 
| 404 |     const unsigned char* charTypeMap = jsc_pcre_default_tables + ctypes_offset; | 
| 405 |     return charTypeMap[c]; | 
| 406 | } | 
| 407 |  | 
| 408 | static inline bool isWordChar(UChar c) | 
| 409 | { | 
| 410 |     return c < 128 && (charTypeForChar(c) & ctype_word); | 
| 411 | } | 
| 412 |  | 
| 413 | static inline bool isSpaceChar(UChar c) | 
| 414 | { | 
| 415 |     return (c < 128 && (charTypeForChar(c) & ctype_space)) || c == 0x00A0; | 
| 416 | } | 
| 417 |  | 
| 418 | static inline bool isNewline(UChar nl) | 
| 419 | { | 
| 420 |     return (nl == 0xA || nl == 0xD || nl == 0x2028 || nl == 0x2029); | 
| 421 | } | 
| 422 |  | 
| 423 | static inline bool isBracketStartOpcode(unsigned char opcode) | 
| 424 | { | 
| 425 |     if (opcode >= OP_BRA) | 
| 426 |         return true; | 
| 427 |     switch (opcode) { | 
| 428 |         case OP_ASSERT: | 
| 429 |         case OP_ASSERT_NOT: | 
| 430 |             return true; | 
| 431 |         default: | 
| 432 |             return false; | 
| 433 |     } | 
| 434 | } | 
| 435 |  | 
| 436 | static inline void advanceToEndOfBracket(const unsigned char*& opcodePtr) | 
| 437 | { | 
| 438 |     ASSERT(isBracketStartOpcode(*opcodePtr) || *opcodePtr == OP_ALT); | 
| 439 |     do | 
| 440 |         opcodePtr += getLinkValue(opcodePtr: opcodePtr + 1); | 
| 441 |     while (*opcodePtr == OP_ALT); | 
| 442 | } | 
| 443 |  | 
| 444 | /* Internal shared functions. These are functions that are used in more | 
| 445 | that one of the source files. They have to have external linkage, but | 
| 446 | but are not part of the public API and so not exported from the library. */ | 
| 447 |  | 
| 448 | extern int jsc_pcre_ucp_othercase(unsigned); | 
| 449 | extern bool jsc_pcre_xclass(int, const unsigned char*); | 
| 450 |  | 
| 451 | #endif | 
| 452 |  | 
| 453 | #endif | 
| 454 |  | 
| 455 | /* End of pcre_internal.h */ | 
| 456 |  |