| 1 | /* |
| 2 | * Copyright (C) 2009 Apple Inc. All rights reserved. |
| 3 | * |
| 4 | * Redistribution and use in source and binary forms, with or without |
| 5 | * modification, are permitted provided that the following conditions |
| 6 | * are met: |
| 7 | * 1. Redistributions of source code must retain the above copyright |
| 8 | * notice, this list of conditions and the following disclaimer. |
| 9 | * 2. Redistributions in binary form must reproduce the above copyright |
| 10 | * notice, this list of conditions and the following disclaimer in the |
| 11 | * documentation and/or other materials provided with the distribution. |
| 12 | * |
| 13 | * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY |
| 14 | * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
| 15 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR |
| 16 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR |
| 17 | * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, |
| 18 | * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, |
| 19 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR |
| 20 | * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY |
| 21 | * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
| 22 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
| 23 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| 24 | */ |
| 25 | |
| 26 | #include "config.h" |
| 27 | #include "RegexCompiler.h" |
| 28 | |
| 29 | #include "RegexInterpreter.h" |
| 30 | #include "RegexPattern.h" |
| 31 | #include <wtf/Vector.h> |
| 32 | |
| 33 | #if ENABLE(YARR) |
| 34 | |
| 35 | using namespace WTF; |
| 36 | |
| 37 | namespace JSC { namespace Yarr { |
| 38 | |
| 39 | class CharacterClassConstructor { |
| 40 | public: |
| 41 | CharacterClassConstructor(bool isCaseInsensitive = false) |
| 42 | : m_isCaseInsensitive(isCaseInsensitive) |
| 43 | { |
| 44 | } |
| 45 | |
| 46 | void reset() |
| 47 | { |
| 48 | m_matches.clear(); |
| 49 | m_ranges.clear(); |
| 50 | m_matchesUnicode.clear(); |
| 51 | m_rangesUnicode.clear(); |
| 52 | } |
| 53 | |
| 54 | void append(const CharacterClass* other) |
| 55 | { |
| 56 | for (size_t i = 0; i < other->m_matches.size(); ++i) |
| 57 | addSorted(matches&: m_matches, ch: other->m_matches[i]); |
| 58 | for (size_t i = 0; i < other->m_ranges.size(); ++i) |
| 59 | addSortedRange(ranges&: m_ranges, lo: other->m_ranges[i].begin, hi: other->m_ranges[i].end); |
| 60 | for (size_t i = 0; i < other->m_matchesUnicode.size(); ++i) |
| 61 | addSorted(matches&: m_matchesUnicode, ch: other->m_matchesUnicode[i]); |
| 62 | for (size_t i = 0; i < other->m_rangesUnicode.size(); ++i) |
| 63 | addSortedRange(ranges&: m_rangesUnicode, lo: other->m_rangesUnicode[i].begin, hi: other->m_rangesUnicode[i].end); |
| 64 | } |
| 65 | |
| 66 | void putChar(UChar ch) |
| 67 | { |
| 68 | if (ch <= 0x7f) { |
| 69 | if (m_isCaseInsensitive && isASCIIAlpha(c: ch)) { |
| 70 | addSorted(matches&: m_matches, ch: toASCIIUpper(c: ch)); |
| 71 | addSorted(matches&: m_matches, ch: toASCIILower(c: ch)); |
| 72 | } else |
| 73 | addSorted(matches&: m_matches, ch); |
| 74 | } else { |
| 75 | UChar upper, lower; |
| 76 | if (m_isCaseInsensitive && ((upper = Unicode::toUpper(ch)) != (lower = Unicode::toLower(ch)))) { |
| 77 | addSorted(matches&: m_matchesUnicode, ch: upper); |
| 78 | addSorted(matches&: m_matchesUnicode, ch: lower); |
| 79 | } else |
| 80 | addSorted(matches&: m_matchesUnicode, ch); |
| 81 | } |
| 82 | } |
| 83 | |
| 84 | // returns true if this character has another case, and 'ch' is the upper case form. |
| 85 | static inline bool isUnicodeUpper(UChar ch) |
| 86 | { |
| 87 | return ch != Unicode::toLower(ch); |
| 88 | } |
| 89 | |
| 90 | // returns true if this character has another case, and 'ch' is the lower case form. |
| 91 | static inline bool isUnicodeLower(UChar ch) |
| 92 | { |
| 93 | return ch != Unicode::toUpper(ch); |
| 94 | } |
| 95 | |
| 96 | void putRange(UChar lo, UChar hi) |
| 97 | { |
| 98 | if (lo <= 0x7f) { |
| 99 | char asciiLo = lo; |
| 100 | char asciiHi = std::min(a: hi, b: (UChar)0x7f); |
| 101 | addSortedRange(ranges&: m_ranges, lo, hi: asciiHi); |
| 102 | |
| 103 | if (m_isCaseInsensitive) { |
| 104 | if ((asciiLo <= 'Z') && (asciiHi >= 'A')) |
| 105 | addSortedRange(ranges&: m_ranges, lo: std::max(a: asciiLo, b: 'A')+('a'-'A'), hi: std::min(a: asciiHi, b: 'Z')+('a'-'A')); |
| 106 | if ((asciiLo <= 'z') && (asciiHi >= 'a')) |
| 107 | addSortedRange(ranges&: m_ranges, lo: std::max(a: asciiLo, b: 'a')+('A'-'a'), hi: std::min(a: asciiHi, b: 'z')+('A'-'a')); |
| 108 | } |
| 109 | } |
| 110 | if (hi >= 0x80) { |
| 111 | uint32_t unicodeCurr = std::max(a: lo, b: (UChar)0x80); |
| 112 | addSortedRange(ranges&: m_rangesUnicode, lo: unicodeCurr, hi); |
| 113 | |
| 114 | if (m_isCaseInsensitive) { |
| 115 | while (unicodeCurr <= hi) { |
| 116 | // If the upper bound of the range (hi) is 0xffff, the increments to |
| 117 | // unicodeCurr in this loop may take it to 0x10000. This is fine |
| 118 | // (if so we won't re-enter the loop, since the loop condition above |
| 119 | // will definitely fail) - but this does mean we cannot use a UChar |
| 120 | // to represent unicodeCurr, we must use a 32-bit value instead. |
| 121 | ASSERT(unicodeCurr <= 0xffff); |
| 122 | |
| 123 | if (isUnicodeUpper(ch: unicodeCurr)) { |
| 124 | UChar lowerCaseRangeBegin = Unicode::toLower(ch: unicodeCurr); |
| 125 | UChar lowerCaseRangeEnd = lowerCaseRangeBegin; |
| 126 | while ((++unicodeCurr <= hi) && isUnicodeUpper(ch: unicodeCurr) && (Unicode::toLower(ch: unicodeCurr) == (lowerCaseRangeEnd + 1))) |
| 127 | lowerCaseRangeEnd++; |
| 128 | addSortedRange(ranges&: m_rangesUnicode, lo: lowerCaseRangeBegin, hi: lowerCaseRangeEnd); |
| 129 | } else if (isUnicodeLower(ch: unicodeCurr)) { |
| 130 | UChar upperCaseRangeBegin = Unicode::toUpper(ch: unicodeCurr); |
| 131 | UChar upperCaseRangeEnd = upperCaseRangeBegin; |
| 132 | while ((++unicodeCurr <= hi) && isUnicodeLower(ch: unicodeCurr) && (Unicode::toUpper(ch: unicodeCurr) == (upperCaseRangeEnd + 1))) |
| 133 | upperCaseRangeEnd++; |
| 134 | addSortedRange(ranges&: m_rangesUnicode, lo: upperCaseRangeBegin, hi: upperCaseRangeEnd); |
| 135 | } else |
| 136 | ++unicodeCurr; |
| 137 | } |
| 138 | } |
| 139 | } |
| 140 | } |
| 141 | |
| 142 | CharacterClass* charClass() |
| 143 | { |
| 144 | CharacterClass* characterClass = new CharacterClass(); |
| 145 | |
| 146 | characterClass->m_matches.append(val: m_matches); |
| 147 | characterClass->m_ranges.append(val: m_ranges); |
| 148 | characterClass->m_matchesUnicode.append(val: m_matchesUnicode); |
| 149 | characterClass->m_rangesUnicode.append(val: m_rangesUnicode); |
| 150 | |
| 151 | reset(); |
| 152 | |
| 153 | return characterClass; |
| 154 | } |
| 155 | |
| 156 | private: |
| 157 | void addSorted(Vector<UChar>& matches, UChar ch) |
| 158 | { |
| 159 | unsigned pos = 0; |
| 160 | unsigned range = matches.size(); |
| 161 | |
| 162 | // binary chop, find position to insert char. |
| 163 | while (range) { |
| 164 | unsigned index = range >> 1; |
| 165 | |
| 166 | int val = matches[pos+index] - ch; |
| 167 | if (!val) |
| 168 | return; |
| 169 | else if (val > 0) |
| 170 | range = index; |
| 171 | else { |
| 172 | pos += (index+1); |
| 173 | range -= (index+1); |
| 174 | } |
| 175 | } |
| 176 | |
| 177 | if (pos == matches.size()) |
| 178 | matches.append(val: ch); |
| 179 | else |
| 180 | matches.insert(position: pos, val: ch); |
| 181 | } |
| 182 | |
| 183 | void addSortedRange(Vector<CharacterRange>& ranges, UChar lo, UChar hi) |
| 184 | { |
| 185 | unsigned end = ranges.size(); |
| 186 | |
| 187 | // Simple linear scan - I doubt there are that many ranges anyway... |
| 188 | // feel free to fix this with something faster (eg binary chop). |
| 189 | for (unsigned i = 0; i < end; ++i) { |
| 190 | // does the new range fall before the current position in the array |
| 191 | if (hi < ranges[i].begin) { |
| 192 | // optional optimization: concatenate appending ranges? - may not be worthwhile. |
| 193 | if (hi == (ranges[i].begin - 1)) { |
| 194 | ranges[i].begin = lo; |
| 195 | return; |
| 196 | } |
| 197 | ranges.insert(position: i, val: CharacterRange(lo, hi)); |
| 198 | return; |
| 199 | } |
| 200 | // Okay, since we didn't hit the last case, the end of the new range is definitely at or after the begining |
| 201 | // If the new range start at or before the end of the last range, then the overlap (if it starts one after the |
| 202 | // end of the last range they concatenate, which is just as good. |
| 203 | if (lo <= (ranges[i].end + 1)) { |
| 204 | // found an intersect! we'll replace this entry in the array. |
| 205 | ranges[i].begin = std::min(a: ranges[i].begin, b: lo); |
| 206 | ranges[i].end = std::max(a: ranges[i].end, b: hi); |
| 207 | |
| 208 | // now check if the new range can subsume any subsequent ranges. |
| 209 | unsigned next = i+1; |
| 210 | // each iteration of the loop we will either remove something from the list, or break the loop. |
| 211 | while (next < ranges.size()) { |
| 212 | if (ranges[next].begin <= (ranges[i].end + 1)) { |
| 213 | // the next entry now overlaps / concatenates this one. |
| 214 | ranges[i].end = std::max(a: ranges[i].end, b: ranges[next].end); |
| 215 | ranges.remove(position: next); |
| 216 | } else |
| 217 | break; |
| 218 | } |
| 219 | |
| 220 | return; |
| 221 | } |
| 222 | } |
| 223 | |
| 224 | // CharacterRange comes after all existing ranges. |
| 225 | ranges.append(val: CharacterRange(lo, hi)); |
| 226 | } |
| 227 | |
| 228 | bool m_isCaseInsensitive; |
| 229 | |
| 230 | Vector<UChar> m_matches; |
| 231 | Vector<CharacterRange> m_ranges; |
| 232 | Vector<UChar> m_matchesUnicode; |
| 233 | Vector<CharacterRange> m_rangesUnicode; |
| 234 | }; |
| 235 | |
| 236 | |
| 237 | CharacterClass* newlineCreate() |
| 238 | { |
| 239 | CharacterClass* characterClass = new CharacterClass(); |
| 240 | |
| 241 | characterClass->m_matches.append(val: '\n'); |
| 242 | characterClass->m_matches.append(val: '\r'); |
| 243 | characterClass->m_matchesUnicode.append(val: 0x2028); |
| 244 | characterClass->m_matchesUnicode.append(val: 0x2029); |
| 245 | |
| 246 | return characterClass; |
| 247 | } |
| 248 | |
| 249 | CharacterClass* digitsCreate() |
| 250 | { |
| 251 | CharacterClass* characterClass = new CharacterClass(); |
| 252 | |
| 253 | characterClass->m_ranges.append(val: CharacterRange('0', '9')); |
| 254 | |
| 255 | return characterClass; |
| 256 | } |
| 257 | |
| 258 | CharacterClass* spacesCreate() |
| 259 | { |
| 260 | CharacterClass* characterClass = new CharacterClass(); |
| 261 | |
| 262 | characterClass->m_matches.append(val: ' '); |
| 263 | characterClass->m_ranges.append(val: CharacterRange('\t', '\r')); |
| 264 | characterClass->m_matchesUnicode.append(val: 0x00a0); |
| 265 | characterClass->m_matchesUnicode.append(val: 0x1680); |
| 266 | characterClass->m_matchesUnicode.append(val: 0x180e); |
| 267 | characterClass->m_matchesUnicode.append(val: 0x2028); |
| 268 | characterClass->m_matchesUnicode.append(val: 0x2029); |
| 269 | characterClass->m_matchesUnicode.append(val: 0x202f); |
| 270 | characterClass->m_matchesUnicode.append(val: 0x205f); |
| 271 | characterClass->m_matchesUnicode.append(val: 0x3000); |
| 272 | characterClass->m_rangesUnicode.append(val: CharacterRange(0x2000, 0x200a)); |
| 273 | |
| 274 | return characterClass; |
| 275 | } |
| 276 | |
| 277 | CharacterClass* wordcharCreate() |
| 278 | { |
| 279 | CharacterClass* characterClass = new CharacterClass(); |
| 280 | |
| 281 | characterClass->m_matches.append(val: '_'); |
| 282 | characterClass->m_ranges.append(val: CharacterRange('0', '9')); |
| 283 | characterClass->m_ranges.append(val: CharacterRange('A', 'Z')); |
| 284 | characterClass->m_ranges.append(val: CharacterRange('a', 'z')); |
| 285 | |
| 286 | return characterClass; |
| 287 | } |
| 288 | |
| 289 | CharacterClass* nondigitsCreate() |
| 290 | { |
| 291 | CharacterClass* characterClass = new CharacterClass(); |
| 292 | |
| 293 | characterClass->m_ranges.append(val: CharacterRange(0, '0' - 1)); |
| 294 | characterClass->m_ranges.append(val: CharacterRange('9' + 1, 0x7f)); |
| 295 | characterClass->m_rangesUnicode.append(val: CharacterRange(0x80, 0xffff)); |
| 296 | |
| 297 | return characterClass; |
| 298 | } |
| 299 | |
| 300 | CharacterClass* nonspacesCreate() |
| 301 | { |
| 302 | CharacterClass* characterClass = new CharacterClass(); |
| 303 | |
| 304 | characterClass->m_ranges.append(val: CharacterRange(0, '\t' - 1)); |
| 305 | characterClass->m_ranges.append(val: CharacterRange('\r' + 1, ' ' - 1)); |
| 306 | characterClass->m_ranges.append(val: CharacterRange(' ' + 1, 0x7f)); |
| 307 | characterClass->m_rangesUnicode.append(val: CharacterRange(0x0080, 0x009f)); |
| 308 | characterClass->m_rangesUnicode.append(val: CharacterRange(0x00a1, 0x167f)); |
| 309 | characterClass->m_rangesUnicode.append(val: CharacterRange(0x1681, 0x180d)); |
| 310 | characterClass->m_rangesUnicode.append(val: CharacterRange(0x180f, 0x1fff)); |
| 311 | characterClass->m_rangesUnicode.append(val: CharacterRange(0x200b, 0x2027)); |
| 312 | characterClass->m_rangesUnicode.append(val: CharacterRange(0x202a, 0x202e)); |
| 313 | characterClass->m_rangesUnicode.append(val: CharacterRange(0x2030, 0x205e)); |
| 314 | characterClass->m_rangesUnicode.append(val: CharacterRange(0x2060, 0x2fff)); |
| 315 | characterClass->m_rangesUnicode.append(val: CharacterRange(0x3001, 0xffff)); |
| 316 | |
| 317 | return characterClass; |
| 318 | } |
| 319 | |
| 320 | CharacterClass* nonwordcharCreate() |
| 321 | { |
| 322 | CharacterClass* characterClass = new CharacterClass(); |
| 323 | |
| 324 | characterClass->m_matches.append(val: '`'); |
| 325 | characterClass->m_ranges.append(val: CharacterRange(0, '0' - 1)); |
| 326 | characterClass->m_ranges.append(val: CharacterRange('9' + 1, 'A' - 1)); |
| 327 | characterClass->m_ranges.append(val: CharacterRange('Z' + 1, '_' - 1)); |
| 328 | characterClass->m_ranges.append(val: CharacterRange('z' + 1, 0x7f)); |
| 329 | characterClass->m_rangesUnicode.append(val: CharacterRange(0x80, 0xffff)); |
| 330 | |
| 331 | return characterClass; |
| 332 | } |
| 333 | |
| 334 | |
| 335 | class RegexPatternConstructor { |
| 336 | public: |
| 337 | RegexPatternConstructor(RegexPattern& pattern) |
| 338 | : m_pattern(pattern) |
| 339 | , m_characterClassConstructor(pattern.m_ignoreCase) |
| 340 | { |
| 341 | } |
| 342 | |
| 343 | ~RegexPatternConstructor() |
| 344 | { |
| 345 | } |
| 346 | |
| 347 | void reset() |
| 348 | { |
| 349 | m_pattern.reset(); |
| 350 | m_characterClassConstructor.reset(); |
| 351 | } |
| 352 | |
| 353 | void assertionBOL() |
| 354 | { |
| 355 | m_alternative->m_terms.append(val: PatternTerm::BOL()); |
| 356 | } |
| 357 | void assertionEOL() |
| 358 | { |
| 359 | m_alternative->m_terms.append(val: PatternTerm::EOL()); |
| 360 | } |
| 361 | void assertionWordBoundary(bool invert) |
| 362 | { |
| 363 | m_alternative->m_terms.append(val: PatternTerm::WordBoundary(invert)); |
| 364 | } |
| 365 | |
| 366 | void atomPatternCharacter(UChar ch) |
| 367 | { |
| 368 | // We handle case-insensitive checking of unicode characters which do have both |
| 369 | // cases by handling them as if they were defined using a CharacterClass. |
| 370 | if (m_pattern.m_ignoreCase && !isASCII(c: ch) && (Unicode::toUpper(ch) != Unicode::toLower(ch))) { |
| 371 | atomCharacterClassBegin(); |
| 372 | atomCharacterClassAtom(ch); |
| 373 | atomCharacterClassEnd(); |
| 374 | } else |
| 375 | m_alternative->m_terms.append(val: PatternTerm(ch)); |
| 376 | } |
| 377 | |
| 378 | void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert) |
| 379 | { |
| 380 | switch (classID) { |
| 381 | case DigitClassID: |
| 382 | m_alternative->m_terms.append(val: PatternTerm(m_pattern.digitsCharacterClass(), invert)); |
| 383 | break; |
| 384 | case SpaceClassID: |
| 385 | m_alternative->m_terms.append(val: PatternTerm(m_pattern.spacesCharacterClass(), invert)); |
| 386 | break; |
| 387 | case WordClassID: |
| 388 | m_alternative->m_terms.append(val: PatternTerm(m_pattern.wordcharCharacterClass(), invert)); |
| 389 | break; |
| 390 | case NewlineClassID: |
| 391 | m_alternative->m_terms.append(val: PatternTerm(m_pattern.newlineCharacterClass(), invert)); |
| 392 | break; |
| 393 | } |
| 394 | } |
| 395 | |
| 396 | void atomCharacterClassBegin(bool invert = false) |
| 397 | { |
| 398 | m_invertCharacterClass = invert; |
| 399 | } |
| 400 | |
| 401 | void atomCharacterClassAtom(UChar ch) |
| 402 | { |
| 403 | m_characterClassConstructor.putChar(ch); |
| 404 | } |
| 405 | |
| 406 | void atomCharacterClassRange(UChar begin, UChar end) |
| 407 | { |
| 408 | m_characterClassConstructor.putRange(lo: begin, hi: end); |
| 409 | } |
| 410 | |
| 411 | void atomCharacterClassBuiltIn(BuiltInCharacterClassID classID, bool invert) |
| 412 | { |
| 413 | ASSERT(classID != NewlineClassID); |
| 414 | |
| 415 | switch (classID) { |
| 416 | case DigitClassID: |
| 417 | m_characterClassConstructor.append(other: invert ? m_pattern.nondigitsCharacterClass() : m_pattern.digitsCharacterClass()); |
| 418 | break; |
| 419 | |
| 420 | case SpaceClassID: |
| 421 | m_characterClassConstructor.append(other: invert ? m_pattern.nonspacesCharacterClass() : m_pattern.spacesCharacterClass()); |
| 422 | break; |
| 423 | |
| 424 | case WordClassID: |
| 425 | m_characterClassConstructor.append(other: invert ? m_pattern.nonwordcharCharacterClass() : m_pattern.wordcharCharacterClass()); |
| 426 | break; |
| 427 | |
| 428 | default: |
| 429 | ASSERT_NOT_REACHED(); |
| 430 | } |
| 431 | } |
| 432 | |
| 433 | void atomCharacterClassEnd() |
| 434 | { |
| 435 | CharacterClass* newCharacterClass = m_characterClassConstructor.charClass(); |
| 436 | m_pattern.m_userCharacterClasses.append(val: newCharacterClass); |
| 437 | m_alternative->m_terms.append(val: PatternTerm(newCharacterClass, m_invertCharacterClass)); |
| 438 | } |
| 439 | |
| 440 | void atomParenthesesSubpatternBegin(bool capture = true) |
| 441 | { |
| 442 | unsigned subpatternId = m_pattern.m_numSubpatterns + 1; |
| 443 | if (capture) |
| 444 | m_pattern.m_numSubpatterns++; |
| 445 | |
| 446 | PatternDisjunction* parenthesesDisjunction = new PatternDisjunction(m_alternative); |
| 447 | m_pattern.m_disjunctions.append(val: parenthesesDisjunction); |
| 448 | m_alternative->m_terms.append(val: PatternTerm(PatternTerm::TypeParenthesesSubpattern, subpatternId, parenthesesDisjunction, capture)); |
| 449 | m_alternative = parenthesesDisjunction->addNewAlternative(); |
| 450 | } |
| 451 | |
| 452 | void atomParentheticalAssertionBegin(bool invert = false) |
| 453 | { |
| 454 | PatternDisjunction* parenthesesDisjunction = new PatternDisjunction(m_alternative); |
| 455 | m_pattern.m_disjunctions.append(val: parenthesesDisjunction); |
| 456 | m_alternative->m_terms.append(val: PatternTerm(PatternTerm::TypeParentheticalAssertion, m_pattern.m_numSubpatterns + 1, parenthesesDisjunction, invert)); |
| 457 | m_alternative = parenthesesDisjunction->addNewAlternative(); |
| 458 | } |
| 459 | |
| 460 | void atomParenthesesEnd() |
| 461 | { |
| 462 | ASSERT(m_alternative->m_parent); |
| 463 | ASSERT(m_alternative->m_parent->m_parent); |
| 464 | m_alternative = m_alternative->m_parent->m_parent; |
| 465 | |
| 466 | m_alternative->lastTerm().parentheses.lastSubpatternId = m_pattern.m_numSubpatterns; |
| 467 | } |
| 468 | |
| 469 | void atomBackReference(unsigned subpatternId) |
| 470 | { |
| 471 | ASSERT(subpatternId); |
| 472 | m_pattern.m_maxBackReference = std::max(a: m_pattern.m_maxBackReference, b: subpatternId); |
| 473 | |
| 474 | if (subpatternId > m_pattern.m_numSubpatterns) { |
| 475 | m_alternative->m_terms.append(val: PatternTerm::ForwardReference()); |
| 476 | return; |
| 477 | } |
| 478 | |
| 479 | PatternAlternative* currentAlternative = m_alternative; |
| 480 | ASSERT(currentAlternative); |
| 481 | |
| 482 | // Note to self: if we waited until the AST was baked, we could also remove forwards refs |
| 483 | while ((currentAlternative = currentAlternative->m_parent->m_parent)) { |
| 484 | PatternTerm& term = currentAlternative->lastTerm(); |
| 485 | ASSERT((term.type == PatternTerm::TypeParenthesesSubpattern) || (term.type == PatternTerm::TypeParentheticalAssertion)); |
| 486 | |
| 487 | if ((term.type == PatternTerm::TypeParenthesesSubpattern) && term.invertOrCapture && (subpatternId == term.subpatternId)) { |
| 488 | m_alternative->m_terms.append(val: PatternTerm::ForwardReference()); |
| 489 | return; |
| 490 | } |
| 491 | } |
| 492 | |
| 493 | m_alternative->m_terms.append(val: PatternTerm(subpatternId)); |
| 494 | } |
| 495 | |
| 496 | PatternDisjunction* copyDisjunction(PatternDisjunction* disjunction) |
| 497 | { |
| 498 | PatternDisjunction* newDisjunction = new PatternDisjunction(); |
| 499 | |
| 500 | newDisjunction->m_parent = disjunction->m_parent; |
| 501 | for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) { |
| 502 | PatternAlternative* alternative = disjunction->m_alternatives[alt]; |
| 503 | PatternAlternative* newAlternative = newDisjunction->addNewAlternative(); |
| 504 | for (unsigned i = 0; i < alternative->m_terms.size(); ++i) |
| 505 | newAlternative->m_terms.append(val: copyTerm(term&: alternative->m_terms[i])); |
| 506 | } |
| 507 | |
| 508 | m_pattern.m_disjunctions.append(val: newDisjunction); |
| 509 | return newDisjunction; |
| 510 | } |
| 511 | |
| 512 | PatternTerm copyTerm(PatternTerm& term) |
| 513 | { |
| 514 | if ((term.type != PatternTerm::TypeParenthesesSubpattern) && (term.type != PatternTerm::TypeParentheticalAssertion)) |
| 515 | return PatternTerm(term); |
| 516 | |
| 517 | PatternTerm termCopy = term; |
| 518 | termCopy.parentheses.disjunction = copyDisjunction(disjunction: termCopy.parentheses.disjunction); |
| 519 | return termCopy; |
| 520 | } |
| 521 | |
| 522 | void quantifyAtom(unsigned min, unsigned max, bool greedy) |
| 523 | { |
| 524 | ASSERT(min <= max); |
| 525 | ASSERT(m_alternative->m_terms.size()); |
| 526 | |
| 527 | if (!max) { |
| 528 | m_alternative->removeLastTerm(); |
| 529 | return; |
| 530 | } |
| 531 | |
| 532 | PatternTerm& term = m_alternative->lastTerm(); |
| 533 | ASSERT(term.type > PatternTerm::TypeAssertionWordBoundary); |
| 534 | ASSERT((term.quantityCount == 1) && (term.quantityType == QuantifierFixedCount)); |
| 535 | |
| 536 | // For any assertion with a zero minimum, not matching is valid and has no effect, |
| 537 | // remove it. Otherwise, we need to match as least once, but there is no point |
| 538 | // matching more than once, so remove the quantifier. It is not entirely clear |
| 539 | // from the spec whether or not this behavior is correct, but I believe this |
| 540 | // matches Firefox. :-/ |
| 541 | if (term.type == PatternTerm::TypeParentheticalAssertion) { |
| 542 | if (!min) |
| 543 | m_alternative->removeLastTerm(); |
| 544 | return; |
| 545 | } |
| 546 | |
| 547 | if (min == 0) |
| 548 | term.quantify(count: max, type: greedy ? QuantifierGreedy : QuantifierNonGreedy); |
| 549 | else if (min == max) |
| 550 | term.quantify(count: min, type: QuantifierFixedCount); |
| 551 | else { |
| 552 | term.quantify(count: min, type: QuantifierFixedCount); |
| 553 | m_alternative->m_terms.append(val: copyTerm(term)); |
| 554 | // NOTE: this term is interesting from an analysis perspective, in that it can be ignored..... |
| 555 | m_alternative->lastTerm().quantify(count: (max == UINT_MAX) ? max : max - min, type: greedy ? QuantifierGreedy : QuantifierNonGreedy); |
| 556 | if (m_alternative->lastTerm().type == PatternTerm::TypeParenthesesSubpattern) |
| 557 | m_alternative->lastTerm().parentheses.isCopy = true; |
| 558 | } |
| 559 | } |
| 560 | |
| 561 | void disjunction() |
| 562 | { |
| 563 | m_alternative = m_alternative->m_parent->addNewAlternative(); |
| 564 | } |
| 565 | |
| 566 | void regexBegin() |
| 567 | { |
| 568 | m_pattern.m_body = new PatternDisjunction(); |
| 569 | m_alternative = m_pattern.m_body->addNewAlternative(); |
| 570 | m_pattern.m_disjunctions.append(val: m_pattern.m_body); |
| 571 | } |
| 572 | void regexEnd() |
| 573 | { |
| 574 | } |
| 575 | void regexError() |
| 576 | { |
| 577 | } |
| 578 | |
| 579 | unsigned setupAlternativeOffsets(PatternAlternative* alternative, unsigned currentCallFrameSize, unsigned initialInputPosition) |
| 580 | { |
| 581 | alternative->m_hasFixedSize = true; |
| 582 | unsigned currentInputPosition = initialInputPosition; |
| 583 | |
| 584 | for (unsigned i = 0; i < alternative->m_terms.size(); ++i) { |
| 585 | PatternTerm& term = alternative->m_terms[i]; |
| 586 | |
| 587 | switch (term.type) { |
| 588 | case PatternTerm::TypeAssertionBOL: |
| 589 | case PatternTerm::TypeAssertionEOL: |
| 590 | case PatternTerm::TypeAssertionWordBoundary: |
| 591 | term.inputPosition = currentInputPosition; |
| 592 | break; |
| 593 | |
| 594 | case PatternTerm::TypeBackReference: |
| 595 | term.inputPosition = currentInputPosition; |
| 596 | term.frameLocation = currentCallFrameSize; |
| 597 | currentCallFrameSize += RegexStackSpaceForBackTrackInfoBackReference; |
| 598 | alternative->m_hasFixedSize = false; |
| 599 | break; |
| 600 | |
| 601 | case PatternTerm::TypeForwardReference: |
| 602 | break; |
| 603 | |
| 604 | case PatternTerm::TypePatternCharacter: |
| 605 | term.inputPosition = currentInputPosition; |
| 606 | if (term.quantityType != QuantifierFixedCount) { |
| 607 | term.frameLocation = currentCallFrameSize; |
| 608 | currentCallFrameSize += RegexStackSpaceForBackTrackInfoPatternCharacter; |
| 609 | alternative->m_hasFixedSize = false; |
| 610 | } else |
| 611 | currentInputPosition += term.quantityCount; |
| 612 | break; |
| 613 | |
| 614 | case PatternTerm::TypeCharacterClass: |
| 615 | term.inputPosition = currentInputPosition; |
| 616 | if (term.quantityType != QuantifierFixedCount) { |
| 617 | term.frameLocation = currentCallFrameSize; |
| 618 | currentCallFrameSize += RegexStackSpaceForBackTrackInfoCharacterClass; |
| 619 | alternative->m_hasFixedSize = false; |
| 620 | } else |
| 621 | currentInputPosition += term.quantityCount; |
| 622 | break; |
| 623 | |
| 624 | case PatternTerm::TypeParenthesesSubpattern: |
| 625 | // Note: for fixed once parentheses we will ensure at least the minimum is available; others are on their own. |
| 626 | term.frameLocation = currentCallFrameSize; |
| 627 | if ((term.quantityCount == 1) && !term.parentheses.isCopy) { |
| 628 | if (term.quantityType == QuantifierFixedCount) { |
| 629 | currentCallFrameSize = setupDisjunctionOffsets(disjunction: term.parentheses.disjunction, initialCallFrameSize: currentCallFrameSize, initialInputPosition: currentInputPosition); |
| 630 | currentInputPosition += term.parentheses.disjunction->m_minimumSize; |
| 631 | } else { |
| 632 | currentCallFrameSize += RegexStackSpaceForBackTrackInfoParenthesesOnce; |
| 633 | currentCallFrameSize = setupDisjunctionOffsets(disjunction: term.parentheses.disjunction, initialCallFrameSize: currentCallFrameSize, initialInputPosition: currentInputPosition); |
| 634 | } |
| 635 | term.inputPosition = currentInputPosition; |
| 636 | } else { |
| 637 | term.inputPosition = currentInputPosition; |
| 638 | setupDisjunctionOffsets(disjunction: term.parentheses.disjunction, initialCallFrameSize: 0, initialInputPosition: currentInputPosition); |
| 639 | currentCallFrameSize += RegexStackSpaceForBackTrackInfoParentheses; |
| 640 | } |
| 641 | // Fixed count of 1 could be accepted, if they have a fixed size *AND* if all alternatives are of the same length. |
| 642 | alternative->m_hasFixedSize = false; |
| 643 | break; |
| 644 | |
| 645 | case PatternTerm::TypeParentheticalAssertion: |
| 646 | term.inputPosition = currentInputPosition; |
| 647 | term.frameLocation = currentCallFrameSize; |
| 648 | currentCallFrameSize = setupDisjunctionOffsets(disjunction: term.parentheses.disjunction, initialCallFrameSize: currentCallFrameSize + RegexStackSpaceForBackTrackInfoParentheticalAssertion, initialInputPosition: currentInputPosition); |
| 649 | break; |
| 650 | } |
| 651 | } |
| 652 | |
| 653 | alternative->m_minimumSize = currentInputPosition - initialInputPosition; |
| 654 | return currentCallFrameSize; |
| 655 | } |
| 656 | |
| 657 | unsigned setupDisjunctionOffsets(PatternDisjunction* disjunction, unsigned initialCallFrameSize, unsigned initialInputPosition) |
| 658 | { |
| 659 | if ((disjunction != m_pattern.m_body) && (disjunction->m_alternatives.size() > 1)) |
| 660 | initialCallFrameSize += RegexStackSpaceForBackTrackInfoAlternative; |
| 661 | |
| 662 | unsigned minimumInputSize = UINT_MAX; |
| 663 | unsigned maximumCallFrameSize = 0; |
| 664 | bool hasFixedSize = true; |
| 665 | |
| 666 | for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) { |
| 667 | PatternAlternative* alternative = disjunction->m_alternatives[alt]; |
| 668 | unsigned currentAlternativeCallFrameSize = setupAlternativeOffsets(alternative, currentCallFrameSize: initialCallFrameSize, initialInputPosition); |
| 669 | minimumInputSize = min(a: minimumInputSize, b: alternative->m_minimumSize); |
| 670 | maximumCallFrameSize = max(a: maximumCallFrameSize, b: currentAlternativeCallFrameSize); |
| 671 | hasFixedSize &= alternative->m_hasFixedSize; |
| 672 | } |
| 673 | |
| 674 | ASSERT(minimumInputSize != UINT_MAX); |
| 675 | ASSERT(maximumCallFrameSize >= initialCallFrameSize); |
| 676 | |
| 677 | disjunction->m_hasFixedSize = hasFixedSize; |
| 678 | disjunction->m_minimumSize = minimumInputSize; |
| 679 | disjunction->m_callFrameSize = maximumCallFrameSize; |
| 680 | return maximumCallFrameSize; |
| 681 | } |
| 682 | |
| 683 | void setupOffsets() |
| 684 | { |
| 685 | setupDisjunctionOffsets(disjunction: m_pattern.m_body, initialCallFrameSize: 0, initialInputPosition: 0); |
| 686 | } |
| 687 | |
| 688 | private: |
| 689 | RegexPattern& m_pattern; |
| 690 | PatternAlternative* m_alternative; |
| 691 | CharacterClassConstructor m_characterClassConstructor; |
| 692 | bool m_invertCharacterClass; |
| 693 | }; |
| 694 | |
| 695 | |
| 696 | const char* compileRegex(const UString& patternString, RegexPattern& pattern) |
| 697 | { |
| 698 | RegexPatternConstructor constructor(pattern); |
| 699 | |
| 700 | if (const char* error = parse(delegate&: constructor, pattern: patternString)) |
| 701 | return error; |
| 702 | |
| 703 | // If the pattern contains illegal backreferences reset & reparse. |
| 704 | // Quoting Netscape's "What's new in JavaScript 1.2", |
| 705 | // "Note: if the number of left parentheses is less than the number specified |
| 706 | // in \#, the \# is taken as an octal escape as described in the next row." |
| 707 | if (pattern.containsIllegalBackReference()) { |
| 708 | unsigned numSubpatterns = pattern.m_numSubpatterns; |
| 709 | |
| 710 | constructor.reset(); |
| 711 | #if !ASSERT_DISABLED |
| 712 | const char* error = |
| 713 | #endif |
| 714 | parse(delegate&: constructor, pattern: patternString, backReferenceLimit: numSubpatterns); |
| 715 | |
| 716 | ASSERT(!error); |
| 717 | ASSERT(numSubpatterns == pattern.m_numSubpatterns); |
| 718 | } |
| 719 | |
| 720 | constructor.setupOffsets(); |
| 721 | |
| 722 | return NULL; |
| 723 | }; |
| 724 | |
| 725 | |
| 726 | } } |
| 727 | |
| 728 | #endif |
| 729 | |