| 1 | /* | 
| 2 |  * Copyright (C) 2009 Apple Inc. All rights reserved. | 
| 3 |  * | 
| 4 |  * Redistribution and use in source and binary forms, with or without | 
| 5 |  * modification, are permitted provided that the following conditions | 
| 6 |  * are met: | 
| 7 |  * 1. Redistributions of source code must retain the above copyright | 
| 8 |  *    notice, this list of conditions and the following disclaimer. | 
| 9 |  * 2. Redistributions in binary form must reproduce the above copyright | 
| 10 |  *    notice, this list of conditions and the following disclaimer in the | 
| 11 |  *    documentation and/or other materials provided with the distribution. | 
| 12 |  * | 
| 13 |  * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY | 
| 14 |  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | 
| 15 |  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR | 
| 16 |  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR | 
| 17 |  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, | 
| 18 |  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, | 
| 19 |  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR | 
| 20 |  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY | 
| 21 |  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | 
| 22 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | 
| 23 |  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  | 
| 24 |  */ | 
| 25 |  | 
| 26 | #include "config.h" | 
| 27 | #include "RegexCompiler.h" | 
| 28 |  | 
| 29 | #include "RegexInterpreter.h" | 
| 30 | #include "RegexPattern.h" | 
| 31 | #include <wtf/Vector.h> | 
| 32 |  | 
| 33 | #if ENABLE(YARR) | 
| 34 |  | 
| 35 | using namespace WTF; | 
| 36 |  | 
| 37 | namespace JSC { namespace Yarr { | 
| 38 |  | 
| 39 | class CharacterClassConstructor { | 
| 40 | public: | 
| 41 |     CharacterClassConstructor(bool isCaseInsensitive = false) | 
| 42 |         : m_isCaseInsensitive(isCaseInsensitive) | 
| 43 |     { | 
| 44 |     } | 
| 45 |      | 
| 46 |     void reset() | 
| 47 |     { | 
| 48 |         m_matches.clear(); | 
| 49 |         m_ranges.clear(); | 
| 50 |         m_matchesUnicode.clear(); | 
| 51 |         m_rangesUnicode.clear(); | 
| 52 |     } | 
| 53 |  | 
| 54 |     void append(const CharacterClass* other) | 
| 55 |     { | 
| 56 |         for (size_t i = 0; i < other->m_matches.size(); ++i) | 
| 57 |             addSorted(matches&: m_matches, ch: other->m_matches[i]); | 
| 58 |         for (size_t i = 0; i < other->m_ranges.size(); ++i) | 
| 59 |             addSortedRange(ranges&: m_ranges, lo: other->m_ranges[i].begin, hi: other->m_ranges[i].end); | 
| 60 |         for (size_t i = 0; i < other->m_matchesUnicode.size(); ++i) | 
| 61 |             addSorted(matches&: m_matchesUnicode, ch: other->m_matchesUnicode[i]); | 
| 62 |         for (size_t i = 0; i < other->m_rangesUnicode.size(); ++i) | 
| 63 |             addSortedRange(ranges&: m_rangesUnicode, lo: other->m_rangesUnicode[i].begin, hi: other->m_rangesUnicode[i].end); | 
| 64 |     } | 
| 65 |  | 
| 66 |     void putChar(UChar ch) | 
| 67 |     { | 
| 68 |         if (ch <= 0x7f) { | 
| 69 |             if (m_isCaseInsensitive && isASCIIAlpha(c: ch)) { | 
| 70 |                 addSorted(matches&: m_matches, ch: toASCIIUpper(c: ch)); | 
| 71 |                 addSorted(matches&: m_matches, ch: toASCIILower(c: ch)); | 
| 72 |             } else | 
| 73 |                 addSorted(matches&: m_matches, ch); | 
| 74 |         } else { | 
| 75 |             UChar upper, lower; | 
| 76 |             if (m_isCaseInsensitive && ((upper = Unicode::toUpper(ch)) != (lower = Unicode::toLower(ch)))) { | 
| 77 |                 addSorted(matches&: m_matchesUnicode, ch: upper); | 
| 78 |                 addSorted(matches&: m_matchesUnicode, ch: lower); | 
| 79 |             } else | 
| 80 |                 addSorted(matches&: m_matchesUnicode, ch); | 
| 81 |         } | 
| 82 |     } | 
| 83 |  | 
| 84 |     // returns true if this character has another case, and 'ch' is the upper case form. | 
| 85 |     static inline bool isUnicodeUpper(UChar ch) | 
| 86 |     { | 
| 87 |         return ch != Unicode::toLower(ch); | 
| 88 |     } | 
| 89 |  | 
| 90 |     // returns true if this character has another case, and 'ch' is the lower case form. | 
| 91 |     static inline bool isUnicodeLower(UChar ch) | 
| 92 |     { | 
| 93 |         return ch != Unicode::toUpper(ch); | 
| 94 |     } | 
| 95 |  | 
| 96 |     void putRange(UChar lo, UChar hi) | 
| 97 |     { | 
| 98 |         if (lo <= 0x7f) { | 
| 99 |             char asciiLo = lo; | 
| 100 |             char asciiHi = std::min(a: hi, b: (UChar)0x7f); | 
| 101 |             addSortedRange(ranges&: m_ranges, lo, hi: asciiHi); | 
| 102 |              | 
| 103 |             if (m_isCaseInsensitive) { | 
| 104 |                 if ((asciiLo <= 'Z') && (asciiHi >= 'A')) | 
| 105 |                     addSortedRange(ranges&: m_ranges, lo: std::max(a: asciiLo, b: 'A')+('a'-'A'), hi: std::min(a: asciiHi, b: 'Z')+('a'-'A')); | 
| 106 |                 if ((asciiLo <= 'z') && (asciiHi >= 'a')) | 
| 107 |                     addSortedRange(ranges&: m_ranges, lo: std::max(a: asciiLo, b: 'a')+('A'-'a'), hi: std::min(a: asciiHi, b: 'z')+('A'-'a')); | 
| 108 |             } | 
| 109 |         } | 
| 110 |         if (hi >= 0x80) { | 
| 111 |             uint32_t unicodeCurr = std::max(a: lo, b: (UChar)0x80); | 
| 112 |             addSortedRange(ranges&: m_rangesUnicode, lo: unicodeCurr, hi); | 
| 113 |              | 
| 114 |             if (m_isCaseInsensitive) { | 
| 115 |                 while (unicodeCurr <= hi) { | 
| 116 |                     // If the upper bound of the range (hi) is 0xffff, the increments to | 
| 117 |                     // unicodeCurr in this loop may take it to 0x10000.  This is fine | 
| 118 |                     // (if so we won't re-enter the loop, since the loop condition above | 
| 119 |                     // will definitely fail) - but this does mean we cannot use a UChar | 
| 120 |                     // to represent unicodeCurr, we must use a 32-bit value instead. | 
| 121 |                     ASSERT(unicodeCurr <= 0xffff); | 
| 122 |  | 
| 123 |                     if (isUnicodeUpper(ch: unicodeCurr)) { | 
| 124 |                         UChar lowerCaseRangeBegin = Unicode::toLower(ch: unicodeCurr); | 
| 125 |                         UChar lowerCaseRangeEnd = lowerCaseRangeBegin; | 
| 126 |                         while ((++unicodeCurr <= hi) && isUnicodeUpper(ch: unicodeCurr) && (Unicode::toLower(ch: unicodeCurr) == (lowerCaseRangeEnd + 1))) | 
| 127 |                             lowerCaseRangeEnd++; | 
| 128 |                         addSortedRange(ranges&: m_rangesUnicode, lo: lowerCaseRangeBegin, hi: lowerCaseRangeEnd); | 
| 129 |                     } else if (isUnicodeLower(ch: unicodeCurr)) { | 
| 130 |                         UChar upperCaseRangeBegin = Unicode::toUpper(ch: unicodeCurr); | 
| 131 |                         UChar upperCaseRangeEnd = upperCaseRangeBegin; | 
| 132 |                         while ((++unicodeCurr <= hi) && isUnicodeLower(ch: unicodeCurr) && (Unicode::toUpper(ch: unicodeCurr) == (upperCaseRangeEnd + 1))) | 
| 133 |                             upperCaseRangeEnd++; | 
| 134 |                         addSortedRange(ranges&: m_rangesUnicode, lo: upperCaseRangeBegin, hi: upperCaseRangeEnd); | 
| 135 |                     } else | 
| 136 |                         ++unicodeCurr; | 
| 137 |                 } | 
| 138 |             } | 
| 139 |         } | 
| 140 |     } | 
| 141 |  | 
| 142 |     CharacterClass* charClass() | 
| 143 |     { | 
| 144 |         CharacterClass* characterClass = new CharacterClass(); | 
| 145 |  | 
| 146 |         characterClass->m_matches.append(val: m_matches); | 
| 147 |         characterClass->m_ranges.append(val: m_ranges); | 
| 148 |         characterClass->m_matchesUnicode.append(val: m_matchesUnicode); | 
| 149 |         characterClass->m_rangesUnicode.append(val: m_rangesUnicode); | 
| 150 |  | 
| 151 |         reset(); | 
| 152 |  | 
| 153 |         return characterClass; | 
| 154 |     } | 
| 155 |  | 
| 156 | private: | 
| 157 |     void addSorted(Vector<UChar>& matches, UChar ch) | 
| 158 |     { | 
| 159 |         unsigned pos = 0; | 
| 160 |         unsigned range = matches.size(); | 
| 161 |  | 
| 162 |         // binary chop, find position to insert char. | 
| 163 |         while (range) { | 
| 164 |             unsigned index = range >> 1; | 
| 165 |  | 
| 166 |             int val = matches[pos+index] - ch; | 
| 167 |             if (!val) | 
| 168 |                 return; | 
| 169 |             else if (val > 0) | 
| 170 |                 range = index; | 
| 171 |             else { | 
| 172 |                 pos += (index+1); | 
| 173 |                 range -= (index+1); | 
| 174 |             } | 
| 175 |         } | 
| 176 |          | 
| 177 |         if (pos == matches.size()) | 
| 178 |             matches.append(val: ch); | 
| 179 |         else | 
| 180 |             matches.insert(position: pos, val: ch); | 
| 181 |     } | 
| 182 |  | 
| 183 |     void addSortedRange(Vector<CharacterRange>& ranges, UChar lo, UChar hi) | 
| 184 |     { | 
| 185 |         unsigned end = ranges.size(); | 
| 186 |          | 
| 187 |         // Simple linear scan - I doubt there are that many ranges anyway... | 
| 188 |         // feel free to fix this with something faster (eg binary chop). | 
| 189 |         for (unsigned i = 0; i < end; ++i) { | 
| 190 |             // does the new range fall before the current position in the array | 
| 191 |             if (hi < ranges[i].begin) { | 
| 192 |                 // optional optimization: concatenate appending ranges? - may not be worthwhile. | 
| 193 |                 if (hi == (ranges[i].begin - 1)) { | 
| 194 |                     ranges[i].begin = lo; | 
| 195 |                     return; | 
| 196 |                 } | 
| 197 |                 ranges.insert(position: i, val: CharacterRange(lo, hi)); | 
| 198 |                 return; | 
| 199 |             } | 
| 200 |             // Okay, since we didn't hit the last case, the end of the new range is definitely at or after the begining | 
| 201 |             // If the new range start at or before the end of the last range, then the overlap (if it starts one after the | 
| 202 |             // end of the last range they concatenate, which is just as good. | 
| 203 |             if (lo <= (ranges[i].end + 1)) { | 
| 204 |                 // found an intersect! we'll replace this entry in the array. | 
| 205 |                 ranges[i].begin = std::min(a: ranges[i].begin, b: lo); | 
| 206 |                 ranges[i].end = std::max(a: ranges[i].end, b: hi); | 
| 207 |  | 
| 208 |                 // now check if the new range can subsume any subsequent ranges. | 
| 209 |                 unsigned next = i+1; | 
| 210 |                 // each iteration of the loop we will either remove something from the list, or break the loop. | 
| 211 |                 while (next < ranges.size()) { | 
| 212 |                     if (ranges[next].begin <= (ranges[i].end + 1)) { | 
| 213 |                         // the next entry now overlaps / concatenates this one. | 
| 214 |                         ranges[i].end = std::max(a: ranges[i].end, b: ranges[next].end); | 
| 215 |                         ranges.remove(position: next); | 
| 216 |                     } else | 
| 217 |                         break; | 
| 218 |                 } | 
| 219 |                  | 
| 220 |                 return; | 
| 221 |             } | 
| 222 |         } | 
| 223 |  | 
| 224 |         // CharacterRange comes after all existing ranges. | 
| 225 |         ranges.append(val: CharacterRange(lo, hi)); | 
| 226 |     } | 
| 227 |  | 
| 228 |     bool m_isCaseInsensitive; | 
| 229 |  | 
| 230 |     Vector<UChar> m_matches; | 
| 231 |     Vector<CharacterRange> m_ranges; | 
| 232 |     Vector<UChar> m_matchesUnicode; | 
| 233 |     Vector<CharacterRange> m_rangesUnicode; | 
| 234 | }; | 
| 235 |  | 
| 236 |  | 
| 237 | CharacterClass* newlineCreate() | 
| 238 | { | 
| 239 |     CharacterClass* characterClass = new CharacterClass(); | 
| 240 |  | 
| 241 |     characterClass->m_matches.append(val: '\n'); | 
| 242 |     characterClass->m_matches.append(val: '\r'); | 
| 243 |     characterClass->m_matchesUnicode.append(val: 0x2028); | 
| 244 |     characterClass->m_matchesUnicode.append(val: 0x2029); | 
| 245 |      | 
| 246 |     return characterClass; | 
| 247 | } | 
| 248 |  | 
| 249 | CharacterClass* digitsCreate() | 
| 250 | { | 
| 251 |     CharacterClass* characterClass = new CharacterClass(); | 
| 252 |  | 
| 253 |     characterClass->m_ranges.append(val: CharacterRange('0', '9')); | 
| 254 |      | 
| 255 |     return characterClass; | 
| 256 | } | 
| 257 |  | 
| 258 | CharacterClass* spacesCreate() | 
| 259 | { | 
| 260 |     CharacterClass* characterClass = new CharacterClass(); | 
| 261 |  | 
| 262 |     characterClass->m_matches.append(val: ' '); | 
| 263 |     characterClass->m_ranges.append(val: CharacterRange('\t', '\r')); | 
| 264 |     characterClass->m_matchesUnicode.append(val: 0x00a0); | 
| 265 |     characterClass->m_matchesUnicode.append(val: 0x1680); | 
| 266 |     characterClass->m_matchesUnicode.append(val: 0x180e); | 
| 267 |     characterClass->m_matchesUnicode.append(val: 0x2028); | 
| 268 |     characterClass->m_matchesUnicode.append(val: 0x2029); | 
| 269 |     characterClass->m_matchesUnicode.append(val: 0x202f); | 
| 270 |     characterClass->m_matchesUnicode.append(val: 0x205f); | 
| 271 |     characterClass->m_matchesUnicode.append(val: 0x3000); | 
| 272 |     characterClass->m_rangesUnicode.append(val: CharacterRange(0x2000, 0x200a)); | 
| 273 |      | 
| 274 |     return characterClass; | 
| 275 | } | 
| 276 |  | 
| 277 | CharacterClass* wordcharCreate() | 
| 278 | { | 
| 279 |     CharacterClass* characterClass = new CharacterClass(); | 
| 280 |  | 
| 281 |     characterClass->m_matches.append(val: '_'); | 
| 282 |     characterClass->m_ranges.append(val: CharacterRange('0', '9')); | 
| 283 |     characterClass->m_ranges.append(val: CharacterRange('A', 'Z')); | 
| 284 |     characterClass->m_ranges.append(val: CharacterRange('a', 'z')); | 
| 285 |      | 
| 286 |     return characterClass; | 
| 287 | } | 
| 288 |  | 
| 289 | CharacterClass* nondigitsCreate() | 
| 290 | { | 
| 291 |     CharacterClass* characterClass = new CharacterClass(); | 
| 292 |  | 
| 293 |     characterClass->m_ranges.append(val: CharacterRange(0, '0' - 1)); | 
| 294 |     characterClass->m_ranges.append(val: CharacterRange('9' + 1, 0x7f)); | 
| 295 |     characterClass->m_rangesUnicode.append(val: CharacterRange(0x80, 0xffff)); | 
| 296 |      | 
| 297 |     return characterClass; | 
| 298 | } | 
| 299 |  | 
| 300 | CharacterClass* nonspacesCreate() | 
| 301 | { | 
| 302 |     CharacterClass* characterClass = new CharacterClass(); | 
| 303 |  | 
| 304 |     characterClass->m_ranges.append(val: CharacterRange(0, '\t' - 1)); | 
| 305 |     characterClass->m_ranges.append(val: CharacterRange('\r' + 1, ' ' - 1)); | 
| 306 |     characterClass->m_ranges.append(val: CharacterRange(' ' + 1, 0x7f)); | 
| 307 |     characterClass->m_rangesUnicode.append(val: CharacterRange(0x0080, 0x009f)); | 
| 308 |     characterClass->m_rangesUnicode.append(val: CharacterRange(0x00a1, 0x167f)); | 
| 309 |     characterClass->m_rangesUnicode.append(val: CharacterRange(0x1681, 0x180d)); | 
| 310 |     characterClass->m_rangesUnicode.append(val: CharacterRange(0x180f, 0x1fff)); | 
| 311 |     characterClass->m_rangesUnicode.append(val: CharacterRange(0x200b, 0x2027)); | 
| 312 |     characterClass->m_rangesUnicode.append(val: CharacterRange(0x202a, 0x202e)); | 
| 313 |     characterClass->m_rangesUnicode.append(val: CharacterRange(0x2030, 0x205e)); | 
| 314 |     characterClass->m_rangesUnicode.append(val: CharacterRange(0x2060, 0x2fff)); | 
| 315 |     characterClass->m_rangesUnicode.append(val: CharacterRange(0x3001, 0xffff)); | 
| 316 |      | 
| 317 |     return characterClass; | 
| 318 | } | 
| 319 |  | 
| 320 | CharacterClass* nonwordcharCreate() | 
| 321 | { | 
| 322 |     CharacterClass* characterClass = new CharacterClass(); | 
| 323 |  | 
| 324 |     characterClass->m_matches.append(val: '`'); | 
| 325 |     characterClass->m_ranges.append(val: CharacterRange(0, '0' - 1)); | 
| 326 |     characterClass->m_ranges.append(val: CharacterRange('9' + 1, 'A' - 1)); | 
| 327 |     characterClass->m_ranges.append(val: CharacterRange('Z' + 1, '_' - 1)); | 
| 328 |     characterClass->m_ranges.append(val: CharacterRange('z' + 1, 0x7f)); | 
| 329 |     characterClass->m_rangesUnicode.append(val: CharacterRange(0x80, 0xffff)); | 
| 330 |  | 
| 331 |     return characterClass; | 
| 332 | } | 
| 333 |  | 
| 334 |  | 
| 335 | class RegexPatternConstructor { | 
| 336 | public: | 
| 337 |     RegexPatternConstructor(RegexPattern& pattern) | 
| 338 |         : m_pattern(pattern) | 
| 339 |         , m_characterClassConstructor(pattern.m_ignoreCase) | 
| 340 |     { | 
| 341 |     } | 
| 342 |  | 
| 343 |     ~RegexPatternConstructor() | 
| 344 |     { | 
| 345 |     } | 
| 346 |  | 
| 347 |     void reset() | 
| 348 |     { | 
| 349 |         m_pattern.reset(); | 
| 350 |         m_characterClassConstructor.reset(); | 
| 351 |     } | 
| 352 |      | 
| 353 |     void assertionBOL() | 
| 354 |     { | 
| 355 |         m_alternative->m_terms.append(val: PatternTerm::BOL()); | 
| 356 |     } | 
| 357 |     void assertionEOL() | 
| 358 |     { | 
| 359 |         m_alternative->m_terms.append(val: PatternTerm::EOL()); | 
| 360 |     } | 
| 361 |     void assertionWordBoundary(bool invert) | 
| 362 |     { | 
| 363 |         m_alternative->m_terms.append(val: PatternTerm::WordBoundary(invert)); | 
| 364 |     } | 
| 365 |  | 
| 366 |     void atomPatternCharacter(UChar ch) | 
| 367 |     { | 
| 368 |         // We handle case-insensitive checking of unicode characters which do have both | 
| 369 |         // cases by handling them as if they were defined using a CharacterClass. | 
| 370 |         if (m_pattern.m_ignoreCase && !isASCII(c: ch) && (Unicode::toUpper(ch) != Unicode::toLower(ch))) { | 
| 371 |             atomCharacterClassBegin(); | 
| 372 |             atomCharacterClassAtom(ch); | 
| 373 |             atomCharacterClassEnd(); | 
| 374 |         } else | 
| 375 |             m_alternative->m_terms.append(val: PatternTerm(ch)); | 
| 376 |     } | 
| 377 |  | 
| 378 |     void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert) | 
| 379 |     { | 
| 380 |         switch (classID) { | 
| 381 |         case DigitClassID: | 
| 382 |             m_alternative->m_terms.append(val: PatternTerm(m_pattern.digitsCharacterClass(), invert)); | 
| 383 |             break; | 
| 384 |         case SpaceClassID: | 
| 385 |             m_alternative->m_terms.append(val: PatternTerm(m_pattern.spacesCharacterClass(), invert)); | 
| 386 |             break; | 
| 387 |         case WordClassID: | 
| 388 |             m_alternative->m_terms.append(val: PatternTerm(m_pattern.wordcharCharacterClass(), invert)); | 
| 389 |             break; | 
| 390 |         case NewlineClassID: | 
| 391 |             m_alternative->m_terms.append(val: PatternTerm(m_pattern.newlineCharacterClass(), invert)); | 
| 392 |             break; | 
| 393 |         } | 
| 394 |     } | 
| 395 |  | 
| 396 |     void atomCharacterClassBegin(bool invert = false) | 
| 397 |     { | 
| 398 |         m_invertCharacterClass = invert; | 
| 399 |     } | 
| 400 |  | 
| 401 |     void atomCharacterClassAtom(UChar ch) | 
| 402 |     { | 
| 403 |         m_characterClassConstructor.putChar(ch); | 
| 404 |     } | 
| 405 |  | 
| 406 |     void atomCharacterClassRange(UChar begin, UChar end) | 
| 407 |     { | 
| 408 |         m_characterClassConstructor.putRange(lo: begin, hi: end); | 
| 409 |     } | 
| 410 |  | 
| 411 |     void atomCharacterClassBuiltIn(BuiltInCharacterClassID classID, bool invert) | 
| 412 |     { | 
| 413 |         ASSERT(classID != NewlineClassID); | 
| 414 |  | 
| 415 |         switch (classID) { | 
| 416 |         case DigitClassID: | 
| 417 |             m_characterClassConstructor.append(other: invert ? m_pattern.nondigitsCharacterClass() : m_pattern.digitsCharacterClass()); | 
| 418 |             break; | 
| 419 |          | 
| 420 |         case SpaceClassID: | 
| 421 |             m_characterClassConstructor.append(other: invert ? m_pattern.nonspacesCharacterClass() : m_pattern.spacesCharacterClass()); | 
| 422 |             break; | 
| 423 |          | 
| 424 |         case WordClassID: | 
| 425 |             m_characterClassConstructor.append(other: invert ? m_pattern.nonwordcharCharacterClass() : m_pattern.wordcharCharacterClass()); | 
| 426 |             break; | 
| 427 |          | 
| 428 |         default: | 
| 429 |             ASSERT_NOT_REACHED(); | 
| 430 |         } | 
| 431 |     } | 
| 432 |  | 
| 433 |     void atomCharacterClassEnd() | 
| 434 |     { | 
| 435 |         CharacterClass* newCharacterClass = m_characterClassConstructor.charClass(); | 
| 436 |         m_pattern.m_userCharacterClasses.append(val: newCharacterClass); | 
| 437 |         m_alternative->m_terms.append(val: PatternTerm(newCharacterClass, m_invertCharacterClass)); | 
| 438 |     } | 
| 439 |  | 
| 440 |     void atomParenthesesSubpatternBegin(bool capture = true) | 
| 441 |     { | 
| 442 |         unsigned subpatternId = m_pattern.m_numSubpatterns + 1; | 
| 443 |         if (capture) | 
| 444 |             m_pattern.m_numSubpatterns++; | 
| 445 |  | 
| 446 |         PatternDisjunction* parenthesesDisjunction = new PatternDisjunction(m_alternative); | 
| 447 |         m_pattern.m_disjunctions.append(val: parenthesesDisjunction); | 
| 448 |         m_alternative->m_terms.append(val: PatternTerm(PatternTerm::TypeParenthesesSubpattern, subpatternId, parenthesesDisjunction, capture)); | 
| 449 |         m_alternative = parenthesesDisjunction->addNewAlternative(); | 
| 450 |     } | 
| 451 |  | 
| 452 |     void atomParentheticalAssertionBegin(bool invert = false) | 
| 453 |     { | 
| 454 |         PatternDisjunction* parenthesesDisjunction = new PatternDisjunction(m_alternative); | 
| 455 |         m_pattern.m_disjunctions.append(val: parenthesesDisjunction); | 
| 456 |         m_alternative->m_terms.append(val: PatternTerm(PatternTerm::TypeParentheticalAssertion, m_pattern.m_numSubpatterns + 1, parenthesesDisjunction, invert)); | 
| 457 |         m_alternative = parenthesesDisjunction->addNewAlternative(); | 
| 458 |     } | 
| 459 |  | 
| 460 |     void atomParenthesesEnd() | 
| 461 |     { | 
| 462 |         ASSERT(m_alternative->m_parent); | 
| 463 |         ASSERT(m_alternative->m_parent->m_parent); | 
| 464 |         m_alternative = m_alternative->m_parent->m_parent; | 
| 465 |          | 
| 466 |         m_alternative->lastTerm().parentheses.lastSubpatternId = m_pattern.m_numSubpatterns; | 
| 467 |     } | 
| 468 |  | 
| 469 |     void atomBackReference(unsigned subpatternId) | 
| 470 |     { | 
| 471 |         ASSERT(subpatternId); | 
| 472 |         m_pattern.m_maxBackReference = std::max(a: m_pattern.m_maxBackReference, b: subpatternId); | 
| 473 |  | 
| 474 |         if (subpatternId > m_pattern.m_numSubpatterns) { | 
| 475 |             m_alternative->m_terms.append(val: PatternTerm::ForwardReference()); | 
| 476 |             return; | 
| 477 |         } | 
| 478 |  | 
| 479 |         PatternAlternative* currentAlternative = m_alternative; | 
| 480 |         ASSERT(currentAlternative); | 
| 481 |  | 
| 482 |         // Note to self: if we waited until the AST was baked, we could also remove forwards refs  | 
| 483 |         while ((currentAlternative = currentAlternative->m_parent->m_parent)) { | 
| 484 |             PatternTerm& term = currentAlternative->lastTerm(); | 
| 485 |             ASSERT((term.type == PatternTerm::TypeParenthesesSubpattern) || (term.type == PatternTerm::TypeParentheticalAssertion)); | 
| 486 |  | 
| 487 |             if ((term.type == PatternTerm::TypeParenthesesSubpattern) && term.invertOrCapture && (subpatternId == term.subpatternId)) { | 
| 488 |                 m_alternative->m_terms.append(val: PatternTerm::ForwardReference()); | 
| 489 |                 return; | 
| 490 |             } | 
| 491 |         } | 
| 492 |  | 
| 493 |         m_alternative->m_terms.append(val: PatternTerm(subpatternId)); | 
| 494 |     } | 
| 495 |  | 
| 496 |     PatternDisjunction* copyDisjunction(PatternDisjunction* disjunction) | 
| 497 |     { | 
| 498 |         PatternDisjunction* newDisjunction = new PatternDisjunction(); | 
| 499 |  | 
| 500 |         newDisjunction->m_parent = disjunction->m_parent; | 
| 501 |         for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) { | 
| 502 |             PatternAlternative* alternative = disjunction->m_alternatives[alt]; | 
| 503 |             PatternAlternative* newAlternative = newDisjunction->addNewAlternative(); | 
| 504 |             for (unsigned i = 0; i < alternative->m_terms.size(); ++i) | 
| 505 |                 newAlternative->m_terms.append(val: copyTerm(term&: alternative->m_terms[i])); | 
| 506 |         } | 
| 507 |  | 
| 508 |         m_pattern.m_disjunctions.append(val: newDisjunction); | 
| 509 |         return newDisjunction; | 
| 510 |     } | 
| 511 |  | 
| 512 |     PatternTerm copyTerm(PatternTerm& term) | 
| 513 |     { | 
| 514 |         if ((term.type != PatternTerm::TypeParenthesesSubpattern) && (term.type != PatternTerm::TypeParentheticalAssertion)) | 
| 515 |             return PatternTerm(term); | 
| 516 |  | 
| 517 |         PatternTerm termCopy = term; | 
| 518 |         termCopy.parentheses.disjunction = copyDisjunction(disjunction: termCopy.parentheses.disjunction); | 
| 519 |         return termCopy; | 
| 520 |     } | 
| 521 |  | 
| 522 |     void quantifyAtom(unsigned min, unsigned max, bool greedy) | 
| 523 |     { | 
| 524 |         ASSERT(min <= max); | 
| 525 |         ASSERT(m_alternative->m_terms.size()); | 
| 526 |  | 
| 527 |         if (!max) { | 
| 528 |             m_alternative->removeLastTerm(); | 
| 529 |             return; | 
| 530 |         } | 
| 531 |  | 
| 532 |         PatternTerm& term = m_alternative->lastTerm(); | 
| 533 |         ASSERT(term.type > PatternTerm::TypeAssertionWordBoundary); | 
| 534 |         ASSERT((term.quantityCount == 1) && (term.quantityType == QuantifierFixedCount)); | 
| 535 |  | 
| 536 |         // For any assertion with a zero minimum, not matching is valid and has no effect, | 
| 537 |         // remove it.  Otherwise, we need to match as least once, but there is no point | 
| 538 |         // matching more than once, so remove the quantifier.  It is not entirely clear | 
| 539 |         // from the spec whether or not this behavior is correct, but I believe this | 
| 540 |         // matches Firefox. :-/ | 
| 541 |         if (term.type == PatternTerm::TypeParentheticalAssertion) { | 
| 542 |             if (!min) | 
| 543 |                 m_alternative->removeLastTerm(); | 
| 544 |             return; | 
| 545 |         } | 
| 546 |  | 
| 547 |         if (min == 0) | 
| 548 |             term.quantify(count: max, type: greedy   ? QuantifierGreedy : QuantifierNonGreedy); | 
| 549 |         else if (min == max) | 
| 550 |             term.quantify(count: min, type: QuantifierFixedCount); | 
| 551 |         else { | 
| 552 |             term.quantify(count: min, type: QuantifierFixedCount); | 
| 553 |             m_alternative->m_terms.append(val: copyTerm(term)); | 
| 554 |             // NOTE: this term is interesting from an analysis perspective, in that it can be ignored..... | 
| 555 |             m_alternative->lastTerm().quantify(count: (max == UINT_MAX) ? max : max - min, type: greedy ? QuantifierGreedy : QuantifierNonGreedy); | 
| 556 |             if (m_alternative->lastTerm().type == PatternTerm::TypeParenthesesSubpattern) | 
| 557 |                 m_alternative->lastTerm().parentheses.isCopy = true; | 
| 558 |         } | 
| 559 |     } | 
| 560 |  | 
| 561 |     void disjunction() | 
| 562 |     { | 
| 563 |         m_alternative = m_alternative->m_parent->addNewAlternative(); | 
| 564 |     } | 
| 565 |  | 
| 566 |     void regexBegin() | 
| 567 |     { | 
| 568 |         m_pattern.m_body = new PatternDisjunction(); | 
| 569 |         m_alternative = m_pattern.m_body->addNewAlternative(); | 
| 570 |         m_pattern.m_disjunctions.append(val: m_pattern.m_body); | 
| 571 |     } | 
| 572 |     void regexEnd() | 
| 573 |     { | 
| 574 |     } | 
| 575 |     void regexError() | 
| 576 |     { | 
| 577 |     } | 
| 578 |  | 
| 579 |     unsigned setupAlternativeOffsets(PatternAlternative* alternative, unsigned currentCallFrameSize, unsigned initialInputPosition) | 
| 580 |     { | 
| 581 |         alternative->m_hasFixedSize = true; | 
| 582 |         unsigned currentInputPosition = initialInputPosition; | 
| 583 |  | 
| 584 |         for (unsigned i = 0; i < alternative->m_terms.size(); ++i) { | 
| 585 |             PatternTerm& term = alternative->m_terms[i]; | 
| 586 |  | 
| 587 |             switch (term.type) { | 
| 588 |             case PatternTerm::TypeAssertionBOL: | 
| 589 |             case PatternTerm::TypeAssertionEOL: | 
| 590 |             case PatternTerm::TypeAssertionWordBoundary: | 
| 591 |                 term.inputPosition = currentInputPosition; | 
| 592 |                 break; | 
| 593 |  | 
| 594 |             case PatternTerm::TypeBackReference: | 
| 595 |                 term.inputPosition = currentInputPosition; | 
| 596 |                 term.frameLocation = currentCallFrameSize; | 
| 597 |                 currentCallFrameSize += RegexStackSpaceForBackTrackInfoBackReference; | 
| 598 |                 alternative->m_hasFixedSize = false; | 
| 599 |                 break; | 
| 600 |  | 
| 601 |             case PatternTerm::TypeForwardReference: | 
| 602 |                 break; | 
| 603 |  | 
| 604 |             case PatternTerm::TypePatternCharacter: | 
| 605 |                 term.inputPosition = currentInputPosition; | 
| 606 |                 if (term.quantityType != QuantifierFixedCount) { | 
| 607 |                     term.frameLocation = currentCallFrameSize; | 
| 608 |                     currentCallFrameSize += RegexStackSpaceForBackTrackInfoPatternCharacter; | 
| 609 |                     alternative->m_hasFixedSize = false; | 
| 610 |                 } else | 
| 611 |                     currentInputPosition += term.quantityCount; | 
| 612 |                 break; | 
| 613 |  | 
| 614 |             case PatternTerm::TypeCharacterClass: | 
| 615 |                 term.inputPosition = currentInputPosition; | 
| 616 |                 if (term.quantityType != QuantifierFixedCount) { | 
| 617 |                     term.frameLocation = currentCallFrameSize; | 
| 618 |                     currentCallFrameSize += RegexStackSpaceForBackTrackInfoCharacterClass; | 
| 619 |                     alternative->m_hasFixedSize = false; | 
| 620 |                 } else | 
| 621 |                     currentInputPosition += term.quantityCount; | 
| 622 |                 break; | 
| 623 |  | 
| 624 |             case PatternTerm::TypeParenthesesSubpattern: | 
| 625 |                 // Note: for fixed once parentheses we will ensure at least the minimum is available; others are on their own. | 
| 626 |                 term.frameLocation = currentCallFrameSize; | 
| 627 |                 if ((term.quantityCount == 1) && !term.parentheses.isCopy) { | 
| 628 |                     if (term.quantityType == QuantifierFixedCount) { | 
| 629 |                         currentCallFrameSize = setupDisjunctionOffsets(disjunction: term.parentheses.disjunction, initialCallFrameSize: currentCallFrameSize, initialInputPosition: currentInputPosition); | 
| 630 |                         currentInputPosition += term.parentheses.disjunction->m_minimumSize; | 
| 631 |                     } else { | 
| 632 |                         currentCallFrameSize += RegexStackSpaceForBackTrackInfoParenthesesOnce; | 
| 633 |                         currentCallFrameSize = setupDisjunctionOffsets(disjunction: term.parentheses.disjunction, initialCallFrameSize: currentCallFrameSize, initialInputPosition: currentInputPosition); | 
| 634 |                     } | 
| 635 |                     term.inputPosition = currentInputPosition; | 
| 636 |                 } else { | 
| 637 |                     term.inputPosition = currentInputPosition; | 
| 638 |                     setupDisjunctionOffsets(disjunction: term.parentheses.disjunction, initialCallFrameSize: 0, initialInputPosition: currentInputPosition); | 
| 639 |                     currentCallFrameSize += RegexStackSpaceForBackTrackInfoParentheses; | 
| 640 |                 } | 
| 641 |                 // Fixed count of 1 could be accepted, if they have a fixed size *AND* if all alternatives are of the same length. | 
| 642 |                 alternative->m_hasFixedSize = false; | 
| 643 |                 break; | 
| 644 |  | 
| 645 |             case PatternTerm::TypeParentheticalAssertion: | 
| 646 |                 term.inputPosition = currentInputPosition; | 
| 647 |                 term.frameLocation = currentCallFrameSize; | 
| 648 |                 currentCallFrameSize = setupDisjunctionOffsets(disjunction: term.parentheses.disjunction, initialCallFrameSize: currentCallFrameSize + RegexStackSpaceForBackTrackInfoParentheticalAssertion, initialInputPosition: currentInputPosition); | 
| 649 |                 break; | 
| 650 |             } | 
| 651 |         } | 
| 652 |  | 
| 653 |         alternative->m_minimumSize = currentInputPosition - initialInputPosition; | 
| 654 |         return currentCallFrameSize; | 
| 655 |     } | 
| 656 |  | 
| 657 |     unsigned setupDisjunctionOffsets(PatternDisjunction* disjunction, unsigned initialCallFrameSize, unsigned initialInputPosition) | 
| 658 |     { | 
| 659 |         if ((disjunction != m_pattern.m_body) && (disjunction->m_alternatives.size() > 1)) | 
| 660 |             initialCallFrameSize += RegexStackSpaceForBackTrackInfoAlternative; | 
| 661 |  | 
| 662 |         unsigned minimumInputSize = UINT_MAX; | 
| 663 |         unsigned maximumCallFrameSize = 0; | 
| 664 |         bool hasFixedSize = true; | 
| 665 |  | 
| 666 |         for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) { | 
| 667 |             PatternAlternative* alternative = disjunction->m_alternatives[alt]; | 
| 668 |             unsigned currentAlternativeCallFrameSize = setupAlternativeOffsets(alternative, currentCallFrameSize: initialCallFrameSize, initialInputPosition); | 
| 669 |             minimumInputSize = min(a: minimumInputSize, b: alternative->m_minimumSize); | 
| 670 |             maximumCallFrameSize = max(a: maximumCallFrameSize, b: currentAlternativeCallFrameSize); | 
| 671 |             hasFixedSize &= alternative->m_hasFixedSize; | 
| 672 |         } | 
| 673 |          | 
| 674 |         ASSERT(minimumInputSize != UINT_MAX); | 
| 675 |         ASSERT(maximumCallFrameSize >= initialCallFrameSize); | 
| 676 |  | 
| 677 |         disjunction->m_hasFixedSize = hasFixedSize; | 
| 678 |         disjunction->m_minimumSize = minimumInputSize; | 
| 679 |         disjunction->m_callFrameSize = maximumCallFrameSize; | 
| 680 |         return maximumCallFrameSize; | 
| 681 |     } | 
| 682 |  | 
| 683 |     void setupOffsets() | 
| 684 |     { | 
| 685 |         setupDisjunctionOffsets(disjunction: m_pattern.m_body, initialCallFrameSize: 0, initialInputPosition: 0); | 
| 686 |     } | 
| 687 |  | 
| 688 | private: | 
| 689 |     RegexPattern& m_pattern; | 
| 690 |     PatternAlternative* m_alternative; | 
| 691 |     CharacterClassConstructor m_characterClassConstructor; | 
| 692 |     bool m_invertCharacterClass; | 
| 693 | }; | 
| 694 |  | 
| 695 |  | 
| 696 | const char* compileRegex(const UString& patternString, RegexPattern& pattern) | 
| 697 | { | 
| 698 |     RegexPatternConstructor constructor(pattern); | 
| 699 |  | 
| 700 |     if (const char* error = parse(delegate&: constructor, pattern: patternString)) | 
| 701 |         return error; | 
| 702 |      | 
| 703 |     // If the pattern contains illegal backreferences reset & reparse. | 
| 704 |     // Quoting Netscape's "What's new in JavaScript 1.2", | 
| 705 |     //      "Note: if the number of left parentheses is less than the number specified | 
| 706 |     //       in \#, the \# is taken as an octal escape as described in the next row." | 
| 707 |     if (pattern.containsIllegalBackReference()) { | 
| 708 |         unsigned numSubpatterns = pattern.m_numSubpatterns; | 
| 709 |  | 
| 710 |         constructor.reset(); | 
| 711 | #if !ASSERT_DISABLED | 
| 712 |         const char* error = | 
| 713 | #endif | 
| 714 |             parse(delegate&: constructor, pattern: patternString, backReferenceLimit: numSubpatterns); | 
| 715 |  | 
| 716 |         ASSERT(!error); | 
| 717 |         ASSERT(numSubpatterns == pattern.m_numSubpatterns); | 
| 718 |     } | 
| 719 |  | 
| 720 |     constructor.setupOffsets(); | 
| 721 |  | 
| 722 |     return NULL; | 
| 723 | }; | 
| 724 |  | 
| 725 |  | 
| 726 | } } | 
| 727 |  | 
| 728 | #endif | 
| 729 |  |