| 1 | /* | 
| 2 |  * Copyright (C) 2009, 2013-2016 Apple Inc. All rights reserved. | 
| 3 |  * Copyright (C) 2010 Peter Varga (pvarga@inf.u-szeged.hu), University of Szeged | 
| 4 |  * | 
| 5 |  * Redistribution and use in source and binary forms, with or without | 
| 6 |  * modification, are permitted provided that the following conditions | 
| 7 |  * are met: | 
| 8 |  * 1. Redistributions of source code must retain the above copyright | 
| 9 |  *    notice, this list of conditions and the following disclaimer. | 
| 10 |  * 2. Redistributions in binary form must reproduce the above copyright | 
| 11 |  *    notice, this list of conditions and the following disclaimer in the | 
| 12 |  *    documentation and/or other materials provided with the distribution. | 
| 13 |  * | 
| 14 |  * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY | 
| 15 |  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | 
| 16 |  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR | 
| 17 |  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR | 
| 18 |  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, | 
| 19 |  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, | 
| 20 |  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR | 
| 21 |  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY | 
| 22 |  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | 
| 23 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | 
| 24 |  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  | 
| 25 |  */ | 
| 26 |  | 
| 27 | #include "config.h" | 
| 28 | #include "YarrPattern.h" | 
| 29 |  | 
| 30 | #include "Options.h" | 
| 31 | #include "Yarr.h" | 
| 32 | #include "YarrCanonicalize.h" | 
| 33 | #include "YarrParser.h" | 
| 34 | #include <wtf/DataLog.h> | 
| 35 | #include <wtf/Optional.h> | 
| 36 | #include <wtf/Vector.h> | 
| 37 | #include <wtf/text/WTFString.h> | 
| 38 |  | 
| 39 | namespace JSC { namespace Yarr { | 
| 40 |  | 
| 41 | #include "RegExpJitTables.h" | 
| 42 |  | 
| 43 | class CharacterClassConstructor { | 
| 44 | public: | 
| 45 |     CharacterClassConstructor(bool isCaseInsensitive, CanonicalMode canonicalMode) | 
| 46 |         : m_isCaseInsensitive(isCaseInsensitive) | 
| 47 |         , m_hasNonBMPCharacters(false) | 
| 48 |         , m_anyCharacter(false) | 
| 49 |         , m_canonicalMode(canonicalMode) | 
| 50 |     { | 
| 51 |     } | 
| 52 |      | 
| 53 |     void reset() | 
| 54 |     { | 
| 55 |         m_matches.clear(); | 
| 56 |         m_ranges.clear(); | 
| 57 |         m_matchesUnicode.clear(); | 
| 58 |         m_rangesUnicode.clear(); | 
| 59 |         m_hasNonBMPCharacters = false; | 
| 60 |         m_anyCharacter = false; | 
| 61 |     } | 
| 62 |  | 
| 63 |     void append(const CharacterClass* other) | 
| 64 |     { | 
| 65 |         for (size_t i = 0; i < other->m_matches.size(); ++i) | 
| 66 |             addSorted(matches&: m_matches, ch: other->m_matches[i]); | 
| 67 |         for (size_t i = 0; i < other->m_ranges.size(); ++i) | 
| 68 |             addSortedRange(ranges&: m_ranges, lo: other->m_ranges[i].begin, hi: other->m_ranges[i].end); | 
| 69 |         for (size_t i = 0; i < other->m_matchesUnicode.size(); ++i) | 
| 70 |             addSorted(matches&: m_matchesUnicode, ch: other->m_matchesUnicode[i]); | 
| 71 |         for (size_t i = 0; i < other->m_rangesUnicode.size(); ++i) | 
| 72 |             addSortedRange(ranges&: m_rangesUnicode, lo: other->m_rangesUnicode[i].begin, hi: other->m_rangesUnicode[i].end); | 
| 73 |     } | 
| 74 |  | 
| 75 |     void appendInverted(const CharacterClass* other) | 
| 76 |     { | 
| 77 |         auto addSortedInverted = [&](UChar32 min, UChar32 max, | 
| 78 |             const Vector<UChar32>& srcMatches, const Vector<CharacterRange>& srcRanges, | 
| 79 |             Vector<UChar32>& destMatches, Vector<CharacterRange>& destRanges) { | 
| 80 |  | 
| 81 |             auto addSortedMatchOrRange = [&](UChar32 lo, UChar32 hiPlusOne) { | 
| 82 |                 if (lo < hiPlusOne) { | 
| 83 |                     if (lo + 1 == hiPlusOne) | 
| 84 |                         addSorted(destMatches, lo); | 
| 85 |                     else | 
| 86 |                         addSortedRange(ranges&: destRanges, lo, hi: hiPlusOne - 1); | 
| 87 |                 } | 
| 88 |             }; | 
| 89 |  | 
| 90 |             UChar32 lo = min; | 
| 91 |             size_t matchesIndex = 0; | 
| 92 |             size_t rangesIndex = 0; | 
| 93 |             bool matchesRemaining = matchesIndex < srcMatches.size(); | 
| 94 |             bool rangesRemaining = rangesIndex < srcRanges.size(); | 
| 95 |  | 
| 96 |             if (!matchesRemaining && !rangesRemaining) { | 
| 97 |                 addSortedMatchOrRange(min, max + 1); | 
| 98 |                 return; | 
| 99 |             } | 
| 100 |  | 
| 101 |             while (matchesRemaining || rangesRemaining) { | 
| 102 |                 UChar32 hiPlusOne; | 
| 103 |                 UChar32 nextLo; | 
| 104 |  | 
| 105 |                 if (matchesRemaining | 
| 106 |                     && (!rangesRemaining || srcMatches[matchesIndex] < srcRanges[rangesIndex].begin)) { | 
| 107 |                     hiPlusOne = srcMatches[matchesIndex]; | 
| 108 |                     nextLo = hiPlusOne + 1; | 
| 109 |                     ++matchesIndex; | 
| 110 |                     matchesRemaining = matchesIndex < srcMatches.size(); | 
| 111 |                 } else { | 
| 112 |                     hiPlusOne = srcRanges[rangesIndex].begin; | 
| 113 |                     nextLo = srcRanges[rangesIndex].end + 1; | 
| 114 |                     ++rangesIndex; | 
| 115 |                     rangesRemaining = rangesIndex < srcRanges.size(); | 
| 116 |                 } | 
| 117 |  | 
| 118 |                 addSortedMatchOrRange(lo, hiPlusOne); | 
| 119 |  | 
| 120 |                 lo = nextLo; | 
| 121 |             } | 
| 122 |  | 
| 123 |             addSortedMatchOrRange(lo, max + 1); | 
| 124 |         }; | 
| 125 |  | 
| 126 |         addSortedInverted(0, 0x7f, other->m_matches, other->m_ranges, m_matches, m_ranges); | 
| 127 |         addSortedInverted(0x80, 0x10ffff, other->m_matchesUnicode, other->m_rangesUnicode, m_matchesUnicode, m_rangesUnicode); | 
| 128 |     } | 
| 129 |  | 
| 130 |     void putChar(UChar32 ch) | 
| 131 |     { | 
| 132 |         if (!m_isCaseInsensitive) { | 
| 133 |             addSorted(ch); | 
| 134 |             return; | 
| 135 |         } | 
| 136 |  | 
| 137 |         if (m_canonicalMode == CanonicalMode::UCS2 && isASCII(ch)) { | 
| 138 |             // Handle ASCII cases. | 
| 139 |             if (isASCIIAlpha(ch)) { | 
| 140 |                 addSorted(m_matches, toASCIIUpper(ch)); | 
| 141 |                 addSorted(m_matches, toASCIILower(ch)); | 
| 142 |             } else | 
| 143 |                 addSorted(matches&: m_matches, ch); | 
| 144 |             return; | 
| 145 |         } | 
| 146 |  | 
| 147 |         // Add multiple matches, if necessary. | 
| 148 |         const CanonicalizationRange* info = canonicalRangeInfoFor(ch, canonicalMode: m_canonicalMode); | 
| 149 |         if (info->type == CanonicalizeUnique) | 
| 150 |             addSorted(ch); | 
| 151 |         else | 
| 152 |             putUnicodeIgnoreCase(ch, info); | 
| 153 |     } | 
| 154 |  | 
| 155 |     void putUnicodeIgnoreCase(UChar32 ch, const CanonicalizationRange* info) | 
| 156 |     { | 
| 157 |         ASSERT(m_isCaseInsensitive); | 
| 158 |         ASSERT(ch >= info->begin && ch <= info->end); | 
| 159 |         ASSERT(info->type != CanonicalizeUnique); | 
| 160 |         if (info->type == CanonicalizeSet) { | 
| 161 |             for (const UChar32* set = canonicalCharacterSetInfo(index: info->value, canonicalMode: m_canonicalMode); (ch = *set); ++set) | 
| 162 |                 addSorted(ch); | 
| 163 |         } else { | 
| 164 |             addSorted(ch); | 
| 165 |             addSorted(ch: getCanonicalPair(info, ch)); | 
| 166 |         } | 
| 167 |     } | 
| 168 |  | 
| 169 |     void putRange(UChar32 lo, UChar32 hi) | 
| 170 |     { | 
| 171 |         if (isASCII(lo)) { | 
| 172 |             char asciiLo = lo; | 
| 173 |             char asciiHi = std::min(hi, (UChar32)0x7f); | 
| 174 |             addSortedRange(ranges&: m_ranges, lo, hi: asciiHi); | 
| 175 |              | 
| 176 |             if (m_isCaseInsensitive) { | 
| 177 |                 if ((asciiLo <= 'Z') && (asciiHi >= 'A')) | 
| 178 |                     addSortedRange(ranges&: m_ranges, lo: std::max(asciiLo, 'A')+('a'-'A'), hi: std::min(asciiHi, 'Z')+('a'-'A')); | 
| 179 |                 if ((asciiLo <= 'z') && (asciiHi >= 'a')) | 
| 180 |                     addSortedRange(ranges&: m_ranges, lo: std::max(asciiLo, 'a')+('A'-'a'), hi: std::min(asciiHi, 'z')+('A'-'a')); | 
| 181 |             } | 
| 182 |         } | 
| 183 |         if (isASCII(hi)) | 
| 184 |             return; | 
| 185 |  | 
| 186 |         lo = std::max(lo, (UChar32)0x80); | 
| 187 |         addSortedRange(ranges&: m_rangesUnicode, lo, hi); | 
| 188 |          | 
| 189 |         if (!m_isCaseInsensitive) | 
| 190 |             return; | 
| 191 |  | 
| 192 |         const CanonicalizationRange* info = canonicalRangeInfoFor(ch: lo, canonicalMode: m_canonicalMode); | 
| 193 |         while (true) { | 
| 194 |             // Handle the range [lo .. end] | 
| 195 |             UChar32 end = std::min<UChar32>(info->end, hi); | 
| 196 |  | 
| 197 |             switch (info->type) { | 
| 198 |             case CanonicalizeUnique: | 
| 199 |                 // Nothing to do - no canonical equivalents. | 
| 200 |                 break; | 
| 201 |             case CanonicalizeSet: { | 
| 202 |                 UChar ch; | 
| 203 |                 for (const UChar32* set = canonicalCharacterSetInfo(index: info->value, canonicalMode: m_canonicalMode); (ch = *set); ++set) | 
| 204 |                     addSorted(matches&: m_matchesUnicode, ch); | 
| 205 |                 break; | 
| 206 |             } | 
| 207 |             case CanonicalizeRangeLo: | 
| 208 |                 addSortedRange(ranges&: m_rangesUnicode, lo: lo + info->value, hi: end + info->value); | 
| 209 |                 break; | 
| 210 |             case CanonicalizeRangeHi: | 
| 211 |                 addSortedRange(ranges&: m_rangesUnicode, lo: lo - info->value, hi: end - info->value); | 
| 212 |                 break; | 
| 213 |             case CanonicalizeAlternatingAligned: | 
| 214 |                 // Use addSortedRange since there is likely an abutting range to combine with. | 
| 215 |                 if (lo & 1) | 
| 216 |                     addSortedRange(ranges&: m_rangesUnicode, lo: lo - 1, hi: lo - 1); | 
| 217 |                 if (!(end & 1)) | 
| 218 |                     addSortedRange(ranges&: m_rangesUnicode, lo: end + 1, hi: end + 1); | 
| 219 |                 break; | 
| 220 |             case CanonicalizeAlternatingUnaligned: | 
| 221 |                 // Use addSortedRange since there is likely an abutting range to combine with. | 
| 222 |                 if (!(lo & 1)) | 
| 223 |                     addSortedRange(ranges&: m_rangesUnicode, lo: lo - 1, hi: lo - 1); | 
| 224 |                 if (end & 1) | 
| 225 |                     addSortedRange(ranges&: m_rangesUnicode, lo: end + 1, hi: end + 1); | 
| 226 |                 break; | 
| 227 |             } | 
| 228 |  | 
| 229 |             if (hi == end) | 
| 230 |                 return; | 
| 231 |  | 
| 232 |             ++info; | 
| 233 |             lo = info->begin; | 
| 234 |         }; | 
| 235 |  | 
| 236 |     } | 
| 237 |  | 
| 238 |     std::unique_ptr<CharacterClass> charClass() | 
| 239 |     { | 
| 240 |         coalesceTables(); | 
| 241 |  | 
| 242 |         auto characterClass = std::make_unique<CharacterClass>(); | 
| 243 |  | 
| 244 |         characterClass->m_matches.swap(m_matches); | 
| 245 |         characterClass->m_ranges.swap(m_ranges); | 
| 246 |         characterClass->m_matchesUnicode.swap(m_matchesUnicode); | 
| 247 |         characterClass->m_rangesUnicode.swap(m_rangesUnicode); | 
| 248 |         characterClass->m_hasNonBMPCharacters = hasNonBMPCharacters(); | 
| 249 |         characterClass->m_anyCharacter = anyCharacter(); | 
| 250 |  | 
| 251 |         m_hasNonBMPCharacters = false; | 
| 252 |         m_anyCharacter = false; | 
| 253 |  | 
| 254 |         return characterClass; | 
| 255 |     } | 
| 256 |  | 
| 257 | private: | 
| 258 |     void addSorted(UChar32 ch) | 
| 259 |     { | 
| 260 |         addSorted(matches&: isASCII(ch) ? m_matches : m_matchesUnicode, ch); | 
| 261 |     } | 
| 262 |  | 
| 263 |     void addSorted(Vector<UChar32>& matches, UChar32 ch) | 
| 264 |     { | 
| 265 |         unsigned pos = 0; | 
| 266 |         unsigned range = matches.size(); | 
| 267 |  | 
| 268 |         if (!U_IS_BMP(ch)) | 
| 269 |             m_hasNonBMPCharacters = true; | 
| 270 |  | 
| 271 |         // binary chop, find position to insert char. | 
| 272 |         while (range) { | 
| 273 |             unsigned index = range >> 1; | 
| 274 |  | 
| 275 |             int val = matches[pos+index] - ch; | 
| 276 |             if (!val) | 
| 277 |                 return; | 
| 278 |             else if (val > 0) { | 
| 279 |                 if (val == 1) { | 
| 280 |                     UChar32 lo = ch; | 
| 281 |                     UChar32 hi = ch + 1; | 
| 282 |                     matches.remove(position: pos + index); | 
| 283 |                     if (pos + index > 0 && matches[pos + index - 1] == ch - 1) { | 
| 284 |                         lo = ch - 1; | 
| 285 |                         matches.remove(position: pos + index - 1); | 
| 286 |                     } | 
| 287 |                     addSortedRange(ranges&: isASCII(ch) ? m_ranges : m_rangesUnicode, lo, hi); | 
| 288 |                     return; | 
| 289 |                 } | 
| 290 |                 range = index; | 
| 291 |             } else { | 
| 292 |                 if (val == -1) { | 
| 293 |                     UChar32 lo = ch - 1; | 
| 294 |                     UChar32 hi = ch; | 
| 295 |                     matches.remove(position: pos + index); | 
| 296 |                     if (pos + index + 1 < matches.size() && matches[pos + index + 1] == ch + 1) { | 
| 297 |                         hi = ch + 1; | 
| 298 |                         matches.remove(position: pos + index + 1); | 
| 299 |                     } | 
| 300 |                     addSortedRange(ranges&: isASCII(ch) ? m_ranges : m_rangesUnicode, lo, hi); | 
| 301 |                     return; | 
| 302 |                 } | 
| 303 |                 pos += (index+1); | 
| 304 |                 range -= (index+1); | 
| 305 |             } | 
| 306 |         } | 
| 307 |          | 
| 308 |         if (pos == matches.size()) | 
| 309 |             matches.append(value: ch); | 
| 310 |         else | 
| 311 |             matches.insert(position: pos, value: ch); | 
| 312 |     } | 
| 313 |  | 
| 314 |     void addSortedRange(Vector<CharacterRange>& ranges, UChar32 lo, UChar32 hi) | 
| 315 |     { | 
| 316 |         size_t end = ranges.size(); | 
| 317 |  | 
| 318 |         if (!U_IS_BMP(hi)) | 
| 319 |             m_hasNonBMPCharacters = true; | 
| 320 |  | 
| 321 |         // Simple linear scan - I doubt there are that many ranges anyway... | 
| 322 |         // feel free to fix this with something faster (eg binary chop). | 
| 323 |         for (size_t i = 0; i < end; ++i) { | 
| 324 |             // does the new range fall before the current position in the array | 
| 325 |             if (hi < ranges[i].begin) { | 
| 326 |                 // Concatenate appending ranges. | 
| 327 |                 if (hi == (ranges[i].begin - 1)) { | 
| 328 |                     ranges[i].begin = lo; | 
| 329 |                     return; | 
| 330 |                 } | 
| 331 |                 ranges.insert(position: i, value: CharacterRange(lo, hi)); | 
| 332 |                 return; | 
| 333 |             } | 
| 334 |             // Okay, since we didn't hit the last case, the end of the new range is definitely at or after the beginning | 
| 335 |             // If the new range start at or before the end of the last range, then the overlap (if it starts one after the | 
| 336 |             // end of the last range they concatenate, which is just as good. | 
| 337 |             if (lo <= (ranges[i].end + 1)) { | 
| 338 |                 // found an intersect! we'll replace this entry in the array. | 
| 339 |                 ranges[i].begin = std::min(ranges[i].begin, lo); | 
| 340 |                 ranges[i].end = std::max(ranges[i].end, hi); | 
| 341 |  | 
| 342 |                 mergeRangesFrom(ranges, index: i); | 
| 343 |                 return; | 
| 344 |             } | 
| 345 |         } | 
| 346 |  | 
| 347 |         // CharacterRange comes after all existing ranges. | 
| 348 |         ranges.append(other: CharacterRange(lo, hi)); | 
| 349 |     } | 
| 350 |  | 
| 351 |     void mergeRangesFrom(Vector<CharacterRange>& ranges, size_t index) | 
| 352 |     { | 
| 353 |         size_t next = index + 1; | 
| 354 |  | 
| 355 |         // each iteration of the loop we will either remove something from the list, or break out of the loop. | 
| 356 |         while (next < ranges.size()) { | 
| 357 |             if (ranges[next].begin <= (ranges[index].end + 1)) { | 
| 358 |                 // the next entry now overlaps / concatenates with this one. | 
| 359 |                 ranges[index].end = std::max(ranges[index].end, ranges[next].end); | 
| 360 |                 ranges.remove(position: next); | 
| 361 |             } else | 
| 362 |                 break; | 
| 363 |         } | 
| 364 |  | 
| 365 |     } | 
| 366 |  | 
| 367 |     void coalesceTables() | 
| 368 |     { | 
| 369 |         auto coalesceMatchesAndRanges = [&](Vector<UChar32>& matches, Vector<CharacterRange>& ranges) { | 
| 370 |  | 
| 371 |             size_t matchesIndex = 0; | 
| 372 |             size_t rangesIndex = 0; | 
| 373 |  | 
| 374 |             while (matchesIndex < matches.size() && rangesIndex < ranges.size()) { | 
| 375 |                 while (matchesIndex < matches.size() && matches[matchesIndex] < ranges[rangesIndex].begin - 1) | 
| 376 |                     matchesIndex++; | 
| 377 |  | 
| 378 |                 if (matchesIndex < matches.size() && matches[matchesIndex] == ranges[rangesIndex].begin - 1) { | 
| 379 |                     ranges[rangesIndex].begin = matches[matchesIndex]; | 
| 380 |                     matches.remove(matchesIndex); | 
| 381 |                 } | 
| 382 |  | 
| 383 |                 while (matchesIndex < matches.size() && matches[matchesIndex] < ranges[rangesIndex].end + 1) | 
| 384 |                     matchesIndex++; | 
| 385 |  | 
| 386 |                 if (matchesIndex < matches.size()) { | 
| 387 |                     if (matches[matchesIndex] == ranges[rangesIndex].end + 1) { | 
| 388 |                         ranges[rangesIndex].end = matches[matchesIndex]; | 
| 389 |                         matches.remove(matchesIndex); | 
| 390 |  | 
| 391 |                         mergeRangesFrom(ranges&: ranges, index: rangesIndex); | 
| 392 |                     } else | 
| 393 |                         matchesIndex++; | 
| 394 |                 } | 
| 395 |             } | 
| 396 |         }; | 
| 397 |  | 
| 398 |         coalesceMatchesAndRanges(m_matches, m_ranges); | 
| 399 |         coalesceMatchesAndRanges(m_matchesUnicode, m_rangesUnicode); | 
| 400 |  | 
| 401 |         if (!m_matches.size() && !m_matchesUnicode.size() | 
| 402 |             && m_ranges.size() == 1 && m_rangesUnicode.size() == 1 | 
| 403 |             && m_ranges[0].begin == 0 && m_ranges[0].end == 0x7f | 
| 404 |             && m_rangesUnicode[0].begin == 0x80 && m_rangesUnicode[0].end == 0x10ffff) | 
| 405 |             m_anyCharacter = true; | 
| 406 |     } | 
| 407 |  | 
| 408 |     bool hasNonBMPCharacters() | 
| 409 |     { | 
| 410 |         return m_hasNonBMPCharacters; | 
| 411 |     } | 
| 412 |  | 
| 413 |     bool anyCharacter() | 
| 414 |     { | 
| 415 |         return m_anyCharacter; | 
| 416 |     } | 
| 417 |  | 
| 418 |     bool m_isCaseInsensitive : 1; | 
| 419 |     bool m_hasNonBMPCharacters : 1; | 
| 420 |     bool m_anyCharacter : 1; | 
| 421 |     CanonicalMode m_canonicalMode; | 
| 422 |  | 
| 423 |     Vector<UChar32> m_matches; | 
| 424 |     Vector<CharacterRange> m_ranges; | 
| 425 |     Vector<UChar32> m_matchesUnicode; | 
| 426 |     Vector<CharacterRange> m_rangesUnicode; | 
| 427 | }; | 
| 428 |  | 
| 429 | class YarrPatternConstructor { | 
| 430 | public: | 
| 431 |     YarrPatternConstructor(YarrPattern& pattern, void* stackLimit) | 
| 432 |         : m_pattern(pattern) | 
| 433 |         , m_characterClassConstructor(pattern.ignoreCase(), pattern.unicode() ? CanonicalMode::Unicode : CanonicalMode::UCS2) | 
| 434 |         , m_stackLimit(stackLimit) | 
| 435 |     { | 
| 436 |         auto body = std::make_unique<PatternDisjunction>(); | 
| 437 |         m_pattern.m_body = body.get(); | 
| 438 |         m_alternative = body->addNewAlternative(); | 
| 439 |         m_pattern.m_disjunctions.append(WTFMove(body)); | 
| 440 |     } | 
| 441 |  | 
| 442 |     ~YarrPatternConstructor() | 
| 443 |     { | 
| 444 |     } | 
| 445 |  | 
| 446 |     void resetForReparsing() | 
| 447 |     { | 
| 448 |         m_pattern.resetForReparsing(); | 
| 449 |         m_characterClassConstructor.reset(); | 
| 450 |  | 
| 451 |         auto body = std::make_unique<PatternDisjunction>(); | 
| 452 |         m_pattern.m_body = body.get(); | 
| 453 |         m_alternative = body->addNewAlternative(); | 
| 454 |         m_pattern.m_disjunctions.append(WTFMove(body)); | 
| 455 |     } | 
| 456 |  | 
| 457 |     void saveUnmatchedNamedForwardReferences() | 
| 458 |     { | 
| 459 |         m_unmatchedNamedForwardReferences.shrink(0); | 
| 460 |  | 
| 461 |         for (auto& entry : m_pattern.m_namedForwardReferences) { | 
| 462 |             if (!m_pattern.m_captureGroupNames.contains(entry)) | 
| 463 |                 m_unmatchedNamedForwardReferences.append(entry); | 
| 464 |         } | 
| 465 |     } | 
| 466 |  | 
| 467 |     void assertionBOL() | 
| 468 |     { | 
| 469 |         if (!m_alternative->m_terms.size() && !m_invertParentheticalAssertion) { | 
| 470 |             m_alternative->m_startsWithBOL = true; | 
| 471 |             m_alternative->m_containsBOL = true; | 
| 472 |             m_pattern.m_containsBOL = true; | 
| 473 |         } | 
| 474 |         m_alternative->m_terms.append(other: PatternTerm::BOL()); | 
| 475 |     } | 
| 476 |     void assertionEOL() | 
| 477 |     { | 
| 478 |         m_alternative->m_terms.append(other: PatternTerm::EOL()); | 
| 479 |     } | 
| 480 |     void assertionWordBoundary(bool invert) | 
| 481 |     { | 
| 482 |         m_alternative->m_terms.append(other: PatternTerm::WordBoundary(invert)); | 
| 483 |     } | 
| 484 |  | 
| 485 |     void atomPatternCharacter(UChar32 ch) | 
| 486 |     { | 
| 487 |         // We handle case-insensitive checking of unicode characters which do have both | 
| 488 |         // cases by handling them as if they were defined using a CharacterClass. | 
| 489 |         if (!m_pattern.ignoreCase() || (isASCII(ch) && !m_pattern.unicode())) { | 
| 490 |             m_alternative->m_terms.append(other: PatternTerm(ch)); | 
| 491 |             return; | 
| 492 |         } | 
| 493 |  | 
| 494 |         const CanonicalizationRange* info = canonicalRangeInfoFor(ch, canonicalMode: m_pattern.unicode() ? CanonicalMode::Unicode : CanonicalMode::UCS2); | 
| 495 |         if (info->type == CanonicalizeUnique) { | 
| 496 |             m_alternative->m_terms.append(other: PatternTerm(ch)); | 
| 497 |             return; | 
| 498 |         } | 
| 499 |  | 
| 500 |         m_characterClassConstructor.putUnicodeIgnoreCase(ch, info); | 
| 501 |         auto newCharacterClass = m_characterClassConstructor.charClass(); | 
| 502 |         m_alternative->m_terms.append(other: PatternTerm(newCharacterClass.get(), false)); | 
| 503 |         m_pattern.m_userCharacterClasses.append(WTFMove(newCharacterClass)); | 
| 504 |     } | 
| 505 |  | 
| 506 |     void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert) | 
| 507 |     { | 
| 508 |         switch (classID) { | 
| 509 |         case BuiltInCharacterClassID::DigitClassID: | 
| 510 |             m_alternative->m_terms.append(other: PatternTerm(m_pattern.digitsCharacterClass(), invert)); | 
| 511 |             break; | 
| 512 |         case BuiltInCharacterClassID::SpaceClassID: | 
| 513 |             m_alternative->m_terms.append(other: PatternTerm(m_pattern.spacesCharacterClass(), invert)); | 
| 514 |             break; | 
| 515 |         case BuiltInCharacterClassID::WordClassID: | 
| 516 |             if (m_pattern.unicode() && m_pattern.ignoreCase()) | 
| 517 |                 m_alternative->m_terms.append(other: PatternTerm(m_pattern.wordUnicodeIgnoreCaseCharCharacterClass(), invert)); | 
| 518 |             else | 
| 519 |                 m_alternative->m_terms.append(other: PatternTerm(m_pattern.wordcharCharacterClass(), invert)); | 
| 520 |             break; | 
| 521 |         case BuiltInCharacterClassID::DotClassID: | 
| 522 |             ASSERT(!invert); | 
| 523 |             if (m_pattern.dotAll()) | 
| 524 |                 m_alternative->m_terms.append(other: PatternTerm(m_pattern.anyCharacterClass(), false)); | 
| 525 |             else | 
| 526 |                 m_alternative->m_terms.append(other: PatternTerm(m_pattern.newlineCharacterClass(), true)); | 
| 527 |             break; | 
| 528 |         default: | 
| 529 |             m_alternative->m_terms.append(other: PatternTerm(m_pattern.unicodeCharacterClassFor(unicodeClassID: classID), invert)); | 
| 530 |             break; | 
| 531 |         } | 
| 532 |     } | 
| 533 |  | 
| 534 |     void atomCharacterClassBegin(bool invert = false) | 
| 535 |     { | 
| 536 |         m_invertCharacterClass = invert; | 
| 537 |     } | 
| 538 |  | 
| 539 |     void atomCharacterClassAtom(UChar32 ch) | 
| 540 |     { | 
| 541 |         m_characterClassConstructor.putChar(ch); | 
| 542 |     } | 
| 543 |  | 
| 544 |     void atomCharacterClassRange(UChar32 begin, UChar32 end) | 
| 545 |     { | 
| 546 |         m_characterClassConstructor.putRange(lo: begin, hi: end); | 
| 547 |     } | 
| 548 |  | 
| 549 |     void atomCharacterClassBuiltIn(BuiltInCharacterClassID classID, bool invert) | 
| 550 |     { | 
| 551 |         ASSERT(classID != BuiltInCharacterClassID::DotClassID); | 
| 552 |  | 
| 553 |         switch (classID) { | 
| 554 |         case BuiltInCharacterClassID::DigitClassID: | 
| 555 |             m_characterClassConstructor.append(other: invert ? m_pattern.nondigitsCharacterClass() : m_pattern.digitsCharacterClass()); | 
| 556 |             break; | 
| 557 |          | 
| 558 |         case BuiltInCharacterClassID::SpaceClassID: | 
| 559 |             m_characterClassConstructor.append(other: invert ? m_pattern.nonspacesCharacterClass() : m_pattern.spacesCharacterClass()); | 
| 560 |             break; | 
| 561 |          | 
| 562 |         case BuiltInCharacterClassID::WordClassID: | 
| 563 |             if (m_pattern.unicode() && m_pattern.ignoreCase()) | 
| 564 |                 m_characterClassConstructor.append(other: invert ? m_pattern.nonwordUnicodeIgnoreCaseCharCharacterClass() : m_pattern.wordUnicodeIgnoreCaseCharCharacterClass()); | 
| 565 |             else | 
| 566 |                 m_characterClassConstructor.append(other: invert ? m_pattern.nonwordcharCharacterClass() : m_pattern.wordcharCharacterClass()); | 
| 567 |             break; | 
| 568 |          | 
| 569 |         default: | 
| 570 |             if (!invert) | 
| 571 |                 m_characterClassConstructor.append(other: m_pattern.unicodeCharacterClassFor(unicodeClassID: classID)); | 
| 572 |             else | 
| 573 |                 m_characterClassConstructor.appendInverted(other: m_pattern.unicodeCharacterClassFor(unicodeClassID: classID)); | 
| 574 |         } | 
| 575 |     } | 
| 576 |  | 
| 577 |     void atomCharacterClassEnd() | 
| 578 |     { | 
| 579 |         auto newCharacterClass = m_characterClassConstructor.charClass(); | 
| 580 |  | 
| 581 |         if (!m_invertCharacterClass && newCharacterClass.get()->m_anyCharacter) { | 
| 582 |             m_alternative->m_terms.append(other: PatternTerm(m_pattern.anyCharacterClass(), false)); | 
| 583 |             return; | 
| 584 |         } | 
| 585 |         m_alternative->m_terms.append(other: PatternTerm(newCharacterClass.get(), m_invertCharacterClass)); | 
| 586 |         m_pattern.m_userCharacterClasses.append(WTFMove(newCharacterClass)); | 
| 587 |     } | 
| 588 |  | 
| 589 |     void atomParenthesesSubpatternBegin(bool capture = true, std::optional<String> optGroupName = std::nullopt) | 
| 590 |     { | 
| 591 |         unsigned subpatternId = m_pattern.m_numSubpatterns + 1; | 
| 592 |         if (capture) { | 
| 593 |             m_pattern.m_numSubpatterns++; | 
| 594 |             if (optGroupName) { | 
| 595 |                 while (m_pattern.m_captureGroupNames.size() < subpatternId) | 
| 596 |                     m_pattern.m_captureGroupNames.append(other: String()); | 
| 597 |                 m_pattern.m_captureGroupNames.append(value: optGroupName.value()); | 
| 598 |                 m_pattern.m_namedGroupToParenIndex.add(k: optGroupName.value(), v: subpatternId); | 
| 599 |             } | 
| 600 |         } else | 
| 601 |             ASSERT(!optGroupName); | 
| 602 |  | 
| 603 |         auto parenthesesDisjunction = std::make_unique<PatternDisjunction>(m_alternative); | 
| 604 |         m_alternative->m_terms.append(other: PatternTerm(PatternTerm::TypeParenthesesSubpattern, subpatternId, parenthesesDisjunction.get(), capture, false)); | 
| 605 |         m_alternative = parenthesesDisjunction->addNewAlternative(); | 
| 606 |         m_pattern.m_disjunctions.append(WTFMove(parenthesesDisjunction)); | 
| 607 |     } | 
| 608 |  | 
| 609 |     void atomParentheticalAssertionBegin(bool invert = false) | 
| 610 |     { | 
| 611 |         auto parenthesesDisjunction = std::make_unique<PatternDisjunction>(m_alternative); | 
| 612 |         m_alternative->m_terms.append(other: PatternTerm(PatternTerm::TypeParentheticalAssertion, m_pattern.m_numSubpatterns + 1, parenthesesDisjunction.get(), false, invert)); | 
| 613 |         m_alternative = parenthesesDisjunction->addNewAlternative(); | 
| 614 |         m_invertParentheticalAssertion = invert; | 
| 615 |         m_pattern.m_disjunctions.append(WTFMove(parenthesesDisjunction)); | 
| 616 |     } | 
| 617 |  | 
| 618 |     void atomParenthesesEnd() | 
| 619 |     { | 
| 620 |         ASSERT(m_alternative->m_parent); | 
| 621 |         ASSERT(m_alternative->m_parent->m_parent); | 
| 622 |  | 
| 623 |         PatternDisjunction* parenthesesDisjunction = m_alternative->m_parent; | 
| 624 |         m_alternative = m_alternative->m_parent->m_parent; | 
| 625 |  | 
| 626 |         PatternTerm& lastTerm = m_alternative->lastTerm(); | 
| 627 |  | 
| 628 |         unsigned numParenAlternatives = parenthesesDisjunction->m_alternatives.size(); | 
| 629 |         unsigned numBOLAnchoredAlts = 0; | 
| 630 |  | 
| 631 |         for (unsigned i = 0; i < numParenAlternatives; i++) { | 
| 632 |             // Bubble up BOL flags | 
| 633 |             if (parenthesesDisjunction->m_alternatives[i]->m_startsWithBOL) | 
| 634 |                 numBOLAnchoredAlts++; | 
| 635 |         } | 
| 636 |  | 
| 637 |         if (numBOLAnchoredAlts) { | 
| 638 |             m_alternative->m_containsBOL = true; | 
| 639 |             // If all the alternatives in parens start with BOL, then so does this one | 
| 640 |             if (numBOLAnchoredAlts == numParenAlternatives) | 
| 641 |                 m_alternative->m_startsWithBOL = true; | 
| 642 |         } | 
| 643 |  | 
| 644 |         lastTerm.parentheses.lastSubpatternId = m_pattern.m_numSubpatterns; | 
| 645 |         m_invertParentheticalAssertion = false; | 
| 646 |     } | 
| 647 |  | 
| 648 |     void atomBackReference(unsigned subpatternId) | 
| 649 |     { | 
| 650 |         ASSERT(subpatternId); | 
| 651 |         m_pattern.m_containsBackreferences = true; | 
| 652 |         m_pattern.m_maxBackReference = std::max(m_pattern.m_maxBackReference, subpatternId); | 
| 653 |  | 
| 654 |         if (subpatternId > m_pattern.m_numSubpatterns) { | 
| 655 |             m_alternative->m_terms.append(other: PatternTerm::ForwardReference()); | 
| 656 |             return; | 
| 657 |         } | 
| 658 |  | 
| 659 |         PatternAlternative* currentAlternative = m_alternative; | 
| 660 |         ASSERT(currentAlternative); | 
| 661 |  | 
| 662 |         // Note to self: if we waited until the AST was baked, we could also remove forwards refs  | 
| 663 |         while ((currentAlternative = currentAlternative->m_parent->m_parent)) { | 
| 664 |             PatternTerm& term = currentAlternative->lastTerm(); | 
| 665 |             ASSERT((term.type == PatternTerm::TypeParenthesesSubpattern) || (term.type == PatternTerm::TypeParentheticalAssertion)); | 
| 666 |  | 
| 667 |             if ((term.type == PatternTerm::TypeParenthesesSubpattern) && term.capture() && (subpatternId == term.parentheses.subpatternId)) { | 
| 668 |                 m_alternative->m_terms.append(other: PatternTerm::ForwardReference()); | 
| 669 |                 return; | 
| 670 |             } | 
| 671 |         } | 
| 672 |  | 
| 673 |         m_alternative->m_terms.append(other: PatternTerm(subpatternId)); | 
| 674 |     } | 
| 675 |  | 
| 676 |     void atomNamedBackReference(const String& subpatternName) | 
| 677 |     { | 
| 678 |         ASSERT(m_pattern.m_namedGroupToParenIndex.find(subpatternName) != m_pattern.m_namedGroupToParenIndex.end()); | 
| 679 |         atomBackReference(subpatternId: m_pattern.m_namedGroupToParenIndex.get(k: subpatternName)); | 
| 680 |     } | 
| 681 |  | 
| 682 |     bool isValidNamedForwardReference(const String& subpatternName) | 
| 683 |     { | 
| 684 |         return !m_unmatchedNamedForwardReferences.contains(subpatternName); | 
| 685 |     } | 
| 686 |  | 
| 687 |     void atomNamedForwardReference(const String& subpatternName) | 
| 688 |     { | 
| 689 |         if (!m_pattern.m_namedForwardReferences.contains(value: subpatternName)) | 
| 690 |             m_pattern.m_namedForwardReferences.append(value: subpatternName); | 
| 691 |         m_alternative->m_terms.append(other: PatternTerm::ForwardReference()); | 
| 692 |     } | 
| 693 |  | 
| 694 |     // deep copy the argument disjunction.  If filterStartsWithBOL is true, | 
| 695 |     // skip alternatives with m_startsWithBOL set true. | 
| 696 |     PatternDisjunction* copyDisjunction(PatternDisjunction* disjunction, bool filterStartsWithBOL = false) | 
| 697 |     { | 
| 698 |         std::unique_ptr<PatternDisjunction> newDisjunction; | 
| 699 |         for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) { | 
| 700 |             PatternAlternative* alternative = disjunction->m_alternatives[alt].get(); | 
| 701 |             if (!filterStartsWithBOL || !alternative->m_startsWithBOL) { | 
| 702 |                 if (!newDisjunction) { | 
| 703 |                     newDisjunction = std::make_unique<PatternDisjunction>(); | 
| 704 |                     newDisjunction->m_parent = disjunction->m_parent; | 
| 705 |                 } | 
| 706 |                 PatternAlternative* newAlternative = newDisjunction->addNewAlternative(); | 
| 707 |                 newAlternative->m_terms.reserveInitialCapacity(size: alternative->m_terms.size()); | 
| 708 |                 for (unsigned i = 0; i < alternative->m_terms.size(); ++i) | 
| 709 |                     newAlternative->m_terms.append(other: copyTerm(term&: alternative->m_terms[i], filterStartsWithBOL)); | 
| 710 |             } | 
| 711 |         } | 
| 712 |          | 
| 713 |         if (!newDisjunction) | 
| 714 |             return 0; | 
| 715 |  | 
| 716 |         PatternDisjunction* copiedDisjunction = newDisjunction.get(); | 
| 717 |         m_pattern.m_disjunctions.append(WTFMove(newDisjunction)); | 
| 718 |         return copiedDisjunction; | 
| 719 |     } | 
| 720 |      | 
| 721 |     PatternTerm copyTerm(PatternTerm& term, bool filterStartsWithBOL = false) | 
| 722 |     { | 
| 723 |         if ((term.type != PatternTerm::TypeParenthesesSubpattern) && (term.type != PatternTerm::TypeParentheticalAssertion)) | 
| 724 |             return PatternTerm(term); | 
| 725 |          | 
| 726 |         PatternTerm termCopy = term; | 
| 727 |         termCopy.parentheses.disjunction = copyDisjunction(disjunction: termCopy.parentheses.disjunction, filterStartsWithBOL); | 
| 728 |         m_pattern.m_hasCopiedParenSubexpressions = true; | 
| 729 |         return termCopy; | 
| 730 |     } | 
| 731 |      | 
| 732 |     void quantifyAtom(unsigned min, unsigned max, bool greedy) | 
| 733 |     { | 
| 734 |         ASSERT(min <= max); | 
| 735 |         ASSERT(m_alternative->m_terms.size()); | 
| 736 |  | 
| 737 |         if (!max) { | 
| 738 |             m_alternative->removeLastTerm(); | 
| 739 |             return; | 
| 740 |         } | 
| 741 |  | 
| 742 |         PatternTerm& term = m_alternative->lastTerm(); | 
| 743 |         ASSERT(term.type > PatternTerm::TypeAssertionWordBoundary); | 
| 744 |         ASSERT(term.quantityMinCount == 1 && term.quantityMaxCount == 1 && term.quantityType == QuantifierFixedCount); | 
| 745 |  | 
| 746 |         if (term.type == PatternTerm::TypeParentheticalAssertion) { | 
| 747 |             // If an assertion is quantified with a minimum count of zero, it can simply be removed. | 
| 748 |             // This arises from the RepeatMatcher behaviour in the spec. Matching an assertion never | 
| 749 |             // results in any input being consumed, however the continuation passed to the assertion | 
| 750 |             // (called in steps, 8c and 9 of the RepeatMatcher definition, ES5.1 15.10.2.5) will | 
| 751 |             // reject all zero length matches (see step 2.1). A match from the continuation of the | 
| 752 |             // expression will still be accepted regardless (via steps 8a and 11) - the upshot of all | 
| 753 |             // this is that matches from the assertion are not required, and won't be accepted anyway, | 
| 754 |             // so no need to ever run it. | 
| 755 |             if (!min) | 
| 756 |                 m_alternative->removeLastTerm(); | 
| 757 |             // We never need to run an assertion more than once. Subsequent interations will be run | 
| 758 |             // with the same start index (since assertions are non-capturing) and the same captures | 
| 759 |             // (per step 4 of RepeatMatcher in ES5.1 15.10.2.5), and as such will always produce the | 
| 760 |             // same result and captures. If the first match succeeds then the subsequent (min - 1) | 
| 761 |             // matches will too. Any additional optional matches will fail (on the same basis as the | 
| 762 |             // minimum zero quantified assertions, above), but this will still result in a match. | 
| 763 |             return; | 
| 764 |         } | 
| 765 |  | 
| 766 |         if (min == max) | 
| 767 |             term.quantify(minCount: min, maxCount: max, type: QuantifierFixedCount); | 
| 768 |         else if (!min || (term.type == PatternTerm::TypeParenthesesSubpattern && m_pattern.m_hasCopiedParenSubexpressions)) | 
| 769 |             term.quantify(minCount: min, maxCount: max, type: greedy ? QuantifierGreedy : QuantifierNonGreedy); | 
| 770 |         else { | 
| 771 |             term.quantify(minCount: min, maxCount: min, type: QuantifierFixedCount); | 
| 772 |             m_alternative->m_terms.append(other: copyTerm(term)); | 
| 773 |             // NOTE: this term is interesting from an analysis perspective, in that it can be ignored..... | 
| 774 |             m_alternative->lastTerm().quantify(count: (max == quantifyInfinite) ? max : max - min, type: greedy ? QuantifierGreedy : QuantifierNonGreedy); | 
| 775 |             if (m_alternative->lastTerm().type == PatternTerm::TypeParenthesesSubpattern) | 
| 776 |                 m_alternative->lastTerm().parentheses.isCopy = true; | 
| 777 |         } | 
| 778 |     } | 
| 779 |  | 
| 780 |     void disjunction() | 
| 781 |     { | 
| 782 |         m_alternative = m_alternative->m_parent->addNewAlternative(); | 
| 783 |     } | 
| 784 |  | 
| 785 |     ErrorCode setupAlternativeOffsets(PatternAlternative* alternative, unsigned currentCallFrameSize, unsigned initialInputPosition, unsigned& newCallFrameSize) WARN_UNUSED_RETURN | 
| 786 |     { | 
| 787 |         if (UNLIKELY(!isSafeToRecurse())) | 
| 788 |             return ErrorCode::TooManyDisjunctions; | 
| 789 |  | 
| 790 |         ErrorCode error = ErrorCode::NoError; | 
| 791 |         alternative->m_hasFixedSize = true; | 
| 792 |         Checked<unsigned, RecordOverflow> currentInputPosition = initialInputPosition; | 
| 793 |  | 
| 794 |         for (unsigned i = 0; i < alternative->m_terms.size(); ++i) { | 
| 795 |             PatternTerm& term = alternative->m_terms[i]; | 
| 796 |  | 
| 797 |             switch (term.type) { | 
| 798 |             case PatternTerm::TypeAssertionBOL: | 
| 799 |             case PatternTerm::TypeAssertionEOL: | 
| 800 |             case PatternTerm::TypeAssertionWordBoundary: | 
| 801 |                 term.inputPosition = currentInputPosition.unsafeGet(); | 
| 802 |                 break; | 
| 803 |  | 
| 804 |             case PatternTerm::TypeBackReference: | 
| 805 |                 term.inputPosition = currentInputPosition.unsafeGet(); | 
| 806 |                 term.frameLocation = currentCallFrameSize; | 
| 807 |                 currentCallFrameSize += YarrStackSpaceForBackTrackInfoBackReference; | 
| 808 |                 alternative->m_hasFixedSize = false; | 
| 809 |                 break; | 
| 810 |  | 
| 811 |             case PatternTerm::TypeForwardReference: | 
| 812 |                 break; | 
| 813 |  | 
| 814 |             case PatternTerm::TypePatternCharacter: | 
| 815 |                 term.inputPosition = currentInputPosition.unsafeGet(); | 
| 816 |                 if (term.quantityType != QuantifierFixedCount) { | 
| 817 |                     term.frameLocation = currentCallFrameSize; | 
| 818 |                     currentCallFrameSize += YarrStackSpaceForBackTrackInfoPatternCharacter; | 
| 819 |                     alternative->m_hasFixedSize = false; | 
| 820 |                 } else if (m_pattern.unicode()) { | 
| 821 |                     Checked<unsigned, RecordOverflow> tempCount = term.quantityMaxCount; | 
| 822 |                     tempCount *= U16_LENGTH(term.patternCharacter); | 
| 823 |                     if (tempCount.hasOverflowed()) | 
| 824 |                         return ErrorCode::OffsetTooLarge; | 
| 825 |                     currentInputPosition += tempCount; | 
| 826 |                 } else | 
| 827 |                     currentInputPosition += term.quantityMaxCount; | 
| 828 |                 break; | 
| 829 |  | 
| 830 |             case PatternTerm::TypeCharacterClass: | 
| 831 |                 term.inputPosition = currentInputPosition.unsafeGet(); | 
| 832 |                 if (term.quantityType != QuantifierFixedCount) { | 
| 833 |                     term.frameLocation = currentCallFrameSize; | 
| 834 |                     currentCallFrameSize += YarrStackSpaceForBackTrackInfoCharacterClass; | 
| 835 |                     alternative->m_hasFixedSize = false; | 
| 836 |                 } else if (m_pattern.unicode()) { | 
| 837 |                     term.frameLocation = currentCallFrameSize; | 
| 838 |                     currentCallFrameSize += YarrStackSpaceForBackTrackInfoCharacterClass; | 
| 839 |                     currentInputPosition += term.quantityMaxCount; | 
| 840 |                     alternative->m_hasFixedSize = false; | 
| 841 |                 } else | 
| 842 |                     currentInputPosition += term.quantityMaxCount; | 
| 843 |                 break; | 
| 844 |  | 
| 845 |             case PatternTerm::TypeParenthesesSubpattern: | 
| 846 |                 // Note: for fixed once parentheses we will ensure at least the minimum is available; others are on their own. | 
| 847 |                 term.frameLocation = currentCallFrameSize; | 
| 848 |                 if (term.quantityMaxCount == 1 && !term.parentheses.isCopy) { | 
| 849 |                     currentCallFrameSize += YarrStackSpaceForBackTrackInfoParenthesesOnce; | 
| 850 |                     error = setupDisjunctionOffsets(disjunction: term.parentheses.disjunction, initialCallFrameSize: currentCallFrameSize, initialInputPosition: currentInputPosition.unsafeGet(), callFrameSize&: currentCallFrameSize); | 
| 851 |                     if (hasError(errorCode: error)) | 
| 852 |                         return error; | 
| 853 |                     // If quantity is fixed, then pre-check its minimum size. | 
| 854 |                     if (term.quantityType == QuantifierFixedCount) | 
| 855 |                         currentInputPosition += term.parentheses.disjunction->m_minimumSize; | 
| 856 |                     term.inputPosition = currentInputPosition.unsafeGet(); | 
| 857 |                 } else if (term.parentheses.isTerminal) { | 
| 858 |                     currentCallFrameSize += YarrStackSpaceForBackTrackInfoParenthesesTerminal; | 
| 859 |                     error = setupDisjunctionOffsets(disjunction: term.parentheses.disjunction, initialCallFrameSize: currentCallFrameSize, initialInputPosition: currentInputPosition.unsafeGet(), callFrameSize&: currentCallFrameSize); | 
| 860 |                     if (hasError(errorCode: error)) | 
| 861 |                         return error; | 
| 862 |                     term.inputPosition = currentInputPosition.unsafeGet(); | 
| 863 |                 } else { | 
| 864 |                     term.inputPosition = currentInputPosition.unsafeGet(); | 
| 865 |                     currentCallFrameSize += YarrStackSpaceForBackTrackInfoParentheses; | 
| 866 |                     error = setupDisjunctionOffsets(disjunction: term.parentheses.disjunction, initialCallFrameSize: currentCallFrameSize, initialInputPosition: currentInputPosition.unsafeGet(), callFrameSize&: currentCallFrameSize); | 
| 867 |                     if (hasError(errorCode: error)) | 
| 868 |                         return error; | 
| 869 |                 } | 
| 870 |                 // Fixed count of 1 could be accepted, if they have a fixed size *AND* if all alternatives are of the same length. | 
| 871 |                 alternative->m_hasFixedSize = false; | 
| 872 |                 break; | 
| 873 |  | 
| 874 |             case PatternTerm::TypeParentheticalAssertion: | 
| 875 |                 term.inputPosition = currentInputPosition.unsafeGet(); | 
| 876 |                 term.frameLocation = currentCallFrameSize; | 
| 877 |                 error = setupDisjunctionOffsets(disjunction: term.parentheses.disjunction, initialCallFrameSize: currentCallFrameSize + YarrStackSpaceForBackTrackInfoParentheticalAssertion, initialInputPosition: currentInputPosition.unsafeGet(), callFrameSize&: currentCallFrameSize); | 
| 878 |                 if (hasError(errorCode: error)) | 
| 879 |                     return error; | 
| 880 |                 break; | 
| 881 |  | 
| 882 |             case PatternTerm::TypeDotStarEnclosure: | 
| 883 |                 ASSERT(!m_pattern.m_saveInitialStartValue); | 
| 884 |                 alternative->m_hasFixedSize = false; | 
| 885 |                 term.inputPosition = initialInputPosition; | 
| 886 |                 m_pattern.m_initialStartValueFrameLocation = currentCallFrameSize; | 
| 887 |                 currentCallFrameSize += YarrStackSpaceForDotStarEnclosure; | 
| 888 |                 m_pattern.m_saveInitialStartValue = true; | 
| 889 |                 break; | 
| 890 |             } | 
| 891 |             if (currentInputPosition.hasOverflowed()) | 
| 892 |                 return ErrorCode::OffsetTooLarge; | 
| 893 |         } | 
| 894 |  | 
| 895 |         alternative->m_minimumSize = (currentInputPosition - initialInputPosition).unsafeGet(); | 
| 896 |         newCallFrameSize = currentCallFrameSize; | 
| 897 |         return error; | 
| 898 |     } | 
| 899 |  | 
| 900 |     ErrorCode setupDisjunctionOffsets(PatternDisjunction* disjunction, unsigned initialCallFrameSize, unsigned initialInputPosition, unsigned& callFrameSize) | 
| 901 |     { | 
| 902 |         if (UNLIKELY(!isSafeToRecurse())) | 
| 903 |             return ErrorCode::TooManyDisjunctions; | 
| 904 |  | 
| 905 |         if ((disjunction != m_pattern.m_body) && (disjunction->m_alternatives.size() > 1)) | 
| 906 |             initialCallFrameSize += YarrStackSpaceForBackTrackInfoAlternative; | 
| 907 |  | 
| 908 |         unsigned minimumInputSize = UINT_MAX; | 
| 909 |         unsigned maximumCallFrameSize = 0; | 
| 910 |         bool hasFixedSize = true; | 
| 911 |         ErrorCode error = ErrorCode::NoError; | 
| 912 |  | 
| 913 |         for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) { | 
| 914 |             PatternAlternative* alternative = disjunction->m_alternatives[alt].get(); | 
| 915 |             unsigned currentAlternativeCallFrameSize; | 
| 916 |             error = setupAlternativeOffsets(alternative, currentCallFrameSize: initialCallFrameSize, initialInputPosition, newCallFrameSize&: currentAlternativeCallFrameSize); | 
| 917 |             if (hasError(errorCode: error)) | 
| 918 |                 return error; | 
| 919 |             minimumInputSize = std::min(minimumInputSize, alternative->m_minimumSize); | 
| 920 |             maximumCallFrameSize = std::max(maximumCallFrameSize, currentAlternativeCallFrameSize); | 
| 921 |             hasFixedSize &= alternative->m_hasFixedSize; | 
| 922 |             if (alternative->m_minimumSize > INT_MAX) | 
| 923 |                 m_pattern.m_containsUnsignedLengthPattern = true; | 
| 924 |         } | 
| 925 |          | 
| 926 |         ASSERT(minimumInputSize != UINT_MAX); | 
| 927 |         ASSERT(maximumCallFrameSize >= initialCallFrameSize); | 
| 928 |  | 
| 929 |         disjunction->m_hasFixedSize = hasFixedSize; | 
| 930 |         disjunction->m_minimumSize = minimumInputSize; | 
| 931 |         disjunction->m_callFrameSize = maximumCallFrameSize; | 
| 932 |         callFrameSize = maximumCallFrameSize; | 
| 933 |         return error; | 
| 934 |     } | 
| 935 |  | 
| 936 |     ErrorCode setupOffsets() | 
| 937 |     { | 
| 938 |         // FIXME: Yarr should not use the stack to handle subpatterns (rdar://problem/26436314). | 
| 939 |         unsigned ignoredCallFrameSize; | 
| 940 |         return setupDisjunctionOffsets(disjunction: m_pattern.m_body, initialCallFrameSize: 0, initialInputPosition: 0, callFrameSize&: ignoredCallFrameSize); | 
| 941 |     } | 
| 942 |  | 
| 943 |     // This optimization identifies sets of parentheses that we will never need to backtrack. | 
| 944 |     // In these cases we do not need to store state from prior iterations. | 
| 945 |     // We can presently avoid backtracking for: | 
| 946 |     //   * where the parens are at the end of the regular expression (last term in any of the | 
| 947 |     //     alternatives of the main body disjunction). | 
| 948 |     //   * where the parens are non-capturing, and quantified unbounded greedy (*). | 
| 949 |     //   * where the parens do not contain any capturing subpatterns. | 
| 950 |     void checkForTerminalParentheses() | 
| 951 |     { | 
| 952 |         // This check is much too crude; should be just checking whether the candidate | 
| 953 |         // node contains nested capturing subpatterns, not the whole expression! | 
| 954 |         if (m_pattern.m_numSubpatterns) | 
| 955 |             return; | 
| 956 |  | 
| 957 |         Vector<std::unique_ptr<PatternAlternative>>& alternatives = m_pattern.m_body->m_alternatives; | 
| 958 |         for (size_t i = 0; i < alternatives.size(); ++i) { | 
| 959 |             Vector<PatternTerm>& terms = alternatives[i]->m_terms; | 
| 960 |             if (terms.size()) { | 
| 961 |                 PatternTerm& term = terms.last(); | 
| 962 |                 if (term.type == PatternTerm::TypeParenthesesSubpattern | 
| 963 |                     && term.quantityType == QuantifierGreedy | 
| 964 |                     && term.quantityMinCount == 0 | 
| 965 |                     && term.quantityMaxCount == quantifyInfinite | 
| 966 |                     && !term.capture()) | 
| 967 |                     term.parentheses.isTerminal = true; | 
| 968 |             } | 
| 969 |         } | 
| 970 |     } | 
| 971 |  | 
| 972 |     void optimizeBOL() | 
| 973 |     { | 
| 974 |         // Look for expressions containing beginning of line (^) anchoring and unroll them. | 
| 975 |         // e.g. /^a|^b|c/ becomes /^a|^b|c/ which is executed once followed by /c/ which loops | 
| 976 |         // This code relies on the parsing code tagging alternatives with m_containsBOL and | 
| 977 |         // m_startsWithBOL and rolling those up to containing alternatives. | 
| 978 |         // At this point, this is only valid for non-multiline expressions. | 
| 979 |         PatternDisjunction* disjunction = m_pattern.m_body; | 
| 980 |          | 
| 981 |         if (!m_pattern.m_containsBOL || m_pattern.multiline()) | 
| 982 |             return; | 
| 983 |          | 
| 984 |         PatternDisjunction* loopDisjunction = copyDisjunction(disjunction, filterStartsWithBOL: true); | 
| 985 |  | 
| 986 |         // Set alternatives in disjunction to "onceThrough" | 
| 987 |         for (unsigned alt = 0; alt < disjunction->m_alternatives.size(); ++alt) | 
| 988 |             disjunction->m_alternatives[alt]->setOnceThrough(); | 
| 989 |  | 
| 990 |         if (loopDisjunction) { | 
| 991 |             // Move alternatives from loopDisjunction to disjunction | 
| 992 |             for (unsigned alt = 0; alt < loopDisjunction->m_alternatives.size(); ++alt) | 
| 993 |                 disjunction->m_alternatives.append(loopDisjunction->m_alternatives[alt].release()); | 
| 994 |                  | 
| 995 |             loopDisjunction->m_alternatives.clear(); | 
| 996 |         } | 
| 997 |     } | 
| 998 |  | 
| 999 |     bool containsCapturingTerms(PatternAlternative* alternative, size_t firstTermIndex, size_t endIndex) | 
| 1000 |     { | 
| 1001 |         Vector<PatternTerm>& terms = alternative->m_terms; | 
| 1002 |  | 
| 1003 |         ASSERT(endIndex <= terms.size()); | 
| 1004 |         for (size_t termIndex = firstTermIndex; termIndex < endIndex; ++termIndex) { | 
| 1005 |             PatternTerm& term = terms[termIndex]; | 
| 1006 |  | 
| 1007 |             if (term.m_capture) | 
| 1008 |                 return true; | 
| 1009 |  | 
| 1010 |             if (term.type == PatternTerm::TypeParenthesesSubpattern) { | 
| 1011 |                 PatternDisjunction* nestedDisjunction = term.parentheses.disjunction; | 
| 1012 |                 for (unsigned alt = 0; alt < nestedDisjunction->m_alternatives.size(); ++alt) { | 
| 1013 |                     if (containsCapturingTerms(nestedDisjunction->m_alternatives[alt].get(), 0, nestedDisjunction->m_alternatives[alt]->m_terms.size())) | 
| 1014 |                         return true; | 
| 1015 |                 } | 
| 1016 |             } | 
| 1017 |         } | 
| 1018 |  | 
| 1019 |         return false; | 
| 1020 |     } | 
| 1021 |  | 
| 1022 |     // This optimization identifies alternatives in the form of  | 
| 1023 |     // [^].*[?]<expression>.*[$] for expressions that don't have any  | 
| 1024 |     // capturing terms. The alternative is changed to <expression>  | 
| 1025 |     // followed by processing of the dot stars to find and adjust the  | 
| 1026 |     // beginning and the end of the match. | 
| 1027 |     void optimizeDotStarWrappedExpressions() | 
| 1028 |     { | 
| 1029 |         Vector<std::unique_ptr<PatternAlternative>>& alternatives = m_pattern.m_body->m_alternatives; | 
| 1030 |         if (alternatives.size() != 1) | 
| 1031 |             return; | 
| 1032 |  | 
| 1033 |         CharacterClass* dotCharacterClass = m_pattern.dotAll() ? m_pattern.anyCharacterClass() : m_pattern.newlineCharacterClass(); | 
| 1034 |         PatternAlternative* alternative = alternatives[0].get(); | 
| 1035 |         Vector<PatternTerm>& terms = alternative->m_terms; | 
| 1036 |         if (terms.size() >= 3) { | 
| 1037 |             bool startsWithBOL = false; | 
| 1038 |             bool endsWithEOL = false; | 
| 1039 |             size_t termIndex, firstExpressionTerm; | 
| 1040 |  | 
| 1041 |             termIndex = 0; | 
| 1042 |             if (terms[termIndex].type == PatternTerm::TypeAssertionBOL) { | 
| 1043 |                 startsWithBOL = true; | 
| 1044 |                 ++termIndex; | 
| 1045 |             } | 
| 1046 |              | 
| 1047 |             PatternTerm& firstNonAnchorTerm = terms[termIndex]; | 
| 1048 |             if (firstNonAnchorTerm.type != PatternTerm::TypeCharacterClass | 
| 1049 |                 || firstNonAnchorTerm.characterClass != dotCharacterClass | 
| 1050 |                 || firstNonAnchorTerm.quantityMinCount | 
| 1051 |                 || firstNonAnchorTerm.quantityMaxCount != quantifyInfinite) | 
| 1052 |                 return; | 
| 1053 |              | 
| 1054 |             firstExpressionTerm = termIndex + 1; | 
| 1055 |              | 
| 1056 |             termIndex = terms.size() - 1; | 
| 1057 |             if (terms[termIndex].type == PatternTerm::TypeAssertionEOL) { | 
| 1058 |                 endsWithEOL = true; | 
| 1059 |                 --termIndex; | 
| 1060 |             } | 
| 1061 |              | 
| 1062 |             PatternTerm& lastNonAnchorTerm = terms[termIndex]; | 
| 1063 |             if (lastNonAnchorTerm.type != PatternTerm::TypeCharacterClass | 
| 1064 |                 || lastNonAnchorTerm.characterClass != dotCharacterClass | 
| 1065 |                 || lastNonAnchorTerm.quantityType != QuantifierGreedy | 
| 1066 |                 || lastNonAnchorTerm.quantityMinCount | 
| 1067 |                 || lastNonAnchorTerm.quantityMaxCount != quantifyInfinite) | 
| 1068 |                 return; | 
| 1069 |  | 
| 1070 |             size_t endIndex = termIndex; | 
| 1071 |             if (firstExpressionTerm >= endIndex) | 
| 1072 |                 return; | 
| 1073 |  | 
| 1074 |             if (!containsCapturingTerms(alternative, firstTermIndex: firstExpressionTerm, endIndex)) { | 
| 1075 |                 for (termIndex = terms.size() - 1; termIndex >= endIndex; --termIndex) | 
| 1076 |                     terms.remove(termIndex); | 
| 1077 |  | 
| 1078 |                 for (termIndex = firstExpressionTerm; termIndex > 0; --termIndex) | 
| 1079 |                     terms.remove(termIndex - 1); | 
| 1080 |  | 
| 1081 |                 terms.append(PatternTerm(startsWithBOL, endsWithEOL)); | 
| 1082 |                  | 
| 1083 |                 m_pattern.m_containsBOL = false; | 
| 1084 |             } | 
| 1085 |         } | 
| 1086 |     } | 
| 1087 |  | 
| 1088 | private: | 
| 1089 |     bool isSafeToRecurse() const | 
| 1090 |     { | 
| 1091 |         if (!m_stackLimit) | 
| 1092 |             return true; | 
| 1093 |         int8_t* curr = reinterpret_cast<int8_t*>(&curr); | 
| 1094 |         int8_t* limit = reinterpret_cast<int8_t*>(m_stackLimit); | 
| 1095 |         return curr >= limit; | 
| 1096 |     } | 
| 1097 |  | 
| 1098 |     YarrPattern& m_pattern; | 
| 1099 |     PatternAlternative* m_alternative; | 
| 1100 |     CharacterClassConstructor m_characterClassConstructor; | 
| 1101 |     Vector<String> m_unmatchedNamedForwardReferences; | 
| 1102 |     void* m_stackLimit; | 
| 1103 |     bool m_invertCharacterClass; | 
| 1104 |     bool m_invertParentheticalAssertion { false }; | 
| 1105 | }; | 
| 1106 |  | 
| 1107 | ErrorCode YarrPattern::compile(const String& patternString, void* stackLimit) | 
| 1108 | { | 
| 1109 |     YarrPatternConstructor constructor(*this, stackLimit); | 
| 1110 |  | 
| 1111 |     if (m_flags == InvalidFlags) | 
| 1112 |         return ErrorCode::InvalidRegularExpressionFlags; | 
| 1113 |  | 
| 1114 |     { | 
| 1115 |         ErrorCode error = parse(constructor, patternString, unicode()); | 
| 1116 |         if (hasError(errorCode: error)) | 
| 1117 |             return error; | 
| 1118 |     } | 
| 1119 |      | 
| 1120 |     // If the pattern contains illegal backreferences reset & reparse. | 
| 1121 |     // Quoting Netscape's "What's new in JavaScript 1.2", | 
| 1122 |     //      "Note: if the number of left parentheses is less than the number specified | 
| 1123 |     //       in \#, the \# is taken as an octal escape as described in the next row." | 
| 1124 |     if (containsIllegalBackReference() || containsIllegalNamedForwardReferences()) { | 
| 1125 |         if (unicode()) | 
| 1126 |             return ErrorCode::InvalidBackreference; | 
| 1127 |  | 
| 1128 |         unsigned numSubpatterns = m_numSubpatterns; | 
| 1129 |  | 
| 1130 |         constructor.saveUnmatchedNamedForwardReferences(); | 
| 1131 |         constructor.resetForReparsing(); | 
| 1132 |         ErrorCode error = parse(constructor, patternString, unicode(), numSubpatterns); | 
| 1133 |         ASSERT_UNUSED(error, !hasError(error)); | 
| 1134 |         ASSERT(numSubpatterns == m_numSubpatterns); | 
| 1135 |     } | 
| 1136 |  | 
| 1137 |     constructor.checkForTerminalParentheses(); | 
| 1138 |     constructor.optimizeDotStarWrappedExpressions(); | 
| 1139 |     constructor.optimizeBOL(); | 
| 1140 |  | 
| 1141 |     { | 
| 1142 |         ErrorCode error = constructor.setupOffsets(); | 
| 1143 |         if (hasError(errorCode: error)) | 
| 1144 |             return error; | 
| 1145 |     } | 
| 1146 |  | 
| 1147 |     if (Options::dumpCompiledRegExpPatterns()) | 
| 1148 |         dumpPattern(pattern: patternString); | 
| 1149 |  | 
| 1150 |     return ErrorCode::NoError; | 
| 1151 | } | 
| 1152 |  | 
| 1153 | YarrPattern::YarrPattern(const String& pattern, RegExpFlags flags, ErrorCode& error, void* stackLimit) | 
| 1154 |     : m_containsBackreferences(false) | 
| 1155 |     , m_containsBOL(false) | 
| 1156 |     , m_containsUnsignedLengthPattern(false) | 
| 1157 |     , m_hasCopiedParenSubexpressions(false) | 
| 1158 |     , m_saveInitialStartValue(false) | 
| 1159 |     , m_flags(flags) | 
| 1160 | { | 
| 1161 |     error = compile(patternString: pattern, stackLimit); | 
| 1162 | } | 
| 1163 |  | 
| 1164 | void indentForNestingLevel(PrintStream& out, unsigned nestingDepth) | 
| 1165 | { | 
| 1166 |     out.print("    " ); | 
| 1167 |     for (; nestingDepth; --nestingDepth) | 
| 1168 |         out.print("  " ); | 
| 1169 | } | 
| 1170 |  | 
| 1171 | void dumpUChar32(PrintStream& out, UChar32 c) | 
| 1172 | { | 
| 1173 |     if (c >= ' '&& c <= 0xff) | 
| 1174 |         out.printf(format: "'%c'" , static_cast<char>(c)); | 
| 1175 |     else | 
| 1176 |         out.printf(format: "0x%04x" , c); | 
| 1177 | } | 
| 1178 |  | 
| 1179 | void dumpCharacterClass(PrintStream& out, YarrPattern* pattern, CharacterClass* characterClass) | 
| 1180 | { | 
| 1181 |     if (characterClass == pattern->anyCharacterClass()) | 
| 1182 |         out.print("<any character>" ); | 
| 1183 |     else if (characterClass == pattern->newlineCharacterClass()) | 
| 1184 |         out.print("<newline>" ); | 
| 1185 |     else if (characterClass == pattern->digitsCharacterClass()) | 
| 1186 |         out.print("<digits>" ); | 
| 1187 |     else if (characterClass == pattern->spacesCharacterClass()) | 
| 1188 |         out.print("<whitespace>" ); | 
| 1189 |     else if (characterClass == pattern->wordcharCharacterClass()) | 
| 1190 |         out.print("<word>" ); | 
| 1191 |     else if (characterClass == pattern->wordUnicodeIgnoreCaseCharCharacterClass()) | 
| 1192 |         out.print("<unicode word ignore case>" ); | 
| 1193 |     else if (characterClass == pattern->nondigitsCharacterClass()) | 
| 1194 |         out.print("<non-digits>" ); | 
| 1195 |     else if (characterClass == pattern->nonspacesCharacterClass()) | 
| 1196 |         out.print("<non-whitespace>" ); | 
| 1197 |     else if (characterClass == pattern->nonwordcharCharacterClass()) | 
| 1198 |         out.print("<non-word>" ); | 
| 1199 |     else if (characterClass == pattern->nonwordUnicodeIgnoreCaseCharCharacterClass()) | 
| 1200 |         out.print("<unicode non-word ignore case>" ); | 
| 1201 |     else { | 
| 1202 |         bool needMatchesRangesSeperator = false; | 
| 1203 |  | 
| 1204 |         auto dumpMatches = [&] (const char* prefix, Vector<UChar32> matches) { | 
| 1205 |             size_t matchesSize = matches.size(); | 
| 1206 |             if (matchesSize) { | 
| 1207 |                 if (needMatchesRangesSeperator) | 
| 1208 |                     out.print("," ); | 
| 1209 |                 needMatchesRangesSeperator = true; | 
| 1210 |  | 
| 1211 |                 out.print(prefix, ":(" ); | 
| 1212 |                 for (size_t i = 0; i < matchesSize; ++i) { | 
| 1213 |                     if (i) | 
| 1214 |                         out.print("," ); | 
| 1215 |                     dumpUChar32(out, matches[i]); | 
| 1216 |                 } | 
| 1217 |                 out.print(")" ); | 
| 1218 |             } | 
| 1219 |         }; | 
| 1220 |  | 
| 1221 |         auto dumpRanges = [&] (const char* prefix, Vector<CharacterRange> ranges) { | 
| 1222 |             size_t rangeSize = ranges.size(); | 
| 1223 |             if (rangeSize) { | 
| 1224 |                 if (needMatchesRangesSeperator) | 
| 1225 |                     out.print("," ); | 
| 1226 |                 needMatchesRangesSeperator = true; | 
| 1227 |  | 
| 1228 |                 out.print(prefix, " ranges:(" ); | 
| 1229 |                 for (size_t i = 0; i < rangeSize; ++i) { | 
| 1230 |                     if (i) | 
| 1231 |                         out.print("," ); | 
| 1232 |                     CharacterRange range = ranges[i]; | 
| 1233 |                     out.print("(" ); | 
| 1234 |                     dumpUChar32(out, c: range.begin); | 
| 1235 |                     out.print(".." ); | 
| 1236 |                     dumpUChar32(out, c: range.end); | 
| 1237 |                     out.print(")" ); | 
| 1238 |                 } | 
| 1239 |                 out.print(")" ); | 
| 1240 |             } | 
| 1241 |         }; | 
| 1242 |  | 
| 1243 |         out.print("[" ); | 
| 1244 |         dumpMatches("ASCII" , characterClass->m_matches); | 
| 1245 |         dumpRanges("ASCII" , characterClass->m_ranges); | 
| 1246 |         dumpMatches("Unicode" , characterClass->m_matchesUnicode); | 
| 1247 |         dumpRanges("Unicode" , characterClass->m_rangesUnicode); | 
| 1248 |         out.print("]" ); | 
| 1249 |     } | 
| 1250 | } | 
| 1251 |  | 
| 1252 | void PatternAlternative::dump(PrintStream& out, YarrPattern* thisPattern, unsigned nestingDepth) | 
| 1253 | { | 
| 1254 |     out.print("minimum size: " , m_minimumSize); | 
| 1255 |     if (m_hasFixedSize) | 
| 1256 |         out.print(",fixed size" ); | 
| 1257 |     if (m_onceThrough) | 
| 1258 |         out.print(",once through" ); | 
| 1259 |     if (m_startsWithBOL) | 
| 1260 |         out.print(",starts with ^" ); | 
| 1261 |     if (m_containsBOL) | 
| 1262 |         out.print(",contains ^" ); | 
| 1263 |     out.print("\n" ); | 
| 1264 |  | 
| 1265 |     for (size_t i = 0; i < m_terms.size(); ++i) | 
| 1266 |         m_terms[i].dump(out, thisPattern, nestingDepth); | 
| 1267 | } | 
| 1268 |  | 
| 1269 | void PatternTerm::dumpQuantifier(PrintStream& out) | 
| 1270 | { | 
| 1271 |     if (quantityType == QuantifierFixedCount && quantityMinCount == 1 && quantityMaxCount == 1) | 
| 1272 |         return; | 
| 1273 |     out.print(" {" , quantityMinCount.unsafeGet()); | 
| 1274 |     if (quantityMinCount != quantityMaxCount) { | 
| 1275 |         if (quantityMaxCount == UINT_MAX) | 
| 1276 |             out.print(",..." ); | 
| 1277 |         else | 
| 1278 |             out.print("," , quantityMaxCount.unsafeGet()); | 
| 1279 |     } | 
| 1280 |     out.print("}" ); | 
| 1281 |     if (quantityType == QuantifierGreedy) | 
| 1282 |         out.print(" greedy" ); | 
| 1283 |     else if (quantityType == QuantifierNonGreedy) | 
| 1284 |         out.print(" non-greedy" ); | 
| 1285 | } | 
| 1286 |  | 
| 1287 | void PatternTerm::dump(PrintStream& out, YarrPattern* thisPattern, unsigned nestingDepth) | 
| 1288 | { | 
| 1289 |     indentForNestingLevel(out, nestingDepth); | 
| 1290 |  | 
| 1291 |     if (type != TypeParenthesesSubpattern && type != TypeParentheticalAssertion) { | 
| 1292 |         if (invert()) | 
| 1293 |             out.print("not " ); | 
| 1294 |     } | 
| 1295 |  | 
| 1296 |     switch (type) { | 
| 1297 |     case TypeAssertionBOL: | 
| 1298 |         out.println("BOL" ); | 
| 1299 |         break; | 
| 1300 |     case TypeAssertionEOL: | 
| 1301 |         out.println("EOL" ); | 
| 1302 |         break; | 
| 1303 |     case TypeAssertionWordBoundary: | 
| 1304 |         out.println("word boundary" ); | 
| 1305 |         break; | 
| 1306 |     case TypePatternCharacter: | 
| 1307 |         out.printf(format: "character " ); | 
| 1308 |         out.printf(format: "inputPosition %u " , inputPosition); | 
| 1309 |         if (thisPattern->ignoreCase() && isASCIIAlpha(patternCharacter)) { | 
| 1310 |             dumpUChar32(out, toASCIIUpper(patternCharacter)); | 
| 1311 |             out.print("/" ); | 
| 1312 |             dumpUChar32(out, toASCIILower(patternCharacter)); | 
| 1313 |         } else | 
| 1314 |             dumpUChar32(out, c: patternCharacter); | 
| 1315 |         dumpQuantifier(out); | 
| 1316 |         if (quantityType != QuantifierFixedCount) | 
| 1317 |             out.print(",frame location " , frameLocation); | 
| 1318 |         out.println(); | 
| 1319 |         break; | 
| 1320 |     case TypeCharacterClass: | 
| 1321 |         out.print("character class " ); | 
| 1322 |         dumpCharacterClass(out, pattern: thisPattern, characterClass: characterClass); | 
| 1323 |         dumpQuantifier(out); | 
| 1324 |         if (quantityType != QuantifierFixedCount || thisPattern->unicode()) | 
| 1325 |             out.print(",frame location " , frameLocation); | 
| 1326 |         out.println(); | 
| 1327 |         break; | 
| 1328 |     case TypeBackReference: | 
| 1329 |         out.print("back reference to subpattern #" , backReferenceSubpatternId); | 
| 1330 |         out.println(",frame location " , frameLocation); | 
| 1331 |         break; | 
| 1332 |     case TypeForwardReference: | 
| 1333 |         out.println("forward reference" ); | 
| 1334 |         break; | 
| 1335 |     case TypeParenthesesSubpattern: | 
| 1336 |         if (m_capture) | 
| 1337 |             out.print("captured " ); | 
| 1338 |         else | 
| 1339 |             out.print("non-captured " ); | 
| 1340 |  | 
| 1341 |         FALLTHROUGH; | 
| 1342 |     case TypeParentheticalAssertion: | 
| 1343 |         if (m_invert) | 
| 1344 |             out.print("inverted " ); | 
| 1345 |  | 
| 1346 |         if (type == TypeParenthesesSubpattern) | 
| 1347 |             out.print("subpattern" ); | 
| 1348 |         else if (type == TypeParentheticalAssertion) | 
| 1349 |             out.print("assertion" ); | 
| 1350 |  | 
| 1351 |         if (m_capture) | 
| 1352 |             out.print(" #" , parentheses.subpatternId); | 
| 1353 |  | 
| 1354 |         dumpQuantifier(out); | 
| 1355 |  | 
| 1356 |         if (parentheses.isCopy) | 
| 1357 |             out.print(",copy" ); | 
| 1358 |  | 
| 1359 |         if (parentheses.isTerminal) | 
| 1360 |             out.print(",terminal" ); | 
| 1361 |  | 
| 1362 |         out.println(",frame location " , frameLocation); | 
| 1363 |  | 
| 1364 |         if (parentheses.disjunction->m_alternatives.size() > 1) { | 
| 1365 |             indentForNestingLevel(out, nestingDepth: nestingDepth + 1); | 
| 1366 |             unsigned alternativeFrameLocation = frameLocation; | 
| 1367 |             if (quantityMaxCount == 1 && !parentheses.isCopy) | 
| 1368 |                 alternativeFrameLocation += YarrStackSpaceForBackTrackInfoParenthesesOnce; | 
| 1369 |             else if (parentheses.isTerminal) | 
| 1370 |                 alternativeFrameLocation += YarrStackSpaceForBackTrackInfoParenthesesTerminal; | 
| 1371 |             else | 
| 1372 |                 alternativeFrameLocation += YarrStackSpaceForBackTrackInfoParentheses; | 
| 1373 |             out.println("alternative list,frame location " , alternativeFrameLocation); | 
| 1374 |         } | 
| 1375 |  | 
| 1376 |         parentheses.disjunction->dump(out, thisPattern, nestingDepth + 1); | 
| 1377 |         break; | 
| 1378 |     case TypeDotStarEnclosure: | 
| 1379 |         out.println(".* enclosure,frame location " , thisPattern->m_initialStartValueFrameLocation); | 
| 1380 |         break; | 
| 1381 |     } | 
| 1382 | } | 
| 1383 |  | 
| 1384 | void PatternDisjunction::dump(PrintStream& out, YarrPattern* thisPattern, unsigned nestingDepth = 0) | 
| 1385 | { | 
| 1386 |     unsigned alternativeCount = m_alternatives.size(); | 
| 1387 |     for (unsigned i = 0; i < alternativeCount; ++i) { | 
| 1388 |         indentForNestingLevel(out, nestingDepth); | 
| 1389 |         if (alternativeCount > 1) | 
| 1390 |             out.print("alternative #" , i, ": " ); | 
| 1391 |         m_alternatives[i].get()->dump(out, thisPattern, nestingDepth + (alternativeCount > 1)); | 
| 1392 |     } | 
| 1393 | } | 
| 1394 |  | 
| 1395 | void YarrPattern::dumpPatternString(PrintStream& out, const String& patternString) | 
| 1396 | { | 
| 1397 |     out.print("/" , patternString, "/" ); | 
| 1398 |  | 
| 1399 |     if (global()) | 
| 1400 |         out.print("g" ); | 
| 1401 |     if (ignoreCase()) | 
| 1402 |         out.print("i" ); | 
| 1403 |     if (multiline()) | 
| 1404 |         out.print("m" ); | 
| 1405 |     if (unicode()) | 
| 1406 |         out.print("u" ); | 
| 1407 |     if (sticky()) | 
| 1408 |         out.print("y" ); | 
| 1409 | } | 
| 1410 |  | 
| 1411 | void YarrPattern::dumpPattern(const String& patternString) | 
| 1412 | { | 
| 1413 |     dumpPattern(out&: WTF::dataFile(), pattern: patternString); | 
| 1414 | } | 
| 1415 |  | 
| 1416 | void YarrPattern::dumpPattern(PrintStream& out, const String& patternString) | 
| 1417 | { | 
| 1418 |     out.print("RegExp pattern for " ); | 
| 1419 |     dumpPatternString(out, patternString); | 
| 1420 |  | 
| 1421 |     if (m_flags != NoFlags) { | 
| 1422 |         bool printSeperator = false; | 
| 1423 |         out.print(" (" ); | 
| 1424 |         if (global()) { | 
| 1425 |             out.print("global" ); | 
| 1426 |             printSeperator = true; | 
| 1427 |         } | 
| 1428 |         if (ignoreCase()) { | 
| 1429 |             if (printSeperator) | 
| 1430 |                 out.print("|" ); | 
| 1431 |             out.print("ignore case" ); | 
| 1432 |             printSeperator = true; | 
| 1433 |         } | 
| 1434 |         if (multiline()) { | 
| 1435 |             if (printSeperator) | 
| 1436 |                 out.print("|" ); | 
| 1437 |             out.print("multiline" ); | 
| 1438 |             printSeperator = true; | 
| 1439 |         } | 
| 1440 |         if (unicode()) { | 
| 1441 |             if (printSeperator) | 
| 1442 |                 out.print("|" ); | 
| 1443 |             out.print("unicode" ); | 
| 1444 |             printSeperator = true; | 
| 1445 |         } | 
| 1446 |         if (sticky()) { | 
| 1447 |             if (printSeperator) | 
| 1448 |                 out.print("|" ); | 
| 1449 |             out.print("sticky" ); | 
| 1450 |             printSeperator = true; | 
| 1451 |         } | 
| 1452 |         out.print(")" ); | 
| 1453 |     } | 
| 1454 |     out.print(":\n" ); | 
| 1455 |     if (m_body->m_callFrameSize) | 
| 1456 |         out.print("    callframe size: " , m_body->m_callFrameSize, "\n" ); | 
| 1457 |     m_body->dump(out, thisPattern: this); | 
| 1458 | } | 
| 1459 |  | 
| 1460 | std::unique_ptr<CharacterClass> anycharCreate() | 
| 1461 | { | 
| 1462 |     auto characterClass = std::make_unique<CharacterClass>(); | 
| 1463 |     characterClass->m_ranges.append(CharacterRange(0x00, 0x7f)); | 
| 1464 |     characterClass->m_rangesUnicode.append(CharacterRange(0x0080, 0x10ffff)); | 
| 1465 |     characterClass->m_hasNonBMPCharacters = true; | 
| 1466 |     characterClass->m_anyCharacter = true; | 
| 1467 |     return characterClass; | 
| 1468 | } | 
| 1469 |  | 
| 1470 | } } // namespace JSC::Yarr | 
| 1471 |  |