| 1 | /* | 
| 2 |  * Copyright (C) 2009, 2014-2016 Apple Inc. All rights reserved. | 
| 3 |  * | 
| 4 |  * Redistribution and use in source and binary forms, with or without | 
| 5 |  * modification, are permitted provided that the following conditions | 
| 6 |  * are met: | 
| 7 |  * 1. Redistributions of source code must retain the above copyright | 
| 8 |  *    notice, this list of conditions and the following disclaimer. | 
| 9 |  * 2. Redistributions in binary form must reproduce the above copyright | 
| 10 |  *    notice, this list of conditions and the following disclaimer in the | 
| 11 |  *    documentation and/or other materials provided with the distribution. | 
| 12 |  * | 
| 13 |  * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY | 
| 14 |  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | 
| 15 |  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR | 
| 16 |  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR | 
| 17 |  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, | 
| 18 |  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, | 
| 19 |  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR | 
| 20 |  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY | 
| 21 |  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | 
| 22 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | 
| 23 |  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  | 
| 24 |  */ | 
| 25 |  | 
| 26 | #pragma once | 
| 27 |  | 
| 28 | #include "Yarr.h" | 
| 29 | #include "YarrPattern.h" | 
| 30 | #include "YarrUnicodeProperties.h" | 
| 31 | #include <wtf/ASCIICType.h> | 
| 32 | #include <wtf/HashSet.h> | 
| 33 | #include <wtf/Optional.h> | 
| 34 | #include <wtf/text/StringBuilder.h> | 
| 35 | #include <wtf/text/WTFString.h> | 
| 36 |  | 
| 37 | namespace JSC { namespace Yarr { | 
| 38 |  | 
| 39 | // The Parser class should not be used directly - only via the Yarr::parse() method. | 
| 40 | template<class Delegate, typename CharType> | 
| 41 | class Parser { | 
| 42 | private: | 
| 43 |     template<class FriendDelegate> | 
| 44 |     friend ErrorCode parse(FriendDelegate&, const String& pattern, bool isUnicode, unsigned backReferenceLimit); | 
| 45 |  | 
| 46 |     /* | 
| 47 |      * CharacterClassParserDelegate: | 
| 48 |      * | 
| 49 |      * The class CharacterClassParserDelegate is used in the parsing of character | 
| 50 |      * classes.  This class handles detection of character ranges.  This class | 
| 51 |      * implements enough of the delegate interface such that it can be passed to | 
| 52 |      * parseEscape() as an EscapeDelegate.  This allows parseEscape() to be reused | 
| 53 |      * to perform the parsing of escape characters in character sets. | 
| 54 |      */ | 
| 55 |     class CharacterClassParserDelegate { | 
| 56 |     public: | 
| 57 |         CharacterClassParserDelegate(Delegate& delegate, ErrorCode& err) | 
| 58 |             : m_delegate(delegate) | 
| 59 |             , m_errorCode(err) | 
| 60 |             , m_state(Empty) | 
| 61 |             , m_character(0) | 
| 62 |         { | 
| 63 |         } | 
| 64 |  | 
| 65 |         /* | 
| 66 |          * begin(): | 
| 67 |          * | 
| 68 |          * Called at beginning of construction. | 
| 69 |          */ | 
| 70 |         void begin(bool invert) | 
| 71 |         { | 
| 72 |             m_delegate.atomCharacterClassBegin(invert); | 
| 73 |         } | 
| 74 |  | 
| 75 |         /* | 
| 76 |          * atomPatternCharacter(): | 
| 77 |          * | 
| 78 |          * This method is called either from parseCharacterClass() (for an unescaped | 
| 79 |          * character in a character class), or from parseEscape(). In the former case | 
| 80 |          * the value true will be passed for the argument 'hyphenIsRange', and in this | 
| 81 |          * mode we will allow a hypen to be treated as indicating a range (i.e. /[a-z]/ | 
| 82 |          * is different to /[a\-z]/). | 
| 83 |          */ | 
| 84 |         void atomPatternCharacter(UChar32 ch, bool hyphenIsRange = false) | 
| 85 |         { | 
| 86 |             switch (m_state) { | 
| 87 |             case AfterCharacterClass: | 
| 88 |                 // Following a builtin character class we need look out for a hyphen. | 
| 89 |                 // We're looking for invalid ranges, such as /[\d-x]/ or /[\d-\d]/. | 
| 90 |                 // If we see a hyphen following a charater class then unlike usual | 
| 91 |                 // we'll report it to the delegate immediately, and put ourself into | 
| 92 |                 // a poisoned state. Any following calls to add another character or | 
| 93 |                 // character class will result in an error. (A hypen following a | 
| 94 |                 // character-class is itself valid, but only  at the end of a regex). | 
| 95 |                 if (hyphenIsRange && ch == '-') { | 
| 96 |                     m_delegate.atomCharacterClassAtom('-'); | 
| 97 |                     m_state = AfterCharacterClassHyphen; | 
| 98 |                     return; | 
| 99 |                 } | 
| 100 |                 // Otherwise just fall through - cached character so treat this as Empty. | 
| 101 |                 FALLTHROUGH; | 
| 102 |  | 
| 103 |             case Empty: | 
| 104 |                 m_character = ch; | 
| 105 |                 m_state = CachedCharacter; | 
| 106 |                 return; | 
| 107 |  | 
| 108 |             case CachedCharacter: | 
| 109 |                 if (hyphenIsRange && ch == '-') | 
| 110 |                     m_state = CachedCharacterHyphen; | 
| 111 |                 else { | 
| 112 |                     m_delegate.atomCharacterClassAtom(m_character); | 
| 113 |                     m_character = ch; | 
| 114 |                 } | 
| 115 |                 return; | 
| 116 |  | 
| 117 |             case CachedCharacterHyphen: | 
| 118 |                 if (ch < m_character) { | 
| 119 |                     m_errorCode = ErrorCode::CharacterClassOutOfOrder; | 
| 120 |                     return; | 
| 121 |                 } | 
| 122 |                 m_delegate.atomCharacterClassRange(m_character, ch); | 
| 123 |                 m_state = Empty; | 
| 124 |                 return; | 
| 125 |  | 
| 126 |                 // See coment in atomBuiltInCharacterClass below. | 
| 127 |                 // This too is technically an error, per ECMA-262, and again we | 
| 128 |                 // we chose to allow this.  Note a subtlely here that while we | 
| 129 |                 // diverge from the spec's definition of CharacterRange we do | 
| 130 |                 // remain in compliance with the grammar.  For example, consider | 
| 131 |                 // the expression /[\d-a-z]/.  We comply with the grammar in | 
| 132 |                 // this case by not allowing a-z to be matched as a range. | 
| 133 |             case AfterCharacterClassHyphen: | 
| 134 |                 m_delegate.atomCharacterClassAtom(ch); | 
| 135 |                 m_state = Empty; | 
| 136 |                 return; | 
| 137 |             } | 
| 138 |         } | 
| 139 |  | 
| 140 |         /* | 
| 141 |          * atomBuiltInCharacterClass(): | 
| 142 |          * | 
| 143 |          * Adds a built-in character class, called by parseEscape(). | 
| 144 |          */ | 
| 145 |         void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert) | 
| 146 |         { | 
| 147 |             switch (m_state) { | 
| 148 |             case CachedCharacter: | 
| 149 |                 // Flush the currently cached character, then fall through. | 
| 150 |                 m_delegate.atomCharacterClassAtom(m_character); | 
| 151 |                 FALLTHROUGH; | 
| 152 |             case Empty: | 
| 153 |             case AfterCharacterClass: | 
| 154 |                 m_state = AfterCharacterClass; | 
| 155 |                 m_delegate.atomCharacterClassBuiltIn(classID, invert); | 
| 156 |                 return; | 
| 157 |  | 
| 158 |                 // If we hit either of these cases, we have an invalid range that | 
| 159 |                 // looks something like /[x-\d]/ or /[\d-\d]/. | 
| 160 |                 // According to ECMA-262 this should be a syntax error, but | 
| 161 |                 // empirical testing shows this to break teh webz.  Instead we | 
| 162 |                 // comply with to the ECMA-262 grammar, and assume the grammar to | 
| 163 |                 // have matched the range correctly, but tweak our interpretation | 
| 164 |                 // of CharacterRange.  Effectively we implicitly handle the hyphen | 
| 165 |                 // as if it were escaped, e.g. /[\w-_]/ is treated as /[\w\-_]/. | 
| 166 |             case CachedCharacterHyphen: | 
| 167 |                 m_delegate.atomCharacterClassAtom(m_character); | 
| 168 |                 m_delegate.atomCharacterClassAtom('-'); | 
| 169 |                 FALLTHROUGH; | 
| 170 |             case AfterCharacterClassHyphen: | 
| 171 |                 m_delegate.atomCharacterClassBuiltIn(classID, invert); | 
| 172 |                 m_state = Empty; | 
| 173 |                 return; | 
| 174 |             } | 
| 175 |         } | 
| 176 |  | 
| 177 |         /* | 
| 178 |          * end(): | 
| 179 |          * | 
| 180 |          * Called at end of construction. | 
| 181 |          */ | 
| 182 |         void end() | 
| 183 |         { | 
| 184 |             if (m_state == CachedCharacter) | 
| 185 |                 m_delegate.atomCharacterClassAtom(m_character); | 
| 186 |             else if (m_state == CachedCharacterHyphen) { | 
| 187 |                 m_delegate.atomCharacterClassAtom(m_character); | 
| 188 |                 m_delegate.atomCharacterClassAtom('-'); | 
| 189 |             } | 
| 190 |             m_delegate.atomCharacterClassEnd(); | 
| 191 |         } | 
| 192 |  | 
| 193 |         // parseEscape() should never call these delegate methods when | 
| 194 |         // invoked with inCharacterClass set. | 
| 195 |         NO_RETURN_DUE_TO_ASSERT void assertionWordBoundary(bool) { RELEASE_ASSERT_NOT_REACHED(); } | 
| 196 |         NO_RETURN_DUE_TO_ASSERT void atomBackReference(unsigned) { RELEASE_ASSERT_NOT_REACHED(); } | 
| 197 |         NO_RETURN_DUE_TO_ASSERT void atomNamedBackReference(const String&) { RELEASE_ASSERT_NOT_REACHED(); } | 
| 198 |         bool isValidNamedForwardReference(const String&) { RELEASE_ASSERT_NOT_REACHED(); return false; } | 
| 199 |         NO_RETURN_DUE_TO_ASSERT void atomNamedForwardReference(const String&) { RELEASE_ASSERT_NOT_REACHED(); } | 
| 200 |  | 
| 201 |     private: | 
| 202 |         Delegate& m_delegate; | 
| 203 |         ErrorCode& m_errorCode; | 
| 204 |         enum CharacterClassConstructionState { | 
| 205 |             Empty, | 
| 206 |             CachedCharacter, | 
| 207 |             CachedCharacterHyphen, | 
| 208 |             AfterCharacterClass, | 
| 209 |             AfterCharacterClassHyphen, | 
| 210 |         } m_state; | 
| 211 |         UChar32 m_character; | 
| 212 |     }; | 
| 213 |  | 
| 214 |     Parser(Delegate& delegate, const String& pattern, bool isUnicode, unsigned backReferenceLimit) | 
| 215 |         : m_delegate(delegate) | 
| 216 |         , m_backReferenceLimit(backReferenceLimit) | 
| 217 |         , m_data(pattern.characters<CharType>()) | 
| 218 |         , m_size(pattern.length()) | 
| 219 |         , m_isUnicode(isUnicode) | 
| 220 |     { | 
| 221 |     } | 
| 222 |  | 
| 223 |     // The handling of IdentityEscapes is different depending on the unicode flag. | 
| 224 |     // For Unicode patterns, IdentityEscapes only include SyntaxCharacters or '/'. | 
| 225 |     // For non-unicode patterns, most any character can be escaped. | 
| 226 |     bool isIdentityEscapeAnError(int ch) | 
| 227 |     { | 
| 228 |         if (m_isUnicode && !strchr(s: "^$\\.*+?()[]{}|/" , c: ch)) { | 
| 229 |             m_errorCode = ErrorCode::InvalidIdentityEscape; | 
| 230 |             return true; | 
| 231 |         } | 
| 232 |  | 
| 233 |         return false; | 
| 234 |     } | 
| 235 |  | 
| 236 |     /* | 
| 237 |      * parseEscape(): | 
| 238 |      * | 
| 239 |      * Helper for parseTokens() AND parseCharacterClass(). | 
| 240 |      * Unlike the other parser methods, this function does not report tokens | 
| 241 |      * directly to the member delegate (m_delegate), instead tokens are | 
| 242 |      * emitted to the delegate provided as an argument.  In the case of atom | 
| 243 |      * escapes, parseTokens() will call parseEscape() passing m_delegate as | 
| 244 |      * an argument, and as such the escape will be reported to the delegate. | 
| 245 |      * | 
| 246 |      * However this method may also be used by parseCharacterClass(), in which | 
| 247 |      * case a CharacterClassParserDelegate will be passed as the delegate that | 
| 248 |      * tokens should be added to.  A boolean flag is also provided to indicate | 
| 249 |      * whether that an escape in a CharacterClass is being parsed (some parsing | 
| 250 |      * rules change in this context). | 
| 251 |      * | 
| 252 |      * The boolean value returned by this method indicates whether the token | 
| 253 |      * parsed was an atom (outside of a characted class \b and \B will be | 
| 254 |      * interpreted as assertions). | 
| 255 |      */ | 
| 256 |     template<bool inCharacterClass, class EscapeDelegate> | 
| 257 |     bool parseEscape(EscapeDelegate& delegate) | 
| 258 |     { | 
| 259 |         ASSERT(!hasError(m_errorCode)); | 
| 260 |         ASSERT(peek() == '\\'); | 
| 261 |         consume(); | 
| 262 |  | 
| 263 |         if (atEndOfPattern()) { | 
| 264 |             m_errorCode = ErrorCode::EscapeUnterminated; | 
| 265 |             return false; | 
| 266 |         } | 
| 267 |  | 
| 268 |         switch (peek()) { | 
| 269 |         // Assertions | 
| 270 |         case 'b': | 
| 271 |             consume(); | 
| 272 |             if (inCharacterClass) { | 
| 273 |                 if (isIdentityEscapeAnError(ch: 'b')) | 
| 274 |                     break; | 
| 275 |  | 
| 276 |                 delegate.atomPatternCharacter('\b'); | 
| 277 |             } else { | 
| 278 |                 delegate.assertionWordBoundary(false); | 
| 279 |                 return false; | 
| 280 |             } | 
| 281 |             break; | 
| 282 |         case 'B': | 
| 283 |             consume(); | 
| 284 |             if (inCharacterClass) { | 
| 285 |                 if (isIdentityEscapeAnError(ch: 'B')) | 
| 286 |                     break; | 
| 287 |  | 
| 288 |                 delegate.atomPatternCharacter('B'); | 
| 289 |             } else { | 
| 290 |                 delegate.assertionWordBoundary(true); | 
| 291 |                 return false; | 
| 292 |             } | 
| 293 |             break; | 
| 294 |  | 
| 295 |         // CharacterClassEscape | 
| 296 |         case 'd': | 
| 297 |             consume(); | 
| 298 |             delegate.atomBuiltInCharacterClass(BuiltInCharacterClassID::DigitClassID, false); | 
| 299 |             break; | 
| 300 |         case 's': | 
| 301 |             consume(); | 
| 302 |             delegate.atomBuiltInCharacterClass(BuiltInCharacterClassID::SpaceClassID, false); | 
| 303 |             break; | 
| 304 |         case 'w': | 
| 305 |             consume(); | 
| 306 |             delegate.atomBuiltInCharacterClass(BuiltInCharacterClassID::WordClassID, false); | 
| 307 |             break; | 
| 308 |         case 'D': | 
| 309 |             consume(); | 
| 310 |             delegate.atomBuiltInCharacterClass(BuiltInCharacterClassID::DigitClassID, true); | 
| 311 |             break; | 
| 312 |         case 'S': | 
| 313 |             consume(); | 
| 314 |             delegate.atomBuiltInCharacterClass(BuiltInCharacterClassID::SpaceClassID, true); | 
| 315 |             break; | 
| 316 |         case 'W': | 
| 317 |             consume(); | 
| 318 |             delegate.atomBuiltInCharacterClass(BuiltInCharacterClassID::WordClassID, true); | 
| 319 |             break; | 
| 320 |  | 
| 321 |         // DecimalEscape | 
| 322 |         case '1': | 
| 323 |         case '2': | 
| 324 |         case '3': | 
| 325 |         case '4': | 
| 326 |         case '5': | 
| 327 |         case '6': | 
| 328 |         case '7': | 
| 329 |         case '8': | 
| 330 |         case '9': { | 
| 331 |             // To match Firefox, we parse an invalid backreference in the range [1-7] as an octal escape. | 
| 332 |             // First, try to parse this as backreference. | 
| 333 |             if (!inCharacterClass) { | 
| 334 |                 ParseState state = saveState(); | 
| 335 |  | 
| 336 |                 unsigned backReference = consumeNumber(); | 
| 337 |                 if (backReference <= m_backReferenceLimit) { | 
| 338 |                     delegate.atomBackReference(backReference); | 
| 339 |                     break; | 
| 340 |                 } | 
| 341 |  | 
| 342 |                 restoreState(state); | 
| 343 |  | 
| 344 |                 if (m_isUnicode) { | 
| 345 |                     m_errorCode = ErrorCode::InvalidBackreference; | 
| 346 |                     return false; | 
| 347 |                 } | 
| 348 |             } | 
| 349 |  | 
| 350 |             // Not a backreference, and not octal. Just a number. | 
| 351 |             if (peek() >= '8') { | 
| 352 |                 delegate.atomPatternCharacter(consume()); | 
| 353 |                 break; | 
| 354 |             } | 
| 355 |  | 
| 356 |             // Fall-through to handle this as an octal escape. | 
| 357 |             FALLTHROUGH; | 
| 358 |         } | 
| 359 |  | 
| 360 |         // Octal escape | 
| 361 |         case '0': | 
| 362 |             delegate.atomPatternCharacter(consumeOctal()); | 
| 363 |             break; | 
| 364 |  | 
| 365 |         // ControlEscape | 
| 366 |         case 'f': | 
| 367 |             consume(); | 
| 368 |             delegate.atomPatternCharacter('\f'); | 
| 369 |             break; | 
| 370 |         case 'n': | 
| 371 |             consume(); | 
| 372 |             delegate.atomPatternCharacter('\n'); | 
| 373 |             break; | 
| 374 |         case 'r': | 
| 375 |             consume(); | 
| 376 |             delegate.atomPatternCharacter('\r'); | 
| 377 |             break; | 
| 378 |         case 't': | 
| 379 |             consume(); | 
| 380 |             delegate.atomPatternCharacter('\t'); | 
| 381 |             break; | 
| 382 |         case 'v': | 
| 383 |             consume(); | 
| 384 |             delegate.atomPatternCharacter('\v'); | 
| 385 |             break; | 
| 386 |  | 
| 387 |         // ControlLetter | 
| 388 |         case 'c': { | 
| 389 |             ParseState state = saveState(); | 
| 390 |             consume(); | 
| 391 |             if (!atEndOfPattern()) { | 
| 392 |                 int control = consume(); | 
| 393 |  | 
| 394 |                 // To match Firefox, inside a character class, we also accept numbers and '_' as control characters. | 
| 395 |                 if (inCharacterClass ? WTF::isASCIIAlphanumeric(c: control) || (control == '_') : WTF::isASCIIAlpha(c: control)) { | 
| 396 |                     delegate.atomPatternCharacter(control & 0x1f); | 
| 397 |                     break; | 
| 398 |                 } | 
| 399 |             } | 
| 400 |             restoreState(state); | 
| 401 |             delegate.atomPatternCharacter('\\'); | 
| 402 |             break; | 
| 403 |         } | 
| 404 |  | 
| 405 |         // HexEscape | 
| 406 |         case 'x': { | 
| 407 |             consume(); | 
| 408 |             int x = tryConsumeHex(count: 2); | 
| 409 |             if (x == -1) { | 
| 410 |                 if (isIdentityEscapeAnError(ch: 'x')) | 
| 411 |                     break; | 
| 412 |  | 
| 413 |                 delegate.atomPatternCharacter('x'); | 
| 414 |             } else | 
| 415 |                 delegate.atomPatternCharacter(x); | 
| 416 |             break; | 
| 417 |         } | 
| 418 |  | 
| 419 |         // Named backreference | 
| 420 |         case 'k': { | 
| 421 |             consume(); | 
| 422 |             ParseState state = saveState(); | 
| 423 |             if (!atEndOfPattern() && !inCharacterClass) { | 
| 424 |                 if (consume() == '<') { | 
| 425 |                     auto groupName = tryConsumeGroupName(); | 
| 426 |                     if (groupName) { | 
| 427 |                         if (m_captureGroupNames.contains(groupName.value())) { | 
| 428 |                             delegate.atomNamedBackReference(groupName.value()); | 
| 429 |                             break; | 
| 430 |                         } | 
| 431 |  | 
| 432 |                         if (delegate.isValidNamedForwardReference(groupName.value())) { | 
| 433 |                             delegate.atomNamedForwardReference(groupName.value()); | 
| 434 |                             break; | 
| 435 |                         } | 
| 436 |                     } | 
| 437 |                     if (m_isUnicode) { | 
| 438 |                         m_errorCode = ErrorCode::InvalidBackreference; | 
| 439 |                         break; | 
| 440 |                     } | 
| 441 |                 } | 
| 442 |             } | 
| 443 |             restoreState(state); | 
| 444 |             delegate.atomPatternCharacter('k'); | 
| 445 |             break; | 
| 446 |         } | 
| 447 |  | 
| 448 |         // Unicode property escapes | 
| 449 |         case 'p': | 
| 450 |         case 'P': { | 
| 451 |             int escapeChar = consume(); | 
| 452 |  | 
| 453 |             if (!m_isUnicode) { | 
| 454 |                 if (isIdentityEscapeAnError(ch: escapeChar)) | 
| 455 |                     break; | 
| 456 |                 delegate.atomPatternCharacter(escapeChar); | 
| 457 |                 break; | 
| 458 |             } | 
| 459 |  | 
| 460 |             if (!atEndOfPattern() && peek() == '{') { | 
| 461 |                 consume(); | 
| 462 |                 auto optClassID = tryConsumeUnicodePropertyExpression(); | 
| 463 |                 if (!optClassID) { | 
| 464 |                     // tryConsumeUnicodePropertyExpression() will set m_errorCode for a malformed property expression | 
| 465 |                     break; | 
| 466 |                 } | 
| 467 |                 delegate.atomBuiltInCharacterClass(optClassID.value(), escapeChar == 'P'); | 
| 468 |             } else | 
| 469 |                 m_errorCode = ErrorCode::InvalidUnicodePropertyExpression; | 
| 470 |             break; | 
| 471 |         } | 
| 472 |  | 
| 473 |         // UnicodeEscape | 
| 474 |         case 'u': { | 
| 475 |             consume(); | 
| 476 |             if (atEndOfPattern()) { | 
| 477 |                 if (isIdentityEscapeAnError(ch: 'u')) | 
| 478 |                     break; | 
| 479 |  | 
| 480 |                 delegate.atomPatternCharacter('u'); | 
| 481 |                 break; | 
| 482 |             } | 
| 483 |  | 
| 484 |             if (m_isUnicode && peek() == '{') { | 
| 485 |                 consume(); | 
| 486 |                 UChar32 codePoint = 0; | 
| 487 |                 do { | 
| 488 |                     if (atEndOfPattern() || !isASCIIHexDigit(peek())) { | 
| 489 |                         m_errorCode = ErrorCode::InvalidUnicodeEscape; | 
| 490 |                         break; | 
| 491 |                     } | 
| 492 |  | 
| 493 |                     codePoint = (codePoint << 4) | toASCIIHexValue(consume()); | 
| 494 |  | 
| 495 |                     if (codePoint > UCHAR_MAX_VALUE) | 
| 496 |                         m_errorCode = ErrorCode::InvalidUnicodeEscape; | 
| 497 |                 } while (!atEndOfPattern() && peek() != '}'); | 
| 498 |                 if (!atEndOfPattern() && peek() == '}') | 
| 499 |                     consume(); | 
| 500 |                 else if (!hasError(errorCode: m_errorCode)) | 
| 501 |                     m_errorCode = ErrorCode::InvalidUnicodeEscape; | 
| 502 |                 if (hasError(errorCode: m_errorCode)) | 
| 503 |                     return false; | 
| 504 |  | 
| 505 |                 delegate.atomPatternCharacter(codePoint); | 
| 506 |                 break; | 
| 507 |             } | 
| 508 |             int u = tryConsumeHex(count: 4); | 
| 509 |             if (u == -1) { | 
| 510 |                 if (isIdentityEscapeAnError(ch: 'u')) | 
| 511 |                     break; | 
| 512 |  | 
| 513 |                 delegate.atomPatternCharacter('u'); | 
| 514 |             } else { | 
| 515 |                 // If we have the first of a surrogate pair, look for the second. | 
| 516 |                 if (U16_IS_LEAD(u) && m_isUnicode && (patternRemaining() >= 6) && peek() == '\\') { | 
| 517 |                     ParseState state = saveState(); | 
| 518 |                     consume(); | 
| 519 |                      | 
| 520 |                     if (tryConsume(ch: 'u')) { | 
| 521 |                         int surrogate2 = tryConsumeHex(count: 4); | 
| 522 |                         if (U16_IS_TRAIL(surrogate2)) { | 
| 523 |                             u = U16_GET_SUPPLEMENTARY(u, surrogate2); | 
| 524 |                             delegate.atomPatternCharacter(u); | 
| 525 |                             break; | 
| 526 |                         } | 
| 527 |                     } | 
| 528 |  | 
| 529 |                     restoreState(state); | 
| 530 |                 } | 
| 531 |                 delegate.atomPatternCharacter(u); | 
| 532 |             } | 
| 533 |             break; | 
| 534 |         } | 
| 535 |  | 
| 536 |         // IdentityEscape | 
| 537 |         default: | 
| 538 |             int ch = peek(); | 
| 539 |  | 
| 540 |             if (ch == '-' && m_isUnicode && inCharacterClass) { | 
| 541 |                 // \- is allowed for ClassEscape with unicode flag. | 
| 542 |                 delegate.atomPatternCharacter(consume()); | 
| 543 |                 break; | 
| 544 |             } | 
| 545 |  | 
| 546 |             if (isIdentityEscapeAnError(ch)) | 
| 547 |                 break; | 
| 548 |  | 
| 549 |             delegate.atomPatternCharacter(consume()); | 
| 550 |         } | 
| 551 |          | 
| 552 |         return true; | 
| 553 |     } | 
| 554 |  | 
| 555 |     UChar32 consumePossibleSurrogatePair() | 
| 556 |     { | 
| 557 |         UChar32 ch = consume(); | 
| 558 |         if (U16_IS_LEAD(ch) && m_isUnicode && (patternRemaining() > 0)) { | 
| 559 |             ParseState state = saveState(); | 
| 560 |  | 
| 561 |             UChar32 surrogate2 = consume(); | 
| 562 |             if (U16_IS_TRAIL(surrogate2)) | 
| 563 |                 ch = U16_GET_SUPPLEMENTARY(ch, surrogate2); | 
| 564 |             else | 
| 565 |                 restoreState(state); | 
| 566 |         } | 
| 567 |  | 
| 568 |         return ch; | 
| 569 |     } | 
| 570 |  | 
| 571 |     /* | 
| 572 |      * parseAtomEscape(), parseCharacterClassEscape(): | 
| 573 |      * | 
| 574 |      * These methods alias to parseEscape(). | 
| 575 |      */ | 
| 576 |     bool parseAtomEscape() | 
| 577 |     { | 
| 578 |         return parseEscape<false>(m_delegate); | 
| 579 |     } | 
| 580 |     void parseCharacterClassEscape(CharacterClassParserDelegate& delegate) | 
| 581 |     { | 
| 582 |         parseEscape<true>(delegate); | 
| 583 |     } | 
| 584 |  | 
| 585 |     /* | 
| 586 |      * parseCharacterClass(): | 
| 587 |      * | 
| 588 |      * Helper for parseTokens(); calls dirctly and indirectly (via parseCharacterClassEscape) | 
| 589 |      * to an instance of CharacterClassParserDelegate, to describe the character class to the | 
| 590 |      * delegate. | 
| 591 |      */ | 
| 592 |     void parseCharacterClass() | 
| 593 |     { | 
| 594 |         ASSERT(!hasError(m_errorCode)); | 
| 595 |         ASSERT(peek() == '['); | 
| 596 |         consume(); | 
| 597 |  | 
| 598 |         CharacterClassParserDelegate characterClassConstructor(m_delegate, m_errorCode); | 
| 599 |  | 
| 600 |         characterClassConstructor.begin(tryConsume(ch: '^')); | 
| 601 |  | 
| 602 |         while (!atEndOfPattern()) { | 
| 603 |             switch (peek()) { | 
| 604 |             case ']': | 
| 605 |                 consume(); | 
| 606 |                 characterClassConstructor.end(); | 
| 607 |                 return; | 
| 608 |  | 
| 609 |             case '\\': | 
| 610 |                 parseCharacterClassEscape(delegate&: characterClassConstructor); | 
| 611 |                 break; | 
| 612 |  | 
| 613 |             default: | 
| 614 |                 characterClassConstructor.atomPatternCharacter(consumePossibleSurrogatePair(), true); | 
| 615 |             } | 
| 616 |  | 
| 617 |             if (hasError(errorCode: m_errorCode)) | 
| 618 |                 return; | 
| 619 |         } | 
| 620 |  | 
| 621 |         m_errorCode = ErrorCode::CharacterClassUnmatched; | 
| 622 |     } | 
| 623 |  | 
| 624 |     /* | 
| 625 |      * parseParenthesesBegin(): | 
| 626 |      * | 
| 627 |      * Helper for parseTokens(); checks for parentheses types other than regular capturing subpatterns. | 
| 628 |      */ | 
| 629 |     void parseParenthesesBegin() | 
| 630 |     { | 
| 631 |         ASSERT(!hasError(m_errorCode)); | 
| 632 |         ASSERT(peek() == '('); | 
| 633 |         consume(); | 
| 634 |  | 
| 635 |         if (tryConsume(ch: '?')) { | 
| 636 |             if (atEndOfPattern()) { | 
| 637 |                 m_errorCode = ErrorCode::ParenthesesTypeInvalid; | 
| 638 |                 return; | 
| 639 |             } | 
| 640 |  | 
| 641 |             switch (consume()) { | 
| 642 |             case ':': | 
| 643 |                 m_delegate.atomParenthesesSubpatternBegin(false); | 
| 644 |                 break; | 
| 645 |              | 
| 646 |             case '=': | 
| 647 |                 m_delegate.atomParentheticalAssertionBegin(); | 
| 648 |                 break; | 
| 649 |  | 
| 650 |             case '!': | 
| 651 |                 m_delegate.atomParentheticalAssertionBegin(true); | 
| 652 |                 break; | 
| 653 |  | 
| 654 |             case '<': { | 
| 655 |                 auto groupName = tryConsumeGroupName(); | 
| 656 |                 if (groupName) { | 
| 657 |                     auto setAddResult = m_captureGroupNames.add(k: groupName.value()); | 
| 658 |                     if (setAddResult.isNewEntry) | 
| 659 |                         m_delegate.atomParenthesesSubpatternBegin(true, groupName); | 
| 660 |                     else | 
| 661 |                         m_errorCode = ErrorCode::DuplicateGroupName; | 
| 662 |                 } else | 
| 663 |                     m_errorCode = ErrorCode::InvalidGroupName; | 
| 664 |  | 
| 665 |                 break; | 
| 666 |             } | 
| 667 |  | 
| 668 |             default: | 
| 669 |                 m_errorCode = ErrorCode::ParenthesesTypeInvalid; | 
| 670 |             } | 
| 671 |         } else | 
| 672 |             m_delegate.atomParenthesesSubpatternBegin(); | 
| 673 |  | 
| 674 |         ++m_parenthesesNestingDepth; | 
| 675 |     } | 
| 676 |  | 
| 677 |     /* | 
| 678 |      * parseParenthesesEnd(): | 
| 679 |      * | 
| 680 |      * Helper for parseTokens(); checks for parse errors (due to unmatched parentheses). | 
| 681 |      */ | 
| 682 |     void parseParenthesesEnd() | 
| 683 |     { | 
| 684 |         ASSERT(!hasError(m_errorCode)); | 
| 685 |         ASSERT(peek() == ')'); | 
| 686 |         consume(); | 
| 687 |  | 
| 688 |         if (m_parenthesesNestingDepth > 0) | 
| 689 |             m_delegate.atomParenthesesEnd(); | 
| 690 |         else | 
| 691 |             m_errorCode = ErrorCode::ParenthesesUnmatched; | 
| 692 |  | 
| 693 |         --m_parenthesesNestingDepth; | 
| 694 |     } | 
| 695 |  | 
| 696 |     /* | 
| 697 |      * parseQuantifier(): | 
| 698 |      * | 
| 699 |      * Helper for parseTokens(); checks for parse errors and non-greedy quantifiers. | 
| 700 |      */ | 
| 701 |     void parseQuantifier(bool lastTokenWasAnAtom, unsigned min, unsigned max) | 
| 702 |     { | 
| 703 |         ASSERT(!hasError(m_errorCode)); | 
| 704 |         ASSERT(min <= max); | 
| 705 |  | 
| 706 |         const unsigned quantifyLimit = 1 << 24; | 
| 707 |         if (min > quantifyLimit || (max != quantifyInfinite && max > quantifyLimit)) { | 
| 708 |             m_errorCode = ErrorCode::QuantifierTooLarge; | 
| 709 |             return; | 
| 710 |         } | 
| 711 |  | 
| 712 |         if (lastTokenWasAnAtom) | 
| 713 |             m_delegate.quantifyAtom(min, max, !tryConsume(ch: '?')); | 
| 714 |         else | 
| 715 |             m_errorCode = ErrorCode::QuantifierWithoutAtom; | 
| 716 |     } | 
| 717 |  | 
| 718 |     /* | 
| 719 |      * parseTokens(): | 
| 720 |      * | 
| 721 |      * This method loops over the input pattern reporting tokens to the delegate. | 
| 722 |      * The method returns when a parse error is detected, or the end of the pattern | 
| 723 |      * is reached.  One piece of state is tracked around the loop, which is whether | 
| 724 |      * the last token passed to the delegate was an atom (this is necessary to detect | 
| 725 |      * a parse error when a quantifier provided without an atom to quantify). | 
| 726 |      */ | 
| 727 |     void parseTokens() | 
| 728 |     { | 
| 729 |         bool lastTokenWasAnAtom = false; | 
| 730 |  | 
| 731 |         while (!atEndOfPattern()) { | 
| 732 |             switch (peek()) { | 
| 733 |             case '|': | 
| 734 |                 consume(); | 
| 735 |                 m_delegate.disjunction(); | 
| 736 |                 lastTokenWasAnAtom = false; | 
| 737 |                 break; | 
| 738 |  | 
| 739 |             case '(': | 
| 740 |                 parseParenthesesBegin(); | 
| 741 |                 lastTokenWasAnAtom = false; | 
| 742 |                 break; | 
| 743 |  | 
| 744 |             case ')': | 
| 745 |                 parseParenthesesEnd(); | 
| 746 |                 lastTokenWasAnAtom = true; | 
| 747 |                 break; | 
| 748 |  | 
| 749 |             case '^': | 
| 750 |                 consume(); | 
| 751 |                 m_delegate.assertionBOL(); | 
| 752 |                 lastTokenWasAnAtom = false; | 
| 753 |                 break; | 
| 754 |  | 
| 755 |             case '$': | 
| 756 |                 consume(); | 
| 757 |                 m_delegate.assertionEOL(); | 
| 758 |                 lastTokenWasAnAtom = false; | 
| 759 |                 break; | 
| 760 |  | 
| 761 |             case '.': | 
| 762 |                 consume(); | 
| 763 |                 m_delegate.atomBuiltInCharacterClass(BuiltInCharacterClassID::DotClassID, false); | 
| 764 |                 lastTokenWasAnAtom = true; | 
| 765 |                 break; | 
| 766 |  | 
| 767 |             case '[': | 
| 768 |                 parseCharacterClass(); | 
| 769 |                 lastTokenWasAnAtom = true; | 
| 770 |                 break; | 
| 771 |  | 
| 772 |             case '\\': | 
| 773 |                 lastTokenWasAnAtom = parseAtomEscape(); | 
| 774 |                 break; | 
| 775 |  | 
| 776 |             case '*': | 
| 777 |                 consume(); | 
| 778 |                 parseQuantifier(lastTokenWasAnAtom, min: 0, max: quantifyInfinite); | 
| 779 |                 lastTokenWasAnAtom = false; | 
| 780 |                 break; | 
| 781 |  | 
| 782 |             case '+': | 
| 783 |                 consume(); | 
| 784 |                 parseQuantifier(lastTokenWasAnAtom, min: 1, max: quantifyInfinite); | 
| 785 |                 lastTokenWasAnAtom = false; | 
| 786 |                 break; | 
| 787 |  | 
| 788 |             case '?': | 
| 789 |                 consume(); | 
| 790 |                 parseQuantifier(lastTokenWasAnAtom, min: 0, max: 1); | 
| 791 |                 lastTokenWasAnAtom = false; | 
| 792 |                 break; | 
| 793 |  | 
| 794 |             case '{': { | 
| 795 |                 ParseState state = saveState(); | 
| 796 |  | 
| 797 |                 consume(); | 
| 798 |                 if (peekIsDigit()) { | 
| 799 |                     unsigned min = consumeNumber(); | 
| 800 |                     unsigned max = min; | 
| 801 |                      | 
| 802 |                     if (tryConsume(ch: ',')) | 
| 803 |                         max = peekIsDigit() ? consumeNumber() : quantifyInfinite; | 
| 804 |  | 
| 805 |                     if (tryConsume(ch: '}')) { | 
| 806 |                         if (min <= max) | 
| 807 |                             parseQuantifier(lastTokenWasAnAtom, min, max); | 
| 808 |                         else | 
| 809 |                             m_errorCode = ErrorCode::QuantifierOutOfOrder; | 
| 810 |                         lastTokenWasAnAtom = false; | 
| 811 |                         break; | 
| 812 |                     } | 
| 813 |                 } | 
| 814 |  | 
| 815 |                 restoreState(state); | 
| 816 |             } | 
| 817 |             // if we did not find a complete quantifer, fall through to the default case. | 
| 818 |             FALLTHROUGH; | 
| 819 |  | 
| 820 |             default: | 
| 821 |                 m_delegate.atomPatternCharacter(consumePossibleSurrogatePair()); | 
| 822 |                 lastTokenWasAnAtom = true; | 
| 823 |             } | 
| 824 |  | 
| 825 |             if (hasError(errorCode: m_errorCode)) | 
| 826 |                 return; | 
| 827 |         } | 
| 828 |  | 
| 829 |         if (m_parenthesesNestingDepth > 0) | 
| 830 |             m_errorCode = ErrorCode::MissingParentheses; | 
| 831 |     } | 
| 832 |  | 
| 833 |     /* | 
| 834 |      * parse(): | 
| 835 |      * | 
| 836 |      * This method calls parseTokens() to parse over the input and returns error code for a result. | 
| 837 |      */ | 
| 838 |     ErrorCode parse() | 
| 839 |     { | 
| 840 |         if (m_size > MAX_PATTERN_SIZE) | 
| 841 |             m_errorCode = ErrorCode::PatternTooLarge; | 
| 842 |         else | 
| 843 |             parseTokens(); | 
| 844 |         ASSERT(atEndOfPattern() || hasError(m_errorCode)); | 
| 845 |          | 
| 846 |         return m_errorCode; | 
| 847 |     } | 
| 848 |  | 
| 849 |     // Misc helper functions: | 
| 850 |  | 
| 851 |     typedef unsigned ParseState; | 
| 852 |      | 
| 853 |     ParseState saveState() | 
| 854 |     { | 
| 855 |         return m_index; | 
| 856 |     } | 
| 857 |  | 
| 858 |     void restoreState(ParseState state) | 
| 859 |     { | 
| 860 |         m_index = state; | 
| 861 |     } | 
| 862 |  | 
| 863 |     bool atEndOfPattern() | 
| 864 |     { | 
| 865 |         ASSERT(m_index <= m_size); | 
| 866 |         return m_index == m_size; | 
| 867 |     } | 
| 868 |  | 
| 869 |     unsigned patternRemaining() | 
| 870 |     { | 
| 871 |         ASSERT(m_index <= m_size); | 
| 872 |         return m_size - m_index; | 
| 873 |     } | 
| 874 |  | 
| 875 |     int peek() | 
| 876 |     { | 
| 877 |         ASSERT(m_index < m_size); | 
| 878 |         return m_data[m_index]; | 
| 879 |     } | 
| 880 |  | 
| 881 |     bool peekIsDigit() | 
| 882 |     { | 
| 883 |         return !atEndOfPattern() && WTF::isASCIIDigit(peek()); | 
| 884 |     } | 
| 885 |  | 
| 886 |     unsigned peekDigit() | 
| 887 |     { | 
| 888 |         ASSERT(peekIsDigit()); | 
| 889 |         return peek() - '0'; | 
| 890 |     } | 
| 891 |  | 
| 892 |     int tryConsumeUnicodeEscape() | 
| 893 |     { | 
| 894 |         if (!tryConsume(ch: 'u')) | 
| 895 |             return -1; | 
| 896 |  | 
| 897 |         if (m_isUnicode && tryConsume(ch: '{')) { | 
| 898 |             int codePoint = 0; | 
| 899 |             do { | 
| 900 |                 if (atEndOfPattern() || !isASCIIHexDigit(peek())) { | 
| 901 |                     m_errorCode = ErrorCode::InvalidUnicodeEscape; | 
| 902 |                     return -1; | 
| 903 |                 } | 
| 904 |  | 
| 905 |                 codePoint = (codePoint << 4) | toASCIIHexValue(consume()); | 
| 906 |  | 
| 907 |                 if (codePoint > UCHAR_MAX_VALUE) { | 
| 908 |                     m_errorCode = ErrorCode::InvalidUnicodeEscape; | 
| 909 |                     return -1; | 
| 910 |                 } | 
| 911 |             } while (!atEndOfPattern() && peek() != '}'); | 
| 912 |             if (!atEndOfPattern() && peek() == '}') | 
| 913 |                 consume(); | 
| 914 |             else if (!hasError(errorCode: m_errorCode)) | 
| 915 |                 m_errorCode = ErrorCode::InvalidUnicodeEscape; | 
| 916 |             if (hasError(errorCode: m_errorCode)) | 
| 917 |                 return -1; | 
| 918 |  | 
| 919 |             return codePoint; | 
| 920 |         } | 
| 921 |  | 
| 922 |         int u = tryConsumeHex(count: 4); | 
| 923 |         if (u == -1) | 
| 924 |             return -1; | 
| 925 |  | 
| 926 |         // If we have the first of a surrogate pair, look for the second. | 
| 927 |         if (U16_IS_LEAD(u) && m_isUnicode && (patternRemaining() >= 6) && peek() == '\\') { | 
| 928 |             ParseState state = saveState(); | 
| 929 |             consume(); | 
| 930 |  | 
| 931 |             if (tryConsume(ch: 'u')) { | 
| 932 |                 int surrogate2 = tryConsumeHex(count: 4); | 
| 933 |                 if (U16_IS_TRAIL(surrogate2)) { | 
| 934 |                     u = U16_GET_SUPPLEMENTARY(u, surrogate2); | 
| 935 |                     return u; | 
| 936 |                 } | 
| 937 |             } | 
| 938 |  | 
| 939 |             restoreState(state); | 
| 940 |         } | 
| 941 |  | 
| 942 |         return u; | 
| 943 |     } | 
| 944 |  | 
| 945 |     int tryConsumeIdentifierCharacter() | 
| 946 |     { | 
| 947 |         int ch = peek(); | 
| 948 |  | 
| 949 |         if (ch == '\\') { | 
| 950 |             consume(); | 
| 951 |             ch = tryConsumeUnicodeEscape(); | 
| 952 |         } else | 
| 953 |             consume(); | 
| 954 |  | 
| 955 |         return ch; | 
| 956 |     } | 
| 957 |  | 
| 958 |     bool isIdentifierStart(int ch) | 
| 959 |     { | 
| 960 |         return (WTF::isASCII(c: ch) && (WTF::isASCIIAlpha(c: ch) || ch == '_' || ch == '$')) || (U_GET_GC_MASK(ch) & U_GC_L_MASK); | 
| 961 |     } | 
| 962 |  | 
| 963 |     bool isIdentifierPart(int ch) | 
| 964 |     { | 
| 965 |         return (WTF::isASCII(c: ch) && (WTF::isASCIIAlpha(c: ch) || ch == '_' || ch == '$')) || (U_GET_GC_MASK(ch) & (U_GC_L_MASK | U_GC_MN_MASK | U_GC_MC_MASK | U_GC_ND_MASK | U_GC_PC_MASK)) || ch == 0x200C || ch == 0x200D; | 
| 966 |     } | 
| 967 |  | 
| 968 |     bool isUnicodePropertyValueExpressionChar(int ch) | 
| 969 |     { | 
| 970 |         return WTF::isASCIIAlphanumeric(c: ch) || ch == '_' || ch == '='; | 
| 971 |     } | 
| 972 |  | 
| 973 |     int consume() | 
| 974 |     { | 
| 975 |         ASSERT(m_index < m_size); | 
| 976 |         return m_data[m_index++]; | 
| 977 |     } | 
| 978 |  | 
| 979 |     unsigned consumeDigit() | 
| 980 |     { | 
| 981 |         ASSERT(peekIsDigit()); | 
| 982 |         return consume() - '0'; | 
| 983 |     } | 
| 984 |  | 
| 985 |     unsigned () | 
| 986 |     { | 
| 987 |         Checked<unsigned, RecordOverflow> n = consumeDigit(); | 
| 988 |         while (peekIsDigit()) | 
| 989 |             n = n * 10 + consumeDigit(); | 
| 990 |         return n.hasOverflowed() ? quantifyInfinite : n.unsafeGet(); | 
| 991 |     } | 
| 992 |  | 
| 993 |     unsigned consumeOctal() | 
| 994 |     { | 
| 995 |         ASSERT(WTF::isASCIIOctalDigit(peek())); | 
| 996 |  | 
| 997 |         unsigned n = consumeDigit(); | 
| 998 |         while (n < 32 && !atEndOfPattern() && WTF::isASCIIOctalDigit(peek())) | 
| 999 |             n = n * 8 + consumeDigit(); | 
| 1000 |         return n; | 
| 1001 |     } | 
| 1002 |  | 
| 1003 |     bool tryConsume(UChar ch) | 
| 1004 |     { | 
| 1005 |         if (atEndOfPattern() || (m_data[m_index] != ch)) | 
| 1006 |             return false; | 
| 1007 |         ++m_index; | 
| 1008 |         return true; | 
| 1009 |     } | 
| 1010 |  | 
| 1011 |     int tryConsumeHex(int count) | 
| 1012 |     { | 
| 1013 |         ParseState state = saveState(); | 
| 1014 |  | 
| 1015 |         int n = 0; | 
| 1016 |         while (count--) { | 
| 1017 |             if (atEndOfPattern() || !WTF::isASCIIHexDigit(peek())) { | 
| 1018 |                 restoreState(state); | 
| 1019 |                 return -1; | 
| 1020 |             } | 
| 1021 |             n = (n << 4) | WTF::toASCIIHexValue(consume()); | 
| 1022 |         } | 
| 1023 |         return n; | 
| 1024 |     } | 
| 1025 |  | 
| 1026 |     std::optional<String> tryConsumeGroupName() | 
| 1027 |     { | 
| 1028 |         if (atEndOfPattern()) | 
| 1029 |             return std::nullopt; | 
| 1030 |  | 
| 1031 |         ParseState state = saveState(); | 
| 1032 |          | 
| 1033 |         int ch = tryConsumeIdentifierCharacter(); | 
| 1034 |  | 
| 1035 |         if (isIdentifierStart(ch)) { | 
| 1036 |             StringBuilder identifierBuilder; | 
| 1037 |             identifierBuilder.append(c: ch); | 
| 1038 |  | 
| 1039 |             while (!atEndOfPattern()) { | 
| 1040 |                 ch = tryConsumeIdentifierCharacter(); | 
| 1041 |                 if (ch == '>') | 
| 1042 |                     return std::optional<String>(identifierBuilder.toString()); | 
| 1043 |  | 
| 1044 |                 if (!isIdentifierPart(ch)) | 
| 1045 |                     break; | 
| 1046 |  | 
| 1047 |                 identifierBuilder.append(c: ch); | 
| 1048 |             } | 
| 1049 |         } | 
| 1050 |  | 
| 1051 |         restoreState(state); | 
| 1052 |  | 
| 1053 |         return std::nullopt; | 
| 1054 |     } | 
| 1055 |  | 
| 1056 |     std::optional<BuiltInCharacterClassID> tryConsumeUnicodePropertyExpression() | 
| 1057 |     { | 
| 1058 |         if (atEndOfPattern() || !isUnicodePropertyValueExpressionChar(ch: peek())) { | 
| 1059 |             m_errorCode = ErrorCode::InvalidUnicodePropertyExpression; | 
| 1060 |             return std::nullopt; | 
| 1061 |         } | 
| 1062 |  | 
| 1063 |         StringBuilder expressionBuilder; | 
| 1064 |         String unicodePropertyName; | 
| 1065 |         bool foundEquals = false; | 
| 1066 |         unsigned errors = 0; | 
| 1067 |  | 
| 1068 |         expressionBuilder.append(consume()); | 
| 1069 |  | 
| 1070 |         while (!atEndOfPattern()) { | 
| 1071 |             int ch = peek(); | 
| 1072 |             if (ch == '}') { | 
| 1073 |                 consume(); | 
| 1074 |                 if (errors) { | 
| 1075 |                     m_errorCode = ErrorCode::InvalidUnicodePropertyExpression; | 
| 1076 |                     return std::nullopt; | 
| 1077 |                 } | 
| 1078 |  | 
| 1079 |                 if (foundEquals) { | 
| 1080 |                     auto result = unicodeMatchPropertyValue(unicodePropertyName, expressionBuilder.toString()); | 
| 1081 |                     if (!result) | 
| 1082 |                         m_errorCode = ErrorCode::InvalidUnicodePropertyExpression; | 
| 1083 |                     return result; | 
| 1084 |                 } | 
| 1085 |  | 
| 1086 |                 auto result = unicodeMatchProperty(expressionBuilder.toString()); | 
| 1087 |                 if (!result) | 
| 1088 |                     m_errorCode = ErrorCode::InvalidUnicodePropertyExpression; | 
| 1089 |                 return result; | 
| 1090 |             } | 
| 1091 |  | 
| 1092 |             consume(); | 
| 1093 |             if (ch == '=') { | 
| 1094 |                 if (!foundEquals) { | 
| 1095 |                     foundEquals = true; | 
| 1096 |                     unicodePropertyName = expressionBuilder.toString(); | 
| 1097 |                     expressionBuilder.clear(); | 
| 1098 |                 } else | 
| 1099 |                     errors++; | 
| 1100 |             } else if (!isUnicodePropertyValueExpressionChar(ch)) | 
| 1101 |                 errors++; | 
| 1102 |             else | 
| 1103 |                 expressionBuilder.append(c: ch); | 
| 1104 |         } | 
| 1105 |  | 
| 1106 |         m_errorCode = ErrorCode::InvalidUnicodePropertyExpression; | 
| 1107 |         return std::nullopt; | 
| 1108 |     } | 
| 1109 |  | 
| 1110 |     Delegate& m_delegate; | 
| 1111 |     unsigned m_backReferenceLimit; | 
| 1112 |     ErrorCode m_errorCode { ErrorCode::NoError }; | 
| 1113 |     const CharType* m_data; | 
| 1114 |     unsigned m_size; | 
| 1115 |     unsigned m_index { 0 }; | 
| 1116 |     bool m_isUnicode; | 
| 1117 |     unsigned m_parenthesesNestingDepth { 0 }; | 
| 1118 |     HashSet<String> m_captureGroupNames; | 
| 1119 |  | 
| 1120 |     // Derived by empirical testing of compile time in PCRE and WREC. | 
| 1121 |     static const unsigned MAX_PATTERN_SIZE = 1024 * 1024; | 
| 1122 | }; | 
| 1123 |  | 
| 1124 | /* | 
| 1125 |  * Yarr::parse(): | 
| 1126 |  * | 
| 1127 |  * The parse method is passed a pattern to be parsed and a delegate upon which | 
| 1128 |  * callbacks will be made to record the parsed tokens forming the regex. | 
| 1129 |  * Yarr::parse() returns null on success, or a const C string providing an error | 
| 1130 |  * message where a parse error occurs. | 
| 1131 |  * | 
| 1132 |  * The Delegate must implement the following interface: | 
| 1133 |  * | 
| 1134 |  *    void assertionBOL(); | 
| 1135 |  *    void assertionEOL(); | 
| 1136 |  *    void assertionWordBoundary(bool invert); | 
| 1137 |  * | 
| 1138 |  *    void atomPatternCharacter(UChar32 ch); | 
| 1139 |  *    void atomBuiltInCharacterClass(BuiltInCharacterClassID classID, bool invert); | 
| 1140 |  *    void atomCharacterClassBegin(bool invert) | 
| 1141 |  *    void atomCharacterClassAtom(UChar32 ch) | 
| 1142 |  *    void atomCharacterClassRange(UChar32 begin, UChar32 end) | 
| 1143 |  *    void atomCharacterClassBuiltIn(BuiltInCharacterClassID classID, bool invert) | 
| 1144 |  *    void atomCharacterClassEnd() | 
| 1145 |  *    void atomParenthesesSubpatternBegin(bool capture = true, Optional<String> groupName); | 
| 1146 |  *    void atomParentheticalAssertionBegin(bool invert = false); | 
| 1147 |  *    void atomParenthesesEnd(); | 
| 1148 |  *    void atomBackReference(unsigned subpatternId); | 
| 1149 |  *    void atomNamedBackReference(const String& subpatternName); | 
| 1150 |  *    bool isValidNamedForwardReference(const String& subpatternName); | 
| 1151 |  *    void atomNamedForwardReference(const String& subpatternName); | 
| 1152 |  * | 
| 1153 |  *    void quantifyAtom(unsigned min, unsigned max, bool greedy); | 
| 1154 |  * | 
| 1155 |  *    void disjunction(); | 
| 1156 |  * | 
| 1157 |  * The regular expression is described by a sequence of assertion*() and atom*() | 
| 1158 |  * callbacks to the delegate, describing the terms in the regular expression. | 
| 1159 |  * Following an atom a quantifyAtom() call may occur to indicate that the previous | 
| 1160 |  * atom should be quantified.  In the case of atoms described across multiple | 
| 1161 |  * calls (parentheses and character classes) the call to quantifyAtom() will come | 
| 1162 |  * after the call to the atom*End() method, never after atom*Begin(). | 
| 1163 |  * | 
| 1164 |  * Character classes may either be described by a single call to | 
| 1165 |  * atomBuiltInCharacterClass(), or by a sequence of atomCharacterClass*() calls. | 
| 1166 |  * In the latter case, ...Begin() will be called, followed by a sequence of | 
| 1167 |  * calls to ...Atom(), ...Range(), and ...BuiltIn(), followed by a call to ...End(). | 
| 1168 |  * | 
| 1169 |  * Sequences of atoms and assertions are broken into alternatives via calls to | 
| 1170 |  * disjunction().  Assertions, atoms, and disjunctions emitted between calls to | 
| 1171 |  * atomParenthesesBegin() and atomParenthesesEnd() form the body of a subpattern. | 
| 1172 |  * atomParenthesesBegin() is passed a subpatternId.  In the case of a regular | 
| 1173 |  * capturing subpattern, this will be the subpatternId associated with these | 
| 1174 |  * parentheses, and will also by definition be the lowest subpatternId of these | 
| 1175 |  * parentheses and of any nested paretheses.  The atomParenthesesEnd() method | 
| 1176 |  * is passed the subpatternId of the last capturing subexpression nested within | 
| 1177 |  * these paretheses.  In the case of a capturing subpattern with no nested | 
| 1178 |  * capturing subpatterns, the same subpatternId will be passed to the begin and | 
| 1179 |  * end functions.  In the case of non-capturing subpatterns the subpatternId | 
| 1180 |  * passed to the begin method is also the first possible subpatternId that might | 
| 1181 |  * be nested within these paretheses.  If a set of non-capturing parentheses does | 
| 1182 |  * not contain any capturing subpatterns, then the subpatternId passed to begin | 
| 1183 |  * will be greater than the subpatternId passed to end. | 
| 1184 |  */ | 
| 1185 |  | 
| 1186 | template<class Delegate> | 
| 1187 | ErrorCode parse(Delegate& delegate, const String& pattern, bool isUnicode, unsigned backReferenceLimit = quantifyInfinite) | 
| 1188 | { | 
| 1189 |     if (pattern.is8Bit()) | 
| 1190 |         return Parser<Delegate, LChar>(delegate, pattern, isUnicode, backReferenceLimit).parse(); | 
| 1191 |     return Parser<Delegate, UChar>(delegate, pattern, isUnicode, backReferenceLimit).parse(); | 
| 1192 | } | 
| 1193 |  | 
| 1194 | } } // namespace JSC::Yarr | 
| 1195 |  |