| 1 | /* | 
| 2 |  * Copyright (C) 2009, 2010-2012, 2014, 2016 Apple Inc. All rights reserved. | 
| 3 |  * | 
| 4 |  * Redistribution and use in source and binary forms, with or without | 
| 5 |  * modification, are permitted provided that the following conditions | 
| 6 |  * are met: | 
| 7 |  * 1. Redistributions of source code must retain the above copyright | 
| 8 |  *    notice, this list of conditions and the following disclaimer. | 
| 9 |  * 2. Redistributions in binary form must reproduce the above copyright | 
| 10 |  *    notice, this list of conditions and the following disclaimer in the | 
| 11 |  *    documentation and/or other materials provided with the distribution. | 
| 12 |  * | 
| 13 |  * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY | 
| 14 |  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | 
| 15 |  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR | 
| 16 |  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR | 
| 17 |  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, | 
| 18 |  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, | 
| 19 |  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR | 
| 20 |  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY | 
| 21 |  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | 
| 22 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | 
| 23 |  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  | 
| 24 |  */ | 
| 25 |  | 
| 26 | #pragma once | 
| 27 |  | 
| 28 | #include "ConcurrentJSLock.h" | 
| 29 | #include "YarrPattern.h" | 
| 30 |  | 
| 31 | namespace WTF { | 
| 32 | class BumpPointerAllocator; | 
| 33 | } | 
| 34 | using WTF::BumpPointerAllocator; | 
| 35 |  | 
| 36 | namespace JSC { namespace Yarr { | 
| 37 |  | 
| 38 | class ByteDisjunction; | 
| 39 |  | 
| 40 | struct ByteTerm { | 
| 41 |     enum Type { | 
| 42 |         TypeBodyAlternativeBegin, | 
| 43 |         TypeBodyAlternativeDisjunction, | 
| 44 |         TypeBodyAlternativeEnd, | 
| 45 |         TypeAlternativeBegin, | 
| 46 |         TypeAlternativeDisjunction, | 
| 47 |         TypeAlternativeEnd, | 
| 48 |         TypeSubpatternBegin, | 
| 49 |         TypeSubpatternEnd, | 
| 50 |         TypeAssertionBOL, | 
| 51 |         TypeAssertionEOL, | 
| 52 |         TypeAssertionWordBoundary, | 
| 53 |         TypePatternCharacterOnce, | 
| 54 |         TypePatternCharacterFixed, | 
| 55 |         TypePatternCharacterGreedy, | 
| 56 |         TypePatternCharacterNonGreedy, | 
| 57 |         TypePatternCasedCharacterOnce, | 
| 58 |         TypePatternCasedCharacterFixed, | 
| 59 |         TypePatternCasedCharacterGreedy, | 
| 60 |         TypePatternCasedCharacterNonGreedy, | 
| 61 |         TypeCharacterClass, | 
| 62 |         TypeBackReference, | 
| 63 |         TypeParenthesesSubpattern, | 
| 64 |         TypeParenthesesSubpatternOnceBegin, | 
| 65 |         TypeParenthesesSubpatternOnceEnd, | 
| 66 |         TypeParenthesesSubpatternTerminalBegin, | 
| 67 |         TypeParenthesesSubpatternTerminalEnd, | 
| 68 |         TypeParentheticalAssertionBegin, | 
| 69 |         TypeParentheticalAssertionEnd, | 
| 70 |         TypeCheckInput, | 
| 71 |         TypeUncheckInput, | 
| 72 |         TypeDotStarEnclosure, | 
| 73 |     } type; | 
| 74 |     union { | 
| 75 |         struct { | 
| 76 |             union { | 
| 77 |                 UChar32 patternCharacter; | 
| 78 |                 struct { | 
| 79 |                     UChar32 lo; | 
| 80 |                     UChar32 hi; | 
| 81 |                 } casedCharacter; | 
| 82 |                 CharacterClass* characterClass; | 
| 83 |                 unsigned subpatternId; | 
| 84 |             }; | 
| 85 |             union { | 
| 86 |                 ByteDisjunction* parenthesesDisjunction; | 
| 87 |                 unsigned parenthesesWidth; | 
| 88 |             }; | 
| 89 |             QuantifierType quantityType; | 
| 90 |             unsigned quantityMinCount; | 
| 91 |             unsigned quantityMaxCount; | 
| 92 |         } atom; | 
| 93 |         struct { | 
| 94 |             int next; | 
| 95 |             int end; | 
| 96 |             bool onceThrough; | 
| 97 |         } alternative; | 
| 98 |         struct { | 
| 99 |             bool m_bol : 1; | 
| 100 |             bool m_eol : 1; | 
| 101 |         } anchors; | 
| 102 |         unsigned checkInputCount; | 
| 103 |     }; | 
| 104 |     unsigned frameLocation; | 
| 105 |     bool m_capture : 1; | 
| 106 |     bool m_invert : 1; | 
| 107 |     unsigned inputPosition; | 
| 108 |  | 
| 109 |     ByteTerm(UChar32 ch, unsigned inputPos, unsigned frameLocation, Checked<unsigned> quantityCount, QuantifierType quantityType) | 
| 110 |         : frameLocation(frameLocation) | 
| 111 |         , m_capture(false) | 
| 112 |         , m_invert(false) | 
| 113 |     { | 
| 114 |         atom.patternCharacter = ch; | 
| 115 |         atom.quantityType = quantityType; | 
| 116 |         atom.quantityMinCount = quantityCount.unsafeGet(); | 
| 117 |         atom.quantityMaxCount = quantityCount.unsafeGet(); | 
| 118 |         inputPosition = inputPos; | 
| 119 |  | 
| 120 |         switch (quantityType) { | 
| 121 |         case QuantifierFixedCount: | 
| 122 |             type = (quantityCount == 1) ? ByteTerm::TypePatternCharacterOnce : ByteTerm::TypePatternCharacterFixed; | 
| 123 |             break; | 
| 124 |         case QuantifierGreedy: | 
| 125 |             type = ByteTerm::TypePatternCharacterGreedy; | 
| 126 |             break; | 
| 127 |         case QuantifierNonGreedy: | 
| 128 |             type = ByteTerm::TypePatternCharacterNonGreedy; | 
| 129 |             break; | 
| 130 |         } | 
| 131 |     } | 
| 132 |  | 
| 133 |     ByteTerm(UChar32 lo, UChar32 hi, unsigned inputPos, unsigned frameLocation, Checked<unsigned> quantityCount, QuantifierType quantityType) | 
| 134 |         : frameLocation(frameLocation) | 
| 135 |         , m_capture(false) | 
| 136 |         , m_invert(false) | 
| 137 |     { | 
| 138 |         switch (quantityType) { | 
| 139 |         case QuantifierFixedCount: | 
| 140 |             type = (quantityCount == 1) ? ByteTerm::TypePatternCasedCharacterOnce : ByteTerm::TypePatternCasedCharacterFixed; | 
| 141 |             break; | 
| 142 |         case QuantifierGreedy: | 
| 143 |             type = ByteTerm::TypePatternCasedCharacterGreedy; | 
| 144 |             break; | 
| 145 |         case QuantifierNonGreedy: | 
| 146 |             type = ByteTerm::TypePatternCasedCharacterNonGreedy; | 
| 147 |             break; | 
| 148 |         } | 
| 149 |  | 
| 150 |         atom.casedCharacter.lo = lo; | 
| 151 |         atom.casedCharacter.hi = hi; | 
| 152 |         atom.quantityType = quantityType; | 
| 153 |         atom.quantityMinCount = quantityCount.unsafeGet(); | 
| 154 |         atom.quantityMaxCount = quantityCount.unsafeGet(); | 
| 155 |         inputPosition = inputPos; | 
| 156 |     } | 
| 157 |  | 
| 158 |     ByteTerm(CharacterClass* characterClass, bool invert, unsigned inputPos) | 
| 159 |         : type(ByteTerm::TypeCharacterClass) | 
| 160 |         , m_capture(false) | 
| 161 |         , m_invert(invert) | 
| 162 |     { | 
| 163 |         atom.characterClass = characterClass; | 
| 164 |         atom.quantityType = QuantifierFixedCount; | 
| 165 |         atom.quantityMinCount = 1; | 
| 166 |         atom.quantityMaxCount = 1; | 
| 167 |         inputPosition = inputPos; | 
| 168 |     } | 
| 169 |  | 
| 170 |     ByteTerm(Type type, unsigned subpatternId, ByteDisjunction* parenthesesInfo, bool capture, unsigned inputPos) | 
| 171 |         : type(type) | 
| 172 |         , m_capture(capture) | 
| 173 |         , m_invert(false) | 
| 174 |     { | 
| 175 |         atom.subpatternId = subpatternId; | 
| 176 |         atom.parenthesesDisjunction = parenthesesInfo; | 
| 177 |         atom.quantityType = QuantifierFixedCount; | 
| 178 |         atom.quantityMinCount = 1; | 
| 179 |         atom.quantityMaxCount = 1; | 
| 180 |         inputPosition = inputPos; | 
| 181 |     } | 
| 182 |      | 
| 183 |     ByteTerm(Type type, bool invert = false) | 
| 184 |         : type(type) | 
| 185 |         , m_capture(false) | 
| 186 |         , m_invert(invert) | 
| 187 |     { | 
| 188 |         atom.quantityType = QuantifierFixedCount; | 
| 189 |         atom.quantityMinCount = 1; | 
| 190 |         atom.quantityMaxCount = 1; | 
| 191 |     } | 
| 192 |  | 
| 193 |     ByteTerm(Type type, unsigned subpatternId, bool capture, bool invert, unsigned inputPos) | 
| 194 |         : type(type) | 
| 195 |         , m_capture(capture) | 
| 196 |         , m_invert(invert) | 
| 197 |     { | 
| 198 |         atom.subpatternId = subpatternId; | 
| 199 |         atom.quantityType = QuantifierFixedCount; | 
| 200 |         atom.quantityMinCount = 1; | 
| 201 |         atom.quantityMaxCount = 1; | 
| 202 |         inputPosition = inputPos; | 
| 203 |     } | 
| 204 |  | 
| 205 |     static ByteTerm BOL(unsigned inputPos) | 
| 206 |     { | 
| 207 |         ByteTerm term(TypeAssertionBOL); | 
| 208 |         term.inputPosition = inputPos; | 
| 209 |         return term; | 
| 210 |     } | 
| 211 |  | 
| 212 |     static ByteTerm CheckInput(Checked<unsigned> count) | 
| 213 |     { | 
| 214 |         ByteTerm term(TypeCheckInput); | 
| 215 |         term.checkInputCount = count.unsafeGet(); | 
| 216 |         return term; | 
| 217 |     } | 
| 218 |  | 
| 219 |     static ByteTerm UncheckInput(Checked<unsigned> count) | 
| 220 |     { | 
| 221 |         ByteTerm term(TypeUncheckInput); | 
| 222 |         term.checkInputCount = count.unsafeGet(); | 
| 223 |         return term; | 
| 224 |     } | 
| 225 |      | 
| 226 |     static ByteTerm EOL(unsigned inputPos) | 
| 227 |     { | 
| 228 |         ByteTerm term(TypeAssertionEOL); | 
| 229 |         term.inputPosition = inputPos; | 
| 230 |         return term; | 
| 231 |     } | 
| 232 |  | 
| 233 |     static ByteTerm WordBoundary(bool invert, unsigned inputPos) | 
| 234 |     { | 
| 235 |         ByteTerm term(TypeAssertionWordBoundary, invert); | 
| 236 |         term.inputPosition = inputPos; | 
| 237 |         return term; | 
| 238 |     } | 
| 239 |      | 
| 240 |     static ByteTerm BackReference(unsigned subpatternId, unsigned inputPos) | 
| 241 |     { | 
| 242 |         return ByteTerm(TypeBackReference, subpatternId, false, false, inputPos); | 
| 243 |     } | 
| 244 |  | 
| 245 |     static ByteTerm BodyAlternativeBegin(bool onceThrough) | 
| 246 |     { | 
| 247 |         ByteTerm term(TypeBodyAlternativeBegin); | 
| 248 |         term.alternative.next = 0; | 
| 249 |         term.alternative.end = 0; | 
| 250 |         term.alternative.onceThrough = onceThrough; | 
| 251 |         return term; | 
| 252 |     } | 
| 253 |  | 
| 254 |     static ByteTerm BodyAlternativeDisjunction(bool onceThrough) | 
| 255 |     { | 
| 256 |         ByteTerm term(TypeBodyAlternativeDisjunction); | 
| 257 |         term.alternative.next = 0; | 
| 258 |         term.alternative.end = 0; | 
| 259 |         term.alternative.onceThrough = onceThrough; | 
| 260 |         return term; | 
| 261 |     } | 
| 262 |  | 
| 263 |     static ByteTerm BodyAlternativeEnd() | 
| 264 |     { | 
| 265 |         ByteTerm term(TypeBodyAlternativeEnd); | 
| 266 |         term.alternative.next = 0; | 
| 267 |         term.alternative.end = 0; | 
| 268 |         term.alternative.onceThrough = false; | 
| 269 |         return term; | 
| 270 |     } | 
| 271 |  | 
| 272 |     static ByteTerm AlternativeBegin() | 
| 273 |     { | 
| 274 |         ByteTerm term(TypeAlternativeBegin); | 
| 275 |         term.alternative.next = 0; | 
| 276 |         term.alternative.end = 0; | 
| 277 |         term.alternative.onceThrough = false; | 
| 278 |         return term; | 
| 279 |     } | 
| 280 |  | 
| 281 |     static ByteTerm AlternativeDisjunction() | 
| 282 |     { | 
| 283 |         ByteTerm term(TypeAlternativeDisjunction); | 
| 284 |         term.alternative.next = 0; | 
| 285 |         term.alternative.end = 0; | 
| 286 |         term.alternative.onceThrough = false; | 
| 287 |         return term; | 
| 288 |     } | 
| 289 |  | 
| 290 |     static ByteTerm AlternativeEnd() | 
| 291 |     { | 
| 292 |         ByteTerm term(TypeAlternativeEnd); | 
| 293 |         term.alternative.next = 0; | 
| 294 |         term.alternative.end = 0; | 
| 295 |         term.alternative.onceThrough = false; | 
| 296 |         return term; | 
| 297 |     } | 
| 298 |  | 
| 299 |     static ByteTerm SubpatternBegin() | 
| 300 |     { | 
| 301 |         return ByteTerm(TypeSubpatternBegin); | 
| 302 |     } | 
| 303 |  | 
| 304 |     static ByteTerm SubpatternEnd() | 
| 305 |     { | 
| 306 |         return ByteTerm(TypeSubpatternEnd); | 
| 307 |     } | 
| 308 |      | 
| 309 |     static ByteTerm DotStarEnclosure(bool bolAnchor, bool eolAnchor) | 
| 310 |     { | 
| 311 |         ByteTerm term(TypeDotStarEnclosure); | 
| 312 |         term.anchors.m_bol = bolAnchor; | 
| 313 |         term.anchors.m_eol = eolAnchor; | 
| 314 |         return term; | 
| 315 |     } | 
| 316 |  | 
| 317 |     bool invert() | 
| 318 |     { | 
| 319 |         return m_invert; | 
| 320 |     } | 
| 321 |  | 
| 322 |     bool capture() | 
| 323 |     { | 
| 324 |         return m_capture; | 
| 325 |     } | 
| 326 | }; | 
| 327 |  | 
| 328 | class ByteDisjunction { | 
| 329 |     WTF_MAKE_FAST_ALLOCATED; | 
| 330 | public: | 
| 331 |     ByteDisjunction(unsigned numSubpatterns, unsigned frameSize) | 
| 332 |         : m_numSubpatterns(numSubpatterns) | 
| 333 |         , m_frameSize(frameSize) | 
| 334 |     { | 
| 335 |     } | 
| 336 |  | 
| 337 |     size_t estimatedSizeInBytes() const { return terms.capacity() * sizeof(ByteTerm); } | 
| 338 |  | 
| 339 |     Vector<ByteTerm> terms; | 
| 340 |     unsigned m_numSubpatterns; | 
| 341 |     unsigned m_frameSize; | 
| 342 | }; | 
| 343 |  | 
| 344 | struct BytecodePattern { | 
| 345 |     WTF_MAKE_FAST_ALLOCATED; | 
| 346 | public: | 
| 347 |     BytecodePattern(std::unique_ptr<ByteDisjunction> body, Vector<std::unique_ptr<ByteDisjunction>>& parenthesesInfoToAdopt, YarrPattern& pattern, BumpPointerAllocator* allocator, ConcurrentJSLock* lock) | 
| 348 |         : m_body(WTFMove(body)) | 
| 349 |         , m_flags(pattern.m_flags) | 
| 350 |         , m_allocator(allocator) | 
| 351 |         , m_lock(lock) | 
| 352 |     { | 
| 353 |         m_body->terms.shrinkToFit(); | 
| 354 |  | 
| 355 |         newlineCharacterClass = pattern.newlineCharacterClass(); | 
| 356 |         if (unicode() && ignoreCase()) | 
| 357 |             wordcharCharacterClass = pattern.wordUnicodeIgnoreCaseCharCharacterClass(); | 
| 358 |         else | 
| 359 |             wordcharCharacterClass = pattern.wordcharCharacterClass(); | 
| 360 |  | 
| 361 |         m_allParenthesesInfo.swap(x&: parenthesesInfoToAdopt); | 
| 362 |         m_allParenthesesInfo.shrinkToFit(); | 
| 363 |  | 
| 364 |         m_userCharacterClasses.swap(x&: pattern.m_userCharacterClasses); | 
| 365 |         m_userCharacterClasses.shrinkToFit(); | 
| 366 |     } | 
| 367 |  | 
| 368 |     size_t estimatedSizeInBytes() const { return m_body->estimatedSizeInBytes(); } | 
| 369 |      | 
| 370 |     bool ignoreCase() const { return m_flags & FlagIgnoreCase; } | 
| 371 |     bool multiline() const { return m_flags & FlagMultiline; } | 
| 372 |     bool sticky() const { return m_flags & FlagSticky; } | 
| 373 |     bool unicode() const { return m_flags & FlagUnicode; } | 
| 374 |     bool dotAll() const { return m_flags & FlagDotAll; } | 
| 375 |  | 
| 376 |     std::unique_ptr<ByteDisjunction> m_body; | 
| 377 |     RegExpFlags m_flags; | 
| 378 |     // Each BytecodePattern is associated with a RegExp, each RegExp is associated | 
| 379 |     // with a VM.  Cache a pointer to out VM's m_regExpAllocator. | 
| 380 |     BumpPointerAllocator* m_allocator; | 
| 381 |     ConcurrentJSLock* m_lock; | 
| 382 |  | 
| 383 |     CharacterClass* newlineCharacterClass; | 
| 384 |     CharacterClass* wordcharCharacterClass; | 
| 385 |  | 
| 386 | private: | 
| 387 |     Vector<std::unique_ptr<ByteDisjunction>> m_allParenthesesInfo; | 
| 388 |     Vector<std::unique_ptr<CharacterClass>> m_userCharacterClasses; | 
| 389 | }; | 
| 390 |  | 
| 391 | JS_EXPORT_PRIVATE std::unique_ptr<BytecodePattern> byteCompile(YarrPattern&, BumpPointerAllocator*, ConcurrentJSLock* = nullptr); | 
| 392 | JS_EXPORT_PRIVATE unsigned interpret(BytecodePattern*, const String& input, unsigned start, unsigned* output); | 
| 393 | unsigned interpret(BytecodePattern*, const LChar* input, unsigned length, unsigned start, unsigned* output); | 
| 394 | unsigned interpret(BytecodePattern*, const UChar* input, unsigned length, unsigned start, unsigned* output); | 
| 395 |  | 
| 396 | } } // namespace JSC::Yarr | 
| 397 |  |