| 1 | /* | 
| 2 |  * Copyright (C) 2009 Apple Inc. All rights reserved. | 
| 3 |  * | 
| 4 |  * Redistribution and use in source and binary forms, with or without | 
| 5 |  * modification, are permitted provided that the following conditions | 
| 6 |  * are met: | 
| 7 |  * 1. Redistributions of source code must retain the above copyright | 
| 8 |  *    notice, this list of conditions and the following disclaimer. | 
| 9 |  * 2. Redistributions in binary form must reproduce the above copyright | 
| 10 |  *    notice, this list of conditions and the following disclaimer in the | 
| 11 |  *    documentation and/or other materials provided with the distribution. | 
| 12 |  * | 
| 13 |  * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY | 
| 14 |  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | 
| 15 |  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR | 
| 16 |  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR | 
| 17 |  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, | 
| 18 |  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, | 
| 19 |  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR | 
| 20 |  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY | 
| 21 |  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | 
| 22 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | 
| 23 |  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  | 
| 24 |  */ | 
| 25 |  | 
| 26 | #ifndef RegexPattern_h | 
| 27 | #define RegexPattern_h | 
| 28 |  | 
| 29 | #include <wtf/Platform.h> | 
| 30 |  | 
| 31 | #if ENABLE(YARR) | 
| 32 |  | 
| 33 | #include <wtf/Vector.h> | 
| 34 | #include <wtf/unicode/Unicode.h> | 
| 35 |  | 
| 36 |  | 
| 37 | namespace JSC { namespace Yarr { | 
| 38 |  | 
| 39 | #define RegexStackSpaceForBackTrackInfoPatternCharacter 1 // Only for !fixed quantifiers. | 
| 40 | #define RegexStackSpaceForBackTrackInfoCharacterClass 1 // Only for !fixed quantifiers. | 
| 41 | #define RegexStackSpaceForBackTrackInfoBackReference 2 | 
| 42 | #define RegexStackSpaceForBackTrackInfoAlternative 1 // One per alternative. | 
| 43 | #define RegexStackSpaceForBackTrackInfoParentheticalAssertion 1 | 
| 44 | #define RegexStackSpaceForBackTrackInfoParenthesesOnce 1 // Only for !fixed quantifiers. | 
| 45 | #define RegexStackSpaceForBackTrackInfoParentheses 4 | 
| 46 |  | 
| 47 | struct PatternDisjunction; | 
| 48 |  | 
| 49 | struct CharacterRange { | 
| 50 |     UChar begin; | 
| 51 |     UChar end; | 
| 52 |  | 
| 53 |     CharacterRange(UChar begin, UChar end) | 
| 54 |         : begin(begin) | 
| 55 |         , end(end) | 
| 56 |     { | 
| 57 |     } | 
| 58 | }; | 
| 59 |  | 
| 60 | struct CharacterClass : FastAllocBase { | 
| 61 |     Vector<UChar> m_matches; | 
| 62 |     Vector<CharacterRange> m_ranges; | 
| 63 |     Vector<UChar> m_matchesUnicode; | 
| 64 |     Vector<CharacterRange> m_rangesUnicode; | 
| 65 | }; | 
| 66 |  | 
| 67 | enum QuantifierType { | 
| 68 |     QuantifierFixedCount, | 
| 69 |     QuantifierGreedy, | 
| 70 |     QuantifierNonGreedy, | 
| 71 | }; | 
| 72 |  | 
| 73 | struct PatternTerm { | 
| 74 |     enum Type { | 
| 75 |         TypeAssertionBOL, | 
| 76 |         TypeAssertionEOL, | 
| 77 |         TypeAssertionWordBoundary, | 
| 78 |         TypePatternCharacter, | 
| 79 |         TypeCharacterClass, | 
| 80 |         TypeBackReference, | 
| 81 |         TypeForwardReference, | 
| 82 |         TypeParenthesesSubpattern, | 
| 83 |         TypeParentheticalAssertion, | 
| 84 |     } type; | 
| 85 |     bool invertOrCapture; | 
| 86 |     union { | 
| 87 |         UChar patternCharacter; | 
| 88 |         CharacterClass* characterClass; | 
| 89 |         unsigned subpatternId; | 
| 90 |         struct { | 
| 91 |             PatternDisjunction* disjunction; | 
| 92 |             unsigned subpatternId; | 
| 93 |             unsigned lastSubpatternId; | 
| 94 |             bool isCopy; | 
| 95 |         } parentheses; | 
| 96 |     }; | 
| 97 |     QuantifierType quantityType; | 
| 98 |     unsigned quantityCount; | 
| 99 |     int inputPosition; | 
| 100 |     unsigned frameLocation; | 
| 101 |  | 
| 102 |     PatternTerm(UChar ch) | 
| 103 |         : type(PatternTerm::TypePatternCharacter) | 
| 104 |     { | 
| 105 |         patternCharacter = ch; | 
| 106 |         quantityType = QuantifierFixedCount; | 
| 107 |         quantityCount = 1; | 
| 108 |     } | 
| 109 |  | 
| 110 |     PatternTerm(CharacterClass* charClass, bool invert) | 
| 111 |         : type(PatternTerm::TypeCharacterClass) | 
| 112 |         , invertOrCapture(invert) | 
| 113 |     { | 
| 114 |         characterClass = charClass; | 
| 115 |         quantityType = QuantifierFixedCount; | 
| 116 |         quantityCount = 1; | 
| 117 |     } | 
| 118 |  | 
| 119 |     PatternTerm(Type type, unsigned subpatternId, PatternDisjunction* disjunction, bool invertOrCapture) | 
| 120 |         : type(type) | 
| 121 |         , invertOrCapture(invertOrCapture) | 
| 122 |     { | 
| 123 |         parentheses.disjunction = disjunction; | 
| 124 |         parentheses.subpatternId = subpatternId; | 
| 125 |         parentheses.isCopy = false; | 
| 126 |         quantityType = QuantifierFixedCount; | 
| 127 |         quantityCount = 1; | 
| 128 |     } | 
| 129 |      | 
| 130 |     PatternTerm(Type type, bool invert = false) | 
| 131 |         : type(type) | 
| 132 |         , invertOrCapture(invert) | 
| 133 |     { | 
| 134 |         quantityType = QuantifierFixedCount; | 
| 135 |         quantityCount = 1; | 
| 136 |     } | 
| 137 |  | 
| 138 |     PatternTerm(unsigned spatternId) | 
| 139 |         : type(TypeBackReference) | 
| 140 |         , invertOrCapture(false) | 
| 141 |     { | 
| 142 |         subpatternId = spatternId; | 
| 143 |         quantityType = QuantifierFixedCount; | 
| 144 |         quantityCount = 1; | 
| 145 |     } | 
| 146 |  | 
| 147 |     static PatternTerm ForwardReference() | 
| 148 |     { | 
| 149 |         return PatternTerm(TypeForwardReference); | 
| 150 |     } | 
| 151 |  | 
| 152 |     static PatternTerm BOL() | 
| 153 |     { | 
| 154 |         return PatternTerm(TypeAssertionBOL); | 
| 155 |     } | 
| 156 |  | 
| 157 |     static PatternTerm EOL() | 
| 158 |     { | 
| 159 |         return PatternTerm(TypeAssertionEOL); | 
| 160 |     } | 
| 161 |  | 
| 162 |     static PatternTerm WordBoundary(bool invert) | 
| 163 |     { | 
| 164 |         return PatternTerm(TypeAssertionWordBoundary, invert); | 
| 165 |     } | 
| 166 |      | 
| 167 |     bool invert() | 
| 168 |     { | 
| 169 |         return invertOrCapture; | 
| 170 |     } | 
| 171 |  | 
| 172 |     bool capture() | 
| 173 |     { | 
| 174 |         return invertOrCapture; | 
| 175 |     } | 
| 176 |      | 
| 177 |     void quantify(unsigned count, QuantifierType type) | 
| 178 |     { | 
| 179 |         quantityCount = count; | 
| 180 |         quantityType = type; | 
| 181 |     } | 
| 182 | }; | 
| 183 |  | 
| 184 | struct PatternAlternative : FastAllocBase { | 
| 185 |     PatternAlternative(PatternDisjunction* disjunction) | 
| 186 |         : m_parent(disjunction) | 
| 187 |     { | 
| 188 |     } | 
| 189 |  | 
| 190 |     PatternTerm& lastTerm() | 
| 191 |     { | 
| 192 |         ASSERT(m_terms.size()); | 
| 193 |         return m_terms[m_terms.size() - 1]; | 
| 194 |     } | 
| 195 |      | 
| 196 |     void removeLastTerm() | 
| 197 |     { | 
| 198 |         ASSERT(m_terms.size()); | 
| 199 |         m_terms.shrink(size: m_terms.size() - 1); | 
| 200 |     } | 
| 201 |  | 
| 202 |     Vector<PatternTerm> m_terms; | 
| 203 |     PatternDisjunction* m_parent; | 
| 204 |     unsigned m_minimumSize; | 
| 205 |     bool m_hasFixedSize; | 
| 206 | }; | 
| 207 |  | 
| 208 | struct PatternDisjunction : FastAllocBase { | 
| 209 |     PatternDisjunction(PatternAlternative* parent = 0) | 
| 210 |         : m_parent(parent) | 
| 211 |     { | 
| 212 |     } | 
| 213 |      | 
| 214 |     ~PatternDisjunction() | 
| 215 |     { | 
| 216 |         deleteAllValues(collection: m_alternatives); | 
| 217 |     } | 
| 218 |  | 
| 219 |     PatternAlternative* addNewAlternative() | 
| 220 |     { | 
| 221 |         PatternAlternative* alternative = new PatternAlternative(this); | 
| 222 |         m_alternatives.append(val: alternative); | 
| 223 |         return alternative; | 
| 224 |     } | 
| 225 |  | 
| 226 |     Vector<PatternAlternative*> m_alternatives; | 
| 227 |     PatternAlternative* m_parent; | 
| 228 |     unsigned m_minimumSize; | 
| 229 |     unsigned m_callFrameSize; | 
| 230 |     bool m_hasFixedSize; | 
| 231 | }; | 
| 232 |  | 
| 233 | // You probably don't want to be calling these functions directly | 
| 234 | // (please to be calling newlineCharacterClass() et al on your | 
| 235 | // friendly neighborhood RegexPattern instance to get nicely | 
| 236 | // cached copies). | 
| 237 | CharacterClass* newlineCreate(); | 
| 238 | CharacterClass* digitsCreate(); | 
| 239 | CharacterClass* spacesCreate(); | 
| 240 | CharacterClass* wordcharCreate(); | 
| 241 | CharacterClass* nondigitsCreate(); | 
| 242 | CharacterClass* nonspacesCreate(); | 
| 243 | CharacterClass* nonwordcharCreate(); | 
| 244 |  | 
| 245 | struct RegexPattern { | 
| 246 |     RegexPattern(bool ignoreCase, bool multiline) | 
| 247 |         : m_ignoreCase(ignoreCase) | 
| 248 |         , m_multiline(multiline) | 
| 249 |         , m_numSubpatterns(0) | 
| 250 |         , m_maxBackReference(0) | 
| 251 |         , newlineCached(0) | 
| 252 |         , digitsCached(0) | 
| 253 |         , spacesCached(0) | 
| 254 |         , wordcharCached(0) | 
| 255 |         , nondigitsCached(0) | 
| 256 |         , nonspacesCached(0) | 
| 257 |         , nonwordcharCached(0) | 
| 258 |     { | 
| 259 |     } | 
| 260 |  | 
| 261 |     ~RegexPattern() | 
| 262 |     { | 
| 263 |         deleteAllValues(collection: m_disjunctions); | 
| 264 |         deleteAllValues(collection: m_userCharacterClasses); | 
| 265 |     } | 
| 266 |  | 
| 267 |     void reset() | 
| 268 |     { | 
| 269 |         m_numSubpatterns = 0; | 
| 270 |         m_maxBackReference = 0; | 
| 271 |  | 
| 272 |         newlineCached = 0; | 
| 273 |         digitsCached = 0; | 
| 274 |         spacesCached = 0; | 
| 275 |         wordcharCached = 0; | 
| 276 |         nondigitsCached = 0; | 
| 277 |         nonspacesCached = 0; | 
| 278 |         nonwordcharCached = 0; | 
| 279 |  | 
| 280 |         deleteAllValues(collection: m_disjunctions); | 
| 281 |         m_disjunctions.clear(); | 
| 282 |         deleteAllValues(collection: m_userCharacterClasses); | 
| 283 |         m_userCharacterClasses.clear(); | 
| 284 |     } | 
| 285 |  | 
| 286 |     bool containsIllegalBackReference() | 
| 287 |     { | 
| 288 |         return m_maxBackReference > m_numSubpatterns; | 
| 289 |     } | 
| 290 |  | 
| 291 |     CharacterClass* newlineCharacterClass() | 
| 292 |     { | 
| 293 |         if (!newlineCached) | 
| 294 |             m_userCharacterClasses.append(val: newlineCached = newlineCreate()); | 
| 295 |         return newlineCached; | 
| 296 |     } | 
| 297 |     CharacterClass* digitsCharacterClass() | 
| 298 |     { | 
| 299 |         if (!digitsCached) | 
| 300 |             m_userCharacterClasses.append(val: digitsCached = digitsCreate()); | 
| 301 |         return digitsCached; | 
| 302 |     } | 
| 303 |     CharacterClass* spacesCharacterClass() | 
| 304 |     { | 
| 305 |         if (!spacesCached) | 
| 306 |             m_userCharacterClasses.append(val: spacesCached = spacesCreate()); | 
| 307 |         return spacesCached; | 
| 308 |     } | 
| 309 |     CharacterClass* wordcharCharacterClass() | 
| 310 |     { | 
| 311 |         if (!wordcharCached) | 
| 312 |             m_userCharacterClasses.append(val: wordcharCached = wordcharCreate()); | 
| 313 |         return wordcharCached; | 
| 314 |     } | 
| 315 |     CharacterClass* nondigitsCharacterClass() | 
| 316 |     { | 
| 317 |         if (!nondigitsCached) | 
| 318 |             m_userCharacterClasses.append(val: nondigitsCached = nondigitsCreate()); | 
| 319 |         return nondigitsCached; | 
| 320 |     } | 
| 321 |     CharacterClass* nonspacesCharacterClass() | 
| 322 |     { | 
| 323 |         if (!nonspacesCached) | 
| 324 |             m_userCharacterClasses.append(val: nonspacesCached = nonspacesCreate()); | 
| 325 |         return nonspacesCached; | 
| 326 |     } | 
| 327 |     CharacterClass* nonwordcharCharacterClass() | 
| 328 |     { | 
| 329 |         if (!nonwordcharCached) | 
| 330 |             m_userCharacterClasses.append(val: nonwordcharCached = nonwordcharCreate()); | 
| 331 |         return nonwordcharCached; | 
| 332 |     } | 
| 333 |  | 
| 334 |     bool m_ignoreCase; | 
| 335 |     bool m_multiline; | 
| 336 |     unsigned m_numSubpatterns; | 
| 337 |     unsigned m_maxBackReference; | 
| 338 |     PatternDisjunction* m_body; | 
| 339 |     Vector<PatternDisjunction*, 4> m_disjunctions; | 
| 340 |     Vector<CharacterClass*> m_userCharacterClasses; | 
| 341 |  | 
| 342 | private: | 
| 343 |     CharacterClass* newlineCached; | 
| 344 |     CharacterClass* digitsCached; | 
| 345 |     CharacterClass* spacesCached; | 
| 346 |     CharacterClass* wordcharCached; | 
| 347 |     CharacterClass* nondigitsCached; | 
| 348 |     CharacterClass* nonspacesCached; | 
| 349 |     CharacterClass* nonwordcharCached; | 
| 350 | }; | 
| 351 |  | 
| 352 | } } // namespace JSC::Yarr | 
| 353 |  | 
| 354 | #endif | 
| 355 |  | 
| 356 | #endif // RegexPattern_h | 
| 357 |  |