| 1 | /**************************************************************************** | 
| 2 | ** | 
| 3 | ** Copyright (C) 2016 The Qt Company Ltd. | 
| 4 | ** Contact: https://www.qt.io/licensing/ | 
| 5 | ** | 
| 6 | ** This file is part of the QtXmlPatterns module of the Qt Toolkit. | 
| 7 | ** | 
| 8 | ** $QT_BEGIN_LICENSE:LGPL$ | 
| 9 | ** Commercial License Usage | 
| 10 | ** Licensees holding valid commercial Qt licenses may use this file in | 
| 11 | ** accordance with the commercial license agreement provided with the | 
| 12 | ** Software or, alternatively, in accordance with the terms contained in | 
| 13 | ** a written agreement between you and The Qt Company. For licensing terms | 
| 14 | ** and conditions see https://www.qt.io/terms-conditions. For further | 
| 15 | ** information use the contact form at https://www.qt.io/contact-us. | 
| 16 | ** | 
| 17 | ** GNU Lesser General Public License Usage | 
| 18 | ** Alternatively, this file may be used under the terms of the GNU Lesser | 
| 19 | ** General Public License version 3 as published by the Free Software | 
| 20 | ** Foundation and appearing in the file LICENSE.LGPL3 included in the | 
| 21 | ** packaging of this file. Please review the following information to | 
| 22 | ** ensure the GNU Lesser General Public License version 3 requirements | 
| 23 | ** will be met: https://www.gnu.org/licenses/lgpl-3.0.html. | 
| 24 | ** | 
| 25 | ** GNU General Public License Usage | 
| 26 | ** Alternatively, this file may be used under the terms of the GNU | 
| 27 | ** General Public License version 2.0 or (at your option) the GNU General | 
| 28 | ** Public license version 3 or any later version approved by the KDE Free | 
| 29 | ** Qt Foundation. The licenses are as published by the Free Software | 
| 30 | ** Foundation and appearing in the file LICENSE.GPL2 and LICENSE.GPL3 | 
| 31 | ** included in the packaging of this file. Please review the following | 
| 32 | ** information to ensure the GNU General Public License requirements will | 
| 33 | ** be met: https://www.gnu.org/licenses/gpl-2.0.html and | 
| 34 | ** https://www.gnu.org/licenses/gpl-3.0.html. | 
| 35 | ** | 
| 36 | ** $QT_END_LICENSE$ | 
| 37 | ** | 
| 38 | ****************************************************************************/ | 
| 39 |  | 
| 40 | // | 
| 41 | //  W A R N I N G | 
| 42 | //  ------------- | 
| 43 | // | 
| 44 | // This file is not part of the Qt API.  It exists purely as an | 
| 45 | // implementation detail.  This header file may change from version to | 
| 46 | // version without notice, or even be removed. | 
| 47 | // | 
| 48 | // We mean it. | 
| 49 | #ifndef Patternist_XQueryTokenizer_H | 
| 50 | #define Patternist_XQueryTokenizer_H | 
| 51 |  | 
| 52 | #include <QHash> | 
| 53 | #include <QSet> | 
| 54 | #include <QStack> | 
| 55 | #include <QString> | 
| 56 | #include <QUrl> | 
| 57 |  | 
| 58 | #include <private/qtokenizer_p.h> | 
| 59 |  | 
| 60 | QT_BEGIN_NAMESPACE | 
| 61 |  | 
| 62 | namespace QPatternist | 
| 63 | { | 
| 64 |     struct TokenMap; | 
| 65 |  | 
| 66 |     /** | 
| 67 |      * @short A hand-written tokenizer which tokenizes XQuery 1.0 & XPath 2.0, | 
| 68 |      * and delivers tokens to the Bison generated parser. | 
| 69 |      * | 
| 70 |      * @author Frans Englich <frans.englich@nokia.com> | 
| 71 |      */ | 
| 72 |     class XQueryTokenizer : public Tokenizer | 
| 73 |     { | 
| 74 |     public: | 
| 75 |         /** | 
| 76 |          * Tokenizer states. Organized alphabetically. | 
| 77 |          */ | 
| 78 |         enum State | 
| 79 |         { | 
| 80 |             AfterAxisSeparator, | 
| 81 |             AposAttributeContent, | 
| 82 |             Axis, | 
| 83 |             Default, | 
| 84 |             ElementContent, | 
| 85 |             EndTag, | 
| 86 |             ItemType, | 
| 87 |             KindTest, | 
| 88 |             KindTestForPI, | 
| 89 |             NamespaceDecl, | 
| 90 |             NamespaceKeyword, | 
| 91 |             OccurrenceIndicator, | 
| 92 |             Operator, | 
| 93 |             Pragma, | 
| 94 |             PragmaContent, | 
| 95 |             ProcessingInstructionContent, | 
| 96 |             ProcessingInstructionName, | 
| 97 |             QuotAttributeContent, | 
| 98 |             StartTag, | 
| 99 |             VarName, | 
| 100 |             , | 
| 101 |             XMLSpaceDecl, | 
| 102 |             XQueryVersion | 
| 103 |         }; | 
| 104 |  | 
| 105 |         XQueryTokenizer(const QString &query, | 
| 106 |                         const QUrl &location, | 
| 107 |                         const State startingState = Default); | 
| 108 |  | 
| 109 |         virtual Token nextToken(XPATHLTYPE *const sourceLocator); | 
| 110 |         virtual int commenceScanOnly(); | 
| 111 |         virtual void resumeTokenizationFrom(const int position); | 
| 112 |  | 
| 113 |         /** | 
| 114 |          * Does nothing. | 
| 115 |          */ | 
| 116 |         virtual void setParserContext(const ParserContext::Ptr &parseInfo); | 
| 117 |  | 
| 118 |     private: | 
| 119 |  | 
| 120 |         /** | 
| 121 |          * Returns the character corresponding to the builtin reference @p | 
| 122 |          * reference. For instance, passing @c gt will give you '>' in return. | 
| 123 |          * | 
| 124 |          * If @p reference is an invalid character reference, a null QChar is | 
| 125 |          * returned. | 
| 126 |          * | 
| 127 |          * @see QChar::isNull() | 
| 128 |          */ | 
| 129 |         QChar charForReference(const QString &reference); | 
| 130 |  | 
| 131 |         inline Token tokenAndChangeState(const TokenType code, | 
| 132 |                                          const State state, | 
| 133 |                                          const int advance = 1); | 
| 134 |         inline Token tokenAndChangeState(const TokenType code, | 
| 135 |                                          const QString &value, | 
| 136 |                                          const State state); | 
| 137 |         inline Token tokenAndAdvance(const TokenType code, | 
| 138 |                                      const int advance = 1); | 
| 139 |         QString tokenizeCharacterReference(); | 
| 140 |  | 
| 141 |         inline Token tokenizeStringLiteral(); | 
| 142 |         inline Token tokenizeNumberLiteral(); | 
| 143 |  | 
| 144 |         /** | 
| 145 |          * @returns the character @p length characters from the current | 
| 146 |          * position. | 
| 147 |          */ | 
| 148 |         inline char peekAhead(const int length = 1) const; | 
| 149 |  | 
| 150 |         /** | 
| 151 |          * @returns whether the stream, starting from @p offset from the | 
| 152 |          * current position, matches @p chs. The length of @p chs is @p len. | 
| 153 |          */ | 
| 154 |         inline bool aheadEquals(const char *const chs, | 
| 155 |                                 const int len, | 
| 156 |                                 const int offset = 1) const; | 
| 157 |  | 
| 158 |         inline Token tokenizeNCName(); | 
| 159 |         static inline bool isOperatorKeyword(const TokenType); | 
| 160 |  | 
| 161 |         static inline bool isDigit(const char ch); | 
| 162 |         static inline Token error(); | 
| 163 |         inline TokenType consumeWhitespace(); | 
| 164 |  | 
| 165 |         /** | 
| 166 |          * @short Returns the character at the current position, converted to | 
| 167 |          * @c ASCII. | 
| 168 |          * | 
| 169 |          * Equivalent to calling: | 
| 170 |          * | 
| 171 |          * @code | 
| 172 |          * current().toLatin1(); | 
| 173 |          * @endcode | 
| 174 |          */ | 
| 175 |         inline char peekCurrent() const; | 
| 176 |  | 
| 177 |         /** | 
| 178 |          * Disregarding encoding conversion, equivalent to calling: | 
| 179 |          * | 
| 180 |          * @code | 
| 181 |          * peekAhead(0); | 
| 182 |          * @endcode | 
| 183 |          */ | 
| 184 |         inline const QChar current() const; | 
| 185 |  | 
| 186 |         /** | 
| 187 |          * @p hadWhitespace is always set to a proper value. | 
| 188 |          * | 
| 189 |          * @returns the length of whitespace scanned before reaching "::", or | 
| 190 |          * -1 if something else was found. | 
| 191 |          */ | 
| 192 |         int peekForColonColon() const; | 
| 193 |  | 
| 194 |         static inline bool isNCNameStart(const QChar ch); | 
| 195 |         static inline bool isNCNameBody(const QChar ch); | 
| 196 |         static inline const TokenMap *lookupKeyword(const QString &keyword); | 
| 197 |         inline void popState(); | 
| 198 |         inline void pushState(const State state); | 
| 199 |         inline State state() const; | 
| 200 |         inline void setState(const State s); | 
| 201 |         static bool isTypeToken(const TokenType t); | 
| 202 |  | 
| 203 |         inline Token tokenizeNCNameOrQName(); | 
| 204 |         /** | 
| 205 |          * Advances m_pos until content is encountered. | 
| 206 |          * | 
| 207 |          * Returned is the length stretching from m_pos when starting, until | 
| 208 |          * @p content is encountered. @p content is not included in the length. | 
| 209 |          */ | 
| 210 |         int scanUntil(const char *const content); | 
| 211 |  | 
| 212 |         /** | 
| 213 |          * Same as calling: | 
| 214 |          * @code | 
| 215 |          * pushState(currentState()); | 
| 216 |          * @endcode | 
| 217 |          */ | 
| 218 |         inline void pushState(); | 
| 219 |  | 
| 220 |         /** | 
| 221 |          * Consumes only whitespace, in the traditional sense. The function exits | 
| 222 |          * if non-whitespace is encountered, such as the start of a comment. | 
| 223 |          * | 
| 224 |          * @returns @c true if the end was reached, otherwise @c false | 
| 225 |          */ | 
| 226 |         inline bool consumeRawWhitespace(); | 
| 227 |  | 
| 228 |         /** | 
| 229 |          * @short Parses comments: <tt>(: comment content :)</tt>. It recurses for | 
| 230 |          * parsing nested comments. | 
| 231 |          * | 
| 232 |          * It is assumed that the start token for the comment, "(:", has | 
| 233 |          * already been parsed. | 
| 234 |          * | 
| 235 |          * Typically, don't call this function, but ignoreWhitespace(). | 
| 236 |          * | 
| 237 |          * @see <a href="http://www.w3.org/TR/xpath20/#comments">XML Path Language (XPath) | 
| 238 |          * 2.0, 2.6 Comments</a> | 
| 239 |          * @returns | 
| 240 |          * - SUCCESS if everything went ok | 
| 241 |          * - ERROR if there was an error in parsing one or more comments | 
| 242 |          * - END_OF_FILE if the end was reached | 
| 243 |          */ | 
| 244 |         Tokenizer::TokenType (); | 
| 245 |  | 
| 246 |         /** | 
| 247 |          * Determines whether @p code is a keyword | 
| 248 |          * that is followed by a second keyword. For instance <tt>declare | 
| 249 |          * function</tt>. | 
| 250 |          */ | 
| 251 |         static inline bool isPhraseKeyword(const TokenType code); | 
| 252 |  | 
| 253 |         /** | 
| 254 |          * A set of indexes into a QString, the one being passed to | 
| 255 |          * normalizeEOL() whose characters shouldn't be normalized. */ | 
| 256 |         typedef QSet<int> CharacterSkips; | 
| 257 |  | 
| 258 |         /** | 
| 259 |          * Returns @p input, normalized according to | 
| 260 |          * <a href="http://www.w3.org/TR/xquery/#id-eol-handling">XQuery 1.0: | 
| 261 |          * An XML Query Language, A.2.3 End-of-Line Handling</a> | 
| 262 |          */ | 
| 263 |         static QString normalizeEOL(const QString &input, | 
| 264 |                                     const CharacterSkips &characterSkips); | 
| 265 |  | 
| 266 |         inline bool atEnd() const | 
| 267 |         { | 
| 268 |             return m_pos == m_length; | 
| 269 |         } | 
| 270 |  | 
| 271 |         Token nextToken(); | 
| 272 |         /** | 
| 273 |          * Instead of recognizing and tokenizing embedded expressions in | 
| 274 |          * direct attriute constructors, this function is essentially a mini | 
| 275 |          * recursive-descent parser that has the necessary logic to recognize | 
| 276 |          * embedded expressions and their potentially interfering string literals, in | 
| 277 |          * order to scan to the very end of the attribute value, and return the | 
| 278 |          * whole as a string. | 
| 279 |          * | 
| 280 |          * There is of course syntax errors this function will not detect, but | 
| 281 |          * that is ok since the attributes will be parsed once more. | 
| 282 |          * | 
| 283 |          * An inelegant solution, but which gets the job done. | 
| 284 |          * | 
| 285 |          * @see commenceScanOnly(), resumeTokenizationFrom() | 
| 286 |          */ | 
| 287 |         Token attributeAsRaw(const QChar separator, | 
| 288 |                              int &stack, | 
| 289 |                              const int startPos, | 
| 290 |                              const bool inLiteral, | 
| 291 |                              QString &result); | 
| 292 |  | 
| 293 |         const QString           m_data; | 
| 294 |         const int               m_length; | 
| 295 |         State                   m_state; | 
| 296 |         QStack<State>           m_stateStack; | 
| 297 |         int                     m_pos; | 
| 298 |  | 
| 299 |         /** | 
| 300 |          * The current line number. | 
| 301 |          * | 
| 302 |          * The line number and column number both starts at 1. | 
| 303 |          */ | 
| 304 |         int                     m_line; | 
| 305 |  | 
| 306 |         /** | 
| 307 |          * The offset into m_length for where | 
| 308 |          * the current column starts. So m_length - m_columnOffset | 
| 309 |          * is the current column. | 
| 310 |          * | 
| 311 |          * The line number and column number both starts at 1. | 
| 312 |          */ | 
| 313 |         int                     m_columnOffset; | 
| 314 |  | 
| 315 |         const NamePool::Ptr     m_namePool; | 
| 316 |         QStack<Token>           m_tokenStack; | 
| 317 |         QHash<QString, QChar>   m_charRefs; | 
| 318 |         bool                    m_scanOnly; | 
| 319 |  | 
| 320 |         Q_DISABLE_COPY(XQueryTokenizer) | 
| 321 |     }; | 
| 322 | } | 
| 323 |  | 
| 324 | QT_END_NAMESPACE | 
| 325 |  | 
| 326 | #endif | 
| 327 |  |