| 1 | /**************************************************************************** |
| 2 | ** |
| 3 | ** Copyright (C) 2016 The Qt Company Ltd. |
| 4 | ** Contact: https://www.qt.io/licensing/ |
| 5 | ** |
| 6 | ** This file is part of the QtXmlPatterns module of the Qt Toolkit. |
| 7 | ** |
| 8 | ** $QT_BEGIN_LICENSE:LGPL$ |
| 9 | ** Commercial License Usage |
| 10 | ** Licensees holding valid commercial Qt licenses may use this file in |
| 11 | ** accordance with the commercial license agreement provided with the |
| 12 | ** Software or, alternatively, in accordance with the terms contained in |
| 13 | ** a written agreement between you and The Qt Company. For licensing terms |
| 14 | ** and conditions see https://www.qt.io/terms-conditions. For further |
| 15 | ** information use the contact form at https://www.qt.io/contact-us. |
| 16 | ** |
| 17 | ** GNU Lesser General Public License Usage |
| 18 | ** Alternatively, this file may be used under the terms of the GNU Lesser |
| 19 | ** General Public License version 3 as published by the Free Software |
| 20 | ** Foundation and appearing in the file LICENSE.LGPL3 included in the |
| 21 | ** packaging of this file. Please review the following information to |
| 22 | ** ensure the GNU Lesser General Public License version 3 requirements |
| 23 | ** will be met: https://www.gnu.org/licenses/lgpl-3.0.html. |
| 24 | ** |
| 25 | ** GNU General Public License Usage |
| 26 | ** Alternatively, this file may be used under the terms of the GNU |
| 27 | ** General Public License version 2.0 or (at your option) the GNU General |
| 28 | ** Public license version 3 or any later version approved by the KDE Free |
| 29 | ** Qt Foundation. The licenses are as published by the Free Software |
| 30 | ** Foundation and appearing in the file LICENSE.GPL2 and LICENSE.GPL3 |
| 31 | ** included in the packaging of this file. Please review the following |
| 32 | ** information to ensure the GNU General Public License requirements will |
| 33 | ** be met: https://www.gnu.org/licenses/gpl-2.0.html and |
| 34 | ** https://www.gnu.org/licenses/gpl-3.0.html. |
| 35 | ** |
| 36 | ** $QT_END_LICENSE$ |
| 37 | ** |
| 38 | ****************************************************************************/ |
| 39 | |
| 40 | // |
| 41 | // W A R N I N G |
| 42 | // ------------- |
| 43 | // |
| 44 | // This file is not part of the Qt API. It exists purely as an |
| 45 | // implementation detail. This header file may change from version to |
| 46 | // version without notice, or even be removed. |
| 47 | // |
| 48 | // We mean it. |
| 49 | #ifndef Patternist_XQueryTokenizer_H |
| 50 | #define Patternist_XQueryTokenizer_H |
| 51 | |
| 52 | #include <QHash> |
| 53 | #include <QSet> |
| 54 | #include <QStack> |
| 55 | #include <QString> |
| 56 | #include <QUrl> |
| 57 | |
| 58 | #include <private/qtokenizer_p.h> |
| 59 | |
| 60 | QT_BEGIN_NAMESPACE |
| 61 | |
| 62 | namespace QPatternist |
| 63 | { |
| 64 | struct TokenMap; |
| 65 | |
| 66 | /** |
| 67 | * @short A hand-written tokenizer which tokenizes XQuery 1.0 & XPath 2.0, |
| 68 | * and delivers tokens to the Bison generated parser. |
| 69 | * |
| 70 | * @author Frans Englich <frans.englich@nokia.com> |
| 71 | */ |
| 72 | class XQueryTokenizer : public Tokenizer |
| 73 | { |
| 74 | public: |
| 75 | /** |
| 76 | * Tokenizer states. Organized alphabetically. |
| 77 | */ |
| 78 | enum State |
| 79 | { |
| 80 | AfterAxisSeparator, |
| 81 | AposAttributeContent, |
| 82 | Axis, |
| 83 | Default, |
| 84 | ElementContent, |
| 85 | EndTag, |
| 86 | ItemType, |
| 87 | KindTest, |
| 88 | KindTestForPI, |
| 89 | NamespaceDecl, |
| 90 | NamespaceKeyword, |
| 91 | OccurrenceIndicator, |
| 92 | Operator, |
| 93 | Pragma, |
| 94 | PragmaContent, |
| 95 | ProcessingInstructionContent, |
| 96 | ProcessingInstructionName, |
| 97 | QuotAttributeContent, |
| 98 | StartTag, |
| 99 | VarName, |
| 100 | , |
| 101 | XMLSpaceDecl, |
| 102 | XQueryVersion |
| 103 | }; |
| 104 | |
| 105 | XQueryTokenizer(const QString &query, |
| 106 | const QUrl &location, |
| 107 | const State startingState = Default); |
| 108 | |
| 109 | virtual Token nextToken(XPATHLTYPE *const sourceLocator); |
| 110 | virtual int commenceScanOnly(); |
| 111 | virtual void resumeTokenizationFrom(const int position); |
| 112 | |
| 113 | /** |
| 114 | * Does nothing. |
| 115 | */ |
| 116 | virtual void setParserContext(const ParserContext::Ptr &parseInfo); |
| 117 | |
| 118 | private: |
| 119 | |
| 120 | /** |
| 121 | * Returns the character corresponding to the builtin reference @p |
| 122 | * reference. For instance, passing @c gt will give you '>' in return. |
| 123 | * |
| 124 | * If @p reference is an invalid character reference, a null QChar is |
| 125 | * returned. |
| 126 | * |
| 127 | * @see QChar::isNull() |
| 128 | */ |
| 129 | QChar charForReference(const QString &reference); |
| 130 | |
| 131 | inline Token tokenAndChangeState(const TokenType code, |
| 132 | const State state, |
| 133 | const int advance = 1); |
| 134 | inline Token tokenAndChangeState(const TokenType code, |
| 135 | const QString &value, |
| 136 | const State state); |
| 137 | inline Token tokenAndAdvance(const TokenType code, |
| 138 | const int advance = 1); |
| 139 | QString tokenizeCharacterReference(); |
| 140 | |
| 141 | inline Token tokenizeStringLiteral(); |
| 142 | inline Token tokenizeNumberLiteral(); |
| 143 | |
| 144 | /** |
| 145 | * @returns the character @p length characters from the current |
| 146 | * position. |
| 147 | */ |
| 148 | inline char peekAhead(const int length = 1) const; |
| 149 | |
| 150 | /** |
| 151 | * @returns whether the stream, starting from @p offset from the |
| 152 | * current position, matches @p chs. The length of @p chs is @p len. |
| 153 | */ |
| 154 | inline bool aheadEquals(const char *const chs, |
| 155 | const int len, |
| 156 | const int offset = 1) const; |
| 157 | |
| 158 | inline Token tokenizeNCName(); |
| 159 | static inline bool isOperatorKeyword(const TokenType); |
| 160 | |
| 161 | static inline bool isDigit(const char ch); |
| 162 | static inline Token error(); |
| 163 | inline TokenType consumeWhitespace(); |
| 164 | |
| 165 | /** |
| 166 | * @short Returns the character at the current position, converted to |
| 167 | * @c ASCII. |
| 168 | * |
| 169 | * Equivalent to calling: |
| 170 | * |
| 171 | * @code |
| 172 | * current().toLatin1(); |
| 173 | * @endcode |
| 174 | */ |
| 175 | inline char peekCurrent() const; |
| 176 | |
| 177 | /** |
| 178 | * Disregarding encoding conversion, equivalent to calling: |
| 179 | * |
| 180 | * @code |
| 181 | * peekAhead(0); |
| 182 | * @endcode |
| 183 | */ |
| 184 | inline const QChar current() const; |
| 185 | |
| 186 | /** |
| 187 | * @p hadWhitespace is always set to a proper value. |
| 188 | * |
| 189 | * @returns the length of whitespace scanned before reaching "::", or |
| 190 | * -1 if something else was found. |
| 191 | */ |
| 192 | int peekForColonColon() const; |
| 193 | |
| 194 | static inline bool isNCNameStart(const QChar ch); |
| 195 | static inline bool isNCNameBody(const QChar ch); |
| 196 | static inline const TokenMap *lookupKeyword(const QString &keyword); |
| 197 | inline void popState(); |
| 198 | inline void pushState(const State state); |
| 199 | inline State state() const; |
| 200 | inline void setState(const State s); |
| 201 | static bool isTypeToken(const TokenType t); |
| 202 | |
| 203 | inline Token tokenizeNCNameOrQName(); |
| 204 | /** |
| 205 | * Advances m_pos until content is encountered. |
| 206 | * |
| 207 | * Returned is the length stretching from m_pos when starting, until |
| 208 | * @p content is encountered. @p content is not included in the length. |
| 209 | */ |
| 210 | int scanUntil(const char *const content); |
| 211 | |
| 212 | /** |
| 213 | * Same as calling: |
| 214 | * @code |
| 215 | * pushState(currentState()); |
| 216 | * @endcode |
| 217 | */ |
| 218 | inline void pushState(); |
| 219 | |
| 220 | /** |
| 221 | * Consumes only whitespace, in the traditional sense. The function exits |
| 222 | * if non-whitespace is encountered, such as the start of a comment. |
| 223 | * |
| 224 | * @returns @c true if the end was reached, otherwise @c false |
| 225 | */ |
| 226 | inline bool consumeRawWhitespace(); |
| 227 | |
| 228 | /** |
| 229 | * @short Parses comments: <tt>(: comment content :)</tt>. It recurses for |
| 230 | * parsing nested comments. |
| 231 | * |
| 232 | * It is assumed that the start token for the comment, "(:", has |
| 233 | * already been parsed. |
| 234 | * |
| 235 | * Typically, don't call this function, but ignoreWhitespace(). |
| 236 | * |
| 237 | * @see <a href="http://www.w3.org/TR/xpath20/#comments">XML Path Language (XPath) |
| 238 | * 2.0, 2.6 Comments</a> |
| 239 | * @returns |
| 240 | * - SUCCESS if everything went ok |
| 241 | * - ERROR if there was an error in parsing one or more comments |
| 242 | * - END_OF_FILE if the end was reached |
| 243 | */ |
| 244 | Tokenizer::TokenType (); |
| 245 | |
| 246 | /** |
| 247 | * Determines whether @p code is a keyword |
| 248 | * that is followed by a second keyword. For instance <tt>declare |
| 249 | * function</tt>. |
| 250 | */ |
| 251 | static inline bool isPhraseKeyword(const TokenType code); |
| 252 | |
| 253 | /** |
| 254 | * A set of indexes into a QString, the one being passed to |
| 255 | * normalizeEOL() whose characters shouldn't be normalized. */ |
| 256 | typedef QSet<int> CharacterSkips; |
| 257 | |
| 258 | /** |
| 259 | * Returns @p input, normalized according to |
| 260 | * <a href="http://www.w3.org/TR/xquery/#id-eol-handling">XQuery 1.0: |
| 261 | * An XML Query Language, A.2.3 End-of-Line Handling</a> |
| 262 | */ |
| 263 | static QString normalizeEOL(const QString &input, |
| 264 | const CharacterSkips &characterSkips); |
| 265 | |
| 266 | inline bool atEnd() const |
| 267 | { |
| 268 | return m_pos == m_length; |
| 269 | } |
| 270 | |
| 271 | Token nextToken(); |
| 272 | /** |
| 273 | * Instead of recognizing and tokenizing embedded expressions in |
| 274 | * direct attriute constructors, this function is essentially a mini |
| 275 | * recursive-descent parser that has the necessary logic to recognize |
| 276 | * embedded expressions and their potentially interfering string literals, in |
| 277 | * order to scan to the very end of the attribute value, and return the |
| 278 | * whole as a string. |
| 279 | * |
| 280 | * There is of course syntax errors this function will not detect, but |
| 281 | * that is ok since the attributes will be parsed once more. |
| 282 | * |
| 283 | * An inelegant solution, but which gets the job done. |
| 284 | * |
| 285 | * @see commenceScanOnly(), resumeTokenizationFrom() |
| 286 | */ |
| 287 | Token attributeAsRaw(const QChar separator, |
| 288 | int &stack, |
| 289 | const int startPos, |
| 290 | const bool inLiteral, |
| 291 | QString &result); |
| 292 | |
| 293 | const QString m_data; |
| 294 | const int m_length; |
| 295 | State m_state; |
| 296 | QStack<State> m_stateStack; |
| 297 | int m_pos; |
| 298 | |
| 299 | /** |
| 300 | * The current line number. |
| 301 | * |
| 302 | * The line number and column number both starts at 1. |
| 303 | */ |
| 304 | int m_line; |
| 305 | |
| 306 | /** |
| 307 | * The offset into m_length for where |
| 308 | * the current column starts. So m_length - m_columnOffset |
| 309 | * is the current column. |
| 310 | * |
| 311 | * The line number and column number both starts at 1. |
| 312 | */ |
| 313 | int m_columnOffset; |
| 314 | |
| 315 | const NamePool::Ptr m_namePool; |
| 316 | QStack<Token> m_tokenStack; |
| 317 | QHash<QString, QChar> m_charRefs; |
| 318 | bool m_scanOnly; |
| 319 | |
| 320 | Q_DISABLE_COPY(XQueryTokenizer) |
| 321 | }; |
| 322 | } |
| 323 | |
| 324 | QT_END_NAMESPACE |
| 325 | |
| 326 | #endif |
| 327 | |