1 | /**************************************************************************** |
2 | ** |
3 | ** Copyright (C) 2016 The Qt Company Ltd. |
4 | ** Contact: https://www.qt.io/licensing/ |
5 | ** |
6 | ** This file is part of the QtXmlPatterns module of the Qt Toolkit. |
7 | ** |
8 | ** $QT_BEGIN_LICENSE:LGPL$ |
9 | ** Commercial License Usage |
10 | ** Licensees holding valid commercial Qt licenses may use this file in |
11 | ** accordance with the commercial license agreement provided with the |
12 | ** Software or, alternatively, in accordance with the terms contained in |
13 | ** a written agreement between you and The Qt Company. For licensing terms |
14 | ** and conditions see https://www.qt.io/terms-conditions. For further |
15 | ** information use the contact form at https://www.qt.io/contact-us. |
16 | ** |
17 | ** GNU Lesser General Public License Usage |
18 | ** Alternatively, this file may be used under the terms of the GNU Lesser |
19 | ** General Public License version 3 as published by the Free Software |
20 | ** Foundation and appearing in the file LICENSE.LGPL3 included in the |
21 | ** packaging of this file. Please review the following information to |
22 | ** ensure the GNU Lesser General Public License version 3 requirements |
23 | ** will be met: https://www.gnu.org/licenses/lgpl-3.0.html. |
24 | ** |
25 | ** GNU General Public License Usage |
26 | ** Alternatively, this file may be used under the terms of the GNU |
27 | ** General Public License version 2.0 or (at your option) the GNU General |
28 | ** Public license version 3 or any later version approved by the KDE Free |
29 | ** Qt Foundation. The licenses are as published by the Free Software |
30 | ** Foundation and appearing in the file LICENSE.GPL2 and LICENSE.GPL3 |
31 | ** included in the packaging of this file. Please review the following |
32 | ** information to ensure the GNU General Public License requirements will |
33 | ** be met: https://www.gnu.org/licenses/gpl-2.0.html and |
34 | ** https://www.gnu.org/licenses/gpl-3.0.html. |
35 | ** |
36 | ** $QT_END_LICENSE$ |
37 | ** |
38 | ****************************************************************************/ |
39 | |
40 | // |
41 | // W A R N I N G |
42 | // ------------- |
43 | // |
44 | // This file is not part of the Qt API. It exists purely as an |
45 | // implementation detail. This header file may change from version to |
46 | // version without notice, or even be removed. |
47 | // |
48 | // We mean it. |
49 | #ifndef Patternist_XQueryTokenizer_H |
50 | #define Patternist_XQueryTokenizer_H |
51 | |
52 | #include <QHash> |
53 | #include <QSet> |
54 | #include <QStack> |
55 | #include <QString> |
56 | #include <QUrl> |
57 | |
58 | #include <private/qtokenizer_p.h> |
59 | |
60 | QT_BEGIN_NAMESPACE |
61 | |
62 | namespace QPatternist |
63 | { |
64 | struct TokenMap; |
65 | |
66 | /** |
67 | * @short A hand-written tokenizer which tokenizes XQuery 1.0 & XPath 2.0, |
68 | * and delivers tokens to the Bison generated parser. |
69 | * |
70 | * @author Frans Englich <frans.englich@nokia.com> |
71 | */ |
72 | class XQueryTokenizer : public Tokenizer |
73 | { |
74 | public: |
75 | /** |
76 | * Tokenizer states. Organized alphabetically. |
77 | */ |
78 | enum State |
79 | { |
80 | AfterAxisSeparator, |
81 | AposAttributeContent, |
82 | Axis, |
83 | Default, |
84 | ElementContent, |
85 | EndTag, |
86 | ItemType, |
87 | KindTest, |
88 | KindTestForPI, |
89 | NamespaceDecl, |
90 | NamespaceKeyword, |
91 | OccurrenceIndicator, |
92 | Operator, |
93 | Pragma, |
94 | PragmaContent, |
95 | ProcessingInstructionContent, |
96 | ProcessingInstructionName, |
97 | QuotAttributeContent, |
98 | StartTag, |
99 | VarName, |
100 | , |
101 | XMLSpaceDecl, |
102 | XQueryVersion |
103 | }; |
104 | |
105 | XQueryTokenizer(const QString &query, |
106 | const QUrl &location, |
107 | const State startingState = Default); |
108 | |
109 | virtual Token nextToken(XPATHLTYPE *const sourceLocator); |
110 | virtual int commenceScanOnly(); |
111 | virtual void resumeTokenizationFrom(const int position); |
112 | |
113 | /** |
114 | * Does nothing. |
115 | */ |
116 | virtual void setParserContext(const ParserContext::Ptr &parseInfo); |
117 | |
118 | private: |
119 | |
120 | /** |
121 | * Returns the character corresponding to the builtin reference @p |
122 | * reference. For instance, passing @c gt will give you '>' in return. |
123 | * |
124 | * If @p reference is an invalid character reference, a null QChar is |
125 | * returned. |
126 | * |
127 | * @see QChar::isNull() |
128 | */ |
129 | QChar charForReference(const QString &reference); |
130 | |
131 | inline Token tokenAndChangeState(const TokenType code, |
132 | const State state, |
133 | const int advance = 1); |
134 | inline Token tokenAndChangeState(const TokenType code, |
135 | const QString &value, |
136 | const State state); |
137 | inline Token tokenAndAdvance(const TokenType code, |
138 | const int advance = 1); |
139 | QString tokenizeCharacterReference(); |
140 | |
141 | inline Token tokenizeStringLiteral(); |
142 | inline Token tokenizeNumberLiteral(); |
143 | |
144 | /** |
145 | * @returns the character @p length characters from the current |
146 | * position. |
147 | */ |
148 | inline char peekAhead(const int length = 1) const; |
149 | |
150 | /** |
151 | * @returns whether the stream, starting from @p offset from the |
152 | * current position, matches @p chs. The length of @p chs is @p len. |
153 | */ |
154 | inline bool aheadEquals(const char *const chs, |
155 | const int len, |
156 | const int offset = 1) const; |
157 | |
158 | inline Token tokenizeNCName(); |
159 | static inline bool isOperatorKeyword(const TokenType); |
160 | |
161 | static inline bool isDigit(const char ch); |
162 | static inline Token error(); |
163 | inline TokenType consumeWhitespace(); |
164 | |
165 | /** |
166 | * @short Returns the character at the current position, converted to |
167 | * @c ASCII. |
168 | * |
169 | * Equivalent to calling: |
170 | * |
171 | * @code |
172 | * current().toLatin1(); |
173 | * @endcode |
174 | */ |
175 | inline char peekCurrent() const; |
176 | |
177 | /** |
178 | * Disregarding encoding conversion, equivalent to calling: |
179 | * |
180 | * @code |
181 | * peekAhead(0); |
182 | * @endcode |
183 | */ |
184 | inline const QChar current() const; |
185 | |
186 | /** |
187 | * @p hadWhitespace is always set to a proper value. |
188 | * |
189 | * @returns the length of whitespace scanned before reaching "::", or |
190 | * -1 if something else was found. |
191 | */ |
192 | int peekForColonColon() const; |
193 | |
194 | static inline bool isNCNameStart(const QChar ch); |
195 | static inline bool isNCNameBody(const QChar ch); |
196 | static inline const TokenMap *lookupKeyword(const QString &keyword); |
197 | inline void popState(); |
198 | inline void pushState(const State state); |
199 | inline State state() const; |
200 | inline void setState(const State s); |
201 | static bool isTypeToken(const TokenType t); |
202 | |
203 | inline Token tokenizeNCNameOrQName(); |
204 | /** |
205 | * Advances m_pos until content is encountered. |
206 | * |
207 | * Returned is the length stretching from m_pos when starting, until |
208 | * @p content is encountered. @p content is not included in the length. |
209 | */ |
210 | int scanUntil(const char *const content); |
211 | |
212 | /** |
213 | * Same as calling: |
214 | * @code |
215 | * pushState(currentState()); |
216 | * @endcode |
217 | */ |
218 | inline void pushState(); |
219 | |
220 | /** |
221 | * Consumes only whitespace, in the traditional sense. The function exits |
222 | * if non-whitespace is encountered, such as the start of a comment. |
223 | * |
224 | * @returns @c true if the end was reached, otherwise @c false |
225 | */ |
226 | inline bool consumeRawWhitespace(); |
227 | |
228 | /** |
229 | * @short Parses comments: <tt>(: comment content :)</tt>. It recurses for |
230 | * parsing nested comments. |
231 | * |
232 | * It is assumed that the start token for the comment, "(:", has |
233 | * already been parsed. |
234 | * |
235 | * Typically, don't call this function, but ignoreWhitespace(). |
236 | * |
237 | * @see <a href="http://www.w3.org/TR/xpath20/#comments">XML Path Language (XPath) |
238 | * 2.0, 2.6 Comments</a> |
239 | * @returns |
240 | * - SUCCESS if everything went ok |
241 | * - ERROR if there was an error in parsing one or more comments |
242 | * - END_OF_FILE if the end was reached |
243 | */ |
244 | Tokenizer::TokenType (); |
245 | |
246 | /** |
247 | * Determines whether @p code is a keyword |
248 | * that is followed by a second keyword. For instance <tt>declare |
249 | * function</tt>. |
250 | */ |
251 | static inline bool isPhraseKeyword(const TokenType code); |
252 | |
253 | /** |
254 | * A set of indexes into a QString, the one being passed to |
255 | * normalizeEOL() whose characters shouldn't be normalized. */ |
256 | typedef QSet<int> CharacterSkips; |
257 | |
258 | /** |
259 | * Returns @p input, normalized according to |
260 | * <a href="http://www.w3.org/TR/xquery/#id-eol-handling">XQuery 1.0: |
261 | * An XML Query Language, A.2.3 End-of-Line Handling</a> |
262 | */ |
263 | static QString normalizeEOL(const QString &input, |
264 | const CharacterSkips &characterSkips); |
265 | |
266 | inline bool atEnd() const |
267 | { |
268 | return m_pos == m_length; |
269 | } |
270 | |
271 | Token nextToken(); |
272 | /** |
273 | * Instead of recognizing and tokenizing embedded expressions in |
274 | * direct attriute constructors, this function is essentially a mini |
275 | * recursive-descent parser that has the necessary logic to recognize |
276 | * embedded expressions and their potentially interfering string literals, in |
277 | * order to scan to the very end of the attribute value, and return the |
278 | * whole as a string. |
279 | * |
280 | * There is of course syntax errors this function will not detect, but |
281 | * that is ok since the attributes will be parsed once more. |
282 | * |
283 | * An inelegant solution, but which gets the job done. |
284 | * |
285 | * @see commenceScanOnly(), resumeTokenizationFrom() |
286 | */ |
287 | Token attributeAsRaw(const QChar separator, |
288 | int &stack, |
289 | const int startPos, |
290 | const bool inLiteral, |
291 | QString &result); |
292 | |
293 | const QString m_data; |
294 | const int m_length; |
295 | State m_state; |
296 | QStack<State> m_stateStack; |
297 | int m_pos; |
298 | |
299 | /** |
300 | * The current line number. |
301 | * |
302 | * The line number and column number both starts at 1. |
303 | */ |
304 | int m_line; |
305 | |
306 | /** |
307 | * The offset into m_length for where |
308 | * the current column starts. So m_length - m_columnOffset |
309 | * is the current column. |
310 | * |
311 | * The line number and column number both starts at 1. |
312 | */ |
313 | int m_columnOffset; |
314 | |
315 | const NamePool::Ptr m_namePool; |
316 | QStack<Token> m_tokenStack; |
317 | QHash<QString, QChar> m_charRefs; |
318 | bool m_scanOnly; |
319 | |
320 | Q_DISABLE_COPY(XQueryTokenizer) |
321 | }; |
322 | } |
323 | |
324 | QT_END_NAMESPACE |
325 | |
326 | #endif |
327 | |