1/****************************************************************************
2**
3** Copyright (C) 2016 The Qt Company Ltd.
4** Contact: https://www.qt.io/licensing/
5**
6** This file is part of the QtXmlPatterns module of the Qt Toolkit.
7**
8** $QT_BEGIN_LICENSE:LGPL$
9** Commercial License Usage
10** Licensees holding valid commercial Qt licenses may use this file in
11** accordance with the commercial license agreement provided with the
12** Software or, alternatively, in accordance with the terms contained in
13** a written agreement between you and The Qt Company. For licensing terms
14** and conditions see https://www.qt.io/terms-conditions. For further
15** information use the contact form at https://www.qt.io/contact-us.
16**
17** GNU Lesser General Public License Usage
18** Alternatively, this file may be used under the terms of the GNU Lesser
19** General Public License version 3 as published by the Free Software
20** Foundation and appearing in the file LICENSE.LGPL3 included in the
21** packaging of this file. Please review the following information to
22** ensure the GNU Lesser General Public License version 3 requirements
23** will be met: https://www.gnu.org/licenses/lgpl-3.0.html.
24**
25** GNU General Public License Usage
26** Alternatively, this file may be used under the terms of the GNU
27** General Public License version 2.0 or (at your option) the GNU General
28** Public license version 3 or any later version approved by the KDE Free
29** Qt Foundation. The licenses are as published by the Free Software
30** Foundation and appearing in the file LICENSE.GPL2 and LICENSE.GPL3
31** included in the packaging of this file. Please review the following
32** information to ensure the GNU General Public License requirements will
33** be met: https://www.gnu.org/licenses/gpl-2.0.html and
34** https://www.gnu.org/licenses/gpl-3.0.html.
35**
36** $QT_END_LICENSE$
37**
38****************************************************************************/
39
40//
41// W A R N I N G
42// -------------
43//
44// This file is not part of the Qt API. It exists purely as an
45// implementation detail. This header file may change from version to
46// version without notice, or even be removed.
47//
48// We mean it.
49#ifndef Patternist_XQueryTokenizer_H
50#define Patternist_XQueryTokenizer_H
51
52#include <QHash>
53#include <QSet>
54#include <QStack>
55#include <QString>
56#include <QUrl>
57
58#include <private/qtokenizer_p.h>
59
60QT_BEGIN_NAMESPACE
61
62namespace QPatternist
63{
64 struct TokenMap;
65
66 /**
67 * @short A hand-written tokenizer which tokenizes XQuery 1.0 & XPath 2.0,
68 * and delivers tokens to the Bison generated parser.
69 *
70 * @author Frans Englich <frans.englich@nokia.com>
71 */
72 class XQueryTokenizer : public Tokenizer
73 {
74 public:
75 /**
76 * Tokenizer states. Organized alphabetically.
77 */
78 enum State
79 {
80 AfterAxisSeparator,
81 AposAttributeContent,
82 Axis,
83 Default,
84 ElementContent,
85 EndTag,
86 ItemType,
87 KindTest,
88 KindTestForPI,
89 NamespaceDecl,
90 NamespaceKeyword,
91 OccurrenceIndicator,
92 Operator,
93 Pragma,
94 PragmaContent,
95 ProcessingInstructionContent,
96 ProcessingInstructionName,
97 QuotAttributeContent,
98 StartTag,
99 VarName,
100 XMLComment,
101 XMLSpaceDecl,
102 XQueryVersion
103 };
104
105 XQueryTokenizer(const QString &query,
106 const QUrl &location,
107 const State startingState = Default);
108
109 virtual Token nextToken(XPATHLTYPE *const sourceLocator);
110 virtual int commenceScanOnly();
111 virtual void resumeTokenizationFrom(const int position);
112
113 /**
114 * Does nothing.
115 */
116 virtual void setParserContext(const ParserContext::Ptr &parseInfo);
117
118 private:
119
120 /**
121 * Returns the character corresponding to the builtin reference @p
122 * reference. For instance, passing @c gt will give you '>' in return.
123 *
124 * If @p reference is an invalid character reference, a null QChar is
125 * returned.
126 *
127 * @see QChar::isNull()
128 */
129 QChar charForReference(const QString &reference);
130
131 inline Token tokenAndChangeState(const TokenType code,
132 const State state,
133 const int advance = 1);
134 inline Token tokenAndChangeState(const TokenType code,
135 const QString &value,
136 const State state);
137 inline Token tokenAndAdvance(const TokenType code,
138 const int advance = 1);
139 QString tokenizeCharacterReference();
140
141 inline Token tokenizeStringLiteral();
142 inline Token tokenizeNumberLiteral();
143
144 /**
145 * @returns the character @p length characters from the current
146 * position.
147 */
148 inline char peekAhead(const int length = 1) const;
149
150 /**
151 * @returns whether the stream, starting from @p offset from the
152 * current position, matches @p chs. The length of @p chs is @p len.
153 */
154 inline bool aheadEquals(const char *const chs,
155 const int len,
156 const int offset = 1) const;
157
158 inline Token tokenizeNCName();
159 static inline bool isOperatorKeyword(const TokenType);
160
161 static inline bool isDigit(const char ch);
162 static inline Token error();
163 inline TokenType consumeWhitespace();
164
165 /**
166 * @short Returns the character at the current position, converted to
167 * @c ASCII.
168 *
169 * Equivalent to calling:
170 *
171 * @code
172 * current().toLatin1();
173 * @endcode
174 */
175 inline char peekCurrent() const;
176
177 /**
178 * Disregarding encoding conversion, equivalent to calling:
179 *
180 * @code
181 * peekAhead(0);
182 * @endcode
183 */
184 inline const QChar current() const;
185
186 /**
187 * @p hadWhitespace is always set to a proper value.
188 *
189 * @returns the length of whitespace scanned before reaching "::", or
190 * -1 if something else was found.
191 */
192 int peekForColonColon() const;
193
194 static inline bool isNCNameStart(const QChar ch);
195 static inline bool isNCNameBody(const QChar ch);
196 static inline const TokenMap *lookupKeyword(const QString &keyword);
197 inline void popState();
198 inline void pushState(const State state);
199 inline State state() const;
200 inline void setState(const State s);
201 static bool isTypeToken(const TokenType t);
202
203 inline Token tokenizeNCNameOrQName();
204 /**
205 * Advances m_pos until content is encountered.
206 *
207 * Returned is the length stretching from m_pos when starting, until
208 * @p content is encountered. @p content is not included in the length.
209 */
210 int scanUntil(const char *const content);
211
212 /**
213 * Same as calling:
214 * @code
215 * pushState(currentState());
216 * @endcode
217 */
218 inline void pushState();
219
220 /**
221 * Consumes only whitespace, in the traditional sense. The function exits
222 * if non-whitespace is encountered, such as the start of a comment.
223 *
224 * @returns @c true if the end was reached, otherwise @c false
225 */
226 inline bool consumeRawWhitespace();
227
228 /**
229 * @short Parses comments: <tt>(: comment content :)</tt>. It recurses for
230 * parsing nested comments.
231 *
232 * It is assumed that the start token for the comment, "(:", has
233 * already been parsed.
234 *
235 * Typically, don't call this function, but ignoreWhitespace().
236 *
237 * @see <a href="http://www.w3.org/TR/xpath20/#comments">XML Path Language (XPath)
238 * 2.0, 2.6 Comments</a>
239 * @returns
240 * - SUCCESS if everything went ok
241 * - ERROR if there was an error in parsing one or more comments
242 * - END_OF_FILE if the end was reached
243 */
244 Tokenizer::TokenType consumeComment();
245
246 /**
247 * Determines whether @p code is a keyword
248 * that is followed by a second keyword. For instance <tt>declare
249 * function</tt>.
250 */
251 static inline bool isPhraseKeyword(const TokenType code);
252
253 /**
254 * A set of indexes into a QString, the one being passed to
255 * normalizeEOL() whose characters shouldn't be normalized. */
256 typedef QSet<int> CharacterSkips;
257
258 /**
259 * Returns @p input, normalized according to
260 * <a href="http://www.w3.org/TR/xquery/#id-eol-handling">XQuery 1.0:
261 * An XML Query Language, A.2.3 End-of-Line Handling</a>
262 */
263 static QString normalizeEOL(const QString &input,
264 const CharacterSkips &characterSkips);
265
266 inline bool atEnd() const
267 {
268 return m_pos == m_length;
269 }
270
271 Token nextToken();
272 /**
273 * Instead of recognizing and tokenizing embedded expressions in
274 * direct attriute constructors, this function is essentially a mini
275 * recursive-descent parser that has the necessary logic to recognize
276 * embedded expressions and their potentially interfering string literals, in
277 * order to scan to the very end of the attribute value, and return the
278 * whole as a string.
279 *
280 * There is of course syntax errors this function will not detect, but
281 * that is ok since the attributes will be parsed once more.
282 *
283 * An inelegant solution, but which gets the job done.
284 *
285 * @see commenceScanOnly(), resumeTokenizationFrom()
286 */
287 Token attributeAsRaw(const QChar separator,
288 int &stack,
289 const int startPos,
290 const bool inLiteral,
291 QString &result);
292
293 const QString m_data;
294 const int m_length;
295 State m_state;
296 QStack<State> m_stateStack;
297 int m_pos;
298
299 /**
300 * The current line number.
301 *
302 * The line number and column number both starts at 1.
303 */
304 int m_line;
305
306 /**
307 * The offset into m_length for where
308 * the current column starts. So m_length - m_columnOffset
309 * is the current column.
310 *
311 * The line number and column number both starts at 1.
312 */
313 int m_columnOffset;
314
315 const NamePool::Ptr m_namePool;
316 QStack<Token> m_tokenStack;
317 QHash<QString, QChar> m_charRefs;
318 bool m_scanOnly;
319
320 Q_DISABLE_COPY(XQueryTokenizer)
321 };
322}
323
324QT_END_NAMESPACE
325
326#endif
327

source code of qtxmlpatterns/src/xmlpatterns/parser/qxquerytokenizer_p.h