1 | /**************************************************************************** |
2 | ** |
3 | ** Copyright (C) 2016 The Qt Company Ltd. |
4 | ** Contact: https://www.qt.io/licensing/ |
5 | ** |
6 | ** This file is part of the QtXmlPatterns module of the Qt Toolkit. |
7 | ** |
8 | ** $QT_BEGIN_LICENSE:LGPL$ |
9 | ** Commercial License Usage |
10 | ** Licensees holding valid commercial Qt licenses may use this file in |
11 | ** accordance with the commercial license agreement provided with the |
12 | ** Software or, alternatively, in accordance with the terms contained in |
13 | ** a written agreement between you and The Qt Company. For licensing terms |
14 | ** and conditions see https://www.qt.io/terms-conditions. For further |
15 | ** information use the contact form at https://www.qt.io/contact-us. |
16 | ** |
17 | ** GNU Lesser General Public License Usage |
18 | ** Alternatively, this file may be used under the terms of the GNU Lesser |
19 | ** General Public License version 3 as published by the Free Software |
20 | ** Foundation and appearing in the file LICENSE.LGPL3 included in the |
21 | ** packaging of this file. Please review the following information to |
22 | ** ensure the GNU Lesser General Public License version 3 requirements |
23 | ** will be met: https://www.gnu.org/licenses/lgpl-3.0.html. |
24 | ** |
25 | ** GNU General Public License Usage |
26 | ** Alternatively, this file may be used under the terms of the GNU |
27 | ** General Public License version 2.0 or (at your option) the GNU General |
28 | ** Public license version 3 or any later version approved by the KDE Free |
29 | ** Qt Foundation. The licenses are as published by the Free Software |
30 | ** Foundation and appearing in the file LICENSE.GPL2 and LICENSE.GPL3 |
31 | ** included in the packaging of this file. Please review the following |
32 | ** information to ensure the GNU General Public License requirements will |
33 | ** be met: https://www.gnu.org/licenses/gpl-2.0.html and |
34 | ** https://www.gnu.org/licenses/gpl-3.0.html. |
35 | ** |
36 | ** $QT_END_LICENSE$ |
37 | ** |
38 | ****************************************************************************/ |
39 | |
40 | #include <QByteArray> |
41 | |
42 | #include "qparsercontext_p.h" |
43 | #include "qquerytransformparser_p.h" |
44 | |
45 | #include "qxquerytokenizer_p.h" |
46 | |
47 | #include "qtokenlookup.cpp" |
48 | |
49 | QT_BEGIN_NAMESPACE |
50 | |
51 | namespace QPatternist |
52 | { |
53 | |
54 | #define handleWhitespace() \ |
55 | { \ |
56 | const TokenType t = consumeWhitespace(); \ |
57 | if (t != T_SUCCESS) \ |
58 | return Token(t); \ |
59 | } |
60 | |
61 | XQueryTokenizer::XQueryTokenizer(const QString &query, |
62 | const QUrl &location, |
63 | const State startingState) : Tokenizer(location) |
64 | , m_data(query) |
65 | , m_length(query.length()) |
66 | , m_state(startingState) |
67 | , m_pos(0) |
68 | , m_line(1) |
69 | , m_columnOffset(0) |
70 | , m_scanOnly(false) |
71 | { |
72 | Q_ASSERT(location.isValid() || location.isEmpty()); |
73 | } |
74 | |
75 | const QChar XQueryTokenizer::current() const |
76 | { |
77 | if (m_pos < m_length) |
78 | return m_data.at(i: m_pos); |
79 | else |
80 | return QChar(); |
81 | } |
82 | |
83 | char XQueryTokenizer::peekCurrent() const |
84 | { |
85 | return current().toLatin1(); |
86 | } |
87 | |
88 | int XQueryTokenizer::peekForColonColon() const |
89 | { |
90 | /* Note, we don't modify m_pos in this function, so we need to do offset |
91 | * calculations. */ |
92 | int pos = m_pos; |
93 | |
94 | while(pos < m_length) |
95 | { |
96 | switch(m_data.at(i: pos).toLatin1()) |
97 | { |
98 | /* Fallthrough these four. */ |
99 | case ' ': |
100 | case '\t': |
101 | case '\n': |
102 | case '\r': |
103 | break; |
104 | case ':': |
105 | { |
106 | if (peekAhead(length: (pos - m_pos) + 1) == ':') |
107 | return pos - m_pos; |
108 | Q_FALLTHROUGH(); |
109 | } |
110 | default: |
111 | return -1; |
112 | } |
113 | ++pos; |
114 | } |
115 | |
116 | return -1; |
117 | } |
118 | |
119 | Tokenizer::Token XQueryTokenizer::tokenAndChangeState(const TokenType code, |
120 | const State s, |
121 | const int advance) |
122 | { |
123 | Q_ASSERT(advance >= 0); |
124 | m_pos += advance; |
125 | setState(s); |
126 | return Token(code); |
127 | } |
128 | |
129 | Tokenizer::Token XQueryTokenizer::tokenAndChangeState(const TokenType code, |
130 | const QString &value, |
131 | const State s) |
132 | { |
133 | setState(s); |
134 | return Token(code, value); |
135 | } |
136 | |
137 | Tokenizer::Token XQueryTokenizer::tokenAndAdvance(const TokenType code, |
138 | const int advance) |
139 | { |
140 | Q_ASSERT(advance >= 0); |
141 | m_pos += advance; |
142 | return Token(code); |
143 | } |
144 | |
145 | QString XQueryTokenizer::normalizeEOL(const QString &input, |
146 | const CharacterSkips &characterSkips) |
147 | { |
148 | const int len = input.count(); |
149 | QString result; |
150 | |
151 | /* The likely hood is rather high it'll be the same content. */ |
152 | result.reserve(asize: len); |
153 | |
154 | for(int i = 0; i < len; ++i) |
155 | { |
156 | const QChar &at = input.at(i); |
157 | |
158 | if (characterSkips.contains(value: i)) |
159 | { |
160 | result.append(c: at); |
161 | continue; |
162 | } |
163 | switch(input.at(i).unicode()) |
164 | { |
165 | case '\r': |
166 | { |
167 | if (i + 1 < len && input.at(i: i + 1) == QLatin1Char('\n')) |
168 | ++i; |
169 | |
170 | Q_FALLTHROUGH(); |
171 | } |
172 | case '\n': |
173 | { |
174 | result.append(c: QLatin1Char('\n')); |
175 | continue; |
176 | } |
177 | default: |
178 | { |
179 | result.append(c: at); |
180 | } |
181 | } |
182 | } |
183 | |
184 | return result; |
185 | } |
186 | |
187 | Tokenizer::TokenType XQueryTokenizer::() |
188 | { |
189 | /* Below, we return ERROR instead of END_OF_FILE such that the parser |
190 | * sees an invalid comment. */ |
191 | while(m_pos < m_length) |
192 | { |
193 | switch(peekCurrent()) |
194 | { |
195 | case ':': |
196 | { |
197 | ++m_pos; /* Consume ':' */ |
198 | if (atEnd()) |
199 | return T_ERROR; |
200 | |
201 | if (peekCurrent() == ')') |
202 | { |
203 | ++m_pos; /* Consume ')' */ |
204 | return T_SUCCESS; /* The comment closed nicely. */ |
205 | } |
206 | continue; /* We don't want to increment m_pos twice. */ |
207 | } |
208 | case '(': |
209 | { /* It looks like the start of a comment. */ |
210 | ++m_pos; |
211 | |
212 | if (atEnd()) |
213 | return T_END_OF_FILE; |
214 | else if (peekCurrent() == ':') |
215 | { |
216 | /* And it is a nested comment -- parse it. */ |
217 | const TokenType retval = consumeComment(); |
218 | if (retval == T_SUCCESS) |
219 | continue; /* Continue with our "own" comment. */ |
220 | else |
221 | return retval; /* Return the error in the nested comment. */ |
222 | } |
223 | break; |
224 | } |
225 | case '\n': |
226 | case '\r': |
227 | { |
228 | /* We want to count \r\n as a single line break. */ |
229 | if (peekAhead() == '\n') |
230 | ++m_pos; |
231 | |
232 | m_columnOffset = m_pos; |
233 | ++m_line; |
234 | |
235 | break; |
236 | } |
237 | } |
238 | ++m_pos; |
239 | } |
240 | |
241 | return T_ERROR; /* Error: we reached the end while inside a comment. */ |
242 | } |
243 | |
244 | bool XQueryTokenizer::consumeRawWhitespace() |
245 | { |
246 | while(m_pos < m_length) |
247 | { |
248 | switch(peekCurrent()) |
249 | { |
250 | case ' ': |
251 | case '\t': |
252 | break; |
253 | case '\n': |
254 | case '\r': |
255 | { |
256 | if (peekAhead() == '\n') |
257 | ++m_pos; |
258 | |
259 | m_columnOffset = m_pos; |
260 | ++m_line; |
261 | |
262 | break; |
263 | } |
264 | default: |
265 | return false; |
266 | } |
267 | ++m_pos; |
268 | } |
269 | return true; |
270 | } |
271 | |
272 | Tokenizer::TokenType XQueryTokenizer::consumeWhitespace() |
273 | { |
274 | while(m_pos < m_length) |
275 | { |
276 | switch(peekCurrent()) |
277 | { |
278 | case ' ': |
279 | case '\t': |
280 | break; |
281 | case '\n': |
282 | case '\r': |
283 | { |
284 | /* We want to count \r\n as a single line break. */ |
285 | if (peekAhead() == '\n') |
286 | ++m_pos; |
287 | |
288 | m_columnOffset = m_pos; |
289 | ++m_line; |
290 | |
291 | break; |
292 | } |
293 | case '(': |
294 | { |
295 | if (peekAhead() == ':') |
296 | { |
297 | m_pos += 2; /* Consume "(:" */ |
298 | |
299 | const TokenType = consumeComment(); |
300 | if (comment == T_SUCCESS) |
301 | continue; |
302 | else |
303 | return comment; |
304 | } |
305 | Q_FALLTHROUGH(); |
306 | } |
307 | default: |
308 | return T_SUCCESS; |
309 | } |
310 | ++m_pos; |
311 | } |
312 | |
313 | return T_END_OF_FILE; |
314 | } |
315 | |
316 | char XQueryTokenizer::peekAhead(const int length) const |
317 | { |
318 | if (m_pos + length < m_length) |
319 | return m_data.at(i: m_pos + length).toLatin1(); |
320 | else |
321 | return 0; |
322 | } |
323 | |
324 | Tokenizer::Token XQueryTokenizer::error() |
325 | { |
326 | return Token(T_ERROR); |
327 | } |
328 | |
329 | bool XQueryTokenizer::isDigit(const char ch) |
330 | { |
331 | return ch >= '0' && ch <= '9'; |
332 | } |
333 | |
334 | /* Replace with function in QXmlUtils. Write test cases for this. */ |
335 | bool XQueryTokenizer::isNCNameStart(const QChar ch) |
336 | { |
337 | if (ch == QLatin1Char('_')) |
338 | return true; |
339 | |
340 | switch(ch.category()) |
341 | { |
342 | case QChar::Letter_Lowercase: |
343 | case QChar::Letter_Uppercase: |
344 | case QChar::Letter_Other: |
345 | case QChar::Letter_Titlecase: |
346 | case QChar::Number_Letter: |
347 | return true; |
348 | default: |
349 | return false; |
350 | } |
351 | } |
352 | |
353 | bool XQueryTokenizer::isNCNameBody(const QChar ch) |
354 | { |
355 | switch(ch.unicode()) |
356 | { |
357 | case '.': |
358 | case '_': |
359 | case '-': |
360 | return true; |
361 | } |
362 | |
363 | switch(ch.category()) |
364 | { |
365 | case QChar::Letter_Lowercase: |
366 | case QChar::Letter_Uppercase: |
367 | case QChar::Letter_Other: |
368 | case QChar::Letter_Titlecase: |
369 | case QChar::Number_Letter: |
370 | case QChar::Mark_SpacingCombining: |
371 | case QChar::Mark_Enclosing: |
372 | case QChar::Mark_NonSpacing: |
373 | case QChar::Letter_Modifier: |
374 | case QChar::Number_DecimalDigit: |
375 | return true; |
376 | default: |
377 | return false; |
378 | } |
379 | } |
380 | |
381 | bool XQueryTokenizer::isPhraseKeyword(const TokenType code) |
382 | { |
383 | switch(code) |
384 | { |
385 | /* Fallthrough all these. */ |
386 | case T_CASTABLE: |
387 | case T_CAST: |
388 | case T_COPY_NAMESPACES: |
389 | case T_DECLARE: |
390 | case T_EMPTY: |
391 | case T_MODULE: |
392 | case T_IMPORT: |
393 | case T_INSTANCE: |
394 | case T_ORDER: |
395 | case T_ORDERING: |
396 | case T_XQUERY: |
397 | case T_STABLE: |
398 | case T_TREAT: |
399 | return true; |
400 | default: |
401 | return false; |
402 | } |
403 | } |
404 | |
405 | bool XQueryTokenizer::isOperatorKeyword(const TokenType code) |
406 | { |
407 | switch(code) |
408 | { |
409 | /* Fallthrough all these. */ |
410 | case T_AS: |
411 | case T_ASCENDING: |
412 | case T_AT: |
413 | case T_CASE: |
414 | case T_CAST: |
415 | case T_CASTABLE: |
416 | case T_EQ: |
417 | case T_EXTERNAL: |
418 | case T_GE: |
419 | case T_G_EQ: |
420 | case T_G_GT: |
421 | case T_G_LT: |
422 | case T_G_NE: |
423 | case T_GT: |
424 | case T_IN: |
425 | case T_INHERIT: |
426 | case T_INSTANCE: |
427 | case T_IS: |
428 | case T_ITEM: |
429 | case T_LE: |
430 | case T_LT: |
431 | case T_NE: |
432 | case T_NO_INHERIT: |
433 | case T_NO_PRESERVE: |
434 | case T_OF: |
435 | case T_PRESERVE: |
436 | case T_RETURN: |
437 | case T_STABLE: |
438 | case T_TO: |
439 | case T_TREAT: |
440 | return true; |
441 | default: |
442 | return false; |
443 | }; |
444 | } |
445 | |
446 | bool XQueryTokenizer::isTypeToken(const TokenType t) |
447 | { |
448 | switch(t) |
449 | { |
450 | /* Fallthrough all these. */ |
451 | case T_ATTRIBUTE: |
452 | case T_COMMENT: |
453 | case T_DOCUMENT: |
454 | case T_DOCUMENT_NODE: |
455 | case T_ELEMENT: |
456 | case T_ITEM: |
457 | case T_NODE: |
458 | case T_PROCESSING_INSTRUCTION: |
459 | case T_SCHEMA_ATTRIBUTE: |
460 | case T_SCHEMA_ELEMENT: |
461 | case T_TEXT: |
462 | return true; |
463 | default: |
464 | return false; |
465 | } |
466 | } |
467 | |
468 | Tokenizer::Token XQueryTokenizer::tokenizeNCNameOrQName() |
469 | { |
470 | const int start = m_pos; |
471 | |
472 | const Token t1 = tokenizeNCName(); |
473 | if (t1.hasError()) |
474 | return t1; |
475 | |
476 | if (peekCurrent() != ':' || peekAhead() == '=') |
477 | return t1; |
478 | |
479 | ++m_pos; |
480 | |
481 | const Token t2 = tokenizeNCName(); |
482 | if (t2.hasError()) |
483 | return t2; |
484 | else |
485 | return Token(T_QNAME, m_data.mid(position: start, n: m_pos - start)); |
486 | } |
487 | |
488 | Tokenizer::Token XQueryTokenizer::tokenizeNumberLiteral() |
489 | { |
490 | setState(Operator); |
491 | const int startPos = m_pos; |
492 | bool hasDot = false; |
493 | bool isXPath20 = false; |
494 | |
495 | for(; m_pos < m_length; ++m_pos) |
496 | { |
497 | QChar ch(current()); |
498 | |
499 | char cell = ch.cell(); |
500 | |
501 | if (cell == 'e' || cell == 'E') |
502 | { |
503 | isXPath20 = true; |
504 | ++m_pos; |
505 | ch = current(); |
506 | |
507 | if (ch.row() != 0) |
508 | break; |
509 | |
510 | cell = ch.cell(); |
511 | |
512 | if (cell == '+' || cell == '-') |
513 | continue; |
514 | } |
515 | |
516 | if (isNCNameStart(ch)) |
517 | return error(); |
518 | |
519 | if (cell < '0' || cell > '9') |
520 | { |
521 | if (cell == '.' && !hasDot) |
522 | hasDot = true; |
523 | else |
524 | break; |
525 | } |
526 | } |
527 | |
528 | return Token(isXPath20 ? T_XPATH2_NUMBER : T_NUMBER, m_data.mid(position: startPos, n: m_pos - startPos)); |
529 | } |
530 | |
531 | QString XQueryTokenizer::tokenizeCharacterReference() |
532 | { |
533 | Q_ASSERT(peekCurrent() == '&'); |
534 | |
535 | const int theEnd = m_data.indexOf(c: QLatin1Char(';'), from: m_pos + 1); |
536 | |
537 | if (theEnd == -1) /* No ';' found, a syntax error. i18n. */ |
538 | return QString(); |
539 | |
540 | QString content(m_data.mid(position: m_pos + 1, n: (theEnd - m_pos) - 1)); |
541 | m_pos = theEnd; |
542 | |
543 | const QChar charRef(charForReference(reference: content)); |
544 | |
545 | if (!charRef.isNull()) |
546 | return charRef; |
547 | else if (content.startsWith(c: QLatin1Char('#'))) |
548 | { |
549 | int base; |
550 | |
551 | /* It is only '#' or '#x'. */ |
552 | if (content.length() < 2) |
553 | return QString(); |
554 | |
555 | /* We got a hex number if it starts with 'x', otherwise it's a decimal. */ |
556 | if (content.at(i: 1) == QLatin1Char('x')) |
557 | { |
558 | base = 16; |
559 | content = content.mid(position: 2); /* Remove "#x". */ |
560 | } |
561 | else |
562 | { |
563 | base = 10; |
564 | content = content.mid(position: 1); /* Remove "#". */ |
565 | } |
566 | |
567 | bool conversionOK = false; |
568 | const int codepoint = content.toInt(ok: &conversionOK, base); |
569 | |
570 | if (conversionOK) |
571 | { |
572 | const QChar ch(codepoint); |
573 | |
574 | if (ch.isNull()) |
575 | { |
576 | /* We likely have something which require surrogate pairs. */ |
577 | QString result; |
578 | result += QChar(QChar::highSurrogate(ucs4: codepoint)); |
579 | result += QChar(QChar::lowSurrogate(ucs4: codepoint)); |
580 | return result; |
581 | } |
582 | else |
583 | return ch; |
584 | } |
585 | else |
586 | return QString(); |
587 | } |
588 | else |
589 | return QString(); |
590 | } |
591 | |
592 | int XQueryTokenizer::scanUntil(const char *const content) |
593 | { |
594 | const int end = m_data.indexOf(s: QString::fromLatin1(str: content), from: m_pos); |
595 | |
596 | if (end == -1) |
597 | return -1; |
598 | else |
599 | { |
600 | const int len = end - m_pos; |
601 | m_pos += len; |
602 | return len; |
603 | } |
604 | } |
605 | |
606 | QChar XQueryTokenizer::charForReference(const QString &reference) |
607 | { |
608 | if (m_charRefs.isEmpty()) |
609 | { |
610 | /* Initialize. */ |
611 | m_charRefs.reserve(asize: 5); |
612 | m_charRefs.insert(akey: QLatin1String("lt" ), avalue: QLatin1Char('<')); |
613 | m_charRefs.insert(akey: QLatin1String("gt" ), avalue: QLatin1Char('>')); |
614 | m_charRefs.insert(akey: QLatin1String("amp" ), avalue: QLatin1Char('&')); |
615 | m_charRefs.insert(akey: QLatin1String("quot" ), avalue: QLatin1Char('"')); |
616 | m_charRefs.insert(akey: QLatin1String("apos" ), avalue: QLatin1Char('\'')); |
617 | } |
618 | |
619 | return m_charRefs.value(akey: reference); |
620 | } |
621 | |
622 | Tokenizer::Token XQueryTokenizer::tokenizeStringLiteral() |
623 | { |
624 | const QChar delimiter(current()); |
625 | /* We cannot unfortunately just scan and then do mid(), |
626 | * since we can encounter character references. */ |
627 | QString result; |
628 | |
629 | /* This is more likely than QString's default allocation. */ |
630 | result.reserve(asize: 8); |
631 | |
632 | CharacterSkips skipEOLNormalization; |
633 | |
634 | /* Advance over the initial quote character. */ |
635 | ++m_pos; |
636 | |
637 | for(; m_pos < m_length; ++m_pos) |
638 | { |
639 | const QChar c(current()); |
640 | |
641 | if (c == QLatin1Char('&')) |
642 | { |
643 | const QString charRef(tokenizeCharacterReference()); |
644 | |
645 | if (charRef.isNull()) |
646 | return error(); |
647 | else |
648 | { |
649 | skipEOLNormalization.insert(value: result.count()); |
650 | result.append(s: charRef); |
651 | } |
652 | |
653 | } |
654 | else if (c == delimiter) |
655 | { |
656 | /* Maybe the escaping mechanism is used. For instance, "s""s" |
657 | * has the value `s"s'. */ |
658 | ++m_pos; |
659 | |
660 | if (current() == delimiter) /* Double quote. */ |
661 | result += delimiter; |
662 | else |
663 | return Token(T_STRING_LITERAL, normalizeEOL(input: result, characterSkips: skipEOLNormalization)); |
664 | } |
665 | else |
666 | result += c; |
667 | } |
668 | |
669 | return error(); |
670 | } |
671 | |
672 | Tokenizer::Token XQueryTokenizer::tokenizeNCName() |
673 | { |
674 | const int startPos = m_pos; |
675 | |
676 | if (m_pos < m_length && isNCNameStart(ch: current())) |
677 | { |
678 | ++m_pos; |
679 | |
680 | for(; m_pos < m_length; ++m_pos) |
681 | { |
682 | if (!isNCNameBody(ch: current())) |
683 | break; |
684 | } |
685 | |
686 | return Token(T_NCNAME, m_data.mid(position: startPos, n: m_pos - startPos)); |
687 | } |
688 | else |
689 | return error(); |
690 | } |
691 | |
692 | bool XQueryTokenizer::aheadEquals(const char *const chs, |
693 | const int len, |
694 | const int offset) const |
695 | { |
696 | Q_ASSERT(len > 0); |
697 | Q_ASSERT(qstrlen(chs) == uint(len)); |
698 | |
699 | if (m_pos + len >= m_length) |
700 | return false; |
701 | |
702 | for(int i = offset; i < (len + offset); ++i) |
703 | { |
704 | if (m_data.at(i: m_pos + i).toLatin1() != chs[i - offset]) |
705 | return false; |
706 | } |
707 | |
708 | return true; |
709 | } |
710 | |
711 | const TokenMap *XQueryTokenizer::lookupKeyword(const QString &keyword) |
712 | { |
713 | return TokenLookup::value(str: keyword.toLatin1().constData(), len: keyword.length()); |
714 | } |
715 | |
716 | XQueryTokenizer::State XQueryTokenizer::state() const |
717 | { |
718 | return m_state; |
719 | } |
720 | |
721 | void XQueryTokenizer::setState(const State s) |
722 | { |
723 | m_state = s; |
724 | } |
725 | |
726 | void XQueryTokenizer::pushState(const State s) |
727 | { |
728 | m_stateStack.push(t: s); |
729 | } |
730 | |
731 | void XQueryTokenizer::pushState() |
732 | { |
733 | m_stateStack.push(t: m_state); |
734 | } |
735 | |
736 | void XQueryTokenizer::popState() |
737 | { |
738 | /* QStack::pop() asserts if it's empty, so we need to check |
739 | * it, since we might receive unbalanced curlies. */ |
740 | if (!m_stateStack.isEmpty()) |
741 | m_state = m_stateStack.pop(); |
742 | } |
743 | |
744 | Tokenizer::Token XQueryTokenizer::nextToken() |
745 | { |
746 | switch(state()) |
747 | { |
748 | /* We want to skip or do special whitespace handling for these |
749 | * states. So fallthrough all of the following. */ |
750 | case AposAttributeContent: |
751 | case Axis: |
752 | case ElementContent: |
753 | case EndTag: |
754 | case Pragma: |
755 | case PragmaContent: |
756 | case ProcessingInstructionName: |
757 | case QuotAttributeContent: |
758 | case StartTag: |
759 | case XMLComment: |
760 | break; |
761 | default: |
762 | handleWhitespace(); |
763 | } |
764 | |
765 | switch(state()) |
766 | { |
767 | case XMLSpaceDecl: |
768 | case NamespaceKeyword: |
769 | { |
770 | switch(peekCurrent()) |
771 | { |
772 | case ',': |
773 | return tokenAndAdvance(code: T_COMMA); |
774 | case '"': |
775 | case '\'': |
776 | { |
777 | setState(NamespaceDecl); |
778 | return tokenizeStringLiteral(); |
779 | } |
780 | } |
781 | |
782 | const Token id(tokenizeNCName()); |
783 | |
784 | if (id.type != T_NCNAME) |
785 | return id; |
786 | |
787 | const TokenMap *const keyword = lookupKeyword(keyword: id.value); |
788 | if (keyword) |
789 | { |
790 | switch(keyword->token) |
791 | { |
792 | case T_INHERIT: |
793 | case T_NO_INHERIT: |
794 | { |
795 | setState(Default); |
796 | break; |
797 | } |
798 | case T_NAMESPACE: |
799 | { |
800 | setState(NamespaceDecl); |
801 | break; |
802 | } |
803 | case T_ORDERED: |
804 | case T_UNORDERED: |
805 | case T_STRIP: |
806 | { |
807 | setState(Default); |
808 | break; |
809 | } |
810 | case T_PRESERVE: |
811 | { |
812 | if (state() != NamespaceKeyword) |
813 | setState(Default); |
814 | break; |
815 | } |
816 | default: |
817 | break; |
818 | } |
819 | |
820 | return Token(keyword->token); |
821 | } |
822 | else |
823 | return id; |
824 | } |
825 | case NamespaceDecl: |
826 | { |
827 | switch(peekCurrent()) |
828 | { |
829 | case '=': |
830 | return tokenAndAdvance(code: T_G_EQ); |
831 | case ';': |
832 | return tokenAndChangeState(code: T_SEMI_COLON, s: Default); |
833 | case '\'': |
834 | case '\"': |
835 | return tokenizeStringLiteral(); |
836 | } |
837 | |
838 | const Token nc(tokenizeNCName()); |
839 | |
840 | handleWhitespace(); |
841 | |
842 | const char pc = peekCurrent(); |
843 | const TokenMap* const t = lookupKeyword(keyword: nc.value); |
844 | |
845 | if (pc == '\'' || (pc == '"' && t)) |
846 | return tokenAndChangeState(code: t->token, s: Default, advance: 0); |
847 | else |
848 | return nc; |
849 | } |
850 | case Axis: |
851 | { |
852 | if (peekCurrent() == ':') |
853 | { |
854 | Q_ASSERT(peekAhead() == ':'); |
855 | m_pos += 2; |
856 | setState(AfterAxisSeparator); |
857 | return Token(T_COLONCOLON); |
858 | } |
859 | Q_FALLTHROUGH(); |
860 | } |
861 | case AfterAxisSeparator: |
862 | case Default: |
863 | /* State Operator and state Default have a lot of tokens in common except |
864 | * for minor differences. So we treat them the same way, and sprinkles logic |
865 | * here and there to handle the small differences. */ |
866 | Q_FALLTHROUGH(); |
867 | case Operator: |
868 | { |
869 | switch(peekCurrent()) |
870 | { |
871 | case '=': |
872 | return tokenAndChangeState(code: T_G_EQ, s: Default); |
873 | case '-': |
874 | return tokenAndChangeState(code: T_MINUS, s: Default); |
875 | case '+': |
876 | return tokenAndChangeState(code: T_PLUS, s: Default); |
877 | case '[': |
878 | return tokenAndChangeState(code: T_LBRACKET, s: Default); |
879 | case ']': |
880 | return tokenAndChangeState(code: T_RBRACKET, s: Operator); |
881 | case ',': |
882 | return tokenAndChangeState(code: T_COMMA, s: Default); |
883 | case ';': |
884 | return tokenAndChangeState(code: T_SEMI_COLON, s: Default); |
885 | case '$': |
886 | return tokenAndChangeState(code: T_DOLLAR, s: VarName); |
887 | case '|': |
888 | return tokenAndChangeState(code: T_BAR, s: Default); |
889 | case '?': |
890 | return tokenAndChangeState(code: T_QUESTION, s: Operator); |
891 | case ')': |
892 | return tokenAndChangeState(code: T_RPAREN, s: Operator); |
893 | case '@': |
894 | return tokenAndChangeState(code: T_AT_SIGN, s: Default); |
895 | /* Fallthrough all these. */ |
896 | case '1': |
897 | case '2': |
898 | case '3': |
899 | case '4': |
900 | case '5': |
901 | case '6': |
902 | case '7': |
903 | case '8': |
904 | case '9': |
905 | case '0': |
906 | return tokenizeNumberLiteral(); |
907 | case '.': |
908 | { |
909 | const char next = peekAhead(); |
910 | if (next == '.') |
911 | return tokenAndChangeState(code: T_DOTDOT, s: Operator, advance: 2); |
912 | /* .5 is allowed, as short form for 0.5: |
913 | * <tt>[142] DecimalLiteral ::= ("." Digits) | (Digits "." [0-9]*)</tt> |
914 | */ |
915 | else if (isDigit(ch: next)) |
916 | return tokenizeNumberLiteral(); |
917 | else |
918 | return tokenAndChangeState(code: T_DOT, s: Operator); |
919 | } |
920 | case '\'': |
921 | case '"': |
922 | { |
923 | setState(Operator); |
924 | return tokenizeStringLiteral(); |
925 | |
926 | } |
927 | case '(': |
928 | { |
929 | if (peekAhead() == '#') |
930 | return tokenAndChangeState(code: T_PRAGMA_START, s: Pragma, advance: 2); |
931 | else |
932 | return tokenAndChangeState(code: T_LPAREN, s: Default); |
933 | } |
934 | case '*': |
935 | { |
936 | if (peekAhead() == ':') |
937 | { |
938 | m_pos += 2; /* Consume *:. */ |
939 | const Token nc = tokenizeNCName(); |
940 | |
941 | if (nc.hasError()) |
942 | return error(); |
943 | else |
944 | return tokenAndChangeState(code: T_ANY_PREFIX, value: nc.value, s: Operator); |
945 | } |
946 | else |
947 | return tokenAndChangeState(code: T_STAR, s: state() == Default ? Operator : Default); |
948 | } |
949 | case ':': |
950 | { |
951 | switch(peekAhead()) |
952 | { |
953 | case '=': |
954 | return tokenAndChangeState(code: T_ASSIGN, s: Default, advance: 2); |
955 | case ':': |
956 | return tokenAndChangeState(code: T_COLONCOLON, s: Default, advance: 2); |
957 | default: |
958 | return error(); |
959 | } |
960 | } |
961 | case '!': |
962 | { |
963 | if (peekAhead() == '=') |
964 | return tokenAndChangeState(code: T_G_NE, s: Default, advance: 2); |
965 | else |
966 | return error(); |
967 | } |
968 | case '<': |
969 | { |
970 | switch(peekAhead()) |
971 | { |
972 | case '=': |
973 | return tokenAndChangeState(code: T_G_LE, s: Default, advance: 2); |
974 | case '<': |
975 | return tokenAndChangeState(code: T_PRECEDES, s: Default, advance: 2); |
976 | case '?': |
977 | { |
978 | pushState(s: Operator); |
979 | return tokenAndChangeState(code: T_PI_START, s: ProcessingInstructionName, advance: 2); |
980 | } |
981 | case '!': |
982 | { |
983 | if (aheadEquals(chs: "!--" , len: 3)) |
984 | { |
985 | m_pos += 3; /* Consume "!--". */ |
986 | pushState(s: Operator); |
987 | return tokenAndChangeState(code: T_COMMENT_START, s: XMLComment); |
988 | } |
989 | /* Fallthrough. It's a syntax error, and this is a good way to report it. */ |
990 | Q_FALLTHROUGH(); |
991 | } |
992 | default: |
993 | { |
994 | if ((m_pos + 1) < m_length && isNCNameStart(ch: m_data.at(i: m_pos + 1))) |
995 | { |
996 | /* We assume it's an element constructor. */ |
997 | pushState(s: Operator); |
998 | } |
999 | |
1000 | return tokenAndChangeState(code: T_G_LT, s: state() == Operator ? Default : StartTag); |
1001 | } |
1002 | } |
1003 | } |
1004 | case '>': |
1005 | { |
1006 | switch(peekAhead()) |
1007 | { |
1008 | case '=': |
1009 | return tokenAndChangeState(code: T_G_GE, s: Default, advance: 2); |
1010 | case '>': |
1011 | return tokenAndChangeState(code: T_FOLLOWS, s: Default, advance: 2); |
1012 | default: |
1013 | return tokenAndChangeState(code: T_G_GT, s: Default); |
1014 | } |
1015 | } |
1016 | case '/': |
1017 | { |
1018 | if (peekAhead() == '/') |
1019 | return tokenAndChangeState(code: T_SLASHSLASH, s: Default, advance: 2); |
1020 | else |
1021 | return tokenAndChangeState(code: T_SLASH, s: Default); |
1022 | } |
1023 | case '{': |
1024 | { |
1025 | pushState(s: Operator); |
1026 | return tokenAndChangeState(code: T_CURLY_LBRACE, s: Default); |
1027 | } |
1028 | case '}': |
1029 | { |
1030 | popState(); |
1031 | |
1032 | return tokenAndAdvance(code: T_CURLY_RBRACE); |
1033 | } |
1034 | } |
1035 | |
1036 | /* Ok. We're in state Default or Operator, and it wasn't a simple |
1037 | * character. */ |
1038 | |
1039 | const Token id(tokenizeNCName()); |
1040 | |
1041 | if (id.type != T_NCNAME) |
1042 | return id; |
1043 | |
1044 | const TokenMap *const keyword = lookupKeyword(keyword: id.value); |
1045 | |
1046 | if (state() == Operator) |
1047 | { |
1048 | if (keyword) |
1049 | { |
1050 | if (keyword->token == T_DEFAULT || keyword->token == T_ASCENDING || keyword->token == T_DESCENDING) |
1051 | setState(Operator); |
1052 | else if (keyword->token == T_RETURN) |
1053 | setState(Default); |
1054 | else if (isPhraseKeyword(code: keyword->token)) |
1055 | { |
1056 | const TokenType ws = consumeWhitespace(); |
1057 | if (ws == T_ERROR) |
1058 | return error(); |
1059 | |
1060 | const Token id2(tokenizeNCName()); |
1061 | const TokenMap *const keyword2 = lookupKeyword(keyword: id2.value); |
1062 | |
1063 | if (keyword2) |
1064 | { |
1065 | if (keyword->token == T_TREAT && keyword2->token == T_AS) |
1066 | setState(ItemType); |
1067 | else if (keyword->token == T_CAST || (keyword->token == T_CASTABLE && keyword2->token == T_AS) || keyword2->token == T_BY) |
1068 | setState(Default); |
1069 | |
1070 | m_tokenStack.push(t: Token(keyword2->token)); |
1071 | } |
1072 | else |
1073 | m_tokenStack.push(t: id2); |
1074 | |
1075 | return Token(keyword->token); |
1076 | } |
1077 | else |
1078 | { |
1079 | /* Such that we tokenize the second token in "empty greatest". */ |
1080 | if (keyword->token != T_EMPTY) |
1081 | setState(Default); |
1082 | } |
1083 | |
1084 | if (keyword->token == T_AS || keyword->token == T_CASE) |
1085 | setState(ItemType); |
1086 | |
1087 | return Token(keyword->token); |
1088 | } |
1089 | else |
1090 | return id; |
1091 | } |
1092 | |
1093 | Q_ASSERT(state() == Default || state() == Axis || state() == AfterAxisSeparator); |
1094 | |
1095 | /* |
1096 | * This is hard. Consider this: |
1097 | * |
1098 | * Valid: child ::nameTest |
1099 | * Valid: child:: nameTest |
1100 | * Syntax Error: child :localName |
1101 | * Syntax Error: child: localName |
1102 | * |
1103 | * Consider "child ::name". Right now, we're here: |
1104 | * ^ |
1105 | * We don't know whether "child" is a prefix and hence the whitespace is invalid, |
1106 | * or whether it's an axis and hence skippable. */ |
1107 | { |
1108 | const int wsLength = peekForColonColon(); |
1109 | /* We cannot call handleWhitespace() because it returns on |
1110 | * END_OF_FILE, and we have parsed up keyword, and we need to |
1111 | * deal with that. |
1112 | * |
1113 | * If we have a colon colon, which means the whitespace is |
1114 | * allowed, we skip it. */ |
1115 | if (wsLength != -1) |
1116 | m_pos += wsLength; |
1117 | } |
1118 | |
1119 | /* Handle name tests. */ |
1120 | if (peekCurrent() == ':') |
1121 | { |
1122 | switch(peekAhead()) |
1123 | { |
1124 | case '=': |
1125 | return id; |
1126 | case '*': |
1127 | { |
1128 | m_pos += 2; |
1129 | return tokenAndChangeState(code: T_ANY_LOCAL_NAME, value: id.value, s: Operator); |
1130 | } |
1131 | case ':': |
1132 | { |
1133 | /* We have an axis. */ |
1134 | setState(Axis); |
1135 | return keyword ? Token(keyword->token) : id; |
1136 | } |
1137 | default: |
1138 | { |
1139 | /* It's a QName. */ |
1140 | ++m_pos; /* Consume the colon. */ |
1141 | |
1142 | const Token id2(tokenizeNCName()); |
1143 | |
1144 | if (id2.type != T_NCNAME) |
1145 | { |
1146 | --m_pos; |
1147 | return id; |
1148 | } |
1149 | |
1150 | setState(Operator); |
1151 | const int qNameLen = id.value.length() + id2.value.length() + 1; |
1152 | return Token(T_QNAME, m_data.mid(position: m_pos - qNameLen, n: qNameLen)); |
1153 | } |
1154 | } |
1155 | } |
1156 | |
1157 | if (!keyword || isOperatorKeyword(code: keyword->token)) |
1158 | { |
1159 | setState(Operator); |
1160 | return id; |
1161 | } |
1162 | |
1163 | const TokenType ws = consumeWhitespace(); |
1164 | if (ws == T_ERROR) // TODO this should test for success. Write test. |
1165 | return Token(T_ERROR); |
1166 | |
1167 | if (atEnd()) |
1168 | { |
1169 | setState(Operator); |
1170 | return id; |
1171 | } |
1172 | |
1173 | /* Let the if-body apply for constructors, and node type tests. */ |
1174 | if (isTypeToken(t: keyword->token) || |
1175 | keyword->token == T_TYPESWITCH || |
1176 | keyword->token == T_ORDERED || |
1177 | keyword->token == T_UNORDERED || |
1178 | keyword->token == T_IF) |
1179 | { |
1180 | switch(peekCurrent()) |
1181 | { |
1182 | case '(': |
1183 | { |
1184 | // TODO See if we can remove DOCUMENT from isTypeToken. |
1185 | if (isTypeToken(t: keyword->token) && keyword->token != T_DOCUMENT) |
1186 | { |
1187 | m_tokenStack.push(t: Token(T_LPAREN)); |
1188 | ++m_pos; /* Consume '('. */ |
1189 | pushState(s: Operator); |
1190 | |
1191 | if (keyword->token == T_PROCESSING_INSTRUCTION) |
1192 | setState(KindTestForPI); |
1193 | else |
1194 | setState(KindTest); |
1195 | |
1196 | return Token(keyword->token); |
1197 | } |
1198 | else if (keyword->token == T_TYPESWITCH || keyword->token == T_IF) |
1199 | return Token(keyword->token); |
1200 | else /* It's a function call. */ |
1201 | return id; |
1202 | } |
1203 | case '{': |
1204 | { |
1205 | m_tokenStack.push(t: Token(T_CURLY_LBRACE)); |
1206 | ++m_pos; /* Consume '{'. */ |
1207 | pushState(s: Operator); |
1208 | /* Stay in state Default. */ |
1209 | return Token(keyword->token); |
1210 | } |
1211 | default: |
1212 | { |
1213 | /* We have read in a token which is for instance |
1214 | * "return", and now it can be an element |
1215 | * test("element") a node kind test("element()"), or a |
1216 | * computed element constructor("element name {..."). |
1217 | * We need to do a two-token lookahead here, because |
1218 | * "element return" can be an element test followed by |
1219 | * the return keyword, but it can also be an element |
1220 | * constructor("element return {"). */ |
1221 | if (isNCNameStart(ch: current())) |
1222 | { |
1223 | const int currentPos = m_pos; |
1224 | const Token token2 = tokenizeNCNameOrQName(); |
1225 | |
1226 | if (token2.hasError()) |
1227 | return token2; |
1228 | |
1229 | handleWhitespace(); |
1230 | |
1231 | if (peekCurrent() == '{') |
1232 | { |
1233 | /* An element constructor. */ |
1234 | m_tokenStack.push(t: token2); |
1235 | return Token(keyword->token); |
1236 | } |
1237 | |
1238 | /* We jump back in the stream, we need to tokenize token2 according |
1239 | * to the state. */ |
1240 | m_pos = currentPos; |
1241 | setState(Operator); |
1242 | return Token(T_NCNAME, QLatin1String(keyword->name)); |
1243 | } |
1244 | } |
1245 | } |
1246 | } |
1247 | |
1248 | if (peekCurrent() == '$') |
1249 | { |
1250 | setState(VarName); |
1251 | return Token(keyword->token); |
1252 | } |
1253 | |
1254 | /* It's not a node type, it's not the typeswitch expression, but it is a function callsite. */ |
1255 | if (peekCurrent() == '(') |
1256 | return id; |
1257 | else if (peekCurrent() == '{' && keyword->token == T_VALIDATE) |
1258 | return Token(keyword->token); |
1259 | |
1260 | if (!isNCNameStart(ch: current())) |
1261 | { |
1262 | setState(Operator); |
1263 | return id; |
1264 | } |
1265 | |
1266 | const Token id2(tokenizeNCName()); |
1267 | const TokenMap *const keyword2 = lookupKeyword(keyword: id2.value); |
1268 | |
1269 | if (!keyword2) |
1270 | { |
1271 | /* It's a syntax error. All cases of two subsequent ncnames are keywords(e.g, declarations). */ |
1272 | setState(Operator); |
1273 | return id; |
1274 | } |
1275 | |
1276 | switch(keyword->token) |
1277 | { |
1278 | case T_DECLARE: |
1279 | { |
1280 | switch(keyword2->token) |
1281 | { |
1282 | case T_VARIABLE: |
1283 | case T_FUNCTION: |
1284 | { |
1285 | m_tokenStack.push(t: Token(keyword2->token)); |
1286 | setState(Default); |
1287 | return Token(keyword->token); |
1288 | } |
1289 | case T_OPTION: |
1290 | { |
1291 | m_tokenStack.push(t: Token(keyword2->token)); |
1292 | setState(Default); |
1293 | return Token(keyword->token); |
1294 | } |
1295 | case T_COPY_NAMESPACES: |
1296 | case T_ORDERING: |
1297 | { |
1298 | m_tokenStack.push(t: Token(keyword2->token)); |
1299 | setState(NamespaceKeyword); |
1300 | return Token(keyword->token); |
1301 | } |
1302 | case T_CONSTRUCTION: |
1303 | { |
1304 | // TODO identical to CONSTRUCTION? |
1305 | m_tokenStack.push(t: Token(keyword2->token)); |
1306 | setState(Operator); |
1307 | return Token(keyword->token); |
1308 | } |
1309 | case T_NAMESPACE: |
1310 | case T_BASEURI: |
1311 | { |
1312 | m_tokenStack.push(t: Token(keyword2->token)); |
1313 | setState(NamespaceDecl); |
1314 | return Token(keyword->token); |
1315 | } |
1316 | case T_BOUNDARY_SPACE: |
1317 | { |
1318 | m_tokenStack.push(t: Token(keyword2->token)); |
1319 | setState(XMLSpaceDecl); |
1320 | return Token(keyword->token); |
1321 | } |
1322 | case T_DEFAULT: |
1323 | { |
1324 | m_tokenStack.push(t: Token(keyword2->token)); |
1325 | |
1326 | const TokenType ws2 = consumeWhitespace(); |
1327 | if (ws2 != T_SUCCESS) |
1328 | { |
1329 | m_tokenStack.prepend(t: Token(ws2)); |
1330 | return Token(keyword->token); |
1331 | } |
1332 | |
1333 | const Token id3(tokenizeNCName()); |
1334 | |
1335 | if (id3.type != T_NCNAME) |
1336 | { |
1337 | m_tokenStack.prepend(t: id3); |
1338 | return Token(keyword->token); |
1339 | } |
1340 | |
1341 | const TokenMap *const keyword3 = lookupKeyword(keyword: id3.value); |
1342 | if (!keyword3) |
1343 | { |
1344 | m_tokenStack.prepend(t: id3); |
1345 | return Token(keyword->token); |
1346 | } |
1347 | else |
1348 | { |
1349 | m_tokenStack.prepend(t: Token(keyword3->token)); |
1350 | |
1351 | if (keyword3->token == T_ORDER) |
1352 | setState(Operator); |
1353 | else |
1354 | setState(NamespaceDecl); |
1355 | } |
1356 | |
1357 | return Token(keyword->token); |
1358 | } |
1359 | default: |
1360 | { |
1361 | m_tokenStack.push(t: Token(keyword2->token)); |
1362 | setState(Default); |
1363 | return id; |
1364 | } |
1365 | } |
1366 | } |
1367 | case T_XQUERY: |
1368 | { |
1369 | m_tokenStack.push(t: Token(keyword2->token)); |
1370 | |
1371 | if (keyword2->token == T_VERSION) |
1372 | { |
1373 | setState(NamespaceDecl); |
1374 | return Token(keyword->token); |
1375 | } |
1376 | else |
1377 | { |
1378 | setState(Operator); |
1379 | return id; |
1380 | } |
1381 | } |
1382 | case T_IMPORT: |
1383 | { |
1384 | m_tokenStack.push(t: Token(keyword2->token)); |
1385 | |
1386 | switch(keyword2->token) |
1387 | { |
1388 | case T_SCHEMA: |
1389 | case T_MODULE: |
1390 | { |
1391 | setState(NamespaceKeyword); |
1392 | return Token(keyword->token); |
1393 | } |
1394 | default: |
1395 | { |
1396 | setState(Operator); |
1397 | return id; |
1398 | } |
1399 | } |
1400 | } |
1401 | case T_VALIDATE: |
1402 | { |
1403 | m_tokenStack.push(t: Token(keyword2->token)); |
1404 | |
1405 | switch(keyword2->token) |
1406 | { |
1407 | case T_LAX: |
1408 | case T_STRICT: |
1409 | { |
1410 | pushState(s: Operator); |
1411 | return Token(keyword->token); |
1412 | } |
1413 | default: |
1414 | { |
1415 | setState(Operator); |
1416 | return id; |
1417 | } |
1418 | } |
1419 | } |
1420 | default: |
1421 | { |
1422 | m_tokenStack.push(t: Token(keyword2->token)); |
1423 | setState(Operator); |
1424 | return id; |
1425 | } |
1426 | } |
1427 | } |
1428 | case VarName: |
1429 | { |
1430 | if (peekCurrent() == '$') |
1431 | return tokenAndAdvance(code: T_DOLLAR); |
1432 | |
1433 | setState(Operator); |
1434 | return tokenizeNCNameOrQName(); |
1435 | } |
1436 | case ItemType: |
1437 | { |
1438 | switch(peekCurrent()) |
1439 | { |
1440 | case '(': |
1441 | return tokenAndChangeState(code: T_LPAREN, s: KindTest); |
1442 | case '$': |
1443 | return tokenAndChangeState(code: T_DOLLAR, s: VarName); |
1444 | } |
1445 | |
1446 | const Token name(tokenizeNCNameOrQName()); |
1447 | |
1448 | if (name.hasError()) |
1449 | return error(); |
1450 | |
1451 | else if (name.type == T_QNAME) |
1452 | { |
1453 | setState(OccurrenceIndicator); |
1454 | return name; |
1455 | } |
1456 | else |
1457 | { |
1458 | const TokenMap *const keyword = lookupKeyword(keyword: name.value); |
1459 | |
1460 | if (keyword) |
1461 | { |
1462 | pushState(s: OccurrenceIndicator); |
1463 | return Token(keyword->token); |
1464 | } |
1465 | else |
1466 | { |
1467 | setState(Default); |
1468 | return name; |
1469 | } |
1470 | } |
1471 | } |
1472 | case KindTest: |
1473 | { |
1474 | switch(peekCurrent()) |
1475 | { |
1476 | case ')': |
1477 | { |
1478 | popState(); |
1479 | return tokenAndAdvance(code: T_RPAREN); |
1480 | } |
1481 | case '(': |
1482 | return tokenAndAdvance(code: T_LPAREN); |
1483 | case ',': |
1484 | return tokenAndAdvance(code: T_COMMA); |
1485 | case '*': |
1486 | return tokenAndAdvance(code: T_STAR); |
1487 | case '?': |
1488 | return tokenAndAdvance(code: T_QUESTION); |
1489 | case '\'': |
1490 | case '"': |
1491 | return tokenizeStringLiteral(); |
1492 | } |
1493 | |
1494 | const Token nc(tokenizeNCNameOrQName()); |
1495 | if (nc.hasError()) |
1496 | return nc; |
1497 | |
1498 | const TokenType ws = consumeWhitespace(); |
1499 | if (ws == T_ERROR) |
1500 | return error(); |
1501 | |
1502 | if (peekCurrent() == '(') |
1503 | { |
1504 | const TokenMap *const keyword = lookupKeyword(keyword: nc.value); |
1505 | if (keyword) |
1506 | { |
1507 | pushState(s: KindTest); |
1508 | return Token(keyword->token); |
1509 | } |
1510 | else |
1511 | return nc; |
1512 | } |
1513 | else |
1514 | return nc; |
1515 | } |
1516 | case KindTestForPI: |
1517 | { |
1518 | switch(peekCurrent()) |
1519 | { |
1520 | case ')': |
1521 | { |
1522 | popState(); |
1523 | return tokenAndAdvance(code: T_RPAREN); |
1524 | } |
1525 | case '\'': |
1526 | case '"': |
1527 | return tokenizeStringLiteral(); |
1528 | default: |
1529 | return tokenizeNCName(); |
1530 | } |
1531 | } |
1532 | case OccurrenceIndicator: |
1533 | { |
1534 | switch(peekCurrent()) |
1535 | { |
1536 | case '?': |
1537 | return tokenAndChangeState(code: T_QUESTION, s: Operator); |
1538 | case '*': |
1539 | return tokenAndChangeState(code: T_STAR, s: Operator); |
1540 | case '+': |
1541 | return tokenAndChangeState(code: T_PLUS, s: Operator); |
1542 | default: |
1543 | { |
1544 | setState(Operator); |
1545 | return nextToken(); |
1546 | } |
1547 | } |
1548 | } |
1549 | case XQueryVersion: |
1550 | { |
1551 | switch(peekCurrent()) |
1552 | { |
1553 | case '\'': |
1554 | case '"': |
1555 | return tokenizeStringLiteral(); |
1556 | case ';': |
1557 | return tokenAndChangeState(code: T_SEMI_COLON, s: Default); |
1558 | } |
1559 | |
1560 | const Token id(tokenizeNCName()); |
1561 | |
1562 | if (id.type != T_NCNAME) |
1563 | return id; |
1564 | |
1565 | const TokenMap *const keyword = lookupKeyword(keyword: id.value); |
1566 | if (keyword) |
1567 | return tokenAndChangeState(code: keyword->token, s: Default); |
1568 | else |
1569 | return id; |
1570 | } |
1571 | case StartTag: |
1572 | { |
1573 | if (peekAhead(length: -1) == '<') |
1574 | { |
1575 | if (current().isSpace()) |
1576 | return Token(T_ERROR); |
1577 | } |
1578 | else |
1579 | { |
1580 | if (consumeRawWhitespace()) |
1581 | return Token(T_END_OF_FILE); |
1582 | } |
1583 | |
1584 | switch(peekCurrent()) |
1585 | { |
1586 | case '/': |
1587 | { |
1588 | if (peekAhead() == '>') |
1589 | { |
1590 | m_pos += 2; |
1591 | |
1592 | if (m_scanOnly) |
1593 | return Token(T_POSITION_SET); |
1594 | else |
1595 | { |
1596 | popState(); |
1597 | return Token(T_QUICK_TAG_END); |
1598 | } |
1599 | } |
1600 | else |
1601 | return error(); |
1602 | } |
1603 | case '>': |
1604 | { |
1605 | if (m_scanOnly) |
1606 | return tokenAndChangeState(code: T_POSITION_SET, s: StartTag); |
1607 | else |
1608 | return tokenAndChangeState(code: T_G_GT, s: ElementContent); |
1609 | } |
1610 | case '=': |
1611 | return tokenAndAdvance(code: T_G_EQ); |
1612 | case '\'': |
1613 | return tokenAndChangeState(code: T_APOS, s: AposAttributeContent); |
1614 | case '"': |
1615 | return tokenAndChangeState(code: T_QUOTE, s: QuotAttributeContent); |
1616 | default: |
1617 | return tokenizeNCNameOrQName(); |
1618 | } |
1619 | } |
1620 | case AposAttributeContent: |
1621 | case QuotAttributeContent: |
1622 | { |
1623 | const QChar sep(state() == AposAttributeContent ? QLatin1Char('\'') : QLatin1Char('"')); |
1624 | QString result; |
1625 | result.reserve(asize: 20); |
1626 | |
1627 | if (m_scanOnly) |
1628 | { |
1629 | int stack = 0; |
1630 | return attributeAsRaw(separator: sep, stack, startPos: m_pos, inLiteral: true, result); |
1631 | } |
1632 | |
1633 | Q_ASSERT(!m_scanOnly); |
1634 | while(true) |
1635 | { |
1636 | if (atEnd()) |
1637 | { |
1638 | /* In the case that the XSL-T tokenizer invokes us with |
1639 | * default state QuotAttributeContent, we need to be able |
1640 | * to return a single string, in case that is all we have |
1641 | * accumulated. */ |
1642 | if (result.isEmpty()) |
1643 | return Token(T_END_OF_FILE); |
1644 | else |
1645 | return Token(T_STRING_LITERAL, result); |
1646 | } |
1647 | |
1648 | const QChar curr(current()); |
1649 | |
1650 | if (curr == sep) |
1651 | { |
1652 | if (m_pos + 1 == m_length) |
1653 | return Token(T_END_OF_FILE); |
1654 | |
1655 | if (m_data.at(i: m_pos + 1) == sep) |
1656 | { |
1657 | /* The quoting mechanism was used. */ |
1658 | m_pos += 2; |
1659 | result.append(c: sep); |
1660 | continue; |
1661 | } |
1662 | |
1663 | const QChar next(m_data.at(i: m_pos + 1)); |
1664 | if (!next.isSpace() && next != QLatin1Char('/') && next != QLatin1Char('>')) |
1665 | return Token(T_ERROR); // i18n Space must separate attributes |
1666 | |
1667 | if (result.isEmpty()) |
1668 | { |
1669 | return tokenAndChangeState(code: state() == AposAttributeContent ? T_APOS : T_QUOTE, |
1670 | s: StartTag, advance: 1); |
1671 | } |
1672 | |
1673 | /* Don't consume the sep, but leave it so we next time return a token for it. */ |
1674 | return Token(T_STRING_LITERAL, result); |
1675 | } |
1676 | else if (curr == QLatin1Char('{')) |
1677 | { |
1678 | if (m_pos + 1 == m_length) |
1679 | return Token(T_END_OF_FILE); |
1680 | else if (peekAhead() == '{') |
1681 | { |
1682 | ++m_pos; |
1683 | result.append(c: QLatin1Char('{')); |
1684 | } |
1685 | else |
1686 | { |
1687 | if (result.isEmpty()) |
1688 | { |
1689 | /* The Attribute Value Template appeared directly in the attribute. */ |
1690 | pushState(); |
1691 | return tokenAndChangeState(code: T_CURLY_LBRACE, s: Default); |
1692 | } |
1693 | else |
1694 | { |
1695 | /* We don't advance, keep '{' as next token. */ |
1696 | return Token(T_STRING_LITERAL, result); |
1697 | } |
1698 | } |
1699 | } |
1700 | else if (curr == QLatin1Char('}')) |
1701 | { |
1702 | if (m_pos + 1 == m_length) |
1703 | return Token(T_END_OF_FILE); |
1704 | else if (peekAhead() == '}') |
1705 | { |
1706 | ++m_pos; |
1707 | result.append(c: QLatin1Char('}')); |
1708 | } |
1709 | else |
1710 | return Token(T_ERROR); |
1711 | } |
1712 | else if (curr == QLatin1Char('&')) |
1713 | { |
1714 | const QString ret(tokenizeCharacterReference()); |
1715 | if (ret.isNull()) |
1716 | return Token(T_ERROR); |
1717 | else |
1718 | result.append(s: ret); |
1719 | } |
1720 | else if (curr == QLatin1Char('<')) |
1721 | return Token(T_STRING_LITERAL, result); |
1722 | else |
1723 | { |
1724 | /* See Extensible Markup Language (XML) 1.0 (Fourth Edition), |
1725 | * 3.3.3 Attribute-Value Normalization. |
1726 | * |
1727 | * However, it is complicated a bit by that AVN is defined on top of |
1728 | * EOL normalization and we do those two in one go here. */ |
1729 | switch(curr.unicode()) |
1730 | { |
1731 | case 0xD: |
1732 | { |
1733 | if (peekAhead() == '\n') |
1734 | { |
1735 | result.append(c: QLatin1Char(' ')); |
1736 | ++m_pos; |
1737 | break; |
1738 | } |
1739 | Q_FALLTHROUGH(); |
1740 | } |
1741 | case 0xA: |
1742 | case 0x9: |
1743 | { |
1744 | result.append(c: QLatin1Char(' ')); |
1745 | break; |
1746 | } |
1747 | default: |
1748 | result.append(c: curr); |
1749 | } |
1750 | } |
1751 | |
1752 | ++m_pos; |
1753 | } |
1754 | } |
1755 | case ElementContent: |
1756 | { |
1757 | QString result; |
1758 | result.reserve(asize: 20); |
1759 | |
1760 | /* Whether the text node, result, may be whitespace only. Character references |
1761 | * and CDATA sections disables that. */ |
1762 | bool mayBeWS = true; |
1763 | |
1764 | CharacterSkips skipEOLNormalization; |
1765 | |
1766 | while(true) |
1767 | { |
1768 | if (atEnd()) |
1769 | return Token(T_END_OF_FILE); |
1770 | |
1771 | switch(peekCurrent()) |
1772 | { |
1773 | case '<': |
1774 | { |
1775 | if (!result.isEmpty() && peekAhead(length: 2) != '[') |
1776 | { |
1777 | /* We encountered the end, and it was not a CDATA section. */ |
1778 | /* We don't advance. Next time we'll handle the <... stuff. */ |
1779 | return Token(mayBeWS ? T_STRING_LITERAL : T_NON_BOUNDARY_WS, normalizeEOL(input: result, characterSkips: skipEOLNormalization)); |
1780 | } |
1781 | |
1782 | ++m_pos; |
1783 | if (atEnd()) |
1784 | return Token(T_END_OF_FILE); |
1785 | |
1786 | const QChar ahead(current()); |
1787 | if (ahead.isSpace()) |
1788 | return error(); |
1789 | else if (ahead == QLatin1Char('/')) |
1790 | { |
1791 | if (m_pos + 1 == m_length) |
1792 | return Token(T_END_OF_FILE); |
1793 | else if (m_data.at(i: m_pos + 1).isSpace()) |
1794 | return error(); |
1795 | else |
1796 | return tokenAndChangeState(code: T_BEGIN_END_TAG, s: EndTag); |
1797 | } |
1798 | else if (isNCNameStart(ch: ahead)) |
1799 | { |
1800 | pushState(); |
1801 | return tokenAndChangeState(code: T_G_LT, s: StartTag, advance: 0); |
1802 | } |
1803 | else if (aheadEquals(chs: "!--" , len: 3, offset: 0)) |
1804 | { |
1805 | pushState(); |
1806 | m_pos += 3; |
1807 | return tokenAndChangeState(code: T_COMMENT_START, s: XMLComment, advance: 0); |
1808 | } |
1809 | else if (aheadEquals(chs: "![CDATA[" , len: 8, offset: 0)) |
1810 | { |
1811 | mayBeWS = false; |
1812 | m_pos += 8; |
1813 | const int start = m_pos; |
1814 | const int len = scanUntil(content: "]]>" ); |
1815 | |
1816 | if (len == -1) |
1817 | return Token(T_END_OF_FILE); |
1818 | |
1819 | m_pos += 2; /* Consume "]]>". Note that m_pos is on '!'. */ |
1820 | result.append(s: m_data.mid(position: start, n: len)); |
1821 | break; |
1822 | } |
1823 | else if (ahead == QLatin1Char('?')) |
1824 | { |
1825 | pushState(); |
1826 | return tokenAndChangeState(code: T_PI_START, s: ProcessingInstructionName); |
1827 | } |
1828 | else |
1829 | return Token(T_G_LT); |
1830 | } |
1831 | case '&': |
1832 | { |
1833 | const QString ret(tokenizeCharacterReference()); |
1834 | if (ret.isNull()) |
1835 | return Token(T_ERROR); |
1836 | else |
1837 | { |
1838 | skipEOLNormalization.insert(value: result.count()); |
1839 | result.append(s: ret); |
1840 | mayBeWS = false; |
1841 | break; |
1842 | } |
1843 | } |
1844 | case '{': |
1845 | { |
1846 | // TODO remove this check, also below. |
1847 | if (m_pos + 1 == m_length) |
1848 | return Token(T_END_OF_FILE); |
1849 | else if (peekAhead() == '{') |
1850 | { |
1851 | ++m_pos; |
1852 | result.append(c: QLatin1Char('{')); |
1853 | } |
1854 | else |
1855 | { |
1856 | if (result.isEmpty()) |
1857 | { |
1858 | pushState(); |
1859 | return tokenAndChangeState(code: T_CURLY_LBRACE, s: Default); |
1860 | } |
1861 | else |
1862 | { |
1863 | /* We don't advance here. */ |
1864 | return Token(mayBeWS ? T_STRING_LITERAL : T_NON_BOUNDARY_WS, normalizeEOL(input: result, characterSkips: skipEOLNormalization)); |
1865 | } |
1866 | } |
1867 | break; |
1868 | } |
1869 | case '}': |
1870 | { |
1871 | if (m_pos + 1 == m_length) |
1872 | return Token(T_END_OF_FILE); |
1873 | else if (peekAhead() == '}') |
1874 | { |
1875 | ++m_pos; |
1876 | result.append(c: QLatin1Char('}')); |
1877 | } |
1878 | else |
1879 | { |
1880 | /* This is a parse error, and the grammar won't be able |
1881 | * to reduce this CURLY_RBRACE. */ |
1882 | return tokenAndChangeState(code: T_CURLY_RBRACE, s: Default); |
1883 | } |
1884 | break; |
1885 | } |
1886 | case '\n': |
1887 | { |
1888 | /* We want to translate \r\n into \n. */ |
1889 | if (peekAhead(length: -1) == '\r') |
1890 | break; |
1891 | Q_FALLTHROUGH(); |
1892 | } |
1893 | case '\r': |
1894 | { |
1895 | result.append(c: QLatin1Char('\n')); |
1896 | break; |
1897 | } |
1898 | default: |
1899 | { |
1900 | result.append(c: current()); |
1901 | break; |
1902 | } |
1903 | } |
1904 | ++m_pos; |
1905 | } |
1906 | } |
1907 | case ProcessingInstructionName: |
1908 | { |
1909 | const int start = m_pos; |
1910 | |
1911 | while(true) |
1912 | { |
1913 | ++m_pos; |
1914 | if (m_pos >= m_length) |
1915 | return Token(T_END_OF_FILE); |
1916 | |
1917 | const QChar next(current()); |
1918 | if (next.isSpace() || next == QLatin1Char('?')) |
1919 | { |
1920 | return tokenAndChangeState(code: T_PI_TARGET, value: m_data.mid(position: start, n: m_pos - start), |
1921 | s: ProcessingInstructionContent); |
1922 | } |
1923 | } |
1924 | } |
1925 | case ProcessingInstructionContent: |
1926 | { |
1927 | /* Consume whitespace between the name and the content. */ |
1928 | if (consumeRawWhitespace()) |
1929 | return Token(T_END_OF_FILE); |
1930 | |
1931 | const int start = m_pos; |
1932 | const int len = scanUntil(content: "?>" ); |
1933 | |
1934 | if (len == -1) |
1935 | return Token(T_END_OF_FILE); |
1936 | else |
1937 | { |
1938 | m_pos += 2; /* Consume "?>" */ |
1939 | popState(); |
1940 | return Token(T_PI_CONTENT, normalizeEOL(input: m_data.mid(position: start, n: len), characterSkips: CharacterSkips())); |
1941 | } |
1942 | } |
1943 | case EndTag: |
1944 | { |
1945 | if (consumeRawWhitespace()) |
1946 | return T_END_OF_FILE; |
1947 | |
1948 | if (peekCurrent() == '>') |
1949 | { |
1950 | popState(); |
1951 | return tokenAndAdvance(code: T_G_GT); |
1952 | } |
1953 | else |
1954 | return tokenizeNCNameOrQName(); |
1955 | } |
1956 | case XMLComment: |
1957 | { |
1958 | const int start = m_pos; |
1959 | const int len = scanUntil(content: "--" ); |
1960 | |
1961 | if (len == -1) |
1962 | return T_END_OF_FILE; |
1963 | else |
1964 | { |
1965 | m_pos += 2; /* Consume "--". */ |
1966 | popState(); |
1967 | |
1968 | if (peekCurrent() == '>') |
1969 | { |
1970 | ++m_pos; |
1971 | return Token(T_COMMENT_CONTENT, normalizeEOL(input: m_data.mid(position: start, n: len), characterSkips: CharacterSkips())); |
1972 | } |
1973 | else |
1974 | return error(); |
1975 | } |
1976 | } |
1977 | case Pragma: |
1978 | { |
1979 | /* Consume whitespace. */ |
1980 | if (consumeRawWhitespace()) |
1981 | return Token(T_END_OF_FILE); |
1982 | |
1983 | setState(PragmaContent); |
1984 | return tokenizeNCNameOrQName(); |
1985 | } |
1986 | case PragmaContent: |
1987 | { |
1988 | QString result; |
1989 | result.reserve(asize: 20); |
1990 | |
1991 | const bool hasWS = m_pos < m_length && current().isSpace(); |
1992 | |
1993 | /* Consume all whitespace up to the pragma content(if any). */ |
1994 | if (consumeRawWhitespace()) |
1995 | return Token(T_END_OF_FILE); |
1996 | |
1997 | if (peekCurrent() == '#' && peekAhead() == ')') |
1998 | { |
1999 | /* We reached the end, and there's no pragma content. */ |
2000 | return tokenAndChangeState(code: T_PRAGMA_END, s: Default, advance: 2); |
2001 | } |
2002 | else if (!hasWS) |
2003 | { |
2004 | /* A separating space is required if there's pragma content. */ |
2005 | return error(); /* i18n */ |
2006 | } |
2007 | |
2008 | const int start = m_pos; |
2009 | const int len = scanUntil(content: "#)" ); |
2010 | if (len == -1) |
2011 | return Token(T_END_OF_FILE); |
2012 | |
2013 | return Token(T_STRING_LITERAL, m_data.mid(position: start, n: len)); |
2014 | Q_ASSERT(false); |
2015 | } |
2016 | } |
2017 | |
2018 | Q_ASSERT(false); |
2019 | return error(); |
2020 | } |
2021 | |
2022 | Tokenizer::Token XQueryTokenizer::attributeAsRaw(const QChar sep, |
2023 | int &sepStack, |
2024 | const int startPos, |
2025 | const bool aInLiteral, |
2026 | QString &result) |
2027 | { |
2028 | bool inLiteral = aInLiteral; |
2029 | const char otherSep = (sep == QLatin1Char('"') ? '\'' : '"'); |
2030 | |
2031 | while(true) |
2032 | { |
2033 | if (atEnd()) |
2034 | return T_END_OF_FILE; |
2035 | |
2036 | if (peekCurrent() == sep.unicode()) |
2037 | { |
2038 | if (inLiteral) |
2039 | inLiteral = false; |
2040 | else |
2041 | inLiteral = true; |
2042 | |
2043 | if (peekAhead() == sep.unicode()) |
2044 | { |
2045 | /* The quoting mechanism was used. */ |
2046 | result.append(c: current()); |
2047 | m_pos += 2; |
2048 | continue; |
2049 | } |
2050 | else |
2051 | { |
2052 | /* Don't consume the separator, such that we |
2053 | * return a token for it next time. */ |
2054 | if (m_pos == startPos) |
2055 | { |
2056 | ++m_pos; |
2057 | setState(StartTag); |
2058 | return Token(sep == QLatin1Char('"') ? T_QUOTE : T_APOS); |
2059 | } |
2060 | |
2061 | |
2062 | if (sepStack == 0) |
2063 | { |
2064 | return Token(T_STRING_LITERAL, result); |
2065 | } |
2066 | else |
2067 | { |
2068 | result.append(c: current()); |
2069 | ++m_pos; |
2070 | continue; |
2071 | } |
2072 | } |
2073 | } |
2074 | else if (peekCurrent() == '&') |
2075 | { |
2076 | const QString ret(tokenizeCharacterReference()); |
2077 | if (ret.isNull()) |
2078 | return Token(T_ERROR); |
2079 | else |
2080 | { |
2081 | result.append(s: ret); |
2082 | ++m_pos; |
2083 | continue; |
2084 | } |
2085 | } |
2086 | else if (peekCurrent() == otherSep) |
2087 | { |
2088 | result.append(c: current()); |
2089 | ++m_pos; |
2090 | |
2091 | if (peekCurrent() == otherSep) |
2092 | ++m_pos; |
2093 | |
2094 | if (inLiteral) |
2095 | inLiteral = false; |
2096 | else |
2097 | inLiteral = true; |
2098 | |
2099 | continue; |
2100 | } |
2101 | else if (peekCurrent() == '{') |
2102 | { |
2103 | result.append(c: current()); |
2104 | |
2105 | if (peekAhead() == '{') |
2106 | { |
2107 | m_pos += 2; |
2108 | continue; |
2109 | } |
2110 | else |
2111 | { |
2112 | ++m_pos; |
2113 | ++sepStack; |
2114 | const Token t(attributeAsRaw(sep, sepStack, startPos, aInLiteral: false, result)); |
2115 | if (t.type != T_SUCCESS) |
2116 | return t; |
2117 | } |
2118 | |
2119 | } |
2120 | else if (peekCurrent() == '}') |
2121 | { |
2122 | if (inLiteral && peekAhead() == '}') |
2123 | { |
2124 | result.append(c: current()); |
2125 | m_pos += 2; |
2126 | continue; |
2127 | } |
2128 | else |
2129 | { |
2130 | ++m_pos; |
2131 | --sepStack; |
2132 | return Token(T_SUCCESS); /* The return value is arbitrary. */ |
2133 | } |
2134 | } |
2135 | else |
2136 | { |
2137 | result.append(c: current()); |
2138 | ++m_pos; |
2139 | } |
2140 | } |
2141 | } |
2142 | |
2143 | Tokenizer::Token XQueryTokenizer::nextToken(XPATHLTYPE *const sourceLocator) |
2144 | { |
2145 | sourceLocator->first_line = m_line; |
2146 | sourceLocator->first_column = m_pos - m_columnOffset + 1; /* Plus 1, since m_pos is 0-based. */ |
2147 | |
2148 | if (m_tokenStack.isEmpty()) |
2149 | return nextToken(); |
2150 | else |
2151 | { |
2152 | const Token retval(m_tokenStack.pop()); |
2153 | |
2154 | switch(retval.type) |
2155 | { |
2156 | case T_MODULE: |
2157 | case T_SCHEMA: |
2158 | case T_COPY_NAMESPACES: |
2159 | { |
2160 | setState(NamespaceKeyword); |
2161 | break; |
2162 | } |
2163 | case T_VERSION: |
2164 | { |
2165 | setState(XQueryVersion); |
2166 | break; |
2167 | } |
2168 | case T_AS: |
2169 | case T_OF: |
2170 | { |
2171 | setState(ItemType); |
2172 | break; |
2173 | } |
2174 | default: |
2175 | { |
2176 | if (isOperatorKeyword(code: retval.type)) |
2177 | setState(Default); |
2178 | |
2179 | break; |
2180 | } |
2181 | }; |
2182 | |
2183 | return retval; |
2184 | } |
2185 | } |
2186 | |
2187 | int XQueryTokenizer::commenceScanOnly() |
2188 | { |
2189 | m_scanOnly = true; |
2190 | return m_pos; |
2191 | } |
2192 | |
2193 | void XQueryTokenizer::resumeTokenizationFrom(const int pos) |
2194 | { |
2195 | m_scanOnly = false; |
2196 | m_pos = pos; |
2197 | } |
2198 | |
2199 | void XQueryTokenizer::setParserContext(const ParserContext::Ptr &) |
2200 | { |
2201 | } |
2202 | |
2203 | #undef handleWhitespace |
2204 | |
2205 | } // namespace QPatternist |
2206 | |
2207 | QT_END_NAMESPACE |
2208 | |