1/****************************************************************************
2**
3** Copyright (C) 2016 The Qt Company Ltd.
4** Contact: https://www.qt.io/licensing/
5**
6** This file is part of the QtXmlPatterns module of the Qt Toolkit.
7**
8** $QT_BEGIN_LICENSE:LGPL$
9** Commercial License Usage
10** Licensees holding valid commercial Qt licenses may use this file in
11** accordance with the commercial license agreement provided with the
12** Software or, alternatively, in accordance with the terms contained in
13** a written agreement between you and The Qt Company. For licensing terms
14** and conditions see https://www.qt.io/terms-conditions. For further
15** information use the contact form at https://www.qt.io/contact-us.
16**
17** GNU Lesser General Public License Usage
18** Alternatively, this file may be used under the terms of the GNU Lesser
19** General Public License version 3 as published by the Free Software
20** Foundation and appearing in the file LICENSE.LGPL3 included in the
21** packaging of this file. Please review the following information to
22** ensure the GNU Lesser General Public License version 3 requirements
23** will be met: https://www.gnu.org/licenses/lgpl-3.0.html.
24**
25** GNU General Public License Usage
26** Alternatively, this file may be used under the terms of the GNU
27** General Public License version 2.0 or (at your option) the GNU General
28** Public license version 3 or any later version approved by the KDE Free
29** Qt Foundation. The licenses are as published by the Free Software
30** Foundation and appearing in the file LICENSE.GPL2 and LICENSE.GPL3
31** included in the packaging of this file. Please review the following
32** information to ensure the GNU General Public License requirements will
33** be met: https://www.gnu.org/licenses/gpl-2.0.html and
34** https://www.gnu.org/licenses/gpl-3.0.html.
35**
36** $QT_END_LICENSE$
37**
38****************************************************************************/
39
40#include <QByteArray>
41
42#include "qparsercontext_p.h"
43#include "qquerytransformparser_p.h"
44
45#include "qxquerytokenizer_p.h"
46
47#include "qtokenlookup.cpp"
48
49QT_BEGIN_NAMESPACE
50
51namespace QPatternist
52{
53
54#define handleWhitespace() \
55{ \
56 const TokenType t = consumeWhitespace(); \
57 if (t != T_SUCCESS) \
58 return Token(t); \
59}
60
61XQueryTokenizer::XQueryTokenizer(const QString &query,
62 const QUrl &location,
63 const State startingState) : Tokenizer(location)
64 , m_data(query)
65 , m_length(query.length())
66 , m_state(startingState)
67 , m_pos(0)
68 , m_line(1)
69 , m_columnOffset(0)
70 , m_scanOnly(false)
71{
72 Q_ASSERT(location.isValid() || location.isEmpty());
73}
74
75const QChar XQueryTokenizer::current() const
76{
77 if (m_pos < m_length)
78 return m_data.at(i: m_pos);
79 else
80 return QChar();
81}
82
83char XQueryTokenizer::peekCurrent() const
84{
85 return current().toLatin1();
86}
87
88int XQueryTokenizer::peekForColonColon() const
89{
90 /* Note, we don't modify m_pos in this function, so we need to do offset
91 * calculations. */
92 int pos = m_pos;
93
94 while(pos < m_length)
95 {
96 switch(m_data.at(i: pos).toLatin1())
97 {
98 /* Fallthrough these four. */
99 case ' ':
100 case '\t':
101 case '\n':
102 case '\r':
103 break;
104 case ':':
105 {
106 if (peekAhead(length: (pos - m_pos) + 1) == ':')
107 return pos - m_pos;
108 Q_FALLTHROUGH();
109 }
110 default:
111 return -1;
112 }
113 ++pos;
114 }
115
116 return -1;
117}
118
119Tokenizer::Token XQueryTokenizer::tokenAndChangeState(const TokenType code,
120 const State s,
121 const int advance)
122{
123 Q_ASSERT(advance >= 0);
124 m_pos += advance;
125 setState(s);
126 return Token(code);
127}
128
129Tokenizer::Token XQueryTokenizer::tokenAndChangeState(const TokenType code,
130 const QString &value,
131 const State s)
132{
133 setState(s);
134 return Token(code, value);
135}
136
137Tokenizer::Token XQueryTokenizer::tokenAndAdvance(const TokenType code,
138 const int advance)
139{
140 Q_ASSERT(advance >= 0);
141 m_pos += advance;
142 return Token(code);
143}
144
145QString XQueryTokenizer::normalizeEOL(const QString &input,
146 const CharacterSkips &characterSkips)
147{
148 const int len = input.count();
149 QString result;
150
151 /* The likely hood is rather high it'll be the same content. */
152 result.reserve(asize: len);
153
154 for(int i = 0; i < len; ++i)
155 {
156 const QChar &at = input.at(i);
157
158 if (characterSkips.contains(value: i))
159 {
160 result.append(c: at);
161 continue;
162 }
163 switch(input.at(i).unicode())
164 {
165 case '\r':
166 {
167 if (i + 1 < len && input.at(i: i + 1) == QLatin1Char('\n'))
168 ++i;
169
170 Q_FALLTHROUGH();
171 }
172 case '\n':
173 {
174 result.append(c: QLatin1Char('\n'));
175 continue;
176 }
177 default:
178 {
179 result.append(c: at);
180 }
181 }
182 }
183
184 return result;
185}
186
187Tokenizer::TokenType XQueryTokenizer::consumeComment()
188{
189 /* Below, we return ERROR instead of END_OF_FILE such that the parser
190 * sees an invalid comment. */
191 while(m_pos < m_length)
192 {
193 switch(peekCurrent())
194 {
195 case ':':
196 {
197 ++m_pos; /* Consume ':' */
198 if (atEnd())
199 return T_ERROR;
200
201 if (peekCurrent() == ')')
202 {
203 ++m_pos; /* Consume ')' */
204 return T_SUCCESS; /* The comment closed nicely. */
205 }
206 continue; /* We don't want to increment m_pos twice. */
207 }
208 case '(':
209 { /* It looks like the start of a comment. */
210 ++m_pos;
211
212 if (atEnd())
213 return T_END_OF_FILE;
214 else if (peekCurrent() == ':')
215 {
216 /* And it is a nested comment -- parse it. */
217 const TokenType retval = consumeComment();
218 if (retval == T_SUCCESS)
219 continue; /* Continue with our "own" comment. */
220 else
221 return retval; /* Return the error in the nested comment. */
222 }
223 break;
224 }
225 case '\n':
226 case '\r':
227 {
228 /* We want to count \r\n as a single line break. */
229 if (peekAhead() == '\n')
230 ++m_pos;
231
232 m_columnOffset = m_pos;
233 ++m_line;
234
235 break;
236 }
237 }
238 ++m_pos;
239 }
240
241 return T_ERROR; /* Error: we reached the end while inside a comment. */
242}
243
244bool XQueryTokenizer::consumeRawWhitespace()
245{
246 while(m_pos < m_length)
247 {
248 switch(peekCurrent())
249 {
250 case ' ':
251 case '\t':
252 break;
253 case '\n':
254 case '\r':
255 {
256 if (peekAhead() == '\n')
257 ++m_pos;
258
259 m_columnOffset = m_pos;
260 ++m_line;
261
262 break;
263 }
264 default:
265 return false;
266 }
267 ++m_pos;
268 }
269 return true;
270}
271
272Tokenizer::TokenType XQueryTokenizer::consumeWhitespace()
273{
274 while(m_pos < m_length)
275 {
276 switch(peekCurrent())
277 {
278 case ' ':
279 case '\t':
280 break;
281 case '\n':
282 case '\r':
283 {
284 /* We want to count \r\n as a single line break. */
285 if (peekAhead() == '\n')
286 ++m_pos;
287
288 m_columnOffset = m_pos;
289 ++m_line;
290
291 break;
292 }
293 case '(':
294 {
295 if (peekAhead() == ':')
296 {
297 m_pos += 2; /* Consume "(:" */
298
299 const TokenType comment = consumeComment();
300 if (comment == T_SUCCESS)
301 continue;
302 else
303 return comment;
304 }
305 Q_FALLTHROUGH();
306 }
307 default:
308 return T_SUCCESS;
309 }
310 ++m_pos;
311 }
312
313 return T_END_OF_FILE;
314}
315
316char XQueryTokenizer::peekAhead(const int length) const
317{
318 if (m_pos + length < m_length)
319 return m_data.at(i: m_pos + length).toLatin1();
320 else
321 return 0;
322}
323
324Tokenizer::Token XQueryTokenizer::error()
325{
326 return Token(T_ERROR);
327}
328
329bool XQueryTokenizer::isDigit(const char ch)
330{
331 return ch >= '0' && ch <= '9';
332}
333
334/* Replace with function in QXmlUtils. Write test cases for this. */
335bool XQueryTokenizer::isNCNameStart(const QChar ch)
336{
337 if (ch == QLatin1Char('_'))
338 return true;
339
340 switch(ch.category())
341 {
342 case QChar::Letter_Lowercase:
343 case QChar::Letter_Uppercase:
344 case QChar::Letter_Other:
345 case QChar::Letter_Titlecase:
346 case QChar::Number_Letter:
347 return true;
348 default:
349 return false;
350 }
351}
352
353bool XQueryTokenizer::isNCNameBody(const QChar ch)
354{
355 switch(ch.unicode())
356 {
357 case '.':
358 case '_':
359 case '-':
360 return true;
361 }
362
363 switch(ch.category())
364 {
365 case QChar::Letter_Lowercase:
366 case QChar::Letter_Uppercase:
367 case QChar::Letter_Other:
368 case QChar::Letter_Titlecase:
369 case QChar::Number_Letter:
370 case QChar::Mark_SpacingCombining:
371 case QChar::Mark_Enclosing:
372 case QChar::Mark_NonSpacing:
373 case QChar::Letter_Modifier:
374 case QChar::Number_DecimalDigit:
375 return true;
376 default:
377 return false;
378 }
379}
380
381bool XQueryTokenizer::isPhraseKeyword(const TokenType code)
382{
383 switch(code)
384 {
385 /* Fallthrough all these. */
386 case T_CASTABLE:
387 case T_CAST:
388 case T_COPY_NAMESPACES:
389 case T_DECLARE:
390 case T_EMPTY:
391 case T_MODULE:
392 case T_IMPORT:
393 case T_INSTANCE:
394 case T_ORDER:
395 case T_ORDERING:
396 case T_XQUERY:
397 case T_STABLE:
398 case T_TREAT:
399 return true;
400 default:
401 return false;
402 }
403}
404
405bool XQueryTokenizer::isOperatorKeyword(const TokenType code)
406{
407 switch(code)
408 {
409 /* Fallthrough all these. */
410 case T_AS:
411 case T_ASCENDING:
412 case T_AT:
413 case T_CASE:
414 case T_CAST:
415 case T_CASTABLE:
416 case T_EQ:
417 case T_EXTERNAL:
418 case T_GE:
419 case T_G_EQ:
420 case T_G_GT:
421 case T_G_LT:
422 case T_G_NE:
423 case T_GT:
424 case T_IN:
425 case T_INHERIT:
426 case T_INSTANCE:
427 case T_IS:
428 case T_ITEM:
429 case T_LE:
430 case T_LT:
431 case T_NE:
432 case T_NO_INHERIT:
433 case T_NO_PRESERVE:
434 case T_OF:
435 case T_PRESERVE:
436 case T_RETURN:
437 case T_STABLE:
438 case T_TO:
439 case T_TREAT:
440 return true;
441 default:
442 return false;
443 };
444}
445
446bool XQueryTokenizer::isTypeToken(const TokenType t)
447{
448 switch(t)
449 {
450 /* Fallthrough all these. */
451 case T_ATTRIBUTE:
452 case T_COMMENT:
453 case T_DOCUMENT:
454 case T_DOCUMENT_NODE:
455 case T_ELEMENT:
456 case T_ITEM:
457 case T_NODE:
458 case T_PROCESSING_INSTRUCTION:
459 case T_SCHEMA_ATTRIBUTE:
460 case T_SCHEMA_ELEMENT:
461 case T_TEXT:
462 return true;
463 default:
464 return false;
465 }
466}
467
468Tokenizer::Token XQueryTokenizer::tokenizeNCNameOrQName()
469{
470 const int start = m_pos;
471
472 const Token t1 = tokenizeNCName();
473 if (t1.hasError())
474 return t1;
475
476 if (peekCurrent() != ':' || peekAhead() == '=')
477 return t1;
478
479 ++m_pos;
480
481 const Token t2 = tokenizeNCName();
482 if (t2.hasError())
483 return t2;
484 else
485 return Token(T_QNAME, m_data.mid(position: start, n: m_pos - start));
486}
487
488Tokenizer::Token XQueryTokenizer::tokenizeNumberLiteral()
489{
490 setState(Operator);
491 const int startPos = m_pos;
492 bool hasDot = false;
493 bool isXPath20 = false;
494
495 for(; m_pos < m_length; ++m_pos)
496 {
497 QChar ch(current());
498
499 char cell = ch.cell();
500
501 if (cell == 'e' || cell == 'E')
502 {
503 isXPath20 = true;
504 ++m_pos;
505 ch = current();
506
507 if (ch.row() != 0)
508 break;
509
510 cell = ch.cell();
511
512 if (cell == '+' || cell == '-')
513 continue;
514 }
515
516 if (isNCNameStart(ch))
517 return error();
518
519 if (cell < '0' || cell > '9')
520 {
521 if (cell == '.' && !hasDot)
522 hasDot = true;
523 else
524 break;
525 }
526 }
527
528 return Token(isXPath20 ? T_XPATH2_NUMBER : T_NUMBER, m_data.mid(position: startPos, n: m_pos - startPos));
529}
530
531QString XQueryTokenizer::tokenizeCharacterReference()
532{
533 Q_ASSERT(peekCurrent() == '&');
534
535 const int theEnd = m_data.indexOf(c: QLatin1Char(';'), from: m_pos + 1);
536
537 if (theEnd == -1) /* No ';' found, a syntax error. i18n. */
538 return QString();
539
540 QString content(m_data.mid(position: m_pos + 1, n: (theEnd - m_pos) - 1));
541 m_pos = theEnd;
542
543 const QChar charRef(charForReference(reference: content));
544
545 if (!charRef.isNull())
546 return charRef;
547 else if (content.startsWith(c: QLatin1Char('#')))
548 {
549 int base;
550
551 /* It is only '#' or '#x'. */
552 if (content.length() < 2)
553 return QString();
554
555 /* We got a hex number if it starts with 'x', otherwise it's a decimal. */
556 if (content.at(i: 1) == QLatin1Char('x'))
557 {
558 base = 16;
559 content = content.mid(position: 2); /* Remove "#x". */
560 }
561 else
562 {
563 base = 10;
564 content = content.mid(position: 1); /* Remove "#". */
565 }
566
567 bool conversionOK = false;
568 const int codepoint = content.toInt(ok: &conversionOK, base);
569
570 if (conversionOK)
571 {
572 const QChar ch(codepoint);
573
574 if (ch.isNull())
575 {
576 /* We likely have something which require surrogate pairs. */
577 QString result;
578 result += QChar(QChar::highSurrogate(ucs4: codepoint));
579 result += QChar(QChar::lowSurrogate(ucs4: codepoint));
580 return result;
581 }
582 else
583 return ch;
584 }
585 else
586 return QString();
587 }
588 else
589 return QString();
590}
591
592int XQueryTokenizer::scanUntil(const char *const content)
593{
594 const int end = m_data.indexOf(s: QString::fromLatin1(str: content), from: m_pos);
595
596 if (end == -1)
597 return -1;
598 else
599 {
600 const int len = end - m_pos;
601 m_pos += len;
602 return len;
603 }
604}
605
606QChar XQueryTokenizer::charForReference(const QString &reference)
607{
608 if (m_charRefs.isEmpty())
609 {
610 /* Initialize. */
611 m_charRefs.reserve(asize: 5);
612 m_charRefs.insert(akey: QLatin1String("lt"), avalue: QLatin1Char('<'));
613 m_charRefs.insert(akey: QLatin1String("gt"), avalue: QLatin1Char('>'));
614 m_charRefs.insert(akey: QLatin1String("amp"), avalue: QLatin1Char('&'));
615 m_charRefs.insert(akey: QLatin1String("quot"), avalue: QLatin1Char('"'));
616 m_charRefs.insert(akey: QLatin1String("apos"), avalue: QLatin1Char('\''));
617 }
618
619 return m_charRefs.value(akey: reference);
620}
621
622Tokenizer::Token XQueryTokenizer::tokenizeStringLiteral()
623{
624 const QChar delimiter(current());
625 /* We cannot unfortunately just scan and then do mid(),
626 * since we can encounter character references. */
627 QString result;
628
629 /* This is more likely than QString's default allocation. */
630 result.reserve(asize: 8);
631
632 CharacterSkips skipEOLNormalization;
633
634 /* Advance over the initial quote character. */
635 ++m_pos;
636
637 for(; m_pos < m_length; ++m_pos)
638 {
639 const QChar c(current());
640
641 if (c == QLatin1Char('&'))
642 {
643 const QString charRef(tokenizeCharacterReference());
644
645 if (charRef.isNull())
646 return error();
647 else
648 {
649 skipEOLNormalization.insert(value: result.count());
650 result.append(s: charRef);
651 }
652
653 }
654 else if (c == delimiter)
655 {
656 /* Maybe the escaping mechanism is used. For instance, "s""s"
657 * has the value `s"s'. */
658 ++m_pos;
659
660 if (current() == delimiter) /* Double quote. */
661 result += delimiter;
662 else
663 return Token(T_STRING_LITERAL, normalizeEOL(input: result, characterSkips: skipEOLNormalization));
664 }
665 else
666 result += c;
667 }
668
669 return error();
670}
671
672Tokenizer::Token XQueryTokenizer::tokenizeNCName()
673{
674 const int startPos = m_pos;
675
676 if (m_pos < m_length && isNCNameStart(ch: current()))
677 {
678 ++m_pos;
679
680 for(; m_pos < m_length; ++m_pos)
681 {
682 if (!isNCNameBody(ch: current()))
683 break;
684 }
685
686 return Token(T_NCNAME, m_data.mid(position: startPos, n: m_pos - startPos));
687 }
688 else
689 return error();
690}
691
692bool XQueryTokenizer::aheadEquals(const char *const chs,
693 const int len,
694 const int offset) const
695{
696 Q_ASSERT(len > 0);
697 Q_ASSERT(qstrlen(chs) == uint(len));
698
699 if (m_pos + len >= m_length)
700 return false;
701
702 for(int i = offset; i < (len + offset); ++i)
703 {
704 if (m_data.at(i: m_pos + i).toLatin1() != chs[i - offset])
705 return false;
706 }
707
708 return true;
709}
710
711const TokenMap *XQueryTokenizer::lookupKeyword(const QString &keyword)
712{
713 return TokenLookup::value(str: keyword.toLatin1().constData(), len: keyword.length());
714}
715
716XQueryTokenizer::State XQueryTokenizer::state() const
717{
718 return m_state;
719}
720
721void XQueryTokenizer::setState(const State s)
722{
723 m_state = s;
724}
725
726void XQueryTokenizer::pushState(const State s)
727{
728 m_stateStack.push(t: s);
729}
730
731void XQueryTokenizer::pushState()
732{
733 m_stateStack.push(t: m_state);
734}
735
736void XQueryTokenizer::popState()
737{
738 /* QStack::pop() asserts if it's empty, so we need to check
739 * it, since we might receive unbalanced curlies. */
740 if (!m_stateStack.isEmpty())
741 m_state = m_stateStack.pop();
742}
743
744Tokenizer::Token XQueryTokenizer::nextToken()
745{
746 switch(state())
747 {
748 /* We want to skip or do special whitespace handling for these
749 * states. So fallthrough all of the following. */
750 case AposAttributeContent:
751 case Axis:
752 case ElementContent:
753 case EndTag:
754 case Pragma:
755 case PragmaContent:
756 case ProcessingInstructionName:
757 case QuotAttributeContent:
758 case StartTag:
759 case XMLComment:
760 break;
761 default:
762 handleWhitespace();
763 }
764
765 switch(state())
766 {
767 case XMLSpaceDecl:
768 case NamespaceKeyword:
769 {
770 switch(peekCurrent())
771 {
772 case ',':
773 return tokenAndAdvance(code: T_COMMA);
774 case '"':
775 case '\'':
776 {
777 setState(NamespaceDecl);
778 return tokenizeStringLiteral();
779 }
780 }
781
782 const Token id(tokenizeNCName());
783
784 if (id.type != T_NCNAME)
785 return id;
786
787 const TokenMap *const keyword = lookupKeyword(keyword: id.value);
788 if (keyword)
789 {
790 switch(keyword->token)
791 {
792 case T_INHERIT:
793 case T_NO_INHERIT:
794 {
795 setState(Default);
796 break;
797 }
798 case T_NAMESPACE:
799 {
800 setState(NamespaceDecl);
801 break;
802 }
803 case T_ORDERED:
804 case T_UNORDERED:
805 case T_STRIP:
806 {
807 setState(Default);
808 break;
809 }
810 case T_PRESERVE:
811 {
812 if (state() != NamespaceKeyword)
813 setState(Default);
814 break;
815 }
816 default:
817 break;
818 }
819
820 return Token(keyword->token);
821 }
822 else
823 return id;
824 }
825 case NamespaceDecl:
826 {
827 switch(peekCurrent())
828 {
829 case '=':
830 return tokenAndAdvance(code: T_G_EQ);
831 case ';':
832 return tokenAndChangeState(code: T_SEMI_COLON, s: Default);
833 case '\'':
834 case '\"':
835 return tokenizeStringLiteral();
836 }
837
838 const Token nc(tokenizeNCName());
839
840 handleWhitespace();
841
842 const char pc = peekCurrent();
843 const TokenMap* const t = lookupKeyword(keyword: nc.value);
844
845 if (pc == '\'' || (pc == '"' && t))
846 return tokenAndChangeState(code: t->token, s: Default, advance: 0);
847 else
848 return nc;
849 }
850 case Axis:
851 {
852 if (peekCurrent() == ':')
853 {
854 Q_ASSERT(peekAhead() == ':');
855 m_pos += 2;
856 setState(AfterAxisSeparator);
857 return Token(T_COLONCOLON);
858 }
859 Q_FALLTHROUGH();
860 }
861 case AfterAxisSeparator:
862 case Default:
863 /* State Operator and state Default have a lot of tokens in common except
864 * for minor differences. So we treat them the same way, and sprinkles logic
865 * here and there to handle the small differences. */
866 Q_FALLTHROUGH();
867 case Operator:
868 {
869 switch(peekCurrent())
870 {
871 case '=':
872 return tokenAndChangeState(code: T_G_EQ, s: Default);
873 case '-':
874 return tokenAndChangeState(code: T_MINUS, s: Default);
875 case '+':
876 return tokenAndChangeState(code: T_PLUS, s: Default);
877 case '[':
878 return tokenAndChangeState(code: T_LBRACKET, s: Default);
879 case ']':
880 return tokenAndChangeState(code: T_RBRACKET, s: Operator);
881 case ',':
882 return tokenAndChangeState(code: T_COMMA, s: Default);
883 case ';':
884 return tokenAndChangeState(code: T_SEMI_COLON, s: Default);
885 case '$':
886 return tokenAndChangeState(code: T_DOLLAR, s: VarName);
887 case '|':
888 return tokenAndChangeState(code: T_BAR, s: Default);
889 case '?':
890 return tokenAndChangeState(code: T_QUESTION, s: Operator);
891 case ')':
892 return tokenAndChangeState(code: T_RPAREN, s: Operator);
893 case '@':
894 return tokenAndChangeState(code: T_AT_SIGN, s: Default);
895 /* Fallthrough all these. */
896 case '1':
897 case '2':
898 case '3':
899 case '4':
900 case '5':
901 case '6':
902 case '7':
903 case '8':
904 case '9':
905 case '0':
906 return tokenizeNumberLiteral();
907 case '.':
908 {
909 const char next = peekAhead();
910 if (next == '.')
911 return tokenAndChangeState(code: T_DOTDOT, s: Operator, advance: 2);
912 /* .5 is allowed, as short form for 0.5:
913 * <tt>[142] DecimalLiteral ::= ("." Digits) | (Digits "." [0-9]*)</tt>
914 */
915 else if (isDigit(ch: next))
916 return tokenizeNumberLiteral();
917 else
918 return tokenAndChangeState(code: T_DOT, s: Operator);
919 }
920 case '\'':
921 case '"':
922 {
923 setState(Operator);
924 return tokenizeStringLiteral();
925
926 }
927 case '(':
928 {
929 if (peekAhead() == '#')
930 return tokenAndChangeState(code: T_PRAGMA_START, s: Pragma, advance: 2);
931 else
932 return tokenAndChangeState(code: T_LPAREN, s: Default);
933 }
934 case '*':
935 {
936 if (peekAhead() == ':')
937 {
938 m_pos += 2; /* Consume *:. */
939 const Token nc = tokenizeNCName();
940
941 if (nc.hasError())
942 return error();
943 else
944 return tokenAndChangeState(code: T_ANY_PREFIX, value: nc.value, s: Operator);
945 }
946 else
947 return tokenAndChangeState(code: T_STAR, s: state() == Default ? Operator : Default);
948 }
949 case ':':
950 {
951 switch(peekAhead())
952 {
953 case '=':
954 return tokenAndChangeState(code: T_ASSIGN, s: Default, advance: 2);
955 case ':':
956 return tokenAndChangeState(code: T_COLONCOLON, s: Default, advance: 2);
957 default:
958 return error();
959 }
960 }
961 case '!':
962 {
963 if (peekAhead() == '=')
964 return tokenAndChangeState(code: T_G_NE, s: Default, advance: 2);
965 else
966 return error();
967 }
968 case '<':
969 {
970 switch(peekAhead())
971 {
972 case '=':
973 return tokenAndChangeState(code: T_G_LE, s: Default, advance: 2);
974 case '<':
975 return tokenAndChangeState(code: T_PRECEDES, s: Default, advance: 2);
976 case '?':
977 {
978 pushState(s: Operator);
979 return tokenAndChangeState(code: T_PI_START, s: ProcessingInstructionName, advance: 2);
980 }
981 case '!':
982 {
983 if (aheadEquals(chs: "!--", len: 3))
984 {
985 m_pos += 3; /* Consume "!--". */
986 pushState(s: Operator);
987 return tokenAndChangeState(code: T_COMMENT_START, s: XMLComment);
988 }
989 /* Fallthrough. It's a syntax error, and this is a good way to report it. */
990 Q_FALLTHROUGH();
991 }
992 default:
993 {
994 if ((m_pos + 1) < m_length && isNCNameStart(ch: m_data.at(i: m_pos + 1)))
995 {
996 /* We assume it's an element constructor. */
997 pushState(s: Operator);
998 }
999
1000 return tokenAndChangeState(code: T_G_LT, s: state() == Operator ? Default : StartTag);
1001 }
1002 }
1003 }
1004 case '>':
1005 {
1006 switch(peekAhead())
1007 {
1008 case '=':
1009 return tokenAndChangeState(code: T_G_GE, s: Default, advance: 2);
1010 case '>':
1011 return tokenAndChangeState(code: T_FOLLOWS, s: Default, advance: 2);
1012 default:
1013 return tokenAndChangeState(code: T_G_GT, s: Default);
1014 }
1015 }
1016 case '/':
1017 {
1018 if (peekAhead() == '/')
1019 return tokenAndChangeState(code: T_SLASHSLASH, s: Default, advance: 2);
1020 else
1021 return tokenAndChangeState(code: T_SLASH, s: Default);
1022 }
1023 case '{':
1024 {
1025 pushState(s: Operator);
1026 return tokenAndChangeState(code: T_CURLY_LBRACE, s: Default);
1027 }
1028 case '}':
1029 {
1030 popState();
1031
1032 return tokenAndAdvance(code: T_CURLY_RBRACE);
1033 }
1034 }
1035
1036 /* Ok. We're in state Default or Operator, and it wasn't a simple
1037 * character. */
1038
1039 const Token id(tokenizeNCName());
1040
1041 if (id.type != T_NCNAME)
1042 return id;
1043
1044 const TokenMap *const keyword = lookupKeyword(keyword: id.value);
1045
1046 if (state() == Operator)
1047 {
1048 if (keyword)
1049 {
1050 if (keyword->token == T_DEFAULT || keyword->token == T_ASCENDING || keyword->token == T_DESCENDING)
1051 setState(Operator);
1052 else if (keyword->token == T_RETURN)
1053 setState(Default);
1054 else if (isPhraseKeyword(code: keyword->token))
1055 {
1056 const TokenType ws = consumeWhitespace();
1057 if (ws == T_ERROR)
1058 return error();
1059
1060 const Token id2(tokenizeNCName());
1061 const TokenMap *const keyword2 = lookupKeyword(keyword: id2.value);
1062
1063 if (keyword2)
1064 {
1065 if (keyword->token == T_TREAT && keyword2->token == T_AS)
1066 setState(ItemType);
1067 else if (keyword->token == T_CAST || (keyword->token == T_CASTABLE && keyword2->token == T_AS) || keyword2->token == T_BY)
1068 setState(Default);
1069
1070 m_tokenStack.push(t: Token(keyword2->token));
1071 }
1072 else
1073 m_tokenStack.push(t: id2);
1074
1075 return Token(keyword->token);
1076 }
1077 else
1078 {
1079 /* Such that we tokenize the second token in "empty greatest". */
1080 if (keyword->token != T_EMPTY)
1081 setState(Default);
1082 }
1083
1084 if (keyword->token == T_AS || keyword->token == T_CASE)
1085 setState(ItemType);
1086
1087 return Token(keyword->token);
1088 }
1089 else
1090 return id;
1091 }
1092
1093 Q_ASSERT(state() == Default || state() == Axis || state() == AfterAxisSeparator);
1094
1095 /*
1096 * This is hard. Consider this:
1097 *
1098 * Valid: child ::nameTest
1099 * Valid: child:: nameTest
1100 * Syntax Error: child :localName
1101 * Syntax Error: child: localName
1102 *
1103 * Consider "child ::name". Right now, we're here:
1104 * ^
1105 * We don't know whether "child" is a prefix and hence the whitespace is invalid,
1106 * or whether it's an axis and hence skippable. */
1107 {
1108 const int wsLength = peekForColonColon();
1109 /* We cannot call handleWhitespace() because it returns on
1110 * END_OF_FILE, and we have parsed up keyword, and we need to
1111 * deal with that.
1112 *
1113 * If we have a colon colon, which means the whitespace is
1114 * allowed, we skip it. */
1115 if (wsLength != -1)
1116 m_pos += wsLength;
1117 }
1118
1119 /* Handle name tests. */
1120 if (peekCurrent() == ':')
1121 {
1122 switch(peekAhead())
1123 {
1124 case '=':
1125 return id;
1126 case '*':
1127 {
1128 m_pos += 2;
1129 return tokenAndChangeState(code: T_ANY_LOCAL_NAME, value: id.value, s: Operator);
1130 }
1131 case ':':
1132 {
1133 /* We have an axis. */
1134 setState(Axis);
1135 return keyword ? Token(keyword->token) : id;
1136 }
1137 default:
1138 {
1139 /* It's a QName. */
1140 ++m_pos; /* Consume the colon. */
1141
1142 const Token id2(tokenizeNCName());
1143
1144 if (id2.type != T_NCNAME)
1145 {
1146 --m_pos;
1147 return id;
1148 }
1149
1150 setState(Operator);
1151 const int qNameLen = id.value.length() + id2.value.length() + 1;
1152 return Token(T_QNAME, m_data.mid(position: m_pos - qNameLen, n: qNameLen));
1153 }
1154 }
1155 }
1156
1157 if (!keyword || isOperatorKeyword(code: keyword->token))
1158 {
1159 setState(Operator);
1160 return id;
1161 }
1162
1163 const TokenType ws = consumeWhitespace();
1164 if (ws == T_ERROR) // TODO this should test for success. Write test.
1165 return Token(T_ERROR);
1166
1167 if (atEnd())
1168 {
1169 setState(Operator);
1170 return id;
1171 }
1172
1173 /* Let the if-body apply for constructors, and node type tests. */
1174 if (isTypeToken(t: keyword->token) ||
1175 keyword->token == T_TYPESWITCH ||
1176 keyword->token == T_ORDERED ||
1177 keyword->token == T_UNORDERED ||
1178 keyword->token == T_IF)
1179 {
1180 switch(peekCurrent())
1181 {
1182 case '(':
1183 {
1184 // TODO See if we can remove DOCUMENT from isTypeToken.
1185 if (isTypeToken(t: keyword->token) && keyword->token != T_DOCUMENT)
1186 {
1187 m_tokenStack.push(t: Token(T_LPAREN));
1188 ++m_pos; /* Consume '('. */
1189 pushState(s: Operator);
1190
1191 if (keyword->token == T_PROCESSING_INSTRUCTION)
1192 setState(KindTestForPI);
1193 else
1194 setState(KindTest);
1195
1196 return Token(keyword->token);
1197 }
1198 else if (keyword->token == T_TYPESWITCH || keyword->token == T_IF)
1199 return Token(keyword->token);
1200 else /* It's a function call. */
1201 return id;
1202 }
1203 case '{':
1204 {
1205 m_tokenStack.push(t: Token(T_CURLY_LBRACE));
1206 ++m_pos; /* Consume '{'. */
1207 pushState(s: Operator);
1208 /* Stay in state Default. */
1209 return Token(keyword->token);
1210 }
1211 default:
1212 {
1213 /* We have read in a token which is for instance
1214 * "return", and now it can be an element
1215 * test("element") a node kind test("element()"), or a
1216 * computed element constructor("element name {...").
1217 * We need to do a two-token lookahead here, because
1218 * "element return" can be an element test followed by
1219 * the return keyword, but it can also be an element
1220 * constructor("element return {"). */
1221 if (isNCNameStart(ch: current()))
1222 {
1223 const int currentPos = m_pos;
1224 const Token token2 = tokenizeNCNameOrQName();
1225
1226 if (token2.hasError())
1227 return token2;
1228
1229 handleWhitespace();
1230
1231 if (peekCurrent() == '{')
1232 {
1233 /* An element constructor. */
1234 m_tokenStack.push(t: token2);
1235 return Token(keyword->token);
1236 }
1237
1238 /* We jump back in the stream, we need to tokenize token2 according
1239 * to the state. */
1240 m_pos = currentPos;
1241 setState(Operator);
1242 return Token(T_NCNAME, QLatin1String(keyword->name));
1243 }
1244 }
1245 }
1246 }
1247
1248 if (peekCurrent() == '$')
1249 {
1250 setState(VarName);
1251 return Token(keyword->token);
1252 }
1253
1254 /* It's not a node type, it's not the typeswitch expression, but it is a function callsite. */
1255 if (peekCurrent() == '(')
1256 return id;
1257 else if (peekCurrent() == '{' && keyword->token == T_VALIDATE)
1258 return Token(keyword->token);
1259
1260 if (!isNCNameStart(ch: current()))
1261 {
1262 setState(Operator);
1263 return id;
1264 }
1265
1266 const Token id2(tokenizeNCName());
1267 const TokenMap *const keyword2 = lookupKeyword(keyword: id2.value);
1268
1269 if (!keyword2)
1270 {
1271 /* It's a syntax error. All cases of two subsequent ncnames are keywords(e.g, declarations). */
1272 setState(Operator);
1273 return id;
1274 }
1275
1276 switch(keyword->token)
1277 {
1278 case T_DECLARE:
1279 {
1280 switch(keyword2->token)
1281 {
1282 case T_VARIABLE:
1283 case T_FUNCTION:
1284 {
1285 m_tokenStack.push(t: Token(keyword2->token));
1286 setState(Default);
1287 return Token(keyword->token);
1288 }
1289 case T_OPTION:
1290 {
1291 m_tokenStack.push(t: Token(keyword2->token));
1292 setState(Default);
1293 return Token(keyword->token);
1294 }
1295 case T_COPY_NAMESPACES:
1296 case T_ORDERING:
1297 {
1298 m_tokenStack.push(t: Token(keyword2->token));
1299 setState(NamespaceKeyword);
1300 return Token(keyword->token);
1301 }
1302 case T_CONSTRUCTION:
1303 {
1304 // TODO identical to CONSTRUCTION?
1305 m_tokenStack.push(t: Token(keyword2->token));
1306 setState(Operator);
1307 return Token(keyword->token);
1308 }
1309 case T_NAMESPACE:
1310 case T_BASEURI:
1311 {
1312 m_tokenStack.push(t: Token(keyword2->token));
1313 setState(NamespaceDecl);
1314 return Token(keyword->token);
1315 }
1316 case T_BOUNDARY_SPACE:
1317 {
1318 m_tokenStack.push(t: Token(keyword2->token));
1319 setState(XMLSpaceDecl);
1320 return Token(keyword->token);
1321 }
1322 case T_DEFAULT:
1323 {
1324 m_tokenStack.push(t: Token(keyword2->token));
1325
1326 const TokenType ws2 = consumeWhitespace();
1327 if (ws2 != T_SUCCESS)
1328 {
1329 m_tokenStack.prepend(t: Token(ws2));
1330 return Token(keyword->token);
1331 }
1332
1333 const Token id3(tokenizeNCName());
1334
1335 if (id3.type != T_NCNAME)
1336 {
1337 m_tokenStack.prepend(t: id3);
1338 return Token(keyword->token);
1339 }
1340
1341 const TokenMap *const keyword3 = lookupKeyword(keyword: id3.value);
1342 if (!keyword3)
1343 {
1344 m_tokenStack.prepend(t: id3);
1345 return Token(keyword->token);
1346 }
1347 else
1348 {
1349 m_tokenStack.prepend(t: Token(keyword3->token));
1350
1351 if (keyword3->token == T_ORDER)
1352 setState(Operator);
1353 else
1354 setState(NamespaceDecl);
1355 }
1356
1357 return Token(keyword->token);
1358 }
1359 default:
1360 {
1361 m_tokenStack.push(t: Token(keyword2->token));
1362 setState(Default);
1363 return id;
1364 }
1365 }
1366 }
1367 case T_XQUERY:
1368 {
1369 m_tokenStack.push(t: Token(keyword2->token));
1370
1371 if (keyword2->token == T_VERSION)
1372 {
1373 setState(NamespaceDecl);
1374 return Token(keyword->token);
1375 }
1376 else
1377 {
1378 setState(Operator);
1379 return id;
1380 }
1381 }
1382 case T_IMPORT:
1383 {
1384 m_tokenStack.push(t: Token(keyword2->token));
1385
1386 switch(keyword2->token)
1387 {
1388 case T_SCHEMA:
1389 case T_MODULE:
1390 {
1391 setState(NamespaceKeyword);
1392 return Token(keyword->token);
1393 }
1394 default:
1395 {
1396 setState(Operator);
1397 return id;
1398 }
1399 }
1400 }
1401 case T_VALIDATE:
1402 {
1403 m_tokenStack.push(t: Token(keyword2->token));
1404
1405 switch(keyword2->token)
1406 {
1407 case T_LAX:
1408 case T_STRICT:
1409 {
1410 pushState(s: Operator);
1411 return Token(keyword->token);
1412 }
1413 default:
1414 {
1415 setState(Operator);
1416 return id;
1417 }
1418 }
1419 }
1420 default:
1421 {
1422 m_tokenStack.push(t: Token(keyword2->token));
1423 setState(Operator);
1424 return id;
1425 }
1426 }
1427 }
1428 case VarName:
1429 {
1430 if (peekCurrent() == '$')
1431 return tokenAndAdvance(code: T_DOLLAR);
1432
1433 setState(Operator);
1434 return tokenizeNCNameOrQName();
1435 }
1436 case ItemType:
1437 {
1438 switch(peekCurrent())
1439 {
1440 case '(':
1441 return tokenAndChangeState(code: T_LPAREN, s: KindTest);
1442 case '$':
1443 return tokenAndChangeState(code: T_DOLLAR, s: VarName);
1444 }
1445
1446 const Token name(tokenizeNCNameOrQName());
1447
1448 if (name.hasError())
1449 return error();
1450
1451 else if (name.type == T_QNAME)
1452 {
1453 setState(OccurrenceIndicator);
1454 return name;
1455 }
1456 else
1457 {
1458 const TokenMap *const keyword = lookupKeyword(keyword: name.value);
1459
1460 if (keyword)
1461 {
1462 pushState(s: OccurrenceIndicator);
1463 return Token(keyword->token);
1464 }
1465 else
1466 {
1467 setState(Default);
1468 return name;
1469 }
1470 }
1471 }
1472 case KindTest:
1473 {
1474 switch(peekCurrent())
1475 {
1476 case ')':
1477 {
1478 popState();
1479 return tokenAndAdvance(code: T_RPAREN);
1480 }
1481 case '(':
1482 return tokenAndAdvance(code: T_LPAREN);
1483 case ',':
1484 return tokenAndAdvance(code: T_COMMA);
1485 case '*':
1486 return tokenAndAdvance(code: T_STAR);
1487 case '?':
1488 return tokenAndAdvance(code: T_QUESTION);
1489 case '\'':
1490 case '"':
1491 return tokenizeStringLiteral();
1492 }
1493
1494 const Token nc(tokenizeNCNameOrQName());
1495 if (nc.hasError())
1496 return nc;
1497
1498 const TokenType ws = consumeWhitespace();
1499 if (ws == T_ERROR)
1500 return error();
1501
1502 if (peekCurrent() == '(')
1503 {
1504 const TokenMap *const keyword = lookupKeyword(keyword: nc.value);
1505 if (keyword)
1506 {
1507 pushState(s: KindTest);
1508 return Token(keyword->token);
1509 }
1510 else
1511 return nc;
1512 }
1513 else
1514 return nc;
1515 }
1516 case KindTestForPI:
1517 {
1518 switch(peekCurrent())
1519 {
1520 case ')':
1521 {
1522 popState();
1523 return tokenAndAdvance(code: T_RPAREN);
1524 }
1525 case '\'':
1526 case '"':
1527 return tokenizeStringLiteral();
1528 default:
1529 return tokenizeNCName();
1530 }
1531 }
1532 case OccurrenceIndicator:
1533 {
1534 switch(peekCurrent())
1535 {
1536 case '?':
1537 return tokenAndChangeState(code: T_QUESTION, s: Operator);
1538 case '*':
1539 return tokenAndChangeState(code: T_STAR, s: Operator);
1540 case '+':
1541 return tokenAndChangeState(code: T_PLUS, s: Operator);
1542 default:
1543 {
1544 setState(Operator);
1545 return nextToken();
1546 }
1547 }
1548 }
1549 case XQueryVersion:
1550 {
1551 switch(peekCurrent())
1552 {
1553 case '\'':
1554 case '"':
1555 return tokenizeStringLiteral();
1556 case ';':
1557 return tokenAndChangeState(code: T_SEMI_COLON, s: Default);
1558 }
1559
1560 const Token id(tokenizeNCName());
1561
1562 if (id.type != T_NCNAME)
1563 return id;
1564
1565 const TokenMap *const keyword = lookupKeyword(keyword: id.value);
1566 if (keyword)
1567 return tokenAndChangeState(code: keyword->token, s: Default);
1568 else
1569 return id;
1570 }
1571 case StartTag:
1572 {
1573 if (peekAhead(length: -1) == '<')
1574 {
1575 if (current().isSpace())
1576 return Token(T_ERROR);
1577 }
1578 else
1579 {
1580 if (consumeRawWhitespace())
1581 return Token(T_END_OF_FILE);
1582 }
1583
1584 switch(peekCurrent())
1585 {
1586 case '/':
1587 {
1588 if (peekAhead() == '>')
1589 {
1590 m_pos += 2;
1591
1592 if (m_scanOnly)
1593 return Token(T_POSITION_SET);
1594 else
1595 {
1596 popState();
1597 return Token(T_QUICK_TAG_END);
1598 }
1599 }
1600 else
1601 return error();
1602 }
1603 case '>':
1604 {
1605 if (m_scanOnly)
1606 return tokenAndChangeState(code: T_POSITION_SET, s: StartTag);
1607 else
1608 return tokenAndChangeState(code: T_G_GT, s: ElementContent);
1609 }
1610 case '=':
1611 return tokenAndAdvance(code: T_G_EQ);
1612 case '\'':
1613 return tokenAndChangeState(code: T_APOS, s: AposAttributeContent);
1614 case '"':
1615 return tokenAndChangeState(code: T_QUOTE, s: QuotAttributeContent);
1616 default:
1617 return tokenizeNCNameOrQName();
1618 }
1619 }
1620 case AposAttributeContent:
1621 case QuotAttributeContent:
1622 {
1623 const QChar sep(state() == AposAttributeContent ? QLatin1Char('\'') : QLatin1Char('"'));
1624 QString result;
1625 result.reserve(asize: 20);
1626
1627 if (m_scanOnly)
1628 {
1629 int stack = 0;
1630 return attributeAsRaw(separator: sep, stack, startPos: m_pos, inLiteral: true, result);
1631 }
1632
1633 Q_ASSERT(!m_scanOnly);
1634 while(true)
1635 {
1636 if (atEnd())
1637 {
1638 /* In the case that the XSL-T tokenizer invokes us with
1639 * default state QuotAttributeContent, we need to be able
1640 * to return a single string, in case that is all we have
1641 * accumulated. */
1642 if (result.isEmpty())
1643 return Token(T_END_OF_FILE);
1644 else
1645 return Token(T_STRING_LITERAL, result);
1646 }
1647
1648 const QChar curr(current());
1649
1650 if (curr == sep)
1651 {
1652 if (m_pos + 1 == m_length)
1653 return Token(T_END_OF_FILE);
1654
1655 if (m_data.at(i: m_pos + 1) == sep)
1656 {
1657 /* The quoting mechanism was used. */
1658 m_pos += 2;
1659 result.append(c: sep);
1660 continue;
1661 }
1662
1663 const QChar next(m_data.at(i: m_pos + 1));
1664 if (!next.isSpace() && next != QLatin1Char('/') && next != QLatin1Char('>'))
1665 return Token(T_ERROR); // i18n Space must separate attributes
1666
1667 if (result.isEmpty())
1668 {
1669 return tokenAndChangeState(code: state() == AposAttributeContent ? T_APOS : T_QUOTE,
1670 s: StartTag, advance: 1);
1671 }
1672
1673 /* Don't consume the sep, but leave it so we next time return a token for it. */
1674 return Token(T_STRING_LITERAL, result);
1675 }
1676 else if (curr == QLatin1Char('{'))
1677 {
1678 if (m_pos + 1 == m_length)
1679 return Token(T_END_OF_FILE);
1680 else if (peekAhead() == '{')
1681 {
1682 ++m_pos;
1683 result.append(c: QLatin1Char('{'));
1684 }
1685 else
1686 {
1687 if (result.isEmpty())
1688 {
1689 /* The Attribute Value Template appeared directly in the attribute. */
1690 pushState();
1691 return tokenAndChangeState(code: T_CURLY_LBRACE, s: Default);
1692 }
1693 else
1694 {
1695 /* We don't advance, keep '{' as next token. */
1696 return Token(T_STRING_LITERAL, result);
1697 }
1698 }
1699 }
1700 else if (curr == QLatin1Char('}'))
1701 {
1702 if (m_pos + 1 == m_length)
1703 return Token(T_END_OF_FILE);
1704 else if (peekAhead() == '}')
1705 {
1706 ++m_pos;
1707 result.append(c: QLatin1Char('}'));
1708 }
1709 else
1710 return Token(T_ERROR);
1711 }
1712 else if (curr == QLatin1Char('&'))
1713 {
1714 const QString ret(tokenizeCharacterReference());
1715 if (ret.isNull())
1716 return Token(T_ERROR);
1717 else
1718 result.append(s: ret);
1719 }
1720 else if (curr == QLatin1Char('<'))
1721 return Token(T_STRING_LITERAL, result);
1722 else
1723 {
1724 /* See Extensible Markup Language (XML) 1.0 (Fourth Edition),
1725 * 3.3.3 Attribute-Value Normalization.
1726 *
1727 * However, it is complicated a bit by that AVN is defined on top of
1728 * EOL normalization and we do those two in one go here. */
1729 switch(curr.unicode())
1730 {
1731 case 0xD:
1732 {
1733 if (peekAhead() == '\n')
1734 {
1735 result.append(c: QLatin1Char(' '));
1736 ++m_pos;
1737 break;
1738 }
1739 Q_FALLTHROUGH();
1740 }
1741 case 0xA:
1742 case 0x9:
1743 {
1744 result.append(c: QLatin1Char(' '));
1745 break;
1746 }
1747 default:
1748 result.append(c: curr);
1749 }
1750 }
1751
1752 ++m_pos;
1753 }
1754 }
1755 case ElementContent:
1756 {
1757 QString result;
1758 result.reserve(asize: 20);
1759
1760 /* Whether the text node, result, may be whitespace only. Character references
1761 * and CDATA sections disables that. */
1762 bool mayBeWS = true;
1763
1764 CharacterSkips skipEOLNormalization;
1765
1766 while(true)
1767 {
1768 if (atEnd())
1769 return Token(T_END_OF_FILE);
1770
1771 switch(peekCurrent())
1772 {
1773 case '<':
1774 {
1775 if (!result.isEmpty() && peekAhead(length: 2) != '[')
1776 {
1777 /* We encountered the end, and it was not a CDATA section. */
1778 /* We don't advance. Next time we'll handle the <... stuff. */
1779 return Token(mayBeWS ? T_STRING_LITERAL : T_NON_BOUNDARY_WS, normalizeEOL(input: result, characterSkips: skipEOLNormalization));
1780 }
1781
1782 ++m_pos;
1783 if (atEnd())
1784 return Token(T_END_OF_FILE);
1785
1786 const QChar ahead(current());
1787 if (ahead.isSpace())
1788 return error();
1789 else if (ahead == QLatin1Char('/'))
1790 {
1791 if (m_pos + 1 == m_length)
1792 return Token(T_END_OF_FILE);
1793 else if (m_data.at(i: m_pos + 1).isSpace())
1794 return error();
1795 else
1796 return tokenAndChangeState(code: T_BEGIN_END_TAG, s: EndTag);
1797 }
1798 else if (isNCNameStart(ch: ahead))
1799 {
1800 pushState();
1801 return tokenAndChangeState(code: T_G_LT, s: StartTag, advance: 0);
1802 }
1803 else if (aheadEquals(chs: "!--", len: 3, offset: 0))
1804 {
1805 pushState();
1806 m_pos += 3;
1807 return tokenAndChangeState(code: T_COMMENT_START, s: XMLComment, advance: 0);
1808 }
1809 else if (aheadEquals(chs: "![CDATA[", len: 8, offset: 0))
1810 {
1811 mayBeWS = false;
1812 m_pos += 8;
1813 const int start = m_pos;
1814 const int len = scanUntil(content: "]]>");
1815
1816 if (len == -1)
1817 return Token(T_END_OF_FILE);
1818
1819 m_pos += 2; /* Consume "]]>". Note that m_pos is on '!'. */
1820 result.append(s: m_data.mid(position: start, n: len));
1821 break;
1822 }
1823 else if (ahead == QLatin1Char('?'))
1824 {
1825 pushState();
1826 return tokenAndChangeState(code: T_PI_START, s: ProcessingInstructionName);
1827 }
1828 else
1829 return Token(T_G_LT);
1830 }
1831 case '&':
1832 {
1833 const QString ret(tokenizeCharacterReference());
1834 if (ret.isNull())
1835 return Token(T_ERROR);
1836 else
1837 {
1838 skipEOLNormalization.insert(value: result.count());
1839 result.append(s: ret);
1840 mayBeWS = false;
1841 break;
1842 }
1843 }
1844 case '{':
1845 {
1846 // TODO remove this check, also below.
1847 if (m_pos + 1 == m_length)
1848 return Token(T_END_OF_FILE);
1849 else if (peekAhead() == '{')
1850 {
1851 ++m_pos;
1852 result.append(c: QLatin1Char('{'));
1853 }
1854 else
1855 {
1856 if (result.isEmpty())
1857 {
1858 pushState();
1859 return tokenAndChangeState(code: T_CURLY_LBRACE, s: Default);
1860 }
1861 else
1862 {
1863 /* We don't advance here. */
1864 return Token(mayBeWS ? T_STRING_LITERAL : T_NON_BOUNDARY_WS, normalizeEOL(input: result, characterSkips: skipEOLNormalization));
1865 }
1866 }
1867 break;
1868 }
1869 case '}':
1870 {
1871 if (m_pos + 1 == m_length)
1872 return Token(T_END_OF_FILE);
1873 else if (peekAhead() == '}')
1874 {
1875 ++m_pos;
1876 result.append(c: QLatin1Char('}'));
1877 }
1878 else
1879 {
1880 /* This is a parse error, and the grammar won't be able
1881 * to reduce this CURLY_RBRACE. */
1882 return tokenAndChangeState(code: T_CURLY_RBRACE, s: Default);
1883 }
1884 break;
1885 }
1886 case '\n':
1887 {
1888 /* We want to translate \r\n into \n. */
1889 if (peekAhead(length: -1) == '\r')
1890 break;
1891 Q_FALLTHROUGH();
1892 }
1893 case '\r':
1894 {
1895 result.append(c: QLatin1Char('\n'));
1896 break;
1897 }
1898 default:
1899 {
1900 result.append(c: current());
1901 break;
1902 }
1903 }
1904 ++m_pos;
1905 }
1906 }
1907 case ProcessingInstructionName:
1908 {
1909 const int start = m_pos;
1910
1911 while(true)
1912 {
1913 ++m_pos;
1914 if (m_pos >= m_length)
1915 return Token(T_END_OF_FILE);
1916
1917 const QChar next(current());
1918 if (next.isSpace() || next == QLatin1Char('?'))
1919 {
1920 return tokenAndChangeState(code: T_PI_TARGET, value: m_data.mid(position: start, n: m_pos - start),
1921 s: ProcessingInstructionContent);
1922 }
1923 }
1924 }
1925 case ProcessingInstructionContent:
1926 {
1927 /* Consume whitespace between the name and the content. */
1928 if (consumeRawWhitespace())
1929 return Token(T_END_OF_FILE);
1930
1931 const int start = m_pos;
1932 const int len = scanUntil(content: "?>");
1933
1934 if (len == -1)
1935 return Token(T_END_OF_FILE);
1936 else
1937 {
1938 m_pos += 2; /* Consume "?>" */
1939 popState();
1940 return Token(T_PI_CONTENT, normalizeEOL(input: m_data.mid(position: start, n: len), characterSkips: CharacterSkips()));
1941 }
1942 }
1943 case EndTag:
1944 {
1945 if (consumeRawWhitespace())
1946 return T_END_OF_FILE;
1947
1948 if (peekCurrent() == '>')
1949 {
1950 popState();
1951 return tokenAndAdvance(code: T_G_GT);
1952 }
1953 else
1954 return tokenizeNCNameOrQName();
1955 }
1956 case XMLComment:
1957 {
1958 const int start = m_pos;
1959 const int len = scanUntil(content: "--");
1960
1961 if (len == -1)
1962 return T_END_OF_FILE;
1963 else
1964 {
1965 m_pos += 2; /* Consume "--". */
1966 popState();
1967
1968 if (peekCurrent() == '>')
1969 {
1970 ++m_pos;
1971 return Token(T_COMMENT_CONTENT, normalizeEOL(input: m_data.mid(position: start, n: len), characterSkips: CharacterSkips()));
1972 }
1973 else
1974 return error();
1975 }
1976 }
1977 case Pragma:
1978 {
1979 /* Consume whitespace. */
1980 if (consumeRawWhitespace())
1981 return Token(T_END_OF_FILE);
1982
1983 setState(PragmaContent);
1984 return tokenizeNCNameOrQName();
1985 }
1986 case PragmaContent:
1987 {
1988 QString result;
1989 result.reserve(asize: 20);
1990
1991 const bool hasWS = m_pos < m_length && current().isSpace();
1992
1993 /* Consume all whitespace up to the pragma content(if any). */
1994 if (consumeRawWhitespace())
1995 return Token(T_END_OF_FILE);
1996
1997 if (peekCurrent() == '#' && peekAhead() == ')')
1998 {
1999 /* We reached the end, and there's no pragma content. */
2000 return tokenAndChangeState(code: T_PRAGMA_END, s: Default, advance: 2);
2001 }
2002 else if (!hasWS)
2003 {
2004 /* A separating space is required if there's pragma content. */
2005 return error(); /* i18n */
2006 }
2007
2008 const int start = m_pos;
2009 const int len = scanUntil(content: "#)");
2010 if (len == -1)
2011 return Token(T_END_OF_FILE);
2012
2013 return Token(T_STRING_LITERAL, m_data.mid(position: start, n: len));
2014 Q_ASSERT(false);
2015 }
2016 }
2017
2018 Q_ASSERT(false);
2019 return error();
2020}
2021
2022Tokenizer::Token XQueryTokenizer::attributeAsRaw(const QChar sep,
2023 int &sepStack,
2024 const int startPos,
2025 const bool aInLiteral,
2026 QString &result)
2027{
2028 bool inLiteral = aInLiteral;
2029 const char otherSep = (sep == QLatin1Char('"') ? '\'' : '"');
2030
2031 while(true)
2032 {
2033 if (atEnd())
2034 return T_END_OF_FILE;
2035
2036 if (peekCurrent() == sep.unicode())
2037 {
2038 if (inLiteral)
2039 inLiteral = false;
2040 else
2041 inLiteral = true;
2042
2043 if (peekAhead() == sep.unicode())
2044 {
2045 /* The quoting mechanism was used. */
2046 result.append(c: current());
2047 m_pos += 2;
2048 continue;
2049 }
2050 else
2051 {
2052 /* Don't consume the separator, such that we
2053 * return a token for it next time. */
2054 if (m_pos == startPos)
2055 {
2056 ++m_pos;
2057 setState(StartTag);
2058 return Token(sep == QLatin1Char('"') ? T_QUOTE : T_APOS);
2059 }
2060
2061
2062 if (sepStack == 0)
2063 {
2064 return Token(T_STRING_LITERAL, result);
2065 }
2066 else
2067 {
2068 result.append(c: current());
2069 ++m_pos;
2070 continue;
2071 }
2072 }
2073 }
2074 else if (peekCurrent() == '&')
2075 {
2076 const QString ret(tokenizeCharacterReference());
2077 if (ret.isNull())
2078 return Token(T_ERROR);
2079 else
2080 {
2081 result.append(s: ret);
2082 ++m_pos;
2083 continue;
2084 }
2085 }
2086 else if (peekCurrent() == otherSep)
2087 {
2088 result.append(c: current());
2089 ++m_pos;
2090
2091 if (peekCurrent() == otherSep)
2092 ++m_pos;
2093
2094 if (inLiteral)
2095 inLiteral = false;
2096 else
2097 inLiteral = true;
2098
2099 continue;
2100 }
2101 else if (peekCurrent() == '{')
2102 {
2103 result.append(c: current());
2104
2105 if (peekAhead() == '{')
2106 {
2107 m_pos += 2;
2108 continue;
2109 }
2110 else
2111 {
2112 ++m_pos;
2113 ++sepStack;
2114 const Token t(attributeAsRaw(sep, sepStack, startPos, aInLiteral: false, result));
2115 if (t.type != T_SUCCESS)
2116 return t;
2117 }
2118
2119 }
2120 else if (peekCurrent() == '}')
2121 {
2122 if (inLiteral && peekAhead() == '}')
2123 {
2124 result.append(c: current());
2125 m_pos += 2;
2126 continue;
2127 }
2128 else
2129 {
2130 ++m_pos;
2131 --sepStack;
2132 return Token(T_SUCCESS); /* The return value is arbitrary. */
2133 }
2134 }
2135 else
2136 {
2137 result.append(c: current());
2138 ++m_pos;
2139 }
2140 }
2141}
2142
2143Tokenizer::Token XQueryTokenizer::nextToken(XPATHLTYPE *const sourceLocator)
2144{
2145 sourceLocator->first_line = m_line;
2146 sourceLocator->first_column = m_pos - m_columnOffset + 1; /* Plus 1, since m_pos is 0-based. */
2147
2148 if (m_tokenStack.isEmpty())
2149 return nextToken();
2150 else
2151 {
2152 const Token retval(m_tokenStack.pop());
2153
2154 switch(retval.type)
2155 {
2156 case T_MODULE:
2157 case T_SCHEMA:
2158 case T_COPY_NAMESPACES:
2159 {
2160 setState(NamespaceKeyword);
2161 break;
2162 }
2163 case T_VERSION:
2164 {
2165 setState(XQueryVersion);
2166 break;
2167 }
2168 case T_AS:
2169 case T_OF:
2170 {
2171 setState(ItemType);
2172 break;
2173 }
2174 default:
2175 {
2176 if (isOperatorKeyword(code: retval.type))
2177 setState(Default);
2178
2179 break;
2180 }
2181 };
2182
2183 return retval;
2184 }
2185}
2186
2187int XQueryTokenizer::commenceScanOnly()
2188{
2189 m_scanOnly = true;
2190 return m_pos;
2191}
2192
2193void XQueryTokenizer::resumeTokenizationFrom(const int pos)
2194{
2195 m_scanOnly = false;
2196 m_pos = pos;
2197}
2198
2199void XQueryTokenizer::setParserContext(const ParserContext::Ptr &)
2200{
2201}
2202
2203#undef handleWhitespace
2204
2205} // namespace QPatternist
2206
2207QT_END_NAMESPACE
2208

source code of qtxmlpatterns/src/xmlpatterns/parser/qxquerytokenizer.cpp