| 1 | /**************************************************************************** | 
| 2 | ** | 
| 3 | ** Copyright (C) 2016 The Qt Company Ltd. | 
| 4 | ** Contact: https://www.qt.io/licensing/ | 
| 5 | ** | 
| 6 | ** This file is part of the QtXmlPatterns module of the Qt Toolkit. | 
| 7 | ** | 
| 8 | ** $QT_BEGIN_LICENSE:LGPL$ | 
| 9 | ** Commercial License Usage | 
| 10 | ** Licensees holding valid commercial Qt licenses may use this file in | 
| 11 | ** accordance with the commercial license agreement provided with the | 
| 12 | ** Software or, alternatively, in accordance with the terms contained in | 
| 13 | ** a written agreement between you and The Qt Company. For licensing terms | 
| 14 | ** and conditions see https://www.qt.io/terms-conditions. For further | 
| 15 | ** information use the contact form at https://www.qt.io/contact-us. | 
| 16 | ** | 
| 17 | ** GNU Lesser General Public License Usage | 
| 18 | ** Alternatively, this file may be used under the terms of the GNU Lesser | 
| 19 | ** General Public License version 3 as published by the Free Software | 
| 20 | ** Foundation and appearing in the file LICENSE.LGPL3 included in the | 
| 21 | ** packaging of this file. Please review the following information to | 
| 22 | ** ensure the GNU Lesser General Public License version 3 requirements | 
| 23 | ** will be met: https://www.gnu.org/licenses/lgpl-3.0.html. | 
| 24 | ** | 
| 25 | ** GNU General Public License Usage | 
| 26 | ** Alternatively, this file may be used under the terms of the GNU | 
| 27 | ** General Public License version 2.0 or (at your option) the GNU General | 
| 28 | ** Public license version 3 or any later version approved by the KDE Free | 
| 29 | ** Qt Foundation. The licenses are as published by the Free Software | 
| 30 | ** Foundation and appearing in the file LICENSE.GPL2 and LICENSE.GPL3 | 
| 31 | ** included in the packaging of this file. Please review the following | 
| 32 | ** information to ensure the GNU General Public License requirements will | 
| 33 | ** be met: https://www.gnu.org/licenses/gpl-2.0.html and | 
| 34 | ** https://www.gnu.org/licenses/gpl-3.0.html. | 
| 35 | ** | 
| 36 | ** $QT_END_LICENSE$ | 
| 37 | ** | 
| 38 | ****************************************************************************/ | 
| 39 |  | 
| 40 | #include <QByteArray> | 
| 41 |  | 
| 42 | #include "qparsercontext_p.h" | 
| 43 | #include "qquerytransformparser_p.h" | 
| 44 |  | 
| 45 | #include "qxquerytokenizer_p.h" | 
| 46 |  | 
| 47 | #include "qtokenlookup.cpp" | 
| 48 |  | 
| 49 | QT_BEGIN_NAMESPACE | 
| 50 |  | 
| 51 | namespace QPatternist | 
| 52 | { | 
| 53 |  | 
| 54 | #define handleWhitespace()                      \ | 
| 55 | {                                               \ | 
| 56 |     const TokenType t = consumeWhitespace();    \ | 
| 57 |     if (t != T_SUCCESS)                            \ | 
| 58 |         return Token(t);                        \ | 
| 59 | } | 
| 60 |  | 
| 61 | XQueryTokenizer::XQueryTokenizer(const QString &query, | 
| 62 |                                  const QUrl &location, | 
| 63 |                                  const State startingState) : Tokenizer(location) | 
| 64 |                                                             , m_data(query) | 
| 65 |                                                             , m_length(query.length()) | 
| 66 |                                                             , m_state(startingState) | 
| 67 |                                                             , m_pos(0) | 
| 68 |                                                             , m_line(1) | 
| 69 |                                                             , m_columnOffset(0) | 
| 70 |                                                             , m_scanOnly(false) | 
| 71 | { | 
| 72 |     Q_ASSERT(location.isValid() || location.isEmpty()); | 
| 73 | } | 
| 74 |  | 
| 75 | const QChar XQueryTokenizer::current() const | 
| 76 | { | 
| 77 |     if (m_pos < m_length) | 
| 78 |         return m_data.at(i: m_pos); | 
| 79 |     else | 
| 80 |         return QChar(); | 
| 81 | } | 
| 82 |  | 
| 83 | char XQueryTokenizer::peekCurrent() const | 
| 84 | { | 
| 85 |     return current().toLatin1(); | 
| 86 | } | 
| 87 |  | 
| 88 | int XQueryTokenizer::peekForColonColon() const | 
| 89 | { | 
| 90 |     /* Note, we don't modify m_pos in this function, so we need to do offset | 
| 91 |      * calculations. */ | 
| 92 |     int pos = m_pos; | 
| 93 |  | 
| 94 |     while(pos < m_length) | 
| 95 |     { | 
| 96 |         switch(m_data.at(i: pos).toLatin1()) | 
| 97 |         { | 
| 98 |             /* Fallthrough these four. */ | 
| 99 |             case ' ': | 
| 100 |             case '\t': | 
| 101 |             case '\n': | 
| 102 |             case '\r': | 
| 103 |                 break; | 
| 104 |             case ':': | 
| 105 |             { | 
| 106 |                 if (peekAhead(length: (pos - m_pos) + 1) == ':') | 
| 107 |                     return pos - m_pos; | 
| 108 |                 Q_FALLTHROUGH(); | 
| 109 |             } | 
| 110 |             default: | 
| 111 |                 return -1; | 
| 112 |         } | 
| 113 |         ++pos; | 
| 114 |     } | 
| 115 |  | 
| 116 |     return -1; | 
| 117 | } | 
| 118 |  | 
| 119 | Tokenizer::Token XQueryTokenizer::tokenAndChangeState(const TokenType code, | 
| 120 |                                                       const State s, | 
| 121 |                                                       const int advance) | 
| 122 | { | 
| 123 |     Q_ASSERT(advance >= 0); | 
| 124 |     m_pos += advance; | 
| 125 |     setState(s); | 
| 126 |     return Token(code); | 
| 127 | } | 
| 128 |  | 
| 129 | Tokenizer::Token XQueryTokenizer::tokenAndChangeState(const TokenType code, | 
| 130 |                                                       const QString &value, | 
| 131 |                                                       const State s) | 
| 132 | { | 
| 133 |     setState(s); | 
| 134 |     return Token(code, value); | 
| 135 | } | 
| 136 |  | 
| 137 | Tokenizer::Token XQueryTokenizer::tokenAndAdvance(const TokenType code, | 
| 138 |                                                   const int advance) | 
| 139 | { | 
| 140 |     Q_ASSERT(advance >= 0); | 
| 141 |     m_pos += advance; | 
| 142 |     return Token(code); | 
| 143 | } | 
| 144 |  | 
| 145 | QString XQueryTokenizer::normalizeEOL(const QString &input, | 
| 146 |                                       const CharacterSkips &characterSkips) | 
| 147 | { | 
| 148 |     const int len = input.count(); | 
| 149 |     QString result; | 
| 150 |  | 
| 151 |     /* The likely hood is rather high it'll be the same content. */ | 
| 152 |     result.reserve(asize: len); | 
| 153 |  | 
| 154 |     for(int i = 0; i < len; ++i) | 
| 155 |     { | 
| 156 |         const QChar &at = input.at(i); | 
| 157 |  | 
| 158 |         if (characterSkips.contains(value: i)) | 
| 159 |         { | 
| 160 |             result.append(c: at); | 
| 161 |             continue; | 
| 162 |         } | 
| 163 |         switch(input.at(i).unicode()) | 
| 164 |         { | 
| 165 |             case '\r': | 
| 166 |             { | 
| 167 |                 if (i + 1 < len && input.at(i: i + 1) == QLatin1Char('\n')) | 
| 168 |                     ++i; | 
| 169 |  | 
| 170 |                 Q_FALLTHROUGH(); | 
| 171 |             } | 
| 172 |             case '\n': | 
| 173 |             { | 
| 174 |                 result.append(c: QLatin1Char('\n')); | 
| 175 |                 continue; | 
| 176 |             } | 
| 177 |             default: | 
| 178 |             { | 
| 179 |                 result.append(c: at); | 
| 180 |             } | 
| 181 |         } | 
| 182 |     } | 
| 183 |  | 
| 184 |     return result; | 
| 185 | } | 
| 186 |  | 
| 187 | Tokenizer::TokenType XQueryTokenizer::() | 
| 188 | { | 
| 189 |     /* Below, we return ERROR instead of END_OF_FILE such that the parser | 
| 190 |      * sees an invalid comment. */ | 
| 191 |     while(m_pos < m_length) | 
| 192 |     { | 
| 193 |         switch(peekCurrent()) | 
| 194 |         { | 
| 195 |             case ':': | 
| 196 |             { | 
| 197 |                 ++m_pos; /* Consume ':' */ | 
| 198 |                 if (atEnd()) | 
| 199 |                     return T_ERROR; | 
| 200 |  | 
| 201 |                 if (peekCurrent() == ')') | 
| 202 |                 { | 
| 203 |                     ++m_pos; /* Consume ')' */ | 
| 204 |                     return T_SUCCESS; /* The comment closed nicely. */ | 
| 205 |                 } | 
| 206 |                 continue; /* We don't want to increment m_pos twice. */ | 
| 207 |             } | 
| 208 |             case '(': | 
| 209 |             { /* It looks like the start of a comment. */ | 
| 210 |                 ++m_pos; | 
| 211 |  | 
| 212 |                 if (atEnd()) | 
| 213 |                     return T_END_OF_FILE; | 
| 214 |                 else if (peekCurrent() == ':') | 
| 215 |                 { | 
| 216 |                     /* And it is a nested comment -- parse it. */ | 
| 217 |                     const TokenType retval = consumeComment(); | 
| 218 |                     if (retval == T_SUCCESS) | 
| 219 |                         continue; /* Continue with our "own" comment. */ | 
| 220 |                     else | 
| 221 |                         return retval; /* Return the error in the nested comment. */ | 
| 222 |                 } | 
| 223 |                 break; | 
| 224 |             } | 
| 225 |             case '\n': | 
| 226 |             case '\r': | 
| 227 |             { | 
| 228 |                 /* We want to count \r\n as a single line break. */ | 
| 229 |                 if (peekAhead() == '\n') | 
| 230 |                     ++m_pos; | 
| 231 |  | 
| 232 |                 m_columnOffset = m_pos; | 
| 233 |                 ++m_line; | 
| 234 |  | 
| 235 |                 break; | 
| 236 |             } | 
| 237 |         } | 
| 238 |         ++m_pos; | 
| 239 |     } | 
| 240 |  | 
| 241 |     return T_ERROR; /* Error: we reached the end while inside a comment. */ | 
| 242 | } | 
| 243 |  | 
| 244 | bool XQueryTokenizer::consumeRawWhitespace() | 
| 245 | { | 
| 246 |     while(m_pos < m_length) | 
| 247 |     { | 
| 248 |         switch(peekCurrent()) | 
| 249 |         { | 
| 250 |             case ' ': | 
| 251 |             case '\t': | 
| 252 |                 break; | 
| 253 |             case '\n': | 
| 254 |             case '\r': | 
| 255 |             { | 
| 256 |                 if (peekAhead() == '\n') | 
| 257 |                     ++m_pos; | 
| 258 |  | 
| 259 |                 m_columnOffset = m_pos; | 
| 260 |                 ++m_line; | 
| 261 |  | 
| 262 |                 break; | 
| 263 |             } | 
| 264 |             default: | 
| 265 |                 return false; | 
| 266 |         } | 
| 267 |         ++m_pos; | 
| 268 |     } | 
| 269 |     return true; | 
| 270 | } | 
| 271 |  | 
| 272 | Tokenizer::TokenType XQueryTokenizer::consumeWhitespace() | 
| 273 | { | 
| 274 |     while(m_pos < m_length) | 
| 275 |     { | 
| 276 |         switch(peekCurrent()) | 
| 277 |         { | 
| 278 |             case ' ': | 
| 279 |             case '\t': | 
| 280 |                 break; | 
| 281 |             case '\n': | 
| 282 |             case '\r': | 
| 283 |             { | 
| 284 |                 /* We want to count \r\n as a single line break. */ | 
| 285 |                 if (peekAhead() == '\n') | 
| 286 |                     ++m_pos; | 
| 287 |  | 
| 288 |                 m_columnOffset = m_pos; | 
| 289 |                 ++m_line; | 
| 290 |  | 
| 291 |                 break; | 
| 292 |             } | 
| 293 |             case '(': | 
| 294 |             { | 
| 295 |                 if (peekAhead() == ':') | 
| 296 |                 { | 
| 297 |                     m_pos += 2; /* Consume "(:" */ | 
| 298 |  | 
| 299 |                     const TokenType  = consumeComment(); | 
| 300 |                     if (comment == T_SUCCESS) | 
| 301 |                         continue; | 
| 302 |                     else | 
| 303 |                         return comment; | 
| 304 |                 } | 
| 305 |                 Q_FALLTHROUGH(); | 
| 306 |             } | 
| 307 |             default: | 
| 308 |                 return T_SUCCESS; | 
| 309 |         } | 
| 310 |         ++m_pos; | 
| 311 |     } | 
| 312 |  | 
| 313 |     return T_END_OF_FILE; | 
| 314 | } | 
| 315 |  | 
| 316 | char XQueryTokenizer::peekAhead(const int length) const | 
| 317 | { | 
| 318 |     if (m_pos + length < m_length) | 
| 319 |         return m_data.at(i: m_pos + length).toLatin1(); | 
| 320 |     else | 
| 321 |         return 0; | 
| 322 | } | 
| 323 |  | 
| 324 | Tokenizer::Token XQueryTokenizer::error() | 
| 325 | { | 
| 326 |     return Token(T_ERROR); | 
| 327 | } | 
| 328 |  | 
| 329 | bool XQueryTokenizer::isDigit(const char ch) | 
| 330 | { | 
| 331 |     return ch >= '0' && ch <= '9'; | 
| 332 | } | 
| 333 |  | 
| 334 | /* Replace with function in QXmlUtils. Write test cases for this. */ | 
| 335 | bool XQueryTokenizer::isNCNameStart(const QChar ch) | 
| 336 | { | 
| 337 |     if (ch == QLatin1Char('_')) | 
| 338 |         return true; | 
| 339 |  | 
| 340 |     switch(ch.category()) | 
| 341 |     { | 
| 342 |         case QChar::Letter_Lowercase: | 
| 343 |         case QChar::Letter_Uppercase: | 
| 344 |         case QChar::Letter_Other: | 
| 345 |         case QChar::Letter_Titlecase: | 
| 346 |         case QChar::Number_Letter: | 
| 347 |             return true; | 
| 348 |         default: | 
| 349 |             return false; | 
| 350 |     } | 
| 351 | } | 
| 352 |  | 
| 353 | bool XQueryTokenizer::isNCNameBody(const QChar ch) | 
| 354 | { | 
| 355 |     switch(ch.unicode()) | 
| 356 |     { | 
| 357 |         case '.': | 
| 358 |         case '_': | 
| 359 |         case '-': | 
| 360 |             return true; | 
| 361 |     } | 
| 362 |  | 
| 363 |     switch(ch.category()) | 
| 364 |     { | 
| 365 |         case QChar::Letter_Lowercase: | 
| 366 |         case QChar::Letter_Uppercase: | 
| 367 |         case QChar::Letter_Other: | 
| 368 |         case QChar::Letter_Titlecase: | 
| 369 |         case QChar::Number_Letter: | 
| 370 |         case QChar::Mark_SpacingCombining: | 
| 371 |         case QChar::Mark_Enclosing: | 
| 372 |         case QChar::Mark_NonSpacing: | 
| 373 |         case QChar::Letter_Modifier: | 
| 374 |         case QChar::Number_DecimalDigit: | 
| 375 |             return true; | 
| 376 |         default: | 
| 377 |             return false; | 
| 378 |     } | 
| 379 | } | 
| 380 |  | 
| 381 | bool XQueryTokenizer::isPhraseKeyword(const TokenType code) | 
| 382 | { | 
| 383 |     switch(code) | 
| 384 |     { | 
| 385 |         /* Fallthrough all these. */ | 
| 386 |         case T_CASTABLE: | 
| 387 |         case T_CAST: | 
| 388 |         case T_COPY_NAMESPACES: | 
| 389 |         case T_DECLARE: | 
| 390 |         case T_EMPTY: | 
| 391 |         case T_MODULE: | 
| 392 |         case T_IMPORT: | 
| 393 |         case T_INSTANCE: | 
| 394 |         case T_ORDER: | 
| 395 |         case T_ORDERING: | 
| 396 |         case T_XQUERY: | 
| 397 |         case T_STABLE: | 
| 398 |         case T_TREAT: | 
| 399 |             return true; | 
| 400 |         default: | 
| 401 |             return false; | 
| 402 |     } | 
| 403 | } | 
| 404 |  | 
| 405 | bool XQueryTokenizer::isOperatorKeyword(const TokenType code) | 
| 406 | { | 
| 407 |     switch(code) | 
| 408 |     { | 
| 409 |         /* Fallthrough all these. */ | 
| 410 |         case T_AS: | 
| 411 |         case T_ASCENDING: | 
| 412 |         case T_AT: | 
| 413 |         case T_CASE: | 
| 414 |         case T_CAST: | 
| 415 |         case T_CASTABLE: | 
| 416 |         case T_EQ: | 
| 417 |         case T_EXTERNAL: | 
| 418 |         case T_GE: | 
| 419 |         case T_G_EQ: | 
| 420 |         case T_G_GT: | 
| 421 |         case T_G_LT: | 
| 422 |         case T_G_NE: | 
| 423 |         case T_GT: | 
| 424 |         case T_IN: | 
| 425 |         case T_INHERIT: | 
| 426 |         case T_INSTANCE: | 
| 427 |         case T_IS: | 
| 428 |         case T_ITEM: | 
| 429 |         case T_LE: | 
| 430 |         case T_LT: | 
| 431 |         case T_NE: | 
| 432 |         case T_NO_INHERIT: | 
| 433 |         case T_NO_PRESERVE: | 
| 434 |         case T_OF: | 
| 435 |         case T_PRESERVE: | 
| 436 |         case T_RETURN: | 
| 437 |         case T_STABLE: | 
| 438 |         case T_TO: | 
| 439 |         case T_TREAT: | 
| 440 |             return true; | 
| 441 |         default: | 
| 442 |             return false; | 
| 443 |     }; | 
| 444 | } | 
| 445 |  | 
| 446 | bool XQueryTokenizer::isTypeToken(const TokenType t) | 
| 447 | { | 
| 448 |     switch(t) | 
| 449 |     { | 
| 450 |         /* Fallthrough all these. */ | 
| 451 |         case T_ATTRIBUTE: | 
| 452 |         case T_COMMENT: | 
| 453 |         case T_DOCUMENT: | 
| 454 |         case T_DOCUMENT_NODE: | 
| 455 |         case T_ELEMENT: | 
| 456 |         case T_ITEM: | 
| 457 |         case T_NODE: | 
| 458 |         case T_PROCESSING_INSTRUCTION: | 
| 459 |         case T_SCHEMA_ATTRIBUTE: | 
| 460 |         case T_SCHEMA_ELEMENT: | 
| 461 |         case T_TEXT: | 
| 462 |             return true; | 
| 463 |         default: | 
| 464 |             return false; | 
| 465 |     } | 
| 466 | } | 
| 467 |  | 
| 468 | Tokenizer::Token XQueryTokenizer::tokenizeNCNameOrQName() | 
| 469 | { | 
| 470 |     const int start = m_pos; | 
| 471 |  | 
| 472 |     const Token t1 = tokenizeNCName(); | 
| 473 |     if (t1.hasError()) | 
| 474 |         return t1; | 
| 475 |  | 
| 476 |     if (peekCurrent() != ':' || peekAhead() == '=') | 
| 477 |         return t1; | 
| 478 |  | 
| 479 |     ++m_pos; | 
| 480 |  | 
| 481 |     const Token t2 = tokenizeNCName(); | 
| 482 |     if (t2.hasError()) | 
| 483 |         return t2; | 
| 484 |     else | 
| 485 |         return Token(T_QNAME, m_data.mid(position: start, n: m_pos - start)); | 
| 486 | } | 
| 487 |  | 
| 488 | Tokenizer::Token XQueryTokenizer::tokenizeNumberLiteral() | 
| 489 | { | 
| 490 |     setState(Operator); | 
| 491 |     const int startPos = m_pos; | 
| 492 |     bool hasDot = false; | 
| 493 |     bool isXPath20 = false; | 
| 494 |  | 
| 495 |     for(; m_pos < m_length; ++m_pos) | 
| 496 |     { | 
| 497 |         QChar ch(current()); | 
| 498 |  | 
| 499 |         char cell = ch.cell(); | 
| 500 |  | 
| 501 |         if (cell == 'e' || cell == 'E') | 
| 502 |         { | 
| 503 |             isXPath20 = true; | 
| 504 |             ++m_pos; | 
| 505 |             ch = current(); | 
| 506 |  | 
| 507 |             if (ch.row() != 0) | 
| 508 |                 break; | 
| 509 |  | 
| 510 |             cell = ch.cell(); | 
| 511 |  | 
| 512 |             if (cell == '+' || cell == '-') | 
| 513 |                 continue; | 
| 514 |         } | 
| 515 |  | 
| 516 |         if (isNCNameStart(ch)) | 
| 517 |             return error(); | 
| 518 |  | 
| 519 |         if (cell < '0' || cell > '9') | 
| 520 |         { | 
| 521 |             if (cell == '.' && !hasDot) | 
| 522 |                 hasDot = true; | 
| 523 |             else | 
| 524 |                 break; | 
| 525 |         } | 
| 526 |     } | 
| 527 |  | 
| 528 |     return Token(isXPath20 ? T_XPATH2_NUMBER : T_NUMBER, m_data.mid(position: startPos, n: m_pos - startPos)); | 
| 529 | } | 
| 530 |  | 
| 531 | QString XQueryTokenizer::tokenizeCharacterReference() | 
| 532 | { | 
| 533 |     Q_ASSERT(peekCurrent() == '&'); | 
| 534 |  | 
| 535 |     const int theEnd = m_data.indexOf(c: QLatin1Char(';'), from: m_pos + 1); | 
| 536 |  | 
| 537 |     if (theEnd == -1) /* No ';' found, a syntax error. i18n. */ | 
| 538 |         return QString(); | 
| 539 |  | 
| 540 |     QString content(m_data.mid(position: m_pos + 1, n: (theEnd - m_pos) - 1)); | 
| 541 |     m_pos = theEnd; | 
| 542 |  | 
| 543 |     const QChar charRef(charForReference(reference: content)); | 
| 544 |  | 
| 545 |     if (!charRef.isNull()) | 
| 546 |         return charRef; | 
| 547 |     else if (content.startsWith(c: QLatin1Char('#'))) | 
| 548 |     { | 
| 549 |         int base; | 
| 550 |  | 
| 551 |         /* It is only '#' or '#x'. */ | 
| 552 |         if (content.length() < 2) | 
| 553 |             return QString(); | 
| 554 |  | 
| 555 |         /* We got a hex number if it starts with 'x', otherwise it's a decimal. */ | 
| 556 |         if (content.at(i: 1) == QLatin1Char('x')) | 
| 557 |         { | 
| 558 |             base = 16; | 
| 559 |             content = content.mid(position: 2); /* Remove "#x". */ | 
| 560 |         } | 
| 561 |         else | 
| 562 |         { | 
| 563 |             base = 10; | 
| 564 |             content = content.mid(position: 1); /* Remove "#". */ | 
| 565 |         } | 
| 566 |  | 
| 567 |         bool conversionOK = false; | 
| 568 |         const int codepoint = content.toInt(ok: &conversionOK, base); | 
| 569 |  | 
| 570 |         if (conversionOK) | 
| 571 |         { | 
| 572 |             const QChar ch(codepoint); | 
| 573 |  | 
| 574 |             if (ch.isNull()) | 
| 575 |             { | 
| 576 |                 /* We likely have something which require surrogate pairs. */ | 
| 577 |                 QString result; | 
| 578 |                 result += QChar(QChar::highSurrogate(ucs4: codepoint)); | 
| 579 |                 result += QChar(QChar::lowSurrogate(ucs4: codepoint)); | 
| 580 |                 return result; | 
| 581 |             } | 
| 582 |             else | 
| 583 |                 return ch; | 
| 584 |         } | 
| 585 |         else | 
| 586 |             return QString(); | 
| 587 |     } | 
| 588 |     else | 
| 589 |         return QString(); | 
| 590 | } | 
| 591 |  | 
| 592 | int XQueryTokenizer::scanUntil(const char *const content) | 
| 593 | { | 
| 594 |     const int end = m_data.indexOf(s: QString::fromLatin1(str: content), from: m_pos); | 
| 595 |  | 
| 596 |     if (end == -1) | 
| 597 |         return -1; | 
| 598 |     else | 
| 599 |     { | 
| 600 |         const int len = end - m_pos; | 
| 601 |         m_pos += len; | 
| 602 |         return len; | 
| 603 |     } | 
| 604 | } | 
| 605 |  | 
| 606 | QChar XQueryTokenizer::charForReference(const QString &reference) | 
| 607 | { | 
| 608 |     if (m_charRefs.isEmpty()) | 
| 609 |     { | 
| 610 |         /* Initialize. */ | 
| 611 |         m_charRefs.reserve(asize: 5); | 
| 612 |         m_charRefs.insert(akey: QLatin1String("lt" ),     avalue: QLatin1Char('<')); | 
| 613 |         m_charRefs.insert(akey: QLatin1String("gt" ),     avalue: QLatin1Char('>')); | 
| 614 |         m_charRefs.insert(akey: QLatin1String("amp" ),    avalue: QLatin1Char('&')); | 
| 615 |         m_charRefs.insert(akey: QLatin1String("quot" ),   avalue: QLatin1Char('"')); | 
| 616 |         m_charRefs.insert(akey: QLatin1String("apos" ),   avalue: QLatin1Char('\'')); | 
| 617 |     } | 
| 618 |  | 
| 619 |     return m_charRefs.value(akey: reference); | 
| 620 | } | 
| 621 |  | 
| 622 | Tokenizer::Token XQueryTokenizer::tokenizeStringLiteral() | 
| 623 | { | 
| 624 |     const QChar delimiter(current()); | 
| 625 |     /* We cannot unfortunately just scan and then do mid(), | 
| 626 |      * since we can encounter character references. */ | 
| 627 |     QString result; | 
| 628 |  | 
| 629 |     /* This is more likely than QString's default allocation. */ | 
| 630 |     result.reserve(asize: 8); | 
| 631 |  | 
| 632 |     CharacterSkips skipEOLNormalization; | 
| 633 |  | 
| 634 |     /* Advance over the initial quote character. */ | 
| 635 |     ++m_pos; | 
| 636 |  | 
| 637 |     for(; m_pos < m_length; ++m_pos) | 
| 638 |     { | 
| 639 |         const QChar c(current()); | 
| 640 |  | 
| 641 |         if (c == QLatin1Char('&')) | 
| 642 |         { | 
| 643 |             const QString charRef(tokenizeCharacterReference()); | 
| 644 |  | 
| 645 |             if (charRef.isNull()) | 
| 646 |                 return error(); | 
| 647 |             else | 
| 648 |             { | 
| 649 |                 skipEOLNormalization.insert(value: result.count()); | 
| 650 |                 result.append(s: charRef); | 
| 651 |             } | 
| 652 |  | 
| 653 |         } | 
| 654 |         else if (c == delimiter) | 
| 655 |         { | 
| 656 |             /* Maybe the escaping mechanism is used. For instance, "s""s" | 
| 657 |              * has the value `s"s'. */ | 
| 658 |             ++m_pos; | 
| 659 |  | 
| 660 |             if (current() == delimiter) /* Double quote. */ | 
| 661 |                 result += delimiter; | 
| 662 |             else | 
| 663 |                 return Token(T_STRING_LITERAL, normalizeEOL(input: result, characterSkips: skipEOLNormalization)); | 
| 664 |         } | 
| 665 |         else | 
| 666 |             result += c; | 
| 667 |     } | 
| 668 |  | 
| 669 |     return error(); | 
| 670 | } | 
| 671 |  | 
| 672 | Tokenizer::Token XQueryTokenizer::tokenizeNCName() | 
| 673 | { | 
| 674 |     const int startPos = m_pos; | 
| 675 |  | 
| 676 |     if (m_pos < m_length && isNCNameStart(ch: current())) | 
| 677 |     { | 
| 678 |         ++m_pos; | 
| 679 |  | 
| 680 |         for(; m_pos < m_length; ++m_pos) | 
| 681 |         { | 
| 682 |             if (!isNCNameBody(ch: current())) | 
| 683 |                 break; | 
| 684 |         } | 
| 685 |  | 
| 686 |         return Token(T_NCNAME, m_data.mid(position: startPos, n: m_pos - startPos)); | 
| 687 |     } | 
| 688 |     else | 
| 689 |         return error(); | 
| 690 | } | 
| 691 |  | 
| 692 | bool XQueryTokenizer::aheadEquals(const char *const chs, | 
| 693 |                                   const int len, | 
| 694 |                                   const int offset) const | 
| 695 | { | 
| 696 |     Q_ASSERT(len > 0); | 
| 697 |     Q_ASSERT(qstrlen(chs) == uint(len)); | 
| 698 |  | 
| 699 |     if (m_pos + len >= m_length) | 
| 700 |         return false; | 
| 701 |  | 
| 702 |     for(int i = offset; i < (len + offset); ++i) | 
| 703 |     { | 
| 704 |         if (m_data.at(i: m_pos + i).toLatin1() != chs[i - offset]) | 
| 705 |             return false; | 
| 706 |     } | 
| 707 |  | 
| 708 |     return true; | 
| 709 | } | 
| 710 |  | 
| 711 | const TokenMap *XQueryTokenizer::lookupKeyword(const QString &keyword) | 
| 712 | { | 
| 713 |     return TokenLookup::value(str: keyword.toLatin1().constData(), len: keyword.length()); | 
| 714 | } | 
| 715 |  | 
| 716 | XQueryTokenizer::State XQueryTokenizer::state() const | 
| 717 | { | 
| 718 |     return m_state; | 
| 719 | } | 
| 720 |  | 
| 721 | void XQueryTokenizer::setState(const State s) | 
| 722 | { | 
| 723 |     m_state = s; | 
| 724 | } | 
| 725 |  | 
| 726 | void XQueryTokenizer::pushState(const State s) | 
| 727 | { | 
| 728 |     m_stateStack.push(t: s); | 
| 729 | } | 
| 730 |  | 
| 731 | void XQueryTokenizer::pushState() | 
| 732 | { | 
| 733 |     m_stateStack.push(t: m_state); | 
| 734 | } | 
| 735 |  | 
| 736 | void XQueryTokenizer::popState() | 
| 737 | { | 
| 738 |     /* QStack::pop() asserts if it's empty, so we need to check | 
| 739 |      * it, since we might receive unbalanced curlies. */ | 
| 740 |     if (!m_stateStack.isEmpty()) | 
| 741 |         m_state = m_stateStack.pop(); | 
| 742 | } | 
| 743 |  | 
| 744 | Tokenizer::Token XQueryTokenizer::nextToken() | 
| 745 | { | 
| 746 |     switch(state()) | 
| 747 |     { | 
| 748 |         /* We want to skip or do special whitespace handling for these | 
| 749 |          * states. So fallthrough all of the following. */ | 
| 750 |         case AposAttributeContent: | 
| 751 |         case Axis: | 
| 752 |         case ElementContent: | 
| 753 |         case EndTag: | 
| 754 |         case Pragma: | 
| 755 |         case PragmaContent: | 
| 756 |         case ProcessingInstructionName: | 
| 757 |         case QuotAttributeContent: | 
| 758 |         case StartTag: | 
| 759 |         case XMLComment: | 
| 760 |             break; | 
| 761 |         default: | 
| 762 |             handleWhitespace(); | 
| 763 |     } | 
| 764 |  | 
| 765 |     switch(state()) | 
| 766 |     { | 
| 767 |         case XMLSpaceDecl: | 
| 768 |         case NamespaceKeyword: | 
| 769 |         { | 
| 770 |             switch(peekCurrent()) | 
| 771 |             { | 
| 772 |                 case ',': | 
| 773 |                     return tokenAndAdvance(code: T_COMMA); | 
| 774 |                 case '"': | 
| 775 |                 case '\'': | 
| 776 |                 { | 
| 777 |                     setState(NamespaceDecl); | 
| 778 |                     return tokenizeStringLiteral(); | 
| 779 |                 } | 
| 780 |             } | 
| 781 |  | 
| 782 |             const Token id(tokenizeNCName()); | 
| 783 |  | 
| 784 |             if (id.type != T_NCNAME) | 
| 785 |                 return id; | 
| 786 |  | 
| 787 |             const TokenMap *const keyword = lookupKeyword(keyword: id.value); | 
| 788 |             if (keyword) | 
| 789 |             { | 
| 790 |                 switch(keyword->token) | 
| 791 |                 { | 
| 792 |                     case T_INHERIT: | 
| 793 |                     case T_NO_INHERIT: | 
| 794 |                     { | 
| 795 |                         setState(Default); | 
| 796 |                         break; | 
| 797 |                     } | 
| 798 |                     case T_NAMESPACE: | 
| 799 |                     { | 
| 800 |                         setState(NamespaceDecl); | 
| 801 |                         break; | 
| 802 |                     } | 
| 803 |                     case T_ORDERED: | 
| 804 |                     case T_UNORDERED: | 
| 805 |                     case T_STRIP: | 
| 806 |                     { | 
| 807 |                         setState(Default); | 
| 808 |                         break; | 
| 809 |                     } | 
| 810 |                     case T_PRESERVE: | 
| 811 |                     { | 
| 812 |                         if (state() != NamespaceKeyword) | 
| 813 |                             setState(Default); | 
| 814 |                         break; | 
| 815 |                     } | 
| 816 |                     default: | 
| 817 |                         break; | 
| 818 |                 } | 
| 819 |  | 
| 820 |                 return Token(keyword->token); | 
| 821 |             } | 
| 822 |             else | 
| 823 |                 return id; | 
| 824 |         } | 
| 825 |         case NamespaceDecl: | 
| 826 |         { | 
| 827 |             switch(peekCurrent()) | 
| 828 |             { | 
| 829 |                 case '=': | 
| 830 |                     return tokenAndAdvance(code: T_G_EQ); | 
| 831 |                 case ';': | 
| 832 |                     return tokenAndChangeState(code: T_SEMI_COLON, s: Default); | 
| 833 |                 case '\'': | 
| 834 |                 case '\"': | 
| 835 |                     return tokenizeStringLiteral(); | 
| 836 |             } | 
| 837 |  | 
| 838 |             const Token nc(tokenizeNCName()); | 
| 839 |  | 
| 840 |             handleWhitespace(); | 
| 841 |  | 
| 842 |             const char pc = peekCurrent(); | 
| 843 |             const TokenMap* const t = lookupKeyword(keyword: nc.value); | 
| 844 |  | 
| 845 |             if (pc == '\'' || (pc == '"' && t)) | 
| 846 |                 return tokenAndChangeState(code: t->token, s: Default, advance: 0); | 
| 847 |             else | 
| 848 |                 return nc; | 
| 849 |         } | 
| 850 |         case Axis: | 
| 851 |         { | 
| 852 |             if (peekCurrent() == ':') | 
| 853 |             { | 
| 854 |                 Q_ASSERT(peekAhead() == ':'); | 
| 855 |                 m_pos += 2; | 
| 856 |                 setState(AfterAxisSeparator); | 
| 857 |                 return Token(T_COLONCOLON); | 
| 858 |             } | 
| 859 |             Q_FALLTHROUGH(); | 
| 860 |         } | 
| 861 |         case AfterAxisSeparator: | 
| 862 |         case Default: | 
| 863 |            /* State Operator and state Default have a lot of tokens in common except | 
| 864 |             * for minor differences. So we treat them the same way, and sprinkles logic | 
| 865 |             * here and there to handle the small differences. */ | 
| 866 |             Q_FALLTHROUGH(); | 
| 867 |         case Operator: | 
| 868 |         { | 
| 869 |             switch(peekCurrent()) | 
| 870 |             { | 
| 871 |                 case '=': | 
| 872 |                     return tokenAndChangeState(code: T_G_EQ, s: Default); | 
| 873 |                 case '-': | 
| 874 |                     return tokenAndChangeState(code: T_MINUS, s: Default); | 
| 875 |                 case '+': | 
| 876 |                     return tokenAndChangeState(code: T_PLUS, s: Default); | 
| 877 |                 case '[': | 
| 878 |                     return tokenAndChangeState(code: T_LBRACKET, s: Default); | 
| 879 |                 case ']': | 
| 880 |                     return tokenAndChangeState(code: T_RBRACKET, s: Operator); | 
| 881 |                 case ',': | 
| 882 |                     return tokenAndChangeState(code: T_COMMA, s: Default); | 
| 883 |                 case ';': | 
| 884 |                     return tokenAndChangeState(code: T_SEMI_COLON, s: Default); | 
| 885 |                 case '$': | 
| 886 |                     return tokenAndChangeState(code: T_DOLLAR, s: VarName); | 
| 887 |                 case '|': | 
| 888 |                     return tokenAndChangeState(code: T_BAR, s: Default); | 
| 889 |                 case '?': | 
| 890 |                     return tokenAndChangeState(code: T_QUESTION, s: Operator); | 
| 891 |                 case ')': | 
| 892 |                     return tokenAndChangeState(code: T_RPAREN, s: Operator); | 
| 893 |                 case '@': | 
| 894 |                     return tokenAndChangeState(code: T_AT_SIGN, s: Default); | 
| 895 |                 /* Fallthrough all these. */ | 
| 896 |                 case '1': | 
| 897 |                 case '2': | 
| 898 |                 case '3': | 
| 899 |                 case '4': | 
| 900 |                 case '5': | 
| 901 |                 case '6': | 
| 902 |                 case '7': | 
| 903 |                 case '8': | 
| 904 |                 case '9': | 
| 905 |                 case '0': | 
| 906 |                     return tokenizeNumberLiteral(); | 
| 907 |                 case '.': | 
| 908 |                 { | 
| 909 |                     const char next = peekAhead(); | 
| 910 |                     if (next == '.') | 
| 911 |                         return tokenAndChangeState(code: T_DOTDOT, s: Operator, advance: 2); | 
| 912 |                     /* .5 is allowed, as short form for 0.5: | 
| 913 |                      * <tt>[142]     DecimalLiteral     ::=     ("." Digits) | (Digits "." [0-9]*)</tt> | 
| 914 |                      */ | 
| 915 |                     else if (isDigit(ch: next)) | 
| 916 |                         return tokenizeNumberLiteral(); | 
| 917 |                     else | 
| 918 |                         return tokenAndChangeState(code: T_DOT, s: Operator); | 
| 919 |                 } | 
| 920 |                 case '\'': | 
| 921 |                 case '"': | 
| 922 |                 { | 
| 923 |                     setState(Operator); | 
| 924 |                     return tokenizeStringLiteral(); | 
| 925 |  | 
| 926 |                 } | 
| 927 |                 case '(': | 
| 928 |                 { | 
| 929 |                     if (peekAhead() == '#') | 
| 930 |                         return tokenAndChangeState(code: T_PRAGMA_START, s: Pragma, advance: 2); | 
| 931 |                     else | 
| 932 |                         return tokenAndChangeState(code: T_LPAREN, s: Default); | 
| 933 |                 } | 
| 934 |                 case '*': | 
| 935 |                 { | 
| 936 |                     if (peekAhead() == ':') | 
| 937 |                     { | 
| 938 |                         m_pos += 2; /* Consume *:. */ | 
| 939 |                         const Token nc = tokenizeNCName(); | 
| 940 |  | 
| 941 |                         if (nc.hasError()) | 
| 942 |                             return error(); | 
| 943 |                         else | 
| 944 |                             return tokenAndChangeState(code: T_ANY_PREFIX, value: nc.value, s: Operator); | 
| 945 |                     } | 
| 946 |                     else | 
| 947 |                         return tokenAndChangeState(code: T_STAR, s: state() == Default ? Operator : Default); | 
| 948 |                 } | 
| 949 |                 case ':': | 
| 950 |                 { | 
| 951 |                     switch(peekAhead()) | 
| 952 |                     { | 
| 953 |                         case '=': | 
| 954 |                             return tokenAndChangeState(code: T_ASSIGN, s: Default, advance: 2); | 
| 955 |                         case ':': | 
| 956 |                             return tokenAndChangeState(code: T_COLONCOLON, s: Default, advance: 2); | 
| 957 |                         default: | 
| 958 |                             return error(); | 
| 959 |                     } | 
| 960 |                 } | 
| 961 |                 case '!': | 
| 962 |                 { | 
| 963 |                     if (peekAhead() == '=') | 
| 964 |                         return tokenAndChangeState(code: T_G_NE, s: Default, advance: 2); | 
| 965 |                     else | 
| 966 |                         return error(); | 
| 967 |                 } | 
| 968 |                 case '<': | 
| 969 |                 { | 
| 970 |                     switch(peekAhead()) | 
| 971 |                     { | 
| 972 |                         case '=': | 
| 973 |                             return tokenAndChangeState(code: T_G_LE, s: Default, advance: 2); | 
| 974 |                         case '<': | 
| 975 |                             return tokenAndChangeState(code: T_PRECEDES, s: Default, advance: 2); | 
| 976 |                         case '?': | 
| 977 |                         { | 
| 978 |                             pushState(s: Operator); | 
| 979 |                             return tokenAndChangeState(code: T_PI_START, s: ProcessingInstructionName, advance: 2); | 
| 980 |                         } | 
| 981 |                         case '!': | 
| 982 |                         { | 
| 983 |                             if (aheadEquals(chs: "!--" , len: 3)) | 
| 984 |                             { | 
| 985 |                                 m_pos += 3; /* Consume "!--". */ | 
| 986 |                                 pushState(s: Operator); | 
| 987 |                                 return tokenAndChangeState(code: T_COMMENT_START, s: XMLComment); | 
| 988 |                             } | 
| 989 |                             /* Fallthrough. It's a syntax error, and this is a good way to report it. */ | 
| 990 |                             Q_FALLTHROUGH(); | 
| 991 |                         } | 
| 992 |                         default: | 
| 993 |                         { | 
| 994 |                             if ((m_pos + 1) < m_length && isNCNameStart(ch: m_data.at(i: m_pos + 1))) | 
| 995 |                             { | 
| 996 |                                 /* We assume it's an element constructor. */ | 
| 997 |                                 pushState(s: Operator); | 
| 998 |                             } | 
| 999 |  | 
| 1000 |                             return tokenAndChangeState(code: T_G_LT, s: state() == Operator ? Default : StartTag); | 
| 1001 |                         } | 
| 1002 |                     } | 
| 1003 |                 } | 
| 1004 |                 case '>': | 
| 1005 |                 { | 
| 1006 |                     switch(peekAhead()) | 
| 1007 |                     { | 
| 1008 |                         case '=': | 
| 1009 |                             return tokenAndChangeState(code: T_G_GE, s: Default, advance: 2); | 
| 1010 |                         case '>': | 
| 1011 |                             return tokenAndChangeState(code: T_FOLLOWS, s: Default, advance: 2); | 
| 1012 |                         default: | 
| 1013 |                             return tokenAndChangeState(code: T_G_GT, s: Default); | 
| 1014 |                     } | 
| 1015 |                 } | 
| 1016 |                 case '/': | 
| 1017 |                 { | 
| 1018 |                     if (peekAhead() == '/') | 
| 1019 |                         return tokenAndChangeState(code: T_SLASHSLASH, s: Default, advance: 2); | 
| 1020 |                     else | 
| 1021 |                         return tokenAndChangeState(code: T_SLASH, s: Default); | 
| 1022 |                 } | 
| 1023 |                 case '{': | 
| 1024 |                 { | 
| 1025 |                     pushState(s: Operator); | 
| 1026 |                     return tokenAndChangeState(code: T_CURLY_LBRACE, s: Default); | 
| 1027 |                 } | 
| 1028 |                 case '}': | 
| 1029 |                 { | 
| 1030 |                     popState(); | 
| 1031 |  | 
| 1032 |                     return tokenAndAdvance(code: T_CURLY_RBRACE); | 
| 1033 |                 } | 
| 1034 |             } | 
| 1035 |  | 
| 1036 |             /* Ok. We're in state Default or Operator, and it wasn't a simple | 
| 1037 |              * character. */ | 
| 1038 |  | 
| 1039 |             const Token id(tokenizeNCName()); | 
| 1040 |  | 
| 1041 |             if (id.type != T_NCNAME) | 
| 1042 |                 return id; | 
| 1043 |  | 
| 1044 |             const TokenMap *const keyword = lookupKeyword(keyword: id.value); | 
| 1045 |  | 
| 1046 |             if (state() == Operator) | 
| 1047 |             { | 
| 1048 |                 if (keyword) | 
| 1049 |                 { | 
| 1050 |                     if (keyword->token == T_DEFAULT || keyword->token == T_ASCENDING || keyword->token == T_DESCENDING) | 
| 1051 |                         setState(Operator); | 
| 1052 |                     else if (keyword->token == T_RETURN) | 
| 1053 |                         setState(Default); | 
| 1054 |                     else if (isPhraseKeyword(code: keyword->token)) | 
| 1055 |                     { | 
| 1056 |                         const TokenType ws = consumeWhitespace(); | 
| 1057 |                         if (ws == T_ERROR) | 
| 1058 |                             return error(); | 
| 1059 |  | 
| 1060 |                         const Token id2(tokenizeNCName()); | 
| 1061 |                         const TokenMap *const keyword2 = lookupKeyword(keyword: id2.value); | 
| 1062 |  | 
| 1063 |                         if (keyword2) | 
| 1064 |                         { | 
| 1065 |                             if (keyword->token == T_TREAT && keyword2->token == T_AS) | 
| 1066 |                                 setState(ItemType); | 
| 1067 |                             else if (keyword->token == T_CAST || (keyword->token == T_CASTABLE && keyword2->token == T_AS) || keyword2->token == T_BY) | 
| 1068 |                                 setState(Default); | 
| 1069 |  | 
| 1070 |                             m_tokenStack.push(t: Token(keyword2->token)); | 
| 1071 |                         } | 
| 1072 |                         else | 
| 1073 |                             m_tokenStack.push(t: id2); | 
| 1074 |  | 
| 1075 |                         return Token(keyword->token); | 
| 1076 |                     } | 
| 1077 |                     else | 
| 1078 |                     { | 
| 1079 |                         /* Such that we tokenize the second token in "empty greatest". */ | 
| 1080 |                         if (keyword->token != T_EMPTY) | 
| 1081 |                             setState(Default); | 
| 1082 |                     } | 
| 1083 |  | 
| 1084 |                     if (keyword->token == T_AS || keyword->token == T_CASE) | 
| 1085 |                         setState(ItemType); | 
| 1086 |  | 
| 1087 |                     return Token(keyword->token); | 
| 1088 |                 } | 
| 1089 |                 else | 
| 1090 |                     return id; | 
| 1091 |             } | 
| 1092 |  | 
| 1093 |             Q_ASSERT(state() == Default || state() == Axis || state() == AfterAxisSeparator); | 
| 1094 |  | 
| 1095 |             /* | 
| 1096 |              * This is hard. Consider this: | 
| 1097 |              * | 
| 1098 |              * Valid:           child       ::nameTest | 
| 1099 |              * Valid:           child::     nameTest | 
| 1100 |              * Syntax Error:    child       :localName | 
| 1101 |              * Syntax Error:    child:      localName | 
| 1102 |              * | 
| 1103 |              * Consider "child ::name". Right now, we're here: | 
| 1104 |              *                ^ | 
| 1105 |              * We don't know whether "child" is a prefix and hence the whitespace is invalid, | 
| 1106 |              * or whether it's an axis and hence skippable. */ | 
| 1107 |             { | 
| 1108 |                 const int wsLength = peekForColonColon(); | 
| 1109 |                 /* We cannot call handleWhitespace() because it returns on | 
| 1110 |                  * END_OF_FILE, and we have parsed up keyword, and we need to | 
| 1111 |                  * deal with that. | 
| 1112 |                  * | 
| 1113 |                  * If we have a colon colon, which means the whitespace is | 
| 1114 |                  * allowed, we skip it. */ | 
| 1115 |                 if (wsLength != -1) | 
| 1116 |                     m_pos += wsLength; | 
| 1117 |             } | 
| 1118 |  | 
| 1119 |             /* Handle name tests. */ | 
| 1120 |             if (peekCurrent() == ':') | 
| 1121 |             { | 
| 1122 |                 switch(peekAhead()) | 
| 1123 |                 { | 
| 1124 |                     case '=': | 
| 1125 |                         return id; | 
| 1126 |                     case '*': | 
| 1127 |                     { | 
| 1128 |                         m_pos += 2; | 
| 1129 |                         return tokenAndChangeState(code: T_ANY_LOCAL_NAME, value: id.value, s: Operator); | 
| 1130 |                     } | 
| 1131 |                     case ':': | 
| 1132 |                     { | 
| 1133 |                         /* We have an axis. */ | 
| 1134 |                         setState(Axis); | 
| 1135 |                         return keyword ? Token(keyword->token) : id; | 
| 1136 |                     } | 
| 1137 |                     default: | 
| 1138 |                     { | 
| 1139 |                         /* It's a QName. */ | 
| 1140 |                         ++m_pos; /* Consume the colon. */ | 
| 1141 |  | 
| 1142 |                         const Token id2(tokenizeNCName()); | 
| 1143 |  | 
| 1144 |                         if (id2.type != T_NCNAME) | 
| 1145 |                         { | 
| 1146 |                             --m_pos; | 
| 1147 |                             return id; | 
| 1148 |                         } | 
| 1149 |  | 
| 1150 |                         setState(Operator); | 
| 1151 |                         const int qNameLen = id.value.length() + id2.value.length() + 1; | 
| 1152 |                         return Token(T_QNAME, m_data.mid(position: m_pos - qNameLen, n: qNameLen)); | 
| 1153 |                     } | 
| 1154 |                 } | 
| 1155 |             } | 
| 1156 |  | 
| 1157 |             if (!keyword || isOperatorKeyword(code: keyword->token)) | 
| 1158 |             { | 
| 1159 |                 setState(Operator); | 
| 1160 |                 return id; | 
| 1161 |             } | 
| 1162 |  | 
| 1163 |             const TokenType ws = consumeWhitespace(); | 
| 1164 |             if (ws == T_ERROR) // TODO this should test for success. Write test. | 
| 1165 |                 return Token(T_ERROR); | 
| 1166 |  | 
| 1167 |             if (atEnd()) | 
| 1168 |             { | 
| 1169 |                 setState(Operator); | 
| 1170 |                 return id; | 
| 1171 |             } | 
| 1172 |  | 
| 1173 |             /* Let the if-body apply for constructors, and node type tests. */ | 
| 1174 |             if (isTypeToken(t: keyword->token) || | 
| 1175 |                keyword->token == T_TYPESWITCH || | 
| 1176 |                keyword->token == T_ORDERED || | 
| 1177 |                keyword->token == T_UNORDERED || | 
| 1178 |                keyword->token == T_IF) | 
| 1179 |             { | 
| 1180 |                 switch(peekCurrent()) | 
| 1181 |                 { | 
| 1182 |                     case '(': | 
| 1183 |                     { | 
| 1184 |                         // TODO See if we can remove DOCUMENT from isTypeToken. | 
| 1185 |                         if (isTypeToken(t: keyword->token) && keyword->token != T_DOCUMENT) | 
| 1186 |                         { | 
| 1187 |                             m_tokenStack.push(t: Token(T_LPAREN)); | 
| 1188 |                             ++m_pos; /* Consume '('. */ | 
| 1189 |                             pushState(s: Operator); | 
| 1190 |  | 
| 1191 |                             if (keyword->token == T_PROCESSING_INSTRUCTION) | 
| 1192 |                                 setState(KindTestForPI); | 
| 1193 |                             else | 
| 1194 |                                 setState(KindTest); | 
| 1195 |  | 
| 1196 |                             return Token(keyword->token); | 
| 1197 |                         } | 
| 1198 |                         else if (keyword->token == T_TYPESWITCH || keyword->token == T_IF) | 
| 1199 |                             return Token(keyword->token); | 
| 1200 |                         else /* It's a function call. */ | 
| 1201 |                             return id; | 
| 1202 |                     } | 
| 1203 |                     case '{': | 
| 1204 |                     { | 
| 1205 |                         m_tokenStack.push(t: Token(T_CURLY_LBRACE)); | 
| 1206 |                         ++m_pos; /* Consume '{'. */ | 
| 1207 |                         pushState(s: Operator); | 
| 1208 |                         /* Stay in state Default. */ | 
| 1209 |                         return Token(keyword->token); | 
| 1210 |                     } | 
| 1211 |                     default: | 
| 1212 |                     { | 
| 1213 |                         /* We have read in a token which is for instance | 
| 1214 |                          * "return", and now it can be an element | 
| 1215 |                          * test("element") a node kind test("element()"), or a | 
| 1216 |                          * computed element constructor("element name {..."). | 
| 1217 |                          * We need to do a two-token lookahead here, because | 
| 1218 |                          * "element return" can be an element test followed by | 
| 1219 |                          * the return keyword, but it can also be an element | 
| 1220 |                          * constructor("element return {"). */ | 
| 1221 |                         if (isNCNameStart(ch: current())) | 
| 1222 |                         { | 
| 1223 |                             const int currentPos = m_pos; | 
| 1224 |                             const Token token2 = tokenizeNCNameOrQName(); | 
| 1225 |  | 
| 1226 |                             if (token2.hasError()) | 
| 1227 |                                 return token2; | 
| 1228 |  | 
| 1229 |                             handleWhitespace(); | 
| 1230 |  | 
| 1231 |                             if (peekCurrent() == '{') | 
| 1232 |                             { | 
| 1233 |                                 /* An element constructor. */ | 
| 1234 |                                 m_tokenStack.push(t: token2); | 
| 1235 |                                 return Token(keyword->token); | 
| 1236 |                             } | 
| 1237 |  | 
| 1238 |                             /* We jump back in the stream, we need to tokenize token2 according | 
| 1239 |                              * to the state. */ | 
| 1240 |                             m_pos = currentPos; | 
| 1241 |                             setState(Operator); | 
| 1242 |                             return Token(T_NCNAME, QLatin1String(keyword->name)); | 
| 1243 |                         } | 
| 1244 |                     } | 
| 1245 |                 } | 
| 1246 |             } | 
| 1247 |  | 
| 1248 |             if (peekCurrent() == '$') | 
| 1249 |             { | 
| 1250 |                 setState(VarName); | 
| 1251 |                 return Token(keyword->token); | 
| 1252 |             } | 
| 1253 |  | 
| 1254 |             /* It's not a node type, it's not the typeswitch expression, but it is a function callsite. */ | 
| 1255 |             if (peekCurrent() == '(') | 
| 1256 |                 return id; | 
| 1257 |             else if (peekCurrent() == '{' && keyword->token == T_VALIDATE) | 
| 1258 |                 return Token(keyword->token); | 
| 1259 |  | 
| 1260 |             if (!isNCNameStart(ch: current())) | 
| 1261 |             { | 
| 1262 |                 setState(Operator); | 
| 1263 |                 return id; | 
| 1264 |             } | 
| 1265 |  | 
| 1266 |             const Token id2(tokenizeNCName()); | 
| 1267 |             const TokenMap *const keyword2 = lookupKeyword(keyword: id2.value); | 
| 1268 |  | 
| 1269 |             if (!keyword2) | 
| 1270 |             { | 
| 1271 |                 /* It's a syntax error. All cases of two subsequent ncnames are keywords(e.g, declarations). */ | 
| 1272 |                 setState(Operator); | 
| 1273 |                 return id; | 
| 1274 |             } | 
| 1275 |  | 
| 1276 |             switch(keyword->token) | 
| 1277 |             { | 
| 1278 |                 case T_DECLARE: | 
| 1279 |                 { | 
| 1280 |                     switch(keyword2->token) | 
| 1281 |                     { | 
| 1282 |                         case T_VARIABLE: | 
| 1283 |                         case T_FUNCTION: | 
| 1284 |                         { | 
| 1285 |                             m_tokenStack.push(t: Token(keyword2->token)); | 
| 1286 |                             setState(Default); | 
| 1287 |                             return Token(keyword->token); | 
| 1288 |                         } | 
| 1289 |                         case T_OPTION: | 
| 1290 |                         { | 
| 1291 |                             m_tokenStack.push(t: Token(keyword2->token)); | 
| 1292 |                             setState(Default); | 
| 1293 |                             return Token(keyword->token); | 
| 1294 |                         } | 
| 1295 |                         case T_COPY_NAMESPACES: | 
| 1296 |                         case T_ORDERING: | 
| 1297 |                         { | 
| 1298 |                             m_tokenStack.push(t: Token(keyword2->token)); | 
| 1299 |                             setState(NamespaceKeyword); | 
| 1300 |                             return Token(keyword->token); | 
| 1301 |                         } | 
| 1302 |                         case T_CONSTRUCTION: | 
| 1303 |                         { | 
| 1304 |                             // TODO identical to CONSTRUCTION? | 
| 1305 |                             m_tokenStack.push(t: Token(keyword2->token)); | 
| 1306 |                             setState(Operator); | 
| 1307 |                             return Token(keyword->token); | 
| 1308 |                         } | 
| 1309 |                         case T_NAMESPACE: | 
| 1310 |                         case T_BASEURI: | 
| 1311 |                         { | 
| 1312 |                             m_tokenStack.push(t: Token(keyword2->token)); | 
| 1313 |                             setState(NamespaceDecl); | 
| 1314 |                             return Token(keyword->token); | 
| 1315 |                         } | 
| 1316 |                         case T_BOUNDARY_SPACE: | 
| 1317 |                         { | 
| 1318 |                             m_tokenStack.push(t: Token(keyword2->token)); | 
| 1319 |                             setState(XMLSpaceDecl); | 
| 1320 |                             return Token(keyword->token); | 
| 1321 |                         } | 
| 1322 |                         case T_DEFAULT: | 
| 1323 |                         { | 
| 1324 |                             m_tokenStack.push(t: Token(keyword2->token)); | 
| 1325 |  | 
| 1326 |                             const TokenType ws2 = consumeWhitespace(); | 
| 1327 |                             if (ws2 != T_SUCCESS) | 
| 1328 |                             { | 
| 1329 |                                 m_tokenStack.prepend(t: Token(ws2)); | 
| 1330 |                                 return Token(keyword->token); | 
| 1331 |                             } | 
| 1332 |  | 
| 1333 |                             const Token id3(tokenizeNCName()); | 
| 1334 |  | 
| 1335 |                             if (id3.type != T_NCNAME) | 
| 1336 |                             { | 
| 1337 |                                 m_tokenStack.prepend(t: id3); | 
| 1338 |                                 return Token(keyword->token); | 
| 1339 |                             } | 
| 1340 |  | 
| 1341 |                             const TokenMap *const keyword3 = lookupKeyword(keyword: id3.value); | 
| 1342 |                             if (!keyword3) | 
| 1343 |                             { | 
| 1344 |                                 m_tokenStack.prepend(t: id3); | 
| 1345 |                                 return Token(keyword->token); | 
| 1346 |                             } | 
| 1347 |                             else | 
| 1348 |                             { | 
| 1349 |                                 m_tokenStack.prepend(t: Token(keyword3->token)); | 
| 1350 |  | 
| 1351 |                                 if (keyword3->token == T_ORDER) | 
| 1352 |                                     setState(Operator); | 
| 1353 |                                 else | 
| 1354 |                                     setState(NamespaceDecl); | 
| 1355 |                             } | 
| 1356 |  | 
| 1357 |                             return Token(keyword->token); | 
| 1358 |                         } | 
| 1359 |                         default: | 
| 1360 |                         { | 
| 1361 |                             m_tokenStack.push(t: Token(keyword2->token)); | 
| 1362 |                             setState(Default); | 
| 1363 |                             return id; | 
| 1364 |                         } | 
| 1365 |                     } | 
| 1366 |                 } | 
| 1367 |                 case T_XQUERY: | 
| 1368 |                 { | 
| 1369 |                     m_tokenStack.push(t: Token(keyword2->token)); | 
| 1370 |  | 
| 1371 |                     if (keyword2->token == T_VERSION) | 
| 1372 |                     { | 
| 1373 |                         setState(NamespaceDecl); | 
| 1374 |                         return Token(keyword->token); | 
| 1375 |                     } | 
| 1376 |                     else | 
| 1377 |                     { | 
| 1378 |                         setState(Operator); | 
| 1379 |                         return id; | 
| 1380 |                     } | 
| 1381 |                 } | 
| 1382 |                 case T_IMPORT: | 
| 1383 |                 { | 
| 1384 |                     m_tokenStack.push(t: Token(keyword2->token)); | 
| 1385 |  | 
| 1386 |                     switch(keyword2->token) | 
| 1387 |                     { | 
| 1388 |                         case T_SCHEMA: | 
| 1389 |                         case T_MODULE: | 
| 1390 |                         { | 
| 1391 |                             setState(NamespaceKeyword); | 
| 1392 |                             return Token(keyword->token); | 
| 1393 |                         } | 
| 1394 |                         default: | 
| 1395 |                         { | 
| 1396 |                             setState(Operator); | 
| 1397 |                             return id; | 
| 1398 |                         } | 
| 1399 |                     } | 
| 1400 |                 } | 
| 1401 |                 case T_VALIDATE: | 
| 1402 |                 { | 
| 1403 |                     m_tokenStack.push(t: Token(keyword2->token)); | 
| 1404 |  | 
| 1405 |                     switch(keyword2->token) | 
| 1406 |                     { | 
| 1407 |                         case T_LAX: | 
| 1408 |                         case T_STRICT: | 
| 1409 |                         { | 
| 1410 |                             pushState(s: Operator); | 
| 1411 |                             return Token(keyword->token); | 
| 1412 |                         } | 
| 1413 |                         default: | 
| 1414 |                         { | 
| 1415 |                             setState(Operator); | 
| 1416 |                             return id; | 
| 1417 |                         } | 
| 1418 |                     } | 
| 1419 |                 } | 
| 1420 |                 default: | 
| 1421 |                 { | 
| 1422 |                     m_tokenStack.push(t: Token(keyword2->token)); | 
| 1423 |                     setState(Operator); | 
| 1424 |                     return id; | 
| 1425 |                 } | 
| 1426 |             } | 
| 1427 |         } | 
| 1428 |         case VarName: | 
| 1429 |         { | 
| 1430 |             if (peekCurrent() == '$') | 
| 1431 |                 return tokenAndAdvance(code: T_DOLLAR); | 
| 1432 |  | 
| 1433 |             setState(Operator); | 
| 1434 |             return tokenizeNCNameOrQName(); | 
| 1435 |         } | 
| 1436 |         case ItemType: | 
| 1437 |         { | 
| 1438 |             switch(peekCurrent()) | 
| 1439 |             { | 
| 1440 |                 case '(': | 
| 1441 |                     return tokenAndChangeState(code: T_LPAREN, s: KindTest); | 
| 1442 |                 case '$': | 
| 1443 |                     return tokenAndChangeState(code: T_DOLLAR, s: VarName); | 
| 1444 |             } | 
| 1445 |  | 
| 1446 |             const Token name(tokenizeNCNameOrQName()); | 
| 1447 |  | 
| 1448 |             if (name.hasError()) | 
| 1449 |                 return error(); | 
| 1450 |  | 
| 1451 |             else if (name.type == T_QNAME) | 
| 1452 |             { | 
| 1453 |                 setState(OccurrenceIndicator); | 
| 1454 |                 return name; | 
| 1455 |             } | 
| 1456 |             else | 
| 1457 |             { | 
| 1458 |                 const TokenMap *const keyword = lookupKeyword(keyword: name.value); | 
| 1459 |  | 
| 1460 |                 if (keyword) | 
| 1461 |                 { | 
| 1462 |                     pushState(s: OccurrenceIndicator); | 
| 1463 |                     return Token(keyword->token); | 
| 1464 |                 } | 
| 1465 |                 else | 
| 1466 |                 { | 
| 1467 |                     setState(Default); | 
| 1468 |                     return name; | 
| 1469 |                 } | 
| 1470 |             } | 
| 1471 |         } | 
| 1472 |         case KindTest: | 
| 1473 |         { | 
| 1474 |             switch(peekCurrent()) | 
| 1475 |             { | 
| 1476 |                 case ')': | 
| 1477 |                 { | 
| 1478 |                     popState(); | 
| 1479 |                     return tokenAndAdvance(code: T_RPAREN); | 
| 1480 |                 } | 
| 1481 |                 case '(': | 
| 1482 |                     return tokenAndAdvance(code: T_LPAREN); | 
| 1483 |                 case ',': | 
| 1484 |                     return tokenAndAdvance(code: T_COMMA); | 
| 1485 |                 case '*': | 
| 1486 |                     return tokenAndAdvance(code: T_STAR); | 
| 1487 |                 case '?': | 
| 1488 |                     return tokenAndAdvance(code: T_QUESTION); | 
| 1489 |                 case '\'': | 
| 1490 |                 case '"': | 
| 1491 |                     return tokenizeStringLiteral(); | 
| 1492 |             } | 
| 1493 |  | 
| 1494 |             const Token nc(tokenizeNCNameOrQName()); | 
| 1495 |             if (nc.hasError()) | 
| 1496 |                 return nc; | 
| 1497 |  | 
| 1498 |             const TokenType ws = consumeWhitespace(); | 
| 1499 |             if (ws == T_ERROR) | 
| 1500 |                 return error(); | 
| 1501 |  | 
| 1502 |             if (peekCurrent() == '(') | 
| 1503 |             { | 
| 1504 |                 const TokenMap *const keyword = lookupKeyword(keyword: nc.value); | 
| 1505 |                 if (keyword) | 
| 1506 |                 { | 
| 1507 |                     pushState(s: KindTest); | 
| 1508 |                     return Token(keyword->token); | 
| 1509 |                 } | 
| 1510 |                 else | 
| 1511 |                     return nc; | 
| 1512 |             } | 
| 1513 |             else | 
| 1514 |                 return nc; | 
| 1515 |         } | 
| 1516 |         case KindTestForPI: | 
| 1517 |         { | 
| 1518 |             switch(peekCurrent()) | 
| 1519 |             { | 
| 1520 |                 case ')': | 
| 1521 |                 { | 
| 1522 |                     popState(); | 
| 1523 |                     return tokenAndAdvance(code: T_RPAREN); | 
| 1524 |                 } | 
| 1525 |                 case '\'': | 
| 1526 |                 case '"': | 
| 1527 |                     return tokenizeStringLiteral(); | 
| 1528 |                 default: | 
| 1529 |                     return tokenizeNCName(); | 
| 1530 |             } | 
| 1531 |         } | 
| 1532 |         case OccurrenceIndicator: | 
| 1533 |         { | 
| 1534 |             switch(peekCurrent()) | 
| 1535 |             { | 
| 1536 |                 case '?': | 
| 1537 |                     return tokenAndChangeState(code: T_QUESTION, s: Operator); | 
| 1538 |                 case '*': | 
| 1539 |                     return tokenAndChangeState(code: T_STAR, s: Operator); | 
| 1540 |                 case '+': | 
| 1541 |                     return tokenAndChangeState(code: T_PLUS, s: Operator); | 
| 1542 |                 default: | 
| 1543 |                 { | 
| 1544 |                     setState(Operator); | 
| 1545 |                     return nextToken(); | 
| 1546 |                 } | 
| 1547 |             } | 
| 1548 |         } | 
| 1549 |         case XQueryVersion: | 
| 1550 |         { | 
| 1551 |             switch(peekCurrent()) | 
| 1552 |             { | 
| 1553 |                 case '\'': | 
| 1554 |                 case '"': | 
| 1555 |                     return tokenizeStringLiteral(); | 
| 1556 |                 case ';': | 
| 1557 |                     return tokenAndChangeState(code: T_SEMI_COLON, s: Default); | 
| 1558 |             } | 
| 1559 |  | 
| 1560 |             const Token id(tokenizeNCName()); | 
| 1561 |  | 
| 1562 |             if (id.type != T_NCNAME) | 
| 1563 |                 return id; | 
| 1564 |  | 
| 1565 |             const TokenMap *const keyword = lookupKeyword(keyword: id.value); | 
| 1566 |             if (keyword) | 
| 1567 |                 return tokenAndChangeState(code: keyword->token, s: Default); | 
| 1568 |             else | 
| 1569 |                 return id; | 
| 1570 |         } | 
| 1571 |         case StartTag: | 
| 1572 |         { | 
| 1573 |             if (peekAhead(length: -1) == '<') | 
| 1574 |             { | 
| 1575 |                 if (current().isSpace()) | 
| 1576 |                     return Token(T_ERROR); | 
| 1577 |             } | 
| 1578 |             else | 
| 1579 |             { | 
| 1580 |                 if (consumeRawWhitespace()) | 
| 1581 |                     return Token(T_END_OF_FILE); | 
| 1582 |             } | 
| 1583 |  | 
| 1584 |             switch(peekCurrent()) | 
| 1585 |             { | 
| 1586 |                 case '/': | 
| 1587 |                 { | 
| 1588 |                     if (peekAhead() == '>') | 
| 1589 |                     { | 
| 1590 |                         m_pos += 2; | 
| 1591 |  | 
| 1592 |                         if (m_scanOnly) | 
| 1593 |                             return Token(T_POSITION_SET); | 
| 1594 |                         else | 
| 1595 |                         { | 
| 1596 |                             popState(); | 
| 1597 |                             return Token(T_QUICK_TAG_END); | 
| 1598 |                         } | 
| 1599 |                     } | 
| 1600 |                     else | 
| 1601 |                         return error(); | 
| 1602 |                 } | 
| 1603 |                 case '>': | 
| 1604 |                 { | 
| 1605 |                     if (m_scanOnly) | 
| 1606 |                         return tokenAndChangeState(code: T_POSITION_SET, s: StartTag); | 
| 1607 |                     else | 
| 1608 |                         return tokenAndChangeState(code: T_G_GT, s: ElementContent); | 
| 1609 |                 } | 
| 1610 |                 case '=': | 
| 1611 |                     return tokenAndAdvance(code: T_G_EQ); | 
| 1612 |                 case '\'': | 
| 1613 |                     return tokenAndChangeState(code: T_APOS, s: AposAttributeContent); | 
| 1614 |                 case '"': | 
| 1615 |                     return tokenAndChangeState(code: T_QUOTE, s: QuotAttributeContent); | 
| 1616 |                 default: | 
| 1617 |                     return tokenizeNCNameOrQName(); | 
| 1618 |             } | 
| 1619 |         } | 
| 1620 |         case AposAttributeContent: | 
| 1621 |         case QuotAttributeContent: | 
| 1622 |         { | 
| 1623 |             const QChar sep(state() == AposAttributeContent ? QLatin1Char('\'') : QLatin1Char('"')); | 
| 1624 |             QString result; | 
| 1625 |             result.reserve(asize: 20); | 
| 1626 |  | 
| 1627 |             if (m_scanOnly) | 
| 1628 |             { | 
| 1629 |                 int stack = 0; | 
| 1630 |                 return attributeAsRaw(separator: sep, stack, startPos: m_pos, inLiteral: true, result); | 
| 1631 |             } | 
| 1632 |  | 
| 1633 |             Q_ASSERT(!m_scanOnly); | 
| 1634 |             while(true) | 
| 1635 |             { | 
| 1636 |                 if (atEnd()) | 
| 1637 |                 { | 
| 1638 |                     /* In the case that the XSL-T tokenizer invokes us with | 
| 1639 |                      * default state QuotAttributeContent, we need to be able | 
| 1640 |                      * to return a single string, in case that is all we have | 
| 1641 |                      * accumulated. */ | 
| 1642 |                     if (result.isEmpty()) | 
| 1643 |                         return Token(T_END_OF_FILE); | 
| 1644 |                     else | 
| 1645 |                         return Token(T_STRING_LITERAL, result); | 
| 1646 |                 } | 
| 1647 |  | 
| 1648 |                 const QChar curr(current()); | 
| 1649 |  | 
| 1650 |                 if (curr == sep) | 
| 1651 |                 { | 
| 1652 |                     if (m_pos + 1 == m_length) | 
| 1653 |                         return Token(T_END_OF_FILE); | 
| 1654 |  | 
| 1655 |                     if (m_data.at(i: m_pos + 1) == sep) | 
| 1656 |                     { | 
| 1657 |                         /* The quoting mechanism was used. */ | 
| 1658 |                         m_pos += 2; | 
| 1659 |                         result.append(c: sep); | 
| 1660 |                         continue; | 
| 1661 |                     } | 
| 1662 |  | 
| 1663 |                     const QChar next(m_data.at(i: m_pos + 1)); | 
| 1664 |                     if (!next.isSpace() && next != QLatin1Char('/') && next != QLatin1Char('>')) | 
| 1665 |                         return Token(T_ERROR); // i18n Space must separate attributes | 
| 1666 |  | 
| 1667 |                     if (result.isEmpty()) | 
| 1668 |                     { | 
| 1669 |                         return tokenAndChangeState(code: state() == AposAttributeContent ? T_APOS : T_QUOTE, | 
| 1670 |                                                    s: StartTag, advance: 1); | 
| 1671 |                     } | 
| 1672 |  | 
| 1673 |                     /* Don't consume the sep, but leave it so we next time return a token for it. */ | 
| 1674 |                     return Token(T_STRING_LITERAL, result); | 
| 1675 |                 } | 
| 1676 |                 else if (curr == QLatin1Char('{')) | 
| 1677 |                 { | 
| 1678 |                     if (m_pos + 1 == m_length) | 
| 1679 |                         return Token(T_END_OF_FILE); | 
| 1680 |                     else if (peekAhead() == '{') | 
| 1681 |                     { | 
| 1682 |                         ++m_pos; | 
| 1683 |                         result.append(c: QLatin1Char('{')); | 
| 1684 |                     } | 
| 1685 |                     else | 
| 1686 |                     { | 
| 1687 |                         if (result.isEmpty()) | 
| 1688 |                         { | 
| 1689 |                             /* The Attribute Value Template appeared directly in the attribute. */ | 
| 1690 |                             pushState(); | 
| 1691 |                             return tokenAndChangeState(code: T_CURLY_LBRACE, s: Default); | 
| 1692 |                         } | 
| 1693 |                         else | 
| 1694 |                         { | 
| 1695 |                             /* We don't advance, keep '{' as next token. */ | 
| 1696 |                             return Token(T_STRING_LITERAL, result); | 
| 1697 |                         } | 
| 1698 |                     } | 
| 1699 |                 } | 
| 1700 |                 else if (curr == QLatin1Char('}')) | 
| 1701 |                 { | 
| 1702 |                     if (m_pos + 1 == m_length) | 
| 1703 |                         return Token(T_END_OF_FILE); | 
| 1704 |                     else if (peekAhead() == '}') | 
| 1705 |                     { | 
| 1706 |                         ++m_pos; | 
| 1707 |                         result.append(c: QLatin1Char('}')); | 
| 1708 |                     } | 
| 1709 |                     else | 
| 1710 |                         return Token(T_ERROR); | 
| 1711 |                 } | 
| 1712 |                 else if (curr == QLatin1Char('&')) | 
| 1713 |                 { | 
| 1714 |                     const QString ret(tokenizeCharacterReference()); | 
| 1715 |                     if (ret.isNull()) | 
| 1716 |                         return Token(T_ERROR); | 
| 1717 |                     else | 
| 1718 |                         result.append(s: ret); | 
| 1719 |                 } | 
| 1720 |                 else if (curr == QLatin1Char('<')) | 
| 1721 |                     return Token(T_STRING_LITERAL, result); | 
| 1722 |                 else | 
| 1723 |                 { | 
| 1724 |                     /* See Extensible Markup Language (XML) 1.0 (Fourth Edition), | 
| 1725 |                      * 3.3.3 Attribute-Value Normalization. | 
| 1726 |                      * | 
| 1727 |                      * However, it is complicated a bit by that AVN is defined on top of | 
| 1728 |                      * EOL normalization and we do those two in one go here. */ | 
| 1729 |                     switch(curr.unicode()) | 
| 1730 |                     { | 
| 1731 |                         case 0xD: | 
| 1732 |                         { | 
| 1733 |                             if (peekAhead() == '\n') | 
| 1734 |                             { | 
| 1735 |                                 result.append(c: QLatin1Char(' ')); | 
| 1736 |                                 ++m_pos; | 
| 1737 |                                 break; | 
| 1738 |                             } | 
| 1739 |                             Q_FALLTHROUGH(); | 
| 1740 |                         } | 
| 1741 |                         case 0xA: | 
| 1742 |                         case 0x9: | 
| 1743 |                         { | 
| 1744 |                             result.append(c: QLatin1Char(' ')); | 
| 1745 |                             break; | 
| 1746 |                         } | 
| 1747 |                         default: | 
| 1748 |                             result.append(c: curr); | 
| 1749 |                     } | 
| 1750 |                 } | 
| 1751 |  | 
| 1752 |                 ++m_pos; | 
| 1753 |             } | 
| 1754 |         } | 
| 1755 |         case ElementContent: | 
| 1756 |         { | 
| 1757 |             QString result; | 
| 1758 |             result.reserve(asize: 20); | 
| 1759 |  | 
| 1760 |             /* Whether the text node, result, may be whitespace only. Character references | 
| 1761 |              * and CDATA sections disables that. */ | 
| 1762 |             bool mayBeWS = true; | 
| 1763 |  | 
| 1764 |             CharacterSkips skipEOLNormalization; | 
| 1765 |  | 
| 1766 |             while(true) | 
| 1767 |             { | 
| 1768 |                 if (atEnd()) | 
| 1769 |                     return Token(T_END_OF_FILE); | 
| 1770 |  | 
| 1771 |                 switch(peekCurrent()) | 
| 1772 |                 { | 
| 1773 |                     case '<': | 
| 1774 |                     { | 
| 1775 |                         if (!result.isEmpty() && peekAhead(length: 2) != '[') | 
| 1776 |                         { | 
| 1777 |                             /* We encountered the end, and it was not a CDATA section. */ | 
| 1778 |                             /* We don't advance. Next time we'll handle the <... stuff. */ | 
| 1779 |                             return Token(mayBeWS ? T_STRING_LITERAL : T_NON_BOUNDARY_WS, normalizeEOL(input: result, characterSkips: skipEOLNormalization)); | 
| 1780 |                         } | 
| 1781 |  | 
| 1782 |                         ++m_pos; | 
| 1783 |                         if (atEnd()) | 
| 1784 |                             return Token(T_END_OF_FILE); | 
| 1785 |  | 
| 1786 |                         const QChar ahead(current()); | 
| 1787 |                         if (ahead.isSpace()) | 
| 1788 |                             return error(); | 
| 1789 |                         else if (ahead == QLatin1Char('/')) | 
| 1790 |                         { | 
| 1791 |                             if (m_pos + 1 == m_length) | 
| 1792 |                                 return Token(T_END_OF_FILE); | 
| 1793 |                             else if (m_data.at(i: m_pos + 1).isSpace()) | 
| 1794 |                                 return error(); | 
| 1795 |                             else | 
| 1796 |                                 return tokenAndChangeState(code: T_BEGIN_END_TAG, s: EndTag); | 
| 1797 |                         } | 
| 1798 |                         else if (isNCNameStart(ch: ahead)) | 
| 1799 |                         { | 
| 1800 |                             pushState(); | 
| 1801 |                             return tokenAndChangeState(code: T_G_LT, s: StartTag, advance: 0); | 
| 1802 |                         } | 
| 1803 |                         else if (aheadEquals(chs: "!--" , len: 3, offset: 0)) | 
| 1804 |                         { | 
| 1805 |                             pushState(); | 
| 1806 |                             m_pos += 3; | 
| 1807 |                             return tokenAndChangeState(code: T_COMMENT_START, s: XMLComment, advance: 0); | 
| 1808 |                         } | 
| 1809 |                         else if (aheadEquals(chs: "![CDATA[" , len: 8, offset: 0)) | 
| 1810 |                         { | 
| 1811 |                             mayBeWS = false; | 
| 1812 |                             m_pos += 8; | 
| 1813 |                             const int start = m_pos; | 
| 1814 |                             const int len = scanUntil(content: "]]>" ); | 
| 1815 |  | 
| 1816 |                             if (len == -1) | 
| 1817 |                                 return Token(T_END_OF_FILE); | 
| 1818 |  | 
| 1819 |                             m_pos += 2; /* Consume "]]>". Note that m_pos is on '!'. */ | 
| 1820 |                             result.append(s: m_data.mid(position: start, n: len)); | 
| 1821 |                             break; | 
| 1822 |                         } | 
| 1823 |                         else if (ahead == QLatin1Char('?')) | 
| 1824 |                         { | 
| 1825 |                             pushState(); | 
| 1826 |                             return tokenAndChangeState(code: T_PI_START, s: ProcessingInstructionName); | 
| 1827 |                         } | 
| 1828 |                         else | 
| 1829 |                             return Token(T_G_LT); | 
| 1830 |                     } | 
| 1831 |                     case '&': | 
| 1832 |                     { | 
| 1833 |                         const QString ret(tokenizeCharacterReference()); | 
| 1834 |                         if (ret.isNull()) | 
| 1835 |                             return Token(T_ERROR); | 
| 1836 |                         else | 
| 1837 |                         { | 
| 1838 |                             skipEOLNormalization.insert(value: result.count()); | 
| 1839 |                             result.append(s: ret); | 
| 1840 |                             mayBeWS = false; | 
| 1841 |                             break; | 
| 1842 |                         } | 
| 1843 |                     } | 
| 1844 |                     case '{': | 
| 1845 |                     { | 
| 1846 |                         // TODO remove this check, also below. | 
| 1847 |                         if (m_pos + 1 == m_length) | 
| 1848 |                             return Token(T_END_OF_FILE); | 
| 1849 |                         else if (peekAhead() == '{') | 
| 1850 |                         { | 
| 1851 |                             ++m_pos; | 
| 1852 |                             result.append(c: QLatin1Char('{')); | 
| 1853 |                         } | 
| 1854 |                         else | 
| 1855 |                         { | 
| 1856 |                             if (result.isEmpty()) | 
| 1857 |                             { | 
| 1858 |                                 pushState(); | 
| 1859 |                                 return tokenAndChangeState(code: T_CURLY_LBRACE, s: Default); | 
| 1860 |                             } | 
| 1861 |                             else | 
| 1862 |                             { | 
| 1863 |                                 /* We don't advance here. */ | 
| 1864 |                                 return Token(mayBeWS ? T_STRING_LITERAL : T_NON_BOUNDARY_WS, normalizeEOL(input: result, characterSkips: skipEOLNormalization)); | 
| 1865 |                             } | 
| 1866 |                         } | 
| 1867 |                         break; | 
| 1868 |                     } | 
| 1869 |                     case '}': | 
| 1870 |                     { | 
| 1871 |                         if (m_pos + 1 == m_length) | 
| 1872 |                             return Token(T_END_OF_FILE); | 
| 1873 |                         else if (peekAhead() == '}') | 
| 1874 |                         { | 
| 1875 |                             ++m_pos; | 
| 1876 |                             result.append(c: QLatin1Char('}')); | 
| 1877 |                         } | 
| 1878 |                         else | 
| 1879 |                         { | 
| 1880 |                             /* This is a parse error, and the grammar won't be able | 
| 1881 |                              * to reduce this CURLY_RBRACE. */ | 
| 1882 |                             return tokenAndChangeState(code: T_CURLY_RBRACE, s: Default); | 
| 1883 |                         } | 
| 1884 |                         break; | 
| 1885 |                     } | 
| 1886 |                     case '\n': | 
| 1887 |                     { | 
| 1888 |                         /* We want to translate \r\n into \n. */ | 
| 1889 |                         if (peekAhead(length: -1) == '\r') | 
| 1890 |                             break; | 
| 1891 |                         Q_FALLTHROUGH(); | 
| 1892 |                     } | 
| 1893 |                     case '\r': | 
| 1894 |                     { | 
| 1895 |                         result.append(c: QLatin1Char('\n')); | 
| 1896 |                         break; | 
| 1897 |                     } | 
| 1898 |                     default: | 
| 1899 |                     { | 
| 1900 |                         result.append(c: current()); | 
| 1901 |                         break; | 
| 1902 |                     } | 
| 1903 |                 } | 
| 1904 |                 ++m_pos; | 
| 1905 |             } | 
| 1906 |         } | 
| 1907 |         case ProcessingInstructionName: | 
| 1908 |         { | 
| 1909 |             const int start = m_pos; | 
| 1910 |  | 
| 1911 |             while(true) | 
| 1912 |             { | 
| 1913 |                 ++m_pos; | 
| 1914 |                 if (m_pos >= m_length) | 
| 1915 |                     return Token(T_END_OF_FILE); | 
| 1916 |  | 
| 1917 |                 const QChar next(current()); | 
| 1918 |                 if (next.isSpace() || next == QLatin1Char('?')) | 
| 1919 |                 { | 
| 1920 |                     return tokenAndChangeState(code: T_PI_TARGET, value: m_data.mid(position: start, n: m_pos - start), | 
| 1921 |                                                s: ProcessingInstructionContent); | 
| 1922 |                 } | 
| 1923 |             } | 
| 1924 |         } | 
| 1925 |         case ProcessingInstructionContent: | 
| 1926 |         { | 
| 1927 |             /* Consume whitespace between the name and the content. */ | 
| 1928 |             if (consumeRawWhitespace()) | 
| 1929 |                 return Token(T_END_OF_FILE); | 
| 1930 |  | 
| 1931 |             const int start = m_pos; | 
| 1932 |             const int len = scanUntil(content: "?>" ); | 
| 1933 |  | 
| 1934 |             if (len == -1) | 
| 1935 |                 return Token(T_END_OF_FILE); | 
| 1936 |             else | 
| 1937 |             { | 
| 1938 |                 m_pos += 2; /* Consume "?>" */ | 
| 1939 |                 popState(); | 
| 1940 |                 return Token(T_PI_CONTENT, normalizeEOL(input: m_data.mid(position: start, n: len), characterSkips: CharacterSkips())); | 
| 1941 |             } | 
| 1942 |         } | 
| 1943 |         case EndTag: | 
| 1944 |         { | 
| 1945 |             if (consumeRawWhitespace()) | 
| 1946 |                 return T_END_OF_FILE; | 
| 1947 |  | 
| 1948 |             if (peekCurrent() == '>') | 
| 1949 |             { | 
| 1950 |                 popState(); | 
| 1951 |                 return tokenAndAdvance(code: T_G_GT); | 
| 1952 |             } | 
| 1953 |             else | 
| 1954 |                 return tokenizeNCNameOrQName(); | 
| 1955 |         } | 
| 1956 |         case XMLComment: | 
| 1957 |         { | 
| 1958 |             const int start = m_pos; | 
| 1959 |             const int len = scanUntil(content: "--" ); | 
| 1960 |  | 
| 1961 |             if (len == -1) | 
| 1962 |                 return T_END_OF_FILE; | 
| 1963 |             else | 
| 1964 |             { | 
| 1965 |                 m_pos += 2; /* Consume "--". */ | 
| 1966 |                 popState(); | 
| 1967 |  | 
| 1968 |                 if (peekCurrent() == '>') | 
| 1969 |                 { | 
| 1970 |                     ++m_pos; | 
| 1971 |                     return Token(T_COMMENT_CONTENT, normalizeEOL(input: m_data.mid(position: start, n: len), characterSkips: CharacterSkips())); | 
| 1972 |                 } | 
| 1973 |                 else | 
| 1974 |                     return error(); | 
| 1975 |             } | 
| 1976 |         } | 
| 1977 |         case Pragma: | 
| 1978 |         { | 
| 1979 |             /* Consume whitespace. */ | 
| 1980 |             if (consumeRawWhitespace()) | 
| 1981 |                 return Token(T_END_OF_FILE); | 
| 1982 |  | 
| 1983 |             setState(PragmaContent); | 
| 1984 |             return tokenizeNCNameOrQName(); | 
| 1985 |         } | 
| 1986 |         case PragmaContent: | 
| 1987 |         { | 
| 1988 |             QString result; | 
| 1989 |             result.reserve(asize: 20); | 
| 1990 |  | 
| 1991 |             const bool hasWS = m_pos < m_length && current().isSpace(); | 
| 1992 |  | 
| 1993 |             /* Consume all whitespace up to the pragma content(if any). */ | 
| 1994 |             if (consumeRawWhitespace()) | 
| 1995 |                 return Token(T_END_OF_FILE); | 
| 1996 |  | 
| 1997 |             if (peekCurrent() == '#' && peekAhead() == ')') | 
| 1998 |             { | 
| 1999 |                 /* We reached the end, and there's no pragma content. */ | 
| 2000 |                 return tokenAndChangeState(code: T_PRAGMA_END, s: Default, advance: 2); | 
| 2001 |             } | 
| 2002 |             else if (!hasWS) | 
| 2003 |             { | 
| 2004 |                 /* A separating space is required if there's pragma content. */ | 
| 2005 |                 return error(); /* i18n */ | 
| 2006 |             } | 
| 2007 |  | 
| 2008 |             const int start = m_pos; | 
| 2009 |             const int len = scanUntil(content: "#)" ); | 
| 2010 |             if (len == -1) | 
| 2011 |                 return Token(T_END_OF_FILE); | 
| 2012 |  | 
| 2013 |             return Token(T_STRING_LITERAL, m_data.mid(position: start, n: len)); | 
| 2014 |             Q_ASSERT(false); | 
| 2015 |         } | 
| 2016 |     } | 
| 2017 |  | 
| 2018 |     Q_ASSERT(false); | 
| 2019 |     return error(); | 
| 2020 | } | 
| 2021 |  | 
| 2022 | Tokenizer::Token XQueryTokenizer::attributeAsRaw(const QChar sep, | 
| 2023 |                                                  int &sepStack, | 
| 2024 |                                                  const int startPos, | 
| 2025 |                                                  const bool aInLiteral, | 
| 2026 |                                                  QString &result) | 
| 2027 | { | 
| 2028 |     bool inLiteral = aInLiteral; | 
| 2029 |     const char otherSep = (sep == QLatin1Char('"') ? '\'' : '"'); | 
| 2030 |  | 
| 2031 |     while(true) | 
| 2032 |     { | 
| 2033 |         if (atEnd()) | 
| 2034 |             return T_END_OF_FILE; | 
| 2035 |  | 
| 2036 |         if (peekCurrent() == sep.unicode()) | 
| 2037 |         { | 
| 2038 |             if (inLiteral) | 
| 2039 |                 inLiteral = false; | 
| 2040 |             else | 
| 2041 |                 inLiteral = true; | 
| 2042 |  | 
| 2043 |             if (peekAhead() == sep.unicode()) | 
| 2044 |             { | 
| 2045 |                 /* The quoting mechanism was used. */ | 
| 2046 |                 result.append(c: current()); | 
| 2047 |                 m_pos += 2; | 
| 2048 |                 continue; | 
| 2049 |             } | 
| 2050 |             else | 
| 2051 |             { | 
| 2052 |                 /* Don't consume the separator, such that we | 
| 2053 |                  * return a token for it next time. */ | 
| 2054 |                 if (m_pos == startPos) | 
| 2055 |                 { | 
| 2056 |                     ++m_pos; | 
| 2057 |                     setState(StartTag); | 
| 2058 |                     return Token(sep == QLatin1Char('"') ? T_QUOTE : T_APOS); | 
| 2059 |                 } | 
| 2060 |  | 
| 2061 |  | 
| 2062 |                 if (sepStack == 0) | 
| 2063 |                 { | 
| 2064 |                     return Token(T_STRING_LITERAL, result); | 
| 2065 |                 } | 
| 2066 |                 else | 
| 2067 |                 { | 
| 2068 |                     result.append(c: current()); | 
| 2069 |                     ++m_pos; | 
| 2070 |                     continue; | 
| 2071 |                 } | 
| 2072 |             } | 
| 2073 |         } | 
| 2074 |         else if (peekCurrent() == '&') | 
| 2075 |         { | 
| 2076 |             const QString ret(tokenizeCharacterReference()); | 
| 2077 |             if (ret.isNull()) | 
| 2078 |                 return Token(T_ERROR); | 
| 2079 |             else | 
| 2080 |             { | 
| 2081 |                 result.append(s: ret); | 
| 2082 |                 ++m_pos; | 
| 2083 |                 continue; | 
| 2084 |             } | 
| 2085 |         } | 
| 2086 |         else if (peekCurrent() == otherSep) | 
| 2087 |         { | 
| 2088 |             result.append(c: current()); | 
| 2089 |             ++m_pos; | 
| 2090 |  | 
| 2091 |             if (peekCurrent() == otherSep) | 
| 2092 |                 ++m_pos; | 
| 2093 |  | 
| 2094 |             if (inLiteral) | 
| 2095 |                 inLiteral = false; | 
| 2096 |             else | 
| 2097 |                 inLiteral = true; | 
| 2098 |  | 
| 2099 |             continue; | 
| 2100 |         } | 
| 2101 |         else if (peekCurrent() == '{') | 
| 2102 |         { | 
| 2103 |             result.append(c: current()); | 
| 2104 |  | 
| 2105 |             if (peekAhead() == '{') | 
| 2106 |             { | 
| 2107 |                 m_pos += 2; | 
| 2108 |                 continue; | 
| 2109 |             } | 
| 2110 |             else | 
| 2111 |             { | 
| 2112 |                 ++m_pos; | 
| 2113 |                 ++sepStack; | 
| 2114 |                 const Token t(attributeAsRaw(sep, sepStack, startPos, aInLiteral: false, result)); | 
| 2115 |                 if (t.type != T_SUCCESS) | 
| 2116 |                     return t; | 
| 2117 |             } | 
| 2118 |  | 
| 2119 |         } | 
| 2120 |         else if (peekCurrent() == '}') | 
| 2121 |         { | 
| 2122 |             if (inLiteral && peekAhead() == '}') | 
| 2123 |             { | 
| 2124 |                 result.append(c: current()); | 
| 2125 |                 m_pos += 2; | 
| 2126 |                 continue; | 
| 2127 |             } | 
| 2128 |             else | 
| 2129 |             { | 
| 2130 |                 ++m_pos; | 
| 2131 |                 --sepStack; | 
| 2132 |                 return Token(T_SUCCESS); /* The return value is arbitrary. */ | 
| 2133 |             } | 
| 2134 |         } | 
| 2135 |         else | 
| 2136 |         { | 
| 2137 |             result.append(c: current()); | 
| 2138 |             ++m_pos; | 
| 2139 |         } | 
| 2140 |     } | 
| 2141 | } | 
| 2142 |  | 
| 2143 | Tokenizer::Token XQueryTokenizer::nextToken(XPATHLTYPE *const sourceLocator) | 
| 2144 | { | 
| 2145 |     sourceLocator->first_line = m_line; | 
| 2146 |     sourceLocator->first_column = m_pos - m_columnOffset + 1; /* Plus 1, since m_pos is 0-based. */ | 
| 2147 |  | 
| 2148 |     if (m_tokenStack.isEmpty()) | 
| 2149 |         return nextToken(); | 
| 2150 |     else | 
| 2151 |     { | 
| 2152 |         const Token retval(m_tokenStack.pop()); | 
| 2153 |  | 
| 2154 |         switch(retval.type) | 
| 2155 |         { | 
| 2156 |             case T_MODULE: | 
| 2157 |             case T_SCHEMA: | 
| 2158 |             case T_COPY_NAMESPACES: | 
| 2159 |             { | 
| 2160 |                 setState(NamespaceKeyword); | 
| 2161 |                 break; | 
| 2162 |             } | 
| 2163 |             case T_VERSION: | 
| 2164 |             { | 
| 2165 |                 setState(XQueryVersion); | 
| 2166 |                 break; | 
| 2167 |             } | 
| 2168 |             case T_AS: | 
| 2169 |             case T_OF: | 
| 2170 |             { | 
| 2171 |                 setState(ItemType); | 
| 2172 |                 break; | 
| 2173 |             } | 
| 2174 |             default: | 
| 2175 |             { | 
| 2176 |                 if (isOperatorKeyword(code: retval.type)) | 
| 2177 |                     setState(Default); | 
| 2178 |  | 
| 2179 |                 break; | 
| 2180 |             } | 
| 2181 |         }; | 
| 2182 |  | 
| 2183 |         return retval; | 
| 2184 |     } | 
| 2185 | } | 
| 2186 |  | 
| 2187 | int XQueryTokenizer::commenceScanOnly() | 
| 2188 | { | 
| 2189 |     m_scanOnly = true; | 
| 2190 |     return m_pos; | 
| 2191 | } | 
| 2192 |  | 
| 2193 | void XQueryTokenizer::resumeTokenizationFrom(const int pos) | 
| 2194 | { | 
| 2195 |     m_scanOnly = false; | 
| 2196 |     m_pos = pos; | 
| 2197 | } | 
| 2198 |  | 
| 2199 | void XQueryTokenizer::setParserContext(const ParserContext::Ptr &) | 
| 2200 | { | 
| 2201 | } | 
| 2202 |  | 
| 2203 | #undef handleWhitespace | 
| 2204 |  | 
| 2205 | } // namespace QPatternist | 
| 2206 |  | 
| 2207 | QT_END_NAMESPACE | 
| 2208 |  |