1// Copyright (C) 2002-2007 Detlev Offenbach <detlev@die-offenbachs.de>
2// Copyright (C) 2021 The Qt Company Ltd.
3// SPDX-License-Identifier: LicenseRef-Qt-Commercial OR GPL-3.0-only WITH Qt-GPL-exception-1.0
4
5#include <translator.h>
6#include "lupdate.h"
7
8#include <QtCore/qhash.h>
9#include <QtCore/qstring.h>
10#include <QtCore/qtextstream.h>
11#include <QtCore/qstack.h>
12
13#include <cctype>
14#include <cerrno>
15#include <cstdio>
16#include <cstring>
17
18QT_BEGIN_NAMESPACE
19
20static const char PythonMagicComment[] = "TRANSLATOR ";
21
22/*
23 The first part of this source file is the Python tokenizer. We skip
24 most of Python; the only tokens that interest us are defined here.
25*/
26
27enum Token { Tok_Eof, Tok_class, Tok_def, Tok_return, Tok_tr,
28 Tok_trUtf8, Tok_translate, Tok_Ident,
29 Tok_Comment, Tok_Dot, Tok_String,
30 Tok_LeftParen, Tok_RightParen,
31 Tok_Comma, Tok_None, Tok_Integer};
32
33enum class StringType
34{
35 NoString,
36 String,
37 FormatString,
38 RawString
39};
40
41/*
42 The tokenizer maintains the following global variables. The names
43 should be self-explanatory.
44*/
45static QString yyFileName;
46static int yyCh;
47static QByteArray yyIdent;
48static char yyComment[65536];
49static size_t yyCommentLen;
50static char yyString[65536];
51static size_t yyStringLen;
52static int yyParenDepth;
53static int yyLineNo;
54static int yyCurLineNo;
55
56static QByteArray extraComment;
57static QByteArray id;
58
59QHash<QByteArray, Token> tokens = {
60 {"None", Tok_None},
61 {"class", Tok_class},
62 {"def", Tok_def},
63 {"return", Tok_return},
64 {"__tr", Tok_tr}, // Legacy?
65 {"__trUtf8", Tok_trUtf8}
66};
67
68// the file to read from (if reading from a file)
69static FILE *yyInFile;
70
71// the string to read from and current position in the string (otherwise)
72static int yyInPos;
73static int buf;
74
75static int (*getChar)();
76static int (*peekChar)();
77
78static int yyIndentationSize;
79static int yyContinuousSpaceCount;
80static bool yyCountingIndentation;
81
82// (Context, indentation level) pair.
83using ContextPair = QPair<QByteArray, int>;
84// Stack of (Context, indentation level) pairs.
85using ContextStack = QStack<ContextPair>;
86static ContextStack yyContextStack;
87
88static int getCharFromFile()
89{
90 int c;
91
92 if (buf < 0) {
93 c = getc(stream: yyInFile);
94 } else {
95 c = buf;
96 buf = -1;
97 }
98 if (c == '\n') {
99 yyCurLineNo++;
100 yyCountingIndentation = true;
101 yyContinuousSpaceCount = 0;
102 } else if (yyCountingIndentation && (c == 32 || c == 9)) {
103 yyContinuousSpaceCount++;
104 } else {
105 yyCountingIndentation = false;
106 }
107 return c;
108}
109
110static int peekCharFromFile()
111{
112 int c = getc(stream: yyInFile);
113 buf = c;
114 return c;
115}
116
117static void startTokenizer(const QString &fileName, int (*getCharFunc)(),
118 int (*peekCharFunc)())
119{
120 yyInPos = 0;
121 buf = -1;
122 getChar = getCharFunc;
123 peekChar = peekCharFunc;
124
125 yyFileName = fileName;
126 yyCh = getChar();
127 yyParenDepth = 0;
128 yyCurLineNo = 1;
129
130 yyIndentationSize = -1;
131 yyContinuousSpaceCount = 0;
132 yyCountingIndentation = false;
133 yyContextStack.clear();
134}
135
136static bool parseStringEscape(int quoteChar, StringType stringType)
137{
138 static const char tab[] = "abfnrtv";
139 static const char backTab[] = "\a\b\f\n\r\t\v";
140
141 yyCh = getChar();
142 if (yyCh == EOF)
143 return false;
144
145 if (stringType == StringType::RawString) {
146 if (yyCh != quoteChar) // Only quotes can be escaped in raw strings
147 yyString[yyStringLen++] = '\\';
148 yyString[yyStringLen++] = yyCh;
149 yyCh = getChar();
150 return true;
151 }
152
153 if (yyCh == 'x') {
154 QByteArray hex = "0";
155 yyCh = getChar();
156 if (yyCh == EOF)
157 return false;
158 while (std::isxdigit(yyCh)) {
159 hex += char(yyCh);
160 yyCh = getChar();
161 if (yyCh == EOF)
162 return false;
163 }
164 uint n;
165#ifdef Q_CC_MSVC
166 sscanf_s(hex, "%x", &n);
167#else
168 std::sscanf(s: hex, format: "%x", &n);
169#endif
170 if (yyStringLen < sizeof(yyString) - 1)
171 yyString[yyStringLen++] = char(n);
172 return true;
173 }
174
175 if (yyCh >= '0' && yyCh < '8') {
176 QByteArray oct;
177 int n = 0;
178 do {
179 oct += char(yyCh);
180 ++n;
181 yyCh = getChar();
182 if (yyCh == EOF)
183 return false;
184 } while (yyCh >= '0' && yyCh < '8' && n < 3);
185#ifdef Q_CC_MSVC
186 sscanf_s(oct, "%o", &n);
187#else
188 std::sscanf(s: oct, format: "%o", &n);
189#endif
190 if (yyStringLen < sizeof(yyString) - 1)
191 yyString[yyStringLen++] = char(n);
192 return true;
193 }
194
195 const char *p = std::strchr(s: tab, c: yyCh);
196 if (yyStringLen < sizeof(yyString) - 1) {
197 yyString[yyStringLen++] = p == nullptr
198 ? char(yyCh) : backTab[p - tab];
199 }
200 yyCh = getChar();
201 return true;
202}
203
204static Token parseString(StringType stringType = StringType::NoString)
205{
206 int quoteChar = yyCh;
207 bool tripleQuote = false;
208 bool singleQuote = true;
209 bool in = false;
210
211 yyCh = getChar();
212
213 while (yyCh != EOF) {
214 if (singleQuote && (yyCh == '\n' || (in && yyCh == quoteChar)))
215 break;
216
217 if (yyCh == quoteChar) {
218 if (peekChar() == quoteChar) {
219 yyCh = getChar();
220 if (!tripleQuote) {
221 tripleQuote = true;
222 singleQuote = false;
223 in = true;
224 yyCh = getChar();
225 } else {
226 yyCh = getChar();
227 if (yyCh == quoteChar) {
228 tripleQuote = false;
229 break;
230 }
231 }
232 } else if (tripleQuote) {
233 if (yyStringLen < sizeof(yyString) - 1)
234 yyString[yyStringLen++] = char(yyCh);
235 yyCh = getChar();
236 continue;
237 } else {
238 break;
239 }
240 } else {
241 in = true;
242 }
243
244 if (yyCh == '\\') {
245 if (!parseStringEscape(quoteChar, stringType))
246 return Tok_Eof;
247 } else {
248 char *yStart = yyString + yyStringLen;
249 char *yp = yStart;
250 while (yyCh != EOF && (tripleQuote || yyCh != '\n') && yyCh != quoteChar
251 && yyCh != '\\') {
252 *yp++ = char(yyCh);
253 yyCh = getChar();
254 }
255 yyStringLen += yp - yStart;
256 }
257 }
258 yyString[yyStringLen] = '\0';
259
260 if (yyCh != quoteChar) {
261 printf(format: "%c\n", yyCh);
262
263 qWarning(msg: "%s:%d: Unterminated string",
264 qPrintable(yyFileName), yyLineNo);
265 }
266
267 if (yyCh == EOF)
268 return Tok_Eof;
269 yyCh = getChar();
270 return Tok_String;
271}
272
273static QByteArray readLine()
274{
275 QByteArray result;
276 while (true) {
277 yyCh = getChar();
278 if (yyCh == EOF || yyCh == '\n')
279 break;
280 result.append(c: char(yyCh));
281 }
282 return result;
283}
284
285static Token getToken(StringType stringType = StringType::NoString)
286{
287 yyIdent.clear();
288 yyCommentLen = 0;
289 yyStringLen = 0;
290 while (yyCh != EOF) {
291 yyLineNo = yyCurLineNo;
292
293 if (std::isalpha(yyCh) || yyCh == '_') {
294 do {
295 yyIdent.append(c: char(yyCh));
296 yyCh = getChar();
297 } while (std::isalnum(yyCh) || yyCh == '_');
298
299 return tokens.value(key: yyIdent, defaultValue: Tok_Ident);
300 }
301 switch (yyCh) {
302 case '#':
303 switch (getChar()) {
304 case ':':
305 extraComment = readLine().trimmed();
306 break;
307 case '=':
308 id = readLine().trimmed();
309 break;
310 case EOF:
311 return Tok_Eof;
312 case '\n':
313 break;
314 default:
315 do {
316 yyCh = getChar();
317 } while (yyCh != EOF && yyCh != '\n');
318 break;
319 }
320 break;
321 case '"':
322 case '\'':
323 return parseString(stringType);
324 case '(':
325 yyParenDepth++;
326 yyCh = getChar();
327 return Tok_LeftParen;
328 case ')':
329 yyParenDepth--;
330 yyCh = getChar();
331 return Tok_RightParen;
332 case ',':
333 yyCh = getChar();
334 return Tok_Comma;
335 case '.':
336 yyCh = getChar();
337 return Tok_Dot;
338 case '0':
339 case '1':
340 case '2':
341 case '3':
342 case '4':
343 case '5':
344 case '6':
345 case '7':
346 case '8':
347 case '9': {
348 QByteArray ba;
349 ba += char(yyCh);
350 yyCh = getChar();
351 const bool hex = yyCh == 'x';
352 if (hex) {
353 ba += char(yyCh);
354 yyCh = getChar();
355 }
356 while ((hex ? std::isxdigit(yyCh) : std::isdigit(yyCh))) {
357 ba += char(yyCh);
358 yyCh = getChar();
359 }
360 bool ok;
361 auto v = ba.toLongLong(ok: &ok);
362 Q_UNUSED(v);
363 if (ok)
364 return Tok_Integer;
365 break;
366 }
367 default:
368 yyCh = getChar();
369 }
370 }
371 return Tok_Eof;
372}
373
374/*
375 The second part of this source file is the parser. It accomplishes
376 a very easy task: It finds all strings inside a tr() or translate()
377 call, and possibly finds out the context of the call. It supports
378 three cases:
379 (1) the context is specified, as in FunnyDialog.tr("Hello") or
380 translate("FunnyDialog", "Hello");
381 (2) the call appears within an inlined function;
382 (3) the call appears within a function defined outside the class definition.
383*/
384
385static Token yyTok;
386
387static bool match(Token t)
388{
389 const bool matches = (yyTok == t);
390 if (matches)
391 yyTok = getToken();
392 return matches;
393}
394
395static bool matchStringStart()
396{
397 if (yyTok == Tok_String)
398 return true;
399 // Check for f"bla{var}" and raw strings r"bla".
400 if (yyTok == Tok_Ident && yyIdent.size() == 1) {
401 switch (yyIdent.at(i: 0)) {
402 case 'r':
403 yyTok = getToken(stringType: StringType::RawString);
404 return yyTok == Tok_String;
405 case 'f':
406 yyTok = getToken(stringType: StringType::FormatString);
407 return yyTok == Tok_String;
408 }
409 }
410 return false;
411}
412
413static bool matchString(QByteArray *s)
414{
415 s->clear();
416 bool ok = false;
417 while (matchStringStart()) {
418 *s += yyString;
419 yyTok = getToken();
420 ok = true;
421 }
422 return ok;
423}
424
425static bool matchEncoding(bool *utf8)
426{
427 // Remove any leading module paths.
428 if (yyTok == Tok_Ident && std::strcmp(s1: yyIdent, s2: "PySide6") == 0) {
429 yyTok = getToken();
430
431 if (yyTok != Tok_Dot)
432 return false;
433
434 yyTok = getToken();
435 }
436
437 if (yyTok == Tok_Ident && (std::strcmp(s1: yyIdent, s2: "QtGui") == 0
438 || std::strcmp(s1: yyIdent, s2: "QtCore") == 0)) {
439 yyTok = getToken();
440
441 if (yyTok != Tok_Dot)
442 return false;
443
444 yyTok = getToken();
445 }
446
447 if (yyTok == Tok_Ident) {
448 if (std::strcmp(s1: yyIdent, s2: "QApplication") == 0
449 || std::strcmp(s1: yyIdent, s2: "QGuiApplication") == 0
450 || std::strcmp(s1: yyIdent, s2: "QCoreApplication") == 0) {
451 yyTok = getToken();
452
453 if (yyTok == Tok_Dot)
454 yyTok = getToken();
455 }
456
457 *utf8 = QByteArray(yyIdent).endsWith(bv: "UTF8");
458 yyTok = getToken();
459 return true;
460 }
461 return false;
462}
463
464static bool matchStringOrNone(QByteArray *s)
465{
466 bool matches = matchString(s);
467
468 if (!matches)
469 matches = match(t: Tok_None);
470
471 return matches;
472}
473
474/*
475 * match any expression that can return a number, which can be
476 * 1. Literal number (e.g. '11')
477 * 2. simple identifier (e.g. 'm_count')
478 * 3. simple function call (e.g. 'size()')
479 * 4. function call on an object (e.g. 'list.size()')
480 *
481 * Other cases:
482 * size(2,4)
483 * list().size()
484 * list(a,b).size(2,4)
485 * etc...
486 */
487static bool matchExpression()
488{
489 if (match(t: Tok_Integer))
490 return true;
491
492 int parenlevel = 0;
493 while (match(t: Tok_Ident) || parenlevel > 0) {
494 if (yyTok == Tok_RightParen) {
495 if (parenlevel == 0)
496 break;
497 --parenlevel;
498 yyTok = getToken();
499 } else if (yyTok == Tok_LeftParen) {
500 yyTok = getToken();
501 if (yyTok == Tok_RightParen) {
502 yyTok = getToken();
503 } else {
504 ++parenlevel;
505 }
506 } else if (yyTok == Tok_Ident) {
507 continue;
508 } else if (parenlevel == 0) {
509 return false;
510 }
511 }
512 return true;
513}
514
515static bool parseTranslate(QByteArray *text, QByteArray *context, QByteArray *comment,
516 bool *utf8, bool *plural)
517{
518 text->clear();
519 context->clear();
520 comment->clear();
521 *utf8 = false;
522 *plural = false;
523
524 yyTok = getToken();
525 if (!match(t: Tok_LeftParen) || !matchString(s: context) || !match(t: Tok_Comma)
526 || !matchString(s: text)) {
527 return false;
528 }
529
530 if (match(t: Tok_RightParen))
531 return true;
532
533 // not a comma or a right paren, illegal syntax
534 if (!match(t: Tok_Comma))
535 return false;
536
537 // python accepts trailing commas within parenthesis, so allow a comma with nothing after
538 if (match(t: Tok_RightParen))
539 return true;
540
541 // check for comment
542 if (!matchStringOrNone(s: comment))
543 return false; // not a comment, or a trailing comma... something is wrong
544
545 if (match(t: Tok_RightParen))
546 return true;
547
548 // not a comma or a right paren, illegal syntax
549 if (!match(t: Tok_Comma))
550 return false;
551
552 // python accepts trailing commas within parenthesis, so allow a comma with nothing after
553 if (match(t: Tok_RightParen))
554 return true;
555
556 // look for optional encoding information
557 if (matchEncoding(utf8)) {
558 if (match(t: Tok_RightParen))
559 return true;
560
561 // not a comma or a right paren, illegal syntax
562 if (!match(t: Tok_Comma))
563 return false;
564
565 // python accepts trailing commas within parenthesis, so allow a comma with nothing after
566 if (match(t: Tok_RightParen))
567 return true;
568 }
569
570 // Must be a plural expression
571 if (!matchExpression())
572 return false;
573
574 *plural = true;
575
576 // Ignore any trailing comma here
577 match(t: Tok_Comma);
578
579 // This must be the end, or there are too many parameters
580 if (match(t: Tok_RightParen))
581 return true;
582
583 return false;
584}
585
586static inline void setMessageParameters(TranslatorMessage *message)
587{
588 if (!extraComment.isEmpty()) {
589 message->setExtraComment(QString::fromUtf8(ba: extraComment));
590 extraComment.clear();
591 }
592 if (!id.isEmpty()) {
593 message->setId(QString::fromUtf8(ba: id));
594 id.clear();
595 }
596}
597
598static void parse(Translator &tor, ConversionData &cd,
599 const QByteArray &initialContext = {},
600 const QByteArray &defaultContext = {})
601{
602 QByteArray context;
603 QByteArray text;
604 QByteArray comment;
605 QByteArray prefix;
606 bool utf8 = false;
607
608 yyTok = getToken();
609 while (yyTok != Tok_Eof) {
610
611 switch (yyTok) {
612 case Tok_class: {
613 if (yyIndentationSize < 0 && yyContinuousSpaceCount > 0)
614 yyIndentationSize = yyContinuousSpaceCount; // First indented "class"
615 const int indent = yyIndentationSize > 0
616 ? yyContinuousSpaceCount / yyIndentationSize : 0;
617 while (!yyContextStack.isEmpty() && yyContextStack.top().second >= indent)
618 yyContextStack.pop();
619 yyTok = getToken();
620 yyContextStack.push(t: {yyIdent, indent});
621 yyTok = getToken();
622 }
623 break;
624 case Tok_def:
625 if (yyIndentationSize < 0 && yyContinuousSpaceCount > 0)
626 yyIndentationSize = yyContinuousSpaceCount; // First indented "def"
627 if (!yyContextStack.isEmpty()) {
628 // Pop classes if the function is further outdented than the class on the top
629 // (end of a nested class).
630 const int classIndent = yyIndentationSize > 0
631 ? yyContinuousSpaceCount / yyIndentationSize - 1 : 0;
632 while (!yyContextStack.isEmpty() && yyContextStack.top().second > classIndent)
633 yyContextStack.pop();
634 }
635 yyTok = getToken();
636 break;
637 case Tok_tr:
638 case Tok_trUtf8:
639 utf8 = true;
640 yyTok = getToken();
641 if (match(t: Tok_LeftParen) && matchString(s: &text)) {
642 comment.clear();
643 bool plural = false;
644
645 if (match(t: Tok_RightParen)) {
646 // There is no comment or plural arguments.
647 } else if (match(t: Tok_Comma) && matchStringOrNone(s: &comment)) {
648 // There is a comment argument.
649 if (match(t: Tok_RightParen)) {
650 // There is no plural argument.
651 } else if (match(t: Tok_Comma)) {
652 // There is a plural argument.
653 plural = true;
654 }
655 }
656
657 if (prefix.isEmpty())
658 context = defaultContext;
659 else if (prefix == "self")
660 context = yyContextStack.isEmpty()
661 ? initialContext : yyContextStack.top().first;
662 else
663 context = prefix;
664
665 prefix.clear();
666
667 if (!text.isEmpty()) {
668 TranslatorMessage message(QString::fromUtf8(ba: context),
669 QString::fromUtf8(ba: text),
670 QString::fromUtf8(ba: comment),
671 {}, yyFileName, yyLineNo,
672 {}, TranslatorMessage::Unfinished, plural);
673 setMessageParameters(&message);
674 tor.extend(msg: message, cd);
675 }
676 }
677 break;
678 case Tok_translate: {
679 bool plural{};
680 if (parseTranslate(text: &text, context: &context, comment: &comment, utf8: &utf8, plural: &plural)
681 && !text.isEmpty()) {
682 TranslatorMessage message(QString::fromUtf8(ba: context),
683 QString::fromUtf8(ba: text),
684 QString::fromUtf8(ba: comment),
685 {}, yyFileName, yyLineNo,
686 {}, TranslatorMessage::Unfinished, plural);
687 setMessageParameters(&message);
688 tor.extend(msg: message, cd);
689 }
690 }
691 break;
692 case Tok_Ident:
693 if (!prefix.isEmpty())
694 prefix += '.';
695 prefix += yyIdent;
696 yyTok = getToken();
697 if (yyTok != Tok_Dot)
698 prefix.clear();
699 break;
700 case Tok_Comment:
701 comment = yyComment;
702 comment = comment.simplified();
703 if (comment.left(len: sizeof(PythonMagicComment) - 1) == PythonMagicComment) {
704 comment.remove(index: 0, len: sizeof(PythonMagicComment) - 1);
705 int k = comment.indexOf(c: ' ');
706 if (k == -1) {
707 context = comment;
708 } else {
709 context = comment.left(len: k);
710 comment.remove( index: 0, len: k + 1);
711 TranslatorMessage message(QString::fromUtf8(ba: context),
712 {}, QString::fromUtf8(ba: comment), {},
713 yyFileName, yyLineNo, {});
714 tor.extend(msg: message, cd);
715 }
716 }
717 yyTok = getToken();
718 break;
719 default:
720 yyTok = getToken();
721 }
722 }
723
724 if (yyParenDepth != 0) {
725 qWarning(msg: "%s: Unbalanced parentheses in Python code",
726 qPrintable(yyFileName));
727 }
728}
729
730bool loadPython(Translator &translator, const QString &fileName, ConversionData &cd)
731{
732 // Match the function aliases to our tokens
733 static bool firstTime = true;
734 if (firstTime) {
735 firstTime = false;
736 const auto &nameMap = trFunctionAliasManager.nameToTrFunctionMap();
737 for (auto it = nameMap.cbegin(), end = nameMap.cend(); it != end; ++it) {
738 switch (it.value()) {
739 case TrFunctionAliasManager::Function_tr:
740 case TrFunctionAliasManager::Function_QT_TR_NOOP:
741 tokens.insert(key: it.key().toUtf8(), value: Tok_tr);
742 break;
743 case TrFunctionAliasManager::Function_trUtf8:
744 tokens.insert(key: it.key().toUtf8(), value: Tok_trUtf8);
745 break;
746 case TrFunctionAliasManager::Function_translate:
747 case TrFunctionAliasManager::Function_QT_TRANSLATE_NOOP:
748 // QTranslator::findMessage() has the same parameters as QApplication::translate().
749 case TrFunctionAliasManager::Function_findMessage:
750 tokens.insert(key: it.key().toUtf8(), value: Tok_translate);
751 break;
752 default:
753 break;
754 }
755 }
756 }
757
758#ifdef Q_CC_MSVC
759 const auto *fileNameC = reinterpret_cast<const wchar_t *>(fileName.utf16());
760 const bool ok = _wfopen_s(&yyInFile, fileNameC, L"r") == 0;
761#else
762 const QByteArray fileNameC = QFile::encodeName(fileName);
763 yyInFile = std::fopen( filename: fileNameC.constData(), modes: "r");
764 const bool ok = yyInFile != nullptr;
765#endif
766 if (!ok) {
767 cd.appendError(QStringLiteral("Cannot open %1").arg(a: fileName));
768 return false;
769 }
770
771 startTokenizer(fileName, getCharFunc: getCharFromFile, peekCharFunc: peekCharFromFile);
772 parse(tor&: translator, cd);
773 std::fclose(stream: yyInFile);
774 return true;
775}
776
777QT_END_NAMESPACE
778

source code of qttools/src/linguist/lupdate/python.cpp