1 | // Copyright (C) 2021 The Qt Company Ltd. |
2 | // SPDX-License-Identifier: LicenseRef-Qt-Commercial OR GPL-3.0-only WITH Qt-GPL-exception-1.0 |
3 | |
4 | #ifndef TOKENIZER_H |
5 | #define TOKENIZER_H |
6 | |
7 | #include "location.h" |
8 | |
9 | #include <QtCore/qfile.h> |
10 | #include <QtCore/qstack.h> |
11 | #include <QtCore/qstring.h> |
12 | |
13 | QT_BEGIN_NAMESPACE |
14 | |
15 | /* |
16 | Here come the C++ tokens we support. The first part contains |
17 | all-purpose tokens; then come keywords. |
18 | |
19 | If you add a keyword, make sure to modify the keyword array in |
20 | tokenizer.cpp as well, and possibly adjust Tok_FirstKeyword and |
21 | Tok_LastKeyword. |
22 | */ |
23 | enum { |
24 | Tok_Eoi, |
25 | Tok_Ampersand, |
26 | Tok_Aster, |
27 | Tok_Caret, |
28 | Tok_LeftParen, |
29 | Tok_RightParen, |
30 | Tok_LeftParenAster, |
31 | Tok_Equal, |
32 | Tok_LeftBrace, |
33 | Tok_RightBrace, |
34 | Tok_Semicolon, |
35 | Tok_Colon, |
36 | Tok_LeftAngle, |
37 | Tok_RightAngle, |
38 | Tok_Comma, |
39 | Tok_Ellipsis, |
40 | Tok_Gulbrandsen, |
41 | Tok_LeftBracket, |
42 | Tok_RightBracket, |
43 | Tok_Tilde, |
44 | Tok_SomeOperator, |
45 | Tok_Number, |
46 | Tok_String, |
47 | Tok_Doc, |
48 | , |
49 | Tok_Ident, |
50 | Tok_At, |
51 | Tok_char, |
52 | Tok_class, |
53 | Tok_const, |
54 | Tok_double, |
55 | Tok_int, |
56 | Tok_long, |
57 | Tok_operator, |
58 | Tok_short, |
59 | Tok_signed, |
60 | Tok_typename, |
61 | Tok_unsigned, |
62 | Tok_void, |
63 | Tok_volatile, |
64 | Tok_int64, |
65 | Tok_QPrivateSignal, |
66 | Tok_FirstKeyword = Tok_char, |
67 | Tok_LastKeyword = Tok_QPrivateSignal |
68 | }; |
69 | |
70 | /* |
71 | The Tokenizer class implements lexical analysis of C++ source |
72 | files. |
73 | |
74 | Not every operator or keyword of C++ is recognized; only those |
75 | that are interesting to us. Some Qt keywords or macros are also |
76 | recognized. |
77 | */ |
78 | |
79 | class Tokenizer |
80 | { |
81 | public: |
82 | Tokenizer(const Location &loc, QByteArray in); |
83 | Tokenizer(const Location &loc, QFile &file); |
84 | |
85 | ~Tokenizer(); |
86 | |
87 | int getToken(); |
88 | void setParsingFnOrMacro(bool macro) { m_parsingMacro = macro; } |
89 | |
90 | [[nodiscard]] const Location &location() const { return m_tokLoc; } |
91 | [[nodiscard]] QString previousLexeme() const; |
92 | [[nodiscard]] QString lexeme() const; |
93 | [[nodiscard]] QString version() const { return m_version; } |
94 | [[nodiscard]] int parenDepth() const { return m_parenDepth; } |
95 | [[nodiscard]] int bracketDepth() const { return m_bracketDepth; } |
96 | |
97 | static void initialize(); |
98 | static void terminate(); |
99 | static bool isTrue(const QString &condition); |
100 | |
101 | private: |
102 | void init(); |
103 | void start(const Location &loc); |
104 | /* |
105 | Represents the maximum amount of characters that a token can be composed |
106 | of. |
107 | |
108 | When a token with more characters than the maximum amount is encountered, a |
109 | warning is issued and parsing continues, discarding all characters from the |
110 | currently parsed token that don't fit into the buffer. |
111 | */ |
112 | enum { yyLexBufSize = 1048576 }; |
113 | |
114 | int getch() { return m_pos == m_in.size() ? EOF : m_in[m_pos++]; } |
115 | |
116 | inline int getChar() |
117 | { |
118 | using namespace Qt::StringLiterals; |
119 | |
120 | if (m_ch == EOF) |
121 | return EOF; |
122 | if (m_lexLen < yyLexBufSize - 1) { |
123 | m_lex[m_lexLen++] = (char)m_ch; |
124 | m_lex[m_lexLen] = '\0'; |
125 | } else if (!token_too_long_warning_was_issued) { |
126 | location().warning( |
127 | message: u"The content is too long.\n"_s , |
128 | details: u"The maximum amount of characters for this content is %1.\n"_s .arg(a: yyLexBufSize) + |
129 | "Consider splitting it or reducing its size." |
130 | ); |
131 | |
132 | token_too_long_warning_was_issued = true; |
133 | } |
134 | m_curLoc.advance(ch: QChar(m_ch)); |
135 | int ch = getch(); |
136 | if (ch == EOF) |
137 | return EOF; |
138 | // cast explicitly to make sure the value of ch |
139 | // is in range [0..255] to avoid assert messages |
140 | // when using debug CRT that checks its input. |
141 | return int(uint(uchar(ch))); |
142 | } |
143 | |
144 | int getTokenAfterPreprocessor(); |
145 | void pushSkipping(bool skip); |
146 | bool popSkipping(); |
147 | |
148 | Location m_tokLoc; |
149 | Location m_curLoc; |
150 | char *m_lexBuf1 { nullptr }; |
151 | char *m_lexBuf2 { nullptr }; |
152 | char *m_prevLex { nullptr }; |
153 | char *m_lex { nullptr }; |
154 | size_t m_lexLen {}; |
155 | QStack<bool> m_preprocessorSkipping; |
156 | int m_numPreprocessorSkipping {}; |
157 | int m_braceDepth {}; |
158 | int m_parenDepth {}; |
159 | int m_bracketDepth {}; |
160 | int m_ch {}; |
161 | |
162 | QString m_version {}; |
163 | bool m_parsingMacro {}; |
164 | |
165 | // Used to ensure that the warning that is issued when a token is |
166 | // too long to fit into our fixed sized buffer is not repeated for each |
167 | // character of that token after the last saved one. |
168 | // The flag is reset whenever a new token is requested, so as to allow |
169 | // reporting all such tokens that are too long during a single execution. |
170 | bool token_too_long_warning_was_issued{false}; |
171 | |
172 | protected: |
173 | QByteArray m_in {}; |
174 | int m_pos {}; |
175 | }; |
176 | |
177 | QT_END_NAMESPACE |
178 | |
179 | #endif |
180 | |