1 | //===-- lib/Parser/token-sequence.cpp -------------------------------------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | |
9 | #include "flang/Parser/token-sequence.h" |
10 | |
11 | #include "prescan.h" |
12 | #include "flang/Parser/characters.h" |
13 | #include "flang/Parser/message.h" |
14 | #include "llvm/Support/raw_ostream.h" |
15 | |
16 | namespace Fortran::parser { |
17 | |
18 | TokenSequence &TokenSequence::operator=(TokenSequence &&that) { |
19 | clear(); |
20 | swap(that); |
21 | return *this; |
22 | } |
23 | |
24 | void TokenSequence::clear() { |
25 | start_.clear(); |
26 | nextStart_ = 0; |
27 | char_.clear(); |
28 | provenances_.clear(); |
29 | } |
30 | |
31 | void TokenSequence::pop_back() { |
32 | CHECK(!start_.empty()); |
33 | CHECK(nextStart_ > start_.back()); |
34 | std::size_t bytes{nextStart_ - start_.back()}; |
35 | nextStart_ = start_.back(); |
36 | start_.pop_back(); |
37 | char_.resize(nextStart_); |
38 | provenances_.RemoveLastBytes(bytes); |
39 | } |
40 | |
41 | void TokenSequence::shrink_to_fit() { |
42 | start_.shrink_to_fit(); |
43 | char_.shrink_to_fit(); |
44 | provenances_.shrink_to_fit(); |
45 | } |
46 | |
47 | void TokenSequence::swap(TokenSequence &that) { |
48 | start_.swap(that.start_); |
49 | std::swap(nextStart_, that.nextStart_); |
50 | char_.swap(that.char_); |
51 | provenances_.swap(that.provenances_); |
52 | } |
53 | |
54 | std::size_t TokenSequence::SkipBlanks(std::size_t at) const { |
55 | std::size_t tokens{start_.size()}; |
56 | for (; at < tokens; ++at) { |
57 | if (!TokenAt(at).IsBlank()) { |
58 | return at; |
59 | } |
60 | } |
61 | return tokens; // even if at > tokens |
62 | } |
63 | |
64 | // C-style /*comments*/ are removed from preprocessing directive |
65 | // token sequences by the prescanner, but not C++ or Fortran |
66 | // free-form line-ending comments (//... and !...) because |
67 | // ignoring them is directive-specific. |
68 | bool TokenSequence::IsAnythingLeft(std::size_t at) const { |
69 | std::size_t tokens{start_.size()}; |
70 | for (; at < tokens; ++at) { |
71 | auto tok{TokenAt(at)}; |
72 | const char *end{tok.end()}; |
73 | for (const char *p{tok.begin()}; p < end; ++p) { |
74 | switch (*p) { |
75 | case '/': |
76 | return p + 1 >= end || p[1] != '/'; |
77 | case '!': |
78 | return false; |
79 | case ' ': |
80 | break; |
81 | default: |
82 | return true; |
83 | } |
84 | } |
85 | } |
86 | return false; |
87 | } |
88 | |
89 | void TokenSequence::Put(const TokenSequence &that) { |
90 | if (nextStart_ < char_.size()) { |
91 | start_.push_back(nextStart_); |
92 | } |
93 | int offset = char_.size(); |
94 | for (int st : that.start_) { |
95 | start_.push_back(st + offset); |
96 | } |
97 | char_.insert(char_.end(), that.char_.begin(), that.char_.end()); |
98 | nextStart_ = char_.size(); |
99 | provenances_.Put(that.provenances_); |
100 | } |
101 | |
102 | void TokenSequence::Put(const TokenSequence &that, ProvenanceRange range) { |
103 | std::size_t offset{0}; |
104 | std::size_t tokens{that.SizeInTokens()}; |
105 | for (std::size_t j{0}; j < tokens; ++j) { |
106 | CharBlock tok{that.TokenAt(j)}; |
107 | Put(tok, range.OffsetMember(offset)); |
108 | offset += tok.size(); |
109 | } |
110 | CHECK(offset == range.size()); |
111 | } |
112 | |
113 | void TokenSequence::Put( |
114 | const TokenSequence &that, std::size_t at, std::size_t tokens) { |
115 | ProvenanceRange provenance; |
116 | std::size_t offset{0}; |
117 | for (; tokens-- > 0; ++at) { |
118 | CharBlock tok{that.TokenAt(at)}; |
119 | std::size_t tokBytes{tok.size()}; |
120 | for (std::size_t j{0}; j < tokBytes; ++j) { |
121 | if (offset == provenance.size()) { |
122 | provenance = that.provenances_.Map(that.start_[at] + j); |
123 | offset = 0; |
124 | } |
125 | PutNextTokenChar(tok[j], provenance.OffsetMember(offset++)); |
126 | } |
127 | CloseToken(); |
128 | } |
129 | } |
130 | |
131 | void TokenSequence::Put( |
132 | const char *s, std::size_t bytes, Provenance provenance) { |
133 | for (std::size_t j{0}; j < bytes; ++j) { |
134 | PutNextTokenChar(s[j], provenance + j); |
135 | } |
136 | CloseToken(); |
137 | } |
138 | |
139 | void TokenSequence::Put(const CharBlock &t, Provenance provenance) { |
140 | // Avoid t[0] if t is empty: it would create a reference to nullptr, |
141 | // which is UB. |
142 | const char *addr{t.size() ? &t[0] : nullptr}; |
143 | Put(addr, t.size(), provenance); |
144 | } |
145 | |
146 | void TokenSequence::Put(const std::string &s, Provenance provenance) { |
147 | Put(s.data(), s.size(), provenance); |
148 | } |
149 | |
150 | void TokenSequence::Put(llvm::raw_string_ostream &ss, Provenance provenance) { |
151 | Put(ss.str(), provenance); |
152 | } |
153 | |
154 | TokenSequence &TokenSequence::ToLowerCase() { |
155 | std::size_t tokens{start_.size()}; |
156 | std::size_t chars{char_.size()}; |
157 | std::size_t atToken{0}; |
158 | for (std::size_t j{0}; j < chars;) { |
159 | std::size_t nextStart{atToken + 1 < tokens ? start_[++atToken] : chars}; |
160 | char *p{&char_[j]}; |
161 | char const *limit{char_.data() + nextStart}; |
162 | const char *lastChar{limit - 1}; |
163 | j = nextStart; |
164 | // Skip leading whitespaces |
165 | while (p < limit - 1 && *p == ' ') { |
166 | ++p; |
167 | } |
168 | // Find last non-whitespace char |
169 | while (lastChar > p + 1 && *lastChar == ' ') { |
170 | --lastChar; |
171 | } |
172 | if (IsDecimalDigit(*p)) { |
173 | while (p < limit && IsDecimalDigit(*p)) { |
174 | ++p; |
175 | } |
176 | if (p >= limit) { |
177 | } else if (*p == 'h' || *p == 'H') { |
178 | // Hollerith |
179 | *p = 'h'; |
180 | } else if (*p == '_') { |
181 | // kind-prefixed character literal (e.g., 1_"ABC") |
182 | } else { |
183 | // exponent |
184 | for (; p < limit; ++p) { |
185 | *p = ToLowerCaseLetter(*p); |
186 | } |
187 | } |
188 | } else if (*lastChar == '\'' || *lastChar == '"') { |
189 | if (*p == *lastChar) { |
190 | // Character literal without prefix |
191 | } else if (p[1] == *lastChar) { |
192 | // BOZX-prefixed constant |
193 | for (; p < limit; ++p) { |
194 | *p = ToLowerCaseLetter(*p); |
195 | } |
196 | } else { |
197 | // Literal with kind-param prefix name (e.g., K_"ABC"). |
198 | for (; *p != *lastChar; ++p) { |
199 | *p = ToLowerCaseLetter(*p); |
200 | } |
201 | } |
202 | } else { |
203 | for (; p < limit; ++p) { |
204 | *p = ToLowerCaseLetter(*p); |
205 | } |
206 | } |
207 | } |
208 | return *this; |
209 | } |
210 | |
211 | bool TokenSequence::HasBlanks(std::size_t firstChar) const { |
212 | std::size_t tokens{SizeInTokens()}; |
213 | for (std::size_t j{0}; j < tokens; ++j) { |
214 | if (start_[j] >= firstChar && TokenAt(j).IsBlank()) { |
215 | return true; |
216 | } |
217 | } |
218 | return false; |
219 | } |
220 | |
221 | bool TokenSequence::HasRedundantBlanks(std::size_t firstChar) const { |
222 | std::size_t tokens{SizeInTokens()}; |
223 | bool lastWasBlank{false}; |
224 | for (std::size_t j{0}; j < tokens; ++j) { |
225 | bool isBlank{TokenAt(j).IsBlank()}; |
226 | if (isBlank && lastWasBlank && start_[j] >= firstChar) { |
227 | return true; |
228 | } |
229 | lastWasBlank = isBlank; |
230 | } |
231 | return false; |
232 | } |
233 | |
234 | TokenSequence &TokenSequence::RemoveBlanks(std::size_t firstChar) { |
235 | std::size_t tokens{SizeInTokens()}; |
236 | TokenSequence result; |
237 | for (std::size_t j{0}; j < tokens; ++j) { |
238 | if (!TokenAt(j).IsBlank() || start_[j] < firstChar) { |
239 | result.Put(*this, j); |
240 | } |
241 | } |
242 | swap(result); |
243 | return *this; |
244 | } |
245 | |
246 | TokenSequence &TokenSequence::RemoveRedundantBlanks(std::size_t firstChar) { |
247 | std::size_t tokens{SizeInTokens()}; |
248 | TokenSequence result; |
249 | bool lastWasBlank{false}; |
250 | for (std::size_t j{0}; j < tokens; ++j) { |
251 | bool isBlank{TokenAt(j).IsBlank()}; |
252 | if (!isBlank || !lastWasBlank || start_[j] < firstChar) { |
253 | result.Put(*this, j); |
254 | } |
255 | lastWasBlank = isBlank; |
256 | } |
257 | swap(result); |
258 | return *this; |
259 | } |
260 | |
261 | TokenSequence &TokenSequence::ClipComment( |
262 | const Prescanner &prescanner, bool skipFirst) { |
263 | std::size_t tokens{SizeInTokens()}; |
264 | for (std::size_t j{0}; j < tokens; ++j) { |
265 | CharBlock tok{TokenAt(j)}; |
266 | if (std::size_t blanks{tok.CountLeadingBlanks()}; |
267 | blanks < tok.size() && tok[blanks] == '!') { |
268 | // Retain active compiler directive sentinels (e.g. "!dir$") |
269 | for (std::size_t k{j + 1}; k < tokens && tok.size() < blanks + 5; ++k) { |
270 | if (tok.begin() + tok.size() == TokenAt(k).begin()) { |
271 | tok.ExtendToCover(TokenAt(k)); |
272 | } else { |
273 | break; |
274 | } |
275 | } |
276 | bool isSentinel{false}; |
277 | if (tok.size() == blanks + 5) { |
278 | char sentinel[4]; |
279 | for (int k{0}; k < 4; ++k) { |
280 | sentinel[k] = ToLowerCaseLetter(tok[blanks + k + 1]); |
281 | } |
282 | isSentinel = prescanner.IsCompilerDirectiveSentinel(sentinel, 4); |
283 | } |
284 | if (isSentinel) { |
285 | } else if (skipFirst) { |
286 | skipFirst = false; |
287 | } else { |
288 | TokenSequence result; |
289 | if (j > 0) { |
290 | result.Put(*this, 0, j - 1); |
291 | } |
292 | swap(result); |
293 | return *this; |
294 | } |
295 | } |
296 | } |
297 | return *this; |
298 | } |
299 | |
300 | void TokenSequence::Emit(CookedSource &cooked) const { |
301 | if (auto n{char_.size()}) { |
302 | cooked.Put(&char_[0], n); |
303 | cooked.PutProvenanceMappings(provenances_); |
304 | } |
305 | } |
306 | |
307 | llvm::raw_ostream &TokenSequence::Dump(llvm::raw_ostream &o) const { |
308 | o << "TokenSequence has " << char_.size() << " chars; nextStart_ " |
309 | << nextStart_ << '\n'; |
310 | for (std::size_t j{0}; j < start_.size(); ++j) { |
311 | o << '[' << j << "] @ " << start_[j] << " '" << TokenAt(j).ToString() |
312 | << "'\n" ; |
313 | } |
314 | return o; |
315 | } |
316 | |
317 | Provenance TokenSequence::GetCharProvenance(std::size_t offset) const { |
318 | ProvenanceRange range{provenances_.Map(offset)}; |
319 | return range.start(); |
320 | } |
321 | |
322 | Provenance TokenSequence::GetTokenProvenance( |
323 | std::size_t token, std::size_t offset) const { |
324 | return GetCharProvenance(start_[token] + offset); |
325 | } |
326 | |
327 | ProvenanceRange TokenSequence::GetTokenProvenanceRange( |
328 | std::size_t token, std::size_t offset) const { |
329 | ProvenanceRange range{provenances_.Map(start_[token] + offset)}; |
330 | return range.Prefix(TokenBytes(token) - offset); |
331 | } |
332 | |
333 | ProvenanceRange TokenSequence::GetIntervalProvenanceRange( |
334 | std::size_t token, std::size_t tokens) const { |
335 | if (tokens == 0) { |
336 | return {}; |
337 | } |
338 | ProvenanceRange range{provenances_.Map(start_[token])}; |
339 | while (--tokens > 0 && |
340 | range.AnnexIfPredecessor(provenances_.Map(start_[++token]))) { |
341 | } |
342 | return range; |
343 | } |
344 | |
345 | ProvenanceRange TokenSequence::GetProvenanceRange() const { |
346 | return GetIntervalProvenanceRange(0, start_.size()); |
347 | } |
348 | |
349 | const TokenSequence &TokenSequence::CheckBadFortranCharacters( |
350 | Messages &messages, const Prescanner &prescanner) const { |
351 | std::size_t tokens{SizeInTokens()}; |
352 | for (std::size_t j{0}; j < tokens; ++j) { |
353 | CharBlock token{TokenAt(j)}; |
354 | char ch{token.FirstNonBlank()}; |
355 | if (ch != ' ' && !IsValidFortranTokenCharacter(ch)) { |
356 | if (ch == '!') { |
357 | if (prescanner.IsCompilerDirectiveSentinel(token)) { |
358 | continue; |
359 | } else if (j + 1 < tokens && |
360 | prescanner.IsCompilerDirectiveSentinel( |
361 | TokenAt(j + 1))) { // !dir$, &c. |
362 | ++j; |
363 | continue; |
364 | } |
365 | } |
366 | if (ch < ' ' || ch >= '\x7f') { |
367 | messages.Say(GetTokenProvenanceRange(j), |
368 | "bad character (0x%02x) in Fortran token"_err_en_US , ch & 0xff); |
369 | } else { |
370 | messages.Say(GetTokenProvenanceRange(j), |
371 | "bad character ('%c') in Fortran token"_err_en_US , ch); |
372 | } |
373 | } |
374 | } |
375 | return *this; |
376 | } |
377 | |
378 | const TokenSequence &TokenSequence::CheckBadParentheses( |
379 | Messages &messages) const { |
380 | // First, a quick pass with no allocation for the common case |
381 | int nesting{0}; |
382 | std::size_t tokens{SizeInTokens()}; |
383 | for (std::size_t j{0}; j < tokens; ++j) { |
384 | CharBlock token{TokenAt(j)}; |
385 | char ch{token.OnlyNonBlank()}; |
386 | if (ch == '(') { |
387 | ++nesting; |
388 | } else if (ch == ')') { |
389 | if (nesting-- == 0) { |
390 | break; |
391 | } |
392 | } |
393 | } |
394 | if (nesting != 0) { |
395 | // There's an error; diagnose it |
396 | std::vector<std::size_t> stack; |
397 | for (std::size_t j{0}; j < tokens; ++j) { |
398 | CharBlock token{TokenAt(j)}; |
399 | char ch{token.OnlyNonBlank()}; |
400 | if (ch == '(') { |
401 | stack.push_back(j); |
402 | } else if (ch == ')') { |
403 | if (stack.empty()) { |
404 | messages.Say(GetTokenProvenanceRange(j), "Unmatched ')'"_err_en_US ); |
405 | return *this; |
406 | } |
407 | stack.pop_back(); |
408 | } |
409 | } |
410 | CHECK(!stack.empty()); |
411 | messages.Say( |
412 | GetTokenProvenanceRange(stack.back()), "Unmatched '('"_err_en_US ); |
413 | } |
414 | return *this; |
415 | } |
416 | } // namespace Fortran::parser |
417 | |