1 | //===-- lib/Parser/token-sequence.cpp -------------------------------------===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | |
9 | #include "flang/Parser/token-sequence.h" |
10 | |
11 | #include "prescan.h" |
12 | #include "flang/Parser/characters.h" |
13 | #include "flang/Parser/message.h" |
14 | #include "llvm/Support/raw_ostream.h" |
15 | |
16 | namespace Fortran::parser { |
17 | |
18 | TokenSequence &TokenSequence::operator=(TokenSequence &&that) { |
19 | clear(); |
20 | swap(that); |
21 | return *this; |
22 | } |
23 | |
24 | void TokenSequence::clear() { |
25 | start_.clear(); |
26 | nextStart_ = 0; |
27 | char_.clear(); |
28 | provenances_.clear(); |
29 | } |
30 | |
31 | void TokenSequence::pop_back() { |
32 | CHECK(!start_.empty()); |
33 | CHECK(nextStart_ > start_.back()); |
34 | std::size_t bytes{nextStart_ - start_.back()}; |
35 | nextStart_ = start_.back(); |
36 | start_.pop_back(); |
37 | char_.resize(nextStart_); |
38 | provenances_.RemoveLastBytes(bytes); |
39 | } |
40 | |
41 | void TokenSequence::shrink_to_fit() { |
42 | start_.shrink_to_fit(); |
43 | char_.shrink_to_fit(); |
44 | provenances_.shrink_to_fit(); |
45 | } |
46 | |
47 | void TokenSequence::swap(TokenSequence &that) { |
48 | start_.swap(that.start_); |
49 | std::swap(nextStart_, that.nextStart_); |
50 | char_.swap(that.char_); |
51 | provenances_.swap(that.provenances_); |
52 | } |
53 | |
54 | std::size_t TokenSequence::SkipBlanks(std::size_t at) const { |
55 | std::size_t tokens{start_.size()}; |
56 | for (; at < tokens; ++at) { |
57 | if (!TokenAt(at).IsBlank()) { |
58 | return at; |
59 | } |
60 | } |
61 | return tokens; // even if at > tokens |
62 | } |
63 | |
64 | std::optional<std::size_t> TokenSequence::SkipBlanksBackwards( |
65 | std::size_t at) const { |
66 | while (at-- > 0) { |
67 | if (!TokenAt(at).IsBlank()) { |
68 | return at; |
69 | } |
70 | } |
71 | return std::nullopt; |
72 | } |
73 | |
74 | // C-style /*comments*/ are removed from preprocessing directive |
75 | // token sequences by the prescanner, but not C++ or Fortran |
76 | // free-form line-ending comments (//... and !...) because |
77 | // ignoring them is directive-specific. |
78 | bool TokenSequence::IsAnythingLeft(std::size_t at) const { |
79 | std::size_t tokens{start_.size()}; |
80 | for (; at < tokens; ++at) { |
81 | auto tok{TokenAt(at)}; |
82 | const char *end{tok.end()}; |
83 | for (const char *p{tok.begin()}; p < end; ++p) { |
84 | switch (*p) { |
85 | case '/': |
86 | return p + 1 >= end || p[1] != '/'; |
87 | case '!': |
88 | return false; |
89 | case ' ': |
90 | break; |
91 | default: |
92 | return true; |
93 | } |
94 | } |
95 | } |
96 | return false; |
97 | } |
98 | |
99 | void TokenSequence::CopyAll(const TokenSequence &that) { |
100 | if (nextStart_ < char_.size()) { |
101 | start_.push_back(nextStart_); |
102 | } |
103 | int offset = char_.size(); |
104 | for (int st : that.start_) { |
105 | start_.push_back(st + offset); |
106 | } |
107 | char_.insert(char_.end(), that.char_.begin(), that.char_.end()); |
108 | nextStart_ = char_.size(); |
109 | provenances_.Put(that.provenances_); |
110 | } |
111 | |
112 | void TokenSequence::CopyWithProvenance( |
113 | const TokenSequence &that, ProvenanceRange range) { |
114 | std::size_t offset{0}; |
115 | std::size_t tokens{that.SizeInTokens()}; |
116 | for (std::size_t j{0}; j < tokens; ++j) { |
117 | CharBlock tok{that.TokenAt(j)}; |
118 | Put(tok, range.OffsetMember(offset)); |
119 | offset += tok.size(); |
120 | } |
121 | CHECK(offset == range.size()); |
122 | } |
123 | |
124 | void TokenSequence::AppendRange( |
125 | const TokenSequence &that, std::size_t at, std::size_t tokens) { |
126 | ProvenanceRange provenance; |
127 | std::size_t offset{0}; |
128 | for (; tokens-- > 0; ++at) { |
129 | CharBlock tok{that.TokenAt(at)}; |
130 | std::size_t tokBytes{tok.size()}; |
131 | for (std::size_t j{0}; j < tokBytes; ++j) { |
132 | if (offset == provenance.size()) { |
133 | provenance = that.provenances_.Map(that.start_[at] + j); |
134 | offset = 0; |
135 | } |
136 | PutNextTokenChar(tok[j], provenance.OffsetMember(offset++)); |
137 | } |
138 | CloseToken(); |
139 | } |
140 | } |
141 | |
142 | void TokenSequence::Put( |
143 | const char *s, std::size_t bytes, Provenance provenance) { |
144 | for (std::size_t j{0}; j < bytes; ++j) { |
145 | PutNextTokenChar(s[j], provenance + j); |
146 | } |
147 | CloseToken(); |
148 | } |
149 | |
150 | void TokenSequence::Put(const CharBlock &t, Provenance provenance) { |
151 | // Avoid t[0] if t is empty: it would create a reference to nullptr, |
152 | // which is UB. |
153 | const char *addr{t.size() ? &t[0] : nullptr}; |
154 | Put(addr, t.size(), provenance); |
155 | } |
156 | |
157 | void TokenSequence::Put(const std::string &s, Provenance provenance) { |
158 | Put(s.data(), s.size(), provenance); |
159 | } |
160 | |
161 | void TokenSequence::Put(llvm::raw_string_ostream &ss, Provenance provenance) { |
162 | Put(ss.str(), provenance); |
163 | } |
164 | |
165 | TokenSequence &TokenSequence::ToLowerCase() { |
166 | std::size_t tokens{start_.size()}; |
167 | std::size_t chars{char_.size()}; |
168 | std::size_t atToken{0}; |
169 | for (std::size_t j{0}; j < chars;) { |
170 | std::size_t nextStart{atToken + 1 < tokens ? start_[++atToken] : chars}; |
171 | char *p{&char_[j]}; |
172 | char const *limit{char_.data() + nextStart}; |
173 | const char *lastChar{limit - 1}; |
174 | j = nextStart; |
175 | // Skip leading whitespaces |
176 | while (p < limit - 1 && *p == ' ') { |
177 | ++p; |
178 | } |
179 | // Find last non-whitespace char |
180 | while (lastChar > p + 1 && *lastChar == ' ') { |
181 | --lastChar; |
182 | } |
183 | if (IsDecimalDigit(*p)) { |
184 | while (p < limit && IsDecimalDigit(*p)) { |
185 | ++p; |
186 | } |
187 | if (p >= limit) { |
188 | } else if (*p == 'h' || *p == 'H') { |
189 | // Hollerith |
190 | *p = 'h'; |
191 | } else if (*p == '_' && p + 1 < limit && (p[1] == '"' || p[1] == '\'')) { |
192 | // kind-prefixed character literal (e.g., 1_"ABC") |
193 | } else { |
194 | // exponent |
195 | for (; p < limit; ++p) { |
196 | *p = ToLowerCaseLetter(*p); |
197 | } |
198 | } |
199 | } else if (*lastChar == '\'' || *lastChar == '"') { |
200 | if (*p == *lastChar) { |
201 | // Character literal without prefix |
202 | } else if (p[1] == *lastChar) { |
203 | // BOZX-prefixed constant |
204 | for (; p < limit; ++p) { |
205 | *p = ToLowerCaseLetter(*p); |
206 | } |
207 | } else { |
208 | // Literal with kind-param prefix name (e.g., K_"ABC"). |
209 | for (; *p != *lastChar; ++p) { |
210 | *p = ToLowerCaseLetter(*p); |
211 | } |
212 | } |
213 | } else { |
214 | for (; p < limit; ++p) { |
215 | *p = ToLowerCaseLetter(*p); |
216 | } |
217 | } |
218 | } |
219 | return *this; |
220 | } |
221 | |
222 | bool TokenSequence::HasBlanks(std::size_t firstChar) const { |
223 | std::size_t tokens{SizeInTokens()}; |
224 | for (std::size_t j{0}; j < tokens; ++j) { |
225 | if (start_[j] >= firstChar && TokenAt(j).IsBlank()) { |
226 | return true; |
227 | } |
228 | } |
229 | return false; |
230 | } |
231 | |
232 | bool TokenSequence::HasRedundantBlanks(std::size_t firstChar) const { |
233 | std::size_t tokens{SizeInTokens()}; |
234 | bool lastWasBlank{false}; |
235 | for (std::size_t j{0}; j < tokens; ++j) { |
236 | bool isBlank{TokenAt(j).IsBlank()}; |
237 | if (isBlank && lastWasBlank && start_[j] >= firstChar) { |
238 | return true; |
239 | } |
240 | lastWasBlank = isBlank; |
241 | } |
242 | return false; |
243 | } |
244 | |
245 | TokenSequence &TokenSequence::RemoveBlanks(std::size_t firstChar) { |
246 | std::size_t tokens{SizeInTokens()}; |
247 | TokenSequence result; |
248 | for (std::size_t j{0}; j < tokens; ++j) { |
249 | if (!TokenAt(j).IsBlank() || start_[j] < firstChar) { |
250 | result.AppendRange(*this, j); |
251 | } |
252 | } |
253 | swap(result); |
254 | return *this; |
255 | } |
256 | |
257 | TokenSequence &TokenSequence::RemoveRedundantBlanks(std::size_t firstChar) { |
258 | std::size_t tokens{SizeInTokens()}; |
259 | TokenSequence result; |
260 | bool lastWasBlank{false}; |
261 | for (std::size_t j{0}; j < tokens; ++j) { |
262 | bool isBlank{TokenAt(j).IsBlank()}; |
263 | if (!isBlank || !lastWasBlank || start_[j] < firstChar) { |
264 | result.AppendRange(*this, j); |
265 | } |
266 | lastWasBlank = isBlank; |
267 | } |
268 | swap(result); |
269 | return *this; |
270 | } |
271 | |
272 | TokenSequence &TokenSequence::ClipComment( |
273 | const Prescanner &prescanner, bool skipFirst) { |
274 | std::size_t tokens{SizeInTokens()}; |
275 | for (std::size_t j{0}; j < tokens; ++j) { |
276 | CharBlock tok{TokenAt(j)}; |
277 | if (std::size_t blanks{tok.CountLeadingBlanks()}; |
278 | blanks < tok.size() && tok[blanks] == '!') { |
279 | // Retain active compiler directive sentinels (e.g. "!dir$") |
280 | for (std::size_t k{j + 1}; k < tokens && tok.size() <= blanks + 5; ++k) { |
281 | if (tok.begin() + tok.size() == TokenAt(k).begin()) { |
282 | tok.ExtendToCover(TokenAt(k)); |
283 | } else { |
284 | break; |
285 | } |
286 | } |
287 | bool isSentinel{false}; |
288 | if (tok.size() > blanks + 5) { |
289 | isSentinel = prescanner.IsCompilerDirectiveSentinel(&tok[blanks + 1]) |
290 | .has_value(); |
291 | } |
292 | if (isSentinel) { |
293 | } else if (skipFirst) { |
294 | skipFirst = false; |
295 | } else { |
296 | TokenSequence result; |
297 | if (j > 0) { |
298 | result.AppendRange(*this, 0, j - 1); |
299 | } |
300 | swap(result); |
301 | return *this; |
302 | } |
303 | } |
304 | } |
305 | return *this; |
306 | } |
307 | |
308 | void TokenSequence::Emit(CookedSource &cooked) const { |
309 | if (auto n{char_.size()}) { |
310 | cooked.Put(&char_[0], n); |
311 | cooked.PutProvenanceMappings(provenances_); |
312 | } |
313 | } |
314 | |
315 | llvm::raw_ostream &TokenSequence::Dump(llvm::raw_ostream &o) const { |
316 | o << "TokenSequence has " << char_.size() << " chars; nextStart_ " |
317 | << nextStart_ << '\n'; |
318 | for (std::size_t j{0}; j < start_.size(); ++j) { |
319 | o << '[' << j << "] @ " << start_[j] << " '" << TokenAt(j).ToString() |
320 | << "'\n" ; |
321 | } |
322 | provenances_.Dump(o << "provenances_:\n" ); |
323 | return o; |
324 | } |
325 | |
326 | Provenance TokenSequence::GetCharProvenance(std::size_t offset) const { |
327 | ProvenanceRange range{provenances_.Map(offset)}; |
328 | return range.start(); |
329 | } |
330 | |
331 | Provenance TokenSequence::GetTokenProvenance( |
332 | std::size_t token, std::size_t offset) const { |
333 | return GetCharProvenance(start_[token] + offset); |
334 | } |
335 | |
336 | ProvenanceRange TokenSequence::GetTokenProvenanceRange( |
337 | std::size_t token, std::size_t offset) const { |
338 | ProvenanceRange range{provenances_.Map(start_[token] + offset)}; |
339 | return range.Prefix(TokenBytes(token) - offset); |
340 | } |
341 | |
342 | ProvenanceRange TokenSequence::GetIntervalProvenanceRange( |
343 | std::size_t token, std::size_t tokens) const { |
344 | if (tokens == 0) { |
345 | return {}; |
346 | } |
347 | ProvenanceRange range{provenances_.Map(start_[token])}; |
348 | while (--tokens > 0 && |
349 | range.AnnexIfPredecessor(provenances_.Map(start_[++token]))) { |
350 | } |
351 | return range; |
352 | } |
353 | |
354 | ProvenanceRange TokenSequence::GetProvenanceRange() const { |
355 | return GetIntervalProvenanceRange(0, start_.size()); |
356 | } |
357 | |
358 | const TokenSequence &TokenSequence::CheckBadFortranCharacters( |
359 | Messages &messages, const Prescanner &prescanner, |
360 | bool preprocessingOnly) const { |
361 | std::size_t tokens{SizeInTokens()}; |
362 | for (std::size_t j{0}; j < tokens; ++j) { |
363 | CharBlock token{TokenAt(j)}; |
364 | char ch{token.FirstNonBlank()}; |
365 | if (ch != ' ' && !IsValidFortranTokenCharacter(ch)) { |
366 | if (ch == '!') { |
367 | if (prescanner.IsCompilerDirectiveSentinel(token)) { |
368 | continue; |
369 | } else if (j + 1 < tokens && |
370 | prescanner.IsCompilerDirectiveSentinel( |
371 | TokenAt(j + 1))) { // !dir$, &c. |
372 | ++j; |
373 | continue; |
374 | } else if (preprocessingOnly) { |
375 | continue; |
376 | } |
377 | } else if (ch == '&' && preprocessingOnly) { |
378 | continue; |
379 | } |
380 | if (ch < ' ' || ch >= '\x7f') { |
381 | messages.Say(GetTokenProvenanceRange(j), |
382 | "bad character (0x%02x) in Fortran token"_err_en_US , ch & 0xff); |
383 | } else { |
384 | messages.Say(GetTokenProvenanceRange(j), |
385 | "bad character ('%c') in Fortran token"_err_en_US , ch); |
386 | } |
387 | } |
388 | } |
389 | return *this; |
390 | } |
391 | |
392 | bool TokenSequence::BadlyNestedParentheses() const { |
393 | int nesting{0}; |
394 | std::size_t tokens{SizeInTokens()}; |
395 | for (std::size_t j{0}; j < tokens; ++j) { |
396 | CharBlock token{TokenAt(j)}; |
397 | char ch{token.OnlyNonBlank()}; |
398 | if (ch == '(') { |
399 | ++nesting; |
400 | } else if (ch == ')') { |
401 | if (nesting-- == 0) { |
402 | break; |
403 | } |
404 | } |
405 | } |
406 | return nesting != 0; |
407 | } |
408 | |
409 | const TokenSequence &TokenSequence::CheckBadParentheses( |
410 | Messages &messages) const { |
411 | if (BadlyNestedParentheses()) { |
412 | // There's an error; diagnose it |
413 | std::size_t tokens{SizeInTokens()}; |
414 | std::vector<std::size_t> stack; |
415 | for (std::size_t j{0}; j < tokens; ++j) { |
416 | CharBlock token{TokenAt(j)}; |
417 | char ch{token.OnlyNonBlank()}; |
418 | if (ch == '(') { |
419 | stack.push_back(j); |
420 | } else if (ch == ')') { |
421 | if (stack.empty()) { |
422 | messages.Say(GetTokenProvenanceRange(j), "Unmatched ')'"_err_en_US ); |
423 | return *this; |
424 | } |
425 | stack.pop_back(); |
426 | } |
427 | } |
428 | CHECK(!stack.empty()); |
429 | messages.Say( |
430 | GetTokenProvenanceRange(stack.back()), "Unmatched '('"_err_en_US ); |
431 | } |
432 | return *this; |
433 | } |
434 | } // namespace Fortran::parser |
435 | |