| 1 | //===-- lib/Parser/token-sequence.cpp -------------------------------------===// |
| 2 | // |
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | // See https://llvm.org/LICENSE.txt for license information. |
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | // |
| 7 | //===----------------------------------------------------------------------===// |
| 8 | |
| 9 | #include "flang/Parser/token-sequence.h" |
| 10 | |
| 11 | #include "prescan.h" |
| 12 | #include "flang/Parser/characters.h" |
| 13 | #include "flang/Parser/message.h" |
| 14 | #include "llvm/Support/raw_ostream.h" |
| 15 | |
| 16 | namespace Fortran::parser { |
| 17 | |
| 18 | TokenSequence &TokenSequence::operator=(TokenSequence &&that) { |
| 19 | clear(); |
| 20 | swap(that); |
| 21 | return *this; |
| 22 | } |
| 23 | |
| 24 | void TokenSequence::clear() { |
| 25 | start_.clear(); |
| 26 | nextStart_ = 0; |
| 27 | char_.clear(); |
| 28 | provenances_.clear(); |
| 29 | } |
| 30 | |
| 31 | void TokenSequence::pop_back() { |
| 32 | CHECK(!start_.empty()); |
| 33 | CHECK(nextStart_ > start_.back()); |
| 34 | std::size_t bytes{nextStart_ - start_.back()}; |
| 35 | nextStart_ = start_.back(); |
| 36 | start_.pop_back(); |
| 37 | char_.resize(nextStart_); |
| 38 | provenances_.RemoveLastBytes(bytes); |
| 39 | } |
| 40 | |
| 41 | void TokenSequence::shrink_to_fit() { |
| 42 | start_.shrink_to_fit(); |
| 43 | char_.shrink_to_fit(); |
| 44 | provenances_.shrink_to_fit(); |
| 45 | } |
| 46 | |
| 47 | void TokenSequence::swap(TokenSequence &that) { |
| 48 | start_.swap(that.start_); |
| 49 | std::swap(nextStart_, that.nextStart_); |
| 50 | char_.swap(that.char_); |
| 51 | provenances_.swap(that.provenances_); |
| 52 | } |
| 53 | |
| 54 | std::size_t TokenSequence::SkipBlanks(std::size_t at) const { |
| 55 | std::size_t tokens{start_.size()}; |
| 56 | for (; at < tokens; ++at) { |
| 57 | if (!TokenAt(at).IsBlank()) { |
| 58 | return at; |
| 59 | } |
| 60 | } |
| 61 | return tokens; // even if at > tokens |
| 62 | } |
| 63 | |
| 64 | std::optional<std::size_t> TokenSequence::SkipBlanksBackwards( |
| 65 | std::size_t at) const { |
| 66 | while (at-- > 0) { |
| 67 | if (!TokenAt(at).IsBlank()) { |
| 68 | return at; |
| 69 | } |
| 70 | } |
| 71 | return std::nullopt; |
| 72 | } |
| 73 | |
| 74 | // C-style /*comments*/ are removed from preprocessing directive |
| 75 | // token sequences by the prescanner, but not C++ or Fortran |
| 76 | // free-form line-ending comments (//... and !...) because |
| 77 | // ignoring them is directive-specific. |
| 78 | bool TokenSequence::IsAnythingLeft(std::size_t at) const { |
| 79 | std::size_t tokens{start_.size()}; |
| 80 | for (; at < tokens; ++at) { |
| 81 | auto tok{TokenAt(at)}; |
| 82 | const char *end{tok.end()}; |
| 83 | for (const char *p{tok.begin()}; p < end; ++p) { |
| 84 | switch (*p) { |
| 85 | case '/': |
| 86 | return p + 1 >= end || p[1] != '/'; |
| 87 | case '!': |
| 88 | return false; |
| 89 | case ' ': |
| 90 | break; |
| 91 | default: |
| 92 | return true; |
| 93 | } |
| 94 | } |
| 95 | } |
| 96 | return false; |
| 97 | } |
| 98 | |
| 99 | void TokenSequence::CopyAll(const TokenSequence &that) { |
| 100 | if (nextStart_ < char_.size()) { |
| 101 | start_.push_back(nextStart_); |
| 102 | } |
| 103 | int offset = char_.size(); |
| 104 | for (int st : that.start_) { |
| 105 | start_.push_back(st + offset); |
| 106 | } |
| 107 | char_.insert(char_.end(), that.char_.begin(), that.char_.end()); |
| 108 | nextStart_ = char_.size(); |
| 109 | provenances_.Put(that.provenances_); |
| 110 | } |
| 111 | |
| 112 | void TokenSequence::CopyWithProvenance( |
| 113 | const TokenSequence &that, ProvenanceRange range) { |
| 114 | std::size_t offset{0}; |
| 115 | std::size_t tokens{that.SizeInTokens()}; |
| 116 | for (std::size_t j{0}; j < tokens; ++j) { |
| 117 | CharBlock tok{that.TokenAt(j)}; |
| 118 | Put(tok, range.OffsetMember(offset)); |
| 119 | offset += tok.size(); |
| 120 | } |
| 121 | CHECK(offset == range.size()); |
| 122 | } |
| 123 | |
| 124 | void TokenSequence::AppendRange( |
| 125 | const TokenSequence &that, std::size_t at, std::size_t tokens) { |
| 126 | ProvenanceRange provenance; |
| 127 | std::size_t offset{0}; |
| 128 | for (; tokens-- > 0; ++at) { |
| 129 | CharBlock tok{that.TokenAt(at)}; |
| 130 | std::size_t tokBytes{tok.size()}; |
| 131 | for (std::size_t j{0}; j < tokBytes; ++j) { |
| 132 | if (offset == provenance.size()) { |
| 133 | provenance = that.provenances_.Map(that.start_[at] + j); |
| 134 | offset = 0; |
| 135 | } |
| 136 | PutNextTokenChar(tok[j], provenance.OffsetMember(offset++)); |
| 137 | } |
| 138 | CloseToken(); |
| 139 | } |
| 140 | } |
| 141 | |
| 142 | void TokenSequence::Put( |
| 143 | const char *s, std::size_t bytes, Provenance provenance) { |
| 144 | for (std::size_t j{0}; j < bytes; ++j) { |
| 145 | PutNextTokenChar(s[j], provenance + j); |
| 146 | } |
| 147 | CloseToken(); |
| 148 | } |
| 149 | |
| 150 | void TokenSequence::Put(const CharBlock &t, Provenance provenance) { |
| 151 | // Avoid t[0] if t is empty: it would create a reference to nullptr, |
| 152 | // which is UB. |
| 153 | const char *addr{t.size() ? &t[0] : nullptr}; |
| 154 | Put(addr, t.size(), provenance); |
| 155 | } |
| 156 | |
| 157 | void TokenSequence::Put(const std::string &s, Provenance provenance) { |
| 158 | Put(s.data(), s.size(), provenance); |
| 159 | } |
| 160 | |
| 161 | void TokenSequence::Put(llvm::raw_string_ostream &ss, Provenance provenance) { |
| 162 | Put(ss.str(), provenance); |
| 163 | } |
| 164 | |
| 165 | TokenSequence &TokenSequence::ToLowerCase() { |
| 166 | std::size_t tokens{start_.size()}; |
| 167 | std::size_t chars{char_.size()}; |
| 168 | std::size_t atToken{0}; |
| 169 | for (std::size_t j{0}; j < chars;) { |
| 170 | std::size_t nextStart{atToken + 1 < tokens ? start_[++atToken] : chars}; |
| 171 | char *p{&char_[j]}; |
| 172 | char const *limit{char_.data() + nextStart}; |
| 173 | const char *lastChar{limit - 1}; |
| 174 | j = nextStart; |
| 175 | // Skip leading whitespaces |
| 176 | while (p < limit - 1 && *p == ' ') { |
| 177 | ++p; |
| 178 | } |
| 179 | // Find last non-whitespace char |
| 180 | while (lastChar > p + 1 && *lastChar == ' ') { |
| 181 | --lastChar; |
| 182 | } |
| 183 | if (IsDecimalDigit(*p)) { |
| 184 | while (p < limit && IsDecimalDigit(*p)) { |
| 185 | ++p; |
| 186 | } |
| 187 | if (p >= limit) { |
| 188 | } else if (*p == 'h' || *p == 'H') { |
| 189 | // Hollerith |
| 190 | *p = 'h'; |
| 191 | } else if (*p == '_' && p + 1 < limit && (p[1] == '"' || p[1] == '\'')) { |
| 192 | // kind-prefixed character literal (e.g., 1_"ABC") |
| 193 | } else { |
| 194 | // exponent |
| 195 | for (; p < limit; ++p) { |
| 196 | *p = ToLowerCaseLetter(*p); |
| 197 | } |
| 198 | } |
| 199 | } else if (*lastChar == '\'' || *lastChar == '"') { |
| 200 | if (*p == *lastChar) { |
| 201 | // Character literal without prefix |
| 202 | } else if (p[1] == *lastChar) { |
| 203 | // BOZX-prefixed constant |
| 204 | for (; p < limit; ++p) { |
| 205 | *p = ToLowerCaseLetter(*p); |
| 206 | } |
| 207 | } else { |
| 208 | // Literal with kind-param prefix name (e.g., K_"ABC"). |
| 209 | for (; *p != *lastChar; ++p) { |
| 210 | *p = ToLowerCaseLetter(*p); |
| 211 | } |
| 212 | } |
| 213 | } else { |
| 214 | for (; p < limit; ++p) { |
| 215 | *p = ToLowerCaseLetter(*p); |
| 216 | } |
| 217 | } |
| 218 | } |
| 219 | return *this; |
| 220 | } |
| 221 | |
| 222 | bool TokenSequence::HasBlanks(std::size_t firstChar) const { |
| 223 | std::size_t tokens{SizeInTokens()}; |
| 224 | for (std::size_t j{0}; j < tokens; ++j) { |
| 225 | if (start_[j] >= firstChar && TokenAt(j).IsBlank()) { |
| 226 | return true; |
| 227 | } |
| 228 | } |
| 229 | return false; |
| 230 | } |
| 231 | |
| 232 | bool TokenSequence::HasRedundantBlanks(std::size_t firstChar) const { |
| 233 | std::size_t tokens{SizeInTokens()}; |
| 234 | bool lastWasBlank{false}; |
| 235 | for (std::size_t j{0}; j < tokens; ++j) { |
| 236 | bool isBlank{TokenAt(j).IsBlank()}; |
| 237 | if (isBlank && lastWasBlank && start_[j] >= firstChar) { |
| 238 | return true; |
| 239 | } |
| 240 | lastWasBlank = isBlank; |
| 241 | } |
| 242 | return false; |
| 243 | } |
| 244 | |
| 245 | TokenSequence &TokenSequence::RemoveBlanks(std::size_t firstChar) { |
| 246 | std::size_t tokens{SizeInTokens()}; |
| 247 | TokenSequence result; |
| 248 | for (std::size_t j{0}; j < tokens; ++j) { |
| 249 | if (!TokenAt(j).IsBlank() || start_[j] < firstChar) { |
| 250 | result.AppendRange(*this, j); |
| 251 | } |
| 252 | } |
| 253 | swap(result); |
| 254 | return *this; |
| 255 | } |
| 256 | |
| 257 | TokenSequence &TokenSequence::RemoveRedundantBlanks(std::size_t firstChar) { |
| 258 | std::size_t tokens{SizeInTokens()}; |
| 259 | TokenSequence result; |
| 260 | bool lastWasBlank{false}; |
| 261 | for (std::size_t j{0}; j < tokens; ++j) { |
| 262 | bool isBlank{TokenAt(j).IsBlank()}; |
| 263 | if (!isBlank || !lastWasBlank || start_[j] < firstChar) { |
| 264 | result.AppendRange(*this, j); |
| 265 | } |
| 266 | lastWasBlank = isBlank; |
| 267 | } |
| 268 | swap(result); |
| 269 | return *this; |
| 270 | } |
| 271 | |
| 272 | TokenSequence &TokenSequence::ClipComment( |
| 273 | const Prescanner &prescanner, bool skipFirst) { |
| 274 | std::size_t tokens{SizeInTokens()}; |
| 275 | for (std::size_t j{0}; j < tokens; ++j) { |
| 276 | CharBlock tok{TokenAt(j)}; |
| 277 | if (std::size_t blanks{tok.CountLeadingBlanks()}; |
| 278 | blanks < tok.size() && tok[blanks] == '!') { |
| 279 | // Retain active compiler directive sentinels (e.g. "!dir$") |
| 280 | for (std::size_t k{j + 1}; k < tokens && tok.size() <= blanks + 5; ++k) { |
| 281 | if (tok.begin() + tok.size() == TokenAt(k).begin()) { |
| 282 | tok.ExtendToCover(TokenAt(k)); |
| 283 | } else { |
| 284 | break; |
| 285 | } |
| 286 | } |
| 287 | bool isSentinel{false}; |
| 288 | if (tok.size() > blanks + 5) { |
| 289 | isSentinel = prescanner.IsCompilerDirectiveSentinel(&tok[blanks + 1]) |
| 290 | .has_value(); |
| 291 | } |
| 292 | if (isSentinel) { |
| 293 | } else if (skipFirst) { |
| 294 | skipFirst = false; |
| 295 | } else { |
| 296 | TokenSequence result; |
| 297 | if (j > 0) { |
| 298 | result.AppendRange(*this, 0, j - 1); |
| 299 | } |
| 300 | swap(result); |
| 301 | return *this; |
| 302 | } |
| 303 | } |
| 304 | } |
| 305 | return *this; |
| 306 | } |
| 307 | |
| 308 | void TokenSequence::Emit(CookedSource &cooked) const { |
| 309 | if (auto n{char_.size()}) { |
| 310 | cooked.Put(&char_[0], n); |
| 311 | cooked.PutProvenanceMappings(provenances_); |
| 312 | } |
| 313 | } |
| 314 | |
| 315 | llvm::raw_ostream &TokenSequence::Dump(llvm::raw_ostream &o) const { |
| 316 | o << "TokenSequence has " << char_.size() << " chars; nextStart_ " |
| 317 | << nextStart_ << '\n'; |
| 318 | for (std::size_t j{0}; j < start_.size(); ++j) { |
| 319 | o << '[' << j << "] @ " << start_[j] << " '" << TokenAt(j).ToString() |
| 320 | << "'\n" ; |
| 321 | } |
| 322 | provenances_.Dump(o << "provenances_:\n" ); |
| 323 | return o; |
| 324 | } |
| 325 | |
| 326 | Provenance TokenSequence::GetCharProvenance(std::size_t offset) const { |
| 327 | ProvenanceRange range{provenances_.Map(offset)}; |
| 328 | return range.start(); |
| 329 | } |
| 330 | |
| 331 | Provenance TokenSequence::GetTokenProvenance( |
| 332 | std::size_t token, std::size_t offset) const { |
| 333 | return GetCharProvenance(start_[token] + offset); |
| 334 | } |
| 335 | |
| 336 | ProvenanceRange TokenSequence::GetTokenProvenanceRange( |
| 337 | std::size_t token, std::size_t offset) const { |
| 338 | ProvenanceRange range{provenances_.Map(start_[token] + offset)}; |
| 339 | return range.Prefix(TokenBytes(token) - offset); |
| 340 | } |
| 341 | |
| 342 | ProvenanceRange TokenSequence::GetIntervalProvenanceRange( |
| 343 | std::size_t token, std::size_t tokens) const { |
| 344 | if (tokens == 0) { |
| 345 | return {}; |
| 346 | } |
| 347 | ProvenanceRange range{provenances_.Map(start_[token])}; |
| 348 | while (--tokens > 0 && |
| 349 | range.AnnexIfPredecessor(provenances_.Map(start_[++token]))) { |
| 350 | } |
| 351 | return range; |
| 352 | } |
| 353 | |
| 354 | ProvenanceRange TokenSequence::GetProvenanceRange() const { |
| 355 | return GetIntervalProvenanceRange(0, start_.size()); |
| 356 | } |
| 357 | |
| 358 | const TokenSequence &TokenSequence::CheckBadFortranCharacters( |
| 359 | Messages &messages, const Prescanner &prescanner, |
| 360 | bool preprocessingOnly) const { |
| 361 | std::size_t tokens{SizeInTokens()}; |
| 362 | for (std::size_t j{0}; j < tokens; ++j) { |
| 363 | CharBlock token{TokenAt(j)}; |
| 364 | char ch{token.FirstNonBlank()}; |
| 365 | if (ch != ' ' && !IsValidFortranTokenCharacter(ch)) { |
| 366 | if (ch == '!') { |
| 367 | if (prescanner.IsCompilerDirectiveSentinel(token)) { |
| 368 | continue; |
| 369 | } else if (j + 1 < tokens && |
| 370 | prescanner.IsCompilerDirectiveSentinel( |
| 371 | TokenAt(j + 1))) { // !dir$, &c. |
| 372 | ++j; |
| 373 | continue; |
| 374 | } else if (preprocessingOnly) { |
| 375 | continue; |
| 376 | } |
| 377 | } else if (ch == '&' && preprocessingOnly) { |
| 378 | continue; |
| 379 | } |
| 380 | if (ch < ' ' || ch >= '\x7f') { |
| 381 | messages.Say(GetTokenProvenanceRange(j), |
| 382 | "bad character (0x%02x) in Fortran token"_err_en_US , ch & 0xff); |
| 383 | } else { |
| 384 | messages.Say(GetTokenProvenanceRange(j), |
| 385 | "bad character ('%c') in Fortran token"_err_en_US , ch); |
| 386 | } |
| 387 | } |
| 388 | } |
| 389 | return *this; |
| 390 | } |
| 391 | |
| 392 | bool TokenSequence::BadlyNestedParentheses() const { |
| 393 | int nesting{0}; |
| 394 | std::size_t tokens{SizeInTokens()}; |
| 395 | for (std::size_t j{0}; j < tokens; ++j) { |
| 396 | CharBlock token{TokenAt(j)}; |
| 397 | char ch{token.OnlyNonBlank()}; |
| 398 | if (ch == '(') { |
| 399 | ++nesting; |
| 400 | } else if (ch == ')') { |
| 401 | if (nesting-- == 0) { |
| 402 | break; |
| 403 | } |
| 404 | } |
| 405 | } |
| 406 | return nesting != 0; |
| 407 | } |
| 408 | |
| 409 | const TokenSequence &TokenSequence::CheckBadParentheses( |
| 410 | Messages &messages) const { |
| 411 | if (BadlyNestedParentheses()) { |
| 412 | // There's an error; diagnose it |
| 413 | std::size_t tokens{SizeInTokens()}; |
| 414 | std::vector<std::size_t> stack; |
| 415 | for (std::size_t j{0}; j < tokens; ++j) { |
| 416 | CharBlock token{TokenAt(j)}; |
| 417 | char ch{token.OnlyNonBlank()}; |
| 418 | if (ch == '(') { |
| 419 | stack.push_back(j); |
| 420 | } else if (ch == ')') { |
| 421 | if (stack.empty()) { |
| 422 | messages.Say(GetTokenProvenanceRange(j), "Unmatched ')'"_err_en_US ); |
| 423 | return *this; |
| 424 | } |
| 425 | stack.pop_back(); |
| 426 | } |
| 427 | } |
| 428 | CHECK(!stack.empty()); |
| 429 | messages.Say( |
| 430 | GetTokenProvenanceRange(stack.back()), "Unmatched '('"_err_en_US ); |
| 431 | } |
| 432 | return *this; |
| 433 | } |
| 434 | } // namespace Fortran::parser |
| 435 | |