| 1 | //===-- lib/Parser/token-sequence.cpp -------------------------------------===// |
| 2 | // |
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | // See https://llvm.org/LICENSE.txt for license information. |
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | // |
| 7 | //===----------------------------------------------------------------------===// |
| 8 | |
| 9 | #include "flang/Parser/token-sequence.h" |
| 10 | |
| 11 | #include "prescan.h" |
| 12 | #include "flang/Parser/characters.h" |
| 13 | #include "flang/Parser/message.h" |
| 14 | #include "llvm/Support/raw_ostream.h" |
| 15 | |
| 16 | namespace Fortran::parser { |
| 17 | |
| 18 | TokenSequence &TokenSequence::operator=(TokenSequence &&that) { |
| 19 | clear(); |
| 20 | swap(that); |
| 21 | return *this; |
| 22 | } |
| 23 | |
| 24 | void TokenSequence::clear() { |
| 25 | start_.clear(); |
| 26 | nextStart_ = 0; |
| 27 | char_.clear(); |
| 28 | provenances_.clear(); |
| 29 | } |
| 30 | |
| 31 | void TokenSequence::pop_back() { |
| 32 | CHECK(!start_.empty()); |
| 33 | // If the last token is empty then `nextStart_ == start_.back()`. |
| 34 | CHECK(nextStart_ >= start_.back()); |
| 35 | std::size_t bytes{nextStart_ - start_.back()}; |
| 36 | nextStart_ = start_.back(); |
| 37 | start_.pop_back(); |
| 38 | char_.resize(nextStart_); |
| 39 | provenances_.RemoveLastBytes(bytes); |
| 40 | } |
| 41 | |
| 42 | void TokenSequence::shrink_to_fit() { |
| 43 | start_.shrink_to_fit(); |
| 44 | char_.shrink_to_fit(); |
| 45 | provenances_.shrink_to_fit(); |
| 46 | } |
| 47 | |
| 48 | void TokenSequence::swap(TokenSequence &that) { |
| 49 | start_.swap(that.start_); |
| 50 | std::swap(nextStart_, that.nextStart_); |
| 51 | char_.swap(that.char_); |
| 52 | provenances_.swap(that.provenances_); |
| 53 | } |
| 54 | |
| 55 | std::size_t TokenSequence::SkipBlanks(std::size_t at) const { |
| 56 | std::size_t tokens{start_.size()}; |
| 57 | for (; at < tokens; ++at) { |
| 58 | if (!TokenAt(at).IsBlank()) { |
| 59 | return at; |
| 60 | } |
| 61 | } |
| 62 | return tokens; // even if at > tokens |
| 63 | } |
| 64 | |
| 65 | std::optional<std::size_t> TokenSequence::SkipBlanksBackwards( |
| 66 | std::size_t at) const { |
| 67 | while (at-- > 0) { |
| 68 | if (!TokenAt(at).IsBlank()) { |
| 69 | return at; |
| 70 | } |
| 71 | } |
| 72 | return std::nullopt; |
| 73 | } |
| 74 | |
| 75 | // C-style /*comments*/ are removed from preprocessing directive |
| 76 | // token sequences by the prescanner, but not C++ or Fortran |
| 77 | // free-form line-ending comments (//... and !...) because |
| 78 | // ignoring them is directive-specific. |
| 79 | bool TokenSequence::IsAnythingLeft(std::size_t at) const { |
| 80 | std::size_t tokens{start_.size()}; |
| 81 | for (; at < tokens; ++at) { |
| 82 | auto tok{TokenAt(at)}; |
| 83 | const char *end{tok.end()}; |
| 84 | for (const char *p{tok.begin()}; p < end; ++p) { |
| 85 | switch (*p) { |
| 86 | case '/': |
| 87 | return p + 1 >= end || p[1] != '/'; |
| 88 | case '!': |
| 89 | return false; |
| 90 | case ' ': |
| 91 | break; |
| 92 | default: |
| 93 | return true; |
| 94 | } |
| 95 | } |
| 96 | } |
| 97 | return false; |
| 98 | } |
| 99 | |
| 100 | void TokenSequence::CopyAll(const TokenSequence &that) { |
| 101 | if (nextStart_ < char_.size()) { |
| 102 | start_.push_back(nextStart_); |
| 103 | } |
| 104 | int offset = char_.size(); |
| 105 | for (int st : that.start_) { |
| 106 | start_.push_back(st + offset); |
| 107 | } |
| 108 | char_.insert(char_.end(), that.char_.begin(), that.char_.end()); |
| 109 | nextStart_ = char_.size(); |
| 110 | provenances_.Put(that.provenances_); |
| 111 | } |
| 112 | |
| 113 | void TokenSequence::CopyWithProvenance( |
| 114 | const TokenSequence &that, ProvenanceRange range) { |
| 115 | std::size_t offset{0}; |
| 116 | std::size_t tokens{that.SizeInTokens()}; |
| 117 | for (std::size_t j{0}; j < tokens; ++j) { |
| 118 | CharBlock tok{that.TokenAt(j)}; |
| 119 | Put(tok, range.OffsetMember(offset)); |
| 120 | offset += tok.size(); |
| 121 | } |
| 122 | CHECK(offset == range.size()); |
| 123 | } |
| 124 | |
| 125 | void TokenSequence::AppendRange( |
| 126 | const TokenSequence &that, std::size_t at, std::size_t tokens) { |
| 127 | ProvenanceRange provenance; |
| 128 | std::size_t offset{0}; |
| 129 | for (; tokens-- > 0; ++at) { |
| 130 | CharBlock tok{that.TokenAt(at)}; |
| 131 | std::size_t tokBytes{tok.size()}; |
| 132 | for (std::size_t j{0}; j < tokBytes; ++j) { |
| 133 | if (offset == provenance.size()) { |
| 134 | provenance = that.provenances_.Map(that.start_[at] + j); |
| 135 | offset = 0; |
| 136 | } |
| 137 | PutNextTokenChar(tok[j], provenance.OffsetMember(offset++)); |
| 138 | } |
| 139 | CloseToken(); |
| 140 | } |
| 141 | } |
| 142 | |
| 143 | void TokenSequence::Put( |
| 144 | const char *s, std::size_t bytes, Provenance provenance) { |
| 145 | for (std::size_t j{0}; j < bytes; ++j) { |
| 146 | PutNextTokenChar(s[j], provenance + j); |
| 147 | } |
| 148 | CloseToken(); |
| 149 | } |
| 150 | |
| 151 | void TokenSequence::Put(const CharBlock &t, Provenance provenance) { |
| 152 | // Avoid t[0] if t is empty: it would create a reference to nullptr, |
| 153 | // which is UB. |
| 154 | const char *addr{t.size() ? &t[0] : nullptr}; |
| 155 | Put(addr, t.size(), provenance); |
| 156 | } |
| 157 | |
| 158 | void TokenSequence::Put(const std::string &s, Provenance provenance) { |
| 159 | Put(s.data(), s.size(), provenance); |
| 160 | } |
| 161 | |
| 162 | void TokenSequence::Put(llvm::raw_string_ostream &ss, Provenance provenance) { |
| 163 | Put(ss.str(), provenance); |
| 164 | } |
| 165 | |
| 166 | TokenSequence &TokenSequence::ToLowerCase() { |
| 167 | std::size_t tokens{start_.size()}; |
| 168 | std::size_t chars{char_.size()}; |
| 169 | std::size_t atToken{0}; |
| 170 | for (std::size_t j{0}; j < chars;) { |
| 171 | std::size_t nextStart{atToken + 1 < tokens ? start_[++atToken] : chars}; |
| 172 | char *p{&char_[j]}; |
| 173 | char const *limit{char_.data() + nextStart}; |
| 174 | const char *lastChar{limit - 1}; |
| 175 | j = nextStart; |
| 176 | // Skip leading whitespaces |
| 177 | while (p < limit - 1 && *p == ' ') { |
| 178 | ++p; |
| 179 | } |
| 180 | // Find last non-whitespace char |
| 181 | while (lastChar > p + 1 && *lastChar == ' ') { |
| 182 | --lastChar; |
| 183 | } |
| 184 | if (IsDecimalDigit(*p)) { |
| 185 | while (p < limit && IsDecimalDigit(*p)) { |
| 186 | ++p; |
| 187 | } |
| 188 | if (p >= limit) { |
| 189 | } else if (*p == 'h' || *p == 'H') { |
| 190 | // Hollerith |
| 191 | *p = 'h'; |
| 192 | } else if (*p == '_' && p + 1 < limit && (p[1] == '"' || p[1] == '\'')) { |
| 193 | // kind-prefixed character literal (e.g., 1_"ABC") |
| 194 | } else { |
| 195 | // exponent |
| 196 | for (; p < limit; ++p) { |
| 197 | *p = ToLowerCaseLetter(*p); |
| 198 | } |
| 199 | } |
| 200 | } else if (*lastChar == '\'' || *lastChar == '"') { |
| 201 | if (*p == *lastChar) { |
| 202 | // Character literal without prefix |
| 203 | } else if (p[1] == *lastChar) { |
| 204 | // BOZX-prefixed constant |
| 205 | for (; p < limit; ++p) { |
| 206 | *p = ToLowerCaseLetter(*p); |
| 207 | } |
| 208 | } else { |
| 209 | // Literal with kind-param prefix name (e.g., K_"ABC"). |
| 210 | for (; *p != *lastChar; ++p) { |
| 211 | *p = ToLowerCaseLetter(*p); |
| 212 | } |
| 213 | } |
| 214 | } else { |
| 215 | for (; p < limit; ++p) { |
| 216 | *p = ToLowerCaseLetter(*p); |
| 217 | } |
| 218 | } |
| 219 | } |
| 220 | return *this; |
| 221 | } |
| 222 | |
| 223 | bool TokenSequence::HasBlanks(std::size_t firstChar) const { |
| 224 | std::size_t tokens{SizeInTokens()}; |
| 225 | for (std::size_t j{0}; j < tokens; ++j) { |
| 226 | if (start_[j] >= firstChar && TokenAt(j).IsBlank()) { |
| 227 | return true; |
| 228 | } |
| 229 | } |
| 230 | return false; |
| 231 | } |
| 232 | |
| 233 | bool TokenSequence::HasRedundantBlanks(std::size_t firstChar) const { |
| 234 | std::size_t tokens{SizeInTokens()}; |
| 235 | bool lastWasBlank{false}; |
| 236 | for (std::size_t j{0}; j < tokens; ++j) { |
| 237 | bool isBlank{TokenAt(j).IsBlank()}; |
| 238 | if (isBlank && lastWasBlank && start_[j] >= firstChar) { |
| 239 | return true; |
| 240 | } |
| 241 | lastWasBlank = isBlank; |
| 242 | } |
| 243 | return false; |
| 244 | } |
| 245 | |
| 246 | TokenSequence &TokenSequence::RemoveBlanks(std::size_t firstChar) { |
| 247 | std::size_t tokens{SizeInTokens()}; |
| 248 | TokenSequence result; |
| 249 | for (std::size_t j{0}; j < tokens; ++j) { |
| 250 | if (!TokenAt(j).IsBlank() || start_[j] < firstChar) { |
| 251 | result.AppendRange(*this, j); |
| 252 | } |
| 253 | } |
| 254 | swap(result); |
| 255 | return *this; |
| 256 | } |
| 257 | |
| 258 | TokenSequence &TokenSequence::RemoveRedundantBlanks(std::size_t firstChar) { |
| 259 | std::size_t tokens{SizeInTokens()}; |
| 260 | TokenSequence result; |
| 261 | bool lastWasBlank{false}; |
| 262 | for (std::size_t j{0}; j < tokens; ++j) { |
| 263 | bool isBlank{TokenAt(j).IsBlank()}; |
| 264 | if (!isBlank || !lastWasBlank || start_[j] < firstChar) { |
| 265 | result.AppendRange(*this, j); |
| 266 | } |
| 267 | lastWasBlank = isBlank; |
| 268 | } |
| 269 | swap(result); |
| 270 | return *this; |
| 271 | } |
| 272 | |
| 273 | TokenSequence &TokenSequence::ClipComment( |
| 274 | const Prescanner &prescanner, bool skipFirst) { |
| 275 | std::size_t tokens{SizeInTokens()}; |
| 276 | for (std::size_t j{0}; j < tokens; ++j) { |
| 277 | CharBlock tok{TokenAt(j)}; |
| 278 | if (std::size_t blanks{tok.CountLeadingBlanks()}; |
| 279 | blanks < tok.size() && tok[blanks] == '!') { |
| 280 | // Retain active compiler directive sentinels (e.g. "!dir$") |
| 281 | for (std::size_t k{j + 1}; k < tokens && tok.size() <= blanks + 5; ++k) { |
| 282 | if (tok.begin() + tok.size() == TokenAt(k).begin()) { |
| 283 | tok.ExtendToCover(TokenAt(k)); |
| 284 | } else { |
| 285 | break; |
| 286 | } |
| 287 | } |
| 288 | bool isSentinel{false}; |
| 289 | if (tok.size() > blanks + 5) { |
| 290 | isSentinel = prescanner.IsCompilerDirectiveSentinel(&tok[blanks + 1]) |
| 291 | .has_value(); |
| 292 | } |
| 293 | if (isSentinel) { |
| 294 | } else if (skipFirst) { |
| 295 | skipFirst = false; |
| 296 | } else { |
| 297 | TokenSequence result; |
| 298 | if (j > 0) { |
| 299 | result.AppendRange(*this, 0, j - 1); |
| 300 | } |
| 301 | swap(result); |
| 302 | return *this; |
| 303 | } |
| 304 | } |
| 305 | } |
| 306 | return *this; |
| 307 | } |
| 308 | |
| 309 | void TokenSequence::Emit(CookedSource &cooked) const { |
| 310 | if (auto n{char_.size()}) { |
| 311 | cooked.Put(&char_[0], n); |
| 312 | cooked.PutProvenanceMappings(provenances_); |
| 313 | } |
| 314 | } |
| 315 | |
| 316 | llvm::raw_ostream &TokenSequence::Dump(llvm::raw_ostream &o) const { |
| 317 | o << "TokenSequence has " << char_.size() << " chars; nextStart_ " |
| 318 | << nextStart_ << '\n'; |
| 319 | for (std::size_t j{0}; j < start_.size(); ++j) { |
| 320 | o << '[' << j << "] @ " << start_[j] << " '" << TokenAt(j).ToString() |
| 321 | << "'\n" ; |
| 322 | } |
| 323 | provenances_.Dump(o << "provenances_:\n" ); |
| 324 | return o; |
| 325 | } |
| 326 | |
| 327 | Provenance TokenSequence::GetCharProvenance(std::size_t offset) const { |
| 328 | ProvenanceRange range{provenances_.Map(offset)}; |
| 329 | return range.start(); |
| 330 | } |
| 331 | |
| 332 | Provenance TokenSequence::GetTokenProvenance( |
| 333 | std::size_t token, std::size_t offset) const { |
| 334 | return GetCharProvenance(start_[token] + offset); |
| 335 | } |
| 336 | |
| 337 | ProvenanceRange TokenSequence::GetTokenProvenanceRange( |
| 338 | std::size_t token, std::size_t offset) const { |
| 339 | ProvenanceRange range{provenances_.Map(start_[token] + offset)}; |
| 340 | return range.Prefix(TokenBytes(token) - offset); |
| 341 | } |
| 342 | |
| 343 | ProvenanceRange TokenSequence::GetIntervalProvenanceRange( |
| 344 | std::size_t token, std::size_t tokens) const { |
| 345 | if (tokens == 0) { |
| 346 | return {}; |
| 347 | } |
| 348 | ProvenanceRange range{provenances_.Map(start_[token])}; |
| 349 | while (--tokens > 0 && |
| 350 | range.AnnexIfPredecessor(provenances_.Map(start_[++token]))) { |
| 351 | } |
| 352 | return range; |
| 353 | } |
| 354 | |
| 355 | ProvenanceRange TokenSequence::GetProvenanceRange() const { |
| 356 | return GetIntervalProvenanceRange(0, start_.size()); |
| 357 | } |
| 358 | |
| 359 | const TokenSequence &TokenSequence::CheckBadFortranCharacters( |
| 360 | Messages &messages, const Prescanner &prescanner, |
| 361 | bool preprocessingOnly) const { |
| 362 | std::size_t tokens{SizeInTokens()}; |
| 363 | for (std::size_t j{0}; j < tokens; ++j) { |
| 364 | CharBlock token{TokenAt(j)}; |
| 365 | char ch{token.FirstNonBlank()}; |
| 366 | if (ch != ' ' && !IsValidFortranTokenCharacter(ch)) { |
| 367 | if (ch == '!') { |
| 368 | if (prescanner.IsCompilerDirectiveSentinel(token)) { |
| 369 | continue; |
| 370 | } else if (j + 1 < tokens && |
| 371 | prescanner.IsCompilerDirectiveSentinel( |
| 372 | TokenAt(j + 1))) { // !dir$, &c. |
| 373 | ++j; |
| 374 | continue; |
| 375 | } else if (preprocessingOnly) { |
| 376 | continue; |
| 377 | } |
| 378 | } else if (ch == '&' && preprocessingOnly) { |
| 379 | continue; |
| 380 | } |
| 381 | if (ch < ' ' || ch >= '\x7f') { |
| 382 | messages.Say(GetTokenProvenanceRange(j), |
| 383 | "bad character (0x%02x) in Fortran token"_err_en_US , ch & 0xff); |
| 384 | } else { |
| 385 | messages.Say(GetTokenProvenanceRange(j), |
| 386 | "bad character ('%c') in Fortran token"_err_en_US , ch); |
| 387 | } |
| 388 | } |
| 389 | } |
| 390 | return *this; |
| 391 | } |
| 392 | |
| 393 | bool TokenSequence::BadlyNestedParentheses() const { |
| 394 | int nesting{0}; |
| 395 | std::size_t tokens{SizeInTokens()}; |
| 396 | for (std::size_t j{0}; j < tokens; ++j) { |
| 397 | CharBlock token{TokenAt(j)}; |
| 398 | char ch{token.OnlyNonBlank()}; |
| 399 | if (ch == '(') { |
| 400 | ++nesting; |
| 401 | } else if (ch == ')') { |
| 402 | if (nesting-- == 0) { |
| 403 | break; |
| 404 | } |
| 405 | } |
| 406 | } |
| 407 | return nesting != 0; |
| 408 | } |
| 409 | |
| 410 | const TokenSequence &TokenSequence::CheckBadParentheses( |
| 411 | Messages &messages) const { |
| 412 | if (BadlyNestedParentheses()) { |
| 413 | // There's an error; diagnose it |
| 414 | std::size_t tokens{SizeInTokens()}; |
| 415 | std::vector<std::size_t> stack; |
| 416 | for (std::size_t j{0}; j < tokens; ++j) { |
| 417 | CharBlock token{TokenAt(j)}; |
| 418 | char ch{token.OnlyNonBlank()}; |
| 419 | if (ch == '(') { |
| 420 | stack.push_back(j); |
| 421 | } else if (ch == ')') { |
| 422 | if (stack.empty()) { |
| 423 | messages.Say(GetTokenProvenanceRange(j), "Unmatched ')'"_err_en_US ); |
| 424 | return *this; |
| 425 | } |
| 426 | stack.pop_back(); |
| 427 | } |
| 428 | } |
| 429 | CHECK(!stack.empty()); |
| 430 | messages.Say( |
| 431 | GetTokenProvenanceRange(stack.back()), "Unmatched '('"_err_en_US ); |
| 432 | } |
| 433 | return *this; |
| 434 | } |
| 435 | } // namespace Fortran::parser |
| 436 | |