1//===-- lib/Parser/token-parsers.h ------------------------------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
9#ifndef FORTRAN_PARSER_TOKEN_PARSERS_H_
10#define FORTRAN_PARSER_TOKEN_PARSERS_H_
11
12// These parsers are driven by the parsers of the Fortran grammar to consume
13// the prescanned character stream and recognize context-sensitive tokens.
14
15#include "basic-parsers.h"
16#include "type-parsers.h"
17#include "flang/Common/idioms.h"
18#include "flang/Parser/char-set.h"
19#include "flang/Parser/characters.h"
20#include "flang/Parser/instrumented-parser.h"
21#include "flang/Parser/provenance.h"
22#include <cstddef>
23#include <cstring>
24#include <functional>
25#include <limits>
26#include <list>
27#include <optional>
28#include <string>
29
30namespace Fortran::parser {
31
32// "xyz"_ch matches one instance of the characters x, y, or z without skipping
33// any spaces before or after. The parser returns the location of the character
34// on success.
35class AnyOfChars {
36public:
37 using resultType = const char *;
38 constexpr AnyOfChars(const AnyOfChars &) = default;
39 constexpr AnyOfChars(SetOfChars set) : set_{set} {}
40 std::optional<const char *> Parse(ParseState &state) const {
41 if (std::optional<const char *> at{state.PeekAtNextChar()}) {
42 if (set_.Has(**at)) {
43 state.UncheckedAdvance();
44 state.set_anyTokenMatched();
45 return at;
46 }
47 }
48 state.Say(MessageExpectedText{set_});
49 return std::nullopt;
50 }
51
52private:
53 const SetOfChars set_;
54};
55
56constexpr AnyOfChars operator""_ch(const char str[], std::size_t n) {
57 return AnyOfChars{SetOfChars(str, n)};
58}
59
60constexpr auto letter{"abcdefghijklmnopqrstuvwxyz"_ch};
61constexpr auto digit{"0123456789"_ch};
62
63// Skips over optional spaces. Always succeeds.
64struct Space {
65 using resultType = Success;
66 constexpr Space() {}
67 static std::optional<Success> Parse(ParseState &state) {
68 while (std::optional<const char *> p{state.PeekAtNextChar()}) {
69 if (**p != ' ') {
70 break;
71 }
72 state.UncheckedAdvance();
73 }
74 return {Success{}};
75 }
76};
77constexpr Space space;
78
79// Skips a space that in free form requires a warning if it precedes a
80// character that could begin an identifier or keyword. Always succeeds.
81inline void MissingSpace(ParseState &state) {
82 if (!state.inFixedForm()) {
83 state.Nonstandard(
84 LanguageFeature::OptionalFreeFormSpace, "missing space"_port_en_US);
85 }
86}
87
88struct SpaceCheck {
89 using resultType = Success;
90 constexpr SpaceCheck() {}
91 static std::optional<Success> Parse(ParseState &state) {
92 if (std::optional<const char *> p{state.PeekAtNextChar()}) {
93 char ch{**p};
94 if (ch == ' ') {
95 state.UncheckedAdvance();
96 return space.Parse(state);
97 }
98 if (IsLegalInIdentifier(ch)) {
99 MissingSpace(state);
100 }
101 }
102 return {Success{}};
103 }
104};
105constexpr SpaceCheck spaceCheck;
106
107// Matches a token string. Spaces in the token string denote where
108// spaces may appear in the source; they can be made mandatory for
109// some free form keyword sequences. Missing mandatory spaces in free
110// form elicit a warning; they are not necessary for recognition.
111// Spaces before and after the token are also skipped.
112//
113// Token strings appear in the grammar as C++ user-defined literals
114// like "BIND ( C )"_tok and "SYNC ALL"_sptok. The _tok suffix is implied
115// when a string literal appears before the sequencing operator >> or
116// after the sequencing operator /. The literal "..."_id parses a
117// token that cannot be a prefix of a longer identifier.
118template <bool MandatoryFreeFormSpace = false, bool MustBeComplete = false>
119class TokenStringMatch {
120public:
121 using resultType = Success;
122 constexpr TokenStringMatch(const TokenStringMatch &) = default;
123 constexpr TokenStringMatch(const char *str, std::size_t n)
124 : str_{str}, bytes_{n} {}
125 explicit constexpr TokenStringMatch(const char *str) : str_{str} {}
126 std::optional<Success> Parse(ParseState &state) const {
127 space.Parse(state);
128 const char *start{state.GetLocation()};
129 const char *p{str_};
130 std::optional<const char *> at; // initially empty
131 for (std::size_t j{0}; j < bytes_ && *p != '\0'; ++j, ++p) {
132 bool spaceSkipping{*p == ' '};
133 if (spaceSkipping) {
134 if (j + 1 == bytes_ || p[1] == ' ' || p[1] == '\0') {
135 continue; // redundant; ignore
136 }
137 }
138 if (!at) {
139 at = nextCh.Parse(state);
140 if (!at) {
141 return std::nullopt;
142 }
143 }
144 if (spaceSkipping) {
145 if (**at == ' ') {
146 at = nextCh.Parse(state);
147 if (!at) {
148 return std::nullopt;
149 }
150 } else if constexpr (MandatoryFreeFormSpace) {
151 MissingSpace(state);
152 }
153 // 'at' remains full for next iteration
154 } else if (**at == ToLowerCaseLetter(*p)) {
155 at.reset();
156 } else {
157 state.Say(start, MessageExpectedText{str_, bytes_});
158 return std::nullopt;
159 }
160 }
161 if constexpr (MustBeComplete) {
162 if (auto after{state.PeekAtNextChar()}) {
163 if (IsLegalInIdentifier(**after)) {
164 state.Say(start, MessageExpectedText{str_, bytes_});
165 return std::nullopt;
166 }
167 }
168 }
169 state.set_anyTokenMatched();
170 if (IsLegalInIdentifier(p[-1])) {
171 return spaceCheck.Parse(state);
172 } else {
173 return space.Parse(state);
174 }
175 }
176
177private:
178 const char *const str_;
179 const std::size_t bytes_{std::string::npos};
180};
181
182constexpr TokenStringMatch<> operator""_tok(const char str[], std::size_t n) {
183 return {str, n};
184}
185
186constexpr TokenStringMatch<true> operator""_sptok(
187 const char str[], std::size_t n) {
188 return {str, n};
189}
190
191constexpr TokenStringMatch<false, true> operator""_id(
192 const char str[], std::size_t n) {
193 return {str, n};
194}
195
196template <class PA>
197inline constexpr std::enable_if_t<std::is_class_v<PA>,
198 SequenceParser<TokenStringMatch<>, PA>>
199operator>>(const char *str, const PA &p) {
200 return SequenceParser<TokenStringMatch<>, PA>{TokenStringMatch<>{str}, p};
201}
202
203template <class PA>
204inline constexpr std::enable_if_t<std::is_class_v<PA>,
205 FollowParser<PA, TokenStringMatch<>>>
206operator/(const PA &p, const char *str) {
207 return FollowParser<PA, TokenStringMatch<>>{p, TokenStringMatch<>{str}};
208}
209
210template <class PA> inline constexpr auto parenthesized(const PA &p) {
211 return "(" >> p / ")";
212}
213
214template <class PA> inline constexpr auto bracketed(const PA &p) {
215 return "[" >> p / "]";
216}
217
218// Quoted character literal constants.
219struct CharLiteralChar {
220 using resultType = std::pair<char, bool /* was escaped */>;
221 static std::optional<resultType> Parse(ParseState &state) {
222 auto at{state.GetLocation()};
223 if (std::optional<const char *> cp{nextCh.Parse(state)}) {
224 char ch{**cp};
225 if (ch == '\n') {
226 state.Say(CharBlock{at, state.GetLocation()},
227 "Unclosed character constant"_err_en_US);
228 return std::nullopt;
229 }
230 if (ch == '\\') {
231 // Most escape sequences in character literals are processed later,
232 // but we have to look for quotes here so that doubled quotes work.
233 if (std::optional<const char *> next{state.PeekAtNextChar()}) {
234 char escaped{**next};
235 if (escaped == '\'' || escaped == '"' || escaped == '\\') {
236 state.UncheckedAdvance();
237 return std::make_pair(x&: escaped, y: true);
238 }
239 }
240 }
241 return std::make_pair(x&: ch, y: false);
242 }
243 return std::nullopt;
244 }
245};
246
247template <char quote> struct CharLiteral {
248 using resultType = std::string;
249 static std::optional<std::string> Parse(ParseState &state) {
250 std::string str;
251 static constexpr auto nextch{attempt(CharLiteralChar{})};
252 while (auto ch{nextch.Parse(state)}) {
253 if (ch->second) {
254 str += '\\';
255 } else if (ch->first == quote) {
256 static constexpr auto doubled{attempt(AnyOfChars{SetOfChars{quote}})};
257 if (!doubled.Parse(state)) {
258 return str;
259 }
260 }
261 str += ch->first;
262 }
263 return std::nullopt;
264 }
265};
266
267// Parse "BOZ" binary literal quoted constants.
268// As extensions, support X as an alternate hexadecimal marker, and allow
269// BOZX markers to appear as suffixes.
270struct BOZLiteral {
271 using resultType = std::string;
272 static std::optional<resultType> Parse(ParseState &state) {
273 char base{'\0'};
274 auto baseChar{[&base](char ch) -> bool {
275 switch (ch) {
276 case 'b':
277 case 'o':
278 case 'z':
279 base = ch;
280 return true;
281 case 'x':
282 base = 'z';
283 return true;
284 default:
285 return false;
286 }
287 }};
288
289 space.Parse(state);
290 const char *start{state.GetLocation()};
291 std::optional<const char *> at{nextCh.Parse(state)};
292 if (!at) {
293 return std::nullopt;
294 }
295 if (**at == 'x' &&
296 !state.IsNonstandardOk(LanguageFeature::BOZExtensions,
297 "nonstandard BOZ literal"_port_en_US)) {
298 return std::nullopt;
299 }
300 if (baseChar(**at)) {
301 at = nextCh.Parse(state);
302 if (!at) {
303 return std::nullopt;
304 }
305 }
306
307 char quote = **at;
308 if (quote != '\'' && quote != '"') {
309 return std::nullopt;
310 }
311
312 std::string content;
313 while (true) {
314 at = nextCh.Parse(state);
315 if (!at) {
316 return std::nullopt;
317 }
318 if (**at == quote) {
319 break;
320 }
321 if (**at == ' ') {
322 continue;
323 }
324 if (!IsHexadecimalDigit(**at)) {
325 return std::nullopt;
326 }
327 content += ToLowerCaseLetter(**at);
328 }
329
330 if (!base) {
331 // extension: base allowed to appear as suffix, too
332 if (!(at = nextCh.Parse(state)) || !baseChar(**at) ||
333 !state.IsNonstandardOk(LanguageFeature::BOZExtensions,
334 "nonstandard BOZ literal"_port_en_US)) {
335 return std::nullopt;
336 }
337 spaceCheck.Parse(state);
338 }
339
340 if (content.empty()) {
341 state.Say(start, "no digit in BOZ literal"_err_en_US);
342 return std::nullopt;
343 }
344 return {std::string{base} + '"' + content + '"'};
345 }
346};
347
348// R711 digit-string -> digit [digit]...
349// N.B. not a token -- no space is skipped
350struct DigitString {
351 using resultType = CharBlock;
352 static std::optional<resultType> Parse(ParseState &state) {
353 if (std::optional<const char *> ch1{state.PeekAtNextChar()}) {
354 if (IsDecimalDigit(**ch1)) {
355 state.UncheckedAdvance();
356 while (std::optional<const char *> p{state.PeekAtNextChar()}) {
357 if (!IsDecimalDigit(**p)) {
358 break;
359 }
360 state.UncheckedAdvance();
361 }
362 return CharBlock{*ch1, state.GetLocation()};
363 }
364 }
365 return std::nullopt;
366 }
367};
368constexpr DigitString digitString;
369
370struct SignedIntLiteralConstantWithoutKind {
371 using resultType = CharBlock;
372 static std::optional<resultType> Parse(ParseState &state) {
373 resultType result{state.GetLocation()};
374 static constexpr auto sign{maybe("+-"_ch / space)};
375 if (sign.Parse(state)) {
376 if (auto digits{digitString.Parse(state)}) {
377 result.ExtendToCover(*digits);
378 return result;
379 }
380 }
381 return std::nullopt;
382 }
383};
384
385struct DigitString64 {
386 using resultType = std::uint64_t;
387 static std::optional<std::uint64_t> Parse(ParseState &state) {
388 std::optional<const char *> firstDigit{digit.Parse(state)};
389 if (!firstDigit) {
390 return std::nullopt;
391 }
392 std::uint64_t value = **firstDigit - '0';
393 bool overflow{false};
394 static constexpr auto getDigit{attempt(digit)};
395 while (auto nextDigit{getDigit.Parse(state)}) {
396 if (value > std::numeric_limits<std::uint64_t>::max() / 10) {
397 overflow = true;
398 }
399 value *= 10;
400 int digitValue = **nextDigit - '0';
401 if (value > std::numeric_limits<std::uint64_t>::max() - digitValue) {
402 overflow = true;
403 }
404 value += digitValue;
405 }
406 if (overflow) {
407 state.Say(*firstDigit, "overflow in decimal literal"_err_en_US);
408 }
409 return {value};
410 }
411};
412constexpr DigitString64 digitString64;
413
414// R707 signed-int-literal-constant -> [sign] int-literal-constant
415// N.B. Spaces are consumed before and after the sign, since the sign
416// and the int-literal-constant are distinct tokens. Does not
417// handle a trailing kind parameter.
418static std::optional<std::int64_t> SignedInteger(
419 const std::optional<std::uint64_t> &x, Location at, bool negate,
420 ParseState &state) {
421 if (!x) {
422 return std::nullopt;
423 }
424 std::uint64_t limit{std::numeric_limits<std::int64_t>::max()};
425 if (negate) {
426 limit = -(limit + 1);
427 }
428 if (*x > limit) {
429 state.Say(at, "overflow in signed decimal literal"_err_en_US);
430 }
431 std::int64_t value = *x;
432 return std::make_optional<std::int64_t>(t: negate ? -value : value);
433}
434
435// R710 signed-digit-string -> [sign] digit-string
436// N.B. Not a complete token -- no space is skipped.
437// Used only in the exponent parts of real literal constants.
438struct SignedDigitString {
439 using resultType = std::int64_t;
440 static std::optional<std::int64_t> Parse(ParseState &state) {
441 std::optional<const char *> sign{state.PeekAtNextChar()};
442 if (!sign) {
443 return std::nullopt;
444 }
445 bool negate{**sign == '-'};
446 if (negate || **sign == '+') {
447 state.UncheckedAdvance();
448 }
449 return SignedInteger(digitString64.Parse(state), *sign, negate, state);
450 }
451};
452
453// Variants of the above for use in FORMAT specifications, where spaces
454// must be ignored.
455struct DigitStringIgnoreSpaces {
456 using resultType = std::uint64_t;
457 static std::optional<std::uint64_t> Parse(ParseState &state) {
458 static constexpr auto getFirstDigit{space >> digit};
459 std::optional<const char *> firstDigit{getFirstDigit.Parse(state)};
460 if (!firstDigit) {
461 return std::nullopt;
462 }
463 std::uint64_t value = **firstDigit - '0';
464 bool overflow{false};
465 static constexpr auto getDigit{space >> attempt(digit)};
466 while (auto nextDigit{getDigit.Parse(state)}) {
467 if (value > std::numeric_limits<std::uint64_t>::max() / 10) {
468 overflow = true;
469 }
470 value *= 10;
471 int digitValue = **nextDigit - '0';
472 if (value > std::numeric_limits<std::uint64_t>::max() - digitValue) {
473 overflow = true;
474 }
475 value += digitValue;
476 }
477 if (overflow) {
478 state.Say(*firstDigit, "overflow in decimal literal"_err_en_US);
479 }
480 return value;
481 }
482};
483
484struct PositiveDigitStringIgnoreSpaces {
485 using resultType = std::int64_t;
486 static std::optional<std::int64_t> Parse(ParseState &state) {
487 Location at{state.GetLocation()};
488 return SignedInteger(
489 DigitStringIgnoreSpaces{}.Parse(state), at, false /*positive*/, state);
490 }
491};
492
493struct SignedDigitStringIgnoreSpaces {
494 using resultType = std::int64_t;
495 static std::optional<std::int64_t> Parse(ParseState &state) {
496 static constexpr auto getSign{space >> attempt("+-"_ch)};
497 bool negate{false};
498 if (std::optional<const char *> sign{getSign.Parse(state)}) {
499 negate = **sign == '-';
500 }
501 Location at{state.GetLocation()};
502 return SignedInteger(
503 DigitStringIgnoreSpaces{}.Parse(state), at, negate, state);
504 }
505};
506
507// Legacy feature: Hollerith literal constants
508struct HollerithLiteral {
509 using resultType = std::string;
510 static std::optional<std::string> Parse(ParseState &state) {
511 space.Parse(state);
512 const char *start{state.GetLocation()};
513 std::optional<std::uint64_t> charCount{
514 DigitStringIgnoreSpaces{}.Parse(state)};
515 if (!charCount || *charCount < 1) {
516 return std::nullopt;
517 }
518 static constexpr auto letterH{"h"_ch};
519 std::optional<const char *> h{letterH.Parse(state)};
520 if (!h) {
521 return std::nullopt;
522 }
523 std::string content;
524 for (auto j{*charCount}; j-- > 0;) {
525 int chBytes{UTF_8CharacterBytes(state.GetLocation())};
526 for (int bytes{chBytes}; bytes > 0; --bytes) {
527 if (std::optional<const char *> at{nextCh.Parse(state)}) {
528 if (chBytes == 1 && !IsPrintable(**at)) {
529 state.Say(start, "Bad character in Hollerith"_err_en_US);
530 return std::nullopt;
531 }
532 content += **at;
533 } else {
534 state.Say(start, "Insufficient characters in Hollerith"_err_en_US);
535 return std::nullopt;
536 }
537 }
538 }
539 return content;
540 }
541};
542
543struct ConsumedAllInputParser {
544 using resultType = Success;
545 constexpr ConsumedAllInputParser() {}
546 static inline std::optional<Success> Parse(ParseState &state) {
547 if (state.IsAtEnd()) {
548 return {Success{}};
549 }
550 return std::nullopt;
551 }
552};
553constexpr ConsumedAllInputParser consumedAllInput;
554
555template <char goal> struct SkipPast {
556 using resultType = Success;
557 constexpr SkipPast() {}
558 constexpr SkipPast(const SkipPast &) {}
559 static std::optional<Success> Parse(ParseState &state) {
560 while (std::optional<const char *> p{state.GetNextChar()}) {
561 if (**p == goal) {
562 return {Success{}};
563 }
564 }
565 return std::nullopt;
566 }
567};
568
569template <char goal> struct SkipTo {
570 using resultType = Success;
571 constexpr SkipTo() {}
572 constexpr SkipTo(const SkipTo &) {}
573 static std::optional<Success> Parse(ParseState &state) {
574 while (std::optional<const char *> p{state.PeekAtNextChar()}) {
575 if (**p == goal) {
576 return {Success{}};
577 }
578 state.UncheckedAdvance();
579 }
580 return std::nullopt;
581 }
582};
583
584// A common idiom in the Fortran grammar is an optional item (usually
585// a nonempty comma-separated list) that, if present, must follow a comma
586// and precede a doubled colon. When the item is absent, the comma must
587// not appear, and the doubled colons are optional.
588// [[, xyz] ::] is optionalBeforeColons(xyz)
589// [[, xyz]... ::] is optionalBeforeColons(nonemptyList(xyz))
590template <typename PA> inline constexpr auto optionalBeforeColons(const PA &p) {
591 using resultType = std::optional<typename PA::resultType>;
592 return "," >> construct<resultType>(p) / "::" ||
593 ("::"_tok || !","_tok) >> pure<resultType>();
594}
595template <typename PA>
596inline constexpr auto optionalListBeforeColons(const PA &p) {
597 using resultType = std::list<typename PA::resultType>;
598 return "," >> nonemptyList(p) / "::" ||
599 ("::"_tok || !","_tok) >> pure<resultType>();
600}
601
602// Skip over empty lines, leading spaces, and some compiler directives (viz.,
603// the ones that specify the source form) that might appear before the
604// next statement. Skip over empty statements (bare semicolons) when
605// not in strict standard conformance mode. Always succeeds.
606struct SkipStuffBeforeStatement {
607 using resultType = Success;
608 static std::optional<Success> Parse(ParseState &state) {
609 if (UserState * ustate{state.userState()}) {
610 if (ParsingLog * log{ustate->log()}) {
611 // Save memory: vacate the parsing log before each statement unless
612 // we're logging the whole parse for debugging.
613 if (!ustate->instrumentedParse()) {
614 log->clear();
615 }
616 }
617 }
618 while (std::optional<const char *> at{state.PeekAtNextChar()}) {
619 if (**at == '\n' || **at == ' ') {
620 state.UncheckedAdvance();
621 } else if (**at == '!') {
622 static const char fixed[] = "!dir$ fixed\n", free[] = "!dir$ free\n";
623 static constexpr std::size_t fixedBytes{sizeof fixed - 1};
624 static constexpr std::size_t freeBytes{sizeof free - 1};
625 std::size_t remain{state.BytesRemaining()};
626 if (remain >= fixedBytes && std::memcmp(s1: *at, s2: fixed, n: fixedBytes) == 0) {
627 state.set_inFixedForm(true).UncheckedAdvance(fixedBytes);
628 } else if (remain >= freeBytes &&
629 std::memcmp(s1: *at, s2: free, n: freeBytes) == 0) {
630 state.set_inFixedForm(false).UncheckedAdvance(freeBytes);
631 } else {
632 break;
633 }
634 } else if (**at == ';' &&
635 state.IsNonstandardOk(
636 LanguageFeature::EmptyStatement, "empty statement"_port_en_US)) {
637 state.UncheckedAdvance();
638 } else {
639 break;
640 }
641 }
642 return {Success{}};
643 }
644};
645constexpr SkipStuffBeforeStatement skipStuffBeforeStatement;
646
647// R602 underscore -> _
648constexpr auto underscore{"_"_ch};
649
650// Characters besides letters and digits that may appear in names.
651// N.B. Don't accept an underscore if it is immediately followed by a
652// quotation mark, so that kindParam_"character literal" is parsed properly.
653// PGI and ifort accept '$' in identifiers, even as the initial character.
654// Cray and gfortran accept '$', but not as the first character.
655// Cray accepts '@' as well.
656constexpr auto otherIdChar{underscore / !"'\""_ch ||
657 extension<LanguageFeature::PunctuationInNames>(
658 "nonstandard usage: punctuation in name"_port_en_US, "$@"_ch)};
659
660constexpr auto logicalTRUE{
661 (".TRUE."_tok ||
662 extension<LanguageFeature::LogicalAbbreviations>(
663 "nonstandard usage: .T. spelling of .TRUE."_port_en_US,
664 ".T."_tok)) >>
665 pure(true)};
666constexpr auto logicalFALSE{
667 (".FALSE."_tok ||
668 extension<LanguageFeature::LogicalAbbreviations>(
669 "nonstandard usage: .F. spelling of .FALSE."_port_en_US,
670 ".F."_tok)) >>
671 pure(false)};
672
673// deprecated: Hollerith literals
674constexpr auto rawHollerithLiteral{
675 deprecated<LanguageFeature::Hollerith>(HollerithLiteral{})};
676
677template <typename A> constexpr decltype(auto) verbatim(A x) {
678 return sourced(construct<Verbatim>(x));
679}
680
681} // namespace Fortran::parser
682#endif // FORTRAN_PARSER_TOKEN_PARSERS_H_
683

source code of flang/lib/Parser/token-parsers.h