token-parsers.h source code [flang/lib/Parser/token-parsers.h]

1	//===-- lib/Parser/token-parsers.h ------------------------------- C++ --===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8
9	#ifndef FORTRAN_PARSER_TOKEN_PARSERS_H_
10	#define FORTRAN_PARSER_TOKEN_PARSERS_H_
11
12	// These parsers are driven by the parsers of the Fortran grammar to consume
13	// the prescanned character stream and recognize context-sensitive tokens.
14
15	#include "basic-parsers.h"
16	#include "type-parsers.h"
17	#include "flang/Common/idioms.h"
18	#include "flang/Parser/char-set.h"
19	#include "flang/Parser/characters.h"
20	#include "flang/Parser/instrumented-parser.h"
21	#include "flang/Parser/provenance.h"
22	#include <cstddef>
23	#include <cstring>
24	#include <functional>
25	#include <limits>
26	#include <list>
27	#include <optional>
28	#include <string>
29
30	namespace Fortran::parser {
31
32	// "xyz"_ch matches one instance of the characters x, y, or z without skipping
33	// any spaces before or after. The parser returns the location of the character
34	// on success.
35	class AnyOfChars {
36	public:
37	using resultType = const char *;
38	constexpr AnyOfChars(const AnyOfChars &) = default;
39	constexpr AnyOfChars(SetOfChars set) : set_{set} {}
40	std::optional<const char > Parse(ParseState &state) const* {
41	if (std::optional<const char *> at{state.PeekAtNextChar()}) {
42	if (set_.Has(**at)) {
43	state.UncheckedAdvance();
44	state.set_anyTokenMatched();
45	return at;
46	}
47	}
48	state.Say(MessageExpectedText{set_});
49	return std::nullopt;
50	}
51
52	private:
53	const SetOfChars set_;
54	};
55
56	constexpr AnyOfChars operator""_ch(const char str[], std::size_t n) {
57	return AnyOfChars{SetOfChars(str, n)};
58	}
59
60	constexpr auto letter{"abcdefghijklmnopqrstuvwxyz"_ch};
61	constexpr auto digit{"0123456789"_ch};
62
63	// Skips over optional spaces. Always succeeds.
64	struct Space {
65	using resultType = Success;
66	constexpr Space() {}
67	static std::optional<Success> Parse(ParseState &state) {
68	while (std::optional<const char *> p{state.PeekAtNextChar()}) {
69	if (**p != `' '`) {
70	break;
71	}
72	state.UncheckedAdvance();
73	}
74	return {Success{}};
75	}
76	};
77	constexpr Space space;
78
79	// Skips a space that in free form requires a warning if it precedes a
80	// character that could begin an identifier or keyword. Always succeeds.
81	inline void MissingSpace(ParseState &state) {
82	if (!state.inFixedForm()) {
83	state.Nonstandard(
84	LanguageFeature::OptionalFreeFormSpace, "missing space"_port_en_US);
85	}
86	}
87
88	struct SpaceCheck {
89	using resultType = Success;
90	constexpr SpaceCheck() {}
91	static std::optional<Success> Parse(ParseState &state) {
92	if (std::optional<const char *> p{state.PeekAtNextChar()}) {
93	char ch{**p};
94	if (ch == `' '`) {
95	state.UncheckedAdvance();
96	return space.Parse(state);
97	}
98	if (IsLegalInIdentifier(ch)) {
99	MissingSpace(state);
100	}
101	}
102	return {Success{}};
103	}
104	};
105	constexpr SpaceCheck spaceCheck;
106
107	// Matches a token string. Spaces in the token string denote where
108	// spaces may appear in the source; they can be made mandatory for
109	// some free form keyword sequences. Missing mandatory spaces in free
110	// form elicit a warning; they are not necessary for recognition.
111	// Spaces before and after the token are also skipped.
112	//
113	// Token strings appear in the grammar as C++ user-defined literals
114	// like "BIND ( C )"_tok and "SYNC ALL"_sptok. The _tok suffix is implied
115	// when a string literal appears before the sequencing operator >> or
116	// after the sequencing operator /. The literal "..."_id parses a
117	// token that cannot be a prefix of a longer identifier.
118	template <bool MandatoryFreeFormSpace = false, bool MustBeComplete = false>
119	class TokenStringMatch {
120	public:
121	using resultType = Success;
122	constexpr TokenStringMatch(const TokenStringMatch &) = default;
123	constexpr TokenStringMatch(const char *str, std::size_t n)
124	: str_{str}, bytes_{n} {}
125	explicit constexpr TokenStringMatch(const char *str) : str_{str} {}
126	std::optional<Success> Parse(ParseState &state) const {
127	space.Parse(state);
128	const char *start{state.GetLocation()};
129	const char *p{str_};
130	std::optional<const char > at; // initially empty*
131	for (std::size_t j{`0`}; j < bytes_ && *p != `'\0'`; ++j, ++p) {
132	bool spaceSkipping{*p == `' '`};
133	if (spaceSkipping) {
134	if (j + `1` == bytes_ \|\| p[`1`] == `' '` \|\| p[`1`] == `'\0'`) {
135	continue; // redundant; ignore
136	}
137	}
138	if (!at) {
139	at = nextCh.Parse(state);
140	if (!at) {
141	return std::nullopt;
142	}
143	}
144	if (spaceSkipping) {
145	if (**at == `' '`) {
146	at = nextCh.Parse(state);
147	if (!at) {
148	return std::nullopt;
149	}
150	} else if constexpr (MandatoryFreeFormSpace) {
151	MissingSpace(state);
152	}
153	// 'at' remains full for next iteration
154	} else if (*at == ToLowerCaseLetter(p)) {
155	at.reset();
156	} else {
157	state.Say(start, MessageExpectedText{str_, bytes_});
158	return std::nullopt;
159	}
160	}
161	if constexpr (MustBeComplete) {
162	if (auto after{state.PeekAtNextChar()}) {
163	if (IsLegalInIdentifier(**after)) {
164	state.Say(start, MessageExpectedText{str_, bytes_});
165	return std::nullopt;
166	}
167	}
168	}
169	state.set_anyTokenMatched();
170	if (IsLegalInIdentifier(p[-`1`])) {
171	return spaceCheck.Parse(state);
172	} else {
173	return space.Parse(state);
174	}
175	}
176
177	private:
178	const char *const str_;
179	const std::size_t bytes_{std::string::npos};
180	};
181
182	constexpr TokenStringMatch<> operator""_tok(const char str[], std::size_t n) {
183	return {str, n};
184	}
185
186	constexpr TokenStringMatch<true> operator""_sptok(
187	const char str[], std::size_t n) {
188	return {str, n};
189	}
190
191	constexpr TokenStringMatch<false, true> operator""_id(
192	const char str[], std::size_t n) {
193	return {str, n};
194	}
195
196	template <class PA>
197	inline constexpr std::enable_if_t<std::is_class_v<PA>,
198	SequenceParser<TokenStringMatch<>, PA>>
199	operator>>(const char str, const* PA &p) {
200	return SequenceParser<TokenStringMatch<>, PA>{TokenStringMatch<>{str}, p};
201	}
202
203	template <class PA>
204	inline constexpr std::enable_if_t<std::is_class_v<PA>,
205	FollowParser<PA, TokenStringMatch<>>>
206	operator/(const PA &p, const char *str) {
207	return FollowParser<PA, TokenStringMatch<>>{p, TokenStringMatch<>{str}};
208	}
209
210	template <class PA> inline constexpr auto parenthesized(const PA &p) {
211	return "(" >> p / ")";
212	}
213
214	template <class PA> inline constexpr auto bracketed(const PA &p) {
215	return "[" >> p / "]";
216	}
217
218	// Quoted character literal constants.
219	struct CharLiteralChar {
220	using resultType = std::pair<char, bool / was escaped />;
221	static std::optional<resultType> Parse(ParseState &state) {
222	auto at{state.GetLocation()};
223	if (std::optional<const char *> cp{nextCh.Parse(state)}) {
224	char ch{**cp};
225	if (ch == `'\n'`) {
226	state.Say(CharBlock{at, state.GetLocation()},
227	"Unclosed character constant"_err_en_US);
228	return std::nullopt;
229	}
230	if (ch == `'\\'`) {
231	// Most escape sequences in character literals are processed later,
232	// but we have to look for quotes here so that doubled quotes work.
233	if (std::optional<const char *> next{state.PeekAtNextChar()}) {
234	char escaped{**next};
235	if (escaped == `'\''` \|\| escaped == `'"'` \|\| escaped == `'\\'`) {
236	state.UncheckedAdvance();
237	return std::make_pair(x&: escaped, y: true);
238	}
239	}
240	}
241	return std::make_pair(x&: ch, y: false);
242	}
243	return std::nullopt;
244	}
245	};
246
247	template <char quote> struct CharLiteral {
248	using resultType = std::string;
249	static std::optional<std::string> Parse(ParseState &state) {
250	std::string str;
251	static constexpr auto nextch{attempt(CharLiteralChar{})};
252	while (auto ch{nextch.Parse(state)}) {
253	if (ch->second) {
254	str += `'\\'`;
255	} else if (ch->first == quote) {
256	static constexpr auto doubled{attempt(AnyOfChars{SetOfChars{quote}})};
257	if (!doubled.Parse(state)) {
258	return str;
259	}
260	}
261	str += ch->first;
262	}
263	return std::nullopt;
264	}
265	};
266
267	// Parse "BOZ" binary literal quoted constants.
268	// As extensions, support X as an alternate hexadecimal marker, and allow
269	// BOZX markers to appear as suffixes.
270	struct BOZLiteral {
271	using resultType = std::string;
272	static std::optional<resultType> Parse(ParseState &state) {
273	char base{`'\0'`};
274	auto baseChar{[&base](char ch) -> bool {
275	switch (ch) {
276	case `'b'`:
277	case `'o'`:
278	case `'z'`:
279	base = ch;
280	return true;
281	case `'x'`:
282	base = `'z'`;
283	return true;
284	default:
285	return false;
286	}
287	}};
288
289	space.Parse(state);
290	const char *start{state.GetLocation()};
291	std::optional<const char *> at{nextCh.Parse(state)};
292	if (!at) {
293	return std::nullopt;
294	}
295	if (**at == `'x'` &&
296	!state.IsNonstandardOk(LanguageFeature::BOZExtensions,
297	"nonstandard BOZ literal"_port_en_US)) {
298	return std::nullopt;
299	}
300	if (baseChar(**at)) {
301	at = nextCh.Parse(state);
302	if (!at) {
303	return std::nullopt;
304	}
305	}
306
307	char quote = **at;
308	if (quote != `'\''` && quote != `'"'`) {
309	return std::nullopt;
310	}
311
312	std::string content;
313	while (true) {
314	at = nextCh.Parse(state);
315	if (!at) {
316	return std::nullopt;
317	}
318	if (**at == quote) {
319	break;
320	}
321	if (**at == `' '`) {
322	continue;
323	}
324	if (!IsHexadecimalDigit(**at)) {
325	return std::nullopt;
326	}
327	content += ToLowerCaseLetter(**at);
328	}
329
330	if (!base) {
331	// extension: base allowed to appear as suffix, too
332	if (!(at = nextCh.Parse(state)) \|\| !baseChar(**at) \|\|
333	!state.IsNonstandardOk(LanguageFeature::BOZExtensions,
334	"nonstandard BOZ literal"_port_en_US)) {
335	return std::nullopt;
336	}
337	spaceCheck.Parse(state);
338	}
339
340	if (content.empty()) {
341	state.Say(start, "no digit in BOZ literal"_err_en_US);
342	return std::nullopt;
343	}
344	return {std::string {base} + `'"'` + content + `'"'`};
345	}
346	};
347
348	// R711 digit-string -> digit [digit]...
349	// N.B. not a token -- no space is skipped
350	struct DigitString {
351	using resultType = CharBlock;
352	static std::optional<resultType> Parse(ParseState &state) {
353	if (std::optional<const char *> ch1{state.PeekAtNextChar()}) {
354	if (IsDecimalDigit(**ch1)) {
355	state.UncheckedAdvance();
356	while (std::optional<const char *> p{state.PeekAtNextChar()}) {
357	if (!IsDecimalDigit(**p)) {
358	break;
359	}
360	state.UncheckedAdvance();
361	}
362	return CharBlock{*ch1, state.GetLocation()};
363	}
364	}
365	return std::nullopt;
366	}
367	};
368	constexpr DigitString digitString;
369
370	struct SignedIntLiteralConstantWithoutKind {
371	using resultType = CharBlock;
372	static std::optional<resultType> Parse(ParseState &state) {
373	resultType result{state.GetLocation()};
374	static constexpr auto sign{maybe("+-"_ch / space)};
375	if (sign.Parse(state)) {
376	if (auto digits{digitString.Parse(state)}) {
377	result.ExtendToCover(*digits);
378	return result;
379	}
380	}
381	return std::nullopt;
382	}
383	};
384
385	struct DigitString64 {
386	using resultType = std::uint64_t;
387	static std::optional<std::uint64_t> Parse(ParseState &state) {
388	std::optional<const char *> firstDigit{digit.Parse(state)};
389	if (!firstDigit) {
390	return std::nullopt;
391	}
392	std::uint64_t value = **firstDigit - `'0'`;
393	bool overflow{false};
394	static constexpr auto getDigit{attempt(digit)};
395	while (auto nextDigit{getDigit.Parse(state)}) {
396	if (value > std::numeric_limits<std::uint64_t>::max() / `10`) {
397	overflow = true;
398	}
399	value *= `10`;
400	int digitValue = **nextDigit - `'0'`;
401	if (value > std::numeric_limits<std::uint64_t>::max() - digitValue) {
402	overflow = true;
403	}
404	value += digitValue;
405	}
406	if (overflow) {
407	state.Say(*firstDigit, "overflow in decimal literal"_err_en_US);
408	}
409	return {value};
410	}
411	};
412	constexpr DigitString64 digitString64;
413
414	// R707 signed-int-literal-constant -> [sign] int-literal-constant
415	// N.B. Spaces are consumed before and after the sign, since the sign
416	// and the int-literal-constant are distinct tokens. Does not
417	// handle a trailing kind parameter.
418	static std::optional<std::int64_t> SignedInteger(
419	const std::optional<std::uint64_t> &x, Location at, bool negate,
420	ParseState &state) {
421	if (!x) {
422	return std::nullopt;
423	}
424	std::uint64_t limit{std::numeric_limits<std::int64_t>::max()};
425	if (negate) {
426	limit = -(limit + `1`);
427	}
428	if (*x > limit) {
429	state.Say(at, "overflow in signed decimal literal"_err_en_US);
430	}
431	std::int64_t value = *x;
432	return std::make_optional<std::int64_t>(t: negate ? -value : value);
433	}
434
435	// R710 signed-digit-string -> [sign] digit-string
436	// N.B. Not a complete token -- no space is skipped.
437	// Used only in the exponent parts of real literal constants.
438	struct SignedDigitString {
439	using resultType = std::int64_t;
440	static std::optional<std::int64_t> Parse(ParseState &state) {
441	std::optional<const char *> sign{state.PeekAtNextChar()};
442	if (!sign) {
443	return std::nullopt;
444	}
445	bool negate{**sign == `'-'`};
446	if (negate \|\| **sign == `'+'`) {
447	state.UncheckedAdvance();
448	}
449	return SignedInteger(digitString64.Parse(state), *sign, negate, state);
450	}
451	};
452
453	// Variants of the above for use in FORMAT specifications, where spaces
454	// must be ignored.
455	struct DigitStringIgnoreSpaces {
456	using resultType = std::uint64_t;
457	static std::optional<std::uint64_t> Parse(ParseState &state) {
458	static constexpr auto getFirstDigit{space >> digit};
459	std::optional<const char *> firstDigit{getFirstDigit.Parse(state)};
460	if (!firstDigit) {
461	return std::nullopt;
462	}
463	std::uint64_t value = **firstDigit - `'0'`;
464	bool overflow{false};
465	static constexpr auto getDigit{space >> attempt(digit)};
466	while (auto nextDigit{getDigit.Parse(state)}) {
467	if (value > std::numeric_limits<std::uint64_t>::max() / `10`) {
468	overflow = true;
469	}
470	value *= `10`;
471	int digitValue = **nextDigit - `'0'`;
472	if (value > std::numeric_limits<std::uint64_t>::max() - digitValue) {
473	overflow = true;
474	}
475	value += digitValue;
476	}
477	if (overflow) {
478	state.Say(*firstDigit, "overflow in decimal literal"_err_en_US);
479	}
480	return value;
481	}
482	};
483
484	struct PositiveDigitStringIgnoreSpaces {
485	using resultType = std::int64_t;
486	static std::optional<std::int64_t> Parse(ParseState &state) {
487	Location at{state.GetLocation()};
488	return SignedInteger(
489	DigitStringIgnoreSpaces{}.Parse(state), at, false /positive/, state);
490	}
491	};
492
493	struct SignedDigitStringIgnoreSpaces {
494	using resultType = std::int64_t;
495	static std::optional<std::int64_t> Parse(ParseState &state) {
496	static constexpr auto getSign{space >> attempt("+-"_ch)};
497	bool negate{false};
498	if (std::optional<const char *> sign{getSign.Parse(state)}) {
499	negate = **sign == `'-'`;
500	}
501	Location at{state.GetLocation()};
502	return SignedInteger(
503	DigitStringIgnoreSpaces{}.Parse(state), at, negate, state);
504	}
505	};
506
507	// Legacy feature: Hollerith literal constants
508	struct HollerithLiteral {
509	using resultType = std::string;
510	static std::optional<std::string> Parse(ParseState &state) {
511	space.Parse(state);
512	const char *start{state.GetLocation()};
513	std::optional<std::uint64_t> charCount{
514	DigitStringIgnoreSpaces{}.Parse(state)};
515	if (!charCount \|\| *charCount < `1`) {
516	return std::nullopt;
517	}
518	static constexpr auto letterH{"h"_ch};
519	std::optional<const char *> h{letterH.Parse(state)};
520	if (!h) {
521	return std::nullopt;
522	}
523	std::string content;
524	for (auto j{*charCount}; j-- > `0`;) {
525	int chBytes{UTF_8CharacterBytes(state.GetLocation())};
526	for (int bytes{chBytes}; bytes > `0`; --bytes) {
527	if (std::optional<const char *> at{nextCh.Parse(state)}) {
528	if (chBytes == `1` && !IsPrintable(**at)) {
529	state.Say(start, "Bad character in Hollerith"_err_en_US);
530	return std::nullopt;
531	}
532	content += **at;
533	} else {
534	state.Say(start, "Insufficient characters in Hollerith"_err_en_US);
535	return std::nullopt;
536	}
537	}
538	}
539	return content;
540	}
541	};
542
543	struct ConsumedAllInputParser {
544	using resultType = Success;
545	constexpr ConsumedAllInputParser() {}
546	static inline std::optional<Success> Parse(ParseState &state) {
547	if (state.IsAtEnd()) {
548	return {Success{}};
549	}
550	return std::nullopt;
551	}
552	};
553	constexpr ConsumedAllInputParser consumedAllInput;
554
555	template <char goal> struct SkipPast {
556	using resultType = Success;
557	constexpr SkipPast() {}
558	constexpr SkipPast(const SkipPast &) {}
559	static std::optional<Success> Parse(ParseState &state) {
560	while (std::optional<const char *> p{state.GetNextChar()}) {
561	if (**p == goal) {
562	return {Success{}};
563	}
564	}
565	return std::nullopt;
566	}
567	};
568
569	template <char goal> struct SkipTo {
570	using resultType = Success;
571	constexpr SkipTo() {}
572	constexpr SkipTo(const SkipTo &) {}
573	static std::optional<Success> Parse(ParseState &state) {
574	while (std::optional<const char *> p{state.PeekAtNextChar()}) {
575	if (**p == goal) {
576	return {Success{}};
577	}
578	state.UncheckedAdvance();
579	}
580	return std::nullopt;
581	}
582	};
583
584	// A common idiom in the Fortran grammar is an optional item (usually
585	// a nonempty comma-separated list) that, if present, must follow a comma
586	// and precede a doubled colon. When the item is absent, the comma must
587	// not appear, and the doubled colons are optional.
588	// [[, xyz] ::] is optionalBeforeColons(xyz)
589	// [[, xyz]... ::] is optionalBeforeColons(nonemptyList(xyz))
590	template <typename PA> inline constexpr auto optionalBeforeColons(const PA &p) {
591	using resultType = std::optional<typename PA::resultType>;
592	return "," >> construct<resultType>(p) / "::" \|\|
593	("::"_tok \|\| !","_tok) >> pure<resultType>();
594	}
595	template <typename PA>
596	inline constexpr auto optionalListBeforeColons(const PA &p) {
597	using resultType = std::list<typename PA::resultType>;
598	return "," >> nonemptyList(p) / "::" \|\|
599	("::"_tok \|\| !","_tok) >> pure<resultType>();
600	}
601
602	// Skip over empty lines, leading spaces, and some compiler directives (viz.,
603	// the ones that specify the source form) that might appear before the
604	// next statement. Skip over empty statements (bare semicolons) when
605	// not in strict standard conformance mode. Always succeeds.
606	struct SkipStuffBeforeStatement {
607	using resultType = Success;
608	static std::optional<Success> Parse(ParseState &state) {
609	if (UserState * ustate{state.userState()}) {
610	if (ParsingLog * log{ustate->log()}) {
611	// Save memory: vacate the parsing log before each statement unless
612	// we're logging the whole parse for debugging.
613	if (!ustate->instrumentedParse()) {
614	log->clear();
615	}
616	}
617	}
618	while (std::optional<const char *> at{state.PeekAtNextChar()}) {
619	if (at == `'\n'` \|\| at == `' '`) {
620	state.UncheckedAdvance();
621	} else if (**at == `'!'`) {
622	static const char fixed[] = "!dir$ fixed\n", free[] = "!dir$ free\n";
623	static constexpr std::size_t fixedBytes{sizeof fixed - `1`};
624	static constexpr std::size_t freeBytes{sizeof free - `1`};
625	std::size_t remain{state.BytesRemaining()};
626	if (remain >= fixedBytes && std::memcmp(s1: *at, s2: fixed, n: fixedBytes) == `0`) {
627	state.set_inFixedForm(true).UncheckedAdvance(fixedBytes);
628	} else if (remain >= freeBytes &&
629	std::memcmp(s1: *at, s2: free, n: freeBytes) == `0`) {
630	state.set_inFixedForm(false).UncheckedAdvance(freeBytes);
631	} else {
632	break;
633	}
634	} else if (**at == `';'` &&
635	state.IsNonstandardOk(
636	LanguageFeature::EmptyStatement, "empty statement"_port_en_US)) {
637	state.UncheckedAdvance();
638	} else {
639	break;
640	}
641	}
642	return {Success{}};
643	}
644	};
645	constexpr SkipStuffBeforeStatement skipStuffBeforeStatement;
646
647	// R602 underscore -> _
648	constexpr auto underscore{"_"_ch};
649
650	// Characters besides letters and digits that may appear in names.
651	// N.B. Don't accept an underscore if it is immediately followed by a
652	// quotation mark, so that kindParam_"character literal" is parsed properly.
653	// PGI and ifort accept '$' in identifiers, even as the initial character.
654	// Cray and gfortran accept '$', but not as the first character.
655	// Cray accepts '@' as well.
656	constexpr auto otherIdChar{underscore / !"'\""_ch \|\|
657	extension<LanguageFeature::PunctuationInNames>(
658	"nonstandard usage: punctuation in name"_port_en_US, "$@"_ch)};
659
660	constexpr auto logicalTRUE{
661	(".TRUE."_tok \|\|
662	extension<LanguageFeature::LogicalAbbreviations>(
663	"nonstandard usage: .T. spelling of .TRUE."_port_en_US,
664	".T."_tok)) >>
665	pure(true)};
666	constexpr auto logicalFALSE{
667	(".FALSE."_tok \|\|
668	extension<LanguageFeature::LogicalAbbreviations>(
669	"nonstandard usage: .F. spelling of .FALSE."_port_en_US,
670	".F."_tok)) >>
671	pure(false)};
672
673	// deprecated: Hollerith literals
674	constexpr auto rawHollerithLiteral{
675	deprecated<LanguageFeature::Hollerith>(HollerithLiteral{})};
676
677	template <typename A> constexpr decltype(auto) verbatim(A x) {
678	return sourced(construct<Verbatim>(x));
679	}
680
681	} // namespace Fortran::parser
682	#endif // FORTRAN_PARSER_TOKEN_PARSERS_H_
683

source code of flang/lib/Parser/token-parsers.h