characters.h source code [flang/include/flang/Parser/characters.h]

Warning: This file is not a C or C++ file. It does not have highlighting.

1	//===-- include/flang/Parser/characters.h ------------------------ C++ --===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8
9	#ifndef FORTRAN_PARSER_CHARACTERS_H_
10	#define FORTRAN_PARSER_CHARACTERS_H_
11
12	// Define some character classification predicates and
13	// conversions here to avoid dependences upon <cctype> and
14	// also to accomodate Fortran tokenization.
15
16	#include <cstddef>
17	#include <cstdint>
18	#include <optional>
19	#include <string>
20
21	namespace Fortran::parser {
22
23	extern bool useHexadecimalEscapeSequences;
24
25	// We can easily support Fortran program source in any character
26	// set whose first 128 code points correspond to ASCII codes 0-127 (ISO/IEC646).
27	// The specific encodings that we can handle include:
28	// LATIN_1: ISO 8859-1 Latin-1
29	// UTF_8: Multi-byte encoding of Unicode (ISO/IEC 10646)
30	enum class Encoding { LATIN_1, UTF_8 };
31
32	inline constexpr bool IsUpperCaseLetter(char ch) {
33	return ch >= 'A' && ch <= 'Z';
34	}
35
36	inline constexpr bool IsLowerCaseLetter(char ch) {
37	return ch >= 'a' && ch <= 'z';
38	}
39
40	inline constexpr bool IsLetter(char ch) {
41	return IsUpperCaseLetter(ch) \|\| IsLowerCaseLetter(ch);
42	}
43
44	inline constexpr bool IsDecimalDigit(char ch) { return ch >= '0' && ch <= '9'; }
45
46	inline constexpr bool IsHexadecimalDigit(char ch) {
47	return (ch >= '0' && ch <= '9') \|\| (ch >= 'A' && ch <= 'F') \|\|
48	(ch >= 'a' && ch <= 'f');
49	}
50
51	inline constexpr bool IsOctalDigit(char ch) { return ch >= '0' && ch <= '7'; }
52
53	inline constexpr bool IsLegalIdentifierStart(char ch) {
54	return IsLetter(ch) \|\| ch == '_' \|\| ch == '@' \|\| ch == '$';
55	}
56
57	inline constexpr bool IsLegalInIdentifier(char ch) {
58	return IsLegalIdentifierStart(ch) \|\| IsDecimalDigit(ch);
59	}
60
61	inline constexpr bool IsPrintable(char ch) { return ch >= ' ' && ch <= '~'; }
62
63	inline constexpr bool IsWhiteSpace(char ch) {
64	return ch == ' ' \|\| ch == '\t' \|\| ch == '\n' \|\| ch == '\v' \|\| ch == '\f' \|\|
65	ch == '\r';
66	}
67
68	inline constexpr char ToLowerCaseLetter(char ch) {
69	return IsUpperCaseLetter(ch) ? ch - 'A' + 'a' : ch;
70	}
71
72	inline constexpr char ToLowerCaseLetter(char &&ch) {
73	return IsUpperCaseLetter(ch) ? ch - 'A' + 'a' : ch;
74	}
75
76	inline std::string ToLowerCaseLetters(std::string_view str) {
77	std::string lowered{str};
78	for (char &ch : lowered) {
79	ch = ToLowerCaseLetter(ch);
80	}
81	return lowered;
82	}
83
84	inline constexpr char ToUpperCaseLetter(char ch) {
85	return IsLowerCaseLetter(ch) ? ch - 'a' + 'A' : ch;
86	}
87
88	inline constexpr char ToUpperCaseLetter(char &&ch) {
89	return IsLowerCaseLetter(ch) ? ch - 'a' + 'A' : ch;
90	}
91
92	inline std::string ToUpperCaseLetters(std::string_view str) {
93	std::string raised{str};
94	for (char &ch : raised) {
95	ch = ToUpperCaseLetter(ch);
96	}
97	return raised;
98	}
99
100	inline constexpr bool IsSameApartFromCase(char x, char y) {
101	return ToLowerCaseLetter(x) == ToLowerCaseLetter(y);
102	}
103
104	inline constexpr char DecimalDigitValue(char ch) { return ch - '0'; }
105
106	inline constexpr char HexadecimalDigitValue(char ch) {
107	return IsUpperCaseLetter(ch) ? ch - 'A' + 10
108	: IsLowerCaseLetter(ch) ? ch - 'a' + 10
109	: DecimalDigitValue(ch);
110	}
111
112	inline constexpr std::optional<char> BackslashEscapeValue(char ch) {
113	switch (ch) {
114	case 'a':
115	return std::nullopt; // '\a'; PGF90 doesn't know \a
116	case 'b':
117	return '\b';
118	case 'f':
119	return '\f';
120	case 'n':
121	return '\n';
122	case 'r':
123	return '\r';
124	case 't':
125	return '\t';
126	case 'v':
127	return '\v';
128	case '"':
129	case '\'':
130	case '\\':
131	return ch;
132	default:
133	return std::nullopt;
134	}
135	}
136
137	inline constexpr std::optional<char> BackslashEscapeChar(char ch) {
138	switch (ch) {
139	case '\a':
140	return std::nullopt; // 'a'; PGF90 doesn't know \a
141	case '\b':
142	return 'b';
143	case '\f':
144	return 'f';
145	case '\n':
146	return 'n';
147	case '\r':
148	return 'r';
149	case '\t':
150	return 't';
151	case '\v':
152	return 'v';
153	case '"':
154	case '\'':
155	case '\\':
156	return ch;
157	default:
158	return std::nullopt;
159	}
160	}
161
162	// Does not include spaces or line ending characters.
163	inline constexpr bool IsValidFortranTokenCharacter(char ch) {
164	switch (ch) {
165	case '"':
166	case '%':
167	case '\'':
168	case '(':
169	case ')':
170	case '*':
171	case '+':
172	case ',':
173	case '-':
174	case '.':
175	case '/':
176	case ':':
177	case ';':
178	case '<':
179	case '=':
180	case '>':
181	case '[':
182	case ']':
183	return true;
184	default:
185	return IsLegalIdentifierStart(ch) \|\| IsDecimalDigit(ch);
186	}
187	}
188
189	struct EncodedCharacter {
190	static constexpr int maxEncodingBytes{6};
191	char buffer[maxEncodingBytes];
192	int bytes{0};
193	};
194
195	template <Encoding ENCODING> EncodedCharacter EncodeCharacter(char32_t ucs);
196	template <> EncodedCharacter EncodeCharacter<Encoding::LATIN_1>(char32_t);
197	template <> EncodedCharacter EncodeCharacter<Encoding::UTF_8>(char32_t);
198
199	EncodedCharacter EncodeCharacter(Encoding, char32_t ucs);
200
201	template <Encoding ENCODING, typename STRING>
202	std::string EncodeString(const STRING &);
203	extern template std::string EncodeString<Encoding::LATIN_1, std::string>(
204	const std::string &);
205	extern template std::string EncodeString<Encoding::UTF_8, std::u32string>(
206	const std::u32string &);
207
208	// EmitQuotedChar drives callbacks "emit" and "insert" to output the
209	// bytes of an encoding for a codepoint.
210	template <typename NORMAL, typename INSERTED>
211	void EmitQuotedChar(char32_t ch, const NORMAL &emit, const INSERTED &insert,
212	bool backslashEscapes = true, Encoding encoding = Encoding::UTF_8) {
213	auto emitOneByte{[&](std::uint8_t ch) {
214	if (backslashEscapes && (ch < ' ' \|\| ch >= 0x7f \|\| ch == '\\')) {
215	if (std::optional<char> escape{BackslashEscapeChar(ch)}) {
216	insert('\\');
217	emit(*escape);
218	} else if (useHexadecimalEscapeSequences) {
219	insert('\\');
220	insert('x');
221	int top{ch >> 4}, bottom{ch & 0xf};
222	insert(top > 9 ? 'a' + top - 10 : '0' + top);
223	insert(bottom > 9 ? 'a' + bottom - 10 : '0' + bottom);
224	} else {
225	// octal escape sequence; always emit 3 digits to avoid ambiguity
226	insert('\\');
227	insert('0' + (ch >> 6));
228	insert('0' + ((ch >> 3) & 7));
229	insert('0' + (ch & 7));
230	}
231	} else if (ch == '\n') { // always escape newlines
232	insert('\\');
233	insert('n');
234	} else {
235	emit(ch);
236	}
237	}};
238	if (ch <= 0x7f) {
239	emitOneByte(ch);
240	} else if (backslashEscapes && useHexadecimalEscapeSequences) {
241	insert('\\');
242	insert('u');
243	if (ch > 0xffff) {
244	unsigned c1{(ch >> 28) & 0xf}, c2{(ch >> 24) & 0xf}, c3{(ch >> 20) & 0xf},
245	c4{(ch >> 16) & 0xf};
246	insert(c1 > 9 ? 'a' + c1 - 10 : '0' + c1);
247	insert(c2 > 9 ? 'a' + c2 - 10 : '0' + c2);
248	insert(c3 > 9 ? 'a' + c3 - 10 : '0' + c3);
249	insert(c4 > 9 ? 'a' + c4 - 10 : '0' + c4);
250	}
251	unsigned c1{(ch >> 12) & 0xf}, c2{(ch >> 8) & 0xf}, c3{(ch >> 4) & 0xf},
252	c4{ch & 0xf};
253	insert(c1 > 9 ? 'a' + c1 - 10 : '0' + c1);
254	insert(c2 > 9 ? 'a' + c2 - 10 : '0' + c2);
255	insert(c3 > 9 ? 'a' + c3 - 10 : '0' + c3);
256	insert(c4 > 9 ? 'a' + c4 - 10 : '0' + c4);
257	} else {
258	EncodedCharacter encoded{EncodeCharacter(encoding, ch)};
259	for (int j{0}; j < encoded.bytes; ++j) {
260	emitOneByte(encoded.buffer[j]);
261	}
262	}
263	}
264
265	std::string QuoteCharacterLiteral(const std::string &,
266	bool backslashEscapes = true, Encoding = Encoding::LATIN_1);
267	std::string QuoteCharacterLiteral(const std::u16string &,
268	bool backslashEscapes = true, Encoding = Encoding::UTF_8);
269	std::string QuoteCharacterLiteral(const std::u32string &,
270	bool backslashEscapes = true, Encoding = Encoding::UTF_8);
271
272	int UTF_8CharacterBytes(const char *);
273
274	struct DecodedCharacter {
275	char32_t codepoint{0};
276	int bytes{0}; // signifying failure
277	};
278
279	template <Encoding ENCODING>
280	DecodedCharacter DecodeRawCharacter(const char *, std::size_t);
281	template <>
282	DecodedCharacter DecodeRawCharacter<Encoding::LATIN_1>(
283	const char *, std::size_t);
284
285	template <>
286	DecodedCharacter DecodeRawCharacter<Encoding::UTF_8>(const char *, std::size_t);
287
288	// DecodeCharacter optionally handles backslash escape sequences, too.
289	template <Encoding ENCODING>
290	DecodedCharacter DecodeCharacter(
291	const char *, std::size_t, bool backslashEscapes);
292	extern template DecodedCharacter DecodeCharacter<Encoding::LATIN_1>(
293	const char *, std::size_t, bool);
294	extern template DecodedCharacter DecodeCharacter<Encoding::UTF_8>(
295	const char *, std::size_t, bool);
296
297	DecodedCharacter DecodeCharacter(
298	Encoding, const char *, std::size_t, bool backslashEscapes);
299
300	template <typename RESULT, Encoding ENCODING>
301	RESULT DecodeString(const std::string &, bool backslashEscapes);
302	extern template std::string DecodeString<std::string, Encoding::LATIN_1>(
303	const std::string &, bool);
304	extern template std::u16string DecodeString<std::u16string, Encoding::UTF_8>(
305	const std::string &, bool);
306	extern template std::u32string DecodeString<std::u32string, Encoding::UTF_8>(
307	const std::string &, bool);
308	} // namespace Fortran::parser
309	#endif // FORTRAN_PARSER_CHARACTERS_H_
310

Warning: This file is not a C or C++ file. It does not have highlighting.

source code of flang/include/flang/Parser/characters.h