Warning: This file is not a C or C++ file. It does not have highlighting.
1 | //===-- include/flang/Parser/characters.h -----------------------*- C++ -*-===// |
---|---|
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | |
9 | #ifndef FORTRAN_PARSER_CHARACTERS_H_ |
10 | #define FORTRAN_PARSER_CHARACTERS_H_ |
11 | |
12 | // Define some character classification predicates and |
13 | // conversions here to avoid dependences upon <cctype> and |
14 | // also to accomodate Fortran tokenization. |
15 | |
16 | #include <cstddef> |
17 | #include <cstdint> |
18 | #include <optional> |
19 | #include <string> |
20 | |
21 | namespace Fortran::parser { |
22 | |
23 | extern bool useHexadecimalEscapeSequences; |
24 | |
25 | // We can easily support Fortran program source in any character |
26 | // set whose first 128 code points correspond to ASCII codes 0-127 (ISO/IEC646). |
27 | // The specific encodings that we can handle include: |
28 | // LATIN_1: ISO 8859-1 Latin-1 |
29 | // UTF_8: Multi-byte encoding of Unicode (ISO/IEC 10646) |
30 | enum class Encoding { LATIN_1, UTF_8 }; |
31 | |
32 | inline constexpr bool IsUpperCaseLetter(char ch) { |
33 | return ch >= 'A' && ch <= 'Z'; |
34 | } |
35 | |
36 | inline constexpr bool IsLowerCaseLetter(char ch) { |
37 | return ch >= 'a' && ch <= 'z'; |
38 | } |
39 | |
40 | inline constexpr bool IsLetter(char ch) { |
41 | return IsUpperCaseLetter(ch) || IsLowerCaseLetter(ch); |
42 | } |
43 | |
44 | inline constexpr bool IsDecimalDigit(char ch) { return ch >= '0' && ch <= '9'; } |
45 | |
46 | inline constexpr bool IsHexadecimalDigit(char ch) { |
47 | return (ch >= '0' && ch <= '9') || (ch >= 'A' && ch <= 'F') || |
48 | (ch >= 'a' && ch <= 'f'); |
49 | } |
50 | |
51 | inline constexpr bool IsOctalDigit(char ch) { return ch >= '0' && ch <= '7'; } |
52 | |
53 | inline constexpr bool IsLegalIdentifierStart(char ch) { |
54 | return IsLetter(ch) || ch == '_' || ch == '@' || ch == '$'; |
55 | } |
56 | |
57 | inline constexpr bool IsLegalInIdentifier(char ch) { |
58 | return IsLegalIdentifierStart(ch) || IsDecimalDigit(ch); |
59 | } |
60 | |
61 | inline constexpr bool IsPrintable(char ch) { return ch >= ' ' && ch <= '~'; } |
62 | |
63 | inline constexpr bool IsWhiteSpace(char ch) { |
64 | return ch == ' ' || ch == '\t' || ch == '\n' || ch == '\v' || ch == '\f' || |
65 | ch == '\r'; |
66 | } |
67 | |
68 | inline constexpr char ToLowerCaseLetter(char ch) { |
69 | return IsUpperCaseLetter(ch) ? ch - 'A' + 'a' : ch; |
70 | } |
71 | |
72 | inline constexpr char ToLowerCaseLetter(char &&ch) { |
73 | return IsUpperCaseLetter(ch) ? ch - 'A' + 'a' : ch; |
74 | } |
75 | |
76 | inline std::string ToLowerCaseLetters(std::string_view str) { |
77 | std::string lowered{str}; |
78 | for (char &ch : lowered) { |
79 | ch = ToLowerCaseLetter(ch); |
80 | } |
81 | return lowered; |
82 | } |
83 | |
84 | inline constexpr char ToUpperCaseLetter(char ch) { |
85 | return IsLowerCaseLetter(ch) ? ch - 'a' + 'A' : ch; |
86 | } |
87 | |
88 | inline constexpr char ToUpperCaseLetter(char &&ch) { |
89 | return IsLowerCaseLetter(ch) ? ch - 'a' + 'A' : ch; |
90 | } |
91 | |
92 | inline std::string ToUpperCaseLetters(std::string_view str) { |
93 | std::string raised{str}; |
94 | for (char &ch : raised) { |
95 | ch = ToUpperCaseLetter(ch); |
96 | } |
97 | return raised; |
98 | } |
99 | |
100 | inline constexpr bool IsSameApartFromCase(char x, char y) { |
101 | return ToLowerCaseLetter(x) == ToLowerCaseLetter(y); |
102 | } |
103 | |
104 | inline constexpr char DecimalDigitValue(char ch) { return ch - '0'; } |
105 | |
106 | inline constexpr char HexadecimalDigitValue(char ch) { |
107 | return IsUpperCaseLetter(ch) ? ch - 'A' + 10 |
108 | : IsLowerCaseLetter(ch) ? ch - 'a' + 10 |
109 | : DecimalDigitValue(ch); |
110 | } |
111 | |
112 | inline constexpr std::optional<char> BackslashEscapeValue(char ch) { |
113 | switch (ch) { |
114 | case 'a': |
115 | return std::nullopt; // '\a'; PGF90 doesn't know \a |
116 | case 'b': |
117 | return '\b'; |
118 | case 'f': |
119 | return '\f'; |
120 | case 'n': |
121 | return '\n'; |
122 | case 'r': |
123 | return '\r'; |
124 | case 't': |
125 | return '\t'; |
126 | case 'v': |
127 | return '\v'; |
128 | case '"': |
129 | case '\'': |
130 | case '\\': |
131 | return ch; |
132 | default: |
133 | return std::nullopt; |
134 | } |
135 | } |
136 | |
137 | inline constexpr std::optional<char> BackslashEscapeChar(char ch) { |
138 | switch (ch) { |
139 | case '\a': |
140 | return std::nullopt; // 'a'; PGF90 doesn't know \a |
141 | case '\b': |
142 | return 'b'; |
143 | case '\f': |
144 | return 'f'; |
145 | case '\n': |
146 | return 'n'; |
147 | case '\r': |
148 | return 'r'; |
149 | case '\t': |
150 | return 't'; |
151 | case '\v': |
152 | return 'v'; |
153 | case '"': |
154 | case '\'': |
155 | case '\\': |
156 | return ch; |
157 | default: |
158 | return std::nullopt; |
159 | } |
160 | } |
161 | |
162 | // Does not include spaces or line ending characters. |
163 | inline constexpr bool IsValidFortranTokenCharacter(char ch) { |
164 | switch (ch) { |
165 | case '"': |
166 | case '%': |
167 | case '\'': |
168 | case '(': |
169 | case ')': |
170 | case '*': |
171 | case '+': |
172 | case ',': |
173 | case '-': |
174 | case '.': |
175 | case '/': |
176 | case ':': |
177 | case ';': |
178 | case '<': |
179 | case '=': |
180 | case '>': |
181 | case '[': |
182 | case ']': |
183 | return true; |
184 | default: |
185 | return IsLegalIdentifierStart(ch) || IsDecimalDigit(ch); |
186 | } |
187 | } |
188 | |
189 | struct EncodedCharacter { |
190 | static constexpr int maxEncodingBytes{6}; |
191 | char buffer[maxEncodingBytes]; |
192 | int bytes{0}; |
193 | }; |
194 | |
195 | template <Encoding ENCODING> EncodedCharacter EncodeCharacter(char32_t ucs); |
196 | template <> EncodedCharacter EncodeCharacter<Encoding::LATIN_1>(char32_t); |
197 | template <> EncodedCharacter EncodeCharacter<Encoding::UTF_8>(char32_t); |
198 | |
199 | EncodedCharacter EncodeCharacter(Encoding, char32_t ucs); |
200 | |
201 | template <Encoding ENCODING, typename STRING> |
202 | std::string EncodeString(const STRING &); |
203 | extern template std::string EncodeString<Encoding::LATIN_1, std::string>( |
204 | const std::string &); |
205 | extern template std::string EncodeString<Encoding::UTF_8, std::u32string>( |
206 | const std::u32string &); |
207 | |
208 | // EmitQuotedChar drives callbacks "emit" and "insert" to output the |
209 | // bytes of an encoding for a codepoint. |
210 | template <typename NORMAL, typename INSERTED> |
211 | void EmitQuotedChar(char32_t ch, const NORMAL &emit, const INSERTED &insert, |
212 | bool backslashEscapes = true, Encoding encoding = Encoding::UTF_8) { |
213 | auto emitOneByte{[&](std::uint8_t ch) { |
214 | if (backslashEscapes && (ch < ' ' || ch >= 0x7f || ch == '\\')) { |
215 | if (std::optional<char> escape{BackslashEscapeChar(ch)}) { |
216 | insert('\\'); |
217 | emit(*escape); |
218 | } else if (useHexadecimalEscapeSequences) { |
219 | insert('\\'); |
220 | insert('x'); |
221 | int top{ch >> 4}, bottom{ch & 0xf}; |
222 | insert(top > 9 ? 'a' + top - 10 : '0' + top); |
223 | insert(bottom > 9 ? 'a' + bottom - 10 : '0' + bottom); |
224 | } else { |
225 | // octal escape sequence; always emit 3 digits to avoid ambiguity |
226 | insert('\\'); |
227 | insert('0' + (ch >> 6)); |
228 | insert('0' + ((ch >> 3) & 7)); |
229 | insert('0' + (ch & 7)); |
230 | } |
231 | } else if (ch == '\n') { // always escape newlines |
232 | insert('\\'); |
233 | insert('n'); |
234 | } else { |
235 | emit(ch); |
236 | } |
237 | }}; |
238 | if (ch <= 0x7f) { |
239 | emitOneByte(ch); |
240 | } else if (backslashEscapes && useHexadecimalEscapeSequences) { |
241 | insert('\\'); |
242 | insert('u'); |
243 | if (ch > 0xffff) { |
244 | unsigned c1{(ch >> 28) & 0xf}, c2{(ch >> 24) & 0xf}, c3{(ch >> 20) & 0xf}, |
245 | c4{(ch >> 16) & 0xf}; |
246 | insert(c1 > 9 ? 'a' + c1 - 10 : '0' + c1); |
247 | insert(c2 > 9 ? 'a' + c2 - 10 : '0' + c2); |
248 | insert(c3 > 9 ? 'a' + c3 - 10 : '0' + c3); |
249 | insert(c4 > 9 ? 'a' + c4 - 10 : '0' + c4); |
250 | } |
251 | unsigned c1{(ch >> 12) & 0xf}, c2{(ch >> 8) & 0xf}, c3{(ch >> 4) & 0xf}, |
252 | c4{ch & 0xf}; |
253 | insert(c1 > 9 ? 'a' + c1 - 10 : '0' + c1); |
254 | insert(c2 > 9 ? 'a' + c2 - 10 : '0' + c2); |
255 | insert(c3 > 9 ? 'a' + c3 - 10 : '0' + c3); |
256 | insert(c4 > 9 ? 'a' + c4 - 10 : '0' + c4); |
257 | } else { |
258 | EncodedCharacter encoded{EncodeCharacter(encoding, ch)}; |
259 | for (int j{0}; j < encoded.bytes; ++j) { |
260 | emitOneByte(encoded.buffer[j]); |
261 | } |
262 | } |
263 | } |
264 | |
265 | std::string QuoteCharacterLiteral(const std::string &, |
266 | bool backslashEscapes = true, Encoding = Encoding::LATIN_1); |
267 | std::string QuoteCharacterLiteral(const std::u16string &, |
268 | bool backslashEscapes = true, Encoding = Encoding::UTF_8); |
269 | std::string QuoteCharacterLiteral(const std::u32string &, |
270 | bool backslashEscapes = true, Encoding = Encoding::UTF_8); |
271 | |
272 | int UTF_8CharacterBytes(const char *); |
273 | |
274 | struct DecodedCharacter { |
275 | char32_t codepoint{0}; |
276 | int bytes{0}; // signifying failure |
277 | }; |
278 | |
279 | template <Encoding ENCODING> |
280 | DecodedCharacter DecodeRawCharacter(const char *, std::size_t); |
281 | template <> |
282 | DecodedCharacter DecodeRawCharacter<Encoding::LATIN_1>( |
283 | const char *, std::size_t); |
284 | |
285 | template <> |
286 | DecodedCharacter DecodeRawCharacter<Encoding::UTF_8>(const char *, std::size_t); |
287 | |
288 | // DecodeCharacter optionally handles backslash escape sequences, too. |
289 | template <Encoding ENCODING> |
290 | DecodedCharacter DecodeCharacter( |
291 | const char *, std::size_t, bool backslashEscapes); |
292 | extern template DecodedCharacter DecodeCharacter<Encoding::LATIN_1>( |
293 | const char *, std::size_t, bool); |
294 | extern template DecodedCharacter DecodeCharacter<Encoding::UTF_8>( |
295 | const char *, std::size_t, bool); |
296 | |
297 | DecodedCharacter DecodeCharacter( |
298 | Encoding, const char *, std::size_t, bool backslashEscapes); |
299 | |
300 | template <typename RESULT, Encoding ENCODING> |
301 | RESULT DecodeString(const std::string &, bool backslashEscapes); |
302 | extern template std::string DecodeString<std::string, Encoding::LATIN_1>( |
303 | const std::string &, bool); |
304 | extern template std::u16string DecodeString<std::u16string, Encoding::UTF_8>( |
305 | const std::string &, bool); |
306 | extern template std::u32string DecodeString<std::u32string, Encoding::UTF_8>( |
307 | const std::string &, bool); |
308 | } // namespace Fortran::parser |
309 | #endif // FORTRAN_PARSER_CHARACTERS_H_ |
310 |
Warning: This file is not a C or C++ file. It does not have highlighting.