Warning: This file is not a C or C++ file. It does not have highlighting.

1//===-- include/flang/Parser/characters.h -----------------------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
9#ifndef FORTRAN_PARSER_CHARACTERS_H_
10#define FORTRAN_PARSER_CHARACTERS_H_
11
12// Define some character classification predicates and
13// conversions here to avoid dependences upon <cctype> and
14// also to accomodate Fortran tokenization.
15
16#include <cstddef>
17#include <cstdint>
18#include <optional>
19#include <string>
20
21namespace Fortran::parser {
22
23extern bool useHexadecimalEscapeSequences;
24
25// We can easily support Fortran program source in any character
26// set whose first 128 code points correspond to ASCII codes 0-127 (ISO/IEC646).
27// The specific encodings that we can handle include:
28// LATIN_1: ISO 8859-1 Latin-1
29// UTF_8: Multi-byte encoding of Unicode (ISO/IEC 10646)
30enum class Encoding { LATIN_1, UTF_8 };
31
32inline constexpr bool IsUpperCaseLetter(char ch) {
33 return ch >= 'A' && ch <= 'Z';
34}
35
36inline constexpr bool IsLowerCaseLetter(char ch) {
37 return ch >= 'a' && ch <= 'z';
38}
39
40inline constexpr bool IsLetter(char ch) {
41 return IsUpperCaseLetter(ch) || IsLowerCaseLetter(ch);
42}
43
44inline constexpr bool IsDecimalDigit(char ch) { return ch >= '0' && ch <= '9'; }
45
46inline constexpr bool IsHexadecimalDigit(char ch) {
47 return (ch >= '0' && ch <= '9') || (ch >= 'A' && ch <= 'F') ||
48 (ch >= 'a' && ch <= 'f');
49}
50
51inline constexpr bool IsOctalDigit(char ch) { return ch >= '0' && ch <= '7'; }
52
53inline constexpr bool IsLegalIdentifierStart(char ch) {
54 return IsLetter(ch) || ch == '_' || ch == '@' || ch == '$';
55}
56
57inline constexpr bool IsLegalInIdentifier(char ch) {
58 return IsLegalIdentifierStart(ch) || IsDecimalDigit(ch);
59}
60
61inline constexpr bool IsPrintable(char ch) { return ch >= ' ' && ch <= '~'; }
62
63inline constexpr bool IsWhiteSpace(char ch) {
64 return ch == ' ' || ch == '\t' || ch == '\n' || ch == '\v' || ch == '\f' ||
65 ch == '\r';
66}
67
68inline constexpr char ToLowerCaseLetter(char ch) {
69 return IsUpperCaseLetter(ch) ? ch - 'A' + 'a' : ch;
70}
71
72inline constexpr char ToLowerCaseLetter(char &&ch) {
73 return IsUpperCaseLetter(ch) ? ch - 'A' + 'a' : ch;
74}
75
76inline std::string ToLowerCaseLetters(std::string_view str) {
77 std::string lowered{str};
78 for (char &ch : lowered) {
79 ch = ToLowerCaseLetter(ch);
80 }
81 return lowered;
82}
83
84inline constexpr char ToUpperCaseLetter(char ch) {
85 return IsLowerCaseLetter(ch) ? ch - 'a' + 'A' : ch;
86}
87
88inline constexpr char ToUpperCaseLetter(char &&ch) {
89 return IsLowerCaseLetter(ch) ? ch - 'a' + 'A' : ch;
90}
91
92inline std::string ToUpperCaseLetters(std::string_view str) {
93 std::string raised{str};
94 for (char &ch : raised) {
95 ch = ToUpperCaseLetter(ch);
96 }
97 return raised;
98}
99
100inline constexpr bool IsSameApartFromCase(char x, char y) {
101 return ToLowerCaseLetter(x) == ToLowerCaseLetter(y);
102}
103
104inline constexpr char DecimalDigitValue(char ch) { return ch - '0'; }
105
106inline constexpr char HexadecimalDigitValue(char ch) {
107 return IsUpperCaseLetter(ch) ? ch - 'A' + 10
108 : IsLowerCaseLetter(ch) ? ch - 'a' + 10
109 : DecimalDigitValue(ch);
110}
111
112inline constexpr std::optional<char> BackslashEscapeValue(char ch) {
113 switch (ch) {
114 case 'a':
115 return std::nullopt; // '\a'; PGF90 doesn't know \a
116 case 'b':
117 return '\b';
118 case 'f':
119 return '\f';
120 case 'n':
121 return '\n';
122 case 'r':
123 return '\r';
124 case 't':
125 return '\t';
126 case 'v':
127 return '\v';
128 case '"':
129 case '\'':
130 case '\\':
131 return ch;
132 default:
133 return std::nullopt;
134 }
135}
136
137inline constexpr std::optional<char> BackslashEscapeChar(char ch) {
138 switch (ch) {
139 case '\a':
140 return std::nullopt; // 'a'; PGF90 doesn't know \a
141 case '\b':
142 return 'b';
143 case '\f':
144 return 'f';
145 case '\n':
146 return 'n';
147 case '\r':
148 return 'r';
149 case '\t':
150 return 't';
151 case '\v':
152 return 'v';
153 case '"':
154 case '\'':
155 case '\\':
156 return ch;
157 default:
158 return std::nullopt;
159 }
160}
161
162// Does not include spaces or line ending characters.
163inline constexpr bool IsValidFortranTokenCharacter(char ch) {
164 switch (ch) {
165 case '"':
166 case '%':
167 case '\'':
168 case '(':
169 case ')':
170 case '*':
171 case '+':
172 case ',':
173 case '-':
174 case '.':
175 case '/':
176 case ':':
177 case ';':
178 case '<':
179 case '=':
180 case '>':
181 case '[':
182 case ']':
183 return true;
184 default:
185 return IsLegalIdentifierStart(ch) || IsDecimalDigit(ch);
186 }
187}
188
189struct EncodedCharacter {
190 static constexpr int maxEncodingBytes{6};
191 char buffer[maxEncodingBytes];
192 int bytes{0};
193};
194
195template <Encoding ENCODING> EncodedCharacter EncodeCharacter(char32_t ucs);
196template <> EncodedCharacter EncodeCharacter<Encoding::LATIN_1>(char32_t);
197template <> EncodedCharacter EncodeCharacter<Encoding::UTF_8>(char32_t);
198
199EncodedCharacter EncodeCharacter(Encoding, char32_t ucs);
200
201template <Encoding ENCODING, typename STRING>
202std::string EncodeString(const STRING &);
203extern template std::string EncodeString<Encoding::LATIN_1, std::string>(
204 const std::string &);
205extern template std::string EncodeString<Encoding::UTF_8, std::u32string>(
206 const std::u32string &);
207
208// EmitQuotedChar drives callbacks "emit" and "insert" to output the
209// bytes of an encoding for a codepoint.
210template <typename NORMAL, typename INSERTED>
211void EmitQuotedChar(char32_t ch, const NORMAL &emit, const INSERTED &insert,
212 bool backslashEscapes = true, Encoding encoding = Encoding::UTF_8) {
213 auto emitOneByte{[&](std::uint8_t ch) {
214 if (backslashEscapes && (ch < ' ' || ch >= 0x7f || ch == '\\')) {
215 if (std::optional<char> escape{BackslashEscapeChar(ch)}) {
216 insert('\\');
217 emit(*escape);
218 } else if (useHexadecimalEscapeSequences) {
219 insert('\\');
220 insert('x');
221 int top{ch >> 4}, bottom{ch & 0xf};
222 insert(top > 9 ? 'a' + top - 10 : '0' + top);
223 insert(bottom > 9 ? 'a' + bottom - 10 : '0' + bottom);
224 } else {
225 // octal escape sequence; always emit 3 digits to avoid ambiguity
226 insert('\\');
227 insert('0' + (ch >> 6));
228 insert('0' + ((ch >> 3) & 7));
229 insert('0' + (ch & 7));
230 }
231 } else if (ch == '\n') { // always escape newlines
232 insert('\\');
233 insert('n');
234 } else {
235 emit(ch);
236 }
237 }};
238 if (ch <= 0x7f) {
239 emitOneByte(ch);
240 } else if (backslashEscapes && useHexadecimalEscapeSequences) {
241 insert('\\');
242 insert('u');
243 if (ch > 0xffff) {
244 unsigned c1{(ch >> 28) & 0xf}, c2{(ch >> 24) & 0xf}, c3{(ch >> 20) & 0xf},
245 c4{(ch >> 16) & 0xf};
246 insert(c1 > 9 ? 'a' + c1 - 10 : '0' + c1);
247 insert(c2 > 9 ? 'a' + c2 - 10 : '0' + c2);
248 insert(c3 > 9 ? 'a' + c3 - 10 : '0' + c3);
249 insert(c4 > 9 ? 'a' + c4 - 10 : '0' + c4);
250 }
251 unsigned c1{(ch >> 12) & 0xf}, c2{(ch >> 8) & 0xf}, c3{(ch >> 4) & 0xf},
252 c4{ch & 0xf};
253 insert(c1 > 9 ? 'a' + c1 - 10 : '0' + c1);
254 insert(c2 > 9 ? 'a' + c2 - 10 : '0' + c2);
255 insert(c3 > 9 ? 'a' + c3 - 10 : '0' + c3);
256 insert(c4 > 9 ? 'a' + c4 - 10 : '0' + c4);
257 } else {
258 EncodedCharacter encoded{EncodeCharacter(encoding, ch)};
259 for (int j{0}; j < encoded.bytes; ++j) {
260 emitOneByte(encoded.buffer[j]);
261 }
262 }
263}
264
265std::string QuoteCharacterLiteral(const std::string &,
266 bool backslashEscapes = true, Encoding = Encoding::LATIN_1);
267std::string QuoteCharacterLiteral(const std::u16string &,
268 bool backslashEscapes = true, Encoding = Encoding::UTF_8);
269std::string QuoteCharacterLiteral(const std::u32string &,
270 bool backslashEscapes = true, Encoding = Encoding::UTF_8);
271
272int UTF_8CharacterBytes(const char *);
273
274struct DecodedCharacter {
275 char32_t codepoint{0};
276 int bytes{0}; // signifying failure
277};
278
279template <Encoding ENCODING>
280DecodedCharacter DecodeRawCharacter(const char *, std::size_t);
281template <>
282DecodedCharacter DecodeRawCharacter<Encoding::LATIN_1>(
283 const char *, std::size_t);
284
285template <>
286DecodedCharacter DecodeRawCharacter<Encoding::UTF_8>(const char *, std::size_t);
287
288// DecodeCharacter optionally handles backslash escape sequences, too.
289template <Encoding ENCODING>
290DecodedCharacter DecodeCharacter(
291 const char *, std::size_t, bool backslashEscapes);
292extern template DecodedCharacter DecodeCharacter<Encoding::LATIN_1>(
293 const char *, std::size_t, bool);
294extern template DecodedCharacter DecodeCharacter<Encoding::UTF_8>(
295 const char *, std::size_t, bool);
296
297DecodedCharacter DecodeCharacter(
298 Encoding, const char *, std::size_t, bool backslashEscapes);
299
300template <typename RESULT, Encoding ENCODING>
301RESULT DecodeString(const std::string &, bool backslashEscapes);
302extern template std::string DecodeString<std::string, Encoding::LATIN_1>(
303 const std::string &, bool);
304extern template std::u16string DecodeString<std::u16string, Encoding::UTF_8>(
305 const std::string &, bool);
306extern template std::u32string DecodeString<std::u32string, Encoding::UTF_8>(
307 const std::string &, bool);
308} // namespace Fortran::parser
309#endif // FORTRAN_PARSER_CHARACTERS_H_
310

Warning: This file is not a C or C++ file. It does not have highlighting.

source code of flang/include/flang/Parser/characters.h