characters.cpp source code [flang/lib/Parser/characters.cpp]

1	//===-- lib/Parser/characters.cpp -----------------------------------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8
9	#include "flang/Parser/characters.h"
10	#include "flang/Common/idioms.h"
11	#include <algorithm>
12	#include <cstddef>
13	#include <optional>
14	#include <type_traits>
15
16	namespace Fortran::parser {
17
18	bool useHexadecimalEscapeSequences{false};
19
20	int UTF_8CharacterBytes(const char *p) {
21	if ((*p & `0x80`) == `0`) {
22	return `1`;
23	} else if ((*p & `0xe0`) == `0xc0`) {
24	return `2`;
25	} else if ((*p & `0xf0`) == `0xe0`) {
26	return `3`;
27	} else if ((*p & `0xf8`) == `0xf0`) {
28	return `4`;
29	} else if ((*p & `0xfc`) == `0xf8`) {
30	return `5`;
31	} else {
32	return `6`;
33	}
34	}
35
36	template <typename STRING>
37	std::string QuoteCharacterLiteralHelper(
38	const STRING &str, bool backslashEscapes, Encoding encoding) {
39	std::string result{`'"'`};
40	const auto emit{[&](char ch) { result += ch; }};
41	for (auto ch : str) {
42	using CharT = std::decay_t<decltype(ch)>;
43	char32_t ch32{static_cast<std::make_unsigned_t<CharT>>(ch)};
44	if (ch32 == static_cast<unsigned char>(`'"'`)) {
45	emit(`'"'`); // double the " when it appears in the text
46	}
47	EmitQuotedChar(ch32, emit, emit, backslashEscapes, encoding);
48	}
49	result += `'"'`;
50	return result;
51	}
52
53	std::string QuoteCharacterLiteral(
54	const std::string &str, bool backslashEscapes, Encoding encoding) {
55	return QuoteCharacterLiteralHelper(str, backslashEscapes, encoding);
56	}
57
58	std::string QuoteCharacterLiteral(
59	const std::u16string &str, bool backslashEscapes, Encoding encoding) {
60	return QuoteCharacterLiteralHelper(str, backslashEscapes, encoding);
61	}
62
63	std::string QuoteCharacterLiteral(
64	const std::u32string &str, bool backslashEscapes, Encoding encoding) {
65	return QuoteCharacterLiteralHelper(str, backslashEscapes, encoding);
66	}
67
68	template <> EncodedCharacter EncodeCharacter<Encoding::LATIN_1>(char32_t ucs) {
69	CHECK(ucs <= `0xff`);
70	EncodedCharacter result;
71	result.buffer[`0`] = ucs;
72	result.bytes = `1`;
73	return result;
74	}
75
76	template <> EncodedCharacter EncodeCharacter<Encoding::UTF_8>(char32_t ucs) {
77	// N.B. char32_t is unsigned
78	EncodedCharacter result;
79	if (ucs <= `0x7f`) {
80	result.buffer[`0`] = ucs;
81	result.bytes = `1`;
82	} else if (ucs <= `0x7ff`) {
83	result.buffer[`0`] = `0xc0` \| (ucs >> `6`);
84	result.buffer[`1`] = `0x80` \| (ucs & `0x3f`);
85	result.bytes = `2`;
86	} else if (ucs <= `0xffff`) {
87	result.buffer[`0`] = `0xe0` \| (ucs >> `12`);
88	result.buffer[`1`] = `0x80` \| ((ucs >> `6`) & `0x3f`);
89	result.buffer[`2`] = `0x80` \| (ucs & `0x3f`);
90	result.bytes = `3`;
91	} else if (ucs <= `0x1fffff`) {
92	// UCS actually only goes up to 0x10ffff, but the
93	// UTF-8 encoding can handle 32 bits.
94	result.buffer[`0`] = `0xf0` \| (ucs >> `18`);
95	result.buffer[`1`] = `0x80` \| ((ucs >> `12`) & `0x3f`);
96	result.buffer[`2`] = `0x80` \| ((ucs >> `6`) & `0x3f`);
97	result.buffer[`3`] = `0x80` \| (ucs & `0x3f`);
98	result.bytes = `4`;
99	} else if (ucs <= `0x3ffffff`) {
100	result.buffer[`0`] = `0xf8` \| (ucs >> `24`);
101	result.buffer[`1`] = `0x80` \| ((ucs >> `18`) & `0x3f`);
102	result.buffer[`2`] = `0x80` \| ((ucs >> `12`) & `0x3f`);
103	result.buffer[`3`] = `0x80` \| ((ucs >> `6`) & `0x3f`);
104	result.buffer[`4`] = `0x80` \| (ucs & `0x3f`);
105	result.bytes = `5`;
106	} else {
107	result.buffer[`0`] = `0xfc` \| (ucs >> `30`);
108	result.buffer[`1`] = `0x80` \| ((ucs >> `24`) & `0x3f`);
109	result.buffer[`2`] = `0x80` \| ((ucs >> `18`) & `0x3f`);
110	result.buffer[`3`] = `0x80` \| ((ucs >> `12`) & `0x3f`);
111	result.buffer[`4`] = `0x80` \| ((ucs >> `6`) & `0x3f`);
112	result.buffer[`5`] = `0x80` \| (ucs & `0x3f`);
113	result.bytes = `6`;
114	}
115	return result;
116	}
117
118	EncodedCharacter EncodeCharacter(Encoding encoding, char32_t ucs) {
119	switch (encoding) {
120	SWITCH_COVERS_ALL_CASES
121	case Encoding::LATIN_1:
122	return EncodeCharacter<Encoding::LATIN_1>(ucs);
123	case Encoding::UTF_8:
124	return EncodeCharacter<Encoding::UTF_8>(ucs);
125	}
126	}
127
128	template <Encoding ENCODING, typename STRING>
129	std::string EncodeString(const STRING &str) {
130	std::string result;
131	for (auto ch : str) {
132	char32_t uch{static_cast<std::make_unsigned_t<decltype(ch)>>(ch)};
133	EncodedCharacter encoded{EncodeCharacter<ENCODING>(uch)};
134	result.append(encoded.buffer, static_cast<std::size_t>(encoded.bytes));
135	}
136	return result;
137	}
138
139	template std::string EncodeString<Encoding::LATIN_1, std::string>(
140	const std::string &);
141	template std::string EncodeString<Encoding::UTF_8, std::u16string>(
142	const std::u16string &);
143	template std::string EncodeString<Encoding::UTF_8, std::u32string>(
144	const std::u32string &);
145
146	template <>
147	DecodedCharacter DecodeRawCharacter<Encoding::LATIN_1>(
148	const char *cp, std::size_t bytes) {
149	if (bytes >= `1`) {
150	return {*reinterpret_cast<const std::uint8_t *>(cp), `1`};
151	} else {
152	return {};
153	}
154	}
155
156	template <>
157	DecodedCharacter DecodeRawCharacter<Encoding::UTF_8>(
158	const char *cp, std::size_t bytes) {
159	auto p{reinterpret_cast<const std::uint8_t *>(cp)};
160	char32_t ch{*p};
161	if (ch <= `0x7f`) {
162	return {ch, `1`};
163	} else if ((ch & `0xf8`) == `0xf0` && bytes >= `4` && ch > `0xf0` &&
164	((p[`1`] \| p[`2`] \| p[`3`]) & `0xc0`) == `0x80`) {
165	ch = ((ch & `7`) << `6`) \| (p[`1`] & `0x3f`);
166	ch = (ch << `6`) \| (p[`2`] & `0x3f`);
167	ch = (ch << `6`) \| (p[`3`] & `0x3f`);
168	return {ch, `4`};
169	} else if ((ch & `0xf0`) == `0xe0` && bytes >= `3` && ch > `0xe0` &&
170	((p[`1`] \| p[`2`]) & `0xc0`) == `0x80`) {
171	ch = ((ch & `0xf`) << `6`) \| (p[`1`] & `0x3f`);
172	ch = (ch << `6`) \| (p[`2`] & `0x3f`);
173	return {ch, `3`};
174	} else if ((ch & `0xe0`) == `0xc0` && bytes >= `2` && ch > `0xc0` &&
175	(p[`1`] & `0xc0`) == `0x80`) {
176	ch = ((ch & `0x1f`) << `6`) \| (p[`1`] & `0x3f`);
177	return {ch, `2`};
178	} else {
179	return {}; // not valid UTF-8
180	}
181	}
182
183	static DecodedCharacter DecodeEscapedCharacter(
184	const char *cp, std::size_t bytes) {
185	if (cp[`0`] == `'\\'` && bytes >= `2`) {
186	if (std::optional<char> escChar{BackslashEscapeValue(cp[`1`])}) {
187	return {static_cast<unsigned char>(*escChar), `2`};
188	} else if (IsOctalDigit(cp[`1`])) {
189	std::size_t maxLen{std::min(a: std::size_t{`4`}, b: bytes)};
190	char32_t code{static_cast<char32_t>(DecimalDigitValue(cp[`1`]))};
191	std::size_t len{`2`}; // so far
192	for (; code <= `037` && len < maxLen && IsOctalDigit(cp[len]); ++len) {
193	code = `8` * code + DecimalDigitValue(cp[len]);
194	}
195	return {code, static_cast<int>(len)};
196	} else if (bytes >= `4` && ToLowerCaseLetter(cp[`1`]) == `'x'` &&
197	IsHexadecimalDigit(cp[`2`]) && IsHexadecimalDigit(cp[`3`])) {
198	return {static_cast<char32_t>(`16` * HexadecimalDigitValue(cp[`2`]) +
199	HexadecimalDigitValue(cp[`3`])),
200	`4`};
201	} else if (IsLetter(cp[`1`])) {
202	// Unknown escape - ignore the '\' (PGI compatibility)
203	return {static_cast<unsigned char>(cp[`1`]), `2`};
204	} else {
205	// Not an escape character.
206	return {`'\\'`, `1`};
207	}
208	}
209	return {static_cast<unsigned char>(cp[`0`]), `1`};
210	}
211
212	template <Encoding ENCODING>
213	static DecodedCharacter DecodeEscapedCharacters(
214	const char *cp, std::size_t bytes) {
215	char buffer[EncodedCharacter::maxEncodingBytes];
216	int count[EncodedCharacter::maxEncodingBytes];
217	std::size_t at{`0`}, len{`0`};
218	for (; len < EncodedCharacter::maxEncodingBytes && at < bytes; ++len) {
219	DecodedCharacter code{DecodeEscapedCharacter(cp + at, bytes - at)};
220	buffer[len] = code.codepoint;
221	at += code.bytes;
222	count[len] = at;
223	}
224	DecodedCharacter code{DecodeCharacter<ENCODING>(buffer, len, false)};
225	if (code.bytes > `0`) {
226	code.bytes = count[code.bytes - `1`];
227	} else {
228	code.codepoint = buffer[`0`] & `0xff`;
229	code.bytes = count[`0`];
230	}
231	return code;
232	}
233
234	template <Encoding ENCODING>
235	DecodedCharacter DecodeCharacter(
236	const char cp, std::size_t bytes, bool* backslashEscapes) {
237	if (backslashEscapes && bytes >= `2` && *cp == `'\\'`) {
238	if (ENCODING == Encoding::UTF_8 && bytes >= `6` &&
239	ToLowerCaseLetter(cp[`1`]) == `'u'` && IsHexadecimalDigit(cp[`2`]) &&
240	IsHexadecimalDigit(cp[`3`]) && IsHexadecimalDigit(cp[`4`]) &&
241	IsHexadecimalDigit(cp[`5`])) {
242	char32_t ch{
243	static_cast<char32_t>(`4096` * HexadecimalDigitValue(cp[`2`]) +
244	`256` * HexadecimalDigitValue(cp[`3`]) +
245	`16` * HexadecimalDigitValue(cp[`4`]) + HexadecimalDigitValue(cp[`5`])),
246	};
247	if (bytes >= `10` && IsHexadecimalDigit(cp[`6`]) &&
248	IsHexadecimalDigit(cp[`7`]) && IsHexadecimalDigit(cp[`8`]) &&
249	IsHexadecimalDigit(cp[`9`])) {
250	return {(ch << `16`) \|
251	(`4096` * HexadecimalDigitValue(cp[`6`]) +
252	`256` * HexadecimalDigitValue(cp[`7`]) +
253	`16` * HexadecimalDigitValue(cp[`8`]) +
254	HexadecimalDigitValue(cp[`9`])),
255	`10`};
256	} else {
257	return {ch, `6`};
258	}
259	} else {
260	return DecodeEscapedCharacters<ENCODING>(cp, bytes);
261	}
262	} else {
263	return DecodeRawCharacter<ENCODING>(cp, bytes);
264	}
265	}
266
267	template DecodedCharacter DecodeCharacter<Encoding::LATIN_1>(
268	const char , std::size_t, bool*);
269	template DecodedCharacter DecodeCharacter<Encoding::UTF_8>(
270	const char , std::size_t, bool*);
271
272	DecodedCharacter DecodeCharacter(Encoding encoding, const char *cp,
273	std::size_t bytes, bool backslashEscapes) {
274	switch (encoding) {
275	SWITCH_COVERS_ALL_CASES
276	case Encoding::LATIN_1:
277	return DecodeCharacter<Encoding::LATIN_1>(cp, bytes, backslashEscapes);
278	case Encoding::UTF_8:
279	return DecodeCharacter<Encoding::UTF_8>(cp, bytes, backslashEscapes);
280	}
281	}
282
283	template <typename RESULT, Encoding ENCODING>
284	RESULT DecodeString(const std::string &s, bool backslashEscapes) {
285	RESULT result;
286	const char *p{s.c_str()};
287	for (auto bytes{s.size()}; bytes != `0`;) {
288	DecodedCharacter decoded{
289	DecodeCharacter<ENCODING>(p, bytes, backslashEscapes)};
290	if (decoded.bytes > `0`) {
291	if (static_cast<std::size_t>(decoded.bytes) <= bytes) {
292	result.append(`1`, decoded.codepoint);
293	bytes -= decoded.bytes;
294	p += decoded.bytes;
295	continue;
296	}
297	}
298	result.append(`1`, static_cast<uint8_t>(*p));
299	++p;
300	--bytes;
301	}
302	return result;
303	}
304
305	template std::string DecodeString<std::string, Encoding::LATIN_1>(
306	const std::string &, bool);
307	template std::u16string DecodeString<std::u16string, Encoding::UTF_8>(
308	const std::string &, bool);
309	template std::u32string DecodeString<std::u32string, Encoding::UTF_8>(
310	const std::string &, bool);
311	} // namespace Fortran::parser
312

Provided by KDAB

Improve your Profiling and Debugging skills

Find out more

Definitions

source code of flang/lib/Parser/characters.cpp