charset.cc source code [libcpp/charset.cc]

1	/ CPP Library - charsets*
2	Copyright (C) 1998-2024 Free Software Foundation, Inc.
3
4	Broken out of c-lex.cc Apr 2003, adding valid C99 UCN ranges.
5
6	This program is free software; you can redistribute it and/or modify it
7	under the terms of the GNU General Public License as published by the
8	Free Software Foundation; either version 3, or (at your option) any
9	later version.
10
11	This program is distributed in the hope that it will be useful,
12	but WITHOUT ANY WARRANTY; without even the implied warranty of
13	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14	GNU General Public License for more details.
15
16	You should have received a copy of the GNU General Public License
17	along with this program; see the file COPYING3. If not see
18	<http://www.gnu.org/licenses/>. /*
19
20	#include "config.h"
21	#include "system.h"
22	#include "cpplib.h"
23	#include "internal.h"
24
25	/ Character set handling for C-family languages.*
26
27	Terminological note: In what follows, "charset" or "character set"
28	will be taken to mean both an abstract set of characters and an
29	encoding for that set.
30
31	The C99 standard discusses two character sets: source and execution.
32	The source character set is used for internal processing in translation
33	phases 1 through 4; the execution character set is used thereafter.
34	Both are required by 5.2.1.2p1 to be multibyte encodings, not wide
35	character encodings (see 3.7.2, 3.7.3 for the standardese meanings
36	of these terms). Furthermore, the "basic character set" (listed in
37	5.2.1p3) is to be encoded in each with values one byte wide, and is
38	to appear in the initial shift state.
39
40	It is not explicitly mentioned, but there is also a "wide execution
41	character set" used to encode wide character constants and wide
42	string literals; this is supposed to be the result of applying the
43	standard library function mbstowcs() to an equivalent narrow string
44	(6.4.5p5). However, the behavior of hexadecimal and octal
45	\-escapes is at odds with this; they are supposed to be translated
46	directly to wchar_t values (6.4.4.4p5,6).
47
48	The source character set is not necessarily the character set used
49	to encode physical source files on disk; translation phase 1 converts
50	from whatever that encoding is to the source character set.
51
52	The presence of universal character names in C99 (6.4.3 et seq.)
53	forces the source character set to be isomorphic to ISO 10646,
54	that is, Unicode. There is no such constraint on the execution
55	character set; note also that the conversion from source to
56	execution character set does not occur for identifiers (5.1.1.2p1#5).
57
58	For convenience of implementation, the source character set's
59	encoding of the basic character set should be identical to the
60	execution character set OF THE HOST SYSTEM's encoding of the basic
61	character set, and it should not be a state-dependent encoding.
62
63	cpplib uses UTF-8 or UTF-EBCDIC for the source character set,
64	depending on whether the host is based on ASCII or EBCDIC (see
65	respectively Unicode section 2.3/ISO10646 Amendment 2, and Unicode
66	Technical Report #16). With limited exceptions, it relies on the
67	system library's iconv() primitive to do charset conversion
68	(specified in SUSv2). /*
69
70	#if !HAVE_ICONV
71	/ Make certain that the uses of iconv(), iconv_open(), iconv_close()*
72	below, which are guarded only by if statements with compile-time
73	constant conditions, do not cause link errors. /*
74	#define iconv_open(x, y) (errno = EINVAL, (iconv_t)-1)
75	#define iconv(a,b,c,d,e) (errno = EINVAL, (size_t)-1)
76	#define iconv_close(x) (void)0
77	#define ICONV_CONST
78	#endif
79
80	#if HOST_CHARSET == HOST_CHARSET_ASCII
81	#define SOURCE_CHARSET "UTF-8"
82	#define LAST_POSSIBLY_BASIC_SOURCE_CHAR 0x7e
83	#elif HOST_CHARSET == HOST_CHARSET_EBCDIC
84	#define SOURCE_CHARSET "UTF-EBCDIC"
85	#define LAST_POSSIBLY_BASIC_SOURCE_CHAR 0xFF
86	#else
87	#error "Unrecognized basic host character set"
88	#endif
89
90	#ifndef EILSEQ
91	#define EILSEQ EINVAL
92	#endif
93
94	/ This structure is used for a resizable string buffer throughout. /
95	/ Don't call it strbuf, as that conflicts with unistd.h on systems*
96	such as DYNIX/ptx where unistd.h includes stropts.h. /*
97	struct _cpp_strbuf
98	{
99	uchar *text;
100	size_t asize;
101	size_t len;
102	};
103
104	/ This is enough to hold any string that fits on a single 80-column*
105	line, even if iconv quadruples its size (e.g. conversion from
106	ASCII to UTF-32) rounded up to a power of two. /*
107	#define OUTBUF_BLOCK_SIZE 256
108
109	/ Conversions between UTF-8 and UTF-16/32 are implemented by custom*
110	logic. This is because a depressing number of systems lack iconv,
111	or have have iconv libraries that do not do these conversions, so
112	we need a fallback implementation for them. To ensure the fallback
113	doesn't break due to neglect, it is used on all systems.
114
115	UTF-32 encoding is nice and simple: a four-byte binary number,
116	constrained to the range 00000000-7FFFFFFF to avoid questions of
117	signedness. We do have to cope with big- and little-endian
118	variants.
119
120	UTF-16 encoding uses two-byte binary numbers, again in big- and
121	little-endian variants, for all values in the 00000000-0000FFFF
122	range. Values in the 00010000-0010FFFF range are encoded as pairs
123	of two-byte numbers, called "surrogate pairs": given a number S in
124	this range, it is mapped to a pair (H, L) as follows:
125
126	H = (S - 0x10000) / 0x400 + 0xD800
127	L = (S - 0x10000) % 0x400 + 0xDC00
128
129	Two-byte values in the D800...DFFF range are ill-formed except as a
130	component of a surrogate pair. Even if the encoding within a
131	two-byte value is little-endian, the H member of the surrogate pair
132	comes first.
133
134	There is no way to encode values in the 00110000-7FFFFFFF range,
135	which is not currently a problem as there are no assigned code
136	points in that range; however, the author expects that it will
137	eventually become necessary to abandon UTF-16 due to this
138	limitation. Note also that, because of these pairs, UTF-16 does
139	not meet the requirements of the C standard for a wide character
140	encoding (see 3.7.3 and 6.4.4.4p11).
141
142	UTF-8 encoding looks like this:
143
144	value range encoded as
145	00000000-0000007F 0xxxxxxx
146	00000080-000007FF 110xxxxx 10xxxxxx
147	00000800-0000FFFF 1110xxxx 10xxxxxx 10xxxxxx
148	00010000-001FFFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
149	00200000-03FFFFFF 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
150	04000000-7FFFFFFF 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
151
152	Values in the 0000D800 ... 0000DFFF range (surrogates) are invalid,
153	which means that three-byte sequences ED xx yy, with A0 <= xx <= BF,
154	never occur. Note also that any value that can be encoded by a
155	given row of the table can also be encoded by all successive rows,
156	but this is not done; only the shortest possible encoding for any
157	given value is valid. For instance, the character 07C0 could be
158	encoded as any of DF 80, E0 9F 80, F0 80 9F 80, F8 80 80 9F 80, or
159	FC 80 80 80 9F 80. Only the first is valid.
160
161	An implementation note: the transformation from UTF-16 to UTF-8, or
162	vice versa, is easiest done by using UTF-32 as an intermediary. /*
163
164	/ Internal primitives which go from an UTF-8 byte stream to native-endian*
165	UTF-32 in a cppchar_t, or vice versa; this avoids an extra marshal/unmarshal
166	operation in several places below. /*
167	static inline int
168	one_utf8_to_cppchar (const uchar *inbufp, size_t inbytesleftp,
169	cppchar_t *cp)
170	{
171	static const uchar masks[`6`] = { `0x7F`, `0x1F`, `0x0F`, `0x07`, `0x03`, `0x01` };
172	static const uchar patns[`6`] = { `0x00`, `0xC0`, `0xE0`, `0xF0`, `0xF8`, `0xFC` };
173
174	cppchar_t c;
175	const uchar inbuf = inbufp;
176	size_t nbytes, i;
177
178	if (*inbytesleftp < `1`)
179	return EINVAL;
180
181	c = *inbuf;
182	if (c < `0x80`)
183	{
184	*cp = c;
185	*inbytesleftp -= `1`;
186	*inbufp += `1`;
187	return `0`;
188	}
189
190	/ The number of leading 1-bits in the first byte indicates how many*
191	bytes follow. /*
192	for (nbytes = `2`; nbytes < `7`; nbytes++)
193	if ((c & ~masks[nbytes-`1`]) == patns[nbytes-`1`])
194	goto found;
195	return EILSEQ;
196	found:
197
198	if (*inbytesleftp < nbytes)
199	return EINVAL;
200
201	c = (c & masks[nbytes-`1`]);
202	inbuf++;
203	for (i = `1`; i < nbytes; i++)
204	{
205	cppchar_t n = *inbuf++;
206	if ((n & `0xC0`) != `0x80`)
207	return EILSEQ;
208	c = ((c << `6`) + (n & `0x3F`));
209	}
210
211	/ Make sure the shortest possible encoding was used. /
212	if (c <= `0x7F` && nbytes > `1`) return EILSEQ;
213	if (c <= `0x7FF` && nbytes > `2`) return EILSEQ;
214	if (c <= `0xFFFF` && nbytes > `3`) return EILSEQ;
215	if (c <= `0x1FFFFF` && nbytes > `4`) return EILSEQ;
216	if (c <= `0x3FFFFFF` && nbytes > `5`) return EILSEQ;
217
218	/ Make sure the character is valid. /
219	if (c > `0x7FFFFFFF` \|\| (c >= `0xD800` && c <= `0xDFFF`)) return EILSEQ;
220
221	*cp = c;
222	*inbufp = inbuf;
223	*inbytesleftp -= nbytes;
224	return `0`;
225	}
226
227	static inline int
228	one_cppchar_to_utf8 (cppchar_t c, uchar *outbufp, size_t outbytesleftp)
229	{
230	static const uchar masks[`6`] = { `0x00`, `0xC0`, `0xE0`, `0xF0`, `0xF8`, `0xFC` };
231	static const uchar limits[`6`] = { `0x80`, `0xE0`, `0xF0`, `0xF8`, `0xFC`, `0xFE` };
232	size_t nbytes;
233	uchar buf[`6`], *p = &buf[`6`];
234	uchar outbuf = outbufp;
235
236	nbytes = `1`;
237	if (c < `0x80`)
238	*--p = c;
239	else
240	{
241	do
242	{
243	*--p = ((c & `0x3F`) \| `0x80`);
244	c >>= `6`;
245	nbytes++;
246	}
247	while (c >= `0x3F` \|\| (c & limits[nbytes-`1`]));
248	*--p = (c \| masks[nbytes-`1`]);
249	}
250
251	if (*outbytesleftp < nbytes)
252	return E2BIG;
253
254	while (p < &buf[`6`])
255	outbuf++ = p++;
256	*outbytesleftp -= nbytes;
257	*outbufp = outbuf;
258	return `0`;
259	}
260
261	/ The following four functions transform one character between the two*
262	encodings named in the function name. All have the signature
263	int ()(iconv_t bigend, const uchar *inbufp, size_t inbytesleftp,*
264	uchar outbufp, size_t outbytesleftp)*
265
266	BIGEND must have the value 0 or 1, coerced to (iconv_t); it is
267	interpreted as a boolean indicating whether big-endian or
268	little-endian encoding is to be used for the member of the pair
269	that is not UTF-8.
270
271	INBUFP, INBYTESLEFTP, OUTBUFP, OUTBYTESLEFTP work exactly as they
272	do for iconv.
273
274	The return value is either 0 for success, or an errno value for
275	failure, which may be E2BIG (need more space), EILSEQ (ill-formed
276	input sequence), ir EINVAL (incomplete input sequence). /*
277
278	static inline int
279	one_utf8_to_utf32 (iconv_t bigend, const uchar *inbufp, size_t inbytesleftp,
280	uchar *outbufp, size_t outbytesleftp)
281	{
282	uchar *outbuf;
283	cppchar_t s = `0`;
284	int rval;
285
286	/ Check for space first, since we know exactly how much we need. /
287	if (*outbytesleftp < `4`)
288	return E2BIG;
289
290	rval = one_utf8_to_cppchar (inbufp, inbytesleftp, cp: &s);
291	if (rval)
292	return rval;
293
294	outbuf = *outbufp;
295	outbuf[bigend ? `3` : `0`] = (s & `0x000000FF`);
296	outbuf[bigend ? `2` : `1`] = (s & `0x0000FF00`) >> `8`;
297	outbuf[bigend ? `1` : `2`] = (s & `0x00FF0000`) >> `16`;
298	outbuf[bigend ? `0` : `3`] = (s & `0xFF000000`) >> `24`;
299
300	*outbufp += `4`;
301	*outbytesleftp -= `4`;
302	return `0`;
303	}
304
305	static inline int
306	one_utf32_to_utf8 (iconv_t bigend, const uchar *inbufp, size_t inbytesleftp,
307	uchar *outbufp, size_t outbytesleftp)
308	{
309	cppchar_t s;
310	int rval;
311	const uchar *inbuf;
312
313	if (*inbytesleftp < `4`)
314	return EINVAL;
315
316	inbuf = *inbufp;
317
318	s = inbuf[bigend ? `0` : `3`] << `24`;
319	s += inbuf[bigend ? `1` : `2`] << `16`;
320	s += inbuf[bigend ? `2` : `1`] << `8`;
321	s += inbuf[bigend ? `3` : `0`];
322
323	if (s >= `0x7FFFFFFF` \|\| (s >= `0xD800` && s <= `0xDFFF`))
324	return EILSEQ;
325
326	rval = one_cppchar_to_utf8 (c: s, outbufp, outbytesleftp);
327	if (rval)
328	return rval;
329
330	*inbufp += `4`;
331	*inbytesleftp -= `4`;
332	return `0`;
333	}
334
335	static inline int
336	one_utf8_to_utf16 (iconv_t bigend, const uchar *inbufp, size_t inbytesleftp,
337	uchar *outbufp, size_t outbytesleftp)
338	{
339	int rval;
340	cppchar_t s = `0`;
341	const uchar save_inbuf = inbufp;
342	size_t save_inbytesleft = *inbytesleftp;
343	uchar outbuf = outbufp;
344
345	rval = one_utf8_to_cppchar (inbufp, inbytesleftp, cp: &s);
346	if (rval)
347	return rval;
348
349	if (s > `0x0010FFFF`)
350	{
351	*inbufp = save_inbuf;
352	*inbytesleftp = save_inbytesleft;
353	return EILSEQ;
354	}
355
356	if (s <= `0xFFFF`)
357	{
358	if (*outbytesleftp < `2`)
359	{
360	*inbufp = save_inbuf;
361	*inbytesleftp = save_inbytesleft;
362	return E2BIG;
363	}
364	outbuf[bigend ? `1` : `0`] = (s & `0x00FF`);
365	outbuf[bigend ? `0` : `1`] = (s & `0xFF00`) >> `8`;
366
367	*outbufp += `2`;
368	*outbytesleftp -= `2`;
369	return `0`;
370	}
371	else
372	{
373	cppchar_t hi, lo;
374
375	if (*outbytesleftp < `4`)
376	{
377	*inbufp = save_inbuf;
378	*inbytesleftp = save_inbytesleft;
379	return E2BIG;
380	}
381
382	hi = (s - `0x10000`) / `0x400` + `0xD800`;
383	lo = (s - `0x10000`) % `0x400` + `0xDC00`;
384
385	/ Even if we are little-endian, put the high surrogate first.*
386	??? Matches practice? /*
387	outbuf[bigend ? `1` : `0`] = (hi & `0x00FF`);
388	outbuf[bigend ? `0` : `1`] = (hi & `0xFF00`) >> `8`;
389	outbuf[bigend ? `3` : `2`] = (lo & `0x00FF`);
390	outbuf[bigend ? `2` : `3`] = (lo & `0xFF00`) >> `8`;
391
392	*outbufp += `4`;
393	*outbytesleftp -= `4`;
394	return `0`;
395	}
396	}
397
398	static inline int
399	one_utf16_to_utf8 (iconv_t bigend, const uchar *inbufp, size_t inbytesleftp,
400	uchar *outbufp, size_t outbytesleftp)
401	{
402	cppchar_t s;
403	const uchar inbuf = inbufp;
404	int rval;
405
406	if (*inbytesleftp < `2`)
407	return EINVAL;
408	s = inbuf[bigend ? `0` : `1`] << `8`;
409	s += inbuf[bigend ? `1` : `0`];
410
411	/ Low surrogate without immediately preceding high surrogate is invalid. /
412	if (s >= `0xDC00` && s <= `0xDFFF`)
413	return EILSEQ;
414	/ High surrogate must have a following low surrogate. /
415	else if (s >= `0xD800` && s <= `0xDBFF`)
416	{
417	cppchar_t hi = s, lo;
418	if (*inbytesleftp < `4`)
419	return EINVAL;
420
421	lo = inbuf[bigend ? `2` : `3`] << `8`;
422	lo += inbuf[bigend ? `3` : `2`];
423
424	if (lo < `0xDC00` \|\| lo > `0xDFFF`)
425	return EILSEQ;
426
427	s = (hi - `0xD800`) * `0x400` + (lo - `0xDC00`) + `0x10000`;
428	}
429
430	rval = one_cppchar_to_utf8 (c: s, outbufp, outbytesleftp);
431	if (rval)
432	return rval;
433
434	/ Success - update the input pointers (one_cppchar_to_utf8 has done*
435	the output pointers for us). /*
436	if (s <= `0xFFFF`)
437	{
438	*inbufp += `2`;
439	*inbytesleftp -= `2`;
440	}
441	else
442	{
443	*inbufp += `4`;
444	*inbytesleftp -= `4`;
445	}
446	return `0`;
447	}
448
449
450	/ Special routine which just counts number of characters in the*
451	string, what exactly is stored into the output doesn't matter
452	as long as it is one uchar per character. /*
453
454	static inline int
455	one_count_chars (iconv_t, const uchar *inbufp, size_t inbytesleftp,
456	uchar *outbufp, size_t outbytesleftp)
457	{
458	cppchar_t s = `0`;
459	int rval;
460
461	/ Check for space first, since we know exactly how much we need. /
462	if (*outbytesleftp < `1`)
463	return E2BIG;
464
465	#if HOST_CHARSET == HOST_CHARSET_ASCII
466	rval = one_utf8_to_cppchar (inbufp, inbytesleftp, cp: &s);
467	if (rval)
468	return rval;
469	#else
470	if (*inbytesleftp < `1`)
471	return EINVAL;
472	static const uchar utf_ebcdic_map[`256`] = {
473	/ See table 4 in http://unicode.org/reports/tr16/tr16-7.2.html /
474	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
475	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
476	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
477	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
478	`1`, `9`, `9`, `9`, `9`, `9`, `9`, `9`, `9`, `9`, `9`, `1`, `1`, `1`, `1`, `1`,
479	`1`, `9`, `9`, `9`, `9`, `9`, `9`, `9`, `9`, `9`, `1`, `1`, `1`, `1`, `1`, `1`,
480	`1`, `1`, `9`, `9`, `9`, `9`, `9`, `9`, `9`, `9`, `9`, `1`, `1`, `1`, `1`, `1`,
481	`9`, `9`, `9`, `9`, `2`, `2`, `2`, `2`, `2`, `1`, `1`, `1`, `1`, `1`, `1`, `1`,
482	`2`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `2`, `2`, `2`, `2`, `2`, `2`,
483	`2`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `2`, `2`, `2`, `2`, `2`, `2`,
484	`2`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `2`, `2`, `2`, `1`, `2`, `2`,
485	`2`, `2`, `2`, `2`, `2`, `2`, `2`, `3`, `3`, `3`, `3`, `3`, `3`, `1`, `3`, `3`,
486	`1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `3`, `3`, `3`, `3`, `3`, `3`,
487	`1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `3`, `3`, `4`, `4`, `4`, `4`,
488	`1`, `4`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `4`, `4`, `4`, `5`, `5`, `5`,
489	`1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `1`, `5`, `6`, `6`, `7`, `7`, `0`
490	};
491	rval = utf_ebcdic_map[**inbufp];
492	if (rval == `9`)
493	return EILSEQ;
494	if (rval == `0`)
495	rval = `1`;
496	if (rval >= `2`)
497	{
498	if (*inbytesleftp < rval)
499	return EINVAL;
500	for (int i = `1`; i < rval; ++i)
501	if (utf_ebcdic_map[(*inbufp)[i]] != `9`)
502	return EILSEQ;
503	}
504	*inbytesleftp -= rval;
505	*inbufp += rval;
506	#endif
507
508	**outbufp = `' '`;
509
510	*outbufp += `1`;
511	*outbytesleftp -= `1`;
512	return `0`;
513	}
514
515
516	/ Helper routine for the next few functions. The 'const' on*
517	one_conversion means that we promise not to modify what function is
518	pointed to, which lets the inliner see through it. /*
519
520	static inline bool
521	conversion_loop (int (*const one_conversion)(iconv_t, const uchar *, size_t ,
522	uchar *, size_t ),
523	iconv_t cd, const uchar from, size_t flen, struct* _cpp_strbuf *to)
524	{
525	const uchar *inbuf;
526	uchar *outbuf;
527	size_t inbytesleft, outbytesleft;
528	int rval;
529
530	inbuf = from;
531	inbytesleft = flen;
532	outbuf = to->text + to->len;
533	outbytesleft = to->asize - to->len;
534
535	for (;;)
536	{
537	do
538	rval = one_conversion (cd, &inbuf, &inbytesleft,
539	&outbuf, &outbytesleft);
540	while (inbytesleft && !rval);
541
542	if (__builtin_expect (inbytesleft == `0`, `1`))
543	{
544	to->len = to->asize - outbytesleft;
545	return true;
546	}
547	if (rval != E2BIG)
548	{
549	errno = rval;
550	return false;
551	}
552
553	outbytesleft += OUTBUF_BLOCK_SIZE;
554	to->asize += OUTBUF_BLOCK_SIZE;
555	to->text = XRESIZEVEC (uchar, to->text, to->asize);
556	outbuf = to->text + to->asize - outbytesleft;
557	}
558	}
559
560
561	/ These functions convert entire strings between character sets.*
562	They all have the signature
563
564	bool ()(iconv_t cd, const uchar from, size_t flen, struct _cpp_strbuf to);*
565
566	The input string FROM is converted as specified by the function
567	name plus the iconv descriptor CD (which may be fake), and the
568	result appended to TO. On any error, false is returned, otherwise true. /*
569
570	/ These four use the custom conversion code above. /
571	static bool
572	convert_utf8_utf16 (iconv_t cd, const uchar *from, size_t flen,
573	struct _cpp_strbuf *to)
574	{
575	return conversion_loop (one_conversion: one_utf8_to_utf16, cd, from, flen, to);
576	}
577
578	static bool
579	convert_utf8_utf32 (iconv_t cd, const uchar *from, size_t flen,
580	struct _cpp_strbuf *to)
581	{
582	return conversion_loop (one_conversion: one_utf8_to_utf32, cd, from, flen, to);
583	}
584
585	static bool
586	convert_utf16_utf8 (iconv_t cd, const uchar *from, size_t flen,
587	struct _cpp_strbuf *to)
588	{
589	return conversion_loop (one_conversion: one_utf16_to_utf8, cd, from, flen, to);
590	}
591
592	static bool
593	convert_utf32_utf8 (iconv_t cd, const uchar *from, size_t flen,
594	struct _cpp_strbuf *to)
595	{
596	return conversion_loop (one_conversion: one_utf32_to_utf8, cd, from, flen, to);
597	}
598
599	/ Magic conversion which just counts characters from input, so*
600	only to->len is significant. /*
601	static bool
602	convert_count_chars (iconv_t cd, const uchar *from,
603	size_t flen, struct _cpp_strbuf *to)
604	{
605	return conversion_loop (one_conversion: one_count_chars, cd, from, flen, to);
606	}
607
608	/ Identity conversion, used when we have no alternative. /
609	static bool
610	convert_no_conversion (iconv_t cd ATTRIBUTE_UNUSED,
611	const uchar from, size_t flen, struct* _cpp_strbuf *to)
612	{
613	if (to->len + flen > to->asize)
614	{
615	to->asize = to->len + flen;
616	to->asize += to->asize / `4`;
617	to->text = XRESIZEVEC (uchar, to->text, to->asize);
618	}
619	memcpy (dest: to->text + to->len, src: from, n: flen);
620	to->len += flen;
621	return true;
622	}
623
624	/ And this one uses the system iconv primitive. It's a little*
625	different, since iconv's interface is a little different. /*
626	#if HAVE_ICONV
627
628	#define CONVERT_ICONV_GROW_BUFFER \
629	do { \
630	outbytesleft += OUTBUF_BLOCK_SIZE; \
631	to->asize += OUTBUF_BLOCK_SIZE; \
632	to->text = XRESIZEVEC (uchar, to->text, to->asize); \
633	outbuf = (char *)to->text + to->asize - outbytesleft; \
634	} while (0)
635
636	static bool
637	convert_using_iconv (iconv_t cd, const uchar *from, size_t flen,
638	struct _cpp_strbuf *to)
639	{
640	ICONV_CONST char *inbuf;
641	char *outbuf;
642	size_t inbytesleft, outbytesleft;
643
644	/ Reset conversion descriptor and check that it is valid. /
645	if (iconv (cd, `0`, `0`, `0`, `0`) == (size_t)-`1`)
646	return false;
647
648	inbuf = (ICONV_CONST char *)from;
649	inbytesleft = flen;
650	outbuf = (char *)to->text + to->len;
651	outbytesleft = to->asize - to->len;
652
653	for (;;)
654	{
655	iconv (cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft);
656	if (__builtin_expect (inbytesleft == `0`, `1`))
657	{
658	/ Close out any shift states, returning to the initial state. /
659	if (iconv (cd, `0`, `0`, &outbuf, &outbytesleft) == (size_t)-`1`)
660	{
661	if (errno != E2BIG)
662	return false;
663
664	CONVERT_ICONV_GROW_BUFFER;
665	if (iconv (cd, `0`, `0`, &outbuf, &outbytesleft) == (size_t)-`1`)
666	return false;
667	}
668
669	to->len = to->asize - outbytesleft;
670	return true;
671	}
672	if (errno != E2BIG)
673	return false;
674
675	CONVERT_ICONV_GROW_BUFFER;
676	}
677	}
678	#else
679	#define convert_using_iconv 0 /* prevent undefined symbol error below */
680	#endif
681
682	/ Arrange for the above custom conversion logic to be used automatically*
683	when conversion between a suitable pair of character sets is requested. /*
684
685	#define APPLY_CONVERSION(CONVERTER, FROM, FLEN, TO) \
686	CONVERTER.func (CONVERTER.cd, FROM, FLEN, TO)
687
688	struct cpp_conversion
689	{
690	const char *pair;
691	convert_f func;
692	iconv_t fake_cd;
693	};
694	static const struct cpp_conversion conversion_tab[] = {
695	{ .pair: "UTF-8/UTF-32LE", .func: convert_utf8_utf32, .fake_cd: (iconv_t)`0` },
696	{ .pair: "UTF-8/UTF-32BE", .func: convert_utf8_utf32, .fake_cd: (iconv_t)`1` },
697	{ .pair: "UTF-8/UTF-16LE", .func: convert_utf8_utf16, .fake_cd: (iconv_t)`0` },
698	{ .pair: "UTF-8/UTF-16BE", .func: convert_utf8_utf16, .fake_cd: (iconv_t)`1` },
699	{ .pair: "UTF-32LE/UTF-8", .func: convert_utf32_utf8, .fake_cd: (iconv_t)`0` },
700	{ .pair: "UTF-32BE/UTF-8", .func: convert_utf32_utf8, .fake_cd: (iconv_t)`1` },
701	{ .pair: "UTF-16LE/UTF-8", .func: convert_utf16_utf8, .fake_cd: (iconv_t)`0` },
702	{ .pair: "UTF-16BE/UTF-8", .func: convert_utf16_utf8, .fake_cd: (iconv_t)`1` },
703	};
704
705	/ Subroutine of cpp_init_iconv: initialize and return a*
706	cset_converter structure for conversion from FROM to TO. If
707	iconv_open() fails, issue an error and return an identity
708	converter. Silently return an identity converter if FROM and TO
709	are identical.
710
711	PFILE is only used for generating diagnostics; setting it to NULL
712	suppresses diagnostics. /*
713
714	static struct cset_converter
715	init_iconv_desc (cpp_reader pfile, const* char to, const* char *from)
716	{
717	struct cset_converter ret;
718	char *pair;
719	size_t i;
720
721	ret.to = to;
722	ret.from = from;
723
724	if (!strcasecmp (s1: to, s2: from))
725	{
726	ret.func = convert_no_conversion;
727	ret.cd = (iconv_t) -`1`;
728	ret.width = -`1`;
729	return ret;
730	}
731
732	pair = (char *) alloca(strlen(to) + strlen(from) + `2`);
733
734	strcpy(dest: pair, src: from);
735	strcat(dest: pair, src: "/");
736	strcat(dest: pair, src: to);
737	for (i = `0`; i < ARRAY_SIZE (conversion_tab); i++)
738	if (!strcasecmp (s1: pair, s2: conversion_tab[i].pair))
739	{
740	ret.func = conversion_tab[i].func;
741	ret.cd = conversion_tab[i].fake_cd;
742	ret.width = -`1`;
743	return ret;
744	}
745
746	/ No custom converter - try iconv. /
747	if (HAVE_ICONV)
748	{
749	ret.func = convert_using_iconv;
750	ret.cd = iconv_open (to, from);
751	ret.width = -`1`;
752
753	if (ret.cd == (iconv_t) -`1`)
754	{
755	if (pfile)
756	{
757	if (errno == EINVAL)
758	cpp_error (pfile, CPP_DL_ERROR, / FIXME should be DL_SORRY /
759	msgid: "conversion from %s to %s not supported by iconv",
760	from, to);
761	else
762	cpp_errno (pfile, CPP_DL_ERROR, msgid: "iconv_open");
763	}
764	ret.func = convert_no_conversion;
765	}
766	}
767	else
768	{
769	if (pfile)
770	{
771	cpp_error (pfile, CPP_DL_ERROR, / FIXME: should be DL_SORRY /
772	msgid: "no iconv implementation, cannot convert from %s to %s",
773	from, to);
774	}
775	ret.func = convert_no_conversion;
776	ret.cd = (iconv_t) -`1`;
777	ret.width = -`1`;
778	}
779
780	return ret;
781	}
782
783	/ If charset conversion is requested, initialize iconv(3) descriptors*
784	for conversion from the source character set to the execution
785	character sets. If iconv is not present in the C library, and
786	conversion is requested, issue an error. /*
787
788	void
789	cpp_init_iconv (cpp_reader *pfile)
790	{
791	const char *ncset = CPP_OPTION (pfile, narrow_charset);
792	const char *wcset = CPP_OPTION (pfile, wide_charset);
793	const char *default_wcset;
794
795	bool be = CPP_OPTION (pfile, bytes_big_endian);
796
797	if (CPP_OPTION (pfile, wchar_precision) >= `32`)
798	default_wcset = be ? "UTF-32BE" : "UTF-32LE";
799	else if (CPP_OPTION (pfile, wchar_precision) >= `16`)
800	default_wcset = be ? "UTF-16BE" : "UTF-16LE";
801	else
802	/ This effectively means that wide strings are not supported,*
803	so don't do any conversion at all. /*
804	default_wcset = SOURCE_CHARSET;
805
806	if (!ncset)
807	ncset = SOURCE_CHARSET;
808	if (!wcset)
809	wcset = default_wcset;
810
811	pfile->narrow_cset_desc = init_iconv_desc (pfile, to: ncset, SOURCE_CHARSET);
812	pfile->narrow_cset_desc.width = CPP_OPTION (pfile, char_precision);
813	pfile->utf8_cset_desc = init_iconv_desc (pfile, to: "UTF-8", SOURCE_CHARSET);
814	pfile->utf8_cset_desc.width = CPP_OPTION (pfile, char_precision);
815	pfile->char16_cset_desc = init_iconv_desc (pfile,
816	to: be ? "UTF-16BE" : "UTF-16LE",
817	SOURCE_CHARSET);
818	pfile->char16_cset_desc.width = `16`;
819	pfile->char32_cset_desc = init_iconv_desc (pfile,
820	to: be ? "UTF-32BE" : "UTF-32LE",
821	SOURCE_CHARSET);
822	pfile->char32_cset_desc.width = `32`;
823	pfile->wide_cset_desc = init_iconv_desc (pfile, to: wcset, SOURCE_CHARSET);
824	pfile->wide_cset_desc.width = CPP_OPTION (pfile, wchar_precision);
825	}
826
827	/ Destroy iconv(3) descriptors set up by cpp_init_iconv, if necessary. /
828	void
829	_cpp_destroy_iconv (cpp_reader *pfile)
830	{
831	if (HAVE_ICONV)
832	{
833	if (pfile->narrow_cset_desc.func == convert_using_iconv)
834	iconv_close (pfile->narrow_cset_desc.cd);
835	if (pfile->utf8_cset_desc.func == convert_using_iconv)
836	iconv_close (pfile->utf8_cset_desc.cd);
837	if (pfile->char16_cset_desc.func == convert_using_iconv)
838	iconv_close (pfile->char16_cset_desc.cd);
839	if (pfile->char32_cset_desc.func == convert_using_iconv)
840	iconv_close (pfile->char32_cset_desc.cd);
841	if (pfile->wide_cset_desc.func == convert_using_iconv)
842	iconv_close (pfile->wide_cset_desc.cd);
843	}
844	}
845
846	/ Utility routine for use by a full compiler. C is a character taken*
847	from the basic* source character set, encoded in the host's*
848	execution encoding. Convert it to (the target's) execution
849	encoding, and return that value.
850
851	Issues an internal error if C's representation in the narrow
852	execution character set fails to be a single-byte value (C99
853	5.2.1p3: "The representation of each member of the source and
854	execution character sets shall fit in a byte.") May also issue an
855	internal error if C fails to be a member of the basic source
856	character set (testing this exactly is too hard, especially when
857	the host character set is EBCDIC). /*
858	cppchar_t
859	cpp_host_to_exec_charset (cpp_reader *pfile, cppchar_t c)
860	{
861	uchar sbuf[`1`];
862	struct _cpp_strbuf tbuf;
863
864	/ This test is merely an approximation, but it suffices to catch*
865	the most important thing, which is that we don't get handed a
866	character outside the unibyte range of the host character set. /*
867	if (c > LAST_POSSIBLY_BASIC_SOURCE_CHAR)
868	{
869	cpp_error (pfile, CPP_DL_ICE,
870	msgid: "character 0x%lx is not in the basic source character set\n",
871	(unsigned long)c);
872	return `0`;
873	}
874
875	/ Being a character in the unibyte range of the host character set,*
876	we can safely splat it into a one-byte buffer and trust that that
877	is a well-formed string. /*
878	sbuf[`0`] = c;
879
880	/ This should never need to reallocate, but just in case... /
881	tbuf.asize = `1`;
882	tbuf.text = XNEWVEC (uchar, tbuf.asize);
883	tbuf.len = `0`;
884
885	if (!APPLY_CONVERSION (pfile->narrow_cset_desc, sbuf, `1`, &tbuf))
886	{
887	cpp_errno (pfile, CPP_DL_ICE, msgid: "converting to execution character set");
888	return `0`;
889	}
890	if (tbuf.len != `1`)
891	{
892	cpp_error (pfile, CPP_DL_ICE,
893	msgid: "character 0x%lx is not unibyte in execution character set",
894	(unsigned long)c);
895	return `0`;
896	}
897	c = tbuf.text[`0`];
898	free(ptr: tbuf.text);
899	return c;
900	}
901
902
903
904	/ cpp_substring_ranges's constructor. /
905
906	cpp_substring_ranges::cpp_substring_ranges () :
907	m_ranges (NULL),
908	m_num_ranges (`0`),
909	m_alloc_ranges (`8`)
910	{
911	m_ranges = XNEWVEC (source_range, m_alloc_ranges);
912	}
913
914	/ cpp_substring_ranges's destructor. /
915
916	cpp_substring_ranges::~cpp_substring_ranges ()
917	{
918	free (ptr: m_ranges);
919	}
920
921	/ Add RANGE to the vector of source_range information. /
922
923	void
924	cpp_substring_ranges::add_range (source_range range)
925	{
926	if (m_num_ranges >= m_alloc_ranges)
927	{
928	m_alloc_ranges *= `2`;
929	m_ranges
930	= (source_range *)xrealloc (m_ranges,
931	sizeof (source_range) * m_alloc_ranges);
932	}
933	m_ranges[m_num_ranges++] = range;
934	}
935
936	/ Read NUM ranges from LOC_READER, adding them to the vector of source_range*
937	information. /*
938
939	void
940	cpp_substring_ranges::add_n_ranges (int num,
941	cpp_string_location_reader &loc_reader)
942	{
943	for (int i = `0`; i < num; i++)
944	add_range (range: loc_reader.get_next ());
945	}
946
947
948
949	/ Utility routine that computes a mask of the form 0000...111... with*
950	WIDTH 1-bits. /*
951	static inline size_t
952	width_to_mask (size_t width)
953	{
954	width = MIN (width, BITS_PER_CPPCHAR_T);
955	if (width >= CHAR_BIT * sizeof (size_t))
956	return ~(size_t) `0`;
957	else
958	return ((size_t) `1` << width) - `1`;
959	}
960
961	/ A large table of unicode character information. /
962	enum {
963	/ Valid in a C99 identifier? /
964	C99 = `1`,
965	/ Valid in a C99 identifier, but not as the first character? /
966	N99 = `2`,
967	/ Valid in a C++ identifier? /
968	CXX = `4`,
969	/ Valid in a C11/C++11 identifier? /
970	C11 = `8`,
971	/ Valid in a C11/C++11 identifier, but not as the first character? /
972	N11 = `16`,
973	/ Valid in a C++23 identifier? /
974	CXX23 = `32`,
975	/ Valid in a C++23 identifier, but not as the first character? /
976	NXX23 = `64`,
977	/ NFC representation is not valid in an identifier? /
978	CID = `128`,
979	/ Might be valid NFC form? /
980	NFC = `256`,
981	/ Might be valid NFKC form? /
982	NKC = `512`,
983	/ Certain preceding characters might make it not valid NFC/NKFC form? /
984	CTX = `1024`
985	};
986
987	struct ucnrange {
988	/ Bitmap of flags above. /
989	unsigned short flags;
990	/ Combining class of the character. /
991	unsigned char combine;
992	/ Last character in the range described by this entry. /
993	unsigned int end;
994	};
995	#include "ucnid.h"
996
997	/ ISO 10646 defines the UCS codespace as the range 0-0x10FFFF inclusive. /
998	#define UCS_LIMIT 0x10FFFF
999
1000	#include "uname2c.h"
1001
1002	static const char hangul_syllables[][`4`] = {
1003	/ L /
1004	"G", "GG", "N", "D", "DD", "R", "M", "B", "BB", "S", "SS", "",
1005	"J", "JJ", "C", "K", "T", "P", "H",
1006	/ V /
1007	"A", "AE", "YA", "YAE", "EO", "E", "YEO", "YE", "O", "WA", "WAE",
1008	"OE", "YO", "U", "WEO", "WE", "WI", "YU", "EU", "YI", "I",
1009	/ T /
1010	"", "G", "GG", "GS", "N", "NJ", "NH", "D", "L", "LG", "LM", "LB",
1011	"LS", "LT", "LP", "LH", "M", "B", "BS", "S", "SS", "NG", "J", "C",
1012	"K", "T", "P", "H"
1013	};
1014
1015	static const short hangul_count[`6`] = { `19`, `21`, `28` };
1016
1017	/ Used for Unicode loose matching rule UAX44-LM2 matching. /
1018
1019	struct uname2c_data
1020	{
1021	char *canon_name;
1022	char prev_char;
1023	};
1024
1025	/ Map NAME, a Unicode character name or correction/control/alternate*
1026	alias, to a Unicode codepoint, or return (cppchar_t) -1 if
1027	not found. This uses a space optimized radix tree precomputed
1028	by the makeuname2c utility, with binary format documented in its
1029	source makeuname2c.cc. /*
1030
1031	static cppchar_t
1032	_cpp_uname2c (const char name, size_t len, const* unsigned char *n,
1033	struct uname2c_data *data)
1034	{
1035	do
1036	{
1037	char k;
1038	const char *key;
1039	size_t key_len, len_adj;
1040	bool has_value = *n & `0x40`;
1041	bool has_children, no_sibling = false;
1042	cppchar_t codepoint = -`1`;
1043	const unsigned char *child = NULL;
1044	int ret;
1045
1046	if (*n & `0x80`)
1047	{
1048	k = `' '` + (*n++ & `0x3f`);
1049	key = &k;
1050	key_len = `1`;
1051	}
1052	else
1053	{
1054	key_len = *n++ & `0x3f`;
1055	key = &uname2c_dict[*n++];
1056	key += (*n++ << `8`);
1057	}
1058	if (has_value)
1059	{
1060	codepoint = *n + (n[`1`] << `8`) + ((n[`2`] & `0x1f`) << `16`);
1061	has_children = n[`2`] & `0x80`;
1062	no_sibling = n[`2`] & `0x40`;
1063	n += `3`;
1064	}
1065	else
1066	has_children = true;
1067	if (has_children)
1068	{
1069	unsigned int shift = `0`;
1070	size_t child_off = `0`;
1071
1072	do
1073	{
1074	child_off \|= (*n & `0x7f`) << shift;
1075	shift += `7`;
1076	}
1077	while ((*n++ & `0x80`) != `0`);
1078	child = n + child_off;
1079	}
1080	if (__builtin_expect (data == NULL, `1`))
1081	{
1082	ret = memcmp (s1: name, s2: key, n: len > key_len ? key_len : len);
1083	len_adj = key_len;
1084	}
1085	else
1086	{
1087	const char p = name, q = key;
1088
1089	while (`1`)
1090	{
1091	if ((size_t) (p - name) == len \|\| (size_t) (q - key) == key_len)
1092	break;
1093	if (*q == `' '`)
1094	{
1095	++q;
1096	continue;
1097	}
1098	if (*q == `'-'`)
1099	{
1100	/ This is the hard case. Only medial hyphens*
1101	should be removed, where medial means preceded
1102	and followed by alnum. /*
1103	if (ISALNUM (q == key ? data->prev_char : q[-`1`]))
1104	{
1105	if (q + `1` == key + key_len)
1106	{
1107	/ We don't know what the next letter will be.*
1108	It could be ISALNUM, then we are supposed
1109	to omit it, or it could be a space and then
1110	we should not omit it and need to compare it.
1111	Fortunately the only 3 names with hyphen
1112	followed by non-letter are
1113	U+0F0A TIBETAN MARK BKA- SHOG YIG MGO
1114	U+0FD0 TIBETAN MARK BKA- SHOG GI MGO RGYAN
1115	U+0FD0 TIBETAN MARK BSKA- SHOG GI MGO RGYAN
1116	Furthermore, prefixes of NR2 generated
1117	ranges all end with a hyphen, but the generated
1118	part is then followed by alpha-numeric.
1119	So, let's just assume that - at the end of
1120	key is always followed by alphanumeric and
1121	so should be omitted.
1122	makeuname2c.cc verifies that this is true. /*
1123	++q;
1124	continue;
1125	}
1126	else if (ISALNUM (q[`1`]))
1127	{
1128	++q;
1129	continue;
1130	}
1131	}
1132	}
1133	if (p != q)
1134	break;
1135	++p;
1136	++q;
1137	}
1138	len_adj = p - name;
1139	/ If we don't consume the whole key, signal a mismatch,*
1140	but always with ret = 1, so that we keep looking through
1141	siblings. /*
1142	ret = q < key + key_len;
1143	}
1144	if (ret < `0`)
1145	return -`1`;
1146	else if (ret == `0`)
1147	{
1148	if (len < len_adj)
1149	return -`1`;
1150	else if (codepoint >= `0xd800`
1151	&& codepoint < `0xd800` + ARRAY_SIZE (uname2c_generated))
1152	{
1153	name += len_adj;
1154	len -= len_adj;
1155	if (codepoint == `0xd800`)
1156	{
1157	/ NR1 - Hangul syllables. /
1158	size_t start = `0`, end, i, j;
1159	int this_len, max_len;
1160	char winner[`3`];
1161
1162	for (i = `0`; i < `3`; ++i)
1163	{
1164	end = start + hangul_count[i];
1165	max_len = -`1`;
1166	winner[i] = -`1`;
1167	for (j = start; j < end; j++)
1168	{
1169	this_len = strlen (s: hangul_syllables[j]);
1170	if (len >= (size_t) this_len
1171	&& this_len > max_len
1172	&& memcmp (s1: name, s2: hangul_syllables[j],
1173	n: this_len) == `0`)
1174	{
1175	max_len = this_len;
1176	winner[i] = j - start;
1177	}
1178	}
1179	if (max_len == -`1`)
1180	return -`1`;
1181	name += max_len;
1182	len -= max_len;
1183	start = end;
1184	}
1185	if (__builtin_expect (data != NULL, `0`))
1186	{
1187	memcpy (dest: data->canon_name, src: key, n: key_len);
1188	data->canon_name[key_len] = `'\0'`;
1189	for (i = `0`, start = `0`; i < `3`; ++i)
1190	{
1191	strcat (dest: data->canon_name,
1192	src: hangul_syllables[start + winner[i]]);
1193	start += hangul_count[i];
1194	}
1195	}
1196	return (`0xac00` + `21` * `28` * winner[`0`]
1197	+ `28` * winner[`1`] + winner[`2`]);
1198	}
1199	else
1200	{
1201	/ NR2 - prefix followed by hexadecimal codepoint. /
1202	const cppchar_t *p;
1203	size_t i;
1204
1205	if (len < `4` \|\| len > `5`)
1206	return -`1`;
1207	p = uname2c_pairs + uname2c_generated[codepoint - `0xd800`];
1208	codepoint = `0`;
1209	for (i = `0`; i < len; ++i)
1210	{
1211	codepoint <<= `4`;
1212	if (!ISXDIGIT (name[i]))
1213	return -`1`;
1214	codepoint += hex_value (name[i]);
1215	}
1216	for (; *p; p += `2`)
1217	if (codepoint < *p)
1218	return -`1`;
1219	else if (codepoint <= p[`1`])
1220	{
1221	if (__builtin_expect (data != NULL, `0`))
1222	{
1223	memcpy (dest: data->canon_name, src: key, n: key_len);
1224	memcpy (dest: data->canon_name + key_len, src: name, n: len);
1225	data->canon_name[key_len + len] = `'\0'`;
1226	}
1227	return codepoint;
1228	}
1229	return -`1`;
1230	}
1231	}
1232	else if (__builtin_expect (data != NULL, `0`))
1233	{
1234	if (len == len_adj)
1235	{
1236	memcpy (dest: data->canon_name, src: key, n: key_len);
1237	data->canon_name[key_len] = `'\0'`;
1238	return codepoint;
1239	}
1240	if (has_children)
1241	{
1242	struct uname2c_data save = *data;
1243	memcpy (dest: data->canon_name, src: key, n: key_len);
1244	data->canon_name += key_len;
1245	data->prev_char = key[key_len - `1`];
1246	codepoint = _cpp_uname2c (name: name + len_adj, len: len - len_adj,
1247	n: child, data);
1248	if (codepoint != (cppchar_t) -`1`)
1249	return codepoint;
1250	*data = save;
1251	}
1252	}
1253	else if (len == len_adj)
1254	return codepoint;
1255	else if (!has_children)
1256	return -`1`;
1257	else
1258	{
1259	name += len_adj;
1260	len -= len_adj;
1261	n = child;
1262	continue;
1263	}
1264	}
1265	if (no_sibling \|\| (!has_value && *n == `0xff`))
1266	break;
1267	}
1268	while (`1`);
1269	return -`1`;
1270	}
1271
1272	/ Try to do a loose name lookup according to Unicode loose matching rule*
1273	UAX44-LM2. First ignore medial hyphens, whitespace, underscore
1274	characters and convert to upper case. /*
1275
1276	static cppchar_t
1277	_cpp_uname2c_uax44_lm2 (const char name, size_t len, char* *canon_name)
1278	{
1279	char name_after_uax44_lm2[uname2c_max_name_len];
1280	char *q = name_after_uax44_lm2;
1281	const char *p;
1282
1283	for (p = name; p < name + len; p++)
1284	if (p == `'_'` \|\| p == `' '`)
1285	continue;
1286	else if (*p == `'-'` && p != name && ISALNUM (p[-`1`]) && ISALNUM (p[`1`]))
1287	continue;
1288	else if (q == name_after_uax44_lm2 + uname2c_max_name_len)
1289	return -`1`;
1290	else if (ISLOWER (*p))
1291	q++ = TOUPPER (p);
1292	else
1293	q++ = p;
1294
1295	struct uname2c_data data;
1296	data.canon_name = canon_name;
1297	data.prev_char = `' '`;
1298	/ Hangul Jungseong O- E after UAX44-LM2 should be HANGULJUNGSEONGO-E*
1299	and so should match U+1180. /*
1300	if (q - name_after_uax44_lm2 == sizeof ("HANGULJUNGSEONGO-E") - `1`
1301	&& memcmp (s1: name_after_uax44_lm2, s2: "HANGULJUNGSEONGO-E",
1302	n: sizeof ("HANGULJUNGSEONGO-E") - `1`) == `0`)
1303	{
1304	name_after_uax44_lm2[sizeof ("HANGULJUNGSEONGO") - `1`] = `'E'`;
1305	--q;
1306	}
1307	cppchar_t result
1308	= _cpp_uname2c (name: name_after_uax44_lm2, len: q - name_after_uax44_lm2,
1309	n: uname2c_tree, data: &data);
1310
1311	/ Unicode UAX44-LM2 exception:*
1312	U+116C HANGUL JUNGSEONG OE
1313	U+1180 HANGUL JUNGSEONG O-E
1314	We remove all medial hyphens when we shouldn't remote the U+1180 one.
1315	The U+1180 entry sorts before U+116C lexicographilly, so we get U+1180
1316	in both cases. Thus, if result is U+1180, check if user's name doesn't
1317	have a hyphen there and adjust. /*
1318	if (result == `0x1180`)
1319	{
1320	while (p[-`1`] == `' '` \|\| p[-`1`] == `'_'`)
1321	--p;
1322	gcc_assert (TOUPPER (p[-`1`]) == `'E'`);
1323	--p;
1324	while (p[-`1`] == `' '` \|\| p[-`1`] == `'_'`)
1325	--p;
1326	if (p[-`1`] != `'-'`)
1327	{
1328	result = `0x116c`;
1329	memcpy (dest: canon_name + sizeof ("HANGUL JUNGSEONG O") - `1`, src: "E", n: `2`);
1330	}
1331	}
1332	return result;
1333	}
1334
1335	/ Returns flags representing the XID properties of the given codepoint. /
1336	unsigned int
1337	cpp_check_xid_property (cppchar_t c)
1338	{
1339	// fast path for ASCII
1340	if (c < `0x80`)
1341	{
1342	if ((`'A'` <= c && c <= `'Z'`) \|\| (`'a'` <= c && c <= `'z'`))
1343	return CPP_XID_START \| CPP_XID_CONTINUE;
1344	if ((`'0'` <= c && c <= `'9'`) \|\| c == `'_'`)
1345	return CPP_XID_CONTINUE;
1346	}
1347
1348	if (c > UCS_LIMIT)
1349	return `0`;
1350
1351	int mn, mx, md;
1352	mn = `0`;
1353	mx = ARRAY_SIZE (ucnranges) - `1`;
1354	while (mx != mn)
1355	{
1356	md = (mn + mx) / `2`;
1357	if (c <= ucnranges[md].end)
1358	mx = md;
1359	else
1360	mn = md + `1`;
1361	}
1362
1363	unsigned short flags = ucnranges[mn].flags;
1364
1365	if (flags & CXX23)
1366	return CPP_XID_START \| CPP_XID_CONTINUE;
1367	if (flags & NXX23)
1368	return CPP_XID_CONTINUE;
1369	return `0`;
1370	}
1371
1372	/ Returns 1 if C is valid in an identifier, 2 if C is valid except at*
1373	the start of an identifier, and 0 if C is not valid in an
1374	identifier. We assume C has already gone through the checks of
1375	_cpp_valid_ucn. Also update NST for C if returning nonzero. The
1376	algorithm is a simple binary search on the table defined in
1377	ucnid.h. /*
1378
1379	static int
1380	ucn_valid_in_identifier (cpp_reader *pfile, cppchar_t c,
1381	struct normalize_state *nst)
1382	{
1383	int mn, mx, md;
1384	unsigned short valid_flags, invalid_start_flags;
1385
1386	if (c > UCS_LIMIT)
1387	return `0`;
1388
1389	mn = `0`;
1390	mx = ARRAY_SIZE (ucnranges) - `1`;
1391	while (mx != mn)
1392	{
1393	md = (mn + mx) / `2`;
1394	if (c <= ucnranges[md].end)
1395	mx = md;
1396	else
1397	mn = md + `1`;
1398	}
1399
1400	/ When -pedantic, we require the character to have been listed by*
1401	the standard for the current language. Otherwise, we accept the
1402	union of the acceptable sets for all supported language versions. /*
1403	valid_flags = C99 \| CXX \| C11 \| CXX23;
1404	if (CPP_PEDANTIC (pfile))
1405	{
1406	if (CPP_OPTION (pfile, xid_identifiers))
1407	valid_flags = CXX23;
1408	else if (CPP_OPTION (pfile, c11_identifiers))
1409	valid_flags = C11;
1410	else if (CPP_OPTION (pfile, c99))
1411	valid_flags = C99;
1412	}
1413	if (! (ucnranges[mn].flags & valid_flags))
1414	return `0`;
1415
1416	/ Update NST. /
1417	if (ucnranges[mn].combine != `0` && ucnranges[mn].combine < nst->prev_class)
1418	nst->level = normalized_none;
1419	else if (ucnranges[mn].flags & CTX)
1420	{
1421	bool safe;
1422	cppchar_t p = nst->previous;
1423
1424	/ For Hangul, characters in the range AC00-D7A3 are NFC/NFKC,*
1425	and are combined algorithmically from a sequence of the form
1426	1100-1112 1161-1175 11A8-11C2
1427	(if the third is not present, it is treated as 11A7, which is not
1428	really a valid character).
1429	Unfortunately, C99 allows (only) the NFC form, but C++ allows
1430	only the combining characters. /*
1431	if (c >= `0x1161` && c <= `0x1175`)
1432	safe = p < `0x1100` \|\| p > `0x1112`;
1433	else if (c >= `0x11A8` && c <= `0x11C2`)
1434	safe = (p < `0xAC00` \|\| p > `0xD7A3` \|\| (p - `0xAC00`) % `28` != `0`);
1435	else
1436	safe = check_nfc (pfile, c, p);
1437	if (!safe)
1438	{
1439	if ((c >= `0x1161` && c <= `0x1175`) \|\| (c >= `0x11A8` && c <= `0x11C2`))
1440	nst->level = MAX (nst->level, normalized_identifier_C);
1441	else
1442	nst->level = normalized_none;
1443	}
1444	}
1445	else if (ucnranges[mn].flags & NKC)
1446	;
1447	else if (ucnranges[mn].flags & NFC)
1448	nst->level = MAX (nst->level, normalized_C);
1449	else if (ucnranges[mn].flags & CID)
1450	nst->level = MAX (nst->level, normalized_identifier_C);
1451	else
1452	nst->level = normalized_none;
1453	if (ucnranges[mn].combine == `0`)
1454	nst->previous = c;
1455	nst->prev_class = ucnranges[mn].combine;
1456
1457	if (!CPP_PEDANTIC (pfile))
1458	{
1459	/ If not -pedantic, accept as character that may*
1460	begin an identifier a union of characters allowed
1461	at that position in each of the character sets. /*
1462	if ((ucnranges[mn].flags & (C99 \| N99)) == C99
1463	\|\| (ucnranges[mn].flags & CXX) != `0`
1464	\|\| (ucnranges[mn].flags & (C11 \| N11)) == C11
1465	\|\| (ucnranges[mn].flags & (CXX23 \| NXX23)) == CXX23)
1466	return `1`;
1467	return `2`;
1468	}
1469
1470	if (CPP_OPTION (pfile, xid_identifiers))
1471	invalid_start_flags = NXX23;
1472	else if (CPP_OPTION (pfile, c11_identifiers))
1473	invalid_start_flags = N11;
1474	else if (CPP_OPTION (pfile, c99))
1475	invalid_start_flags = N99;
1476	else
1477	invalid_start_flags = `0`;
1478
1479	/ In C99, UCN digits may not begin identifiers. In C11 and C++11,*
1480	UCN combining characters may not begin identifiers. /*
1481	if (ucnranges[mn].flags & invalid_start_flags)
1482	return `2`;
1483
1484	return `1`;
1485	}
1486
1487	/ Increment char_range->m_finish by a single character. /
1488
1489	static void
1490	extend_char_range (source_range *char_range,
1491	cpp_string_location_reader *loc_reader)
1492	{
1493	if (loc_reader)
1494	{
1495	gcc_assert (char_range);
1496	char_range->m_finish = loc_reader->get_next ().m_finish;
1497	}
1498	}
1499
1500	/ [lex.charset]: The character designated by the universal character*
1501	name \UNNNNNNNN is that character whose character short name in
1502	ISO/IEC 10646 is NNNNNNNN; the character designated by the
1503	universal character name \uNNNN is that character whose character
1504	short name in ISO/IEC 10646 is 0000NNNN. If the hexadecimal value
1505	for a universal character name corresponds to a surrogate code point
1506	(in the range 0xD800-0xDFFF, inclusive), the program is ill-formed.
1507	Additionally, if the hexadecimal value for a universal-character-name
1508	outside a character or string literal corresponds to a control character
1509	(in either of the ranges 0x00-0x1F or 0x7F-0x9F, both inclusive) or to a
1510	character in the basic source character set, the program is ill-formed.
1511
1512	C99 6.4.3: A universal character name shall not specify a character
1513	whose short identifier is less than 00A0 other than 0024 ($), 0040 (@),
1514	or 0060 (`), nor one in the range D800 through DFFF inclusive.
1515
1516	If the hexadecimal value is larger than the upper bound of the UCS
1517	codespace specified in ISO/IEC 10646, a pedantic warning is issued
1518	in all versions of C and in the C++20 or later versions of C++.
1519
1520	*PSTR must be preceded by "\u" or "\U"; it is assumed that the
1521	buffer end is delimited by a non-hex digit. Returns false if the
1522	UCN has not been consumed, true otherwise.
1523
1524	The value of the UCN, whether valid or invalid, is returned in CP.*
1525	Diagnostics are emitted for invalid values. PSTR is updated to point
1526	one beyond the UCN, or to the syntactically invalid character.
1527
1528	IDENTIFIER_POS is 0 when not in an identifier, 1 for the start of
1529	an identifier, or 2 otherwise.
1530
1531	If LOC_READER is non-NULL, then position information is
1532	read from LOC_READER and CHAR_RANGE->m_finish is updated accordingly. /
1533
1534	bool
1535	_cpp_valid_ucn (cpp_reader pfile, const* uchar **pstr,
1536	const uchar limit, int* identifier_pos,
1537	struct normalize_state nst, cppchar_t cp,
1538	source_range *char_range,
1539	cpp_string_location_reader *loc_reader)
1540	{
1541	cppchar_t result, c;
1542	unsigned int length;
1543	const uchar str = pstr;
1544	const uchar *base = str - `2`;
1545	bool delimited = false, named = false;
1546
1547	if (!CPP_OPTION (pfile, cplusplus) && !CPP_OPTION (pfile, c99))
1548	cpp_error (pfile, CPP_DL_WARNING,
1549	msgid: "universal character names are only valid in C++ and C99");
1550	else if (CPP_OPTION (pfile, cpp_warn_c90_c99_compat) > `0`
1551	&& !CPP_OPTION (pfile, cplusplus))
1552	cpp_error (pfile, CPP_DL_WARNING,
1553	msgid: "C99's universal character names are incompatible with C90");
1554	else if (CPP_WTRADITIONAL (pfile) && identifier_pos == `0`)
1555	cpp_warning (pfile, CPP_W_TRADITIONAL,
1556	msgid: "the meaning of '\\%c' is different in traditional C",
1557	(int) str[-`1`]);
1558
1559	result = `0`;
1560	if (str[-`1`] == `'u'`)
1561	{
1562	length = `4`;
1563	if (str < limit
1564	&& *str == `'{'`
1565	&& (!identifier_pos
1566	\|\| CPP_OPTION (pfile, delimited_escape_seqs)
1567	\|\| !CPP_OPTION (pfile, std)))
1568	{
1569	str++;
1570	/ Magic value to indicate no digits seen. /
1571	length = `32`;
1572	delimited = true;
1573	extend_char_range (char_range, loc_reader);
1574	}
1575	}
1576	else if (str[-`1`] == `'U'`)
1577	length = `8`;
1578	else if (str[-`1`] == `'N'`)
1579	{
1580	length = `4`;
1581	if (identifier_pos
1582	&& !CPP_OPTION (pfile, delimited_escape_seqs)
1583	&& CPP_OPTION (pfile, std))
1584	{
1585	*cp = `0`;
1586	return false;
1587	}
1588	if (str == limit \|\| *str != `'{'`)
1589	{
1590	if (identifier_pos)
1591	{
1592	*cp = `0`;
1593	return false;
1594	}
1595	cpp_error (pfile, CPP_DL_ERROR, msgid: "'\\N' not followed by '{'");
1596	}
1597	else
1598	{
1599	str++;
1600	named = true;
1601	extend_char_range (char_range, loc_reader);
1602	length = `0`;
1603	const uchar *name = str;
1604	bool strict = true;
1605
1606	do
1607	{
1608	if (str == limit)
1609	break;
1610	c = *str;
1611	if (!ISIDNUM (c) && c != `' '` && c != `'-'`)
1612	break;
1613	if (ISLOWER (c) \|\| c == `'_'`)
1614	strict = false;
1615	str++;
1616	extend_char_range (char_range, loc_reader);
1617	}
1618	while (`1`);
1619
1620	if (str < limit && *str == `'}'`)
1621	{
1622	if (identifier_pos && name == str)
1623	{
1624	cpp_warning (pfile, CPP_W_UNICODE,
1625	msgid: "empty named universal character escape "
1626	"sequence; treating it as separate tokens");
1627	*cp = `0`;
1628	return false;
1629	}
1630	if (name == str)
1631	cpp_error (pfile, CPP_DL_ERROR,
1632	msgid: "empty named universal character escape sequence");
1633	else if ((!identifier_pos \|\| strict)
1634	&& !CPP_OPTION (pfile, delimited_escape_seqs)
1635	&& CPP_OPTION (pfile, cpp_pedantic))
1636	cpp_error (pfile, CPP_DL_PEDWARN,
1637	msgid: "named universal character escapes are only valid "
1638	"in C++23");
1639	if (name == str)
1640	result = `0x40`;
1641	else
1642	{
1643	/ If the name is longer than maximum length of a Unicode*
1644	name, it can't be strictly valid. /*
1645	if ((size_t) (str - name) > uname2c_max_name_len \|\| !strict)
1646	result = -`1`;
1647	else
1648	result = _cpp_uname2c (name: (const char *) name, len: str - name,
1649	n: uname2c_tree, NULL);
1650	if (result == (cppchar_t) -`1`)
1651	{
1652	bool ret = true;
1653	if (identifier_pos
1654	&& (!CPP_OPTION (pfile, delimited_escape_seqs)
1655	\|\| !strict))
1656	ret = cpp_warning (pfile, CPP_W_UNICODE,
1657	msgid: "\\N{%.*s} is not a valid "
1658	"universal character; treating it "
1659	"as separate tokens",
1660	(int) (str - name), name);
1661	else
1662	cpp_error (pfile, CPP_DL_ERROR,
1663	msgid: "\\N{%.*s} is not a valid universal "
1664	"character", (int) (str - name), name);
1665
1666	/ Try to do a loose name lookup according to*
1667	Unicode loose matching rule UAX44-LM2. /*
1668	char canon_name[uname2c_max_name_len + `1`];
1669	result = _cpp_uname2c_uax44_lm2 (name: (const char *) name,
1670	len: str - name, canon_name);
1671	if (result != (cppchar_t) -`1` && ret)
1672	cpp_error (pfile, CPP_DL_NOTE,
1673	msgid: "did you mean \\N{%s}?", canon_name);
1674	else
1675	result = `0xC0`;
1676	if (identifier_pos
1677	&& (!CPP_OPTION (pfile, delimited_escape_seqs)
1678	\|\| !strict))
1679	{
1680	*cp = `0`;
1681	return false;
1682	}
1683	}
1684	}
1685	str++;
1686	extend_char_range (char_range, loc_reader);
1687	}
1688	else if (identifier_pos)
1689	{
1690	cpp_warning (pfile, CPP_W_UNICODE,
1691	msgid: "'\\N{' not terminated with '}' after %.*s; "
1692	"treating it as separate tokens",
1693	(int) (str - base), base);
1694	*cp = `0`;
1695	return false;
1696	}
1697	else
1698	{
1699	cpp_error (pfile, CPP_DL_ERROR,
1700	msgid: "'\\N{' not terminated with '}' after %.*s",
1701	(int) (str - base), base);
1702	result = `1`;
1703	}
1704	}
1705	}
1706	else
1707	{
1708	cpp_error (pfile, CPP_DL_ICE, msgid: "In _cpp_valid_ucn but not a UCN");
1709	length = `4`;
1710	}
1711
1712	if (!named)
1713	do
1714	{
1715	if (str == limit)
1716	break;
1717	c = *str;
1718	if (!ISXDIGIT (c))
1719	break;
1720	str++;
1721	extend_char_range (char_range, loc_reader);
1722	if (delimited)
1723	{
1724	if (!result)
1725	/ Accept arbitrary number of leading zeros.*
1726	16 is another magic value, smaller than 32 above
1727	and bigger than 8, so that upon encountering first
1728	non-zero digit we can count 8 digits and after that
1729	or in overflow bit and ensure length doesn't decrease
1730	to 0, as delimited escape sequence doesn't have upper
1731	bound on the number of hex digits. /*
1732	length = `16`;
1733	else if (length == `16` - `8`)
1734	{
1735	/ Make sure we detect overflows. /
1736	result \|= `0x8000000`;
1737	++length;
1738	}
1739	}
1740
1741	result = (result << `4`) + hex_value (c);
1742	}
1743	while (--length);
1744
1745	if (delimited && str < limit && *str == `'}'`)
1746	{
1747	if (length == `32` && identifier_pos)
1748	{
1749	cpp_warning (pfile, CPP_W_UNICODE,
1750	msgid: "empty delimited escape sequence; "
1751	"treating it as separate tokens");
1752	*cp = `0`;
1753	return false;
1754	}
1755	else if (length == `32`)
1756	cpp_error (pfile, CPP_DL_ERROR,
1757	msgid: "empty delimited escape sequence");
1758	else if (!CPP_OPTION (pfile, delimited_escape_seqs)
1759	&& CPP_OPTION (pfile, cpp_pedantic))
1760	cpp_error (pfile, CPP_DL_PEDWARN,
1761	msgid: "delimited escape sequences are only valid in C++23");
1762	str++;
1763	length = `0`;
1764	delimited = false;
1765	extend_char_range (char_range, loc_reader);
1766	}
1767
1768	/ Partial UCNs are not valid in strings, but decompose into*
1769	multiple tokens in identifiers, so we can't give a helpful
1770	error message in that case. /*
1771	if (length && identifier_pos)
1772	{
1773	if (delimited)
1774	cpp_warning (pfile, CPP_W_UNICODE,
1775	msgid: "'\\u{' not terminated with '}' after %.*s; "
1776	"treating it as separate tokens",
1777	(int) (str - base), base);
1778	*cp = `0`;
1779	return false;
1780	}
1781
1782	*pstr = str;
1783	if (length)
1784	{
1785	if (!delimited)
1786	cpp_error (pfile, CPP_DL_ERROR,
1787	msgid: "incomplete universal character name %.*s",
1788	(int) (str - base), base);
1789	else
1790	cpp_error (pfile, CPP_DL_ERROR,
1791	msgid: "'\\u{' not terminated with '}' after %.*s",
1792	(int) (str - base), base);
1793	result = `1`;
1794	}
1795	/ The C99 standard permits $, @ and ` to be specified as UCNs. We use*
1796	hex escapes so that this also works with EBCDIC hosts.
1797	C++0x permits everything below 0xa0 within literals;
1798	ucn_valid_in_identifier will complain about identifiers. /*
1799	else if ((result < `0xa0`
1800	&& !CPP_OPTION (pfile, cplusplus)
1801	&& (result != `0x24` && result != `0x40` && result != `0x60`))
1802	\|\| (result & `0x80000000`)
1803	\|\| (result >= `0xD800` && result <= `0xDFFF`))
1804	{
1805	cpp_error (pfile, CPP_DL_ERROR,
1806	msgid: "%.*s is not a valid universal character",
1807	(int) (str - base), base);
1808	result = `1`;
1809	}
1810	else if (identifier_pos && result == `0x24`
1811	&& CPP_OPTION (pfile, dollars_in_ident))
1812	{
1813	if (CPP_OPTION (pfile, warn_dollars) && !pfile->state.skipping)
1814	{
1815	CPP_OPTION (pfile, warn_dollars) = `0`;
1816	cpp_error (pfile, CPP_DL_PEDWARN, msgid: "'$' in identifier or number");
1817	}
1818	NORMALIZE_STATE_UPDATE_IDNUM (nst, result);
1819	}
1820	else if (identifier_pos)
1821	{
1822	int validity = ucn_valid_in_identifier (pfile, c: result, nst);
1823
1824	if (validity == `0`)
1825	cpp_error (pfile, CPP_DL_ERROR,
1826	msgid: "universal character %.*s is not valid in an identifier",
1827	(int) (str - base), base);
1828	else if (validity == `2` && identifier_pos == `1`)
1829	cpp_error (pfile, CPP_DL_ERROR,
1830	msgid: "universal character %.*s is not valid at the start of an identifier",
1831	(int) (str - base), base);
1832	}
1833	else if (result > UCS_LIMIT
1834	&& (!CPP_OPTION (pfile, cplusplus)
1835	\|\| CPP_OPTION (pfile, lang) > CLK_CXX17))
1836	cpp_error (pfile, CPP_DL_PEDWARN,
1837	msgid: "%.*s is outside the UCS codespace",
1838	(int) (str - base), base);
1839
1840	*cp = result;
1841	return true;
1842	}
1843
1844	/ Convert an UCN, pointed to by FROM, to UTF-8 encoding, then translate*
1845	it to the execution character set and write the result into TBUF,
1846	if TBUF is non-NULL.
1847	An advanced pointer is returned. Issues all relevant diagnostics.
1848	If LOC_READER is non-NULL, then RANGES must be non-NULL and CHAR_RANGE
1849	contains the location of the character so far: location information
1850	is read from LOC_READER, and RANGES is updated accordingly. /*
1851	static const uchar *
1852	convert_ucn (cpp_reader pfile, const* uchar from, const* uchar *limit,
1853	struct _cpp_strbuf tbuf, struct* cset_converter cvt,
1854	source_range char_range,
1855	cpp_string_location_reader *loc_reader,
1856	cpp_substring_ranges *ranges)
1857	{
1858	cppchar_t ucn;
1859	uchar buf[`6`];
1860	uchar *bufp = buf;
1861	size_t bytesleft = `6`;
1862	int rval;
1863	struct normalize_state nst = INITIAL_NORMALIZE_STATE;
1864
1865	/ loc_reader and ranges must either be both NULL, or both be non-NULL. /
1866	gcc_assert ((loc_reader != NULL) == (ranges != NULL));
1867
1868	from++; / Skip u/U/N. /
1869
1870	/ The u/U is part of the spelling of this character. /
1871	extend_char_range (char_range: &char_range, loc_reader);
1872
1873	_cpp_valid_ucn (pfile, pstr: &from, limit, identifier_pos: `0`, nst: &nst,
1874	cp: &ucn, char_range: &char_range, loc_reader);
1875
1876	rval = one_cppchar_to_utf8 (c: ucn, outbufp: &bufp, outbytesleftp: &bytesleft);
1877	if (rval)
1878	{
1879	errno = rval;
1880	cpp_errno (pfile, CPP_DL_ERROR,
1881	msgid: "converting UCN to source character set");
1882	}
1883	else
1884	{
1885	if (tbuf)
1886	if (!APPLY_CONVERSION (cvt, buf, `6` - bytesleft, tbuf))
1887	cpp_errno (pfile, CPP_DL_ERROR,
1888	msgid: "converting UCN to execution character set");
1889
1890	if (loc_reader)
1891	{
1892	int num_encoded_bytes = `6` - bytesleft;
1893	for (int i = `0`; i < num_encoded_bytes; i++)
1894	ranges->add_range (range: char_range);
1895	}
1896	}
1897
1898	return from;
1899	}
1900
1901	/ Performs a similar task as _cpp_valid_ucn, but parses UTF-8-encoded*
1902	extended characters rather than UCNs. If the return value is TRUE, then a
1903	character was successfully decoded and stored in CP; PSTR has been
1904	updated to point one past the valid UTF-8 sequence. Diagnostics may have
1905	been emitted if the character parsed is not allowed in the current context.
1906	If the return value is FALSE, then PSTR has not been modified and CP may
1907	equal 0, to indicate that PSTR does not form a valid UTF-8 sequence, or it*
1908	may, when processing an identifier in C mode, equal a codepoint that was
1909	validly encoded but is not allowed to appear in an identifier. In either
1910	case, no diagnostic is emitted, and the return value of FALSE should cause
1911	a new token to be formed.
1912
1913	_cpp_valid_utf8 can be called when lexing a potential identifier, or a
1914	CPP_OTHER token or for the purposes of -Winvalid-utf8 warning in string or
1915	character literals. NST is unused when not in a potential identifier.
1916
1917	As in _cpp_valid_ucn, IDENTIFIER_POS is 0 when not in an identifier, 1 for
1918	the start of an identifier, or 2 otherwise. /*
1919
1920	extern bool
1921	_cpp_valid_utf8 (cpp_reader *pfile,
1922	const uchar **pstr,
1923	const uchar *limit,
1924	int identifier_pos,
1925	struct normalize_state *nst,
1926	cppchar_t *cp)
1927	{
1928	const uchar base = pstr;
1929	size_t inbytesleft = limit - base;
1930	if (one_utf8_to_cppchar (inbufp: pstr, inbytesleftp: &inbytesleft, cp))
1931	{
1932	/ No diagnostic here as this byte will rather become a*
1933	new token. /*
1934	*cp = `0`;
1935	return false;
1936	}
1937
1938	if (identifier_pos)
1939	{
1940	switch (ucn_valid_in_identifier (pfile, c: *cp, nst))
1941	{
1942
1943	case `0`:
1944	/ In C++, this is an error for invalid character in an identifier*
1945	because logically, the UTF-8 was converted to a UCN during
1946	translation phase 1 (even though we don't physically do it that
1947	way). In C, this byte rather becomes grammatically a separate
1948	token. /*
1949
1950	if (CPP_OPTION (pfile, cplusplus))
1951	cpp_error (pfile, CPP_DL_ERROR,
1952	msgid: "extended character %.*s is not valid in an identifier",
1953	(int) (*pstr - base), base);
1954	else
1955	{
1956	*pstr = base;
1957	return false;
1958	}
1959
1960	break;
1961
1962	case `2`:
1963	if (identifier_pos == `1`)
1964	{
1965	/ This is treated the same way in C++ or C99 -- lexed as an*
1966	identifier which is then invalid because an identifier is
1967	not allowed to start with this character. /*
1968	cpp_error (pfile, CPP_DL_ERROR,
1969	msgid: "extended character %.*s is not valid at the start of an identifier",
1970	(int) (*pstr - base), base);
1971	}
1972	break;
1973	}
1974	}
1975
1976	return true;
1977	}
1978
1979	/ Return true iff BUFFER of size NUM_BYTES is validly-encoded UTF-8. /
1980
1981	extern bool
1982	cpp_valid_utf8_p (const char *buffer, size_t num_bytes)
1983	{
1984	const uchar iter = (const* uchar *)buffer;
1985	size_t bytesleft = num_bytes;
1986	while (bytesleft > `0`)
1987	{
1988	/ one_utf8_to_cppchar implements 5-byte and 6 byte sequences as per*
1989	RFC 2279, but this has been superceded by RFC 3629, which
1990	restricts UTF-8 to 1-byte through 4-byte sequences, and
1991	states "the octet values C0, C1, F5 to FF never appear".
1992
1993	Reject such values. /*
1994	if (*iter >= `0xf4`)
1995	return false;
1996
1997	cppchar_t cp;
1998	int err = one_utf8_to_cppchar (inbufp: &iter, inbytesleftp: &bytesleft, cp: &cp);
1999	if (err)
2000	return false;
2001
2002	/ Additionally, Unicode declares that all codepoints above 0010FFFF are*
2003	invalid because they cannot be represented in UTF-16.
2004
2005	Reject such values./*
2006	if (cp > UCS_LIMIT)
2007	return false;
2008	}
2009	/ No problems encountered. /
2010	return true;
2011	}
2012
2013	/ Subroutine of convert_hex and convert_oct. N is the representation*
2014	in the execution character set of a numeric escape; write it into the
2015	string buffer TBUF and update the end-of-string pointer therein. WIDE
2016	is true if it's a wide string that's being assembled in TBUF. This
2017	function issues no diagnostics and never fails. /*
2018	static void
2019	emit_numeric_escape (cpp_reader *pfile, cppchar_t n,
2020	struct _cpp_strbuf tbuf, struct* cset_converter cvt)
2021	{
2022	size_t width = cvt.width;
2023
2024	if (width != CPP_OPTION (pfile, char_precision))
2025	{
2026	/ We have to render this into the target byte order, which may not*
2027	be our byte order. /*
2028	bool bigend = CPP_OPTION (pfile, bytes_big_endian);
2029	size_t cwidth = CPP_OPTION (pfile, char_precision);
2030	size_t cmask = width_to_mask (width: cwidth);
2031	size_t nbwc = width / cwidth;
2032	size_t i;
2033	size_t off = tbuf->len;
2034	cppchar_t c;
2035
2036	if (tbuf->len + nbwc > tbuf->asize)
2037	{
2038	tbuf->asize += OUTBUF_BLOCK_SIZE;
2039	tbuf->text = XRESIZEVEC (uchar, tbuf->text, tbuf->asize);
2040	}
2041
2042	for (i = `0`; i < nbwc; i++)
2043	{
2044	c = n & cmask;
2045	n >>= cwidth;
2046	tbuf->text[off + (bigend ? nbwc - i - `1` : i)] = c;
2047	}
2048	tbuf->len += nbwc;
2049	}
2050	else
2051	{
2052	/ Note: this code does not handle the case where the target*
2053	and host have a different number of bits in a byte. /*
2054	if (tbuf->len + `1` > tbuf->asize)
2055	{
2056	tbuf->asize += OUTBUF_BLOCK_SIZE;
2057	tbuf->text = XRESIZEVEC (uchar, tbuf->text, tbuf->asize);
2058	}
2059	tbuf->text[tbuf->len++] = n;
2060	}
2061	}
2062
2063	/ Convert a hexadecimal escape, pointed to by FROM, to the execution*
2064	character set and write it into the string buffer TBUF (if non-NULL).
2065	Returns an advanced pointer, and issues diagnostics as necessary.
2066	No character set translation occurs; this routine always produces the
2067	execution-set character with numeric value equal to the given hex
2068	number. You can, e.g. generate surrogate pairs this way.
2069	If LOC_READER is non-NULL, then RANGES must be non-NULL and CHAR_RANGE
2070	contains the location of the character so far: location information
2071	is read from LOC_READER, and RANGES is updated accordingly. /*
2072	static const uchar *
2073	convert_hex (cpp_reader pfile, const* uchar from, const* uchar *limit,
2074	struct _cpp_strbuf tbuf, struct* cset_converter cvt,
2075	source_range char_range,
2076	cpp_string_location_reader *loc_reader,
2077	cpp_substring_ranges *ranges)
2078	{
2079	cppchar_t c, n = `0`, overflow = `0`;
2080	int digits_found = `0`;
2081	size_t width = cvt.width;
2082	size_t mask = width_to_mask (width);
2083	bool delimited = false;
2084	const uchar *base = from - `1`;
2085
2086	/ loc_reader and ranges must either be both NULL, or both be non-NULL. /
2087	gcc_assert ((loc_reader != NULL) == (ranges != NULL));
2088
2089	if (CPP_WTRADITIONAL (pfile))
2090	cpp_warning (pfile, CPP_W_TRADITIONAL,
2091	msgid: "the meaning of '\\x' is different in traditional C");
2092
2093	/ Skip 'x'. /
2094	from++;
2095
2096	/ The 'x' is part of the spelling of this character. /
2097	extend_char_range (char_range: &char_range, loc_reader);
2098
2099	if (from < limit && *from == `'{'`)
2100	{
2101	delimited = true;
2102	from++;
2103	extend_char_range (char_range: &char_range, loc_reader);
2104	}
2105
2106	while (from < limit)
2107	{
2108	c = *from;
2109	if (! hex_p (c))
2110	break;
2111	from++;
2112	extend_char_range (char_range: &char_range, loc_reader);
2113	overflow \|= n ^ (n << `4` >> `4`);
2114	n = (n << `4`) + hex_value (c);
2115	digits_found = `1`;
2116	}
2117
2118	if (delimited && from < limit && *from == `'}'`)
2119	{
2120	from++;
2121	if (!digits_found)
2122	{
2123	cpp_error (pfile, CPP_DL_ERROR,
2124	msgid: "empty delimited escape sequence");
2125	return from;
2126	}
2127	else if (!CPP_OPTION (pfile, delimited_escape_seqs)
2128	&& CPP_OPTION (pfile, cpp_pedantic))
2129	cpp_error (pfile, CPP_DL_PEDWARN,
2130	msgid: "delimited escape sequences are only valid in C++23");
2131	delimited = false;
2132	extend_char_range (char_range: &char_range, loc_reader);
2133	}
2134
2135	if (!digits_found)
2136	{
2137	cpp_error (pfile, CPP_DL_ERROR,
2138	msgid: "\\x used with no following hex digits");
2139	return from;
2140	}
2141	else if (delimited)
2142	{
2143	cpp_error (pfile, CPP_DL_ERROR,
2144	msgid: "'\\x{' not terminated with '}' after %.*s",
2145	(int) (from - base), base);
2146	return from;
2147	}
2148
2149	if (overflow \| (n != (n & mask)))
2150	{
2151	cpp_error (pfile, CPP_DL_PEDWARN,
2152	msgid: "hex escape sequence out of range");
2153	n &= mask;
2154	}
2155
2156	if (tbuf)
2157	emit_numeric_escape (pfile, n, tbuf, cvt);
2158	if (ranges)
2159	ranges->add_range (range: char_range);
2160
2161	return from;
2162	}
2163
2164	/ Convert an octal escape, pointed to by FROM, to the execution*
2165	character set and write it into the string buffer TBUF. Returns an
2166	advanced pointer, and issues diagnostics as necessary.
2167	No character set translation occurs; this routine always produces the
2168	execution-set character with numeric value equal to the given octal
2169	number.
2170	If LOC_READER is non-NULL, then RANGES must be non-NULL and CHAR_RANGE
2171	contains the location of the character so far: location information
2172	is read from LOC_READER, and RANGES is updated accordingly. /*
2173	static const uchar *
2174	convert_oct (cpp_reader pfile, const* uchar from, const* uchar *limit,
2175	struct _cpp_strbuf tbuf, struct* cset_converter cvt,
2176	source_range char_range,
2177	cpp_string_location_reader *loc_reader,
2178	cpp_substring_ranges *ranges)
2179	{
2180	size_t count = `0`;
2181	cppchar_t c, n = `0`, overflow = `0`;
2182	size_t width = cvt.width;
2183	size_t mask = width_to_mask (width);
2184	bool delimited = false;
2185	const uchar *base = from - `1`;
2186
2187	/ loc_reader and ranges must either be both NULL, or both be non-NULL. /
2188	gcc_assert ((loc_reader != NULL) == (ranges != NULL));
2189
2190	if (from < limit && *from == `'o'`)
2191	{
2192	from++;
2193	extend_char_range (char_range: &char_range, loc_reader);
2194	if (from == limit \|\| *from != `'{'`)
2195	cpp_error (pfile, CPP_DL_ERROR, msgid: "'\\o' not followed by '{'");
2196	else
2197	{
2198	from++;
2199	extend_char_range (char_range: &char_range, loc_reader);
2200	delimited = true;
2201	}
2202	}
2203
2204	while (from < limit && count++ < `3`)
2205	{
2206	c = *from;
2207	if (c < `'0'` \|\| c > `'7'`)
2208	break;
2209	from++;
2210	extend_char_range (char_range: &char_range, loc_reader);
2211	if (delimited)
2212	{
2213	count = `2`;
2214	overflow \|= n ^ (n << `3` >> `3`);
2215	}
2216	n = (n << `3`) + c - `'0'`;
2217	}
2218
2219	if (delimited)
2220	{
2221	if (from < limit && *from == `'}'`)
2222	{
2223	from++;
2224	if (count == `1`)
2225	{
2226	cpp_error (pfile, CPP_DL_ERROR,
2227	msgid: "empty delimited escape sequence");
2228	return from;
2229	}
2230	else if (!CPP_OPTION (pfile, delimited_escape_seqs)
2231	&& CPP_OPTION (pfile, cpp_pedantic))
2232	cpp_error (pfile, CPP_DL_PEDWARN,
2233	msgid: "delimited escape sequences are only valid in C++23");
2234	extend_char_range (char_range: &char_range, loc_reader);
2235	}
2236	else
2237	{
2238	cpp_error (pfile, CPP_DL_ERROR,
2239	msgid: "'\\o{' not terminated with '}' after %.*s",
2240	(int) (from - base), base);
2241	return from;
2242	}
2243	}
2244
2245	if (overflow \| (n != (n & mask)))
2246	{
2247	cpp_error (pfile, CPP_DL_PEDWARN,
2248	msgid: "octal escape sequence out of range");
2249	n &= mask;
2250	}
2251
2252	if (tbuf)
2253	emit_numeric_escape (pfile, n, tbuf, cvt);
2254	if (ranges)
2255	ranges->add_range (range: char_range);
2256
2257	return from;
2258	}
2259
2260	/ Convert an escape sequence (pointed to by FROM) to its value on*
2261	the target, and to the execution character set. Do not scan past
2262	LIMIT. Write the converted value into TBUF, if TBUF is non-NULL.
2263	Returns an advanced pointer. Handles all relevant diagnostics.
2264	If LOC_READER is non-NULL, then RANGES must be non-NULL: location
2265	information is read from LOC_READER, and RANGES is updated
2266	accordingly. /*
2267	static const uchar *
2268	convert_escape (cpp_reader pfile, const* uchar from, const* uchar *limit,
2269	struct _cpp_strbuf tbuf, struct* cset_converter cvt,
2270	cpp_string_location_reader *loc_reader,
2271	cpp_substring_ranges ranges, bool* uneval)
2272	{
2273	/ Values of \a \b \e \f \n \r \t \v respectively. /
2274	#if HOST_CHARSET == HOST_CHARSET_ASCII
2275	static const uchar charconsts[] = { `7`, `8`, `27`, `12`, `10`, `13`, `9`, `11` };
2276	#elif HOST_CHARSET == HOST_CHARSET_EBCDIC
2277	static const uchar charconsts[] = { `47`, `22`, `39`, `12`, `21`, `13`, `5`, `11` };
2278	#else
2279	#error "unknown host character set"
2280	#endif
2281
2282	uchar c;
2283
2284	/ Record the location of the backslash. /
2285	source_range char_range;
2286	if (loc_reader)
2287	char_range = loc_reader->get_next ();
2288
2289	c = *from;
2290	switch (c)
2291	{
2292	/ UCNs, hex escapes, and octal escapes are processed separately. /
2293	case `'u'`: case `'U'`: case `'N'`:
2294	return convert_ucn (pfile, from, limit, tbuf, cvt,
2295	char_range, loc_reader, ranges);
2296
2297	case `'x'`:
2298	if (uneval && CPP_PEDANTIC (pfile))
2299	cpp_error (pfile, CPP_DL_PEDWARN,
2300	msgid: "numeric escape sequence in unevaluated string: "
2301	"'\\%c'", (int) c);
2302	return convert_hex (pfile, from, limit, tbuf, cvt,
2303	char_range, loc_reader, ranges);
2304
2305	case `'0'`: case `'1'`: case `'2'`: case `'3'`:
2306	case `'4'`: case `'5'`: case `'6'`: case `'7'`:
2307	case `'o'`:
2308	if (uneval && CPP_PEDANTIC (pfile))
2309	cpp_error (pfile, CPP_DL_PEDWARN,
2310	msgid: "numeric escape sequence in unevaluated string: "
2311	"'\\%c'", (int) c);
2312	return convert_oct (pfile, from, limit, tbuf, cvt,
2313	char_range, loc_reader, ranges);
2314
2315	/ Various letter escapes. Get the appropriate host-charset*
2316	value into C. /*
2317	case `'\\'`: case `'\''`: case `'"'`: case `'?'`: break;
2318
2319	case `'('`: case `'{'`: case `'['`: case `'%'`:
2320	/ '\(', etc, can be used at the beginning of a line in a long*
2321	string split onto multiple lines with \-newline, to prevent
2322	Emacs or other text editors from getting confused. '\%' can
2323	be used to prevent SCCS from mangling printf format strings. /*
2324	if (CPP_PEDANTIC (pfile))
2325	goto unknown;
2326	break;
2327
2328	case `'b'`: c = charconsts[`1`]; break;
2329	case `'f'`: c = charconsts[`3`]; break;
2330	case `'n'`: c = charconsts[`4`]; break;
2331	case `'r'`: c = charconsts[`5`]; break;
2332	case `'t'`: c = charconsts[`6`]; break;
2333	case `'v'`: c = charconsts[`7`]; break;
2334
2335	case `'a'`:
2336	if (CPP_WTRADITIONAL (pfile))
2337	cpp_warning (pfile, CPP_W_TRADITIONAL,
2338	msgid: "the meaning of '\\a' is different in traditional C");
2339	c = charconsts[`0`];
2340	break;
2341
2342	case `'e'`: case `'E'`:
2343	if (CPP_PEDANTIC (pfile))
2344	cpp_error (pfile, CPP_DL_PEDWARN,
2345	msgid: "non-ISO-standard escape sequence, '\\%c'", (int) c);
2346	c = charconsts[`2`];
2347	break;
2348
2349	default:
2350	unknown:
2351	if (ISGRAPH (c))
2352	cpp_error (pfile, CPP_DL_PEDWARN,
2353	msgid: "unknown escape sequence: '\\%c'", (int) c);
2354	else
2355	{
2356	encoding_rich_location rich_loc (pfile);
2357
2358	/ diagnostic.cc does not support "%03o". When it does, this*
2359	code can use %03o directly in the diagnostic again. /*
2360	char buf[`32`];
2361	sprintf(s: buf, format: "%03o", (int) c);
2362	cpp_error_at (pfile, CPP_DL_PEDWARN, richloc: &rich_loc,
2363	msgid: "unknown escape sequence: '\\%s'", buf);
2364	}
2365	}
2366
2367	if (tbuf)
2368	/ Now convert what we have to the execution character set. /
2369	if (!APPLY_CONVERSION (cvt, &c, `1`, tbuf))
2370	cpp_errno (pfile, CPP_DL_ERROR,
2371	msgid: "converting escape sequence to execution character set");
2372
2373	if (loc_reader)
2374	{
2375	char_range.m_finish = loc_reader->get_next ().m_finish;
2376	ranges->add_range (range: char_range);
2377	}
2378
2379	return from + `1`;
2380	}
2381
2382	/ TYPE is a token type. The return value is the conversion needed to*
2383	convert from source to execution character set for the given type. /*
2384	static struct cset_converter
2385	converter_for_type (cpp_reader pfile, enum* cpp_ttype type)
2386	{
2387	switch (type)
2388	{
2389	default:
2390	return pfile->narrow_cset_desc;
2391	case CPP_UTF8CHAR:
2392	case CPP_UTF8STRING:
2393	return pfile->utf8_cset_desc;
2394	case CPP_CHAR16:
2395	case CPP_STRING16:
2396	return pfile->char16_cset_desc;
2397	case CPP_CHAR32:
2398	case CPP_STRING32:
2399	return pfile->char32_cset_desc;
2400	case CPP_WCHAR:
2401	case CPP_WSTRING:
2402	return pfile->wide_cset_desc;
2403	}
2404	}
2405
2406	/ FROM is an array of cpp_string structures of length COUNT. These*
2407	are to be converted from the source to the execution character set,
2408	escape sequences translated, and finally all are to be
2409	concatenated. WIDE indicates whether or not to produce a wide
2410	string. If TO is non-NULL, the result is written into TO.
2411	If LOC_READERS and OUT are non-NULL, then location information
2412	is read from LOC_READERS (which must be an array of length COUNT),
2413	and location information is written to RANGES.*
2414
2415	Returns true for success, false for failure. /*
2416
2417	static bool
2418	cpp_interpret_string_1 (cpp_reader pfile, const* cpp_string *from, size_t count,
2419	cpp_string to, enum* cpp_ttype type,
2420	cpp_string_location_reader *loc_readers,
2421	cpp_substring_ranges *out)
2422	{
2423	struct _cpp_strbuf tbuf;
2424	const uchar p, base, *limit;
2425	size_t i;
2426	struct cset_converter cvt = converter_for_type (pfile, type);
2427
2428	/ loc_readers and out must either be both NULL, or both be non-NULL. /
2429	gcc_assert ((loc_readers != NULL) == (out != NULL));
2430
2431	if (to)
2432	{
2433	tbuf.asize = MAX (OUTBUF_BLOCK_SIZE, from->len);
2434	tbuf.text = XNEWVEC (uchar, tbuf.asize);
2435	tbuf.len = `0`;
2436	}
2437
2438	cpp_string_location_reader *loc_reader = NULL;
2439	for (i = `0`; i < count; i++)
2440	{
2441	if (loc_readers)
2442	loc_reader = &loc_readers[i];
2443
2444	p = from[i].text;
2445	if (*p == `'u'`)
2446	{
2447	p++;
2448	if (loc_reader)
2449	loc_reader->get_next ();
2450	if (*p == `'8'`)
2451	{
2452	p++;
2453	if (loc_reader)
2454	loc_reader->get_next ();
2455	}
2456	}
2457	else if (p == `'L'` \|\| p == `'U'`) p++;
2458	if (*p == `'R'`)
2459	{
2460	const uchar *prefix;
2461
2462	/ Skip over 'R"'. /
2463	p += `2`;
2464	if (loc_reader)
2465	{
2466	loc_reader->get_next ();
2467	loc_reader->get_next ();
2468	}
2469	prefix = p;
2470	while (*p != `'('`)
2471	{
2472	p++;
2473	if (loc_reader)
2474	loc_reader->get_next ();
2475	}
2476	p++;
2477	if (loc_reader)
2478	loc_reader->get_next ();
2479	limit = from[i].text + from[i].len;
2480	if (limit >= p + (p - prefix) + `1`)
2481	limit -= (p - prefix) + `1`;
2482
2483	/ Raw strings are all normal characters; these can be fed*
2484	directly to convert_cset. /*
2485	if (to)
2486	if (!APPLY_CONVERSION (cvt, p, limit - p, &tbuf))
2487	goto fail;
2488
2489	if (loc_reader)
2490	{
2491	/ If generating source ranges, assume we have a 1:1*
2492	correspondence between bytes in the source encoding and bytes
2493	in the execution encoding (e.g. if we have a UTF-8 to UTF-8
2494	conversion), so that this run of bytes in the source file
2495	corresponds to a run of bytes in the execution string.
2496	This requirement is guaranteed by an early-reject in
2497	cpp_interpret_string_ranges. /*
2498	gcc_assert (cvt.func == convert_no_conversion);
2499	out->add_n_ranges (num: limit - p, loc_reader&: *loc_reader);
2500	}
2501
2502	continue;
2503	}
2504
2505	/ If we don't now have a leading quote, something has gone wrong.*
2506	This can occur if cpp_interpret_string_ranges is handling a
2507	stringified macro argument, but should not be possible otherwise. /*
2508	if (p != `'"'` && p != `'\''`)
2509	{
2510	gcc_assert (out != NULL);
2511	cpp_error (pfile, CPP_DL_ERROR, msgid: "missing open quote");
2512	if (to)
2513	free (ptr: tbuf.text);
2514	return false;
2515	}
2516
2517	/ Skip leading quote. /
2518	p++;
2519	if (loc_reader)
2520	loc_reader->get_next ();
2521
2522	limit = from[i].text + from[i].len - `1`; / Skip trailing quote. /
2523
2524	for (;;)
2525	{
2526	base = p;
2527	while (p < limit && *p != `'\\'`)
2528	p++;
2529	if (p > base)
2530	{
2531	/ We have a run of normal characters; these can be fed*
2532	directly to convert_cset. /*
2533	if (to)
2534	if (!APPLY_CONVERSION (cvt, base, p - base, &tbuf))
2535	goto fail;
2536	/ Similar to above: assumes we have a 1:1 correspondence*
2537	between bytes in the source encoding and bytes in the
2538	execution encoding. /*
2539	if (loc_reader)
2540	{
2541	gcc_assert (cvt.func == convert_no_conversion);
2542	out->add_n_ranges (num: p - base, loc_reader&: *loc_reader);
2543	}
2544	}
2545	if (p >= limit)
2546	break;
2547
2548	struct _cpp_strbuf *tbuf_ptr = to ? &tbuf : NULL;
2549	p = convert_escape (pfile, from: p + `1`, limit, tbuf: tbuf_ptr, cvt,
2550	loc_reader, ranges: out, uneval: type == CPP_UNEVAL_STRING);
2551	}
2552	}
2553
2554	if (to)
2555	{
2556	/ NUL-terminate the 'to' buffer and translate it to a cpp_string*
2557	structure. /*
2558	emit_numeric_escape (pfile, n: `0`, tbuf: &tbuf, cvt);
2559	tbuf.text = XRESIZEVEC (uchar, tbuf.text, tbuf.len);
2560	to->text = tbuf.text;
2561	to->len = tbuf.len;
2562	}
2563	/ Use the location of the trailing quote as the location of the*
2564	NUL-terminator. /*
2565	if (loc_reader)
2566	{
2567	source_range range = loc_reader->get_next ();
2568	out->add_range (range);
2569	}
2570
2571	return true;
2572
2573	fail:
2574	cpp_errno (pfile, CPP_DL_ERROR, msgid: "converting to execution character set");
2575	if (to)
2576	free (ptr: tbuf.text);
2577	return false;
2578	}
2579
2580	/ FROM is an array of cpp_string structures of length COUNT. These*
2581	are to be converted from the source to the execution character set,
2582	escape sequences translated, and finally all are to be
2583	concatenated. WIDE indicates whether or not to produce a wide
2584	string. The result is written into TO. Returns true for success,
2585	false for failure. /*
2586	bool
2587	cpp_interpret_string (cpp_reader pfile, const* cpp_string *from, size_t count,
2588	cpp_string to, enum* cpp_ttype type)
2589	{
2590	return cpp_interpret_string_1 (pfile, from, count, to, type, NULL, NULL);
2591	}
2592
2593	/ A "do nothing" diagnostic-handling callback for use by*
2594	cpp_interpret_string_ranges, so that it can temporarily suppress
2595	diagnostic-handling. /*
2596
2597	static bool
2598	noop_diagnostic_cb (cpp_reader , enum* cpp_diagnostic_level,
2599	enum cpp_warning_reason, rich_location *,
2600	const char , va_list )
2601	{
2602	/ no-op. /
2603	return true;
2604	}
2605
2606	/ This function mimics the behavior of cpp_interpret_string, but*
2607	rather than generating a string in the execution character set,
2608	*OUT is written to with the source code ranges of the characters
2609	in such a string.
2610	FROM and LOC_READERS should both be arrays of length COUNT.
2611	Returns NULL for success, or an error message for failure. /*
2612
2613	const char *
2614	cpp_interpret_string_ranges (cpp_reader pfile, const* cpp_string *from,
2615	cpp_string_location_reader *loc_readers,
2616	size_t count,
2617	cpp_substring_ranges *out,
2618	enum cpp_ttype type)
2619	{
2620	/ There are a couple of cases in the range-handling in*
2621	cpp_interpret_string_1 that rely on there being a 1:1 correspondence
2622	between bytes in the source encoding and bytes in the execution
2623	encoding, so that each byte in the execution string can correspond
2624	to the location of a byte in the source string.
2625
2626	This holds for the typical case of a UTF-8 to UTF-8 conversion.
2627	Enforce this requirement by only attempting to track substring
2628	locations if we have source encoding == execution encoding.
2629
2630	This is a stronger condition than we need, since we could e.g.
2631	have ASCII to EBCDIC (with 1 byte per character before and after),
2632	but it seems to be a reasonable restriction. /*
2633	struct cset_converter cvt = converter_for_type (pfile, type);
2634	if (cvt.func != convert_no_conversion)
2635	return "execution character set != source character set";
2636
2637	/ For on-demand strings we have already lexed the strings, so there*
2638	should be no diagnostics. However, if we have bogus source location
2639	data (or stringified macro arguments), the attempt to lex the
2640	strings could fail with an diagnostic. Temporarily install an
2641	diagnostic-handler to catch the diagnostic, so that it can lead to this call
2642	failing, rather than being emitted as a user-visible diagnostic.
2643	If an diagnostic does occur, we should see it via the return value of
2644	cpp_interpret_string_1. /*
2645	bool (saved_diagnostic_handler) (cpp_reader , enum cpp_diagnostic_level,
2646	enum cpp_warning_reason, rich_location *,
2647	const char , va_list )
2648	ATTRIBUTE_FPTR_PRINTF(`5`,`0`);
2649
2650	saved_diagnostic_handler = pfile->cb.diagnostic;
2651	pfile->cb.diagnostic = noop_diagnostic_cb;
2652
2653	bool result = cpp_interpret_string_1 (pfile, from, count, NULL, type,
2654	loc_readers, out);
2655
2656	/ Restore the saved diagnostic-handler. /
2657	pfile->cb.diagnostic = saved_diagnostic_handler;
2658
2659	if (!result)
2660	return "cpp_interpret_string_1 failed";
2661
2662	/ Success. /
2663	return NULL;
2664	}
2665
2666	/ Subroutine of do_line and do_linemarker. Convert escape sequences*
2667	in a string, but do not perform character set conversion. /*
2668	bool
2669	cpp_interpret_string_notranslate (cpp_reader pfile, const* cpp_string *from,
2670	size_t count, cpp_string *to,
2671	enum cpp_ttype type)
2672	{
2673	struct cset_converter save_narrow_cset_desc = pfile->narrow_cset_desc;
2674	bool retval;
2675
2676	pfile->narrow_cset_desc.func = convert_no_conversion;
2677	pfile->narrow_cset_desc.cd = (iconv_t) -`1`;
2678	pfile->narrow_cset_desc.width = CPP_OPTION (pfile, char_precision);
2679
2680	retval = cpp_interpret_string (pfile, from, count, to,
2681	type: type == CPP_UNEVAL_STRING
2682	? CPP_UNEVAL_STRING : CPP_STRING);
2683
2684	pfile->narrow_cset_desc = save_narrow_cset_desc;
2685	return retval;
2686	}
2687
2688
2689	/ Return number of source characters in STR. /
2690	static unsigned
2691	count_source_chars (cpp_reader *pfile, cpp_string str, cpp_ttype type)
2692	{
2693	cpp_string str2 = { .len: `0`, .text: `0` };
2694	bool (saved_diagnostic_handler) (cpp_reader , enum cpp_diagnostic_level,
2695	enum cpp_warning_reason, rich_location *,
2696	const char , va_list )
2697	ATTRIBUTE_FPTR_PRINTF(`5`,`0`);
2698	saved_diagnostic_handler = pfile->cb.diagnostic;
2699	pfile->cb.diagnostic = noop_diagnostic_cb;
2700	convert_f save_func = pfile->narrow_cset_desc.func;
2701	pfile->narrow_cset_desc.func = convert_count_chars;
2702	bool ret = cpp_interpret_string (pfile, from: &str, count: `1`, to: &str2, type);
2703	pfile->narrow_cset_desc.func = save_func;
2704	pfile->cb.diagnostic = saved_diagnostic_handler;
2705	if (ret)
2706	{
2707	if (str2.text != str.text)
2708	free (ptr: (void *)str2.text);
2709	return str2.len;
2710	}
2711	else
2712	return `0`;
2713	}
2714
2715	/ Subroutine of cpp_interpret_charconst which performs the conversion*
2716	to a number, for narrow strings. STR is the string structure returned
2717	by cpp_interpret_string. PCHARS_SEEN and UNSIGNEDP are as for
2718	cpp_interpret_charconst. TOKEN is the token. /*
2719	static cppchar_t
2720	narrow_str_to_charconst (cpp_reader *pfile, cpp_string str,
2721	unsigned int pchars_seen, int* *unsignedp,
2722	const cpp_token *token)
2723	{
2724	enum cpp_ttype type = token->type;
2725	size_t width = CPP_OPTION (pfile, char_precision);
2726	size_t max_chars = CPP_OPTION (pfile, int_precision) / width;
2727	size_t mask = width_to_mask (width);
2728	size_t i;
2729	cppchar_t result, c;
2730	bool unsigned_p;
2731	bool diagnosed = false;
2732
2733	/ The value of a multi-character character constant, or a*
2734	single-character character constant whose representation in the
2735	execution character set is more than one byte long, is
2736	implementation defined. This implementation defines it to be the
2737	number formed by interpreting the byte sequence in memory as a
2738	big-endian binary number. If overflow occurs, the high bytes are
2739	lost, and a warning is issued.
2740
2741	We don't want to process the NUL terminator handed back by
2742	cpp_interpret_string. /*
2743	result = `0`;
2744	for (i = `0`; i < str.len - `1`; i++)
2745	{
2746	c = str.text[i] & mask;
2747	if (width < BITS_PER_CPPCHAR_T)
2748	result = (result << width) \| c;
2749	else
2750	result = c;
2751	}
2752
2753	if (type == CPP_UTF8CHAR)
2754	max_chars = `1`;
2755	else if (i > `1` && CPP_OPTION (pfile, cplusplus) && CPP_PEDANTIC (pfile))
2756	{
2757	/ C++ as a DR since*
2758	P1854R4 - Making non-encodable string literals ill-formed
2759	makes multi-character narrow character literals if any of the
2760	characters in the literal isn't encodable in char/unsigned char
2761	ill-formed. We need to count the number of c-chars and compare
2762	that to str.len. /*
2763	unsigned src_chars = count_source_chars (pfile, str: token->val.str, type);
2764
2765	if (src_chars)
2766	{
2767	if (str.len > src_chars)
2768	{
2769	if (src_chars <= `2`)
2770	diagnosed
2771	= cpp_error (pfile, CPP_DL_PEDWARN,
2772	msgid: "character not encodable in a single execution "
2773	"character code unit");
2774	else
2775	diagnosed
2776	= cpp_error (pfile, CPP_DL_PEDWARN,
2777	msgid: "at least one character in a multi-character "
2778	"literal not encodable in a single execution "
2779	"character code unit");
2780	if (diagnosed && i > max_chars)
2781	i = max_chars;
2782	}
2783	}
2784	}
2785	if (diagnosed)
2786	/ Already diagnosed above. /;
2787	else if (i > max_chars)
2788	{
2789	unsigned src_chars
2790	= count_source_chars (pfile, str: token->val.str,
2791	type: type == CPP_UTF8CHAR ? CPP_CHAR : type);
2792
2793	if (type != CPP_UTF8CHAR)
2794	cpp_error (pfile, CPP_DL_WARNING,
2795	msgid: "multi-character literal with %ld characters exceeds "
2796	"'int' size of %ld bytes", (long) i, (long) max_chars);
2797	else if (src_chars > `2`)
2798	cpp_error (pfile, CPP_DL_ERROR,
2799	msgid: "multi-character literal cannot have an encoding prefix");
2800	else
2801	cpp_error (pfile, CPP_DL_ERROR,
2802	msgid: "character not encodable in a single code unit");
2803	i = max_chars;
2804	}
2805	else if (i > `1` && CPP_OPTION (pfile, warn_multichar))
2806	cpp_warning (pfile, CPP_W_MULTICHAR, msgid: "multi-character character constant");
2807
2808	/ Multichar constants are of type int and therefore signed. /
2809	if (i > `1`)
2810	unsigned_p = `0`;
2811	else if (type == CPP_UTF8CHAR)
2812	unsigned_p = CPP_OPTION (pfile, unsigned_utf8char);
2813	else
2814	unsigned_p = CPP_OPTION (pfile, unsigned_char);
2815
2816	/ Truncate the constant to its natural width, and simultaneously*
2817	sign- or zero-extend to the full width of cppchar_t.
2818	For single-character constants, the value is WIDTH bits wide.
2819	For multi-character constants, the value is INT_PRECISION bits wide. /*
2820	if (i > `1`)
2821	width = CPP_OPTION (pfile, int_precision);
2822	if (width < BITS_PER_CPPCHAR_T)
2823	{
2824	mask = ((cppchar_t) `1` << width) - `1`;
2825	if (unsigned_p \|\| !(result & (`1` << (width - `1`))))
2826	result &= mask;
2827	else
2828	result \|= ~mask;
2829	}
2830	*pchars_seen = i;
2831	*unsignedp = unsigned_p;
2832	return result;
2833	}
2834
2835	/ Subroutine of cpp_interpret_charconst which performs the conversion*
2836	to a number, for wide strings. STR is the string structure returned
2837	by cpp_interpret_string. PCHARS_SEEN and UNSIGNEDP are as for
2838	cpp_interpret_charconst. TOKEN is the token. /*
2839	static cppchar_t
2840	wide_str_to_charconst (cpp_reader *pfile, cpp_string str,
2841	unsigned int pchars_seen, int* *unsignedp,
2842	const cpp_token *token)
2843	{
2844	enum cpp_ttype type = token->type;
2845	bool bigend = CPP_OPTION (pfile, bytes_big_endian);
2846	size_t width = converter_for_type (pfile, type).width;
2847	size_t cwidth = CPP_OPTION (pfile, char_precision);
2848	size_t mask = width_to_mask (width);
2849	size_t cmask = width_to_mask (width: cwidth);
2850	size_t nbwc = width / cwidth;
2851	size_t off, i;
2852	cppchar_t result = `0`, c;
2853
2854	if (str.len <= nbwc)
2855	{
2856	/ Error recovery, if no errors have been diagnosed previously,*
2857	there should be at least two wide characters. Empty literals
2858	are diagnosed earlier and we can get just the zero terminator
2859	only if there were errors diagnosed during conversion. /*
2860	*pchars_seen = `0`;
2861	*unsignedp = `0`;
2862	return `0`;
2863	}
2864
2865	/ This is finicky because the string is in the target's byte order,*
2866	which may not be our byte order. Only the last character, ignoring
2867	the NUL terminator, is relevant. /*
2868	off = str.len - (nbwc * `2`);
2869	result = `0`;
2870	for (i = `0`; i < nbwc; i++)
2871	{
2872	c = bigend ? str.text[off + i] : str.text[off + nbwc - i - `1`];
2873	result = (result << cwidth) \| (c & cmask);
2874	}
2875
2876	/ Wide character constants have type wchar_t, and a single*
2877	character exactly fills a wchar_t, so a multi-character wide
2878	character constant is guaranteed to overflow. /*
2879	if (str.len > nbwc * `2`)
2880	{
2881	cpp_diagnostic_level level = CPP_DL_WARNING;
2882	unsigned src_chars
2883	= count_source_chars (pfile, str: token->val.str, type: CPP_CHAR);
2884
2885	if (CPP_OPTION (pfile, cplusplus)
2886	&& (type == CPP_CHAR16
2887	\|\| type == CPP_CHAR32
2888	/ In C++23 this is error even for L'ab'. /
2889	\|\| (type == CPP_WCHAR
2890	&& CPP_OPTION (pfile, size_t_literals))))
2891	level = CPP_DL_ERROR;
2892	if (src_chars > `2`)
2893	cpp_error (pfile, level,
2894	msgid: "multi-character literal cannot have an encoding prefix");
2895	else
2896	cpp_error (pfile, level,
2897	msgid: "character not encodable in a single code unit");
2898	}
2899
2900	/ Truncate the constant to its natural width, and simultaneously*
2901	sign- or zero-extend to the full width of cppchar_t. /*
2902	if (width < BITS_PER_CPPCHAR_T)
2903	{
2904	if (type == CPP_CHAR16 \|\| type == CPP_CHAR32
2905	\|\| CPP_OPTION (pfile, unsigned_wchar)
2906	\|\| !(result & (`1` << (width - `1`))))
2907	result &= mask;
2908	else
2909	result \|= ~mask;
2910	}
2911
2912	if (type == CPP_CHAR16 \|\| type == CPP_CHAR32
2913	\|\| CPP_OPTION (pfile, unsigned_wchar))
2914	*unsignedp = `1`;
2915	else
2916	*unsignedp = `0`;
2917
2918	*pchars_seen = `1`;
2919	return result;
2920	}
2921
2922	/ Interpret a (possibly wide) character constant in TOKEN.*
2923	PCHARS_SEEN points to a variable that is filled in with the number
2924	of characters seen, and UNSIGNEDP to a variable that indicates
2925	whether the result has signed type. /*
2926	cppchar_t
2927	cpp_interpret_charconst (cpp_reader pfile, const* cpp_token *token,
2928	unsigned int pchars_seen, int* *unsignedp)
2929	{
2930	cpp_string str = { .len: `0`, .text: `0` };
2931	bool wide = (token->type != CPP_CHAR && token->type != CPP_UTF8CHAR);
2932	int u8 = `2` * int(token->type == CPP_UTF8CHAR);
2933	cppchar_t result;
2934
2935	/ An empty constant will appear as L'', u'', U'', u8'', or '' /
2936	if (token->val.str.len == (size_t) (`2` + wide + u8))
2937	{
2938	cpp_error (pfile, CPP_DL_ERROR, msgid: "empty character constant");
2939	*pchars_seen = `0`;
2940	*unsignedp = `0`;
2941	return `0`;
2942	}
2943	else if (!cpp_interpret_string (pfile, from: &token->val.str, count: `1`, to: &str,
2944	type: token->type))
2945	{
2946	*pchars_seen = `0`;
2947	*unsignedp = `0`;
2948	return `0`;
2949	}
2950
2951	if (wide)
2952	result = wide_str_to_charconst (pfile, str, pchars_seen, unsignedp,
2953	token);
2954	else
2955	result = narrow_str_to_charconst (pfile, str, pchars_seen, unsignedp,
2956	token);
2957
2958	if (str.text != token->val.str.text)
2959	free (ptr: (void *)str.text);
2960
2961	return result;
2962	}
2963
2964	/ Convert an identifier denoted by ID and LEN, which might contain*
2965	UCN escapes or UTF-8 multibyte chars, to the source character set,
2966	either UTF-8 or UTF-EBCDIC. Assumes that the identifier is actually
2967	a valid identifier. /*
2968	cpp_hashnode *
2969	_cpp_interpret_identifier (cpp_reader pfile, const* uchar *id, size_t len)
2970	{
2971	/ It turns out that a UCN escape always turns into fewer characters*
2972	than the escape itself, so we can allocate a temporary in advance. /*
2973	uchar * buf = (uchar *) alloca (len + `1`);
2974	uchar * bufp = buf;
2975	size_t idp;
2976
2977	for (idp = `0`; idp < len; idp++)
2978	if (id[idp] != `'\\'`)
2979	*bufp++ = id[idp];
2980	else
2981	{
2982	unsigned length = id[idp + `1`] == `'u'` ? `4` : `8`;
2983	cppchar_t value = `0`;
2984	size_t bufleft = len - (bufp - buf);
2985	int rval;
2986	bool delimited = false;
2987
2988	idp += `2`;
2989	if (id[idp - `1`] == `'N'` && id[idp] == `'{'`)
2990	{
2991	idp++;
2992	const uchar *name = &id[idp];
2993	while (idp < len
2994	&& (ISIDNUM (id[idp]) \|\| id[idp] == `' '` \|\| id[idp] == `'-'`))
2995	idp++;
2996	if (id[idp] == `'}'`)
2997	{
2998	value = _cpp_uname2c (name: (const char *) name, len: &id[idp] - name,
2999	n: uname2c_tree, NULL);
3000	if (value == (cppchar_t) -`1`)
3001	value = `1`;
3002	}
3003	else
3004	idp--;
3005	}
3006	else
3007	{
3008	if (length == `4` && id[idp] == `'{'`)
3009	{
3010	delimited = true;
3011	idp++;
3012	}
3013	while (length && idp < len && ISXDIGIT (id[idp]))
3014	{
3015	value = (value << `4`) + hex_value (id[idp]);
3016	idp++;
3017	if (!delimited)
3018	length--;
3019	}
3020	if (!delimited \|\| id[idp] != `'}'`)
3021	idp--;
3022	}
3023
3024	/ Special case for EBCDIC: if the identifier contains*
3025	a '$' specified using a UCN, translate it to EBCDIC. /*
3026	if (value == `0x24`)
3027	{
3028	*bufp++ = `'$'`;
3029	continue;
3030	}
3031
3032	rval = one_cppchar_to_utf8 (c: value, outbufp: &bufp, outbytesleftp: &bufleft);
3033	if (rval)
3034	{
3035	errno = rval;
3036	cpp_errno (pfile, CPP_DL_ERROR,
3037	msgid: "converting UCN to source character set");
3038	break;
3039	}
3040	}
3041
3042	return CPP_HASHNODE (ht_lookup (pfile->hash_table,
3043	buf, bufp - buf, HT_ALLOC));
3044	}
3045
3046
3047	/ Utility to strip a UTF-8 byte order marking from the beginning*
3048	of a buffer. Returns the number of bytes to skip, which currently
3049	will be either 0 or 3. /*
3050	int
3051	cpp_check_utf8_bom (const char *data, size_t data_length)
3052	{
3053
3054	#if HOST_CHARSET == HOST_CHARSET_ASCII
3055	const unsigned char udata = (const* unsigned char *) data;
3056	if (data_length >= `3` && udata[`0`] == `0xef` && udata[`1`] == `0xbb`
3057	&& udata[`2`] == `0xbf`)
3058	return `3`;
3059	#endif
3060
3061	return `0`;
3062	}
3063
3064
3065	/ Convert an input buffer (containing the complete contents of one*
3066	source file) from INPUT_CHARSET to the source character set. INPUT
3067	points to the input buffer, SIZE is its allocated size, and LEN is
3068	the length of the meaningful data within the buffer. The
3069	translated buffer is returned, ST_SIZE is set to the length of*
3070	the meaningful data within the translated buffer, and BUFFER_START*
3071	is set to the start of the returned buffer. BUFFER_START may*
3072	differ from the return value in the case of a BOM or other ignored
3073	marker information.
3074
3075	INPUT is expected to have been allocated with xmalloc. This
3076	function will either set BUFFER_START to INPUT, or free it and set*
3077	*BUFFER_START to a pointer to another xmalloc-allocated block of
3078	memory.
3079
3080	PFILE is only used to generate diagnostics; setting it to NULL suppresses
3081	diagnostics, and causes a return of NULL if there was any error instead. /*
3082
3083	uchar *
3084	_cpp_convert_input (cpp_reader pfile, const* char *input_charset,
3085	uchar *input, size_t size, size_t len,
3086	const unsigned char *buffer_start, off_t st_size)
3087	{
3088	struct cset_converter input_cset;
3089	struct _cpp_strbuf to;
3090	unsigned char *buffer;
3091
3092	input_cset = init_iconv_desc (pfile, SOURCE_CHARSET, from: input_charset);
3093	if (input_cset.func == convert_no_conversion)
3094	{
3095	to.text = input;
3096	to.asize = size;
3097	to.len = len;
3098	}
3099	else
3100	{
3101	to.asize = MAX (`65536`, len);
3102	to.text = XNEWVEC (uchar, to.asize);
3103	to.len = `0`;
3104
3105	const bool ok = APPLY_CONVERSION (input_cset, input, len, &to);
3106	free (ptr: input);
3107
3108	/ Clean up the mess. /
3109	if (input_cset.func == convert_using_iconv)
3110	iconv_close (input_cset.cd);
3111
3112	/ Handle conversion failure. /
3113	if (!ok)
3114	{
3115	if (!pfile)
3116	{
3117	XDELETEVEC (to.text);
3118	*buffer_start = NULL;
3119	*st_size = `0`;
3120	return NULL;
3121	}
3122	cpp_error (pfile, CPP_DL_ERROR, msgid: "failure to convert %s to %s",
3123	input_charset, SOURCE_CHARSET);
3124	}
3125	}
3126
3127	/ Resize buffer if we allocated substantially too much, or if we*
3128	haven't enough space for the \n-terminator or following
3129	15 bytes of padding (used to quiet warnings from valgrind or
3130	Address Sanitizer, when the optimized lexer accesses aligned
3131	16-byte memory chunks, including the bytes after the malloced,
3132	area, and stops lexing on '\n'). /*
3133	if (to.len + `4096` < to.asize \|\| to.len + `16` > to.asize)
3134	to.text = XRESIZEVEC (uchar, to.text, to.len + `16`);
3135
3136	memset (s: to.text + to.len, c: `'\0'`, n: `16`);
3137
3138	/ If the file is using old-school Mac line endings (\r only),*
3139	terminate with another \r, not an \n, so that we do not mistake
3140	the \r\n sequence for a single DOS line ending and erroneously
3141	issue the "No newline at end of file" diagnostic. /*
3142	if (to.len && to.text[to.len - `1`] == `'\r'`)
3143	to.text[to.len] = `'\r'`;
3144	else
3145	to.text[to.len] = `'\n'`;
3146
3147	buffer = to.text;
3148	*st_size = to.len;
3149
3150	/ Ignore a UTF-8 BOM if we see one and the source charset is UTF-8. Note*
3151	that glib'c UTF-8 iconv() provider (as of glibc 2.7) does not ignore a
3152	BOM -- however, even if it did, we would still need this code due
3153	to the 'convert_no_conversion' case. /*
3154	const int bom_len = cpp_check_utf8_bom (data: (const char *) to.text, data_length: to.len);
3155	*st_size -= bom_len;
3156	buffer += bom_len;
3157
3158	*buffer_start = to.text;
3159	return buffer;
3160	}
3161
3162	/ Decide on the default encoding to assume for input files. /
3163	const char *
3164	_cpp_default_encoding (void)
3165	{
3166	const char *current_encoding = NULL;
3167
3168	/ We disable this because the default codeset is 7-bit ASCII on*
3169	most platforms, and this causes conversion failures on every
3170	file in GCC that happens to have one of the upper 128 characters
3171	in it -- most likely, as part of the name of a contributor.
3172	We should definitely recognize in-band markers of file encoding,
3173	like:
3174	- the appropriate Unicode byte-order mark (FE FF) to recognize
3175	UTF16 and UCS4 (in both big-endian and little-endian flavors)
3176	and UTF8
3177	- a "#i", "#d", "/ ", "//", " #p" or "#p" (for #pragma) to*
3178	distinguish ASCII and EBCDIC.
3179	- now we can parse something like "#pragma GCC encoding <xyz>
3180	on the first line, or even Emacs/VIM's mode line tags (there's
3181	a problem here in that VIM uses the last line, and Emacs has
3182	its more elaborate "local variables" convention).
3183	- investigate whether Java has another common convention, which
3184	would be friendly to support.
3185	(Zack Weinberg and Paolo Bonzini, May 20th 2004) /*
3186	#if defined (HAVE_LOCALE_H) && defined (HAVE_LANGINFO_CODESET) && 0
3187	setlocale (LC_CTYPE, "");
3188	current_encoding = nl_langinfo (CODESET);
3189	#endif
3190	if (current_encoding == NULL \|\| *current_encoding == `'\0'`)
3191	current_encoding = SOURCE_CHARSET;
3192
3193	return current_encoding;
3194	}
3195
3196	/ Check if the configured input charset requires no conversion, other than*
3197	possibly stripping a UTF-8 BOM. /*
3198	bool cpp_input_conversion_is_trivial (const char *input_charset)
3199	{
3200	return !strcasecmp (s1: input_charset, SOURCE_CHARSET);
3201	}
3202
3203	/ Implementation of class cpp_string_location_reader. /
3204
3205	/ Constructor for cpp_string_location_reader. /
3206
3207	cpp_string_location_reader::
3208	cpp_string_location_reader (location_t src_loc,
3209	line_maps *line_table)
3210	{
3211	src_loc = get_range_from_loc (set: line_table, loc: src_loc).m_start;
3212
3213	/ SRC_LOC might be a macro location. It only makes sense to do*
3214	column-by-column calculations on ordinary maps, so get the
3215	corresponding location in an ordinary map. /*
3216	m_loc
3217	= linemap_resolve_location (line_table, loc: src_loc,
3218	lrk: LRK_SPELLING_LOCATION, NULL);
3219
3220	const line_map_ordinary *map
3221	= linemap_check_ordinary (map: linemap_lookup (line_table, m_loc));
3222	m_offset_per_column = (`1` << map->m_range_bits);
3223	}
3224
3225	/ Get the range of the next source byte. /
3226
3227	source_range
3228	cpp_string_location_reader::get_next ()
3229	{
3230	source_range result;
3231	result.m_start = m_loc;
3232	result.m_finish = m_loc;
3233	if (m_loc <= LINE_MAP_MAX_LOCATION_WITH_COLS)
3234	m_loc += m_offset_per_column;
3235	return result;
3236	}
3237
3238	cpp_display_width_computation::
3239	cpp_display_width_computation (const char data, int* data_length,
3240	const cpp_char_column_policy &policy) :
3241	m_begin (data),
3242	m_next (m_begin),
3243	m_bytes_left (data_length),
3244	m_policy (policy),
3245	m_display_cols (`0`)
3246	{
3247	gcc_assert (policy.m_tabstop > `0`);
3248	gcc_assert (policy.m_width_cb);
3249	}
3250
3251
3252	/ The main implementation function for class cpp_display_width_computation.*
3253	m_next points on entry to the start of the UTF-8 encoding of the next
3254	character, and is updated to point just after the last byte of the encoding.
3255	m_bytes_left contains on entry the remaining size of the buffer into which
3256	m_next points, and this is also updated accordingly. If m_next does not
3257	point to a valid UTF-8-encoded sequence, then it will be treated as a single
3258	byte with display width 1. m_cur_display_col is the current display column,
3259	relative to which tab stops should be expanded. Returns the display width of
3260	the codepoint just processed.
3261	If OUT is non-NULL, it is populated. /*
3262
3263	int
3264	cpp_display_width_computation::process_next_codepoint (cpp_decoded_char *out)
3265	{
3266	cppchar_t c;
3267	int next_width;
3268
3269	if (out)
3270	out->m_start_byte = m_next;
3271
3272	if (*m_next == `'\t'`)
3273	{
3274	++m_next;
3275	--m_bytes_left;
3276	next_width = m_policy.m_tabstop - (m_display_cols % m_policy.m_tabstop);
3277	if (out)
3278	{
3279	out->m_ch = `'\t'`;
3280	out->m_valid_ch = true;
3281	}
3282	}
3283	else if (one_utf8_to_cppchar (inbufp: (const uchar **) &m_next, inbytesleftp: &m_bytes_left, cp: &c)
3284	!= `0`)
3285	{
3286	/ Input is not convertible to UTF-8. This could be fine, e.g. in a*
3287	string literal, so don't complain. Just treat it as if it has a width
3288	of one. /*
3289	++m_next;
3290	--m_bytes_left;
3291	next_width = m_policy.m_undecoded_byte_width;
3292	if (out)
3293	out->m_valid_ch = false;
3294	}
3295	else
3296	{
3297	/ one_utf8_to_cppchar() has updated m_next and m_bytes_left for us. /
3298	next_width = m_policy.m_width_cb (c);
3299	if (out)
3300	{
3301	out->m_ch = c;
3302	out->m_valid_ch = true;
3303	}
3304	}
3305
3306	if (out)
3307	out->m_next_byte = m_next;
3308
3309	m_display_cols += next_width;
3310	return next_width;
3311	}
3312
3313	/ Utility to advance the byte stream by the minimum amount needed to consume*
3314	N display columns. Returns the number of display columns that were
3315	actually skipped. This could be less than N, if there was not enough data,
3316	or more than N, if the last character to be skipped had a sufficiently large
3317	display width. /*
3318	int
3319	cpp_display_width_computation::advance_display_cols (int n)
3320	{
3321	const int start = m_display_cols;
3322	const int target = start + n;
3323	while (m_display_cols < target && !done ())
3324	process_next_codepoint (NULL);
3325	return m_display_cols - start;
3326	}
3327
3328	/ For the string of length DATA_LENGTH bytes that begins at DATA, compute*
3329	how many display columns are occupied by the first COLUMN bytes. COLUMN
3330	may exceed DATA_LENGTH, in which case the phantom bytes at the end are
3331	treated as if they have display width 1. Tabs are expanded to the next tab
3332	stop, relative to the start of DATA, and non-printable-ASCII characters
3333	will be escaped as per POLICY. /*
3334
3335	int
3336	cpp_byte_column_to_display_column (const char data, int* data_length,
3337	int column,
3338	const cpp_char_column_policy &policy)
3339	{
3340	const int offset = MAX (`0`, column - data_length);
3341	cpp_display_width_computation dw (data, column - offset, policy);
3342	while (!dw.done ())
3343	dw.process_next_codepoint (NULL);
3344	return dw.display_cols_processed () + offset;
3345	}
3346
3347	/ For the string of length DATA_LENGTH bytes that begins at DATA, compute*
3348	the least number of bytes that will result in at least DISPLAY_COL display
3349	columns. The return value may exceed DATA_LENGTH if the entire string does
3350	not occupy enough display columns. Non-printable-ASCII characters
3351	will be escaped as per POLICY. /*
3352
3353	int
3354	cpp_display_column_to_byte_column (const char data, int* data_length,
3355	int display_col,
3356	const cpp_char_column_policy &policy)
3357	{
3358	cpp_display_width_computation dw (data, data_length, policy);
3359	const int avail_display = dw.advance_display_cols (n: display_col);
3360	return dw.bytes_processed () + MAX (`0`, display_col - avail_display);
3361	}
3362
3363	template <typename PropertyType>
3364	PropertyType
3365	get_cppchar_property (cppchar_t c,
3366	const cppchar_t *range_ends,
3367	const PropertyType *range_values,
3368	size_t num_ranges,
3369	PropertyType default_value)
3370	{
3371	if (__builtin_expect (c <= range_ends[`0`], true))
3372	return range_values[`0`];
3373
3374	/ Binary search the tables. /
3375	int begin = `1`;
3376	static const int end = num_ranges;
3377	int len = end - begin;
3378	do
3379	{
3380	int half = len/`2`;
3381	int middle = begin + half;
3382	if (c > range_ends[middle])
3383	{
3384	begin = middle + `1`;
3385	len -= half + `1`;
3386	}
3387	else
3388	len = half;
3389	} while (len);
3390
3391	if (__builtin_expect (begin != end, true))
3392	return range_values[begin];
3393
3394	return default_value;
3395	}
3396
3397	/ Our own version of wcwidth(). We don't use the actual wcwidth() in glibc,*
3398	because that will inspect the user's locale, and in particular in an ASCII
3399	locale, it will not return anything useful for extended characters. But GCC
3400	in other respects (see e.g. _cpp_default_encoding()) behaves as if
3401	everything is UTF-8. We also make some tweaks that are useful for the way
3402	GCC needs to use this data, e.g. tabs and other control characters should be
3403	treated as having width 1. The lookup tables are generated from
3404	contrib/unicode/gen_wcwidth.py and were made by simply calling glibc
3405	wcwidth() on all codepoints, then applying the small tweaks. These tables
3406	are not highly optimized, but for the present purpose of outputting
3407	diagnostics, they are sufficient. /*
3408
3409	#include "generated_cpp_wcwidth.h"
3410
3411	int
3412	cpp_wcwidth (cppchar_t c)
3413	{
3414	const size_t num_ranges
3415	= sizeof wcwidth_range_ends / sizeof (*wcwidth_range_ends);
3416	return get_cppchar_property<unsigned char > (c,
3417	range_ends: &wcwidth_range_ends[`0`],
3418	range_values: &wcwidth_widths[`0`],
3419	num_ranges,
3420	default_value: `1`);
3421	}
3422
3423	#include "combining-chars.inc"
3424
3425	bool
3426	cpp_is_combining_char (cppchar_t c)
3427	{
3428	const size_t num_ranges
3429	= sizeof combining_range_ends / sizeof (*combining_range_ends);
3430	return get_cppchar_property<bool> (c,
3431	range_ends: &combining_range_ends[`0`],
3432	range_values: &is_combining[`0`],
3433	num_ranges,
3434	default_value: false);
3435	}
3436
3437	#include "printable-chars.inc"
3438
3439	bool
3440	cpp_is_printable_char (cppchar_t c)
3441	{
3442	const size_t num_ranges
3443	= sizeof printable_range_ends / sizeof (*printable_range_ends);
3444	return get_cppchar_property<bool> (c,
3445	range_ends: &printable_range_ends[`0`],
3446	range_values: &is_printable[`0`],
3447	num_ranges,
3448	default_value: false);
3449	}
3450

Provided by KDAB

Definitions

_cpp_strbuf
one_utf8_to_cppchar
one_cppchar_to_utf8
one_utf8_to_utf32
one_utf32_to_utf8
one_utf8_to_utf16
one_utf16_to_utf8
one_count_chars
conversion_loop
convert_utf8_utf16
convert_utf8_utf32
convert_utf16_utf8
convert_utf32_utf8
convert_count_chars
convert_no_conversion
cpp_conversion
conversion_tab
init_iconv_desc
cpp_init_iconv
_cpp_destroy_iconv
cpp_host_to_exec_charset
cpp_substring_ranges
~cpp_substring_ranges
add_range
add_n_ranges
width_to_mask
ucnrange
hangul_syllables
hangul_count
uname2c_data
_cpp_uname2c
_cpp_uname2c_uax44_lm2
cpp_check_xid_property
ucn_valid_in_identifier
extend_char_range
_cpp_valid_ucn
convert_ucn
_cpp_valid_utf8
cpp_valid_utf8_p
emit_numeric_escape
convert_hex
convert_oct
convert_escape
converter_for_type
cpp_interpret_string_1
cpp_interpret_string
noop_diagnostic_cb
cpp_interpret_string_ranges
cpp_interpret_string_notranslate
count_source_chars
narrow_str_to_charconst
wide_str_to_charconst
cpp_interpret_charconst
_cpp_interpret_identifier
cpp_check_utf8_bom
_cpp_convert_input
_cpp_default_encoding
cpp_input_conversion_is_trivial
cpp_string_location_reader
get_next
cpp_display_width_computation
process_next_codepoint
advance_display_cols
cpp_byte_column_to_display_column
cpp_display_column_to_byte_column
get_cppchar_property
cpp_wcwidth
cpp_is_combining_char

Learn to use CMake with our Intro Training

Find out more

Definitions

source code of libcpp/charset.cc