1/* CPP Library - charsets
2 Copyright (C) 1998-2026 Free Software Foundation, Inc.
3
4 Broken out of c-lex.cc Apr 2003, adding valid C99 UCN ranges.
5
6This program is free software; you can redistribute it and/or modify it
7under the terms of the GNU General Public License as published by the
8Free Software Foundation; either version 3, or (at your option) any
9later version.
10
11This program is distributed in the hope that it will be useful,
12but WITHOUT ANY WARRANTY; without even the implied warranty of
13MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14GNU General Public License for more details.
15
16You should have received a copy of the GNU General Public License
17along with this program; see the file COPYING3. If not see
18<http://www.gnu.org/licenses/>. */
19
20#include "config.h"
21#include "system.h"
22#include "cpplib.h"
23#include "internal.h"
24
25/* Character set handling for C-family languages.
26
27 Terminological note: In what follows, "charset" or "character set"
28 will be taken to mean both an abstract set of characters and an
29 encoding for that set.
30
31 The C99 standard discusses two character sets: source and execution.
32 The source character set is used for internal processing in translation
33 phases 1 through 4; the execution character set is used thereafter.
34 Both are required by 5.2.1.2p1 to be multibyte encodings, not wide
35 character encodings (see 3.7.2, 3.7.3 for the standardese meanings
36 of these terms). Furthermore, the "basic character set" (listed in
37 5.2.1p3) is to be encoded in each with values one byte wide, and is
38 to appear in the initial shift state.
39
40 It is not explicitly mentioned, but there is also a "wide execution
41 character set" used to encode wide character constants and wide
42 string literals; this is supposed to be the result of applying the
43 standard library function mbstowcs() to an equivalent narrow string
44 (6.4.5p5). However, the behavior of hexadecimal and octal
45 \-escapes is at odds with this; they are supposed to be translated
46 directly to wchar_t values (6.4.4.4p5,6).
47
48 The source character set is not necessarily the character set used
49 to encode physical source files on disk; translation phase 1 converts
50 from whatever that encoding is to the source character set.
51
52 The presence of universal character names in C99 (6.4.3 et seq.)
53 forces the source character set to be isomorphic to ISO 10646,
54 that is, Unicode. There is no such constraint on the execution
55 character set; note also that the conversion from source to
56 execution character set does not occur for identifiers (5.1.1.2p1#5).
57
58 For convenience of implementation, the source character set's
59 encoding of the basic character set should be identical to the
60 execution character set OF THE HOST SYSTEM's encoding of the basic
61 character set, and it should not be a state-dependent encoding.
62
63 cpplib uses UTF-8 or UTF-EBCDIC for the source character set,
64 depending on whether the host is based on ASCII or EBCDIC (see
65 respectively Unicode section 2.3/ISO10646 Amendment 2, and Unicode
66 Technical Report #16). With limited exceptions, it relies on the
67 system library's iconv() primitive to do charset conversion
68 (specified in SUSv2). */
69
70#if !HAVE_ICONV
71/* Make certain that the uses of iconv(), iconv_open(), iconv_close()
72 below, which are guarded only by if statements with compile-time
73 constant conditions, do not cause link errors. */
74#define iconv_open(x, y) (errno = EINVAL, (iconv_t)-1)
75#define iconv(a,b,c,d,e) (errno = EINVAL, (size_t)-1)
76#define iconv_close(x) (void)0
77#define ICONV_CONST
78#endif
79
80#if HOST_CHARSET == HOST_CHARSET_ASCII
81#define SOURCE_CHARSET "UTF-8"
82#define LAST_POSSIBLY_BASIC_SOURCE_CHAR 0x7e
83#elif HOST_CHARSET == HOST_CHARSET_EBCDIC
84#define SOURCE_CHARSET "UTF-EBCDIC"
85#define LAST_POSSIBLY_BASIC_SOURCE_CHAR 0xFF
86#else
87#error "Unrecognized basic host character set"
88#endif
89
90#ifndef EILSEQ
91#define EILSEQ EINVAL
92#endif
93
94/* This structure is used for a resizable string buffer throughout. */
95/* Don't call it strbuf, as that conflicts with unistd.h on systems
96 such as DYNIX/ptx where unistd.h includes stropts.h. */
97struct _cpp_strbuf
98{
99 uchar *text;
100 size_t asize;
101 size_t len;
102};
103
104/* This is enough to hold any string that fits on a single 80-column
105 line, even if iconv quadruples its size (e.g. conversion from
106 ASCII to UTF-32) rounded up to a power of two. */
107#define OUTBUF_BLOCK_SIZE 256
108
109/* Conversions between UTF-8 and UTF-16/32 are implemented by custom
110 logic. This is because a depressing number of systems lack iconv,
111 or have have iconv libraries that do not do these conversions, so
112 we need a fallback implementation for them. To ensure the fallback
113 doesn't break due to neglect, it is used on all systems.
114
115 UTF-32 encoding is nice and simple: a four-byte binary number,
116 constrained to the range 00000000-7FFFFFFF to avoid questions of
117 signedness. We do have to cope with big- and little-endian
118 variants.
119
120 UTF-16 encoding uses two-byte binary numbers, again in big- and
121 little-endian variants, for all values in the 00000000-0000FFFF
122 range. Values in the 00010000-0010FFFF range are encoded as pairs
123 of two-byte numbers, called "surrogate pairs": given a number S in
124 this range, it is mapped to a pair (H, L) as follows:
125
126 H = (S - 0x10000) / 0x400 + 0xD800
127 L = (S - 0x10000) % 0x400 + 0xDC00
128
129 Two-byte values in the D800...DFFF range are ill-formed except as a
130 component of a surrogate pair. Even if the encoding within a
131 two-byte value is little-endian, the H member of the surrogate pair
132 comes first.
133
134 There is no way to encode values in the 00110000-7FFFFFFF range,
135 which is not currently a problem as there are no assigned code
136 points in that range; however, the author expects that it will
137 eventually become necessary to abandon UTF-16 due to this
138 limitation. Note also that, because of these pairs, UTF-16 does
139 not meet the requirements of the C standard for a wide character
140 encoding (see 3.7.3 and 6.4.4.4p11).
141
142 UTF-8 encoding looks like this:
143
144 value range encoded as
145 00000000-0000007F 0xxxxxxx
146 00000080-000007FF 110xxxxx 10xxxxxx
147 00000800-0000FFFF 1110xxxx 10xxxxxx 10xxxxxx
148 00010000-001FFFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
149 00200000-03FFFFFF 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
150 04000000-7FFFFFFF 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
151
152 Values in the 0000D800 ... 0000DFFF range (surrogates) are invalid,
153 which means that three-byte sequences ED xx yy, with A0 <= xx <= BF,
154 never occur. Note also that any value that can be encoded by a
155 given row of the table can also be encoded by all successive rows,
156 but this is not done; only the shortest possible encoding for any
157 given value is valid. For instance, the character 07C0 could be
158 encoded as any of DF 80, E0 9F 80, F0 80 9F 80, F8 80 80 9F 80, or
159 FC 80 80 80 9F 80. Only the first is valid.
160
161 An implementation note: the transformation from UTF-16 to UTF-8, or
162 vice versa, is easiest done by using UTF-32 as an intermediary. */
163
164/* Internal primitives which go from an UTF-8 byte stream to native-endian
165 UTF-32 in a cppchar_t, or vice versa; this avoids an extra marshal/unmarshal
166 operation in several places below. */
167static inline int
168one_utf8_to_cppchar (const uchar **inbufp, size_t *inbytesleftp,
169 cppchar_t *cp)
170{
171 static const uchar masks[6] = { 0x7F, 0x1F, 0x0F, 0x07, 0x03, 0x01 };
172 static const uchar patns[6] = { 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
173
174 cppchar_t c;
175 const uchar *inbuf = *inbufp;
176 size_t nbytes, i;
177
178 if (*inbytesleftp < 1)
179 return EINVAL;
180
181 c = *inbuf;
182 if (c < 0x80)
183 {
184 *cp = c;
185 *inbytesleftp -= 1;
186 *inbufp += 1;
187 return 0;
188 }
189
190 /* The number of leading 1-bits in the first byte indicates how many
191 bytes follow. */
192 for (nbytes = 2; nbytes < 7; nbytes++)
193 if ((c & ~masks[nbytes-1]) == patns[nbytes-1])
194 goto found;
195 return EILSEQ;
196 found:
197
198 if (*inbytesleftp < nbytes)
199 return EINVAL;
200
201 c = (c & masks[nbytes-1]);
202 inbuf++;
203 for (i = 1; i < nbytes; i++)
204 {
205 cppchar_t n = *inbuf++;
206 if ((n & 0xC0) != 0x80)
207 return EILSEQ;
208 c = ((c << 6) + (n & 0x3F));
209 }
210
211 /* Make sure the shortest possible encoding was used. */
212 if (c <= 0x7F && nbytes > 1) return EILSEQ;
213 if (c <= 0x7FF && nbytes > 2) return EILSEQ;
214 if (c <= 0xFFFF && nbytes > 3) return EILSEQ;
215 if (c <= 0x1FFFFF && nbytes > 4) return EILSEQ;
216 if (c <= 0x3FFFFFF && nbytes > 5) return EILSEQ;
217
218 /* Make sure the character is valid. */
219 if (c > 0x7FFFFFFF || (c >= 0xD800 && c <= 0xDFFF)) return EILSEQ;
220
221 *cp = c;
222 *inbufp = inbuf;
223 *inbytesleftp -= nbytes;
224 return 0;
225}
226
227static inline int
228one_cppchar_to_utf8 (cppchar_t c, uchar **outbufp, size_t *outbytesleftp)
229{
230 static const uchar masks[6] = { 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
231 static const uchar limits[6] = { 0x80, 0xE0, 0xF0, 0xF8, 0xFC, 0xFE };
232 size_t nbytes;
233 uchar buf[6], *p = &buf[6];
234 uchar *outbuf = *outbufp;
235
236 nbytes = 1;
237 if (c < 0x80)
238 *--p = c;
239 else
240 {
241 do
242 {
243 *--p = ((c & 0x3F) | 0x80);
244 c >>= 6;
245 nbytes++;
246 }
247 while (c >= 0x3F || (c & limits[nbytes-1]));
248 *--p = (c | masks[nbytes-1]);
249 }
250
251 if (*outbytesleftp < nbytes)
252 return E2BIG;
253
254 while (p < &buf[6])
255 *outbuf++ = *p++;
256 *outbytesleftp -= nbytes;
257 *outbufp = outbuf;
258 return 0;
259}
260
261/* The following four functions transform one character between the two
262 encodings named in the function name. All have the signature
263 int (*)(iconv_t bigend, const uchar **inbufp, size_t *inbytesleftp,
264 uchar **outbufp, size_t *outbytesleftp)
265
266 BIGEND must have the value 0 or 1, coerced to (iconv_t); it is
267 interpreted as a boolean indicating whether big-endian or
268 little-endian encoding is to be used for the member of the pair
269 that is not UTF-8.
270
271 INBUFP, INBYTESLEFTP, OUTBUFP, OUTBYTESLEFTP work exactly as they
272 do for iconv.
273
274 The return value is either 0 for success, or an errno value for
275 failure, which may be E2BIG (need more space), EILSEQ (ill-formed
276 input sequence), ir EINVAL (incomplete input sequence). */
277
278static inline int
279one_utf8_to_utf32 (iconv_t bigend, const uchar **inbufp, size_t *inbytesleftp,
280 uchar **outbufp, size_t *outbytesleftp)
281{
282 uchar *outbuf;
283 cppchar_t s = 0;
284 int rval;
285
286 /* Check for space first, since we know exactly how much we need. */
287 if (*outbytesleftp < 4)
288 return E2BIG;
289
290 rval = one_utf8_to_cppchar (inbufp, inbytesleftp, cp: &s);
291 if (rval)
292 return rval;
293
294 outbuf = *outbufp;
295 outbuf[bigend ? 3 : 0] = (s & 0x000000FF);
296 outbuf[bigend ? 2 : 1] = (s & 0x0000FF00) >> 8;
297 outbuf[bigend ? 1 : 2] = (s & 0x00FF0000) >> 16;
298 outbuf[bigend ? 0 : 3] = (s & 0xFF000000) >> 24;
299
300 *outbufp += 4;
301 *outbytesleftp -= 4;
302 return 0;
303}
304
305static inline int
306one_utf32_to_utf8 (iconv_t bigend, const uchar **inbufp, size_t *inbytesleftp,
307 uchar **outbufp, size_t *outbytesleftp)
308{
309 cppchar_t s;
310 int rval;
311 const uchar *inbuf;
312
313 if (*inbytesleftp < 4)
314 return EINVAL;
315
316 inbuf = *inbufp;
317
318 s = inbuf[bigend ? 0 : 3] << 24;
319 s += inbuf[bigend ? 1 : 2] << 16;
320 s += inbuf[bigend ? 2 : 1] << 8;
321 s += inbuf[bigend ? 3 : 0];
322
323 if (s >= 0x7FFFFFFF || (s >= 0xD800 && s <= 0xDFFF))
324 return EILSEQ;
325
326 rval = one_cppchar_to_utf8 (c: s, outbufp, outbytesleftp);
327 if (rval)
328 return rval;
329
330 *inbufp += 4;
331 *inbytesleftp -= 4;
332 return 0;
333}
334
335static inline int
336one_utf8_to_utf16 (iconv_t bigend, const uchar **inbufp, size_t *inbytesleftp,
337 uchar **outbufp, size_t *outbytesleftp)
338{
339 int rval;
340 cppchar_t s = 0;
341 const uchar *save_inbuf = *inbufp;
342 size_t save_inbytesleft = *inbytesleftp;
343 uchar *outbuf = *outbufp;
344
345 rval = one_utf8_to_cppchar (inbufp, inbytesleftp, cp: &s);
346 if (rval)
347 return rval;
348
349 if (s > 0x0010FFFF)
350 {
351 *inbufp = save_inbuf;
352 *inbytesleftp = save_inbytesleft;
353 return EILSEQ;
354 }
355
356 if (s <= 0xFFFF)
357 {
358 if (*outbytesleftp < 2)
359 {
360 *inbufp = save_inbuf;
361 *inbytesleftp = save_inbytesleft;
362 return E2BIG;
363 }
364 outbuf[bigend ? 1 : 0] = (s & 0x00FF);
365 outbuf[bigend ? 0 : 1] = (s & 0xFF00) >> 8;
366
367 *outbufp += 2;
368 *outbytesleftp -= 2;
369 return 0;
370 }
371 else
372 {
373 cppchar_t hi, lo;
374
375 if (*outbytesleftp < 4)
376 {
377 *inbufp = save_inbuf;
378 *inbytesleftp = save_inbytesleft;
379 return E2BIG;
380 }
381
382 hi = (s - 0x10000) / 0x400 + 0xD800;
383 lo = (s - 0x10000) % 0x400 + 0xDC00;
384
385 /* Even if we are little-endian, put the high surrogate first.
386 ??? Matches practice? */
387 outbuf[bigend ? 1 : 0] = (hi & 0x00FF);
388 outbuf[bigend ? 0 : 1] = (hi & 0xFF00) >> 8;
389 outbuf[bigend ? 3 : 2] = (lo & 0x00FF);
390 outbuf[bigend ? 2 : 3] = (lo & 0xFF00) >> 8;
391
392 *outbufp += 4;
393 *outbytesleftp -= 4;
394 return 0;
395 }
396}
397
398static inline int
399one_utf16_to_utf8 (iconv_t bigend, const uchar **inbufp, size_t *inbytesleftp,
400 uchar **outbufp, size_t *outbytesleftp)
401{
402 cppchar_t s;
403 const uchar *inbuf = *inbufp;
404 int rval;
405
406 if (*inbytesleftp < 2)
407 return EINVAL;
408 s = inbuf[bigend ? 0 : 1] << 8;
409 s += inbuf[bigend ? 1 : 0];
410
411 /* Low surrogate without immediately preceding high surrogate is invalid. */
412 if (s >= 0xDC00 && s <= 0xDFFF)
413 return EILSEQ;
414 /* High surrogate must have a following low surrogate. */
415 else if (s >= 0xD800 && s <= 0xDBFF)
416 {
417 cppchar_t hi = s, lo;
418 if (*inbytesleftp < 4)
419 return EINVAL;
420
421 lo = inbuf[bigend ? 2 : 3] << 8;
422 lo += inbuf[bigend ? 3 : 2];
423
424 if (lo < 0xDC00 || lo > 0xDFFF)
425 return EILSEQ;
426
427 s = (hi - 0xD800) * 0x400 + (lo - 0xDC00) + 0x10000;
428 }
429
430 rval = one_cppchar_to_utf8 (c: s, outbufp, outbytesleftp);
431 if (rval)
432 return rval;
433
434 /* Success - update the input pointers (one_cppchar_to_utf8 has done
435 the output pointers for us). */
436 if (s <= 0xFFFF)
437 {
438 *inbufp += 2;
439 *inbytesleftp -= 2;
440 }
441 else
442 {
443 *inbufp += 4;
444 *inbytesleftp -= 4;
445 }
446 return 0;
447}
448
449
450/* Special routine which just counts number of characters in the
451 string, what exactly is stored into the output doesn't matter
452 as long as it is one uchar per character. */
453
454static inline int
455one_count_chars (iconv_t, const uchar **inbufp, size_t *inbytesleftp,
456 uchar **outbufp, size_t *outbytesleftp)
457{
458 cppchar_t s = 0;
459 int rval;
460
461 /* Check for space first, since we know exactly how much we need. */
462 if (*outbytesleftp < 1)
463 return E2BIG;
464
465#if HOST_CHARSET == HOST_CHARSET_ASCII
466 rval = one_utf8_to_cppchar (inbufp, inbytesleftp, cp: &s);
467 if (rval)
468 return rval;
469#else
470 if (*inbytesleftp < 1)
471 return EINVAL;
472 static const uchar utf_ebcdic_map[256] = {
473 /* See table 4 in http://unicode.org/reports/tr16/tr16-7.2.html */
474 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
475 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
476 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
477 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
478 1, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 1, 1, 1, 1, 1,
479 1, 9, 9, 9, 9, 9, 9, 9, 9, 9, 1, 1, 1, 1, 1, 1,
480 1, 1, 9, 9, 9, 9, 9, 9, 9, 9, 9, 1, 1, 1, 1, 1,
481 9, 9, 9, 9, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1,
482 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2,
483 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2,
484 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 2, 2,
485 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 1, 3, 3,
486 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 3, 3, 3, 3, 3,
487 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 3, 4, 4, 4, 4,
488 1, 4, 1, 1, 1, 1, 1, 1, 1, 1, 4, 4, 4, 5, 5, 5,
489 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 5, 6, 6, 7, 7, 0
490 };
491 rval = utf_ebcdic_map[**inbufp];
492 if (rval == 9)
493 return EILSEQ;
494 if (rval == 0)
495 rval = 1;
496 if (rval >= 2)
497 {
498 if (*inbytesleftp < rval)
499 return EINVAL;
500 for (int i = 1; i < rval; ++i)
501 if (utf_ebcdic_map[(*inbufp)[i]] != 9)
502 return EILSEQ;
503 }
504 *inbytesleftp -= rval;
505 *inbufp += rval;
506#endif
507
508 **outbufp = ' ';
509
510 *outbufp += 1;
511 *outbytesleftp -= 1;
512 return 0;
513}
514
515
516/* Helper routine for the next few functions. The 'const' on
517 one_conversion means that we promise not to modify what function is
518 pointed to, which lets the inliner see through it. */
519
520static inline bool
521conversion_loop (int (*const one_conversion)(iconv_t, const uchar **, size_t *,
522 uchar **, size_t *),
523 iconv_t cd, const uchar *from, size_t flen, struct _cpp_strbuf *to)
524{
525 const uchar *inbuf;
526 uchar *outbuf;
527 size_t inbytesleft, outbytesleft;
528 int rval;
529
530 inbuf = from;
531 inbytesleft = flen;
532 outbuf = to->text + to->len;
533 outbytesleft = to->asize - to->len;
534
535 for (;;)
536 {
537 do
538 rval = one_conversion (cd, &inbuf, &inbytesleft,
539 &outbuf, &outbytesleft);
540 while (inbytesleft && !rval);
541
542 if (__builtin_expect (inbytesleft == 0, 1))
543 {
544 to->len = to->asize - outbytesleft;
545 return true;
546 }
547 if (rval != E2BIG)
548 {
549 errno = rval;
550 return false;
551 }
552
553 outbytesleft += OUTBUF_BLOCK_SIZE;
554 to->asize += OUTBUF_BLOCK_SIZE;
555 to->text = XRESIZEVEC (uchar, to->text, to->asize);
556 outbuf = to->text + to->asize - outbytesleft;
557 }
558}
559
560
561/* These functions convert entire strings between character sets.
562 They all have the signature
563
564 bool (*)(iconv_t cd, const uchar *from, size_t flen, struct _cpp_strbuf *to);
565
566 The input string FROM is converted as specified by the function
567 name plus the iconv descriptor CD (which may be fake), and the
568 result appended to TO. On any error, false is returned, otherwise true. */
569
570/* These four use the custom conversion code above. */
571static bool
572convert_utf8_utf16 (iconv_t cd, const uchar *from, size_t flen,
573 struct _cpp_strbuf *to)
574{
575 return conversion_loop (one_conversion: one_utf8_to_utf16, cd, from, flen, to);
576}
577
578static bool
579convert_utf8_utf32 (iconv_t cd, const uchar *from, size_t flen,
580 struct _cpp_strbuf *to)
581{
582 return conversion_loop (one_conversion: one_utf8_to_utf32, cd, from, flen, to);
583}
584
585static bool
586convert_utf16_utf8 (iconv_t cd, const uchar *from, size_t flen,
587 struct _cpp_strbuf *to)
588{
589 return conversion_loop (one_conversion: one_utf16_to_utf8, cd, from, flen, to);
590}
591
592static bool
593convert_utf32_utf8 (iconv_t cd, const uchar *from, size_t flen,
594 struct _cpp_strbuf *to)
595{
596 return conversion_loop (one_conversion: one_utf32_to_utf8, cd, from, flen, to);
597}
598
599/* Magic conversion which just counts characters from input, so
600 only to->len is significant. */
601static bool
602convert_count_chars (iconv_t cd, const uchar *from,
603 size_t flen, struct _cpp_strbuf *to)
604{
605 return conversion_loop (one_conversion: one_count_chars, cd, from, flen, to);
606}
607
608/* Identity conversion, used when we have no alternative. */
609static bool
610convert_no_conversion (iconv_t cd ATTRIBUTE_UNUSED,
611 const uchar *from, size_t flen, struct _cpp_strbuf *to)
612{
613 if (to->len + flen > to->asize)
614 {
615 to->asize = to->len + flen;
616 to->asize += to->asize / 4;
617 to->text = XRESIZEVEC (uchar, to->text, to->asize);
618 }
619 memcpy (dest: to->text + to->len, src: from, n: flen);
620 to->len += flen;
621 return true;
622}
623
624/* And this one uses the system iconv primitive. It's a little
625 different, since iconv's interface is a little different. */
626#if HAVE_ICONV
627
628#define CONVERT_ICONV_GROW_BUFFER \
629 do { \
630 outbytesleft += OUTBUF_BLOCK_SIZE; \
631 to->asize += OUTBUF_BLOCK_SIZE; \
632 to->text = XRESIZEVEC (uchar, to->text, to->asize); \
633 outbuf = (char *)to->text + to->asize - outbytesleft; \
634 } while (0)
635
636static bool
637convert_using_iconv (iconv_t cd, const uchar *from, size_t flen,
638 struct _cpp_strbuf *to)
639{
640 ICONV_CONST char *inbuf;
641 char *outbuf;
642 size_t inbytesleft, outbytesleft;
643
644 /* Reset conversion descriptor and check that it is valid. */
645 if (iconv (cd, 0, 0, 0, 0) == (size_t)-1)
646 return false;
647
648 inbuf = (ICONV_CONST char *)from;
649 inbytesleft = flen;
650 outbuf = (char *)to->text + to->len;
651 outbytesleft = to->asize - to->len;
652
653 for (;;)
654 {
655 iconv (cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft);
656 if (__builtin_expect (inbytesleft == 0, 1))
657 {
658 /* Close out any shift states, returning to the initial state. */
659 if (iconv (cd, 0, 0, &outbuf, &outbytesleft) == (size_t)-1)
660 {
661 if (errno != E2BIG)
662 return false;
663
664 CONVERT_ICONV_GROW_BUFFER;
665 if (iconv (cd, 0, 0, &outbuf, &outbytesleft) == (size_t)-1)
666 return false;
667 }
668
669 to->len = to->asize - outbytesleft;
670 return true;
671 }
672 if (errno != E2BIG)
673 return false;
674
675 CONVERT_ICONV_GROW_BUFFER;
676 }
677}
678#else
679#define convert_using_iconv 0 /* prevent undefined symbol error below */
680#endif
681
682/* Arrange for the above custom conversion logic to be used automatically
683 when conversion between a suitable pair of character sets is requested. */
684
685#define APPLY_CONVERSION(CONVERTER, FROM, FLEN, TO) \
686 CONVERTER.func (CONVERTER.cd, FROM, FLEN, TO)
687
688struct cpp_conversion
689{
690 const char *pair;
691 convert_f func;
692 iconv_t fake_cd;
693};
694static const struct cpp_conversion conversion_tab[] = {
695 { .pair: "UTF-8/UTF-32LE", .func: convert_utf8_utf32, .fake_cd: (iconv_t)0 },
696 { .pair: "UTF-8/UTF-32BE", .func: convert_utf8_utf32, .fake_cd: (iconv_t)1 },
697 { .pair: "UTF-8/UTF-16LE", .func: convert_utf8_utf16, .fake_cd: (iconv_t)0 },
698 { .pair: "UTF-8/UTF-16BE", .func: convert_utf8_utf16, .fake_cd: (iconv_t)1 },
699 { .pair: "UTF-32LE/UTF-8", .func: convert_utf32_utf8, .fake_cd: (iconv_t)0 },
700 { .pair: "UTF-32BE/UTF-8", .func: convert_utf32_utf8, .fake_cd: (iconv_t)1 },
701 { .pair: "UTF-16LE/UTF-8", .func: convert_utf16_utf8, .fake_cd: (iconv_t)0 },
702 { .pair: "UTF-16BE/UTF-8", .func: convert_utf16_utf8, .fake_cd: (iconv_t)1 },
703};
704
705/* Subroutine of cpp_init_iconv: initialize and return a
706 cset_converter structure for conversion from FROM to TO. If
707 iconv_open() fails, issue an error and return an identity
708 converter. Silently return an identity converter if FROM and TO
709 are identical.
710
711 PFILE is only used for generating diagnostics; setting it to NULL
712 suppresses diagnostics. */
713
714static struct cset_converter
715init_iconv_desc (cpp_reader *pfile, const char *to, const char *from)
716{
717 struct cset_converter ret;
718 char *pair;
719 size_t i;
720
721 ret.to = to;
722 ret.from = from;
723
724 if (!strcasecmp (s1: to, s2: from))
725 {
726 ret.func = convert_no_conversion;
727 ret.cd = (iconv_t) -1;
728 ret.width = -1;
729 return ret;
730 }
731
732 pair = (char *) alloca(strlen(to) + strlen(from) + 2);
733
734 strcpy(dest: pair, src: from);
735 strcat(dest: pair, src: "/");
736 strcat(dest: pair, src: to);
737 for (i = 0; i < ARRAY_SIZE (conversion_tab); i++)
738 if (!strcasecmp (s1: pair, s2: conversion_tab[i].pair))
739 {
740 ret.func = conversion_tab[i].func;
741 ret.cd = conversion_tab[i].fake_cd;
742 ret.width = -1;
743 return ret;
744 }
745
746 /* No custom converter - try iconv. */
747 if (HAVE_ICONV)
748 {
749 ret.func = convert_using_iconv;
750 ret.cd = iconv_open (to, from);
751 ret.width = -1;
752
753 if (ret.cd == (iconv_t) -1)
754 {
755 if (pfile)
756 {
757 if (errno == EINVAL)
758 cpp_error (pfile, CPP_DL_ERROR, /* FIXME should be DL_SORRY */
759 msgid: "conversion from %s to %s not supported by iconv",
760 from, to);
761 else
762 cpp_errno (pfile, CPP_DL_ERROR, msgid: "iconv_open");
763 }
764 ret.func = convert_no_conversion;
765 }
766 }
767 else
768 {
769 if (pfile)
770 {
771 cpp_error (pfile, CPP_DL_ERROR, /* FIXME: should be DL_SORRY */
772 msgid: "no iconv implementation, cannot convert from %s to %s",
773 from, to);
774 }
775 ret.func = convert_no_conversion;
776 ret.cd = (iconv_t) -1;
777 ret.width = -1;
778 }
779
780 return ret;
781}
782
783/* If charset conversion is requested, initialize iconv(3) descriptors
784 for conversion from the source character set to the execution
785 character sets. If iconv is not present in the C library, and
786 conversion is requested, issue an error. */
787
788void
789cpp_init_iconv (cpp_reader *pfile)
790{
791 const char *ncset = CPP_OPTION (pfile, narrow_charset);
792 const char *wcset = CPP_OPTION (pfile, wide_charset);
793 const char *default_wcset;
794
795 bool be = CPP_OPTION (pfile, bytes_big_endian);
796
797 if (CPP_OPTION (pfile, wchar_precision) >= 32)
798 default_wcset = be ? "UTF-32BE" : "UTF-32LE";
799 else if (CPP_OPTION (pfile, wchar_precision) >= 16)
800 default_wcset = be ? "UTF-16BE" : "UTF-16LE";
801 else
802 /* This effectively means that wide strings are not supported,
803 so don't do any conversion at all. */
804 default_wcset = SOURCE_CHARSET;
805
806 if (!ncset)
807 ncset = SOURCE_CHARSET;
808 if (!wcset)
809 wcset = default_wcset;
810
811 pfile->narrow_cset_desc = init_iconv_desc (pfile, to: ncset, SOURCE_CHARSET);
812 pfile->narrow_cset_desc.width = CPP_OPTION (pfile, char_precision);
813 pfile->utf8_cset_desc = init_iconv_desc (pfile, to: "UTF-8", SOURCE_CHARSET);
814 pfile->utf8_cset_desc.width = CPP_OPTION (pfile, char_precision);
815 pfile->char16_cset_desc = init_iconv_desc (pfile,
816 to: be ? "UTF-16BE" : "UTF-16LE",
817 SOURCE_CHARSET);
818 pfile->char16_cset_desc.width = 16;
819 pfile->char32_cset_desc = init_iconv_desc (pfile,
820 to: be ? "UTF-32BE" : "UTF-32LE",
821 SOURCE_CHARSET);
822 pfile->char32_cset_desc.width = 32;
823 pfile->wide_cset_desc = init_iconv_desc (pfile, to: wcset, SOURCE_CHARSET);
824 pfile->wide_cset_desc.width = CPP_OPTION (pfile, wchar_precision);
825}
826
827/* Destroy iconv(3) descriptors set up by cpp_init_iconv, if necessary. */
828void
829_cpp_destroy_iconv (cpp_reader *pfile)
830{
831 if (HAVE_ICONV)
832 {
833 if (pfile->narrow_cset_desc.func == convert_using_iconv)
834 iconv_close (pfile->narrow_cset_desc.cd);
835 if (pfile->utf8_cset_desc.func == convert_using_iconv)
836 iconv_close (pfile->utf8_cset_desc.cd);
837 if (pfile->char16_cset_desc.func == convert_using_iconv)
838 iconv_close (pfile->char16_cset_desc.cd);
839 if (pfile->char32_cset_desc.func == convert_using_iconv)
840 iconv_close (pfile->char32_cset_desc.cd);
841 if (pfile->wide_cset_desc.func == convert_using_iconv)
842 iconv_close (pfile->wide_cset_desc.cd);
843 if (pfile->reverse_narrow_cset_desc.func == convert_using_iconv)
844 iconv_close (pfile->narrow_cset_desc.cd);
845 if (pfile->reverse_utf8_cset_desc.func == convert_using_iconv)
846 iconv_close (pfile->utf8_cset_desc.cd);
847 }
848}
849
850/* Utility routine for use by a full compiler. C is a character taken
851 from the *basic* source character set, encoded in the host's
852 execution encoding. Convert it to (the target's) execution
853 encoding, and return that value.
854
855 Issues an internal error if C's representation in the narrow
856 execution character set fails to be a single-byte value (C99
857 5.2.1p3: "The representation of each member of the source and
858 execution character sets shall fit in a byte.") May also issue an
859 internal error if C fails to be a member of the basic source
860 character set (testing this exactly is too hard, especially when
861 the host character set is EBCDIC). */
862cppchar_t
863cpp_host_to_exec_charset (cpp_reader *pfile, cppchar_t c)
864{
865 uchar sbuf[1];
866 struct _cpp_strbuf tbuf;
867
868 /* This test is merely an approximation, but it suffices to catch
869 the most important thing, which is that we don't get handed a
870 character outside the unibyte range of the host character set. */
871 if (c > LAST_POSSIBLY_BASIC_SOURCE_CHAR)
872 {
873 cpp_error (pfile, CPP_DL_ICE,
874 msgid: "character 0x%lx is not in the basic source character set",
875 (unsigned long) c);
876 return 0;
877 }
878
879 /* Being a character in the unibyte range of the host character set,
880 we can safely splat it into a one-byte buffer and trust that that
881 is a well-formed string. */
882 sbuf[0] = c;
883
884 /* This should never need to reallocate, but just in case... */
885 tbuf.asize = 1;
886 tbuf.text = XNEWVEC (uchar, tbuf.asize);
887 tbuf.len = 0;
888
889 if (!APPLY_CONVERSION (pfile->narrow_cset_desc, sbuf, 1, &tbuf))
890 {
891 cpp_errno (pfile, CPP_DL_ICE, msgid: "converting to execution character set");
892 return 0;
893 }
894 if (tbuf.len != 1)
895 {
896 cpp_error (pfile, CPP_DL_ICE,
897 msgid: "character 0x%lx is not unibyte in execution character set",
898 (unsigned long)c);
899 return 0;
900 }
901 c = tbuf.text[0];
902 free(ptr: tbuf.text);
903 return c;
904}
905
906
907
908/* cpp_substring_ranges's constructor. */
909
910cpp_substring_ranges::cpp_substring_ranges () :
911 m_ranges (NULL),
912 m_num_ranges (0),
913 m_alloc_ranges (8)
914{
915 m_ranges = XNEWVEC (source_range, m_alloc_ranges);
916}
917
918/* cpp_substring_ranges's destructor. */
919
920cpp_substring_ranges::~cpp_substring_ranges ()
921{
922 free (ptr: m_ranges);
923}
924
925/* Add RANGE to the vector of source_range information. */
926
927void
928cpp_substring_ranges::add_range (source_range range)
929{
930 if (m_num_ranges >= m_alloc_ranges)
931 {
932 m_alloc_ranges *= 2;
933 m_ranges
934 = (source_range *)xrealloc (m_ranges,
935 sizeof (source_range) * m_alloc_ranges);
936 }
937 m_ranges[m_num_ranges++] = range;
938}
939
940/* Read NUM ranges from LOC_READER, adding them to the vector of source_range
941 information. */
942
943void
944cpp_substring_ranges::add_n_ranges (int num,
945 cpp_string_location_reader &loc_reader)
946{
947 for (int i = 0; i < num; i++)
948 add_range (range: loc_reader.get_next ());
949}
950
951
952
953/* Utility routine that computes a mask of the form 0000...111... with
954 WIDTH 1-bits. */
955static inline size_t
956width_to_mask (size_t width)
957{
958 width = MIN (width, BITS_PER_CPPCHAR_T);
959 if (width >= CHAR_BIT * sizeof (size_t))
960 return ~(size_t) 0;
961 else
962 return ((size_t) 1 << width) - 1;
963}
964
965/* A large table of unicode character information. */
966enum {
967 /* Valid in a C99 identifier? */
968 C99 = 1,
969 /* Valid in a C99 identifier, but not as the first character? */
970 N99 = 2,
971 /* Valid in a C++ identifier? */
972 CXX = 4,
973 /* Valid in a C11/C++11 identifier? */
974 C11 = 8,
975 /* Valid in a C11/C++11 identifier, but not as the first character? */
976 N11 = 16,
977 /* Valid in a C++23 identifier? */
978 CXX23 = 32,
979 /* Valid in a C++23 identifier, but not as the first character? */
980 NXX23 = 64,
981 /* NFC representation is not valid in an identifier? */
982 CID = 128,
983 /* Might be valid NFC form? */
984 NFC = 256,
985 /* Might be valid NFKC form? */
986 NKC = 512,
987 /* Certain preceding characters might make it not valid NFC/NKFC form? */
988 CTX = 1024
989};
990
991struct ucnrange {
992 /* Bitmap of flags above. */
993 unsigned short flags;
994 /* Combining class of the character. */
995 unsigned char combine;
996 /* Last character in the range described by this entry. */
997 unsigned int end;
998};
999#include "ucnid.h"
1000
1001/* ISO 10646 defines the UCS codespace as the range 0-0x10FFFF inclusive. */
1002#define UCS_LIMIT 0x10FFFF
1003
1004#include "uname2c.h"
1005
1006static const char hangul_syllables[][4] = {
1007 /* L */
1008 "G", "GG", "N", "D", "DD", "R", "M", "B", "BB", "S", "SS", "",
1009 "J", "JJ", "C", "K", "T", "P", "H",
1010 /* V */
1011 "A", "AE", "YA", "YAE", "EO", "E", "YEO", "YE", "O", "WA", "WAE",
1012 "OE", "YO", "U", "WEO", "WE", "WI", "YU", "EU", "YI", "I",
1013 /* T */
1014 "", "G", "GG", "GS", "N", "NJ", "NH", "D", "L", "LG", "LM", "LB",
1015 "LS", "LT", "LP", "LH", "M", "B", "BS", "S", "SS", "NG", "J", "C",
1016 "K", "T", "P", "H"
1017};
1018
1019static const short hangul_count[6] = { 19, 21, 28 };
1020
1021/* Used for Unicode loose matching rule UAX44-LM2 matching. */
1022
1023struct uname2c_data
1024{
1025 char *canon_name;
1026 char prev_char;
1027};
1028
1029/* Map NAME, a Unicode character name or correction/control/alternate
1030 alias, to a Unicode codepoint, or return (cppchar_t) -1 if
1031 not found. This uses a space optimized radix tree precomputed
1032 by the makeuname2c utility, with binary format documented in its
1033 source makeuname2c.cc. */
1034
1035static cppchar_t
1036_cpp_uname2c (const char *name, size_t len, const unsigned char *n,
1037 struct uname2c_data *data)
1038{
1039 do
1040 {
1041 char k;
1042 const char *key;
1043 size_t key_len, len_adj;
1044 bool has_value = *n & 0x40;
1045 bool has_children, no_sibling = false;
1046 cppchar_t codepoint = -1;
1047 const unsigned char *child = NULL;
1048 int ret;
1049
1050 if (*n & 0x80)
1051 {
1052 k = ' ' + (*n++ & 0x3f);
1053 key = &k;
1054 key_len = 1;
1055 }
1056 else
1057 {
1058 key_len = *n++ & 0x3f;
1059 key = &uname2c_dict[*n++];
1060 key += (*n++ << 8);
1061 }
1062 if (has_value)
1063 {
1064 codepoint = *n + (n[1] << 8) + ((n[2] & 0x1f) << 16);
1065 has_children = n[2] & 0x80;
1066 no_sibling = n[2] & 0x40;
1067 n += 3;
1068 }
1069 else
1070 has_children = true;
1071 if (has_children)
1072 {
1073 unsigned int shift = 0;
1074 size_t child_off = 0;
1075
1076 do
1077 {
1078 child_off |= (*n & 0x7f) << shift;
1079 shift += 7;
1080 }
1081 while ((*n++ & 0x80) != 0);
1082 child = n + child_off;
1083 }
1084 if (__builtin_expect (data == NULL, 1))
1085 {
1086 ret = memcmp (s1: name, s2: key, n: len > key_len ? key_len : len);
1087 len_adj = key_len;
1088 }
1089 else
1090 {
1091 const char *p = name, *q = key;
1092
1093 while (1)
1094 {
1095 if ((size_t) (p - name) == len || (size_t) (q - key) == key_len)
1096 break;
1097 if (*q == ' ')
1098 {
1099 ++q;
1100 continue;
1101 }
1102 if (*q == '-')
1103 {
1104 /* This is the hard case. Only medial hyphens
1105 should be removed, where medial means preceded
1106 and followed by alnum. */
1107 if (ISALNUM (q == key ? data->prev_char : q[-1]))
1108 {
1109 if (q + 1 == key + key_len)
1110 {
1111 /* We don't know what the next letter will be.
1112 It could be ISALNUM, then we are supposed
1113 to omit it, or it could be a space and then
1114 we should not omit it and need to compare it.
1115 Fortunately the only 3 names with hyphen
1116 followed by non-letter are
1117 U+0F0A TIBETAN MARK BKA- SHOG YIG MGO
1118 U+0FD0 TIBETAN MARK BKA- SHOG GI MGO RGYAN
1119 U+0FD0 TIBETAN MARK BSKA- SHOG GI MGO RGYAN
1120 Furthermore, prefixes of NR2 generated
1121 ranges all end with a hyphen, but the generated
1122 part is then followed by alpha-numeric.
1123 So, let's just assume that - at the end of
1124 key is always followed by alphanumeric and
1125 so should be omitted.
1126 makeuname2c.cc verifies that this is true. */
1127 ++q;
1128 continue;
1129 }
1130 else if (ISALNUM (q[1]))
1131 {
1132 ++q;
1133 continue;
1134 }
1135 }
1136 }
1137 if (*p != *q)
1138 break;
1139 ++p;
1140 ++q;
1141 }
1142 len_adj = p - name;
1143 /* If we don't consume the whole key, signal a mismatch,
1144 but always with ret = 1, so that we keep looking through
1145 siblings. */
1146 ret = q < key + key_len;
1147 }
1148 if (ret < 0)
1149 return -1;
1150 else if (ret == 0)
1151 {
1152 if (len < len_adj)
1153 return -1;
1154 else if (codepoint >= 0xd800
1155 && codepoint < 0xd800 + ARRAY_SIZE (uname2c_generated))
1156 {
1157 name += len_adj;
1158 len -= len_adj;
1159 if (codepoint == 0xd800)
1160 {
1161 /* NR1 - Hangul syllables. */
1162 size_t start = 0, end, i, j;
1163 int this_len, max_len;
1164 char winner[3];
1165
1166 for (i = 0; i < 3; ++i)
1167 {
1168 end = start + hangul_count[i];
1169 max_len = -1;
1170 winner[i] = -1;
1171 for (j = start; j < end; j++)
1172 {
1173 this_len = strlen (s: hangul_syllables[j]);
1174 if (len >= (size_t) this_len
1175 && this_len > max_len
1176 && memcmp (s1: name, s2: hangul_syllables[j],
1177 n: this_len) == 0)
1178 {
1179 max_len = this_len;
1180 winner[i] = j - start;
1181 }
1182 }
1183 if (max_len == -1)
1184 return -1;
1185 name += max_len;
1186 len -= max_len;
1187 start = end;
1188 }
1189 if (__builtin_expect (data != NULL, 0))
1190 {
1191 memcpy (dest: data->canon_name, src: key, n: key_len);
1192 data->canon_name[key_len] = '\0';
1193 for (i = 0, start = 0; i < 3; ++i)
1194 {
1195 strcat (dest: data->canon_name,
1196 src: hangul_syllables[start + winner[i]]);
1197 start += hangul_count[i];
1198 }
1199 }
1200 return (0xac00 + 21 * 28 * winner[0]
1201 + 28 * winner[1] + winner[2]);
1202 }
1203 else
1204 {
1205 /* NR2 - prefix followed by hexadecimal codepoint. */
1206 const cppchar_t *p;
1207 size_t i;
1208
1209 if (len < 4 || len > 5)
1210 return -1;
1211 p = uname2c_pairs + uname2c_generated[codepoint - 0xd800];
1212 codepoint = 0;
1213 for (i = 0; i < len; ++i)
1214 {
1215 codepoint <<= 4;
1216 if (!ISXDIGIT (name[i]))
1217 return -1;
1218 codepoint += hex_value (name[i]);
1219 }
1220 for (; *p; p += 2)
1221 if (codepoint < *p)
1222 return -1;
1223 else if (codepoint <= p[1])
1224 {
1225 if (__builtin_expect (data != NULL, 0))
1226 {
1227 memcpy (dest: data->canon_name, src: key, n: key_len);
1228 memcpy (dest: data->canon_name + key_len, src: name, n: len);
1229 data->canon_name[key_len + len] = '\0';
1230 }
1231 return codepoint;
1232 }
1233 return -1;
1234 }
1235 }
1236 else if (__builtin_expect (data != NULL, 0))
1237 {
1238 if (len == len_adj)
1239 {
1240 memcpy (dest: data->canon_name, src: key, n: key_len);
1241 data->canon_name[key_len] = '\0';
1242 return codepoint;
1243 }
1244 if (has_children)
1245 {
1246 struct uname2c_data save = *data;
1247 memcpy (dest: data->canon_name, src: key, n: key_len);
1248 data->canon_name += key_len;
1249 data->prev_char = key[key_len - 1];
1250 codepoint = _cpp_uname2c (name: name + len_adj, len: len - len_adj,
1251 n: child, data);
1252 if (codepoint != (cppchar_t) -1)
1253 return codepoint;
1254 *data = save;
1255 }
1256 }
1257 else if (len == len_adj)
1258 return codepoint;
1259 else if (!has_children)
1260 return -1;
1261 else
1262 {
1263 name += len_adj;
1264 len -= len_adj;
1265 n = child;
1266 continue;
1267 }
1268 }
1269 if (no_sibling || (!has_value && *n == 0xff))
1270 break;
1271 }
1272 while (1);
1273 return -1;
1274}
1275
1276/* Try to do a loose name lookup according to Unicode loose matching rule
1277 UAX44-LM2. First ignore medial hyphens, whitespace, underscore
1278 characters and convert to upper case. */
1279
1280static cppchar_t
1281_cpp_uname2c_uax44_lm2 (const char *name, size_t len, char *canon_name)
1282{
1283 char name_after_uax44_lm2[uname2c_max_name_len];
1284 char *q = name_after_uax44_lm2;
1285 const char *p;
1286
1287 for (p = name; p < name + len; p++)
1288 if (*p == '_' || *p == ' ')
1289 continue;
1290 else if (*p == '-' && p != name && ISALNUM (p[-1]) && ISALNUM (p[1]))
1291 continue;
1292 else if (q == name_after_uax44_lm2 + uname2c_max_name_len)
1293 return -1;
1294 else if (ISLOWER (*p))
1295 *q++ = TOUPPER (*p);
1296 else
1297 *q++ = *p;
1298
1299 struct uname2c_data data;
1300 data.canon_name = canon_name;
1301 data.prev_char = ' ';
1302 /* Hangul Jungseong O- E after UAX44-LM2 should be HANGULJUNGSEONGO-E
1303 and so should match U+1180. */
1304 if (q - name_after_uax44_lm2 == sizeof ("HANGULJUNGSEONGO-E") - 1
1305 && memcmp (s1: name_after_uax44_lm2, s2: "HANGULJUNGSEONGO-E",
1306 n: sizeof ("HANGULJUNGSEONGO-E") - 1) == 0)
1307 {
1308 name_after_uax44_lm2[sizeof ("HANGULJUNGSEONGO") - 1] = 'E';
1309 --q;
1310 }
1311 cppchar_t result
1312 = _cpp_uname2c (name: name_after_uax44_lm2, len: q - name_after_uax44_lm2,
1313 n: uname2c_tree, data: &data);
1314
1315 /* Unicode UAX44-LM2 exception:
1316 U+116C HANGUL JUNGSEONG OE
1317 U+1180 HANGUL JUNGSEONG O-E
1318 We remove all medial hyphens when we shouldn't remote the U+1180 one.
1319 The U+1180 entry sorts before U+116C lexicographilly, so we get U+1180
1320 in both cases. Thus, if result is U+1180, check if user's name doesn't
1321 have a hyphen there and adjust. */
1322 if (result == 0x1180)
1323 {
1324 while (p[-1] == ' ' || p[-1] == '_')
1325 --p;
1326 gcc_assert (TOUPPER (p[-1]) == 'E');
1327 --p;
1328 while (p[-1] == ' ' || p[-1] == '_')
1329 --p;
1330 if (p[-1] != '-')
1331 {
1332 result = 0x116c;
1333 memcpy (dest: canon_name + sizeof ("HANGUL JUNGSEONG O") - 1, src: "E", n: 2);
1334 }
1335 }
1336 return result;
1337}
1338
1339/* Returns flags representing the XID properties of the given codepoint. */
1340unsigned int
1341cpp_check_xid_property (cppchar_t c)
1342{
1343 // fast path for ASCII
1344 if (c < 0x80)
1345 {
1346 if (('A' <= c && c <= 'Z') || ('a' <= c && c <= 'z'))
1347 return CPP_XID_START | CPP_XID_CONTINUE;
1348 if (('0' <= c && c <= '9') || c == '_')
1349 return CPP_XID_CONTINUE;
1350 }
1351
1352 if (c > UCS_LIMIT)
1353 return 0;
1354
1355 int mn, mx, md;
1356 mn = 0;
1357 mx = ARRAY_SIZE (ucnranges) - 1;
1358 while (mx != mn)
1359 {
1360 md = (mn + mx) / 2;
1361 if (c <= ucnranges[md].end)
1362 mx = md;
1363 else
1364 mn = md + 1;
1365 }
1366
1367 unsigned short flags = ucnranges[mn].flags;
1368
1369 if (flags & CXX23)
1370 return CPP_XID_START | CPP_XID_CONTINUE;
1371 if (flags & NXX23)
1372 return CPP_XID_CONTINUE;
1373 return 0;
1374}
1375
1376/* Returns 1 if C is valid in an identifier, 2 if C is valid except at
1377 the start of an identifier, and 0 if C is not valid in an
1378 identifier. We assume C has already gone through the checks of
1379 _cpp_valid_ucn. Also update NST for C if returning nonzero. The
1380 algorithm is a simple binary search on the table defined in
1381 ucnid.h. */
1382
1383static int
1384ucn_valid_in_identifier (cpp_reader *pfile, cppchar_t c,
1385 struct normalize_state *nst)
1386{
1387 int mn, mx, md;
1388 unsigned short valid_flags, invalid_start_flags;
1389
1390 if (c > UCS_LIMIT)
1391 return 0;
1392
1393 mn = 0;
1394 mx = ARRAY_SIZE (ucnranges) - 1;
1395 while (mx != mn)
1396 {
1397 md = (mn + mx) / 2;
1398 if (c <= ucnranges[md].end)
1399 mx = md;
1400 else
1401 mn = md + 1;
1402 }
1403
1404 /* When -pedantic, we require the character to have been listed by
1405 the standard for the current language. Otherwise, we accept the
1406 union of the acceptable sets for all supported language versions. */
1407 valid_flags = C99 | CXX | C11 | CXX23;
1408 if (CPP_PEDANTIC (pfile))
1409 {
1410 if (CPP_OPTION (pfile, xid_identifiers))
1411 valid_flags = CXX23;
1412 else if (CPP_OPTION (pfile, c11_identifiers))
1413 valid_flags = C11;
1414 else if (CPP_OPTION (pfile, c99))
1415 valid_flags = C99;
1416 }
1417 if (! (ucnranges[mn].flags & valid_flags))
1418 return 0;
1419
1420 /* Update NST. */
1421 if (ucnranges[mn].combine != 0 && ucnranges[mn].combine < nst->prev_class)
1422 nst->level = normalized_none;
1423 else if (ucnranges[mn].flags & CTX)
1424 {
1425 bool safe;
1426 cppchar_t p = nst->previous;
1427
1428 /* For Hangul, characters in the range AC00-D7A3 are NFC/NFKC,
1429 and are combined algorithmically from a sequence of the form
1430 1100-1112 1161-1175 11A8-11C2
1431 (if the third is not present, it is treated as 11A7, which is not
1432 really a valid character).
1433 Unfortunately, C99 allows (only) the NFC form, but C++ allows
1434 only the combining characters. */
1435 if (c >= 0x1161 && c <= 0x1175)
1436 safe = p < 0x1100 || p > 0x1112;
1437 else if (c >= 0x11A8 && c <= 0x11C2)
1438 safe = (p < 0xAC00 || p > 0xD7A3 || (p - 0xAC00) % 28 != 0);
1439 else
1440 safe = check_nfc (pfile, c, p);
1441 if (!safe)
1442 {
1443 if ((c >= 0x1161 && c <= 0x1175) || (c >= 0x11A8 && c <= 0x11C2))
1444 nst->level = MAX (nst->level, normalized_identifier_C);
1445 else
1446 nst->level = normalized_none;
1447 }
1448 }
1449 else if (ucnranges[mn].flags & NKC)
1450 ;
1451 else if (ucnranges[mn].flags & NFC)
1452 nst->level = MAX (nst->level, normalized_C);
1453 else if (ucnranges[mn].flags & CID)
1454 nst->level = MAX (nst->level, normalized_identifier_C);
1455 else
1456 nst->level = normalized_none;
1457 if (ucnranges[mn].combine == 0)
1458 nst->previous = c;
1459 nst->prev_class = ucnranges[mn].combine;
1460
1461 if (!CPP_PEDANTIC (pfile))
1462 {
1463 /* If not -pedantic, accept as character that may
1464 begin an identifier a union of characters allowed
1465 at that position in each of the character sets. */
1466 if ((ucnranges[mn].flags & (C99 | N99)) == C99
1467 || (ucnranges[mn].flags & CXX) != 0
1468 || (ucnranges[mn].flags & (C11 | N11)) == C11
1469 || (ucnranges[mn].flags & (CXX23 | NXX23)) == CXX23)
1470 return 1;
1471 return 2;
1472 }
1473
1474 if (CPP_OPTION (pfile, xid_identifiers))
1475 invalid_start_flags = NXX23;
1476 else if (CPP_OPTION (pfile, c11_identifiers))
1477 invalid_start_flags = N11;
1478 else if (CPP_OPTION (pfile, c99))
1479 invalid_start_flags = N99;
1480 else
1481 invalid_start_flags = 0;
1482
1483 /* In C99, UCN digits may not begin identifiers. In C11 and C++11,
1484 UCN combining characters may not begin identifiers. */
1485 if (ucnranges[mn].flags & invalid_start_flags)
1486 return 2;
1487
1488 return 1;
1489}
1490
1491/* Increment char_range->m_finish by a single character. */
1492
1493static void
1494extend_char_range (source_range *char_range,
1495 cpp_string_location_reader *loc_reader)
1496{
1497 if (loc_reader)
1498 {
1499 gcc_assert (char_range);
1500 char_range->m_finish = loc_reader->get_next ().m_finish;
1501 }
1502}
1503
1504/* [lex.charset]: The character designated by the universal character
1505 name \UNNNNNNNN is that character whose character short name in
1506 ISO/IEC 10646 is NNNNNNNN; the character designated by the
1507 universal character name \uNNNN is that character whose character
1508 short name in ISO/IEC 10646 is 0000NNNN. If the hexadecimal value
1509 for a universal character name corresponds to a surrogate code point
1510 (in the range 0xD800-0xDFFF, inclusive), the program is ill-formed.
1511 Additionally, if the hexadecimal value for a universal-character-name
1512 outside a character or string literal corresponds to a control character
1513 (in either of the ranges 0x00-0x1F or 0x7F-0x9F, both inclusive) or to a
1514 character in the basic source character set, the program is ill-formed.
1515
1516 C99 6.4.3: A universal character name shall not specify a character
1517 whose short identifier is less than 00A0 other than 0024 ($), 0040 (@),
1518 or 0060 (`), nor one in the range D800 through DFFF inclusive.
1519
1520 If the hexadecimal value is larger than the upper bound of the UCS
1521 codespace specified in ISO/IEC 10646, a pedantic warning is issued
1522 in all versions of C and in the C++20 or later versions of C++.
1523
1524 *PSTR must be preceded by "\u" or "\U"; it is assumed that the
1525 buffer end is delimited by a non-hex digit. Returns false if the
1526 UCN has not been consumed, true otherwise.
1527
1528 The value of the UCN, whether valid or invalid, is returned in *CP.
1529 Diagnostics are emitted for invalid values. PSTR is updated to point
1530 one beyond the UCN, or to the syntactically invalid character.
1531
1532 IDENTIFIER_POS is 0 when not in an identifier, 1 for the start of
1533 an identifier, or 2 otherwise.
1534
1535 If LOC_READER is non-NULL, then position information is
1536 read from *LOC_READER and CHAR_RANGE->m_finish is updated accordingly. */
1537
1538bool
1539_cpp_valid_ucn (cpp_reader *pfile, const uchar **pstr,
1540 const uchar *limit, int identifier_pos,
1541 struct normalize_state *nst, cppchar_t *cp,
1542 source_range *char_range,
1543 cpp_string_location_reader *loc_reader)
1544{
1545 cppchar_t result, c;
1546 unsigned int length;
1547 const uchar *str = *pstr;
1548 const uchar *base = str - 2;
1549 bool delimited = false, named = false;
1550
1551 if (!CPP_OPTION (pfile, cplusplus) && !CPP_OPTION (pfile, c99))
1552 cpp_error (pfile, CPP_DL_WARNING,
1553 msgid: "universal character names are only valid in C++ and C99");
1554 else if (CPP_OPTION (pfile, cpp_warn_c90_c99_compat) > 0
1555 && !CPP_OPTION (pfile, cplusplus))
1556 cpp_error (pfile, CPP_DL_WARNING,
1557 msgid: "C99%'s universal character names are incompatible with C90");
1558 else if (CPP_WTRADITIONAL (pfile) && identifier_pos == 0)
1559 cpp_warning (pfile, CPP_W_TRADITIONAL,
1560 msgid: "the meaning of %<\\%c%> is different in traditional C",
1561 (int) str[-1]);
1562
1563 result = 0;
1564 if (str[-1] == 'u')
1565 {
1566 length = 4;
1567 if (str < limit
1568 && *str == '{'
1569 && (!identifier_pos
1570 || CPP_OPTION (pfile, delimited_escape_seqs)
1571 || !CPP_OPTION (pfile, std)))
1572 {
1573 str++;
1574 /* Magic value to indicate no digits seen. */
1575 length = 32;
1576 delimited = true;
1577 extend_char_range (char_range, loc_reader);
1578 }
1579 }
1580 else if (str[-1] == 'U')
1581 length = 8;
1582 else if (str[-1] == 'N')
1583 {
1584 length = 4;
1585 if (identifier_pos
1586 && !CPP_OPTION (pfile, named_uc_escape_seqs)
1587 && CPP_OPTION (pfile, std))
1588 {
1589 *cp = 0;
1590 return false;
1591 }
1592 if (str == limit || *str != '{')
1593 {
1594 if (identifier_pos)
1595 {
1596 *cp = 0;
1597 return false;
1598 }
1599 cpp_error (pfile, CPP_DL_ERROR, msgid: "%<\\N%> not followed by %<{%>");
1600 }
1601 else
1602 {
1603 str++;
1604 named = true;
1605 extend_char_range (char_range, loc_reader);
1606 length = 0;
1607 const uchar *name = str;
1608 bool strict = true;
1609
1610 do
1611 {
1612 if (str == limit)
1613 break;
1614 c = *str;
1615 if (!ISIDNUM (c) && c != ' ' && c != '-')
1616 break;
1617 if (ISLOWER (c) || c == '_')
1618 strict = false;
1619 str++;
1620 extend_char_range (char_range, loc_reader);
1621 }
1622 while (1);
1623
1624 if (str < limit && *str == '}')
1625 {
1626 if (identifier_pos && name == str)
1627 {
1628 cpp_warning (pfile, CPP_W_UNICODE,
1629 msgid: "empty named universal character escape "
1630 "sequence; treating it as separate tokens");
1631 *cp = 0;
1632 return false;
1633 }
1634 if (name == str)
1635 cpp_error (pfile, CPP_DL_ERROR,
1636 msgid: "empty named universal character escape sequence");
1637 else if ((!identifier_pos || strict)
1638 && !CPP_OPTION (pfile, named_uc_escape_seqs)
1639 && CPP_OPTION (pfile, cpp_pedantic))
1640 cpp_pedwarning (pfile,
1641 CPP_OPTION (pfile, cplusplus)
1642 ? CPP_W_CXX23_EXTENSIONS : CPP_W_PEDANTIC,
1643 msgid: "named universal character escapes are only "
1644 "valid in C++23");
1645 if (name == str)
1646 result = 0x40;
1647 else
1648 {
1649 /* If the name is longer than maximum length of a Unicode
1650 name, it can't be strictly valid. */
1651 if ((size_t) (str - name) > uname2c_max_name_len || !strict)
1652 result = -1;
1653 else
1654 result = _cpp_uname2c (name: (const char *) name, len: str - name,
1655 n: uname2c_tree, NULL);
1656 if (result == (cppchar_t) -1)
1657 {
1658 bool ret = true;
1659 if (identifier_pos
1660 && (!CPP_OPTION (pfile, named_uc_escape_seqs)
1661 || !strict))
1662 ret = cpp_warning (pfile, CPP_W_UNICODE,
1663 msgid: "%<\\N{%.*s}%> is not a valid "
1664 "universal character; treating it "
1665 "as separate tokens",
1666 (int) (str - name), name);
1667 else
1668 cpp_error (pfile, CPP_DL_ERROR,
1669 msgid: "%<\\N{%.*s}%> is not a valid universal "
1670 "character", (int) (str - name), name);
1671
1672 /* Try to do a loose name lookup according to
1673 Unicode loose matching rule UAX44-LM2. */
1674 char canon_name[uname2c_max_name_len + 1];
1675 result = _cpp_uname2c_uax44_lm2 (name: (const char *) name,
1676 len: str - name, canon_name);
1677 if (result != (cppchar_t) -1 && ret)
1678 cpp_error (pfile, CPP_DL_NOTE,
1679 msgid: "did you mean %<\\N{%s}%>?", canon_name);
1680 else
1681 result = 0xC0;
1682 if (identifier_pos
1683 && (!CPP_OPTION (pfile, named_uc_escape_seqs)
1684 || !strict))
1685 {
1686 *cp = 0;
1687 return false;
1688 }
1689 }
1690 }
1691 str++;
1692 extend_char_range (char_range, loc_reader);
1693 }
1694 else if (identifier_pos)
1695 {
1696 cpp_warning (pfile, CPP_W_UNICODE,
1697 msgid: "%<\\N{%> not terminated with %<}%> after %.*s; "
1698 "treating it as separate tokens",
1699 (int) (str - base), base);
1700 *cp = 0;
1701 return false;
1702 }
1703 else
1704 {
1705 cpp_error (pfile, CPP_DL_ERROR,
1706 msgid: "%<\\N{%> not terminated with %<}%> after %.*s",
1707 (int) (str - base), base);
1708 result = 1;
1709 }
1710 }
1711 }
1712 else
1713 {
1714 cpp_error (pfile, CPP_DL_ICE, msgid: "in %<_cpp_valid_ucn%> but not a UCN");
1715 length = 4;
1716 }
1717
1718 if (!named)
1719 do
1720 {
1721 if (str == limit)
1722 break;
1723 c = *str;
1724 if (!ISXDIGIT (c))
1725 break;
1726 str++;
1727 extend_char_range (char_range, loc_reader);
1728 if (delimited)
1729 {
1730 if (!result)
1731 /* Accept arbitrary number of leading zeros.
1732 16 is another magic value, smaller than 32 above
1733 and bigger than 8, so that upon encountering first
1734 non-zero digit we can count 8 digits and after that
1735 or in overflow bit and ensure length doesn't decrease
1736 to 0, as delimited escape sequence doesn't have upper
1737 bound on the number of hex digits. */
1738 length = 16;
1739 else if (length == 16 - 8)
1740 {
1741 /* Make sure we detect overflows. */
1742 result |= 0x8000000;
1743 ++length;
1744 }
1745 }
1746
1747 result = (result << 4) + hex_value (c);
1748 }
1749 while (--length);
1750
1751 if (delimited && str < limit && *str == '}')
1752 {
1753 bool warned = false;
1754 if (length == 32 && identifier_pos)
1755 {
1756 cpp_warning (pfile, CPP_W_UNICODE,
1757 msgid: "empty delimited escape sequence; "
1758 "treating it as separate tokens");
1759 *cp = 0;
1760 return false;
1761 }
1762 else if (length == 32)
1763 {
1764 cpp_error (pfile, CPP_DL_ERROR, msgid: "empty delimited escape sequence");
1765 warned = true;
1766 }
1767 else if (!CPP_OPTION (pfile, delimited_escape_seqs)
1768 && CPP_OPTION (pfile, cpp_pedantic))
1769 {
1770 if (CPP_OPTION (pfile, cplusplus))
1771 warned
1772 = cpp_pedwarning (pfile, CPP_W_CXX23_EXTENSIONS,
1773 msgid: "delimited escape sequences are only valid "
1774 "in C++23");
1775 else
1776 warned
1777 = cpp_pedwarning (pfile, CPP_W_PEDANTIC,
1778 msgid: "delimited escape sequences are only valid "
1779 "in C2Y");
1780 }
1781 if (!warned && CPP_OPTION (pfile, cpp_warn_c23_c2y_compat) > 0)
1782 cpp_warning (pfile, CPP_W_C11_C23_COMPAT,
1783 msgid: "delimited escape sequences are only valid in C2Y");
1784
1785 str++;
1786 length = 0;
1787 delimited = false;
1788 extend_char_range (char_range, loc_reader);
1789 }
1790
1791 /* Partial UCNs are not valid in strings, but decompose into
1792 multiple tokens in identifiers, so we can't give a helpful
1793 error message in that case. */
1794 if (length && identifier_pos)
1795 {
1796 if (delimited)
1797 cpp_warning (pfile, CPP_W_UNICODE,
1798 msgid: "%<\\u{%> not terminated with %<}%> after %.*s; "
1799 "treating it as separate tokens",
1800 (int) (str - base), base);
1801 *cp = 0;
1802 return false;
1803 }
1804
1805 *pstr = str;
1806 if (length)
1807 {
1808 if (!delimited)
1809 cpp_error (pfile, CPP_DL_ERROR,
1810 msgid: "incomplete universal character name %.*s",
1811 (int) (str - base), base);
1812 else
1813 cpp_error (pfile, CPP_DL_ERROR,
1814 msgid: "%<\\u{%> not terminated with %<}%> after %.*s",
1815 (int) (str - base), base);
1816 result = 1;
1817 }
1818 else if ((result & 0x80000000)
1819 || (result >= 0xD800 && result <= 0xDFFF))
1820 {
1821 cpp_error (pfile, CPP_DL_ERROR,
1822 msgid: "%.*s is not a valid universal character",
1823 (int) (str - base), base);
1824 result = 1;
1825 }
1826 /* The C99 standard permits $, @ and ` to be specified as UCNs. We use
1827 hex escapes so that this also works with EBCDIC hosts.
1828 C++0x permits everything below 0xa0 within literals, as does C23;
1829 ucn_valid_in_identifier will complain about identifiers. */
1830 else if (result < 0xa0
1831 && !identifier_pos
1832 && !CPP_OPTION (pfile, cplusplus)
1833 && (result != 0x24 && result != 0x40 && result != 0x60))
1834 {
1835 bool warned = false;
1836 if (!CPP_OPTION (pfile, low_ucns) && CPP_OPTION (pfile, cpp_pedantic))
1837 warned = cpp_pedwarning (pfile, CPP_W_PEDANTIC,
1838 msgid: "%.*s is not a valid universal character"
1839 " name before C23", (int) (str - base), base);
1840 if (!warned && CPP_OPTION (pfile, cpp_warn_c11_c23_compat) > 0)
1841 warned = cpp_warning (pfile, CPP_W_C11_C23_COMPAT,
1842 msgid: "%.*s is not a valid universal character"
1843 " name before C23", (int) (str - base), base);
1844 }
1845 else if (identifier_pos && result == 0x24
1846 && CPP_OPTION (pfile, dollars_in_ident)
1847 /* In C++26 when dollars are allowed in identifiers,
1848 we should still reject \u0024 as $ is part of the basic
1849 character set. C23 also does not allow \u0024 in
1850 identifiers. */
1851 && !(CPP_OPTION (pfile, cplusplus)
1852 ? CPP_OPTION (pfile, lang) > CLK_CXX23
1853 : CPP_OPTION (pfile, low_ucns)))
1854 {
1855 if (CPP_OPTION (pfile, warn_dollars) && !pfile->state.skipping)
1856 {
1857 CPP_OPTION (pfile, warn_dollars) = 0;
1858 cpp_error (pfile, CPP_DL_PEDWARN, msgid: "%<$%> in identifier or number");
1859 }
1860 NORMALIZE_STATE_UPDATE_IDNUM (nst, result);
1861 }
1862 else if (identifier_pos)
1863 {
1864 int validity = ucn_valid_in_identifier (pfile, c: result, nst);
1865
1866 if (validity == 0)
1867 cpp_error (pfile, CPP_DL_ERROR,
1868 msgid: "universal character %.*s is not valid in an identifier",
1869 (int) (str - base), base);
1870 else if (validity == 2 && identifier_pos == 1)
1871 cpp_error (pfile, CPP_DL_ERROR,
1872 msgid: "universal character %.*s is not valid at the start of an identifier",
1873 (int) (str - base), base);
1874 }
1875 else if (result > UCS_LIMIT
1876 && (!CPP_OPTION (pfile, cplusplus)
1877 || CPP_OPTION (pfile, lang) > CLK_CXX17))
1878 cpp_error (pfile, CPP_DL_PEDWARN,
1879 msgid: "%.*s is outside the UCS codespace",
1880 (int) (str - base), base);
1881
1882 *cp = result;
1883 return true;
1884}
1885
1886/* Convert an UCN, pointed to by FROM, to UTF-8 encoding, then translate
1887 it to the execution character set and write the result into TBUF,
1888 if TBUF is non-NULL.
1889 An advanced pointer is returned. Issues all relevant diagnostics.
1890 If LOC_READER is non-NULL, then RANGES must be non-NULL and CHAR_RANGE
1891 contains the location of the character so far: location information
1892 is read from *LOC_READER, and *RANGES is updated accordingly. */
1893static const uchar *
1894convert_ucn (cpp_reader *pfile, const uchar *from, const uchar *limit,
1895 struct _cpp_strbuf *tbuf, struct cset_converter cvt,
1896 source_range char_range,
1897 cpp_string_location_reader *loc_reader,
1898 cpp_substring_ranges *ranges)
1899{
1900 cppchar_t ucn;
1901 uchar buf[6];
1902 uchar *bufp = buf;
1903 size_t bytesleft = 6;
1904 int rval;
1905 struct normalize_state nst = INITIAL_NORMALIZE_STATE;
1906
1907 /* loc_reader and ranges must either be both NULL, or both be non-NULL. */
1908 gcc_assert ((loc_reader != NULL) == (ranges != NULL));
1909
1910 from++; /* Skip u/U/N. */
1911
1912 /* The u/U is part of the spelling of this character. */
1913 extend_char_range (char_range: &char_range, loc_reader);
1914
1915 _cpp_valid_ucn (pfile, pstr: &from, limit, identifier_pos: 0, nst: &nst,
1916 cp: &ucn, char_range: &char_range, loc_reader);
1917
1918 rval = one_cppchar_to_utf8 (c: ucn, outbufp: &bufp, outbytesleftp: &bytesleft);
1919 if (rval)
1920 {
1921 errno = rval;
1922 cpp_errno (pfile, CPP_DL_ERROR,
1923 msgid: "converting UCN to source character set");
1924 }
1925 else
1926 {
1927 if (tbuf)
1928 if (!APPLY_CONVERSION (cvt, buf, 6 - bytesleft, tbuf))
1929 cpp_errno (pfile, CPP_DL_ERROR,
1930 msgid: "converting UCN to execution character set");
1931
1932 if (loc_reader)
1933 {
1934 int num_encoded_bytes = 6 - bytesleft;
1935 for (int i = 0; i < num_encoded_bytes; i++)
1936 ranges->add_range (range: char_range);
1937 }
1938 }
1939
1940 return from;
1941}
1942
1943/* Performs a similar task as _cpp_valid_ucn, but parses UTF-8-encoded
1944 extended characters rather than UCNs. If the return value is TRUE, then a
1945 character was successfully decoded and stored in *CP; *PSTR has been
1946 updated to point one past the valid UTF-8 sequence. Diagnostics may have
1947 been emitted if the character parsed is not allowed in the current context.
1948 If the return value is FALSE, then *PSTR has not been modified and *CP may
1949 equal 0, to indicate that *PSTR does not form a valid UTF-8 sequence, or it
1950 may, when processing an identifier in C mode, equal a codepoint that was
1951 validly encoded but is not allowed to appear in an identifier. In either
1952 case, no diagnostic is emitted, and the return value of FALSE should cause
1953 a new token to be formed.
1954
1955 _cpp_valid_utf8 can be called when lexing a potential identifier, or a
1956 CPP_OTHER token or for the purposes of -Winvalid-utf8 warning in string or
1957 character literals. NST is unused when not in a potential identifier.
1958
1959 As in _cpp_valid_ucn, IDENTIFIER_POS is 0 when not in an identifier, 1 for
1960 the start of an identifier, or 2 otherwise. */
1961
1962extern bool
1963_cpp_valid_utf8 (cpp_reader *pfile,
1964 const uchar **pstr,
1965 const uchar *limit,
1966 int identifier_pos,
1967 struct normalize_state *nst,
1968 cppchar_t *cp)
1969{
1970 const uchar *base = *pstr;
1971 size_t inbytesleft = limit - base;
1972 if (one_utf8_to_cppchar (inbufp: pstr, inbytesleftp: &inbytesleft, cp))
1973 {
1974 /* No diagnostic here as this byte will rather become a
1975 new token. */
1976 *cp = 0;
1977 return false;
1978 }
1979
1980 if (identifier_pos)
1981 {
1982 switch (ucn_valid_in_identifier (pfile, c: *cp, nst))
1983 {
1984
1985 case 0:
1986 /* In C++, this is an error for invalid character in an identifier
1987 because logically, the UTF-8 was converted to a UCN during
1988 translation phase 1 (even though we don't physically do it that
1989 way). In C, this byte rather becomes grammatically a separate
1990 token. */
1991
1992 if (CPP_OPTION (pfile, cplusplus))
1993 cpp_error (pfile, CPP_DL_ERROR,
1994 msgid: "extended character %.*s is not valid in an identifier",
1995 (int) (*pstr - base), base);
1996 else
1997 {
1998 *pstr = base;
1999 return false;
2000 }
2001
2002 break;
2003
2004 case 2:
2005 if (identifier_pos == 1)
2006 {
2007 /* This is treated the same way in C++ or C99 -- lexed as an
2008 identifier which is then invalid because an identifier is
2009 not allowed to start with this character. */
2010 cpp_error (pfile, CPP_DL_ERROR,
2011 msgid: "extended character %.*s is not valid at the start of an identifier",
2012 (int) (*pstr - base), base);
2013 }
2014 break;
2015 }
2016 }
2017
2018 return true;
2019}
2020
2021/* Return true iff BUFFER of size NUM_BYTES is validly-encoded UTF-8. */
2022
2023extern bool
2024cpp_valid_utf8_p (const char *buffer, size_t num_bytes)
2025{
2026 const uchar *iter = (const uchar *)buffer;
2027 size_t bytesleft = num_bytes;
2028 while (bytesleft > 0)
2029 {
2030 /* one_utf8_to_cppchar implements 5-byte and 6 byte sequences as per
2031 RFC 2279, but this has been superceded by RFC 3629, which
2032 restricts UTF-8 to 1-byte through 4-byte sequences, and
2033 states "the octet values C0, C1, F5 to FF never appear".
2034
2035 Reject such values. */
2036 if (*iter >= 0xf4)
2037 return false;
2038
2039 cppchar_t cp;
2040 int err = one_utf8_to_cppchar (inbufp: &iter, inbytesleftp: &bytesleft, cp: &cp);
2041 if (err)
2042 return false;
2043
2044 /* Additionally, Unicode declares that all codepoints above 0010FFFF are
2045 invalid because they cannot be represented in UTF-16.
2046
2047 Reject such values.*/
2048 if (cp > UCS_LIMIT)
2049 return false;
2050 }
2051 /* No problems encountered. */
2052 return true;
2053}
2054
2055/* Subroutine of convert_hex and convert_oct. N is the representation
2056 in the execution character set of a numeric escape; write it into the
2057 string buffer TBUF and update the end-of-string pointer therein. WIDE
2058 is true if it's a wide string that's being assembled in TBUF. This
2059 function issues no diagnostics and never fails. */
2060static void
2061emit_numeric_escape (cpp_reader *pfile, cppchar_t n,
2062 struct _cpp_strbuf *tbuf, struct cset_converter cvt)
2063{
2064 size_t width = cvt.width;
2065
2066 if (width != CPP_OPTION (pfile, char_precision))
2067 {
2068 /* We have to render this into the target byte order, which may not
2069 be our byte order. */
2070 bool bigend = CPP_OPTION (pfile, bytes_big_endian);
2071 size_t cwidth = CPP_OPTION (pfile, char_precision);
2072 size_t cmask = width_to_mask (width: cwidth);
2073 size_t nbwc = width / cwidth;
2074 size_t i;
2075 size_t off = tbuf->len;
2076 cppchar_t c;
2077
2078 if (tbuf->len + nbwc > tbuf->asize)
2079 {
2080 tbuf->asize += OUTBUF_BLOCK_SIZE;
2081 tbuf->text = XRESIZEVEC (uchar, tbuf->text, tbuf->asize);
2082 }
2083
2084 for (i = 0; i < nbwc; i++)
2085 {
2086 c = n & cmask;
2087 n >>= cwidth;
2088 tbuf->text[off + (bigend ? nbwc - i - 1 : i)] = c;
2089 }
2090 tbuf->len += nbwc;
2091 }
2092 else
2093 {
2094 /* Note: this code does not handle the case where the target
2095 and host have a different number of bits in a byte. */
2096 if (tbuf->len + 1 > tbuf->asize)
2097 {
2098 tbuf->asize += OUTBUF_BLOCK_SIZE;
2099 tbuf->text = XRESIZEVEC (uchar, tbuf->text, tbuf->asize);
2100 }
2101 tbuf->text[tbuf->len++] = n;
2102 }
2103}
2104
2105/* Convert a hexadecimal escape, pointed to by FROM, to the execution
2106 character set and write it into the string buffer TBUF (if non-NULL).
2107 Returns an advanced pointer, and issues diagnostics as necessary.
2108 No character set translation occurs; this routine always produces the
2109 execution-set character with numeric value equal to the given hex
2110 number. You can, e.g. generate surrogate pairs this way.
2111 If LOC_READER is non-NULL, then RANGES must be non-NULL and CHAR_RANGE
2112 contains the location of the character so far: location information
2113 is read from *LOC_READER, and *RANGES is updated accordingly. */
2114static const uchar *
2115convert_hex (cpp_reader *pfile, const uchar *from, const uchar *limit,
2116 struct _cpp_strbuf *tbuf, struct cset_converter cvt,
2117 source_range char_range,
2118 cpp_string_location_reader *loc_reader,
2119 cpp_substring_ranges *ranges)
2120{
2121 cppchar_t c, n = 0, overflow = 0;
2122 int digits_found = 0;
2123 size_t width = cvt.width;
2124 size_t mask = width_to_mask (width);
2125 bool delimited = false;
2126 const uchar *base = from - 1;
2127
2128 /* loc_reader and ranges must either be both NULL, or both be non-NULL. */
2129 gcc_assert ((loc_reader != NULL) == (ranges != NULL));
2130
2131 if (CPP_WTRADITIONAL (pfile))
2132 cpp_warning (pfile, CPP_W_TRADITIONAL,
2133 msgid: "the meaning of %<\\x%> is different in traditional C");
2134
2135 /* Skip 'x'. */
2136 from++;
2137
2138 /* The 'x' is part of the spelling of this character. */
2139 extend_char_range (char_range: &char_range, loc_reader);
2140
2141 if (from < limit && *from == '{')
2142 {
2143 delimited = true;
2144 from++;
2145 extend_char_range (char_range: &char_range, loc_reader);
2146 }
2147
2148 while (from < limit)
2149 {
2150 c = *from;
2151 if (! hex_p (c))
2152 break;
2153 from++;
2154 extend_char_range (char_range: &char_range, loc_reader);
2155 overflow |= n ^ (n << 4 >> 4);
2156 n = (n << 4) + hex_value (c);
2157 digits_found = 1;
2158 }
2159
2160 if (delimited && from < limit && *from == '}')
2161 {
2162 bool warned = false;
2163 from++;
2164 if (!digits_found)
2165 {
2166 cpp_error (pfile, CPP_DL_ERROR,
2167 msgid: "empty delimited escape sequence");
2168 return from;
2169 }
2170 else if (!CPP_OPTION (pfile, delimited_escape_seqs)
2171 && CPP_OPTION (pfile, cpp_pedantic))
2172 {
2173 if (CPP_OPTION (pfile, cplusplus))
2174 warned
2175 = cpp_pedwarning (pfile, CPP_W_CXX23_EXTENSIONS,
2176 msgid: "delimited escape sequences are only valid "
2177 "in C++23");
2178 else
2179 warned
2180 = cpp_pedwarning (pfile, CPP_W_PEDANTIC,
2181 msgid: "delimited escape sequences are only valid "
2182 "in C2Y");
2183 }
2184 if (!warned && CPP_OPTION (pfile, cpp_warn_c23_c2y_compat) > 0)
2185 cpp_warning (pfile, CPP_W_C11_C23_COMPAT,
2186 msgid: "delimited escape sequences are only valid in C2Y");
2187 delimited = false;
2188 extend_char_range (char_range: &char_range, loc_reader);
2189 }
2190
2191 if (!digits_found)
2192 {
2193 cpp_error (pfile, CPP_DL_ERROR,
2194 msgid: "%<\\x%> used with no following hex digits");
2195 return from;
2196 }
2197 else if (delimited)
2198 {
2199 cpp_error (pfile, CPP_DL_ERROR,
2200 msgid: "%<\\x{%> not terminated with %<}%> after %.*s",
2201 (int) (from - base), base);
2202 return from;
2203 }
2204
2205 if (overflow | (n != (n & mask)))
2206 {
2207 cpp_error (pfile, CPP_DL_PEDWARN,
2208 msgid: "hex escape sequence out of range");
2209 n &= mask;
2210 }
2211
2212 if (tbuf)
2213 emit_numeric_escape (pfile, n, tbuf, cvt);
2214 if (ranges)
2215 ranges->add_range (range: char_range);
2216
2217 return from;
2218}
2219
2220/* Convert an octal escape, pointed to by FROM, to the execution
2221 character set and write it into the string buffer TBUF. Returns an
2222 advanced pointer, and issues diagnostics as necessary.
2223 No character set translation occurs; this routine always produces the
2224 execution-set character with numeric value equal to the given octal
2225 number.
2226 If LOC_READER is non-NULL, then RANGES must be non-NULL and CHAR_RANGE
2227 contains the location of the character so far: location information
2228 is read from *LOC_READER, and *RANGES is updated accordingly. */
2229static const uchar *
2230convert_oct (cpp_reader *pfile, const uchar *from, const uchar *limit,
2231 struct _cpp_strbuf *tbuf, struct cset_converter cvt,
2232 source_range char_range,
2233 cpp_string_location_reader *loc_reader,
2234 cpp_substring_ranges *ranges)
2235{
2236 size_t count = 0;
2237 cppchar_t c, n = 0, overflow = 0;
2238 size_t width = cvt.width;
2239 size_t mask = width_to_mask (width);
2240 bool delimited = false;
2241 const uchar *base = from - 1;
2242
2243 /* loc_reader and ranges must either be both NULL, or both be non-NULL. */
2244 gcc_assert ((loc_reader != NULL) == (ranges != NULL));
2245
2246 if (from < limit && *from == 'o')
2247 {
2248 from++;
2249 extend_char_range (char_range: &char_range, loc_reader);
2250 if (from == limit || *from != '{')
2251 cpp_error (pfile, CPP_DL_ERROR, msgid: "%<\\o%> not followed by %<{%>");
2252 else
2253 {
2254 from++;
2255 extend_char_range (char_range: &char_range, loc_reader);
2256 delimited = true;
2257 }
2258 }
2259
2260 while (from < limit && count++ < 3)
2261 {
2262 c = *from;
2263 if (c < '0' || c > '7')
2264 break;
2265 from++;
2266 extend_char_range (char_range: &char_range, loc_reader);
2267 if (delimited)
2268 {
2269 count = 2;
2270 overflow |= n ^ (n << 3 >> 3);
2271 }
2272 n = (n << 3) + c - '0';
2273 }
2274
2275 if (delimited)
2276 {
2277 if (from < limit && *from == '}')
2278 {
2279 bool warned = false;
2280 from++;
2281 if (count == 1)
2282 {
2283 cpp_error (pfile, CPP_DL_ERROR,
2284 msgid: "empty delimited escape sequence");
2285 return from;
2286 }
2287 else if (!CPP_OPTION (pfile, delimited_escape_seqs)
2288 && CPP_OPTION (pfile, cpp_pedantic))
2289 {
2290 if (CPP_OPTION (pfile, cplusplus))
2291 warned
2292 = cpp_pedwarning (pfile, CPP_W_CXX23_EXTENSIONS,
2293 msgid: "delimited escape sequences are only "
2294 "valid in C++23");
2295 else
2296 warned
2297 = cpp_pedwarning (pfile, CPP_W_PEDANTIC,
2298 msgid: "delimited escape sequences are only "
2299 "valid in C2Y");
2300 }
2301 if (!warned && CPP_OPTION (pfile, cpp_warn_c23_c2y_compat) > 0)
2302 cpp_warning (pfile, CPP_W_C11_C23_COMPAT,
2303 msgid: "delimited escape sequences are only valid in C2Y");
2304 extend_char_range (char_range: &char_range, loc_reader);
2305 }
2306 else
2307 {
2308 cpp_error (pfile, CPP_DL_ERROR,
2309 msgid: "%<\\o{%> not terminated with %<}%> after %.*s",
2310 (int) (from - base), base);
2311 return from;
2312 }
2313 }
2314
2315 if (overflow | (n != (n & mask)))
2316 {
2317 cpp_error (pfile, CPP_DL_PEDWARN,
2318 msgid: "octal escape sequence out of range");
2319 n &= mask;
2320 }
2321
2322 if (tbuf)
2323 emit_numeric_escape (pfile, n, tbuf, cvt);
2324 if (ranges)
2325 ranges->add_range (range: char_range);
2326
2327 return from;
2328}
2329
2330/* Convert an escape sequence (pointed to by FROM) to its value on
2331 the target, and to the execution character set. Do not scan past
2332 LIMIT. Write the converted value into TBUF, if TBUF is non-NULL.
2333 Returns an advanced pointer. Handles all relevant diagnostics.
2334 If LOC_READER is non-NULL, then RANGES must be non-NULL: location
2335 information is read from *LOC_READER, and *RANGES is updated
2336 accordingly. */
2337static const uchar *
2338convert_escape (cpp_reader *pfile, const uchar *from, const uchar *limit,
2339 struct _cpp_strbuf *tbuf, struct cset_converter cvt,
2340 cpp_string_location_reader *loc_reader,
2341 cpp_substring_ranges *ranges, bool uneval)
2342{
2343 /* Values of \a \b \e \f \n \r \t \v respectively. */
2344#if HOST_CHARSET == HOST_CHARSET_ASCII
2345 static const uchar charconsts[] = { 7, 8, 27, 12, 10, 13, 9, 11 };
2346#elif HOST_CHARSET == HOST_CHARSET_EBCDIC
2347 static const uchar charconsts[] = { 47, 22, 39, 12, 21, 13, 5, 11 };
2348#else
2349#error "unknown host character set"
2350#endif
2351
2352 uchar c;
2353
2354 /* Record the location of the backslash. */
2355 source_range char_range;
2356 if (loc_reader)
2357 char_range = loc_reader->get_next ();
2358
2359 c = *from;
2360 switch (c)
2361 {
2362 /* UCNs, hex escapes, and octal escapes are processed separately. */
2363 case 'u': case 'U': case 'N':
2364 return convert_ucn (pfile, from, limit, tbuf, cvt,
2365 char_range, loc_reader, ranges);
2366
2367 case 'x':
2368 if (uneval)
2369 cpp_pedwarning (pfile, CPP_W_PEDANTIC,
2370 msgid: "numeric escape sequence in unevaluated string: "
2371 "%<\\%c%>", (int) c);
2372 return convert_hex (pfile, from, limit, tbuf, cvt,
2373 char_range, loc_reader, ranges);
2374
2375 case '0': case '1': case '2': case '3':
2376 case '4': case '5': case '6': case '7':
2377 case 'o':
2378 if (uneval)
2379 cpp_pedwarning (pfile, CPP_W_PEDANTIC,
2380 msgid: "numeric escape sequence in unevaluated string: "
2381 "%<\\%c%>", (int) c);
2382 return convert_oct (pfile, from, limit, tbuf, cvt,
2383 char_range, loc_reader, ranges);
2384
2385 /* Various letter escapes. Get the appropriate host-charset
2386 value into C. */
2387 case '\\': case '\'': case '"': case '?': break;
2388
2389 case '(': case '{': case '[': case '%':
2390 /* '\(', etc, can be used at the beginning of a line in a long
2391 string split onto multiple lines with \-newline, to prevent
2392 Emacs or other text editors from getting confused. '\%' can
2393 be used to prevent SCCS from mangling printf format strings. */
2394 if (CPP_PEDANTIC (pfile))
2395 goto unknown;
2396 break;
2397
2398 case 'b': c = charconsts[1]; break;
2399 case 'f': c = charconsts[3]; break;
2400 case 'n': c = charconsts[4]; break;
2401 case 'r': c = charconsts[5]; break;
2402 case 't': c = charconsts[6]; break;
2403 case 'v': c = charconsts[7]; break;
2404
2405 case 'a':
2406 if (CPP_WTRADITIONAL (pfile))
2407 cpp_warning (pfile, CPP_W_TRADITIONAL,
2408 msgid: "the meaning of %<\\a%> is different in traditional C");
2409 c = charconsts[0];
2410 break;
2411
2412 case 'e': case 'E':
2413 cpp_pedwarning (pfile, CPP_W_PEDANTIC,
2414 msgid: "non-ISO-standard escape sequence, %<\\%c%>", (int) c);
2415 c = charconsts[2];
2416 break;
2417
2418 default:
2419 unknown:
2420 if (ISGRAPH (c))
2421 cpp_error (pfile, CPP_DL_PEDWARN,
2422 msgid: "unknown escape sequence: %<\\%c%>", (int) c);
2423 else
2424 {
2425 encoding_rich_location rich_loc (pfile);
2426
2427 /* pretty-print.cc does not support "%03o". When it does, this
2428 code can use %03o directly in the diagnostic again. */
2429 char buf[32];
2430 sprintf(s: buf, format: "%03o", (int) c);
2431 cpp_error_at (pfile, CPP_DL_PEDWARN, richloc: &rich_loc,
2432 msgid: "unknown escape sequence: %<\\%s%>", buf);
2433 }
2434 }
2435
2436 if (tbuf)
2437 /* Now convert what we have to the execution character set. */
2438 if (!APPLY_CONVERSION (cvt, &c, 1, tbuf))
2439 cpp_errno (pfile, CPP_DL_ERROR,
2440 msgid: "converting escape sequence to execution character set");
2441
2442 if (loc_reader)
2443 {
2444 char_range.m_finish = loc_reader->get_next ().m_finish;
2445 ranges->add_range (range: char_range);
2446 }
2447
2448 return from + 1;
2449}
2450
2451/* TYPE is a token type. The return value is the conversion needed to
2452 convert from source to execution character set for the given type. */
2453static struct cset_converter
2454converter_for_type (cpp_reader *pfile, enum cpp_ttype type)
2455{
2456 switch (type)
2457 {
2458 default:
2459 return pfile->narrow_cset_desc;
2460 case CPP_UTF8CHAR:
2461 case CPP_UTF8STRING:
2462 return pfile->utf8_cset_desc;
2463 case CPP_CHAR16:
2464 case CPP_STRING16:
2465 return pfile->char16_cset_desc;
2466 case CPP_CHAR32:
2467 case CPP_STRING32:
2468 return pfile->char32_cset_desc;
2469 case CPP_WCHAR:
2470 case CPP_WSTRING:
2471 return pfile->wide_cset_desc;
2472 }
2473}
2474
2475/* FROM is an array of cpp_string structures of length COUNT. These
2476 are to be converted from the source to the execution character set,
2477 escape sequences translated, and finally all are to be
2478 concatenated. WIDE indicates whether or not to produce a wide
2479 string. If TO is non-NULL, the result is written into TO.
2480 If LOC_READERS and OUT are non-NULL, then location information
2481 is read from LOC_READERS (which must be an array of length COUNT),
2482 and location information is written to *RANGES.
2483
2484 Returns true for success, false for failure. */
2485
2486static bool
2487cpp_interpret_string_1 (cpp_reader *pfile, const cpp_string *from, size_t count,
2488 cpp_string *to, enum cpp_ttype type,
2489 cpp_string_location_reader *loc_readers,
2490 cpp_substring_ranges *out)
2491{
2492 struct _cpp_strbuf tbuf;
2493 const uchar *p, *base, *limit;
2494 size_t i;
2495 struct cset_converter cvt = converter_for_type (pfile, type);
2496
2497 /* loc_readers and out must either be both NULL, or both be non-NULL. */
2498 gcc_assert ((loc_readers != NULL) == (out != NULL));
2499
2500 if (to)
2501 {
2502 tbuf.asize = MAX (OUTBUF_BLOCK_SIZE, from->len);
2503 tbuf.text = XNEWVEC (uchar, tbuf.asize);
2504 tbuf.len = 0;
2505 }
2506
2507 cpp_string_location_reader *loc_reader = NULL;
2508 for (i = 0; i < count; i++)
2509 {
2510 if (loc_readers)
2511 loc_reader = &loc_readers[i];
2512
2513 p = from[i].text;
2514 if (*p == 'u')
2515 {
2516 p++;
2517 if (loc_reader)
2518 loc_reader->get_next ();
2519 if (*p == '8')
2520 {
2521 p++;
2522 if (loc_reader)
2523 loc_reader->get_next ();
2524 }
2525 }
2526 else if (*p == 'L' || *p == 'U') p++;
2527 if (*p == 'R')
2528 {
2529 const uchar *prefix;
2530
2531 /* Skip over 'R"'. */
2532 p += 2;
2533 if (loc_reader)
2534 {
2535 loc_reader->get_next ();
2536 loc_reader->get_next ();
2537 }
2538 prefix = p;
2539 while (*p != '(')
2540 {
2541 p++;
2542 if (loc_reader)
2543 loc_reader->get_next ();
2544 }
2545 p++;
2546 if (loc_reader)
2547 loc_reader->get_next ();
2548 limit = from[i].text + from[i].len;
2549 if (limit >= p + (p - prefix) + 1)
2550 limit -= (p - prefix) + 1;
2551
2552 /* Raw strings are all normal characters; these can be fed
2553 directly to convert_cset. */
2554 if (to)
2555 if (!APPLY_CONVERSION (cvt, p, limit - p, &tbuf))
2556 goto fail;
2557
2558 if (loc_reader)
2559 {
2560 /* If generating source ranges, assume we have a 1:1
2561 correspondence between bytes in the source encoding and bytes
2562 in the execution encoding (e.g. if we have a UTF-8 to UTF-8
2563 conversion), so that this run of bytes in the source file
2564 corresponds to a run of bytes in the execution string.
2565 This requirement is guaranteed by an early-reject in
2566 cpp_interpret_string_ranges. */
2567 gcc_assert (cvt.func == convert_no_conversion);
2568 out->add_n_ranges (num: limit - p, loc_reader&: *loc_reader);
2569 }
2570
2571 continue;
2572 }
2573
2574 /* If we don't now have a leading quote, something has gone wrong.
2575 This can occur if cpp_interpret_string_ranges is handling a
2576 stringified macro argument, but should not be possible otherwise. */
2577 if (*p != '"' && *p != '\'')
2578 {
2579 gcc_assert (out != NULL);
2580 cpp_error (pfile, CPP_DL_ERROR, msgid: "missing open quote");
2581 if (to)
2582 free (ptr: tbuf.text);
2583 return false;
2584 }
2585
2586 /* Skip leading quote. */
2587 p++;
2588 if (loc_reader)
2589 loc_reader->get_next ();
2590
2591 limit = from[i].text + from[i].len - 1; /* Skip trailing quote. */
2592
2593 for (;;)
2594 {
2595 base = p;
2596 while (p < limit && *p != '\\')
2597 p++;
2598 if (p > base)
2599 {
2600 /* We have a run of normal characters; these can be fed
2601 directly to convert_cset. */
2602 if (to)
2603 if (!APPLY_CONVERSION (cvt, base, p - base, &tbuf))
2604 goto fail;
2605 /* Similar to above: assumes we have a 1:1 correspondence
2606 between bytes in the source encoding and bytes in the
2607 execution encoding. */
2608 if (loc_reader)
2609 {
2610 gcc_assert (cvt.func == convert_no_conversion);
2611 out->add_n_ranges (num: p - base, loc_reader&: *loc_reader);
2612 }
2613 }
2614 if (p >= limit)
2615 break;
2616
2617 struct _cpp_strbuf *tbuf_ptr = to ? &tbuf : NULL;
2618 p = convert_escape (pfile, from: p + 1, limit, tbuf: tbuf_ptr, cvt,
2619 loc_reader, ranges: out, uneval: type == CPP_UNEVAL_STRING);
2620 }
2621 }
2622
2623 if (to)
2624 {
2625 /* NUL-terminate the 'to' buffer and translate it to a cpp_string
2626 structure. */
2627 emit_numeric_escape (pfile, n: 0, tbuf: &tbuf, cvt);
2628 tbuf.text = XRESIZEVEC (uchar, tbuf.text, tbuf.len);
2629 to->text = tbuf.text;
2630 to->len = tbuf.len;
2631 }
2632 /* Use the location of the trailing quote as the location of the
2633 NUL-terminator. */
2634 if (loc_reader)
2635 {
2636 source_range range = loc_reader->get_next ();
2637 out->add_range (range);
2638 }
2639
2640 return true;
2641
2642 fail:
2643 cpp_errno (pfile, CPP_DL_ERROR, msgid: "converting to execution character set");
2644 if (to)
2645 free (ptr: tbuf.text);
2646 return false;
2647}
2648
2649/* FROM is an array of cpp_string structures of length COUNT. These
2650 are to be converted from the source to the execution character set,
2651 escape sequences translated, and finally all are to be
2652 concatenated. WIDE indicates whether or not to produce a wide
2653 string. The result is written into TO. Returns true for success,
2654 false for failure. */
2655bool
2656cpp_interpret_string (cpp_reader *pfile, const cpp_string *from, size_t count,
2657 cpp_string *to, enum cpp_ttype type)
2658{
2659 return cpp_interpret_string_1 (pfile, from, count, to, type, NULL, NULL);
2660}
2661
2662/* This function mimics the behavior of cpp_interpret_string, but
2663 rather than generating a string in the execution character set,
2664 *OUT is written to with the source code ranges of the characters
2665 in such a string.
2666 FROM and LOC_READERS should both be arrays of length COUNT.
2667 Returns NULL for success, or an error message for failure. */
2668
2669const char *
2670cpp_interpret_string_ranges (cpp_reader *pfile, const cpp_string *from,
2671 cpp_string_location_reader *loc_readers,
2672 size_t count,
2673 cpp_substring_ranges *out,
2674 enum cpp_ttype type)
2675{
2676 /* There are a couple of cases in the range-handling in
2677 cpp_interpret_string_1 that rely on there being a 1:1 correspondence
2678 between bytes in the source encoding and bytes in the execution
2679 encoding, so that each byte in the execution string can correspond
2680 to the location of a byte in the source string.
2681
2682 This holds for the typical case of a UTF-8 to UTF-8 conversion.
2683 Enforce this requirement by only attempting to track substring
2684 locations if we have source encoding == execution encoding.
2685
2686 This is a stronger condition than we need, since we could e.g.
2687 have ASCII to EBCDIC (with 1 byte per character before and after),
2688 but it seems to be a reasonable restriction. */
2689 struct cset_converter cvt = converter_for_type (pfile, type);
2690 if (cvt.func != convert_no_conversion)
2691 return "execution character set != source character set";
2692
2693 /* For on-demand strings we have already lexed the strings, so there
2694 should be no diagnostics. However, if we have bogus source location
2695 data (or stringified macro arguments), the attempt to lex the
2696 strings could fail with an diagnostic. Temporarily install an
2697 diagnostic-handler to catch the diagnostic, so that it can lead to this call
2698 failing, rather than being emitted as a user-visible diagnostic.
2699 If an diagnostic does occur, we should see it via the return value of
2700 cpp_interpret_string_1. */
2701 cpp_auto_suppress_diagnostics suppress {pfile};
2702 bool result = cpp_interpret_string_1 (pfile, from, count, NULL, type,
2703 loc_readers, out);
2704 if (!result)
2705 return "cpp_interpret_string_1 failed";
2706
2707 /* Success. */
2708 return NULL;
2709}
2710
2711/* Subroutine of do_line and do_linemarker. Convert escape sequences
2712 in a string, but do not perform character set conversion. */
2713bool
2714cpp_interpret_string_notranslate (cpp_reader *pfile, const cpp_string *from,
2715 size_t count, cpp_string *to,
2716 enum cpp_ttype type)
2717{
2718 struct cset_converter save_narrow_cset_desc = pfile->narrow_cset_desc;
2719 bool retval;
2720
2721 pfile->narrow_cset_desc.func = convert_no_conversion;
2722 pfile->narrow_cset_desc.cd = (iconv_t) -1;
2723 pfile->narrow_cset_desc.width = CPP_OPTION (pfile, char_precision);
2724
2725 retval = cpp_interpret_string (pfile, from, count, to,
2726 type: type == CPP_UNEVAL_STRING
2727 ? CPP_UNEVAL_STRING : CPP_STRING);
2728
2729 pfile->narrow_cset_desc = save_narrow_cset_desc;
2730 return retval;
2731}
2732
2733/* Convert a string FROM to TO, without handling of any UCNs etc., just
2734 pure character set conversion. If !REVERSE, convert from SOURCE_CHARSET
2735 to execution charset corresponding to TYPE, if REVERSE, convert from the
2736 execution charset corresponding to TYPE to SOURCE_CHARSET. Return false
2737 on error. */
2738
2739bool
2740cpp_translate_string (cpp_reader *pfile, const cpp_string *from,
2741 cpp_string *to, enum cpp_ttype type, bool reverse)
2742{
2743 struct cset_converter cvt = converter_for_type (pfile, type);
2744 struct _cpp_strbuf tbuf;
2745 if (reverse)
2746 {
2747 struct cset_converter *pcvt;
2748 switch (type)
2749 {
2750 default:
2751 pcvt = &pfile->reverse_narrow_cset_desc;
2752 break;
2753 case CPP_UTF8CHAR:
2754 case CPP_UTF8STRING:
2755 pcvt = &pfile->reverse_utf8_cset_desc;
2756 break;
2757 case CPP_CHAR16:
2758 case CPP_STRING16:
2759 case CPP_CHAR32:
2760 case CPP_STRING32:
2761 case CPP_WCHAR:
2762 case CPP_WSTRING:
2763 return false;
2764 }
2765 if (pcvt->func == NULL)
2766 {
2767 *pcvt = init_iconv_desc (pfile, to: cvt.from, from: cvt.to);
2768 pcvt->width = cvt.width;
2769 }
2770 cvt = *pcvt;
2771 }
2772 tbuf.asize = MAX (OUTBUF_BLOCK_SIZE, from->len);
2773 tbuf.text = XNEWVEC (uchar, tbuf.asize);
2774 tbuf.len = 0;
2775 if (!APPLY_CONVERSION (cvt, from->text, from->len, &tbuf))
2776 {
2777 XDELETEVEC (tbuf.text);
2778 return false;
2779 }
2780 tbuf.text = XRESIZEVEC (uchar, tbuf.text, tbuf.len);
2781 to->text = tbuf.text;
2782 to->len = tbuf.len;
2783 return true;
2784}
2785
2786/* Return true if ID is a valid identifier, false otherwise. Without any
2787 diagnostics. */
2788
2789bool
2790cpp_valid_identifier (cpp_reader *pfile, const unsigned char *id)
2791{
2792 normalize_state nst = INITIAL_NORMALIZE_STATE;
2793 const unsigned char *p = id;
2794 if (*p == '\0')
2795 return false;
2796 const unsigned char *limit
2797 = (const unsigned char *) strchr (s: (const char *) p, c: '\0');
2798 static const cppchar_t utf8_signifier = 0xC0;
2799 if (ISIDST (*p))
2800 {
2801 NORMALIZE_STATE_UPDATE_IDNUM (&nst, *p);
2802 ++p;
2803 }
2804 while (*p)
2805 {
2806 if (p != id && ISIDNUM (*p))
2807 {
2808 while (ISIDNUM (*p))
2809 ++p;
2810 NORMALIZE_STATE_UPDATE_IDNUM (&nst, *(p - 1));
2811 continue;
2812 }
2813 if (CPP_OPTION (pfile, extended_identifiers) && *p >= utf8_signifier)
2814 {
2815 const unsigned char *base = p;
2816 size_t inbytesleft = limit - p;
2817 cppchar_t c;
2818 if (one_utf8_to_cppchar (inbufp: &p, inbytesleftp: &inbytesleft, cp: &c))
2819 return false;
2820 switch (ucn_valid_in_identifier (pfile, c, nst: &nst))
2821 {
2822 default:
2823 return false;
2824 case 1:
2825 continue;
2826 case 2:
2827 if (base == id)
2828 return false;
2829 continue;
2830 }
2831 }
2832 return false;
2833 }
2834 return true;
2835}
2836
2837
2838/* Return number of source characters in STR. */
2839static unsigned
2840count_source_chars (cpp_reader *pfile, cpp_string str, cpp_ttype type)
2841{
2842 cpp_string str2 = { .len: 0, .text: 0 };
2843 cpp_auto_suppress_diagnostics suppress {pfile};
2844 convert_f save_func = pfile->narrow_cset_desc.func;
2845 pfile->narrow_cset_desc.func = convert_count_chars;
2846 bool ret = cpp_interpret_string (pfile, from: &str, count: 1, to: &str2, type);
2847 pfile->narrow_cset_desc.func = save_func;
2848 if (ret)
2849 {
2850 if (str2.text != str.text)
2851 free (ptr: (void *)str2.text);
2852 return str2.len;
2853 }
2854 else
2855 return 0;
2856}
2857
2858/* Subroutine of cpp_interpret_charconst which performs the conversion
2859 to a number, for narrow strings. STR is the string structure returned
2860 by cpp_interpret_string. PCHARS_SEEN and UNSIGNEDP are as for
2861 cpp_interpret_charconst. TOKEN is the token. */
2862static cppchar_t
2863narrow_str_to_charconst (cpp_reader *pfile, cpp_string str,
2864 unsigned int *pchars_seen, int *unsignedp,
2865 const cpp_token *token)
2866{
2867 enum cpp_ttype type = token->type;
2868 size_t width = CPP_OPTION (pfile, char_precision);
2869 size_t max_chars = CPP_OPTION (pfile, int_precision) / width;
2870 size_t mask = width_to_mask (width);
2871 size_t i;
2872 cppchar_t result, c;
2873 bool unsigned_p;
2874 bool diagnosed = false;
2875
2876 /* The value of a multi-character character constant, or a
2877 single-character character constant whose representation in the
2878 execution character set is more than one byte long, is
2879 implementation defined. This implementation defines it to be the
2880 number formed by interpreting the byte sequence in memory as a
2881 big-endian binary number. If overflow occurs, the high bytes are
2882 lost, and a warning is issued.
2883
2884 We don't want to process the NUL terminator handed back by
2885 cpp_interpret_string. */
2886 result = 0;
2887 for (i = 0; i < str.len - 1; i++)
2888 {
2889 c = str.text[i] & mask;
2890 if (width < BITS_PER_CPPCHAR_T)
2891 result = (result << width) | c;
2892 else
2893 result = c;
2894 }
2895
2896 if (type == CPP_UTF8CHAR)
2897 max_chars = 1;
2898 else if (i > 1 && CPP_OPTION (pfile, cplusplus))
2899 {
2900 /* C++ as a DR since
2901 P1854R4 - Making non-encodable string literals ill-formed
2902 makes multi-character narrow character literals if any of the
2903 characters in the literal isn't encodable in char/unsigned char
2904 ill-formed. We need to count the number of c-chars and compare
2905 that to str.len. */
2906 unsigned src_chars = count_source_chars (pfile, str: token->val.str, type);
2907
2908 if (src_chars)
2909 {
2910 if (str.len > src_chars)
2911 {
2912 if (src_chars <= 2)
2913 diagnosed
2914 = cpp_pedwarning (pfile, CPP_W_PEDANTIC,
2915 msgid: "character not encodable in a single "
2916 "execution character code unit");
2917 else
2918 diagnosed
2919 = cpp_pedwarning (pfile, CPP_W_PEDANTIC,
2920 msgid: "at least one character in a multi-"
2921 "character literal not encodable in a "
2922 "single execution character code unit");
2923 if (diagnosed && i > max_chars)
2924 i = max_chars;
2925 }
2926 }
2927 }
2928 if (diagnosed)
2929 /* Already diagnosed above. */;
2930 else if (i > max_chars)
2931 {
2932 unsigned src_chars
2933 = count_source_chars (pfile, str: token->val.str,
2934 type: type == CPP_UTF8CHAR ? CPP_CHAR : type);
2935
2936 if (type != CPP_UTF8CHAR)
2937 cpp_error (pfile, CPP_DL_WARNING,
2938 msgid: "multi-character literal with %ld characters exceeds "
2939 "%<int%> size of %ld bytes", (long) i, (long) max_chars);
2940 else if (src_chars > 2)
2941 cpp_error (pfile, CPP_DL_ERROR,
2942 msgid: "multi-character literal cannot have an encoding prefix");
2943 else
2944 cpp_error (pfile, CPP_DL_ERROR,
2945 msgid: "character not encodable in a single code unit");
2946 i = max_chars;
2947 }
2948 else if (i > 1 && CPP_OPTION (pfile, warn_multichar))
2949 cpp_warning (pfile, CPP_W_MULTICHAR, msgid: "multi-character character constant");
2950
2951 /* Multichar constants are of type int and therefore signed. */
2952 if (i > 1)
2953 unsigned_p = 0;
2954 else if (type == CPP_UTF8CHAR)
2955 unsigned_p = CPP_OPTION (pfile, unsigned_utf8char);
2956 else
2957 unsigned_p = CPP_OPTION (pfile, unsigned_char);
2958
2959 /* Truncate the constant to its natural width, and simultaneously
2960 sign- or zero-extend to the full width of cppchar_t.
2961 For single-character constants, the value is WIDTH bits wide.
2962 For multi-character constants, the value is INT_PRECISION bits wide. */
2963 if (i > 1)
2964 width = CPP_OPTION (pfile, int_precision);
2965 if (width < BITS_PER_CPPCHAR_T)
2966 {
2967 mask = ((cppchar_t) 1 << width) - 1;
2968 if (unsigned_p || !(result & (1 << (width - 1))))
2969 result &= mask;
2970 else
2971 result |= ~mask;
2972 }
2973 *pchars_seen = i;
2974 *unsignedp = unsigned_p;
2975 return result;
2976}
2977
2978/* Subroutine of cpp_interpret_charconst which performs the conversion
2979 to a number, for wide strings. STR is the string structure returned
2980 by cpp_interpret_string. PCHARS_SEEN and UNSIGNEDP are as for
2981 cpp_interpret_charconst. TOKEN is the token. */
2982static cppchar_t
2983wide_str_to_charconst (cpp_reader *pfile, cpp_string str,
2984 unsigned int *pchars_seen, int *unsignedp,
2985 const cpp_token *token)
2986{
2987 enum cpp_ttype type = token->type;
2988 bool bigend = CPP_OPTION (pfile, bytes_big_endian);
2989 size_t width = converter_for_type (pfile, type).width;
2990 size_t cwidth = CPP_OPTION (pfile, char_precision);
2991 size_t mask = width_to_mask (width);
2992 size_t cmask = width_to_mask (width: cwidth);
2993 size_t nbwc = width / cwidth;
2994 size_t off, i;
2995 cppchar_t result = 0, c;
2996
2997 if (str.len <= nbwc)
2998 {
2999 /* Error recovery, if no errors have been diagnosed previously,
3000 there should be at least two wide characters. Empty literals
3001 are diagnosed earlier and we can get just the zero terminator
3002 only if there were errors diagnosed during conversion. */
3003 *pchars_seen = 0;
3004 *unsignedp = 0;
3005 return 0;
3006 }
3007
3008 /* This is finicky because the string is in the target's byte order,
3009 which may not be our byte order. Only the last character, ignoring
3010 the NUL terminator, is relevant. */
3011 off = str.len - (nbwc * 2);
3012 result = 0;
3013 for (i = 0; i < nbwc; i++)
3014 {
3015 c = bigend ? str.text[off + i] : str.text[off + nbwc - i - 1];
3016 result = (result << cwidth) | (c & cmask);
3017 }
3018
3019 /* Wide character constants have type wchar_t, and a single
3020 character exactly fills a wchar_t, so a multi-character wide
3021 character constant is guaranteed to overflow. */
3022 if (str.len > nbwc * 2)
3023 {
3024 cpp_diagnostic_level level = CPP_DL_WARNING;
3025 unsigned src_chars
3026 = count_source_chars (pfile, str: token->val.str, type: CPP_CHAR);
3027
3028 if (CPP_OPTION (pfile, cplusplus)
3029 && (type == CPP_CHAR16
3030 || type == CPP_CHAR32
3031 /* In C++23 this is error even for L'ab'. */
3032 || (type == CPP_WCHAR
3033 && CPP_OPTION (pfile, size_t_literals))))
3034 level = CPP_DL_ERROR;
3035 if (src_chars > 2)
3036 cpp_error (pfile, level,
3037 msgid: "multi-character literal cannot have an encoding prefix");
3038 else
3039 cpp_error (pfile, level,
3040 msgid: "character not encodable in a single code unit");
3041 }
3042
3043 /* Truncate the constant to its natural width, and simultaneously
3044 sign- or zero-extend to the full width of cppchar_t. */
3045 if (width < BITS_PER_CPPCHAR_T)
3046 {
3047 if (type == CPP_CHAR16 || type == CPP_CHAR32
3048 || CPP_OPTION (pfile, unsigned_wchar)
3049 || !(result & (1 << (width - 1))))
3050 result &= mask;
3051 else
3052 result |= ~mask;
3053 }
3054
3055 if (type == CPP_CHAR16 || type == CPP_CHAR32
3056 || CPP_OPTION (pfile, unsigned_wchar))
3057 *unsignedp = 1;
3058 else
3059 *unsignedp = 0;
3060
3061 *pchars_seen = 1;
3062 return result;
3063}
3064
3065/* Interpret a (possibly wide) character constant in TOKEN.
3066 PCHARS_SEEN points to a variable that is filled in with the number
3067 of characters seen, and UNSIGNEDP to a variable that indicates
3068 whether the result has signed type. */
3069cppchar_t
3070cpp_interpret_charconst (cpp_reader *pfile, const cpp_token *token,
3071 unsigned int *pchars_seen, int *unsignedp)
3072{
3073 cpp_string str = { .len: 0, .text: 0 };
3074 bool wide = (token->type != CPP_CHAR && token->type != CPP_UTF8CHAR);
3075 int u8 = 2 * int(token->type == CPP_UTF8CHAR);
3076 cppchar_t result;
3077
3078 /* An empty constant will appear as L'', u'', U'', u8'', or '' */
3079 if (token->val.str.len == (size_t) (2 + wide + u8))
3080 {
3081 cpp_error (pfile, CPP_DL_ERROR, msgid: "empty character constant");
3082 *pchars_seen = 0;
3083 *unsignedp = 0;
3084 return 0;
3085 }
3086 else if (!cpp_interpret_string (pfile, from: &token->val.str, count: 1, to: &str,
3087 type: token->type))
3088 {
3089 *pchars_seen = 0;
3090 *unsignedp = 0;
3091 return 0;
3092 }
3093
3094 if (wide)
3095 result = wide_str_to_charconst (pfile, str, pchars_seen, unsignedp,
3096 token);
3097 else
3098 result = narrow_str_to_charconst (pfile, str, pchars_seen, unsignedp,
3099 token);
3100
3101 if (str.text != token->val.str.text)
3102 free (ptr: (void *)str.text);
3103
3104 return result;
3105}
3106
3107/* Convert an identifier denoted by ID and LEN, which might contain
3108 UCN escapes or UTF-8 multibyte chars, to the source character set,
3109 either UTF-8 or UTF-EBCDIC. Assumes that the identifier is actually
3110 a valid identifier. */
3111cpp_hashnode *
3112_cpp_interpret_identifier (cpp_reader *pfile, const uchar *id, size_t len)
3113{
3114 /* It turns out that a UCN escape always turns into fewer characters
3115 than the escape itself, so we can allocate a temporary in advance. */
3116 uchar * buf = (uchar *) alloca (len + 1);
3117 uchar * bufp = buf;
3118 size_t idp;
3119
3120 for (idp = 0; idp < len; idp++)
3121 if (id[idp] != '\\')
3122 *bufp++ = id[idp];
3123 else
3124 {
3125 unsigned length = id[idp + 1] == 'u' ? 4 : 8;
3126 cppchar_t value = 0;
3127 size_t bufleft = len - (bufp - buf);
3128 int rval;
3129 bool delimited = false;
3130
3131 idp += 2;
3132 if (id[idp - 1] == 'N' && id[idp] == '{')
3133 {
3134 idp++;
3135 const uchar *name = &id[idp];
3136 while (idp < len
3137 && (ISIDNUM (id[idp]) || id[idp] == ' ' || id[idp] == '-'))
3138 idp++;
3139 if (id[idp] == '}')
3140 {
3141 value = _cpp_uname2c (name: (const char *) name, len: &id[idp] - name,
3142 n: uname2c_tree, NULL);
3143 if (value == (cppchar_t) -1)
3144 value = 1;
3145 }
3146 else
3147 idp--;
3148 }
3149 else
3150 {
3151 if (length == 4 && id[idp] == '{')
3152 {
3153 delimited = true;
3154 idp++;
3155 }
3156 while (length && idp < len && ISXDIGIT (id[idp]))
3157 {
3158 value = (value << 4) + hex_value (id[idp]);
3159 idp++;
3160 if (!delimited)
3161 length--;
3162 }
3163 if (!delimited || id[idp] != '}')
3164 idp--;
3165 }
3166
3167 /* Special case for EBCDIC: if the identifier contains
3168 a '$' specified using a UCN, translate it to EBCDIC. */
3169 if (value == 0x24)
3170 {
3171 *bufp++ = '$';
3172 continue;
3173 }
3174
3175 rval = one_cppchar_to_utf8 (c: value, outbufp: &bufp, outbytesleftp: &bufleft);
3176 if (rval)
3177 {
3178 errno = rval;
3179 cpp_errno (pfile, CPP_DL_ERROR,
3180 msgid: "converting UCN to source character set");
3181 break;
3182 }
3183 }
3184
3185 return CPP_HASHNODE (ht_lookup (pfile->hash_table,
3186 buf, bufp - buf, HT_ALLOC));
3187}
3188
3189
3190/* Utility to strip a UTF-8 byte order marking from the beginning
3191 of a buffer. Returns the number of bytes to skip, which currently
3192 will be either 0 or 3. */
3193int
3194cpp_check_utf8_bom (const char *data, size_t data_length)
3195{
3196
3197#if HOST_CHARSET == HOST_CHARSET_ASCII
3198 const unsigned char *udata = (const unsigned char *) data;
3199 if (data_length >= 3 && udata[0] == 0xef && udata[1] == 0xbb
3200 && udata[2] == 0xbf)
3201 return 3;
3202#endif
3203
3204 return 0;
3205}
3206
3207
3208/* Convert an input buffer (containing the complete contents of one
3209 source file) from INPUT_CHARSET to the source character set. INPUT
3210 points to the input buffer, SIZE is its allocated size, and LEN is
3211 the length of the meaningful data within the buffer. The
3212 translated buffer is returned, *ST_SIZE is set to the length of
3213 the meaningful data within the translated buffer, and *BUFFER_START
3214 is set to the start of the returned buffer. *BUFFER_START may
3215 differ from the return value in the case of a BOM or other ignored
3216 marker information.
3217
3218 INPUT is expected to have been allocated with xmalloc. This
3219 function will either set *BUFFER_START to INPUT, or free it and set
3220 *BUFFER_START to a pointer to another xmalloc-allocated block of
3221 memory.
3222
3223 PFILE is only used to generate diagnostics; setting it to NULL suppresses
3224 diagnostics, and causes a return of NULL if there was any error instead. */
3225
3226uchar *
3227_cpp_convert_input (cpp_reader *pfile, const char *input_charset,
3228 uchar *input, size_t size, size_t len,
3229 const unsigned char **buffer_start, off_t *st_size)
3230{
3231 struct cset_converter input_cset;
3232 struct _cpp_strbuf to;
3233 unsigned char *buffer;
3234 size_t pad = CPP_BUFFER_PADDING;
3235
3236 input_cset = init_iconv_desc (pfile, SOURCE_CHARSET, from: input_charset);
3237 if (input_cset.func == convert_no_conversion)
3238 {
3239 to.text = input;
3240 to.asize = size;
3241 to.len = len;
3242 }
3243 else
3244 {
3245 to.asize = MAX (65536, len);
3246 to.text = XNEWVEC (uchar, to.asize);
3247 to.len = 0;
3248
3249 const bool ok = APPLY_CONVERSION (input_cset, input, len, &to);
3250 free (ptr: input);
3251
3252 /* Clean up the mess. */
3253 if (input_cset.func == convert_using_iconv)
3254 iconv_close (input_cset.cd);
3255
3256 /* Handle conversion failure. */
3257 if (!ok)
3258 {
3259 if (!pfile)
3260 {
3261 XDELETEVEC (to.text);
3262 *buffer_start = NULL;
3263 *st_size = 0;
3264 return NULL;
3265 }
3266 cpp_error (pfile, CPP_DL_ERROR, msgid: "failure to convert %s to %s",
3267 input_charset, SOURCE_CHARSET);
3268 }
3269 }
3270
3271 /* Resize buffer if we allocated substantially too much, or if we
3272 don't have enough space for the following padding, which allows
3273 search_line_fast to use (possibly misaligned) vector loads. */
3274 if (to.len + 4096 < to.asize || to.len + pad > to.asize)
3275 to.text = XRESIZEVEC (uchar, to.text, to.len + pad);
3276
3277 memset (s: to.text + to.len, c: '\0', n: pad);
3278
3279 /* If the file is using old-school Mac line endings (\r only),
3280 terminate with another \r, not an \n, so that we do not mistake
3281 the \r\n sequence for a single DOS line ending and erroneously
3282 issue the "No newline at end of file" diagnostic. */
3283 if (to.len && to.text[to.len - 1] == '\r')
3284 to.text[to.len] = '\r';
3285 else
3286 to.text[to.len] = '\n';
3287
3288 buffer = to.text;
3289 *st_size = to.len;
3290
3291 /* Ignore a UTF-8 BOM if we see one and the source charset is UTF-8. Note
3292 that glib'c UTF-8 iconv() provider (as of glibc 2.7) does not ignore a
3293 BOM -- however, even if it did, we would still need this code due
3294 to the 'convert_no_conversion' case. */
3295 const int bom_len = cpp_check_utf8_bom (data: (const char *) to.text, data_length: to.len);
3296 *st_size -= bom_len;
3297 buffer += bom_len;
3298
3299 *buffer_start = to.text;
3300 return buffer;
3301}
3302
3303/* Decide on the default encoding to assume for input files. */
3304const char *
3305_cpp_default_encoding (void)
3306{
3307 const char *current_encoding = NULL;
3308
3309 /* We disable this because the default codeset is 7-bit ASCII on
3310 most platforms, and this causes conversion failures on every
3311 file in GCC that happens to have one of the upper 128 characters
3312 in it -- most likely, as part of the name of a contributor.
3313 We should definitely recognize in-band markers of file encoding,
3314 like:
3315 - the appropriate Unicode byte-order mark (FE FF) to recognize
3316 UTF16 and UCS4 (in both big-endian and little-endian flavors)
3317 and UTF8
3318 - a "#i", "#d", "/ *", "//", " #p" or "#p" (for #pragma) to
3319 distinguish ASCII and EBCDIC.
3320 - now we can parse something like "#pragma GCC encoding <xyz>
3321 on the first line, or even Emacs/VIM's mode line tags (there's
3322 a problem here in that VIM uses the last line, and Emacs has
3323 its more elaborate "local variables" convention).
3324 - investigate whether Java has another common convention, which
3325 would be friendly to support.
3326 (Zack Weinberg and Paolo Bonzini, May 20th 2004) */
3327#if defined (HAVE_LOCALE_H) && defined (HAVE_LANGINFO_CODESET) && 0
3328 setlocale (LC_CTYPE, "");
3329 current_encoding = nl_langinfo (CODESET);
3330#endif
3331 if (current_encoding == NULL || *current_encoding == '\0')
3332 current_encoding = SOURCE_CHARSET;
3333
3334 return current_encoding;
3335}
3336
3337/* Check if the configured input charset requires no conversion, other than
3338 possibly stripping a UTF-8 BOM. */
3339bool cpp_input_conversion_is_trivial (const char *input_charset)
3340{
3341 return !strcasecmp (s1: input_charset, SOURCE_CHARSET);
3342}
3343
3344/* Implementation of class cpp_string_location_reader. */
3345
3346/* Constructor for cpp_string_location_reader. */
3347
3348cpp_string_location_reader::
3349cpp_string_location_reader (location_t src_loc,
3350 line_maps *line_table)
3351{
3352 src_loc = get_range_from_loc (set: line_table, loc: src_loc).m_start;
3353
3354 /* SRC_LOC might be a macro location. It only makes sense to do
3355 column-by-column calculations on ordinary maps, so get the
3356 corresponding location in an ordinary map. */
3357 m_loc
3358 = linemap_resolve_location (line_table, loc: src_loc,
3359 lrk: LRK_SPELLING_LOCATION, NULL);
3360
3361 const line_map_ordinary *map
3362 = linemap_check_ordinary (map: linemap_lookup (line_table, m_loc));
3363 m_offset_per_column = (1 << map->m_range_bits);
3364}
3365
3366/* Get the range of the next source byte. */
3367
3368source_range
3369cpp_string_location_reader::get_next ()
3370{
3371 source_range result;
3372 result.m_start = m_loc;
3373 result.m_finish = m_loc;
3374 if (m_loc <= LINE_MAP_MAX_LOCATION_WITH_COLS)
3375 m_loc += m_offset_per_column;
3376 return result;
3377}
3378
3379cpp_display_width_computation::
3380cpp_display_width_computation (const char *data, int data_length,
3381 const cpp_char_column_policy &policy) :
3382 m_begin (data),
3383 m_next (m_begin),
3384 m_bytes_left (data_length),
3385 m_policy (policy),
3386 m_display_cols (0)
3387{
3388 gcc_assert (policy.m_tabstop > 0);
3389 gcc_assert (policy.m_width_cb);
3390}
3391
3392
3393/* The main implementation function for class cpp_display_width_computation.
3394 m_next points on entry to the start of the UTF-8 encoding of the next
3395 character, and is updated to point just after the last byte of the encoding.
3396 m_bytes_left contains on entry the remaining size of the buffer into which
3397 m_next points, and this is also updated accordingly. If m_next does not
3398 point to a valid UTF-8-encoded sequence, then it will be treated as a single
3399 byte with display width 1. m_cur_display_col is the current display column,
3400 relative to which tab stops should be expanded. Returns the display width of
3401 the codepoint just processed.
3402 If OUT is non-NULL, it is populated. */
3403
3404int
3405cpp_display_width_computation::process_next_codepoint (cpp_decoded_char *out)
3406{
3407 cppchar_t c;
3408 int next_width;
3409
3410 if (out)
3411 out->m_start_byte = m_next;
3412
3413 if (*m_next == '\t')
3414 {
3415 ++m_next;
3416 --m_bytes_left;
3417 next_width = m_policy.m_tabstop - (m_display_cols % m_policy.m_tabstop);
3418 if (out)
3419 {
3420 out->m_ch = '\t';
3421 out->m_valid_ch = true;
3422 }
3423 }
3424 else if (one_utf8_to_cppchar (inbufp: (const uchar **) &m_next, inbytesleftp: &m_bytes_left, cp: &c)
3425 != 0)
3426 {
3427 /* Input is not convertible to UTF-8. This could be fine, e.g. in a
3428 string literal, so don't complain. Just treat it as if it has a width
3429 of one. */
3430 ++m_next;
3431 --m_bytes_left;
3432 next_width = m_policy.m_undecoded_byte_width;
3433 if (out)
3434 out->m_valid_ch = false;
3435 }
3436 else
3437 {
3438 /* one_utf8_to_cppchar() has updated m_next and m_bytes_left for us. */
3439 next_width = m_policy.m_width_cb (c);
3440 if (out)
3441 {
3442 out->m_ch = c;
3443 out->m_valid_ch = true;
3444 }
3445 }
3446
3447 if (out)
3448 out->m_next_byte = m_next;
3449
3450 m_display_cols += next_width;
3451 return next_width;
3452}
3453
3454/* Utility to advance the byte stream by the minimum amount needed to consume
3455 N display columns. Returns the number of display columns that were
3456 actually skipped. This could be less than N, if there was not enough data,
3457 or more than N, if the last character to be skipped had a sufficiently large
3458 display width. */
3459int
3460cpp_display_width_computation::advance_display_cols (int n)
3461{
3462 const int start = m_display_cols;
3463 const int target = start + n;
3464 while (m_display_cols < target && !done ())
3465 process_next_codepoint (NULL);
3466 return m_display_cols - start;
3467}
3468
3469/* For the string of length DATA_LENGTH bytes that begins at DATA, compute
3470 how many display columns are occupied by the first COLUMN bytes. COLUMN
3471 may exceed DATA_LENGTH, in which case the phantom bytes at the end are
3472 treated as if they have display width 1. Tabs are expanded to the next tab
3473 stop, relative to the start of DATA, and non-printable-ASCII characters
3474 will be escaped as per POLICY. */
3475
3476int
3477cpp_byte_column_to_display_column (const char *data, int data_length,
3478 int column,
3479 const cpp_char_column_policy &policy)
3480{
3481 const int offset = MAX (0, column - data_length);
3482 cpp_display_width_computation dw (data, column - offset, policy);
3483 while (!dw.done ())
3484 dw.process_next_codepoint (NULL);
3485 return dw.display_cols_processed () + offset;
3486}
3487
3488/* For the string of length DATA_LENGTH bytes that begins at DATA, compute
3489 the least number of bytes that will result in at least DISPLAY_COL display
3490 columns. The return value may exceed DATA_LENGTH if the entire string does
3491 not occupy enough display columns. Non-printable-ASCII characters
3492 will be escaped as per POLICY. */
3493
3494int
3495cpp_display_column_to_byte_column (const char *data, int data_length,
3496 int display_col,
3497 const cpp_char_column_policy &policy)
3498{
3499 cpp_display_width_computation dw (data, data_length, policy);
3500 const int avail_display = dw.advance_display_cols (n: display_col);
3501 return dw.bytes_processed () + MAX (0, display_col - avail_display);
3502}
3503
3504template <typename PropertyType>
3505PropertyType
3506get_cppchar_property (cppchar_t c,
3507 const cppchar_t *range_ends,
3508 const PropertyType *range_values,
3509 size_t num_ranges,
3510 PropertyType default_value)
3511{
3512 if (__builtin_expect (c <= range_ends[0], true))
3513 return range_values[0];
3514
3515 /* Binary search the tables. */
3516 int begin = 1;
3517 static const int end = num_ranges;
3518 int len = end - begin;
3519 do
3520 {
3521 int half = len/2;
3522 int middle = begin + half;
3523 if (c > range_ends[middle])
3524 {
3525 begin = middle + 1;
3526 len -= half + 1;
3527 }
3528 else
3529 len = half;
3530 } while (len);
3531
3532 if (__builtin_expect (begin != end, true))
3533 return range_values[begin];
3534
3535 return default_value;
3536}
3537
3538/* Our own version of wcwidth(). We don't use the actual wcwidth() in glibc,
3539 because that will inspect the user's locale, and in particular in an ASCII
3540 locale, it will not return anything useful for extended characters. But GCC
3541 in other respects (see e.g. _cpp_default_encoding()) behaves as if
3542 everything is UTF-8. We also make some tweaks that are useful for the way
3543 GCC needs to use this data, e.g. tabs and other control characters should be
3544 treated as having width 1. The lookup tables are generated from
3545 contrib/unicode/gen_wcwidth.py and were made by simply calling glibc
3546 wcwidth() on all codepoints, then applying the small tweaks. These tables
3547 are not highly optimized, but for the present purpose of outputting
3548 diagnostics, they are sufficient. */
3549
3550#include "generated_cpp_wcwidth.h"
3551
3552int
3553cpp_wcwidth (cppchar_t c)
3554{
3555 const size_t num_ranges
3556 = sizeof wcwidth_range_ends / sizeof (*wcwidth_range_ends);
3557 return get_cppchar_property<unsigned char > (c,
3558 range_ends: &wcwidth_range_ends[0],
3559 range_values: &wcwidth_widths[0],
3560 num_ranges,
3561 default_value: 1);
3562}
3563
3564#include "combining-chars.inc"
3565
3566bool
3567cpp_is_combining_char (cppchar_t c)
3568{
3569 const size_t num_ranges
3570 = sizeof combining_range_ends / sizeof (*combining_range_ends);
3571 return get_cppchar_property<bool> (c,
3572 range_ends: &combining_range_ends[0],
3573 range_values: &is_combining[0],
3574 num_ranges,
3575 default_value: false);
3576}
3577
3578#include "printable-chars.inc"
3579
3580bool
3581cpp_is_printable_char (cppchar_t c)
3582{
3583 const size_t num_ranges
3584 = sizeof printable_range_ends / sizeof (*printable_range_ends);
3585 return get_cppchar_property<bool> (c,
3586 range_ends: &printable_range_ends[0],
3587 range_values: &is_printable[0],
3588 num_ranges,
3589 default_value: false);
3590}
3591

source code of libcpp/charset.cc