1/* CPP Library - lexical analysis.
2 Copyright (C) 2000-2025 Free Software Foundation, Inc.
3 Contributed by Per Bothner, 1994-95.
4 Based on CCCP program by Paul Rubin, June 1986
5 Adapted to ANSI C, Richard Stallman, Jan 1987
6 Broken out to separate file, Zack Weinberg, Mar 2000
7
8This program is free software; you can redistribute it and/or modify it
9under the terms of the GNU General Public License as published by the
10Free Software Foundation; either version 3, or (at your option) any
11later version.
12
13This program is distributed in the hope that it will be useful,
14but WITHOUT ANY WARRANTY; without even the implied warranty of
15MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16GNU General Public License for more details.
17
18You should have received a copy of the GNU General Public License
19along with this program; see the file COPYING3. If not see
20<http://www.gnu.org/licenses/>. */
21
22#include "config.h"
23#include "system.h"
24#include "cpplib.h"
25#include "internal.h"
26
27enum spell_type
28{
29 SPELL_OPERATOR = 0,
30 SPELL_IDENT,
31 SPELL_LITERAL,
32 SPELL_NONE
33};
34
35struct token_spelling
36{
37 enum spell_type category;
38 const unsigned char *name;
39};
40
41static const unsigned char *const digraph_spellings[] =
42{ UC"%:", UC"%:%:", UC"<:", UC":>", UC"<%", UC"%>" };
43
44#define OP(e, s) { SPELL_OPERATOR, UC s },
45#define TK(e, s) { SPELL_ ## s, UC #e },
46static const struct token_spelling token_spellings[N_TTYPES] = { TTYPE_TABLE };
47#undef OP
48#undef TK
49
50#define TOKEN_SPELL(token) (token_spellings[(token)->type].category)
51#define TOKEN_NAME(token) (token_spellings[(token)->type].name)
52
53/* ISO 10646 defines the UCS codespace as the range 0-0x10FFFF inclusive. */
54#define UCS_LIMIT 0x10FFFF
55
56static void add_line_note (cpp_buffer *, const uchar *, unsigned int);
57static int skip_line_comment (cpp_reader *);
58static void skip_whitespace (cpp_reader *, cppchar_t);
59static void lex_string (cpp_reader *, cpp_token *, const uchar *);
60static void save_comment (cpp_reader *, cpp_token *, const uchar *, cppchar_t);
61static void store_comment (cpp_reader *, cpp_token *);
62static void create_literal (cpp_reader *, cpp_token *, const uchar *,
63 unsigned int, enum cpp_ttype);
64static bool warn_in_comment (cpp_reader *, _cpp_line_note *);
65static int name_p (cpp_reader *, const cpp_string *);
66static tokenrun *next_tokenrun (tokenrun *);
67
68static _cpp_buff *new_buff (size_t);
69
70
71/* Utility routine:
72
73 Compares, the token TOKEN to the NUL-terminated string STRING.
74 TOKEN must be a CPP_NAME. Returns 1 for equal, 0 for unequal. */
75int
76cpp_ideq (const cpp_token *token, const char *string)
77{
78 if (token->type != CPP_NAME)
79 return 0;
80
81 return !ustrcmp (NODE_NAME (token->val.node.node), s2: (const uchar *) string);
82}
83
84/* Record a note TYPE at byte POS into the current cleaned logical
85 line. */
86static void
87add_line_note (cpp_buffer *buffer, const uchar *pos, unsigned int type)
88{
89 if (buffer->notes_used == buffer->notes_cap)
90 {
91 buffer->notes_cap = buffer->notes_cap * 2 + 200;
92 buffer->notes = XRESIZEVEC (_cpp_line_note, buffer->notes,
93 buffer->notes_cap);
94 }
95
96 buffer->notes[buffer->notes_used].pos = pos;
97 buffer->notes[buffer->notes_used].type = type;
98 buffer->notes_used++;
99}
100
101
102/* Fast path to find line special characters using optimized character
103 scanning algorithms. Anything complicated falls back to the slow
104 path below. Since this loop is very hot it's worth doing these kinds
105 of optimizations.
106
107 One of the paths through the ifdefs should provide
108
109 const uchar *search_line_fast (const uchar *s, const uchar *end);
110
111 Between S and END, search for \n, \r, \\, ?. Return a pointer to
112 the found character.
113
114 Note that the last character of the buffer is *always* a newline,
115 as forced by _cpp_convert_input. This fact can be used to avoid
116 explicitly looking for the end of the buffer. */
117
118/* Configure gives us an ifdef test. */
119#ifndef WORDS_BIGENDIAN
120#define WORDS_BIGENDIAN 0
121#endif
122
123/* We'd like the largest integer that fits into a register. There's nothing
124 in <stdint.h> that gives us that. For most hosts this is unsigned long,
125 but MS decided on an LLP64 model. Thankfully when building with GCC we
126 can get the "real" word size. */
127#ifdef __GNUC__
128typedef unsigned int word_type __attribute__((__mode__(__word__)));
129#else
130typedef unsigned long word_type;
131#endif
132
133/* The code below is only expecting sizes 4 or 8.
134 Die at compile-time if this expectation is violated. */
135typedef char check_word_type_size
136 [(sizeof(word_type) == 8 || sizeof(word_type) == 4) * 2 - 1];
137
138/* Return X with the first N bytes forced to values that won't match one
139 of the interesting characters. Note that NUL is not interesting. */
140
141static inline word_type
142acc_char_mask_misalign (word_type val, unsigned int n)
143{
144 word_type mask = -1;
145 if (WORDS_BIGENDIAN)
146 mask >>= n * 8;
147 else
148 mask <<= n * 8;
149 return val & mask;
150}
151
152/* Return X replicated to all byte positions within WORD_TYPE. */
153
154static inline word_type
155acc_char_replicate (uchar x)
156{
157 word_type ret;
158
159 ret = (x << 24) | (x << 16) | (x << 8) | x;
160 if (sizeof(word_type) == 8)
161 ret = (ret << 16 << 16) | ret;
162 return ret;
163}
164
165/* Return non-zero if some byte of VAL is (probably) C. */
166
167static inline word_type
168acc_char_cmp (word_type val, word_type c)
169{
170#if defined(__GNUC__) && defined(__alpha__)
171 /* We can get exact results using a compare-bytes instruction.
172 Get (val == c) via (0 >= (val ^ c)). */
173 return __builtin_alpha_cmpbge (0, val ^ c);
174#else
175 word_type magic = 0x7efefefeU;
176 if (sizeof(word_type) == 8)
177 magic = (magic << 16 << 16) | 0xfefefefeU;
178 magic |= 1;
179
180 val ^= c;
181 return ((val + magic) ^ ~val) & ~magic;
182#endif
183}
184
185/* Given the result of acc_char_cmp is non-zero, return the index of
186 the found character. If this was a false positive, return -1. */
187
188static inline int
189acc_char_index (word_type cmp ATTRIBUTE_UNUSED,
190 word_type val ATTRIBUTE_UNUSED)
191{
192#if defined(__GNUC__) && defined(__alpha__) && !WORDS_BIGENDIAN
193 /* The cmpbge instruction sets *bits* of the result corresponding to
194 matches in the bytes with no false positives. */
195 return __builtin_ctzl (cmp);
196#else
197 unsigned int i;
198
199 /* ??? It would be nice to force unrolling here,
200 and have all of these constants folded. */
201 for (i = 0; i < sizeof(word_type); ++i)
202 {
203 uchar c;
204 if (WORDS_BIGENDIAN)
205 c = (val >> (sizeof(word_type) - i - 1) * 8) & 0xff;
206 else
207 c = (val >> i * 8) & 0xff;
208
209 if (c == '\n' || c == '\r' || c == '\\' || c == '?')
210 return i;
211 }
212
213 return -1;
214#endif
215}
216
217/* A version of the fast scanner using bit fiddling techniques.
218
219 For 32-bit words, one would normally perform 16 comparisons and
220 16 branches. With this algorithm one performs 24 arithmetic
221 operations and one branch. Whether this is faster with a 32-bit
222 word size is going to be somewhat system dependent.
223
224 For 64-bit words, we eliminate twice the number of comparisons
225 and branches without increasing the number of arithmetic operations.
226 It's almost certainly going to be a win with 64-bit word size. */
227
228static inline const uchar *
229search_line_acc_char (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
230{
231 const word_type repl_nl = acc_char_replicate (x: '\n');
232 const word_type repl_cr = acc_char_replicate (x: '\r');
233 const word_type repl_bs = acc_char_replicate (x: '\\');
234 const word_type repl_qm = acc_char_replicate (x: '?');
235
236 unsigned int misalign;
237 const word_type *p;
238 word_type val, t;
239
240 /* Align the buffer. Mask out any bytes from before the beginning. */
241 p = (word_type *)((uintptr_t)s & -sizeof(word_type));
242 val = *p;
243 misalign = (uintptr_t)s & (sizeof(word_type) - 1);
244 if (misalign)
245 val = acc_char_mask_misalign (val, n: misalign);
246
247 /* Main loop. */
248 while (1)
249 {
250 t = acc_char_cmp (val, c: repl_nl);
251 t |= acc_char_cmp (val, c: repl_cr);
252 t |= acc_char_cmp (val, c: repl_bs);
253 t |= acc_char_cmp (val, c: repl_qm);
254
255 if (__builtin_expect (t != 0, 0))
256 {
257 int i = acc_char_index (cmp: t, val);
258 if (i >= 0)
259 return (const uchar *)p + i;
260 }
261
262 val = *++p;
263 }
264}
265
266/* Disable on Solaris 2/x86 until the following problem can be properly
267 autoconfed:
268
269 The Solaris 10+ assembler tags objects with the instruction set
270 extensions used, so SSE4.2 executables cannot run on machines that
271 don't support that extension. */
272
273#if (GCC_VERSION >= 4005) && (__GNUC__ >= 5 || !defined(__PIC__)) && (defined(__i386__) || defined(__x86_64__)) && !(defined(__sun__) && defined(__svr4__))
274
275/* Replicated character data to be shared between implementations.
276 Recall that outside of a context with vector support we can't
277 define compatible vector types, therefore these are all defined
278 in terms of raw characters. */
279static const char repl_chars[4][16] __attribute__((aligned(16))) = {
280 { '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
281 '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n' },
282 { '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
283 '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r' },
284 { '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
285 '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\' },
286 { '?', '?', '?', '?', '?', '?', '?', '?',
287 '?', '?', '?', '?', '?', '?', '?', '?' },
288};
289
290
291/* A version of the fast scanner using SSE2 vectorized byte compare insns. */
292
293static inline const uchar *
294#ifndef __SSE2__
295__attribute__((__target__("sse2")))
296#endif
297search_line_sse2 (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
298{
299 typedef char v16qi __attribute__ ((__vector_size__ (16)));
300
301 const v16qi repl_nl = *(const v16qi *)repl_chars[0];
302 const v16qi repl_cr = *(const v16qi *)repl_chars[1];
303 const v16qi repl_bs = *(const v16qi *)repl_chars[2];
304 const v16qi repl_qm = *(const v16qi *)repl_chars[3];
305
306 unsigned int misalign, found, mask;
307 const v16qi *p;
308 v16qi data, t;
309
310 /* Align the source pointer. */
311 misalign = (uintptr_t)s & 15;
312 p = (const v16qi *)((uintptr_t)s & -16);
313 data = *p;
314
315 /* Create a mask for the bytes that are valid within the first
316 16-byte block. The Idea here is that the AND with the mask
317 within the loop is "free", since we need some AND or TEST
318 insn in order to set the flags for the branch anyway. */
319 mask = -1u << misalign;
320
321 /* Main loop processing 16 bytes at a time. */
322 goto start;
323 do
324 {
325 data = *++p;
326 mask = -1;
327
328 start:
329 t = data == repl_nl;
330 t |= data == repl_cr;
331 t |= data == repl_bs;
332 t |= data == repl_qm;
333 found = __builtin_ia32_pmovmskb128 (t);
334 found &= mask;
335 }
336 while (!found);
337
338 /* FOUND contains 1 in bits for which we matched a relevant
339 character. Conversion to the byte index is trivial. */
340 found = __builtin_ctz(found);
341 return (const uchar *)p + found;
342}
343
344#ifdef HAVE_SSSE3
345/* A version of the fast scanner using SSSE3 shuffle (PSHUFB) insns. */
346
347static inline const uchar *
348#ifndef __SSSE3__
349__attribute__((__target__("ssse3")))
350#endif
351search_line_ssse3 (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
352{
353 typedef char v16qi __attribute__ ((__vector_size__ (16)));
354 typedef v16qi v16qi_u __attribute__ ((__aligned__ (1)));
355 /* Helper vector for pshufb-based matching:
356 each character C we're searching for is at position (C % 16). */
357 v16qi lut = { 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, '\n', 0, '\\', '\r', 0, '?' };
358 static_assert('\n' == 10 && '\r' == 13 && '\\' == 92 && '?' == 63,
359 "host character encoding is ASCII");
360
361 v16qi d1, d2, t1, t2;
362 /* Unaligned loads, potentially using padding after the final newline. */
363 static_assert (CPP_BUFFER_PADDING >= 64, "");
364 d1 = *(const v16qi_u *)s;
365 d2 = *(const v16qi_u *)(s + 16);
366 unsigned m1, m2, found;
367 /* Process two 16-byte chunks per iteration. */
368 do
369 {
370 t1 = __builtin_ia32_pshufb128 (lut, d1);
371 t2 = __builtin_ia32_pshufb128 (lut, d2);
372 m1 = __builtin_ia32_pmovmskb128 (t1 == d1);
373 m2 = __builtin_ia32_pmovmskb128 (t2 == d2);
374 s += 32;
375 d1 = *(const v16qi_u *)s;
376 d2 = *(const v16qi_u *)(s + 16);
377 found = m1 + (m2 << 16);
378 }
379 while (!found);
380 /* Prefer to compute 's - 32' here, not spend an extra instruction
381 to make a copy of the previous value of 's' in the loop. */
382 __asm__ ("" : "+r"(s));
383 return s - 32 + __builtin_ctz (found);
384}
385
386#else
387/* Work around out-dated assemblers without SSSE3 support. */
388#define search_line_ssse3 search_line_sse2
389#endif
390
391#ifdef __SSSE3__
392/* No need for CPU probing, just use the best available variant. */
393#define search_line_fast search_line_ssse3
394#else
395/* Check the CPU capabilities. */
396
397#include "../gcc/config/i386/cpuid.h"
398
399typedef const uchar * (*search_line_fast_type) (const uchar *, const uchar *);
400static search_line_fast_type search_line_fast
401#if defined(__SSE2__)
402 = search_line_sse2;
403#else
404 = search_line_acc_char;
405#endif
406
407#define HAVE_init_vectorized_lexer 1
408static inline void
409init_vectorized_lexer (void)
410{
411 unsigned ax, bx, cx, dx;
412
413 if (!__get_cpuid (1, &ax, &bx, &cx, &dx))
414 return;
415
416 if (cx & bit_SSSE3)
417 search_line_fast = search_line_ssse3;
418 else if (dx & bit_SSE2)
419 search_line_fast = search_line_sse2;
420}
421#endif
422
423#elif (GCC_VERSION >= 4005) && defined(_ARCH_PWR8) && defined(__ALTIVEC__)
424
425/* A vection of the fast scanner using AltiVec vectorized byte compares
426 and VSX unaligned loads (when VSX is available). This is otherwise
427 the same as the AltiVec version. */
428
429ATTRIBUTE_NO_SANITIZE_UNDEFINED
430static const uchar *
431search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
432{
433 typedef __attribute__((altivec(vector))) unsigned char vc;
434
435 const vc repl_nl = {
436 '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
437 '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n'
438 };
439 const vc repl_cr = {
440 '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
441 '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r'
442 };
443 const vc repl_bs = {
444 '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
445 '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\'
446 };
447 const vc repl_qm = {
448 '?', '?', '?', '?', '?', '?', '?', '?',
449 '?', '?', '?', '?', '?', '?', '?', '?',
450 };
451 const vc zero = { 0 };
452
453 vc data, t;
454
455 /* Main loop processing 16 bytes at a time. */
456 do
457 {
458 vc m_nl, m_cr, m_bs, m_qm;
459
460 data = __builtin_vec_vsx_ld (0, s);
461 s += 16;
462
463 m_nl = (vc) __builtin_vec_cmpeq(data, repl_nl);
464 m_cr = (vc) __builtin_vec_cmpeq(data, repl_cr);
465 m_bs = (vc) __builtin_vec_cmpeq(data, repl_bs);
466 m_qm = (vc) __builtin_vec_cmpeq(data, repl_qm);
467 t = (m_nl | m_cr) | (m_bs | m_qm);
468
469 /* T now contains 0xff in bytes for which we matched one of the relevant
470 characters. We want to exit the loop if any byte in T is non-zero.
471 Below is the expansion of vec_any_ne(t, zero). */
472 }
473 while (!__builtin_vec_vcmpeq_p(/*__CR6_LT_REV*/3, t, zero));
474
475 /* Restore s to to point to the 16 bytes we just processed. */
476 s -= 16;
477
478 {
479#define N (sizeof(vc) / sizeof(long))
480
481 union {
482 vc v;
483 /* Statically assert that N is 2 or 4. */
484 unsigned long l[(N == 2 || N == 4) ? N : -1];
485 } u;
486 unsigned long l, i = 0;
487
488 u.v = t;
489
490 /* Find the first word of T that is non-zero. */
491 switch (N)
492 {
493 case 4:
494 l = u.l[i++];
495 if (l != 0)
496 break;
497 s += sizeof(unsigned long);
498 l = u.l[i++];
499 if (l != 0)
500 break;
501 s += sizeof(unsigned long);
502 /* FALLTHRU */
503 case 2:
504 l = u.l[i++];
505 if (l != 0)
506 break;
507 s += sizeof(unsigned long);
508 l = u.l[i];
509 }
510
511 /* L now contains 0xff in bytes for which we matched one of the
512 relevant characters. We can find the byte index by finding
513 its bit index and dividing by 8. */
514#ifdef __BIG_ENDIAN__
515 l = __builtin_clzl(l) >> 3;
516#else
517 l = __builtin_ctzl(l) >> 3;
518#endif
519 return s + l;
520
521#undef N
522 }
523}
524
525#elif (GCC_VERSION >= 4005) && defined(__ALTIVEC__) && defined (__BIG_ENDIAN__)
526
527/* A vection of the fast scanner using AltiVec vectorized byte compares.
528 This cannot be used for little endian because vec_lvsl/lvsr are
529 deprecated for little endian and the code won't work properly. */
530/* ??? Unfortunately, attribute(target("altivec")) is not yet supported,
531 so we can't compile this function without -maltivec on the command line
532 (or implied by some other switch). */
533
534static const uchar *
535search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
536{
537 typedef __attribute__((altivec(vector))) unsigned char vc;
538
539 const vc repl_nl = {
540 '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
541 '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n'
542 };
543 const vc repl_cr = {
544 '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
545 '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r'
546 };
547 const vc repl_bs = {
548 '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
549 '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\'
550 };
551 const vc repl_qm = {
552 '?', '?', '?', '?', '?', '?', '?', '?',
553 '?', '?', '?', '?', '?', '?', '?', '?',
554 };
555 const vc ones = {
556 -1, -1, -1, -1, -1, -1, -1, -1,
557 -1, -1, -1, -1, -1, -1, -1, -1,
558 };
559 const vc zero = { 0 };
560
561 vc data, mask, t;
562
563 /* Altivec loads automatically mask addresses with -16. This lets us
564 issue the first load as early as possible. */
565 data = __builtin_vec_ld(0, (const vc *)s);
566
567 /* Discard bytes before the beginning of the buffer. Do this by
568 beginning with all ones and shifting in zeros according to the
569 mis-alignment. The LVSR instruction pulls the exact shift we
570 want from the address. */
571 mask = __builtin_vec_lvsr(0, s);
572 mask = __builtin_vec_perm(zero, ones, mask);
573 data &= mask;
574
575 /* While altivec loads mask addresses, we still need to align S so
576 that the offset we compute at the end is correct. */
577 s = (const uchar *)((uintptr_t)s & -16);
578
579 /* Main loop processing 16 bytes at a time. */
580 goto start;
581 do
582 {
583 vc m_nl, m_cr, m_bs, m_qm;
584
585 s += 16;
586 data = __builtin_vec_ld(0, (const vc *)s);
587
588 start:
589 m_nl = (vc) __builtin_vec_cmpeq(data, repl_nl);
590 m_cr = (vc) __builtin_vec_cmpeq(data, repl_cr);
591 m_bs = (vc) __builtin_vec_cmpeq(data, repl_bs);
592 m_qm = (vc) __builtin_vec_cmpeq(data, repl_qm);
593 t = (m_nl | m_cr) | (m_bs | m_qm);
594
595 /* T now contains 0xff in bytes for which we matched one of the relevant
596 characters. We want to exit the loop if any byte in T is non-zero.
597 Below is the expansion of vec_any_ne(t, zero). */
598 }
599 while (!__builtin_vec_vcmpeq_p(/*__CR6_LT_REV*/3, t, zero));
600
601 {
602#define N (sizeof(vc) / sizeof(long))
603
604 union {
605 vc v;
606 /* Statically assert that N is 2 or 4. */
607 unsigned long l[(N == 2 || N == 4) ? N : -1];
608 } u;
609 unsigned long l, i = 0;
610
611 u.v = t;
612
613 /* Find the first word of T that is non-zero. */
614 switch (N)
615 {
616 case 4:
617 l = u.l[i++];
618 if (l != 0)
619 break;
620 s += sizeof(unsigned long);
621 l = u.l[i++];
622 if (l != 0)
623 break;
624 s += sizeof(unsigned long);
625 /* FALLTHROUGH */
626 case 2:
627 l = u.l[i++];
628 if (l != 0)
629 break;
630 s += sizeof(unsigned long);
631 l = u.l[i];
632 }
633
634 /* L now contains 0xff in bytes for which we matched one of the
635 relevant characters. We can find the byte index by finding
636 its bit index and dividing by 8. */
637 l = __builtin_clzl(l) >> 3;
638 return s + l;
639
640#undef N
641 }
642}
643
644#elif defined (__ARM_NEON) && defined (__ARM_64BIT_STATE)
645#include "arm_neon.h"
646
647/* This doesn't have to be the exact page size, but no system may use
648 a size smaller than this. ARMv8 requires a minimum page size of
649 4k. The impact of being conservative here is a small number of
650 cases will take the slightly slower entry path into the main
651 loop. */
652
653#define AARCH64_MIN_PAGE_SIZE 4096
654
655static const uchar *
656search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
657{
658 const uint8x16_t repl_nl = vdupq_n_u8 ('\n');
659 const uint8x16_t repl_cr = vdupq_n_u8 ('\r');
660 const uint8x16_t repl_bs = vdupq_n_u8 ('\\');
661 const uint8x16_t repl_qm = vdupq_n_u8 ('?');
662 const uint8x16_t xmask = (uint8x16_t) vdupq_n_u64 (0x8040201008040201ULL);
663
664#ifdef __ARM_BIG_ENDIAN
665 const int16x8_t shift = {8, 8, 8, 8, 0, 0, 0, 0};
666#else
667 const int16x8_t shift = {0, 0, 0, 0, 8, 8, 8, 8};
668#endif
669
670 unsigned int found;
671 const uint8_t *p;
672 uint8x16_t data;
673 uint8x16_t t;
674 uint16x8_t m;
675 uint8x16_t u, v, w;
676
677 /* Align the source pointer. */
678 p = (const uint8_t *)((uintptr_t)s & -16);
679
680 /* Assuming random string start positions, with a 4k page size we'll take
681 the slow path about 0.37% of the time. */
682 if (__builtin_expect ((AARCH64_MIN_PAGE_SIZE
683 - (((uintptr_t) s) & (AARCH64_MIN_PAGE_SIZE - 1)))
684 < 16, 0))
685 {
686 /* Slow path: the string starts near a possible page boundary. */
687 uint32_t misalign, mask;
688
689 misalign = (uintptr_t)s & 15;
690 mask = (-1u << misalign) & 0xffff;
691 data = vld1q_u8 (p);
692 t = vceqq_u8 (data, repl_nl);
693 u = vceqq_u8 (data, repl_cr);
694 v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
695 w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
696 t = vorrq_u8 (v, w);
697 t = vandq_u8 (t, xmask);
698 m = vpaddlq_u8 (t);
699 m = vshlq_u16 (m, shift);
700 found = vaddvq_u16 (m);
701 found &= mask;
702 if (found)
703 return (const uchar*)p + __builtin_ctz (found);
704 }
705 else
706 {
707 data = vld1q_u8 ((const uint8_t *) s);
708 t = vceqq_u8 (data, repl_nl);
709 u = vceqq_u8 (data, repl_cr);
710 v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
711 w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
712 t = vorrq_u8 (v, w);
713 if (__builtin_expect (vpaddd_u64 ((uint64x2_t)t) != 0, 0))
714 goto done;
715 }
716
717 do
718 {
719 p += 16;
720 data = vld1q_u8 (p);
721 t = vceqq_u8 (data, repl_nl);
722 u = vceqq_u8 (data, repl_cr);
723 v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
724 w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
725 t = vorrq_u8 (v, w);
726 } while (!vpaddd_u64 ((uint64x2_t)t));
727
728done:
729 /* Now that we've found the terminating substring, work out precisely where
730 we need to stop. */
731 t = vandq_u8 (t, xmask);
732 m = vpaddlq_u8 (t);
733 m = vshlq_u16 (m, shift);
734 found = vaddvq_u16 (m);
735 return (((((uintptr_t) p) < (uintptr_t) s) ? s : (const uchar *)p)
736 + __builtin_ctz (found));
737}
738
739#elif defined (__ARM_NEON)
740#include "arm_neon.h"
741
742static const uchar *
743search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
744{
745 const uint8x16_t repl_nl = vdupq_n_u8 ('\n');
746 const uint8x16_t repl_cr = vdupq_n_u8 ('\r');
747 const uint8x16_t repl_bs = vdupq_n_u8 ('\\');
748 const uint8x16_t repl_qm = vdupq_n_u8 ('?');
749 const uint8x16_t xmask = (uint8x16_t) vdupq_n_u64 (0x8040201008040201ULL);
750
751 unsigned int misalign, found, mask;
752 const uint8_t *p;
753 uint8x16_t data;
754
755 /* Align the source pointer. */
756 misalign = (uintptr_t)s & 15;
757 p = (const uint8_t *)((uintptr_t)s & -16);
758 data = vld1q_u8 (p);
759
760 /* Create a mask for the bytes that are valid within the first
761 16-byte block. The Idea here is that the AND with the mask
762 within the loop is "free", since we need some AND or TEST
763 insn in order to set the flags for the branch anyway. */
764 mask = (-1u << misalign) & 0xffff;
765
766 /* Main loop, processing 16 bytes at a time. */
767 goto start;
768
769 do
770 {
771 uint8x8_t l;
772 uint16x4_t m;
773 uint32x2_t n;
774 uint8x16_t t, u, v, w;
775
776 p += 16;
777 data = vld1q_u8 (p);
778 mask = 0xffff;
779
780 start:
781 t = vceqq_u8 (data, repl_nl);
782 u = vceqq_u8 (data, repl_cr);
783 v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
784 w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
785 t = vandq_u8 (vorrq_u8 (v, w), xmask);
786 l = vpadd_u8 (vget_low_u8 (t), vget_high_u8 (t));
787 m = vpaddl_u8 (l);
788 n = vpaddl_u16 (m);
789
790 found = vget_lane_u32 ((uint32x2_t) vorr_u64 ((uint64x1_t) n,
791 vshr_n_u64 ((uint64x1_t) n, 24)), 0);
792 found &= mask;
793 }
794 while (!found);
795
796 /* FOUND contains 1 in bits for which we matched a relevant
797 character. Conversion to the byte index is trivial. */
798 found = __builtin_ctz (found);
799 return (const uchar *)p + found;
800}
801
802#else
803
804/* We only have one accelerated alternative. Use a direct call so that
805 we encourage inlining. */
806
807#define search_line_fast search_line_acc_char
808
809#endif
810
811/* Initialize the lexer if needed. */
812
813void
814_cpp_init_lexer (void)
815{
816#ifdef HAVE_init_vectorized_lexer
817 init_vectorized_lexer ();
818#endif
819}
820
821/* Look for leading whitespace style issues on lines which don't contain
822 just whitespace.
823 For -Wleading-whitespace=spaces report if such lines contain leading
824 whitespace other than spaces.
825 For -Wleading-whitespace=tabs report if such lines contain leading
826 whitespace other than tabs.
827 For -Wleading-whitespace=blanks report if such lines contain leading
828 whitespace other than spaces+tabs, or contain in it tab after space,
829 or -ftabstop= or more consecutive spaces. */
830
831static void
832find_leading_whitespace_issues (cpp_reader *pfile, const uchar *s)
833{
834 const unsigned char *p = NULL;
835 uchar type = 'L';
836 switch (CPP_OPTION (pfile, cpp_warn_leading_whitespace))
837 {
838 case 1: /* spaces */
839 while (*s == ' ')
840 ++s;
841 break;
842 case 2: /* tabs */
843 while (*s == '\t')
844 ++s;
845 break;
846 case 3: /* blanks */
847 while (*s == '\t')
848 ++s;
849 int n;
850 n = CPP_OPTION (pfile, cpp_tabstop);
851 while (*s == ' ')
852 {
853 if (--n == 0)
854 break;
855 ++s;
856 }
857 if (*s == '\t')
858 type = 'T'; /* Tab after space. */
859 else if (*s == ' ')
860 type = 'S'; /* Too many spaces. */
861 break;
862 default:
863 abort ();
864 }
865 if (!IS_NVSPACE (*s))
866 return;
867 p = s++;
868 while (IS_NVSPACE (*s))
869 ++s;
870 if (*s != '\n' && *s != '\r')
871 add_line_note (buffer: pfile->buffer, pos: p, type);
872}
873
874/* Returns with a logical line that contains no escaped newlines or
875 trigraphs. This is a time-critical inner loop. */
876void
877_cpp_clean_line (cpp_reader *pfile)
878{
879 cpp_buffer *buffer;
880 const uchar *s;
881 uchar c, *d, *p;
882
883 buffer = pfile->buffer;
884 buffer->cur_note = buffer->notes_used = 0;
885 buffer->cur = buffer->line_base = buffer->next_line;
886 buffer->need_line = false;
887 s = buffer->next_line;
888
889 if (!buffer->from_stage3)
890 {
891 const uchar *pbackslash = NULL;
892 bool leading_ws_done = true;
893
894 if (CPP_OPTION (pfile, cpp_warn_leading_whitespace))
895 find_leading_whitespace_issues (pfile, s);
896
897 /* Fast path. This is the common case of an un-escaped line with
898 no trigraphs. The primary win here is by not writing any
899 data back to memory until we have to. */
900 while (1)
901 {
902 /* Perform an optimized search for \n, \r, \\, ?. */
903 s = search_line_fast (s, end: buffer->rlimit);
904
905 c = *s;
906 if (c == '\\')
907 {
908 /* Record the location of the backslash and continue. */
909 pbackslash = s++;
910 }
911 else if (__builtin_expect (c == '?', 0))
912 {
913 if (__builtin_expect (s[1] == '?', false)
914 && _cpp_trigraph_map[s[2]])
915 {
916 /* Have a trigraph. We may or may not have to convert
917 it. Add a line note regardless, for -Wtrigraphs. */
918 add_line_note (buffer, pos: s, type: s[2]);
919 if (CPP_OPTION (pfile, trigraphs))
920 {
921 /* We do, and that means we have to switch to the
922 slow path. */
923 d = (uchar *) s;
924 *d = _cpp_trigraph_map[s[2]];
925 s += 2;
926 goto slow_path;
927 }
928 }
929 /* Not a trigraph. Continue on fast-path. */
930 s++;
931 }
932 else
933 break;
934 }
935
936 /* This must be \r or \n. We're either done, or we'll be forced
937 to write back to the buffer and continue on the slow path. */
938 d = (uchar *) s;
939
940 if (__builtin_expect (s == buffer->rlimit, false))
941 goto done;
942
943 /* DOS line ending? */
944 if (__builtin_expect (c == '\r', false) && s[1] == '\n')
945 {
946 s++;
947 if (s == buffer->rlimit)
948 goto done;
949 }
950
951 if (__builtin_expect (pbackslash == NULL, true))
952 goto done;
953
954 /* Check for escaped newline. */
955 p = d;
956 while (is_nvspace (p[-1]))
957 p--;
958 if (p - 1 != pbackslash)
959 goto done;
960
961 /* Have an escaped newline; process it and proceed to
962 the slow path. */
963 add_line_note (buffer, pos: p - 1, type: p != d ? ' ' : '\\');
964 d = p - 2;
965 buffer->next_line = p - 1;
966 leading_ws_done = false;
967
968 slow_path:
969 while (1)
970 {
971 c = *++s;
972 *++d = c;
973
974 if (c == '\n' || c == '\r')
975 {
976 if (CPP_OPTION (pfile, cpp_warn_leading_whitespace)
977 && !leading_ws_done)
978 find_leading_whitespace_issues (pfile, s: buffer->next_line);
979
980 /* Handle DOS line endings. */
981 if (c == '\r' && s != buffer->rlimit && s[1] == '\n')
982 s++;
983 if (s == buffer->rlimit)
984 break;
985
986 /* Escaped? */
987 p = d;
988 while (p != buffer->next_line && is_nvspace (p[-1]))
989 p--;
990 if (p == buffer->next_line || p[-1] != '\\')
991 break;
992
993 add_line_note (buffer, pos: p - 1, type: p != d ? ' ' : '\\');
994 d = p - 2;
995 buffer->next_line = p - 1;
996 leading_ws_done = false;
997 }
998 else if (c == '?' && s[1] == '?' && _cpp_trigraph_map[s[2]])
999 {
1000 if (CPP_OPTION (pfile, cpp_warn_leading_whitespace)
1001 && !leading_ws_done)
1002 {
1003 find_leading_whitespace_issues (pfile, s: buffer->next_line);
1004 leading_ws_done = true;
1005 }
1006
1007 /* Add a note regardless, for the benefit of -Wtrigraphs. */
1008 add_line_note (buffer, pos: d, type: s[2]);
1009 if (CPP_OPTION (pfile, trigraphs))
1010 {
1011 *d = _cpp_trigraph_map[s[2]];
1012 s += 2;
1013 }
1014 }
1015 }
1016 done:
1017 if (d > buffer->next_line
1018 && CPP_OPTION (pfile, cpp_warn_trailing_whitespace))
1019 switch (CPP_OPTION (pfile, cpp_warn_trailing_whitespace))
1020 {
1021 case 1:
1022 if (ISBLANK (d[-1]))
1023 add_line_note (buffer, pos: d - 1, type: 'W');
1024 break;
1025 case 2:
1026 if (IS_NVSPACE (d[-1]) && d[-1])
1027 add_line_note (buffer, pos: d - 1, type: 'W');
1028 break;
1029 }
1030 }
1031 else
1032 {
1033 while (*s != '\n' && *s != '\r')
1034 s++;
1035 d = (uchar *) s;
1036
1037 /* Handle DOS line endings. */
1038 if (*s == '\r' && s + 1 != buffer->rlimit && s[1] == '\n')
1039 s++;
1040 }
1041
1042 *d = '\n';
1043 /* A sentinel note that should never be processed. */
1044 add_line_note (buffer, pos: d + 1, type: '\n');
1045 buffer->next_line = s + 1;
1046}
1047
1048template <bool lexing_raw_string>
1049static bool get_fresh_line_impl (cpp_reader *pfile);
1050
1051/* Return true if the trigraph indicated by NOTE should be warned
1052 about in a comment. */
1053static bool
1054warn_in_comment (cpp_reader *pfile, _cpp_line_note *note)
1055{
1056 const uchar *p;
1057
1058 /* Within comments we don't warn about trigraphs, unless the
1059 trigraph forms an escaped newline, as that may change
1060 behavior. */
1061 if (note->type != '/')
1062 return false;
1063
1064 /* If -trigraphs, then this was an escaped newline iff the next note
1065 is coincident. */
1066 if (CPP_OPTION (pfile, trigraphs))
1067 return note[1].pos == note->pos;
1068
1069 /* Otherwise, see if this forms an escaped newline. */
1070 p = note->pos + 3;
1071 while (is_nvspace (*p))
1072 p++;
1073
1074 /* There might have been escaped newlines between the trigraph and the
1075 newline we found. Hence the position test. */
1076 return (*p == '\n' && p < note[1].pos);
1077}
1078
1079/* Process the notes created by add_line_note as far as the current
1080 location. */
1081void
1082_cpp_process_line_notes (cpp_reader *pfile, int in_comment)
1083{
1084 cpp_buffer *buffer = pfile->buffer;
1085
1086 for (;;)
1087 {
1088 _cpp_line_note *note = &buffer->notes[buffer->cur_note];
1089 unsigned int col;
1090
1091 if (note->pos > buffer->cur)
1092 break;
1093
1094 buffer->cur_note++;
1095 col = CPP_BUF_COLUMN (buffer, note->pos + 1);
1096
1097 if (note->type == '\\' || note->type == ' ')
1098 {
1099 if (note->type == ' ')
1100 {
1101 if (!in_comment)
1102 cpp_error_with_line (pfile, CPP_DL_WARNING,
1103 pfile->line_table->highest_line, col,
1104 msgid: "backslash and newline separated by "
1105 "space");
1106 else if (CPP_OPTION (pfile, cpp_warn_trailing_whitespace))
1107 cpp_warning_with_line (pfile, CPP_W_TRAILING_WHITESPACE,
1108 pfile->line_table->highest_line, col,
1109 msgid: "trailing whitespace");
1110 }
1111
1112 if (buffer->next_line > buffer->rlimit)
1113 {
1114 cpp_error_with_line (pfile, CPP_DL_PEDWARN,
1115 pfile->line_table->highest_line, col,
1116 msgid: "backslash-newline at end of file");
1117 /* Prevent "no newline at end of file" warning. */
1118 buffer->next_line = buffer->rlimit;
1119 }
1120
1121 buffer->line_base = note->pos;
1122 CPP_INCREMENT_LINE (pfile, 0);
1123 }
1124 else if (_cpp_trigraph_map[note->type])
1125 {
1126 if (CPP_OPTION (pfile, warn_trigraphs)
1127 && (!in_comment || warn_in_comment (pfile, note)))
1128 {
1129 if (CPP_OPTION (pfile, trigraphs))
1130 cpp_warning_with_line (pfile, CPP_W_TRIGRAPHS,
1131 pfile->line_table->highest_line, col,
1132 msgid: "trigraph %<??%c%> converted to %<%c%>",
1133 note->type,
1134 (int) _cpp_trigraph_map[note->type]);
1135 else
1136 cpp_warning_with_line (pfile, CPP_W_TRIGRAPHS,
1137 pfile->line_table->highest_line, col,
1138 msgid: "trigraph %<??%c%> ignored, use "
1139 "%<-trigraphs%> to enable", note->type);
1140 }
1141 }
1142 else if (note->type == 'W')
1143 cpp_warning_with_line (pfile, CPP_W_TRAILING_WHITESPACE,
1144 pfile->line_table->highest_line, col,
1145 msgid: "trailing whitespace");
1146 else if (note->type == 'S')
1147 cpp_warning_with_line (pfile, CPP_W_LEADING_WHITESPACE,
1148 pfile->line_table->highest_line, col,
1149 msgid: "too many consecutive spaces in leading "
1150 "whitespace");
1151 else if (note->type == 'T')
1152 cpp_warning_with_line (pfile, CPP_W_LEADING_WHITESPACE,
1153 pfile->line_table->highest_line, col,
1154 msgid: "tab after space in leading whitespace");
1155 else if (note->type == 'L')
1156 switch (CPP_OPTION (pfile, cpp_warn_leading_whitespace))
1157 {
1158 case 1:
1159 cpp_warning_with_line (pfile, CPP_W_LEADING_WHITESPACE,
1160 pfile->line_table->highest_line, col,
1161 msgid: "whitespace other than spaces in leading "
1162 "whitespace");
1163 break;
1164 case 2:
1165 cpp_warning_with_line (pfile, CPP_W_LEADING_WHITESPACE,
1166 pfile->line_table->highest_line, col,
1167 msgid: "whitespace other than tabs in leading "
1168 "whitespace");
1169 break;
1170 case 3:
1171 cpp_warning_with_line (pfile, CPP_W_LEADING_WHITESPACE,
1172 pfile->line_table->highest_line, col,
1173 msgid: "whitespace other than spaces and tabs in "
1174 "leading whitespace");
1175 break;
1176 default:
1177 abort ();
1178 }
1179 else if (note->type == 0)
1180 /* Already processed in lex_raw_string. */;
1181 else
1182 abort ();
1183 }
1184}
1185
1186namespace bidi {
1187 enum class kind {
1188 NONE, LRE, RLE, LRO, RLO, LRI, RLI, FSI, PDF, PDI, LTR, RTL
1189 };
1190
1191 /* All the UTF-8 encodings of bidi characters start with E2. */
1192 constexpr uchar utf8_start = 0xe2;
1193
1194 struct context
1195 {
1196 context () {}
1197 context (location_t loc, kind k, bool pdf, bool ucn)
1198 : m_loc (loc), m_kind (k), m_pdf (pdf), m_ucn (ucn)
1199 {
1200 }
1201
1202 kind get_pop_kind () const
1203 {
1204 return m_pdf ? kind::PDF : kind::PDI;
1205 }
1206 bool ucn_p () const
1207 {
1208 return m_ucn;
1209 }
1210
1211 location_t m_loc;
1212 kind m_kind;
1213 unsigned m_pdf : 1;
1214 unsigned m_ucn : 1;
1215 };
1216
1217 /* A vector holding currently open bidi contexts. We use a char for
1218 each context, its LSB is 1 if it represents a PDF context, 0 if it
1219 represents a PDI context. The next bit is 1 if this context was open
1220 by a bidi character written as a UCN, and 0 when it was UTF-8. */
1221 semi_embedded_vec <context, 16> vec;
1222
1223 /* Close the whole comment/identifier/string literal/character constant
1224 context. */
1225 void on_close ()
1226 {
1227 vec.truncate (len: 0);
1228 }
1229
1230 /* Pop the last element in the vector. */
1231 void pop ()
1232 {
1233 unsigned int len = vec.count ();
1234 gcc_checking_assert (len > 0);
1235 vec.truncate (len: len - 1);
1236 }
1237
1238 /* Return the pop kind of the context of the Ith element. */
1239 kind pop_kind_at (unsigned int i)
1240 {
1241 return vec[i].get_pop_kind ();
1242 }
1243
1244 /* Return the pop kind of the context that is currently opened. */
1245 kind current_ctx ()
1246 {
1247 unsigned int len = vec.count ();
1248 if (len == 0)
1249 return kind::NONE;
1250 return vec[len - 1].get_pop_kind ();
1251 }
1252
1253 /* Return true if the current context comes from a UCN origin, that is,
1254 the bidi char which started this bidi context was written as a UCN. */
1255 bool current_ctx_ucn_p ()
1256 {
1257 unsigned int len = vec.count ();
1258 gcc_checking_assert (len > 0);
1259 return vec[len - 1].m_ucn;
1260 }
1261
1262 location_t current_ctx_loc ()
1263 {
1264 unsigned int len = vec.count ();
1265 gcc_checking_assert (len > 0);
1266 return vec[len - 1].m_loc;
1267 }
1268
1269 /* We've read a bidi char, update the current vector as necessary.
1270 LOC is only valid when K is not kind::NONE. */
1271 void on_char (kind k, bool ucn_p, location_t loc)
1272 {
1273 switch (k)
1274 {
1275 case kind::LRE:
1276 case kind::RLE:
1277 case kind::LRO:
1278 case kind::RLO:
1279 vec.push (value: context (loc, k, true, ucn_p));
1280 break;
1281 case kind::LRI:
1282 case kind::RLI:
1283 case kind::FSI:
1284 vec.push (value: context (loc, k, false, ucn_p));
1285 break;
1286 /* PDF terminates the scope of the last LRE, RLE, LRO, or RLO
1287 whose scope has not yet been terminated. */
1288 case kind::PDF:
1289 if (current_ctx () == kind::PDF)
1290 pop ();
1291 break;
1292 /* PDI terminates the scope of the last LRI, RLI, or FSI whose
1293 scope has not yet been terminated, as well as the scopes of
1294 any subsequent LREs, RLEs, LROs, or RLOs whose scopes have not
1295 yet been terminated. */
1296 case kind::PDI:
1297 for (int i = vec.count () - 1; i >= 0; --i)
1298 if (pop_kind_at (i) == kind::PDI)
1299 {
1300 vec.truncate (len: i);
1301 break;
1302 }
1303 break;
1304 case kind::LTR:
1305 case kind::RTL:
1306 /* These aren't popped by a PDF/PDI. */
1307 break;
1308 ATTR_LIKELY case kind::NONE:
1309 break;
1310 default:
1311 abort ();
1312 }
1313 }
1314
1315 /* Return a descriptive string for K. */
1316 const char *to_str (kind k)
1317 {
1318 switch (k)
1319 {
1320 case kind::LRE:
1321 return "U+202A (LEFT-TO-RIGHT EMBEDDING)";
1322 case kind::RLE:
1323 return "U+202B (RIGHT-TO-LEFT EMBEDDING)";
1324 case kind::LRO:
1325 return "U+202D (LEFT-TO-RIGHT OVERRIDE)";
1326 case kind::RLO:
1327 return "U+202E (RIGHT-TO-LEFT OVERRIDE)";
1328 case kind::LRI:
1329 return "U+2066 (LEFT-TO-RIGHT ISOLATE)";
1330 case kind::RLI:
1331 return "U+2067 (RIGHT-TO-LEFT ISOLATE)";
1332 case kind::FSI:
1333 return "U+2068 (FIRST STRONG ISOLATE)";
1334 case kind::PDF:
1335 return "U+202C (POP DIRECTIONAL FORMATTING)";
1336 case kind::PDI:
1337 return "U+2069 (POP DIRECTIONAL ISOLATE)";
1338 case kind::LTR:
1339 return "U+200E (LEFT-TO-RIGHT MARK)";
1340 case kind::RTL:
1341 return "U+200F (RIGHT-TO-LEFT MARK)";
1342 default:
1343 abort ();
1344 }
1345 }
1346}
1347
1348/* Get location_t for the range of bytes [START, START + NUM_BYTES)
1349 within the current line in FILE, with the caret at START. */
1350
1351static location_t
1352get_location_for_byte_range_in_cur_line (cpp_reader *pfile,
1353 const unsigned char *const start,
1354 size_t num_bytes)
1355{
1356 gcc_checking_assert (num_bytes > 0);
1357
1358 /* CPP_BUF_COLUMN and linemap_position_for_column both refer
1359 to offsets in bytes, but CPP_BUF_COLUMN is 0-based,
1360 whereas linemap_position_for_column is 1-based. */
1361
1362 /* Get 0-based offsets within the line. */
1363 size_t start_offset = CPP_BUF_COLUMN (pfile->buffer, start);
1364 size_t end_offset = start_offset + num_bytes - 1;
1365
1366 /* Now convert to location_t, where "columns" are 1-based byte offsets. */
1367 location_t start_loc = linemap_position_for_column (pfile->line_table,
1368 start_offset + 1);
1369 location_t end_loc = linemap_position_for_column (pfile->line_table,
1370 end_offset + 1);
1371
1372 if (start_loc == end_loc)
1373 return start_loc;
1374
1375 source_range src_range;
1376 src_range.m_start = start_loc;
1377 src_range.m_finish = end_loc;
1378 location_t combined_loc
1379 = pfile->line_table->get_or_create_combined_loc (locus: start_loc,
1380 src_range,
1381 data: nullptr,
1382 discriminator: 0);
1383 return combined_loc;
1384}
1385
1386/* Parse a sequence of 3 bytes starting with P and return its bidi code. */
1387
1388static bidi::kind
1389get_bidi_utf8_1 (const unsigned char *const p)
1390{
1391 gcc_checking_assert (p[0] == bidi::utf8_start);
1392
1393 if (p[1] == 0x80)
1394 switch (p[2])
1395 {
1396 case 0xaa:
1397 return bidi::kind::LRE;
1398 case 0xab:
1399 return bidi::kind::RLE;
1400 case 0xac:
1401 return bidi::kind::PDF;
1402 case 0xad:
1403 return bidi::kind::LRO;
1404 case 0xae:
1405 return bidi::kind::RLO;
1406 case 0x8e:
1407 return bidi::kind::LTR;
1408 case 0x8f:
1409 return bidi::kind::RTL;
1410 default:
1411 break;
1412 }
1413 else if (p[1] == 0x81)
1414 switch (p[2])
1415 {
1416 case 0xa6:
1417 return bidi::kind::LRI;
1418 case 0xa7:
1419 return bidi::kind::RLI;
1420 case 0xa8:
1421 return bidi::kind::FSI;
1422 case 0xa9:
1423 return bidi::kind::PDI;
1424 default:
1425 break;
1426 }
1427
1428 return bidi::kind::NONE;
1429}
1430
1431/* Parse a sequence of 3 bytes starting with P and return its bidi code.
1432 If the kind is not NONE, write the location to *OUT.*/
1433
1434static bidi::kind
1435get_bidi_utf8 (cpp_reader *pfile, const unsigned char *const p, location_t *out)
1436{
1437 bidi::kind result = get_bidi_utf8_1 (p);
1438 if (result != bidi::kind::NONE)
1439 {
1440 /* We have a sequence of 3 bytes starting at P. */
1441 *out = get_location_for_byte_range_in_cur_line (pfile, start: p, num_bytes: 3);
1442 }
1443 return result;
1444}
1445
1446/* Parse a UCN where P points just past \u or \U and return its bidi code. */
1447
1448static bidi::kind
1449get_bidi_ucn_1 (const unsigned char *p, bool is_U, const unsigned char **end)
1450{
1451 /* 6.4.3 Universal Character Names
1452 \u hex-quad
1453 \U hex-quad hex-quad
1454 \u { simple-hexadecimal-digit-sequence }
1455 where \unnnn means \U0000nnnn. */
1456
1457 *end = p + 4;
1458 if (is_U)
1459 {
1460 if (p[0] != '0' || p[1] != '0' || p[2] != '0' || p[3] != '0')
1461 return bidi::kind::NONE;
1462 /* Skip 4B so we can treat \u and \U the same below. */
1463 p += 4;
1464 *end += 4;
1465 }
1466 else if (p[0] == '{')
1467 {
1468 p++;
1469 while (*p == '0')
1470 p++;
1471 if (p[0] != '2'
1472 || p[1] != '0'
1473 || !ISXDIGIT (p[2])
1474 || !ISXDIGIT (p[3])
1475 || p[4] != '}')
1476 return bidi::kind::NONE;
1477 *end = p + 5;
1478 }
1479
1480 /* All code points we are looking for start with 20xx. */
1481 if (p[0] != '2' || p[1] != '0')
1482 return bidi::kind::NONE;
1483 else if (p[2] == '2')
1484 switch (p[3])
1485 {
1486 case 'a':
1487 case 'A':
1488 return bidi::kind::LRE;
1489 case 'b':
1490 case 'B':
1491 return bidi::kind::RLE;
1492 case 'c':
1493 case 'C':
1494 return bidi::kind::PDF;
1495 case 'd':
1496 case 'D':
1497 return bidi::kind::LRO;
1498 case 'e':
1499 case 'E':
1500 return bidi::kind::RLO;
1501 default:
1502 break;
1503 }
1504 else if (p[2] == '6')
1505 switch (p[3])
1506 {
1507 case '6':
1508 return bidi::kind::LRI;
1509 case '7':
1510 return bidi::kind::RLI;
1511 case '8':
1512 return bidi::kind::FSI;
1513 case '9':
1514 return bidi::kind::PDI;
1515 default:
1516 break;
1517 }
1518 else if (p[2] == '0')
1519 switch (p[3])
1520 {
1521 case 'e':
1522 case 'E':
1523 return bidi::kind::LTR;
1524 case 'f':
1525 case 'F':
1526 return bidi::kind::RTL;
1527 default:
1528 break;
1529 }
1530
1531 return bidi::kind::NONE;
1532}
1533
1534/* Parse a UCN where P points just past \u or \U and return its bidi code.
1535 If the kind is not NONE, write the location to *OUT. */
1536
1537static bidi::kind
1538get_bidi_ucn (cpp_reader *pfile, const unsigned char *p, bool is_U,
1539 location_t *out)
1540{
1541 const unsigned char *end;
1542 bidi::kind result = get_bidi_ucn_1 (p, is_U, end: &end);
1543 if (result != bidi::kind::NONE)
1544 {
1545 const unsigned char *start = p - 2;
1546 size_t num_bytes = end - start;
1547 *out = get_location_for_byte_range_in_cur_line (pfile, start, num_bytes);
1548 }
1549 return result;
1550}
1551
1552/* Parse a named universal character escape where P points just past \N and
1553 return its bidi code. If the kind is not NONE, write the location to
1554 *OUT. */
1555
1556static bidi::kind
1557get_bidi_named (cpp_reader *pfile, const unsigned char *p, location_t *out)
1558{
1559 bidi::kind result = bidi::kind::NONE;
1560 if (*p != '{')
1561 return bidi::kind::NONE;
1562 if (strncmp (s1: (const char *) (p + 1), s2: "LEFT-TO-RIGHT ", n: 14) == 0)
1563 {
1564 if (strncmp (s1: (const char *) (p + 15), s2: "MARK}", n: 5) == 0)
1565 result = bidi::kind::LTR;
1566 else if (strncmp (s1: (const char *) (p + 15), s2: "EMBEDDING}", n: 10) == 0)
1567 result = bidi::kind::LRE;
1568 else if (strncmp (s1: (const char *) (p + 15), s2: "OVERRIDE}", n: 9) == 0)
1569 result = bidi::kind::LRO;
1570 else if (strncmp (s1: (const char *) (p + 15), s2: "ISOLATE}", n: 8) == 0)
1571 result = bidi::kind::LRI;
1572 }
1573 else if (strncmp (s1: (const char *) (p + 1), s2: "RIGHT-TO-LEFT ", n: 14) == 0)
1574 {
1575 if (strncmp (s1: (const char *) (p + 15), s2: "MARK}", n: 5) == 0)
1576 result = bidi::kind::RTL;
1577 else if (strncmp (s1: (const char *) (p + 15), s2: "EMBEDDING}", n: 10) == 0)
1578 result = bidi::kind::RLE;
1579 else if (strncmp (s1: (const char *) (p + 15), s2: "OVERRIDE}", n: 9) == 0)
1580 result = bidi::kind::RLO;
1581 else if (strncmp (s1: (const char *) (p + 15), s2: "ISOLATE}", n: 8) == 0)
1582 result = bidi::kind::RLI;
1583 }
1584 else if (strncmp (s1: (const char *) (p + 1), s2: "POP DIRECTIONAL ", n: 16) == 0)
1585 {
1586 if (strncmp (s1: (const char *) (p + 16), s2: "FORMATTING}", n: 11) == 0)
1587 result = bidi::kind::PDF;
1588 else if (strncmp (s1: (const char *) (p + 16), s2: "ISOLATE}", n: 8) == 0)
1589 result = bidi::kind::PDI;
1590 }
1591 else if (strncmp (s1: (const char *) (p + 1), s2: "FIRST STRONG ISOLATE}", n: 21) == 0)
1592 result = bidi::kind::FSI;
1593 if (result != bidi::kind::NONE)
1594 *out = get_location_for_byte_range_in_cur_line (pfile, start: p - 2,
1595 num_bytes: (strchr (s: (const char *)
1596 (p + 1), c: '}')
1597 - (const char *) p)
1598 + 3);
1599 return result;
1600}
1601
1602/* Subclass of rich_location for reporting on unpaired UTF-8
1603 bidirectional control character(s).
1604 Escape the source lines on output, and show all unclosed
1605 bidi context, labelling everything. */
1606
1607class unpaired_bidi_rich_location : public rich_location
1608{
1609 public:
1610 class custom_range_label : public range_label
1611 {
1612 public:
1613 label_text get_text (unsigned range_idx) const final override
1614 {
1615 /* range 0 is the primary location; each subsequent range i + 1
1616 is for bidi::vec[i]. */
1617 if (range_idx > 0)
1618 {
1619 const bidi::context &ctxt (bidi::vec[range_idx - 1]);
1620 return label_text::borrow (buffer: bidi::to_str (k: ctxt.m_kind));
1621 }
1622 else
1623 return label_text::borrow (_("end of bidirectional context"));
1624 }
1625 };
1626
1627 unpaired_bidi_rich_location (cpp_reader *pfile, location_t loc)
1628 : rich_location (pfile->line_table, loc, &m_custom_label)
1629 {
1630 set_escape_on_output (true);
1631 for (unsigned i = 0; i < bidi::vec.count (); i++)
1632 add_range (loc: bidi::vec[i].m_loc,
1633 range_display_kind: SHOW_RANGE_WITHOUT_CARET,
1634 label: &m_custom_label);
1635 }
1636
1637 private:
1638 custom_range_label m_custom_label;
1639};
1640
1641/* We're closing a bidi context, that is, we've encountered a newline,
1642 are closing a C-style comment, or are at the end of a string literal,
1643 character constant, or identifier. Warn if this context was not
1644 properly terminated by a PDI or PDF. P points to the last character
1645 in this context. */
1646
1647static void
1648maybe_warn_bidi_on_close (cpp_reader *pfile, const uchar *p)
1649{
1650 const auto warn_bidi = CPP_OPTION (pfile, cpp_warn_bidirectional);
1651 if (bidi::vec.count () > 0
1652 && (warn_bidi & bidirectional_unpaired
1653 && (!bidi::current_ctx_ucn_p ()
1654 || (warn_bidi & bidirectional_ucn))))
1655 {
1656 const location_t loc
1657 = linemap_position_for_column (pfile->line_table,
1658 CPP_BUF_COLUMN (pfile->buffer, p));
1659 unpaired_bidi_rich_location rich_loc (pfile, loc);
1660 /* cpp_callbacks doesn't yet have a way to handle singular vs plural
1661 forms of a diagnostic, so fake it for now. */
1662 if (bidi::vec.count () > 1)
1663 cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, richloc: &rich_loc,
1664 msgid: "unpaired UTF-8 bidirectional control characters "
1665 "detected");
1666 else
1667 cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, richloc: &rich_loc,
1668 msgid: "unpaired UTF-8 bidirectional control character "
1669 "detected");
1670 }
1671 /* We're done with this context. */
1672 bidi::on_close ();
1673}
1674
1675/* We're at the beginning or in the middle of an identifier/comment/string
1676 literal/character constant. Warn if we've encountered a bidi character.
1677 KIND says which bidi control character it was; UCN_P is true iff this bidi
1678 control character was written as a UCN. LOC is the location of the
1679 character, but is only valid if KIND != bidi::kind::NONE. */
1680
1681static void
1682maybe_warn_bidi_on_char (cpp_reader *pfile, bidi::kind kind,
1683 bool ucn_p, location_t loc)
1684{
1685 if (__builtin_expect (kind == bidi::kind::NONE, 1))
1686 return;
1687
1688 const auto warn_bidi = CPP_OPTION (pfile, cpp_warn_bidirectional);
1689
1690 if (warn_bidi & (bidirectional_unpaired|bidirectional_any))
1691 {
1692 rich_location rich_loc (pfile->line_table, loc);
1693 rich_loc.set_escape_on_output (true);
1694
1695 /* It seems excessive to warn about a PDI/PDF that is closing
1696 an opened context because we've already warned about the
1697 opening character. Except warn when we have a UCN x UTF-8
1698 mismatch, if UCN checking is enabled. */
1699 if (kind == bidi::current_ctx ())
1700 {
1701 if (warn_bidi == (bidirectional_unpaired|bidirectional_ucn)
1702 && bidi::current_ctx_ucn_p () != ucn_p)
1703 {
1704 rich_loc.add_range (loc: bidi::current_ctx_loc ());
1705 cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, richloc: &rich_loc,
1706 msgid: "UTF-8 vs UCN mismatch when closing "
1707 "a context by %qs", bidi::to_str (k: kind));
1708 }
1709 }
1710 else if (warn_bidi & bidirectional_any
1711 && (!ucn_p || (warn_bidi & bidirectional_ucn)))
1712 {
1713 if (kind == bidi::kind::PDF || kind == bidi::kind::PDI)
1714 cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, richloc: &rich_loc,
1715 msgid: "%qs is closing an unopened context",
1716 bidi::to_str (k: kind));
1717 else
1718 cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, richloc: &rich_loc,
1719 msgid: "found problematic Unicode character %qs",
1720 bidi::to_str (k: kind));
1721 }
1722 }
1723 /* We're done with this context. */
1724 bidi::on_char (k: kind, ucn_p, loc);
1725}
1726
1727static const cppchar_t utf8_continuation = 0x80;
1728static const cppchar_t utf8_signifier = 0xC0;
1729
1730/* Emit -Winvalid-utf8 warning on invalid UTF-8 character starting
1731 at PFILE->buffer->cur. Return a pointer after the diagnosed
1732 invalid character. */
1733
1734static const uchar *
1735_cpp_warn_invalid_utf8 (cpp_reader *pfile)
1736{
1737 cpp_buffer *buffer = pfile->buffer;
1738 const uchar *cur = buffer->cur;
1739 bool pedantic = (CPP_PEDANTIC (pfile)
1740 && CPP_OPTION (pfile, cpp_warn_invalid_utf8) == 2);
1741
1742 if (cur[0] < utf8_signifier
1743 || cur[1] < utf8_continuation || cur[1] >= utf8_signifier)
1744 {
1745 if (pedantic)
1746 cpp_error_with_line (pfile, CPP_DL_PEDWARN,
1747 pfile->line_table->highest_line,
1748 CPP_BUF_COL (buffer),
1749 msgid: "invalid UTF-8 character %<<%x>%>",
1750 cur[0]);
1751 else
1752 cpp_warning_with_line (pfile, CPP_W_INVALID_UTF8,
1753 pfile->line_table->highest_line,
1754 CPP_BUF_COL (buffer),
1755 msgid: "invalid UTF-8 character %<<%x>%>",
1756 cur[0]);
1757 return cur + 1;
1758 }
1759 else if (cur[2] < utf8_continuation || cur[2] >= utf8_signifier)
1760 {
1761 if (pedantic)
1762 cpp_error_with_line (pfile, CPP_DL_PEDWARN,
1763 pfile->line_table->highest_line,
1764 CPP_BUF_COL (buffer),
1765 msgid: "invalid UTF-8 character %<<%x><%x>%>",
1766 cur[0], cur[1]);
1767 else
1768 cpp_warning_with_line (pfile, CPP_W_INVALID_UTF8,
1769 pfile->line_table->highest_line,
1770 CPP_BUF_COL (buffer),
1771 msgid: "invalid UTF-8 character %<<%x><%x>%>",
1772 cur[0], cur[1]);
1773 return cur + 2;
1774 }
1775 else if (cur[3] < utf8_continuation || cur[3] >= utf8_signifier)
1776 {
1777 if (pedantic)
1778 cpp_error_with_line (pfile, CPP_DL_PEDWARN,
1779 pfile->line_table->highest_line,
1780 CPP_BUF_COL (buffer),
1781 msgid: "invalid UTF-8 character %<<%x><%x><%x>%>",
1782 cur[0], cur[1], cur[2]);
1783 else
1784 cpp_warning_with_line (pfile, CPP_W_INVALID_UTF8,
1785 pfile->line_table->highest_line,
1786 CPP_BUF_COL (buffer),
1787 msgid: "invalid UTF-8 character %<<%x><%x><%x>%>",
1788 cur[0], cur[1], cur[2]);
1789 return cur + 3;
1790 }
1791 else
1792 {
1793 if (pedantic)
1794 cpp_error_with_line (pfile, CPP_DL_PEDWARN,
1795 pfile->line_table->highest_line,
1796 CPP_BUF_COL (buffer),
1797 msgid: "invalid UTF-8 character %<<%x><%x><%x><%x>%>",
1798 cur[0], cur[1], cur[2], cur[3]);
1799 else
1800 cpp_warning_with_line (pfile, CPP_W_INVALID_UTF8,
1801 pfile->line_table->highest_line,
1802 CPP_BUF_COL (buffer),
1803 msgid: "invalid UTF-8 character %<<%x><%x><%x><%x>%>",
1804 cur[0], cur[1], cur[2], cur[3]);
1805 return cur + 4;
1806 }
1807}
1808
1809/* Helper function of *skip_*_comment and lex*_string. For C,
1810 character at CUR[-1] with MSB set handle -Wbidi-chars* and
1811 -Winvalid-utf8 diagnostics and return pointer to first character
1812 that should be processed next. */
1813
1814static inline const uchar *
1815_cpp_handle_multibyte_utf8 (cpp_reader *pfile, uchar c,
1816 const uchar *cur, bool warn_bidi_p,
1817 bool warn_invalid_utf8_p)
1818{
1819 /* If this is a beginning of a UTF-8 encoding, it might be
1820 a bidirectional control character. */
1821 if (c == bidi::utf8_start && warn_bidi_p)
1822 {
1823 location_t loc;
1824 bidi::kind kind = get_bidi_utf8 (pfile, p: cur - 1, out: &loc);
1825 maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/false, loc);
1826 }
1827 if (!warn_invalid_utf8_p)
1828 return cur;
1829 if (c >= utf8_signifier)
1830 {
1831 cppchar_t s;
1832 const uchar *pstr = cur - 1;
1833 if (_cpp_valid_utf8 (pfile, pstr: &pstr, limit: pfile->buffer->rlimit, identifier_pos: 0, NULL, cp: &s)
1834 && s <= UCS_LIMIT)
1835 return pstr;
1836 }
1837 pfile->buffer->cur = cur - 1;
1838 return _cpp_warn_invalid_utf8 (pfile);
1839}
1840
1841/* Skip a C-style block comment. We find the end of the comment by
1842 seeing if an asterisk is before every '/' we encounter. Returns
1843 nonzero if comment terminated by EOF, zero otherwise.
1844
1845 Buffer->cur points to the initial asterisk of the comment. */
1846bool
1847_cpp_skip_block_comment (cpp_reader *pfile)
1848{
1849 cpp_buffer *buffer = pfile->buffer;
1850 const uchar *cur = buffer->cur;
1851 uchar c;
1852 const bool warn_bidi_p = pfile->warn_bidi_p ();
1853 const bool warn_invalid_utf8_p = CPP_OPTION (pfile, cpp_warn_invalid_utf8);
1854 const bool warn_bidi_or_invalid_utf8_p = warn_bidi_p | warn_invalid_utf8_p;
1855
1856 cur++;
1857 if (*cur == '/')
1858 cur++;
1859
1860 for (;;)
1861 {
1862 /* People like decorating comments with '*', so check for '/'
1863 instead for efficiency. */
1864 c = *cur++;
1865
1866 if (c == '/')
1867 {
1868 if (cur[-2] == '*')
1869 {
1870 if (warn_bidi_p)
1871 maybe_warn_bidi_on_close (pfile, p: cur);
1872 break;
1873 }
1874
1875 /* Warn about potential nested comments, but not if the '/'
1876 comes immediately before the true comment delimiter.
1877 Don't bother to get it right across escaped newlines. */
1878 if (CPP_OPTION (pfile, warn_comments)
1879 && cur[0] == '*' && cur[1] != '/')
1880 {
1881 buffer->cur = cur;
1882 cpp_warning_with_line (pfile, CPP_W_COMMENTS,
1883 pfile->line_table->highest_line,
1884 CPP_BUF_COL (buffer),
1885 msgid: "%</*%> within comment");
1886 }
1887 }
1888 else if (c == '\n')
1889 {
1890 unsigned int cols;
1891 buffer->cur = cur - 1;
1892 if (warn_bidi_p)
1893 maybe_warn_bidi_on_close (pfile, p: cur);
1894 _cpp_process_line_notes (pfile, in_comment: true);
1895 if (buffer->next_line >= buffer->rlimit)
1896 return true;
1897 _cpp_clean_line (pfile);
1898
1899 cols = buffer->next_line - buffer->line_base;
1900 CPP_INCREMENT_LINE (pfile, cols);
1901
1902 cur = buffer->cur;
1903 }
1904 else if (__builtin_expect (c >= utf8_continuation, 0)
1905 && warn_bidi_or_invalid_utf8_p)
1906 cur = _cpp_handle_multibyte_utf8 (pfile, c, cur, warn_bidi_p,
1907 warn_invalid_utf8_p);
1908 }
1909
1910 buffer->cur = cur;
1911 _cpp_process_line_notes (pfile, in_comment: true);
1912 return false;
1913}
1914
1915/* Skip a C++ line comment, leaving buffer->cur pointing to the
1916 terminating newline. Handles escaped newlines. Returns nonzero
1917 if a multiline comment. */
1918static int
1919skip_line_comment (cpp_reader *pfile)
1920{
1921 cpp_buffer *buffer = pfile->buffer;
1922 location_t orig_line = pfile->line_table->highest_line;
1923 const bool warn_bidi_p = pfile->warn_bidi_p ();
1924 const bool warn_invalid_utf8_p = CPP_OPTION (pfile, cpp_warn_invalid_utf8);
1925 const bool warn_bidi_or_invalid_utf8_p = warn_bidi_p | warn_invalid_utf8_p;
1926
1927 if (!warn_bidi_or_invalid_utf8_p)
1928 while (*buffer->cur != '\n')
1929 buffer->cur++;
1930 else if (!warn_invalid_utf8_p)
1931 {
1932 while (*buffer->cur != '\n'
1933 && *buffer->cur != bidi::utf8_start)
1934 buffer->cur++;
1935 if (__builtin_expect (*buffer->cur == bidi::utf8_start, 0))
1936 {
1937 while (*buffer->cur != '\n')
1938 {
1939 if (__builtin_expect (*buffer->cur == bidi::utf8_start, 0))
1940 {
1941 location_t loc;
1942 bidi::kind kind = get_bidi_utf8 (pfile, p: buffer->cur, out: &loc);
1943 maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/false, loc);
1944 }
1945 buffer->cur++;
1946 }
1947 maybe_warn_bidi_on_close (pfile, p: buffer->cur);
1948 }
1949 }
1950 else
1951 {
1952 while (*buffer->cur != '\n')
1953 {
1954 if (*buffer->cur < utf8_continuation)
1955 {
1956 buffer->cur++;
1957 continue;
1958 }
1959 buffer->cur
1960 = _cpp_handle_multibyte_utf8 (pfile, c: *buffer->cur, cur: buffer->cur + 1,
1961 warn_bidi_p, warn_invalid_utf8_p);
1962 }
1963 if (warn_bidi_p)
1964 maybe_warn_bidi_on_close (pfile, p: buffer->cur);
1965 }
1966
1967 _cpp_process_line_notes (pfile, in_comment: true);
1968 return orig_line != pfile->line_table->highest_line;
1969}
1970
1971/* Skips whitespace, saving the next non-whitespace character. */
1972static void
1973skip_whitespace (cpp_reader *pfile, cppchar_t c)
1974{
1975 cpp_buffer *buffer = pfile->buffer;
1976 bool saw_NUL = false;
1977
1978 do
1979 {
1980 /* Horizontal space always OK. */
1981 if (c == ' ' || c == '\t')
1982 ;
1983 /* Just \f \v or \0 left. */
1984 else if (c == '\0')
1985 saw_NUL = true;
1986 else if (pfile->state.in_directive)
1987 cpp_pedwarning_with_line (pfile, CPP_W_PEDANTIC,
1988 pfile->line_table->highest_line,
1989 CPP_BUF_COL (buffer),
1990 msgid: "%s in preprocessing directive",
1991 c == '\f' ? "form feed" : "vertical tab");
1992
1993 c = *buffer->cur++;
1994 }
1995 /* We only want non-vertical space, i.e. ' ' \t \f \v \0. */
1996 while (is_nvspace (c));
1997
1998 if (saw_NUL)
1999 {
2000 encoding_rich_location rich_loc (pfile);
2001 cpp_error_at (pfile, CPP_DL_WARNING, richloc: &rich_loc,
2002 msgid: "null character(s) ignored");
2003 }
2004
2005 buffer->cur--;
2006}
2007
2008/* See if the characters of a number token are valid in a name (no
2009 '.', '+' or '-'). */
2010static int
2011name_p (cpp_reader *pfile, const cpp_string *string)
2012{
2013 unsigned int i;
2014
2015 for (i = 0; i < string->len; i++)
2016 if (!is_idchar (string->text[i]))
2017 return 0;
2018
2019 return 1;
2020}
2021
2022/* After parsing an identifier or other sequence, produce a warning about
2023 sequences not in NFC/NFKC. */
2024static void
2025warn_about_normalization (cpp_reader *pfile,
2026 const cpp_token *token,
2027 const struct normalize_state *s,
2028 bool identifier)
2029{
2030 if (CPP_OPTION (pfile, warn_normalize) < NORMALIZE_STATE_RESULT (s)
2031 && !pfile->state.skipping)
2032 {
2033 location_t loc = token->src_loc;
2034
2035 /* If possible, create a location range for the token. */
2036 if (loc >= RESERVED_LOCATION_COUNT
2037 && token->type != CPP_EOF
2038 /* There must be no line notes to process. */
2039 && (!(pfile->buffer->cur
2040 >= pfile->buffer->notes[pfile->buffer->cur_note].pos
2041 && !pfile->overlaid_buffer)))
2042 {
2043 source_range tok_range;
2044 tok_range.m_start = loc;
2045 tok_range.m_finish
2046 = linemap_position_for_column (pfile->line_table,
2047 CPP_BUF_COLUMN (pfile->buffer,
2048 pfile->buffer->cur));
2049 loc = pfile->line_table->get_or_create_combined_loc (locus: loc, src_range: tok_range,
2050 data: nullptr, discriminator: 0);
2051 }
2052
2053 encoding_rich_location rich_loc (pfile, loc);
2054
2055 /* Make sure that the token is printed using UCNs, even
2056 if we'd otherwise happily print UTF-8. */
2057 unsigned char *buf = XNEWVEC (unsigned char, cpp_token_len (token));
2058 size_t sz;
2059
2060 sz = cpp_spell_token (pfile, token, buf, false) - buf;
2061 if (NORMALIZE_STATE_RESULT (s) == normalized_C)
2062 cpp_warning_at (pfile, CPP_W_NORMALIZE, richloc: &rich_loc,
2063 msgid: "%<%.*s%> is not in NFKC", (int) sz, buf);
2064 else if (identifier && CPP_OPTION (pfile, xid_identifiers))
2065 cpp_pedwarning_at (pfile, CPP_W_NORMALIZE, richloc: &rich_loc,
2066 msgid: "%<%.*s%> is not in NFC", (int) sz, buf);
2067 else
2068 cpp_warning_at (pfile, CPP_W_NORMALIZE, richloc: &rich_loc,
2069 msgid: "%<%.*s%> is not in NFC", (int) sz, buf);
2070 free (ptr: buf);
2071 }
2072}
2073
2074/* Returns TRUE if the byte sequence starting at buffer->cur is a valid
2075 extended character in an identifier. If FIRST is TRUE, then the character
2076 must be valid at the beginning of an identifier as well. If the return
2077 value is TRUE, then pfile->buffer->cur has been moved to point to the next
2078 byte after the extended character. */
2079
2080static bool
2081forms_identifier_p (cpp_reader *pfile, int first,
2082 struct normalize_state *state)
2083{
2084 cpp_buffer *buffer = pfile->buffer;
2085 const bool warn_bidi_p = pfile->warn_bidi_p ();
2086
2087 if (*buffer->cur == '$')
2088 {
2089 if (!CPP_OPTION (pfile, dollars_in_ident))
2090 return false;
2091
2092 buffer->cur++;
2093 if (CPP_OPTION (pfile, warn_dollars) && !pfile->state.skipping)
2094 {
2095 CPP_OPTION (pfile, warn_dollars) = 0;
2096 cpp_error (pfile, CPP_DL_PEDWARN, msgid: "%<$%> in identifier or number");
2097 }
2098
2099 return true;
2100 }
2101
2102 /* Is this a syntactically valid UCN or a valid UTF-8 char? */
2103 if (CPP_OPTION (pfile, extended_identifiers))
2104 {
2105 cppchar_t s;
2106 if (*buffer->cur >= utf8_signifier)
2107 {
2108 if (__builtin_expect (*buffer->cur == bidi::utf8_start, 0)
2109 && warn_bidi_p)
2110 {
2111 location_t loc;
2112 bidi::kind kind = get_bidi_utf8 (pfile, p: buffer->cur, out: &loc);
2113 maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/false, loc);
2114 }
2115 if (_cpp_valid_utf8 (pfile, pstr: &buffer->cur, limit: buffer->rlimit, identifier_pos: 1 + !first,
2116 nst: state, cp: &s))
2117 return true;
2118 }
2119 else if (*buffer->cur == '\\'
2120 && (buffer->cur[1] == 'u'
2121 || buffer->cur[1] == 'U'
2122 || buffer->cur[1] == 'N'))
2123 {
2124 buffer->cur += 2;
2125 if (warn_bidi_p)
2126 {
2127 location_t loc;
2128 bidi::kind kind;
2129 if (buffer->cur[-1] == 'N')
2130 kind = get_bidi_named (pfile, p: buffer->cur, out: &loc);
2131 else
2132 kind = get_bidi_ucn (pfile, p: buffer->cur,
2133 is_U: buffer->cur[-1] == 'U', out: &loc);
2134 maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/true, loc);
2135 }
2136 if (_cpp_valid_ucn (pfile, &buffer->cur, buffer->rlimit, 1 + !first,
2137 state, &s, NULL, NULL))
2138 return true;
2139 buffer->cur -= 2;
2140 }
2141 }
2142
2143 return false;
2144}
2145
2146/* Helper function to issue error about improper __VA_OPT__ use. */
2147static void
2148maybe_va_opt_error (cpp_reader *pfile)
2149{
2150 if (CPP_PEDANTIC (pfile) && !CPP_OPTION (pfile, va_opt))
2151 {
2152 /* __VA_OPT__ should not be accepted at all, but allow it in
2153 system headers. */
2154 if (!_cpp_in_system_header (pfile))
2155 {
2156 if (CPP_OPTION (pfile, cplusplus))
2157 cpp_pedwarning (pfile, CPP_W_CXX20_EXTENSIONS,
2158 msgid: "%<__VA_OPT__%> is not available until C++20");
2159 else
2160 cpp_pedwarning (pfile, CPP_W_PEDANTIC,
2161 msgid: "%<__VA_OPT__%> is not available until C23");
2162 }
2163 }
2164 else if (!pfile->state.va_args_ok)
2165 {
2166 /* __VA_OPT__ should only appear in the replacement list of a
2167 variadic macro. */
2168 cpp_error (pfile, CPP_DL_PEDWARN,
2169 msgid: "%<__VA_OPT__%> can only appear in the expansion"
2170 " of a C++20 variadic macro");
2171 }
2172}
2173
2174/* Helper function to perform diagnostics that are needed (rarely)
2175 when an identifier is lexed. */
2176static void
2177identifier_diagnostics_on_lex (cpp_reader *pfile, cpp_hashnode *node)
2178{
2179 if (__builtin_expect (!(node->flags & NODE_DIAGNOSTIC)
2180 || pfile->state.skipping, 1))
2181 return;
2182
2183 /* It is allowed to poison the same identifier twice. */
2184 if ((node->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
2185 {
2186 cpp_error (pfile, CPP_DL_ERROR, msgid: "attempt to use poisoned %qs",
2187 NODE_NAME (node));
2188 const auto data = (cpp_hashnode_extra *)
2189 ht_lookup (ht: pfile->extra_hash_table, id: node->ident, opt: HT_NO_INSERT);
2190 if (data && data->poisoned_loc)
2191 cpp_error_at (pfile, CPP_DL_NOTE, src_loc: data->poisoned_loc, msgid: "poisoned here");
2192 }
2193
2194 /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
2195 replacement list of a variadic macro. */
2196 if (node == pfile->spec_nodes.n__VA_ARGS__
2197 && !pfile->state.va_args_ok)
2198 {
2199 if (CPP_OPTION (pfile, cplusplus))
2200 cpp_error (pfile, CPP_DL_PEDWARN,
2201 msgid: "%<__VA_ARGS__%> can only appear in the expansion"
2202 " of a C++11 variadic macro");
2203 else
2204 cpp_error (pfile, CPP_DL_PEDWARN,
2205 msgid: "%<__VA_ARGS__%> can only appear in the expansion"
2206 " of a C99 variadic macro");
2207 }
2208
2209 /* __VA_OPT__ should only appear in the replacement list of a
2210 variadic macro. */
2211 if (node == pfile->spec_nodes.n__VA_OPT__)
2212 maybe_va_opt_error (pfile);
2213
2214 /* For -Wc++-compat, warn about use of C++ named operators. */
2215 if (node->flags & NODE_WARN_OPERATOR)
2216 cpp_warning (pfile, CPP_W_CXX_OPERATOR_NAMES,
2217 msgid: "identifier %qs is a special operator name in C++",
2218 NODE_NAME (node));
2219}
2220
2221/* Lex an identifier starting at BASE. BUFFER->CUR is expected to point
2222 one past the first character at BASE, which may be a (possibly multi-byte)
2223 character if STARTS_UCN is true. */
2224static cpp_hashnode *
2225lex_identifier (cpp_reader *pfile, const uchar *base, bool starts_ucn,
2226 struct normalize_state *nst, cpp_hashnode **spelling)
2227{
2228 cpp_hashnode *result;
2229 const uchar *cur;
2230 unsigned int len;
2231 unsigned int hash = HT_HASHSTEP (0, *base);
2232 const bool warn_bidi_p = pfile->warn_bidi_p ();
2233
2234 cur = pfile->buffer->cur;
2235 if (! starts_ucn)
2236 {
2237 while (ISIDNUM (*cur))
2238 {
2239 hash = HT_HASHSTEP (hash, *cur);
2240 cur++;
2241 }
2242 NORMALIZE_STATE_UPDATE_IDNUM (nst, *(cur - 1));
2243 }
2244 pfile->buffer->cur = cur;
2245 if (starts_ucn || forms_identifier_p (pfile, first: false, state: nst))
2246 {
2247 /* Slower version for identifiers containing UCNs
2248 or extended chars (including $). */
2249 do {
2250 while (ISIDNUM (*pfile->buffer->cur))
2251 {
2252 NORMALIZE_STATE_UPDATE_IDNUM (nst, *pfile->buffer->cur);
2253 pfile->buffer->cur++;
2254 }
2255 } while (forms_identifier_p (pfile, first: false, state: nst));
2256 if (warn_bidi_p)
2257 maybe_warn_bidi_on_close (pfile, p: pfile->buffer->cur);
2258 result = _cpp_interpret_identifier (pfile, id: base,
2259 len: pfile->buffer->cur - base);
2260 *spelling = cpp_lookup (pfile, base, pfile->buffer->cur - base);
2261 }
2262 else
2263 {
2264 len = cur - base;
2265 hash = HT_HASHFINISH (hash, len);
2266
2267 result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
2268 base, len, hash, HT_ALLOC));
2269 *spelling = result;
2270 }
2271
2272 return result;
2273}
2274
2275/* Struct to hold the return value of the scan_cur_identifier () helper
2276 function below. */
2277
2278struct scan_id_result
2279{
2280 cpp_hashnode *node;
2281 normalize_state nst;
2282
2283 scan_id_result ()
2284 : node (nullptr)
2285 {
2286 nst = INITIAL_NORMALIZE_STATE;
2287 }
2288
2289 explicit operator bool () const { return node; }
2290};
2291
2292/* Helper function to scan an entire identifier beginning at
2293 pfile->buffer->cur, and possibly containing extended characters (UCNs
2294 and/or UTF-8). Returns the cpp_hashnode for the identifier on success, or
2295 else nullptr, as well as a normalize_state so that normalization warnings
2296 may be issued once the token lexing is complete. */
2297
2298static scan_id_result
2299scan_cur_identifier (cpp_reader *pfile)
2300{
2301 const auto buffer = pfile->buffer;
2302 const auto begin = buffer->cur;
2303 scan_id_result result;
2304 if (ISIDST (*buffer->cur))
2305 {
2306 ++buffer->cur;
2307 cpp_hashnode *ignore;
2308 result.node = lex_identifier (pfile, base: begin, starts_ucn: false, nst: &result.nst, spelling: &ignore);
2309 }
2310 else if (forms_identifier_p (pfile, first: true, state: &result.nst))
2311 {
2312 /* buffer->cur has been moved already by the call
2313 to forms_identifier_p. */
2314 cpp_hashnode *ignore;
2315 result.node = lex_identifier (pfile, base: begin, starts_ucn: true, nst: &result.nst, spelling: &ignore);
2316 }
2317 return result;
2318}
2319
2320/* Lex a number to NUMBER starting at BUFFER->CUR - 1. */
2321static void
2322lex_number (cpp_reader *pfile, cpp_string *number,
2323 struct normalize_state *nst)
2324{
2325 const uchar *cur;
2326 const uchar *base;
2327 uchar *dest;
2328
2329 base = pfile->buffer->cur - 1;
2330 do
2331 {
2332 const uchar *adj_digit_sep = NULL;
2333 cur = pfile->buffer->cur;
2334
2335 /* N.B. ISIDNUM does not include $. */
2336 while (ISIDNUM (*cur)
2337 || (*cur == '.' && !DIGIT_SEP (cur[-1]))
2338 || DIGIT_SEP (*cur)
2339 || (VALID_SIGN (*cur, cur[-1]) && !DIGIT_SEP (cur[-2])))
2340 {
2341 NORMALIZE_STATE_UPDATE_IDNUM (nst, *cur);
2342 /* Adjacent digit separators do not form part of the pp-number syntax.
2343 However, they can safely be diagnosed here as an error, since '' is
2344 not a valid preprocessing token. */
2345 if (DIGIT_SEP (*cur) && DIGIT_SEP (cur[-1]) && !adj_digit_sep)
2346 adj_digit_sep = cur;
2347 cur++;
2348 }
2349 /* A number can't end with a digit separator. */
2350 while (cur > pfile->buffer->cur && DIGIT_SEP (cur[-1]))
2351 --cur;
2352 if (adj_digit_sep && adj_digit_sep < cur)
2353 cpp_error (pfile, CPP_DL_ERROR, msgid: "adjacent digit separators");
2354
2355 pfile->buffer->cur = cur;
2356 }
2357 while (forms_identifier_p (pfile, first: false, state: nst));
2358
2359 number->len = cur - base;
2360 dest = _cpp_unaligned_alloc (pfile, number->len + 1);
2361 memcpy (dest: dest, src: base, n: number->len);
2362 dest[number->len] = '\0';
2363 number->text = dest;
2364}
2365
2366/* Create a token of type TYPE with a literal spelling. */
2367static void
2368create_literal (cpp_reader *pfile, cpp_token *token, const uchar *base,
2369 unsigned int len, enum cpp_ttype type)
2370{
2371 token->type = type;
2372 token->val.str.len = len;
2373 token->val.str.text = cpp_alloc_token_string (pfile, base, len);
2374}
2375
2376/* Like create_literal(), but construct it from two separate strings
2377 which are concatenated. LEN2 may be 0 if no second string is
2378 required. */
2379static void
2380create_literal2 (cpp_reader *pfile, cpp_token *token, const uchar *base1,
2381 unsigned int len1, const uchar *base2, unsigned int len2,
2382 enum cpp_ttype type)
2383{
2384 token->type = type;
2385 token->val.str.len = len1 + len2;
2386 uchar *const dest = _cpp_unaligned_alloc (pfile, len1 + len2 + 1);
2387 memcpy (dest: dest, src: base1, n: len1);
2388 if (len2)
2389 memcpy (dest: dest+len1, src: base2, n: len2);
2390 dest[len1 + len2] = 0;
2391 token->val.str.text = dest;
2392}
2393
2394const uchar *
2395cpp_alloc_token_string (cpp_reader *pfile,
2396 const unsigned char *ptr, unsigned len)
2397{
2398 uchar *dest = _cpp_unaligned_alloc (pfile, len + 1);
2399
2400 dest[len] = 0;
2401 memcpy (dest: dest, src: ptr, n: len);
2402 return dest;
2403}
2404
2405/* A pair of raw buffer pointers. The currently open one is [1], the
2406 first one is [0]. Used for string literal lexing. */
2407struct lit_accum {
2408 _cpp_buff *first;
2409 _cpp_buff *last;
2410 const uchar *rpos;
2411 size_t accum;
2412
2413 lit_accum ()
2414 : first (NULL), last (NULL), rpos (0), accum (0)
2415 {
2416 }
2417
2418 void append (cpp_reader *, const uchar *, size_t);
2419
2420 void read_begin (cpp_reader *);
2421 bool reading_p () const
2422 {
2423 return rpos != NULL;
2424 }
2425 char read_char ()
2426 {
2427 char c = *rpos++;
2428 if (rpos == BUFF_FRONT (last))
2429 rpos = NULL;
2430 return c;
2431 }
2432
2433 void create_literal2 (cpp_reader *pfile, cpp_token *token,
2434 const uchar *base1, unsigned int len1,
2435 const uchar *base2, unsigned int len2,
2436 enum cpp_ttype type);
2437};
2438
2439/* Subroutine of lex_raw_string: Append LEN chars from BASE to the buffer
2440 sequence from *FIRST_BUFF_P to LAST_BUFF_P. */
2441
2442void
2443lit_accum::append (cpp_reader *pfile, const uchar *base, size_t len)
2444{
2445 if (!last)
2446 /* Starting. */
2447 first = last = _cpp_get_buff (pfile, len);
2448 else if (len > BUFF_ROOM (last))
2449 {
2450 /* There is insufficient room in the buffer. Copy what we can,
2451 and then either extend or create a new one. */
2452 size_t room = BUFF_ROOM (last);
2453 memcpy (BUFF_FRONT (last), src: base, n: room);
2454 BUFF_FRONT (last) += room;
2455 base += room;
2456 len -= room;
2457 accum += room;
2458
2459 gcc_checking_assert (!rpos);
2460
2461 last = _cpp_append_extend_buff (pfile, last, len);
2462 }
2463
2464 memcpy (BUFF_FRONT (last), src: base, n: len);
2465 BUFF_FRONT (last) += len;
2466 accum += len;
2467}
2468
2469void
2470lit_accum::read_begin (cpp_reader *pfile)
2471{
2472 /* We never accumulate more than 4 chars to read. */
2473 if (BUFF_ROOM (last) < 4)
2474
2475 last = _cpp_append_extend_buff (pfile, last, 4);
2476 rpos = BUFF_FRONT (last);
2477}
2478
2479/* Helper function to check if a string format macro, say from inttypes.h, is
2480 placed touching a string literal, in which case it could be parsed as a C++11
2481 user-defined string literal thus breaking the program. Return TRUE if the
2482 UDL should be ignored for now and preserved for potential macro
2483 expansion. */
2484
2485static bool
2486maybe_ignore_udl_macro_suffix (cpp_reader *pfile, location_t src_loc,
2487 const uchar *suffix_begin, cpp_hashnode *node)
2488{
2489 /* User-defined literals outside of namespace std must start with a single
2490 underscore, so assume anything of that form really is a UDL suffix.
2491 We don't need to worry about UDLs defined inside namespace std because
2492 their names are reserved, so cannot be used as macro names in valid
2493 programs. */
2494 if ((suffix_begin[0] == '_' && suffix_begin[1] != '_')
2495 || !cpp_macro_p (node))
2496 return false;
2497
2498 /* Maybe raise a warning here; caller should arrange not to consume
2499 the tokens. */
2500 if (CPP_OPTION (pfile, warn_literal_suffix) && !pfile->state.skipping)
2501 cpp_warning_with_line (pfile, CPP_W_LITERAL_SUFFIX, src_loc, 0,
2502 msgid: "invalid suffix on literal; C++11 requires a space "
2503 "between literal and string macro");
2504 return true;
2505}
2506
2507/* Like create_literal2(), but also prepend all the accumulated data from
2508 the lit_accum struct. */
2509void
2510lit_accum::create_literal2 (cpp_reader *pfile, cpp_token *token,
2511 const uchar *base1, unsigned int len1,
2512 const uchar *base2, unsigned int len2,
2513 enum cpp_ttype type)
2514{
2515 const unsigned int tot_len = accum + len1 + len2;
2516 uchar *dest = _cpp_unaligned_alloc (pfile, tot_len + 1);
2517 token->type = type;
2518 token->val.str.len = tot_len;
2519 token->val.str.text = dest;
2520 for (_cpp_buff *buf = first; buf; buf = buf->next)
2521 {
2522 size_t len = BUFF_FRONT (buf) - buf->base;
2523 memcpy (dest: dest, src: buf->base, n: len);
2524 dest += len;
2525 }
2526 memcpy (dest: dest, src: base1, n: len1);
2527 dest += len1;
2528 if (len2)
2529 memcpy (dest: dest, src: base2, n: len2);
2530 dest += len2;
2531 *dest = '\0';
2532}
2533
2534/* Lexes a raw string. The stored string contains the spelling,
2535 including double quotes, delimiter string, '(' and ')', any leading
2536 'L', 'u', 'U' or 'u8' and 'R' modifier. The created token contains
2537 the type of the literal, or CPP_OTHER if it was not properly
2538 terminated.
2539
2540 BASE is the start of the token. Updates pfile->buffer->cur to just
2541 after the lexed string.
2542
2543 The spelling is NUL-terminated, but it is not guaranteed that this
2544 is the first NUL since embedded NULs are preserved. */
2545
2546static void
2547lex_raw_string (cpp_reader *pfile, cpp_token *token, const uchar *base)
2548{
2549 const uchar *pos = base;
2550 const bool warn_bidi_p = pfile->warn_bidi_p ();
2551 const bool warn_invalid_utf8_p = CPP_OPTION (pfile, cpp_warn_invalid_utf8);
2552 const bool warn_bidi_or_invalid_utf8_p = warn_bidi_p | warn_invalid_utf8_p;
2553
2554 /* 'tis a pity this information isn't passed down from the lexer's
2555 initial categorization of the token. */
2556 enum cpp_ttype type = CPP_STRING;
2557
2558 if (*pos == 'L')
2559 {
2560 type = CPP_WSTRING;
2561 pos++;
2562 }
2563 else if (*pos == 'U')
2564 {
2565 type = CPP_STRING32;
2566 pos++;
2567 }
2568 else if (*pos == 'u')
2569 {
2570 if (pos[1] == '8')
2571 {
2572 type = CPP_UTF8STRING;
2573 pos++;
2574 }
2575 else
2576 type = CPP_STRING16;
2577 pos++;
2578 }
2579
2580 gcc_checking_assert (pos[0] == 'R' && pos[1] == '"');
2581 pos += 2;
2582
2583 _cpp_line_note *note = &pfile->buffer->notes[pfile->buffer->cur_note];
2584
2585 /* Skip notes before the ". */
2586 while (note->pos < pos)
2587 ++note;
2588
2589 lit_accum accum;
2590
2591 uchar prefix[17];
2592 unsigned prefix_len = 0;
2593 enum Phase
2594 {
2595 PHASE_PREFIX = -2,
2596 PHASE_NONE = -1,
2597 PHASE_SUFFIX = 0
2598 } phase = PHASE_PREFIX;
2599
2600 for (;;)
2601 {
2602 gcc_checking_assert (note->pos >= pos);
2603
2604 /* Undo any escaped newlines and trigraphs. */
2605 if (!accum.reading_p () && note->pos == pos)
2606 switch (note->type)
2607 {
2608 case '\\':
2609 case ' ':
2610 /* Restore backslash followed by newline. */
2611 accum.append (pfile, base, len: pos - base);
2612 base = pos;
2613 accum.read_begin (pfile);
2614 accum.append (pfile, UC"\\", len: 1);
2615
2616 after_backslash:
2617 if (note->type == ' ')
2618 /* GNU backslash whitespace newline extension. FIXME
2619 could be any sequence of non-vertical space. When we
2620 can properly restore any such sequence, we should
2621 mark this note as handled so _cpp_process_line_notes
2622 doesn't warn. */
2623 accum.append (pfile, UC" ", len: 1);
2624
2625 accum.append (pfile, UC"\n", len: 1);
2626 note++;
2627 break;
2628
2629 case '\n':
2630 /* This can happen for ??/<NEWLINE> when trigraphs are not
2631 being interpretted. */
2632 gcc_checking_assert (!CPP_OPTION (pfile, trigraphs));
2633 note->type = 0;
2634 note++;
2635 break;
2636
2637 case 'W':
2638 case 'L':
2639 case 'S':
2640 case 'T':
2641 /* Don't warn about leading or trailing whitespace in raw string
2642 literals. */
2643 note->type = 0;
2644 note++;
2645 break;
2646
2647 default:
2648 gcc_checking_assert (_cpp_trigraph_map[note->type]);
2649
2650 /* Don't warn about this trigraph in
2651 _cpp_process_line_notes, since trigraphs show up as
2652 trigraphs in raw strings. */
2653 uchar type = note->type;
2654 note->type = 0;
2655
2656 if (CPP_OPTION (pfile, trigraphs))
2657 {
2658 accum.append (pfile, base, len: pos - base);
2659 base = pos;
2660 accum.read_begin (pfile);
2661 accum.append (pfile, UC"??", len: 2);
2662 accum.append (pfile, base: &type, len: 1);
2663
2664 /* ??/ followed by newline gets two line notes, one for
2665 the trigraph and one for the backslash/newline. */
2666 if (type == '/' && note[1].pos == pos)
2667 {
2668 note++;
2669 gcc_assert (note->type == '\\' || note->type == ' ');
2670 goto after_backslash;
2671 }
2672 /* Skip the replacement character. */
2673 base = ++pos;
2674 }
2675
2676 note++;
2677 break;
2678 }
2679
2680 /* Now get a char to process. Either from an expanded note, or
2681 from the line buffer. */
2682 bool read_note = accum.reading_p ();
2683 char c = read_note ? accum.read_char () : *pos++;
2684
2685 if (phase == PHASE_PREFIX)
2686 {
2687 if (c == '(')
2688 {
2689 /* Done. */
2690 phase = PHASE_NONE;
2691 prefix[prefix_len++] = '"';
2692 }
2693 else if (prefix_len < 16
2694 /* Prefix chars are any of the basic character set,
2695 [lex.charset] except for '
2696 ()\\\t\v\f\n'. Optimized for a contiguous
2697 alphabet. */
2698 /* Unlike a switch, this collapses down to one or
2699 two shift and bitmask operations on an ASCII
2700 system, with an outlier or two. */
2701 && (('Z' - 'A' == 25
2702 ? ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'))
2703 : ISIDST (c))
2704 || (c >= '0' && c <= '9')
2705 || c == '_' || c == '{' || c == '}'
2706 || c == '[' || c == ']' || c == '#'
2707 || c == '<' || c == '>' || c == '%'
2708 || c == ':' || c == ';' || c == '.' || c == '?'
2709 || c == '*' || c == '+' || c == '-' || c == '/'
2710 || c == '^' || c == '&' || c == '|' || c == '~'
2711 || c == '!' || c == '=' || c == ','
2712 || c == '"' || c == '\''
2713 || ((c == '$' || c == '@' || c == '`')
2714 && (CPP_OPTION (pfile, cplusplus)
2715 ? CPP_OPTION (pfile, lang) > CLK_CXX23
2716 : CPP_OPTION (pfile, low_ucns)))))
2717 prefix[prefix_len++] = c;
2718 else
2719 {
2720 /* Something is wrong. */
2721 int col = CPP_BUF_COLUMN (pfile->buffer, pos) + read_note;
2722 if (prefix_len == 16)
2723 cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
2724 col, msgid: "raw string delimiter longer "
2725 "than 16 characters");
2726 else if (c == '\n')
2727 cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
2728 col, msgid: "invalid new-line in raw "
2729 "string delimiter");
2730 else
2731 cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
2732 col, msgid: "invalid character '%c' in "
2733 "raw string delimiter", c);
2734 type = CPP_OTHER;
2735 phase = PHASE_NONE;
2736 /* Continue until we get a close quote, that's probably
2737 the best failure mode. */
2738 prefix_len = 0;
2739 }
2740 if (c != '\n')
2741 continue;
2742 }
2743
2744 if (phase != PHASE_NONE)
2745 {
2746 if (prefix[phase] != c)
2747 phase = PHASE_NONE;
2748 else if (unsigned (phase + 1) == prefix_len)
2749 break;
2750 else
2751 {
2752 phase = Phase (phase + 1);
2753 continue;
2754 }
2755 }
2756
2757 if (!prefix_len && c == '"')
2758 /* Failure mode lexing. */
2759 goto out;
2760 else if (prefix_len && c == ')')
2761 phase = PHASE_SUFFIX;
2762 else if (!read_note && c == '\n')
2763 {
2764 pos--;
2765 pfile->buffer->cur = pos;
2766 if ((pfile->state.in_directive || pfile->state.parsing_args
2767 || pfile->state.in_deferred_pragma)
2768 && pfile->buffer->next_line >= pfile->buffer->rlimit)
2769 {
2770 cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc, 0,
2771 msgid: "unterminated raw string");
2772 type = CPP_OTHER;
2773 goto out;
2774 }
2775
2776 accum.append (pfile, base, len: pos - base + 1);
2777 _cpp_process_line_notes (pfile, in_comment: false);
2778
2779 if (pfile->buffer->next_line < pfile->buffer->rlimit)
2780 CPP_INCREMENT_LINE (pfile, 0);
2781 pfile->buffer->need_line = true;
2782
2783 if (!get_fresh_line_impl<true> (pfile))
2784 {
2785 /* We ran out of file and failed to get a line. */
2786 location_t src_loc = token->src_loc;
2787 token->type = CPP_EOF;
2788 /* Tell the compiler the line number of the EOF token. */
2789 token->src_loc = pfile->line_table->highest_line;
2790 token->flags = BOL;
2791 if (accum.first)
2792 _cpp_release_buff (pfile, accum.first);
2793 cpp_error_with_line (pfile, CPP_DL_ERROR, src_loc, 0,
2794 msgid: "unterminated raw string");
2795
2796 /* Now pop the buffer that get_fresh_line_impl() did not. Popping
2797 is not safe if processing a directive, however this cannot
2798 happen as we already checked above that a line would be
2799 available, and get_fresh_line_impl() can't fail in this
2800 case. */
2801 gcc_assert (!pfile->state.in_directive);
2802 _cpp_pop_buffer (pfile);
2803
2804 return;
2805 }
2806
2807 pos = base = pfile->buffer->cur;
2808 note = &pfile->buffer->notes[pfile->buffer->cur_note];
2809 }
2810 else if (__builtin_expect ((unsigned char) c >= utf8_continuation, 0)
2811 && warn_bidi_or_invalid_utf8_p)
2812 pos = _cpp_handle_multibyte_utf8 (pfile, c, cur: pos, warn_bidi_p,
2813 warn_invalid_utf8_p);
2814 }
2815
2816 if (warn_bidi_p)
2817 maybe_warn_bidi_on_close (pfile, p: pos);
2818
2819 if (CPP_OPTION (pfile, user_literals))
2820 {
2821 const uchar *const suffix_begin = pos;
2822 pfile->buffer->cur = pos;
2823
2824 if (const auto sr = scan_cur_identifier (pfile))
2825 {
2826 if (maybe_ignore_udl_macro_suffix (pfile, src_loc: token->src_loc,
2827 suffix_begin, node: sr.node))
2828 pfile->buffer->cur = suffix_begin;
2829 else
2830 {
2831 type = cpp_userdef_string_add_type (type);
2832 accum.create_literal2 (pfile, token, base1: base, len1: suffix_begin - base,
2833 NODE_NAME (sr.node), NODE_LEN (sr.node),
2834 type);
2835 if (accum.first)
2836 _cpp_release_buff (pfile, accum.first);
2837 warn_about_normalization (pfile, token, s: &sr.nst, identifier: true);
2838 return;
2839 }
2840 }
2841 }
2842
2843 out:
2844 pfile->buffer->cur = pos;
2845 if (!accum.accum)
2846 create_literal (pfile, token, base, len: pos - base, type);
2847 else
2848 {
2849 accum.create_literal2 (pfile, token, base1: base, len1: pos - base, base2: nullptr, len2: 0, type);
2850 _cpp_release_buff (pfile, accum.first);
2851 }
2852}
2853
2854/* Lexes a string, character constant, or angle-bracketed header file
2855 name. The stored string contains the spelling, including opening
2856 quote and any leading 'L', 'u', 'U' or 'u8' and optional
2857 'R' modifier. It returns the type of the literal, or CPP_OTHER
2858 if it was not properly terminated, or CPP_LESS for an unterminated
2859 header name which must be relexed as normal tokens.
2860
2861 The spelling is NUL-terminated, but it is not guaranteed that this
2862 is the first NUL since embedded NULs are preserved. */
2863static void
2864lex_string (cpp_reader *pfile, cpp_token *token, const uchar *base)
2865{
2866 bool saw_NUL = false;
2867 const uchar *cur;
2868 cppchar_t terminator;
2869 enum cpp_ttype type;
2870
2871 cur = base;
2872 terminator = *cur++;
2873 if (terminator == 'L' || terminator == 'U')
2874 terminator = *cur++;
2875 else if (terminator == 'u')
2876 {
2877 terminator = *cur++;
2878 if (terminator == '8')
2879 terminator = *cur++;
2880 }
2881 if (terminator == 'R')
2882 {
2883 lex_raw_string (pfile, token, base);
2884 return;
2885 }
2886 if (terminator == '"')
2887 type = (*base == 'L' ? CPP_WSTRING :
2888 *base == 'U' ? CPP_STRING32 :
2889 *base == 'u' ? (base[1] == '8' ? CPP_UTF8STRING : CPP_STRING16)
2890 : CPP_STRING);
2891 else if (terminator == '\'')
2892 type = (*base == 'L' ? CPP_WCHAR :
2893 *base == 'U' ? CPP_CHAR32 :
2894 *base == 'u' ? (base[1] == '8' ? CPP_UTF8CHAR : CPP_CHAR16)
2895 : CPP_CHAR);
2896 else
2897 terminator = '>', type = CPP_HEADER_NAME;
2898
2899 const bool warn_bidi_p = pfile->warn_bidi_p ();
2900 const bool warn_invalid_utf8_p = CPP_OPTION (pfile, cpp_warn_invalid_utf8);
2901 const bool warn_bidi_or_invalid_utf8_p = warn_bidi_p | warn_invalid_utf8_p;
2902 for (;;)
2903 {
2904 cppchar_t c = *cur++;
2905
2906 /* In #include-style directives, terminators are not escapable. */
2907 if (c == '\\' && !pfile->state.angled_headers && *cur != '\n')
2908 {
2909 if ((cur[0] == 'u' || cur[0] == 'U' || cur[0] == 'N') && warn_bidi_p)
2910 {
2911 location_t loc;
2912 bidi::kind kind;
2913 if (cur[0] == 'N')
2914 kind = get_bidi_named (pfile, p: cur + 1, out: &loc);
2915 else
2916 kind = get_bidi_ucn (pfile, p: cur + 1, is_U: cur[0] == 'U', out: &loc);
2917 maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/true, loc);
2918 }
2919 cur++;
2920 }
2921 else if (c == terminator)
2922 {
2923 if (warn_bidi_p)
2924 maybe_warn_bidi_on_close (pfile, p: cur - 1);
2925 break;
2926 }
2927 else if (c == '\n')
2928 {
2929 cur--;
2930 /* Unmatched quotes always yield undefined behavior, but
2931 greedy lexing means that what appears to be an unterminated
2932 header name may actually be a legitimate sequence of tokens. */
2933 if (terminator == '>')
2934 {
2935 token->type = CPP_LESS;
2936 return;
2937 }
2938 type = CPP_OTHER;
2939 break;
2940 }
2941 else if (c == '\0')
2942 saw_NUL = true;
2943 else if (__builtin_expect (c >= utf8_continuation, 0)
2944 && warn_bidi_or_invalid_utf8_p)
2945 cur = _cpp_handle_multibyte_utf8 (pfile, c, cur, warn_bidi_p,
2946 warn_invalid_utf8_p);
2947 }
2948
2949 if (saw_NUL && !pfile->state.skipping)
2950 cpp_error (pfile, CPP_DL_WARNING,
2951 msgid: "null character(s) preserved in literal");
2952
2953 if (type == CPP_OTHER && CPP_OPTION (pfile, lang) != CLK_ASM)
2954 cpp_error (pfile, CPP_DL_PEDWARN, msgid: "missing terminating %c character",
2955 (int) terminator);
2956
2957 pfile->buffer->cur = cur;
2958 const uchar *const suffix_begin = cur;
2959
2960 if (CPP_OPTION (pfile, user_literals))
2961 {
2962 if (const auto sr = scan_cur_identifier (pfile))
2963 {
2964 if (maybe_ignore_udl_macro_suffix (pfile, src_loc: token->src_loc,
2965 suffix_begin, node: sr.node))
2966 pfile->buffer->cur = suffix_begin;
2967 else
2968 {
2969 /* Grab user defined literal suffix. */
2970 type = cpp_userdef_char_add_type (type);
2971 type = cpp_userdef_string_add_type (type);
2972 create_literal2 (pfile, token, base1: base, len1: suffix_begin - base,
2973 NODE_NAME (sr.node), NODE_LEN (sr.node), type);
2974 warn_about_normalization (pfile, token, s: &sr.nst, identifier: true);
2975 return;
2976 }
2977 }
2978 }
2979 else if (CPP_OPTION (pfile, cpp_warn_cxx11_compat)
2980 && !pfile->state.skipping)
2981 {
2982 const auto sr = scan_cur_identifier (pfile);
2983 /* Maybe raise a warning, but do not consume the tokens. */
2984 pfile->buffer->cur = suffix_begin;
2985 if (sr && cpp_macro_p (node: sr.node))
2986 cpp_warning_with_line (pfile, CPP_W_CXX11_COMPAT,
2987 token->src_loc, 0, msgid: "C++11 requires a space "
2988 "between string literal and macro");
2989 }
2990
2991 create_literal (pfile, token, base, len: cur - base, type);
2992}
2993
2994/* Return the comment table. The client may not make any assumption
2995 about the ordering of the table. */
2996cpp_comment_table *
2997cpp_get_comments (cpp_reader *pfile)
2998{
2999 return &pfile->comments;
3000}
3001
3002/* Append a comment to the end of the comment table. */
3003static void
3004store_comment (cpp_reader *pfile, cpp_token *token)
3005{
3006 int len;
3007
3008 if (pfile->comments.allocated == 0)
3009 {
3010 pfile->comments.allocated = 256;
3011 pfile->comments.entries = (cpp_comment *) xmalloc
3012 (pfile->comments.allocated * sizeof (cpp_comment));
3013 }
3014
3015 if (pfile->comments.count == pfile->comments.allocated)
3016 {
3017 pfile->comments.allocated *= 2;
3018 pfile->comments.entries = (cpp_comment *) xrealloc
3019 (pfile->comments.entries,
3020 pfile->comments.allocated * sizeof (cpp_comment));
3021 }
3022
3023 len = token->val.str.len;
3024
3025 /* Copy comment. Note, token may not be NULL terminated. */
3026 pfile->comments.entries[pfile->comments.count].comment =
3027 (char *) xmalloc (sizeof (char) * (len + 1));
3028 memcpy (dest: pfile->comments.entries[pfile->comments.count].comment,
3029 src: token->val.str.text, n: len);
3030 pfile->comments.entries[pfile->comments.count].comment[len] = '\0';
3031
3032 /* Set source location. */
3033 pfile->comments.entries[pfile->comments.count].sloc = token->src_loc;
3034
3035 /* Increment the count of entries in the comment table. */
3036 pfile->comments.count++;
3037}
3038
3039/* The stored comment includes the comment start and any terminator. */
3040static void
3041save_comment (cpp_reader *pfile, cpp_token *token, const unsigned char *from,
3042 cppchar_t type)
3043{
3044 unsigned char *buffer;
3045 unsigned int len, clen, i;
3046
3047 len = pfile->buffer->cur - from + 1; /* + 1 for the initial '/'. */
3048
3049 /* C++ comments probably (not definitely) have moved past a new
3050 line, which we don't want to save in the comment. */
3051 if (is_vspace (pfile->buffer->cur[-1]))
3052 len--;
3053
3054 /* If we are currently in a directive or in argument parsing, then
3055 we need to store all C++ comments as C comments internally, and
3056 so we need to allocate a little extra space in that case.
3057
3058 Note that the only time we encounter a directive here is
3059 when we are saving comments in a "#define". */
3060 clen = ((pfile->state.in_directive || pfile->state.parsing_args)
3061 && type == '/') ? len + 2 : len;
3062
3063 buffer = _cpp_unaligned_alloc (pfile, clen);
3064
3065 token->type = CPP_COMMENT;
3066 token->val.str.len = clen;
3067 token->val.str.text = buffer;
3068
3069 buffer[0] = '/';
3070 memcpy (dest: buffer + 1, src: from, n: len - 1);
3071
3072 /* Finish conversion to a C comment, if necessary. */
3073 if ((pfile->state.in_directive || pfile->state.parsing_args) && type == '/')
3074 {
3075 buffer[1] = '*';
3076 buffer[clen - 2] = '*';
3077 buffer[clen - 1] = '/';
3078 /* As there can be in a C++ comments illegal sequences for C comments
3079 we need to filter them out. */
3080 for (i = 2; i < (clen - 2); i++)
3081 if (buffer[i] == '/' && (buffer[i - 1] == '*' || buffer[i + 1] == '*'))
3082 buffer[i] = '|';
3083 }
3084
3085 /* Finally store this comment for use by clients of libcpp. */
3086 store_comment (pfile, token);
3087}
3088
3089/* Returns true if comment at COMMENT_START is a recognized FALLTHROUGH
3090 comment. */
3091
3092static bool
3093fallthrough_comment_p (cpp_reader *pfile, const unsigned char *comment_start)
3094{
3095 const unsigned char *from = comment_start + 1;
3096
3097 switch (CPP_OPTION (pfile, cpp_warn_implicit_fallthrough))
3098 {
3099 /* For both -Wimplicit-fallthrough=0 and -Wimplicit-fallthrough=5 we
3100 don't recognize any comments. The latter only checks attributes,
3101 the former doesn't warn. */
3102 case 0:
3103 default:
3104 return false;
3105 /* -Wimplicit-fallthrough=1 considers any comment, no matter what
3106 content it has. */
3107 case 1:
3108 return true;
3109 case 2:
3110 /* -Wimplicit-fallthrough=2 looks for (case insensitive)
3111 .*falls?[ \t-]*thr(u|ough).* regex. */
3112 for (; (size_t) (pfile->buffer->cur - from) >= sizeof "fallthru" - 1;
3113 from++)
3114 {
3115 /* Is there anything like strpbrk with upper boundary, or
3116 memchr looking for 2 characters rather than just one? */
3117 if (from[0] != 'f' && from[0] != 'F')
3118 continue;
3119 if (from[1] != 'a' && from[1] != 'A')
3120 continue;
3121 if (from[2] != 'l' && from[2] != 'L')
3122 continue;
3123 if (from[3] != 'l' && from[3] != 'L')
3124 continue;
3125 from += sizeof "fall" - 1;
3126 if (from[0] == 's' || from[0] == 'S')
3127 from++;
3128 while (*from == ' ' || *from == '\t' || *from == '-')
3129 from++;
3130 if (from[0] != 't' && from[0] != 'T')
3131 continue;
3132 if (from[1] != 'h' && from[1] != 'H')
3133 continue;
3134 if (from[2] != 'r' && from[2] != 'R')
3135 continue;
3136 if (from[3] == 'u' || from[3] == 'U')
3137 return true;
3138 if (from[3] != 'o' && from[3] != 'O')
3139 continue;
3140 if (from[4] != 'u' && from[4] != 'U')
3141 continue;
3142 if (from[5] != 'g' && from[5] != 'G')
3143 continue;
3144 if (from[6] != 'h' && from[6] != 'H')
3145 continue;
3146 return true;
3147 }
3148 return false;
3149 case 3:
3150 case 4:
3151 break;
3152 }
3153
3154 /* Whole comment contents:
3155 -fallthrough
3156 @fallthrough@
3157 */
3158 if (*from == '-' || *from == '@')
3159 {
3160 size_t len = sizeof "fallthrough" - 1;
3161 if ((size_t) (pfile->buffer->cur - from - 1) < len)
3162 return false;
3163 if (memcmp (s1: from + 1, s2: "fallthrough", n: len))
3164 return false;
3165 if (*from == '@')
3166 {
3167 if (from[len + 1] != '@')
3168 return false;
3169 len++;
3170 }
3171 from += 1 + len;
3172 }
3173 /* Whole comment contents (regex):
3174 lint -fallthrough[ \t]*
3175 */
3176 else if (*from == 'l')
3177 {
3178 size_t len = sizeof "int -fallthrough" - 1;
3179 if ((size_t) (pfile->buffer->cur - from - 1) < len)
3180 return false;
3181 if (memcmp (s1: from + 1, s2: "int -fallthrough", n: len))
3182 return false;
3183 from += 1 + len;
3184 while (*from == ' ' || *from == '\t')
3185 from++;
3186 }
3187 /* Whole comment contents (regex):
3188 [ \t]*FALLTHR(U|OUGH)[ \t]*
3189 */
3190 else if (CPP_OPTION (pfile, cpp_warn_implicit_fallthrough) == 4)
3191 {
3192 while (*from == ' ' || *from == '\t')
3193 from++;
3194 if ((size_t) (pfile->buffer->cur - from) < sizeof "FALLTHRU" - 1)
3195 return false;
3196 if (memcmp (s1: from, s2: "FALLTHR", n: sizeof "FALLTHR" - 1))
3197 return false;
3198 from += sizeof "FALLTHR" - 1;
3199 if (*from == 'U')
3200 from++;
3201 else if ((size_t) (pfile->buffer->cur - from) < sizeof "OUGH" - 1)
3202 return false;
3203 else if (memcmp (s1: from, s2: "OUGH", n: sizeof "OUGH" - 1))
3204 return false;
3205 else
3206 from += sizeof "OUGH" - 1;
3207 while (*from == ' ' || *from == '\t')
3208 from++;
3209 }
3210 /* Whole comment contents (regex):
3211 [ \t.!]*(ELSE,? |INTENTIONAL(LY)? )?FALL(S | |-)?THR(OUGH|U)[ \t.!]*(-[^\n\r]*)?
3212 [ \t.!]*(Else,? |Intentional(ly)? )?Fall((s | |-)[Tt]|t)hr(ough|u)[ \t.!]*(-[^\n\r]*)?
3213 [ \t.!]*([Ee]lse,? |[Ii]ntentional(ly)? )?fall(s | |-)?thr(ough|u)[ \t.!]*(-[^\n\r]*)?
3214 */
3215 else
3216 {
3217 while (*from == ' ' || *from == '\t' || *from == '.' || *from == '!')
3218 from++;
3219 unsigned char f = *from;
3220 bool all_upper = false;
3221 if (f == 'E' || f == 'e')
3222 {
3223 if ((size_t) (pfile->buffer->cur - from)
3224 < sizeof "else fallthru" - 1)
3225 return false;
3226 if (f == 'E' && memcmp (s1: from + 1, s2: "LSE", n: sizeof "LSE" - 1) == 0)
3227 all_upper = true;
3228 else if (memcmp (s1: from + 1, s2: "lse", n: sizeof "lse" - 1))
3229 return false;
3230 from += sizeof "else" - 1;
3231 if (*from == ',')
3232 from++;
3233 if (*from != ' ')
3234 return false;
3235 from++;
3236 if (all_upper && *from == 'f')
3237 return false;
3238 if (f == 'e' && *from == 'F')
3239 return false;
3240 f = *from;
3241 }
3242 else if (f == 'I' || f == 'i')
3243 {
3244 if ((size_t) (pfile->buffer->cur - from)
3245 < sizeof "intentional fallthru" - 1)
3246 return false;
3247 if (f == 'I' && memcmp (s1: from + 1, s2: "NTENTIONAL",
3248 n: sizeof "NTENTIONAL" - 1) == 0)
3249 all_upper = true;
3250 else if (memcmp (s1: from + 1, s2: "ntentional",
3251 n: sizeof "ntentional" - 1))
3252 return false;
3253 from += sizeof "intentional" - 1;
3254 if (*from == ' ')
3255 {
3256 from++;
3257 if (all_upper && *from == 'f')
3258 return false;
3259 }
3260 else if (all_upper)
3261 {
3262 if (memcmp (s1: from, s2: "LY F", n: sizeof "LY F" - 1))
3263 return false;
3264 from += sizeof "LY " - 1;
3265 }
3266 else
3267 {
3268 if (memcmp (s1: from, s2: "ly ", n: sizeof "ly " - 1))
3269 return false;
3270 from += sizeof "ly " - 1;
3271 }
3272 if (f == 'i' && *from == 'F')
3273 return false;
3274 f = *from;
3275 }
3276 if (f != 'F' && f != 'f')
3277 return false;
3278 if ((size_t) (pfile->buffer->cur - from) < sizeof "fallthru" - 1)
3279 return false;
3280 if (f == 'F' && memcmp (s1: from + 1, s2: "ALL", n: sizeof "ALL" - 1) == 0)
3281 all_upper = true;
3282 else if (all_upper)
3283 return false;
3284 else if (memcmp (s1: from + 1, s2: "all", n: sizeof "all" - 1))
3285 return false;
3286 from += sizeof "fall" - 1;
3287 if (*from == (all_upper ? 'S' : 's') && from[1] == ' ')
3288 from += 2;
3289 else if (*from == ' ' || *from == '-')
3290 from++;
3291 else if (*from != (all_upper ? 'T' : 't'))
3292 return false;
3293 if ((f == 'f' || *from != 'T') && (all_upper || *from != 't'))
3294 return false;
3295 if ((size_t) (pfile->buffer->cur - from) < sizeof "thru" - 1)
3296 return false;
3297 if (memcmp (s1: from + 1, s2: all_upper ? "HRU" : "hru", n: sizeof "hru" - 1))
3298 {
3299 if ((size_t) (pfile->buffer->cur - from) < sizeof "through" - 1)
3300 return false;
3301 if (memcmp (s1: from + 1, s2: all_upper ? "HROUGH" : "hrough",
3302 n: sizeof "hrough" - 1))
3303 return false;
3304 from += sizeof "through" - 1;
3305 }
3306 else
3307 from += sizeof "thru" - 1;
3308 while (*from == ' ' || *from == '\t' || *from == '.' || *from == '!')
3309 from++;
3310 if (*from == '-')
3311 {
3312 from++;
3313 if (*comment_start == '*')
3314 {
3315 do
3316 {
3317 while (*from && *from != '*'
3318 && *from != '\n' && *from != '\r')
3319 from++;
3320 if (*from != '*' || from[1] == '/')
3321 break;
3322 from++;
3323 }
3324 while (1);
3325 }
3326 else
3327 while (*from && *from != '\n' && *from != '\r')
3328 from++;
3329 }
3330 }
3331 /* C block comment. */
3332 if (*comment_start == '*')
3333 {
3334 if (*from != '*' || from[1] != '/')
3335 return false;
3336 }
3337 /* C++ line comment. */
3338 else if (*from != '\n')
3339 return false;
3340
3341 return true;
3342}
3343
3344/* Allocate COUNT tokens for RUN. */
3345void
3346_cpp_init_tokenrun (tokenrun *run, unsigned int count)
3347{
3348 run->base = XNEWVEC (cpp_token, count);
3349 run->limit = run->base + count;
3350 run->next = NULL;
3351}
3352
3353/* Returns the next tokenrun, or creates one if there is none. */
3354static tokenrun *
3355next_tokenrun (tokenrun *run)
3356{
3357 if (run->next == NULL)
3358 {
3359 run->next = XNEW (tokenrun);
3360 run->next->prev = run;
3361 _cpp_init_tokenrun (run: run->next, count: 250);
3362 }
3363
3364 return run->next;
3365}
3366
3367/* Return the number of not yet processed token in a given
3368 context. */
3369int
3370_cpp_remaining_tokens_num_in_context (cpp_context *context)
3371{
3372 if (context->tokens_kind == TOKENS_KIND_DIRECT)
3373 return (LAST (context).token - FIRST (context).token);
3374 else if (context->tokens_kind == TOKENS_KIND_INDIRECT
3375 || context->tokens_kind == TOKENS_KIND_EXTENDED)
3376 return (LAST (context).ptoken - FIRST (context).ptoken);
3377 else
3378 abort ();
3379}
3380
3381/* Returns the token present at index INDEX in a given context. If
3382 INDEX is zero, the next token to be processed is returned. */
3383static const cpp_token*
3384_cpp_token_from_context_at (cpp_context *context, int index)
3385{
3386 if (context->tokens_kind == TOKENS_KIND_DIRECT)
3387 return &(FIRST (context).token[index]);
3388 else if (context->tokens_kind == TOKENS_KIND_INDIRECT
3389 || context->tokens_kind == TOKENS_KIND_EXTENDED)
3390 return FIRST (context).ptoken[index];
3391 else
3392 abort ();
3393}
3394
3395/* Look ahead in the input stream. */
3396const cpp_token *
3397cpp_peek_token (cpp_reader *pfile, int index)
3398{
3399 cpp_context *context = pfile->context;
3400 const cpp_token *peektok;
3401 int count;
3402
3403 /* First, scan through any pending cpp_context objects. */
3404 while (context->prev)
3405 {
3406 ptrdiff_t sz = _cpp_remaining_tokens_num_in_context (context);
3407
3408 if (index < (int) sz)
3409 return _cpp_token_from_context_at (context, index);
3410 index -= (int) sz;
3411 context = context->prev;
3412 }
3413
3414 /* We will have to read some new tokens after all (and do so
3415 without invalidating preceding tokens). */
3416 count = index;
3417 pfile->keep_tokens++;
3418
3419 /* For peeked tokens temporarily disable line_change reporting,
3420 until the tokens are parsed for real. */
3421 void (*line_change) (cpp_reader *, const cpp_token *, int)
3422 = pfile->cb.line_change;
3423 pfile->cb.line_change = NULL;
3424
3425 do
3426 {
3427 peektok = _cpp_lex_token (pfile);
3428 if (peektok->type == CPP_EOF)
3429 {
3430 index--;
3431 break;
3432 }
3433 else if (peektok->type == CPP_PRAGMA)
3434 {
3435 /* Don't peek past a pragma. */
3436 if (peektok == &pfile->directive_result)
3437 /* Save the pragma in the buffer. */
3438 *pfile->cur_token++ = *peektok;
3439 index--;
3440 break;
3441 }
3442 }
3443 while (index--);
3444
3445 _cpp_backup_tokens_direct (pfile, count - index);
3446 pfile->keep_tokens--;
3447 pfile->cb.line_change = line_change;
3448
3449 return peektok;
3450}
3451
3452/* Allocate a single token that is invalidated at the same time as the
3453 rest of the tokens on the line. Has its line and col set to the
3454 same as the last lexed token, so that diagnostics appear in the
3455 right place. */
3456cpp_token *
3457_cpp_temp_token (cpp_reader *pfile)
3458{
3459 cpp_token *old, *result;
3460 ptrdiff_t sz = pfile->cur_run->limit - pfile->cur_token;
3461 ptrdiff_t la = (ptrdiff_t) pfile->lookaheads;
3462
3463 old = pfile->cur_token - 1;
3464 /* Any pre-existing lookaheads must not be clobbered. */
3465 if (la)
3466 {
3467 if (sz <= la)
3468 {
3469 tokenrun *next = next_tokenrun (run: pfile->cur_run);
3470
3471 if (sz < la)
3472 memmove (dest: next->base + 1, src: next->base,
3473 n: (la - sz) * sizeof (cpp_token));
3474
3475 next->base[0] = pfile->cur_run->limit[-1];
3476 }
3477
3478 if (sz > 1)
3479 memmove (dest: pfile->cur_token + 1, src: pfile->cur_token,
3480 MIN (la, sz - 1) * sizeof (cpp_token));
3481 }
3482
3483 if (!sz && pfile->cur_token == pfile->cur_run->limit)
3484 {
3485 pfile->cur_run = next_tokenrun (run: pfile->cur_run);
3486 pfile->cur_token = pfile->cur_run->base;
3487 }
3488
3489 result = pfile->cur_token++;
3490 result->src_loc = old->src_loc;
3491 return result;
3492}
3493
3494/* We're at the beginning of a logical line (so not in
3495 directives-mode) and RESULT is a CPP_NAME with NODE_MODULE set. See
3496 if we should enter deferred_pragma mode to tokenize the rest of the
3497 line as a module control-line. */
3498
3499static void
3500cpp_maybe_module_directive (cpp_reader *pfile, cpp_token *result)
3501{
3502 unsigned backup = 0; /* Tokens we peeked. */
3503 cpp_hashnode *node = result->val.node.node;
3504 cpp_token *peek = result;
3505 cpp_token *keyword = peek;
3506 cpp_hashnode *(&n_modules)[spec_nodes::M_HWM][2] = pfile->spec_nodes.n_modules;
3507 int header_count = 0;
3508
3509 /* Make sure the incoming state is as we expect it. This way we
3510 can restore it using constants. */
3511 gcc_checking_assert (!pfile->state.in_deferred_pragma
3512 && !pfile->state.skipping
3513 && !pfile->state.parsing_args
3514 && !pfile->state.angled_headers
3515 && (pfile->state.save_comments
3516 == !CPP_OPTION (pfile, discard_comments)));
3517
3518 /* Enter directives mode sufficiently for peeking. We don't have
3519 to actually set in_directive. */
3520 pfile->state.in_deferred_pragma = true;
3521
3522 /* These two fields are needed to process tokenization in deferred
3523 pragma mode. They are not used outside deferred pragma mode or
3524 directives mode. */
3525 pfile->state.pragma_allow_expansion = true;
3526 pfile->directive_line = result->src_loc;
3527
3528 /* Saving comments is incompatible with directives mode. */
3529 pfile->state.save_comments = 0;
3530
3531 if (node == n_modules[spec_nodes::M_EXPORT][0])
3532 {
3533 peek = _cpp_lex_direct (pfile);
3534 keyword = peek;
3535 backup++;
3536 if (keyword->type != CPP_NAME)
3537 goto not_module;
3538 node = keyword->val.node.node;
3539 if (!(node->flags & NODE_MODULE))
3540 goto not_module;
3541 }
3542
3543 if (node == n_modules[spec_nodes::M__IMPORT][0])
3544 /* __import */
3545 header_count = backup + 2 + 16;
3546 else if (node == n_modules[spec_nodes::M_IMPORT][0])
3547 /* import */
3548 header_count = backup + 2 + (CPP_OPTION (pfile, preprocessed) ? 16 : 0);
3549 else if (node == n_modules[spec_nodes::M_MODULE][0])
3550 ; /* module */
3551 else
3552 goto not_module;
3553
3554 /* We've seen [export] {module|import|__import}. Check the next token. */
3555 if (header_count)
3556 /* After '{,__}import' a header name may appear. */
3557 pfile->state.angled_headers = true;
3558 peek = _cpp_lex_direct (pfile);
3559 backup++;
3560
3561 /* ... import followed by identifier, ':', '<' or
3562 header-name preprocessing tokens, or module
3563 followed by cpp-identifier, ':' or ';' preprocessing
3564 tokens. C++ keywords are not yet relevant. */
3565 if (peek->type == CPP_NAME
3566 || peek->type == CPP_COLON
3567 || (header_count
3568 ? (peek->type == CPP_LESS
3569 || (peek->type == CPP_STRING && peek->val.str.text[0] != 'R')
3570 || peek->type == CPP_HEADER_NAME)
3571 : peek->type == CPP_SEMICOLON))
3572 {
3573 pfile->state.pragma_allow_expansion = !CPP_OPTION (pfile, preprocessed);
3574 if (!pfile->state.pragma_allow_expansion)
3575 pfile->state.prevent_expansion++;
3576
3577 if (!header_count && linemap_included_from
3578 (ord_map: LINEMAPS_LAST_ORDINARY_MAP (set: pfile->line_table)))
3579 cpp_error_with_line (pfile, CPP_DL_ERROR, keyword->src_loc, 0,
3580 msgid: "module control-line cannot be in included file");
3581
3582 /* The first one or two tokens cannot be macro names. */
3583 for (int ix = backup; ix--;)
3584 {
3585 cpp_token *tok = ix ? keyword : result;
3586 cpp_hashnode *node = tok->val.node.node;
3587
3588 /* Don't attempt to expand the token. */
3589 tok->flags |= NO_EXPAND;
3590 if (_cpp_defined_macro_p (node)
3591 && _cpp_maybe_notify_macro_use (pfile, node, loc: tok->src_loc)
3592 && !cpp_fun_like_macro_p (node))
3593 cpp_error_with_line (pfile, CPP_DL_ERROR, tok->src_loc, 0,
3594 msgid: "module control-line %qs cannot be"
3595 " an object-like macro",
3596 NODE_NAME (node));
3597 }
3598
3599 /* Map to underbar variants. */
3600 keyword->val.node.node = n_modules[header_count
3601 ? spec_nodes::M_IMPORT
3602 : spec_nodes::M_MODULE][1];
3603 if (backup != 1)
3604 result->val.node.node = n_modules[spec_nodes::M_EXPORT][1];
3605
3606 /* Maybe tell the tokenizer we expect a header-name down the
3607 road. */
3608 pfile->state.directive_file_token = header_count;
3609
3610 /* According to P3034R1, pp-module-name and pp-module-partition tokens
3611 if any shouldn't be macro expanded and identifiers shouldn't be
3612 defined as object-like macro. */
3613 if (!header_count && peek->type == CPP_NAME)
3614 {
3615 int state = 0;
3616 do
3617 {
3618 cpp_token *tok = peek;
3619 if (tok->type == CPP_NAME)
3620 {
3621 cpp_hashnode *node = tok->val.node.node;
3622 /* Don't attempt to expand the token. */
3623 tok->flags |= NO_EXPAND;
3624 if (_cpp_defined_macro_p (node)
3625 && _cpp_maybe_notify_macro_use (pfile, node,
3626 loc: tok->src_loc)
3627 && !cpp_fun_like_macro_p (node))
3628 {
3629 if (state == 0)
3630 cpp_error_with_line (pfile, CPP_DL_ERROR,
3631 tok->src_loc, 0,
3632 msgid: "module name %qs cannot "
3633 "be an object-like macro",
3634 NODE_NAME (node));
3635 else
3636 cpp_error_with_line (pfile, CPP_DL_ERROR,
3637 tok->src_loc, 0,
3638 msgid: "module partition %qs cannot "
3639 "be an object-like macro",
3640 NODE_NAME (node));
3641 }
3642 }
3643 peek = _cpp_lex_direct (pfile);
3644 backup++;
3645 if (tok->type == CPP_NAME)
3646 {
3647 if (peek->type == CPP_DOT)
3648 continue;
3649 else if (peek->type == CPP_COLON && state == 0)
3650 {
3651 ++state;
3652 continue;
3653 }
3654 else if (peek->type == CPP_OPEN_PAREN)
3655 {
3656 if (state == 0)
3657 cpp_error_with_line (pfile, CPP_DL_ERROR,
3658 peek->src_loc, 0,
3659 msgid: "module name followed by %<(%>");
3660 else
3661 cpp_error_with_line (pfile, CPP_DL_ERROR,
3662 peek->src_loc, 0,
3663 msgid: "module partition followed by "
3664 "%<(%>");
3665 break;
3666 }
3667 else if (peek->type == CPP_NAME
3668 && _cpp_defined_macro_p (node: peek->val.node.node))
3669 {
3670 peek->flags |= NO_DOT_COLON;
3671 break;
3672 }
3673 else
3674 break;
3675 }
3676 else if (peek->type != CPP_NAME)
3677 break;
3678 }
3679 while (true);
3680 }
3681 }
3682 else
3683 {
3684 not_module:
3685 /* Drop out of directive mode. */
3686 /* We aaserted save_comments had this value upon entry. */
3687 pfile->state.save_comments
3688 = !CPP_OPTION (pfile, discard_comments);
3689 pfile->state.in_deferred_pragma = false;
3690 /* Do not let this remain on. */
3691 pfile->state.angled_headers = false;
3692 }
3693
3694 /* In either case we want to backup the peeked tokens. */
3695 if (backup)
3696 {
3697 /* If we saw EOL, we should drop it, because this isn't a module
3698 control-line after all. */
3699 bool eol = peek->type == CPP_PRAGMA_EOL;
3700 if (!eol || backup > 1)
3701 {
3702 /* Put put the peeked tokens back */
3703 _cpp_backup_tokens_direct (pfile, backup);
3704 /* But if the last one was an EOL, forget it. */
3705 if (eol)
3706 pfile->lookaheads--;
3707 }
3708 }
3709}
3710
3711/* Lex a token into RESULT (external interface). Takes care of issues
3712 like directive handling, token lookahead, multiple include
3713 optimization and skipping. */
3714const cpp_token *
3715_cpp_lex_token (cpp_reader *pfile)
3716{
3717 cpp_token *result;
3718
3719 for (;;)
3720 {
3721 if (pfile->cur_token == pfile->cur_run->limit)
3722 {
3723 pfile->cur_run = next_tokenrun (run: pfile->cur_run);
3724 pfile->cur_token = pfile->cur_run->base;
3725 }
3726 /* We assume that the current token is somewhere in the current
3727 run. */
3728 if (pfile->cur_token < pfile->cur_run->base
3729 || pfile->cur_token >= pfile->cur_run->limit)
3730 abort ();
3731
3732 if (pfile->lookaheads)
3733 {
3734 pfile->lookaheads--;
3735 result = pfile->cur_token++;
3736 }
3737 else
3738 result = _cpp_lex_direct (pfile);
3739
3740 if (result->flags & BOL)
3741 {
3742 /* Is this a directive. If _cpp_handle_directive returns
3743 false, it is an assembler #. */
3744 if (result->type == CPP_HASH
3745 /* 6.10.3 p 11: Directives in a list of macro arguments
3746 gives undefined behavior. This implementation
3747 handles the directive as normal. */
3748 && pfile->state.parsing_args != 1)
3749 {
3750 if (_cpp_handle_directive (pfile, result->flags & PREV_WHITE))
3751 {
3752 if (pfile->directive_result.type == CPP_PADDING)
3753 continue;
3754 result = &pfile->directive_result;
3755 }
3756 }
3757 else if (pfile->state.in_deferred_pragma)
3758 result = &pfile->directive_result;
3759 else if (result->type == CPP_NAME
3760 && (result->val.node.node->flags & NODE_MODULE)
3761 && !pfile->state.skipping
3762 /* Unlike regular directives, we do not deal with
3763 tokenizing module directives as macro arguments.
3764 That's not permitted. */
3765 && !pfile->state.parsing_args)
3766 {
3767 /* P1857. Before macro expansion, At start of logical
3768 line ... */
3769 /* We don't have to consider lookaheads at this point. */
3770 gcc_checking_assert (!pfile->lookaheads);
3771
3772 cpp_maybe_module_directive (pfile, result);
3773 }
3774
3775 if (pfile->cb.line_change && !pfile->state.skipping)
3776 pfile->cb.line_change (pfile, result, pfile->state.parsing_args);
3777 }
3778
3779 /* We don't skip tokens in directives. */
3780 if (pfile->state.in_directive || pfile->state.in_deferred_pragma)
3781 break;
3782
3783 /* Outside a directive, invalidate controlling macros. At file
3784 EOF, _cpp_lex_direct takes care of popping the buffer, so we never
3785 get here and MI optimization works. */
3786 pfile->mi_valid = false;
3787
3788 if (!pfile->state.skipping || result->type == CPP_EOF)
3789 break;
3790 }
3791
3792 return result;
3793}
3794
3795/* Returns true if a fresh line has been loaded. */
3796template <bool lexing_raw_string>
3797static bool
3798get_fresh_line_impl (cpp_reader *pfile)
3799{
3800 /* We can't get a new line until we leave the current directive, unless we
3801 are lexing a raw string, in which case it will be OK as long as we don't
3802 pop the current buffer. */
3803 if (!lexing_raw_string && pfile->state.in_directive)
3804 return false;
3805
3806 for (;;)
3807 {
3808 cpp_buffer *buffer = pfile->buffer;
3809
3810 if (!buffer->need_line)
3811 return true;
3812
3813 if (buffer->next_line < buffer->rlimit)
3814 {
3815 _cpp_clean_line (pfile);
3816 return true;
3817 }
3818
3819 /* We can't change buffers until we leave the current directive. */
3820 if (lexing_raw_string && pfile->state.in_directive)
3821 return false;
3822
3823 /* First, get out of parsing arguments state. */
3824 if (pfile->state.parsing_args)
3825 return false;
3826
3827 /* End of buffer. Non-empty files should end in a newline. */
3828 if (buffer->buf != buffer->rlimit
3829 && buffer->next_line > buffer->rlimit
3830 && !buffer->from_stage3)
3831 {
3832 /* Clip to buffer size. */
3833 buffer->next_line = buffer->rlimit;
3834 }
3835
3836 if (buffer->prev && !buffer->return_at_eof)
3837 _cpp_pop_buffer (pfile);
3838 else
3839 {
3840 /* End of translation. Do not pop the buffer yet. Increment
3841 line number so that the EOF token is on a line of its own
3842 (_cpp_lex_direct doesn't increment in that case, because
3843 it's hard for it to distinguish this special case). */
3844 CPP_INCREMENT_LINE (pfile, 0);
3845 return false;
3846 }
3847 }
3848}
3849
3850bool
3851_cpp_get_fresh_line (cpp_reader *pfile)
3852{
3853 return get_fresh_line_impl<false> (pfile);
3854}
3855
3856
3857#define IF_NEXT_IS(CHAR, THEN_TYPE, ELSE_TYPE) \
3858 do \
3859 { \
3860 result->type = ELSE_TYPE; \
3861 if (*buffer->cur == CHAR) \
3862 buffer->cur++, result->type = THEN_TYPE; \
3863 } \
3864 while (0)
3865
3866/* Lex a token into pfile->cur_token, which is also incremented, to
3867 get diagnostics pointing to the correct location.
3868
3869 Does not handle issues such as token lookahead, multiple-include
3870 optimization, directives, skipping etc. This function is only
3871 suitable for use by _cpp_lex_token, and in special cases like
3872 lex_expansion_token which doesn't care for any of these issues.
3873
3874 When meeting a newline, returns CPP_EOF if parsing a directive,
3875 otherwise returns to the start of the token buffer if permissible.
3876 Returns the location of the lexed token. */
3877cpp_token *
3878_cpp_lex_direct (cpp_reader *pfile)
3879{
3880 cppchar_t c = 0;
3881 cpp_buffer *buffer;
3882 const unsigned char *comment_start;
3883 bool fallthrough_comment = false;
3884 cpp_token *result = pfile->cur_token++;
3885
3886 fresh_line:
3887 result->flags = 0;
3888 buffer = pfile->buffer;
3889 if (buffer->need_line)
3890 {
3891 if (pfile->state.in_deferred_pragma)
3892 {
3893 /* This can happen in cases like:
3894 #define loop(x) whatever
3895 #pragma omp loop
3896 where when trying to expand loop we need to peek
3897 next token after loop, but aren't still in_deferred_pragma
3898 mode but are in in_directive mode, so buffer->need_line
3899 is set, a CPP_EOF is peeked. */
3900 result->type = CPP_PRAGMA_EOL;
3901 pfile->state.in_deferred_pragma = false;
3902 if (!pfile->state.pragma_allow_expansion)
3903 pfile->state.prevent_expansion--;
3904 result->src_loc = pfile->line_table->highest_line;
3905 return result;
3906 }
3907 if (!_cpp_get_fresh_line (pfile))
3908 {
3909 result->type = CPP_EOF;
3910 /* Not a real EOF in a directive or arg parsing -- we refuse
3911 to advance to the next file now, and will once we're out
3912 of those modes. */
3913 if (!pfile->state.in_directive && !pfile->state.parsing_args)
3914 {
3915 /* Tell the compiler the line number of the EOF token. */
3916 result->src_loc = pfile->line_table->highest_line;
3917 result->flags = BOL;
3918 /* Now pop the buffer that _cpp_get_fresh_line did not. */
3919 _cpp_pop_buffer (pfile);
3920 }
3921 else if (c == 0)
3922 result->src_loc = pfile->line_table->highest_line;
3923 return result;
3924 }
3925 if (buffer != pfile->buffer)
3926 fallthrough_comment = false;
3927 if (!pfile->keep_tokens)
3928 {
3929 pfile->cur_run = &pfile->base_run;
3930 result = pfile->base_run.base;
3931 pfile->cur_token = result + 1;
3932 }
3933 result->flags = BOL;
3934 if (pfile->state.parsing_args == 2)
3935 result->flags |= PREV_WHITE;
3936 }
3937 buffer = pfile->buffer;
3938 update_tokens_line:
3939 result->src_loc = pfile->line_table->highest_line;
3940
3941 skipped_white:
3942 if (buffer->cur >= buffer->notes[buffer->cur_note].pos
3943 && !pfile->overlaid_buffer)
3944 {
3945 _cpp_process_line_notes (pfile, in_comment: false);
3946 result->src_loc = pfile->line_table->highest_line;
3947 }
3948 c = *buffer->cur++;
3949
3950 if (pfile->forced_token_location)
3951 result->src_loc = pfile->forced_token_location;
3952 else
3953 result->src_loc = linemap_position_for_column (pfile->line_table,
3954 CPP_BUF_COLUMN (buffer, buffer->cur));
3955
3956 switch (c)
3957 {
3958 case ' ': case '\t': case '\f': case '\v': case '\0':
3959 result->flags |= PREV_WHITE;
3960 skip_whitespace (pfile, c);
3961 goto skipped_white;
3962
3963 case '\n':
3964 /* Increment the line, unless this is the last line ... */
3965 if (buffer->cur < buffer->rlimit
3966 /* ... or this is a #include, (where _cpp_stack_file needs to
3967 unwind by one line) ... */
3968 || (pfile->state.in_directive > 1
3969 /* ... except traditional-cpp increments this elsewhere. */
3970 && !CPP_OPTION (pfile, traditional)))
3971 CPP_INCREMENT_LINE (pfile, 0);
3972 buffer->need_line = true;
3973 if (pfile->state.in_deferred_pragma)
3974 {
3975 /* Produce the PRAGMA_EOL on this line. File reading
3976 ensures there is always a \n at end of the buffer, thus
3977 in a deferred pragma we always see CPP_PRAGMA_EOL before
3978 any CPP_EOF. */
3979 result->type = CPP_PRAGMA_EOL;
3980 result->flags &= ~PREV_WHITE;
3981 pfile->state.in_deferred_pragma = false;
3982 if (!pfile->state.pragma_allow_expansion)
3983 pfile->state.prevent_expansion--;
3984 return result;
3985 }
3986 goto fresh_line;
3987
3988 case '0': case '1': case '2': case '3': case '4':
3989 case '5': case '6': case '7': case '8': case '9':
3990 {
3991 struct normalize_state nst = INITIAL_NORMALIZE_STATE;
3992 result->type = CPP_NUMBER;
3993 lex_number (pfile, number: &result->val.str, nst: &nst);
3994 warn_about_normalization (pfile, token: result, s: &nst, identifier: false);
3995 break;
3996 }
3997
3998 case 'L':
3999 case 'u':
4000 case 'U':
4001 case 'R':
4002 /* 'L', 'u', 'U', 'u8' or 'R' may introduce wide characters,
4003 wide strings or raw strings. */
4004 if (c == 'L' || CPP_OPTION (pfile, rliterals)
4005 || (c != 'R' && CPP_OPTION (pfile, uliterals)))
4006 {
4007 if ((*buffer->cur == '\'' && c != 'R')
4008 || *buffer->cur == '"'
4009 || (*buffer->cur == 'R'
4010 && c != 'R'
4011 && buffer->cur[1] == '"'
4012 && CPP_OPTION (pfile, rliterals))
4013 || (*buffer->cur == '8'
4014 && c == 'u'
4015 && ((buffer->cur[1] == '"' || (buffer->cur[1] == '\''
4016 && CPP_OPTION (pfile, utf8_char_literals)))
4017 || (buffer->cur[1] == 'R' && buffer->cur[2] == '"'
4018 && CPP_OPTION (pfile, rliterals)))))
4019 {
4020 lex_string (pfile, token: result, base: buffer->cur - 1);
4021 break;
4022 }
4023 }
4024 /* Fall through. */
4025
4026 case '_':
4027 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
4028 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
4029 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
4030 case 's': case 't': case 'v': case 'w': case 'x':
4031 case 'y': case 'z':
4032 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
4033 case 'G': case 'H': case 'I': case 'J': case 'K':
4034 case 'M': case 'N': case 'O': case 'P': case 'Q':
4035 case 'S': case 'T': case 'V': case 'W': case 'X':
4036 case 'Y': case 'Z':
4037 result->type = CPP_NAME;
4038 {
4039 struct normalize_state nst = INITIAL_NORMALIZE_STATE;
4040 const auto node = lex_identifier (pfile, base: buffer->cur - 1, starts_ucn: false, nst: &nst,
4041 spelling: &result->val.node.spelling);
4042 result->val.node.node = node;
4043 identifier_diagnostics_on_lex (pfile, node);
4044 warn_about_normalization (pfile, token: result, s: &nst, identifier: true);
4045 }
4046
4047 /* Convert named operators to their proper types. */
4048 if (result->val.node.node->flags & NODE_OPERATOR)
4049 {
4050 result->flags |= NAMED_OP;
4051 result->type = (enum cpp_ttype) result->val.node.node->directive_index;
4052 }
4053
4054 /* Signal FALLTHROUGH comment followed by another token. */
4055 if (fallthrough_comment)
4056 result->flags |= PREV_FALLTHROUGH;
4057 break;
4058
4059 case '\'':
4060 case '"':
4061 lex_string (pfile, token: result, base: buffer->cur - 1);
4062 break;
4063
4064 case '/':
4065 /* A potential block or line comment. */
4066 comment_start = buffer->cur;
4067 c = *buffer->cur;
4068
4069 if (c == '*')
4070 {
4071 if (_cpp_skip_block_comment (pfile))
4072 cpp_error (pfile, CPP_DL_ERROR, msgid: "unterminated comment");
4073 }
4074 else if (c == '/' && ! CPP_OPTION (pfile, traditional))
4075 {
4076 /* Don't warn for system headers. */
4077 if (_cpp_in_system_header (pfile))
4078 ;
4079 /* Warn about comments if pedantically GNUC89, and not
4080 in system headers. */
4081 else if (CPP_OPTION (pfile, lang) == CLK_GNUC89
4082 && CPP_PEDANTIC (pfile)
4083 && ! buffer->warned_cplusplus_comments)
4084 {
4085 if (cpp_pedwarning (pfile, CPP_W_PEDANTIC,
4086 msgid: "C++ style comments are not allowed "
4087 "in ISO C90"))
4088 cpp_error (pfile, CPP_DL_NOTE,
4089 msgid: "(this will be reported only once per input file)");
4090 buffer->warned_cplusplus_comments = 1;
4091 }
4092 /* Or if specifically desired via -Wc90-c99-compat. */
4093 else if (CPP_OPTION (pfile, cpp_warn_c90_c99_compat) > 0
4094 && ! CPP_OPTION (pfile, cplusplus)
4095 && ! buffer->warned_cplusplus_comments)
4096 {
4097 if (cpp_error (pfile, CPP_DL_WARNING,
4098 msgid: "C++ style comments are incompatible with C90"))
4099 cpp_error (pfile, CPP_DL_NOTE,
4100 msgid: "(this will be reported only once per input file)");
4101 buffer->warned_cplusplus_comments = 1;
4102 }
4103 /* In C89/C94, C++ style comments are forbidden. */
4104 else if ((CPP_OPTION (pfile, lang) == CLK_STDC89
4105 || CPP_OPTION (pfile, lang) == CLK_STDC94))
4106 {
4107 /* But don't be confused about valid code such as
4108 - // immediately followed by *,
4109 - // in a preprocessing directive,
4110 - // in an #if 0 block. */
4111 if (buffer->cur[1] == '*'
4112 || pfile->state.in_directive
4113 || pfile->state.skipping)
4114 {
4115 result->type = CPP_DIV;
4116 break;
4117 }
4118 else if (! buffer->warned_cplusplus_comments)
4119 {
4120 if (cpp_error (pfile, CPP_DL_ERROR,
4121 msgid: "C++ style comments are not allowed in "
4122 "ISO C90"))
4123 cpp_error (pfile, CPP_DL_NOTE,
4124 msgid: "(this will be reported only once per input "
4125 "file)");
4126 buffer->warned_cplusplus_comments = 1;
4127 }
4128 }
4129 if (skip_line_comment (pfile) && CPP_OPTION (pfile, warn_comments))
4130 cpp_warning (pfile, CPP_W_COMMENTS, msgid: "multi-line comment");
4131 }
4132 else if (c == '=')
4133 {
4134 buffer->cur++;
4135 result->type = CPP_DIV_EQ;
4136 break;
4137 }
4138 else
4139 {
4140 result->type = CPP_DIV;
4141 break;
4142 }
4143
4144 if (fallthrough_comment_p (pfile, comment_start))
4145 fallthrough_comment = true;
4146
4147 if (pfile->cb.comment)
4148 {
4149 size_t len = pfile->buffer->cur - comment_start;
4150 pfile->cb.comment (pfile, result->src_loc, comment_start - 1,
4151 len + 1);
4152 }
4153
4154 if (!pfile->state.save_comments)
4155 {
4156 result->flags |= PREV_WHITE;
4157 goto update_tokens_line;
4158 }
4159
4160 if (fallthrough_comment)
4161 result->flags |= PREV_FALLTHROUGH;
4162
4163 /* Save the comment as a token in its own right. */
4164 save_comment (pfile, token: result, from: comment_start, type: c);
4165 break;
4166
4167 case '<':
4168 if (pfile->state.angled_headers)
4169 {
4170 lex_string (pfile, token: result, base: buffer->cur - 1);
4171 if (result->type != CPP_LESS)
4172 break;
4173 }
4174
4175 result->type = CPP_LESS;
4176 if (*buffer->cur == '=')
4177 {
4178 buffer->cur++, result->type = CPP_LESS_EQ;
4179 if (*buffer->cur == '>'
4180 && CPP_OPTION (pfile, cplusplus)
4181 && CPP_OPTION (pfile, lang) >= CLK_GNUCXX20)
4182 buffer->cur++, result->type = CPP_SPACESHIP;
4183 }
4184 else if (*buffer->cur == '<')
4185 {
4186 buffer->cur++;
4187 IF_NEXT_IS ('=', CPP_LSHIFT_EQ, CPP_LSHIFT);
4188 }
4189 else if (CPP_OPTION (pfile, digraphs))
4190 {
4191 if (*buffer->cur == ':')
4192 {
4193 /* C++11 [2.5/3 lex.pptoken], "Otherwise, if the next
4194 three characters are <:: and the subsequent character
4195 is neither : nor >, the < is treated as a preprocessor
4196 token by itself". */
4197 if (CPP_OPTION (pfile, cplusplus)
4198 && CPP_OPTION (pfile, lang) != CLK_CXX98
4199 && CPP_OPTION (pfile, lang) != CLK_GNUCXX
4200 && buffer->cur[1] == ':'
4201 && buffer->cur[2] != ':' && buffer->cur[2] != '>')
4202 break;
4203
4204 buffer->cur++;
4205 result->flags |= DIGRAPH;
4206 result->type = CPP_OPEN_SQUARE;
4207 }
4208 else if (*buffer->cur == '%')
4209 {
4210 buffer->cur++;
4211 result->flags |= DIGRAPH;
4212 result->type = CPP_OPEN_BRACE;
4213 }
4214 }
4215 break;
4216
4217 case '>':
4218 result->type = CPP_GREATER;
4219 if (*buffer->cur == '=')
4220 buffer->cur++, result->type = CPP_GREATER_EQ;
4221 else if (*buffer->cur == '>')
4222 {
4223 buffer->cur++;
4224 IF_NEXT_IS ('=', CPP_RSHIFT_EQ, CPP_RSHIFT);
4225 }
4226 break;
4227
4228 case '%':
4229 result->type = CPP_MOD;
4230 if (*buffer->cur == '=')
4231 buffer->cur++, result->type = CPP_MOD_EQ;
4232 else if (CPP_OPTION (pfile, digraphs))
4233 {
4234 if (*buffer->cur == ':')
4235 {
4236 buffer->cur++;
4237 result->flags |= DIGRAPH;
4238 result->type = CPP_HASH;
4239 if (*buffer->cur == '%' && buffer->cur[1] == ':')
4240 buffer->cur += 2, result->type = CPP_PASTE, result->val.token_no = 0;
4241 }
4242 else if (*buffer->cur == '>')
4243 {
4244 buffer->cur++;
4245 result->flags |= DIGRAPH;
4246 result->type = CPP_CLOSE_BRACE;
4247 }
4248 }
4249 break;
4250
4251 case '.':
4252 result->type = CPP_DOT;
4253 if (ISDIGIT (*buffer->cur))
4254 {
4255 struct normalize_state nst = INITIAL_NORMALIZE_STATE;
4256 result->type = CPP_NUMBER;
4257 lex_number (pfile, number: &result->val.str, nst: &nst);
4258 warn_about_normalization (pfile, token: result, s: &nst, identifier: false);
4259 }
4260 else if (*buffer->cur == '.' && buffer->cur[1] == '.')
4261 buffer->cur += 2, result->type = CPP_ELLIPSIS;
4262 else if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
4263 buffer->cur++, result->type = CPP_DOT_STAR;
4264 break;
4265
4266 case '+':
4267 result->type = CPP_PLUS;
4268 if (*buffer->cur == '+')
4269 buffer->cur++, result->type = CPP_PLUS_PLUS;
4270 else if (*buffer->cur == '=')
4271 buffer->cur++, result->type = CPP_PLUS_EQ;
4272 break;
4273
4274 case '-':
4275 result->type = CPP_MINUS;
4276 if (*buffer->cur == '>')
4277 {
4278 buffer->cur++;
4279 result->type = CPP_DEREF;
4280 if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
4281 buffer->cur++, result->type = CPP_DEREF_STAR;
4282 }
4283 else if (*buffer->cur == '-')
4284 buffer->cur++, result->type = CPP_MINUS_MINUS;
4285 else if (*buffer->cur == '=')
4286 buffer->cur++, result->type = CPP_MINUS_EQ;
4287 break;
4288
4289 case '&':
4290 result->type = CPP_AND;
4291 if (*buffer->cur == '&')
4292 buffer->cur++, result->type = CPP_AND_AND;
4293 else if (*buffer->cur == '=')
4294 buffer->cur++, result->type = CPP_AND_EQ;
4295 break;
4296
4297 case '|':
4298 result->type = CPP_OR;
4299 if (*buffer->cur == '|')
4300 buffer->cur++, result->type = CPP_OR_OR;
4301 else if (*buffer->cur == '=')
4302 buffer->cur++, result->type = CPP_OR_EQ;
4303 break;
4304
4305 case ':':
4306 result->type = CPP_COLON;
4307 if (*buffer->cur == ':')
4308 {
4309 if (CPP_OPTION (pfile, scope))
4310 buffer->cur++, result->type = CPP_SCOPE;
4311 else
4312 result->flags |= COLON_SCOPE;
4313 }
4314 else if (*buffer->cur == '>' && CPP_OPTION (pfile, digraphs))
4315 {
4316 buffer->cur++;
4317 result->flags |= DIGRAPH;
4318 result->type = CPP_CLOSE_SQUARE;
4319 }
4320 break;
4321
4322 case '*': IF_NEXT_IS ('=', CPP_MULT_EQ, CPP_MULT); break;
4323 case '=': IF_NEXT_IS ('=', CPP_EQ_EQ, CPP_EQ); break;
4324 case '!': IF_NEXT_IS ('=', CPP_NOT_EQ, CPP_NOT); break;
4325 case '^': IF_NEXT_IS ('=', CPP_XOR_EQ, CPP_XOR); break;
4326 case '#': IF_NEXT_IS ('#', CPP_PASTE, CPP_HASH); result->val.token_no = 0; break;
4327
4328 case '?': result->type = CPP_QUERY; break;
4329 case '~': result->type = CPP_COMPL; break;
4330 case ',': result->type = CPP_COMMA; break;
4331 case '(': result->type = CPP_OPEN_PAREN; break;
4332 case ')': result->type = CPP_CLOSE_PAREN; break;
4333 case '[': result->type = CPP_OPEN_SQUARE; break;
4334 case ']': result->type = CPP_CLOSE_SQUARE; break;
4335 case '{': result->type = CPP_OPEN_BRACE; break;
4336 case '}': result->type = CPP_CLOSE_BRACE; break;
4337 case ';': result->type = CPP_SEMICOLON; break;
4338
4339 /* @ is a punctuator in Objective-C. */
4340 case '@': result->type = CPP_ATSIGN; break;
4341
4342 default:
4343 {
4344 const uchar *base = --buffer->cur;
4345 static int no_warn_cnt;
4346
4347 /* Check for an extended identifier ($ or UCN or UTF-8). */
4348 struct normalize_state nst = INITIAL_NORMALIZE_STATE;
4349 if (forms_identifier_p (pfile, first: true, state: &nst))
4350 {
4351 result->type = CPP_NAME;
4352 const auto node = lex_identifier (pfile, base, starts_ucn: true, nst: &nst,
4353 spelling: &result->val.node.spelling);
4354 result->val.node.node = node;
4355 identifier_diagnostics_on_lex (pfile, node);
4356 warn_about_normalization (pfile, token: result, s: &nst, identifier: true);
4357 break;
4358 }
4359
4360 /* Otherwise this will form a CPP_OTHER token. Parse valid UTF-8 as a
4361 single token. */
4362 buffer->cur++;
4363 if (c >= utf8_signifier)
4364 {
4365 const uchar *pstr = base;
4366 cppchar_t s;
4367 if (_cpp_valid_utf8 (pfile, pstr: &pstr, limit: buffer->rlimit, identifier_pos: 0, NULL, cp: &s))
4368 {
4369 if (s > UCS_LIMIT && CPP_OPTION (pfile, cpp_warn_invalid_utf8))
4370 {
4371 buffer->cur = base;
4372 _cpp_warn_invalid_utf8 (pfile);
4373 }
4374 buffer->cur = pstr;
4375 }
4376 else if (CPP_OPTION (pfile, cpp_warn_invalid_utf8))
4377 {
4378 buffer->cur = base;
4379 const uchar *end = _cpp_warn_invalid_utf8 (pfile);
4380 buffer->cur = base + 1;
4381 no_warn_cnt = end - buffer->cur;
4382 }
4383 }
4384 else if (c >= utf8_continuation
4385 && CPP_OPTION (pfile, cpp_warn_invalid_utf8))
4386 {
4387 if (no_warn_cnt)
4388 --no_warn_cnt;
4389 else
4390 {
4391 buffer->cur = base;
4392 _cpp_warn_invalid_utf8 (pfile);
4393 buffer->cur = base + 1;
4394 }
4395 }
4396 create_literal (pfile, token: result, base, len: buffer->cur - base, type: CPP_OTHER);
4397 break;
4398 }
4399
4400 }
4401
4402 /* Potentially convert the location of the token to a range. */
4403 if (result->src_loc >= RESERVED_LOCATION_COUNT
4404 && result->type != CPP_EOF)
4405 {
4406 /* Ensure that any line notes are processed, so that we have the
4407 correct physical line/column for the end-point of the token even
4408 when a logical line is split via one or more backslashes. */
4409 if (buffer->cur >= buffer->notes[buffer->cur_note].pos
4410 && !pfile->overlaid_buffer)
4411 _cpp_process_line_notes (pfile, in_comment: false);
4412
4413 source_range tok_range;
4414 tok_range.m_start = result->src_loc;
4415 tok_range.m_finish
4416 = linemap_position_for_column (pfile->line_table,
4417 CPP_BUF_COLUMN (buffer, buffer->cur));
4418
4419 result->src_loc
4420 = pfile->line_table->get_or_create_combined_loc (locus: result->src_loc,
4421 src_range: tok_range, data: nullptr, discriminator: 0);
4422 }
4423
4424 return result;
4425}
4426
4427/* An upper bound on the number of bytes needed to spell TOKEN.
4428 Does not include preceding whitespace. */
4429unsigned int
4430cpp_token_len (const cpp_token *token)
4431{
4432 unsigned int len;
4433
4434 switch (TOKEN_SPELL (token))
4435 {
4436 default: len = 6; break;
4437 case SPELL_LITERAL: len = token->val.str.len; break;
4438 case SPELL_IDENT: len = NODE_LEN (token->val.node.node) * 10; break;
4439 }
4440
4441 return len;
4442}
4443
4444/* Parse UTF-8 out of NAMEP and place a \U escape in BUFFER.
4445 Return the number of bytes read out of NAME. (There are always
4446 10 bytes written to BUFFER.) */
4447
4448static size_t
4449utf8_to_ucn (unsigned char *buffer, const unsigned char *name)
4450{
4451 int j;
4452 int ucn_len = 0;
4453 int ucn_len_c;
4454 unsigned t;
4455 unsigned long utf32;
4456
4457 /* Compute the length of the UTF-8 sequence. */
4458 for (t = *name; t & 0x80; t <<= 1)
4459 ucn_len++;
4460
4461 utf32 = *name & (0x7F >> ucn_len);
4462 for (ucn_len_c = 1; ucn_len_c < ucn_len; ucn_len_c++)
4463 {
4464 utf32 = (utf32 << 6) | (*++name & 0x3F);
4465
4466 /* Ill-formed UTF-8. */
4467 if ((*name & ~0x3F) != 0x80)
4468 abort ();
4469 }
4470
4471 *buffer++ = '\\';
4472 *buffer++ = 'U';
4473 for (j = 7; j >= 0; j--)
4474 *buffer++ = "0123456789abcdef"[(utf32 >> (4 * j)) & 0xF];
4475 return ucn_len;
4476}
4477
4478/* Given a token TYPE corresponding to a digraph, return a pointer to
4479 the spelling of the digraph. */
4480static const unsigned char *
4481cpp_digraph2name (enum cpp_ttype type)
4482{
4483 return digraph_spellings[(int) type - (int) CPP_FIRST_DIGRAPH];
4484}
4485
4486/* Write the spelling of an identifier IDENT, using UCNs, to BUFFER.
4487 The buffer must already contain enough space to hold the
4488 token's spelling. Returns a pointer to the character after the
4489 last character written. */
4490unsigned char *
4491_cpp_spell_ident_ucns (unsigned char *buffer, cpp_hashnode *ident)
4492{
4493 size_t i;
4494 const unsigned char *name = NODE_NAME (ident);
4495
4496 for (i = 0; i < NODE_LEN (ident); i++)
4497 if (name[i] & ~0x7F)
4498 {
4499 i += utf8_to_ucn (buffer, name: name + i) - 1;
4500 buffer += 10;
4501 }
4502 else
4503 *buffer++ = name[i];
4504
4505 return buffer;
4506}
4507
4508/* Write the spelling of a token TOKEN to BUFFER. The buffer must
4509 already contain enough space to hold the token's spelling.
4510 Returns a pointer to the character after the last character written.
4511 FORSTRING is true if this is to be the spelling after translation
4512 phase 1 (with the original spelling of extended identifiers), false
4513 if extended identifiers should always be written using UCNs (there is
4514 no option for always writing them in the internal UTF-8 form).
4515 FIXME: Would be nice if we didn't need the PFILE argument. */
4516unsigned char *
4517cpp_spell_token (cpp_reader *pfile, const cpp_token *token,
4518 unsigned char *buffer, bool forstring)
4519{
4520 switch (TOKEN_SPELL (token))
4521 {
4522 case SPELL_OPERATOR:
4523 {
4524 const unsigned char *spelling;
4525 unsigned char c;
4526
4527 if (token->flags & DIGRAPH)
4528 spelling = cpp_digraph2name (type: token->type);
4529 else if (token->flags & NAMED_OP)
4530 goto spell_ident;
4531 else
4532 spelling = TOKEN_NAME (token);
4533
4534 while ((c = *spelling++) != '\0')
4535 *buffer++ = c;
4536 }
4537 break;
4538
4539 spell_ident:
4540 case SPELL_IDENT:
4541 if (forstring)
4542 {
4543 memcpy (dest: buffer, NODE_NAME (token->val.node.spelling),
4544 NODE_LEN (token->val.node.spelling));
4545 buffer += NODE_LEN (token->val.node.spelling);
4546 }
4547 else
4548 buffer = _cpp_spell_ident_ucns (buffer, ident: token->val.node.node);
4549 break;
4550
4551 case SPELL_LITERAL:
4552 memcpy (dest: buffer, src: token->val.str.text, n: token->val.str.len);
4553 buffer += token->val.str.len;
4554 break;
4555
4556 case SPELL_NONE:
4557 cpp_error (pfile, CPP_DL_ICE,
4558 msgid: "unspellable token %s", TOKEN_NAME (token));
4559 break;
4560 }
4561
4562 return buffer;
4563}
4564
4565/* Returns TOKEN spelt as a null-terminated string. The string is
4566 freed when the reader is destroyed. Useful for diagnostics. */
4567unsigned char *
4568cpp_token_as_text (cpp_reader *pfile, const cpp_token *token)
4569{
4570 unsigned int len = cpp_token_len (token) + 1;
4571 unsigned char *start = _cpp_unaligned_alloc (pfile, len), *end;
4572
4573 end = cpp_spell_token (pfile, token, buffer: start, forstring: false);
4574 end[0] = '\0';
4575
4576 return start;
4577}
4578
4579/* Returns a pointer to a string which spells the token defined by
4580 TYPE and FLAGS. Used by C front ends, which really should move to
4581 using cpp_token_as_text. */
4582const char *
4583cpp_type2name (enum cpp_ttype type, unsigned char flags)
4584{
4585 if (flags & DIGRAPH)
4586 return (const char *) cpp_digraph2name (type);
4587 else if (flags & NAMED_OP)
4588 return cpp_named_operator2name (type);
4589
4590 return (const char *) token_spellings[type].name;
4591}
4592
4593/* Writes the spelling of token to FP, without any preceding space.
4594 Separated from cpp_spell_token for efficiency - to avoid stdio
4595 double-buffering. */
4596void
4597cpp_output_token (const cpp_token *token, FILE *fp)
4598{
4599 switch (TOKEN_SPELL (token))
4600 {
4601 case SPELL_OPERATOR:
4602 {
4603 const unsigned char *spelling;
4604 int c;
4605
4606 if (token->flags & DIGRAPH)
4607 spelling = cpp_digraph2name (type: token->type);
4608 else if (token->flags & NAMED_OP)
4609 goto spell_ident;
4610 else
4611 spelling = TOKEN_NAME (token);
4612
4613 c = *spelling;
4614 do
4615 putc (c, fp);
4616 while ((c = *++spelling) != '\0');
4617 }
4618 break;
4619
4620 spell_ident:
4621 case SPELL_IDENT:
4622 {
4623 size_t i;
4624 const unsigned char * name = NODE_NAME (token->val.node.node);
4625 unsigned len = NODE_LEN (token->val.node.node);
4626
4627 for (i = 0; i < len; i++)
4628 if (name[i] & ~0x7F)
4629 {
4630 unsigned char buffer[10];
4631 i += utf8_to_ucn (buffer, name: name + i) - 1;
4632 fwrite (buffer, 1, 10, fp);
4633 }
4634 else if (name[i] == ' ' && i == len - 1)
4635 /* Omit terminal space in "export ". */;
4636 else
4637 fputc (NODE_NAME (token->val.node.node)[i], fp);
4638 }
4639 break;
4640
4641 case SPELL_LITERAL:
4642 if (token->type == CPP_HEADER_NAME)
4643 fputc ('"', fp);
4644 fwrite (token->val.str.text, 1, token->val.str.len, fp);
4645 if (token->type == CPP_HEADER_NAME)
4646 fputc ('"', fp);
4647 break;
4648
4649 case SPELL_NONE:
4650 /* An error, most probably. */
4651 break;
4652 }
4653}
4654
4655/* Compare two tokens. */
4656int
4657_cpp_equiv_tokens (const cpp_token *a, const cpp_token *b)
4658{
4659 if (a->type == b->type && a->flags == b->flags)
4660 switch (TOKEN_SPELL (a))
4661 {
4662 default: /* Keep compiler happy. */
4663 case SPELL_OPERATOR:
4664 /* token_no is used to track where multiple consecutive ##
4665 tokens were originally located. */
4666 return (a->type != CPP_PASTE || a->val.token_no == b->val.token_no);
4667 case SPELL_NONE:
4668 return (a->type != CPP_MACRO_ARG
4669 || (a->val.macro_arg.arg_no == b->val.macro_arg.arg_no
4670 && a->val.macro_arg.spelling == b->val.macro_arg.spelling));
4671 case SPELL_IDENT:
4672 return (a->val.node.node == b->val.node.node
4673 && a->val.node.spelling == b->val.node.spelling);
4674 case SPELL_LITERAL:
4675 return (a->val.str.len == b->val.str.len
4676 && !memcmp (s1: a->val.str.text, s2: b->val.str.text,
4677 n: a->val.str.len));
4678 }
4679
4680 return 0;
4681}
4682
4683/* Returns nonzero if a space should be inserted to avoid an
4684 accidental token paste for output. For simplicity, it is
4685 conservative, and occasionally advises a space where one is not
4686 needed, e.g. "." and ".2". */
4687int
4688cpp_avoid_paste (cpp_reader *pfile, const cpp_token *token1,
4689 const cpp_token *token2)
4690{
4691 enum cpp_ttype a = token1->type, b = token2->type;
4692 cppchar_t c;
4693
4694 if (token1->flags & NAMED_OP)
4695 a = CPP_NAME;
4696 if (token2->flags & NAMED_OP)
4697 b = CPP_NAME;
4698
4699 c = EOF;
4700 if (token2->flags & DIGRAPH)
4701 c = digraph_spellings[(int) b - (int) CPP_FIRST_DIGRAPH][0];
4702 else if (token_spellings[b].category == SPELL_OPERATOR)
4703 c = token_spellings[b].name[0];
4704
4705 /* Quickly get everything that can paste with an '='. */
4706 if ((int) a <= (int) CPP_LAST_EQ && c == '=')
4707 return 1;
4708
4709 switch (a)
4710 {
4711 case CPP_GREATER: return c == '>';
4712 case CPP_LESS: return c == '<' || c == '%' || c == ':';
4713 case CPP_PLUS: return c == '+';
4714 case CPP_MINUS: return c == '-' || c == '>';
4715 case CPP_DIV: return c == '/' || c == '*'; /* Comments. */
4716 case CPP_MOD: return c == ':' || c == '>';
4717 case CPP_AND: return c == '&';
4718 case CPP_OR: return c == '|';
4719 case CPP_COLON: return c == ':' || c == '>';
4720 case CPP_DEREF: return c == '*';
4721 case CPP_DOT: return c == '.' || c == '%' || b == CPP_NUMBER;
4722 case CPP_HASH: return c == '#' || c == '%'; /* Digraph form. */
4723 case CPP_PRAGMA:
4724 case CPP_NAME: return ((b == CPP_NUMBER
4725 && name_p (pfile, string: &token2->val.str))
4726 || b == CPP_NAME
4727 || b == CPP_CHAR || b == CPP_STRING); /* L */
4728 case CPP_NUMBER: return (b == CPP_NUMBER || b == CPP_NAME
4729 || b == CPP_CHAR
4730 || c == '.' || c == '+' || c == '-');
4731 /* UCNs */
4732 case CPP_OTHER: return ((token1->val.str.text[0] == '\\'
4733 && b == CPP_NAME)
4734 || (CPP_OPTION (pfile, objc)
4735 && token1->val.str.text[0] == '@'
4736 && (b == CPP_NAME || b == CPP_STRING)));
4737 case CPP_LESS_EQ: return c == '>';
4738 case CPP_STRING:
4739 case CPP_WSTRING:
4740 case CPP_UTF8STRING:
4741 case CPP_STRING16:
4742 case CPP_STRING32: return (CPP_OPTION (pfile, user_literals)
4743 && (b == CPP_NAME
4744 || (TOKEN_SPELL (token2) == SPELL_LITERAL
4745 && ISIDST (token2->val.str.text[0]))));
4746
4747 default: break;
4748 }
4749
4750 return 0;
4751}
4752
4753/* Output all the remaining tokens on the current line, and a newline
4754 character, to FP. Leading whitespace is removed. If there are
4755 macros, special token padding is not performed. */
4756void
4757cpp_output_line (cpp_reader *pfile, FILE *fp)
4758{
4759 const cpp_token *token;
4760
4761 token = cpp_get_token (pfile);
4762 while (token->type != CPP_EOF)
4763 {
4764 cpp_output_token (token, fp);
4765 token = cpp_get_token (pfile);
4766 if (token->flags & PREV_WHITE)
4767 putc (' ', fp);
4768 }
4769
4770 putc ('\n', fp);
4771}
4772
4773/* Return a string representation of all the remaining tokens on the
4774 current line. The result is allocated using xmalloc and must be
4775 freed by the caller. */
4776unsigned char *
4777cpp_output_line_to_string (cpp_reader *pfile, const unsigned char *dir_name)
4778{
4779 const cpp_token *token;
4780 unsigned int out = dir_name ? ustrlen (s1: dir_name) : 0;
4781 unsigned int alloced = 120 + out;
4782 unsigned char *result = (unsigned char *) xmalloc (alloced);
4783
4784 /* If DIR_NAME is empty, there are no initial contents. */
4785 if (dir_name)
4786 {
4787 sprintf (s: (char *) result, format: "#%s ", dir_name);
4788 out += 2;
4789 }
4790
4791 token = cpp_get_token (pfile);
4792 while (token->type != CPP_EOF)
4793 {
4794 unsigned char *last;
4795 /* Include room for a possible space and the terminating nul. */
4796 unsigned int len = cpp_token_len (token) + 2;
4797
4798 if (out + len > alloced)
4799 {
4800 alloced *= 2;
4801 if (out + len > alloced)
4802 alloced = out + len;
4803 result = (unsigned char *) xrealloc (result, alloced);
4804 }
4805
4806 last = cpp_spell_token (pfile, token, buffer: &result[out], forstring: 0);
4807 out = last - result;
4808
4809 token = cpp_get_token (pfile);
4810 if (token->flags & PREV_WHITE)
4811 result[out++] = ' ';
4812 }
4813
4814 result[out] = '\0';
4815 return result;
4816}
4817
4818/* Memory buffers. Changing these three constants can have a dramatic
4819 effect on performance. The values here are reasonable defaults,
4820 but might be tuned. If you adjust them, be sure to test across a
4821 range of uses of cpplib, including heavy nested function-like macro
4822 expansion. Also check the change in peak memory usage (NJAMD is a
4823 good tool for this). */
4824#define MIN_BUFF_SIZE 8000
4825#define BUFF_SIZE_UPPER_BOUND(MIN_SIZE) (MIN_BUFF_SIZE + (MIN_SIZE) * 3 / 2)
4826#define EXTENDED_BUFF_SIZE(BUFF, MIN_EXTRA) \
4827 (MIN_EXTRA + ((BUFF)->limit - (BUFF)->cur) * 2)
4828
4829#if MIN_BUFF_SIZE > BUFF_SIZE_UPPER_BOUND (0)
4830 #error BUFF_SIZE_UPPER_BOUND must be at least as large as MIN_BUFF_SIZE!
4831#endif
4832
4833/* Create a new allocation buffer. Place the control block at the end
4834 of the buffer, so that buffer overflows will cause immediate chaos. */
4835static _cpp_buff *
4836new_buff (size_t len)
4837{
4838 _cpp_buff *result;
4839 unsigned char *base;
4840
4841 if (len < MIN_BUFF_SIZE)
4842 len = MIN_BUFF_SIZE;
4843 len = CPP_ALIGN (len);
4844
4845#ifdef ENABLE_VALGRIND_WORKAROUNDS
4846 /* Valgrind warns about uses of interior pointers, so put _cpp_buff
4847 struct first. */
4848 size_t slen = CPP_ALIGN2 (sizeof (_cpp_buff), 2 * DEFAULT_ALIGNMENT);
4849 base = XNEWVEC (unsigned char, len + slen);
4850 result = (_cpp_buff *) base;
4851 base += slen;
4852#else
4853 base = XNEWVEC (unsigned char, len + sizeof (_cpp_buff));
4854 result = (_cpp_buff *) (base + len);
4855#endif
4856 result->base = base;
4857 result->cur = base;
4858 result->limit = base + len;
4859 result->next = NULL;
4860 return result;
4861}
4862
4863/* Place a chain of unwanted allocation buffers on the free list. */
4864void
4865_cpp_release_buff (cpp_reader *pfile, _cpp_buff *buff)
4866{
4867 _cpp_buff *end = buff;
4868
4869 while (end->next)
4870 end = end->next;
4871 end->next = pfile->free_buffs;
4872 pfile->free_buffs = buff;
4873}
4874
4875/* Return a free buffer of size at least MIN_SIZE. */
4876_cpp_buff *
4877_cpp_get_buff (cpp_reader *pfile, size_t min_size)
4878{
4879 _cpp_buff *result, **p;
4880
4881 for (p = &pfile->free_buffs;; p = &(*p)->next)
4882 {
4883 size_t size;
4884
4885 if (*p == NULL)
4886 return new_buff (len: min_size);
4887 result = *p;
4888 size = result->limit - result->base;
4889 /* Return a buffer that's big enough, but don't waste one that's
4890 way too big. */
4891 if (size >= min_size && size <= BUFF_SIZE_UPPER_BOUND (min_size))
4892 break;
4893 }
4894
4895 *p = result->next;
4896 result->next = NULL;
4897 result->cur = result->base;
4898 return result;
4899}
4900
4901/* Creates a new buffer with enough space to hold the uncommitted
4902 remaining bytes of BUFF, and at least MIN_EXTRA more bytes. Copies
4903 the excess bytes to the new buffer. Chains the new buffer after
4904 BUFF, and returns the new buffer. */
4905_cpp_buff *
4906_cpp_append_extend_buff (cpp_reader *pfile, _cpp_buff *buff, size_t min_extra)
4907{
4908 size_t size = EXTENDED_BUFF_SIZE (buff, min_extra);
4909 _cpp_buff *new_buff = _cpp_get_buff (pfile, min_size: size);
4910
4911 buff->next = new_buff;
4912 memcpy (dest: new_buff->base, src: buff->cur, BUFF_ROOM (buff));
4913 return new_buff;
4914}
4915
4916/* Creates a new buffer with enough space to hold the uncommitted
4917 remaining bytes of the buffer pointed to by BUFF, and at least
4918 MIN_EXTRA more bytes. Copies the excess bytes to the new buffer.
4919 Chains the new buffer before the buffer pointed to by BUFF, and
4920 updates the pointer to point to the new buffer. */
4921void
4922_cpp_extend_buff (cpp_reader *pfile, _cpp_buff **pbuff, size_t min_extra)
4923{
4924 _cpp_buff *new_buff, *old_buff = *pbuff;
4925 size_t size = EXTENDED_BUFF_SIZE (old_buff, min_extra);
4926
4927 new_buff = _cpp_get_buff (pfile, min_size: size);
4928 memcpy (dest: new_buff->base, src: old_buff->cur, BUFF_ROOM (old_buff));
4929 new_buff->next = old_buff;
4930 *pbuff = new_buff;
4931}
4932
4933/* Free a chain of buffers starting at BUFF. */
4934void
4935_cpp_free_buff (_cpp_buff *buff)
4936{
4937 _cpp_buff *next;
4938
4939 for (; buff; buff = next)
4940 {
4941 next = buff->next;
4942#ifdef ENABLE_VALGRIND_WORKAROUNDS
4943 free (buff);
4944#else
4945 free (ptr: buff->base);
4946#endif
4947 }
4948}
4949
4950/* Allocate permanent, unaligned storage of length LEN. */
4951unsigned char *
4952_cpp_unaligned_alloc (cpp_reader *pfile, size_t len)
4953{
4954 _cpp_buff *buff = pfile->u_buff;
4955 unsigned char *result = buff->cur;
4956
4957 if (len > (size_t) (buff->limit - result))
4958 {
4959 buff = _cpp_get_buff (pfile, min_size: len);
4960 buff->next = pfile->u_buff;
4961 pfile->u_buff = buff;
4962 result = buff->cur;
4963 }
4964
4965 buff->cur = result + len;
4966 return result;
4967}
4968
4969/* Allocate permanent, unaligned storage of length LEN from a_buff.
4970 That buffer is used for growing allocations when saving macro
4971 replacement lists in a #define, and when parsing an answer to an
4972 assertion in #assert, #unassert or #if (and therefore possibly
4973 whilst expanding macros). It therefore must not be used by any
4974 code that they might call: specifically the lexer and the guts of
4975 the macro expander.
4976
4977 All existing other uses clearly fit this restriction: storing
4978 registered pragmas during initialization. */
4979unsigned char *
4980_cpp_aligned_alloc (cpp_reader *pfile, size_t len)
4981{
4982 _cpp_buff *buff = pfile->a_buff;
4983 unsigned char *result = buff->cur;
4984
4985 if (len > (size_t) (buff->limit - result))
4986 {
4987 buff = _cpp_get_buff (pfile, min_size: len);
4988 buff->next = pfile->a_buff;
4989 pfile->a_buff = buff;
4990 result = buff->cur;
4991 }
4992
4993 buff->cur = result + len;
4994 return result;
4995}
4996
4997/* Commit or allocate storage from a buffer. */
4998
4999void *
5000_cpp_commit_buff (cpp_reader *pfile, size_t size)
5001{
5002 const auto buff = pfile->a_buff;
5003 void *ptr = BUFF_FRONT (buff);
5004
5005 if (pfile->hash_table->alloc_subobject)
5006 {
5007 void *copy = pfile->hash_table->alloc_subobject (size);
5008 memcpy (dest: copy, src: ptr, n: size);
5009 ptr = copy;
5010 }
5011 else
5012 {
5013 BUFF_FRONT (buff) += size;
5014 /* Make sure the remaining space is maximally aligned for whatever this
5015 buffer holds next. */
5016 BUFF_FRONT (buff) += BUFF_ROOM (buff) % DEFAULT_ALIGNMENT;
5017 }
5018
5019 return ptr;
5020}
5021
5022/* Say which field of TOK is in use. */
5023
5024enum cpp_token_fld_kind
5025cpp_token_val_index (const cpp_token *tok)
5026{
5027 switch (TOKEN_SPELL (tok))
5028 {
5029 case SPELL_IDENT:
5030 return CPP_TOKEN_FLD_NODE;
5031 case SPELL_LITERAL:
5032 return CPP_TOKEN_FLD_STR;
5033 case SPELL_OPERATOR:
5034 /* Operands which were originally spelled as ident keep around
5035 the node for the exact spelling. */
5036 if (tok->flags & NAMED_OP)
5037 return CPP_TOKEN_FLD_NODE;
5038 else if (tok->type == CPP_PASTE)
5039 return CPP_TOKEN_FLD_TOKEN_NO;
5040 else
5041 return CPP_TOKEN_FLD_NONE;
5042 case SPELL_NONE:
5043 if (tok->type == CPP_MACRO_ARG)
5044 return CPP_TOKEN_FLD_ARG_NO;
5045 else if (tok->type == CPP_PADDING)
5046 return CPP_TOKEN_FLD_SOURCE;
5047 else if (tok->type == CPP_PRAGMA)
5048 return CPP_TOKEN_FLD_PRAGMA;
5049 /* fall through */
5050 default:
5051 return CPP_TOKEN_FLD_NONE;
5052 }
5053}
5054
5055/* All tokens lexed in R after calling this function will be forced to
5056 have their location_t to be P, until
5057 cpp_stop_forcing_token_locations is called for R. */
5058
5059void
5060cpp_force_token_locations (cpp_reader *r, location_t loc)
5061{
5062 r->forced_token_location = loc;
5063}
5064
5065/* Go back to assigning locations naturally for lexed tokens. */
5066
5067void
5068cpp_stop_forcing_token_locations (cpp_reader *r)
5069{
5070 r->forced_token_location = 0;
5071}
5072
5073/* We're looking at \, if it's escaping EOL, look past it. If at
5074 LIMIT, don't advance. */
5075
5076static const unsigned char *
5077do_peek_backslash (const unsigned char *peek, const unsigned char *limit)
5078{
5079 const unsigned char *probe = peek;
5080
5081 if (__builtin_expect (peek[1] == '\n', true))
5082 {
5083 eol:
5084 probe += 2;
5085 if (__builtin_expect (probe < limit, true))
5086 {
5087 peek = probe;
5088 if (*peek == '\\')
5089 /* The user might be perverse. */
5090 return do_peek_backslash (peek, limit);
5091 }
5092 }
5093 else if (__builtin_expect (peek[1] == '\r', false))
5094 {
5095 if (probe[2] == '\n')
5096 probe++;
5097 goto eol;
5098 }
5099
5100 return peek;
5101}
5102
5103static const unsigned char *
5104do_peek_next (const unsigned char *peek, const unsigned char *limit)
5105{
5106 if (__builtin_expect (*peek == '\\', false))
5107 peek = do_peek_backslash (peek, limit);
5108 return peek;
5109}
5110
5111static const unsigned char *
5112do_peek_prev (const unsigned char *peek, const unsigned char *bound)
5113{
5114 if (peek == bound)
5115 return NULL;
5116
5117 unsigned char c = *--peek;
5118 if (__builtin_expect (c == '\n', false)
5119 || __builtin_expect (c == '\r', false))
5120 {
5121 if (peek == bound)
5122 return peek;
5123 int ix = -1;
5124 if (c == '\n' && peek[ix] == '\r')
5125 {
5126 if (peek + ix == bound)
5127 return peek;
5128 ix--;
5129 }
5130
5131 if (peek[ix] == '\\')
5132 return do_peek_prev (peek: peek + ix, bound);
5133
5134 return peek;
5135 }
5136 else
5137 return peek;
5138}
5139
5140/* If PEEK[-1] is identifier MATCH, scan past it and trailing white
5141 space. Otherwise return NULL. */
5142
5143static const unsigned char *
5144do_peek_ident (const char *match, const unsigned char *peek,
5145 const unsigned char *limit)
5146{
5147 for (; *++match; peek++)
5148 if (*peek != *match)
5149 {
5150 peek = do_peek_next (peek, limit);
5151 if (*peek != *match)
5152 return NULL;
5153 }
5154
5155 /* Must now not be looking at an identifier char. */
5156 peek = do_peek_next (peek, limit);
5157 if (ISIDNUM (*peek))
5158 return NULL;
5159
5160 /* Skip control-line whitespace. */
5161 ws:
5162 while (*peek == ' ' || *peek == '\t')
5163 peek++;
5164 if (__builtin_expect (*peek == '\\', false))
5165 {
5166 peek = do_peek_backslash (peek, limit);
5167 if (*peek != '\\')
5168 goto ws;
5169 }
5170
5171 return peek;
5172}
5173
5174/* Are we looking at a module control line starting as PEEK - 1? */
5175
5176static bool
5177do_peek_module (cpp_reader *pfile, unsigned char c,
5178 const unsigned char *peek, const unsigned char *limit)
5179{
5180 bool import = false;
5181
5182 if (__builtin_expect (c == 'e', false))
5183 {
5184 if (!((peek[0] == 'x' || peek[0] == '\\')
5185 && (peek = do_peek_ident (match: "export", peek, limit))))
5186 return false;
5187
5188 /* export, peek for import or module. No need to peek __import
5189 here. */
5190 if (peek[0] == 'i')
5191 {
5192 if (!((peek[1] == 'm' || peek[1] == '\\')
5193 && (peek = do_peek_ident (match: "import", peek: peek + 1, limit))))
5194 return false;
5195 import = true;
5196 }
5197 else if (peek[0] == 'm')
5198 {
5199 if (!((peek[1] == 'o' || peek[1] == '\\')
5200 && (peek = do_peek_ident (match: "module", peek: peek + 1, limit))))
5201 return false;
5202 }
5203 else
5204 return false;
5205 }
5206 else if (__builtin_expect (c == 'i', false))
5207 {
5208 if (!((peek[0] == 'm' || peek[0] == '\\')
5209 && (peek = do_peek_ident (match: "import", peek, limit))))
5210 return false;
5211 import = true;
5212 }
5213 else if (__builtin_expect (c == '_', false))
5214 {
5215 /* Needed for translated includes. */
5216 if (!((peek[0] == '_' || peek[0] == '\\')
5217 && (peek = do_peek_ident (match: "__import", peek, limit))))
5218 return false;
5219 import = true;
5220 }
5221 else if (__builtin_expect (c == 'm', false))
5222 {
5223 if (!((peek[0] == 'o' || peek[0] == '\\')
5224 && (peek = do_peek_ident (match: "module", peek, limit))))
5225 return false;
5226 }
5227 else
5228 return false;
5229
5230 /* Peek the next character to see if it's good enough. We'll be at
5231 the first non-whitespace char, including skipping an escaped
5232 newline. */
5233 /* ... import followed by identifier, ':', '<' or header-name
5234 preprocessing tokens, or module followed by identifier, ':' or
5235 ';' preprocessing tokens. */
5236 unsigned char p = *peek++;
5237
5238 /* A character literal is ... single quotes, ... optionally preceded
5239 by u8, u, U, or L */
5240 /* A string-literal is a ... double quotes, optionally prefixed by
5241 R, u8, u8R, u, uR, U, UR, L, or LR */
5242 if (p == 'u')
5243 {
5244 peek = do_peek_next (peek, limit);
5245 if (*peek == '8')
5246 {
5247 peek++;
5248 goto peek_u8;
5249 }
5250 goto peek_u;
5251 }
5252 else if (p == 'U' || p == 'L')
5253 {
5254 peek_u8:
5255 peek = do_peek_next (peek, limit);
5256 peek_u:
5257 if (*peek == '\"' || *peek == '\'')
5258 return false;
5259
5260 if (*peek == 'R')
5261 goto peek_R;
5262 /* Identifier. Ok. */
5263 }
5264 else if (p == 'R')
5265 {
5266 peek_R:
5267 if (CPP_OPTION (pfile, rliterals))
5268 {
5269 peek = do_peek_next (peek, limit);
5270 if (*peek == '\"')
5271 return false;
5272 }
5273 /* Identifier. Ok. */
5274 }
5275 else if ('Z' - 'A' == 25
5276 ? ((p >= 'A' && p <= 'Z') || (p >= 'a' && p <= 'z') || p == '_')
5277 : ISIDST (p))
5278 {
5279 /* Identifier. Ok. */
5280 }
5281 else if (p == '<')
5282 {
5283 /* Maybe angle header, ok for import. Reject
5284 '<=', '<<' digraph:'<:'. */
5285 if (!import)
5286 return false;
5287 peek = do_peek_next (peek, limit);
5288 if (*peek == '=' || *peek == '<'
5289 || (*peek == ':' && CPP_OPTION (pfile, digraphs)))
5290 return false;
5291 }
5292 else if (p == ';')
5293 {
5294 /* SEMICOLON, ok for module. */
5295 if (import)
5296 return false;
5297 }
5298 else if (p == '"')
5299 {
5300 /* STRING, ok for import. */
5301 if (!import)
5302 return false;
5303 }
5304 else if (p == ':')
5305 {
5306 /* Maybe COLON, ok. Reject '::', digraph:':>'. */
5307 peek = do_peek_next (peek, limit);
5308 if (*peek == ':' || (*peek == '>' && CPP_OPTION (pfile, digraphs)))
5309 return false;
5310 }
5311 else
5312 /* FIXME: Detect a unicode character, excluding those not
5313 permitted as the initial character. [lex.name]/1. I presume
5314 we need to check the \[uU] spellings, and directly using
5315 Unicode in say UTF8 form? Or perhaps we do the phase-1
5316 conversion of UTF8 to universal-character-names? */
5317 return false;
5318
5319 return true;
5320}
5321
5322/* Directives-only scanning. Somewhat more relaxed than correct
5323 parsing -- some ill-formed programs will not be rejected. */
5324
5325void
5326cpp_directive_only_process (cpp_reader *pfile,
5327 void *data,
5328 void (*cb) (cpp_reader *, CPP_DO_task, void *, ...))
5329{
5330 bool module_p = CPP_OPTION (pfile, module_directives);
5331
5332 do
5333 {
5334 restart:
5335 /* Buffer initialization, but no line cleaning. */
5336 cpp_buffer *buffer = pfile->buffer;
5337 buffer->cur_note = buffer->notes_used = 0;
5338 buffer->cur = buffer->line_base = buffer->next_line;
5339 buffer->need_line = false;
5340 /* Files always end in a newline or carriage return. We rely on this for
5341 character peeking safety. */
5342 gcc_assert (buffer->rlimit[0] == '\n' || buffer->rlimit[0] == '\r');
5343
5344 const unsigned char *base = buffer->cur;
5345 unsigned line_count = 0;
5346 const unsigned char *line_start = base;
5347
5348 bool bol = true;
5349 bool raw = false;
5350
5351 const unsigned char *lwm = base;
5352 for (const unsigned char *pos = base, *limit = buffer->rlimit;
5353 pos < limit;)
5354 {
5355 unsigned char c = *pos++;
5356 /* This matches the switch in _cpp_lex_direct. */
5357 switch (c)
5358 {
5359 case ' ': case '\t': case '\f': case '\v':
5360 /* Whitespace, do nothing. */
5361 break;
5362
5363 case '\r': /* MAC line ending, or Windows \r\n */
5364 if (*pos == '\n')
5365 pos++;
5366 /* FALLTHROUGH */
5367
5368 case '\n':
5369 bol = true;
5370
5371 next_line:
5372 CPP_INCREMENT_LINE (pfile, 0);
5373 line_count++;
5374 line_start = pos;
5375 break;
5376
5377 case '\\':
5378 /* <backslash><newline> is removed, and doesn't undo any
5379 preceeding escape or whatnot. */
5380 if (*pos == '\n')
5381 {
5382 pos++;
5383 goto next_line;
5384 }
5385 else if (*pos == '\r')
5386 {
5387 if (pos[1] == '\n')
5388 pos++;
5389 pos++;
5390 goto next_line;
5391 }
5392 goto dflt;
5393
5394 case '#':
5395 if (bol)
5396 {
5397 /* Line directive. */
5398 if (pos - 1 > base && !pfile->state.skipping)
5399 cb (pfile, CPP_DO_print, data,
5400 line_count, base, pos - 1 - base);
5401
5402 /* Prep things for directive handling. */
5403 buffer->next_line = pos;
5404 buffer->need_line = true;
5405 bool ok = _cpp_get_fresh_line (pfile);
5406 gcc_checking_assert (ok);
5407
5408 /* Ensure proper column numbering for generated
5409 error messages. */
5410 buffer->line_base -= pos - line_start;
5411
5412 if (_cpp_handle_directive (pfile, line_start + 1 != pos) == 2)
5413 {
5414 if (pfile->directive_result.type != CPP_PADDING)
5415 cb (pfile, CPP_DO_token, data,
5416 &pfile->directive_result, pfile->directive_result.src_loc);
5417 if (pfile->context->prev)
5418 {
5419 gcc_assert (pfile->context->tokens_kind == TOKENS_KIND_DIRECT);
5420 for (const cpp_token *tok = FIRST (pfile->context).token;
5421 tok != LAST (pfile->context).token; ++tok)
5422 cb (pfile, CPP_DO_token, data, tok, tok->src_loc);
5423 _cpp_pop_context (pfile);
5424 }
5425 }
5426
5427 /* Sanitize the line settings. Duplicate #include's can
5428 mess things up. */
5429 // FIXME: Necessary?
5430 pfile->line_table->highest_location
5431 = pfile->line_table->highest_line;
5432
5433 if (!pfile->state.skipping
5434 && pfile->buffer->next_line < pfile->buffer->rlimit)
5435 cb (pfile, CPP_DO_location, data,
5436 pfile->line_table->highest_line);
5437
5438 goto restart;
5439 }
5440 goto dflt;
5441
5442 case '/':
5443 {
5444 const unsigned char *peek = do_peek_next (peek: pos, limit);
5445 if (!(*peek == '/' || *peek == '*'))
5446 goto dflt;
5447
5448 /* Line or block comment */
5449 bool is_block = *peek == '*';
5450 bool star = false;
5451 bool esc = false;
5452 location_t sloc
5453 = linemap_position_for_column (pfile->line_table,
5454 pos - line_start);
5455
5456 while (pos < limit)
5457 {
5458 char c = *pos++;
5459 switch (c)
5460 {
5461 case '\\':
5462 esc = true;
5463 break;
5464
5465 case '\r':
5466 if (*pos == '\n')
5467 pos++;
5468 /* FALLTHROUGH */
5469
5470 case '\n':
5471 {
5472 CPP_INCREMENT_LINE (pfile, 0);
5473 line_count++;
5474 line_start = pos;
5475 if (!esc && !is_block)
5476 {
5477 bol = true;
5478 goto done_comment;
5479 }
5480 }
5481 if (!esc)
5482 star = false;
5483 esc = false;
5484 break;
5485
5486 case '*':
5487 if (pos > peek)
5488 star = is_block;
5489 esc = false;
5490 break;
5491
5492 case '/':
5493 if (star)
5494 goto done_comment;
5495 /* FALLTHROUGH */
5496
5497 default:
5498 star = false;
5499 esc = false;
5500 break;
5501 }
5502 }
5503 if (pos < limit || is_block)
5504 cpp_error_with_line (pfile, CPP_DL_ERROR, sloc, 0,
5505 msgid: "unterminated comment");
5506 done_comment:
5507 lwm = pos;
5508 break;
5509 }
5510
5511 case '\'':
5512 if (!CPP_OPTION (pfile, digit_separators))
5513 goto delimited_string;
5514
5515 /* Possibly a number punctuator. */
5516 if (!ISIDNUM (*do_peek_next (pos, limit)))
5517 goto delimited_string;
5518
5519 goto quote_peek;
5520
5521 case '\"':
5522 if (!CPP_OPTION (pfile, rliterals))
5523 goto delimited_string;
5524
5525 quote_peek:
5526 {
5527 /* For ' see if it's a number punctuator
5528 \.?<digit>(<digit>|<identifier-nondigit>
5529 |'<digit>|'<nondigit>|[eEpP]<sign>|\.)* */
5530 /* For " see if it's a raw string
5531 {U,L,u,u8}R. This includes CPP_NUMBER detection,
5532 because that could be 0e+R. */
5533 const unsigned char *peek = pos - 1;
5534 bool quote_first = c == '"';
5535 bool quote_eight = false;
5536 bool maybe_number_start = false;
5537 bool want_number = false;
5538
5539 while ((peek = do_peek_prev (peek, bound: lwm)))
5540 {
5541 unsigned char p = *peek;
5542 if (quote_first)
5543 {
5544 if (!raw)
5545 {
5546 if (p != 'R')
5547 break;
5548 raw = true;
5549 continue;
5550 }
5551
5552 quote_first = false;
5553 if (p == 'L' || p == 'U' || p == 'u')
5554 ;
5555 else if (p == '8')
5556 quote_eight = true;
5557 else
5558 goto second_raw;
5559 }
5560 else if (quote_eight)
5561 {
5562 if (p != 'u')
5563 {
5564 raw = false;
5565 break;
5566 }
5567 quote_eight = false;
5568 }
5569 else if (c == '"')
5570 {
5571 second_raw:;
5572 if (!want_number && ISIDNUM (p))
5573 {
5574 raw = false;
5575 break;
5576 }
5577 }
5578
5579 if (ISDIGIT (p))
5580 maybe_number_start = true;
5581 else if (p == '.')
5582 want_number = true;
5583 else if (ISIDNUM (p))
5584 maybe_number_start = false;
5585 else if (p == '+' || p == '-')
5586 {
5587 if (const unsigned char *peek_prev
5588 = do_peek_prev (peek, bound: lwm))
5589 {
5590 p = *peek_prev;
5591 if (p == 'e' || p == 'E'
5592 || p == 'p' || p == 'P')
5593 {
5594 want_number = true;
5595 maybe_number_start = false;
5596 }
5597 else
5598 break;
5599 }
5600 else
5601 break;
5602 }
5603 else if (p == '\'' || p == '\"')
5604 {
5605 /* If this is lwm, this must be the end of a
5606 previous string. So this is a trailing
5607 literal type, (a) if those are allowed,
5608 and (b) maybe_start is false. Otherwise
5609 this must be a CPP_NUMBER because we've
5610 met another ', and we'd have checked that
5611 in its own right. */
5612 if (peek == lwm && CPP_OPTION (pfile, uliterals))
5613 {
5614 if (!maybe_number_start && !want_number)
5615 /* Must be a literal type. */
5616 raw = false;
5617 }
5618 else if (p == '\''
5619 && CPP_OPTION (pfile, digit_separators))
5620 maybe_number_start = true;
5621 break;
5622 }
5623 else if (c == '\'')
5624 break;
5625 else if (!quote_first && !quote_eight)
5626 break;
5627 }
5628
5629 if (maybe_number_start)
5630 {
5631 if (c == '\'')
5632 /* A CPP NUMBER. */
5633 goto dflt;
5634 raw = false;
5635 }
5636
5637 goto delimited_string;
5638 }
5639
5640 delimited_string:
5641 {
5642 /* (Possibly raw) string or char literal. */
5643 unsigned char end = c;
5644 int delim_len = -1;
5645 const unsigned char *delim = NULL;
5646 location_t sloc = linemap_position_for_column (pfile->line_table,
5647 pos - line_start);
5648 int esc = 0;
5649
5650 if (raw)
5651 {
5652 /* There can be no line breaks in the delimiter. */
5653 delim = pos;
5654 for (delim_len = 0; (c = *pos++) != '('; delim_len++)
5655 {
5656 if (delim_len == 16)
5657 {
5658 cpp_error_with_line (pfile, CPP_DL_ERROR,
5659 sloc, 0,
5660 msgid: "raw string delimiter"
5661 " longer than %d"
5662 " characters",
5663 delim_len);
5664 raw = false;
5665 pos = delim;
5666 break;
5667 }
5668 if (strchr (s: ") \\\t\v\f\n", c: c))
5669 {
5670 cpp_error_with_line (pfile, CPP_DL_ERROR,
5671 sloc, 0,
5672 msgid: "invalid character '%c'"
5673 " in raw string"
5674 " delimiter", c);
5675 raw = false;
5676 pos = delim;
5677 break;
5678 }
5679 if (pos >= limit)
5680 goto bad_string;
5681 }
5682 }
5683
5684 while (pos < limit)
5685 {
5686 char c = *pos++;
5687 switch (c)
5688 {
5689 case '\\':
5690 if (!raw)
5691 esc++;
5692 break;
5693
5694 case '\r':
5695 if (*pos == '\n')
5696 pos++;
5697 /* FALLTHROUGH */
5698
5699 case '\n':
5700 {
5701 CPP_INCREMENT_LINE (pfile, 0);
5702 line_count++;
5703 line_start = pos;
5704 }
5705 if (esc)
5706 esc--;
5707 break;
5708
5709 case ')':
5710 if (raw
5711 && pos + delim_len + 1 < limit
5712 && pos[delim_len] == end
5713 && !memcmp (s1: delim, s2: pos, n: delim_len))
5714 {
5715 pos += delim_len + 1;
5716 raw = false;
5717 goto done_string;
5718 }
5719 break;
5720
5721 default:
5722 if (!raw && !(esc & 1) && c == end)
5723 goto done_string;
5724 esc = 0;
5725 break;
5726 }
5727 }
5728 bad_string:
5729 cpp_error_with_line (pfile, CPP_DL_ERROR, sloc, 0,
5730 msgid: "unterminated literal");
5731
5732 done_string:
5733 raw = false;
5734 lwm = pos - 1;
5735 }
5736 goto dflt;
5737
5738 case '_':
5739 case 'e':
5740 case 'i':
5741 case 'm':
5742 if (bol && module_p && !pfile->state.skipping
5743 && do_peek_module (pfile, c, peek: pos, limit))
5744 {
5745 /* We've seen the start of a module control line.
5746 Start up the tokenizer. */
5747 pos--; /* Backup over the first character. */
5748
5749 /* Backup over whitespace to start of line. */
5750 while (pos > line_start
5751 && (pos[-1] == ' ' || pos[-1] == '\t'))
5752 pos--;
5753
5754 if (pos > base)
5755 cb (pfile, CPP_DO_print, data, line_count, base, pos - base);
5756
5757 /* Prep things for directive handling. */
5758 buffer->next_line = pos;
5759 buffer->need_line = true;
5760
5761 /* Now get tokens until the PRAGMA_EOL. */
5762 do
5763 {
5764 location_t spelling;
5765 const cpp_token *tok
5766 = cpp_get_token_with_location (pfile, &spelling);
5767
5768 gcc_assert (pfile->state.in_deferred_pragma
5769 || tok->type == CPP_PRAGMA_EOL);
5770 cb (pfile, CPP_DO_token, data, tok, spelling);
5771 }
5772 while (pfile->state.in_deferred_pragma);
5773
5774 if (pfile->buffer->next_line < pfile->buffer->rlimit)
5775 cb (pfile, CPP_DO_location, data,
5776 pfile->line_table->highest_line);
5777
5778 pfile->mi_valid = false;
5779 goto restart;
5780 }
5781 goto dflt;
5782
5783 default:
5784 dflt:
5785 bol = false;
5786 pfile->mi_valid = false;
5787 break;
5788 }
5789 }
5790
5791 if (buffer->rlimit > base && !pfile->state.skipping)
5792 {
5793 const unsigned char *limit = buffer->rlimit;
5794 /* If the file was not newline terminated, add rlimit, which is
5795 guaranteed to point to a newline, to the end of our range. */
5796 if (limit[-1] != '\n')
5797 {
5798 limit++;
5799 CPP_INCREMENT_LINE (pfile, 0);
5800 line_count++;
5801 }
5802 cb (pfile, CPP_DO_print, data, line_count, base, limit - base);
5803 }
5804
5805 _cpp_pop_buffer (pfile);
5806 }
5807 while (pfile->buffer);
5808}
5809

source code of libcpp/lex.cc