1/* CPP Library - lexical analysis.
2 Copyright (C) 2000-2026 Free Software Foundation, Inc.
3 Contributed by Per Bothner, 1994-95.
4 Based on CCCP program by Paul Rubin, June 1986
5 Adapted to ANSI C, Richard Stallman, Jan 1987
6 Broken out to separate file, Zack Weinberg, Mar 2000
7
8This program is free software; you can redistribute it and/or modify it
9under the terms of the GNU General Public License as published by the
10Free Software Foundation; either version 3, or (at your option) any
11later version.
12
13This program is distributed in the hope that it will be useful,
14but WITHOUT ANY WARRANTY; without even the implied warranty of
15MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16GNU General Public License for more details.
17
18You should have received a copy of the GNU General Public License
19along with this program; see the file COPYING3. If not see
20<http://www.gnu.org/licenses/>. */
21
22#include "config.h"
23#include "system.h"
24#include "cpplib.h"
25#include "internal.h"
26
27enum spell_type
28{
29 SPELL_OPERATOR = 0,
30 SPELL_IDENT,
31 SPELL_LITERAL,
32 SPELL_NONE
33};
34
35struct token_spelling
36{
37 enum spell_type category;
38 const unsigned char *name;
39};
40
41static const unsigned char *const digraph_spellings[] =
42{ UC"%:", UC"%:%:", UC"<:", UC":>", UC"<%", UC"%>" };
43
44#define OP(e, s) { SPELL_OPERATOR, UC s },
45#define TK(e, s) { SPELL_ ## s, UC #e },
46static const struct token_spelling token_spellings[N_TTYPES] = { TTYPE_TABLE };
47#undef OP
48#undef TK
49
50#define TOKEN_SPELL(token) (token_spellings[(token)->type].category)
51#define TOKEN_NAME(token) (token_spellings[(token)->type].name)
52
53/* ISO 10646 defines the UCS codespace as the range 0-0x10FFFF inclusive. */
54#define UCS_LIMIT 0x10FFFF
55
56static void add_line_note (cpp_buffer *, const uchar *, unsigned int);
57static int skip_line_comment (cpp_reader *);
58static void skip_whitespace (cpp_reader *, cppchar_t);
59static void lex_string (cpp_reader *, cpp_token *, const uchar *);
60static void save_comment (cpp_reader *, cpp_token *, const uchar *, cppchar_t);
61static void store_comment (cpp_reader *, cpp_token *);
62static void create_literal (cpp_reader *, cpp_token *, const uchar *,
63 unsigned int, enum cpp_ttype);
64static bool warn_in_comment (cpp_reader *, _cpp_line_note *);
65static int name_p (cpp_reader *, const cpp_string *);
66static tokenrun *next_tokenrun (tokenrun *);
67
68static _cpp_buff *new_buff (size_t);
69
70
71/* Utility routine:
72
73 Compares, the token TOKEN to the NUL-terminated string STRING.
74 TOKEN must be a CPP_NAME. Returns 1 for equal, 0 for unequal. */
75int
76cpp_ideq (const cpp_token *token, const char *string)
77{
78 if (token->type != CPP_NAME)
79 return 0;
80
81 return !ustrcmp (NODE_NAME (token->val.node.node), s2: (const uchar *) string);
82}
83
84/* Record a note TYPE at byte POS into the current cleaned logical
85 line. */
86static void
87add_line_note (cpp_buffer *buffer, const uchar *pos, unsigned int type)
88{
89 if (buffer->notes_used == buffer->notes_cap)
90 {
91 buffer->notes_cap = buffer->notes_cap * 2 + 200;
92 buffer->notes = XRESIZEVEC (_cpp_line_note, buffer->notes,
93 buffer->notes_cap);
94 }
95
96 buffer->notes[buffer->notes_used].pos = pos;
97 buffer->notes[buffer->notes_used].type = type;
98 buffer->notes_used++;
99}
100
101
102/* Fast path to find line special characters using optimized character
103 scanning algorithms. Anything complicated falls back to the slow
104 path below. Since this loop is very hot it's worth doing these kinds
105 of optimizations.
106
107 One of the paths through the ifdefs should provide
108
109 const uchar *search_line_fast (const uchar *s, const uchar *end);
110
111 Between S and END, search for \n, \r, \\, ?. Return a pointer to
112 the found character.
113
114 Note that the last character of the buffer is *always* a newline,
115 as forced by _cpp_convert_input. This fact can be used to avoid
116 explicitly looking for the end of the buffer. */
117
118/* Configure gives us an ifdef test. */
119#ifndef WORDS_BIGENDIAN
120#define WORDS_BIGENDIAN 0
121#endif
122
123/* We'd like the largest integer that fits into a register. There's nothing
124 in <stdint.h> that gives us that. For most hosts this is unsigned long,
125 but MS decided on an LLP64 model. Thankfully when building with GCC we
126 can get the "real" word size. */
127#ifdef __GNUC__
128typedef unsigned int word_type __attribute__((__mode__(__word__)));
129#else
130typedef unsigned long word_type;
131#endif
132
133/* The code below is only expecting sizes 4 or 8.
134 Die at compile-time if this expectation is violated. */
135typedef char check_word_type_size
136 [(sizeof(word_type) == 8 || sizeof(word_type) == 4) * 2 - 1];
137
138/* Return X with the first N bytes forced to values that won't match one
139 of the interesting characters. Note that NUL is not interesting. */
140
141static inline word_type
142acc_char_mask_misalign (word_type val, unsigned int n)
143{
144 word_type mask = -1;
145 if (WORDS_BIGENDIAN)
146 mask >>= n * 8;
147 else
148 mask <<= n * 8;
149 return val & mask;
150}
151
152/* Return X replicated to all byte positions within WORD_TYPE. */
153
154static inline word_type
155acc_char_replicate (uchar x)
156{
157 word_type ret;
158
159 ret = (x << 24) | (x << 16) | (x << 8) | x;
160 if (sizeof(word_type) == 8)
161 ret = (ret << 16 << 16) | ret;
162 return ret;
163}
164
165/* Return non-zero if some byte of VAL is (probably) C. */
166
167static inline word_type
168acc_char_cmp (word_type val, word_type c)
169{
170#if defined(__GNUC__) && defined(__alpha__)
171 /* We can get exact results using a compare-bytes instruction.
172 Get (val == c) via (0 >= (val ^ c)). */
173 return __builtin_alpha_cmpbge (0, val ^ c);
174#else
175 word_type magic = 0x7efefefeU;
176 if (sizeof(word_type) == 8)
177 magic = (magic << 16 << 16) | 0xfefefefeU;
178 magic |= 1;
179
180 val ^= c;
181 return ((val + magic) ^ ~val) & ~magic;
182#endif
183}
184
185/* Given the result of acc_char_cmp is non-zero, return the index of
186 the found character. If this was a false positive, return -1. */
187
188static inline int
189acc_char_index (word_type cmp ATTRIBUTE_UNUSED,
190 word_type val ATTRIBUTE_UNUSED)
191{
192#if defined(__GNUC__) && defined(__alpha__) && !WORDS_BIGENDIAN
193 /* The cmpbge instruction sets *bits* of the result corresponding to
194 matches in the bytes with no false positives. */
195 return __builtin_ctzl (cmp);
196#else
197 unsigned int i;
198
199 /* ??? It would be nice to force unrolling here,
200 and have all of these constants folded. */
201 for (i = 0; i < sizeof(word_type); ++i)
202 {
203 uchar c;
204 if (WORDS_BIGENDIAN)
205 c = (val >> (sizeof(word_type) - i - 1) * 8) & 0xff;
206 else
207 c = (val >> i * 8) & 0xff;
208
209 if (c == '\n' || c == '\r' || c == '\\' || c == '?')
210 return i;
211 }
212
213 return -1;
214#endif
215}
216
217/* A version of the fast scanner using bit fiddling techniques.
218
219 For 32-bit words, one would normally perform 16 comparisons and
220 16 branches. With this algorithm one performs 24 arithmetic
221 operations and one branch. Whether this is faster with a 32-bit
222 word size is going to be somewhat system dependent.
223
224 For 64-bit words, we eliminate twice the number of comparisons
225 and branches without increasing the number of arithmetic operations.
226 It's almost certainly going to be a win with 64-bit word size. */
227
228static inline const uchar *
229search_line_acc_char (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
230{
231 const word_type repl_nl = acc_char_replicate (x: '\n');
232 const word_type repl_cr = acc_char_replicate (x: '\r');
233 const word_type repl_bs = acc_char_replicate (x: '\\');
234 const word_type repl_qm = acc_char_replicate (x: '?');
235
236 unsigned int misalign;
237 const word_type *p;
238 word_type val, t;
239
240 /* Align the buffer. Mask out any bytes from before the beginning. */
241 p = (word_type *)((uintptr_t)s & -sizeof(word_type));
242 val = *p;
243 misalign = (uintptr_t)s & (sizeof(word_type) - 1);
244 if (misalign)
245 val = acc_char_mask_misalign (val, n: misalign);
246
247 /* Main loop. */
248 while (1)
249 {
250 t = acc_char_cmp (val, c: repl_nl);
251 t |= acc_char_cmp (val, c: repl_cr);
252 t |= acc_char_cmp (val, c: repl_bs);
253 t |= acc_char_cmp (val, c: repl_qm);
254
255 if (__builtin_expect (t != 0, 0))
256 {
257 int i = acc_char_index (cmp: t, val);
258 if (i >= 0)
259 return (const uchar *)p + i;
260 }
261
262 val = *++p;
263 }
264}
265
266/* Disable on Solaris 2/x86 until the following problem can be properly
267 autoconfed:
268
269 The Solaris 10+ assembler tags objects with the instruction set
270 extensions used, so SSE4.2 executables cannot run on machines that
271 don't support that extension. */
272
273#if (GCC_VERSION >= 4005) && (__GNUC__ >= 5 || !defined(__PIC__)) && (defined(__i386__) || defined(__x86_64__)) && !(defined(__sun__) && defined(__svr4__))
274
275/* Replicated character data to be shared between implementations.
276 Recall that outside of a context with vector support we can't
277 define compatible vector types, therefore these are all defined
278 in terms of raw characters. */
279static const char repl_chars[4][16] __attribute__((aligned(16))) = {
280 { '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
281 '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n' },
282 { '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
283 '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r' },
284 { '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
285 '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\' },
286 { '?', '?', '?', '?', '?', '?', '?', '?',
287 '?', '?', '?', '?', '?', '?', '?', '?' },
288};
289
290
291/* A version of the fast scanner using SSE2 vectorized byte compare insns. */
292
293static inline const uchar *
294#ifndef __SSE2__
295__attribute__((__target__("sse2")))
296#endif
297search_line_sse2 (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
298{
299 typedef char v16qi __attribute__ ((__vector_size__ (16)));
300
301 const v16qi repl_nl = *(const v16qi *)repl_chars[0];
302 const v16qi repl_cr = *(const v16qi *)repl_chars[1];
303 const v16qi repl_bs = *(const v16qi *)repl_chars[2];
304 const v16qi repl_qm = *(const v16qi *)repl_chars[3];
305
306 unsigned int misalign, found, mask;
307 const v16qi *p;
308 v16qi data, t;
309
310 /* Align the source pointer. */
311 misalign = (uintptr_t)s & 15;
312 p = (const v16qi *)((uintptr_t)s & -16);
313 data = *p;
314
315 /* Create a mask for the bytes that are valid within the first
316 16-byte block. The Idea here is that the AND with the mask
317 within the loop is "free", since we need some AND or TEST
318 insn in order to set the flags for the branch anyway. */
319 mask = -1u << misalign;
320
321 /* Main loop processing 16 bytes at a time. */
322 goto start;
323 do
324 {
325 data = *++p;
326 mask = -1;
327
328 start:
329 t = data == repl_nl;
330 t |= data == repl_cr;
331 t |= data == repl_bs;
332 t |= data == repl_qm;
333 found = __builtin_ia32_pmovmskb128 (t);
334 found &= mask;
335 }
336 while (!found);
337
338 /* FOUND contains 1 in bits for which we matched a relevant
339 character. Conversion to the byte index is trivial. */
340 found = __builtin_ctz(found);
341 return (const uchar *)p + found;
342}
343
344#ifdef HAVE_SSSE3
345/* A version of the fast scanner using SSSE3 shuffle (PSHUFB) insns. */
346
347static inline const uchar *
348#ifndef __SSSE3__
349__attribute__((__target__("ssse3")))
350#endif
351search_line_ssse3 (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
352{
353 typedef char v16qi __attribute__ ((__vector_size__ (16)));
354 typedef v16qi v16qi_u __attribute__ ((__aligned__ (1)));
355 /* Helper vector for pshufb-based matching:
356 each character C we're searching for is at position (C % 16). */
357 v16qi lut = { 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, '\n', 0, '\\', '\r', 0, '?' };
358 static_assert('\n' == 10 && '\r' == 13 && '\\' == 92 && '?' == 63,
359 "host character encoding is ASCII");
360
361 v16qi d1, d2, t1, t2;
362 /* Unaligned loads, potentially using padding after the final newline. */
363 static_assert (CPP_BUFFER_PADDING >= 64, "");
364 d1 = *(const v16qi_u *)s;
365 d2 = *(const v16qi_u *)(s + 16);
366 unsigned m1, m2, found;
367 /* Process two 16-byte chunks per iteration. */
368 do
369 {
370 t1 = __builtin_ia32_pshufb128 (lut, d1);
371 t2 = __builtin_ia32_pshufb128 (lut, d2);
372 m1 = __builtin_ia32_pmovmskb128 (t1 == d1);
373 m2 = __builtin_ia32_pmovmskb128 (t2 == d2);
374 s += 32;
375 d1 = *(const v16qi_u *)s;
376 d2 = *(const v16qi_u *)(s + 16);
377 found = m1 + (m2 << 16);
378 }
379 while (!found);
380 /* Prefer to compute 's - 32' here, not spend an extra instruction
381 to make a copy of the previous value of 's' in the loop. */
382 __asm__ ("" : "+r"(s));
383 return s - 32 + __builtin_ctz (found);
384}
385
386#else
387/* Work around out-dated assemblers without SSSE3 support. */
388#define search_line_ssse3 search_line_sse2
389#endif
390
391#ifdef __SSSE3__
392/* No need for CPU probing, just use the best available variant. */
393#define search_line_fast search_line_ssse3
394#else
395/* Check the CPU capabilities. */
396
397#include "../gcc/config/i386/cpuid.h"
398
399typedef const uchar * (*search_line_fast_type) (const uchar *, const uchar *);
400static search_line_fast_type search_line_fast
401#if defined(__SSE2__)
402 = search_line_sse2;
403#else
404 = search_line_acc_char;
405#endif
406
407#define HAVE_init_vectorized_lexer 1
408static inline void
409init_vectorized_lexer (void)
410{
411 unsigned ax, bx, cx, dx;
412
413 if (!__get_cpuid (1, &ax, &bx, &cx, &dx))
414 return;
415
416 if (cx & bit_SSSE3)
417 search_line_fast = search_line_ssse3;
418 else if (dx & bit_SSE2)
419 search_line_fast = search_line_sse2;
420}
421#endif
422
423#elif (GCC_VERSION >= 4005) && defined(_ARCH_PWR8) && defined(__ALTIVEC__)
424
425/* A vection of the fast scanner using AltiVec vectorized byte compares
426 and VSX unaligned loads (when VSX is available). This is otherwise
427 the same as the AltiVec version. */
428
429ATTRIBUTE_NO_SANITIZE_UNDEFINED
430static const uchar *
431search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
432{
433 typedef __attribute__((altivec(vector))) unsigned char vc;
434
435 const vc repl_nl = {
436 '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
437 '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n'
438 };
439 const vc repl_cr = {
440 '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
441 '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r'
442 };
443 const vc repl_bs = {
444 '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
445 '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\'
446 };
447 const vc repl_qm = {
448 '?', '?', '?', '?', '?', '?', '?', '?',
449 '?', '?', '?', '?', '?', '?', '?', '?',
450 };
451 const vc zero = { 0 };
452
453 vc data, t;
454
455 /* Main loop processing 16 bytes at a time. */
456 do
457 {
458 vc m_nl, m_cr, m_bs, m_qm;
459
460 data = __builtin_vec_vsx_ld (0, s);
461 s += 16;
462
463 m_nl = (vc) __builtin_vec_cmpeq(data, repl_nl);
464 m_cr = (vc) __builtin_vec_cmpeq(data, repl_cr);
465 m_bs = (vc) __builtin_vec_cmpeq(data, repl_bs);
466 m_qm = (vc) __builtin_vec_cmpeq(data, repl_qm);
467 t = (m_nl | m_cr) | (m_bs | m_qm);
468
469 /* T now contains 0xff in bytes for which we matched one of the relevant
470 characters. We want to exit the loop if any byte in T is non-zero.
471 Below is the expansion of vec_any_ne(t, zero). */
472 }
473 while (!__builtin_vec_vcmpeq_p(/*__CR6_LT_REV*/3, t, zero));
474
475 /* Restore s to to point to the 16 bytes we just processed. */
476 s -= 16;
477
478 {
479#define N (sizeof(vc) / sizeof(long))
480
481 union {
482 vc v;
483 /* Statically assert that N is 2 or 4. */
484 unsigned long l[(N == 2 || N == 4) ? N : -1];
485 } u;
486 unsigned long l, i = 0;
487
488 u.v = t;
489
490 /* Find the first word of T that is non-zero. */
491 switch (N)
492 {
493 case 4:
494 l = u.l[i++];
495 if (l != 0)
496 break;
497 s += sizeof(unsigned long);
498 l = u.l[i++];
499 if (l != 0)
500 break;
501 s += sizeof(unsigned long);
502 /* FALLTHRU */
503 case 2:
504 l = u.l[i++];
505 if (l != 0)
506 break;
507 s += sizeof(unsigned long);
508 l = u.l[i];
509 }
510
511 /* L now contains 0xff in bytes for which we matched one of the
512 relevant characters. We can find the byte index by finding
513 its bit index and dividing by 8. */
514#ifdef __BIG_ENDIAN__
515 l = __builtin_clzl(l) >> 3;
516#else
517 l = __builtin_ctzl(l) >> 3;
518#endif
519 return s + l;
520
521#undef N
522 }
523}
524
525#elif (GCC_VERSION >= 4005) && defined(__ALTIVEC__) && defined (__BIG_ENDIAN__)
526
527/* A vection of the fast scanner using AltiVec vectorized byte compares.
528 This cannot be used for little endian because vec_lvsl/lvsr are
529 deprecated for little endian and the code won't work properly. */
530/* ??? Unfortunately, attribute(target("altivec")) is not yet supported,
531 so we can't compile this function without -maltivec on the command line
532 (or implied by some other switch). */
533
534static const uchar *
535search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
536{
537 typedef __attribute__((altivec(vector))) unsigned char vc;
538
539 const vc repl_nl = {
540 '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
541 '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n'
542 };
543 const vc repl_cr = {
544 '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
545 '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r'
546 };
547 const vc repl_bs = {
548 '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
549 '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\'
550 };
551 const vc repl_qm = {
552 '?', '?', '?', '?', '?', '?', '?', '?',
553 '?', '?', '?', '?', '?', '?', '?', '?',
554 };
555 const vc ones = {
556 -1, -1, -1, -1, -1, -1, -1, -1,
557 -1, -1, -1, -1, -1, -1, -1, -1,
558 };
559 const vc zero = { 0 };
560
561 vc data, mask, t;
562
563 /* Altivec loads automatically mask addresses with -16. This lets us
564 issue the first load as early as possible. */
565 data = __builtin_vec_ld(0, (const vc *)s);
566
567 /* Discard bytes before the beginning of the buffer. Do this by
568 beginning with all ones and shifting in zeros according to the
569 mis-alignment. The LVSR instruction pulls the exact shift we
570 want from the address. */
571 mask = __builtin_vec_lvsr(0, s);
572 mask = __builtin_vec_perm(zero, ones, mask);
573 data &= mask;
574
575 /* While altivec loads mask addresses, we still need to align S so
576 that the offset we compute at the end is correct. */
577 s = (const uchar *)((uintptr_t)s & -16);
578
579 /* Main loop processing 16 bytes at a time. */
580 goto start;
581 do
582 {
583 vc m_nl, m_cr, m_bs, m_qm;
584
585 s += 16;
586 data = __builtin_vec_ld(0, (const vc *)s);
587
588 start:
589 m_nl = (vc) __builtin_vec_cmpeq(data, repl_nl);
590 m_cr = (vc) __builtin_vec_cmpeq(data, repl_cr);
591 m_bs = (vc) __builtin_vec_cmpeq(data, repl_bs);
592 m_qm = (vc) __builtin_vec_cmpeq(data, repl_qm);
593 t = (m_nl | m_cr) | (m_bs | m_qm);
594
595 /* T now contains 0xff in bytes for which we matched one of the relevant
596 characters. We want to exit the loop if any byte in T is non-zero.
597 Below is the expansion of vec_any_ne(t, zero). */
598 }
599 while (!__builtin_vec_vcmpeq_p(/*__CR6_LT_REV*/3, t, zero));
600
601 {
602#define N (sizeof(vc) / sizeof(long))
603
604 union {
605 vc v;
606 /* Statically assert that N is 2 or 4. */
607 unsigned long l[(N == 2 || N == 4) ? N : -1];
608 } u;
609 unsigned long l, i = 0;
610
611 u.v = t;
612
613 /* Find the first word of T that is non-zero. */
614 switch (N)
615 {
616 case 4:
617 l = u.l[i++];
618 if (l != 0)
619 break;
620 s += sizeof(unsigned long);
621 l = u.l[i++];
622 if (l != 0)
623 break;
624 s += sizeof(unsigned long);
625 /* FALLTHROUGH */
626 case 2:
627 l = u.l[i++];
628 if (l != 0)
629 break;
630 s += sizeof(unsigned long);
631 l = u.l[i];
632 }
633
634 /* L now contains 0xff in bytes for which we matched one of the
635 relevant characters. We can find the byte index by finding
636 its bit index and dividing by 8. */
637 l = __builtin_clzl(l) >> 3;
638 return s + l;
639
640#undef N
641 }
642}
643
644#elif defined (__ARM_NEON) && defined (__ARM_64BIT_STATE)
645#include "arm_neon.h"
646
647/* This doesn't have to be the exact page size, but no system may use
648 a size smaller than this. ARMv8 requires a minimum page size of
649 4k. The impact of being conservative here is a small number of
650 cases will take the slightly slower entry path into the main
651 loop. */
652
653#define AARCH64_MIN_PAGE_SIZE 4096
654
655static const uchar *
656search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
657{
658 const uint8x16_t repl_nl = vdupq_n_u8 ('\n');
659 const uint8x16_t repl_cr = vdupq_n_u8 ('\r');
660 const uint8x16_t repl_bs = vdupq_n_u8 ('\\');
661 const uint8x16_t repl_qm = vdupq_n_u8 ('?');
662 const uint8x16_t xmask = (uint8x16_t) vdupq_n_u64 (0x8040201008040201ULL);
663
664#ifdef __ARM_BIG_ENDIAN
665 const int16x8_t shift = {8, 8, 8, 8, 0, 0, 0, 0};
666#else
667 const int16x8_t shift = {0, 0, 0, 0, 8, 8, 8, 8};
668#endif
669
670 unsigned int found;
671 const uint8_t *p;
672 uint8x16_t data;
673 uint8x16_t t;
674 uint16x8_t m;
675 uint8x16_t u, v, w;
676
677 /* Align the source pointer. */
678 p = (const uint8_t *)((uintptr_t)s & -16);
679
680 /* Assuming random string start positions, with a 4k page size we'll take
681 the slow path about 0.37% of the time. */
682 if (__builtin_expect ((AARCH64_MIN_PAGE_SIZE
683 - (((uintptr_t) s) & (AARCH64_MIN_PAGE_SIZE - 1)))
684 < 16, 0))
685 {
686 /* Slow path: the string starts near a possible page boundary. */
687 uint32_t misalign, mask;
688
689 misalign = (uintptr_t)s & 15;
690 mask = (-1u << misalign) & 0xffff;
691 data = vld1q_u8 (p);
692 t = vceqq_u8 (data, repl_nl);
693 u = vceqq_u8 (data, repl_cr);
694 v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
695 w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
696 t = vorrq_u8 (v, w);
697 t = vandq_u8 (t, xmask);
698 m = vpaddlq_u8 (t);
699 m = vshlq_u16 (m, shift);
700 found = vaddvq_u16 (m);
701 found &= mask;
702 if (found)
703 return (const uchar*)p + __builtin_ctz (found);
704 }
705 else
706 {
707 data = vld1q_u8 ((const uint8_t *) s);
708 t = vceqq_u8 (data, repl_nl);
709 u = vceqq_u8 (data, repl_cr);
710 v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
711 w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
712 t = vorrq_u8 (v, w);
713 if (__builtin_expect (vpaddd_u64 ((uint64x2_t)t) != 0, 0))
714 goto done;
715 }
716
717 do
718 {
719 p += 16;
720 data = vld1q_u8 (p);
721 t = vceqq_u8 (data, repl_nl);
722 u = vceqq_u8 (data, repl_cr);
723 v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
724 w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
725 t = vorrq_u8 (v, w);
726 } while (!vpaddd_u64 ((uint64x2_t)t));
727
728done:
729 /* Now that we've found the terminating substring, work out precisely where
730 we need to stop. */
731 t = vandq_u8 (t, xmask);
732 m = vpaddlq_u8 (t);
733 m = vshlq_u16 (m, shift);
734 found = vaddvq_u16 (m);
735 return (((((uintptr_t) p) < (uintptr_t) s) ? s : (const uchar *)p)
736 + __builtin_ctz (found));
737}
738
739#elif defined (__ARM_NEON)
740#include "arm_neon.h"
741
742static const uchar *
743search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
744{
745 const uint8x16_t repl_nl = vdupq_n_u8 ('\n');
746 const uint8x16_t repl_cr = vdupq_n_u8 ('\r');
747 const uint8x16_t repl_bs = vdupq_n_u8 ('\\');
748 const uint8x16_t repl_qm = vdupq_n_u8 ('?');
749 const uint8x16_t xmask = (uint8x16_t) vdupq_n_u64 (0x8040201008040201ULL);
750
751 unsigned int misalign, found, mask;
752 const uint8_t *p;
753 uint8x16_t data;
754
755 /* Align the source pointer. */
756 misalign = (uintptr_t)s & 15;
757 p = (const uint8_t *)((uintptr_t)s & -16);
758 data = vld1q_u8 (p);
759
760 /* Create a mask for the bytes that are valid within the first
761 16-byte block. The Idea here is that the AND with the mask
762 within the loop is "free", since we need some AND or TEST
763 insn in order to set the flags for the branch anyway. */
764 mask = (-1u << misalign) & 0xffff;
765
766 /* Main loop, processing 16 bytes at a time. */
767 goto start;
768
769 do
770 {
771 uint8x8_t l;
772 uint16x4_t m;
773 uint32x2_t n;
774 uint8x16_t t, u, v, w;
775
776 p += 16;
777 data = vld1q_u8 (p);
778 mask = 0xffff;
779
780 start:
781 t = vceqq_u8 (data, repl_nl);
782 u = vceqq_u8 (data, repl_cr);
783 v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
784 w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
785 t = vandq_u8 (vorrq_u8 (v, w), xmask);
786 l = vpadd_u8 (vget_low_u8 (t), vget_high_u8 (t));
787 m = vpaddl_u8 (l);
788 n = vpaddl_u16 (m);
789
790 found = vget_lane_u32 ((uint32x2_t) vorr_u64 ((uint64x1_t) n,
791 vshr_n_u64 ((uint64x1_t) n, 24)), 0);
792 found &= mask;
793 }
794 while (!found);
795
796 /* FOUND contains 1 in bits for which we matched a relevant
797 character. Conversion to the byte index is trivial. */
798 found = __builtin_ctz (found);
799 return (const uchar *)p + found;
800}
801
802#else
803
804/* We only have one accelerated alternative. Use a direct call so that
805 we encourage inlining. */
806
807#define search_line_fast search_line_acc_char
808
809#endif
810
811/* Initialize the lexer if needed. */
812
813void
814_cpp_init_lexer (void)
815{
816#ifdef HAVE_init_vectorized_lexer
817 init_vectorized_lexer ();
818#endif
819}
820
821/* Look for leading whitespace style issues on lines which don't contain
822 just whitespace.
823 For -Wleading-whitespace=spaces report if such lines contain leading
824 whitespace other than spaces.
825 For -Wleading-whitespace=tabs report if such lines contain leading
826 whitespace other than tabs.
827 For -Wleading-whitespace=blanks report if such lines contain leading
828 whitespace other than spaces+tabs, or contain in it tab after space,
829 or -ftabstop= or more consecutive spaces. */
830
831static void
832find_leading_whitespace_issues (cpp_reader *pfile, const uchar *s)
833{
834 const unsigned char *p = NULL;
835 uchar type = 'L';
836 switch (CPP_OPTION (pfile, cpp_warn_leading_whitespace))
837 {
838 case 1: /* spaces */
839 while (*s == ' ')
840 ++s;
841 break;
842 case 2: /* tabs */
843 while (*s == '\t')
844 ++s;
845 break;
846 case 3: /* blanks */
847 while (*s == '\t')
848 ++s;
849 int n;
850 n = CPP_OPTION (pfile, cpp_tabstop);
851 while (*s == ' ')
852 {
853 if (--n == 0)
854 break;
855 ++s;
856 }
857 if (*s == '\t')
858 type = 'T'; /* Tab after space. */
859 else if (*s == ' ')
860 type = 'S'; /* Too many spaces. */
861 break;
862 default:
863 abort ();
864 }
865 if (!IS_NVSPACE (*s))
866 return;
867 p = s++;
868 while (IS_NVSPACE (*s))
869 ++s;
870 if (*s != '\n' && *s != '\r')
871 add_line_note (buffer: pfile->buffer, pos: p, type);
872}
873
874/* Returns with a logical line that contains no escaped newlines or
875 trigraphs. This is a time-critical inner loop. */
876void
877_cpp_clean_line (cpp_reader *pfile)
878{
879 cpp_buffer *buffer;
880 const uchar *s;
881 uchar c, *d, *p;
882
883 buffer = pfile->buffer;
884 buffer->cur_note = buffer->notes_used = 0;
885 buffer->cur = buffer->line_base = buffer->next_line;
886 buffer->need_line = false;
887 s = buffer->next_line;
888
889 if (!buffer->from_stage3)
890 {
891 const uchar *pbackslash = NULL;
892 bool leading_ws_done = true;
893
894 if (CPP_OPTION (pfile, cpp_warn_leading_whitespace))
895 find_leading_whitespace_issues (pfile, s);
896
897 /* Fast path. This is the common case of an un-escaped line with
898 no trigraphs. The primary win here is by not writing any
899 data back to memory until we have to. */
900 while (1)
901 {
902 /* Perform an optimized search for \n, \r, \\, ?. */
903 s = search_line_fast (s, end: buffer->rlimit);
904
905 c = *s;
906 if (c == '\\')
907 {
908 /* Record the location of the backslash and continue. */
909 pbackslash = s++;
910 }
911 else if (__builtin_expect (c == '?', 0))
912 {
913 if (__builtin_expect (s[1] == '?', false)
914 && _cpp_trigraph_map[s[2]])
915 {
916 /* Have a trigraph. We may or may not have to convert
917 it. Add a line note regardless, for -Wtrigraphs. */
918 add_line_note (buffer, pos: s, type: s[2]);
919 if (CPP_OPTION (pfile, trigraphs))
920 {
921 /* We do, and that means we have to switch to the
922 slow path. */
923 d = (uchar *) s;
924 *d = _cpp_trigraph_map[s[2]];
925 s += 2;
926 goto slow_path;
927 }
928 }
929 /* Not a trigraph. Continue on fast-path. */
930 s++;
931 }
932 else
933 break;
934 }
935
936 /* This must be \r or \n. We're either done, or we'll be forced
937 to write back to the buffer and continue on the slow path. */
938 d = (uchar *) s;
939
940 if (__builtin_expect (s == buffer->rlimit, false))
941 goto done;
942
943 /* DOS line ending? */
944 if (__builtin_expect (c == '\r', false) && s[1] == '\n')
945 {
946 s++;
947 if (s == buffer->rlimit)
948 goto done;
949 }
950
951 if (__builtin_expect (pbackslash == NULL, true))
952 goto done;
953
954 /* Check for escaped newline. */
955 p = d;
956 while (is_nvspace (p[-1]))
957 p--;
958 if (p - 1 != pbackslash)
959 goto done;
960
961 /* Have an escaped newline; process it and proceed to
962 the slow path. */
963 add_line_note (buffer, pos: p - 1, type: p != d ? ' ' : '\\');
964 d = p - 2;
965 buffer->next_line = p - 1;
966 leading_ws_done = false;
967
968 slow_path:
969 while (1)
970 {
971 c = *++s;
972 *++d = c;
973
974 if (c == '\n' || c == '\r')
975 {
976 if (CPP_OPTION (pfile, cpp_warn_leading_whitespace)
977 && !leading_ws_done)
978 find_leading_whitespace_issues (pfile, s: buffer->next_line);
979
980 /* Handle DOS line endings. */
981 if (c == '\r' && s != buffer->rlimit && s[1] == '\n')
982 s++;
983 if (s == buffer->rlimit)
984 break;
985
986 /* Escaped? */
987 p = d;
988 while (p != buffer->next_line && is_nvspace (p[-1]))
989 p--;
990 if (p == buffer->next_line || p[-1] != '\\')
991 break;
992
993 add_line_note (buffer, pos: p - 1, type: p != d ? ' ' : '\\');
994 d = p - 2;
995 buffer->next_line = p - 1;
996 leading_ws_done = false;
997 }
998 else if (c == '?' && s[1] == '?' && _cpp_trigraph_map[s[2]])
999 {
1000 if (CPP_OPTION (pfile, cpp_warn_leading_whitespace)
1001 && !leading_ws_done)
1002 {
1003 find_leading_whitespace_issues (pfile, s: buffer->next_line);
1004 leading_ws_done = true;
1005 }
1006
1007 /* Add a note regardless, for the benefit of -Wtrigraphs. */
1008 add_line_note (buffer, pos: d, type: s[2]);
1009 if (CPP_OPTION (pfile, trigraphs))
1010 {
1011 *d = _cpp_trigraph_map[s[2]];
1012 s += 2;
1013 }
1014 }
1015 }
1016 done:
1017 if (d > buffer->next_line
1018 && CPP_OPTION (pfile, cpp_warn_trailing_whitespace))
1019 switch (CPP_OPTION (pfile, cpp_warn_trailing_whitespace))
1020 {
1021 case 1:
1022 if (ISBLANK (d[-1]))
1023 add_line_note (buffer, pos: d - 1, type: 'W');
1024 break;
1025 case 2:
1026 if (IS_NVSPACE (d[-1]) && d[-1])
1027 add_line_note (buffer, pos: d - 1, type: 'W');
1028 break;
1029 }
1030 }
1031 else
1032 {
1033 while (*s != '\n' && *s != '\r')
1034 s++;
1035 d = (uchar *) s;
1036
1037 /* Handle DOS line endings. */
1038 if (*s == '\r' && s + 1 != buffer->rlimit && s[1] == '\n')
1039 s++;
1040 }
1041
1042 *d = '\n';
1043 /* A sentinel note that should never be processed. */
1044 add_line_note (buffer, pos: d + 1, type: '\n');
1045 buffer->next_line = s + 1;
1046}
1047
1048template <bool lexing_raw_string>
1049static bool get_fresh_line_impl (cpp_reader *pfile);
1050
1051/* Return true if the trigraph indicated by NOTE should be warned
1052 about in a comment. */
1053static bool
1054warn_in_comment (cpp_reader *pfile, _cpp_line_note *note)
1055{
1056 const uchar *p;
1057
1058 /* Within comments we don't warn about trigraphs, unless the
1059 trigraph forms an escaped newline, as that may change
1060 behavior. */
1061 if (note->type != '/')
1062 return false;
1063
1064 /* If -trigraphs, then this was an escaped newline iff the next note
1065 is coincident. */
1066 if (CPP_OPTION (pfile, trigraphs))
1067 return note[1].pos == note->pos;
1068
1069 /* Otherwise, see if this forms an escaped newline. */
1070 p = note->pos + 3;
1071 while (is_nvspace (*p))
1072 p++;
1073
1074 /* There might have been escaped newlines between the trigraph and the
1075 newline we found. Hence the position test. */
1076 return (*p == '\n' && p < note[1].pos);
1077}
1078
1079/* Process the notes created by add_line_note as far as the current
1080 location. */
1081void
1082_cpp_process_line_notes (cpp_reader *pfile, int in_comment)
1083{
1084 cpp_buffer *buffer = pfile->buffer;
1085
1086 for (;;)
1087 {
1088 _cpp_line_note *note = &buffer->notes[buffer->cur_note];
1089 unsigned int col;
1090
1091 if (note->pos > buffer->cur)
1092 break;
1093
1094 buffer->cur_note++;
1095 col = CPP_BUF_COLUMN (buffer, note->pos + 1);
1096
1097 if (note->type == '\\' || note->type == ' ')
1098 {
1099 if (note->type == ' ')
1100 {
1101 if (!in_comment)
1102 cpp_error_with_line (pfile, CPP_DL_WARNING,
1103 pfile->line_table->highest_line, col,
1104 msgid: "backslash and newline separated by "
1105 "space");
1106 else if (CPP_OPTION (pfile, cpp_warn_trailing_whitespace))
1107 cpp_warning_with_line (pfile, CPP_W_TRAILING_WHITESPACE,
1108 pfile->line_table->highest_line, col,
1109 msgid: "trailing whitespace");
1110 }
1111
1112 if (buffer->next_line > buffer->rlimit)
1113 {
1114 cpp_error_with_line (pfile, CPP_DL_PEDWARN,
1115 pfile->line_table->highest_line, col,
1116 msgid: "backslash-newline at end of file");
1117 /* Prevent "no newline at end of file" warning. */
1118 buffer->next_line = buffer->rlimit;
1119 }
1120
1121 buffer->line_base = note->pos;
1122 CPP_INCREMENT_LINE (pfile, 0);
1123 }
1124 else if (_cpp_trigraph_map[note->type])
1125 {
1126 if (CPP_OPTION (pfile, warn_trigraphs)
1127 && (!in_comment || warn_in_comment (pfile, note)))
1128 {
1129 if (CPP_OPTION (pfile, trigraphs))
1130 cpp_warning_with_line (pfile, CPP_W_TRIGRAPHS,
1131 pfile->line_table->highest_line, col,
1132 msgid: "trigraph %<??%c%> converted to %<%c%>",
1133 note->type,
1134 (int) _cpp_trigraph_map[note->type]);
1135 else
1136 cpp_warning_with_line (pfile, CPP_W_TRIGRAPHS,
1137 pfile->line_table->highest_line, col,
1138 msgid: "trigraph %<??%c%> ignored, use "
1139 "%<-trigraphs%> to enable", note->type);
1140 }
1141 }
1142 else if (note->type == 'W')
1143 cpp_warning_with_line (pfile, CPP_W_TRAILING_WHITESPACE,
1144 pfile->line_table->highest_line, col,
1145 msgid: "trailing whitespace");
1146 else if (note->type == 'S')
1147 cpp_warning_with_line (pfile, CPP_W_LEADING_WHITESPACE,
1148 pfile->line_table->highest_line, col,
1149 msgid: "too many consecutive spaces in leading "
1150 "whitespace");
1151 else if (note->type == 'T')
1152 cpp_warning_with_line (pfile, CPP_W_LEADING_WHITESPACE,
1153 pfile->line_table->highest_line, col,
1154 msgid: "tab after space in leading whitespace");
1155 else if (note->type == 'L')
1156 switch (CPP_OPTION (pfile, cpp_warn_leading_whitespace))
1157 {
1158 case 1:
1159 cpp_warning_with_line (pfile, CPP_W_LEADING_WHITESPACE,
1160 pfile->line_table->highest_line, col,
1161 msgid: "whitespace other than spaces in leading "
1162 "whitespace");
1163 break;
1164 case 2:
1165 cpp_warning_with_line (pfile, CPP_W_LEADING_WHITESPACE,
1166 pfile->line_table->highest_line, col,
1167 msgid: "whitespace other than tabs in leading "
1168 "whitespace");
1169 break;
1170 case 3:
1171 cpp_warning_with_line (pfile, CPP_W_LEADING_WHITESPACE,
1172 pfile->line_table->highest_line, col,
1173 msgid: "whitespace other than spaces and tabs in "
1174 "leading whitespace");
1175 break;
1176 default:
1177 abort ();
1178 }
1179 else if (note->type == 0)
1180 /* Already processed in lex_raw_string. */;
1181 else
1182 abort ();
1183 }
1184}
1185
1186namespace bidi {
1187 enum class kind {
1188 NONE, LRE, RLE, LRO, RLO, LRI, RLI, FSI, PDF, PDI, LTR, RTL
1189 };
1190
1191 /* All the UTF-8 encodings of bidi characters start with E2. */
1192 constexpr uchar utf8_start = 0xe2;
1193
1194 struct context
1195 {
1196 context () {}
1197 context (location_t loc, kind k, bool pdf, bool ucn)
1198 : m_loc (loc), m_kind (k), m_pdf (pdf), m_ucn (ucn)
1199 {
1200 }
1201
1202 kind get_pop_kind () const
1203 {
1204 return m_pdf ? kind::PDF : kind::PDI;
1205 }
1206 bool ucn_p () const
1207 {
1208 return m_ucn;
1209 }
1210
1211 location_t m_loc;
1212 kind m_kind;
1213 unsigned m_pdf : 1;
1214 unsigned m_ucn : 1;
1215 };
1216
1217 /* A vector holding currently open bidi contexts. We use a char for
1218 each context, its LSB is 1 if it represents a PDF context, 0 if it
1219 represents a PDI context. The next bit is 1 if this context was open
1220 by a bidi character written as a UCN, and 0 when it was UTF-8. */
1221 semi_embedded_vec <context, 16> vec;
1222
1223 /* Close the whole comment/identifier/string literal/character constant
1224 context. */
1225 void on_close ()
1226 {
1227 vec.truncate (len: 0);
1228 }
1229
1230 /* Pop the last element in the vector. */
1231 void pop ()
1232 {
1233 unsigned int len = vec.count ();
1234 gcc_checking_assert (len > 0);
1235 vec.truncate (len: len - 1);
1236 }
1237
1238 /* Return the pop kind of the context of the Ith element. */
1239 kind pop_kind_at (unsigned int i)
1240 {
1241 return vec[i].get_pop_kind ();
1242 }
1243
1244 /* Return the pop kind of the context that is currently opened. */
1245 kind current_ctx ()
1246 {
1247 unsigned int len = vec.count ();
1248 if (len == 0)
1249 return kind::NONE;
1250 return vec[len - 1].get_pop_kind ();
1251 }
1252
1253 /* Return true if the current context comes from a UCN origin, that is,
1254 the bidi char which started this bidi context was written as a UCN. */
1255 bool current_ctx_ucn_p ()
1256 {
1257 unsigned int len = vec.count ();
1258 gcc_checking_assert (len > 0);
1259 return vec[len - 1].m_ucn;
1260 }
1261
1262 location_t current_ctx_loc ()
1263 {
1264 unsigned int len = vec.count ();
1265 gcc_checking_assert (len > 0);
1266 return vec[len - 1].m_loc;
1267 }
1268
1269 /* We've read a bidi char, update the current vector as necessary.
1270 LOC is only valid when K is not kind::NONE. */
1271 void on_char (kind k, bool ucn_p, location_t loc)
1272 {
1273 switch (k)
1274 {
1275 case kind::LRE:
1276 case kind::RLE:
1277 case kind::LRO:
1278 case kind::RLO:
1279 vec.push (value: context (loc, k, true, ucn_p));
1280 break;
1281 case kind::LRI:
1282 case kind::RLI:
1283 case kind::FSI:
1284 vec.push (value: context (loc, k, false, ucn_p));
1285 break;
1286 /* PDF terminates the scope of the last LRE, RLE, LRO, or RLO
1287 whose scope has not yet been terminated. */
1288 case kind::PDF:
1289 if (current_ctx () == kind::PDF)
1290 pop ();
1291 break;
1292 /* PDI terminates the scope of the last LRI, RLI, or FSI whose
1293 scope has not yet been terminated, as well as the scopes of
1294 any subsequent LREs, RLEs, LROs, or RLOs whose scopes have not
1295 yet been terminated. */
1296 case kind::PDI:
1297 for (int i = vec.count () - 1; i >= 0; --i)
1298 if (pop_kind_at (i) == kind::PDI)
1299 {
1300 vec.truncate (len: i);
1301 break;
1302 }
1303 break;
1304 case kind::LTR:
1305 case kind::RTL:
1306 /* These aren't popped by a PDF/PDI. */
1307 break;
1308 ATTR_LIKELY case kind::NONE:
1309 break;
1310 default:
1311 abort ();
1312 }
1313 }
1314
1315 /* Return a descriptive string for K. */
1316 const char *to_str (kind k)
1317 {
1318 switch (k)
1319 {
1320 case kind::LRE:
1321 return "U+202A (LEFT-TO-RIGHT EMBEDDING)";
1322 case kind::RLE:
1323 return "U+202B (RIGHT-TO-LEFT EMBEDDING)";
1324 case kind::LRO:
1325 return "U+202D (LEFT-TO-RIGHT OVERRIDE)";
1326 case kind::RLO:
1327 return "U+202E (RIGHT-TO-LEFT OVERRIDE)";
1328 case kind::LRI:
1329 return "U+2066 (LEFT-TO-RIGHT ISOLATE)";
1330 case kind::RLI:
1331 return "U+2067 (RIGHT-TO-LEFT ISOLATE)";
1332 case kind::FSI:
1333 return "U+2068 (FIRST STRONG ISOLATE)";
1334 case kind::PDF:
1335 return "U+202C (POP DIRECTIONAL FORMATTING)";
1336 case kind::PDI:
1337 return "U+2069 (POP DIRECTIONAL ISOLATE)";
1338 case kind::LTR:
1339 return "U+200E (LEFT-TO-RIGHT MARK)";
1340 case kind::RTL:
1341 return "U+200F (RIGHT-TO-LEFT MARK)";
1342 default:
1343 abort ();
1344 }
1345 }
1346}
1347
1348/* Get location_t for the range of bytes [START, START + NUM_BYTES)
1349 within the current line in FILE, with the caret at START. */
1350
1351static location_t
1352get_location_for_byte_range_in_cur_line (cpp_reader *pfile,
1353 const unsigned char *const start,
1354 size_t num_bytes)
1355{
1356 if (pfile->forced_token_location)
1357 return pfile->forced_token_location;
1358 gcc_checking_assert (num_bytes > 0);
1359
1360 /* CPP_BUF_COLUMN and linemap_position_for_column both refer
1361 to offsets in bytes, but CPP_BUF_COLUMN is 0-based,
1362 whereas linemap_position_for_column is 1-based. */
1363
1364 /* Get 0-based offsets within the line. */
1365 size_t start_offset = CPP_BUF_COLUMN (pfile->buffer, start);
1366 size_t end_offset = start_offset + num_bytes - 1;
1367
1368 /* Now convert to location_t, where "columns" are 1-based byte offsets. */
1369 location_t start_loc = linemap_position_for_column (pfile->line_table,
1370 start_offset + 1);
1371 location_t end_loc = linemap_position_for_column (pfile->line_table,
1372 end_offset + 1);
1373
1374 if (start_loc == end_loc)
1375 return start_loc;
1376
1377 source_range src_range;
1378 src_range.m_start = start_loc;
1379 src_range.m_finish = end_loc;
1380 location_t combined_loc
1381 = pfile->line_table->get_or_create_combined_loc (locus: start_loc,
1382 src_range,
1383 data: nullptr,
1384 discriminator: 0);
1385 return combined_loc;
1386}
1387
1388/* Parse a sequence of 3 bytes starting with P and return its bidi code. */
1389
1390static bidi::kind
1391get_bidi_utf8_1 (const unsigned char *const p)
1392{
1393 gcc_checking_assert (p[0] == bidi::utf8_start);
1394
1395 if (p[1] == 0x80)
1396 switch (p[2])
1397 {
1398 case 0xaa:
1399 return bidi::kind::LRE;
1400 case 0xab:
1401 return bidi::kind::RLE;
1402 case 0xac:
1403 return bidi::kind::PDF;
1404 case 0xad:
1405 return bidi::kind::LRO;
1406 case 0xae:
1407 return bidi::kind::RLO;
1408 case 0x8e:
1409 return bidi::kind::LTR;
1410 case 0x8f:
1411 return bidi::kind::RTL;
1412 default:
1413 break;
1414 }
1415 else if (p[1] == 0x81)
1416 switch (p[2])
1417 {
1418 case 0xa6:
1419 return bidi::kind::LRI;
1420 case 0xa7:
1421 return bidi::kind::RLI;
1422 case 0xa8:
1423 return bidi::kind::FSI;
1424 case 0xa9:
1425 return bidi::kind::PDI;
1426 default:
1427 break;
1428 }
1429
1430 return bidi::kind::NONE;
1431}
1432
1433/* Parse a sequence of 3 bytes starting with P and return its bidi code.
1434 If the kind is not NONE, write the location to *OUT.*/
1435
1436static bidi::kind
1437get_bidi_utf8 (cpp_reader *pfile, const unsigned char *const p, location_t *out)
1438{
1439 bidi::kind result = get_bidi_utf8_1 (p);
1440 if (result != bidi::kind::NONE)
1441 {
1442 /* We have a sequence of 3 bytes starting at P. */
1443 *out = get_location_for_byte_range_in_cur_line (pfile, start: p, num_bytes: 3);
1444 }
1445 return result;
1446}
1447
1448/* Parse a UCN where P points just past \u or \U and return its bidi code. */
1449
1450static bidi::kind
1451get_bidi_ucn_1 (const unsigned char *p, bool is_U, const unsigned char **end)
1452{
1453 /* 6.4.3 Universal Character Names
1454 \u hex-quad
1455 \U hex-quad hex-quad
1456 \u { simple-hexadecimal-digit-sequence }
1457 where \unnnn means \U0000nnnn. */
1458
1459 *end = p + 4;
1460 if (is_U)
1461 {
1462 if (p[0] != '0' || p[1] != '0' || p[2] != '0' || p[3] != '0')
1463 return bidi::kind::NONE;
1464 /* Skip 4B so we can treat \u and \U the same below. */
1465 p += 4;
1466 *end += 4;
1467 }
1468 else if (p[0] == '{')
1469 {
1470 p++;
1471 while (*p == '0')
1472 p++;
1473 if (p[0] != '2'
1474 || p[1] != '0'
1475 || !ISXDIGIT (p[2])
1476 || !ISXDIGIT (p[3])
1477 || p[4] != '}')
1478 return bidi::kind::NONE;
1479 *end = p + 5;
1480 }
1481
1482 /* All code points we are looking for start with 20xx. */
1483 if (p[0] != '2' || p[1] != '0')
1484 return bidi::kind::NONE;
1485 else if (p[2] == '2')
1486 switch (p[3])
1487 {
1488 case 'a':
1489 case 'A':
1490 return bidi::kind::LRE;
1491 case 'b':
1492 case 'B':
1493 return bidi::kind::RLE;
1494 case 'c':
1495 case 'C':
1496 return bidi::kind::PDF;
1497 case 'd':
1498 case 'D':
1499 return bidi::kind::LRO;
1500 case 'e':
1501 case 'E':
1502 return bidi::kind::RLO;
1503 default:
1504 break;
1505 }
1506 else if (p[2] == '6')
1507 switch (p[3])
1508 {
1509 case '6':
1510 return bidi::kind::LRI;
1511 case '7':
1512 return bidi::kind::RLI;
1513 case '8':
1514 return bidi::kind::FSI;
1515 case '9':
1516 return bidi::kind::PDI;
1517 default:
1518 break;
1519 }
1520 else if (p[2] == '0')
1521 switch (p[3])
1522 {
1523 case 'e':
1524 case 'E':
1525 return bidi::kind::LTR;
1526 case 'f':
1527 case 'F':
1528 return bidi::kind::RTL;
1529 default:
1530 break;
1531 }
1532
1533 return bidi::kind::NONE;
1534}
1535
1536/* Parse a UCN where P points just past \u or \U and return its bidi code.
1537 If the kind is not NONE, write the location to *OUT. */
1538
1539static bidi::kind
1540get_bidi_ucn (cpp_reader *pfile, const unsigned char *p, bool is_U,
1541 location_t *out)
1542{
1543 const unsigned char *end;
1544 bidi::kind result = get_bidi_ucn_1 (p, is_U, end: &end);
1545 if (result != bidi::kind::NONE)
1546 {
1547 const unsigned char *start = p - 2;
1548 size_t num_bytes = end - start;
1549 *out = get_location_for_byte_range_in_cur_line (pfile, start, num_bytes);
1550 }
1551 return result;
1552}
1553
1554/* Parse a named universal character escape where P points just past \N and
1555 return its bidi code. If the kind is not NONE, write the location to
1556 *OUT. */
1557
1558static bidi::kind
1559get_bidi_named (cpp_reader *pfile, const unsigned char *p, location_t *out)
1560{
1561 bidi::kind result = bidi::kind::NONE;
1562 if (*p != '{')
1563 return bidi::kind::NONE;
1564 if (strncmp (s1: (const char *) (p + 1), s2: "LEFT-TO-RIGHT ", n: 14) == 0)
1565 {
1566 if (strncmp (s1: (const char *) (p + 15), s2: "MARK}", n: 5) == 0)
1567 result = bidi::kind::LTR;
1568 else if (strncmp (s1: (const char *) (p + 15), s2: "EMBEDDING}", n: 10) == 0)
1569 result = bidi::kind::LRE;
1570 else if (strncmp (s1: (const char *) (p + 15), s2: "OVERRIDE}", n: 9) == 0)
1571 result = bidi::kind::LRO;
1572 else if (strncmp (s1: (const char *) (p + 15), s2: "ISOLATE}", n: 8) == 0)
1573 result = bidi::kind::LRI;
1574 }
1575 else if (strncmp (s1: (const char *) (p + 1), s2: "RIGHT-TO-LEFT ", n: 14) == 0)
1576 {
1577 if (strncmp (s1: (const char *) (p + 15), s2: "MARK}", n: 5) == 0)
1578 result = bidi::kind::RTL;
1579 else if (strncmp (s1: (const char *) (p + 15), s2: "EMBEDDING}", n: 10) == 0)
1580 result = bidi::kind::RLE;
1581 else if (strncmp (s1: (const char *) (p + 15), s2: "OVERRIDE}", n: 9) == 0)
1582 result = bidi::kind::RLO;
1583 else if (strncmp (s1: (const char *) (p + 15), s2: "ISOLATE}", n: 8) == 0)
1584 result = bidi::kind::RLI;
1585 }
1586 else if (strncmp (s1: (const char *) (p + 1), s2: "POP DIRECTIONAL ", n: 16) == 0)
1587 {
1588 if (strncmp (s1: (const char *) (p + 16), s2: "FORMATTING}", n: 11) == 0)
1589 result = bidi::kind::PDF;
1590 else if (strncmp (s1: (const char *) (p + 16), s2: "ISOLATE}", n: 8) == 0)
1591 result = bidi::kind::PDI;
1592 }
1593 else if (strncmp (s1: (const char *) (p + 1), s2: "FIRST STRONG ISOLATE}", n: 21) == 0)
1594 result = bidi::kind::FSI;
1595 if (result != bidi::kind::NONE)
1596 *out = get_location_for_byte_range_in_cur_line (pfile, start: p - 2,
1597 num_bytes: (strchr (s: (const char *)
1598 (p + 1), c: '}')
1599 - (const char *) p)
1600 + 3);
1601 return result;
1602}
1603
1604/* Subclass of rich_location for reporting on unpaired UTF-8
1605 bidirectional control character(s).
1606 Escape the source lines on output, and show all unclosed
1607 bidi context, labelling everything. */
1608
1609class unpaired_bidi_rich_location : public rich_location
1610{
1611 public:
1612 class custom_range_label : public range_label
1613 {
1614 public:
1615 label_text get_text (unsigned range_idx) const final override
1616 {
1617 /* range 0 is the primary location; each subsequent range i + 1
1618 is for bidi::vec[i]. */
1619 if (range_idx > 0)
1620 {
1621 const bidi::context &ctxt (bidi::vec[range_idx - 1]);
1622 return label_text::borrow (buffer: bidi::to_str (k: ctxt.m_kind));
1623 }
1624 else
1625 return label_text::borrow (_("end of bidirectional context"));
1626 }
1627 };
1628
1629 unpaired_bidi_rich_location (cpp_reader *pfile, location_t loc)
1630 : rich_location (pfile->line_table, loc, &m_custom_label)
1631 {
1632 set_escape_on_output (true);
1633 for (unsigned i = 0; i < bidi::vec.count (); i++)
1634 add_range (loc: bidi::vec[i].m_loc,
1635 range_display_kind: SHOW_RANGE_WITHOUT_CARET,
1636 label: &m_custom_label);
1637 }
1638
1639 private:
1640 custom_range_label m_custom_label;
1641};
1642
1643/* We're closing a bidi context, that is, we've encountered a newline,
1644 are closing a C-style comment, or are at the end of a string literal,
1645 character constant, or identifier. Warn if this context was not
1646 properly terminated by a PDI or PDF. P points to the last character
1647 in this context. */
1648
1649static void
1650maybe_warn_bidi_on_close (cpp_reader *pfile, const uchar *p)
1651{
1652 const auto warn_bidi = CPP_OPTION (pfile, cpp_warn_bidirectional);
1653 if (bidi::vec.count () > 0
1654 && (warn_bidi & bidirectional_unpaired
1655 && (!bidi::current_ctx_ucn_p ()
1656 || (warn_bidi & bidirectional_ucn))))
1657 {
1658 const location_t loc
1659 = linemap_position_for_column (pfile->line_table,
1660 CPP_BUF_COLUMN (pfile->buffer, p));
1661 unpaired_bidi_rich_location rich_loc (pfile, loc);
1662 /* cpp_callbacks doesn't yet have a way to handle singular vs plural
1663 forms of a diagnostic, so fake it for now. */
1664 if (bidi::vec.count () > 1)
1665 cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, richloc: &rich_loc,
1666 msgid: "unpaired UTF-8 bidirectional control characters "
1667 "detected");
1668 else
1669 cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, richloc: &rich_loc,
1670 msgid: "unpaired UTF-8 bidirectional control character "
1671 "detected");
1672 }
1673 /* We're done with this context. */
1674 bidi::on_close ();
1675}
1676
1677/* We're at the beginning or in the middle of an identifier/comment/string
1678 literal/character constant. Warn if we've encountered a bidi character.
1679 KIND says which bidi control character it was; UCN_P is true iff this bidi
1680 control character was written as a UCN. LOC is the location of the
1681 character, but is only valid if KIND != bidi::kind::NONE. */
1682
1683static void
1684maybe_warn_bidi_on_char (cpp_reader *pfile, bidi::kind kind,
1685 bool ucn_p, location_t loc)
1686{
1687 if (__builtin_expect (kind == bidi::kind::NONE, 1))
1688 return;
1689
1690 const auto warn_bidi = CPP_OPTION (pfile, cpp_warn_bidirectional);
1691
1692 if (warn_bidi & (bidirectional_unpaired|bidirectional_any))
1693 {
1694 rich_location rich_loc (pfile->line_table, loc);
1695 rich_loc.set_escape_on_output (true);
1696
1697 /* It seems excessive to warn about a PDI/PDF that is closing
1698 an opened context because we've already warned about the
1699 opening character. Except warn when we have a UCN x UTF-8
1700 mismatch, if UCN checking is enabled. */
1701 if (kind == bidi::current_ctx ())
1702 {
1703 if (warn_bidi == (bidirectional_unpaired|bidirectional_ucn)
1704 && bidi::current_ctx_ucn_p () != ucn_p)
1705 {
1706 rich_loc.add_range (loc: bidi::current_ctx_loc ());
1707 cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, richloc: &rich_loc,
1708 msgid: "UTF-8 vs UCN mismatch when closing "
1709 "a context by %qs", bidi::to_str (k: kind));
1710 }
1711 }
1712 else if (warn_bidi & bidirectional_any
1713 && (!ucn_p || (warn_bidi & bidirectional_ucn)))
1714 {
1715 if (kind == bidi::kind::PDF || kind == bidi::kind::PDI)
1716 cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, richloc: &rich_loc,
1717 msgid: "%qs is closing an unopened context",
1718 bidi::to_str (k: kind));
1719 else
1720 cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, richloc: &rich_loc,
1721 msgid: "found problematic Unicode character %qs",
1722 bidi::to_str (k: kind));
1723 }
1724 }
1725 /* We're done with this context. */
1726 bidi::on_char (k: kind, ucn_p, loc);
1727}
1728
1729static const cppchar_t utf8_continuation = 0x80;
1730static const cppchar_t utf8_signifier = 0xC0;
1731
1732/* Emit -Winvalid-utf8 warning on invalid UTF-8 character starting
1733 at PFILE->buffer->cur. Return a pointer after the diagnosed
1734 invalid character. */
1735
1736static const uchar *
1737_cpp_warn_invalid_utf8 (cpp_reader *pfile)
1738{
1739 cpp_buffer *buffer = pfile->buffer;
1740 const uchar *cur = buffer->cur;
1741 bool pedantic = (CPP_PEDANTIC (pfile)
1742 && CPP_OPTION (pfile, cpp_warn_invalid_utf8) == 2);
1743
1744 if (cur[0] < utf8_signifier
1745 || cur[1] < utf8_continuation || cur[1] >= utf8_signifier)
1746 {
1747 if (pedantic)
1748 cpp_error_with_line (pfile, CPP_DL_PEDWARN,
1749 pfile->line_table->highest_line,
1750 CPP_BUF_COL (buffer),
1751 msgid: "invalid UTF-8 character %<<%x>%>",
1752 cur[0]);
1753 else
1754 cpp_warning_with_line (pfile, CPP_W_INVALID_UTF8,
1755 pfile->line_table->highest_line,
1756 CPP_BUF_COL (buffer),
1757 msgid: "invalid UTF-8 character %<<%x>%>",
1758 cur[0]);
1759 return cur + 1;
1760 }
1761 else if (cur[2] < utf8_continuation || cur[2] >= utf8_signifier)
1762 {
1763 if (pedantic)
1764 cpp_error_with_line (pfile, CPP_DL_PEDWARN,
1765 pfile->line_table->highest_line,
1766 CPP_BUF_COL (buffer),
1767 msgid: "invalid UTF-8 character %<<%x><%x>%>",
1768 cur[0], cur[1]);
1769 else
1770 cpp_warning_with_line (pfile, CPP_W_INVALID_UTF8,
1771 pfile->line_table->highest_line,
1772 CPP_BUF_COL (buffer),
1773 msgid: "invalid UTF-8 character %<<%x><%x>%>",
1774 cur[0], cur[1]);
1775 return cur + 2;
1776 }
1777 else if (cur[3] < utf8_continuation || cur[3] >= utf8_signifier)
1778 {
1779 if (pedantic)
1780 cpp_error_with_line (pfile, CPP_DL_PEDWARN,
1781 pfile->line_table->highest_line,
1782 CPP_BUF_COL (buffer),
1783 msgid: "invalid UTF-8 character %<<%x><%x><%x>%>",
1784 cur[0], cur[1], cur[2]);
1785 else
1786 cpp_warning_with_line (pfile, CPP_W_INVALID_UTF8,
1787 pfile->line_table->highest_line,
1788 CPP_BUF_COL (buffer),
1789 msgid: "invalid UTF-8 character %<<%x><%x><%x>%>",
1790 cur[0], cur[1], cur[2]);
1791 return cur + 3;
1792 }
1793 else
1794 {
1795 if (pedantic)
1796 cpp_error_with_line (pfile, CPP_DL_PEDWARN,
1797 pfile->line_table->highest_line,
1798 CPP_BUF_COL (buffer),
1799 msgid: "invalid UTF-8 character %<<%x><%x><%x><%x>%>",
1800 cur[0], cur[1], cur[2], cur[3]);
1801 else
1802 cpp_warning_with_line (pfile, CPP_W_INVALID_UTF8,
1803 pfile->line_table->highest_line,
1804 CPP_BUF_COL (buffer),
1805 msgid: "invalid UTF-8 character %<<%x><%x><%x><%x>%>",
1806 cur[0], cur[1], cur[2], cur[3]);
1807 return cur + 4;
1808 }
1809}
1810
1811/* Helper function of *skip_*_comment and lex*_string. For C,
1812 character at CUR[-1] with MSB set handle -Wbidi-chars* and
1813 -Winvalid-utf8 diagnostics and return pointer to first character
1814 that should be processed next. */
1815
1816static inline const uchar *
1817_cpp_handle_multibyte_utf8 (cpp_reader *pfile, uchar c,
1818 const uchar *cur, bool warn_bidi_p,
1819 bool warn_invalid_utf8_p)
1820{
1821 /* If this is a beginning of a UTF-8 encoding, it might be
1822 a bidirectional control character. */
1823 if (c == bidi::utf8_start && warn_bidi_p)
1824 {
1825 location_t loc;
1826 bidi::kind kind = get_bidi_utf8 (pfile, p: cur - 1, out: &loc);
1827 maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/false, loc);
1828 }
1829 if (!warn_invalid_utf8_p)
1830 return cur;
1831 if (c >= utf8_signifier)
1832 {
1833 cppchar_t s;
1834 const uchar *pstr = cur - 1;
1835 if (_cpp_valid_utf8 (pfile, pstr: &pstr, limit: pfile->buffer->rlimit, identifier_pos: 0, NULL, cp: &s)
1836 && s <= UCS_LIMIT)
1837 return pstr;
1838 }
1839 pfile->buffer->cur = cur - 1;
1840 return _cpp_warn_invalid_utf8 (pfile);
1841}
1842
1843/* Skip a C-style block comment. We find the end of the comment by
1844 seeing if an asterisk is before every '/' we encounter. Returns
1845 nonzero if comment terminated by EOF, zero otherwise.
1846
1847 Buffer->cur points to the initial asterisk of the comment. */
1848bool
1849_cpp_skip_block_comment (cpp_reader *pfile)
1850{
1851 cpp_buffer *buffer = pfile->buffer;
1852 const uchar *cur = buffer->cur;
1853 uchar c;
1854 const bool warn_bidi_p = pfile->warn_bidi_p ();
1855 const bool warn_invalid_utf8_p = CPP_OPTION (pfile, cpp_warn_invalid_utf8);
1856 const bool warn_bidi_or_invalid_utf8_p = warn_bidi_p | warn_invalid_utf8_p;
1857
1858 cur++;
1859 if (*cur == '/')
1860 cur++;
1861
1862 for (;;)
1863 {
1864 /* People like decorating comments with '*', so check for '/'
1865 instead for efficiency. */
1866 c = *cur++;
1867
1868 if (c == '/')
1869 {
1870 if (cur[-2] == '*')
1871 {
1872 if (warn_bidi_p)
1873 maybe_warn_bidi_on_close (pfile, p: cur);
1874 break;
1875 }
1876
1877 /* Warn about potential nested comments, but not if the '/'
1878 comes immediately before the true comment delimiter.
1879 Don't bother to get it right across escaped newlines. */
1880 if (CPP_OPTION (pfile, warn_comments)
1881 && cur[0] == '*' && cur[1] != '/')
1882 {
1883 buffer->cur = cur;
1884 cpp_warning_with_line (pfile, CPP_W_COMMENTS,
1885 pfile->line_table->highest_line,
1886 CPP_BUF_COL (buffer),
1887 msgid: "%</*%> within comment");
1888 }
1889 }
1890 else if (c == '\n')
1891 {
1892 unsigned int cols;
1893 buffer->cur = cur - 1;
1894 if (warn_bidi_p)
1895 maybe_warn_bidi_on_close (pfile, p: cur);
1896 _cpp_process_line_notes (pfile, in_comment: true);
1897 if (buffer->next_line >= buffer->rlimit)
1898 return true;
1899 _cpp_clean_line (pfile);
1900
1901 cols = buffer->next_line - buffer->line_base;
1902 CPP_INCREMENT_LINE (pfile, cols);
1903
1904 cur = buffer->cur;
1905 }
1906 else if (__builtin_expect (c >= utf8_continuation, 0)
1907 && warn_bidi_or_invalid_utf8_p)
1908 cur = _cpp_handle_multibyte_utf8 (pfile, c, cur, warn_bidi_p,
1909 warn_invalid_utf8_p);
1910 }
1911
1912 buffer->cur = cur;
1913 _cpp_process_line_notes (pfile, in_comment: true);
1914 return false;
1915}
1916
1917/* Skip a C++ line comment, leaving buffer->cur pointing to the
1918 terminating newline. Handles escaped newlines. Returns nonzero
1919 if a multiline comment. */
1920static int
1921skip_line_comment (cpp_reader *pfile)
1922{
1923 cpp_buffer *buffer = pfile->buffer;
1924 location_t orig_line = pfile->line_table->highest_line;
1925 const bool warn_bidi_p = pfile->warn_bidi_p ();
1926 const bool warn_invalid_utf8_p = CPP_OPTION (pfile, cpp_warn_invalid_utf8);
1927 const bool warn_bidi_or_invalid_utf8_p = warn_bidi_p | warn_invalid_utf8_p;
1928
1929 if (!warn_bidi_or_invalid_utf8_p)
1930 while (*buffer->cur != '\n')
1931 buffer->cur++;
1932 else if (!warn_invalid_utf8_p)
1933 {
1934 while (*buffer->cur != '\n'
1935 && *buffer->cur != bidi::utf8_start)
1936 buffer->cur++;
1937 if (__builtin_expect (*buffer->cur == bidi::utf8_start, 0))
1938 {
1939 while (*buffer->cur != '\n')
1940 {
1941 if (__builtin_expect (*buffer->cur == bidi::utf8_start, 0))
1942 {
1943 location_t loc;
1944 bidi::kind kind = get_bidi_utf8 (pfile, p: buffer->cur, out: &loc);
1945 maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/false, loc);
1946 }
1947 buffer->cur++;
1948 }
1949 maybe_warn_bidi_on_close (pfile, p: buffer->cur);
1950 }
1951 }
1952 else
1953 {
1954 while (*buffer->cur != '\n')
1955 {
1956 if (*buffer->cur < utf8_continuation)
1957 {
1958 buffer->cur++;
1959 continue;
1960 }
1961 buffer->cur
1962 = _cpp_handle_multibyte_utf8 (pfile, c: *buffer->cur, cur: buffer->cur + 1,
1963 warn_bidi_p, warn_invalid_utf8_p);
1964 }
1965 if (warn_bidi_p)
1966 maybe_warn_bidi_on_close (pfile, p: buffer->cur);
1967 }
1968
1969 _cpp_process_line_notes (pfile, in_comment: true);
1970 return orig_line != pfile->line_table->highest_line;
1971}
1972
1973/* Skips whitespace, saving the next non-whitespace character. */
1974static void
1975skip_whitespace (cpp_reader *pfile, cppchar_t c)
1976{
1977 cpp_buffer *buffer = pfile->buffer;
1978 bool saw_NUL = false;
1979
1980 do
1981 {
1982 /* Horizontal space always OK. */
1983 if (c == ' ' || c == '\t')
1984 ;
1985 /* Just \f \v or \0 left. */
1986 else if (c == '\0')
1987 saw_NUL = true;
1988 else if (pfile->state.in_directive)
1989 cpp_pedwarning_with_line (pfile, CPP_W_PEDANTIC,
1990 pfile->line_table->highest_line,
1991 CPP_BUF_COL (buffer),
1992 msgid: "%s in preprocessing directive",
1993 c == '\f' ? "form feed" : "vertical tab");
1994
1995 c = *buffer->cur++;
1996 }
1997 /* We only want non-vertical space, i.e. ' ' \t \f \v \0. */
1998 while (is_nvspace (c));
1999
2000 if (saw_NUL)
2001 {
2002 encoding_rich_location rich_loc (pfile);
2003 cpp_error_at (pfile, CPP_DL_WARNING, richloc: &rich_loc,
2004 msgid: "null character(s) ignored");
2005 }
2006
2007 buffer->cur--;
2008}
2009
2010/* See if the characters of a number token are valid in a name (no
2011 '.', '+' or '-'). */
2012static int
2013name_p (cpp_reader *pfile, const cpp_string *string)
2014{
2015 unsigned int i;
2016
2017 for (i = 0; i < string->len; i++)
2018 if (!is_idchar (string->text[i]))
2019 return 0;
2020
2021 return 1;
2022}
2023
2024/* After parsing an identifier or other sequence, produce a warning about
2025 sequences not in NFC/NFKC. */
2026static void
2027warn_about_normalization (cpp_reader *pfile,
2028 const cpp_token *token,
2029 const struct normalize_state *s,
2030 bool identifier)
2031{
2032 if (CPP_OPTION (pfile, warn_normalize) < NORMALIZE_STATE_RESULT (s)
2033 && !pfile->state.skipping)
2034 {
2035 location_t loc = token->src_loc;
2036
2037 /* If possible, create a location range for the token. */
2038 if (loc >= RESERVED_LOCATION_COUNT
2039 && token->type != CPP_EOF
2040 && !pfile->forced_token_location
2041 /* There must be no line notes to process. */
2042 && (!(pfile->buffer->cur
2043 >= pfile->buffer->notes[pfile->buffer->cur_note].pos
2044 && !pfile->overlaid_buffer)))
2045 {
2046 source_range tok_range;
2047 tok_range.m_start = loc;
2048 tok_range.m_finish
2049 = linemap_position_for_column (pfile->line_table,
2050 CPP_BUF_COLUMN (pfile->buffer,
2051 pfile->buffer->cur));
2052 loc = pfile->line_table->get_or_create_combined_loc (locus: loc, src_range: tok_range,
2053 data: nullptr, discriminator: 0);
2054 }
2055
2056 encoding_rich_location rich_loc (pfile, loc);
2057
2058 /* Make sure that the token is printed using UCNs, even
2059 if we'd otherwise happily print UTF-8. */
2060 unsigned char *buf = XNEWVEC (unsigned char, cpp_token_len (token));
2061 size_t sz;
2062
2063 sz = cpp_spell_token (pfile, token, buf, false) - buf;
2064 if (NORMALIZE_STATE_RESULT (s) == normalized_C)
2065 cpp_warning_at (pfile, CPP_W_NORMALIZE, richloc: &rich_loc,
2066 msgid: "%<%.*s%> is not in NFKC", (int) sz, buf);
2067 else if (identifier && CPP_OPTION (pfile, xid_identifiers))
2068 cpp_pedwarning_at (pfile, CPP_W_NORMALIZE, richloc: &rich_loc,
2069 msgid: "%<%.*s%> is not in NFC", (int) sz, buf);
2070 else
2071 cpp_warning_at (pfile, CPP_W_NORMALIZE, richloc: &rich_loc,
2072 msgid: "%<%.*s%> is not in NFC", (int) sz, buf);
2073 free (ptr: buf);
2074 }
2075}
2076
2077/* Returns TRUE if the byte sequence starting at buffer->cur is a valid
2078 extended character in an identifier. If FIRST is TRUE, then the character
2079 must be valid at the beginning of an identifier as well. If the return
2080 value is TRUE, then pfile->buffer->cur has been moved to point to the next
2081 byte after the extended character. */
2082
2083static bool
2084forms_identifier_p (cpp_reader *pfile, int first,
2085 struct normalize_state *state)
2086{
2087 cpp_buffer *buffer = pfile->buffer;
2088 const bool warn_bidi_p = pfile->warn_bidi_p ();
2089
2090 if (*buffer->cur == '$')
2091 {
2092 if (!CPP_OPTION (pfile, dollars_in_ident))
2093 return false;
2094
2095 buffer->cur++;
2096 if (CPP_OPTION (pfile, warn_dollars) && !pfile->state.skipping)
2097 {
2098 CPP_OPTION (pfile, warn_dollars) = 0;
2099 cpp_error (pfile, CPP_DL_PEDWARN, msgid: "%<$%> in identifier or number");
2100 }
2101
2102 return true;
2103 }
2104
2105 /* Is this a syntactically valid UCN or a valid UTF-8 char? */
2106 if (CPP_OPTION (pfile, extended_identifiers))
2107 {
2108 cppchar_t s;
2109 if (*buffer->cur >= utf8_signifier)
2110 {
2111 if (__builtin_expect (*buffer->cur == bidi::utf8_start, 0)
2112 && warn_bidi_p)
2113 {
2114 location_t loc;
2115 bidi::kind kind = get_bidi_utf8 (pfile, p: buffer->cur, out: &loc);
2116 maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/false, loc);
2117 }
2118 if (_cpp_valid_utf8 (pfile, pstr: &buffer->cur, limit: buffer->rlimit, identifier_pos: 1 + !first,
2119 nst: state, cp: &s))
2120 return true;
2121 }
2122 else if (*buffer->cur == '\\'
2123 && (buffer->cur[1] == 'u'
2124 || buffer->cur[1] == 'U'
2125 || buffer->cur[1] == 'N'))
2126 {
2127 buffer->cur += 2;
2128 if (warn_bidi_p)
2129 {
2130 location_t loc;
2131 bidi::kind kind;
2132 if (buffer->cur[-1] == 'N')
2133 kind = get_bidi_named (pfile, p: buffer->cur, out: &loc);
2134 else
2135 kind = get_bidi_ucn (pfile, p: buffer->cur,
2136 is_U: buffer->cur[-1] == 'U', out: &loc);
2137 maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/true, loc);
2138 }
2139 if (_cpp_valid_ucn (pfile, &buffer->cur, buffer->rlimit, 1 + !first,
2140 state, &s, NULL, NULL))
2141 return true;
2142 buffer->cur -= 2;
2143 }
2144 }
2145
2146 return false;
2147}
2148
2149/* Helper function to issue error about improper __VA_OPT__ use. */
2150static void
2151maybe_va_opt_error (cpp_reader *pfile)
2152{
2153 if (CPP_PEDANTIC (pfile) && !CPP_OPTION (pfile, va_opt))
2154 {
2155 /* __VA_OPT__ should not be accepted at all, but allow it in
2156 system headers. */
2157 if (!_cpp_in_system_header (pfile))
2158 {
2159 if (CPP_OPTION (pfile, cplusplus))
2160 cpp_pedwarning (pfile, CPP_W_CXX20_EXTENSIONS,
2161 msgid: "%<__VA_OPT__%> is not available until C++20");
2162 else
2163 cpp_pedwarning (pfile, CPP_W_PEDANTIC,
2164 msgid: "%<__VA_OPT__%> is not available until C23");
2165 }
2166 }
2167 else if (!pfile->state.va_args_ok)
2168 {
2169 /* __VA_OPT__ should only appear in the replacement list of a
2170 variadic macro. */
2171 cpp_error (pfile, CPP_DL_PEDWARN,
2172 msgid: "%<__VA_OPT__%> can only appear in the expansion"
2173 " of a C++20 variadic macro");
2174 }
2175}
2176
2177/* Helper function to perform diagnostics that are needed (rarely)
2178 when an identifier is lexed. */
2179static void
2180identifier_diagnostics_on_lex (cpp_reader *pfile, cpp_hashnode *node)
2181{
2182 if (__builtin_expect (!(node->flags & NODE_DIAGNOSTIC)
2183 || pfile->state.skipping, 1))
2184 return;
2185
2186 /* It is allowed to poison the same identifier twice. */
2187 if ((node->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
2188 {
2189 cpp_error (pfile, CPP_DL_ERROR, msgid: "attempt to use poisoned %qs",
2190 NODE_NAME (node));
2191 const auto data = (cpp_hashnode_extra *)
2192 ht_lookup (ht: pfile->extra_hash_table, id: node->ident, opt: HT_NO_INSERT);
2193 if (data && data->poisoned_loc)
2194 cpp_error_at (pfile, CPP_DL_NOTE, src_loc: data->poisoned_loc, msgid: "poisoned here");
2195 }
2196
2197 /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
2198 replacement list of a variadic macro. */
2199 if (node == pfile->spec_nodes.n__VA_ARGS__
2200 && !pfile->state.va_args_ok)
2201 {
2202 if (CPP_OPTION (pfile, cplusplus))
2203 cpp_error (pfile, CPP_DL_PEDWARN,
2204 msgid: "%<__VA_ARGS__%> can only appear in the expansion"
2205 " of a C++11 variadic macro");
2206 else
2207 cpp_error (pfile, CPP_DL_PEDWARN,
2208 msgid: "%<__VA_ARGS__%> can only appear in the expansion"
2209 " of a C99 variadic macro");
2210 }
2211
2212 /* __VA_OPT__ should only appear in the replacement list of a
2213 variadic macro. */
2214 if (node == pfile->spec_nodes.n__VA_OPT__)
2215 maybe_va_opt_error (pfile);
2216
2217 /* For -Wc++-compat, warn about use of C++ named operators. */
2218 if (node->flags & NODE_WARN_OPERATOR)
2219 cpp_warning (pfile, CPP_W_CXX_OPERATOR_NAMES,
2220 msgid: "identifier %qs is a special operator name in C++",
2221 NODE_NAME (node));
2222}
2223
2224/* Lex an identifier starting at BASE. BUFFER->CUR is expected to point
2225 one past the first character at BASE, which may be a (possibly multi-byte)
2226 character if STARTS_UCN is true. */
2227static cpp_hashnode *
2228lex_identifier (cpp_reader *pfile, const uchar *base, bool starts_ucn,
2229 struct normalize_state *nst, cpp_hashnode **spelling)
2230{
2231 cpp_hashnode *result;
2232 const uchar *cur;
2233 unsigned int len;
2234 unsigned int hash = HT_HASHSTEP (0, *base);
2235 const bool warn_bidi_p = pfile->warn_bidi_p ();
2236
2237 cur = pfile->buffer->cur;
2238 if (! starts_ucn)
2239 {
2240 while (ISIDNUM (*cur))
2241 {
2242 hash = HT_HASHSTEP (hash, *cur);
2243 cur++;
2244 }
2245 NORMALIZE_STATE_UPDATE_IDNUM (nst, *(cur - 1));
2246 }
2247 pfile->buffer->cur = cur;
2248 if (starts_ucn || forms_identifier_p (pfile, first: false, state: nst))
2249 {
2250 /* Slower version for identifiers containing UCNs
2251 or extended chars (including $). */
2252 do {
2253 while (ISIDNUM (*pfile->buffer->cur))
2254 {
2255 NORMALIZE_STATE_UPDATE_IDNUM (nst, *pfile->buffer->cur);
2256 pfile->buffer->cur++;
2257 }
2258 } while (forms_identifier_p (pfile, first: false, state: nst));
2259 if (warn_bidi_p)
2260 maybe_warn_bidi_on_close (pfile, p: pfile->buffer->cur);
2261 result = _cpp_interpret_identifier (pfile, id: base,
2262 len: pfile->buffer->cur - base);
2263 *spelling = cpp_lookup (pfile, base, pfile->buffer->cur - base);
2264 }
2265 else
2266 {
2267 len = cur - base;
2268 hash = HT_HASHFINISH (hash, len);
2269
2270 result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
2271 base, len, hash, HT_ALLOC));
2272 *spelling = result;
2273 }
2274
2275 return result;
2276}
2277
2278/* Struct to hold the return value of the scan_cur_identifier () helper
2279 function below. */
2280
2281struct scan_id_result
2282{
2283 cpp_hashnode *node;
2284 normalize_state nst;
2285
2286 scan_id_result ()
2287 : node (nullptr)
2288 {
2289 nst = INITIAL_NORMALIZE_STATE;
2290 }
2291
2292 explicit operator bool () const { return node; }
2293};
2294
2295/* Helper function to scan an entire identifier beginning at
2296 pfile->buffer->cur, and possibly containing extended characters (UCNs
2297 and/or UTF-8). Returns the cpp_hashnode for the identifier on success, or
2298 else nullptr, as well as a normalize_state so that normalization warnings
2299 may be issued once the token lexing is complete. */
2300
2301static scan_id_result
2302scan_cur_identifier (cpp_reader *pfile)
2303{
2304 const auto buffer = pfile->buffer;
2305 const auto begin = buffer->cur;
2306 scan_id_result result;
2307 if (ISIDST (*buffer->cur))
2308 {
2309 ++buffer->cur;
2310 cpp_hashnode *ignore;
2311 result.node = lex_identifier (pfile, base: begin, starts_ucn: false, nst: &result.nst, spelling: &ignore);
2312 }
2313 else if (forms_identifier_p (pfile, first: true, state: &result.nst))
2314 {
2315 /* buffer->cur has been moved already by the call
2316 to forms_identifier_p. */
2317 cpp_hashnode *ignore;
2318 result.node = lex_identifier (pfile, base: begin, starts_ucn: true, nst: &result.nst, spelling: &ignore);
2319 }
2320 return result;
2321}
2322
2323/* Lex a number to NUMBER starting at BUFFER->CUR - 1. */
2324static void
2325lex_number (cpp_reader *pfile, cpp_string *number,
2326 struct normalize_state *nst)
2327{
2328 const uchar *cur;
2329 const uchar *base;
2330 uchar *dest;
2331
2332 base = pfile->buffer->cur - 1;
2333 do
2334 {
2335 const uchar *adj_digit_sep = NULL;
2336 cur = pfile->buffer->cur;
2337
2338 /* N.B. ISIDNUM does not include $. */
2339 while (ISIDNUM (*cur)
2340 || (*cur == '.' && !DIGIT_SEP (cur[-1]))
2341 || DIGIT_SEP (*cur)
2342 || (VALID_SIGN (*cur, cur[-1]) && !DIGIT_SEP (cur[-2])))
2343 {
2344 NORMALIZE_STATE_UPDATE_IDNUM (nst, *cur);
2345 /* Adjacent digit separators do not form part of the pp-number syntax.
2346 However, they can safely be diagnosed here as an error, since '' is
2347 not a valid preprocessing token. */
2348 if (DIGIT_SEP (*cur) && DIGIT_SEP (cur[-1]) && !adj_digit_sep)
2349 adj_digit_sep = cur;
2350 cur++;
2351 }
2352 /* A number can't end with a digit separator. */
2353 while (cur > pfile->buffer->cur && DIGIT_SEP (cur[-1]))
2354 --cur;
2355 if (adj_digit_sep && adj_digit_sep < cur)
2356 cpp_error (pfile, CPP_DL_ERROR, msgid: "adjacent digit separators");
2357
2358 pfile->buffer->cur = cur;
2359 }
2360 while (forms_identifier_p (pfile, first: false, state: nst));
2361
2362 number->len = cur - base;
2363 dest = _cpp_unaligned_alloc (pfile, number->len + 1);
2364 memcpy (dest: dest, src: base, n: number->len);
2365 dest[number->len] = '\0';
2366 number->text = dest;
2367}
2368
2369/* Create a token of type TYPE with a literal spelling. */
2370static void
2371create_literal (cpp_reader *pfile, cpp_token *token, const uchar *base,
2372 unsigned int len, enum cpp_ttype type)
2373{
2374 token->type = type;
2375 token->val.str.len = len;
2376 token->val.str.text = cpp_alloc_token_string (pfile, base, len);
2377}
2378
2379/* Like create_literal(), but construct it from two separate strings
2380 which are concatenated. LEN2 may be 0 if no second string is
2381 required. */
2382static void
2383create_literal2 (cpp_reader *pfile, cpp_token *token, const uchar *base1,
2384 unsigned int len1, const uchar *base2, unsigned int len2,
2385 enum cpp_ttype type)
2386{
2387 token->type = type;
2388 token->val.str.len = len1 + len2;
2389 uchar *const dest = _cpp_unaligned_alloc (pfile, len1 + len2 + 1);
2390 memcpy (dest: dest, src: base1, n: len1);
2391 if (len2)
2392 memcpy (dest: dest+len1, src: base2, n: len2);
2393 dest[len1 + len2] = 0;
2394 token->val.str.text = dest;
2395}
2396
2397const uchar *
2398cpp_alloc_token_string (cpp_reader *pfile,
2399 const unsigned char *ptr, unsigned len)
2400{
2401 uchar *dest = _cpp_unaligned_alloc (pfile, len + 1);
2402
2403 dest[len] = 0;
2404 memcpy (dest: dest, src: ptr, n: len);
2405 return dest;
2406}
2407
2408/* A pair of raw buffer pointers. The currently open one is [1], the
2409 first one is [0]. Used for string literal lexing. */
2410struct lit_accum {
2411 _cpp_buff *first;
2412 _cpp_buff *last;
2413 const uchar *rpos;
2414 size_t accum;
2415
2416 lit_accum ()
2417 : first (NULL), last (NULL), rpos (0), accum (0)
2418 {
2419 }
2420
2421 void append (cpp_reader *, const uchar *, size_t);
2422
2423 void read_begin (cpp_reader *);
2424 bool reading_p () const
2425 {
2426 return rpos != NULL;
2427 }
2428 char read_char ()
2429 {
2430 char c = *rpos++;
2431 if (rpos == BUFF_FRONT (last))
2432 rpos = NULL;
2433 return c;
2434 }
2435
2436 void create_literal2 (cpp_reader *pfile, cpp_token *token,
2437 const uchar *base1, unsigned int len1,
2438 const uchar *base2, unsigned int len2,
2439 enum cpp_ttype type);
2440};
2441
2442/* Subroutine of lex_raw_string: Append LEN chars from BASE to the buffer
2443 sequence from *FIRST_BUFF_P to LAST_BUFF_P. */
2444
2445void
2446lit_accum::append (cpp_reader *pfile, const uchar *base, size_t len)
2447{
2448 if (!last)
2449 /* Starting. */
2450 first = last = _cpp_get_buff (pfile, len);
2451 else if (len > BUFF_ROOM (last))
2452 {
2453 /* There is insufficient room in the buffer. Copy what we can,
2454 and then either extend or create a new one. */
2455 size_t room = BUFF_ROOM (last);
2456 memcpy (BUFF_FRONT (last), src: base, n: room);
2457 BUFF_FRONT (last) += room;
2458 base += room;
2459 len -= room;
2460 accum += room;
2461
2462 gcc_checking_assert (!rpos);
2463
2464 last = _cpp_append_extend_buff (pfile, last, len);
2465 }
2466
2467 memcpy (BUFF_FRONT (last), src: base, n: len);
2468 BUFF_FRONT (last) += len;
2469 accum += len;
2470}
2471
2472void
2473lit_accum::read_begin (cpp_reader *pfile)
2474{
2475 /* We never accumulate more than 4 chars to read. */
2476 if (BUFF_ROOM (last) < 4)
2477
2478 last = _cpp_append_extend_buff (pfile, last, 4);
2479 rpos = BUFF_FRONT (last);
2480}
2481
2482/* Helper function to check if a string format macro, say from inttypes.h, is
2483 placed touching a string literal, in which case it could be parsed as a C++11
2484 user-defined string literal thus breaking the program. Return TRUE if the
2485 UDL should be ignored for now and preserved for potential macro
2486 expansion. */
2487
2488static bool
2489maybe_ignore_udl_macro_suffix (cpp_reader *pfile, location_t src_loc,
2490 const uchar *suffix_begin, cpp_hashnode *node)
2491{
2492 /* User-defined literals outside of namespace std must start with a single
2493 underscore, so assume anything of that form really is a UDL suffix.
2494 We don't need to worry about UDLs defined inside namespace std because
2495 their names are reserved, so cannot be used as macro names in valid
2496 programs. */
2497 if ((suffix_begin[0] == '_' && suffix_begin[1] != '_')
2498 || !cpp_macro_p (node))
2499 return false;
2500
2501 /* Maybe raise a warning here; caller should arrange not to consume
2502 the tokens. */
2503 if (CPP_OPTION (pfile, warn_literal_suffix) && !pfile->state.skipping)
2504 cpp_warning_with_line (pfile, CPP_W_LITERAL_SUFFIX, src_loc, 0,
2505 msgid: "invalid suffix on literal; C++11 requires a space "
2506 "between literal and string macro");
2507 return true;
2508}
2509
2510/* Like create_literal2(), but also prepend all the accumulated data from
2511 the lit_accum struct. */
2512void
2513lit_accum::create_literal2 (cpp_reader *pfile, cpp_token *token,
2514 const uchar *base1, unsigned int len1,
2515 const uchar *base2, unsigned int len2,
2516 enum cpp_ttype type)
2517{
2518 const unsigned int tot_len = accum + len1 + len2;
2519 uchar *dest = _cpp_unaligned_alloc (pfile, tot_len + 1);
2520 token->type = type;
2521 token->val.str.len = tot_len;
2522 token->val.str.text = dest;
2523 for (_cpp_buff *buf = first; buf; buf = buf->next)
2524 {
2525 size_t len = BUFF_FRONT (buf) - buf->base;
2526 memcpy (dest: dest, src: buf->base, n: len);
2527 dest += len;
2528 }
2529 memcpy (dest: dest, src: base1, n: len1);
2530 dest += len1;
2531 if (len2)
2532 memcpy (dest: dest, src: base2, n: len2);
2533 dest += len2;
2534 *dest = '\0';
2535}
2536
2537/* Lexes a raw string. The stored string contains the spelling,
2538 including double quotes, delimiter string, '(' and ')', any leading
2539 'L', 'u', 'U' or 'u8' and 'R' modifier. The created token contains
2540 the type of the literal, or CPP_OTHER if it was not properly
2541 terminated.
2542
2543 BASE is the start of the token. Updates pfile->buffer->cur to just
2544 after the lexed string.
2545
2546 The spelling is NUL-terminated, but it is not guaranteed that this
2547 is the first NUL since embedded NULs are preserved. */
2548
2549static void
2550lex_raw_string (cpp_reader *pfile, cpp_token *token, const uchar *base)
2551{
2552 const uchar *pos = base;
2553 const bool warn_bidi_p = pfile->warn_bidi_p ();
2554 const bool warn_invalid_utf8_p = CPP_OPTION (pfile, cpp_warn_invalid_utf8);
2555 const bool warn_bidi_or_invalid_utf8_p = warn_bidi_p | warn_invalid_utf8_p;
2556
2557 /* 'tis a pity this information isn't passed down from the lexer's
2558 initial categorization of the token. */
2559 enum cpp_ttype type = CPP_STRING;
2560
2561 if (*pos == 'L')
2562 {
2563 type = CPP_WSTRING;
2564 pos++;
2565 }
2566 else if (*pos == 'U')
2567 {
2568 type = CPP_STRING32;
2569 pos++;
2570 }
2571 else if (*pos == 'u')
2572 {
2573 if (pos[1] == '8')
2574 {
2575 type = CPP_UTF8STRING;
2576 pos++;
2577 }
2578 else
2579 type = CPP_STRING16;
2580 pos++;
2581 }
2582
2583 gcc_checking_assert (pos[0] == 'R' && pos[1] == '"');
2584 pos += 2;
2585
2586 _cpp_line_note *note = &pfile->buffer->notes[pfile->buffer->cur_note];
2587
2588 /* Skip notes before the ". */
2589 while (note->pos < pos)
2590 ++note;
2591
2592 lit_accum accum;
2593
2594 uchar prefix[17];
2595 unsigned prefix_len = 0;
2596 enum Phase
2597 {
2598 PHASE_PREFIX = -2,
2599 PHASE_NONE = -1,
2600 PHASE_SUFFIX = 0
2601 } phase = PHASE_PREFIX;
2602
2603 for (;;)
2604 {
2605 gcc_checking_assert (note->pos >= pos);
2606
2607 /* Undo any escaped newlines and trigraphs. */
2608 if (!accum.reading_p () && note->pos == pos)
2609 switch (note->type)
2610 {
2611 case '\\':
2612 case ' ':
2613 /* Restore backslash followed by newline. */
2614 accum.append (pfile, base, len: pos - base);
2615 base = pos;
2616 accum.read_begin (pfile);
2617 accum.append (pfile, UC"\\", len: 1);
2618
2619 after_backslash:
2620 if (note->type == ' ')
2621 /* GNU backslash whitespace newline extension. FIXME
2622 could be any sequence of non-vertical space. When we
2623 can properly restore any such sequence, we should
2624 mark this note as handled so _cpp_process_line_notes
2625 doesn't warn. */
2626 accum.append (pfile, UC" ", len: 1);
2627
2628 accum.append (pfile, UC"\n", len: 1);
2629 note++;
2630 break;
2631
2632 case '\n':
2633 /* This can happen for ??/<NEWLINE> when trigraphs are not
2634 being interpretted. */
2635 gcc_checking_assert (!CPP_OPTION (pfile, trigraphs));
2636 note->type = 0;
2637 note++;
2638 break;
2639
2640 case 'W':
2641 case 'L':
2642 case 'S':
2643 case 'T':
2644 /* Don't warn about leading or trailing whitespace in raw string
2645 literals. */
2646 note->type = 0;
2647 note++;
2648 break;
2649
2650 default:
2651 gcc_checking_assert (_cpp_trigraph_map[note->type]);
2652
2653 /* Don't warn about this trigraph in
2654 _cpp_process_line_notes, since trigraphs show up as
2655 trigraphs in raw strings. */
2656 uchar type = note->type;
2657 note->type = 0;
2658
2659 if (CPP_OPTION (pfile, trigraphs))
2660 {
2661 accum.append (pfile, base, len: pos - base);
2662 base = pos;
2663 accum.read_begin (pfile);
2664 accum.append (pfile, UC"??", len: 2);
2665 accum.append (pfile, base: &type, len: 1);
2666
2667 /* ??/ followed by newline gets two line notes, one for
2668 the trigraph and one for the backslash/newline. */
2669 if (type == '/' && note[1].pos == pos)
2670 {
2671 note++;
2672 gcc_assert (note->type == '\\' || note->type == ' ');
2673 goto after_backslash;
2674 }
2675 /* Skip the replacement character. */
2676 base = ++pos;
2677 }
2678
2679 note++;
2680 break;
2681 }
2682
2683 /* Now get a char to process. Either from an expanded note, or
2684 from the line buffer. */
2685 bool read_note = accum.reading_p ();
2686 char c = read_note ? accum.read_char () : *pos++;
2687
2688 if (phase == PHASE_PREFIX)
2689 {
2690 if (c == '(')
2691 {
2692 /* Done. */
2693 phase = PHASE_NONE;
2694 prefix[prefix_len++] = '"';
2695 }
2696 else if (prefix_len < 16
2697 /* Prefix chars are any of the basic character set,
2698 [lex.charset] except for '
2699 ()\\\t\v\f\n'. Optimized for a contiguous
2700 alphabet. */
2701 /* Unlike a switch, this collapses down to one or
2702 two shift and bitmask operations on an ASCII
2703 system, with an outlier or two. */
2704 && (('Z' - 'A' == 25
2705 ? ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'))
2706 : ISIDST (c))
2707 || (c >= '0' && c <= '9')
2708 || c == '_' || c == '{' || c == '}'
2709 || c == '[' || c == ']' || c == '#'
2710 || c == '<' || c == '>' || c == '%'
2711 || c == ':' || c == ';' || c == '.' || c == '?'
2712 || c == '*' || c == '+' || c == '-' || c == '/'
2713 || c == '^' || c == '&' || c == '|' || c == '~'
2714 || c == '!' || c == '=' || c == ','
2715 || c == '"' || c == '\''
2716 || ((c == '$' || c == '@' || c == '`')
2717 && (CPP_OPTION (pfile, cplusplus)
2718 ? CPP_OPTION (pfile, lang) > CLK_CXX23
2719 : CPP_OPTION (pfile, low_ucns)))))
2720 prefix[prefix_len++] = c;
2721 else
2722 {
2723 /* Something is wrong. */
2724 int col = CPP_BUF_COLUMN (pfile->buffer, pos) + read_note;
2725 if (prefix_len == 16)
2726 cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
2727 col, msgid: "raw string delimiter longer "
2728 "than 16 characters");
2729 else if (c == '\n')
2730 cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
2731 col, msgid: "invalid new-line in raw "
2732 "string delimiter");
2733 else
2734 cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc,
2735 col, msgid: "invalid character '%c' in "
2736 "raw string delimiter", c);
2737 type = CPP_OTHER;
2738 phase = PHASE_NONE;
2739 /* Continue until we get a close quote, that's probably
2740 the best failure mode. */
2741 prefix_len = 0;
2742 }
2743 if (c != '\n')
2744 continue;
2745 }
2746
2747 if (phase != PHASE_NONE)
2748 {
2749 if (prefix[phase] != c)
2750 phase = PHASE_NONE;
2751 else if (unsigned (phase + 1) == prefix_len)
2752 break;
2753 else
2754 {
2755 phase = Phase (phase + 1);
2756 continue;
2757 }
2758 }
2759
2760 if (!prefix_len && c == '"')
2761 /* Failure mode lexing. */
2762 goto out;
2763 else if (prefix_len && c == ')')
2764 phase = PHASE_SUFFIX;
2765 else if (!read_note && c == '\n')
2766 {
2767 pos--;
2768 pfile->buffer->cur = pos;
2769 if ((pfile->state.in_directive || pfile->state.parsing_args
2770 || pfile->state.in_deferred_pragma)
2771 && pfile->buffer->next_line >= pfile->buffer->rlimit)
2772 {
2773 cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc, 0,
2774 msgid: "unterminated raw string");
2775 type = CPP_OTHER;
2776 goto out;
2777 }
2778
2779 accum.append (pfile, base, len: pos - base + 1);
2780 _cpp_process_line_notes (pfile, in_comment: false);
2781
2782 if (pfile->buffer->next_line < pfile->buffer->rlimit)
2783 CPP_INCREMENT_LINE (pfile, 0);
2784 pfile->buffer->need_line = true;
2785
2786 if (!get_fresh_line_impl<true> (pfile))
2787 {
2788 /* We ran out of file and failed to get a line. */
2789 location_t src_loc = token->src_loc;
2790 token->type = CPP_EOF;
2791 /* Tell the compiler the line number of the EOF token. */
2792 token->src_loc = pfile->line_table->highest_line;
2793 token->flags = BOL;
2794 if (accum.first)
2795 _cpp_release_buff (pfile, accum.first);
2796 cpp_error_with_line (pfile, CPP_DL_ERROR, src_loc, 0,
2797 msgid: "unterminated raw string");
2798
2799 /* Now pop the buffer that get_fresh_line_impl() did not. Popping
2800 is not safe if processing a directive, however this cannot
2801 happen as we already checked above that a line would be
2802 available, and get_fresh_line_impl() can't fail in this
2803 case. */
2804 gcc_assert (!pfile->state.in_directive);
2805 _cpp_pop_buffer (pfile);
2806
2807 return;
2808 }
2809
2810 pos = base = pfile->buffer->cur;
2811 note = &pfile->buffer->notes[pfile->buffer->cur_note];
2812 }
2813 else if (__builtin_expect ((unsigned char) c >= utf8_continuation, 0)
2814 && warn_bidi_or_invalid_utf8_p)
2815 pos = _cpp_handle_multibyte_utf8 (pfile, c, cur: pos, warn_bidi_p,
2816 warn_invalid_utf8_p);
2817 }
2818
2819 if (warn_bidi_p)
2820 maybe_warn_bidi_on_close (pfile, p: pos);
2821
2822 if (CPP_OPTION (pfile, user_literals))
2823 {
2824 const uchar *const suffix_begin = pos;
2825 pfile->buffer->cur = pos;
2826
2827 if (const auto sr = scan_cur_identifier (pfile))
2828 {
2829 if (maybe_ignore_udl_macro_suffix (pfile, src_loc: token->src_loc,
2830 suffix_begin, node: sr.node))
2831 pfile->buffer->cur = suffix_begin;
2832 else
2833 {
2834 type = cpp_userdef_string_add_type (type);
2835 accum.create_literal2 (pfile, token, base1: base, len1: suffix_begin - base,
2836 NODE_NAME (sr.node), NODE_LEN (sr.node),
2837 type);
2838 if (accum.first)
2839 _cpp_release_buff (pfile, accum.first);
2840 warn_about_normalization (pfile, token, s: &sr.nst, identifier: true);
2841 return;
2842 }
2843 }
2844 }
2845
2846 out:
2847 pfile->buffer->cur = pos;
2848 if (!accum.accum)
2849 create_literal (pfile, token, base, len: pos - base, type);
2850 else
2851 {
2852 accum.create_literal2 (pfile, token, base1: base, len1: pos - base, base2: nullptr, len2: 0, type);
2853 _cpp_release_buff (pfile, accum.first);
2854 }
2855}
2856
2857/* Lexes a string, character constant, or angle-bracketed header file
2858 name. The stored string contains the spelling, including opening
2859 quote and any leading 'L', 'u', 'U' or 'u8' and optional
2860 'R' modifier. It returns the type of the literal, or CPP_OTHER
2861 if it was not properly terminated, or CPP_LESS for an unterminated
2862 header name which must be relexed as normal tokens.
2863
2864 The spelling is NUL-terminated, but it is not guaranteed that this
2865 is the first NUL since embedded NULs are preserved. */
2866static void
2867lex_string (cpp_reader *pfile, cpp_token *token, const uchar *base)
2868{
2869 bool saw_NUL = false;
2870 const uchar *cur;
2871 cppchar_t terminator;
2872 enum cpp_ttype type;
2873
2874 cur = base;
2875 terminator = *cur++;
2876 if (terminator == 'L' || terminator == 'U')
2877 terminator = *cur++;
2878 else if (terminator == 'u')
2879 {
2880 terminator = *cur++;
2881 if (terminator == '8')
2882 terminator = *cur++;
2883 }
2884 if (terminator == 'R')
2885 {
2886 lex_raw_string (pfile, token, base);
2887 return;
2888 }
2889 if (terminator == '"')
2890 type = (*base == 'L' ? CPP_WSTRING :
2891 *base == 'U' ? CPP_STRING32 :
2892 *base == 'u' ? (base[1] == '8' ? CPP_UTF8STRING : CPP_STRING16)
2893 : CPP_STRING);
2894 else if (terminator == '\'')
2895 type = (*base == 'L' ? CPP_WCHAR :
2896 *base == 'U' ? CPP_CHAR32 :
2897 *base == 'u' ? (base[1] == '8' ? CPP_UTF8CHAR : CPP_CHAR16)
2898 : CPP_CHAR);
2899 else
2900 terminator = '>', type = CPP_HEADER_NAME;
2901
2902 const bool warn_bidi_p = pfile->warn_bidi_p ();
2903 const bool warn_invalid_utf8_p = CPP_OPTION (pfile, cpp_warn_invalid_utf8);
2904 const bool warn_bidi_or_invalid_utf8_p = warn_bidi_p | warn_invalid_utf8_p;
2905 for (;;)
2906 {
2907 cppchar_t c = *cur++;
2908
2909 /* In #include-style directives, terminators are not escapable. */
2910 if (c == '\\' && !pfile->state.angled_headers && *cur != '\n')
2911 {
2912 if ((cur[0] == 'u' || cur[0] == 'U' || cur[0] == 'N') && warn_bidi_p)
2913 {
2914 location_t loc;
2915 bidi::kind kind;
2916 if (cur[0] == 'N')
2917 kind = get_bidi_named (pfile, p: cur + 1, out: &loc);
2918 else
2919 kind = get_bidi_ucn (pfile, p: cur + 1, is_U: cur[0] == 'U', out: &loc);
2920 maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/true, loc);
2921 }
2922 cur++;
2923 }
2924 else if (c == terminator)
2925 {
2926 if (warn_bidi_p)
2927 maybe_warn_bidi_on_close (pfile, p: cur - 1);
2928 break;
2929 }
2930 else if (c == '\n')
2931 {
2932 cur--;
2933 /* Unmatched quotes always yield undefined behavior, but
2934 greedy lexing means that what appears to be an unterminated
2935 header name may actually be a legitimate sequence of tokens. */
2936 if (terminator == '>')
2937 {
2938 token->type = CPP_LESS;
2939 return;
2940 }
2941 type = CPP_OTHER;
2942 break;
2943 }
2944 else if (c == '\0')
2945 saw_NUL = true;
2946 else if (__builtin_expect (c >= utf8_continuation, 0)
2947 && warn_bidi_or_invalid_utf8_p)
2948 cur = _cpp_handle_multibyte_utf8 (pfile, c, cur, warn_bidi_p,
2949 warn_invalid_utf8_p);
2950 }
2951
2952 if (saw_NUL && !pfile->state.skipping)
2953 cpp_error (pfile, CPP_DL_WARNING,
2954 msgid: "null character(s) preserved in literal");
2955
2956 if (type == CPP_OTHER && CPP_OPTION (pfile, lang) != CLK_ASM)
2957 cpp_error (pfile, CPP_DL_PEDWARN, msgid: "missing terminating %c character",
2958 (int) terminator);
2959
2960 pfile->buffer->cur = cur;
2961 const uchar *const suffix_begin = cur;
2962
2963 if (CPP_OPTION (pfile, user_literals))
2964 {
2965 if (const auto sr = scan_cur_identifier (pfile))
2966 {
2967 if (maybe_ignore_udl_macro_suffix (pfile, src_loc: token->src_loc,
2968 suffix_begin, node: sr.node))
2969 pfile->buffer->cur = suffix_begin;
2970 else
2971 {
2972 /* Grab user defined literal suffix. */
2973 type = cpp_userdef_char_add_type (type);
2974 type = cpp_userdef_string_add_type (type);
2975 create_literal2 (pfile, token, base1: base, len1: suffix_begin - base,
2976 NODE_NAME (sr.node), NODE_LEN (sr.node), type);
2977 warn_about_normalization (pfile, token, s: &sr.nst, identifier: true);
2978 return;
2979 }
2980 }
2981 }
2982 else if (CPP_OPTION (pfile, cpp_warn_cxx11_compat)
2983 && !pfile->state.skipping)
2984 {
2985 const auto sr = scan_cur_identifier (pfile);
2986 /* Maybe raise a warning, but do not consume the tokens. */
2987 pfile->buffer->cur = suffix_begin;
2988 if (sr && cpp_macro_p (node: sr.node))
2989 cpp_warning_with_line (pfile, CPP_W_CXX11_COMPAT,
2990 token->src_loc, 0, msgid: "C++11 requires a space "
2991 "between string literal and macro");
2992 }
2993
2994 create_literal (pfile, token, base, len: cur - base, type);
2995}
2996
2997/* Return the comment table. The client may not make any assumption
2998 about the ordering of the table. */
2999cpp_comment_table *
3000cpp_get_comments (cpp_reader *pfile)
3001{
3002 return &pfile->comments;
3003}
3004
3005/* Append a comment to the end of the comment table. */
3006static void
3007store_comment (cpp_reader *pfile, cpp_token *token)
3008{
3009 int len;
3010
3011 if (pfile->comments.allocated == 0)
3012 {
3013 pfile->comments.allocated = 256;
3014 pfile->comments.entries = (cpp_comment *) xmalloc
3015 (pfile->comments.allocated * sizeof (cpp_comment));
3016 }
3017
3018 if (pfile->comments.count == pfile->comments.allocated)
3019 {
3020 pfile->comments.allocated *= 2;
3021 pfile->comments.entries = (cpp_comment *) xrealloc
3022 (pfile->comments.entries,
3023 pfile->comments.allocated * sizeof (cpp_comment));
3024 }
3025
3026 len = token->val.str.len;
3027
3028 /* Copy comment. Note, token may not be NULL terminated. */
3029 pfile->comments.entries[pfile->comments.count].comment =
3030 (char *) xmalloc (sizeof (char) * (len + 1));
3031 memcpy (dest: pfile->comments.entries[pfile->comments.count].comment,
3032 src: token->val.str.text, n: len);
3033 pfile->comments.entries[pfile->comments.count].comment[len] = '\0';
3034
3035 /* Set source location. */
3036 pfile->comments.entries[pfile->comments.count].sloc = token->src_loc;
3037
3038 /* Increment the count of entries in the comment table. */
3039 pfile->comments.count++;
3040}
3041
3042/* The stored comment includes the comment start and any terminator. */
3043static void
3044save_comment (cpp_reader *pfile, cpp_token *token, const unsigned char *from,
3045 cppchar_t type)
3046{
3047 unsigned char *buffer;
3048 unsigned int len, clen, i;
3049
3050 len = pfile->buffer->cur - from + 1; /* + 1 for the initial '/'. */
3051
3052 /* C++ comments probably (not definitely) have moved past a new
3053 line, which we don't want to save in the comment. */
3054 if (is_vspace (pfile->buffer->cur[-1]))
3055 len--;
3056
3057 /* If we are currently in a directive or in argument parsing, then
3058 we need to store all C++ comments as C comments internally, and
3059 so we need to allocate a little extra space in that case.
3060
3061 Note that the only time we encounter a directive here is
3062 when we are saving comments in a "#define". */
3063 clen = ((pfile->state.in_directive || pfile->state.parsing_args)
3064 && type == '/') ? len + 2 : len;
3065
3066 buffer = _cpp_unaligned_alloc (pfile, clen);
3067
3068 token->type = CPP_COMMENT;
3069 token->val.str.len = clen;
3070 token->val.str.text = buffer;
3071
3072 buffer[0] = '/';
3073 memcpy (dest: buffer + 1, src: from, n: len - 1);
3074
3075 /* Finish conversion to a C comment, if necessary. */
3076 if ((pfile->state.in_directive || pfile->state.parsing_args) && type == '/')
3077 {
3078 buffer[1] = '*';
3079 buffer[clen - 2] = '*';
3080 buffer[clen - 1] = '/';
3081 /* As there can be in a C++ comments illegal sequences for C comments
3082 we need to filter them out. */
3083 for (i = 2; i < (clen - 2); i++)
3084 if (buffer[i] == '/' && (buffer[i - 1] == '*' || buffer[i + 1] == '*'))
3085 buffer[i] = '|';
3086 }
3087
3088 /* Finally store this comment for use by clients of libcpp. */
3089 store_comment (pfile, token);
3090}
3091
3092/* Returns true if comment at COMMENT_START is a recognized FALLTHROUGH
3093 comment. */
3094
3095static bool
3096fallthrough_comment_p (cpp_reader *pfile, const unsigned char *comment_start)
3097{
3098 const unsigned char *from = comment_start + 1;
3099
3100 switch (CPP_OPTION (pfile, cpp_warn_implicit_fallthrough))
3101 {
3102 /* For both -Wimplicit-fallthrough=0 and -Wimplicit-fallthrough=5 we
3103 don't recognize any comments. The latter only checks attributes,
3104 the former doesn't warn. */
3105 case 0:
3106 default:
3107 return false;
3108 /* -Wimplicit-fallthrough=1 considers any comment, no matter what
3109 content it has. */
3110 case 1:
3111 return true;
3112 case 2:
3113 /* -Wimplicit-fallthrough=2 looks for (case insensitive)
3114 .*falls?[ \t-]*thr(u|ough).* regex. */
3115 for (; (size_t) (pfile->buffer->cur - from) >= sizeof "fallthru" - 1;
3116 from++)
3117 {
3118 /* Is there anything like strpbrk with upper boundary, or
3119 memchr looking for 2 characters rather than just one? */
3120 if (from[0] != 'f' && from[0] != 'F')
3121 continue;
3122 if (from[1] != 'a' && from[1] != 'A')
3123 continue;
3124 if (from[2] != 'l' && from[2] != 'L')
3125 continue;
3126 if (from[3] != 'l' && from[3] != 'L')
3127 continue;
3128 from += sizeof "fall" - 1;
3129 if (from[0] == 's' || from[0] == 'S')
3130 from++;
3131 while (*from == ' ' || *from == '\t' || *from == '-')
3132 from++;
3133 if (from[0] != 't' && from[0] != 'T')
3134 continue;
3135 if (from[1] != 'h' && from[1] != 'H')
3136 continue;
3137 if (from[2] != 'r' && from[2] != 'R')
3138 continue;
3139 if (from[3] == 'u' || from[3] == 'U')
3140 return true;
3141 if (from[3] != 'o' && from[3] != 'O')
3142 continue;
3143 if (from[4] != 'u' && from[4] != 'U')
3144 continue;
3145 if (from[5] != 'g' && from[5] != 'G')
3146 continue;
3147 if (from[6] != 'h' && from[6] != 'H')
3148 continue;
3149 return true;
3150 }
3151 return false;
3152 case 3:
3153 case 4:
3154 break;
3155 }
3156
3157 /* Whole comment contents:
3158 -fallthrough
3159 @fallthrough@
3160 */
3161 if (*from == '-' || *from == '@')
3162 {
3163 size_t len = sizeof "fallthrough" - 1;
3164 if ((size_t) (pfile->buffer->cur - from - 1) < len)
3165 return false;
3166 if (memcmp (s1: from + 1, s2: "fallthrough", n: len))
3167 return false;
3168 if (*from == '@')
3169 {
3170 if (from[len + 1] != '@')
3171 return false;
3172 len++;
3173 }
3174 from += 1 + len;
3175 }
3176 /* Whole comment contents (regex):
3177 lint -fallthrough[ \t]*
3178 */
3179 else if (*from == 'l')
3180 {
3181 size_t len = sizeof "int -fallthrough" - 1;
3182 if ((size_t) (pfile->buffer->cur - from - 1) < len)
3183 return false;
3184 if (memcmp (s1: from + 1, s2: "int -fallthrough", n: len))
3185 return false;
3186 from += 1 + len;
3187 while (*from == ' ' || *from == '\t')
3188 from++;
3189 }
3190 /* Whole comment contents (regex):
3191 [ \t]*FALLTHR(U|OUGH)[ \t]*
3192 */
3193 else if (CPP_OPTION (pfile, cpp_warn_implicit_fallthrough) == 4)
3194 {
3195 while (*from == ' ' || *from == '\t')
3196 from++;
3197 if ((size_t) (pfile->buffer->cur - from) < sizeof "FALLTHRU" - 1)
3198 return false;
3199 if (memcmp (s1: from, s2: "FALLTHR", n: sizeof "FALLTHR" - 1))
3200 return false;
3201 from += sizeof "FALLTHR" - 1;
3202 if (*from == 'U')
3203 from++;
3204 else if ((size_t) (pfile->buffer->cur - from) < sizeof "OUGH" - 1)
3205 return false;
3206 else if (memcmp (s1: from, s2: "OUGH", n: sizeof "OUGH" - 1))
3207 return false;
3208 else
3209 from += sizeof "OUGH" - 1;
3210 while (*from == ' ' || *from == '\t')
3211 from++;
3212 }
3213 /* Whole comment contents (regex):
3214 [ \t.!]*(ELSE,? |INTENTIONAL(LY)? )?FALL(S | |-)?THR(OUGH|U)[ \t.!]*(-[^\n\r]*)?
3215 [ \t.!]*(Else,? |Intentional(ly)? )?Fall((s | |-)[Tt]|t)hr(ough|u)[ \t.!]*(-[^\n\r]*)?
3216 [ \t.!]*([Ee]lse,? |[Ii]ntentional(ly)? )?fall(s | |-)?thr(ough|u)[ \t.!]*(-[^\n\r]*)?
3217 */
3218 else
3219 {
3220 while (*from == ' ' || *from == '\t' || *from == '.' || *from == '!')
3221 from++;
3222 unsigned char f = *from;
3223 bool all_upper = false;
3224 if (f == 'E' || f == 'e')
3225 {
3226 if ((size_t) (pfile->buffer->cur - from)
3227 < sizeof "else fallthru" - 1)
3228 return false;
3229 if (f == 'E' && memcmp (s1: from + 1, s2: "LSE", n: sizeof "LSE" - 1) == 0)
3230 all_upper = true;
3231 else if (memcmp (s1: from + 1, s2: "lse", n: sizeof "lse" - 1))
3232 return false;
3233 from += sizeof "else" - 1;
3234 if (*from == ',')
3235 from++;
3236 if (*from != ' ')
3237 return false;
3238 from++;
3239 if (all_upper && *from == 'f')
3240 return false;
3241 if (f == 'e' && *from == 'F')
3242 return false;
3243 f = *from;
3244 }
3245 else if (f == 'I' || f == 'i')
3246 {
3247 if ((size_t) (pfile->buffer->cur - from)
3248 < sizeof "intentional fallthru" - 1)
3249 return false;
3250 if (f == 'I' && memcmp (s1: from + 1, s2: "NTENTIONAL",
3251 n: sizeof "NTENTIONAL" - 1) == 0)
3252 all_upper = true;
3253 else if (memcmp (s1: from + 1, s2: "ntentional",
3254 n: sizeof "ntentional" - 1))
3255 return false;
3256 from += sizeof "intentional" - 1;
3257 if (*from == ' ')
3258 {
3259 from++;
3260 if (all_upper && *from == 'f')
3261 return false;
3262 }
3263 else if (all_upper)
3264 {
3265 if (memcmp (s1: from, s2: "LY F", n: sizeof "LY F" - 1))
3266 return false;
3267 from += sizeof "LY " - 1;
3268 }
3269 else
3270 {
3271 if (memcmp (s1: from, s2: "ly ", n: sizeof "ly " - 1))
3272 return false;
3273 from += sizeof "ly " - 1;
3274 }
3275 if (f == 'i' && *from == 'F')
3276 return false;
3277 f = *from;
3278 }
3279 if (f != 'F' && f != 'f')
3280 return false;
3281 if ((size_t) (pfile->buffer->cur - from) < sizeof "fallthru" - 1)
3282 return false;
3283 if (f == 'F' && memcmp (s1: from + 1, s2: "ALL", n: sizeof "ALL" - 1) == 0)
3284 all_upper = true;
3285 else if (all_upper)
3286 return false;
3287 else if (memcmp (s1: from + 1, s2: "all", n: sizeof "all" - 1))
3288 return false;
3289 from += sizeof "fall" - 1;
3290 if (*from == (all_upper ? 'S' : 's') && from[1] == ' ')
3291 from += 2;
3292 else if (*from == ' ' || *from == '-')
3293 from++;
3294 else if (*from != (all_upper ? 'T' : 't'))
3295 return false;
3296 if ((f == 'f' || *from != 'T') && (all_upper || *from != 't'))
3297 return false;
3298 if ((size_t) (pfile->buffer->cur - from) < sizeof "thru" - 1)
3299 return false;
3300 if (memcmp (s1: from + 1, s2: all_upper ? "HRU" : "hru", n: sizeof "hru" - 1))
3301 {
3302 if ((size_t) (pfile->buffer->cur - from) < sizeof "through" - 1)
3303 return false;
3304 if (memcmp (s1: from + 1, s2: all_upper ? "HROUGH" : "hrough",
3305 n: sizeof "hrough" - 1))
3306 return false;
3307 from += sizeof "through" - 1;
3308 }
3309 else
3310 from += sizeof "thru" - 1;
3311 while (*from == ' ' || *from == '\t' || *from == '.' || *from == '!')
3312 from++;
3313 if (*from == '-')
3314 {
3315 from++;
3316 if (*comment_start == '*')
3317 {
3318 do
3319 {
3320 while (*from && *from != '*'
3321 && *from != '\n' && *from != '\r')
3322 from++;
3323 if (*from != '*' || from[1] == '/')
3324 break;
3325 from++;
3326 }
3327 while (1);
3328 }
3329 else
3330 while (*from && *from != '\n' && *from != '\r')
3331 from++;
3332 }
3333 }
3334 /* C block comment. */
3335 if (*comment_start == '*')
3336 {
3337 if (*from != '*' || from[1] != '/')
3338 return false;
3339 }
3340 /* C++ line comment. */
3341 else if (*from != '\n')
3342 return false;
3343
3344 return true;
3345}
3346
3347/* Allocate COUNT tokens for RUN. */
3348void
3349_cpp_init_tokenrun (tokenrun *run, unsigned int count)
3350{
3351 run->base = XNEWVEC (cpp_token, count);
3352 run->limit = run->base + count;
3353 run->next = NULL;
3354}
3355
3356/* Returns the next tokenrun, or creates one if there is none. */
3357static tokenrun *
3358next_tokenrun (tokenrun *run)
3359{
3360 if (run->next == NULL)
3361 {
3362 run->next = XNEW (tokenrun);
3363 run->next->prev = run;
3364 _cpp_init_tokenrun (run: run->next, count: 250);
3365 }
3366
3367 return run->next;
3368}
3369
3370/* Return the number of not yet processed token in a given
3371 context. */
3372int
3373_cpp_remaining_tokens_num_in_context (cpp_context *context)
3374{
3375 if (context->tokens_kind == TOKENS_KIND_DIRECT)
3376 return (LAST (context).token - FIRST (context).token);
3377 else if (context->tokens_kind == TOKENS_KIND_INDIRECT
3378 || context->tokens_kind == TOKENS_KIND_EXTENDED)
3379 return (LAST (context).ptoken - FIRST (context).ptoken);
3380 else
3381 abort ();
3382}
3383
3384/* Returns the token present at index INDEX in a given context. If
3385 INDEX is zero, the next token to be processed is returned. */
3386static const cpp_token*
3387_cpp_token_from_context_at (cpp_context *context, int index)
3388{
3389 if (context->tokens_kind == TOKENS_KIND_DIRECT)
3390 return &(FIRST (context).token[index]);
3391 else if (context->tokens_kind == TOKENS_KIND_INDIRECT
3392 || context->tokens_kind == TOKENS_KIND_EXTENDED)
3393 return FIRST (context).ptoken[index];
3394 else
3395 abort ();
3396}
3397
3398/* Look ahead in the input stream. */
3399const cpp_token *
3400cpp_peek_token (cpp_reader *pfile, int index)
3401{
3402 cpp_context *context = pfile->context;
3403 const cpp_token *peektok;
3404 int count;
3405
3406 /* First, scan through any pending cpp_context objects. */
3407 while (context->prev)
3408 {
3409 ptrdiff_t sz = _cpp_remaining_tokens_num_in_context (context);
3410
3411 if (index < (int) sz)
3412 return _cpp_token_from_context_at (context, index);
3413 index -= (int) sz;
3414 context = context->prev;
3415 }
3416
3417 /* We will have to read some new tokens after all (and do so
3418 without invalidating preceding tokens). */
3419 count = index;
3420 pfile->keep_tokens++;
3421
3422 /* For peeked tokens temporarily disable line_change reporting,
3423 until the tokens are parsed for real. */
3424 void (*line_change) (cpp_reader *, const cpp_token *, int)
3425 = pfile->cb.line_change;
3426 pfile->cb.line_change = NULL;
3427
3428 do
3429 {
3430 peektok = _cpp_lex_token (pfile);
3431 if (peektok->type == CPP_EOF)
3432 {
3433 index--;
3434 break;
3435 }
3436 else if (peektok->type == CPP_PRAGMA)
3437 {
3438 /* Don't peek past a pragma. */
3439 if (peektok == &pfile->directive_result)
3440 /* Save the pragma in the buffer. */
3441 *pfile->cur_token++ = *peektok;
3442 index--;
3443 break;
3444 }
3445 }
3446 while (index--);
3447
3448 _cpp_backup_tokens_direct (pfile, count - index);
3449 pfile->keep_tokens--;
3450 pfile->cb.line_change = line_change;
3451
3452 return peektok;
3453}
3454
3455/* Allocate a single token that is invalidated at the same time as the
3456 rest of the tokens on the line. Has its line and col set to the
3457 same as the last lexed token, so that diagnostics appear in the
3458 right place. */
3459cpp_token *
3460_cpp_temp_token (cpp_reader *pfile)
3461{
3462 cpp_token *old, *result;
3463 ptrdiff_t sz = pfile->cur_run->limit - pfile->cur_token;
3464 ptrdiff_t la = (ptrdiff_t) pfile->lookaheads;
3465
3466 old = pfile->cur_token - 1;
3467 /* Any pre-existing lookaheads must not be clobbered. */
3468 if (la)
3469 {
3470 if (sz <= la)
3471 {
3472 tokenrun *next = next_tokenrun (run: pfile->cur_run);
3473
3474 if (sz < la)
3475 memmove (dest: next->base + 1, src: next->base,
3476 n: (la - sz) * sizeof (cpp_token));
3477
3478 next->base[0] = pfile->cur_run->limit[-1];
3479 }
3480
3481 if (sz > 1)
3482 memmove (dest: pfile->cur_token + 1, src: pfile->cur_token,
3483 MIN (la, sz - 1) * sizeof (cpp_token));
3484 }
3485
3486 if (!sz && pfile->cur_token == pfile->cur_run->limit)
3487 {
3488 pfile->cur_run = next_tokenrun (run: pfile->cur_run);
3489 pfile->cur_token = pfile->cur_run->base;
3490 }
3491
3492 result = pfile->cur_token++;
3493 result->src_loc = old->src_loc;
3494 return result;
3495}
3496
3497/* We're at the beginning of a logical line (so not in
3498 directives-mode) and RESULT is a CPP_NAME with NODE_MODULE set. See
3499 if we should enter deferred_pragma mode to tokenize the rest of the
3500 line as a module control-line. */
3501
3502static void
3503cpp_maybe_module_directive (cpp_reader *pfile, cpp_token *result)
3504{
3505 unsigned backup = 0; /* Tokens we peeked. */
3506 cpp_hashnode *node = result->val.node.node;
3507 cpp_token *peek = result;
3508 cpp_token *keyword = peek;
3509 cpp_hashnode *(&n_modules)[spec_nodes::M_HWM][2] = pfile->spec_nodes.n_modules;
3510 int header_count = 0;
3511 bool eol = false;
3512
3513 /* Make sure the incoming state is as we expect it. This way we
3514 can restore it using constants. */
3515 gcc_checking_assert (!pfile->state.in_deferred_pragma
3516 && !pfile->state.skipping
3517 && !pfile->state.parsing_args
3518 && !pfile->state.angled_headers
3519 && (pfile->state.save_comments
3520 == !CPP_OPTION (pfile, discard_comments)));
3521
3522 /* Enter directives mode sufficiently for peeking. We don't have
3523 to actually set in_directive. */
3524 pfile->state.in_deferred_pragma = true;
3525
3526 /* These two fields are needed to process tokenization in deferred
3527 pragma mode. They are not used outside deferred pragma mode or
3528 directives mode. */
3529 pfile->state.pragma_allow_expansion = true;
3530 pfile->directive_line = result->src_loc;
3531
3532 /* Saving comments is incompatible with directives mode. */
3533 pfile->state.save_comments = 0;
3534
3535 if (node == n_modules[spec_nodes::M_EXPORT][0])
3536 {
3537 peek = _cpp_lex_direct (pfile);
3538 keyword = peek;
3539 backup++;
3540 if (keyword->type != CPP_NAME)
3541 goto not_module;
3542 node = keyword->val.node.node;
3543 if (!(node->flags & NODE_MODULE))
3544 goto not_module;
3545 }
3546
3547 if (node == n_modules[spec_nodes::M__IMPORT][0])
3548 /* __import */
3549 header_count = backup + 2 + 16;
3550 else if (node == n_modules[spec_nodes::M_IMPORT][0])
3551 /* import */
3552 header_count = backup + 2 + (CPP_OPTION (pfile, preprocessed) ? 16 : 0);
3553 else if (node == n_modules[spec_nodes::M_MODULE][0])
3554 ; /* module */
3555 else
3556 goto not_module;
3557
3558 /* We've seen [export] {module|import|__import}. Check the next token. */
3559 if (header_count)
3560 /* After '{,__}import' a header name may appear. */
3561 pfile->state.angled_headers = true;
3562 peek = _cpp_lex_direct (pfile);
3563 backup++;
3564
3565 /* ... import followed by identifier, ':', '<' or
3566 header-name preprocessing tokens, or module
3567 followed by cpp-identifier, ':' or ';' preprocessing
3568 tokens. C++ keywords are not yet relevant. */
3569 if (peek->type == CPP_NAME
3570 || peek->type == CPP_COLON
3571 || (header_count
3572 ? (peek->type == CPP_LESS
3573 || (peek->type == CPP_STRING && peek->val.str.text[0] != 'R')
3574 || peek->type == CPP_HEADER_NAME)
3575 : peek->type == CPP_SEMICOLON))
3576 {
3577 pfile->state.pragma_allow_expansion = !CPP_OPTION (pfile, preprocessed);
3578 if (!pfile->state.pragma_allow_expansion)
3579 pfile->state.prevent_expansion++;
3580
3581 if (!header_count && linemap_included_from
3582 (ord_map: LINEMAPS_LAST_ORDINARY_MAP (set: pfile->line_table)))
3583 cpp_error_with_line (pfile, CPP_DL_ERROR, keyword->src_loc, 0,
3584 msgid: "module control-line cannot be in included file");
3585
3586 /* The first one or two tokens cannot be macro names. */
3587 for (int ix = backup; ix--;)
3588 {
3589 cpp_token *tok = ix ? keyword : result;
3590 cpp_hashnode *node = tok->val.node.node;
3591
3592 /* Don't attempt to expand the token. */
3593 tok->flags |= NO_EXPAND;
3594 if (_cpp_defined_macro_p (node)
3595 && _cpp_maybe_notify_macro_use (pfile, node, loc: tok->src_loc)
3596 && !cpp_fun_like_macro_p (node))
3597 cpp_error_with_line (pfile, CPP_DL_ERROR, tok->src_loc, 0,
3598 msgid: "module control-line %qs cannot be"
3599 " an object-like macro",
3600 NODE_NAME (node));
3601 }
3602
3603 /* Map to underbar variants. */
3604 keyword->val.node.node = n_modules[header_count
3605 ? spec_nodes::M_IMPORT
3606 : spec_nodes::M_MODULE][1];
3607 if (backup != 1)
3608 result->val.node.node = n_modules[spec_nodes::M_EXPORT][1];
3609
3610 /* Maybe tell the tokenizer we expect a header-name down the
3611 road. */
3612 pfile->state.directive_file_token = header_count;
3613
3614 /* According to P3034R1, pp-module-name and pp-module-partition tokens
3615 if any shouldn't be macro expanded and identifiers shouldn't be
3616 defined as object-like macro. */
3617 if (!header_count && peek->type == CPP_NAME)
3618 {
3619 int state = 0;
3620 do
3621 {
3622 cpp_token *tok = peek;
3623 if (tok->type == CPP_NAME)
3624 {
3625 cpp_hashnode *node = tok->val.node.node;
3626 /* Don't attempt to expand the token. */
3627 tok->flags |= NO_EXPAND;
3628 if (_cpp_defined_macro_p (node)
3629 && _cpp_maybe_notify_macro_use (pfile, node,
3630 loc: tok->src_loc)
3631 && !cpp_fun_like_macro_p (node))
3632 {
3633 if (state == 0)
3634 cpp_error_with_line (pfile, CPP_DL_ERROR,
3635 tok->src_loc, 0,
3636 msgid: "module name %qs cannot "
3637 "be an object-like macro",
3638 NODE_NAME (node));
3639 else
3640 cpp_error_with_line (pfile, CPP_DL_ERROR,
3641 tok->src_loc, 0,
3642 msgid: "module partition %qs cannot "
3643 "be an object-like macro",
3644 NODE_NAME (node));
3645 }
3646 }
3647 peek = _cpp_lex_direct (pfile);
3648 backup++;
3649 if (tok->type == CPP_NAME)
3650 {
3651 if (peek->type == CPP_DOT)
3652 continue;
3653 else if (peek->type == CPP_COLON && state == 0)
3654 {
3655 ++state;
3656 continue;
3657 }
3658 else if (peek->type == CPP_OPEN_PAREN)
3659 {
3660 if (state == 0)
3661 cpp_error_with_line (pfile, CPP_DL_ERROR,
3662 peek->src_loc, 0,
3663 msgid: "module name followed by %<(%>");
3664 else
3665 cpp_error_with_line (pfile, CPP_DL_ERROR,
3666 peek->src_loc, 0,
3667 msgid: "module partition followed by "
3668 "%<(%>");
3669 break;
3670 }
3671 else if (peek->type == CPP_NAME
3672 && _cpp_defined_macro_p (node: peek->val.node.node))
3673 {
3674 peek->flags |= NO_DOT_COLON;
3675 break;
3676 }
3677 else
3678 break;
3679 }
3680 else if (peek->type != CPP_NAME)
3681 break;
3682 }
3683 while (true);
3684 }
3685 }
3686 else
3687 {
3688 not_module:
3689 /* Drop out of directive mode. */
3690 /* We aaserted save_comments had this value upon entry. */
3691 pfile->state.save_comments
3692 = !CPP_OPTION (pfile, discard_comments);
3693 pfile->state.in_deferred_pragma = false;
3694 /* Do not let this remain on. */
3695 pfile->state.angled_headers = false;
3696 /* If we saw EOL, we should drop it, because this isn't a module
3697 control-line after all. */
3698 eol = peek->type == CPP_PRAGMA_EOL;
3699 }
3700
3701 /* In either case we want to backup the peeked tokens. */
3702 if (backup && (!eol || backup > 1))
3703 {
3704 /* Put the peeked tokens back. */
3705 _cpp_backup_tokens_direct (pfile, backup);
3706 /* But if the last one was an EOL in the not_module case, forget it. */
3707 if (eol)
3708 pfile->lookaheads--;
3709 }
3710}
3711
3712/* Lex a token into RESULT (external interface). Takes care of issues
3713 like directive handling, token lookahead, multiple include
3714 optimization and skipping. */
3715const cpp_token *
3716_cpp_lex_token (cpp_reader *pfile)
3717{
3718 cpp_token *result;
3719
3720 for (;;)
3721 {
3722 if (pfile->cur_token == pfile->cur_run->limit)
3723 {
3724 pfile->cur_run = next_tokenrun (run: pfile->cur_run);
3725 pfile->cur_token = pfile->cur_run->base;
3726 }
3727 /* We assume that the current token is somewhere in the current
3728 run. */
3729 if (pfile->cur_token < pfile->cur_run->base
3730 || pfile->cur_token >= pfile->cur_run->limit)
3731 abort ();
3732
3733 if (pfile->lookaheads)
3734 {
3735 pfile->lookaheads--;
3736 result = pfile->cur_token++;
3737 }
3738 else
3739 result = _cpp_lex_direct (pfile);
3740
3741 if (result->flags & BOL)
3742 {
3743 /* Is this a directive. If _cpp_handle_directive returns
3744 false, it is an assembler #. */
3745 if (result->type == CPP_HASH
3746 /* 6.10.3 p 11: Directives in a list of macro arguments
3747 gives undefined behavior. This implementation
3748 handles the directive as normal. */
3749 && pfile->state.parsing_args != 1)
3750 {
3751 if (_cpp_handle_directive (pfile, result->flags & PREV_WHITE))
3752 {
3753 if (pfile->directive_result.type == CPP_PADDING)
3754 continue;
3755 result = &pfile->directive_result;
3756 }
3757 }
3758 else if (pfile->state.in_deferred_pragma)
3759 result = &pfile->directive_result;
3760 else if (result->type == CPP_NAME
3761 && (result->val.node.node->flags & NODE_MODULE)
3762 && !pfile->state.skipping
3763 /* Unlike regular directives, we do not deal with
3764 tokenizing module directives as macro arguments.
3765 That's not permitted. */
3766 && !pfile->state.parsing_args)
3767 {
3768 /* P1857. Before macro expansion, At start of logical
3769 line ... */
3770 /* We don't have to consider lookaheads at this point. */
3771 gcc_checking_assert (!pfile->lookaheads);
3772
3773 cpp_maybe_module_directive (pfile, result);
3774 }
3775
3776 if (pfile->cb.line_change && !pfile->state.skipping)
3777 pfile->cb.line_change (pfile, result, pfile->state.parsing_args);
3778 }
3779
3780 /* We don't skip tokens in directives. */
3781 if (pfile->state.in_directive || pfile->state.in_deferred_pragma)
3782 break;
3783
3784 /* Outside a directive, invalidate controlling macros. At file
3785 EOF, _cpp_lex_direct takes care of popping the buffer, so we never
3786 get here and MI optimization works. */
3787 pfile->mi_valid = false;
3788
3789 if (!pfile->state.skipping || result->type == CPP_EOF)
3790 break;
3791 }
3792
3793 return result;
3794}
3795
3796/* Returns true if a fresh line has been loaded. */
3797template <bool lexing_raw_string>
3798static bool
3799get_fresh_line_impl (cpp_reader *pfile)
3800{
3801 /* We can't get a new line until we leave the current directive, unless we
3802 are lexing a raw string, in which case it will be OK as long as we don't
3803 pop the current buffer. */
3804 if (!lexing_raw_string && pfile->state.in_directive)
3805 return false;
3806
3807 for (;;)
3808 {
3809 cpp_buffer *buffer = pfile->buffer;
3810
3811 if (!buffer->need_line)
3812 return true;
3813
3814 if (buffer->next_line < buffer->rlimit)
3815 {
3816 _cpp_clean_line (pfile);
3817 return true;
3818 }
3819
3820 /* We can't change buffers until we leave the current directive. */
3821 if (lexing_raw_string && pfile->state.in_directive)
3822 return false;
3823
3824 /* First, get out of parsing arguments state. */
3825 if (pfile->state.parsing_args)
3826 return false;
3827
3828 /* End of buffer. Non-empty files should end in a newline. */
3829 if (buffer->buf != buffer->rlimit
3830 && buffer->next_line > buffer->rlimit
3831 && !buffer->from_stage3)
3832 {
3833 /* Clip to buffer size. */
3834 buffer->next_line = buffer->rlimit;
3835 }
3836
3837 if (buffer->prev && !buffer->return_at_eof)
3838 _cpp_pop_buffer (pfile);
3839 else
3840 {
3841 /* End of translation. Do not pop the buffer yet. Increment
3842 line number so that the EOF token is on a line of its own
3843 (_cpp_lex_direct doesn't increment in that case, because
3844 it's hard for it to distinguish this special case). */
3845 CPP_INCREMENT_LINE (pfile, 0);
3846 return false;
3847 }
3848 }
3849}
3850
3851bool
3852_cpp_get_fresh_line (cpp_reader *pfile)
3853{
3854 return get_fresh_line_impl<false> (pfile);
3855}
3856
3857
3858#define IF_NEXT_IS(CHAR, THEN_TYPE, ELSE_TYPE) \
3859 do \
3860 { \
3861 result->type = ELSE_TYPE; \
3862 if (*buffer->cur == CHAR) \
3863 buffer->cur++, result->type = THEN_TYPE; \
3864 } \
3865 while (0)
3866
3867/* Lex a token into pfile->cur_token, which is also incremented, to
3868 get diagnostics pointing to the correct location.
3869
3870 Does not handle issues such as token lookahead, multiple-include
3871 optimization, directives, skipping etc. This function is only
3872 suitable for use by _cpp_lex_token, and in special cases like
3873 lex_expansion_token which doesn't care for any of these issues.
3874
3875 When meeting a newline, returns CPP_EOF if parsing a directive,
3876 otherwise returns to the start of the token buffer if permissible.
3877 Returns the location of the lexed token. */
3878cpp_token *
3879_cpp_lex_direct (cpp_reader *pfile)
3880{
3881 cppchar_t c = 0;
3882 cpp_buffer *buffer;
3883 const unsigned char *comment_start;
3884 bool fallthrough_comment = false;
3885 cpp_token *result = pfile->cur_token++;
3886
3887 fresh_line:
3888 result->flags = 0;
3889 buffer = pfile->buffer;
3890 if (buffer->need_line)
3891 {
3892 if (pfile->state.in_deferred_pragma)
3893 {
3894 /* This can happen in cases like:
3895 #define loop(x) whatever
3896 #pragma omp loop
3897 where when trying to expand loop we need to peek
3898 next token after loop, but aren't still in_deferred_pragma
3899 mode but are in in_directive mode, so buffer->need_line
3900 is set, a CPP_EOF is peeked. */
3901 result->type = CPP_PRAGMA_EOL;
3902 pfile->state.in_deferred_pragma = false;
3903 if (!pfile->state.pragma_allow_expansion)
3904 pfile->state.prevent_expansion--;
3905 result->src_loc = pfile->line_table->highest_line;
3906 return result;
3907 }
3908 if (!_cpp_get_fresh_line (pfile))
3909 {
3910 result->type = CPP_EOF;
3911 /* Not a real EOF in a directive or arg parsing -- we refuse
3912 to advance to the next file now, and will once we're out
3913 of those modes. */
3914 if (!pfile->state.in_directive && !pfile->state.parsing_args)
3915 {
3916 /* Tell the compiler the line number of the EOF token. */
3917 result->src_loc = pfile->line_table->highest_line;
3918 result->flags = BOL;
3919 /* Now pop the buffer that _cpp_get_fresh_line did not. */
3920 _cpp_pop_buffer (pfile);
3921 }
3922 else if (c == 0)
3923 result->src_loc = pfile->line_table->highest_line;
3924 return result;
3925 }
3926 if (buffer != pfile->buffer)
3927 fallthrough_comment = false;
3928 if (!pfile->keep_tokens)
3929 {
3930 pfile->cur_run = &pfile->base_run;
3931 result = pfile->base_run.base;
3932 pfile->cur_token = result + 1;
3933 }
3934 result->flags = BOL;
3935 if (pfile->state.parsing_args == 2)
3936 result->flags |= PREV_WHITE;
3937 }
3938 buffer = pfile->buffer;
3939 update_tokens_line:
3940 result->src_loc = pfile->line_table->highest_line;
3941
3942 skipped_white:
3943 if (buffer->cur >= buffer->notes[buffer->cur_note].pos
3944 && !pfile->overlaid_buffer)
3945 {
3946 _cpp_process_line_notes (pfile, in_comment: false);
3947 result->src_loc = pfile->line_table->highest_line;
3948 }
3949 c = *buffer->cur++;
3950
3951 if (pfile->forced_token_location)
3952 result->src_loc = pfile->forced_token_location;
3953 else
3954 result->src_loc = linemap_position_for_column (pfile->line_table,
3955 CPP_BUF_COLUMN (buffer, buffer->cur));
3956
3957 switch (c)
3958 {
3959 case ' ': case '\t': case '\f': case '\v': case '\0':
3960 result->flags |= PREV_WHITE;
3961 skip_whitespace (pfile, c);
3962 goto skipped_white;
3963
3964 case '\n':
3965 /* Increment the line, unless this is the last line ... */
3966 if (buffer->cur < buffer->rlimit
3967 /* ... or this is a #include, (where _cpp_stack_file needs to
3968 unwind by one line) ... */
3969 || (pfile->state.in_directive > 1
3970 /* ... except traditional-cpp increments this elsewhere. */
3971 && !CPP_OPTION (pfile, traditional)))
3972 CPP_INCREMENT_LINE (pfile, 0);
3973 buffer->need_line = true;
3974 if (pfile->state.in_deferred_pragma)
3975 {
3976 /* Produce the PRAGMA_EOL on this line. File reading
3977 ensures there is always a \n at end of the buffer, thus
3978 in a deferred pragma we always see CPP_PRAGMA_EOL before
3979 any CPP_EOF. */
3980 result->type = CPP_PRAGMA_EOL;
3981 result->flags &= ~PREV_WHITE;
3982 pfile->state.in_deferred_pragma = false;
3983 if (!pfile->state.pragma_allow_expansion)
3984 pfile->state.prevent_expansion--;
3985 return result;
3986 }
3987 goto fresh_line;
3988
3989 case '0': case '1': case '2': case '3': case '4':
3990 case '5': case '6': case '7': case '8': case '9':
3991 {
3992 struct normalize_state nst = INITIAL_NORMALIZE_STATE;
3993 result->type = CPP_NUMBER;
3994 lex_number (pfile, number: &result->val.str, nst: &nst);
3995 warn_about_normalization (pfile, token: result, s: &nst, identifier: false);
3996 break;
3997 }
3998
3999 case 'L':
4000 case 'u':
4001 case 'U':
4002 case 'R':
4003 /* 'L', 'u', 'U', 'u8' or 'R' may introduce wide characters,
4004 wide strings or raw strings. */
4005 if (c == 'L' || CPP_OPTION (pfile, rliterals)
4006 || (c != 'R' && CPP_OPTION (pfile, uliterals)))
4007 {
4008 if ((*buffer->cur == '\'' && c != 'R')
4009 || *buffer->cur == '"'
4010 || (*buffer->cur == 'R'
4011 && c != 'R'
4012 && buffer->cur[1] == '"'
4013 && CPP_OPTION (pfile, rliterals))
4014 || (*buffer->cur == '8'
4015 && c == 'u'
4016 && ((buffer->cur[1] == '"' || (buffer->cur[1] == '\''
4017 && CPP_OPTION (pfile, utf8_char_literals)))
4018 || (buffer->cur[1] == 'R' && buffer->cur[2] == '"'
4019 && CPP_OPTION (pfile, rliterals)))))
4020 {
4021 lex_string (pfile, token: result, base: buffer->cur - 1);
4022 break;
4023 }
4024 }
4025 /* Fall through. */
4026
4027 case '_':
4028 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
4029 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
4030 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
4031 case 's': case 't': case 'v': case 'w': case 'x':
4032 case 'y': case 'z':
4033 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
4034 case 'G': case 'H': case 'I': case 'J': case 'K':
4035 case 'M': case 'N': case 'O': case 'P': case 'Q':
4036 case 'S': case 'T': case 'V': case 'W': case 'X':
4037 case 'Y': case 'Z':
4038 result->type = CPP_NAME;
4039 {
4040 struct normalize_state nst = INITIAL_NORMALIZE_STATE;
4041 const auto node = lex_identifier (pfile, base: buffer->cur - 1, starts_ucn: false, nst: &nst,
4042 spelling: &result->val.node.spelling);
4043 result->val.node.node = node;
4044 identifier_diagnostics_on_lex (pfile, node);
4045 warn_about_normalization (pfile, token: result, s: &nst, identifier: true);
4046 }
4047
4048 /* Convert named operators to their proper types. */
4049 if (result->val.node.node->flags & NODE_OPERATOR)
4050 {
4051 result->flags |= NAMED_OP;
4052 result->type = (enum cpp_ttype) result->val.node.node->directive_index;
4053 }
4054
4055 /* Signal FALLTHROUGH comment followed by another token. */
4056 if (fallthrough_comment)
4057 result->flags |= PREV_FALLTHROUGH;
4058 break;
4059
4060 case '\'':
4061 case '"':
4062 lex_string (pfile, token: result, base: buffer->cur - 1);
4063 break;
4064
4065 case '/':
4066 /* A potential block or line comment. */
4067 comment_start = buffer->cur;
4068 c = *buffer->cur;
4069
4070 if (c == '*')
4071 {
4072 if (_cpp_skip_block_comment (pfile))
4073 cpp_error (pfile, CPP_DL_ERROR, msgid: "unterminated comment");
4074 }
4075 else if (c == '/' && ! CPP_OPTION (pfile, traditional))
4076 {
4077 /* Don't warn for system headers. */
4078 if (_cpp_in_system_header (pfile))
4079 ;
4080 /* Warn about comments if pedantically GNUC89, and not
4081 in system headers. */
4082 else if (CPP_OPTION (pfile, lang) == CLK_GNUC89
4083 && CPP_PEDANTIC (pfile)
4084 && ! buffer->warned_cplusplus_comments)
4085 {
4086 if (cpp_pedwarning (pfile, CPP_W_PEDANTIC,
4087 msgid: "C++ style comments are not allowed "
4088 "in ISO C90"))
4089 cpp_error (pfile, CPP_DL_NOTE,
4090 msgid: "(this will be reported only once per input file)");
4091 buffer->warned_cplusplus_comments = 1;
4092 }
4093 /* Or if specifically desired via -Wc90-c99-compat. */
4094 else if (CPP_OPTION (pfile, cpp_warn_c90_c99_compat) > 0
4095 && ! CPP_OPTION (pfile, cplusplus)
4096 && ! buffer->warned_cplusplus_comments)
4097 {
4098 if (cpp_error (pfile, CPP_DL_WARNING,
4099 msgid: "C++ style comments are incompatible with C90"))
4100 cpp_error (pfile, CPP_DL_NOTE,
4101 msgid: "(this will be reported only once per input file)");
4102 buffer->warned_cplusplus_comments = 1;
4103 }
4104 /* In C89/C94, C++ style comments are forbidden. */
4105 else if ((CPP_OPTION (pfile, lang) == CLK_STDC89
4106 || CPP_OPTION (pfile, lang) == CLK_STDC94))
4107 {
4108 /* But don't be confused about valid code such as
4109 - // immediately followed by *,
4110 - // in a preprocessing directive,
4111 - // in an #if 0 block. */
4112 if (buffer->cur[1] == '*'
4113 || pfile->state.in_directive
4114 || pfile->state.skipping)
4115 {
4116 result->type = CPP_DIV;
4117 break;
4118 }
4119 else if (! buffer->warned_cplusplus_comments)
4120 {
4121 if (cpp_error (pfile, CPP_DL_ERROR,
4122 msgid: "C++ style comments are not allowed in "
4123 "ISO C90"))
4124 cpp_error (pfile, CPP_DL_NOTE,
4125 msgid: "(this will be reported only once per input "
4126 "file)");
4127 buffer->warned_cplusplus_comments = 1;
4128 }
4129 }
4130 if (skip_line_comment (pfile) && CPP_OPTION (pfile, warn_comments))
4131 cpp_warning (pfile, CPP_W_COMMENTS, msgid: "multi-line comment");
4132 }
4133 else if (c == '=')
4134 {
4135 buffer->cur++;
4136 result->type = CPP_DIV_EQ;
4137 break;
4138 }
4139 else
4140 {
4141 result->type = CPP_DIV;
4142 break;
4143 }
4144
4145 if (fallthrough_comment_p (pfile, comment_start))
4146 fallthrough_comment = true;
4147
4148 if (pfile->cb.comment)
4149 {
4150 size_t len = pfile->buffer->cur - comment_start;
4151 pfile->cb.comment (pfile, result->src_loc, comment_start - 1,
4152 len + 1);
4153 }
4154
4155 if (!pfile->state.save_comments)
4156 {
4157 result->flags |= PREV_WHITE;
4158 goto update_tokens_line;
4159 }
4160
4161 if (fallthrough_comment)
4162 result->flags |= PREV_FALLTHROUGH;
4163
4164 /* Save the comment as a token in its own right. */
4165 save_comment (pfile, token: result, from: comment_start, type: c);
4166 break;
4167
4168 case '<':
4169 if (pfile->state.angled_headers)
4170 {
4171 lex_string (pfile, token: result, base: buffer->cur - 1);
4172 if (result->type != CPP_LESS)
4173 break;
4174 }
4175
4176 result->type = CPP_LESS;
4177 if (*buffer->cur == '=')
4178 {
4179 buffer->cur++, result->type = CPP_LESS_EQ;
4180 if (*buffer->cur == '>'
4181 && CPP_OPTION (pfile, cplusplus)
4182 && CPP_OPTION (pfile, lang) >= CLK_GNUCXX20)
4183 buffer->cur++, result->type = CPP_SPACESHIP;
4184 }
4185 else if (*buffer->cur == '<')
4186 {
4187 buffer->cur++;
4188 IF_NEXT_IS ('=', CPP_LSHIFT_EQ, CPP_LSHIFT);
4189 }
4190 else if (CPP_OPTION (pfile, digraphs))
4191 {
4192 if (*buffer->cur == ':')
4193 {
4194 /* C++11 [2.5/3 lex.pptoken], "Otherwise, if the next
4195 three characters are <:: and the subsequent character
4196 is neither : nor >, the < is treated as a preprocessor
4197 token by itself". */
4198 if (CPP_OPTION (pfile, cplusplus)
4199 && CPP_OPTION (pfile, lang) != CLK_CXX98
4200 && CPP_OPTION (pfile, lang) != CLK_GNUCXX
4201 && buffer->cur[1] == ':'
4202 && buffer->cur[2] != ':' && buffer->cur[2] != '>')
4203 break;
4204
4205 buffer->cur++;
4206 result->flags |= DIGRAPH;
4207 result->type = CPP_OPEN_SQUARE;
4208 }
4209 else if (*buffer->cur == '%')
4210 {
4211 buffer->cur++;
4212 result->flags |= DIGRAPH;
4213 result->type = CPP_OPEN_BRACE;
4214 }
4215 }
4216 break;
4217
4218 case '>':
4219 result->type = CPP_GREATER;
4220 if (*buffer->cur == '=')
4221 buffer->cur++, result->type = CPP_GREATER_EQ;
4222 else if (*buffer->cur == '>')
4223 {
4224 buffer->cur++;
4225 IF_NEXT_IS ('=', CPP_RSHIFT_EQ, CPP_RSHIFT);
4226 }
4227 break;
4228
4229 case '%':
4230 result->type = CPP_MOD;
4231 if (*buffer->cur == '=')
4232 buffer->cur++, result->type = CPP_MOD_EQ;
4233 else if (CPP_OPTION (pfile, digraphs))
4234 {
4235 if (*buffer->cur == ':')
4236 {
4237 buffer->cur++;
4238 result->flags |= DIGRAPH;
4239 result->type = CPP_HASH;
4240 if (*buffer->cur == '%' && buffer->cur[1] == ':')
4241 buffer->cur += 2, result->type = CPP_PASTE, result->val.token_no = 0;
4242 }
4243 else if (*buffer->cur == '>')
4244 {
4245 buffer->cur++;
4246 result->flags |= DIGRAPH;
4247 result->type = CPP_CLOSE_BRACE;
4248 }
4249 }
4250 break;
4251
4252 case '.':
4253 result->type = CPP_DOT;
4254 if (ISDIGIT (*buffer->cur))
4255 {
4256 struct normalize_state nst = INITIAL_NORMALIZE_STATE;
4257 result->type = CPP_NUMBER;
4258 lex_number (pfile, number: &result->val.str, nst: &nst);
4259 warn_about_normalization (pfile, token: result, s: &nst, identifier: false);
4260 }
4261 else if (*buffer->cur == '.' && buffer->cur[1] == '.')
4262 buffer->cur += 2, result->type = CPP_ELLIPSIS;
4263 else if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
4264 buffer->cur++, result->type = CPP_DOT_STAR;
4265 break;
4266
4267 case '+':
4268 result->type = CPP_PLUS;
4269 if (*buffer->cur == '+')
4270 buffer->cur++, result->type = CPP_PLUS_PLUS;
4271 else if (*buffer->cur == '=')
4272 buffer->cur++, result->type = CPP_PLUS_EQ;
4273 break;
4274
4275 case '-':
4276 result->type = CPP_MINUS;
4277 if (*buffer->cur == '>')
4278 {
4279 buffer->cur++;
4280 result->type = CPP_DEREF;
4281 if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
4282 buffer->cur++, result->type = CPP_DEREF_STAR;
4283 }
4284 else if (*buffer->cur == '-')
4285 buffer->cur++, result->type = CPP_MINUS_MINUS;
4286 else if (*buffer->cur == '=')
4287 buffer->cur++, result->type = CPP_MINUS_EQ;
4288 break;
4289
4290 case '&':
4291 result->type = CPP_AND;
4292 if (*buffer->cur == '&')
4293 buffer->cur++, result->type = CPP_AND_AND;
4294 else if (*buffer->cur == '=')
4295 buffer->cur++, result->type = CPP_AND_EQ;
4296 break;
4297
4298 case '|':
4299 result->type = CPP_OR;
4300 if (*buffer->cur == '|')
4301 buffer->cur++, result->type = CPP_OR_OR;
4302 else if (*buffer->cur == '=')
4303 buffer->cur++, result->type = CPP_OR_EQ;
4304 break;
4305
4306 case ':':
4307 result->type = CPP_COLON;
4308 if (*buffer->cur == ':')
4309 {
4310 if (CPP_OPTION (pfile, scope))
4311 buffer->cur++, result->type = CPP_SCOPE;
4312 else
4313 result->flags |= COLON_SCOPE;
4314 }
4315 else if (*buffer->cur == ']'
4316 && CPP_OPTION (pfile, cplusplus)
4317 && CPP_OPTION (pfile, lang) >= CLK_GNUCXX26)
4318 buffer->cur++, result->type = CPP_CLOSE_SPLICE;
4319 else if (*buffer->cur == '>' && CPP_OPTION (pfile, digraphs))
4320 {
4321 buffer->cur++;
4322 result->flags |= DIGRAPH;
4323 result->type = CPP_CLOSE_SQUARE;
4324 }
4325 break;
4326
4327 case '*': IF_NEXT_IS ('=', CPP_MULT_EQ, CPP_MULT); break;
4328 case '=': IF_NEXT_IS ('=', CPP_EQ_EQ, CPP_EQ); break;
4329 case '!': IF_NEXT_IS ('=', CPP_NOT_EQ, CPP_NOT); break;
4330 case '^':
4331 result->type = CPP_XOR;
4332 if (*buffer->cur == '=')
4333 buffer->cur++, result->type = CPP_XOR_EQ;
4334 else if (*buffer->cur == '^'
4335 && CPP_OPTION (pfile, cplusplus)
4336 && CPP_OPTION (pfile, lang) >= CLK_GNUCXX26)
4337 buffer->cur++, result->type = CPP_REFLECT_OP;
4338 break;
4339 case '#': IF_NEXT_IS ('#', CPP_PASTE, CPP_HASH); result->val.token_no = 0; break;
4340
4341 case '?': result->type = CPP_QUERY; break;
4342 case '~': result->type = CPP_COMPL; break;
4343 case ',': result->type = CPP_COMMA; break;
4344 case '(': result->type = CPP_OPEN_PAREN; break;
4345 case ')': result->type = CPP_CLOSE_PAREN; break;
4346 case '[':
4347 result->type = CPP_OPEN_SQUARE;
4348 /* C++ [lex.pptoken]/4.3: "Otherwise, if the next three characters are
4349 [:: and the subsequent character is not :, or if the next three
4350 characters are [:>, the [ is treated as a preprocessing token by
4351 itself and not as the first character of the preprocessing token [:."
4352 Also, the tokens [: and :] cannot be composed from digraphs. */
4353 if (*buffer->cur == ':'
4354 && CPP_OPTION (pfile, cplusplus)
4355 && CPP_OPTION (pfile, lang) >= CLK_GNUCXX26)
4356 {
4357 if ((buffer->cur[1] == ':' && buffer->cur[2] != ':')
4358 || buffer->cur[1] == '>')
4359 break;
4360 else
4361 buffer->cur++, result->type = CPP_OPEN_SPLICE;
4362 }
4363 break;
4364 case ']': result->type = CPP_CLOSE_SQUARE; break;
4365 case '{': result->type = CPP_OPEN_BRACE; break;
4366 case '}': result->type = CPP_CLOSE_BRACE; break;
4367 case ';': result->type = CPP_SEMICOLON; break;
4368
4369 /* @ is a punctuator in Objective-C. */
4370 case '@': result->type = CPP_ATSIGN; break;
4371
4372 default:
4373 {
4374 const uchar *base = --buffer->cur;
4375 static int no_warn_cnt;
4376
4377 /* Check for an extended identifier ($ or UCN or UTF-8). */
4378 struct normalize_state nst = INITIAL_NORMALIZE_STATE;
4379 if (forms_identifier_p (pfile, first: true, state: &nst))
4380 {
4381 result->type = CPP_NAME;
4382 const auto node = lex_identifier (pfile, base, starts_ucn: true, nst: &nst,
4383 spelling: &result->val.node.spelling);
4384 result->val.node.node = node;
4385 identifier_diagnostics_on_lex (pfile, node);
4386 warn_about_normalization (pfile, token: result, s: &nst, identifier: true);
4387 break;
4388 }
4389
4390 /* Otherwise this will form a CPP_OTHER token. Parse valid UTF-8 as a
4391 single token. */
4392 buffer->cur++;
4393 if (c >= utf8_signifier)
4394 {
4395 const uchar *pstr = base;
4396 cppchar_t s;
4397 if (_cpp_valid_utf8 (pfile, pstr: &pstr, limit: buffer->rlimit, identifier_pos: 0, NULL, cp: &s))
4398 {
4399 if (s > UCS_LIMIT && CPP_OPTION (pfile, cpp_warn_invalid_utf8))
4400 {
4401 buffer->cur = base;
4402 _cpp_warn_invalid_utf8 (pfile);
4403 }
4404 buffer->cur = pstr;
4405 }
4406 else if (CPP_OPTION (pfile, cpp_warn_invalid_utf8))
4407 {
4408 buffer->cur = base;
4409 const uchar *end = _cpp_warn_invalid_utf8 (pfile);
4410 buffer->cur = base + 1;
4411 no_warn_cnt = end - buffer->cur;
4412 }
4413 }
4414 else if (c >= utf8_continuation
4415 && CPP_OPTION (pfile, cpp_warn_invalid_utf8))
4416 {
4417 if (no_warn_cnt)
4418 --no_warn_cnt;
4419 else
4420 {
4421 buffer->cur = base;
4422 _cpp_warn_invalid_utf8 (pfile);
4423 buffer->cur = base + 1;
4424 }
4425 }
4426 create_literal (pfile, token: result, base, len: buffer->cur - base, type: CPP_OTHER);
4427 break;
4428 }
4429
4430 }
4431
4432 /* Potentially convert the location of the token to a range. */
4433 if (result->src_loc >= RESERVED_LOCATION_COUNT
4434 && result->type != CPP_EOF
4435 && !pfile->forced_token_location)
4436 {
4437 /* Ensure that any line notes are processed, so that we have the
4438 correct physical line/column for the end-point of the token even
4439 when a logical line is split via one or more backslashes. */
4440 if (buffer->cur >= buffer->notes[buffer->cur_note].pos
4441 && !pfile->overlaid_buffer)
4442 _cpp_process_line_notes (pfile, in_comment: false);
4443
4444 source_range tok_range;
4445 tok_range.m_start = result->src_loc;
4446 tok_range.m_finish
4447 = linemap_position_for_column (pfile->line_table,
4448 CPP_BUF_COLUMN (buffer, buffer->cur));
4449
4450 result->src_loc
4451 = pfile->line_table->get_or_create_combined_loc (locus: result->src_loc,
4452 src_range: tok_range, data: nullptr, discriminator: 0);
4453 }
4454
4455 return result;
4456}
4457
4458/* An upper bound on the number of bytes needed to spell TOKEN.
4459 Does not include preceding whitespace. */
4460unsigned int
4461cpp_token_len (const cpp_token *token)
4462{
4463 unsigned int len;
4464
4465 switch (TOKEN_SPELL (token))
4466 {
4467 default: len = 6; break;
4468 case SPELL_LITERAL: len = token->val.str.len; break;
4469 case SPELL_IDENT: len = NODE_LEN (token->val.node.node) * 10; break;
4470 }
4471
4472 return len;
4473}
4474
4475/* Parse UTF-8 out of NAMEP and place a \U escape in BUFFER.
4476 Return the number of bytes read out of NAME. (There are always
4477 10 bytes written to BUFFER.) */
4478
4479static size_t
4480utf8_to_ucn (unsigned char *buffer, const unsigned char *name)
4481{
4482 int j;
4483 int ucn_len = 0;
4484 int ucn_len_c;
4485 unsigned t;
4486 unsigned long utf32;
4487
4488 /* Compute the length of the UTF-8 sequence. */
4489 for (t = *name; t & 0x80; t <<= 1)
4490 ucn_len++;
4491
4492 utf32 = *name & (0x7F >> ucn_len);
4493 for (ucn_len_c = 1; ucn_len_c < ucn_len; ucn_len_c++)
4494 {
4495 utf32 = (utf32 << 6) | (*++name & 0x3F);
4496
4497 /* Ill-formed UTF-8. */
4498 if ((*name & ~0x3F) != 0x80)
4499 abort ();
4500 }
4501
4502 *buffer++ = '\\';
4503 *buffer++ = 'U';
4504 for (j = 7; j >= 0; j--)
4505 *buffer++ = "0123456789abcdef"[(utf32 >> (4 * j)) & 0xF];
4506 return ucn_len;
4507}
4508
4509/* Given a token TYPE corresponding to a digraph, return a pointer to
4510 the spelling of the digraph. */
4511static const unsigned char *
4512cpp_digraph2name (enum cpp_ttype type)
4513{
4514 return digraph_spellings[(int) type - (int) CPP_FIRST_DIGRAPH];
4515}
4516
4517/* Write the spelling of an identifier IDENT, using UCNs, to BUFFER.
4518 The buffer must already contain enough space to hold the
4519 token's spelling. Returns a pointer to the character after the
4520 last character written. */
4521unsigned char *
4522_cpp_spell_ident_ucns (unsigned char *buffer, cpp_hashnode *ident)
4523{
4524 size_t i;
4525 const unsigned char *name = NODE_NAME (ident);
4526
4527 for (i = 0; i < NODE_LEN (ident); i++)
4528 if (name[i] & ~0x7F)
4529 {
4530 i += utf8_to_ucn (buffer, name: name + i) - 1;
4531 buffer += 10;
4532 }
4533 else
4534 *buffer++ = name[i];
4535
4536 return buffer;
4537}
4538
4539/* Write the spelling of a token TOKEN to BUFFER. The buffer must
4540 already contain enough space to hold the token's spelling.
4541 Returns a pointer to the character after the last character written.
4542 FORSTRING is true if this is to be the spelling after translation
4543 phase 1 (with the original spelling of extended identifiers), false
4544 if extended identifiers should always be written using UCNs (there is
4545 no option for always writing them in the internal UTF-8 form).
4546 FIXME: Would be nice if we didn't need the PFILE argument. */
4547unsigned char *
4548cpp_spell_token (cpp_reader *pfile, const cpp_token *token,
4549 unsigned char *buffer, bool forstring)
4550{
4551 switch (TOKEN_SPELL (token))
4552 {
4553 case SPELL_OPERATOR:
4554 {
4555 const unsigned char *spelling;
4556 unsigned char c;
4557
4558 if (token->flags & DIGRAPH)
4559 spelling = cpp_digraph2name (type: token->type);
4560 else if (token->flags & NAMED_OP)
4561 goto spell_ident;
4562 else
4563 spelling = TOKEN_NAME (token);
4564
4565 while ((c = *spelling++) != '\0')
4566 *buffer++ = c;
4567 }
4568 break;
4569
4570 spell_ident:
4571 case SPELL_IDENT:
4572 if (forstring)
4573 {
4574 memcpy (dest: buffer, NODE_NAME (token->val.node.spelling),
4575 NODE_LEN (token->val.node.spelling));
4576 buffer += NODE_LEN (token->val.node.spelling);
4577 }
4578 else
4579 buffer = _cpp_spell_ident_ucns (buffer, ident: token->val.node.node);
4580 break;
4581
4582 case SPELL_LITERAL:
4583 memcpy (dest: buffer, src: token->val.str.text, n: token->val.str.len);
4584 buffer += token->val.str.len;
4585 break;
4586
4587 case SPELL_NONE:
4588 cpp_error (pfile, CPP_DL_ICE,
4589 msgid: "unspellable token %s", TOKEN_NAME (token));
4590 break;
4591 }
4592
4593 return buffer;
4594}
4595
4596/* Returns TOKEN spelt as a null-terminated string. The string is
4597 freed when the reader is destroyed. Useful for diagnostics. */
4598unsigned char *
4599cpp_token_as_text (cpp_reader *pfile, const cpp_token *token)
4600{
4601 unsigned int len = cpp_token_len (token) + 1;
4602 unsigned char *start = _cpp_unaligned_alloc (pfile, len), *end;
4603
4604 end = cpp_spell_token (pfile, token, buffer: start, forstring: false);
4605 end[0] = '\0';
4606
4607 return start;
4608}
4609
4610/* Returns a pointer to a string which spells the token defined by
4611 TYPE and FLAGS. Used by C front ends, which really should move to
4612 using cpp_token_as_text. */
4613const char *
4614cpp_type2name (enum cpp_ttype type, unsigned char flags)
4615{
4616 if (flags & DIGRAPH)
4617 return (const char *) cpp_digraph2name (type);
4618 else if (flags & NAMED_OP)
4619 return cpp_named_operator2name (type);
4620
4621 return (const char *) token_spellings[type].name;
4622}
4623
4624/* Writes the spelling of token to FP, without any preceding space.
4625 Separated from cpp_spell_token for efficiency - to avoid stdio
4626 double-buffering. */
4627void
4628cpp_output_token (const cpp_token *token, FILE *fp)
4629{
4630 switch (TOKEN_SPELL (token))
4631 {
4632 case SPELL_OPERATOR:
4633 {
4634 const unsigned char *spelling;
4635 int c;
4636
4637 if (token->flags & DIGRAPH)
4638 spelling = cpp_digraph2name (type: token->type);
4639 else if (token->flags & NAMED_OP)
4640 goto spell_ident;
4641 else
4642 spelling = TOKEN_NAME (token);
4643
4644 c = *spelling;
4645 do
4646 putc (c, fp);
4647 while ((c = *++spelling) != '\0');
4648 }
4649 break;
4650
4651 spell_ident:
4652 case SPELL_IDENT:
4653 {
4654 size_t i;
4655 const unsigned char * name = NODE_NAME (token->val.node.node);
4656 unsigned len = NODE_LEN (token->val.node.node);
4657
4658 for (i = 0; i < len; i++)
4659 if (name[i] & ~0x7F)
4660 {
4661 unsigned char buffer[10];
4662 i += utf8_to_ucn (buffer, name: name + i) - 1;
4663 fwrite (buffer, 1, 10, fp);
4664 }
4665 else if (name[i] == ' ' && i == len - 1)
4666 /* Omit terminal space in "export ". */;
4667 else
4668 fputc (NODE_NAME (token->val.node.node)[i], fp);
4669 }
4670 break;
4671
4672 case SPELL_LITERAL:
4673 if (token->type == CPP_HEADER_NAME)
4674 fputc ('"', fp);
4675 fwrite (token->val.str.text, 1, token->val.str.len, fp);
4676 if (token->type == CPP_HEADER_NAME)
4677 fputc ('"', fp);
4678 break;
4679
4680 case SPELL_NONE:
4681 /* An error, most probably. */
4682 break;
4683 }
4684}
4685
4686/* Compare two tokens. */
4687int
4688_cpp_equiv_tokens (const cpp_token *a, const cpp_token *b)
4689{
4690 if (a->type == b->type && a->flags == b->flags)
4691 switch (TOKEN_SPELL (a))
4692 {
4693 default: /* Keep compiler happy. */
4694 case SPELL_OPERATOR:
4695 /* token_no is used to track where multiple consecutive ##
4696 tokens were originally located. */
4697 return (a->type != CPP_PASTE || a->val.token_no == b->val.token_no);
4698 case SPELL_NONE:
4699 return (a->type != CPP_MACRO_ARG
4700 || (a->val.macro_arg.arg_no == b->val.macro_arg.arg_no
4701 && a->val.macro_arg.spelling == b->val.macro_arg.spelling));
4702 case SPELL_IDENT:
4703 return (a->val.node.node == b->val.node.node
4704 && a->val.node.spelling == b->val.node.spelling);
4705 case SPELL_LITERAL:
4706 return (a->val.str.len == b->val.str.len
4707 && !memcmp (s1: a->val.str.text, s2: b->val.str.text,
4708 n: a->val.str.len));
4709 }
4710
4711 return 0;
4712}
4713
4714/* Returns nonzero if a space should be inserted to avoid an
4715 accidental token paste for output. For simplicity, it is
4716 conservative, and occasionally advises a space where one is not
4717 needed, e.g. "." and ".2". */
4718int
4719cpp_avoid_paste (cpp_reader *pfile, const cpp_token *token1,
4720 const cpp_token *token2)
4721{
4722 enum cpp_ttype a = token1->type, b = token2->type;
4723 cppchar_t c;
4724
4725 if (token1->flags & NAMED_OP)
4726 a = CPP_NAME;
4727 if (token2->flags & NAMED_OP)
4728 b = CPP_NAME;
4729
4730 c = EOF;
4731 if (token2->flags & DIGRAPH)
4732 c = digraph_spellings[(int) b - (int) CPP_FIRST_DIGRAPH][0];
4733 else if (token_spellings[b].category == SPELL_OPERATOR)
4734 c = token_spellings[b].name[0];
4735
4736 /* Quickly get everything that can paste with an '='. */
4737 if ((int) a <= (int) CPP_LAST_EQ && c == '=')
4738 return 1;
4739
4740 switch (a)
4741 {
4742 case CPP_GREATER: return c == '>';
4743 case CPP_LESS: return c == '<' || c == '%' || c == ':';
4744 case CPP_PLUS: return c == '+';
4745 case CPP_MINUS: return c == '-' || c == '>';
4746 case CPP_DIV: return c == '/' || c == '*'; /* Comments. */
4747 case CPP_MOD: return c == ':' || c == '>';
4748 case CPP_AND: return c == '&';
4749 case CPP_OR: return c == '|';
4750 case CPP_COLON: return c == ':' || c == '>';
4751 case CPP_DEREF: return c == '*';
4752 case CPP_DOT: return c == '.' || c == '%' || b == CPP_NUMBER;
4753 case CPP_HASH: return c == '#' || c == '%'; /* Digraph form. */
4754 case CPP_PRAGMA:
4755 case CPP_NAME: return ((b == CPP_NUMBER
4756 && name_p (pfile, string: &token2->val.str))
4757 || b == CPP_NAME
4758 || b == CPP_CHAR || b == CPP_STRING); /* L */
4759 case CPP_NUMBER: return (b == CPP_NUMBER || b == CPP_NAME
4760 || b == CPP_CHAR
4761 || c == '.' || c == '+' || c == '-');
4762 /* UCNs */
4763 case CPP_OTHER: return ((token1->val.str.text[0] == '\\'
4764 && b == CPP_NAME)
4765 || (CPP_OPTION (pfile, objc)
4766 && token1->val.str.text[0] == '@'
4767 && (b == CPP_NAME || b == CPP_STRING)));
4768 case CPP_LESS_EQ: return c == '>';
4769 case CPP_STRING:
4770 case CPP_WSTRING:
4771 case CPP_UTF8STRING:
4772 case CPP_STRING16:
4773 case CPP_STRING32: return (CPP_OPTION (pfile, user_literals)
4774 && (b == CPP_NAME
4775 || (TOKEN_SPELL (token2) == SPELL_LITERAL
4776 && ISIDST (token2->val.str.text[0]))));
4777
4778 default: break;
4779 }
4780
4781 return 0;
4782}
4783
4784/* Output all the remaining tokens on the current line, and a newline
4785 character, to FP. Leading whitespace is removed. If there are
4786 macros, special token padding is not performed. */
4787void
4788cpp_output_line (cpp_reader *pfile, FILE *fp)
4789{
4790 const cpp_token *token;
4791
4792 token = cpp_get_token (pfile);
4793 while (token->type != CPP_EOF)
4794 {
4795 cpp_output_token (token, fp);
4796 token = cpp_get_token (pfile);
4797 if (token->flags & PREV_WHITE)
4798 putc (' ', fp);
4799 }
4800
4801 putc ('\n', fp);
4802}
4803
4804/* Return a string representation of all the remaining tokens on the
4805 current line. The result is allocated using xmalloc and must be
4806 freed by the caller. */
4807unsigned char *
4808cpp_output_line_to_string (cpp_reader *pfile, const unsigned char *dir_name)
4809{
4810 const cpp_token *token;
4811 unsigned int out = dir_name ? ustrlen (s1: dir_name) : 0;
4812 unsigned int alloced = 120 + out;
4813 unsigned char *result = (unsigned char *) xmalloc (alloced);
4814
4815 /* If DIR_NAME is empty, there are no initial contents. */
4816 if (dir_name)
4817 {
4818 sprintf (s: (char *) result, format: "#%s ", dir_name);
4819 out += 2;
4820 }
4821
4822 token = cpp_get_token (pfile);
4823 while (token->type != CPP_EOF)
4824 {
4825 unsigned char *last;
4826 /* Include room for a possible space and the terminating nul. */
4827 unsigned int len = cpp_token_len (token) + 2;
4828
4829 if (out + len > alloced)
4830 {
4831 alloced *= 2;
4832 if (out + len > alloced)
4833 alloced = out + len;
4834 result = (unsigned char *) xrealloc (result, alloced);
4835 }
4836
4837 last = cpp_spell_token (pfile, token, buffer: &result[out], forstring: 0);
4838 out = last - result;
4839
4840 token = cpp_get_token (pfile);
4841 if (token->flags & PREV_WHITE)
4842 result[out++] = ' ';
4843 }
4844
4845 result[out] = '\0';
4846 return result;
4847}
4848
4849/* Memory buffers. Changing these three constants can have a dramatic
4850 effect on performance. The values here are reasonable defaults,
4851 but might be tuned. If you adjust them, be sure to test across a
4852 range of uses of cpplib, including heavy nested function-like macro
4853 expansion. Also check the change in peak memory usage (NJAMD is a
4854 good tool for this). */
4855#define MIN_BUFF_SIZE 8000
4856#define BUFF_SIZE_UPPER_BOUND(MIN_SIZE) (MIN_BUFF_SIZE + (MIN_SIZE) * 3 / 2)
4857#define EXTENDED_BUFF_SIZE(BUFF, MIN_EXTRA) \
4858 (MIN_EXTRA + ((BUFF)->limit - (BUFF)->cur) * 2)
4859
4860#if MIN_BUFF_SIZE > BUFF_SIZE_UPPER_BOUND (0)
4861 #error BUFF_SIZE_UPPER_BOUND must be at least as large as MIN_BUFF_SIZE!
4862#endif
4863
4864/* Create a new allocation buffer. Place the control block at the end
4865 of the buffer, so that buffer overflows will cause immediate chaos. */
4866static _cpp_buff *
4867new_buff (size_t len)
4868{
4869 _cpp_buff *result;
4870 unsigned char *base;
4871
4872 if (len < MIN_BUFF_SIZE)
4873 len = MIN_BUFF_SIZE;
4874 len = CPP_ALIGN (len);
4875
4876#ifdef ENABLE_VALGRIND_WORKAROUNDS
4877 /* Valgrind warns about uses of interior pointers, so put _cpp_buff
4878 struct first. */
4879 size_t slen = CPP_ALIGN2 (sizeof (_cpp_buff), 2 * DEFAULT_ALIGNMENT);
4880 base = XNEWVEC (unsigned char, len + slen);
4881 result = (_cpp_buff *) base;
4882 base += slen;
4883#else
4884 base = XNEWVEC (unsigned char, len + sizeof (_cpp_buff));
4885 result = (_cpp_buff *) (base + len);
4886#endif
4887 result->base = base;
4888 result->cur = base;
4889 result->limit = base + len;
4890 result->next = NULL;
4891 return result;
4892}
4893
4894/* Place a chain of unwanted allocation buffers on the free list. */
4895void
4896_cpp_release_buff (cpp_reader *pfile, _cpp_buff *buff)
4897{
4898 _cpp_buff *end = buff;
4899
4900 while (end->next)
4901 end = end->next;
4902 end->next = pfile->free_buffs;
4903 pfile->free_buffs = buff;
4904}
4905
4906/* Return a free buffer of size at least MIN_SIZE. */
4907_cpp_buff *
4908_cpp_get_buff (cpp_reader *pfile, size_t min_size)
4909{
4910 _cpp_buff *result, **p;
4911
4912 for (p = &pfile->free_buffs;; p = &(*p)->next)
4913 {
4914 size_t size;
4915
4916 if (*p == NULL)
4917 return new_buff (len: min_size);
4918 result = *p;
4919 size = result->limit - result->base;
4920 /* Return a buffer that's big enough, but don't waste one that's
4921 way too big. */
4922 if (size >= min_size && size <= BUFF_SIZE_UPPER_BOUND (min_size))
4923 break;
4924 }
4925
4926 *p = result->next;
4927 result->next = NULL;
4928 result->cur = result->base;
4929 return result;
4930}
4931
4932/* Creates a new buffer with enough space to hold the uncommitted
4933 remaining bytes of BUFF, and at least MIN_EXTRA more bytes. Copies
4934 the excess bytes to the new buffer. Chains the new buffer after
4935 BUFF, and returns the new buffer. */
4936_cpp_buff *
4937_cpp_append_extend_buff (cpp_reader *pfile, _cpp_buff *buff, size_t min_extra)
4938{
4939 size_t size = EXTENDED_BUFF_SIZE (buff, min_extra);
4940 _cpp_buff *new_buff = _cpp_get_buff (pfile, min_size: size);
4941
4942 buff->next = new_buff;
4943 memcpy (dest: new_buff->base, src: buff->cur, BUFF_ROOM (buff));
4944 return new_buff;
4945}
4946
4947/* Creates a new buffer with enough space to hold the uncommitted
4948 remaining bytes of the buffer pointed to by BUFF, and at least
4949 MIN_EXTRA more bytes. Copies the excess bytes to the new buffer.
4950 Chains the new buffer before the buffer pointed to by BUFF, and
4951 updates the pointer to point to the new buffer. */
4952void
4953_cpp_extend_buff (cpp_reader *pfile, _cpp_buff **pbuff, size_t min_extra)
4954{
4955 _cpp_buff *new_buff, *old_buff = *pbuff;
4956 size_t size = EXTENDED_BUFF_SIZE (old_buff, min_extra);
4957
4958 new_buff = _cpp_get_buff (pfile, min_size: size);
4959 memcpy (dest: new_buff->base, src: old_buff->cur, BUFF_ROOM (old_buff));
4960 new_buff->next = old_buff;
4961 *pbuff = new_buff;
4962}
4963
4964/* Free a chain of buffers starting at BUFF. */
4965void
4966_cpp_free_buff (_cpp_buff *buff)
4967{
4968 _cpp_buff *next;
4969
4970 for (; buff; buff = next)
4971 {
4972 next = buff->next;
4973#ifdef ENABLE_VALGRIND_WORKAROUNDS
4974 free (buff);
4975#else
4976 free (ptr: buff->base);
4977#endif
4978 }
4979}
4980
4981/* Allocate permanent, unaligned storage of length LEN. */
4982unsigned char *
4983_cpp_unaligned_alloc (cpp_reader *pfile, size_t len)
4984{
4985 _cpp_buff *buff = pfile->u_buff;
4986 unsigned char *result = buff->cur;
4987
4988 if (len > (size_t) (buff->limit - result))
4989 {
4990 buff = _cpp_get_buff (pfile, min_size: len);
4991 buff->next = pfile->u_buff;
4992 pfile->u_buff = buff;
4993 result = buff->cur;
4994 }
4995
4996 buff->cur = result + len;
4997 return result;
4998}
4999
5000/* Allocate permanent, unaligned storage of length LEN from a_buff.
5001 That buffer is used for growing allocations when saving macro
5002 replacement lists in a #define, and when parsing an answer to an
5003 assertion in #assert, #unassert or #if (and therefore possibly
5004 whilst expanding macros). It therefore must not be used by any
5005 code that they might call: specifically the lexer and the guts of
5006 the macro expander.
5007
5008 All existing other uses clearly fit this restriction: storing
5009 registered pragmas during initialization. */
5010unsigned char *
5011_cpp_aligned_alloc (cpp_reader *pfile, size_t len)
5012{
5013 _cpp_buff *buff = pfile->a_buff;
5014 unsigned char *result = buff->cur;
5015
5016 if (len > (size_t) (buff->limit - result))
5017 {
5018 buff = _cpp_get_buff (pfile, min_size: len);
5019 buff->next = pfile->a_buff;
5020 pfile->a_buff = buff;
5021 result = buff->cur;
5022 }
5023
5024 buff->cur = result + len;
5025 return result;
5026}
5027
5028/* Commit or allocate storage from a buffer. */
5029
5030void *
5031_cpp_commit_buff (cpp_reader *pfile, size_t size)
5032{
5033 const auto buff = pfile->a_buff;
5034 void *ptr = BUFF_FRONT (buff);
5035
5036 if (pfile->hash_table->alloc_subobject)
5037 {
5038 void *copy = pfile->hash_table->alloc_subobject (size);
5039 memcpy (dest: copy, src: ptr, n: size);
5040 ptr = copy;
5041 }
5042 else
5043 {
5044 BUFF_FRONT (buff) += size;
5045 /* Make sure the remaining space is maximally aligned for whatever this
5046 buffer holds next. */
5047 BUFF_FRONT (buff) += BUFF_ROOM (buff) % DEFAULT_ALIGNMENT;
5048 }
5049
5050 return ptr;
5051}
5052
5053/* Say which field of TOK is in use. */
5054
5055enum cpp_token_fld_kind
5056cpp_token_val_index (const cpp_token *tok)
5057{
5058 switch (TOKEN_SPELL (tok))
5059 {
5060 case SPELL_IDENT:
5061 return CPP_TOKEN_FLD_NODE;
5062 case SPELL_LITERAL:
5063 return CPP_TOKEN_FLD_STR;
5064 case SPELL_OPERATOR:
5065 /* Operands which were originally spelled as ident keep around
5066 the node for the exact spelling. */
5067 if (tok->flags & NAMED_OP)
5068 return CPP_TOKEN_FLD_NODE;
5069 else if (tok->type == CPP_PASTE)
5070 return CPP_TOKEN_FLD_TOKEN_NO;
5071 else
5072 return CPP_TOKEN_FLD_NONE;
5073 case SPELL_NONE:
5074 if (tok->type == CPP_MACRO_ARG)
5075 return CPP_TOKEN_FLD_ARG_NO;
5076 else if (tok->type == CPP_PADDING)
5077 return CPP_TOKEN_FLD_SOURCE;
5078 else if (tok->type == CPP_PRAGMA)
5079 return CPP_TOKEN_FLD_PRAGMA;
5080 /* fall through */
5081 default:
5082 return CPP_TOKEN_FLD_NONE;
5083 }
5084}
5085
5086/* All tokens lexed in R after calling this function will be forced to
5087 have their location_t to be P, until
5088 cpp_stop_forcing_token_locations is called for R. */
5089
5090void
5091cpp_force_token_locations (cpp_reader *r, location_t loc)
5092{
5093 r->forced_token_location = loc;
5094}
5095
5096/* Go back to assigning locations naturally for lexed tokens. */
5097
5098void
5099cpp_stop_forcing_token_locations (cpp_reader *r)
5100{
5101 r->forced_token_location = 0;
5102}
5103
5104/* We're looking at \, if it's escaping EOL, look past it. If at
5105 LIMIT, don't advance. */
5106
5107static const unsigned char *
5108do_peek_backslash (const unsigned char *peek, const unsigned char *limit)
5109{
5110 const unsigned char *probe = peek;
5111
5112 if (__builtin_expect (peek[1] == '\n', true))
5113 {
5114 eol:
5115 probe += 2;
5116 if (__builtin_expect (probe < limit, true))
5117 {
5118 peek = probe;
5119 if (*peek == '\\')
5120 /* The user might be perverse. */
5121 return do_peek_backslash (peek, limit);
5122 }
5123 }
5124 else if (__builtin_expect (peek[1] == '\r', false))
5125 {
5126 if (probe[2] == '\n')
5127 probe++;
5128 goto eol;
5129 }
5130
5131 return peek;
5132}
5133
5134static const unsigned char *
5135do_peek_next (const unsigned char *peek, const unsigned char *limit)
5136{
5137 if (__builtin_expect (*peek == '\\', false))
5138 peek = do_peek_backslash (peek, limit);
5139 return peek;
5140}
5141
5142static const unsigned char *
5143do_peek_prev (const unsigned char *peek, const unsigned char *bound)
5144{
5145 if (peek == bound)
5146 return NULL;
5147
5148 unsigned char c = *--peek;
5149 if (__builtin_expect (c == '\n', false)
5150 || __builtin_expect (c == '\r', false))
5151 {
5152 if (peek == bound)
5153 return peek;
5154 int ix = -1;
5155 if (c == '\n' && peek[ix] == '\r')
5156 {
5157 if (peek + ix == bound)
5158 return peek;
5159 ix--;
5160 }
5161
5162 if (peek[ix] == '\\')
5163 return do_peek_prev (peek: peek + ix, bound);
5164
5165 return peek;
5166 }
5167 else
5168 return peek;
5169}
5170
5171/* If PEEK[-1] is identifier MATCH, scan past it and trailing white
5172 space. Otherwise return NULL. */
5173
5174static const unsigned char *
5175do_peek_ident (const char *match, const unsigned char *peek,
5176 const unsigned char *limit)
5177{
5178 for (; *++match; peek++)
5179 if (*peek != *match)
5180 {
5181 peek = do_peek_next (peek, limit);
5182 if (*peek != *match)
5183 return NULL;
5184 }
5185
5186 /* Must now not be looking at an identifier char. */
5187 peek = do_peek_next (peek, limit);
5188 if (ISIDNUM (*peek))
5189 return NULL;
5190
5191 /* Skip control-line whitespace. */
5192 ws:
5193 while (*peek == ' ' || *peek == '\t')
5194 peek++;
5195 if (__builtin_expect (*peek == '\\', false))
5196 {
5197 peek = do_peek_backslash (peek, limit);
5198 if (*peek != '\\')
5199 goto ws;
5200 }
5201
5202 return peek;
5203}
5204
5205/* Are we looking at a module control line starting as PEEK - 1? */
5206
5207static bool
5208do_peek_module (cpp_reader *pfile, unsigned char c,
5209 const unsigned char *peek, const unsigned char *limit)
5210{
5211 bool import = false;
5212
5213 if (__builtin_expect (c == 'e', false))
5214 {
5215 if (!((peek[0] == 'x' || peek[0] == '\\')
5216 && (peek = do_peek_ident (match: "export", peek, limit))))
5217 return false;
5218
5219 /* export, peek for import or module. No need to peek __import
5220 here. */
5221 if (peek[0] == 'i')
5222 {
5223 if (!((peek[1] == 'm' || peek[1] == '\\')
5224 && (peek = do_peek_ident (match: "import", peek: peek + 1, limit))))
5225 return false;
5226 import = true;
5227 }
5228 else if (peek[0] == 'm')
5229 {
5230 if (!((peek[1] == 'o' || peek[1] == '\\')
5231 && (peek = do_peek_ident (match: "module", peek: peek + 1, limit))))
5232 return false;
5233 }
5234 else
5235 return false;
5236 }
5237 else if (__builtin_expect (c == 'i', false))
5238 {
5239 if (!((peek[0] == 'm' || peek[0] == '\\')
5240 && (peek = do_peek_ident (match: "import", peek, limit))))
5241 return false;
5242 import = true;
5243 }
5244 else if (__builtin_expect (c == '_', false))
5245 {
5246 /* Needed for translated includes. */
5247 if (!((peek[0] == '_' || peek[0] == '\\')
5248 && (peek = do_peek_ident (match: "__import", peek, limit))))
5249 return false;
5250 import = true;
5251 }
5252 else if (__builtin_expect (c == 'm', false))
5253 {
5254 if (!((peek[0] == 'o' || peek[0] == '\\')
5255 && (peek = do_peek_ident (match: "module", peek, limit))))
5256 return false;
5257 }
5258 else
5259 return false;
5260
5261 /* Peek the next character to see if it's good enough. We'll be at
5262 the first non-whitespace char, including skipping an escaped
5263 newline. */
5264 /* ... import followed by identifier, ':', '<' or header-name
5265 preprocessing tokens, or module followed by identifier, ':' or
5266 ';' preprocessing tokens. */
5267 unsigned char p = *peek++;
5268
5269 /* A character literal is ... single quotes, ... optionally preceded
5270 by u8, u, U, or L */
5271 /* A string-literal is a ... double quotes, optionally prefixed by
5272 R, u8, u8R, u, uR, U, UR, L, or LR */
5273 if (p == 'u')
5274 {
5275 peek = do_peek_next (peek, limit);
5276 if (*peek == '8')
5277 {
5278 peek++;
5279 goto peek_u8;
5280 }
5281 goto peek_u;
5282 }
5283 else if (p == 'U' || p == 'L')
5284 {
5285 peek_u8:
5286 peek = do_peek_next (peek, limit);
5287 peek_u:
5288 if (*peek == '\"' || *peek == '\'')
5289 return false;
5290
5291 if (*peek == 'R')
5292 goto peek_R;
5293 /* Identifier. Ok. */
5294 }
5295 else if (p == 'R')
5296 {
5297 peek_R:
5298 if (CPP_OPTION (pfile, rliterals))
5299 {
5300 peek = do_peek_next (peek, limit);
5301 if (*peek == '\"')
5302 return false;
5303 }
5304 /* Identifier. Ok. */
5305 }
5306 else if ('Z' - 'A' == 25
5307 ? ((p >= 'A' && p <= 'Z') || (p >= 'a' && p <= 'z') || p == '_')
5308 : ISIDST (p))
5309 {
5310 /* Identifier. Ok. */
5311 }
5312 else if (p == '<')
5313 {
5314 /* Maybe angle header, ok for import. Reject
5315 '<=', '<<' digraph:'<:'. */
5316 if (!import)
5317 return false;
5318 peek = do_peek_next (peek, limit);
5319 if (*peek == '=' || *peek == '<'
5320 || (*peek == ':' && CPP_OPTION (pfile, digraphs)))
5321 return false;
5322 }
5323 else if (p == ';')
5324 {
5325 /* SEMICOLON, ok for module. */
5326 if (import)
5327 return false;
5328 }
5329 else if (p == '"')
5330 {
5331 /* STRING, ok for import. */
5332 if (!import)
5333 return false;
5334 }
5335 else if (p == ':')
5336 {
5337 /* Maybe COLON, ok. Reject '::', digraph:':>'. */
5338 peek = do_peek_next (peek, limit);
5339 if (*peek == ':' || (*peek == '>' && CPP_OPTION (pfile, digraphs)))
5340 return false;
5341 }
5342 else
5343 /* FIXME: Detect a unicode character, excluding those not
5344 permitted as the initial character. [lex.name]/1. I presume
5345 we need to check the \[uU] spellings, and directly using
5346 Unicode in say UTF8 form? Or perhaps we do the phase-1
5347 conversion of UTF8 to universal-character-names? */
5348 return false;
5349
5350 return true;
5351}
5352
5353/* Directives-only scanning. Somewhat more relaxed than correct
5354 parsing -- some ill-formed programs will not be rejected. */
5355
5356void
5357cpp_directive_only_process (cpp_reader *pfile,
5358 void *data,
5359 void (*cb) (cpp_reader *, CPP_DO_task, void *, ...))
5360{
5361 bool module_p = CPP_OPTION (pfile, module_directives);
5362
5363 do
5364 {
5365 restart:
5366 /* Buffer initialization, but no line cleaning. */
5367 cpp_buffer *buffer = pfile->buffer;
5368 buffer->cur_note = buffer->notes_used = 0;
5369 buffer->cur = buffer->line_base = buffer->next_line;
5370 buffer->need_line = false;
5371 /* Files always end in a newline or carriage return. We rely on this for
5372 character peeking safety. */
5373 gcc_assert (buffer->rlimit[0] == '\n' || buffer->rlimit[0] == '\r');
5374
5375 const unsigned char *base = buffer->cur;
5376 unsigned line_count = 0;
5377 const unsigned char *line_start = base;
5378
5379 bool bol = true;
5380 bool raw = false;
5381
5382 const unsigned char *lwm = base;
5383 for (const unsigned char *pos = base, *limit = buffer->rlimit;
5384 pos < limit;)
5385 {
5386 unsigned char c = *pos++;
5387 /* This matches the switch in _cpp_lex_direct. */
5388 switch (c)
5389 {
5390 case ' ': case '\t': case '\f': case '\v':
5391 /* Whitespace, do nothing. */
5392 break;
5393
5394 case '\r': /* MAC line ending, or Windows \r\n */
5395 if (*pos == '\n')
5396 pos++;
5397 /* FALLTHROUGH */
5398
5399 case '\n':
5400 bol = true;
5401
5402 next_line:
5403 CPP_INCREMENT_LINE (pfile, 0);
5404 line_count++;
5405 line_start = pos;
5406 break;
5407
5408 case '\\':
5409 /* <backslash><newline> is removed, and doesn't undo any
5410 preceeding escape or whatnot. */
5411 if (*pos == '\n')
5412 {
5413 pos++;
5414 goto next_line;
5415 }
5416 else if (*pos == '\r')
5417 {
5418 if (pos[1] == '\n')
5419 pos++;
5420 pos++;
5421 goto next_line;
5422 }
5423 goto dflt;
5424
5425 case '#':
5426 if (bol)
5427 {
5428 /* Line directive. */
5429 if (pos - 1 > base && !pfile->state.skipping)
5430 cb (pfile, CPP_DO_print, data,
5431 line_count, base, pos - 1 - base);
5432
5433 /* Prep things for directive handling. */
5434 buffer->next_line = pos;
5435 buffer->need_line = true;
5436 bool ok = _cpp_get_fresh_line (pfile);
5437 gcc_checking_assert (ok);
5438
5439 /* Ensure proper column numbering for generated
5440 error messages. */
5441 buffer->line_base -= pos - line_start;
5442
5443 if (_cpp_handle_directive (pfile, line_start + 1 != pos) == 2)
5444 {
5445 if (pfile->directive_result.type != CPP_PADDING)
5446 cb (pfile, CPP_DO_token, data,
5447 &pfile->directive_result, pfile->directive_result.src_loc);
5448 if (pfile->context->prev)
5449 {
5450 gcc_assert (pfile->context->tokens_kind == TOKENS_KIND_DIRECT);
5451 for (const cpp_token *tok = FIRST (pfile->context).token;
5452 tok != LAST (pfile->context).token; ++tok)
5453 cb (pfile, CPP_DO_token, data, tok, tok->src_loc);
5454 _cpp_pop_context (pfile);
5455 }
5456 }
5457
5458 /* Sanitize the line settings. Duplicate #include's can
5459 mess things up. */
5460 // FIXME: Necessary?
5461 pfile->line_table->highest_location
5462 = pfile->line_table->highest_line;
5463
5464 if (!pfile->state.skipping
5465 && pfile->buffer->next_line < pfile->buffer->rlimit)
5466 cb (pfile, CPP_DO_location, data,
5467 pfile->line_table->highest_line);
5468
5469 goto restart;
5470 }
5471 goto dflt;
5472
5473 case '/':
5474 {
5475 const unsigned char *peek = do_peek_next (peek: pos, limit);
5476 if (!(*peek == '/' || *peek == '*'))
5477 goto dflt;
5478
5479 /* Line or block comment */
5480 bool is_block = *peek == '*';
5481 bool star = false;
5482 bool esc = false;
5483 location_t sloc
5484 = linemap_position_for_column (pfile->line_table,
5485 pos - line_start);
5486
5487 while (pos < limit)
5488 {
5489 char c = *pos++;
5490 switch (c)
5491 {
5492 case '\\':
5493 if (esc)
5494 {
5495 star = false;
5496 esc = false;
5497 }
5498 else
5499 esc = true;
5500 break;
5501
5502 case '\r':
5503 if (*pos == '\n')
5504 pos++;
5505 /* FALLTHROUGH */
5506
5507 case '\n':
5508 {
5509 CPP_INCREMENT_LINE (pfile, 0);
5510 line_count++;
5511 line_start = pos;
5512 if (!esc && !is_block)
5513 {
5514 bol = true;
5515 goto done_comment;
5516 }
5517 }
5518 if (!esc)
5519 star = false;
5520 esc = false;
5521 break;
5522
5523 case '*':
5524 if (pos > peek)
5525 star = is_block;
5526 esc = false;
5527 break;
5528
5529 case '/':
5530 if (star && !esc)
5531 goto done_comment;
5532 /* FALLTHROUGH */
5533
5534 default:
5535 star = false;
5536 esc = false;
5537 break;
5538 }
5539 }
5540 if (pos < limit || is_block)
5541 cpp_error_with_line (pfile, CPP_DL_ERROR, sloc, 0,
5542 msgid: "unterminated comment");
5543 done_comment:
5544 lwm = pos;
5545 break;
5546 }
5547
5548 case '\'':
5549 if (!CPP_OPTION (pfile, digit_separators))
5550 goto delimited_string;
5551
5552 /* Possibly a number punctuator. */
5553 if (!ISIDNUM (*do_peek_next (pos, limit)))
5554 goto delimited_string;
5555
5556 goto quote_peek;
5557
5558 case '\"':
5559 if (!CPP_OPTION (pfile, rliterals))
5560 goto delimited_string;
5561
5562 quote_peek:
5563 {
5564 /* For ' see if it's a number punctuator
5565 \.?<digit>(<digit>|<identifier-nondigit>
5566 |'<digit>|'<nondigit>|[eEpP]<sign>|\.)* */
5567 /* For " see if it's a raw string
5568 {U,L,u,u8}R. This includes CPP_NUMBER detection,
5569 because that could be 0e+R. */
5570 const unsigned char *peek = pos - 1;
5571 bool quote_first = c == '"';
5572 bool quote_eight = false;
5573 bool maybe_number_start = false;
5574 bool want_number = false;
5575
5576 while ((peek = do_peek_prev (peek, bound: lwm)))
5577 {
5578 unsigned char p = *peek;
5579 if (quote_first)
5580 {
5581 if (!raw)
5582 {
5583 if (p != 'R')
5584 break;
5585 raw = true;
5586 continue;
5587 }
5588
5589 quote_first = false;
5590 if (p == 'L' || p == 'U' || p == 'u')
5591 ;
5592 else if (p == '8')
5593 quote_eight = true;
5594 else
5595 goto second_raw;
5596 }
5597 else if (quote_eight)
5598 {
5599 if (p != 'u')
5600 {
5601 raw = false;
5602 break;
5603 }
5604 quote_eight = false;
5605 }
5606 else if (c == '"')
5607 {
5608 second_raw:;
5609 if (!want_number && ISIDNUM (p))
5610 {
5611 raw = false;
5612 break;
5613 }
5614 }
5615
5616 if (ISDIGIT (p))
5617 maybe_number_start = true;
5618 else if (p == '.')
5619 want_number = true;
5620 else if (ISIDNUM (p))
5621 maybe_number_start = false;
5622 else if (p == '+' || p == '-')
5623 {
5624 if (const unsigned char *peek_prev
5625 = do_peek_prev (peek, bound: lwm))
5626 {
5627 p = *peek_prev;
5628 if (p == 'e' || p == 'E'
5629 || p == 'p' || p == 'P')
5630 {
5631 want_number = true;
5632 maybe_number_start = false;
5633 }
5634 else
5635 break;
5636 }
5637 else
5638 break;
5639 }
5640 else if (p == '\'' || p == '\"')
5641 {
5642 /* If this is lwm, this must be the end of a
5643 previous string. So this is a trailing
5644 literal type, (a) if those are allowed,
5645 and (b) maybe_start is false. Otherwise
5646 this must be a CPP_NUMBER because we've
5647 met another ', and we'd have checked that
5648 in its own right. */
5649 if (peek == lwm && CPP_OPTION (pfile, uliterals))
5650 {
5651 if (!maybe_number_start && !want_number)
5652 /* Must be a literal type. */
5653 raw = false;
5654 }
5655 else if (p == '\''
5656 && CPP_OPTION (pfile, digit_separators))
5657 maybe_number_start = true;
5658 break;
5659 }
5660 else if (c == '\'')
5661 break;
5662 else if (!quote_first && !quote_eight)
5663 break;
5664 }
5665
5666 if (maybe_number_start)
5667 {
5668 if (c == '\'')
5669 /* A CPP NUMBER. */
5670 goto dflt;
5671 raw = false;
5672 }
5673
5674 goto delimited_string;
5675 }
5676
5677 delimited_string:
5678 {
5679 /* (Possibly raw) string or char literal. */
5680 unsigned char end = c;
5681 int delim_len = -1;
5682 const unsigned char *delim = NULL;
5683 location_t sloc = linemap_position_for_column (pfile->line_table,
5684 pos - line_start);
5685 int esc = 0;
5686
5687 if (raw)
5688 {
5689 /* There can be no line breaks in the delimiter. */
5690 delim = pos;
5691 for (delim_len = 0; (c = *pos++) != '('; delim_len++)
5692 {
5693 if (delim_len == 16)
5694 {
5695 cpp_error_with_line (pfile, CPP_DL_ERROR,
5696 sloc, 0,
5697 msgid: "raw string delimiter"
5698 " longer than %d"
5699 " characters",
5700 delim_len);
5701 raw = false;
5702 pos = delim;
5703 break;
5704 }
5705 if (strchr (s: ") \\\t\v\f\n", c: c))
5706 {
5707 cpp_error_with_line (pfile, CPP_DL_ERROR,
5708 sloc, 0,
5709 msgid: "invalid character '%c'"
5710 " in raw string"
5711 " delimiter", c);
5712 raw = false;
5713 pos = delim;
5714 break;
5715 }
5716 if (pos >= limit)
5717 goto bad_string;
5718 }
5719 }
5720
5721 while (pos < limit)
5722 {
5723 char c = *pos++;
5724 switch (c)
5725 {
5726 case '\\':
5727 if (!raw)
5728 esc++;
5729 break;
5730
5731 case '\r':
5732 if (*pos == '\n')
5733 pos++;
5734 /* FALLTHROUGH */
5735
5736 case '\n':
5737 {
5738 CPP_INCREMENT_LINE (pfile, 0);
5739 line_count++;
5740 line_start = pos;
5741 }
5742 if (esc)
5743 esc--;
5744 break;
5745
5746 case ')':
5747 if (raw
5748 && pos + delim_len + 1 < limit
5749 && pos[delim_len] == end
5750 && !memcmp (s1: delim, s2: pos, n: delim_len))
5751 {
5752 pos += delim_len + 1;
5753 raw = false;
5754 goto done_string;
5755 }
5756 break;
5757
5758 default:
5759 if (!raw && !(esc & 1) && c == end)
5760 goto done_string;
5761 esc = 0;
5762 break;
5763 }
5764 }
5765 bad_string:
5766 cpp_error_with_line (pfile, CPP_DL_ERROR, sloc, 0,
5767 msgid: "unterminated literal");
5768
5769 done_string:
5770 raw = false;
5771 lwm = pos - 1;
5772 }
5773 goto dflt;
5774
5775 case '_':
5776 case 'e':
5777 case 'i':
5778 case 'm':
5779 if (bol && module_p && !pfile->state.skipping
5780 && do_peek_module (pfile, c, peek: pos, limit))
5781 {
5782 /* We've seen the start of a module control line.
5783 Start up the tokenizer. */
5784 pos--; /* Backup over the first character. */
5785
5786 /* Backup over whitespace to start of line. */
5787 while (pos > line_start
5788 && (pos[-1] == ' ' || pos[-1] == '\t'))
5789 pos--;
5790
5791 if (pos > base)
5792 cb (pfile, CPP_DO_print, data, line_count, base, pos - base);
5793
5794 /* Prep things for directive handling. */
5795 buffer->next_line = pos;
5796 buffer->need_line = true;
5797
5798 /* Now get tokens until the PRAGMA_EOL. */
5799 do
5800 {
5801 location_t spelling;
5802 const cpp_token *tok
5803 = cpp_get_token_with_location (pfile, &spelling);
5804
5805 gcc_assert (pfile->state.in_deferred_pragma
5806 || tok->type == CPP_PRAGMA_EOL);
5807 cb (pfile, CPP_DO_token, data, tok, spelling);
5808 }
5809 while (pfile->state.in_deferred_pragma);
5810
5811 if (pfile->buffer->next_line < pfile->buffer->rlimit)
5812 cb (pfile, CPP_DO_location, data,
5813 pfile->line_table->highest_line);
5814
5815 pfile->mi_valid = false;
5816 goto restart;
5817 }
5818 goto dflt;
5819
5820 default:
5821 dflt:
5822 bol = false;
5823 pfile->mi_valid = false;
5824 break;
5825 }
5826 }
5827
5828 if (buffer->rlimit > base && !pfile->state.skipping)
5829 {
5830 const unsigned char *limit = buffer->rlimit;
5831 /* If the file was not newline terminated, add rlimit, which is
5832 guaranteed to point to a newline, to the end of our range. */
5833 if (limit[-1] != '\n')
5834 {
5835 limit++;
5836 CPP_INCREMENT_LINE (pfile, 0);
5837 line_count++;
5838 }
5839 cb (pfile, CPP_DO_print, data, line_count, base, limit - base);
5840 }
5841
5842 _cpp_pop_buffer (pfile);
5843 }
5844 while (pfile->buffer);
5845}
5846

source code of libcpp/lex.cc