1/* Data and functions related to line maps and input files.
2 Copyright (C) 2004-2023 Free Software Foundation, Inc.
3
4This file is part of GCC.
5
6GCC is free software; you can redistribute it and/or modify it under
7the terms of the GNU General Public License as published by the Free
8Software Foundation; either version 3, or (at your option) any later
9version.
10
11GCC is distributed in the hope that it will be useful, but WITHOUT ANY
12WARRANTY; without even the implied warranty of MERCHANTABILITY or
13FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14for more details.
15
16You should have received a copy of the GNU General Public License
17along with GCC; see the file COPYING3. If not see
18<http://www.gnu.org/licenses/>. */
19
20#include "config.h"
21#include "system.h"
22#include "coretypes.h"
23#include "intl.h"
24#include "diagnostic.h"
25#include "selftest.h"
26#include "cpplib.h"
27
28#ifndef HAVE_ICONV
29#define HAVE_ICONV 0
30#endif
31
32const char *
33special_fname_builtin ()
34{
35 return _("<built-in>");
36}
37
38/* Input charset configuration. */
39static const char *default_charset_callback (const char *)
40{
41 return nullptr;
42}
43
44void
45file_cache::initialize_input_context (diagnostic_input_charset_callback ccb,
46 bool should_skip_bom)
47{
48 in_context.ccb = (ccb ? ccb : default_charset_callback);
49 in_context.should_skip_bom = should_skip_bom;
50}
51
52/* This is a cache used by get_next_line to store the content of a
53 file to be searched for file lines. */
54class file_cache_slot
55{
56public:
57 file_cache_slot ();
58 ~file_cache_slot ();
59
60 bool read_line_num (size_t line_num,
61 char ** line, ssize_t *line_len);
62
63 /* Accessors. */
64 const char *get_file_path () const { return m_file_path; }
65 unsigned get_use_count () const { return m_use_count; }
66 bool missing_trailing_newline_p () const
67 {
68 return m_missing_trailing_newline;
69 }
70 char_span get_full_file_content ();
71
72 void inc_use_count () { m_use_count++; }
73
74 bool create (const file_cache::input_context &in_context,
75 const char *file_path, FILE *fp, unsigned highest_use_count);
76 void evict ();
77
78 private:
79 /* These are information used to store a line boundary. */
80 class line_info
81 {
82 public:
83 /* The line number. It starts from 1. */
84 size_t line_num;
85
86 /* The position (byte count) of the beginning of the line,
87 relative to the file data pointer. This starts at zero. */
88 size_t start_pos;
89
90 /* The position (byte count) of the last byte of the line. This
91 normally points to the '\n' character, or to one byte after the
92 last byte of the file, if the file doesn't contain a '\n'
93 character. */
94 size_t end_pos;
95
96 line_info (size_t l, size_t s, size_t e)
97 : line_num (l), start_pos (s), end_pos (e)
98 {}
99
100 line_info ()
101 :line_num (0), start_pos (0), end_pos (0)
102 {}
103 };
104
105 bool needs_read_p () const;
106 bool needs_grow_p () const;
107 void maybe_grow ();
108 bool read_data ();
109 bool maybe_read_data ();
110 bool get_next_line (char **line, ssize_t *line_len);
111 bool read_next_line (char ** line, ssize_t *line_len);
112 bool goto_next_line ();
113
114 static const size_t buffer_size = 4 * 1024;
115 static const size_t line_record_size = 100;
116
117 /* The number of time this file has been accessed. This is used
118 to designate which file cache to evict from the cache
119 array. */
120 unsigned m_use_count;
121
122 /* The file_path is the key for identifying a particular file in
123 the cache.
124 For libcpp-using code, the underlying buffer for this field is
125 owned by the corresponding _cpp_file within the cpp_reader. */
126 const char *m_file_path;
127
128 FILE *m_fp;
129
130 /* This points to the content of the file that we've read so
131 far. */
132 char *m_data;
133
134 /* The allocated buffer to be freed may start a little earlier than DATA,
135 e.g. if a UTF8 BOM was skipped at the beginning. */
136 int m_alloc_offset;
137
138 /* The size of the DATA array above.*/
139 size_t m_size;
140
141 /* The number of bytes read from the underlying file so far. This
142 must be less (or equal) than SIZE above. */
143 size_t m_nb_read;
144
145 /* The index of the beginning of the current line. */
146 size_t m_line_start_idx;
147
148 /* The number of the previous line read. This starts at 1. Zero
149 means we've read no line so far. */
150 size_t m_line_num;
151
152 /* This is the total number of lines of the current file. At the
153 moment, we try to get this information from the line map
154 subsystem. Note that this is just a hint. When using the C++
155 front-end, this hint is correct because the input file is then
156 completely tokenized before parsing starts; so the line map knows
157 the number of lines before compilation really starts. For e.g,
158 the C front-end, it can happen that we start emitting diagnostics
159 before the line map has seen the end of the file. */
160 size_t m_total_lines;
161
162 /* Could this file be missing a trailing newline on its final line?
163 Initially true (to cope with empty files), set to true/false
164 as each line is read. */
165 bool m_missing_trailing_newline;
166
167 /* This is a record of the beginning and end of the lines we've seen
168 while reading the file. This is useful to avoid walking the data
169 from the beginning when we are asked to read a line that is
170 before LINE_START_IDX above. Note that the maximum size of this
171 record is line_record_size, so that the memory consumption
172 doesn't explode. We thus scale total_lines down to
173 line_record_size. */
174 vec<line_info, va_heap> m_line_record;
175
176 void offset_buffer (int offset)
177 {
178 gcc_assert (offset < 0 ? m_alloc_offset + offset >= 0
179 : (size_t) offset <= m_size);
180 gcc_assert (m_data);
181 m_alloc_offset += offset;
182 m_data += offset;
183 m_size -= offset;
184 }
185
186};
187
188/* Current position in real source file. */
189
190location_t input_location = UNKNOWN_LOCATION;
191
192class line_maps *line_table;
193
194/* A stashed copy of "line_table" for use by selftest::line_table_test.
195 This needs to be a global so that it can be a GC root, and thus
196 prevent the stashed copy from being garbage-collected if the GC runs
197 during a line_table_test. */
198
199class line_maps *saved_line_table;
200
201/* Expand the source location LOC into a human readable location. If
202 LOC resolves to a builtin location, the file name of the readable
203 location is set to the string "<built-in>". If EXPANSION_POINT_P is
204 TRUE and LOC is virtual, then it is resolved to the expansion
205 point of the involved macro. Otherwise, it is resolved to the
206 spelling location of the token.
207
208 When resolving to the spelling location of the token, if the
209 resulting location is for a built-in location (that is, it has no
210 associated line/column) in the context of a macro expansion, the
211 returned location is the first one (while unwinding the macro
212 location towards its expansion point) that is in real source
213 code.
214
215 ASPECT controls which part of the location to use. */
216
217static expanded_location
218expand_location_1 (const line_maps *set,
219 location_t loc,
220 bool expansion_point_p,
221 enum location_aspect aspect)
222{
223 expanded_location xloc;
224 const line_map_ordinary *map;
225 enum location_resolution_kind lrk = LRK_MACRO_EXPANSION_POINT;
226 tree block = NULL;
227
228 if (IS_ADHOC_LOC (loc))
229 {
230 block = LOCATION_BLOCK (loc);
231 loc = LOCATION_LOCUS (loc);
232 }
233
234 memset (s: &xloc, c: 0, n: sizeof (xloc));
235
236 if (loc >= RESERVED_LOCATION_COUNT)
237 {
238 if (!expansion_point_p)
239 {
240 /* We want to resolve LOC to its spelling location.
241
242 But if that spelling location is a reserved location that
243 appears in the context of a macro expansion (like for a
244 location for a built-in token), let's consider the first
245 location (toward the expansion point) that is not reserved;
246 that is, the first location that is in real source code. */
247 loc = linemap_unwind_to_first_non_reserved_loc (set,
248 loc, NULL);
249 lrk = LRK_SPELLING_LOCATION;
250 }
251 loc = linemap_resolve_location (set, loc, lrk, loc_map: &map);
252
253 /* loc is now either in an ordinary map, or is a reserved location.
254 If it is a compound location, the caret is in a spelling location,
255 but the start/finish might still be a virtual location.
256 Depending of what the caller asked for, we may need to recurse
257 one level in order to resolve any virtual locations in the
258 end-points. */
259 switch (aspect)
260 {
261 default:
262 gcc_unreachable ();
263 /* Fall through. */
264 case LOCATION_ASPECT_CARET:
265 break;
266 case LOCATION_ASPECT_START:
267 {
268 location_t start = get_start (loc);
269 if (start != loc)
270 return expand_location_1 (set, loc: start, expansion_point_p, aspect);
271 }
272 break;
273 case LOCATION_ASPECT_FINISH:
274 {
275 location_t finish = get_finish (loc);
276 if (finish != loc)
277 return expand_location_1 (set, loc: finish, expansion_point_p, aspect);
278 }
279 break;
280 }
281 xloc = linemap_expand_location (set, map, loc);
282 }
283
284 xloc.data = block;
285 if (loc <= BUILTINS_LOCATION)
286 xloc.file = loc == UNKNOWN_LOCATION ? NULL : special_fname_builtin ();
287
288 return xloc;
289}
290
291/* Initialize the set of cache used for files accessed by caret
292 diagnostic. */
293
294static void
295diagnostic_file_cache_init (void)
296{
297 gcc_assert (global_dc);
298 global_dc->file_cache_init ();
299}
300
301void
302diagnostic_context::file_cache_init ()
303{
304 if (m_file_cache == nullptr)
305 m_file_cache = new file_cache ();
306}
307
308/* Return the total lines number that have been read so far by the
309 line map (in the preprocessor) so far. For languages like C++ that
310 entirely preprocess the input file before starting to parse, this
311 equals the actual number of lines of the file. */
312
313static size_t
314total_lines_num (const char *file_path)
315{
316 size_t r = 0;
317 location_t l = 0;
318 if (linemap_get_file_highest_location (set: line_table, file_name: file_path, loc: &l))
319 {
320 gcc_assert (l >= RESERVED_LOCATION_COUNT);
321 expanded_location xloc = expand_location (l);
322 r = xloc.line;
323 }
324 return r;
325}
326
327/* Lookup the cache used for the content of a given file accessed by
328 caret diagnostic. Return the found cached file, or NULL if no
329 cached file was found. */
330
331file_cache_slot *
332file_cache::lookup_file (const char *file_path)
333{
334 gcc_assert (file_path);
335
336 /* This will contain the found cached file. */
337 file_cache_slot *r = NULL;
338 for (unsigned i = 0; i < num_file_slots; ++i)
339 {
340 file_cache_slot *c = &m_file_slots[i];
341 if (c->get_file_path () && !strcmp (s1: c->get_file_path (), s2: file_path))
342 {
343 c->inc_use_count ();
344 r = c;
345 }
346 }
347
348 if (r)
349 r->inc_use_count ();
350
351 return r;
352}
353
354/* Purge any mention of FILENAME from the cache of files used for
355 printing source code. For use in selftests when working
356 with tempfiles. */
357
358void
359diagnostics_file_cache_forcibly_evict_file (const char *file_path)
360{
361 gcc_assert (file_path);
362
363 auto file_cache = global_dc->get_file_cache ();
364 if (!file_cache)
365 return;
366 file_cache->forcibly_evict_file (file_path);
367}
368
369void
370file_cache::forcibly_evict_file (const char *file_path)
371{
372 gcc_assert (file_path);
373
374 file_cache_slot *r = lookup_file (file_path);
375 if (!r)
376 /* Not found. */
377 return;
378
379 r->evict ();
380}
381
382void
383file_cache_slot::evict ()
384{
385 m_file_path = NULL;
386 if (m_fp)
387 fclose (stream: m_fp);
388 m_fp = NULL;
389 m_nb_read = 0;
390 m_line_start_idx = 0;
391 m_line_num = 0;
392 m_line_record.truncate (size: 0);
393 m_use_count = 0;
394 m_total_lines = 0;
395 m_missing_trailing_newline = true;
396}
397
398/* Return the file cache that has been less used, recently, or the
399 first empty one. If HIGHEST_USE_COUNT is non-null,
400 *HIGHEST_USE_COUNT is set to the highest use count of the entries
401 in the cache table. */
402
403file_cache_slot*
404file_cache::evicted_cache_tab_entry (unsigned *highest_use_count)
405{
406 diagnostic_file_cache_init ();
407
408 file_cache_slot *to_evict = &m_file_slots[0];
409 unsigned huc = to_evict->get_use_count ();
410 for (unsigned i = 1; i < num_file_slots; ++i)
411 {
412 file_cache_slot *c = &m_file_slots[i];
413 bool c_is_empty = (c->get_file_path () == NULL);
414
415 if (c->get_use_count () < to_evict->get_use_count ()
416 || (to_evict->get_file_path () && c_is_empty))
417 /* We evict C because it's either an entry with a lower use
418 count or one that is empty. */
419 to_evict = c;
420
421 if (huc < c->get_use_count ())
422 huc = c->get_use_count ();
423
424 if (c_is_empty)
425 /* We've reached the end of the cache; subsequent elements are
426 all empty. */
427 break;
428 }
429
430 if (highest_use_count)
431 *highest_use_count = huc;
432
433 return to_evict;
434}
435
436/* Create the cache used for the content of a given file to be
437 accessed by caret diagnostic. This cache is added to an array of
438 cache and can be retrieved by lookup_file_in_cache_tab. This
439 function returns the created cache. Note that only the last
440 num_file_slots files are cached.
441
442 This can return nullptr if the FILE_PATH can't be opened for
443 reading, or if the content can't be converted to the input_charset. */
444
445file_cache_slot*
446file_cache::add_file (const char *file_path)
447{
448
449 FILE *fp = fopen (filename: file_path, modes: "r");
450 if (fp == NULL)
451 return NULL;
452
453 unsigned highest_use_count = 0;
454 file_cache_slot *r = evicted_cache_tab_entry (highest_use_count: &highest_use_count);
455 if (!r->create (in_context, file_path, fp, highest_use_count))
456 return NULL;
457 return r;
458}
459
460/* Get a borrowed char_span to the full content of this file
461 as decoded according to the input charset, encoded as UTF-8. */
462
463char_span
464file_cache_slot::get_full_file_content ()
465{
466 char *line;
467 ssize_t line_len;
468 while (get_next_line (line: &line, line_len: &line_len))
469 {
470 }
471 return char_span (m_data, m_nb_read);
472}
473
474/* Populate this slot for use on FILE_PATH and FP, dropping any
475 existing cached content within it. */
476
477bool
478file_cache_slot::create (const file_cache::input_context &in_context,
479 const char *file_path, FILE *fp,
480 unsigned highest_use_count)
481{
482 m_file_path = file_path;
483 if (m_fp)
484 fclose (stream: m_fp);
485 m_fp = fp;
486 if (m_alloc_offset)
487 offset_buffer (offset: -m_alloc_offset);
488 m_nb_read = 0;
489 m_line_start_idx = 0;
490 m_line_num = 0;
491 m_line_record.truncate (size: 0);
492 /* Ensure that this cache entry doesn't get evicted next time
493 add_file_to_cache_tab is called. */
494 m_use_count = ++highest_use_count;
495 m_total_lines = total_lines_num (file_path);
496 m_missing_trailing_newline = true;
497
498
499 /* Check the input configuration to determine if we need to do any
500 transformations, such as charset conversion or BOM skipping. */
501 if (const char *input_charset = in_context.ccb (file_path))
502 {
503 /* Need a full-blown conversion of the input charset. */
504 fclose (stream: m_fp);
505 m_fp = NULL;
506 const cpp_converted_source cs
507 = cpp_get_converted_source (fname: file_path, input_charset);
508 if (!cs.data)
509 return false;
510 if (m_data)
511 XDELETEVEC (m_data);
512 m_data = cs.data;
513 m_nb_read = m_size = cs.len;
514 m_alloc_offset = cs.data - cs.to_free;
515 }
516 else if (in_context.should_skip_bom)
517 {
518 if (read_data ())
519 {
520 const int offset = cpp_check_utf8_bom (data: m_data, data_length: m_nb_read);
521 offset_buffer (offset);
522 m_nb_read -= offset;
523 }
524 }
525
526 return true;
527}
528
529/* file_cache's ctor. */
530
531file_cache::file_cache ()
532: m_file_slots (new file_cache_slot[num_file_slots])
533{
534 initialize_input_context (ccb: nullptr, should_skip_bom: false);
535}
536
537/* file_cache's dtor. */
538
539file_cache::~file_cache ()
540{
541 delete[] m_file_slots;
542}
543
544/* Lookup the cache used for the content of a given file accessed by
545 caret diagnostic. If no cached file was found, create a new cache
546 for this file, add it to the array of cached file and return
547 it.
548
549 This can return nullptr on a cache miss if FILE_PATH can't be opened for
550 reading, or if the content can't be converted to the input_charset. */
551
552file_cache_slot*
553file_cache::lookup_or_add_file (const char *file_path)
554{
555 file_cache_slot *r = lookup_file (file_path);
556 if (r == NULL)
557 r = add_file (file_path);
558 return r;
559}
560
561/* Default constructor for a cache of file used by caret
562 diagnostic. */
563
564file_cache_slot::file_cache_slot ()
565: m_use_count (0), m_file_path (NULL), m_fp (NULL), m_data (0),
566 m_alloc_offset (0), m_size (0), m_nb_read (0), m_line_start_idx (0),
567 m_line_num (0), m_total_lines (0), m_missing_trailing_newline (true)
568{
569 m_line_record.create (nelems: 0);
570}
571
572/* Destructor for a cache of file used by caret diagnostic. */
573
574file_cache_slot::~file_cache_slot ()
575{
576 if (m_fp)
577 {
578 fclose (stream: m_fp);
579 m_fp = NULL;
580 }
581 if (m_data)
582 {
583 offset_buffer (offset: -m_alloc_offset);
584 XDELETEVEC (m_data);
585 m_data = 0;
586 }
587 m_line_record.release ();
588}
589
590/* Returns TRUE iff the cache would need to be filled with data coming
591 from the file. That is, either the cache is empty or full or the
592 current line is empty. Note that if the cache is full, it would
593 need to be extended and filled again. */
594
595bool
596file_cache_slot::needs_read_p () const
597{
598 return m_fp && (m_nb_read == 0
599 || m_nb_read == m_size
600 || (m_line_start_idx >= m_nb_read - 1));
601}
602
603/* Return TRUE iff the cache is full and thus needs to be
604 extended. */
605
606bool
607file_cache_slot::needs_grow_p () const
608{
609 return m_nb_read == m_size;
610}
611
612/* Grow the cache if it needs to be extended. */
613
614void
615file_cache_slot::maybe_grow ()
616{
617 if (!needs_grow_p ())
618 return;
619
620 if (!m_data)
621 {
622 gcc_assert (m_size == 0 && m_alloc_offset == 0);
623 m_size = buffer_size;
624 m_data = XNEWVEC (char, m_size);
625 }
626 else
627 {
628 const int offset = m_alloc_offset;
629 offset_buffer (offset: -offset);
630 m_size *= 2;
631 m_data = XRESIZEVEC (char, m_data, m_size);
632 offset_buffer (offset);
633 }
634}
635
636/* Read more data into the cache. Extends the cache if need be.
637 Returns TRUE iff new data could be read. */
638
639bool
640file_cache_slot::read_data ()
641{
642 if (feof (stream: m_fp) || ferror (stream: m_fp))
643 return false;
644
645 maybe_grow ();
646
647 char * from = m_data + m_nb_read;
648 size_t to_read = m_size - m_nb_read;
649 size_t nb_read = fread (ptr: from, size: 1, n: to_read, stream: m_fp);
650
651 if (ferror (stream: m_fp))
652 return false;
653
654 m_nb_read += nb_read;
655 return !!nb_read;
656}
657
658/* Read new data iff the cache needs to be filled with more data
659 coming from the file FP. Return TRUE iff the cache was filled with
660 mode data. */
661
662bool
663file_cache_slot::maybe_read_data ()
664{
665 if (!needs_read_p ())
666 return false;
667 return read_data ();
668}
669
670/* Helper function for file_cache_slot::get_next_line (), to find the end of
671 the next line. Returns with the memchr convention, i.e. nullptr if a line
672 terminator was not found. We need to determine line endings in the same
673 manner that libcpp does: any of \n, \r\n, or \r is a line ending. */
674
675static char *
676find_end_of_line (char *s, size_t len)
677{
678 for (const auto end = s + len; s != end; ++s)
679 {
680 if (*s == '\n')
681 return s;
682 if (*s == '\r')
683 {
684 const auto next = s + 1;
685 if (next == end)
686 {
687 /* Don't find the line ending if \r is the very last character
688 in the buffer; we do not know if it's the end of the file or
689 just the end of what has been read so far, and we wouldn't
690 want to break in the middle of what's actually a \r\n
691 sequence. Instead, we will handle the case of a file ending
692 in a \r later. */
693 break;
694 }
695 return (*next == '\n' ? next : s);
696 }
697 }
698 return nullptr;
699}
700
701/* Read a new line from file FP, using C as a cache for the data
702 coming from the file. Upon successful completion, *LINE is set to
703 the beginning of the line found. *LINE points directly in the
704 line cache and is only valid until the next call of get_next_line.
705 *LINE_LEN is set to the length of the line. Note that the line
706 does not contain any terminal delimiter. This function returns
707 true if some data was read or process from the cache, false
708 otherwise. Note that subsequent calls to get_next_line might
709 make the content of *LINE invalid. */
710
711bool
712file_cache_slot::get_next_line (char **line, ssize_t *line_len)
713{
714 /* Fill the cache with data to process. */
715 maybe_read_data ();
716
717 size_t remaining_size = m_nb_read - m_line_start_idx;
718 if (remaining_size == 0)
719 /* There is no more data to process. */
720 return false;
721
722 char *line_start = m_data + m_line_start_idx;
723
724 char *next_line_start = NULL;
725 size_t len = 0;
726 char *line_end = find_end_of_line (s: line_start, len: remaining_size);
727 if (line_end == NULL)
728 {
729 /* We haven't found an end-of-line delimiter in the cache.
730 Fill the cache with more data from the file and look again. */
731 while (maybe_read_data ())
732 {
733 line_start = m_data + m_line_start_idx;
734 remaining_size = m_nb_read - m_line_start_idx;
735 line_end = find_end_of_line (s: line_start, len: remaining_size);
736 if (line_end != NULL)
737 {
738 next_line_start = line_end + 1;
739 break;
740 }
741 }
742 if (line_end == NULL)
743 {
744 /* We've loaded all the file into the cache and still no
745 terminator. Let's say the line ends up at one byte past the
746 end of the file. This is to stay consistent with the case
747 of when the line ends up with a terminator and line_end points to
748 that. That consistency is useful below in the len calculation.
749
750 If the file ends in a \r, we didn't identify it as a line
751 terminator above, so do that now instead. */
752 line_end = m_data + m_nb_read;
753 if (m_nb_read && line_end[-1] == '\r')
754 {
755 --line_end;
756 m_missing_trailing_newline = false;
757 }
758 else
759 m_missing_trailing_newline = true;
760 }
761 else
762 m_missing_trailing_newline = false;
763 }
764 else
765 {
766 next_line_start = line_end + 1;
767 m_missing_trailing_newline = false;
768 }
769
770 if (m_fp && ferror (stream: m_fp))
771 return false;
772
773 /* At this point, we've found the end of the of line. It either points to
774 the line terminator or to one byte after the last byte of the file. */
775 gcc_assert (line_end != NULL);
776
777 len = line_end - line_start;
778
779 if (m_line_start_idx < m_nb_read)
780 *line = line_start;
781
782 ++m_line_num;
783
784 /* Before we update our line record, make sure the hint about the
785 total number of lines of the file is correct. If it's not, then
786 we give up recording line boundaries from now on. */
787 bool update_line_record = true;
788 if (m_line_num > m_total_lines)
789 update_line_record = false;
790
791 /* Now update our line record so that re-reading lines from the
792 before m_line_start_idx is faster. */
793 if (update_line_record
794 && m_line_record.length () < line_record_size)
795 {
796 /* If the file lines fits in the line record, we just record all
797 its lines ...*/
798 if (m_total_lines <= line_record_size
799 && m_line_num > m_line_record.length ())
800 m_line_record.safe_push
801 (obj: file_cache_slot::line_info (m_line_num,
802 m_line_start_idx,
803 line_end - m_data));
804 else if (m_total_lines > line_record_size)
805 {
806 /* ... otherwise, we just scale total_lines down to
807 (line_record_size lines. */
808 size_t n = (m_line_num * line_record_size) / m_total_lines;
809 if (m_line_record.length () == 0
810 || n >= m_line_record.length ())
811 m_line_record.safe_push
812 (obj: file_cache_slot::line_info (m_line_num,
813 m_line_start_idx,
814 line_end - m_data));
815 }
816 }
817
818 /* Update m_line_start_idx so that it points to the next line to be
819 read. */
820 if (next_line_start)
821 m_line_start_idx = next_line_start - m_data;
822 else
823 /* We didn't find any terminal '\n'. Let's consider that the end
824 of line is the end of the data in the cache. The next
825 invocation of get_next_line will either read more data from the
826 underlying file or return false early because we've reached the
827 end of the file. */
828 m_line_start_idx = m_nb_read;
829
830 *line_len = len;
831
832 return true;
833}
834
835/* Consume the next bytes coming from the cache (or from its
836 underlying file if there are remaining unread bytes in the file)
837 until we reach the next end-of-line (or end-of-file). There is no
838 copying from the cache involved. Return TRUE upon successful
839 completion. */
840
841bool
842file_cache_slot::goto_next_line ()
843{
844 char *l;
845 ssize_t len;
846
847 return get_next_line (line: &l, line_len: &len);
848}
849
850/* Read an arbitrary line number LINE_NUM from the file cached in C.
851 If the line was read successfully, *LINE points to the beginning
852 of the line in the file cache and *LINE_LEN is the length of the
853 line. *LINE is not nul-terminated, but may contain zero bytes.
854 *LINE is only valid until the next call of read_line_num.
855 This function returns bool if a line was read. */
856
857bool
858file_cache_slot::read_line_num (size_t line_num,
859 char ** line, ssize_t *line_len)
860{
861 gcc_assert (line_num > 0);
862
863 if (line_num <= m_line_num)
864 {
865 /* We've been asked to read lines that are before m_line_num.
866 So lets use our line record (if it's not empty) to try to
867 avoid re-reading the file from the beginning again. */
868
869 if (m_line_record.is_empty ())
870 {
871 m_line_start_idx = 0;
872 m_line_num = 0;
873 }
874 else
875 {
876 file_cache_slot::line_info *i = NULL;
877 if (m_total_lines <= line_record_size)
878 {
879 /* In languages where the input file is not totally
880 preprocessed up front, the m_total_lines hint
881 can be smaller than the number of lines of the
882 file. In that case, only the first
883 m_total_lines have been recorded.
884
885 Otherwise, the first m_total_lines we've read have
886 their start/end recorded here. */
887 i = (line_num <= m_total_lines)
888 ? &m_line_record[line_num - 1]
889 : &m_line_record[m_total_lines - 1];
890 gcc_assert (i->line_num <= line_num);
891 }
892 else
893 {
894 /* So the file had more lines than our line record
895 size. Thus the number of lines we've recorded has
896 been scaled down to line_record_size. Let's
897 pick the start/end of the recorded line that is
898 closest to line_num. */
899 size_t n = (line_num <= m_total_lines)
900 ? line_num * line_record_size / m_total_lines
901 : m_line_record.length () - 1;
902 if (n < m_line_record.length ())
903 {
904 i = &m_line_record[n];
905 gcc_assert (i->line_num <= line_num);
906 }
907 }
908
909 if (i && i->line_num == line_num)
910 {
911 /* We have the start/end of the line. */
912 *line = m_data + i->start_pos;
913 *line_len = i->end_pos - i->start_pos;
914 return true;
915 }
916
917 if (i)
918 {
919 m_line_start_idx = i->start_pos;
920 m_line_num = i->line_num - 1;
921 }
922 else
923 {
924 m_line_start_idx = 0;
925 m_line_num = 0;
926 }
927 }
928 }
929
930 /* Let's walk from line m_line_num up to line_num - 1, without
931 copying any line. */
932 while (m_line_num < line_num - 1)
933 if (!goto_next_line ())
934 return false;
935
936 /* The line we want is the next one. Let's read and copy it back to
937 the caller. */
938 return get_next_line (line, line_len);
939}
940
941/* Return the physical source line that corresponds to FILE_PATH/LINE.
942 The line is not nul-terminated. The returned pointer is only
943 valid until the next call of location_get_source_line.
944 Note that the line can contain several null characters,
945 so the returned value's length has the actual length of the line.
946 If the function fails, a NULL char_span is returned. */
947
948char_span
949file_cache::get_source_line (const char *file_path, int line)
950{
951 char *buffer = NULL;
952 ssize_t len;
953
954 if (line == 0)
955 return char_span (NULL, 0);
956
957 if (file_path == NULL)
958 return char_span (NULL, 0);
959
960 file_cache_slot *c = lookup_or_add_file (file_path);
961 if (c == NULL)
962 return char_span (NULL, 0);
963
964 bool read = c->read_line_num (line_num: line, line: &buffer, line_len: &len);
965 if (!read)
966 return char_span (NULL, 0);
967
968 return char_span (buffer, len);
969}
970
971char_span
972location_get_source_line (const char *file_path, int line)
973{
974 diagnostic_file_cache_init ();
975 return global_dc->get_file_cache ()->get_source_line (file_path, line);
976}
977
978/* Return a NUL-terminated copy of the source text between two locations, or
979 NULL if the arguments are invalid. The caller is responsible for freeing
980 the return value. */
981
982char *
983get_source_text_between (location_t start, location_t end)
984{
985 expanded_location expstart =
986 expand_location_to_spelling_point (start, aspect: LOCATION_ASPECT_START);
987 expanded_location expend =
988 expand_location_to_spelling_point (end, aspect: LOCATION_ASPECT_FINISH);
989
990 /* If the locations are in different files or the end comes before the
991 start, give up and return nothing. */
992 if (!expstart.file || !expend.file)
993 return NULL;
994 if (strcmp (s1: expstart.file, s2: expend.file) != 0)
995 return NULL;
996 if (expstart.line > expend.line)
997 return NULL;
998 if (expstart.line == expend.line
999 && expstart.column > expend.column)
1000 return NULL;
1001 /* These aren't real column numbers, give up. */
1002 if (expstart.column == 0 || expend.column == 0)
1003 return NULL;
1004
1005 /* For a single line we need to trim both edges. */
1006 if (expstart.line == expend.line)
1007 {
1008 char_span line = location_get_source_line (file_path: expstart.file, line: expstart.line);
1009 if (line.length () < 1)
1010 return NULL;
1011 int s = expstart.column - 1;
1012 int len = expend.column - s;
1013 if (line.length () < (size_t)expend.column)
1014 return NULL;
1015 return line.subspan (offset: s, n_elts: len).xstrdup ();
1016 }
1017
1018 struct obstack buf_obstack;
1019 obstack_init (&buf_obstack);
1020
1021 /* Loop through all lines in the range and append each to buf; may trim
1022 parts of the start and end lines off depending on column values. */
1023 for (int lnum = expstart.line; lnum <= expend.line; ++lnum)
1024 {
1025 char_span line = location_get_source_line (file_path: expstart.file, line: lnum);
1026 if (line.length () < 1 && (lnum != expstart.line && lnum != expend.line))
1027 continue;
1028
1029 /* For the first line in the range, only start at expstart.column */
1030 if (lnum == expstart.line)
1031 {
1032 unsigned off = expstart.column - 1;
1033 if (line.length () < off)
1034 return NULL;
1035 line = line.subspan (offset: off, n_elts: line.length() - off);
1036 }
1037 /* For the last line, don't go past expend.column */
1038 else if (lnum == expend.line)
1039 {
1040 if (line.length () < (size_t)expend.column)
1041 return NULL;
1042 line = line.subspan (offset: 0, n_elts: expend.column);
1043 }
1044
1045 /* Combine spaces at the beginning of later lines. */
1046 if (lnum > expstart.line)
1047 {
1048 unsigned off;
1049 for (off = 0; off < line.length(); ++off)
1050 if (line[off] != ' ' && line[off] != '\t')
1051 break;
1052 if (off > 0)
1053 {
1054 obstack_1grow (&buf_obstack, ' ');
1055 line = line.subspan (offset: off, n_elts: line.length() - off);
1056 }
1057 }
1058
1059 /* This does not include any trailing newlines. */
1060 obstack_grow (&buf_obstack, line.get_buffer (), line.length ());
1061 }
1062
1063 /* NUL-terminate and finish the buf obstack. */
1064 obstack_1grow (&buf_obstack, 0);
1065 const char *buf = (const char *) obstack_finish (&buf_obstack);
1066
1067 return xstrdup (buf);
1068}
1069
1070
1071char_span
1072file_cache::get_source_file_content (const char *file_path)
1073{
1074 file_cache_slot *c = lookup_or_add_file (file_path);
1075 if (c == nullptr)
1076 return char_span (nullptr, 0);
1077 return c->get_full_file_content ();
1078}
1079
1080
1081/* Get a borrowed char_span to the full content of FILE_PATH
1082 as decoded according to the input charset, encoded as UTF-8. */
1083
1084char_span
1085get_source_file_content (const char *file_path)
1086{
1087 diagnostic_file_cache_init ();
1088 return global_dc->get_file_cache ()->get_source_file_content (file_path);
1089}
1090
1091/* Determine if FILE_PATH missing a trailing newline on its final line.
1092 Only valid to call once all of the file has been loaded, by
1093 requesting a line number beyond the end of the file. */
1094
1095bool
1096location_missing_trailing_newline (const char *file_path)
1097{
1098 diagnostic_file_cache_init ();
1099
1100 file_cache_slot *c = global_dc->get_file_cache ()->lookup_or_add_file (file_path);
1101 if (c == NULL)
1102 return false;
1103
1104 return c->missing_trailing_newline_p ();
1105}
1106
1107/* Test if the location originates from the spelling location of a
1108 builtin-tokens. That is, return TRUE if LOC is a (possibly
1109 virtual) location of a built-in token that appears in the expansion
1110 list of a macro. Please note that this function also works on
1111 tokens that result from built-in tokens. For instance, the
1112 function would return true if passed a token "4" that is the result
1113 of the expansion of the built-in __LINE__ macro. */
1114bool
1115is_location_from_builtin_token (location_t loc)
1116{
1117 const line_map_ordinary *map = NULL;
1118 loc = linemap_resolve_location (line_table, loc,
1119 lrk: LRK_SPELLING_LOCATION, loc_map: &map);
1120 return loc == BUILTINS_LOCATION;
1121}
1122
1123/* Expand the source location LOC into a human readable location. If
1124 LOC is virtual, it resolves to the expansion point of the involved
1125 macro. If LOC resolves to a builtin location, the file name of the
1126 readable location is set to the string "<built-in>". */
1127
1128expanded_location
1129expand_location (location_t loc)
1130{
1131 return expand_location_1 (set: line_table, loc, /*expansion_point_p=*/true,
1132 aspect: LOCATION_ASPECT_CARET);
1133}
1134
1135/* Expand the source location LOC into a human readable location. If
1136 LOC is virtual, it resolves to the expansion location of the
1137 relevant macro. If LOC resolves to a builtin location, the file
1138 name of the readable location is set to the string
1139 "<built-in>". */
1140
1141expanded_location
1142expand_location_to_spelling_point (location_t loc,
1143 enum location_aspect aspect)
1144{
1145 return expand_location_1 (set: line_table, loc, /*expansion_point_p=*/false,
1146 aspect);
1147}
1148
1149/* The rich_location class within libcpp requires a way to expand
1150 location_t instances, and relies on the client code
1151 providing a symbol named
1152 linemap_client_expand_location_to_spelling_point
1153 to do this.
1154
1155 This is the implementation for libcommon.a (all host binaries),
1156 which simply calls into expand_location_1. */
1157
1158expanded_location
1159linemap_client_expand_location_to_spelling_point (const line_maps *set,
1160 location_t loc,
1161 enum location_aspect aspect)
1162{
1163 return expand_location_1 (set, loc, /*expansion_point_p=*/false, aspect);
1164}
1165
1166
1167/* If LOCATION is in a system header and if it is a virtual location
1168 for a token coming from the expansion of a macro, unwind it to
1169 the location of the expansion point of the macro. If the expansion
1170 point is also in a system header return the original LOCATION.
1171 Otherwise, return the location of the expansion point.
1172
1173 This is used for instance when we want to emit diagnostics about a
1174 token that may be located in a macro that is itself defined in a
1175 system header, for example, for the NULL macro. In such a case, if
1176 LOCATION were passed directly to diagnostic functions such as
1177 warning_at, the diagnostic would be suppressed (unless
1178 -Wsystem-headers). */
1179
1180location_t
1181expansion_point_location_if_in_system_header (location_t location)
1182{
1183 if (!in_system_header_at (loc: location))
1184 return location;
1185
1186 location_t xloc = linemap_resolve_location (line_table, loc: location,
1187 lrk: LRK_MACRO_EXPANSION_POINT,
1188 NULL);
1189 return in_system_header_at (loc: xloc) ? location : xloc;
1190}
1191
1192/* If LOCATION is a virtual location for a token coming from the expansion
1193 of a macro, unwind to the location of the expansion point of the macro. */
1194
1195location_t
1196expansion_point_location (location_t location)
1197{
1198 return linemap_resolve_location (line_table, loc: location,
1199 lrk: LRK_MACRO_EXPANSION_POINT, NULL);
1200}
1201
1202/* Construct a location with caret at CARET, ranging from START to
1203 FINISH.
1204
1205 For example, consider:
1206
1207 11111111112
1208 12345678901234567890
1209 522
1210 523 return foo + bar;
1211 ~~~~^~~~~
1212 524
1213
1214 The location's caret is at the "+", line 523 column 15, but starts
1215 earlier, at the "f" of "foo" at column 11. The finish is at the "r"
1216 of "bar" at column 19. */
1217
1218location_t
1219make_location (location_t caret, location_t start, location_t finish)
1220{
1221 return line_table->make_location (caret, start, finish);
1222}
1223
1224/* Same as above, but taking a source range rather than two locations. */
1225
1226location_t
1227make_location (location_t caret, source_range src_range)
1228{
1229 location_t pure_loc = get_pure_location (loc: caret);
1230 return line_table->get_or_create_combined_loc (locus: pure_loc, src_range,
1231 data: nullptr, discriminator: 0);
1232}
1233
1234/* An expanded_location stores the column in byte units. This function
1235 converts that column to display units. That requires reading the associated
1236 source line in order to calculate the display width. If that cannot be done
1237 for any reason, then returns the byte column as a fallback. */
1238int
1239location_compute_display_column (expanded_location exploc,
1240 const cpp_char_column_policy &policy)
1241{
1242 if (!(exploc.file && *exploc.file && exploc.line && exploc.column))
1243 return exploc.column;
1244 char_span line = location_get_source_line (file_path: exploc.file, line: exploc.line);
1245 /* If line is NULL, this function returns exploc.column which is the
1246 desired fallback. */
1247 return cpp_byte_column_to_display_column (data: line.get_buffer (), data_length: line.length (),
1248 column: exploc.column, policy);
1249}
1250
1251/* Dump statistics to stderr about the memory usage of the line_table
1252 set of line maps. This also displays some statistics about macro
1253 expansion. */
1254
1255void
1256dump_line_table_statistics (void)
1257{
1258 struct linemap_stats s;
1259 long total_used_map_size,
1260 macro_maps_size,
1261 total_allocated_map_size;
1262
1263 memset (s: &s, c: 0, n: sizeof (s));
1264
1265 linemap_get_statistics (line_table, &s);
1266
1267 macro_maps_size = s.macro_maps_used_size
1268 + s.macro_maps_locations_size;
1269
1270 total_allocated_map_size = s.ordinary_maps_allocated_size
1271 + s.macro_maps_allocated_size
1272 + s.macro_maps_locations_size;
1273
1274 total_used_map_size = s.ordinary_maps_used_size
1275 + s.macro_maps_used_size
1276 + s.macro_maps_locations_size;
1277
1278 fprintf (stderr, format: "Number of expanded macros: %5ld\n",
1279 s.num_expanded_macros);
1280 if (s.num_expanded_macros != 0)
1281 fprintf (stderr, format: "Average number of tokens per macro expansion: %5ld\n",
1282 s.num_macro_tokens / s.num_expanded_macros);
1283 fprintf (stderr,
1284 format: "\nLine Table allocations during the "
1285 "compilation process\n");
1286 fprintf (stderr, format: "Number of ordinary maps used: " PRsa (5) "\n",
1287 SIZE_AMOUNT (s.num_ordinary_maps_used));
1288 fprintf (stderr, format: "Ordinary map used size: " PRsa (5) "\n",
1289 SIZE_AMOUNT (s.ordinary_maps_used_size));
1290 fprintf (stderr, format: "Number of ordinary maps allocated: " PRsa (5) "\n",
1291 SIZE_AMOUNT (s.num_ordinary_maps_allocated));
1292 fprintf (stderr, format: "Ordinary maps allocated size: " PRsa (5) "\n",
1293 SIZE_AMOUNT (s.ordinary_maps_allocated_size));
1294 fprintf (stderr, format: "Number of macro maps used: " PRsa (5) "\n",
1295 SIZE_AMOUNT (s.num_macro_maps_used));
1296 fprintf (stderr, format: "Macro maps used size: " PRsa (5) "\n",
1297 SIZE_AMOUNT (s.macro_maps_used_size));
1298 fprintf (stderr, format: "Macro maps locations size: " PRsa (5) "\n",
1299 SIZE_AMOUNT (s.macro_maps_locations_size));
1300 fprintf (stderr, format: "Macro maps size: " PRsa (5) "\n",
1301 SIZE_AMOUNT (macro_maps_size));
1302 fprintf (stderr, format: "Duplicated maps locations size: " PRsa (5) "\n",
1303 SIZE_AMOUNT (s.duplicated_macro_maps_locations_size));
1304 fprintf (stderr, format: "Total allocated maps size: " PRsa (5) "\n",
1305 SIZE_AMOUNT (total_allocated_map_size));
1306 fprintf (stderr, format: "Total used maps size: " PRsa (5) "\n",
1307 SIZE_AMOUNT (total_used_map_size));
1308 fprintf (stderr, format: "Ad-hoc table size: " PRsa (5) "\n",
1309 SIZE_AMOUNT (s.adhoc_table_size));
1310 fprintf (stderr, format: "Ad-hoc table entries used: " PRsa (5) "\n",
1311 SIZE_AMOUNT (s.adhoc_table_entries_used));
1312 fprintf (stderr, format: "optimized_ranges: " PRsa (5) "\n",
1313 SIZE_AMOUNT (line_table->m_num_optimized_ranges));
1314 fprintf (stderr, format: "unoptimized_ranges: " PRsa (5) "\n",
1315 SIZE_AMOUNT (line_table->m_num_unoptimized_ranges));
1316
1317 fprintf (stderr, format: "\n");
1318}
1319
1320/* Get location one beyond the final location in ordinary map IDX. */
1321
1322static location_t
1323get_end_location (class line_maps *set, unsigned int idx)
1324{
1325 if (idx == LINEMAPS_ORDINARY_USED (set) - 1)
1326 return set->highest_location;
1327
1328 struct line_map *next_map = LINEMAPS_ORDINARY_MAP_AT (set, index: idx + 1);
1329 return MAP_START_LOCATION (map: next_map);
1330}
1331
1332/* Helper function for write_digit_row. */
1333
1334static void
1335write_digit (FILE *stream, int digit)
1336{
1337 fputc (c: '0' + (digit % 10), stream: stream);
1338}
1339
1340/* Helper function for dump_location_info.
1341 Write a row of numbers to STREAM, numbering a source line,
1342 giving the units, tens, hundreds etc of the column number. */
1343
1344static void
1345write_digit_row (FILE *stream, int indent,
1346 const line_map_ordinary *map,
1347 location_t loc, int max_col, int divisor)
1348{
1349 fprintf (stream: stream, format: "%*c", indent, ' ');
1350 fprintf (stream: stream, format: "|");
1351 for (int column = 1; column < max_col; column++)
1352 {
1353 location_t column_loc = loc + (column << map->m_range_bits);
1354 write_digit (stream, digit: column_loc / divisor);
1355 }
1356 fprintf (stream: stream, format: "\n");
1357}
1358
1359/* Write a half-closed (START) / half-open (END) interval of
1360 location_t to STREAM. */
1361
1362static void
1363dump_location_range (FILE *stream,
1364 location_t start, location_t end)
1365{
1366 fprintf (stream: stream,
1367 format: " location_t interval: %u <= loc < %u\n",
1368 start, end);
1369}
1370
1371/* Write a labelled description of a half-closed (START) / half-open (END)
1372 interval of location_t to STREAM. */
1373
1374static void
1375dump_labelled_location_range (FILE *stream,
1376 const char *name,
1377 location_t start, location_t end)
1378{
1379 fprintf (stream: stream, format: "%s\n", name);
1380 dump_location_range (stream, start, end);
1381 fprintf (stream: stream, format: "\n");
1382}
1383
1384/* Write a visualization of the locations in the line_table to STREAM. */
1385
1386void
1387dump_location_info (FILE *stream)
1388{
1389 /* Visualize the reserved locations. */
1390 dump_labelled_location_range (stream, name: "RESERVED LOCATIONS",
1391 start: 0, end: RESERVED_LOCATION_COUNT);
1392
1393 /* Visualize the ordinary line_map instances, rendering the sources. */
1394 for (unsigned int idx = 0; idx < LINEMAPS_ORDINARY_USED (set: line_table); idx++)
1395 {
1396 location_t end_location = get_end_location (set: line_table, idx);
1397 /* half-closed: doesn't include this one. */
1398
1399 const line_map_ordinary *map
1400 = LINEMAPS_ORDINARY_MAP_AT (set: line_table, index: idx);
1401 fprintf (stream: stream, format: "ORDINARY MAP: %i\n", idx);
1402 dump_location_range (stream,
1403 start: MAP_START_LOCATION (map), end: end_location);
1404 fprintf (stream: stream, format: " file: %s\n", ORDINARY_MAP_FILE_NAME (ord_map: map));
1405 fprintf (stream: stream, format: " starting at line: %i\n",
1406 ORDINARY_MAP_STARTING_LINE_NUMBER (ord_map: map));
1407 fprintf (stream: stream, format: " column and range bits: %i\n",
1408 map->m_column_and_range_bits);
1409 fprintf (stream: stream, format: " column bits: %i\n",
1410 map->m_column_and_range_bits - map->m_range_bits);
1411 fprintf (stream: stream, format: " range bits: %i\n",
1412 map->m_range_bits);
1413 const char * reason;
1414 switch (map->reason) {
1415 case LC_ENTER:
1416 reason = "LC_ENTER";
1417 break;
1418 case LC_LEAVE:
1419 reason = "LC_LEAVE";
1420 break;
1421 case LC_RENAME:
1422 reason = "LC_RENAME";
1423 break;
1424 case LC_RENAME_VERBATIM:
1425 reason = "LC_RENAME_VERBATIM";
1426 break;
1427 case LC_ENTER_MACRO:
1428 reason = "LC_RENAME_MACRO";
1429 break;
1430 default:
1431 reason = "Unknown";
1432 }
1433 fprintf (stream: stream, format: " reason: %d (%s)\n", map->reason, reason);
1434
1435 const line_map_ordinary *includer_map
1436 = linemap_included_from_linemap (set: line_table, map);
1437 fprintf (stream: stream, format: " included from location: %d",
1438 linemap_included_from (ord_map: map));
1439 if (includer_map) {
1440 fprintf (stream: stream, format: " (in ordinary map %d)",
1441 int (includer_map - line_table->info_ordinary.maps));
1442 }
1443 fprintf (stream: stream, format: "\n");
1444
1445 /* Render the span of source lines that this "map" covers. */
1446 for (location_t loc = MAP_START_LOCATION (map);
1447 loc < end_location;
1448 loc += (1 << map->m_range_bits) )
1449 {
1450 gcc_assert (pure_location_p (line_table, loc) );
1451
1452 expanded_location exploc
1453 = linemap_expand_location (line_table, map, loc);
1454
1455 if (exploc.column == 0)
1456 {
1457 /* Beginning of a new source line: draw the line. */
1458
1459 char_span line_text = location_get_source_line (file_path: exploc.file,
1460 line: exploc.line);
1461 if (!line_text)
1462 break;
1463 fprintf (stream: stream,
1464 format: "%s:%3i|loc:%5i|%.*s\n",
1465 exploc.file, exploc.line,
1466 loc,
1467 (int)line_text.length (), line_text.get_buffer ());
1468
1469 /* "loc" is at column 0, which means "the whole line".
1470 Render the locations *within* the line, by underlining
1471 it, showing the location_t numeric values
1472 at each column. */
1473 size_t max_col = (1 << map->m_column_and_range_bits) - 1;
1474 if (max_col > line_text.length ())
1475 max_col = line_text.length () + 1;
1476
1477 int len_lnum = num_digits (exploc.line);
1478 if (len_lnum < 3)
1479 len_lnum = 3;
1480 int len_loc = num_digits (loc);
1481 if (len_loc < 5)
1482 len_loc = 5;
1483
1484 int indent = 6 + strlen (s: exploc.file) + len_lnum + len_loc;
1485
1486 /* Thousands. */
1487 if (end_location > 999)
1488 write_digit_row (stream, indent, map, loc, max_col, divisor: 1000);
1489
1490 /* Hundreds. */
1491 if (end_location > 99)
1492 write_digit_row (stream, indent, map, loc, max_col, divisor: 100);
1493
1494 /* Tens. */
1495 write_digit_row (stream, indent, map, loc, max_col, divisor: 10);
1496
1497 /* Units. */
1498 write_digit_row (stream, indent, map, loc, max_col, divisor: 1);
1499 }
1500 }
1501 fprintf (stream: stream, format: "\n");
1502 }
1503
1504 /* Visualize unallocated values. */
1505 dump_labelled_location_range (stream, name: "UNALLOCATED LOCATIONS",
1506 start: line_table->highest_location,
1507 end: LINEMAPS_MACRO_LOWEST_LOCATION (set: line_table));
1508
1509 /* Visualize the macro line_map instances, rendering the sources. */
1510 for (unsigned int i = 0; i < LINEMAPS_MACRO_USED (set: line_table); i++)
1511 {
1512 /* Each macro map that is allocated owns location_t values
1513 that are *lower* that the one before them.
1514 Hence it's meaningful to view them either in order of ascending
1515 source locations, or in order of ascending macro map index. */
1516 const bool ascending_location_ts = true;
1517 unsigned int idx = (ascending_location_ts
1518 ? (LINEMAPS_MACRO_USED (set: line_table) - (i + 1))
1519 : i);
1520 const line_map_macro *map = LINEMAPS_MACRO_MAP_AT (set: line_table, index: idx);
1521 fprintf (stream: stream, format: "MACRO %i: %s (%u tokens)\n",
1522 idx,
1523 linemap_map_get_macro_name (map),
1524 MACRO_MAP_NUM_MACRO_TOKENS (macro_map: map));
1525 dump_location_range (stream,
1526 start: map->start_location,
1527 end: (map->start_location
1528 + MACRO_MAP_NUM_MACRO_TOKENS (macro_map: map)));
1529 inform (map->get_expansion_point_location (),
1530 "expansion point is location %i",
1531 map->get_expansion_point_location ());
1532 fprintf (stream: stream, format: " map->start_location: %u\n",
1533 map->start_location);
1534
1535 fprintf (stream: stream, format: " macro_locations:\n");
1536 for (unsigned int i = 0; i < MACRO_MAP_NUM_MACRO_TOKENS (macro_map: map); i++)
1537 {
1538 location_t x = MACRO_MAP_LOCATIONS (macro_map: map)[2 * i];
1539 location_t y = MACRO_MAP_LOCATIONS (macro_map: map)[(2 * i) + 1];
1540
1541 /* linemap_add_macro_token encodes token numbers in an expansion
1542 by putting them after MAP_START_LOCATION. */
1543
1544 /* I'm typically seeing 4 uninitialized entries at the end of
1545 0xafafafaf.
1546 This appears to be due to macro.cc:replace_args
1547 adding 2 extra args for padding tokens; presumably there may
1548 be a leading and/or trailing padding token injected,
1549 each for 2 more location slots.
1550 This would explain there being up to 4 location_ts slots
1551 that may be uninitialized. */
1552
1553 fprintf (stream: stream, format: " %u: %u, %u\n",
1554 i,
1555 x,
1556 y);
1557 if (x == y)
1558 {
1559 if (x < MAP_START_LOCATION (map))
1560 inform (x, "token %u has %<x-location == y-location == %u%>",
1561 i, x);
1562 else
1563 fprintf (stream: stream,
1564 format: "x-location == y-location == %u encodes token # %u\n",
1565 x, x - MAP_START_LOCATION (map));
1566 }
1567 else
1568 {
1569 inform (x, "token %u has %<x-location == %u%>", i, x);
1570 inform (x, "token %u has %<y-location == %u%>", i, y);
1571 }
1572 }
1573 fprintf (stream: stream, format: "\n");
1574 }
1575
1576 /* It appears that MAX_LOCATION_T itself is never assigned to a
1577 macro map, presumably due to an off-by-one error somewhere
1578 between the logic in linemap_enter_macro and
1579 LINEMAPS_MACRO_LOWEST_LOCATION. */
1580 dump_labelled_location_range (stream, name: "MAX_LOCATION_T",
1581 start: MAX_LOCATION_T,
1582 end: MAX_LOCATION_T + 1);
1583
1584 /* Visualize ad-hoc values. */
1585 dump_labelled_location_range (stream, name: "AD-HOC LOCATIONS",
1586 start: MAX_LOCATION_T + 1, UINT_MAX);
1587}
1588
1589/* string_concat's constructor. */
1590
1591string_concat::string_concat (int num, location_t *locs)
1592 : m_num (num)
1593{
1594 m_locs = ggc_vec_alloc <location_t> (c: num);
1595 for (int i = 0; i < num; i++)
1596 m_locs[i] = locs[i];
1597}
1598
1599/* string_concat_db's constructor. */
1600
1601string_concat_db::string_concat_db ()
1602{
1603 m_table = hash_map <location_hash, string_concat *>::create_ggc (size: 64);
1604}
1605
1606/* Record that a string concatenation occurred, covering NUM
1607 string literal tokens. LOCS is an array of size NUM, containing the
1608 locations of the tokens. A copy of LOCS is taken. */
1609
1610void
1611string_concat_db::record_string_concatenation (int num, location_t *locs)
1612{
1613 gcc_assert (num > 1);
1614 gcc_assert (locs);
1615
1616 location_t key_loc = get_key_loc (loc: locs[0]);
1617 /* We don't record data for 'RESERVED_LOCATION_P (key_loc)' key values:
1618 any data now recorded under key 'key_loc' would be overwritten by a
1619 subsequent call with the same key 'key_loc'. */
1620 if (RESERVED_LOCATION_P (key_loc))
1621 return;
1622
1623 string_concat *concat
1624 = new (ggc_alloc <string_concat> ()) string_concat (num, locs);
1625 m_table->put (k: key_loc, v: concat);
1626}
1627
1628/* Determine if LOC was the location of the initial token of a
1629 concatenation of string literal tokens.
1630 If so, *OUT_NUM is written to with the number of tokens, and
1631 *OUT_LOCS with the location of an array of locations of the
1632 tokens, and return true. *OUT_LOCS is a borrowed pointer to
1633 storage owned by the string_concat_db.
1634 Otherwise, return false. */
1635
1636bool
1637string_concat_db::get_string_concatenation (location_t loc,
1638 int *out_num,
1639 location_t **out_locs)
1640{
1641 gcc_assert (out_num);
1642 gcc_assert (out_locs);
1643
1644 location_t key_loc = get_key_loc (loc);
1645 /* We don't record data for 'RESERVED_LOCATION_P (key_loc)' key values; see
1646 discussion in 'string_concat_db::record_string_concatenation'. */
1647 if (RESERVED_LOCATION_P (key_loc))
1648 return false;
1649
1650 string_concat **concat = m_table->get (k: key_loc);
1651 if (!concat)
1652 return false;
1653
1654 *out_num = (*concat)->m_num;
1655 *out_locs =(*concat)->m_locs;
1656 return true;
1657}
1658
1659/* Internal function. Canonicalize LOC into a form suitable for
1660 use as a key within the database, stripping away macro expansion,
1661 ad-hoc information, and range information, using the location of
1662 the start of LOC within an ordinary linemap. */
1663
1664location_t
1665string_concat_db::get_key_loc (location_t loc)
1666{
1667 loc = linemap_resolve_location (line_table, loc, lrk: LRK_SPELLING_LOCATION,
1668 NULL);
1669
1670 loc = get_range_from_loc (set: line_table, loc).m_start;
1671
1672 return loc;
1673}
1674
1675/* Helper class for use within get_substring_ranges_for_loc.
1676 An vec of cpp_string with responsibility for releasing all of the
1677 str->text for each str in the vector. */
1678
1679class auto_cpp_string_vec : public auto_vec <cpp_string>
1680{
1681 public:
1682 auto_cpp_string_vec (int alloc)
1683 : auto_vec <cpp_string> (alloc) {}
1684
1685 ~auto_cpp_string_vec ()
1686 {
1687 /* Clean up the copies within this vec. */
1688 int i;
1689 cpp_string *str;
1690 FOR_EACH_VEC_ELT (*this, i, str)
1691 free (ptr: const_cast <unsigned char *> (str->text));
1692 }
1693};
1694
1695/* Attempt to populate RANGES with source location information on the
1696 individual characters within the string literal found at STRLOC.
1697 If CONCATS is non-NULL, then any string literals that the token at
1698 STRLOC was concatenated with are also added to RANGES.
1699
1700 Return NULL if successful, or an error message if any errors occurred (in
1701 which case RANGES may be only partially populated and should not
1702 be used).
1703
1704 This is implemented by re-parsing the relevant source line(s). */
1705
1706static const char *
1707get_substring_ranges_for_loc (cpp_reader *pfile,
1708 string_concat_db *concats,
1709 location_t strloc,
1710 enum cpp_ttype type,
1711 cpp_substring_ranges &ranges)
1712{
1713 gcc_assert (pfile);
1714
1715 if (strloc == UNKNOWN_LOCATION)
1716 return "unknown location";
1717
1718 /* Reparsing the strings requires accurate location information.
1719 If -ftrack-macro-expansion has been overridden from its default
1720 of 2, then we might have a location of a macro expansion point,
1721 rather than the location of the literal itself.
1722 Avoid this by requiring that we have full macro expansion tracking
1723 for substring locations to be available. */
1724 if (cpp_get_options (pfile)->track_macro_expansion != 2)
1725 return "track_macro_expansion != 2";
1726
1727 /* If #line or # 44 "file"-style directives are present, then there's
1728 no guarantee that the line numbers we have can be used to locate
1729 the strings. For example, we might have a .i file with # directives
1730 pointing back to lines within a .c file, but the .c file might
1731 have been edited since the .i file was created.
1732 In such a case, the safest course is to disable on-demand substring
1733 locations. */
1734 if (line_table->seen_line_directive)
1735 return "seen line directive";
1736
1737 /* If string concatenation has occurred at STRLOC, get the locations
1738 of all of the literal tokens making up the compound string.
1739 Otherwise, just use STRLOC. */
1740 int num_locs = 1;
1741 location_t *strlocs = &strloc;
1742 if (concats)
1743 concats->get_string_concatenation (loc: strloc, out_num: &num_locs, out_locs: &strlocs);
1744
1745 auto_cpp_string_vec strs (num_locs);
1746 auto_vec <cpp_string_location_reader> loc_readers (num_locs);
1747 for (int i = 0; i < num_locs; i++)
1748 {
1749 /* Get range of strloc. We will use it to locate the start and finish
1750 of the literal token within the line. */
1751 source_range src_range = get_range_from_loc (set: line_table, loc: strlocs[i]);
1752
1753 if (src_range.m_start >= LINEMAPS_MACRO_LOWEST_LOCATION (set: line_table))
1754 {
1755 /* If the string token was within a macro expansion, then we can
1756 cope with it for the simple case where we have a single token.
1757 Otherwise, bail out. */
1758 if (src_range.m_start != src_range.m_finish)
1759 return "macro expansion";
1760 }
1761 else
1762 {
1763 if (src_range.m_start >= LINE_MAP_MAX_LOCATION_WITH_COLS)
1764 /* If so, we can't reliably determine where the token started within
1765 its line. */
1766 return "range starts after LINE_MAP_MAX_LOCATION_WITH_COLS";
1767
1768 if (src_range.m_finish >= LINE_MAP_MAX_LOCATION_WITH_COLS)
1769 /* If so, we can't reliably determine where the token finished
1770 within its line. */
1771 return "range ends after LINE_MAP_MAX_LOCATION_WITH_COLS";
1772 }
1773
1774 expanded_location start
1775 = expand_location_to_spelling_point (loc: src_range.m_start,
1776 aspect: LOCATION_ASPECT_START);
1777 expanded_location finish
1778 = expand_location_to_spelling_point (loc: src_range.m_finish,
1779 aspect: LOCATION_ASPECT_FINISH);
1780 if (start.file != finish.file)
1781 return "range endpoints are in different files";
1782 if (start.line != finish.line)
1783 return "range endpoints are on different lines";
1784 if (start.column > finish.column)
1785 return "range endpoints are reversed";
1786
1787 char_span line = location_get_source_line (file_path: start.file, line: start.line);
1788 if (!line)
1789 return "unable to read source line";
1790
1791 /* Determine the location of the literal (including quotes
1792 and leading prefix chars, such as the 'u' in a u""
1793 token). */
1794 size_t literal_length = finish.column - start.column + 1;
1795
1796 /* Ensure that we don't crash if we got the wrong location. */
1797 if (start.column < 1)
1798 return "zero start column";
1799 if (line.length () < (start.column - 1 + literal_length))
1800 return "line is not wide enough";
1801
1802 char_span literal = line.subspan (offset: start.column - 1, n_elts: literal_length);
1803
1804 cpp_string from;
1805 from.len = literal_length;
1806 /* Make a copy of the literal, to avoid having to rely on
1807 the lifetime of the copy of the line within the cache.
1808 This will be released by the auto_cpp_string_vec dtor. */
1809 from.text = (unsigned char *)literal.xstrdup ();
1810 strs.safe_push (obj: from);
1811
1812 /* For very long lines, a new linemap could have started
1813 halfway through the token.
1814 Ensure that the loc_reader uses the linemap of the
1815 *end* of the token for its start location. */
1816 const line_map_ordinary *start_ord_map;
1817 linemap_resolve_location (line_table, loc: src_range.m_start,
1818 lrk: LRK_SPELLING_LOCATION, loc_map: &start_ord_map);
1819 const line_map_ordinary *final_ord_map;
1820 linemap_resolve_location (line_table, loc: src_range.m_finish,
1821 lrk: LRK_SPELLING_LOCATION, loc_map: &final_ord_map);
1822 if (start_ord_map == NULL || final_ord_map == NULL)
1823 return "failed to get ordinary maps";
1824 /* Bulletproofing. We ought to only have different ordinary maps
1825 for start vs finish due to line-length jumps. */
1826 if (start_ord_map != final_ord_map
1827 && start_ord_map->to_file != final_ord_map->to_file)
1828 return "start and finish are spelled in different ordinary maps";
1829 /* The file from linemap_resolve_location ought to match that from
1830 expand_location_to_spelling_point. */
1831 if (start_ord_map->to_file != start.file)
1832 return "mismatching file after resolving linemap";
1833
1834 location_t start_loc
1835 = linemap_position_for_line_and_column (set: line_table, final_ord_map,
1836 start.line, start.column);
1837
1838 cpp_string_location_reader loc_reader (start_loc, line_table);
1839 loc_readers.safe_push (obj: loc_reader);
1840 }
1841
1842 /* Rerun cpp_interpret_string, or rather, a modified version of it. */
1843 const char *err = cpp_interpret_string_ranges (pfile, from: strs.address (),
1844 loc_readers.address (),
1845 count: num_locs, out: &ranges, type);
1846 if (err)
1847 return err;
1848
1849 /* Success: "ranges" should now contain information on the string. */
1850 return NULL;
1851}
1852
1853/* Attempt to populate *OUT_LOC with source location information on the
1854 given characters within the string literal found at STRLOC.
1855 CARET_IDX, START_IDX, and END_IDX refer to offsets within the execution
1856 character set.
1857
1858 For example, given CARET_IDX = 4, START_IDX = 3, END_IDX = 7
1859 and string literal "012345\n789"
1860 *OUT_LOC is written to with:
1861 "012345\n789"
1862 ~^~~~~
1863
1864 If CONCATS is non-NULL, then any string literals that the token at
1865 STRLOC was concatenated with are also considered.
1866
1867 This is implemented by re-parsing the relevant source line(s).
1868
1869 Return NULL if successful, or an error message if any errors occurred.
1870 Error messages are intended for GCC developers (to help debugging) rather
1871 than for end-users. */
1872
1873const char *
1874get_location_within_string (cpp_reader *pfile,
1875 string_concat_db *concats,
1876 location_t strloc,
1877 enum cpp_ttype type,
1878 int caret_idx, int start_idx, int end_idx,
1879 location_t *out_loc)
1880{
1881 gcc_checking_assert (caret_idx >= 0);
1882 gcc_checking_assert (start_idx >= 0);
1883 gcc_checking_assert (end_idx >= 0);
1884 gcc_assert (out_loc);
1885
1886 cpp_substring_ranges ranges;
1887 const char *err
1888 = get_substring_ranges_for_loc (pfile, concats, strloc, type, ranges);
1889 if (err)
1890 return err;
1891
1892 if (caret_idx >= ranges.get_num_ranges ())
1893 return "caret_idx out of range";
1894 if (start_idx >= ranges.get_num_ranges ())
1895 return "start_idx out of range";
1896 if (end_idx >= ranges.get_num_ranges ())
1897 return "end_idx out of range";
1898
1899 *out_loc = make_location (caret: ranges.get_range (idx: caret_idx).m_start,
1900 start: ranges.get_range (idx: start_idx).m_start,
1901 finish: ranges.get_range (idx: end_idx).m_finish);
1902 return NULL;
1903}
1904
1905/* Associate the DISCRIMINATOR with LOCUS, and return a new locus. */
1906
1907location_t
1908location_with_discriminator (location_t locus, int discriminator)
1909{
1910 tree block = LOCATION_BLOCK (locus);
1911 source_range src_range = get_range_from_loc (set: line_table, loc: locus);
1912 locus = get_pure_location (loc: locus);
1913
1914 if (locus == UNKNOWN_LOCATION)
1915 return locus;
1916
1917 return line_table->get_or_create_combined_loc (locus, src_range, data: block,
1918 discriminator);
1919}
1920
1921/* Return TRUE if LOCUS represents a location with a discriminator. */
1922
1923bool
1924has_discriminator (location_t locus)
1925{
1926 return get_discriminator_from_loc (locus) != 0;
1927}
1928
1929/* Return the discriminator for LOCUS. */
1930
1931int
1932get_discriminator_from_loc (location_t locus)
1933{
1934 return get_discriminator_from_loc (set: line_table, loc: locus);
1935}
1936
1937#if CHECKING_P
1938
1939namespace selftest {
1940
1941/* Selftests of location handling. */
1942
1943/* Attempt to populate *OUT_RANGE with source location information on the
1944 given character within the string literal found at STRLOC.
1945 CHAR_IDX refers to an offset within the execution character set.
1946 If CONCATS is non-NULL, then any string literals that the token at
1947 STRLOC was concatenated with are also considered.
1948
1949 This is implemented by re-parsing the relevant source line(s).
1950
1951 Return NULL if successful, or an error message if any errors occurred.
1952 Error messages are intended for GCC developers (to help debugging) rather
1953 than for end-users. */
1954
1955static const char *
1956get_source_range_for_char (cpp_reader *pfile,
1957 string_concat_db *concats,
1958 location_t strloc,
1959 enum cpp_ttype type,
1960 int char_idx,
1961 source_range *out_range)
1962{
1963 gcc_checking_assert (char_idx >= 0);
1964 gcc_assert (out_range);
1965
1966 cpp_substring_ranges ranges;
1967 const char *err
1968 = get_substring_ranges_for_loc (pfile, concats, strloc, type, ranges);
1969 if (err)
1970 return err;
1971
1972 if (char_idx >= ranges.get_num_ranges ())
1973 return "char_idx out of range";
1974
1975 *out_range = ranges.get_range (idx: char_idx);
1976 return NULL;
1977}
1978
1979/* As get_source_range_for_char, but write to *OUT the number
1980 of ranges that are available. */
1981
1982static const char *
1983get_num_source_ranges_for_substring (cpp_reader *pfile,
1984 string_concat_db *concats,
1985 location_t strloc,
1986 enum cpp_ttype type,
1987 int *out)
1988{
1989 gcc_assert (out);
1990
1991 cpp_substring_ranges ranges;
1992 const char *err
1993 = get_substring_ranges_for_loc (pfile, concats, strloc, type, ranges);
1994
1995 if (err)
1996 return err;
1997
1998 *out = ranges.get_num_ranges ();
1999 return NULL;
2000}
2001
2002/* Selftests of location handling. */
2003
2004/* Verify that compare() on linenum_type handles comparisons over the full
2005 range of the type. */
2006
2007static void
2008test_linenum_comparisons ()
2009{
2010 linenum_type min_line (0);
2011 linenum_type max_line (0xffffffff);
2012 ASSERT_EQ (0, compare (min_line, min_line));
2013 ASSERT_EQ (0, compare (max_line, max_line));
2014
2015 ASSERT_GT (compare (max_line, min_line), 0);
2016 ASSERT_LT (compare (min_line, max_line), 0);
2017}
2018
2019/* Helper function for verifying location data: when location_t
2020 values are > LINE_MAP_MAX_LOCATION_WITH_COLS, they are treated
2021 as having column 0. */
2022
2023static bool
2024should_have_column_data_p (location_t loc)
2025{
2026 if (IS_ADHOC_LOC (loc))
2027 loc = get_location_from_adhoc_loc (line_table, loc);
2028 if (loc > LINE_MAP_MAX_LOCATION_WITH_COLS)
2029 return false;
2030 return true;
2031}
2032
2033/* Selftest for should_have_column_data_p. */
2034
2035static void
2036test_should_have_column_data_p ()
2037{
2038 ASSERT_TRUE (should_have_column_data_p (RESERVED_LOCATION_COUNT));
2039 ASSERT_TRUE
2040 (should_have_column_data_p (LINE_MAP_MAX_LOCATION_WITH_COLS));
2041 ASSERT_FALSE
2042 (should_have_column_data_p (LINE_MAP_MAX_LOCATION_WITH_COLS + 1));
2043}
2044
2045/* Verify the result of LOCATION_FILE/LOCATION_LINE/LOCATION_COLUMN
2046 on LOC. */
2047
2048static void
2049assert_loceq (const char *exp_filename, int exp_linenum, int exp_colnum,
2050 location_t loc)
2051{
2052 ASSERT_STREQ (exp_filename, LOCATION_FILE (loc));
2053 ASSERT_EQ (exp_linenum, LOCATION_LINE (loc));
2054 /* If location_t values are sufficiently high, then column numbers
2055 will be unavailable and LOCATION_COLUMN (loc) will be 0.
2056 When close to the threshold, column numbers *may* be present: if
2057 the final linemap before the threshold contains a line that straddles
2058 the threshold, locations in that line have column information. */
2059 if (should_have_column_data_p (loc))
2060 ASSERT_EQ (exp_colnum, LOCATION_COLUMN (loc));
2061}
2062
2063/* Various selftests involve constructing a line table and one or more
2064 line maps within it.
2065
2066 For maximum test coverage we want to run these tests with a variety
2067 of situations:
2068 - line_table->default_range_bits: some frontends use a non-zero value
2069 and others use zero
2070 - the fallback modes within line-map.cc: there are various threshold
2071 values for location_t beyond line-map.cc changes
2072 behavior (disabling of the range-packing optimization, disabling
2073 of column-tracking). We can exercise these by starting the line_table
2074 at interesting values at or near these thresholds.
2075
2076 The following struct describes a particular case within our test
2077 matrix. */
2078
2079class line_table_case
2080{
2081public:
2082 line_table_case (int default_range_bits, int base_location)
2083 : m_default_range_bits (default_range_bits),
2084 m_base_location (base_location)
2085 {}
2086
2087 int m_default_range_bits;
2088 int m_base_location;
2089};
2090
2091/* Constructor. Store the old value of line_table, and create a new
2092 one, using sane defaults. */
2093
2094line_table_test::line_table_test ()
2095{
2096 gcc_assert (saved_line_table == NULL);
2097 saved_line_table = line_table;
2098 line_table = ggc_alloc<line_maps> ();
2099 linemap_init (set: line_table, BUILTINS_LOCATION);
2100 gcc_assert (saved_line_table->m_reallocator);
2101 line_table->m_reallocator = saved_line_table->m_reallocator;
2102 gcc_assert (saved_line_table->m_round_alloc_size);
2103 line_table->m_round_alloc_size = saved_line_table->m_round_alloc_size;
2104 line_table->default_range_bits = 0;
2105}
2106
2107/* Constructor. Store the old value of line_table, and create a new
2108 one, using the sitation described in CASE_. */
2109
2110line_table_test::line_table_test (const line_table_case &case_)
2111{
2112 gcc_assert (saved_line_table == NULL);
2113 saved_line_table = line_table;
2114 line_table = ggc_alloc<line_maps> ();
2115 linemap_init (set: line_table, BUILTINS_LOCATION);
2116 gcc_assert (saved_line_table->m_reallocator);
2117 line_table->m_reallocator = saved_line_table->m_reallocator;
2118 gcc_assert (saved_line_table->m_round_alloc_size);
2119 line_table->m_round_alloc_size = saved_line_table->m_round_alloc_size;
2120 line_table->default_range_bits = case_.m_default_range_bits;
2121 if (case_.m_base_location)
2122 {
2123 line_table->highest_location = case_.m_base_location;
2124 line_table->highest_line = case_.m_base_location;
2125 }
2126}
2127
2128/* Destructor. Restore the old value of line_table. */
2129
2130line_table_test::~line_table_test ()
2131{
2132 gcc_assert (saved_line_table != NULL);
2133 line_table = saved_line_table;
2134 saved_line_table = NULL;
2135}
2136
2137/* Verify basic operation of ordinary linemaps. */
2138
2139static void
2140test_accessing_ordinary_linemaps (const line_table_case &case_)
2141{
2142 line_table_test ltt (case_);
2143
2144 /* Build a simple linemap describing some locations. */
2145 linemap_add (line_table, LC_ENTER, sysp: false, to_file: "foo.c", to_line: 0);
2146
2147 linemap_line_start (set: line_table, to_line: 1, max_column_hint: 100);
2148 location_t loc_a = linemap_position_for_column (line_table, 1);
2149 location_t loc_b = linemap_position_for_column (line_table, 23);
2150
2151 linemap_line_start (set: line_table, to_line: 2, max_column_hint: 100);
2152 location_t loc_c = linemap_position_for_column (line_table, 1);
2153 location_t loc_d = linemap_position_for_column (line_table, 17);
2154
2155 /* Example of a very long line. */
2156 linemap_line_start (set: line_table, to_line: 3, max_column_hint: 2000);
2157 location_t loc_e = linemap_position_for_column (line_table, 700);
2158
2159 /* Transitioning back to a short line. */
2160 linemap_line_start (set: line_table, to_line: 4, max_column_hint: 0);
2161 location_t loc_back_to_short = linemap_position_for_column (line_table, 100);
2162
2163 if (should_have_column_data_p (loc: loc_back_to_short))
2164 {
2165 /* Verify that we switched to short lines in the linemap. */
2166 line_map_ordinary *map = LINEMAPS_LAST_ORDINARY_MAP (set: line_table);
2167 ASSERT_EQ (7, map->m_column_and_range_bits - map->m_range_bits);
2168 }
2169
2170 /* Example of a line that will eventually be seen to be longer
2171 than LINE_MAP_MAX_COLUMN_NUMBER; the initially seen width is
2172 below that. */
2173 linemap_line_start (set: line_table, to_line: 5, max_column_hint: 2000);
2174
2175 location_t loc_start_of_very_long_line
2176 = linemap_position_for_column (line_table, 2000);
2177 location_t loc_too_wide
2178 = linemap_position_for_column (line_table, 4097);
2179 location_t loc_too_wide_2
2180 = linemap_position_for_column (line_table, 4098);
2181
2182 /* ...and back to a sane line length. */
2183 linemap_line_start (set: line_table, to_line: 6, max_column_hint: 100);
2184 location_t loc_sane_again = linemap_position_for_column (line_table, 10);
2185
2186 linemap_add (line_table, LC_LEAVE, sysp: false, NULL, to_line: 0);
2187
2188 /* Multiple files. */
2189 linemap_add (line_table, LC_ENTER, sysp: false, to_file: "bar.c", to_line: 0);
2190 linemap_line_start (set: line_table, to_line: 1, max_column_hint: 200);
2191 location_t loc_f = linemap_position_for_column (line_table, 150);
2192 linemap_add (line_table, LC_LEAVE, sysp: false, NULL, to_line: 0);
2193
2194 /* Verify that we can recover the location info. */
2195 assert_loceq (exp_filename: "foo.c", exp_linenum: 1, exp_colnum: 1, loc: loc_a);
2196 assert_loceq (exp_filename: "foo.c", exp_linenum: 1, exp_colnum: 23, loc: loc_b);
2197 assert_loceq (exp_filename: "foo.c", exp_linenum: 2, exp_colnum: 1, loc: loc_c);
2198 assert_loceq (exp_filename: "foo.c", exp_linenum: 2, exp_colnum: 17, loc: loc_d);
2199 assert_loceq (exp_filename: "foo.c", exp_linenum: 3, exp_colnum: 700, loc: loc_e);
2200 assert_loceq (exp_filename: "foo.c", exp_linenum: 4, exp_colnum: 100, loc: loc_back_to_short);
2201
2202 /* In the very wide line, the initial location should be fully tracked. */
2203 assert_loceq (exp_filename: "foo.c", exp_linenum: 5, exp_colnum: 2000, loc: loc_start_of_very_long_line);
2204 /* ...but once we exceed LINE_MAP_MAX_COLUMN_NUMBER column-tracking should
2205 be disabled. */
2206 assert_loceq (exp_filename: "foo.c", exp_linenum: 5, exp_colnum: 0, loc: loc_too_wide);
2207 assert_loceq (exp_filename: "foo.c", exp_linenum: 5, exp_colnum: 0, loc: loc_too_wide_2);
2208 /*...and column-tracking should be re-enabled for subsequent lines. */
2209 assert_loceq (exp_filename: "foo.c", exp_linenum: 6, exp_colnum: 10, loc: loc_sane_again);
2210
2211 assert_loceq (exp_filename: "bar.c", exp_linenum: 1, exp_colnum: 150, loc: loc_f);
2212
2213 ASSERT_FALSE (is_location_from_builtin_token (loc_a));
2214 ASSERT_TRUE (pure_location_p (line_table, loc_a));
2215
2216 /* Verify using make_location to build a range, and extracting data
2217 back from it. */
2218 location_t range_c_b_d = make_location (caret: loc_c, start: loc_b, finish: loc_d);
2219 ASSERT_FALSE (pure_location_p (line_table, range_c_b_d));
2220 ASSERT_EQ (loc_c, get_location_from_adhoc_loc (line_table, range_c_b_d));
2221 source_range src_range = get_range_from_loc (set: line_table, loc: range_c_b_d);
2222 ASSERT_EQ (loc_b, src_range.m_start);
2223 ASSERT_EQ (loc_d, src_range.m_finish);
2224}
2225
2226/* Verify various properties of UNKNOWN_LOCATION. */
2227
2228static void
2229test_unknown_location ()
2230{
2231 ASSERT_EQ (NULL, LOCATION_FILE (UNKNOWN_LOCATION));
2232 ASSERT_EQ (0, LOCATION_LINE (UNKNOWN_LOCATION));
2233 ASSERT_EQ (0, LOCATION_COLUMN (UNKNOWN_LOCATION));
2234}
2235
2236/* Verify various properties of BUILTINS_LOCATION. */
2237
2238static void
2239test_builtins ()
2240{
2241 assert_loceq (exp_filename: special_fname_builtin (), exp_linenum: 0, exp_colnum: 0, BUILTINS_LOCATION);
2242 ASSERT_PRED1 (is_location_from_builtin_token, BUILTINS_LOCATION);
2243}
2244
2245/* Regression test for make_location.
2246 Ensure that we use pure locations for the start/finish of the range,
2247 rather than storing a packed or ad-hoc range as the start/finish. */
2248
2249static void
2250test_make_location_nonpure_range_endpoints (const line_table_case &case_)
2251{
2252 /* Issue seen with testsuite/c-c++-common/Wlogical-not-parentheses-2.c
2253 with C++ frontend.
2254 ....................0000000001111111111222.
2255 ....................1234567890123456789012. */
2256 const char *content = " r += !aaa == bbb;\n";
2257 temp_source_file tmp (SELFTEST_LOCATION, ".C", content);
2258 line_table_test ltt (case_);
2259 linemap_add (line_table, LC_ENTER, sysp: false, to_file: tmp.get_filename (), to_line: 1);
2260
2261 const location_t c11 = linemap_position_for_column (line_table, 11);
2262 const location_t c12 = linemap_position_for_column (line_table, 12);
2263 const location_t c13 = linemap_position_for_column (line_table, 13);
2264 const location_t c14 = linemap_position_for_column (line_table, 14);
2265 const location_t c21 = linemap_position_for_column (line_table, 21);
2266
2267 if (c21 > LINE_MAP_MAX_LOCATION_WITH_COLS)
2268 return;
2269
2270 /* Use column 13 for the caret location, arbitrarily, to verify that we
2271 handle start != caret. */
2272 const location_t aaa = make_location (caret: c13, start: c12, finish: c14);
2273 ASSERT_EQ (c13, get_pure_location (aaa));
2274 ASSERT_EQ (c12, get_start (aaa));
2275 ASSERT_FALSE (IS_ADHOC_LOC (get_start (aaa)));
2276 ASSERT_EQ (c14, get_finish (aaa));
2277 ASSERT_FALSE (IS_ADHOC_LOC (get_finish (aaa)));
2278
2279 /* Make a location using a location with a range as the start-point. */
2280 const location_t not_aaa = make_location (caret: c11, start: aaa, finish: c14);
2281 ASSERT_EQ (c11, get_pure_location (not_aaa));
2282 /* It should use the start location of the range, not store the range
2283 itself. */
2284 ASSERT_EQ (c12, get_start (not_aaa));
2285 ASSERT_FALSE (IS_ADHOC_LOC (get_start (not_aaa)));
2286 ASSERT_EQ (c14, get_finish (not_aaa));
2287 ASSERT_FALSE (IS_ADHOC_LOC (get_finish (not_aaa)));
2288
2289 /* Similarly, make a location with a range as the end-point. */
2290 const location_t aaa_eq_bbb = make_location (caret: c12, start: c12, finish: c21);
2291 ASSERT_EQ (c12, get_pure_location (aaa_eq_bbb));
2292 ASSERT_EQ (c12, get_start (aaa_eq_bbb));
2293 ASSERT_FALSE (IS_ADHOC_LOC (get_start (aaa_eq_bbb)));
2294 ASSERT_EQ (c21, get_finish (aaa_eq_bbb));
2295 ASSERT_FALSE (IS_ADHOC_LOC (get_finish (aaa_eq_bbb)));
2296 const location_t not_aaa_eq_bbb = make_location (caret: c11, start: c12, finish: aaa_eq_bbb);
2297 /* It should use the finish location of the range, not store the range
2298 itself. */
2299 ASSERT_EQ (c11, get_pure_location (not_aaa_eq_bbb));
2300 ASSERT_EQ (c12, get_start (not_aaa_eq_bbb));
2301 ASSERT_FALSE (IS_ADHOC_LOC (get_start (not_aaa_eq_bbb)));
2302 ASSERT_EQ (c21, get_finish (not_aaa_eq_bbb));
2303 ASSERT_FALSE (IS_ADHOC_LOC (get_finish (not_aaa_eq_bbb)));
2304}
2305
2306/* Verify reading of input files (e.g. for caret-based diagnostics). */
2307
2308static void
2309test_reading_source_line ()
2310{
2311 /* Create a tempfile and write some text to it. */
2312 temp_source_file tmp (SELFTEST_LOCATION, ".txt",
2313 "01234567890123456789\n"
2314 "This is the test text\n"
2315 "This is the 3rd line");
2316
2317 /* Read back a specific line from the tempfile. */
2318 char_span source_line = location_get_source_line (file_path: tmp.get_filename (), line: 3);
2319 ASSERT_TRUE (source_line);
2320 ASSERT_TRUE (source_line.get_buffer () != NULL);
2321 ASSERT_EQ (20, source_line.length ());
2322 ASSERT_TRUE (!strncmp ("This is the 3rd line",
2323 source_line.get_buffer (), source_line.length ()));
2324
2325 source_line = location_get_source_line (file_path: tmp.get_filename (), line: 2);
2326 ASSERT_TRUE (source_line);
2327 ASSERT_TRUE (source_line.get_buffer () != NULL);
2328 ASSERT_EQ (21, source_line.length ());
2329 ASSERT_TRUE (!strncmp ("This is the test text",
2330 source_line.get_buffer (), source_line.length ()));
2331
2332 source_line = location_get_source_line (file_path: tmp.get_filename (), line: 4);
2333 ASSERT_FALSE (source_line);
2334 ASSERT_TRUE (source_line.get_buffer () == NULL);
2335}
2336
2337/* Tests of lexing. */
2338
2339/* Verify that token TOK from PARSER has cpp_token_as_text
2340 equal to EXPECTED_TEXT. */
2341
2342#define ASSERT_TOKEN_AS_TEXT_EQ(PARSER, TOK, EXPECTED_TEXT) \
2343 SELFTEST_BEGIN_STMT \
2344 unsigned char *actual_txt = cpp_token_as_text ((PARSER), (TOK)); \
2345 ASSERT_STREQ ((EXPECTED_TEXT), (const char *)actual_txt); \
2346 SELFTEST_END_STMT
2347
2348/* Verify that TOK's src_loc is within EXP_FILENAME at EXP_LINENUM,
2349 and ranges from EXP_START_COL to EXP_FINISH_COL.
2350 Use LOC as the effective location of the selftest. */
2351
2352static void
2353assert_token_loc_eq (const location &loc,
2354 const cpp_token *tok,
2355 const char *exp_filename, int exp_linenum,
2356 int exp_start_col, int exp_finish_col)
2357{
2358 location_t tok_loc = tok->src_loc;
2359 ASSERT_STREQ_AT (loc, exp_filename, LOCATION_FILE (tok_loc));
2360 ASSERT_EQ_AT (loc, exp_linenum, LOCATION_LINE (tok_loc));
2361
2362 /* If location_t values are sufficiently high, then column numbers
2363 will be unavailable. */
2364 if (!should_have_column_data_p (loc: tok_loc))
2365 return;
2366
2367 ASSERT_EQ_AT (loc, exp_start_col, LOCATION_COLUMN (tok_loc));
2368 source_range tok_range = get_range_from_loc (set: line_table, loc: tok_loc);
2369 ASSERT_EQ_AT (loc, exp_start_col, LOCATION_COLUMN (tok_range.m_start));
2370 ASSERT_EQ_AT (loc, exp_finish_col, LOCATION_COLUMN (tok_range.m_finish));
2371}
2372
2373/* Use assert_token_loc_eq to verify the TOK->src_loc, using
2374 SELFTEST_LOCATION as the effective location of the selftest. */
2375
2376#define ASSERT_TOKEN_LOC_EQ(TOK, EXP_FILENAME, EXP_LINENUM, \
2377 EXP_START_COL, EXP_FINISH_COL) \
2378 assert_token_loc_eq (SELFTEST_LOCATION, (TOK), (EXP_FILENAME), \
2379 (EXP_LINENUM), (EXP_START_COL), (EXP_FINISH_COL))
2380
2381/* Test of lexing a file using libcpp, verifying tokens and their
2382 location information. */
2383
2384static void
2385test_lexer (const line_table_case &case_)
2386{
2387 /* Create a tempfile and write some text to it. */
2388 const char *content =
2389 /*00000000011111111112222222222333333.3333444444444.455555555556
2390 12345678901234567890123456789012345.6789012345678.901234567890. */
2391 ("test_name /* c-style comment */\n"
2392 " \"test literal\"\n"
2393 " // test c++-style comment\n"
2394 " 42\n");
2395 temp_source_file tmp (SELFTEST_LOCATION, ".txt", content);
2396
2397 line_table_test ltt (case_);
2398
2399 cpp_reader *parser = cpp_create_reader (CLK_GNUC89, NULL, line_table);
2400
2401 const char *fname = cpp_read_main_file (parser, tmp.get_filename ());
2402 ASSERT_NE (fname, NULL);
2403
2404 /* Verify that we get the expected tokens back, with the correct
2405 location information. */
2406
2407 location_t loc;
2408 const cpp_token *tok;
2409 tok = cpp_get_token_with_location (parser, &loc);
2410 ASSERT_NE (tok, NULL);
2411 ASSERT_EQ (tok->type, CPP_NAME);
2412 ASSERT_TOKEN_AS_TEXT_EQ (parser, tok, "test_name");
2413 ASSERT_TOKEN_LOC_EQ (tok, tmp.get_filename (), 1, 1, 9);
2414
2415 tok = cpp_get_token_with_location (parser, &loc);
2416 ASSERT_NE (tok, NULL);
2417 ASSERT_EQ (tok->type, CPP_STRING);
2418 ASSERT_TOKEN_AS_TEXT_EQ (parser, tok, "\"test literal\"");
2419 ASSERT_TOKEN_LOC_EQ (tok, tmp.get_filename (), 2, 35, 48);
2420
2421 tok = cpp_get_token_with_location (parser, &loc);
2422 ASSERT_NE (tok, NULL);
2423 ASSERT_EQ (tok->type, CPP_NUMBER);
2424 ASSERT_TOKEN_AS_TEXT_EQ (parser, tok, "42");
2425 ASSERT_TOKEN_LOC_EQ (tok, tmp.get_filename (), 4, 4, 5);
2426
2427 tok = cpp_get_token_with_location (parser, &loc);
2428 ASSERT_NE (tok, NULL);
2429 ASSERT_EQ (tok->type, CPP_EOF);
2430
2431 cpp_finish (parser, NULL);
2432 cpp_destroy (parser);
2433}
2434
2435/* Forward decls. */
2436
2437class lexer_test;
2438class lexer_test_options;
2439
2440/* A class for specifying options of a lexer_test.
2441 The "apply" vfunc is called during the lexer_test constructor. */
2442
2443class lexer_test_options
2444{
2445 public:
2446 virtual void apply (lexer_test &) = 0;
2447};
2448
2449/* Wrapper around an cpp_reader *, which calls cpp_finish and cpp_destroy
2450 in its dtor.
2451
2452 This is needed by struct lexer_test to ensure that the cleanup of the
2453 cpp_reader happens *after* the cleanup of the temp_source_file. */
2454
2455class cpp_reader_ptr
2456{
2457 public:
2458 cpp_reader_ptr (cpp_reader *ptr) : m_ptr (ptr) {}
2459
2460 ~cpp_reader_ptr ()
2461 {
2462 cpp_finish (m_ptr, NULL);
2463 cpp_destroy (m_ptr);
2464 }
2465
2466 operator cpp_reader * () const { return m_ptr; }
2467
2468 private:
2469 cpp_reader *m_ptr;
2470};
2471
2472/* A struct for writing lexer tests. */
2473
2474class lexer_test
2475{
2476public:
2477 lexer_test (const line_table_case &case_, const char *content,
2478 lexer_test_options *options);
2479 ~lexer_test ();
2480
2481 const cpp_token *get_token ();
2482
2483 /* The ordering of these fields matters.
2484 The line_table_test must be first, since the cpp_reader_ptr
2485 uses it.
2486 The cpp_reader must be cleaned up *after* the temp_source_file
2487 since the filenames in input.cc's input cache are owned by the
2488 cpp_reader; in particular, when ~temp_source_file evicts the
2489 filename the filenames must still be alive. */
2490 line_table_test m_ltt;
2491 cpp_reader_ptr m_parser;
2492 temp_source_file m_tempfile;
2493 string_concat_db m_concats;
2494 bool m_implicitly_expect_EOF;
2495};
2496
2497/* Use an EBCDIC encoding for the execution charset, specifically
2498 IBM1047-encoded (aka "EBCDIC 1047", or "Code page 1047").
2499
2500 This exercises iconv integration within libcpp.
2501 Not every build of iconv supports the given charset,
2502 so we need to flag this error and handle it gracefully. */
2503
2504class ebcdic_execution_charset : public lexer_test_options
2505{
2506 public:
2507 ebcdic_execution_charset () : m_num_iconv_errors (0)
2508 {
2509 gcc_assert (s_singleton == NULL);
2510 s_singleton = this;
2511 }
2512 ~ebcdic_execution_charset ()
2513 {
2514 gcc_assert (s_singleton == this);
2515 s_singleton = NULL;
2516 }
2517
2518 void apply (lexer_test &test) final override
2519 {
2520 cpp_options *cpp_opts = cpp_get_options (test.m_parser);
2521 cpp_opts->narrow_charset = "IBM1047";
2522
2523 cpp_callbacks *callbacks = cpp_get_callbacks (test.m_parser);
2524 callbacks->diagnostic = on_diagnostic;
2525 }
2526
2527 static bool on_diagnostic (cpp_reader *pfile ATTRIBUTE_UNUSED,
2528 enum cpp_diagnostic_level level ATTRIBUTE_UNUSED,
2529 enum cpp_warning_reason reason ATTRIBUTE_UNUSED,
2530 rich_location *richloc ATTRIBUTE_UNUSED,
2531 const char *msgid, va_list *ap ATTRIBUTE_UNUSED)
2532 ATTRIBUTE_FPTR_PRINTF(5,0)
2533 {
2534 gcc_assert (s_singleton);
2535 /* Avoid exgettext from picking this up, it is translated in libcpp. */
2536 const char *msg = "conversion from %s to %s not supported by iconv";
2537#ifdef ENABLE_NLS
2538 msg = dgettext (domainname: "cpplib", msgid: msg);
2539#endif
2540 /* Detect and record errors emitted by libcpp/charset.cc:init_iconv_desc
2541 when the local iconv build doesn't support the conversion. */
2542 if (strcmp (s1: msgid, s2: msg) == 0)
2543 {
2544 s_singleton->m_num_iconv_errors++;
2545 return true;
2546 }
2547
2548 /* Otherwise, we have an unexpected error. */
2549 abort ();
2550 }
2551
2552 bool iconv_errors_occurred_p () const { return m_num_iconv_errors > 0; }
2553
2554 private:
2555 static ebcdic_execution_charset *s_singleton;
2556 int m_num_iconv_errors;
2557};
2558
2559ebcdic_execution_charset *ebcdic_execution_charset::s_singleton;
2560
2561/* A lexer_test_options subclass that records a list of diagnostic
2562 messages emitted by the lexer. */
2563
2564class lexer_diagnostic_sink : public lexer_test_options
2565{
2566 public:
2567 lexer_diagnostic_sink ()
2568 {
2569 gcc_assert (s_singleton == NULL);
2570 s_singleton = this;
2571 }
2572 ~lexer_diagnostic_sink ()
2573 {
2574 gcc_assert (s_singleton == this);
2575 s_singleton = NULL;
2576
2577 int i;
2578 char *str;
2579 FOR_EACH_VEC_ELT (m_diagnostics, i, str)
2580 free (ptr: str);
2581 }
2582
2583 void apply (lexer_test &test) final override
2584 {
2585 cpp_callbacks *callbacks = cpp_get_callbacks (test.m_parser);
2586 callbacks->diagnostic = on_diagnostic;
2587 }
2588
2589 static bool on_diagnostic (cpp_reader *pfile ATTRIBUTE_UNUSED,
2590 enum cpp_diagnostic_level level ATTRIBUTE_UNUSED,
2591 enum cpp_warning_reason reason ATTRIBUTE_UNUSED,
2592 rich_location *richloc ATTRIBUTE_UNUSED,
2593 const char *msgid, va_list *ap)
2594 ATTRIBUTE_FPTR_PRINTF(5,0)
2595 {
2596 char *msg = xvasprintf (msgid, *ap);
2597 s_singleton->m_diagnostics.safe_push (obj: msg);
2598 return true;
2599 }
2600
2601 auto_vec<char *> m_diagnostics;
2602
2603 private:
2604 static lexer_diagnostic_sink *s_singleton;
2605};
2606
2607lexer_diagnostic_sink *lexer_diagnostic_sink::s_singleton;
2608
2609/* Constructor. Override line_table with a new instance based on CASE_,
2610 and write CONTENT to a tempfile. Create a cpp_reader, and use it to
2611 start parsing the tempfile. */
2612
2613lexer_test::lexer_test (const line_table_case &case_, const char *content,
2614 lexer_test_options *options)
2615: m_ltt (case_),
2616 m_parser (cpp_create_reader (CLK_GNUC99, NULL, line_table)),
2617 /* Create a tempfile and write the text to it. */
2618 m_tempfile (SELFTEST_LOCATION, ".c", content),
2619 m_concats (),
2620 m_implicitly_expect_EOF (true)
2621{
2622 if (options)
2623 options->apply (*this);
2624
2625 cpp_init_iconv (m_parser);
2626
2627 /* Parse the file. */
2628 const char *fname = cpp_read_main_file (m_parser,
2629 m_tempfile.get_filename ());
2630 ASSERT_NE (fname, NULL);
2631}
2632
2633/* Destructor. By default, verify that the next token in m_parser is EOF. */
2634
2635lexer_test::~lexer_test ()
2636{
2637 location_t loc;
2638 const cpp_token *tok;
2639
2640 if (m_implicitly_expect_EOF)
2641 {
2642 tok = cpp_get_token_with_location (m_parser, &loc);
2643 ASSERT_NE (tok, NULL);
2644 ASSERT_EQ (tok->type, CPP_EOF);
2645 }
2646}
2647
2648/* Get the next token from m_parser. */
2649
2650const cpp_token *
2651lexer_test::get_token ()
2652{
2653 location_t loc;
2654 const cpp_token *tok;
2655
2656 tok = cpp_get_token_with_location (m_parser, &loc);
2657 ASSERT_NE (tok, NULL);
2658 return tok;
2659}
2660
2661/* Verify that locations within string literals are correctly handled. */
2662
2663/* Verify get_source_range_for_substring for token(s) at STRLOC,
2664 using the string concatenation database for TEST.
2665
2666 Assert that the character at index IDX is on EXPECTED_LINE,
2667 and that it begins at column EXPECTED_START_COL and ends at
2668 EXPECTED_FINISH_COL (unless the locations are beyond
2669 LINE_MAP_MAX_LOCATION_WITH_COLS, in which case don't check their
2670 columns). */
2671
2672static void
2673assert_char_at_range (const location &loc,
2674 lexer_test& test,
2675 location_t strloc, enum cpp_ttype type, int idx,
2676 int expected_line, int expected_start_col,
2677 int expected_finish_col)
2678{
2679 cpp_reader *pfile = test.m_parser;
2680 string_concat_db *concats = &test.m_concats;
2681
2682 source_range actual_range = source_range();
2683 const char *err
2684 = get_source_range_for_char (pfile, concats, strloc, type, char_idx: idx,
2685 out_range: &actual_range);
2686 if (should_have_column_data_p (loc: strloc))
2687 ASSERT_EQ_AT (loc, NULL, err);
2688 else
2689 {
2690 ASSERT_STREQ_AT (loc,
2691 "range starts after LINE_MAP_MAX_LOCATION_WITH_COLS",
2692 err);
2693 return;
2694 }
2695
2696 int actual_start_line = LOCATION_LINE (actual_range.m_start);
2697 ASSERT_EQ_AT (loc, expected_line, actual_start_line);
2698 int actual_finish_line = LOCATION_LINE (actual_range.m_finish);
2699 ASSERT_EQ_AT (loc, expected_line, actual_finish_line);
2700
2701 if (should_have_column_data_p (loc: actual_range.m_start))
2702 {
2703 int actual_start_col = LOCATION_COLUMN (actual_range.m_start);
2704 ASSERT_EQ_AT (loc, expected_start_col, actual_start_col);
2705 }
2706 if (should_have_column_data_p (loc: actual_range.m_finish))
2707 {
2708 int actual_finish_col = LOCATION_COLUMN (actual_range.m_finish);
2709 ASSERT_EQ_AT (loc, expected_finish_col, actual_finish_col);
2710 }
2711}
2712
2713/* Macro for calling assert_char_at_range, supplying SELFTEST_LOCATION for
2714 the effective location of any errors. */
2715
2716#define ASSERT_CHAR_AT_RANGE(LEXER_TEST, STRLOC, TYPE, IDX, EXPECTED_LINE, \
2717 EXPECTED_START_COL, EXPECTED_FINISH_COL) \
2718 assert_char_at_range (SELFTEST_LOCATION, (LEXER_TEST), (STRLOC), (TYPE), \
2719 (IDX), (EXPECTED_LINE), (EXPECTED_START_COL), \
2720 (EXPECTED_FINISH_COL))
2721
2722/* Verify get_num_source_ranges_for_substring for token(s) at STRLOC,
2723 using the string concatenation database for TEST.
2724
2725 Assert that the token(s) at STRLOC contain EXPECTED_NUM_RANGES. */
2726
2727static void
2728assert_num_substring_ranges (const location &loc,
2729 lexer_test& test,
2730 location_t strloc,
2731 enum cpp_ttype type,
2732 int expected_num_ranges)
2733{
2734 cpp_reader *pfile = test.m_parser;
2735 string_concat_db *concats = &test.m_concats;
2736
2737 int actual_num_ranges = -1;
2738 const char *err
2739 = get_num_source_ranges_for_substring (pfile, concats, strloc, type,
2740 out: &actual_num_ranges);
2741 if (should_have_column_data_p (loc: strloc))
2742 ASSERT_EQ_AT (loc, NULL, err);
2743 else
2744 {
2745 ASSERT_STREQ_AT (loc,
2746 "range starts after LINE_MAP_MAX_LOCATION_WITH_COLS",
2747 err);
2748 return;
2749 }
2750 ASSERT_EQ_AT (loc, expected_num_ranges, actual_num_ranges);
2751}
2752
2753/* Macro for calling assert_num_substring_ranges, supplying
2754 SELFTEST_LOCATION for the effective location of any errors. */
2755
2756#define ASSERT_NUM_SUBSTRING_RANGES(LEXER_TEST, STRLOC, TYPE, \
2757 EXPECTED_NUM_RANGES) \
2758 assert_num_substring_ranges (SELFTEST_LOCATION, (LEXER_TEST), (STRLOC), \
2759 (TYPE), (EXPECTED_NUM_RANGES))
2760
2761
2762/* Verify that get_num_source_ranges_for_substring for token(s) at STRLOC
2763 returns an error (using the string concatenation database for TEST). */
2764
2765static void
2766assert_has_no_substring_ranges (const location &loc,
2767 lexer_test& test,
2768 location_t strloc,
2769 enum cpp_ttype type,
2770 const char *expected_err)
2771{
2772 cpp_reader *pfile = test.m_parser;
2773 string_concat_db *concats = &test.m_concats;
2774 cpp_substring_ranges ranges;
2775 const char *actual_err
2776 = get_substring_ranges_for_loc (pfile, concats, strloc,
2777 type, ranges);
2778 if (should_have_column_data_p (loc: strloc))
2779 ASSERT_STREQ_AT (loc, expected_err, actual_err);
2780 else
2781 ASSERT_STREQ_AT (loc,
2782 "range starts after LINE_MAP_MAX_LOCATION_WITH_COLS",
2783 actual_err);
2784}
2785
2786#define ASSERT_HAS_NO_SUBSTRING_RANGES(LEXER_TEST, STRLOC, TYPE, ERR) \
2787 assert_has_no_substring_ranges (SELFTEST_LOCATION, (LEXER_TEST), \
2788 (STRLOC), (TYPE), (ERR))
2789
2790/* Lex a simple string literal. Verify the substring location data, before
2791 and after running cpp_interpret_string on it. */
2792
2793static void
2794test_lexer_string_locations_simple (const line_table_case &case_)
2795{
2796 /* Digits 0-9 (with 0 at column 10), the simple way.
2797 ....................000000000.11111111112.2222222223333333333
2798 ....................123456789.01234567890.1234567890123456789
2799 We add a trailing comment to ensure that we correctly locate
2800 the end of the string literal token. */
2801 const char *content = " \"0123456789\" /* not a string */\n";
2802 lexer_test test (case_, content, NULL);
2803
2804 /* Verify that we get the expected token back, with the correct
2805 location information. */
2806 const cpp_token *tok = test.get_token ();
2807 ASSERT_EQ (tok->type, CPP_STRING);
2808 ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"0123456789\"");
2809 ASSERT_TOKEN_LOC_EQ (tok, test.m_tempfile.get_filename (), 1, 9, 20);
2810
2811 /* At this point in lexing, the quote characters are treated as part of
2812 the string (they are stripped off by cpp_interpret_string). */
2813
2814 ASSERT_EQ (tok->val.str.len, 12);
2815
2816 /* Verify that cpp_interpret_string works. */
2817 cpp_string dst_string;
2818 const enum cpp_ttype type = CPP_STRING;
2819 bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2820 &dst_string, type);
2821 ASSERT_TRUE (result);
2822 ASSERT_STREQ ("0123456789", (const char *)dst_string.text);
2823 free (ptr: const_cast <unsigned char *> (dst_string.text));
2824
2825 /* Verify ranges of individual characters. This no longer includes the
2826 opening quote, but does include the closing quote. */
2827 for (int i = 0; i <= 10; i++)
2828 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1,
2829 10 + i, 10 + i);
2830
2831 ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, type, 11);
2832}
2833
2834/* As test_lexer_string_locations_simple, but use an EBCDIC execution
2835 encoding. */
2836
2837static void
2838test_lexer_string_locations_ebcdic (const line_table_case &case_)
2839{
2840 /* EBCDIC support requires iconv. */
2841 if (!HAVE_ICONV)
2842 return;
2843
2844 /* Digits 0-9 (with 0 at column 10), the simple way.
2845 ....................000000000.11111111112.2222222223333333333
2846 ....................123456789.01234567890.1234567890123456789
2847 We add a trailing comment to ensure that we correctly locate
2848 the end of the string literal token. */
2849 const char *content = " \"0123456789\" /* not a string */\n";
2850 ebcdic_execution_charset use_ebcdic;
2851 lexer_test test (case_, content, &use_ebcdic);
2852
2853 /* Verify that we get the expected token back, with the correct
2854 location information. */
2855 const cpp_token *tok = test.get_token ();
2856 ASSERT_EQ (tok->type, CPP_STRING);
2857 ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"0123456789\"");
2858 ASSERT_TOKEN_LOC_EQ (tok, test.m_tempfile.get_filename (), 1, 9, 20);
2859
2860 /* At this point in lexing, the quote characters are treated as part of
2861 the string (they are stripped off by cpp_interpret_string). */
2862
2863 ASSERT_EQ (tok->val.str.len, 12);
2864
2865 /* The remainder of the test requires an iconv implementation that
2866 can convert from UTF-8 to the EBCDIC encoding requested above. */
2867 if (use_ebcdic.iconv_errors_occurred_p ())
2868 return;
2869
2870 /* Verify that cpp_interpret_string works. */
2871 cpp_string dst_string;
2872 const enum cpp_ttype type = CPP_STRING;
2873 bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2874 &dst_string, type);
2875 ASSERT_TRUE (result);
2876 /* We should now have EBCDIC-encoded text, specifically
2877 IBM1047-encoded (aka "EBCDIC 1047", or "Code page 1047").
2878 The digits 0-9 are encoded as 240-249 i.e. 0xf0-0xf9. */
2879 ASSERT_STREQ ("\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9",
2880 (const char *)dst_string.text);
2881 free (ptr: const_cast <unsigned char *> (dst_string.text));
2882
2883 /* Verify that we don't attempt to record substring location information
2884 for such cases. */
2885 ASSERT_HAS_NO_SUBSTRING_RANGES
2886 (test, tok->src_loc, type,
2887 "execution character set != source character set");
2888}
2889
2890/* Lex a string literal containing a hex-escaped character.
2891 Verify the substring location data, before and after running
2892 cpp_interpret_string on it. */
2893
2894static void
2895test_lexer_string_locations_hex (const line_table_case &case_)
2896{
2897 /* Digits 0-9, expressing digit 5 in ASCII as "\x35"
2898 and with a space in place of digit 6, to terminate the escaped
2899 hex code.
2900 ....................000000000.111111.11112222.
2901 ....................123456789.012345.67890123. */
2902 const char *content = " \"01234\\x35 789\"\n";
2903 lexer_test test (case_, content, NULL);
2904
2905 /* Verify that we get the expected token back, with the correct
2906 location information. */
2907 const cpp_token *tok = test.get_token ();
2908 ASSERT_EQ (tok->type, CPP_STRING);
2909 ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"01234\\x35 789\"");
2910 ASSERT_TOKEN_LOC_EQ (tok, test.m_tempfile.get_filename (), 1, 9, 23);
2911
2912 /* At this point in lexing, the quote characters are treated as part of
2913 the string (they are stripped off by cpp_interpret_string). */
2914 ASSERT_EQ (tok->val.str.len, 15);
2915
2916 /* Verify that cpp_interpret_string works. */
2917 cpp_string dst_string;
2918 const enum cpp_ttype type = CPP_STRING;
2919 bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2920 &dst_string, type);
2921 ASSERT_TRUE (result);
2922 ASSERT_STREQ ("012345 789", (const char *)dst_string.text);
2923 free (ptr: const_cast <unsigned char *> (dst_string.text));
2924
2925 /* Verify ranges of individual characters. This no longer includes the
2926 opening quote, but does include the closing quote. */
2927 for (int i = 0; i <= 4; i++)
2928 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 10 + i, 10 + i);
2929 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, 5, 1, 15, 18);
2930 for (int i = 6; i <= 10; i++)
2931 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 13 + i, 13 + i);
2932
2933 ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, type, 11);
2934}
2935
2936/* Lex a string literal containing an octal-escaped character.
2937 Verify the substring location data after running cpp_interpret_string
2938 on it. */
2939
2940static void
2941test_lexer_string_locations_oct (const line_table_case &case_)
2942{
2943 /* Digits 0-9, expressing digit 5 in ASCII as "\065"
2944 and with a space in place of digit 6, to terminate the escaped
2945 octal code.
2946 ....................000000000.111111.11112222.2222223333333333444
2947 ....................123456789.012345.67890123.4567890123456789012 */
2948 const char *content = " \"01234\\065 789\" /* not a string */\n";
2949 lexer_test test (case_, content, NULL);
2950
2951 /* Verify that we get the expected token back, with the correct
2952 location information. */
2953 const cpp_token *tok = test.get_token ();
2954 ASSERT_EQ (tok->type, CPP_STRING);
2955 ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"01234\\065 789\"");
2956
2957 /* Verify that cpp_interpret_string works. */
2958 cpp_string dst_string;
2959 const enum cpp_ttype type = CPP_STRING;
2960 bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
2961 &dst_string, type);
2962 ASSERT_TRUE (result);
2963 ASSERT_STREQ ("012345 789", (const char *)dst_string.text);
2964 free (ptr: const_cast <unsigned char *> (dst_string.text));
2965
2966 /* Verify ranges of individual characters. This no longer includes the
2967 opening quote, but does include the closing quote. */
2968 for (int i = 0; i < 5; i++)
2969 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 10 + i, 10 + i);
2970 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, 5, 1, 15, 18);
2971 for (int i = 6; i <= 10; i++)
2972 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 13 + i, 13 + i);
2973
2974 ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, type, 11);
2975}
2976
2977/* Test of string literal containing letter escapes. */
2978
2979static void
2980test_lexer_string_locations_letter_escape_1 (const line_table_case &case_)
2981{
2982 /* The string "\tfoo\\\nbar" i.e. tab, "foo", backslash, newline, bar.
2983 .....................000000000.1.11111.1.1.11222.22222223333333
2984 .....................123456789.0.12345.6.7.89012.34567890123456. */
2985 const char *content = (" \"\\tfoo\\\\\\nbar\" /* non-str */\n");
2986 lexer_test test (case_, content, NULL);
2987
2988 /* Verify that we get the expected tokens back. */
2989 const cpp_token *tok = test.get_token ();
2990 ASSERT_EQ (tok->type, CPP_STRING);
2991 ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"\\tfoo\\\\\\nbar\"");
2992
2993 /* Verify ranges of individual characters. */
2994 /* "\t". */
2995 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
2996 0, 1, 10, 11);
2997 /* "foo". */
2998 for (int i = 1; i <= 3; i++)
2999 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
3000 i, 1, 11 + i, 11 + i);
3001 /* "\\" and "\n". */
3002 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
3003 4, 1, 15, 16);
3004 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
3005 5, 1, 17, 18);
3006
3007 /* "bar" and closing quote for nul-terminator. */
3008 for (int i = 6; i <= 9; i++)
3009 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
3010 i, 1, 13 + i, 13 + i);
3011
3012 ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING, 10);
3013}
3014
3015/* Another test of a string literal containing a letter escape.
3016 Based on string seen in
3017 printf ("%-%\n");
3018 in gcc.dg/format/c90-printf-1.c. */
3019
3020static void
3021test_lexer_string_locations_letter_escape_2 (const line_table_case &case_)
3022{
3023 /* .....................000000000.1111.11.1111.22222222223.
3024 .....................123456789.0123.45.6789.01234567890. */
3025 const char *content = (" \"%-%\\n\" /* non-str */\n");
3026 lexer_test test (case_, content, NULL);
3027
3028 /* Verify that we get the expected tokens back. */
3029 const cpp_token *tok = test.get_token ();
3030 ASSERT_EQ (tok->type, CPP_STRING);
3031 ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"%-%\\n\"");
3032
3033 /* Verify ranges of individual characters. */
3034 /* "%-%". */
3035 for (int i = 0; i < 3; i++)
3036 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
3037 i, 1, 10 + i, 10 + i);
3038 /* "\n". */
3039 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
3040 3, 1, 13, 14);
3041
3042 /* Closing quote for nul-terminator. */
3043 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
3044 4, 1, 15, 15);
3045
3046 ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING, 5);
3047}
3048
3049/* Lex a string literal containing UCN 4 characters.
3050 Verify the substring location data after running cpp_interpret_string
3051 on it. */
3052
3053static void
3054test_lexer_string_locations_ucn4 (const line_table_case &case_)
3055{
3056 /* Digits 0-9, expressing digits 5 and 6 as Roman numerals expressed
3057 as UCN 4.
3058 ....................000000000.111111.111122.222222223.33333333344444
3059 ....................123456789.012345.678901.234567890.12345678901234 */
3060 const char *content = " \"01234\\u2174\\u2175789\" /* non-str */\n";
3061 lexer_test test (case_, content, NULL);
3062
3063 /* Verify that we get the expected token back, with the correct
3064 location information. */
3065 const cpp_token *tok = test.get_token ();
3066 ASSERT_EQ (tok->type, CPP_STRING);
3067 ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"01234\\u2174\\u2175789\"");
3068
3069 /* Verify that cpp_interpret_string works.
3070 The string should be encoded in the execution character
3071 set. Assuming that is UTF-8, we should have the following:
3072 ----------- ---- ----- ------- ----------------
3073 Byte offset Byte Octal Unicode Source Column(s)
3074 ----------- ---- ----- ------- ----------------
3075 0 0x30 '0' 10
3076 1 0x31 '1' 11
3077 2 0x32 '2' 12
3078 3 0x33 '3' 13
3079 4 0x34 '4' 14
3080 5 0xE2 \342 U+2174 15-20
3081 6 0x85 \205 (cont) 15-20
3082 7 0xB4 \264 (cont) 15-20
3083 8 0xE2 \342 U+2175 21-26
3084 9 0x85 \205 (cont) 21-26
3085 10 0xB5 \265 (cont) 21-26
3086 11 0x37 '7' 27
3087 12 0x38 '8' 28
3088 13 0x39 '9' 29
3089 14 0x00 30 (closing quote)
3090 ----------- ---- ----- ------- ---------------. */
3091
3092 cpp_string dst_string;
3093 const enum cpp_ttype type = CPP_STRING;
3094 bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
3095 &dst_string, type);
3096 ASSERT_TRUE (result);
3097 ASSERT_STREQ ("01234\342\205\264\342\205\265789",
3098 (const char *)dst_string.text);
3099 free (ptr: const_cast <unsigned char *> (dst_string.text));
3100
3101 /* Verify ranges of individual characters. This no longer includes the
3102 opening quote, but does include the closing quote.
3103 '01234'. */
3104 for (int i = 0; i <= 4; i++)
3105 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 10 + i, 10 + i);
3106 /* U+2174. */
3107 for (int i = 5; i <= 7; i++)
3108 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 15, 20);
3109 /* U+2175. */
3110 for (int i = 8; i <= 10; i++)
3111 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 21, 26);
3112 /* '789' and nul terminator */
3113 for (int i = 11; i <= 14; i++)
3114 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 16 + i, 16 + i);
3115
3116 ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, type, 15);
3117}
3118
3119/* Lex a string literal containing UCN 8 characters.
3120 Verify the substring location data after running cpp_interpret_string
3121 on it. */
3122
3123static void
3124test_lexer_string_locations_ucn8 (const line_table_case &case_)
3125{
3126 /* Digits 0-9, expressing digits 5 and 6 as Roman numerals as UCN 8.
3127 ....................000000000.111111.1111222222.2222333333333.344444
3128 ....................123456789.012345.6789012345.6789012345678.901234 */
3129 const char *content = " \"01234\\U00002174\\U00002175789\" /* */\n";
3130 lexer_test test (case_, content, NULL);
3131
3132 /* Verify that we get the expected token back, with the correct
3133 location information. */
3134 const cpp_token *tok = test.get_token ();
3135 ASSERT_EQ (tok->type, CPP_STRING);
3136 ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok,
3137 "\"01234\\U00002174\\U00002175789\"");
3138
3139 /* Verify that cpp_interpret_string works.
3140 The UTF-8 encoding of the string is identical to that from
3141 the ucn4 testcase above; the only difference is the column
3142 locations. */
3143 cpp_string dst_string;
3144 const enum cpp_ttype type = CPP_STRING;
3145 bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
3146 &dst_string, type);
3147 ASSERT_TRUE (result);
3148 ASSERT_STREQ ("01234\342\205\264\342\205\265789",
3149 (const char *)dst_string.text);
3150 free (ptr: const_cast <unsigned char *> (dst_string.text));
3151
3152 /* Verify ranges of individual characters. This no longer includes the
3153 opening quote, but does include the closing quote.
3154 '01234'. */
3155 for (int i = 0; i <= 4; i++)
3156 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 10 + i, 10 + i);
3157 /* U+2174. */
3158 for (int i = 5; i <= 7; i++)
3159 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 15, 24);
3160 /* U+2175. */
3161 for (int i = 8; i <= 10; i++)
3162 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 25, 34);
3163 /* '789' at columns 35-37 */
3164 for (int i = 11; i <= 13; i++)
3165 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 24 + i, 24 + i);
3166 /* Closing quote/nul-terminator at column 38. */
3167 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, 14, 1, 38, 38);
3168
3169 ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, type, 15);
3170}
3171
3172/* Fetch a big-endian 32-bit value and convert to host endianness. */
3173
3174static uint32_t
3175uint32_from_big_endian (const uint32_t *ptr_be_value)
3176{
3177 const unsigned char *buf = (const unsigned char *)ptr_be_value;
3178 return (((uint32_t) buf[0] << 24)
3179 | ((uint32_t) buf[1] << 16)
3180 | ((uint32_t) buf[2] << 8)
3181 | (uint32_t) buf[3]);
3182}
3183
3184/* Lex a wide string literal and verify that attempts to read substring
3185 location data from it fail gracefully. */
3186
3187static void
3188test_lexer_string_locations_wide_string (const line_table_case &case_)
3189{
3190 /* Digits 0-9.
3191 ....................000000000.11111111112.22222222233333
3192 ....................123456789.01234567890.12345678901234 */
3193 const char *content = " L\"0123456789\" /* non-str */\n";
3194 lexer_test test (case_, content, NULL);
3195
3196 /* Verify that we get the expected token back, with the correct
3197 location information. */
3198 const cpp_token *tok = test.get_token ();
3199 ASSERT_EQ (tok->type, CPP_WSTRING);
3200 ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "L\"0123456789\"");
3201
3202 /* Verify that cpp_interpret_string works, using CPP_WSTRING. */
3203 cpp_string dst_string;
3204 const enum cpp_ttype type = CPP_WSTRING;
3205 bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
3206 &dst_string, type);
3207 ASSERT_TRUE (result);
3208 /* The cpp_reader defaults to big-endian with
3209 CHAR_BIT * sizeof (int) for the wchar_precision, so dst_string should
3210 now be encoded as UTF-32BE. */
3211 const uint32_t *be32_chars = (const uint32_t *)dst_string.text;
3212 ASSERT_EQ ('0', uint32_from_big_endian (&be32_chars[0]));
3213 ASSERT_EQ ('5', uint32_from_big_endian (&be32_chars[5]));
3214 ASSERT_EQ ('9', uint32_from_big_endian (&be32_chars[9]));
3215 ASSERT_EQ (0, uint32_from_big_endian (&be32_chars[10]));
3216 free (ptr: const_cast <unsigned char *> (dst_string.text));
3217
3218 /* We don't yet support generating substring location information
3219 for L"" strings. */
3220 ASSERT_HAS_NO_SUBSTRING_RANGES
3221 (test, tok->src_loc, type,
3222 "execution character set != source character set");
3223}
3224
3225/* Fetch a big-endian 16-bit value and convert to host endianness. */
3226
3227static uint16_t
3228uint16_from_big_endian (const uint16_t *ptr_be_value)
3229{
3230 const unsigned char *buf = (const unsigned char *)ptr_be_value;
3231 return ((uint16_t) buf[0] << 8) | (uint16_t) buf[1];
3232}
3233
3234/* Lex a u"" string literal and verify that attempts to read substring
3235 location data from it fail gracefully. */
3236
3237static void
3238test_lexer_string_locations_string16 (const line_table_case &case_)
3239{
3240 /* Digits 0-9.
3241 ....................000000000.11111111112.22222222233333
3242 ....................123456789.01234567890.12345678901234 */
3243 const char *content = " u\"0123456789\" /* non-str */\n";
3244 lexer_test test (case_, content, NULL);
3245
3246 /* Verify that we get the expected token back, with the correct
3247 location information. */
3248 const cpp_token *tok = test.get_token ();
3249 ASSERT_EQ (tok->type, CPP_STRING16);
3250 ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "u\"0123456789\"");
3251
3252 /* Verify that cpp_interpret_string works, using CPP_STRING16. */
3253 cpp_string dst_string;
3254 const enum cpp_ttype type = CPP_STRING16;
3255 bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
3256 &dst_string, type);
3257 ASSERT_TRUE (result);
3258
3259 /* The cpp_reader defaults to big-endian, so dst_string should
3260 now be encoded as UTF-16BE. */
3261 const uint16_t *be16_chars = (const uint16_t *)dst_string.text;
3262 ASSERT_EQ ('0', uint16_from_big_endian (&be16_chars[0]));
3263 ASSERT_EQ ('5', uint16_from_big_endian (&be16_chars[5]));
3264 ASSERT_EQ ('9', uint16_from_big_endian (&be16_chars[9]));
3265 ASSERT_EQ (0, uint16_from_big_endian (&be16_chars[10]));
3266 free (ptr: const_cast <unsigned char *> (dst_string.text));
3267
3268 /* We don't yet support generating substring location information
3269 for L"" strings. */
3270 ASSERT_HAS_NO_SUBSTRING_RANGES
3271 (test, tok->src_loc, type,
3272 "execution character set != source character set");
3273}
3274
3275/* Lex a U"" string literal and verify that attempts to read substring
3276 location data from it fail gracefully. */
3277
3278static void
3279test_lexer_string_locations_string32 (const line_table_case &case_)
3280{
3281 /* Digits 0-9.
3282 ....................000000000.11111111112.22222222233333
3283 ....................123456789.01234567890.12345678901234 */
3284 const char *content = " U\"0123456789\" /* non-str */\n";
3285 lexer_test test (case_, content, NULL);
3286
3287 /* Verify that we get the expected token back, with the correct
3288 location information. */
3289 const cpp_token *tok = test.get_token ();
3290 ASSERT_EQ (tok->type, CPP_STRING32);
3291 ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "U\"0123456789\"");
3292
3293 /* Verify that cpp_interpret_string works, using CPP_STRING32. */
3294 cpp_string dst_string;
3295 const enum cpp_ttype type = CPP_STRING32;
3296 bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
3297 &dst_string, type);
3298 ASSERT_TRUE (result);
3299
3300 /* The cpp_reader defaults to big-endian, so dst_string should
3301 now be encoded as UTF-32BE. */
3302 const uint32_t *be32_chars = (const uint32_t *)dst_string.text;
3303 ASSERT_EQ ('0', uint32_from_big_endian (&be32_chars[0]));
3304 ASSERT_EQ ('5', uint32_from_big_endian (&be32_chars[5]));
3305 ASSERT_EQ ('9', uint32_from_big_endian (&be32_chars[9]));
3306 ASSERT_EQ (0, uint32_from_big_endian (&be32_chars[10]));
3307 free (ptr: const_cast <unsigned char *> (dst_string.text));
3308
3309 /* We don't yet support generating substring location information
3310 for L"" strings. */
3311 ASSERT_HAS_NO_SUBSTRING_RANGES
3312 (test, tok->src_loc, type,
3313 "execution character set != source character set");
3314}
3315
3316/* Lex a u8-string literal.
3317 Verify the substring location data after running cpp_interpret_string
3318 on it. */
3319
3320static void
3321test_lexer_string_locations_u8 (const line_table_case &case_)
3322{
3323 /* Digits 0-9.
3324 ....................000000000.11111111112.22222222233333
3325 ....................123456789.01234567890.12345678901234 */
3326 const char *content = " u8\"0123456789\" /* non-str */\n";
3327 lexer_test test (case_, content, NULL);
3328
3329 /* Verify that we get the expected token back, with the correct
3330 location information. */
3331 const cpp_token *tok = test.get_token ();
3332 ASSERT_EQ (tok->type, CPP_UTF8STRING);
3333 ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "u8\"0123456789\"");
3334
3335 /* Verify that cpp_interpret_string works. */
3336 cpp_string dst_string;
3337 const enum cpp_ttype type = CPP_STRING;
3338 bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
3339 &dst_string, type);
3340 ASSERT_TRUE (result);
3341 ASSERT_STREQ ("0123456789", (const char *)dst_string.text);
3342 free (ptr: const_cast <unsigned char *> (dst_string.text));
3343
3344 /* Verify ranges of individual characters. This no longer includes the
3345 opening quote, but does include the closing quote. */
3346 for (int i = 0; i <= 10; i++)
3347 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 10 + i, 10 + i);
3348}
3349
3350/* Lex a string literal containing UTF-8 source characters.
3351 Verify the substring location data after running cpp_interpret_string
3352 on it. */
3353
3354static void
3355test_lexer_string_locations_utf8_source (const line_table_case &case_)
3356{
3357 /* This string literal is written out to the source file as UTF-8,
3358 and is of the form "before mojibake after", where "mojibake"
3359 is written as the following four unicode code points:
3360 U+6587 CJK UNIFIED IDEOGRAPH-6587
3361 U+5B57 CJK UNIFIED IDEOGRAPH-5B57
3362 U+5316 CJK UNIFIED IDEOGRAPH-5316
3363 U+3051 HIRAGANA LETTER KE.
3364 Each of these is 3 bytes wide when encoded in UTF-8, whereas the
3365 "before" and "after" are 1 byte per unicode character.
3366
3367 The numbering shown are "columns", which are *byte* numbers within
3368 the line, rather than unicode character numbers.
3369
3370 .................... 000000000.1111111.
3371 .................... 123456789.0123456. */
3372 const char *content = (" \"before "
3373 /* U+6587 CJK UNIFIED IDEOGRAPH-6587
3374 UTF-8: 0xE6 0x96 0x87
3375 C octal escaped UTF-8: \346\226\207
3376 "column" numbers: 17-19. */
3377 "\346\226\207"
3378
3379 /* U+5B57 CJK UNIFIED IDEOGRAPH-5B57
3380 UTF-8: 0xE5 0xAD 0x97
3381 C octal escaped UTF-8: \345\255\227
3382 "column" numbers: 20-22. */
3383 "\345\255\227"
3384
3385 /* U+5316 CJK UNIFIED IDEOGRAPH-5316
3386 UTF-8: 0xE5 0x8C 0x96
3387 C octal escaped UTF-8: \345\214\226
3388 "column" numbers: 23-25. */
3389 "\345\214\226"
3390
3391 /* U+3051 HIRAGANA LETTER KE
3392 UTF-8: 0xE3 0x81 0x91
3393 C octal escaped UTF-8: \343\201\221
3394 "column" numbers: 26-28. */
3395 "\343\201\221"
3396
3397 /* column numbers 29 onwards
3398 2333333.33334444444444
3399 9012345.67890123456789. */
3400 " after\" /* non-str */\n");
3401 lexer_test test (case_, content, NULL);
3402
3403 /* Verify that we get the expected token back, with the correct
3404 location information. */
3405 const cpp_token *tok = test.get_token ();
3406 ASSERT_EQ (tok->type, CPP_STRING);
3407 ASSERT_TOKEN_AS_TEXT_EQ
3408 (test.m_parser, tok,
3409 "\"before \346\226\207\345\255\227\345\214\226\343\201\221 after\"");
3410
3411 /* Verify that cpp_interpret_string works. */
3412 cpp_string dst_string;
3413 const enum cpp_ttype type = CPP_STRING;
3414 bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
3415 &dst_string, type);
3416 ASSERT_TRUE (result);
3417 ASSERT_STREQ
3418 ("before \346\226\207\345\255\227\345\214\226\343\201\221 after",
3419 (const char *)dst_string.text);
3420 free (ptr: const_cast <unsigned char *> (dst_string.text));
3421
3422 /* Verify ranges of individual characters. This no longer includes the
3423 opening quote, but does include the closing quote.
3424 Assuming that both source and execution encodings are UTF-8, we have
3425 a run of 25 octets in each, plus the NUL terminator. */
3426 for (int i = 0; i < 25; i++)
3427 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, i, 1, 10 + i, 10 + i);
3428 /* NUL-terminator should use the closing quote at column 35. */
3429 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, type, 25, 1, 35, 35);
3430
3431 ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, type, 26);
3432}
3433
3434/* Test of string literal concatenation. */
3435
3436static void
3437test_lexer_string_locations_concatenation_1 (const line_table_case &case_)
3438{
3439 /* Digits 0-9.
3440 .....................000000000.111111.11112222222222
3441 .....................123456789.012345.67890123456789. */
3442 const char *content = (" \"01234\" /* non-str */\n"
3443 " \"56789\" /* non-str */\n");
3444 lexer_test test (case_, content, NULL);
3445
3446 location_t input_locs[2];
3447
3448 /* Verify that we get the expected tokens back. */
3449 auto_vec <cpp_string> input_strings;
3450 const cpp_token *tok_a = test.get_token ();
3451 ASSERT_EQ (tok_a->type, CPP_STRING);
3452 ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok_a, "\"01234\"");
3453 input_strings.safe_push (obj: tok_a->val.str);
3454 input_locs[0] = tok_a->src_loc;
3455
3456 const cpp_token *tok_b = test.get_token ();
3457 ASSERT_EQ (tok_b->type, CPP_STRING);
3458 ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok_b, "\"56789\"");
3459 input_strings.safe_push (obj: tok_b->val.str);
3460 input_locs[1] = tok_b->src_loc;
3461
3462 /* Verify that cpp_interpret_string works. */
3463 cpp_string dst_string;
3464 const enum cpp_ttype type = CPP_STRING;
3465 bool result = cpp_interpret_string (test.m_parser,
3466 input_strings.address (), 2,
3467 &dst_string, type);
3468 ASSERT_TRUE (result);
3469 ASSERT_STREQ ("0123456789", (const char *)dst_string.text);
3470 free (ptr: const_cast <unsigned char *> (dst_string.text));
3471
3472 /* Simulate c-lex.cc's lex_string in order to record concatenation. */
3473 test.m_concats.record_string_concatenation (num: 2, locs: input_locs);
3474
3475 location_t initial_loc = input_locs[0];
3476
3477 /* "01234" on line 1. */
3478 for (int i = 0; i <= 4; i++)
3479 ASSERT_CHAR_AT_RANGE (test, initial_loc, type, i, 1, 10 + i, 10 + i);
3480 /* "56789" in line 2, plus its closing quote for the nul terminator. */
3481 for (int i = 5; i <= 10; i++)
3482 ASSERT_CHAR_AT_RANGE (test, initial_loc, type, i, 2, 5 + i, 5 + i);
3483
3484 ASSERT_NUM_SUBSTRING_RANGES (test, initial_loc, type, 11);
3485}
3486
3487/* Another test of string literal concatenation. */
3488
3489static void
3490test_lexer_string_locations_concatenation_2 (const line_table_case &case_)
3491{
3492 /* Digits 0-9.
3493 .....................000000000.111.11111112222222
3494 .....................123456789.012.34567890123456. */
3495 const char *content = (" \"01\" /* non-str */\n"
3496 " \"23\" /* non-str */\n"
3497 " \"45\" /* non-str */\n"
3498 " \"67\" /* non-str */\n"
3499 " \"89\" /* non-str */\n");
3500 lexer_test test (case_, content, NULL);
3501
3502 auto_vec <cpp_string> input_strings;
3503 location_t input_locs[5];
3504
3505 /* Verify that we get the expected tokens back. */
3506 for (int i = 0; i < 5; i++)
3507 {
3508 const cpp_token *tok = test.get_token ();
3509 ASSERT_EQ (tok->type, CPP_STRING);
3510 input_strings.safe_push (obj: tok->val.str);
3511 input_locs[i] = tok->src_loc;
3512 }
3513
3514 /* Verify that cpp_interpret_string works. */
3515 cpp_string dst_string;
3516 const enum cpp_ttype type = CPP_STRING;
3517 bool result = cpp_interpret_string (test.m_parser,
3518 input_strings.address (), 5,
3519 &dst_string, type);
3520 ASSERT_TRUE (result);
3521 ASSERT_STREQ ("0123456789", (const char *)dst_string.text);
3522 free (ptr: const_cast <unsigned char *> (dst_string.text));
3523
3524 /* Simulate c-lex.cc's lex_string in order to record concatenation. */
3525 test.m_concats.record_string_concatenation (num: 5, locs: input_locs);
3526
3527 location_t initial_loc = input_locs[0];
3528
3529 /* Within ASSERT_CHAR_AT_RANGE (actually assert_char_at_range), we can
3530 detect if the initial loc is after LINE_MAP_MAX_LOCATION_WITH_COLS
3531 and expect get_source_range_for_substring to fail.
3532 However, for a string concatenation test, we can have a case
3533 where the initial string is fully before LINE_MAP_MAX_LOCATION_WITH_COLS,
3534 but subsequent strings can be after it.
3535 Attempting to detect this within assert_char_at_range
3536 would overcomplicate the logic for the common test cases, so
3537 we detect it here. */
3538 if (should_have_column_data_p (loc: input_locs[0])
3539 && !should_have_column_data_p (loc: input_locs[4]))
3540 {
3541 /* Verify that get_source_range_for_substring gracefully rejects
3542 this case. */
3543 source_range actual_range;
3544 const char *err
3545 = get_source_range_for_char (pfile: test.m_parser, concats: &test.m_concats,
3546 strloc: initial_loc, type, char_idx: 0, out_range: &actual_range);
3547 ASSERT_STREQ ("range starts after LINE_MAP_MAX_LOCATION_WITH_COLS", err);
3548 return;
3549 }
3550
3551 for (int i = 0; i < 5; i++)
3552 for (int j = 0; j < 2; j++)
3553 ASSERT_CHAR_AT_RANGE (test, initial_loc, type, (i * 2) + j,
3554 i + 1, 10 + j, 10 + j);
3555
3556 /* NUL-terminator should use the final closing quote at line 5 column 12. */
3557 ASSERT_CHAR_AT_RANGE (test, initial_loc, type, 10, 5, 12, 12);
3558
3559 ASSERT_NUM_SUBSTRING_RANGES (test, initial_loc, type, 11);
3560}
3561
3562/* Another test of string literal concatenation, this time combined with
3563 various kinds of escaped characters. */
3564
3565static void
3566test_lexer_string_locations_concatenation_3 (const line_table_case &case_)
3567{
3568 /* Digits 0-9, expressing digit 5 in ASCII as hex "\x35"
3569 digit 6 in ASCII as octal "\066", concatenating multiple strings. */
3570 const char *content
3571 /* .000000000.111111.111.1.2222.222.2.2233.333.3333.34444444444555
3572 .123456789.012345.678.9.0123.456.7.8901.234.5678.90123456789012. */
3573 = (" \"01234\" \"\\x35\" \"\\066\" \"789\" /* non-str */\n");
3574 lexer_test test (case_, content, NULL);
3575
3576 auto_vec <cpp_string> input_strings;
3577 location_t input_locs[4];
3578
3579 /* Verify that we get the expected tokens back. */
3580 for (int i = 0; i < 4; i++)
3581 {
3582 const cpp_token *tok = test.get_token ();
3583 ASSERT_EQ (tok->type, CPP_STRING);
3584 input_strings.safe_push (obj: tok->val.str);
3585 input_locs[i] = tok->src_loc;
3586 }
3587
3588 /* Verify that cpp_interpret_string works. */
3589 cpp_string dst_string;
3590 const enum cpp_ttype type = CPP_STRING;
3591 bool result = cpp_interpret_string (test.m_parser,
3592 input_strings.address (), 4,
3593 &dst_string, type);
3594 ASSERT_TRUE (result);
3595 ASSERT_STREQ ("0123456789", (const char *)dst_string.text);
3596 free (ptr: const_cast <unsigned char *> (dst_string.text));
3597
3598 /* Simulate c-lex.cc's lex_string in order to record concatenation. */
3599 test.m_concats.record_string_concatenation (num: 4, locs: input_locs);
3600
3601 location_t initial_loc = input_locs[0];
3602
3603 for (int i = 0; i <= 4; i++)
3604 ASSERT_CHAR_AT_RANGE (test, initial_loc, type, i, 1, 10 + i, 10 + i);
3605 ASSERT_CHAR_AT_RANGE (test, initial_loc, type, 5, 1, 19, 22);
3606 ASSERT_CHAR_AT_RANGE (test, initial_loc, type, 6, 1, 27, 30);
3607 for (int i = 7; i <= 9; i++)
3608 ASSERT_CHAR_AT_RANGE (test, initial_loc, type, i, 1, 28 + i, 28 + i);
3609
3610 /* NUL-terminator should use the location of the final closing quote. */
3611 ASSERT_CHAR_AT_RANGE (test, initial_loc, type, 10, 1, 38, 38);
3612
3613 ASSERT_NUM_SUBSTRING_RANGES (test, initial_loc, type, 11);
3614}
3615
3616/* Test of string literal in a macro. */
3617
3618static void
3619test_lexer_string_locations_macro (const line_table_case &case_)
3620{
3621 /* Digits 0-9.
3622 .....................0000000001111111111.22222222223.
3623 .....................1234567890123456789.01234567890. */
3624 const char *content = ("#define MACRO \"0123456789\" /* non-str */\n"
3625 " MACRO");
3626 lexer_test test (case_, content, NULL);
3627
3628 /* Verify that we get the expected tokens back. */
3629 const cpp_token *tok = test.get_token ();
3630 ASSERT_EQ (tok->type, CPP_PADDING);
3631
3632 tok = test.get_token ();
3633 ASSERT_EQ (tok->type, CPP_STRING);
3634 ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"0123456789\"");
3635
3636 /* Verify ranges of individual characters. We ought to
3637 see columns within the macro definition. */
3638 for (int i = 0; i <= 10; i++)
3639 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
3640 i, 1, 20 + i, 20 + i);
3641
3642 ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING, 11);
3643
3644 tok = test.get_token ();
3645 ASSERT_EQ (tok->type, CPP_PADDING);
3646}
3647
3648/* Test of stringification of a macro argument. */
3649
3650static void
3651test_lexer_string_locations_stringified_macro_argument
3652 (const line_table_case &case_)
3653{
3654 /* .....................000000000111111111122222222223.
3655 .....................123456789012345678901234567890. */
3656 const char *content = ("#define MACRO(X) #X /* non-str */\n"
3657 "MACRO(foo)\n");
3658 lexer_test test (case_, content, NULL);
3659
3660 /* Verify that we get the expected token back. */
3661 const cpp_token *tok = test.get_token ();
3662 ASSERT_EQ (tok->type, CPP_PADDING);
3663
3664 tok = test.get_token ();
3665 ASSERT_EQ (tok->type, CPP_STRING);
3666 ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "\"foo\"");
3667
3668 /* We don't support getting the location of a stringified macro
3669 argument. Verify that it fails gracefully. */
3670 ASSERT_HAS_NO_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING,
3671 "cpp_interpret_string_1 failed");
3672
3673 tok = test.get_token ();
3674 ASSERT_EQ (tok->type, CPP_PADDING);
3675
3676 tok = test.get_token ();
3677 ASSERT_EQ (tok->type, CPP_PADDING);
3678}
3679
3680/* Ensure that we are fail gracefully if something attempts to pass
3681 in a location that isn't a string literal token. Seen on this code:
3682
3683 const char a[] = " %d ";
3684 __builtin_printf (a, 0.5);
3685 ^
3686
3687 when c-format.cc erroneously used the indicated one-character
3688 location as the format string location, leading to a read past the
3689 end of a string buffer in cpp_interpret_string_1. */
3690
3691static void
3692test_lexer_string_locations_non_string (const line_table_case &case_)
3693{
3694 /* .....................000000000111111111122222222223.
3695 .....................123456789012345678901234567890. */
3696 const char *content = (" a\n");
3697 lexer_test test (case_, content, NULL);
3698
3699 /* Verify that we get the expected token back. */
3700 const cpp_token *tok = test.get_token ();
3701 ASSERT_EQ (tok->type, CPP_NAME);
3702 ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "a");
3703
3704 /* At this point, libcpp is attempting to interpret the name as a
3705 string literal, despite it not starting with a quote. We don't detect
3706 that, but we should at least fail gracefully. */
3707 ASSERT_HAS_NO_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING,
3708 "cpp_interpret_string_1 failed");
3709}
3710
3711/* Ensure that we can read substring information for a token which
3712 starts in one linemap and ends in another . Adapted from
3713 gcc.dg/cpp/pr69985.c. */
3714
3715static void
3716test_lexer_string_locations_long_line (const line_table_case &case_)
3717{
3718 /* .....................000000.000111111111
3719 .....................123456.789012346789. */
3720 const char *content = ("/* A very long line, so that we start a new line map. */\n"
3721 " \"0123456789012345678901234567890123456789"
3722 "0123456789012345678901234567890123456789"
3723 "0123456789012345678901234567890123456789"
3724 "0123456789\"\n");
3725
3726 lexer_test test (case_, content, NULL);
3727
3728 /* Verify that we get the expected token back. */
3729 const cpp_token *tok = test.get_token ();
3730 ASSERT_EQ (tok->type, CPP_STRING);
3731
3732 if (!should_have_column_data_p (loc: line_table->highest_location))
3733 return;
3734
3735 /* Verify ranges of individual characters. */
3736 ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING, 131);
3737 for (int i = 0; i < 131; i++)
3738 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
3739 i, 2, 7 + i, 7 + i);
3740}
3741
3742/* Test of locations within a raw string that doesn't contain a newline. */
3743
3744static void
3745test_lexer_string_locations_raw_string_one_line (const line_table_case &case_)
3746{
3747 /* .....................00.0000000111111111122.
3748 .....................12.3456789012345678901. */
3749 const char *content = ("R\"foo(0123456789)foo\"\n");
3750 lexer_test test (case_, content, NULL);
3751
3752 /* Verify that we get the expected token back. */
3753 const cpp_token *tok = test.get_token ();
3754 ASSERT_EQ (tok->type, CPP_STRING);
3755
3756 /* Verify that cpp_interpret_string works. */
3757 cpp_string dst_string;
3758 const enum cpp_ttype type = CPP_STRING;
3759 bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
3760 &dst_string, type);
3761 ASSERT_TRUE (result);
3762 ASSERT_STREQ ("0123456789", (const char *)dst_string.text);
3763 free (ptr: const_cast <unsigned char *> (dst_string.text));
3764
3765 if (!should_have_column_data_p (loc: line_table->highest_location))
3766 return;
3767
3768 /* 0-9, plus the nil terminator. */
3769 ASSERT_NUM_SUBSTRING_RANGES (test, tok->src_loc, CPP_STRING, 11);
3770 for (int i = 0; i < 11; i++)
3771 ASSERT_CHAR_AT_RANGE (test, tok->src_loc, CPP_STRING,
3772 i, 1, 7 + i, 7 + i);
3773}
3774
3775/* Test of locations within a raw string that contains a newline. */
3776
3777static void
3778test_lexer_string_locations_raw_string_multiline (const line_table_case &case_)
3779{
3780 /* .....................00.0000.
3781 .....................12.3456. */
3782 const char *content = ("R\"foo(\n"
3783 /* .....................00000.
3784 .....................12345. */
3785 "hello\n"
3786 "world\n"
3787 /* .....................00000.
3788 .....................12345. */
3789 ")foo\"\n");
3790 lexer_test test (case_, content, NULL);
3791
3792 /* Verify that we get the expected token back. */
3793 const cpp_token *tok = test.get_token ();
3794 ASSERT_EQ (tok->type, CPP_STRING);
3795
3796 /* Verify that cpp_interpret_string works. */
3797 cpp_string dst_string;
3798 const enum cpp_ttype type = CPP_STRING;
3799 bool result = cpp_interpret_string (test.m_parser, &tok->val.str, 1,
3800 &dst_string, type);
3801 ASSERT_TRUE (result);
3802 ASSERT_STREQ ("\nhello\nworld\n", (const char *)dst_string.text);
3803 free (ptr: const_cast <unsigned char *> (dst_string.text));
3804
3805 if (!should_have_column_data_p (loc: line_table->highest_location))
3806 return;
3807
3808 /* Currently we don't support locations within raw strings that
3809 contain newlines. */
3810 ASSERT_HAS_NO_SUBSTRING_RANGES (test, tok->src_loc, tok->type,
3811 "range endpoints are on different lines");
3812}
3813
3814/* Test of parsing an unterminated raw string. */
3815
3816static void
3817test_lexer_string_locations_raw_string_unterminated (const line_table_case &case_)
3818{
3819 const char *content = "R\"ouch()ouCh\" /* etc */";
3820
3821 lexer_diagnostic_sink diagnostics;
3822 lexer_test test (case_, content, &diagnostics);
3823 test.m_implicitly_expect_EOF = false;
3824
3825 /* Attempt to parse the raw string. */
3826 const cpp_token *tok = test.get_token ();
3827 ASSERT_EQ (tok->type, CPP_EOF);
3828
3829 ASSERT_EQ (1, diagnostics.m_diagnostics.length ());
3830 /* We expect the message "unterminated raw string"
3831 in the "cpplib" translation domain.
3832 It's not clear that dgettext is available on all supported hosts,
3833 so this assertion is commented-out for now.
3834 ASSERT_STREQ (dgettext ("cpplib", "unterminated raw string"),
3835 diagnostics.m_diagnostics[0]);
3836 */
3837}
3838
3839/* Test of lexing char constants. */
3840
3841static void
3842test_lexer_char_constants (const line_table_case &case_)
3843{
3844 /* Various char constants.
3845 .....................0000000001111111111.22222222223.
3846 .....................1234567890123456789.01234567890. */
3847 const char *content = (" 'a'\n"
3848 " u'a'\n"
3849 " U'a'\n"
3850 " L'a'\n"
3851 " 'abc'\n");
3852 lexer_test test (case_, content, NULL);
3853
3854 /* Verify that we get the expected tokens back. */
3855 /* 'a'. */
3856 const cpp_token *tok = test.get_token ();
3857 ASSERT_EQ (tok->type, CPP_CHAR);
3858 ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "'a'");
3859
3860 unsigned int chars_seen;
3861 int unsignedp;
3862 cppchar_t cc = cpp_interpret_charconst (test.m_parser, tok,
3863 &chars_seen, &unsignedp);
3864 ASSERT_EQ (cc, 'a');
3865 ASSERT_EQ (chars_seen, 1);
3866
3867 /* u'a'. */
3868 tok = test.get_token ();
3869 ASSERT_EQ (tok->type, CPP_CHAR16);
3870 ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "u'a'");
3871
3872 /* U'a'. */
3873 tok = test.get_token ();
3874 ASSERT_EQ (tok->type, CPP_CHAR32);
3875 ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "U'a'");
3876
3877 /* L'a'. */
3878 tok = test.get_token ();
3879 ASSERT_EQ (tok->type, CPP_WCHAR);
3880 ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "L'a'");
3881
3882 /* 'abc' (c-char-sequence). */
3883 tok = test.get_token ();
3884 ASSERT_EQ (tok->type, CPP_CHAR);
3885 ASSERT_TOKEN_AS_TEXT_EQ (test.m_parser, tok, "'abc'");
3886}
3887/* A table of interesting location_t values, giving one axis of our test
3888 matrix. */
3889
3890static const location_t boundary_locations[] = {
3891 /* Zero means "don't override the default values for a new line_table". */
3892 0,
3893
3894 /* An arbitrary non-zero value that isn't close to one of
3895 the boundary values below. */
3896 0x10000,
3897
3898 /* Values near LINE_MAP_MAX_LOCATION_WITH_PACKED_RANGES. */
3899 LINE_MAP_MAX_LOCATION_WITH_PACKED_RANGES - 0x100,
3900 LINE_MAP_MAX_LOCATION_WITH_PACKED_RANGES - 1,
3901 LINE_MAP_MAX_LOCATION_WITH_PACKED_RANGES,
3902 LINE_MAP_MAX_LOCATION_WITH_PACKED_RANGES + 1,
3903 LINE_MAP_MAX_LOCATION_WITH_PACKED_RANGES + 0x100,
3904
3905 /* Values near LINE_MAP_MAX_LOCATION_WITH_COLS. */
3906 LINE_MAP_MAX_LOCATION_WITH_COLS - 0x100,
3907 LINE_MAP_MAX_LOCATION_WITH_COLS - 1,
3908 LINE_MAP_MAX_LOCATION_WITH_COLS,
3909 LINE_MAP_MAX_LOCATION_WITH_COLS + 1,
3910 LINE_MAP_MAX_LOCATION_WITH_COLS + 0x100,
3911};
3912
3913/* Run TESTCASE multiple times, once for each case in our test matrix. */
3914
3915void
3916for_each_line_table_case (void (*testcase) (const line_table_case &))
3917{
3918 /* As noted above in the description of struct line_table_case,
3919 we want to explore a test matrix of interesting line_table
3920 situations, running various selftests for each case within the
3921 matrix. */
3922
3923 /* Run all tests with:
3924 (a) line_table->default_range_bits == 0, and
3925 (b) line_table->default_range_bits == 5. */
3926 int num_cases_tested = 0;
3927 for (int default_range_bits = 0; default_range_bits <= 5;
3928 default_range_bits += 5)
3929 {
3930 /* ...and use each of the "interesting" location values as
3931 the starting location within line_table. */
3932 const int num_boundary_locations = ARRAY_SIZE (boundary_locations);
3933 for (int loc_idx = 0; loc_idx < num_boundary_locations; loc_idx++)
3934 {
3935 line_table_case c (default_range_bits, boundary_locations[loc_idx]);
3936
3937 testcase (c);
3938
3939 num_cases_tested++;
3940 }
3941 }
3942
3943 /* Verify that we fully covered the test matrix. */
3944 ASSERT_EQ (num_cases_tested, 2 * 12);
3945}
3946
3947/* Verify that when presented with a consecutive pair of locations with
3948 a very large line offset, we don't attempt to consolidate them into
3949 a single ordinary linemap where the line offsets within the line map
3950 would lead to overflow (PR lto/88147). */
3951
3952static void
3953test_line_offset_overflow ()
3954{
3955 line_table_test ltt (line_table_case (5, 0));
3956
3957 linemap_add (line_table, LC_ENTER, sysp: false, to_file: "foo.c", to_line: 0);
3958 linemap_line_start (set: line_table, to_line: 1, max_column_hint: 100);
3959 location_t loc_a = linemap_line_start (set: line_table, to_line: 2578, max_column_hint: 255);
3960 assert_loceq (exp_filename: "foo.c", exp_linenum: 2578, exp_colnum: 0, loc: loc_a);
3961
3962 const line_map_ordinary *ordmap_a = LINEMAPS_LAST_ORDINARY_MAP (set: line_table);
3963 ASSERT_EQ (ordmap_a->m_column_and_range_bits, 13);
3964 ASSERT_EQ (ordmap_a->m_range_bits, 5);
3965
3966 location_t loc_b = linemap_line_start (set: line_table, to_line: 404198, max_column_hint: 512);
3967 assert_loceq (exp_filename: "foo.c", exp_linenum: 404198, exp_colnum: 0, loc: loc_b);
3968
3969 /* We should have started a new linemap, rather than attempting to store
3970 a very large line offset. */
3971 const line_map_ordinary *ordmap_b = LINEMAPS_LAST_ORDINARY_MAP (set: line_table);
3972 ASSERT_NE (ordmap_a, ordmap_b);
3973}
3974
3975void test_cpp_utf8 ()
3976{
3977 const int def_tabstop = 8;
3978 cpp_char_column_policy policy (def_tabstop, cpp_wcwidth);
3979
3980 /* Verify that wcwidth of invalid UTF-8 or control bytes is 1. */
3981 {
3982 int w_bad = cpp_display_width (data: "\xf0!\x9f!\x98!\x82!", data_length: 8, policy);
3983 ASSERT_EQ (8, w_bad);
3984 int w_ctrl = cpp_display_width (data: "\r\n\v\0\1", data_length: 5, policy);
3985 ASSERT_EQ (5, w_ctrl);
3986 }
3987
3988 /* Verify that wcwidth of valid UTF-8 is as expected. */
3989 {
3990 const int w_pi = cpp_display_width (data: "\xcf\x80", data_length: 2, policy);
3991 ASSERT_EQ (1, w_pi);
3992 const int w_emoji = cpp_display_width (data: "\xf0\x9f\x98\x82", data_length: 4, policy);
3993 ASSERT_EQ (2, w_emoji);
3994 const int w_umlaut_precomposed = cpp_display_width (data: "\xc3\xbf", data_length: 2,
3995 policy);
3996 ASSERT_EQ (1, w_umlaut_precomposed);
3997 const int w_umlaut_combining = cpp_display_width (data: "y\xcc\x88", data_length: 3,
3998 policy);
3999 ASSERT_EQ (1, w_umlaut_combining);
4000 const int w_han = cpp_display_width (data: "\xe4\xb8\xba", data_length: 3, policy);
4001 ASSERT_EQ (2, w_han);
4002 const int w_ascii = cpp_display_width (data: "GCC", data_length: 3, policy);
4003 ASSERT_EQ (3, w_ascii);
4004 const int w_mixed = cpp_display_width (data: "\xcf\x80 = 3.14 \xf0\x9f\x98\x82"
4005 "\x9f! \xe4\xb8\xba y\xcc\x88",
4006 data_length: 24, policy);
4007 ASSERT_EQ (18, w_mixed);
4008 }
4009
4010 /* Verify that display width properly expands tabs. */
4011 {
4012 const char *tstr = "\tabc\td";
4013 ASSERT_EQ (6, cpp_display_width (tstr, 6,
4014 cpp_char_column_policy (1, cpp_wcwidth)));
4015 ASSERT_EQ (10, cpp_display_width (tstr, 6,
4016 cpp_char_column_policy (3, cpp_wcwidth)));
4017 ASSERT_EQ (17, cpp_display_width (tstr, 6,
4018 cpp_char_column_policy (8, cpp_wcwidth)));
4019 ASSERT_EQ (1,
4020 cpp_display_column_to_byte_column
4021 (tstr, 6, 7, cpp_char_column_policy (8, cpp_wcwidth)));
4022 }
4023
4024 /* Verify that cpp_byte_column_to_display_column can go past the end,
4025 and similar edge cases. */
4026 {
4027 const char *str
4028 /* Display columns.
4029 111111112345 */
4030 = "\xcf\x80 abc";
4031 /* 111122223456
4032 Byte columns. */
4033
4034 ASSERT_EQ (5, cpp_display_width (str, 6, policy));
4035 ASSERT_EQ (105,
4036 cpp_byte_column_to_display_column (str, 6, 106, policy));
4037 ASSERT_EQ (10000,
4038 cpp_byte_column_to_display_column (NULL, 0, 10000, policy));
4039 ASSERT_EQ (0,
4040 cpp_byte_column_to_display_column (NULL, 10000, 0, policy));
4041 }
4042
4043 /* Verify that cpp_display_column_to_byte_column can go past the end,
4044 and similar edge cases, and check invertibility. */
4045 {
4046 const char *str
4047 /* Display columns.
4048 000000000000000000000000000000000000011
4049 111111112222222234444444455555555678901 */
4050 = "\xf0\x9f\x98\x82 \xf0\x9f\x98\x82 hello";
4051 /* 000000000000000000000000000000000111111
4052 111122223333444456666777788889999012345
4053 Byte columns. */
4054 ASSERT_EQ (4, cpp_display_column_to_byte_column (str, 15, 2, policy));
4055 ASSERT_EQ (15,
4056 cpp_display_column_to_byte_column (str, 15, 11, policy));
4057 ASSERT_EQ (115,
4058 cpp_display_column_to_byte_column (str, 15, 111, policy));
4059 ASSERT_EQ (10000,
4060 cpp_display_column_to_byte_column (NULL, 0, 10000, policy));
4061 ASSERT_EQ (0,
4062 cpp_display_column_to_byte_column (NULL, 10000, 0, policy));
4063
4064 /* Verify that we do not interrupt a UTF-8 sequence. */
4065 ASSERT_EQ (4, cpp_display_column_to_byte_column (str, 15, 1, policy));
4066
4067 for (int byte_col = 1; byte_col <= 15; ++byte_col)
4068 {
4069 const int disp_col
4070 = cpp_byte_column_to_display_column (data: str, data_length: 15, column: byte_col, policy);
4071 const int byte_col2
4072 = cpp_display_column_to_byte_column (data: str, data_length: 15, display_col: disp_col, policy);
4073
4074 /* If we ask for the display column in the middle of a UTF-8
4075 sequence, it will return the length of the partial sequence,
4076 matching the behavior of GCC before display column support.
4077 Otherwise check the round trip was successful. */
4078 if (byte_col < 4)
4079 ASSERT_EQ (byte_col, disp_col);
4080 else if (byte_col >= 6 && byte_col < 9)
4081 ASSERT_EQ (3 + (byte_col - 5), disp_col);
4082 else
4083 ASSERT_EQ (byte_col2, byte_col);
4084 }
4085 }
4086}
4087
4088static bool
4089check_cpp_valid_utf8_p (const char *str)
4090{
4091 return cpp_valid_utf8_p (data: str, num_bytes: strlen (s: str));
4092}
4093
4094/* Check that cpp_valid_utf8_p works as expected. */
4095
4096static void
4097test_cpp_valid_utf8_p ()
4098{
4099 ASSERT_TRUE (check_cpp_valid_utf8_p ("hello world"));
4100
4101 /* 2-byte char (pi). */
4102 ASSERT_TRUE (check_cpp_valid_utf8_p("\xcf\x80"));
4103
4104 /* 3-byte chars (the Japanese word "mojibake"). */
4105 ASSERT_TRUE (check_cpp_valid_utf8_p
4106 (
4107 /* U+6587 CJK UNIFIED IDEOGRAPH-6587
4108 UTF-8: 0xE6 0x96 0x87
4109 C octal escaped UTF-8: \346\226\207. */
4110 "\346\226\207"
4111 /* U+5B57 CJK UNIFIED IDEOGRAPH-5B57
4112 UTF-8: 0xE5 0xAD 0x97
4113 C octal escaped UTF-8: \345\255\227. */
4114 "\345\255\227"
4115 /* U+5316 CJK UNIFIED IDEOGRAPH-5316
4116 UTF-8: 0xE5 0x8C 0x96
4117 C octal escaped UTF-8: \345\214\226. */
4118 "\345\214\226"
4119 /* U+3051 HIRAGANA LETTER KE
4120 UTF-8: 0xE3 0x81 0x91
4121 C octal escaped UTF-8: \343\201\221. */
4122 "\343\201\221"));
4123
4124 /* 4-byte char: an emoji. */
4125 ASSERT_TRUE (check_cpp_valid_utf8_p ("\xf0\x9f\x98\x82"));
4126
4127 /* Control codes, including the NUL byte. */
4128 ASSERT_TRUE (cpp_valid_utf8_p ("\r\n\v\0\1", 5));
4129
4130 ASSERT_FALSE (check_cpp_valid_utf8_p ("\xf0!\x9f!\x98!\x82!"));
4131
4132 /* Unexpected continuation bytes. */
4133 for (unsigned char continuation_byte = 0x80;
4134 continuation_byte <= 0xbf;
4135 continuation_byte++)
4136 ASSERT_FALSE (cpp_valid_utf8_p ((const char *)&continuation_byte, 1));
4137
4138 /* "Lonely start characters" for 2-byte sequences. */
4139 {
4140 unsigned char buf[2];
4141 buf[1] = ' ';
4142 for (buf[0] = 0xc0;
4143 buf[0] <= 0xdf;
4144 buf[0]++)
4145 ASSERT_FALSE (cpp_valid_utf8_p ((const char *)buf, 2));
4146 }
4147
4148 /* "Lonely start characters" for 3-byte sequences. */
4149 {
4150 unsigned char buf[2];
4151 buf[1] = ' ';
4152 for (buf[0] = 0xe0;
4153 buf[0] <= 0xef;
4154 buf[0]++)
4155 ASSERT_FALSE (cpp_valid_utf8_p ((const char *)buf, 2));
4156 }
4157
4158 /* "Lonely start characters" for 4-byte sequences. */
4159 {
4160 unsigned char buf[2];
4161 buf[1] = ' ';
4162 for (buf[0] = 0xf0;
4163 buf[0] <= 0xf4;
4164 buf[0]++)
4165 ASSERT_FALSE (cpp_valid_utf8_p ((const char *)buf, 2));
4166 }
4167
4168 /* Invalid start characters (formerly valid for 5-byte and 6-byte
4169 sequences). */
4170 {
4171 unsigned char buf[2];
4172 buf[1] = ' ';
4173 for (buf[0] = 0xf5;
4174 buf[0] <= 0xfd;
4175 buf[0]++)
4176 ASSERT_FALSE (cpp_valid_utf8_p ((const char *)buf, 2));
4177 }
4178
4179 /* Impossible bytes. */
4180 ASSERT_FALSE (check_cpp_valid_utf8_p ("\xc0"));
4181 ASSERT_FALSE (check_cpp_valid_utf8_p ("\xc1"));
4182 ASSERT_FALSE (check_cpp_valid_utf8_p ("\xfe"));
4183 ASSERT_FALSE (check_cpp_valid_utf8_p ("\xff"));
4184}
4185
4186/* Run all of the selftests within this file. */
4187
4188void
4189input_cc_tests ()
4190{
4191 test_linenum_comparisons ();
4192 test_should_have_column_data_p ();
4193 test_unknown_location ();
4194 test_builtins ();
4195 for_each_line_table_case (testcase: test_make_location_nonpure_range_endpoints);
4196
4197 for_each_line_table_case (testcase: test_accessing_ordinary_linemaps);
4198 for_each_line_table_case (testcase: test_lexer);
4199 for_each_line_table_case (testcase: test_lexer_string_locations_simple);
4200 for_each_line_table_case (testcase: test_lexer_string_locations_ebcdic);
4201 for_each_line_table_case (testcase: test_lexer_string_locations_hex);
4202 for_each_line_table_case (testcase: test_lexer_string_locations_oct);
4203 for_each_line_table_case (testcase: test_lexer_string_locations_letter_escape_1);
4204 for_each_line_table_case (testcase: test_lexer_string_locations_letter_escape_2);
4205 for_each_line_table_case (testcase: test_lexer_string_locations_ucn4);
4206 for_each_line_table_case (testcase: test_lexer_string_locations_ucn8);
4207 for_each_line_table_case (testcase: test_lexer_string_locations_wide_string);
4208 for_each_line_table_case (testcase: test_lexer_string_locations_string16);
4209 for_each_line_table_case (testcase: test_lexer_string_locations_string32);
4210 for_each_line_table_case (testcase: test_lexer_string_locations_u8);
4211 for_each_line_table_case (testcase: test_lexer_string_locations_utf8_source);
4212 for_each_line_table_case (testcase: test_lexer_string_locations_concatenation_1);
4213 for_each_line_table_case (testcase: test_lexer_string_locations_concatenation_2);
4214 for_each_line_table_case (testcase: test_lexer_string_locations_concatenation_3);
4215 for_each_line_table_case (testcase: test_lexer_string_locations_macro);
4216 for_each_line_table_case (testcase: test_lexer_string_locations_stringified_macro_argument);
4217 for_each_line_table_case (testcase: test_lexer_string_locations_non_string);
4218 for_each_line_table_case (testcase: test_lexer_string_locations_long_line);
4219 for_each_line_table_case (testcase: test_lexer_string_locations_raw_string_one_line);
4220 for_each_line_table_case (testcase: test_lexer_string_locations_raw_string_multiline);
4221 for_each_line_table_case (testcase: test_lexer_string_locations_raw_string_unterminated);
4222 for_each_line_table_case (testcase: test_lexer_char_constants);
4223
4224 test_reading_source_line ();
4225
4226 test_line_offset_overflow ();
4227
4228 test_cpp_utf8 ();
4229 test_cpp_valid_utf8_p ();
4230}
4231
4232} // namespace selftest
4233
4234#endif /* CHECKING_P */
4235

source code of gcc/input.cc