1/* Find near-matches for strings.
2 Copyright (C) 2015-2023 Free Software Foundation, Inc.
3
4This file is part of GCC.
5
6GCC is free software; you can redistribute it and/or modify it under
7the terms of the GNU General Public License as published by the Free
8Software Foundation; either version 3, or (at your option) any later
9version.
10
11GCC is distributed in the hope that it will be useful, but WITHOUT ANY
12WARRANTY; without even the implied warranty of MERCHANTABILITY or
13FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14for more details.
15
16You should have received a copy of the GNU General Public License
17along with GCC; see the file COPYING3. If not see
18<http://www.gnu.org/licenses/>. */
19
20#include "config.h"
21#include "system.h"
22#include "coretypes.h"
23#include "tm.h"
24#include "tree.h"
25#include "spellcheck.h"
26#include "selftest.h"
27
28/* Cost of a case transformation. */
29#define CASE_COST 1
30
31/* Cost of another kind of edit. */
32#define BASE_COST 2
33
34/* Get the edit distance between the two strings: the minimal
35 number of edits that are needed to change one string into another,
36 where edits can be one-character insertions, removals, or substitutions,
37 or transpositions of two adjacent characters (counting as one "edit").
38
39 This implementation uses a modified variant of the Wagner-Fischer
40 algorithm for the Damerau-Levenshtein distance; specifically, the
41 "optimal string alignment distance" or "restricted edit distance"
42 variant. This implementation has been further modified to take
43 case into account. */
44
45edit_distance_t
46get_edit_distance (const char *s, int len_s,
47 const char *t, int len_t)
48{
49 const bool debug = false;
50
51 if (debug)
52 {
53 printf (format: "s: \"%s\" (len_s=%i)\n", s, len_s);
54 printf (format: "t: \"%s\" (len_t=%i)\n", t, len_t);
55 }
56
57 if (len_s == 0)
58 return BASE_COST * len_t;
59 if (len_t == 0)
60 return BASE_COST * len_s;
61
62 /* We effectively build a matrix where each (i, j) contains the
63 distance between the prefix strings s[0:j] and t[0:i].
64 Rather than actually build an (len_t + 1) * (len_s + 1) matrix,
65 we simply keep track of the last two rows, v_one_ago and v_two_ago,
66 and a new row, v_next, which avoids an (len_t + 1) * (len_s + 1)
67 allocation and memory accesses in favor of three (len_s + 1)
68 allocations. These could potentially be
69 statically-allocated if we impose a maximum length on the
70 strings of interest. */
71 edit_distance_t *v_two_ago = new edit_distance_t[len_s + 1];
72 edit_distance_t *v_one_ago = new edit_distance_t[len_s + 1];
73 edit_distance_t *v_next = new edit_distance_t[len_s + 1];
74
75 /* The first row is for the case of an empty target string, which
76 we can reach by deleting every character in the source string. */
77 for (int i = 0; i < len_s + 1; i++)
78 v_one_ago[i] = i * BASE_COST;
79
80 /* Build successive rows. */
81 for (int i = 0; i < len_t; i++)
82 {
83 if (debug)
84 {
85 printf (format: "i:%i v_one_ago = ", i);
86 for (int j = 0; j < len_s + 1; j++)
87 printf (format: "%i ", v_one_ago[j]);
88 printf (format: "\n");
89 }
90
91 /* The initial column is for the case of an empty source string; we
92 can reach prefixes of the target string of length i
93 by inserting i characters. */
94 v_next[0] = (i + 1) * BASE_COST;
95
96 /* Build the rest of the row by considering neighbors to
97 the north, west and northwest. */
98 for (int j = 0; j < len_s; j++)
99 {
100 edit_distance_t cost;
101
102 if (s[j] == t[i])
103 cost = 0;
104 else if (TOLOWER (s[j]) == TOLOWER (t[i]))
105 cost = CASE_COST;
106 else
107 cost = BASE_COST;
108 edit_distance_t deletion = v_next[j] + BASE_COST;
109 edit_distance_t insertion = v_one_ago[j + 1] + BASE_COST;
110 edit_distance_t substitution = v_one_ago[j] + cost;
111 edit_distance_t cheapest = MIN (deletion, insertion);
112 cheapest = MIN (cheapest, substitution);
113 if (i > 0 && j > 0 && s[j] == t[i - 1] && s[j - 1] == t[i])
114 {
115 edit_distance_t transposition = v_two_ago[j - 1] + BASE_COST;
116 cheapest = MIN (cheapest, transposition);
117 }
118 v_next[j + 1] = cheapest;
119 }
120
121 /* Prepare to move on to next row. */
122 for (int j = 0; j < len_s + 1; j++)
123 {
124 v_two_ago[j] = v_one_ago[j];
125 v_one_ago[j] = v_next[j];
126 }
127 }
128
129 if (debug)
130 {
131 printf (format: "final v_next = ");
132 for (int j = 0; j < len_s + 1; j++)
133 printf (format: "%i ", v_next[j]);
134 printf (format: "\n");
135 }
136
137 edit_distance_t result = v_next[len_s];
138 delete[] v_two_ago;
139 delete[] v_one_ago;
140 delete[] v_next;
141 return result;
142}
143
144/* Get the edit distance between two nil-terminated strings. */
145
146edit_distance_t
147get_edit_distance (const char *s, const char *t)
148{
149 return get_edit_distance (s, len_s: strlen (s: s), t, len_t: strlen (s: t));
150}
151
152/* Given TARGET, a non-NULL string, and CANDIDATES, a non-NULL ptr to
153 an autovec of non-NULL strings, determine which element within
154 CANDIDATES has the lowest edit distance to TARGET. If there are
155 multiple elements with the same minimal distance, the first in the
156 vector wins.
157
158 If more than half of the letters were misspelled, the suggestion is
159 likely to be meaningless, so return NULL for this case. */
160
161const char *
162find_closest_string (const char *target,
163 const auto_vec<const char *> *candidates)
164{
165 gcc_assert (target);
166 gcc_assert (candidates);
167
168 int i;
169 const char *candidate;
170 best_match<const char *, const char *> bm (target);
171 FOR_EACH_VEC_ELT (*candidates, i, candidate)
172 {
173 gcc_assert (candidate);
174 bm.consider (candidate);
175 }
176
177 return bm.get_best_meaningful_candidate ();
178}
179
180/* Generate the maximum edit distance for which we consider a suggestion
181 to be meaningful, given a goal of length GOAL_LEN and a candidate of
182 length CANDIDATE_LEN.
183
184 This is a third of the length of the candidate or of the goal,
185 whichever is bigger. */
186
187edit_distance_t
188get_edit_distance_cutoff (size_t goal_len, size_t candidate_len)
189{
190 size_t max_length = MAX (goal_len, candidate_len);
191 size_t min_length = MIN (goal_len, candidate_len);
192
193 gcc_assert (max_length >= min_length);
194
195 /* Special case: don't offer suggestions for a pair of
196 length == 1 strings (or empty strings). */
197 if (max_length <= 1)
198 return 0;
199
200 /* If the lengths are close, then round down. */
201 if (max_length - min_length <= 1)
202 /* ...but allow an edit distance of at least 1. */
203 return BASE_COST * MAX (max_length / 3, 1);
204
205 /* Otherwise, round up (thus giving a little extra leeway to some cases
206 involving insertions/deletions). */
207 return BASE_COST * (max_length + 2) / 3;
208}
209
210#if CHECKING_P
211
212namespace selftest {
213
214/* Selftests. */
215
216/* Verify that get_edit_distance (A, B) equals the expected value. */
217
218static void
219test_get_edit_distance_one_way (const char *a, const char *b,
220 edit_distance_t expected)
221{
222 edit_distance_t actual = get_edit_distance (s: a, t: b);
223 ASSERT_EQ (actual, expected);
224}
225
226/* Verify that both
227 get_edit_distance (A, B)
228 and
229 get_edit_distance (B, A)
230 equal the expected value, to ensure that the function is symmetric. */
231
232static void
233test_get_edit_distance_both_ways (const char *a, const char *b,
234 edit_distance_t expected)
235{
236 test_get_edit_distance_one_way (a, b, expected);
237 test_get_edit_distance_one_way (a: b, b: a, expected);
238}
239
240/* Verify get_edit_distance for a variety of pairs of pre-canned
241 inputs, comparing against known-good values. */
242
243static void
244test_edit_distances ()
245{
246 test_get_edit_distance_both_ways (a: "", b: "nonempty",
247 BASE_COST * strlen (s: "nonempty"));
248 test_get_edit_distance_both_ways (a: "saturday", b: "sunday",
249 BASE_COST * 3);
250 test_get_edit_distance_both_ways (a: "foo", b: "m_foo", BASE_COST * 2);
251 test_get_edit_distance_both_ways (a: "hello_world", b: "HelloWorld", expected: 4);
252 test_get_edit_distance_both_ways
253 (a: "the quick brown fox jumps over the lazy dog", b: "dog", BASE_COST * 40);
254 test_get_edit_distance_both_ways
255 (a: "the quick brown fox jumps over the lazy dog",
256 b: "the quick brown dog jumps over the lazy fox",
257 BASE_COST * 4);
258 test_get_edit_distance_both_ways
259 (a: "Lorem ipsum dolor sit amet, consectetur adipiscing elit,",
260 b: "All your base are belong to us",
261 BASE_COST * 44);
262 test_get_edit_distance_both_ways (a: "foo", b: "FOO", expected: 3);
263 test_get_edit_distance_both_ways (a: "fee", b: "deed", BASE_COST * 2);
264 test_get_edit_distance_both_ways (a: "coorzd1", b: "coordx1", BASE_COST * 2);
265 test_get_edit_distance_both_ways (a: "assert", b: "sqrt", BASE_COST * 3);
266 test_get_edit_distance_both_ways (a: "PATH_MAX", b: "INT8_MAX", BASE_COST * 3);
267 test_get_edit_distance_both_ways (a: "time", b: "nice", BASE_COST * 2);
268 test_get_edit_distance_both_ways (a: "bar", b: "carg", BASE_COST * 2);
269 test_get_edit_distance_both_ways (a: "gtk_widget_show_all",
270 b: "GtkWidgetShowAll", expected: 10);
271 test_get_edit_distance_both_ways (a: "m_bar", b: "bar", BASE_COST * 2);
272 test_get_edit_distance_both_ways (a: "MACRO", b: "MACRAME", BASE_COST * 3);
273 test_get_edit_distance_both_ways (a: "ab", b: "ac", BASE_COST * 1);
274 test_get_edit_distance_both_ways (a: "ab", b: "a", BASE_COST * 1);
275 test_get_edit_distance_both_ways (a: "a", b: "b", BASE_COST * 1);
276 test_get_edit_distance_both_ways (a: "nanl", b: "name", BASE_COST * 2);
277 test_get_edit_distance_both_ways (a: "char", b: "bar", BASE_COST * 2);
278 test_get_edit_distance_both_ways (a: "-optimize", b: "fsanitize", BASE_COST * 5);
279 test_get_edit_distance_both_ways (a: "__DATE__", b: "__i386__", BASE_COST * 4);
280
281 /* Examples where transposition helps. */
282 test_get_edit_distance_both_ways (a: "ab", b: "ba", BASE_COST * 1);
283 test_get_edit_distance_both_ways (a: "ba", b: "abc", BASE_COST * 2);
284 test_get_edit_distance_both_ways (a: "coorzd1", b: "coordz1", BASE_COST * 1);
285 test_get_edit_distance_both_ways (a: "abcdefghijklmnopqrstuvwxyz",
286 b: "bacdefghijklmnopqrstuvwxzy",
287 BASE_COST * 2);
288 test_get_edit_distance_both_ways (a: "saturday", b: "sundya", BASE_COST * 4);
289 test_get_edit_distance_both_ways (a: "signed", b: "singed", BASE_COST * 1);
290}
291
292/* Subroutine of test_get_edit_distance_cutoff, for emulating the
293 spellchecking cutoff in up to GCC 8. */
294
295static edit_distance_t
296get_old_cutoff (size_t goal_len, size_t candidate_len)
297{
298 return BASE_COST * MAX (goal_len, candidate_len) / 2;
299}
300
301/* Verify that the cutoff for "meaningfulness" of suggestions is at least as
302 conservative as in older GCC releases.
303
304 This should ensure that we don't offer additional meaningless
305 suggestions (apart from those for which transposition has helped). */
306
307static void
308test_get_edit_distance_cutoff ()
309{
310 for (size_t goal_len = 0; goal_len < 30; goal_len++)
311 for (size_t candidate_len = 0; candidate_len < 30; candidate_len++)
312 ASSERT_TRUE (get_edit_distance_cutoff (goal_len, candidate_len)
313 <= get_old_cutoff (goal_len, candidate_len));
314}
315
316/* Assert that CANDIDATE is offered as a suggestion for TARGET. */
317
318static void
319assert_suggested_for (const location &loc, const char *candidate,
320 const char *target)
321{
322 auto_vec<const char *> candidates;
323 candidates.safe_push (obj: candidate);
324 ASSERT_EQ_AT (loc, candidate, find_closest_string (target, &candidates));
325}
326
327/* Assert that CANDIDATE is offered as a suggestion for TARGET. */
328
329#define ASSERT_SUGGESTED_FOR(CANDIDATE, TARGET) \
330 SELFTEST_BEGIN_STMT \
331 assert_suggested_for (SELFTEST_LOCATION, CANDIDATE, TARGET); \
332 SELFTEST_END_STMT
333
334/* Assert that CANDIDATE is not offered as a suggestion for TARGET. */
335
336static void
337assert_not_suggested_for (const location &loc, const char *candidate,
338 const char *target)
339{
340 auto_vec<const char *> candidates;
341 candidates.safe_push (obj: candidate);
342 ASSERT_EQ_AT (loc, NULL, find_closest_string (target, &candidates));
343}
344
345/* Assert that CANDIDATE is not offered as a suggestion for TARGET. */
346
347#define ASSERT_NOT_SUGGESTED_FOR(CANDIDATE, TARGET) \
348 SELFTEST_BEGIN_STMT \
349 assert_not_suggested_for (SELFTEST_LOCATION, CANDIDATE, TARGET); \
350 SELFTEST_END_STMT
351
352/* Verify that we offer varous suggestions that are meaningful,
353 and that we don't offer various other ones that aren't (PR c/82967). */
354
355static void
356test_suggestions ()
357{
358 /* Good suggestions. */
359
360 ASSERT_SUGGESTED_FOR ("m_bar", "bar");
361 // dist == 2, max_length == 5, min_length == 3
362
363 ASSERT_SUGGESTED_FOR ("MACRO", "MACRAME");
364 // dist == 3, max_length == 7, min_length == 5
365
366 ASSERT_SUGGESTED_FOR ("gtk_widget_show_all", "GtkWidgetShowAll");
367 // dist == 7, max_length == 16, min_length = 19
368
369 ASSERT_SUGGESTED_FOR ("ab", "ac");
370 // dist == 1, max_length == min_length = 2
371
372 ASSERT_SUGGESTED_FOR ("ab", "a");
373 // dist == 1, max_length == 2, min_length = 1
374
375 /* Bad suggestions. */
376
377 ASSERT_NOT_SUGGESTED_FOR ("a", "b");
378 // dist == 1, max_length == min_length = 1
379
380 ASSERT_NOT_SUGGESTED_FOR ("sqrt", "assert");
381 // dist == 3, max_length 6, min_length == 4
382
383 ASSERT_NOT_SUGGESTED_FOR ("INT8_MAX", "PATH_MAX");
384 // dist == 3, max_length == min_length == 8
385
386 ASSERT_NOT_SUGGESTED_FOR ("nice", "time");
387 ASSERT_NOT_SUGGESTED_FOR ("nanl", "name");
388 // dist == 2, max_length == min_length == 4
389
390 ASSERT_NOT_SUGGESTED_FOR ("carg", "bar");
391 ASSERT_NOT_SUGGESTED_FOR ("char", "bar");
392 // dist == 2, max_length == 4, min_length == 3
393
394 ASSERT_NOT_SUGGESTED_FOR ("-optimize", "fsanitize");
395 // dist == 5, max_length == min_length == 9
396
397 ASSERT_NOT_SUGGESTED_FOR ("__DATE__", "__i386__");
398 // dist == 4, max_length == min_length == 8
399
400 ASSERT_NOT_SUGGESTED_FOR ("start_input_device", "InputDevice");
401 // dist == 9, max_length == 18, min_length == 11
402}
403
404/* Verify that find_closest_string is sane. */
405
406static void
407test_find_closest_string ()
408{
409 auto_vec<const char *> candidates;
410
411 /* Verify that it can handle an empty vec. */
412 ASSERT_EQ (NULL, find_closest_string ("", &candidates));
413
414 /* Verify that it works sanely for non-empty vecs. */
415 candidates.safe_push (obj: "apple");
416 candidates.safe_push (obj: "banana");
417 candidates.safe_push (obj: "cherry");
418
419 ASSERT_STREQ ("apple", find_closest_string ("app", &candidates));
420 ASSERT_STREQ ("banana", find_closest_string ("banyan", &candidates));
421 ASSERT_STREQ ("cherry", find_closest_string ("berry", &candidates));
422 ASSERT_EQ (NULL, find_closest_string ("not like the others", &candidates));
423
424 /* The order of the vec can matter, but it should not matter for these
425 inputs. */
426 candidates.truncate (size: 0);
427 candidates.safe_push (obj: "cherry");
428 candidates.safe_push (obj: "banana");
429 candidates.safe_push (obj: "apple");
430 ASSERT_STREQ ("apple", find_closest_string ("app", &candidates));
431 ASSERT_STREQ ("banana", find_closest_string ("banyan", &candidates));
432 ASSERT_STREQ ("cherry", find_closest_string ("berry", &candidates));
433 ASSERT_EQ (NULL, find_closest_string ("not like the others", &candidates));
434
435 /* If the goal string somehow makes it into the candidate list, offering
436 it as a suggestion will be nonsensical. Verify that we don't offer such
437 suggestions. */
438 ASSERT_EQ (NULL, find_closest_string ("banana", &candidates));
439
440 /* Example from PR 69968 where transposition helps. */
441 candidates.truncate (size: 0);
442 candidates.safe_push(obj: "coordx");
443 candidates.safe_push(obj: "coordy");
444 candidates.safe_push(obj: "coordz");
445 candidates.safe_push(obj: "coordx1");
446 candidates.safe_push(obj: "coordy1");
447 candidates.safe_push(obj: "coordz1");
448 ASSERT_STREQ ("coordz1", find_closest_string ("coorzd1", &candidates));
449
450 candidates.truncate (size: 0);
451 candidates.safe_push (obj: "DWARF_GNAT_ENCODINGS_GDB");
452 candidates.safe_push (obj: "DWARF_GNAT_ENCODINGS_ALL");
453 candidates.safe_push (obj: "DWARF_GNAT_ENCODINGS_MINIMAL");
454 ASSERT_STREQ ("DWARF_GNAT_ENCODINGS_ALL",
455 find_closest_string ("DWARF_GNAT_ENCODINGS_all",
456 &candidates));
457
458 /* The same as the previous test, but with a different order of
459 candidates. */
460 candidates.truncate (size: 0);
461 candidates.safe_push (obj: "DWARF_GNAT_ENCODINGS_ALL");
462 candidates.safe_push (obj: "DWARF_GNAT_ENCODINGS_GDB");
463 candidates.safe_push (obj: "DWARF_GNAT_ENCODINGS_MINIMAL");
464 ASSERT_STREQ ("DWARF_GNAT_ENCODINGS_ALL",
465 find_closest_string ("DWARF_GNAT_ENCODINGS_all",
466 &candidates));
467
468 /* Example from PR 105564 where option name with missing equal
469 sign should win. */
470 candidates.truncate (size: 0);
471 candidates.safe_push (obj: "-Wtrivial-auto-var-init");
472 candidates.safe_push (obj: "-ftrivial-auto-var-init=");
473 ASSERT_STREQ ("-ftrivial-auto-var-init=",
474 find_closest_string ("-ftrivial-auto-var-init",
475 &candidates));
476}
477
478/* Test data for test_metric_conditions. */
479
480static const char * const test_data[] = {
481 "",
482 "foo",
483 "food",
484 "boo",
485 "1234567890123456789012345678901234567890123456789012345678901234567890",
486 "abc",
487 "ac",
488 "ca",
489};
490
491/* Verify that get_edit_distance appears to be a sane distance function,
492 even though it doesn't satisfy the conditions for being a metric. (This
493 is because the triangle inequality fails to hold: the distance between
494 "ca" and "ac" is 1, and so is the distance between "abc" and "ac", but
495 the distance between "abc" and "ca" is 3. Algorithms that calculate the
496 true Levenshtein-Damerau metric are much more expensive.) */
497
498static void
499test_metric_conditions ()
500{
501 const int num_test_cases = ARRAY_SIZE (test_data);
502
503 for (int i = 0; i < num_test_cases; i++)
504 {
505 for (int j = 0; j < num_test_cases; j++)
506 {
507 edit_distance_t dist_ij
508 = get_edit_distance (s: test_data[i], t: test_data[j]);
509
510 /* Identity of indiscernibles: d(i, j) > 0 iff i == j. */
511 if (i == j)
512 ASSERT_EQ (dist_ij, 0);
513 else
514 ASSERT_TRUE (dist_ij > 0);
515
516 /* Symmetry: d(i, j) == d(j, i). */
517 edit_distance_t dist_ji
518 = get_edit_distance (s: test_data[j], t: test_data[i]);
519 ASSERT_EQ (dist_ij, dist_ji);
520 }
521 }
522}
523
524/* Run all of the selftests within this file. */
525
526void
527spellcheck_cc_tests ()
528{
529 test_edit_distances ();
530 test_get_edit_distance_cutoff ();
531 test_suggestions ();
532 test_find_closest_string ();
533 test_metric_conditions ();
534}
535
536} // namespace selftest
537
538#endif /* #if CHECKING_P */
539

source code of gcc/spellcheck.cc