#include "spellcheck.h"
#include "selftest.h"
-/* The Levenshtein distance is an "edit-distance": the minimal
- number of one-character insertions, removals or substitutions
- that are needed to change one string into another.
+/* Get the edit distance between the two strings: the minimal
+ number of edits that are needed to change one string into another,
+ where edits can be one-character insertions, removals, or substitutions,
+ or transpositions of two adjacent characters (counting as one "edit").
- This implementation uses the Wagner-Fischer algorithm. */
+ This implementation uses the Wagner-Fischer algorithm for the
+ Damerau-Levenshtein distance; specifically, the "optimal string alignment
+ distance" or "restricted edit distance" variant. */
edit_distance_t
-levenshtein_distance (const char *s, int len_s,
- const char *t, int len_t)
+get_edit_distance (const char *s, int len_s,
+ const char *t, int len_t)
{
const bool debug = false;
return len_s;
/* We effectively build a matrix where each (i, j) contains the
- Levenshtein distance between the prefix strings s[0:j]
- and t[0:i].
+ distance between the prefix strings s[0:j] and t[0:i].
Rather than actually build an (len_t + 1) * (len_s + 1) matrix,
- we simply keep track of the last row, v0 and a new row, v1,
- which avoids an (len_t + 1) * (len_s + 1) allocation and memory accesses
- in favor of two (len_s + 1) allocations. These could potentially be
+ we simply keep track of the last two rows, v_one_ago and v_two_ago,
+ and a new row, v_next, which avoids an (len_t + 1) * (len_s + 1)
+ allocation and memory accesses in favor of three (len_s + 1)
+ allocations. These could potentially be
statically-allocated if we impose a maximum length on the
strings of interest. */
- edit_distance_t *v0 = new edit_distance_t[len_s + 1];
- edit_distance_t *v1 = new edit_distance_t[len_s + 1];
+ edit_distance_t *v_two_ago = new edit_distance_t[len_s + 1];
+ edit_distance_t *v_one_ago = new edit_distance_t[len_s + 1];
+ edit_distance_t *v_next = new edit_distance_t[len_s + 1];
/* The first row is for the case of an empty target string, which
we can reach by deleting every character in the source string. */
for (int i = 0; i < len_s + 1; i++)
- v0[i] = i;
+ v_one_ago[i] = i;
/* Build successive rows. */
for (int i = 0; i < len_t; i++)
{
if (debug)
{
- printf ("i:%i v0 = ", i);
+ printf ("i:%i v_one_ago = ", i);
for (int j = 0; j < len_s + 1; j++)
- printf ("%i ", v0[j]);
+ printf ("%i ", v_one_ago[j]);
printf ("\n");
}
/* The initial column is for the case of an empty source string; we
can reach prefixes of the target string of length i
by inserting i characters. */
- v1[0] = i + 1;
+ v_next[0] = i + 1;
/* Build the rest of the row by considering neighbors to
the north, west and northwest. */
for (int j = 0; j < len_s; j++)
{
edit_distance_t cost = (s[j] == t[i] ? 0 : 1);
- edit_distance_t deletion = v1[j] + 1;
- edit_distance_t insertion = v0[j + 1] + 1;
- edit_distance_t substitution = v0[j] + cost;
+ edit_distance_t deletion = v_next[j] + 1;
+ edit_distance_t insertion = v_one_ago[j + 1] + 1;
+ edit_distance_t substitution = v_one_ago[j] + cost;
edit_distance_t cheapest = MIN (deletion, insertion);
cheapest = MIN (cheapest, substitution);
- v1[j + 1] = cheapest;
+ if (i > 0 && j > 0 && s[j] == t[i - 1] && s[j - 1] == t[i])
+ {
+ edit_distance_t transposition = v_two_ago[j - 1] + 1;
+ cheapest = MIN (cheapest, transposition);
+ }
+ v_next[j + 1] = cheapest;
}
/* Prepare to move on to next row. */
for (int j = 0; j < len_s + 1; j++)
- v0[j] = v1[j];
+ {
+ v_two_ago[j] = v_one_ago[j];
+ v_one_ago[j] = v_next[j];
+ }
}
if (debug)
{
- printf ("final v1 = ");
+ printf ("final v_next = ");
for (int j = 0; j < len_s + 1; j++)
- printf ("%i ", v1[j]);
+ printf ("%i ", v_next[j]);
printf ("\n");
}
- edit_distance_t result = v1[len_s];
- delete[] v0;
- delete[] v1;
+ edit_distance_t result = v_next[len_s];
+ delete[] v_two_ago;
+ delete[] v_one_ago;
+ delete[] v_next;
return result;
}
-/* Calculate Levenshtein distance between two nil-terminated strings. */
+/* Get the edit distance between two nil-terminated strings. */
edit_distance_t
-levenshtein_distance (const char *s, const char *t)
+get_edit_distance (const char *s, const char *t)
{
- return levenshtein_distance (s, strlen (s), t, strlen (t));
+ return get_edit_distance (s, strlen (s), t, strlen (t));
}
/* Given TARGET, a non-NULL string, and CANDIDATES, a non-NULL ptr to
/* Selftests. */
-/* Verify that the levenshtein_distance (A, B) equals the expected
- value. */
+/* Verify that get_edit_distance (A, B) equals the expected value. */
static void
-levenshtein_distance_unit_test_oneway (const char *a, const char *b,
- edit_distance_t expected)
+test_edit_distance_unit_test_oneway (const char *a, const char *b,
+ edit_distance_t expected)
{
- edit_distance_t actual = levenshtein_distance (a, b);
+ edit_distance_t actual = get_edit_distance (a, b);
ASSERT_EQ (actual, expected);
}
/* Verify that both
- levenshtein_distance (A, B)
+ get_edit_distance (A, B)
and
- levenshtein_distance (B, A)
+ get_edit_distance (B, A)
equal the expected value, to ensure that the function is symmetric. */
static void
-levenshtein_distance_unit_test (const char *a, const char *b,
- edit_distance_t expected)
+test_get_edit_distance_unit (const char *a, const char *b,
+ edit_distance_t expected)
{
- levenshtein_distance_unit_test_oneway (a, b, expected);
- levenshtein_distance_unit_test_oneway (b, a, expected);
+ test_edit_distance_unit_test_oneway (a, b, expected);
+ test_edit_distance_unit_test_oneway (b, a, expected);
}
/* Verify that find_closest_string is sane. */
it as a suggestion will be nonsensical. Verify that we don't offer such
suggestions. */
ASSERT_EQ (NULL, find_closest_string ("banana", &candidates));
+
+ /* Example from PR 69968 where transposition helps. */
+ candidates.truncate (0);
+ candidates.safe_push("coordx");
+ candidates.safe_push("coordy");
+ candidates.safe_push("coordz");
+ candidates.safe_push("coordx1");
+ candidates.safe_push("coordy1");
+ candidates.safe_push("coordz1");
+ ASSERT_STREQ ("coordz1", find_closest_string ("coorzd1", &candidates));
}
/* Test data for test_metric_conditions. */
"1234567890123456789012345678901234567890123456789012345678901234567890"
};
-/* Verify that levenshtein_distance appears to be a sane distance function,
+/* Verify that get_edit_distance appears to be a sane distance function,
i.e. the conditions for being a metric. This is done directly for a
small set of examples, using test_data above. This is O(N^3) in the size
of the array, due to the test for the triangle inequality, so we keep the
for (int j = 0; j < num_test_cases; j++)
{
edit_distance_t dist_ij
- = levenshtein_distance (test_data[i], test_data[j]);
+ = get_edit_distance (test_data[i], test_data[j]);
/* Identity of indiscernibles: d(i, j) > 0 iff i == j. */
if (i == j)
/* Symmetry: d(i, j) == d(j, i). */
edit_distance_t dist_ji
- = levenshtein_distance (test_data[j], test_data[i]);
+ = get_edit_distance (test_data[j], test_data[i]);
ASSERT_EQ (dist_ij, dist_ji);
/* Triangle inequality. */
for (int k = 0; k < num_test_cases; k++)
{
edit_distance_t dist_ik
- = levenshtein_distance (test_data[i], test_data[k]);
+ = get_edit_distance (test_data[i], test_data[k]);
edit_distance_t dist_jk
- = levenshtein_distance (test_data[j], test_data[k]);
+ = get_edit_distance (test_data[j], test_data[k]);
ASSERT_TRUE (dist_ik <= dist_ij + dist_jk);
}
}
}
}
-/* Verify levenshtein_distance for a variety of pairs of pre-canned
+/* Verify get_edit_distance for a variety of pairs of pre-canned
inputs, comparing against known-good values. */
void
spellcheck_c_tests ()
{
- levenshtein_distance_unit_test ("", "nonempty", strlen ("nonempty"));
- levenshtein_distance_unit_test ("saturday", "sunday", 3);
- levenshtein_distance_unit_test ("foo", "m_foo", 2);
- levenshtein_distance_unit_test ("hello_world", "HelloWorld", 3);
- levenshtein_distance_unit_test
+ test_get_edit_distance_unit ("", "nonempty", strlen ("nonempty"));
+ test_get_edit_distance_unit ("saturday", "sunday", 3);
+ test_get_edit_distance_unit ("foo", "m_foo", 2);
+ test_get_edit_distance_unit ("hello_world", "HelloWorld", 3);
+ test_get_edit_distance_unit
("the quick brown fox jumps over the lazy dog", "dog", 40);
- levenshtein_distance_unit_test
+ test_get_edit_distance_unit
("the quick brown fox jumps over the lazy dog",
"the quick brown dog jumps over the lazy fox",
4);
- levenshtein_distance_unit_test
+ test_get_edit_distance_unit
("Lorem ipsum dolor sit amet, consectetur adipiscing elit,",
"All your base are belong to us",
44);
- levenshtein_distance_unit_test ("foo", "FOO", 3);
+ test_get_edit_distance_unit ("foo", "FOO", 3);
+ test_get_edit_distance_unit ("fee", "deed", 2);
+ test_get_edit_distance_unit ("coorzd1", "coordx1", 2);
+
+ /* Examples where transposition helps. */
+ test_get_edit_distance_unit ("ab", "ba", 1);
+ test_get_edit_distance_unit ("ba", "abc", 2);
+ test_get_edit_distance_unit ("coorzd1", "coordz1", 1);
+ test_get_edit_distance_unit ("abcdefghijklmnopqrstuvwxyz",
+ "bacdefghijklmnopqrstuvwxzy", 2);
+ test_get_edit_distance_unit ("saturday", "sundya", 4);
+ test_get_edit_distance_unit ("signed", "singed", 1);
test_find_closest_string ();
test_metric_conditions ();