From: David Malcolm Date: Tue, 12 Jun 2018 18:28:37 +0000 (+0000) Subject: spellcheck: support transpositions aka Damerau-Levenshtein (PR other/69968) X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=b80a188beea98da44228d6030932af0769946f83;p=gcc.git spellcheck: support transpositions aka Damerau-Levenshtein (PR other/69968) gcc/fortran/ChangeLog: PR other/69968 * misc.c (gfc_closest_fuzzy_match): Update for renaming of levenshtein_distance to get_edit_distance. gcc/ChangeLog: PR other/69968 * spellcheck-tree.c (levenshtein_distance): Rename to... (get_edit_distance): ...this, and update for underlying renaming. * spellcheck-tree.h (levenshtein_distance): Rename to... (get_edit_distance): ...this. * spellcheck.c (levenshtein_distance): Rename to... (get_edit_distance): ...this. Convert from Levenshtein distance to Damerau-Levenshtein distance by supporting transpositions of adjacent characters. Rename "v1" to "v_next" and "v0" to "v_one_ago". (selftest::levenshtein_distance_unit_test_oneway): Rename to... (selftest::test_edit_distance_unit_test_oneway): ...this, and update for underlying renaming. (selftest::levenshtein_distance_unit_test): Rename to... (selftest::test_get_edit_distance_unit): ...this, and update for underlying renaming. (selftest::test_find_closest_string): Add example from PR 69968 where transposition helps (selftest::test_metric_conditions): Update for renaming. (selftest::test_metric_conditions): Likewise. (selftest::spellcheck_c_tests): Likewise. * spellcheck.h (levenshtein_distance): Rename both overloads to... (get_edit_distance): ...this. (best_match::consider): Update for renaming. gcc/testsuite/ChangeLog: PR other/69968 * gcc.dg/spellcheck-transposition.c: New test. From-SVN: r261521 --- diff --git a/gcc/ChangeLog b/gcc/ChangeLog index aa0247df98f..1bba11ccd67 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,30 @@ +2018-06-12 David Malcolm + + PR other/69968 + * spellcheck-tree.c (levenshtein_distance): Rename to... + (get_edit_distance): ...this, and update for underlying renaming. + * spellcheck-tree.h (levenshtein_distance): Rename to... + (get_edit_distance): ...this. + * spellcheck.c (levenshtein_distance): Rename to... + (get_edit_distance): ...this. Convert from Levenshtein distance + to Damerau-Levenshtein distance by supporting transpositions of + adjacent characters. Rename "v1" to "v_next" and "v0" to + "v_one_ago". + (selftest::levenshtein_distance_unit_test_oneway): Rename to... + (selftest::test_edit_distance_unit_test_oneway): ...this, and + update for underlying renaming. + (selftest::levenshtein_distance_unit_test): Rename to... + (selftest::test_get_edit_distance_unit): ...this, and update for + underlying renaming. + (selftest::test_find_closest_string): Add example from PR 69968 + where transposition helps + (selftest::test_metric_conditions): Update for renaming. + (selftest::test_metric_conditions): Likewise. + (selftest::spellcheck_c_tests): Likewise. + * spellcheck.h (levenshtein_distance): Rename both overloads to... + (get_edit_distance): ...this. + (best_match::consider): Update for renaming. + 2018-06-12 Martin Sebor PR tree-optimization/85259 diff --git a/gcc/fortran/ChangeLog b/gcc/fortran/ChangeLog index 46498a30c7f..9768959374d 100644 --- a/gcc/fortran/ChangeLog +++ b/gcc/fortran/ChangeLog @@ -1,3 +1,9 @@ +2018-06-12 David Malcolm + + PR other/69968 + * misc.c (gfc_closest_fuzzy_match): Update for renaming of + levenshtein_distance to get_edit_distance. + 2018-06-12 Steven G. Kargl PR fortran/44491 diff --git a/gcc/fortran/misc.c b/gcc/fortran/misc.c index ec1f548123a..fb18c5ceb6f 100644 --- a/gcc/fortran/misc.c +++ b/gcc/fortran/misc.c @@ -286,7 +286,7 @@ get_c_kind(const char *c_kind_name, CInteropKind_t kinds_table[]) /* For a given name TYPO, determine the best candidate from CANDIDATES - perusing Levenshtein distance. Frees CANDIDATES before returning. */ + using get_edit_distance. Frees CANDIDATES before returning. */ const char * gfc_closest_fuzzy_match (const char *typo, char **candidates) @@ -299,7 +299,7 @@ gfc_closest_fuzzy_match (const char *typo, char **candidates) while (cand && *cand) { - edit_distance_t dist = levenshtein_distance (typo, tl, *cand, + edit_distance_t dist = get_edit_distance (typo, tl, *cand, strlen (*cand)); if (dist < best_distance) { diff --git a/gcc/spellcheck-tree.c b/gcc/spellcheck-tree.c index 2a66649ec6a..596293e8d47 100644 --- a/gcc/spellcheck-tree.c +++ b/gcc/spellcheck-tree.c @@ -27,18 +27,18 @@ along with GCC; see the file COPYING3. If not see #include "selftest.h" #include "stringpool.h" -/* Calculate Levenshtein distance between two identifiers. */ +/* Calculate edit distance between two identifiers. */ edit_distance_t -levenshtein_distance (tree ident_s, tree ident_t) +get_edit_distance (tree ident_s, tree ident_t) { gcc_assert (TREE_CODE (ident_s) == IDENTIFIER_NODE); gcc_assert (TREE_CODE (ident_t) == IDENTIFIER_NODE); - return levenshtein_distance (IDENTIFIER_POINTER (ident_s), - IDENTIFIER_LENGTH (ident_s), - IDENTIFIER_POINTER (ident_t), - IDENTIFIER_LENGTH (ident_t)); + return get_edit_distance (IDENTIFIER_POINTER (ident_s), + IDENTIFIER_LENGTH (ident_s), + IDENTIFIER_POINTER (ident_t), + IDENTIFIER_LENGTH (ident_t)); } /* Given TARGET, an identifier, and CANDIDATES, a vec of identifiers, diff --git a/gcc/spellcheck-tree.h b/gcc/spellcheck-tree.h index 1debef52315..4324bd2f939 100644 --- a/gcc/spellcheck-tree.h +++ b/gcc/spellcheck-tree.h @@ -25,7 +25,7 @@ along with GCC; see the file COPYING3. If not see /* spellcheck-tree.c */ extern edit_distance_t -levenshtein_distance (tree ident_s, tree ident_t); +get_edit_distance (tree ident_s, tree ident_t); extern tree find_closest_identifier (tree target, const auto_vec *candidates); diff --git a/gcc/spellcheck.c b/gcc/spellcheck.c index 7da006afc13..e2a8b92fdec 100644 --- a/gcc/spellcheck.c +++ b/gcc/spellcheck.c @@ -25,15 +25,18 @@ along with GCC; see the file COPYING3. If not see #include "spellcheck.h" #include "selftest.h" -/* The Levenshtein distance is an "edit-distance": the minimal - number of one-character insertions, removals or substitutions - that are needed to change one string into another. +/* Get the edit distance between the two strings: the minimal + number of edits that are needed to change one string into another, + where edits can be one-character insertions, removals, or substitutions, + or transpositions of two adjacent characters (counting as one "edit"). - This implementation uses the Wagner-Fischer algorithm. */ + This implementation uses the Wagner-Fischer algorithm for the + Damerau-Levenshtein distance; specifically, the "optimal string alignment + distance" or "restricted edit distance" variant. */ edit_distance_t -levenshtein_distance (const char *s, int len_s, - const char *t, int len_t) +get_edit_distance (const char *s, int len_s, + const char *t, int len_t) { const bool debug = false; @@ -49,76 +52,86 @@ levenshtein_distance (const char *s, int len_s, return len_s; /* We effectively build a matrix where each (i, j) contains the - Levenshtein distance between the prefix strings s[0:j] - and t[0:i]. + distance between the prefix strings s[0:j] and t[0:i]. Rather than actually build an (len_t + 1) * (len_s + 1) matrix, - we simply keep track of the last row, v0 and a new row, v1, - which avoids an (len_t + 1) * (len_s + 1) allocation and memory accesses - in favor of two (len_s + 1) allocations. These could potentially be + we simply keep track of the last two rows, v_one_ago and v_two_ago, + and a new row, v_next, which avoids an (len_t + 1) * (len_s + 1) + allocation and memory accesses in favor of three (len_s + 1) + allocations. These could potentially be statically-allocated if we impose a maximum length on the strings of interest. */ - edit_distance_t *v0 = new edit_distance_t[len_s + 1]; - edit_distance_t *v1 = new edit_distance_t[len_s + 1]; + edit_distance_t *v_two_ago = new edit_distance_t[len_s + 1]; + edit_distance_t *v_one_ago = new edit_distance_t[len_s + 1]; + edit_distance_t *v_next = new edit_distance_t[len_s + 1]; /* The first row is for the case of an empty target string, which we can reach by deleting every character in the source string. */ for (int i = 0; i < len_s + 1; i++) - v0[i] = i; + v_one_ago[i] = i; /* Build successive rows. */ for (int i = 0; i < len_t; i++) { if (debug) { - printf ("i:%i v0 = ", i); + printf ("i:%i v_one_ago = ", i); for (int j = 0; j < len_s + 1; j++) - printf ("%i ", v0[j]); + printf ("%i ", v_one_ago[j]); printf ("\n"); } /* The initial column is for the case of an empty source string; we can reach prefixes of the target string of length i by inserting i characters. */ - v1[0] = i + 1; + v_next[0] = i + 1; /* Build the rest of the row by considering neighbors to the north, west and northwest. */ for (int j = 0; j < len_s; j++) { edit_distance_t cost = (s[j] == t[i] ? 0 : 1); - edit_distance_t deletion = v1[j] + 1; - edit_distance_t insertion = v0[j + 1] + 1; - edit_distance_t substitution = v0[j] + cost; + edit_distance_t deletion = v_next[j] + 1; + edit_distance_t insertion = v_one_ago[j + 1] + 1; + edit_distance_t substitution = v_one_ago[j] + cost; edit_distance_t cheapest = MIN (deletion, insertion); cheapest = MIN (cheapest, substitution); - v1[j + 1] = cheapest; + if (i > 0 && j > 0 && s[j] == t[i - 1] && s[j - 1] == t[i]) + { + edit_distance_t transposition = v_two_ago[j - 1] + 1; + cheapest = MIN (cheapest, transposition); + } + v_next[j + 1] = cheapest; } /* Prepare to move on to next row. */ for (int j = 0; j < len_s + 1; j++) - v0[j] = v1[j]; + { + v_two_ago[j] = v_one_ago[j]; + v_one_ago[j] = v_next[j]; + } } if (debug) { - printf ("final v1 = "); + printf ("final v_next = "); for (int j = 0; j < len_s + 1; j++) - printf ("%i ", v1[j]); + printf ("%i ", v_next[j]); printf ("\n"); } - edit_distance_t result = v1[len_s]; - delete[] v0; - delete[] v1; + edit_distance_t result = v_next[len_s]; + delete[] v_two_ago; + delete[] v_one_ago; + delete[] v_next; return result; } -/* Calculate Levenshtein distance between two nil-terminated strings. */ +/* Get the edit distance between two nil-terminated strings. */ edit_distance_t -levenshtein_distance (const char *s, const char *t) +get_edit_distance (const char *s, const char *t) { - return levenshtein_distance (s, strlen (s), t, strlen (t)); + return get_edit_distance (s, strlen (s), t, strlen (t)); } /* Given TARGET, a non-NULL string, and CANDIDATES, a non-NULL ptr to @@ -155,29 +168,28 @@ namespace selftest { /* Selftests. */ -/* Verify that the levenshtein_distance (A, B) equals the expected - value. */ +/* Verify that get_edit_distance (A, B) equals the expected value. */ static void -levenshtein_distance_unit_test_oneway (const char *a, const char *b, - edit_distance_t expected) +test_edit_distance_unit_test_oneway (const char *a, const char *b, + edit_distance_t expected) { - edit_distance_t actual = levenshtein_distance (a, b); + edit_distance_t actual = get_edit_distance (a, b); ASSERT_EQ (actual, expected); } /* Verify that both - levenshtein_distance (A, B) + get_edit_distance (A, B) and - levenshtein_distance (B, A) + get_edit_distance (B, A) equal the expected value, to ensure that the function is symmetric. */ static void -levenshtein_distance_unit_test (const char *a, const char *b, - edit_distance_t expected) +test_get_edit_distance_unit (const char *a, const char *b, + edit_distance_t expected) { - levenshtein_distance_unit_test_oneway (a, b, expected); - levenshtein_distance_unit_test_oneway (b, a, expected); + test_edit_distance_unit_test_oneway (a, b, expected); + test_edit_distance_unit_test_oneway (b, a, expected); } /* Verify that find_closest_string is sane. */ @@ -215,6 +227,16 @@ test_find_closest_string () it as a suggestion will be nonsensical. Verify that we don't offer such suggestions. */ ASSERT_EQ (NULL, find_closest_string ("banana", &candidates)); + + /* Example from PR 69968 where transposition helps. */ + candidates.truncate (0); + candidates.safe_push("coordx"); + candidates.safe_push("coordy"); + candidates.safe_push("coordz"); + candidates.safe_push("coordx1"); + candidates.safe_push("coordy1"); + candidates.safe_push("coordz1"); + ASSERT_STREQ ("coordz1", find_closest_string ("coorzd1", &candidates)); } /* Test data for test_metric_conditions. */ @@ -227,7 +249,7 @@ static const char * const test_data[] = { "1234567890123456789012345678901234567890123456789012345678901234567890" }; -/* Verify that levenshtein_distance appears to be a sane distance function, +/* Verify that get_edit_distance appears to be a sane distance function, i.e. the conditions for being a metric. This is done directly for a small set of examples, using test_data above. This is O(N^3) in the size of the array, due to the test for the triangle inequality, so we keep the @@ -243,7 +265,7 @@ test_metric_conditions () for (int j = 0; j < num_test_cases; j++) { edit_distance_t dist_ij - = levenshtein_distance (test_data[i], test_data[j]); + = get_edit_distance (test_data[i], test_data[j]); /* Identity of indiscernibles: d(i, j) > 0 iff i == j. */ if (i == j) @@ -253,43 +275,54 @@ test_metric_conditions () /* Symmetry: d(i, j) == d(j, i). */ edit_distance_t dist_ji - = levenshtein_distance (test_data[j], test_data[i]); + = get_edit_distance (test_data[j], test_data[i]); ASSERT_EQ (dist_ij, dist_ji); /* Triangle inequality. */ for (int k = 0; k < num_test_cases; k++) { edit_distance_t dist_ik - = levenshtein_distance (test_data[i], test_data[k]); + = get_edit_distance (test_data[i], test_data[k]); edit_distance_t dist_jk - = levenshtein_distance (test_data[j], test_data[k]); + = get_edit_distance (test_data[j], test_data[k]); ASSERT_TRUE (dist_ik <= dist_ij + dist_jk); } } } } -/* Verify levenshtein_distance for a variety of pairs of pre-canned +/* Verify get_edit_distance for a variety of pairs of pre-canned inputs, comparing against known-good values. */ void spellcheck_c_tests () { - levenshtein_distance_unit_test ("", "nonempty", strlen ("nonempty")); - levenshtein_distance_unit_test ("saturday", "sunday", 3); - levenshtein_distance_unit_test ("foo", "m_foo", 2); - levenshtein_distance_unit_test ("hello_world", "HelloWorld", 3); - levenshtein_distance_unit_test + test_get_edit_distance_unit ("", "nonempty", strlen ("nonempty")); + test_get_edit_distance_unit ("saturday", "sunday", 3); + test_get_edit_distance_unit ("foo", "m_foo", 2); + test_get_edit_distance_unit ("hello_world", "HelloWorld", 3); + test_get_edit_distance_unit ("the quick brown fox jumps over the lazy dog", "dog", 40); - levenshtein_distance_unit_test + test_get_edit_distance_unit ("the quick brown fox jumps over the lazy dog", "the quick brown dog jumps over the lazy fox", 4); - levenshtein_distance_unit_test + test_get_edit_distance_unit ("Lorem ipsum dolor sit amet, consectetur adipiscing elit,", "All your base are belong to us", 44); - levenshtein_distance_unit_test ("foo", "FOO", 3); + test_get_edit_distance_unit ("foo", "FOO", 3); + test_get_edit_distance_unit ("fee", "deed", 2); + test_get_edit_distance_unit ("coorzd1", "coordx1", 2); + + /* Examples where transposition helps. */ + test_get_edit_distance_unit ("ab", "ba", 1); + test_get_edit_distance_unit ("ba", "abc", 2); + test_get_edit_distance_unit ("coorzd1", "coordz1", 1); + test_get_edit_distance_unit ("abcdefghijklmnopqrstuvwxyz", + "bacdefghijklmnopqrstuvwxzy", 2); + test_get_edit_distance_unit ("saturday", "sundya", 4); + test_get_edit_distance_unit ("signed", "singed", 1); test_find_closest_string (); test_metric_conditions (); diff --git a/gcc/spellcheck.h b/gcc/spellcheck.h index f0efff8993b..41ac16f0252 100644 --- a/gcc/spellcheck.h +++ b/gcc/spellcheck.h @@ -25,11 +25,11 @@ const edit_distance_t MAX_EDIT_DISTANCE = UINT_MAX; /* spellcheck.c */ extern edit_distance_t -levenshtein_distance (const char *s, int len_s, - const char *t, int len_t); +get_edit_distance (const char *s, int len_s, + const char *t, int len_t); extern edit_distance_t -levenshtein_distance (const char *s, const char *t); +get_edit_distance (const char *s, const char *t); extern const char * find_closest_string (const char *target, @@ -73,7 +73,7 @@ struct edit_distance_traits This type accumulates the best possible match against GOAL_TYPE for a sequence of elements of CANDIDATE_TYPE, whilst minimizing the - number of calls to levenshtein_distance and to + number of calls to get_edit_distance and to edit_distance_traits::get_length. */ template @@ -126,9 +126,9 @@ class best_match /* Otherwise, compute the distance and see if the candidate has beaten the previous best value. */ edit_distance_t dist - = levenshtein_distance (m_goal, m_goal_len, - candidate_traits::get_string (candidate), - candidate_len); + = get_edit_distance (m_goal, m_goal_len, + candidate_traits::get_string (candidate), + candidate_len); if (dist < m_best_distance) { m_best_distance = dist; diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index 9ab1822be6e..6acc3589141 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,3 +1,8 @@ +2018-06-12 David Malcolm + + PR other/69968 + * gcc.dg/spellcheck-transposition.c: New test. + 2018-06-12 Steven G. Kargl PR fortran/44491 diff --git a/gcc/testsuite/gcc.dg/spellcheck-transposition.c b/gcc/testsuite/gcc.dg/spellcheck-transposition.c new file mode 100644 index 00000000000..b787c83a4e2 --- /dev/null +++ b/gcc/testsuite/gcc.dg/spellcheck-transposition.c @@ -0,0 +1,20 @@ +/* PR other/69968. */ + +struct { + int coordx, coordy, coordz; + int coordx1, coordy1, coordz1; +} c; + +/* Consider the misspelling "coorzd1". + + With Levenshtein distance, the misspelling has an edit distance of 2 + to all 6 of the fields (e.g. via a deletion and a substitution for the + first three, and via deletion and insertion for the second three). + + With Damerau-Levenshtein, the misspelling has an edit distance of 1 + via transposition to "coordz1", and 2 to the other fields. */ + +void foo (void) +{ + c.coorzd1 = c.coordy; /* { dg-error "has no member named 'coorzd1'; did you mean 'coordz1'" } */ +}