From 277fe616911ac1ce91e9f1178d648303b4a26940 Mon Sep 17 00:00:00 2001 From: David Malcolm Date: Fri, 13 Nov 2015 01:54:41 +0000 Subject: [PATCH] Implement Levenshtein distance; use in C FE for misspelled field names This is the combination of: [PATCH 1/2] Implement Levenshtein distance [PATCH 2/2] C FE: suggest corrections for misspelled field names plus some nit fixes in spellcheck.c. gcc/ChangeLog: * Makefile.in (OBJS): Add spellcheck.o. * spellcheck.c: New file. * spellcheck.h: New file. gcc/c/ChangeLog: * c-typeck.c: Include spellcheck.h. (lookup_field_fuzzy_find_candidates): New function. (lookup_field_fuzzy): New function. (build_component_ref): If the field was not found, try using lookup_field_fuzzy and potentially offer a suggestion. gcc/testsuite/ChangeLog: * gcc.dg/plugin/levenshtein-test-1.c: New file. * gcc.dg/plugin/levenshtein_plugin.c: New file. * gcc.dg/plugin/plugin.exp (plugin_test_list): Add levenshtein_plugin.c. * gcc.dg/spellcheck-fields.c: New file. From-SVN: r230284 --- gcc/ChangeLog | 6 + gcc/Makefile.in | 1 + gcc/c/ChangeLog | 8 ++ gcc/c/c-typeck.c | 74 +++++++++- gcc/spellcheck.c | 136 ++++++++++++++++++ gcc/spellcheck.h | 32 +++++ gcc/testsuite/ChangeLog | 8 ++ .../gcc.dg/plugin/levenshtein-test-1.c | 9 ++ .../gcc.dg/plugin/levenshtein_plugin.c | 64 +++++++++ gcc/testsuite/gcc.dg/plugin/plugin.exp | 1 + gcc/testsuite/gcc.dg/spellcheck-fields.c | 63 ++++++++ 11 files changed, 401 insertions(+), 1 deletion(-) create mode 100644 gcc/spellcheck.c create mode 100644 gcc/spellcheck.h create mode 100644 gcc/testsuite/gcc.dg/plugin/levenshtein-test-1.c create mode 100644 gcc/testsuite/gcc.dg/plugin/levenshtein_plugin.c create mode 100644 gcc/testsuite/gcc.dg/spellcheck-fields.c diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 4fb5504312e..1febdcf6feb 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,9 @@ +2015-11-13 David Malcolm + + * Makefile.in (OBJS): Add spellcheck.o. + * spellcheck.c: New file. + * spellcheck.h: New file. + 2015-11-13 James Bowman * config/ft32/ft32.md (*sne): New insn pattern. diff --git a/gcc/Makefile.in b/gcc/Makefile.in index 34d23565f89..f17234d7870 100644 --- a/gcc/Makefile.in +++ b/gcc/Makefile.in @@ -1403,6 +1403,7 @@ OBJS = \ shrink-wrap.o \ simplify-rtx.o \ sparseset.o \ + spellcheck.o \ sreal.o \ stack-ptr-mod.o \ statistics.o \ diff --git a/gcc/c/ChangeLog b/gcc/c/ChangeLog index 02ae07e8b6a..9b1fdc9343d 100644 --- a/gcc/c/ChangeLog +++ b/gcc/c/ChangeLog @@ -1,3 +1,11 @@ +2015-11-13 David Malcolm + + * c-typeck.c: Include spellcheck.h. + (lookup_field_fuzzy_find_candidates): New function. + (lookup_field_fuzzy): New function. + (build_component_ref): If the field was not found, try using + lookup_field_fuzzy and potentially offer a suggestion. + 2015-11-12 James Norris Joseph Myers diff --git a/gcc/c/c-typeck.c b/gcc/c/c-typeck.c index 4335a87872c..eb4e1fccc44 100644 --- a/gcc/c/c-typeck.c +++ b/gcc/c/c-typeck.c @@ -47,6 +47,7 @@ along with GCC; see the file COPYING3. If not see #include "c-family/c-ubsan.h" #include "cilk.h" #include "gomp-constants.h" +#include "spellcheck.h" /* Possible cases of implicit bad conversions. Used to select diagnostic messages in convert_for_assignment. */ @@ -2242,6 +2243,72 @@ lookup_field (tree type, tree component) return tree_cons (NULL_TREE, field, NULL_TREE); } +/* Recursively append candidate IDENTIFIER_NODEs to CANDIDATES. */ + +static void +lookup_field_fuzzy_find_candidates (tree type, tree component, + vec *candidates) +{ + tree field; + for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field)) + { + if (DECL_NAME (field) == NULL_TREE + && (TREE_CODE (TREE_TYPE (field)) == RECORD_TYPE + || TREE_CODE (TREE_TYPE (field)) == UNION_TYPE)) + { + lookup_field_fuzzy_find_candidates (TREE_TYPE (field), + component, + candidates); + } + + if (DECL_NAME (field)) + candidates->safe_push (DECL_NAME (field)); + } +} + +/* Like "lookup_field", but find the closest matching IDENTIFIER_NODE, + rather than returning a TREE_LIST for an exact match. */ + +static tree +lookup_field_fuzzy (tree type, tree component) +{ + gcc_assert (TREE_CODE (component) == IDENTIFIER_NODE); + + /* First, gather a list of candidates. */ + auto_vec candidates; + + lookup_field_fuzzy_find_candidates (type, component, + &candidates); + + /* Now determine which is closest. */ + int i; + tree identifier; + tree best_identifier = NULL; + edit_distance_t best_distance = MAX_EDIT_DISTANCE; + FOR_EACH_VEC_ELT (candidates, i, identifier) + { + gcc_assert (TREE_CODE (identifier) == IDENTIFIER_NODE); + edit_distance_t dist = levenshtein_distance (component, identifier); + if (dist < best_distance) + { + best_distance = dist; + best_identifier = identifier; + } + } + + /* If more than half of the letters were misspelled, the suggestion is + likely to be meaningless. */ + if (best_identifier) + { + unsigned int cutoff = MAX (IDENTIFIER_LENGTH (component), + IDENTIFIER_LENGTH (best_identifier)) / 2; + if (best_distance > cutoff) + return NULL; + } + + return best_identifier; +} + /* Make an expression to refer to the COMPONENT field of structure or union value DATUM. COMPONENT is an IDENTIFIER_NODE. LOC is the location of the COMPONENT_REF. */ @@ -2277,7 +2344,12 @@ build_component_ref (location_t loc, tree datum, tree component) if (!field) { - error_at (loc, "%qT has no member named %qE", type, component); + tree guessed_id = lookup_field_fuzzy (type, component); + if (guessed_id) + error_at (loc, "%qT has no member named %qE; did you mean %qE?", + type, component, guessed_id); + else + error_at (loc, "%qT has no member named %qE", type, component); return error_mark_node; } diff --git a/gcc/spellcheck.c b/gcc/spellcheck.c new file mode 100644 index 00000000000..31ce3224546 --- /dev/null +++ b/gcc/spellcheck.c @@ -0,0 +1,136 @@ +/* Find near-matches for strings and identifiers. + Copyright (C) 2015 Free Software Foundation, Inc. + +This file is part of GCC. + +GCC is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free +Software Foundation; either version 3, or (at your option) any later +version. + +GCC is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received a copy of the GNU General Public License +along with GCC; see the file COPYING3. If not see +. */ + +#include "config.h" +#include "system.h" +#include "coretypes.h" +#include "tm.h" +#include "tree.h" +#include "spellcheck.h" + +/* The Levenshtein distance is an "edit-distance": the minimal + number of one-character insertions, removals or substitutions + that are needed to change one string into another. + + This implementation uses the Wagner-Fischer algorithm. */ + +static edit_distance_t +levenshtein_distance (const char *s, int len_s, + const char *t, int len_t) +{ + const bool debug = false; + + if (debug) + { + printf ("s: \"%s\" (len_s=%i)\n", s, len_s); + printf ("t: \"%s\" (len_t=%i)\n", t, len_t); + } + + if (len_s == 0) + return len_t; + if (len_t == 0) + return len_s; + + /* We effectively build a matrix where each (i, j) contains the + Levenshtein distance between the prefix strings s[0:j] + and t[0:i]. + Rather than actually build an (len_t + 1) * (len_s + 1) matrix, + we simply keep track of the last row, v0 and a new row, v1, + which avoids an (len_t + 1) * (len_s + 1) allocation and memory accesses + in favor of two (len_s + 1) allocations. These could potentially be + statically-allocated if we impose a maximum length on the + strings of interest. */ + edit_distance_t *v0 = new edit_distance_t[len_s + 1]; + edit_distance_t *v1 = new edit_distance_t[len_s + 1]; + + /* The first row is for the case of an empty target string, which + we can reach by deleting every character in the source string. */ + for (int i = 0; i < len_s + 1; i++) + v0[i] = i; + + /* Build successive rows. */ + for (int i = 0; i < len_t; i++) + { + if (debug) + { + printf ("i:%i v0 = ", i); + for (int j = 0; j < len_s + 1; j++) + printf ("%i ", v0[j]); + printf ("\n"); + } + + /* The initial column is for the case of an empty source string; we + can reach prefixes of the target string of length i + by inserting i characters. */ + v1[0] = i + 1; + + /* Build the rest of the row by considering neighbours to + the north, west and northwest. */ + for (int j = 0; j < len_s; j++) + { + edit_distance_t cost = (s[j] == t[i] ? 0 : 1); + edit_distance_t deletion = v1[j] + 1; + edit_distance_t insertion = v0[j + 1] + 1; + edit_distance_t substitution = v0[j] + cost; + edit_distance_t cheapest = MIN (deletion, insertion); + cheapest = MIN (cheapest, substitution); + v1[j + 1] = cheapest; + } + + /* Prepare to move on to next row. */ + for (int j = 0; j < len_s + 1; j++) + v0[j] = v1[j]; + } + + if (debug) + { + printf ("final v1 = "); + for (int j = 0; j < len_s + 1; j++) + printf ("%i ", v1[j]); + printf ("\n"); + } + + edit_distance_t result = v1[len_s]; + delete[] v0; + delete[] v1; + return result; +} + +/* Calculate Levenshtein distance between two nil-terminated strings. + This exists purely for the unit tests. */ + +edit_distance_t +levenshtein_distance (const char *s, const char *t) +{ + return levenshtein_distance (s, strlen (s), t, strlen (t)); +} + +/* Calculate Levenshtein distance between two identifiers. */ + +edit_distance_t +levenshtein_distance (tree ident_s, tree ident_t) +{ + gcc_assert (TREE_CODE (ident_s) == IDENTIFIER_NODE); + gcc_assert (TREE_CODE (ident_t) == IDENTIFIER_NODE); + + return levenshtein_distance (IDENTIFIER_POINTER (ident_s), + IDENTIFIER_LENGTH (ident_s), + IDENTIFIER_POINTER (ident_t), + IDENTIFIER_LENGTH (ident_t)); +} diff --git a/gcc/spellcheck.h b/gcc/spellcheck.h new file mode 100644 index 00000000000..58355d60e92 --- /dev/null +++ b/gcc/spellcheck.h @@ -0,0 +1,32 @@ +/* Find near-matches for strings and identifiers. + Copyright (C) 2015 Free Software Foundation, Inc. + +This file is part of GCC. + +GCC is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free +Software Foundation; either version 3, or (at your option) any later +version. + +GCC is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received a copy of the GNU General Public License +along with GCC; see the file COPYING3. If not see +. */ + +#ifndef GCC_SPELLCHECK_H +#define GCC_SPELLCHECK_H + +typedef unsigned int edit_distance_t; +const edit_distance_t MAX_EDIT_DISTANCE = UINT_MAX; + +extern edit_distance_t +levenshtein_distance (const char *s, const char *t); + +extern edit_distance_t +levenshtein_distance (tree ident_s, tree ident_t); + +#endif /* GCC_SPELLCHECK_H */ diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index 6531a10068e..67299e015cd 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,3 +1,11 @@ +2015-11-13 David Malcolm + + * gcc.dg/plugin/levenshtein-test-1.c: New file. + * gcc.dg/plugin/levenshtein_plugin.c: New file. + * gcc.dg/plugin/plugin.exp (plugin_test_list): Add + levenshtein_plugin.c. + * gcc.dg/spellcheck-fields.c: New file. + 2015-11-12 Steven G. Kargl PR fortran/68318 diff --git a/gcc/testsuite/gcc.dg/plugin/levenshtein-test-1.c b/gcc/testsuite/gcc.dg/plugin/levenshtein-test-1.c new file mode 100644 index 00000000000..ac49992780d --- /dev/null +++ b/gcc/testsuite/gcc.dg/plugin/levenshtein-test-1.c @@ -0,0 +1,9 @@ +/* Placeholder C source file for unit-testing gcc/spellcheck.c. */ +/* { dg-do compile } */ +/* { dg-options "-O" } */ + +int +main (int argc, char **argv) +{ + return 0; +} diff --git a/gcc/testsuite/gcc.dg/plugin/levenshtein_plugin.c b/gcc/testsuite/gcc.dg/plugin/levenshtein_plugin.c new file mode 100644 index 00000000000..3e7dc788930 --- /dev/null +++ b/gcc/testsuite/gcc.dg/plugin/levenshtein_plugin.c @@ -0,0 +1,64 @@ +/* Plugin for unittesting gcc/spellcheck.h. */ + +#include "config.h" +#include "gcc-plugin.h" +#include "system.h" +#include "coretypes.h" +#include "spellcheck.h" +#include "diagnostic.h" + +int plugin_is_GPL_compatible; + +static void +levenshtein_distance_unit_test_oneway (const char *a, const char *b, + edit_distance_t expected) +{ + edit_distance_t actual = levenshtein_distance (a, b); + if (actual != expected) + error ("levenshtein_distance (\"%s\", \"%s\") : expected: %i got %i", + a, b, expected, actual); +} + + +static void +levenshtein_distance_unit_test (const char *a, const char *b, + edit_distance_t expected) +{ + /* Run every test both ways to ensure it's symmetric. */ + levenshtein_distance_unit_test_oneway (a, b, expected); + levenshtein_distance_unit_test_oneway (b, a, expected); +} + +/* Callback handler for the PLUGIN_FINISH event; run + levenshtein_distance unit tests here. */ + +static void +on_finish (void */*gcc_data*/, void */*user_data*/) +{ + levenshtein_distance_unit_test ("", "nonempty", strlen ("nonempty")); + levenshtein_distance_unit_test ("saturday", "sunday", 3); + levenshtein_distance_unit_test ("foo", "m_foo", 2); + levenshtein_distance_unit_test ("hello_world", "HelloWorld", 3); + levenshtein_distance_unit_test + ("the quick brown fox jumps over the lazy dog", "dog", 40); + levenshtein_distance_unit_test + ("the quick brown fox jumps over the lazy dog", + "the quick brown dog jumps over the lazy fox", + 4); + levenshtein_distance_unit_test + ("Lorem ipsum dolor sit amet, consectetur adipiscing elit,", + "All your base are belong to us", + 44); +} + +int +plugin_init (struct plugin_name_args *plugin_info, + struct plugin_gcc_version */*version*/) +{ + register_callback (plugin_info->base_name, + PLUGIN_FINISH, + on_finish, + NULL); /* void *user_data */ + + return 0; +} diff --git a/gcc/testsuite/gcc.dg/plugin/plugin.exp b/gcc/testsuite/gcc.dg/plugin/plugin.exp index 941bccc4387..ce0a18d0a98 100644 --- a/gcc/testsuite/gcc.dg/plugin/plugin.exp +++ b/gcc/testsuite/gcc.dg/plugin/plugin.exp @@ -66,6 +66,7 @@ set plugin_test_list [list \ { diagnostic_plugin_test_show_locus.c \ diagnostic-test-show-locus-bw.c \ diagnostic-test-show-locus-color.c } \ + { levenshtein_plugin.c levenshtein-test-1.c } \ ] foreach plugin_test $plugin_test_list { diff --git a/gcc/testsuite/gcc.dg/spellcheck-fields.c b/gcc/testsuite/gcc.dg/spellcheck-fields.c new file mode 100644 index 00000000000..01be5508dc5 --- /dev/null +++ b/gcc/testsuite/gcc.dg/spellcheck-fields.c @@ -0,0 +1,63 @@ +/* { dg-do compile } */ + +struct foo +{ + int foo; + int bar; + int baz; +}; + +int test (struct foo *ptr) +{ + return ptr->m_bar; /* { dg-error "'struct foo' has no member named 'm_bar'; did you mean 'bar'?" } */ +} + +int test2 (void) +{ + struct foo instance = {0, 0, 0}; + return instance.m_bar; /* { dg-error "'struct foo' has no member named 'm_bar'; did you mean 'bar'?" } */ +} + +struct s { + struct j { int aa; } kk; + int ab; +}; + +void test3 (struct s x) +{ + x.ac; /* { dg-error "'struct s' has no member named 'ac'; did you mean 'ab'?" } */ +} + +int test4 (struct foo *ptr) +{ + return sizeof (ptr->foa); /* { dg-error "'struct foo' has no member named 'foa'; did you mean 'foo'?" } */ +} + +/* Verify that we don't offer nonsensical suggestions. */ + +int test5 (struct foo *ptr) +{ + return ptr->this_is_unlike_any_of_the_fields; /* { dg-bogus "did you mean" } */ + /* { dg-error "has no member named" "" { target *-*-* } 40 } */ +} + +union u +{ + int color; + int shape; +}; + +int test6 (union u *ptr) +{ + return ptr->colour; /* { dg-error "'union u' has no member named 'colour'; did you mean 'color'?" } */ +} + +struct has_anon +{ + struct { int color; } s; +}; + +int test7 (struct has_anon *ptr) +{ + return ptr->s.colour; /* { dg-error "'struct ' has no member named 'colour'; did you mean 'color'?" } */ +} -- 2.30.2