diff mbox series

spellcheck: support transpositions aka Damerau-Levenshtein (PR other/69968)

Message ID 1525135053-44850-1-git-send-email-dmalcolm@redhat.com
State New
Headers show
Series spellcheck: support transpositions aka Damerau-Levenshtein (PR other/69968) | expand

Commit Message

David Malcolm May 1, 2018, 12:37 a.m. UTC
This patch updates the edit-distance algorithm in spellcheck.c to
support transpositions as well as additions/deletions/substitutions,
so that a transposition error counts as a distance of 1 rather than 2.

This leads to saner suggestions for such cases.

Successfully bootstrapped & regrtested on x86_64-pc-linux-gnu.

OK for trunk?

gcc/fortran/ChangeLog:
	PR other/69968
	* misc.c (gfc_closest_fuzzy_match): Update for renaming of
	levenshtein_distance to get_edit_distance.

gcc/ChangeLog:
	PR other/69968
	* spellcheck-tree.c (levenshtein_distance): Rename to...
	(get_edit_distance): ...this, and update for underlying renaming.
	* spellcheck-tree.h (levenshtein_distance): Rename to...
	(get_edit_distance): ...this.
	* spellcheck.c (levenshtein_distance): Rename to...
	(get_edit_distance): ...this.  Convert from Levenshtein distance
	to Damerau-Levenshtein distance by supporting transpositions of
	adjacent characters.  Rename "v1" to "v_next" and "v0" to
	"v_one_ago".
	(selftest::levenshtein_distance_unit_test_oneway): Rename to...
	(selftest::test_edit_distance_unit_test_oneway): ...this, and
	update for underlying renaming.
	(selftest::levenshtein_distance_unit_test): Rename to...
	(selftest::test_get_edit_distance_unit): ...this, and update for
	underlying renaming.
	(selftest::test_find_closest_string): Add example from PR 69968
	where transposition helps
	(selftest::test_metric_conditions): Update for renaming.
	(selftest::test_metric_conditions): Likewise.
	(selftest::spellcheck_c_tests): Likewise.
	* spellcheck.h (levenshtein_distance): Rename both overloads to...
	(get_edit_distance): ...this.
	(best_match::consider): Update for renaming.

gcc/testsuite/ChangeLog:
	PR other/69968
	* gcc.dg/spellcheck-transposition.c: New test.
---
 gcc/fortran/misc.c                              |   4 +-
 gcc/spellcheck-tree.c                           |  12 +-
 gcc/spellcheck-tree.h                           |   2 +-
 gcc/spellcheck.c                                | 143 +++++++++++++++---------
 gcc/spellcheck.h                                |  14 +--
 gcc/testsuite/gcc.dg/spellcheck-transposition.c |  20 ++++
 6 files changed, 124 insertions(+), 71 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/spellcheck-transposition.c

Comments

Jeff Law June 11, 2018, 9:15 p.m. UTC | #1
On 04/30/2018 06:37 PM, David Malcolm wrote:
> This patch updates the edit-distance algorithm in spellcheck.c to
> support transpositions as well as additions/deletions/substitutions,
> so that a transposition error counts as a distance of 1 rather than 2.
> 
> This leads to saner suggestions for such cases.
> 
> Successfully bootstrapped & regrtested on x86_64-pc-linux-gnu.
> 
> OK for trunk?
> 
> gcc/fortran/ChangeLog:
> 	PR other/69968
> 	* misc.c (gfc_closest_fuzzy_match): Update for renaming of
> 	levenshtein_distance to get_edit_distance.
> 
> gcc/ChangeLog:
> 	PR other/69968
> 	* spellcheck-tree.c (levenshtein_distance): Rename to...
> 	(get_edit_distance): ...this, and update for underlying renaming.
> 	* spellcheck-tree.h (levenshtein_distance): Rename to...
> 	(get_edit_distance): ...this.
> 	* spellcheck.c (levenshtein_distance): Rename to...
> 	(get_edit_distance): ...this.  Convert from Levenshtein distance
> 	to Damerau-Levenshtein distance by supporting transpositions of
> 	adjacent characters.  Rename "v1" to "v_next" and "v0" to
> 	"v_one_ago".
> 	(selftest::levenshtein_distance_unit_test_oneway): Rename to...
> 	(selftest::test_edit_distance_unit_test_oneway): ...this, and
> 	update for underlying renaming.
> 	(selftest::levenshtein_distance_unit_test): Rename to...
> 	(selftest::test_get_edit_distance_unit): ...this, and update for
> 	underlying renaming.
> 	(selftest::test_find_closest_string): Add example from PR 69968
> 	where transposition helps
> 	(selftest::test_metric_conditions): Update for renaming.
> 	(selftest::test_metric_conditions): Likewise.
> 	(selftest::spellcheck_c_tests): Likewise.
> 	* spellcheck.h (levenshtein_distance): Rename both overloads to...
> 	(get_edit_distance): ...this.
> 	(best_match::consider): Update for renaming.
> 
> gcc/testsuite/ChangeLog:
> 	PR other/69968
> 	* gcc.dg/spellcheck-transposition.c: New test.
Going to trust you've got the algorithm right :-)

OK

jeff
diff mbox series

Patch

diff --git a/gcc/fortran/misc.c b/gcc/fortran/misc.c
index ec1f548..fb18c5c 100644
--- a/gcc/fortran/misc.c
+++ b/gcc/fortran/misc.c
@@ -286,7 +286,7 @@  get_c_kind(const char *c_kind_name, CInteropKind_t kinds_table[])
 
 
 /* For a given name TYPO, determine the best candidate from CANDIDATES
-   perusing Levenshtein distance.  Frees CANDIDATES before returning.  */
+   using get_edit_distance.  Frees CANDIDATES before returning.  */
 
 const char *
 gfc_closest_fuzzy_match (const char *typo, char **candidates)
@@ -299,7 +299,7 @@  gfc_closest_fuzzy_match (const char *typo, char **candidates)
 
   while (cand && *cand)
     {
-      edit_distance_t dist = levenshtein_distance (typo, tl, *cand,
+      edit_distance_t dist = get_edit_distance (typo, tl, *cand,
 	  strlen (*cand));
       if (dist < best_distance)
 	{
diff --git a/gcc/spellcheck-tree.c b/gcc/spellcheck-tree.c
index 2a66649..596293e 100644
--- a/gcc/spellcheck-tree.c
+++ b/gcc/spellcheck-tree.c
@@ -27,18 +27,18 @@  along with GCC; see the file COPYING3.  If not see
 #include "selftest.h"
 #include "stringpool.h"
 
-/* Calculate Levenshtein distance between two identifiers.  */
+/* Calculate edit distance between two identifiers.  */
 
 edit_distance_t
-levenshtein_distance (tree ident_s, tree ident_t)
+get_edit_distance (tree ident_s, tree ident_t)
 {
   gcc_assert (TREE_CODE (ident_s) == IDENTIFIER_NODE);
   gcc_assert (TREE_CODE (ident_t) == IDENTIFIER_NODE);
 
-  return levenshtein_distance (IDENTIFIER_POINTER (ident_s),
-			       IDENTIFIER_LENGTH (ident_s),
-			       IDENTIFIER_POINTER (ident_t),
-			       IDENTIFIER_LENGTH (ident_t));
+  return get_edit_distance (IDENTIFIER_POINTER (ident_s),
+			    IDENTIFIER_LENGTH (ident_s),
+			    IDENTIFIER_POINTER (ident_t),
+			    IDENTIFIER_LENGTH (ident_t));
 }
 
 /* Given TARGET, an identifier, and CANDIDATES, a vec of identifiers,
diff --git a/gcc/spellcheck-tree.h b/gcc/spellcheck-tree.h
index 1debef5..4324bd2 100644
--- a/gcc/spellcheck-tree.h
+++ b/gcc/spellcheck-tree.h
@@ -25,7 +25,7 @@  along with GCC; see the file COPYING3.  If not see
 /* spellcheck-tree.c  */
 
 extern edit_distance_t
-levenshtein_distance (tree ident_s, tree ident_t);
+get_edit_distance (tree ident_s, tree ident_t);
 
 extern tree
 find_closest_identifier (tree target, const auto_vec<tree> *candidates);
diff --git a/gcc/spellcheck.c b/gcc/spellcheck.c
index 7da006a..e2a8b92 100644
--- a/gcc/spellcheck.c
+++ b/gcc/spellcheck.c
@@ -25,15 +25,18 @@  along with GCC; see the file COPYING3.  If not see
 #include "spellcheck.h"
 #include "selftest.h"
 
-/* The Levenshtein distance is an "edit-distance": the minimal
-   number of one-character insertions, removals or substitutions
-   that are needed to change one string into another.
+/* Get the edit distance between the two strings: the minimal
+   number of edits that are needed to change one string into another,
+   where edits can be one-character insertions, removals, or substitutions,
+   or transpositions of two adjacent characters (counting as one "edit").
 
-   This implementation uses the Wagner-Fischer algorithm.  */
+   This implementation uses the Wagner-Fischer algorithm for the
+   Damerau-Levenshtein distance; specifically, the "optimal string alignment
+   distance" or "restricted edit distance" variant.  */
 
 edit_distance_t
-levenshtein_distance (const char *s, int len_s,
-		      const char *t, int len_t)
+get_edit_distance (const char *s, int len_s,
+		   const char *t, int len_t)
 {
   const bool debug = false;
 
@@ -49,76 +52,86 @@  levenshtein_distance (const char *s, int len_s,
     return len_s;
 
   /* We effectively build a matrix where each (i, j) contains the
-     Levenshtein distance between the prefix strings s[0:j]
-     and t[0:i].
+     distance between the prefix strings s[0:j] and t[0:i].
      Rather than actually build an (len_t + 1) * (len_s + 1) matrix,
-     we simply keep track of the last row, v0 and a new row, v1,
-     which avoids an (len_t + 1) * (len_s + 1) allocation and memory accesses
-     in favor of two (len_s + 1) allocations.  These could potentially be
+     we simply keep track of the last two rows, v_one_ago and v_two_ago,
+     and a new row, v_next, which avoids an (len_t + 1) * (len_s + 1)
+     allocation and memory accesses in favor of three (len_s + 1)
+     allocations.  These could potentially be
      statically-allocated if we impose a maximum length on the
      strings of interest.  */
-  edit_distance_t *v0 = new edit_distance_t[len_s + 1];
-  edit_distance_t *v1 = new edit_distance_t[len_s + 1];
+  edit_distance_t *v_two_ago = new edit_distance_t[len_s + 1];
+  edit_distance_t *v_one_ago = new edit_distance_t[len_s + 1];
+  edit_distance_t *v_next = new edit_distance_t[len_s + 1];
 
   /* The first row is for the case of an empty target string, which
      we can reach by deleting every character in the source string.  */
   for (int i = 0; i < len_s + 1; i++)
-    v0[i] = i;
+    v_one_ago[i] = i;
 
   /* Build successive rows.  */
   for (int i = 0; i < len_t; i++)
     {
       if (debug)
 	{
-	  printf ("i:%i v0 = ", i);
+	  printf ("i:%i v_one_ago = ", i);
 	  for (int j = 0; j < len_s + 1; j++)
-	    printf ("%i ", v0[j]);
+	    printf ("%i ", v_one_ago[j]);
 	  printf ("\n");
 	}
 
       /* The initial column is for the case of an empty source string; we
 	 can reach prefixes of the target string of length i
 	 by inserting i characters.  */
-      v1[0] = i + 1;
+      v_next[0] = i + 1;
 
       /* Build the rest of the row by considering neighbors to
 	 the north, west and northwest.  */
       for (int j = 0; j < len_s; j++)
 	{
 	  edit_distance_t cost = (s[j] == t[i] ? 0 : 1);
-	  edit_distance_t deletion     = v1[j] + 1;
-	  edit_distance_t insertion    = v0[j + 1] + 1;
-	  edit_distance_t substitution = v0[j] + cost;
+	  edit_distance_t deletion     = v_next[j] + 1;
+	  edit_distance_t insertion    = v_one_ago[j + 1] + 1;
+	  edit_distance_t substitution = v_one_ago[j] + cost;
 	  edit_distance_t cheapest = MIN (deletion, insertion);
 	  cheapest = MIN (cheapest, substitution);
-	  v1[j + 1] = cheapest;
+	  if (i > 0 && j > 0 && s[j] == t[i - 1] && s[j - 1] == t[i])
+	    {
+	      edit_distance_t transposition = v_two_ago[j - 1] + 1;
+	      cheapest = MIN (cheapest, transposition);
+	    }
+	  v_next[j + 1] = cheapest;
 	}
 
       /* Prepare to move on to next row.  */
       for (int j = 0; j < len_s + 1; j++)
-	v0[j] = v1[j];
+	{
+	  v_two_ago[j] = v_one_ago[j];
+	  v_one_ago[j] = v_next[j];
+	}
     }
 
   if (debug)
     {
-      printf ("final v1 = ");
+      printf ("final v_next = ");
       for (int j = 0; j < len_s + 1; j++)
-	printf ("%i ", v1[j]);
+	printf ("%i ", v_next[j]);
       printf ("\n");
     }
 
-  edit_distance_t result = v1[len_s];
-  delete[] v0;
-  delete[] v1;
+  edit_distance_t result = v_next[len_s];
+  delete[] v_two_ago;
+  delete[] v_one_ago;
+  delete[] v_next;
   return result;
 }
 
-/* Calculate Levenshtein distance between two nil-terminated strings.  */
+/* Get the edit distance between two nil-terminated strings.  */
 
 edit_distance_t
-levenshtein_distance (const char *s, const char *t)
+get_edit_distance (const char *s, const char *t)
 {
-  return levenshtein_distance (s, strlen (s), t, strlen (t));
+  return get_edit_distance (s, strlen (s), t, strlen (t));
 }
 
 /* Given TARGET, a non-NULL string, and CANDIDATES, a non-NULL ptr to
@@ -155,29 +168,28 @@  namespace selftest {
 
 /* Selftests.  */
 
-/* Verify that the levenshtein_distance (A, B) equals the expected
-   value.  */
+/* Verify that get_edit_distance (A, B) equals the expected value.  */
 
 static void
-levenshtein_distance_unit_test_oneway (const char *a, const char *b,
-				       edit_distance_t expected)
+test_edit_distance_unit_test_oneway (const char *a, const char *b,
+				    edit_distance_t expected)
 {
-  edit_distance_t actual = levenshtein_distance (a, b);
+  edit_distance_t actual = get_edit_distance (a, b);
   ASSERT_EQ (actual, expected);
 }
 
 /* Verify that both
-     levenshtein_distance (A, B)
+     get_edit_distance (A, B)
    and
-     levenshtein_distance (B, A)
+     get_edit_distance (B, A)
    equal the expected value, to ensure that the function is symmetric.  */
 
 static void
-levenshtein_distance_unit_test (const char *a, const char *b,
-				edit_distance_t expected)
+test_get_edit_distance_unit (const char *a, const char *b,
+			     edit_distance_t expected)
 {
-  levenshtein_distance_unit_test_oneway (a, b, expected);
-  levenshtein_distance_unit_test_oneway (b, a, expected);
+  test_edit_distance_unit_test_oneway (a, b, expected);
+  test_edit_distance_unit_test_oneway (b, a, expected);
 }
 
 /* Verify that find_closest_string is sane.  */
@@ -215,6 +227,16 @@  test_find_closest_string ()
      it as a suggestion will be nonsensical.  Verify that we don't offer such
      suggestions.  */
   ASSERT_EQ (NULL, find_closest_string ("banana", &candidates));
+
+  /* Example from PR 69968 where transposition helps.  */
+  candidates.truncate (0);
+  candidates.safe_push("coordx");
+  candidates.safe_push("coordy");
+  candidates.safe_push("coordz");
+  candidates.safe_push("coordx1");
+  candidates.safe_push("coordy1");
+  candidates.safe_push("coordz1");
+  ASSERT_STREQ ("coordz1", find_closest_string ("coorzd1", &candidates));
 }
 
 /* Test data for test_metric_conditions.  */
@@ -227,7 +249,7 @@  static const char * const test_data[] = {
   "1234567890123456789012345678901234567890123456789012345678901234567890"
 };
 
-/* Verify that levenshtein_distance appears to be a sane distance function,
+/* Verify that get_edit_distance appears to be a sane distance function,
    i.e. the conditions for being a metric.  This is done directly for a
    small set of examples, using test_data above.  This is O(N^3) in the size
    of the array, due to the test for the triangle inequality, so we keep the
@@ -243,7 +265,7 @@  test_metric_conditions ()
       for (int j = 0; j < num_test_cases; j++)
 	{
 	  edit_distance_t dist_ij
-	    = levenshtein_distance (test_data[i], test_data[j]);
+	    = get_edit_distance (test_data[i], test_data[j]);
 
 	  /* Identity of indiscernibles: d(i, j) > 0 iff i == j.  */
 	  if (i == j)
@@ -253,43 +275,54 @@  test_metric_conditions ()
 
 	  /* Symmetry: d(i, j) == d(j, i).  */
 	  edit_distance_t dist_ji
-	    = levenshtein_distance (test_data[j], test_data[i]);
+	    = get_edit_distance (test_data[j], test_data[i]);
 	  ASSERT_EQ (dist_ij, dist_ji);
 
 	  /* Triangle inequality.  */
 	  for (int k = 0; k < num_test_cases; k++)
 	    {
 	      edit_distance_t dist_ik
-		= levenshtein_distance (test_data[i], test_data[k]);
+		= get_edit_distance (test_data[i], test_data[k]);
 	      edit_distance_t dist_jk
-		= levenshtein_distance (test_data[j], test_data[k]);
+		= get_edit_distance (test_data[j], test_data[k]);
 	      ASSERT_TRUE (dist_ik <= dist_ij + dist_jk);
 	    }
 	}
     }
 }
 
-/* Verify levenshtein_distance for a variety of pairs of pre-canned
+/* Verify get_edit_distance for a variety of pairs of pre-canned
    inputs, comparing against known-good values.  */
 
 void
 spellcheck_c_tests ()
 {
-  levenshtein_distance_unit_test ("", "nonempty", strlen ("nonempty"));
-  levenshtein_distance_unit_test ("saturday", "sunday", 3);
-  levenshtein_distance_unit_test ("foo", "m_foo", 2);
-  levenshtein_distance_unit_test ("hello_world", "HelloWorld", 3);
-  levenshtein_distance_unit_test
+  test_get_edit_distance_unit ("", "nonempty", strlen ("nonempty"));
+  test_get_edit_distance_unit ("saturday", "sunday", 3);
+  test_get_edit_distance_unit ("foo", "m_foo", 2);
+  test_get_edit_distance_unit ("hello_world", "HelloWorld", 3);
+  test_get_edit_distance_unit
     ("the quick brown fox jumps over the lazy dog", "dog", 40);
-  levenshtein_distance_unit_test
+  test_get_edit_distance_unit
     ("the quick brown fox jumps over the lazy dog",
      "the quick brown dog jumps over the lazy fox",
      4);
-  levenshtein_distance_unit_test
+  test_get_edit_distance_unit
     ("Lorem ipsum dolor sit amet, consectetur adipiscing elit,",
      "All your base are belong to us",
      44);
-  levenshtein_distance_unit_test ("foo", "FOO", 3);
+  test_get_edit_distance_unit ("foo", "FOO", 3);
+  test_get_edit_distance_unit ("fee", "deed", 2);
+  test_get_edit_distance_unit ("coorzd1", "coordx1", 2);
+
+  /* Examples where transposition helps.  */
+  test_get_edit_distance_unit ("ab", "ba", 1);
+  test_get_edit_distance_unit ("ba", "abc", 2);
+  test_get_edit_distance_unit ("coorzd1", "coordz1", 1);
+  test_get_edit_distance_unit ("abcdefghijklmnopqrstuvwxyz",
+			       "bacdefghijklmnopqrstuvwxzy", 2);
+  test_get_edit_distance_unit ("saturday", "sundya", 4);
+  test_get_edit_distance_unit ("signed", "singed", 1);
 
   test_find_closest_string ();
   test_metric_conditions ();
diff --git a/gcc/spellcheck.h b/gcc/spellcheck.h
index f0efff8..41ac16f 100644
--- a/gcc/spellcheck.h
+++ b/gcc/spellcheck.h
@@ -25,11 +25,11 @@  const edit_distance_t MAX_EDIT_DISTANCE = UINT_MAX;
 
 /* spellcheck.c  */
 extern edit_distance_t
-levenshtein_distance (const char *s, int len_s,
-		      const char *t, int len_t);
+get_edit_distance (const char *s, int len_s,
+		   const char *t, int len_t);
 
 extern edit_distance_t
-levenshtein_distance (const char *s, const char *t);
+get_edit_distance (const char *s, const char *t);
 
 extern const char *
 find_closest_string (const char *target,
@@ -73,7 +73,7 @@  struct edit_distance_traits<const char *>
 
    This type accumulates the best possible match against GOAL_TYPE for
    a sequence of elements of CANDIDATE_TYPE, whilst minimizing the
-   number of calls to levenshtein_distance and to
+   number of calls to get_edit_distance and to
    edit_distance_traits<T>::get_length.  */
 
 template <typename GOAL_TYPE, typename CANDIDATE_TYPE>
@@ -126,9 +126,9 @@  class best_match
     /* Otherwise, compute the distance and see if the candidate
        has beaten the previous best value.  */
     edit_distance_t dist
-      = levenshtein_distance (m_goal, m_goal_len,
-			      candidate_traits::get_string (candidate),
-			      candidate_len);
+      = get_edit_distance (m_goal, m_goal_len,
+			   candidate_traits::get_string (candidate),
+			   candidate_len);
     if (dist < m_best_distance)
       {
 	m_best_distance = dist;
diff --git a/gcc/testsuite/gcc.dg/spellcheck-transposition.c b/gcc/testsuite/gcc.dg/spellcheck-transposition.c
new file mode 100644
index 0000000..b787c83
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/spellcheck-transposition.c
@@ -0,0 +1,20 @@ 
+/* PR other/69968.  */
+
+struct {
+  int coordx, coordy, coordz;
+  int coordx1, coordy1, coordz1;
+} c;
+
+/* Consider the misspelling "coorzd1".
+
+   With Levenshtein distance, the misspelling has an edit distance of 2
+   to all 6 of the fields (e.g. via a deletion and a substitution for the
+   first three, and via deletion and insertion for the second three).
+   
+   With Damerau-Levenshtein, the misspelling has an edit distance of 1
+   via transposition to "coordz1", and 2 to the other fields.  */
+
+void foo (void)
+{
+  c.coorzd1 = c.coordy; /* { dg-error "has no member named 'coorzd1'; did you mean 'coordz1'" } */
+}