diff mbox

[1/2] Implement Levenshtein distance

Message ID 1446209267-49800-2-git-send-email-dmalcolm@redhat.com
State New
Headers show

Commit Message

David Malcolm Oct. 30, 2015, 12:47 p.m. UTC
This patch adds an implementation of Levenshtein distance to gcc,
along with unit testing of the algorithm.

The unit testing is implemented via a plugin within gcc.dg/plugin.
(I'd prefer to do this via the unit testing patches I've been
proposing in a separate patch kit, but to avoid depending on that
this kit does it within a custom plugin.)

The plugin actually fails until followup patches are applied, with:

 cc1: error: cannot load plugin ./levenshtein_plugin.so
 ./levenshtein_plugin.so: undefined symbol: _Z20levenshtein_distancePKcS0_

due to nothing in the tree initially using the API, but I've broken
it out in the hope that it makes review easier.

gcc/ChangeLog:
	* Makefile.in (OBJS): Add spellcheck.o.
	* spellcheck.c: New file.
	* spellcheck.h: New file.

gcc/testsuite/ChangeLog:
	* gcc.dg/plugin/levenshtein-test-1.c: New file.
	* gcc.dg/plugin/levenshtein_plugin.c: New file.
	* gcc.dg/plugin/plugin.exp (plugin_test_list): Add
	levenshtein_plugin.c.
---
 gcc/Makefile.in                                  |   1 +
 gcc/spellcheck.c                                 | 136 +++++++++++++++++++++++
 gcc/spellcheck.h                                 |  32 ++++++
 gcc/testsuite/gcc.dg/plugin/levenshtein-test-1.c |   9 ++
 gcc/testsuite/gcc.dg/plugin/levenshtein_plugin.c |  64 +++++++++++
 gcc/testsuite/gcc.dg/plugin/plugin.exp           |   1 +
 6 files changed, 243 insertions(+)
 create mode 100644 gcc/spellcheck.c
 create mode 100644 gcc/spellcheck.h
 create mode 100644 gcc/testsuite/gcc.dg/plugin/levenshtein-test-1.c
 create mode 100644 gcc/testsuite/gcc.dg/plugin/levenshtein_plugin.c

Comments

Mikael Morin Nov. 2, 2015, 10:56 a.m. UTC | #1
Hello,

Le 30/10/2015 13:47, David Malcolm a écrit :
> This patch adds an implementation of Levenshtein distance to gcc,
> along with unit testing of the algorithm.
>
I noticed two nits while looking at it.


> diff --git a/gcc/spellcheck.c b/gcc/spellcheck.c
> new file mode 100644
> index 0000000..532df58
> --- /dev/null
> +++ b/gcc/spellcheck.c
> @@ -0,0 +1,136 @@
> +/* Find near-matches for strings and identifiers.
> +   Copyright (C) 2015 Free Software Foundation, Inc.
> +
> +This file is part of GCC.
> +
> +GCC is free software; you can redistribute it and/or modify it under
> +the terms of the GNU General Public License as published by the Free
> +Software Foundation; either version 3, or (at your option) any later
> +version.
> +
> +GCC is distributed in the hope that it will be useful, but WITHOUT ANY
> +WARRANTY; without even the implied warranty of MERCHANTABILITY or
> +FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
> +for more details.
> +
> +You should have received a copy of the GNU General Public License
> +along with GCC; see the file COPYING3.  If not see
> +<http://www.gnu.org/licenses/>.  */
> +
> +#include "config.h"
> +#include "system.h"
> +#include "coretypes.h"
> +#include "tm.h"
> +#include "tree.h"
> +#include "spellcheck.h"
> +
> +/* The Levenshtein distance is an "edit-distance": the minimal
> +   number of one-character insertions, removals or substitutions
> +   that are needed to change one string into another.
> +
> +   This implementation uses the Wagner-Fischer algorithm.  */
> +
You forgot to explain that m and n are the lengths of s and t 
respectively.  You may want to just use a more descriptive name for them.

> +static edit_distance_t
> +levenshtein_distance (const char *s, int m,
> +		      const char *t, int n)
> +{
> +  const bool debug = false;
> +
> +  if (debug)
> +    {
> +      printf ("s: \"%s\" (m=%i)\n", s, m);
> +      printf ("t: \"%s\" (n=%i)\n", t, n);
> +    }
> +
> +  if (m == 0)
> +    return n;
> +  if (n == 0)
> +    return m;
> +
> +  /* We effectively build a matrix where each (i, j) contains the
> +     Levenshtein distance between the prefix strings s[0:i]
> +     and t[0:j].
The code seems to use s[0:j] and t[0:i] instead, doesn't it?

Thanks
Mikael
diff mbox

Patch

diff --git a/gcc/Makefile.in b/gcc/Makefile.in
index 2685b38..9fb643e 100644
--- a/gcc/Makefile.in
+++ b/gcc/Makefile.in
@@ -1394,6 +1394,7 @@  OBJS = \
 	shrink-wrap.o \
 	simplify-rtx.o \
 	sparseset.o \
+	spellcheck.o \
 	sreal.o \
 	stack-ptr-mod.o \
 	statistics.o \
diff --git a/gcc/spellcheck.c b/gcc/spellcheck.c
new file mode 100644
index 0000000..532df58
--- /dev/null
+++ b/gcc/spellcheck.c
@@ -0,0 +1,136 @@ 
+/* Find near-matches for strings and identifiers.
+   Copyright (C) 2015 Free Software Foundation, Inc.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free
+Software Foundation; either version 3, or (at your option) any later
+version.
+
+GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3.  If not see
+<http://www.gnu.org/licenses/>.  */
+
+#include "config.h"
+#include "system.h"
+#include "coretypes.h"
+#include "tm.h"
+#include "tree.h"
+#include "spellcheck.h"
+
+/* The Levenshtein distance is an "edit-distance": the minimal
+   number of one-character insertions, removals or substitutions
+   that are needed to change one string into another.
+
+   This implementation uses the Wagner-Fischer algorithm.  */
+
+static edit_distance_t
+levenshtein_distance (const char *s, int m,
+		      const char *t, int n)
+{
+  const bool debug = false;
+
+  if (debug)
+    {
+      printf ("s: \"%s\" (m=%i)\n", s, m);
+      printf ("t: \"%s\" (n=%i)\n", t, n);
+    }
+
+  if (m == 0)
+    return n;
+  if (n == 0)
+    return m;
+
+  /* We effectively build a matrix where each (i, j) contains the
+     Levenshtein distance between the prefix strings s[0:i]
+     and t[0:j].
+     Rather than actually build an (m + 1) * (n + 1) matrix,
+     we simply keep track of the last row, v0 and a new row, v1,
+     which avoids an (m + 1) * (n + 1) allocation and memory accesses
+     in favor of two (m + 1) allocations.  These could potentially be
+     statically-allocated if we impose a maximum length on the
+     strings of interest.  */
+  edit_distance_t *v0 = new edit_distance_t[m + 1];
+  edit_distance_t *v1 = new edit_distance_t[m + 1];
+
+  /* The first row is for the case of an empty target string, which
+     we can reach by deleting every character in the source string.  */
+  for (int i = 0; i < m + 1; i++)
+    v0[i] = i;
+
+  /* Build successive rows.  */
+  for (int i = 0; i < n; i++)
+    {
+      if (debug)
+	{
+	  printf ("i:%i v0 = ", i);
+	  for (int j = 0; j < m + 1; j++)
+	    printf ("%i ", v0[j]);
+	  printf ("\n");
+	}
+
+      /* The initial column is for the case of an empty source string; we
+	 can reach prefixes of the target string of length i
+	 by inserting i characters.  */
+      v1[0] = i + 1;
+
+      /* Build the rest of the row by considering neighbours to
+	 the north, west and northwest.  */
+      for (int j = 0; j < m; j++)
+	{
+	  edit_distance_t cost = (s[j] == t[i] ? 0 : 1);
+	  edit_distance_t deletion     = v1[j] + 1;
+	  edit_distance_t insertion    = v0[j + 1] + 1;
+	  edit_distance_t substitution = v0[j] + cost;
+	  edit_distance_t cheapest = MIN (deletion, insertion);
+	  cheapest = MIN (cheapest, substitution);
+	  v1[j + 1] = cheapest;
+	}
+
+      /* Prepare to move on to next row.  */
+      for (int j = 0; j < m + 1; j++)
+	v0[j] = v1[j];
+    }
+
+  if (debug)
+    {
+      printf ("final v1 = ");
+      for (int j = 0; j < m + 1; j++)
+	printf ("%i ", v1[j]);
+      printf ("\n");
+    }
+
+  edit_distance_t result = v1[m];
+  delete[] v0;
+  delete[] v1;
+  return result;
+}
+
+/* Calculate Levenshtein distance between two nil-terminated strings.
+   This exists purely for the unit tests.  */
+
+edit_distance_t
+levenshtein_distance (const char *s, const char *t)
+{
+  return levenshtein_distance (s, strlen (s), t, strlen (t));
+}
+
+/* Calculate Levenshtein distance between two identifiers.  */
+
+edit_distance_t
+levenshtein_distance (tree ident_s, tree ident_t)
+{
+  gcc_assert (TREE_CODE (ident_s) == IDENTIFIER_NODE);
+  gcc_assert (TREE_CODE (ident_t) == IDENTIFIER_NODE);
+
+  return levenshtein_distance (IDENTIFIER_POINTER (ident_s),
+			       IDENTIFIER_LENGTH (ident_s),
+			       IDENTIFIER_POINTER (ident_t),
+			       IDENTIFIER_LENGTH (ident_t));
+}
diff --git a/gcc/spellcheck.h b/gcc/spellcheck.h
new file mode 100644
index 0000000..58355d6
--- /dev/null
+++ b/gcc/spellcheck.h
@@ -0,0 +1,32 @@ 
+/* Find near-matches for strings and identifiers.
+   Copyright (C) 2015 Free Software Foundation, Inc.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free
+Software Foundation; either version 3, or (at your option) any later
+version.
+
+GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3.  If not see
+<http://www.gnu.org/licenses/>.  */
+
+#ifndef GCC_SPELLCHECK_H
+#define GCC_SPELLCHECK_H
+
+typedef unsigned int edit_distance_t;
+const edit_distance_t MAX_EDIT_DISTANCE = UINT_MAX;
+
+extern edit_distance_t
+levenshtein_distance (const char *s, const char *t);
+
+extern edit_distance_t
+levenshtein_distance (tree ident_s, tree ident_t);
+
+#endif  /* GCC_SPELLCHECK_H  */
diff --git a/gcc/testsuite/gcc.dg/plugin/levenshtein-test-1.c b/gcc/testsuite/gcc.dg/plugin/levenshtein-test-1.c
new file mode 100644
index 0000000..ac49992
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/plugin/levenshtein-test-1.c
@@ -0,0 +1,9 @@ 
+/* Placeholder C source file for unit-testing gcc/spellcheck.c.  */
+/* { dg-do compile } */
+/* { dg-options "-O" } */
+
+int
+main (int argc, char **argv)
+{
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.dg/plugin/levenshtein_plugin.c b/gcc/testsuite/gcc.dg/plugin/levenshtein_plugin.c
new file mode 100644
index 0000000..3e7dc78
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/plugin/levenshtein_plugin.c
@@ -0,0 +1,64 @@ 
+/* Plugin for unittesting gcc/spellcheck.h.  */
+
+#include "config.h"
+#include "gcc-plugin.h"
+#include "system.h"
+#include "coretypes.h"
+#include "spellcheck.h"
+#include "diagnostic.h"
+
+int plugin_is_GPL_compatible;
+
+static void
+levenshtein_distance_unit_test_oneway (const char *a, const char *b,
+				       edit_distance_t expected)
+{
+  edit_distance_t actual = levenshtein_distance (a, b);
+  if (actual != expected)
+    error ("levenshtein_distance (\"%s\", \"%s\") : expected: %i got %i",
+	   a, b, expected, actual);
+}
+
+
+static void
+levenshtein_distance_unit_test (const char *a, const char *b,
+				edit_distance_t expected)
+{
+  /* Run every test both ways to ensure it's symmetric.  */
+  levenshtein_distance_unit_test_oneway (a, b, expected);
+  levenshtein_distance_unit_test_oneway (b, a, expected);
+}
+
+/* Callback handler for the PLUGIN_FINISH event; run
+   levenshtein_distance unit tests here.  */
+
+static void
+on_finish (void */*gcc_data*/, void */*user_data*/)
+{
+  levenshtein_distance_unit_test ("", "nonempty", strlen ("nonempty"));
+  levenshtein_distance_unit_test ("saturday", "sunday", 3);
+  levenshtein_distance_unit_test ("foo", "m_foo", 2);
+  levenshtein_distance_unit_test ("hello_world", "HelloWorld", 3);
+  levenshtein_distance_unit_test
+    ("the quick brown fox jumps over the lazy dog", "dog", 40);
+  levenshtein_distance_unit_test
+    ("the quick brown fox jumps over the lazy dog",
+     "the quick brown dog jumps over the lazy fox",
+     4);
+  levenshtein_distance_unit_test
+    ("Lorem ipsum dolor sit amet, consectetur adipiscing elit,",
+     "All your base are belong to us",
+     44);
+}
+
+int
+plugin_init (struct plugin_name_args *plugin_info,
+	     struct plugin_gcc_version */*version*/)
+{
+  register_callback (plugin_info->base_name,
+		     PLUGIN_FINISH,
+		     on_finish,
+		     NULL); /* void *user_data */
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.dg/plugin/plugin.exp b/gcc/testsuite/gcc.dg/plugin/plugin.exp
index 39fab6e..80fc539 100644
--- a/gcc/testsuite/gcc.dg/plugin/plugin.exp
+++ b/gcc/testsuite/gcc.dg/plugin/plugin.exp
@@ -63,6 +63,7 @@  set plugin_test_list [list \
     { start_unit_plugin.c start_unit-test-1.c } \
     { finish_unit_plugin.c finish_unit-test-1.c } \
     { wide-int_plugin.c wide-int-test-1.c } \
+    { levenshtein_plugin.c levenshtein-test-1.c } \
 ]
 
 foreach plugin_test $plugin_test_list {