diff mbox

[0/2] Levenshtein-based suggestions (v3)

Message ID 1447416968.7830.43.camel@surprise
State New
Headers show

Commit Message

David Malcolm Nov. 13, 2015, 12:16 p.m. UTC
On Fri, 2015-11-13 at 07:57 +0100, Marek Polacek wrote:
> Probably coming too late, sorry.

> On Thu, Nov 12, 2015 at 09:08:36PM -0500, David Malcolm wrote:
> > index 4335a87..eb4e1fc 100644
> > --- a/gcc/c/c-typeck.c
> > +++ b/gcc/c/c-typeck.c
> > @@ -47,6 +47,7 @@ along with GCC; see the file COPYING3.  If not see
> >  #include "c-family/c-ubsan.h"
> >  #include "cilk.h"
> >  #include "gomp-constants.h"
> > +#include "spellcheck.h"
> >  
> >  /* Possible cases of implicit bad conversions.  Used to select
> >     diagnostic messages in convert_for_assignment.  */
> > @@ -2242,6 +2243,72 @@ lookup_field (tree type, tree component)
> >    return tree_cons (NULL_TREE, field, NULL_TREE);
> >  }
> >  
> > +/* Recursively append candidate IDENTIFIER_NODEs to CANDIDATES.  */
> > +
> > +static void
> > +lookup_field_fuzzy_find_candidates (tree type, tree component,
> > +				    vec<tree> *candidates)
> > +{
> > +  tree field;
> > +  for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
> 
> I'd prefer declaring field in the for loop, so
>   for (tree field = TYPE_FIELDS...
> 
> > +	  && (TREE_CODE (TREE_TYPE (field)) == RECORD_TYPE
> > +	      || TREE_CODE (TREE_TYPE (field)) == UNION_TYPE))
> 
> This is RECORD_OR_UNION_TYPE_P (TREE_TYPE (field)).

I based this code on the code in lookup_field right above it;
I copied-and-pasted that conditional, so presumably it should also be
changed in lookup_field (which has the condition twice)?

FWIW I notice RECORD_OR_UNION_TYPE_P also covers QUAL_UNION_TYPE.

/* Nonzero if TYPE is a record or union type.  */
#define RECORD_OR_UNION_TYPE_P(TYPE)		\
  (TREE_CODE (TYPE) == RECORD_TYPE		\
   || TREE_CODE (TYPE) == UNION_TYPE		\
   || TREE_CODE (TYPE) == QUAL_UNION_TYPE)

FWIW I've made the change in the attached patch (both to the new
function, and to lookup_field).

> > +	{
> > +	  lookup_field_fuzzy_find_candidates (TREE_TYPE (field),
> > +					      component,
> > +					      candidates);
> > +	}
> 
> Lose the brackets around a single statement.

Done.

> > +      if (DECL_NAME (field))
> > +	candidates->safe_push (DECL_NAME (field));
> > +    }
> > +}
> > +
> > +/* Like "lookup_field", but find the closest matching IDENTIFIER_NODE,
> > +   rather than returning a TREE_LIST for an exact match.  */
> > +
> > +static tree
> > +lookup_field_fuzzy (tree type, tree component)
> > +{
> > +  gcc_assert (TREE_CODE (component) == IDENTIFIER_NODE);
> > +
> > +  /* First, gather a list of candidates.  */
> > +  auto_vec <tree> candidates;
> > +
> > +  lookup_field_fuzzy_find_candidates (type, component,
> > +				      &candidates);
> > +
> > +  /* Now determine which is closest.  */
> > +  int i;
> > +  tree identifier;
> > +  tree best_identifier = NULL;
> 
> NULL_TREE

Fixed.

> > +  edit_distance_t best_distance = MAX_EDIT_DISTANCE;
> > +  FOR_EACH_VEC_ELT (candidates, i, identifier)
> > +    {
> > +      gcc_assert (TREE_CODE (identifier) == IDENTIFIER_NODE);
> > +      edit_distance_t dist = levenshtein_distance (component, identifier);
> > +      if (dist < best_distance)
> > +	{
> > +	  best_distance = dist;
> > +	  best_identifier = identifier;
> > +	}
> > +    }
> > +
> > +  /* If more than half of the letters were misspelled, the suggestion is
> > +     likely to be meaningless.  */
> > +  if (best_identifier)
> > +    {
> > +      unsigned int cutoff = MAX (IDENTIFIER_LENGTH (component),
> > +				 IDENTIFIER_LENGTH (best_identifier)) / 2;
> > +      if (best_distance > cutoff)
> > +	return NULL;
> 
> NULL_TREE

Fixed.

> > +/* The Levenshtein distance is an "edit-distance": the minimal
> > +   number of one-character insertions, removals or substitutions
> > +   that are needed to change one string into another.
> > +
> > +   This implementation uses the Wagner-Fischer algorithm.  */
> > +
> > +static edit_distance_t
> > +levenshtein_distance (const char *s, int len_s,
> > +		      const char *t, int len_t)
> > +{
> > +  const bool debug = false;
> > +
> > +  if (debug)
> > +    {
> > +      printf ("s: \"%s\" (len_s=%i)\n", s, len_s);
> > +      printf ("t: \"%s\" (len_t=%i)\n", t, len_t);
> > +    }
> 
> Did you leave this debug stuff here intentionally?

I find it useful, but I believe it's against our policy, so I've deleted
it in the attached patch.

> > +      /* Build the rest of the row by considering neighbours to
> > +	 the north, west and northwest.  */
> > +      for (int j = 0; j < len_s; j++)
> > +	{
> > +	  edit_distance_t cost = (s[j] == t[i] ? 0 : 1);
> > +	  edit_distance_t deletion     = v1[j] + 1;
> > +	  edit_distance_t insertion    = v0[j + 1] + 1;
> 
> The formatting doesn't look right here.

It's correct; it's "diff" inserting two spaces before a tab combined
with our mixed spaces+tab convention: the "for" is at column 6 (6
spaces), whereas the other lines are at column 8 (1 tab), which looks
weird in a diff.

Patch attached; only tested lightly so far (compiles, and passes
spellcheck subset of tests).

OK for trunk if it passes bootstrap&regrtest?

Comments

Marek Polacek Nov. 13, 2015, 3:11 p.m. UTC | #1
On Fri, Nov 13, 2015 at 07:16:08AM -0500, David Malcolm wrote:
> > > +	  && (TREE_CODE (TREE_TYPE (field)) == RECORD_TYPE
> > > +	      || TREE_CODE (TREE_TYPE (field)) == UNION_TYPE))
> > 
> > This is RECORD_OR_UNION_TYPE_P (TREE_TYPE (field)).
> 
> I based this code on the code in lookup_field right above it;
> I copied-and-pasted that conditional, so presumably it should also be
> changed in lookup_field (which has the condition twice)?
> 
> FWIW I notice RECORD_OR_UNION_TYPE_P also covers QUAL_UNION_TYPE.
> 
> /* Nonzero if TYPE is a record or union type.  */
> #define RECORD_OR_UNION_TYPE_P(TYPE)		\
>   (TREE_CODE (TYPE) == RECORD_TYPE		\
>    || TREE_CODE (TYPE) == UNION_TYPE		\
>    || TREE_CODE (TYPE) == QUAL_UNION_TYPE)
> 
> FWIW I've made the change in the attached patch (both to the new
> function, and to lookup_field).

Sorry, I changed my mind.  Since QUAL_UNION_TYPE is Ada-only thing and
we check (RECORD_TYPE || UNION_TYPE) in a lot of places in the C FE,
introducing RECORD_OR_UNION_TYPE_P everywhere would unnecessarily slow
things down.  I think we should have a C FE-only macro, maybe called
RECORD_OR_UNION_TYPE_P that only checks for those two types, but this is
something that I can deal with later on.

So I think please just drop these changes for now.  Sorry again.

> > > +  const bool debug = false;
> > > +
> > > +  if (debug)
> > > +    {
> > > +      printf ("s: \"%s\" (len_s=%i)\n", s, len_s);
> > > +      printf ("t: \"%s\" (len_t=%i)\n", t, len_t);
> > > +    }
> > 
> > Did you leave this debug stuff here intentionally?
> 
> I find it useful, but I believe it's against our policy, so I've deleted
> it in the attached patch.

Probably.  But you could surely have a separate DEBUG_FUNCTION that can be
called from gdb.
 
> > > +      /* Build the rest of the row by considering neighbours to
> > > +	 the north, west and northwest.  */
> > > +      for (int j = 0; j < len_s; j++)
> > > +	{
> > > +	  edit_distance_t cost = (s[j] == t[i] ? 0 : 1);
> > > +	  edit_distance_t deletion     = v1[j] + 1;
> > > +	  edit_distance_t insertion    = v0[j + 1] + 1;
> > 
> > The formatting doesn't look right here.
> 
> It's correct; it's "diff" inserting two spaces before a tab combined
> with our mixed spaces+tab convention: the "for" is at column 6 (6
> spaces), whereas the other lines are at column 8 (1 tab), which looks
> weird in a diff.

Sorry, what I had in mind were the spaces after "deletion" and "insertion"
before "=".  Not a big deal, of course.
 
> Patch attached; only tested lightly so far (compiles, and passes
> spellcheck subset of tests).
> 
> OK for trunk if it passes bootstrap&regrtest?

Ok modulo the RECORD_OR_UNION_TYPE_P changes, thanks.

	Marek
Bernd Schmidt Nov. 13, 2015, 3:44 p.m. UTC | #2
On 11/13/2015 04:11 PM, Marek Polacek wrote:
> Sorry, I changed my mind.  Since QUAL_UNION_TYPE is Ada-only thing and
> we check (RECORD_TYPE || UNION_TYPE) in a lot of places in the C FE,
> introducing RECORD_OR_UNION_TYPE_P everywhere would unnecessarily slow
> things down.

I don't think so, the three codes are adjacent so we should be 
generating "(unsigned)(code - RECORD_TYPE) < 3".


Bernd
Marek Polacek Nov. 13, 2015, 3:53 p.m. UTC | #3
On Fri, Nov 13, 2015 at 04:44:21PM +0100, Bernd Schmidt wrote:
> On 11/13/2015 04:11 PM, Marek Polacek wrote:
> >Sorry, I changed my mind.  Since QUAL_UNION_TYPE is Ada-only thing and
> >we check (RECORD_TYPE || UNION_TYPE) in a lot of places in the C FE,
> >introducing RECORD_OR_UNION_TYPE_P everywhere would unnecessarily slow
> >things down.
> 
> I don't think so, the three codes are adjacent so we should be generating
> "(unsigned)(code - RECORD_TYPE) < 3".

Interesting.  Yeah, if we change the RECORD_OR_UNION_TYPE_P macro to this
form, then we don't need a separate version for the C FE.

I'll look at this cleanup in the next week.

	Marek
Jakub Jelinek Nov. 13, 2015, 3:56 p.m. UTC | #4
On Fri, Nov 13, 2015 at 04:53:05PM +0100, Marek Polacek wrote:
> On Fri, Nov 13, 2015 at 04:44:21PM +0100, Bernd Schmidt wrote:
> > On 11/13/2015 04:11 PM, Marek Polacek wrote:
> > >Sorry, I changed my mind.  Since QUAL_UNION_TYPE is Ada-only thing and
> > >we check (RECORD_TYPE || UNION_TYPE) in a lot of places in the C FE,
> > >introducing RECORD_OR_UNION_TYPE_P everywhere would unnecessarily slow
> > >things down.
> > 
> > I don't think so, the three codes are adjacent so we should be generating
> > "(unsigned)(code - RECORD_TYPE) < 3".
> 
> Interesting.  Yeah, if we change the RECORD_OR_UNION_TYPE_P macro to this
> form, then we don't need a separate version for the C FE.

Why?  The compiler should do that already, or do you care about
-O0 builds or host compilers other than gcc that aren't able to do this?
The disadvantage of writing it manually that way is that you need to assert
somewhere that the 3 values indeed are consecutive, while
when the (host?) compiler performs this optimization, it does that only if
they are consecutive, if they are not, the code will be just less efficient.

	Jakub
Marek Polacek Nov. 13, 2015, 4:02 p.m. UTC | #5
On Fri, Nov 13, 2015 at 04:56:30PM +0100, Jakub Jelinek wrote:
> On Fri, Nov 13, 2015 at 04:53:05PM +0100, Marek Polacek wrote:
> > On Fri, Nov 13, 2015 at 04:44:21PM +0100, Bernd Schmidt wrote:
> > > I don't think so, the three codes are adjacent so we should be generating
> > > "(unsigned)(code - RECORD_TYPE) < 3".
> > 
> > Interesting.  Yeah, if we change the RECORD_OR_UNION_TYPE_P macro to this
> > form, then we don't need a separate version for the C FE.
> 
> Why?  The compiler should do that already, or do you care about
> -O0 builds or host compilers other than gcc that aren't able to do this?

I don't.

> The disadvantage of writing it manually that way is that you need to assert
> somewhere that the 3 values indeed are consecutive, while
> when the (host?) compiler performs this optimization, it does that only if
> they are consecutive, if they are not, the code will be just less efficient.

Ok, I understand now what Bernd meant.  I didn't realize the compiler already
does such optimization with those _TYPEs...

	Marek
diff mbox

Patch

From b8ed3cbe9cc000416941e0108036f24f4483cdb0 Mon Sep 17 00:00:00 2001
From: David Malcolm <dmalcolm@redhat.com>
Date: Fri, 13 Nov 2015 07:22:16 -0500
Subject: [PATCH] Cleanups of spellchecking code

gcc/c/ChangeLog:
	* c-typeck.c (lookup_field): Use RECORD_OR_UNION_TYPE_P
	in two places.
	(lookup_field_fuzzy_find_candidates): Use RECORD_OR_UNION_TYPE_P;
	formatting cleanups.
	(lookup_field_fuzzy): Use NULL_TREE rather than NULL.

gcc/ChangeLog:
	* spellcheck.c (levenshtein_distance): Remove debug code.
---
 gcc/c/c-typeck.c | 24 +++++++++---------------
 gcc/spellcheck.c | 24 ------------------------
 2 files changed, 9 insertions(+), 39 deletions(-)

diff --git a/gcc/c/c-typeck.c b/gcc/c/c-typeck.c
index eb4e1fc..b084ca5 100644
--- a/gcc/c/c-typeck.c
+++ b/gcc/c/c-typeck.c
@@ -2166,8 +2166,7 @@  lookup_field (tree type, tree component)
 	      while (DECL_NAME (field_array[bot]) == NULL_TREE)
 		{
 		  field = field_array[bot++];
-		  if (TREE_CODE (TREE_TYPE (field)) == RECORD_TYPE
-		      || TREE_CODE (TREE_TYPE (field)) == UNION_TYPE)
+		  if (RECORD_OR_UNION_TYPE_P (TREE_TYPE (field)))
 		    {
 		      tree anon = lookup_field (TREE_TYPE (field), component);
 
@@ -2213,8 +2212,7 @@  lookup_field (tree type, tree component)
       for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
 	{
 	  if (DECL_NAME (field) == NULL_TREE
-	      && (TREE_CODE (TREE_TYPE (field)) == RECORD_TYPE
-		  || TREE_CODE (TREE_TYPE (field)) == UNION_TYPE))
+	      && RECORD_OR_UNION_TYPE_P (TREE_TYPE (field)))
 	    {
 	      tree anon = lookup_field (TREE_TYPE (field), component);
 
@@ -2249,17 +2247,13 @@  static void
 lookup_field_fuzzy_find_candidates (tree type, tree component,
 				    vec<tree> *candidates)
 {
-  tree field;
-  for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
+  for (tree field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
     {
       if (DECL_NAME (field) == NULL_TREE
-	  && (TREE_CODE (TREE_TYPE (field)) == RECORD_TYPE
-	      || TREE_CODE (TREE_TYPE (field)) == UNION_TYPE))
-	{
-	  lookup_field_fuzzy_find_candidates (TREE_TYPE (field),
-					      component,
-					      candidates);
-	}
+	  && RECORD_OR_UNION_TYPE_P (TREE_TYPE (field)))
+	lookup_field_fuzzy_find_candidates (TREE_TYPE (field),
+					    component,
+					    candidates);
 
       if (DECL_NAME (field))
 	candidates->safe_push (DECL_NAME (field));
@@ -2283,7 +2277,7 @@  lookup_field_fuzzy (tree type, tree component)
   /* Now determine which is closest.  */
   int i;
   tree identifier;
-  tree best_identifier = NULL;
+  tree best_identifier = NULL_TREE;
   edit_distance_t best_distance = MAX_EDIT_DISTANCE;
   FOR_EACH_VEC_ELT (candidates, i, identifier)
     {
@@ -2303,7 +2297,7 @@  lookup_field_fuzzy (tree type, tree component)
       unsigned int cutoff = MAX (IDENTIFIER_LENGTH (component),
 				 IDENTIFIER_LENGTH (best_identifier)) / 2;
       if (best_distance > cutoff)
-	return NULL;
+	return NULL_TREE;
     }
 
   return best_identifier;
diff --git a/gcc/spellcheck.c b/gcc/spellcheck.c
index 32854cf..6432e385 100644
--- a/gcc/spellcheck.c
+++ b/gcc/spellcheck.c
@@ -34,14 +34,6 @@  edit_distance_t
 levenshtein_distance (const char *s, int len_s,
 		      const char *t, int len_t)
 {
-  const bool debug = false;
-
-  if (debug)
-    {
-      printf ("s: \"%s\" (len_s=%i)\n", s, len_s);
-      printf ("t: \"%s\" (len_t=%i)\n", t, len_t);
-    }
-
   if (len_s == 0)
     return len_t;
   if (len_t == 0)
@@ -67,14 +59,6 @@  levenshtein_distance (const char *s, int len_s,
   /* Build successive rows.  */
   for (int i = 0; i < len_t; i++)
     {
-      if (debug)
-	{
-	  printf ("i:%i v0 = ", i);
-	  for (int j = 0; j < len_s + 1; j++)
-	    printf ("%i ", v0[j]);
-	  printf ("\n");
-	}
-
       /* The initial column is for the case of an empty source string; we
 	 can reach prefixes of the target string of length i
 	 by inserting i characters.  */
@@ -98,14 +82,6 @@  levenshtein_distance (const char *s, int len_s,
 	v0[j] = v1[j];
     }
 
-  if (debug)
-    {
-      printf ("final v1 = ");
-      for (int j = 0; j < len_s + 1; j++)
-	printf ("%i ", v1[j]);
-      printf ("\n");
-    }
-
   edit_distance_t result = v1[len_s];
   delete[] v0;
   delete[] v1;
-- 
1.8.5.3