diff mbox

[WIP] Use Levenshtein distance for various misspellings in C frontend v2

Message ID 1442331491-11471-1-git-send-email-dmalcolm@redhat.com
State New
Headers show

Commit Message

David Malcolm Sept. 15, 2015, 3:38 p.m. UTC
Updated patch attached, which is now independent of the rest of the
patch kit; see below.  Various other comments inline.

On Fri, 2015-09-11 at 17:30 +0200, Manuel López-Ibáñez wrote:
On 10/09/15 22:28, David Malcolm wrote:
> > There are a couple of FIXMEs here:
> > * where to call levenshtein_distance_unit_tests
>
> Should this be part of make check? Perhaps a small program that is compiled and
> linked with spellcheck.c? This would be possible if spellcheck.c did not depend
> on tree.h or tm.h, which I doubt it needs to.

Ideally I'd like to put them into a unittest plugin I've been working on:
 https://gcc.gnu.org/ml/gcc-patches/2015-06/msg00765.html
In the meantime, they only get run in an ENABLE_CHECKING build.

> > * should we attempt error-recovery in c-typeck.c:build_component_ref
>
> I would say yes, but why not leave this discussion to a later patch? The
> current one seems useful enough.

(nods)

> > +
> > +/* Look for the closest match for NAME within the currently valid
> > +   scopes.
> > +
> > +   This finds the identifier with the lowest Levenshtein distance to
> > +   NAME.  If there are multiple candidates with equal minimal distance,
> > +   the first one found is returned.  Scopes are searched from innermost
> > +   outwards, and within a scope in reverse order of declaration, thus
> > +   benefiting candidates "near" to the current scope.  */
> > +
> > +tree
> > +lookup_name_fuzzy (tree name)
> > +{
> > +  gcc_assert (TREE_CODE (name) == IDENTIFIER_NODE);
> > +
> > +  c_binding *best_binding = NULL;
> > +  int best_distance = INT_MAX;
> > +
> > +  for (c_scope *scope = current_scope; scope; scope = scope->outer)
> > +    for (c_binding *binding = scope->bindings; binding; binding = binding->prev)
> > +      {
> > +	if (!binding->id)
> > +	  continue;
> > +	int dist = levenshtein_distance (name, binding->id);
> > +	if (dist < best_distance)
>
> I guess 'dist' cannot be negative. Can it be zero? If not, wouldn't be
> appropriate to exit as soon as it becomes 1?

It can't be negative, so I've converted it to unsigned int, and introduced an
"edit_distance_t" typedef for it.

It would be appropriate to exit as soon as we reach 1 if we agree
that lookup_name_fuzzy isn't intended to find exact matches (since
otherwise we might fail to return an exact match if we see a
distance 1 match first).

I haven't implemented that early bailout in this iteration of the
patch; should I?

> Is this code discriminating between types and names? That is, what happens for:
>
> typedef int ins;
>
> int foo(void)
> {
>     int inr;
>     inp x;
> }

Thanks.  I've fixed that.

> > +/* Recursively append candidate IDENTIFIER_NODEs to CANDIDATES.  */
> > +
> > +static void
> > +lookup_field_fuzzy_find_candidates (tree type, tree component,
> > +				    vec<tree> *candidates)
> > +{
> > +  tree field;
> > +  for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
> > +    {
> > +      if (DECL_NAME (field) == NULL_TREE
> > +	  && (TREE_CODE (TREE_TYPE (field)) == RECORD_TYPE
> > +	      || TREE_CODE (TREE_TYPE (field)) == UNION_TYPE))
> > +	{
> > +	  lookup_field_fuzzy_find_candidates (TREE_TYPE (field),
> > +					      component,
> > +					      candidates);
> > +	}
> > +
> > +      if (DECL_NAME (field))
> > +	candidates->safe_push (field);
> > +    }
> > +}
>
> This is appending inner-most, isn't it? Thus, given:

Yes.

> struct s{
>      struct j { int aa; } kk;
>      int aa;
> };
>
> void foo(struct s x)
> {
>      x.ab;
> }
>
> it will find s::j::aa before s::aa, no?

AIUI, it doesn't look inside the "kk", only for anonymous structs.

I added a test for this.

> >   tree
> > -build_component_ref (location_t loc, tree datum, tree component)
> > +build_component_ref (location_t loc, tree datum, tree component,
> > +		     source_range *ident_range)
> >   {
> >     tree type = TREE_TYPE (datum);
> >     enum tree_code code = TREE_CODE (type);
> > @@ -2294,7 +2356,31 @@ build_component_ref (location_t loc, tree datum, tree component)
> >
> >         if (!field)
> >   	{
> > -	  error_at (loc, "%qT has no member named %qE", type, component);
> > +	  if (!ident_range)
> > +	    {
> > +	      error_at (loc, "%qT has no member named %qE",
> > +			type, component);
> > +	      return error_mark_node;
> > +	    }
> > +	  gcc_rich_location richloc (*ident_range);
> > +	  if (TREE_CODE (datum) == INDIRECT_REF)
> > +	    richloc.add_expr (TREE_OPERAND (datum, 0));
> > +	  else
> > +	    richloc.add_expr (datum);
> > +	  field = lookup_field_fuzzy (type, component);
> > +	  if (field)
> > +	    {
> > +	      error_at_rich_loc
> > +		(&richloc,
> > +		 "%qT has no member named %qE; did you mean %qE?",
> > +		 type, component, field);
> > +	      /* FIXME: error recovery: should we try to keep going,
> > +		 with "field"? (having issued an error, and hence no
> > +		 output).  */
> > +	    }
> > +	  else
> > +	    error_at_rich_loc (&richloc, "%qT has no member named %qE",
> > +			       type, component);
> >   	  return error_mark_node;
> >   	}
>
> I don't understand why looking for a candidate or not depends on ident_range.

This is because the old patch was integrated with the source_range
ideas from the rest of the patch kit.  I've taken that out in the new
version.

> > --- /dev/null
> > +++ b/gcc/testsuite/gcc.dg/spellcheck.c
> > @@ -0,0 +1,36 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-fdiagnostics-show-caret" } */
> > +
> > +struct foo
> > +{
> > +  int foo;
> > +  int bar;
> > +  int baz;
> > +};
> > +
> > +int test (struct foo *ptr)
> > +{
> > +  return ptr->m_bar; /* { dg-error "'struct foo' has no member named 'm_bar'; did you mean 'bar'?" } */
> > +
> > +/* { dg-begin-multiline-output "" }
> > +   return ptr->m_bar;
> > +          ~~~  ^~~~~
> > +   { dg-end-multiline-output "" } */
> > +}
> > +
> > +int test2 (void)
> > +{
> > +  struct foo instance = {};
> > +  return instance.m_bar; /* { dg-error "'struct foo' has no member named 'm_bar'; did you mean 'bar'?" } */
> > +
> > +/* { dg-begin-multiline-output "" }
> > +   return instance.m_bar;
> > +          ~~~~~~~~ ^~~~~
> > +   { dg-end-multiline-output "" } */
> > +}
> > +
> > +int64 foo; /* { dg-error "unknown type name 'int64'; did you mean 'int'?" } */
> > +/* { dg-begin-multiline-output "" }
> > + int64 foo;
> > + ^~~~~
> > +   { dg-end-multiline-output "" } */
> >
>
>
> These tests could also test different scopes, clashes between types and fields
> and variables, and the correct behavior for nested struct/unions.

Thanks; added to TODO list below.

> I wonder whether it would be worth it to extend existing tests if now they emit
> the "do you mean" part to be sure they are doing the right thing.

Thanks; added to TODO list below.  These are passing now due to the
dg-error regexp not caring about the exact message.

Many of the field names in these tests are very short; it's not clear
to me that there's a good single suggestion that can be made if there
are several 1-char field names to choose from.

I noticed that the old patch could sometimes offer unhelpful
suggestions; I added a test for this:

  nonsensical_suggestion_t var;

where it would suggest something unrelated.  I suppressed that in
lookup_name_fuzzy by only offering a suggestion if the distance is less
than half of the length of what the user typed and that seemed to work
well, albeit in the few cases I tried.  I suspect that we may
want a similar suppression for lookup_field_fuzzy.

> Cheers,
>
> Manuel.

Thanks.

Update version of the patch follows.

This version of the patch is independent of the rest of the kit,
and applies directly on top of trunk (r227562, specifically).

Changes since previous version:
- it's now independent of the rest of the patch kit.
- removal of tracking of fieldname range "ident_range" from calls
  to build_component_ref, just using the location_t.
- removal of show-caret/multiline tests from testcase
- introduced a typedef "edit_distance_t", using it to convert
  the underlying type from "int" to "unsigned int".
- "lookup_name_fuzzy" now only considers bindings of a TYPE_DECL,
  thus matching "ins" rather than "inr" for the example given by Manu
  here:
    https://gcc.gnu.org/ml/gcc-patches/2015-09/msg00813.html
- lookup_name_fuzzy: don't offer a suggestion if the distance is too
  high, since such a suggestion is likely to be bogus
- added test coverage to try to cover the above
- reimplemented levenshtein_distance to avoid allocating and building
  an (m + 1) * (n + 1) matrix in favor of just tracking two rows
  at once
- made levenshtein_distance_unit_tests automatically run each test
  both ways; added some more tests

I attempted the error-recovery in build_component_ref, but I found it
could make things worse.  For example, in
gcc/testsuite/gcc.dg/anon-struct-11.c:
  f3 (&e.D);		/* { dg-error "no member" } */
becomes:
  error: 'struct E' has no member named 'D'; did you mean 'b'?
but if we try to use "b", this then leads to thes additional bogus
messages:
  warning: passing argument 1 of 'f3' from incompatible
    pointer type [-Wincompatible-pointer-types]
  note: expected 'D * {aka struct <anonymous> *}' but
    argument is of type 'char *'

Similarly, in gcc/testsuite/gcc.dg/c11-anon-struct-2.c:
  x.i = 0; /* { dg-error "has no member" } */
this becomes:
  error: 'struct s5' has no member named 'i'; did you mean 'a'?
which then leads to:
  error: incompatible types when assigning to type
   'struct <anonymous>' from type 'int'

So this version of the patch doesn't attempted to use the suggested
field.

Successfully bootstrapped&regrtested on x86_64-pc-linux-gnu; adds
9 PASSes to gcc.sum.

I'm posting it here as a work-in-progress.

Remaining work:
  * the FIXME about where to call levenshtein_distance_unit_tests;
there's an argument that this could be moved to libiberty (is C++
allowed in libiberty?); I'd prefer to get the unittest idea from
 https://gcc.gnu.org/ml/gcc-patches/2015-06/msg00765.html
into trunk, and then move it into there.  Right now it's all
gcc_assert, so optimizes away in a production build.
  * more testcases as noted by Manu above
  * try existing testcases as noted by Manu above
  * possible early return when distance == 1
  * perhaps some kind of limit on the number of iterations inside
levenshtein_distance (e.g. governed by a param).
  * perhaps some ability to pass in a limit on the
distance we care about, so we can immediately reject distances
that will be above this

It also strikes me that sometimes a "misspelling" is a missing
header file, and that the most helpful thing to do might be to
suggest including that header file.  For instance given:
  $ cat /tmp/foo.c
  int64_t i;

  $ ./xgcc -B. /tmp/foo.c
  /tmp/foo.c:1:1: error: unknown type name ‘int64_t’
  int64_t i;
  ^
(where the suggestion of "int" is suppressed due to the distance
being too long) it might be helpful to print:
  /tmp/foo.c:1:1: error: unknown type name 'int64_t'; did you mean to include '<inttypes.h>'?
  int64_t i;
  ^
That does seem like a separate enhancement, though.

gcc/ChangeLog:
	* Makefile.in (OBJS): Add spellcheck.o.
	* spellcheck.c: New file.
	* spellcheck.h: New file.

gcc/c-family/ChangeLog:
	* c-common.h (lookup_name_fuzzy): New decl.

gcc/c/ChangeLog:
	* c-decl.c: Include spellcheck.h.
	(lookup_name_fuzzy): New.
	* c-parser.c: Include spellcheck.h.
	(c_parser_declaration_or_fndef): If "unknown type name",
	attempt to suggest a close match using lookup_name_fuzzy.
	* c-typeck.c: Include spellcheck.h.
	(lookup_field_fuzzy_find_candidates): New function.
	(lookup_field_fuzzy): New function.
	(build_component_ref): Use lookup_field_fuzzy to suggest close
	matches when printing field-not-found error.

gcc/testsuite/ChangeLog:
	* gcc.dg/spellcheck.c: New file.
---
 gcc/Makefile.in                   |   1 +
 gcc/c-family/c-common.h           |   1 +
 gcc/c/c-decl.c                    |  45 +++++++++++
 gcc/c/c-parser.c                  |  11 ++-
 gcc/c/c-typeck.c                  |  66 ++++++++++++++-
 gcc/spellcheck.c                  | 166 ++++++++++++++++++++++++++++++++++++++
 gcc/spellcheck.h                  |  35 ++++++++
 gcc/testsuite/gcc.dg/spellcheck.c |  49 +++++++++++
 8 files changed, 371 insertions(+), 3 deletions(-)
 create mode 100644 gcc/spellcheck.c
 create mode 100644 gcc/spellcheck.h
 create mode 100644 gcc/testsuite/gcc.dg/spellcheck.c

Comments

Manuel López-Ibáñez Sept. 15, 2015, 4:20 p.m. UTC | #1
On 15 September 2015 at 17:38, David Malcolm <dmalcolm@redhat.com> wrote:
> It would be appropriate to exit as soon as we reach 1 if we agree
> that lookup_name_fuzzy isn't intended to find exact matches (since
> otherwise we might fail to return an exact match if we see a
> distance 1 match first).
>
> I haven't implemented that early bailout in this iteration of the
> patch; should I?

Everything I say are mere suggestions since I cannot approve patches,
so I would say: "whatever the approvers say!" :-)

Nevertheless, how an exact match would play out?

unknown type name 'inp'; did you mean 'inp'?


> Remaining work:
>   * the FIXME about where to call levenshtein_distance_unit_tests;
> there's an argument that this could be moved to libiberty (is C++
> allowed in libiberty?); I'd prefer to get the unittest idea from
>  https://gcc.gnu.org/ml/gcc-patches/2015-06/msg00765.html
> into trunk, and then move it into there.  Right now it's all
> gcc_assert, so optimizes away in a production build.

That would be gcc_checking_assert, no? gcc_assert() should still work
in release mode, AFAIU.

>   * try existing testcases as noted by Manu above

I think the most useful part of checking those is that we have really
wacky testcases and it may show cases where things go horribly wrong.
Plus, if the suggestion is perfect, then you have another testcase for
free. This is what I was doing with the Wformat precise locations.

> It also strikes me that sometimes a "misspelling" is a missing
> header file, and that the most helpful thing to do might be to
> suggest including that header file.  For instance given:
>   $ cat /tmp/foo.c
>   int64_t i;
>
>   $ ./xgcc -B. /tmp/foo.c
>   /tmp/foo.c:1:1: error: unknown type name ‘int64_t’
>   int64_t i;
>   ^
> (where the suggestion of "int" is suppressed due to the distance
> being too long) it might be helpful to print:
>   /tmp/foo.c:1:1: error: unknown type name 'int64_t'; did you mean to include '<inttypes.h>'?
>   int64_t i;
>   ^
> That does seem like a separate enhancement, though.

We already suggest header files for built-in functions:
https://gcc.gnu.org/PR59717
Doing the same for "standard" types would not be a stretch, but yes,
it is a separate thing.


> diff --git a/gcc/spellcheck.c b/gcc/spellcheck.c
> new file mode 100644
> index 0000000..c407aa0
> --- /dev/null
> +++ b/gcc/spellcheck.c
> +#include "config.h"
> +#include "system.h"
> +#include "coretypes.h"
> +#include "tm.h"
> +#include "tree.h"
> +#include "spellcheck.h"

Why tm.h?

Great work!

Manuel.
Richard Biener Sept. 16, 2015, 8:34 a.m. UTC | #2
On Tue, Sep 15, 2015 at 5:38 PM, David Malcolm <dmalcolm@redhat.com> wrote:
> Updated patch attached, which is now independent of the rest of the
> patch kit; see below.  Various other comments inline.
>
> On Fri, 2015-09-11 at 17:30 +0200, Manuel López-Ibáñez wrote:
> On 10/09/15 22:28, David Malcolm wrote:
>> > There are a couple of FIXMEs here:
>> > * where to call levenshtein_distance_unit_tests
>>
>> Should this be part of make check? Perhaps a small program that is compiled and
>> linked with spellcheck.c? This would be possible if spellcheck.c did not depend
>> on tree.h or tm.h, which I doubt it needs to.
>
> Ideally I'd like to put them into a unittest plugin I've been working on:
>  https://gcc.gnu.org/ml/gcc-patches/2015-06/msg00765.html
> In the meantime, they only get run in an ENABLE_CHECKING build.
>
>> > * should we attempt error-recovery in c-typeck.c:build_component_ref
>>
>> I would say yes, but why not leave this discussion to a later patch? The
>> current one seems useful enough.
>
> (nods)
>
>> > +
>> > +/* Look for the closest match for NAME within the currently valid
>> > +   scopes.
>> > +
>> > +   This finds the identifier with the lowest Levenshtein distance to
>> > +   NAME.  If there are multiple candidates with equal minimal distance,
>> > +   the first one found is returned.  Scopes are searched from innermost
>> > +   outwards, and within a scope in reverse order of declaration, thus
>> > +   benefiting candidates "near" to the current scope.  */
>> > +
>> > +tree
>> > +lookup_name_fuzzy (tree name)
>> > +{
>> > +  gcc_assert (TREE_CODE (name) == IDENTIFIER_NODE);
>> > +
>> > +  c_binding *best_binding = NULL;
>> > +  int best_distance = INT_MAX;
>> > +
>> > +  for (c_scope *scope = current_scope; scope; scope = scope->outer)
>> > +    for (c_binding *binding = scope->bindings; binding; binding = binding->prev)
>> > +      {
>> > +   if (!binding->id)
>> > +     continue;
>> > +   int dist = levenshtein_distance (name, binding->id);
>> > +   if (dist < best_distance)

Btw, this looks quite expensive - I'm sure we want to limit the effort
here a bit.
Also not allowing arbitrary "best" distances and not do this for very simple
identifiers such as 'i'.  Say,

foo()
{
  int i;
  for (i =0; i<10; ++i)
   for (j = 0; j < 12; ++j)
    ;
}

I don't want us to suggest using 'i' instead of j (a good hint is that
I used 'j'
multiple times).

So while the idea might be an improvement to selected cases it can cause
confusion as well.  And if using the suggestion for further parsing it can
cause worse followup errors (unless we can limit such "fixup" use to the
cases where we can parse the result without errors).  Consider

foo()
{
  foz = 1;
}

if we suggest 'foo' instead of foz then we'll get a more confusing followup
error if we actually use it.

But maybe you already handle all these cases (didn't look at the patch,
just saw the above expensive loop plus dropped some obvious concerns).

Richard.

>> I guess 'dist' cannot be negative. Can it be zero? If not, wouldn't be
>> appropriate to exit as soon as it becomes 1?
>
> It can't be negative, so I've converted it to unsigned int, and introduced an
> "edit_distance_t" typedef for it.
>
> It would be appropriate to exit as soon as we reach 1 if we agree
> that lookup_name_fuzzy isn't intended to find exact matches (since
> otherwise we might fail to return an exact match if we see a
> distance 1 match first).
>
> I haven't implemented that early bailout in this iteration of the
> patch; should I?
>
>> Is this code discriminating between types and names? That is, what happens for:
>>
>> typedef int ins;
>>
>> int foo(void)
>> {
>>     int inr;
>>     inp x;
>> }
>
> Thanks.  I've fixed that.
>
>> > +/* Recursively append candidate IDENTIFIER_NODEs to CANDIDATES.  */
>> > +
>> > +static void
>> > +lookup_field_fuzzy_find_candidates (tree type, tree component,
>> > +                               vec<tree> *candidates)
>> > +{
>> > +  tree field;
>> > +  for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
>> > +    {
>> > +      if (DECL_NAME (field) == NULL_TREE
>> > +     && (TREE_CODE (TREE_TYPE (field)) == RECORD_TYPE
>> > +         || TREE_CODE (TREE_TYPE (field)) == UNION_TYPE))
>> > +   {
>> > +     lookup_field_fuzzy_find_candidates (TREE_TYPE (field),
>> > +                                         component,
>> > +                                         candidates);
>> > +   }
>> > +
>> > +      if (DECL_NAME (field))
>> > +   candidates->safe_push (field);
>> > +    }
>> > +}
>>
>> This is appending inner-most, isn't it? Thus, given:
>
> Yes.
>
>> struct s{
>>      struct j { int aa; } kk;
>>      int aa;
>> };
>>
>> void foo(struct s x)
>> {
>>      x.ab;
>> }
>>
>> it will find s::j::aa before s::aa, no?
>
> AIUI, it doesn't look inside the "kk", only for anonymous structs.
>
> I added a test for this.
>
>> >   tree
>> > -build_component_ref (location_t loc, tree datum, tree component)
>> > +build_component_ref (location_t loc, tree datum, tree component,
>> > +                source_range *ident_range)
>> >   {
>> >     tree type = TREE_TYPE (datum);
>> >     enum tree_code code = TREE_CODE (type);
>> > @@ -2294,7 +2356,31 @@ build_component_ref (location_t loc, tree datum, tree component)
>> >
>> >         if (!field)
>> >     {
>> > -     error_at (loc, "%qT has no member named %qE", type, component);
>> > +     if (!ident_range)
>> > +       {
>> > +         error_at (loc, "%qT has no member named %qE",
>> > +                   type, component);
>> > +         return error_mark_node;
>> > +       }
>> > +     gcc_rich_location richloc (*ident_range);
>> > +     if (TREE_CODE (datum) == INDIRECT_REF)
>> > +       richloc.add_expr (TREE_OPERAND (datum, 0));
>> > +     else
>> > +       richloc.add_expr (datum);
>> > +     field = lookup_field_fuzzy (type, component);
>> > +     if (field)
>> > +       {
>> > +         error_at_rich_loc
>> > +           (&richloc,
>> > +            "%qT has no member named %qE; did you mean %qE?",
>> > +            type, component, field);
>> > +         /* FIXME: error recovery: should we try to keep going,
>> > +            with "field"? (having issued an error, and hence no
>> > +            output).  */
>> > +       }
>> > +     else
>> > +       error_at_rich_loc (&richloc, "%qT has no member named %qE",
>> > +                          type, component);
>> >       return error_mark_node;
>> >     }
>>
>> I don't understand why looking for a candidate or not depends on ident_range.
>
> This is because the old patch was integrated with the source_range
> ideas from the rest of the patch kit.  I've taken that out in the new
> version.
>
>> > --- /dev/null
>> > +++ b/gcc/testsuite/gcc.dg/spellcheck.c
>> > @@ -0,0 +1,36 @@
>> > +/* { dg-do compile } */
>> > +/* { dg-options "-fdiagnostics-show-caret" } */
>> > +
>> > +struct foo
>> > +{
>> > +  int foo;
>> > +  int bar;
>> > +  int baz;
>> > +};
>> > +
>> > +int test (struct foo *ptr)
>> > +{
>> > +  return ptr->m_bar; /* { dg-error "'struct foo' has no member named 'm_bar'; did you mean 'bar'?" } */
>> > +
>> > +/* { dg-begin-multiline-output "" }
>> > +   return ptr->m_bar;
>> > +          ~~~  ^~~~~
>> > +   { dg-end-multiline-output "" } */
>> > +}
>> > +
>> > +int test2 (void)
>> > +{
>> > +  struct foo instance = {};
>> > +  return instance.m_bar; /* { dg-error "'struct foo' has no member named 'm_bar'; did you mean 'bar'?" } */
>> > +
>> > +/* { dg-begin-multiline-output "" }
>> > +   return instance.m_bar;
>> > +          ~~~~~~~~ ^~~~~
>> > +   { dg-end-multiline-output "" } */
>> > +}
>> > +
>> > +int64 foo; /* { dg-error "unknown type name 'int64'; did you mean 'int'?" } */
>> > +/* { dg-begin-multiline-output "" }
>> > + int64 foo;
>> > + ^~~~~
>> > +   { dg-end-multiline-output "" } */
>> >
>>
>>
>> These tests could also test different scopes, clashes between types and fields
>> and variables, and the correct behavior for nested struct/unions.
>
> Thanks; added to TODO list below.
>
>> I wonder whether it would be worth it to extend existing tests if now they emit
>> the "do you mean" part to be sure they are doing the right thing.
>
> Thanks; added to TODO list below.  These are passing now due to the
> dg-error regexp not caring about the exact message.
>
> Many of the field names in these tests are very short; it's not clear
> to me that there's a good single suggestion that can be made if there
> are several 1-char field names to choose from.
>
> I noticed that the old patch could sometimes offer unhelpful
> suggestions; I added a test for this:
>
>   nonsensical_suggestion_t var;
>
> where it would suggest something unrelated.  I suppressed that in
> lookup_name_fuzzy by only offering a suggestion if the distance is less
> than half of the length of what the user typed and that seemed to work
> well, albeit in the few cases I tried.  I suspect that we may
> want a similar suppression for lookup_field_fuzzy.
>
>> Cheers,
>>
>> Manuel.
>
> Thanks.
>
> Update version of the patch follows.
>
> This version of the patch is independent of the rest of the kit,
> and applies directly on top of trunk (r227562, specifically).
>
> Changes since previous version:
> - it's now independent of the rest of the patch kit.
> - removal of tracking of fieldname range "ident_range" from calls
>   to build_component_ref, just using the location_t.
> - removal of show-caret/multiline tests from testcase
> - introduced a typedef "edit_distance_t", using it to convert
>   the underlying type from "int" to "unsigned int".
> - "lookup_name_fuzzy" now only considers bindings of a TYPE_DECL,
>   thus matching "ins" rather than "inr" for the example given by Manu
>   here:
>     https://gcc.gnu.org/ml/gcc-patches/2015-09/msg00813.html
> - lookup_name_fuzzy: don't offer a suggestion if the distance is too
>   high, since such a suggestion is likely to be bogus
> - added test coverage to try to cover the above
> - reimplemented levenshtein_distance to avoid allocating and building
>   an (m + 1) * (n + 1) matrix in favor of just tracking two rows
>   at once
> - made levenshtein_distance_unit_tests automatically run each test
>   both ways; added some more tests
>
> I attempted the error-recovery in build_component_ref, but I found it
> could make things worse.  For example, in
> gcc/testsuite/gcc.dg/anon-struct-11.c:
>   f3 (&e.D);            /* { dg-error "no member" } */
> becomes:
>   error: 'struct E' has no member named 'D'; did you mean 'b'?
> but if we try to use "b", this then leads to thes additional bogus
> messages:
>   warning: passing argument 1 of 'f3' from incompatible
>     pointer type [-Wincompatible-pointer-types]
>   note: expected 'D * {aka struct <anonymous> *}' but
>     argument is of type 'char *'
>
> Similarly, in gcc/testsuite/gcc.dg/c11-anon-struct-2.c:
>   x.i = 0; /* { dg-error "has no member" } */
> this becomes:
>   error: 'struct s5' has no member named 'i'; did you mean 'a'?
> which then leads to:
>   error: incompatible types when assigning to type
>    'struct <anonymous>' from type 'int'
>
> So this version of the patch doesn't attempted to use the suggested
> field.
>
> Successfully bootstrapped&regrtested on x86_64-pc-linux-gnu; adds
> 9 PASSes to gcc.sum.
>
> I'm posting it here as a work-in-progress.
>
> Remaining work:
>   * the FIXME about where to call levenshtein_distance_unit_tests;
> there's an argument that this could be moved to libiberty (is C++
> allowed in libiberty?); I'd prefer to get the unittest idea from
>  https://gcc.gnu.org/ml/gcc-patches/2015-06/msg00765.html
> into trunk, and then move it into there.  Right now it's all
> gcc_assert, so optimizes away in a production build.
>   * more testcases as noted by Manu above
>   * try existing testcases as noted by Manu above
>   * possible early return when distance == 1
>   * perhaps some kind of limit on the number of iterations inside
> levenshtein_distance (e.g. governed by a param).
>   * perhaps some ability to pass in a limit on the
> distance we care about, so we can immediately reject distances
> that will be above this
>
> It also strikes me that sometimes a "misspelling" is a missing
> header file, and that the most helpful thing to do might be to
> suggest including that header file.  For instance given:
>   $ cat /tmp/foo.c
>   int64_t i;
>
>   $ ./xgcc -B. /tmp/foo.c
>   /tmp/foo.c:1:1: error: unknown type name ‘int64_t’
>   int64_t i;
>   ^
> (where the suggestion of "int" is suppressed due to the distance
> being too long) it might be helpful to print:
>   /tmp/foo.c:1:1: error: unknown type name 'int64_t'; did you mean to include '<inttypes.h>'?
>   int64_t i;
>   ^
> That does seem like a separate enhancement, though.
>
> gcc/ChangeLog:
>         * Makefile.in (OBJS): Add spellcheck.o.
>         * spellcheck.c: New file.
>         * spellcheck.h: New file.
>
> gcc/c-family/ChangeLog:
>         * c-common.h (lookup_name_fuzzy): New decl.
>
> gcc/c/ChangeLog:
>         * c-decl.c: Include spellcheck.h.
>         (lookup_name_fuzzy): New.
>         * c-parser.c: Include spellcheck.h.
>         (c_parser_declaration_or_fndef): If "unknown type name",
>         attempt to suggest a close match using lookup_name_fuzzy.
>         * c-typeck.c: Include spellcheck.h.
>         (lookup_field_fuzzy_find_candidates): New function.
>         (lookup_field_fuzzy): New function.
>         (build_component_ref): Use lookup_field_fuzzy to suggest close
>         matches when printing field-not-found error.
>
> gcc/testsuite/ChangeLog:
>         * gcc.dg/spellcheck.c: New file.
> ---
>  gcc/Makefile.in                   |   1 +
>  gcc/c-family/c-common.h           |   1 +
>  gcc/c/c-decl.c                    |  45 +++++++++++
>  gcc/c/c-parser.c                  |  11 ++-
>  gcc/c/c-typeck.c                  |  66 ++++++++++++++-
>  gcc/spellcheck.c                  | 166 ++++++++++++++++++++++++++++++++++++++
>  gcc/spellcheck.h                  |  35 ++++++++
>  gcc/testsuite/gcc.dg/spellcheck.c |  49 +++++++++++
>  8 files changed, 371 insertions(+), 3 deletions(-)
>  create mode 100644 gcc/spellcheck.c
>  create mode 100644 gcc/spellcheck.h
>  create mode 100644 gcc/testsuite/gcc.dg/spellcheck.c
>
> diff --git a/gcc/Makefile.in b/gcc/Makefile.in
> index 3d1c1e5..73a29b4 100644
> --- a/gcc/Makefile.in
> +++ b/gcc/Makefile.in
> @@ -1391,6 +1391,7 @@ OBJS = \
>         shrink-wrap.o \
>         simplify-rtx.o \
>         sparseset.o \
> +       spellcheck.o \
>         sreal.o \
>         stack-ptr-mod.o \
>         statistics.o \
> diff --git a/gcc/c-family/c-common.h b/gcc/c-family/c-common.h
> index 74d1bc1..e5f867c 100644
> --- a/gcc/c-family/c-common.h
> +++ b/gcc/c-family/c-common.h
> @@ -971,6 +971,7 @@ extern tree finish_label_address_expr (tree, location_t);
>     different implementations.  Used in c-common.c.  */
>  extern tree lookup_label (tree);
>  extern tree lookup_name (tree);
> +extern tree lookup_name_fuzzy (tree);
>  extern bool lvalue_p (const_tree);
>
>  extern bool vector_targets_convertible_p (const_tree t1, const_tree t2);
> diff --git a/gcc/c/c-decl.c b/gcc/c/c-decl.c
> index b83c584..d919019 100644
> --- a/gcc/c/c-decl.c
> +++ b/gcc/c/c-decl.c
> @@ -64,6 +64,7 @@ along with GCC; see the file COPYING3.  If not see
>  #include "c-family/c-ada-spec.h"
>  #include "cilk.h"
>  #include "builtins.h"
> +#include "spellcheck.h"
>
>  /* In grokdeclarator, distinguish syntactic contexts of declarators.  */
>  enum decl_context
> @@ -3900,6 +3901,50 @@ lookup_name_in_scope (tree name, struct c_scope *scope)
>        return b->decl;
>    return 0;
>  }
> +
> +/* Look for the closest match for NAME within the currently valid
> +   scopes.
> +
> +   This finds the identifier with the lowest Levenshtein distance to
> +   NAME.  If there are multiple candidates with equal minimal distance,
> +   the first one found is returned.  Scopes are searched from innermost
> +   outwards, and within a scope in reverse order of declaration, thus
> +   benefiting candidates "near" to the current scope.  */
> +
> +tree
> +lookup_name_fuzzy (tree name)
> +{
> +  gcc_assert (TREE_CODE (name) == IDENTIFIER_NODE);
> +
> +  c_binding *best_binding = NULL;
> +  edit_distance_t best_distance = MAX_EDIT_DISTANCE;
> +
> +  for (c_scope *scope = current_scope; scope; scope = scope->outer)
> +    for (c_binding *binding = scope->bindings; binding; binding = binding->prev)
> +      {
> +       if (!binding->id)
> +         continue;
> +       if (TREE_CODE (binding->decl) != TYPE_DECL)
> +         continue;
> +       edit_distance_t dist = levenshtein_distance (name, binding->id);
> +       if (dist < best_distance)
> +         {
> +           best_distance = dist;
> +           best_binding = binding;
> +         }
> +      }
> +
> +  if (!best_binding)
> +    return NULL;
> +
> +  /* If more than half of the letters were misspelled, the suggestion is
> +     likely to be meaningless.  */
> +  if (best_distance > IDENTIFIER_LENGTH (name) / 2 )
> +    return NULL;
> +
> +  return best_binding->id;
> +}
> +
>
>  /* Create the predefined scalar types of C,
>     and some nodes representing standard constants (0, 1, (void *) 0).
> diff --git a/gcc/c/c-parser.c b/gcc/c/c-parser.c
> index 11a2b0f..f04c88b 100644
> --- a/gcc/c/c-parser.c
> +++ b/gcc/c/c-parser.c
> @@ -66,6 +66,7 @@ along with GCC; see the file COPYING3.  If not see
>  #include "builtins.h"
>  #include "gomp-constants.h"
>  #include "c-family/c-indentation.h"
> +#include "spellcheck.h"
>
>
>  /* Initialization routine for this file.  */
> @@ -1539,8 +1540,14 @@ c_parser_declaration_or_fndef (c_parser *parser, bool fndef_ok,
>            || c_parser_peek_2nd_token (parser)->type == CPP_MULT)
>        && (!nested || !lookup_name (c_parser_peek_token (parser)->value)))
>      {
> -      error_at (here, "unknown type name %qE",
> -                c_parser_peek_token (parser)->value);
> +      tree hint = lookup_name_fuzzy (c_parser_peek_token (parser)->value);
> +      if (hint)
> +       error_at (here, "unknown type name %qE; did you mean %qE?",
> +                 c_parser_peek_token (parser)->value,
> +                 hint);
> +      else
> +       error_at (here, "unknown type name %qE",
> +                 c_parser_peek_token (parser)->value);
>
>        /* Parse declspecs normally to get a correct pointer type, but avoid
>           a further "fails to be a type name" error.  Refuse nested functions
> diff --git a/gcc/c/c-typeck.c b/gcc/c/c-typeck.c
> index dc22396..3dded26 100644
> --- a/gcc/c/c-typeck.c
> +++ b/gcc/c/c-typeck.c
> @@ -54,6 +54,7 @@ along with GCC; see the file COPYING3.  If not see
>  #include "c-family/c-ubsan.h"
>  #include "cilk.h"
>  #include "gomp-constants.h"
> +#include "spellcheck.h"
>
>  /* Possible cases of implicit bad conversions.  Used to select
>     diagnostic messages in convert_for_assignment.  */
> @@ -2249,6 +2250,64 @@ lookup_field (tree type, tree component)
>    return tree_cons (NULL_TREE, field, NULL_TREE);
>  }
>
> +/* Recursively append candidate IDENTIFIER_NODEs to CANDIDATES.  */
> +
> +static void
> +lookup_field_fuzzy_find_candidates (tree type, tree component,
> +                                   vec<tree> *candidates)
> +{
> +  tree field;
> +  for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
> +    {
> +      if (DECL_NAME (field) == NULL_TREE
> +         && (TREE_CODE (TREE_TYPE (field)) == RECORD_TYPE
> +             || TREE_CODE (TREE_TYPE (field)) == UNION_TYPE))
> +       {
> +         lookup_field_fuzzy_find_candidates (TREE_TYPE (field),
> +                                             component,
> +                                             candidates);
> +       }
> +
> +      if (DECL_NAME (field))
> +       candidates->safe_push (DECL_NAME (field));
> +    }
> +}
> +
> +/* Like "lookup_field", but find the closest matching IDENTIFIER_NODE,
> +   rather than returning a TREE_LIST for an exact match.  */
> +
> +static tree
> +lookup_field_fuzzy (tree type, tree component)
> +{
> +  gcc_assert (TREE_CODE (component) == IDENTIFIER_NODE);
> +
> +  /* FIXME: move this to a unittest suite. */
> +  levenshtein_distance_unit_tests ();
> +
> +  /* First, gather a list of candidates.  */
> +  auto_vec <tree> candidates;
> +
> +  lookup_field_fuzzy_find_candidates (type, component,
> +                                     &candidates);
> +
> +  /* Now determine which is closest.  */
> +  int i;
> +  tree identifier;
> +  tree best_identifier = NULL;
> +  edit_distance_t best_distance = MAX_EDIT_DISTANCE;
> +  FOR_EACH_VEC_ELT (candidates, i, identifier)
> +    {
> +      edit_distance_t dist = levenshtein_distance (component, identifier);
> +      if (dist < best_distance)
> +       {
> +         best_distance = dist;
> +         best_identifier = identifier;
> +       }
> +    }
> +
> +  return best_identifier;
> +}
> +
>  /* Make an expression to refer to the COMPONENT field of structure or
>     union value DATUM.  COMPONENT is an IDENTIFIER_NODE.  LOC is the
>     location of the COMPONENT_REF.  */
> @@ -2284,7 +2343,12 @@ build_component_ref (location_t loc, tree datum, tree component)
>
>        if (!field)
>         {
> -         error_at (loc, "%qT has no member named %qE", type, component);
> +         tree guessed_id = lookup_field_fuzzy (type, component);
> +         if (guessed_id)
> +           error_at (loc, "%qT has no member named %qE; did you mean %qE?",
> +                     type, component, guessed_id);
> +         else
> +           error_at (loc, "%qT has no member named %qE", type, component);
>           return error_mark_node;
>         }
>
> diff --git a/gcc/spellcheck.c b/gcc/spellcheck.c
> new file mode 100644
> index 0000000..c407aa0
> --- /dev/null
> +++ b/gcc/spellcheck.c
> @@ -0,0 +1,166 @@
> +/* Find near-matches for strings and identifiers.
> +   Copyright (C) 2015 Free Software Foundation, Inc.
> +
> +This file is part of GCC.
> +
> +GCC is free software; you can redistribute it and/or modify it under
> +the terms of the GNU General Public License as published by the Free
> +Software Foundation; either version 3, or (at your option) any later
> +version.
> +
> +GCC is distributed in the hope that it will be useful, but WITHOUT ANY
> +WARRANTY; without even the implied warranty of MERCHANTABILITY or
> +FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
> +for more details.
> +
> +You should have received a copy of the GNU General Public License
> +along with GCC; see the file COPYING3.  If not see
> +<http://www.gnu.org/licenses/>.  */
> +
> +#include "config.h"
> +#include "system.h"
> +#include "coretypes.h"
> +#include "tm.h"
> +#include "tree.h"
> +#include "spellcheck.h"
> +
> +/* The Levenshtein distance is an "edit-distance": the minimal
> +   number of one-character insertions, removals or substitutions
> +   that are needed to change one string into another.
> +
> +   This implementation uses the Wagner-Fischer algorithm.  */
> +
> +static edit_distance_t
> +levenshtein_distance (const char *s, int m,
> +                     const char *t, int n)
> +{
> +  const bool debug = false;
> +
> +  if (debug)
> +    {
> +      printf ("s: \"%s\" (m=%i)\n", s, m);
> +      printf ("t: \"%s\" (n=%i)\n", t, n);
> +    }
> +
> +  if (m == 0)
> +    return n;
> +  if (n == 0)
> +    return m;
> +
> +  /* We effectively build a matrix where each (i, j) contains the
> +     Levenshtein distance between the prefix strings s[0:i]
> +     and t[0:j].
> +     Rather than actually build an (m + 1) * (n + 1) matrix,
> +     we simply keep track of the last row, v0 and a new row, v1,
> +     which avoids an (m + 1) * (n + 1) allocation and memory accesses
> +     in favor of two (m + 1) allocations.  These could potentially be
> +     statically-allocated if we impose a maximum length on the
> +     strings of interest.  */
> +  edit_distance_t *v0 = new edit_distance_t[m + 1];
> +  edit_distance_t *v1 = new edit_distance_t[m + 1];
> +
> +  /* The first row is for the case of an empty target string, which
> +     we can reach by deleting every character in the source string.  */
> +  for (int i = 0; i < m + 1; i++)
> +    v0[i] = i;
> +
> +  /* Build successive rows.  */
> +  for (int i = 0; i < n; i++)
> +    {
> +      if (debug)
> +       {
> +         printf ("i:%i v0 = ", i);
> +         for (int j = 0; j < m + 1; j++)
> +           printf ("%i ", v0[j]);
> +         printf ("\n");
> +       }
> +
> +      /* The initial column is for the case of an empty source string; we
> +        can reach prefixes of the target string of length i
> +        by inserting i characters.  */
> +      v1[0] = i + 1;
> +
> +      /* Build the rest of the row by considering neighbours to
> +        the north, west and northwest.  */
> +      for (int j = 0; j < m; j++)
> +       {
> +         edit_distance_t cost = (s[j] == t[i] ? 0 : 1);
> +         edit_distance_t deletion     = v1[j] + 1;
> +         edit_distance_t insertion    = v0[j + 1] + 1;
> +         edit_distance_t substitution = v0[j] + cost;
> +         edit_distance_t cheapest = MIN (deletion, insertion);
> +         cheapest = MIN (cheapest, substitution);
> +         v1[j + 1] = cheapest;
> +       }
> +
> +      /* Prepare to move on to next row.  */
> +      for (int j = 0; j < m + 1; j++)
> +       v0[j] = v1[j];
> +    }
> +
> +  if (debug)
> +    {
> +      printf ("final v1 = ");
> +      for (int j = 0; j < m + 1; j++)
> +       printf ("%i ", v1[j]);
> +      printf ("\n");
> +    }
> +
> +  edit_distance_t result = v1[m];
> +  delete[] v0;
> +  delete[] v1;
> +  return result;
> +}
> +
> +/* Calculate Levenshtein distance between two nil-terminated strings.
> +   This exists purely for the unit tests.  */
> +
> +edit_distance_t
> +levenshtein_distance (const char *s, const char *t)
> +{
> +  return levenshtein_distance (s, strlen (s), t, strlen (t));
> +}
> +
> +/* Unit tests for levenshtein_distance.  */
> +
> +static void
> +levenshtein_distance_unit_test (const char *a, const char *b,
> +                               edit_distance_t expected)
> +{
> +  /* Run every test both ways to ensure it's symmetric.  */
> +  gcc_assert (levenshtein_distance (a, b) == expected);
> +  gcc_assert (levenshtein_distance (b, a) == expected);
> +}
> +
> +void
> +levenshtein_distance_unit_tests (void)
> +{
> +  levenshtein_distance_unit_test ("", "nonempty", strlen ("nonempty"));
> +  levenshtein_distance_unit_test ("saturday", "sunday", 3);
> +  levenshtein_distance_unit_test ("foo", "m_foo", 2);
> +  levenshtein_distance_unit_test ("hello_world", "HelloWorld", 3);
> +  levenshtein_distance_unit_test
> +    ("the quick brown fox jumps over the lazy dog", "dog", 40);
> +  levenshtein_distance_unit_test
> +    ("the quick brown fox jumps over the lazy dog",
> +     "the quick brown dog jumps over the lazy fox",
> +     4);
> +  levenshtein_distance_unit_test
> +    ("Lorem ipsum dolor sit amet, consectetur adipiscing elit,",
> +     "All your base are belong to us",
> +     44);
> +}
> +
> +/* Calculate Levenshtein distance between two identifiers.  */
> +
> +edit_distance_t
> +levenshtein_distance (tree ident_s, tree ident_t)
> +{
> +  gcc_assert (TREE_CODE (ident_s) == IDENTIFIER_NODE);
> +  gcc_assert (TREE_CODE (ident_t) == IDENTIFIER_NODE);
> +
> +  return levenshtein_distance (IDENTIFIER_POINTER (ident_s),
> +                              IDENTIFIER_LENGTH (ident_s),
> +                              IDENTIFIER_POINTER (ident_t),
> +                              IDENTIFIER_LENGTH (ident_t));
> +}
> diff --git a/gcc/spellcheck.h b/gcc/spellcheck.h
> new file mode 100644
> index 0000000..6f50b71
> --- /dev/null
> +++ b/gcc/spellcheck.h
> @@ -0,0 +1,35 @@
> +/* Find near-matches for strings and identifiers.
> +   Copyright (C) 2015 Free Software Foundation, Inc.
> +
> +This file is part of GCC.
> +
> +GCC is free software; you can redistribute it and/or modify it under
> +the terms of the GNU General Public License as published by the Free
> +Software Foundation; either version 3, or (at your option) any later
> +version.
> +
> +GCC is distributed in the hope that it will be useful, but WITHOUT ANY
> +WARRANTY; without even the implied warranty of MERCHANTABILITY or
> +FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
> +for more details.
> +
> +You should have received a copy of the GNU General Public License
> +along with GCC; see the file COPYING3.  If not see
> +<http://www.gnu.org/licenses/>.  */
> +
> +#ifndef GCC_SPELLCHECK_H
> +#define GCC_SPELLCHECK_H
> +
> +typedef unsigned int edit_distance_t;
> +const edit_distance_t MAX_EDIT_DISTANCE = UINT_MAX;
> +
> +extern void
> +levenshtein_distance_unit_tests (void);
> +
> +extern edit_distance_t
> +levenshtein_distance (const char *s, const char *t);
> +
> +extern edit_distance_t
> +levenshtein_distance (tree ident_s, tree ident_t);
> +
> +#endif  /* GCC_SPELLCHECK_H  */
> diff --git a/gcc/testsuite/gcc.dg/spellcheck.c b/gcc/testsuite/gcc.dg/spellcheck.c
> new file mode 100644
> index 0000000..53ebb86
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/spellcheck.c
> @@ -0,0 +1,49 @@
> +/* { dg-do compile } */
> +
> +struct foo
> +{
> +  int foo;
> +  int bar;
> +  int baz;
> +};
> +
> +int test (struct foo *ptr)
> +{
> +  return ptr->m_bar; /* { dg-error "'struct foo' has no member named 'm_bar'; did you mean 'bar'?" } */
> +}
> +
> +int test2 (void)
> +{
> +  struct foo instance = {0, 0, 0};
> +  return instance.m_bar; /* { dg-error "'struct foo' has no member named 'm_bar'; did you mean 'bar'?" } */
> +}
> +
> +#include <inttypes.h>
> +int64 i; /* { dg-error "unknown type name 'int64'; did you mean 'int64_t'?" } */
> +
> +typedef int ins;
> +int test3 (void)
> +{
> +    int inr;
> +    inp x; /* { dg-error "unknown type name 'inp'; did you mean 'ins'?" } */
> +}
> +
> +struct s {
> +    struct j { int aa; } kk;
> +    int ab;
> +};
> +
> +void test4 (struct s x)
> +{
> +  x.ac;  /* { dg-error "'struct s' has no member named 'ac'; did you mean 'ab'?" } */
> +}
> +
> +int test5 (struct foo *ptr)
> +{
> +  return sizeof (ptr->foa); /* { dg-error "'struct foo' has no member named 'foa'; did you mean 'foo'?" } */
> +}
> +
> +/* Verify that gcc doesn't offer nonsensical suggestions.  */
> +
> +nonsensical_suggestion_t var; /* { dg-bogus "did you mean" } */
> +/* { dg-error "unknown type name" "" { target { *-*-* } } 48 } */
> --
> 1.8.5.3
>
Michael Matz Sept. 16, 2015, 1:22 p.m. UTC | #3
Hi,

On Wed, 16 Sep 2015, Richard Biener wrote:

> Btw, this looks quite expensive - I'm sure we want to limit the effort
> here a bit.

I'm not so sure.  It's only used for printing an error, so walking all 
available decls is expensive but IMHO not too much so.

> I don't want us to suggest using 'i' instead of j (a good hint is that I 
> used 'j' multiple times).

Well, there will always be cases where the suggestion is actually wrong.  
How do you propose to deal with this?  The above case could be solved by 
not giving hints when the levenshtein distance is as long as the string 
length (which makes sense, because then there's no relation at all between 
the string and the suggestion).

> So while the idea might be an improvement to selected cases it can cause 
> confusion as well.  And if using the suggestion for further parsing it 
> can cause worse followup errors (unless we can limit such "fixup" use to 
> the cases where we can parse the result without errors).  Consider
> 
> foo()
> {
>   foz = 1;
> }
> 
> if we suggest 'foo' instead of foz then we'll get a more confusing followup
> error if we actually use it.

This particular case could be solved by ruling out candidaten of the wrong 
kind (here, something that can be assigned to, vs. a function).  But it 
might actually be too early in parsing to say that there will be an 
assignment.  I don't think _this_ problem should block the patch.


Ciao,
Michael.
Richard Biener Sept. 16, 2015, 1:33 p.m. UTC | #4
On Wed, Sep 16, 2015 at 3:22 PM, Michael Matz <matz@suse.de> wrote:
> Hi,
>
> On Wed, 16 Sep 2015, Richard Biener wrote:
>
>> Btw, this looks quite expensive - I'm sure we want to limit the effort
>> here a bit.
>
> I'm not so sure.  It's only used for printing an error, so walking all
> available decls is expensive but IMHO not too much so.

Well, as we're not stopping at the very first error creating an
artificial testcase
that hits this quite badly should be possible.  Maybe only try this for
the first error and not for followups?

>> I don't want us to suggest using 'i' instead of j (a good hint is that I
>> used 'j' multiple times).
>
> Well, there will always be cases where the suggestion is actually wrong.
> How do you propose to deal with this?  The above case could be solved by
> not giving hints when the levenshtein distance is as long as the string
> length (which makes sense, because then there's no relation at all between
> the string and the suggestion).
>
>> So while the idea might be an improvement to selected cases it can cause
>> confusion as well.  And if using the suggestion for further parsing it
>> can cause worse followup errors (unless we can limit such "fixup" use to
>> the cases where we can parse the result without errors).  Consider
>>
>> foo()
>> {
>>   foz = 1;
>> }
>>
>> if we suggest 'foo' instead of foz then we'll get a more confusing followup
>> error if we actually use it.
>
> This particular case could be solved by ruling out candidaten of the wrong
> kind (here, something that can be assigned to, vs. a function).  But it
> might actually be too early in parsing to say that there will be an
> assignment.  I don't think _this_ problem should block the patch.

I wonder if we can tentatively parse with the choice at hand, only allowing
(and even suggesting?) it if that works out.

Richard.

>
> Ciao,
> Michael.
Manuel López-Ibáñez Sept. 16, 2015, 3:45 p.m. UTC | #5
On 16 September 2015 at 15:33, Richard Biener
<richard.guenther@gmail.com> wrote:
> On Wed, Sep 16, 2015 at 3:22 PM, Michael Matz <matz@suse.de> wrote:
>>> if we suggest 'foo' instead of foz then we'll get a more confusing followup
>>> error if we actually use it.
>>
>> This particular case could be solved by ruling out candidaten of the wrong
>> kind (here, something that can be assigned to, vs. a function).  But it
>> might actually be too early in parsing to say that there will be an
>> assignment.  I don't think _this_ problem should block the patch.

Indeed. The patch by David does not try to fix-up the code, it merely
suggests a possible candidate. The follow-up errors should be the same
before and after. Such suggestions will never be 100% right, even if
the suggestion makes the code compile and run, it may still be the
wrong one. A wrong suggestion is far less serious than a wrong
uninitialized or Warray-bounds warning and we can live with those. Why
this needs to be perfect from the very beginning?

BTW, there is a PR for this: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=52277

> I wonder if we can tentatively parse with the choice at hand, only allowing
> (and even suggesting?) it if that works out.

This would require to queue the error, fix-up the wrong name and
continue parsing. If there is another error, ignore that one and emit
the original error without suggestion. The problem here is that we do
not know if the additional error is actually caused by the fix-up we
did or it is an already existing error. It would be equally terrible
to emit errors caused by the fix-up or emit just a single error for
the typo. We would need to roll-back the tentative parse and do a
definitive parse anyway. This does not seem possible at the moment
because the parsers maintain a lot of global state that is not easy to
roll-back. We cannot simply create a copy of the parser state and
throw it away later to continue as if the tentative parse has not
happened.

I'm not even sure if, in general, one can stop at the statement level
or we would need to parse the whole function (or translation unit) to
be able to tell if the suggestion is a valid candidate.

Cheers,

Manuel.
Richard Biener Sept. 17, 2015, 8:43 a.m. UTC | #6
On Wed, Sep 16, 2015 at 5:45 PM, Manuel López-Ibáñez
<lopezibanez@gmail.com> wrote:
> On 16 September 2015 at 15:33, Richard Biener
> <richard.guenther@gmail.com> wrote:
>> On Wed, Sep 16, 2015 at 3:22 PM, Michael Matz <matz@suse.de> wrote:
>>>> if we suggest 'foo' instead of foz then we'll get a more confusing followup
>>>> error if we actually use it.
>>>
>>> This particular case could be solved by ruling out candidaten of the wrong
>>> kind (here, something that can be assigned to, vs. a function).  But it
>>> might actually be too early in parsing to say that there will be an
>>> assignment.  I don't think _this_ problem should block the patch.
>
> Indeed. The patch by David does not try to fix-up the code, it merely
> suggests a possible candidate. The follow-up errors should be the same
> before and after. Such suggestions will never be 100% right, even if
> the suggestion makes the code compile and run, it may still be the
> wrong one. A wrong suggestion is far less serious than a wrong
> uninitialized or Warray-bounds warning and we can live with those. Why
> this needs to be perfect from the very beginning?
>
> BTW, there is a PR for this: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=52277
>
>> I wonder if we can tentatively parse with the choice at hand, only allowing
>> (and even suggesting?) it if that works out.
>
> This would require to queue the error, fix-up the wrong name and
> continue parsing. If there is another error, ignore that one and emit
> the original error without suggestion. The problem here is that we do
> not know if the additional error is actually caused by the fix-up we
> did or it is an already existing error. It would be equally terrible
> to emit errors caused by the fix-up or emit just a single error for
> the typo. We would need to roll-back the tentative parse and do a
> definitive parse anyway. This does not seem possible at the moment
> because the parsers maintain a lot of global state that is not easy to
> roll-back. We cannot simply create a copy of the parser state and
> throw it away later to continue as if the tentative parse has not
> happened.
>
> I'm not even sure if, in general, one can stop at the statement level
> or we would need to parse the whole function (or translation unit) to
> be able to tell if the suggestion is a valid candidate.

I was suggesting to only tentatively finish parsing the "current construct".
No idea how to best figure that out to the extend to make the tentative
parse useful.  Say, if we have "a + s.foz" and the field foz is not there
but foo is, so if we continue parsing with 'foo' instead but 'foo' will have
a type that makes "a + s.foo" invalid then we probably shouldn't suggest
it.  It _might_ be reasonably "easy" to implement that, but I'm not sure.
There might be a field named fz (with same or bigger levenstein distance)
with the correct type.  Of course it might have been I misspelled
's' and meant 'r' instead which has a field foz of corect type... (and 's'
is available as well).

I agree that we don't have to solve all this in the first iteration.

Richard.

> Cheers,
>
> Manuel.
Jeff Law Sept. 17, 2015, 7:31 p.m. UTC | #7
On 09/16/2015 02:34 AM, Richard Biener wrote:
>
> Btw, this looks quite expensive - I'm sure we want to limit the effort
> here a bit.
A limiter is reasonable, though as it's been pointed out this only fires 
during error processing, so we probably have more leeway to take time 
and see if we can do better error recovery.

FWIW, I've used this algorithm in totally unrelated projects and while 
it seems expensive, it's worked out quite nicely.

>
> So while the idea might be an improvement to selected cases it can cause
> confusion as well.  And if using the suggestion for further parsing it can
> cause worse followup errors (unless we can limit such "fixup" use to the
> cases where we can parse the result without errors).  Consider
>
> foo()
> {
>    foz = 1;
> }
>
> if we suggest 'foo' instead of foz then we'll get a more confusing followup
> error if we actually use it.
True.  This kind of problem is probably inherent in this kind of "I'm 
going assume you meant..." error recovery mechanisms.

And just to be clear, even in a successful recovery scenario, we still 
issue an error.  The error recovery is just meant to try and give the 
user a hint what might have gone wrong and gracefully handle the case 
where they just made a minor goof.  Obviously the idea here is to cut 
down on the number of iterations of edit-compile cycle one has to do :-)


Jeff
David Malcolm Sept. 17, 2015, 7:57 p.m. UTC | #8
On Thu, 2015-09-17 at 13:31 -0600, Jeff Law wrote:
> On 09/16/2015 02:34 AM, Richard Biener wrote:
> >
> > Btw, this looks quite expensive - I'm sure we want to limit the effort
> > here a bit.
> A limiter is reasonable, though as it's been pointed out this only fires 
> during error processing, so we probably have more leeway to take time 
> and see if we can do better error recovery.
> 
> FWIW, I've used this algorithm in totally unrelated projects and while 
> it seems expensive, it's worked out quite nicely.
> 
> >
> > So while the idea might be an improvement to selected cases it can cause
> > confusion as well.  And if using the suggestion for further parsing it can
> > cause worse followup errors (unless we can limit such "fixup" use to the
> > cases where we can parse the result without errors).  Consider
> >
> > foo()
> > {
> >    foz = 1;
> > }
> >
> > if we suggest 'foo' instead of foz then we'll get a more confusing followup
> > error if we actually use it.
> True.  This kind of problem is probably inherent in this kind of "I'm 
> going assume you meant..." error recovery mechanisms.
> 
> And just to be clear, even in a successful recovery scenario, we still 
> issue an error.  The error recovery is just meant to try and give the 
> user a hint what might have gone wrong and gracefully handle the case 
> where they just made a minor goof.  

(nods)

> Obviously the idea here is to cut 
> down on the number of iterations of edit-compile cycle one has to do :-)

In my mind it's more about saving the user from having to locate the
field they really meant within the corresponding structure declaration
(either by grep, or by some cross-referencing tool).

A lot of the time I find myself wishing that the compiler had issued a
note saying "here's the declaration of the struct in question", which
would make it easy for me to go straight there in Emacs.

I wonder what proportion of our users use a cross-referencing tool or
have an IDE that can find this stuff for them, vs those that rely on
grep, and if that should mean something for our diagnostics (I tend to
just rely on grep).

This is rather tangential to this RFE, of course.

Dave
Manuel López-Ibáñez Sept. 17, 2015, 8:42 p.m. UTC | #9
On 17 September 2015 at 21:57, David Malcolm <dmalcolm@redhat.com> wrote:
> In my mind it's more about saving the user from having to locate the
> field they really meant within the corresponding structure declaration
> (either by grep, or by some cross-referencing tool).

I think it is more than that. After a long coding session, one can
start to wonder why the compiler cannot find type_of_unknwon_predicate
or firstColourInColumn (ah! it was type_of_unknown_predicate and
firstColorInColumn!).

Or when we extend this to options (PR67613), why I get

error: unrecognized command line option '-Weffic++'

when I just read it in the manual!

Cheers,

Manuel.
David Malcolm Oct. 30, 2015, 12:47 p.m. UTC | #10
On Thu, 2015-09-17 at 13:31 -0600, Jeff Law wrote:
> On 09/16/2015 02:34 AM, Richard Biener wrote:
> >
> > Btw, this looks quite expensive - I'm sure we want to limit the effort
> > here a bit.
> A limiter is reasonable, though as it's been pointed out this only fires 
> during error processing, so we probably have more leeway to take time 
> and see if we can do better error recovery.
> 
> FWIW, I've used this algorithm in totally unrelated projects and while 
> it seems expensive, it's worked out quite nicely.
> 
> >
> > So while the idea might be an improvement to selected cases it can cause
> > confusion as well.  And if using the suggestion for further parsing it can
> > cause worse followup errors (unless we can limit such "fixup" use to the
> > cases where we can parse the result without errors).  Consider
> >
> > foo()
> > {
> >    foz = 1;
> > }
> >
> > if we suggest 'foo' instead of foz then we'll get a more confusing followup
> > error if we actually use it.
> True.  This kind of problem is probably inherent in this kind of "I'm 
> going assume you meant..." error recovery mechanisms.
> 
> And just to be clear, even in a successful recovery scenario, we still 
> issue an error.  The error recovery is just meant to try and give the 
> user a hint what might have gone wrong and gracefully handle the case 
> where they just made a minor goof.  Obviously the idea here is to cut 
> down on the number of iterations of edit-compile cycle one has to do :-)
> 
> 
> Jeff

The typename suggestion seems to be at least somewhat controversial,
whereas (I hope) the misspelled field names suggestion is more
acceptable.

Hence I'm focusing on the field name lookup for now; other uses of the
algorithm (e.g. the typename lookup) could be done in followup patches,
but I'm deferring them for now in the hope of getting the simplest case
into trunk as a first step.  Similarly, for simplicity, I didn't
implement any attempt at error-recovery using the hint.

The following patch kit is in two parts (for ease of review; they would
be applied together):

  patch 1: Implement Levenshtein distance
  patch 2: C FE: suggest corrections for misspelled field names

I didn't implement a limiter, on the grounds that this only fires
once per "has no member named" error, and so is unlikely to slow
things down noticeably.

Successfully bootstrapped&regrtested the combination of these two
on x86_64-pc-linux-gnu (adds 11 new PASS results to gcc.sum)

OK for trunk?

 gcc/Makefile.in                                  |   1 +
 gcc/c/c-typeck.c                                 |  70 +++++++++++-
 gcc/spellcheck.c                                 | 136 +++++++++++++++++++++++
 gcc/spellcheck.h                                 |  32 ++++++
 gcc/testsuite/gcc.dg/plugin/levenshtein-test-1.c |   9 ++
 gcc/testsuite/gcc.dg/plugin/levenshtein_plugin.c |  64 +++++++++++
 gcc/testsuite/gcc.dg/plugin/plugin.exp           |   1 +
 gcc/testsuite/gcc.dg/spellcheck-fields.c         |  63 +++++++++++
 8 files changed, 375 insertions(+), 1 deletion(-)
 create mode 100644 gcc/spellcheck.c
 create mode 100644 gcc/spellcheck.h
 create mode 100644 gcc/testsuite/gcc.dg/plugin/levenshtein-test-1.c
 create mode 100644 gcc/testsuite/gcc.dg/plugin/levenshtein_plugin.c
 create mode 100644 gcc/testsuite/gcc.dg/spellcheck-fields.c
Jeff Law Nov. 2, 2015, 6:44 a.m. UTC | #11
On 10/30/2015 06:47 AM, David Malcolm wrote:

> The typename suggestion seems to be at least somewhat controversial,
> whereas (I hope) the misspelled field names suggestion is more
> acceptable.
>
> Hence I'm focusing on the field name lookup for now; other uses of the
> algorithm (e.g. the typename lookup) could be done in followup patches,
> but I'm deferring them for now in the hope of getting the simplest case
> into trunk as a first step.  Similarly, for simplicity, I didn't
> implement any attempt at error-recovery using the hint.
>
> The following patch kit is in two parts (for ease of review; they would
> be applied together):
>
>    patch 1: Implement Levenshtein distance
>    patch 2: C FE: suggest corrections for misspelled field names
>
> I didn't implement a limiter, on the grounds that this only fires
> once per "has no member named" error, and so is unlikely to slow
> things down noticeably.
>
> Successfully bootstrapped&regrtested the combination of these two
> on x86_64-pc-linux-gnu (adds 11 new PASS results to gcc.sum)
>
> OK for trunk?
>
>   gcc/Makefile.in                                  |   1 +
>   gcc/c/c-typeck.c                                 |  70 +++++++++++-
>   gcc/spellcheck.c                                 | 136 +++++++++++++++++++++++
>   gcc/spellcheck.h                                 |  32 ++++++
>   gcc/testsuite/gcc.dg/plugin/levenshtein-test-1.c |   9 ++
>   gcc/testsuite/gcc.dg/plugin/levenshtein_plugin.c |  64 +++++++++++
>   gcc/testsuite/gcc.dg/plugin/plugin.exp           |   1 +
>   gcc/testsuite/gcc.dg/spellcheck-fields.c         |  63 +++++++++++
>   8 files changed, 375 insertions(+), 1 deletion(-)
>   create mode 100644 gcc/spellcheck.c
>   create mode 100644 gcc/spellcheck.h
>   create mode 100644 gcc/testsuite/gcc.dg/plugin/levenshtein-test-1.c
>   create mode 100644 gcc/testsuite/gcc.dg/plugin/levenshtein_plugin.c
>   create mode 100644 gcc/testsuite/gcc.dg/spellcheck-fields.c
I'm going to assume you got levenshtein's algorithm reasonably correct.

This is OK for the trunk.  Obviously I'd like to see it extend into the 
other front-ends (C++ in particular).  Then I'd like to see it extend 
beyond just misspelled field names.

jeff
diff mbox

Patch

diff --git a/gcc/Makefile.in b/gcc/Makefile.in
index 3d1c1e5..73a29b4 100644
--- a/gcc/Makefile.in
+++ b/gcc/Makefile.in
@@ -1391,6 +1391,7 @@  OBJS = \
 	shrink-wrap.o \
 	simplify-rtx.o \
 	sparseset.o \
+	spellcheck.o \
 	sreal.o \
 	stack-ptr-mod.o \
 	statistics.o \
diff --git a/gcc/c-family/c-common.h b/gcc/c-family/c-common.h
index 74d1bc1..e5f867c 100644
--- a/gcc/c-family/c-common.h
+++ b/gcc/c-family/c-common.h
@@ -971,6 +971,7 @@  extern tree finish_label_address_expr (tree, location_t);
    different implementations.  Used in c-common.c.  */
 extern tree lookup_label (tree);
 extern tree lookup_name (tree);
+extern tree lookup_name_fuzzy (tree);
 extern bool lvalue_p (const_tree);
 
 extern bool vector_targets_convertible_p (const_tree t1, const_tree t2);
diff --git a/gcc/c/c-decl.c b/gcc/c/c-decl.c
index b83c584..d919019 100644
--- a/gcc/c/c-decl.c
+++ b/gcc/c/c-decl.c
@@ -64,6 +64,7 @@  along with GCC; see the file COPYING3.  If not see
 #include "c-family/c-ada-spec.h"
 #include "cilk.h"
 #include "builtins.h"
+#include "spellcheck.h"
 
 /* In grokdeclarator, distinguish syntactic contexts of declarators.  */
 enum decl_context
@@ -3900,6 +3901,50 @@  lookup_name_in_scope (tree name, struct c_scope *scope)
       return b->decl;
   return 0;
 }
+
+/* Look for the closest match for NAME within the currently valid
+   scopes.
+
+   This finds the identifier with the lowest Levenshtein distance to
+   NAME.  If there are multiple candidates with equal minimal distance,
+   the first one found is returned.  Scopes are searched from innermost
+   outwards, and within a scope in reverse order of declaration, thus
+   benefiting candidates "near" to the current scope.  */
+
+tree
+lookup_name_fuzzy (tree name)
+{
+  gcc_assert (TREE_CODE (name) == IDENTIFIER_NODE);
+
+  c_binding *best_binding = NULL;
+  edit_distance_t best_distance = MAX_EDIT_DISTANCE;
+
+  for (c_scope *scope = current_scope; scope; scope = scope->outer)
+    for (c_binding *binding = scope->bindings; binding; binding = binding->prev)
+      {
+	if (!binding->id)
+	  continue;
+	if (TREE_CODE (binding->decl) != TYPE_DECL)
+	  continue;
+	edit_distance_t dist = levenshtein_distance (name, binding->id);
+	if (dist < best_distance)
+	  {
+	    best_distance = dist;
+	    best_binding = binding;
+	  }
+      }
+
+  if (!best_binding)
+    return NULL;
+
+  /* If more than half of the letters were misspelled, the suggestion is
+     likely to be meaningless.  */
+  if (best_distance > IDENTIFIER_LENGTH (name) / 2 )
+    return NULL;
+
+  return best_binding->id;
+}
+
 
 /* Create the predefined scalar types of C,
    and some nodes representing standard constants (0, 1, (void *) 0).
diff --git a/gcc/c/c-parser.c b/gcc/c/c-parser.c
index 11a2b0f..f04c88b 100644
--- a/gcc/c/c-parser.c
+++ b/gcc/c/c-parser.c
@@ -66,6 +66,7 @@  along with GCC; see the file COPYING3.  If not see
 #include "builtins.h"
 #include "gomp-constants.h"
 #include "c-family/c-indentation.h"
+#include "spellcheck.h"
 
 
 /* Initialization routine for this file.  */
@@ -1539,8 +1540,14 @@  c_parser_declaration_or_fndef (c_parser *parser, bool fndef_ok,
           || c_parser_peek_2nd_token (parser)->type == CPP_MULT)
       && (!nested || !lookup_name (c_parser_peek_token (parser)->value)))
     {
-      error_at (here, "unknown type name %qE",
-                c_parser_peek_token (parser)->value);
+      tree hint = lookup_name_fuzzy (c_parser_peek_token (parser)->value);
+      if (hint)
+	error_at (here, "unknown type name %qE; did you mean %qE?",
+		  c_parser_peek_token (parser)->value,
+		  hint);
+      else
+	error_at (here, "unknown type name %qE",
+		  c_parser_peek_token (parser)->value);
 
       /* Parse declspecs normally to get a correct pointer type, but avoid
          a further "fails to be a type name" error.  Refuse nested functions
diff --git a/gcc/c/c-typeck.c b/gcc/c/c-typeck.c
index dc22396..3dded26 100644
--- a/gcc/c/c-typeck.c
+++ b/gcc/c/c-typeck.c
@@ -54,6 +54,7 @@  along with GCC; see the file COPYING3.  If not see
 #include "c-family/c-ubsan.h"
 #include "cilk.h"
 #include "gomp-constants.h"
+#include "spellcheck.h"
 
 /* Possible cases of implicit bad conversions.  Used to select
    diagnostic messages in convert_for_assignment.  */
@@ -2249,6 +2250,64 @@  lookup_field (tree type, tree component)
   return tree_cons (NULL_TREE, field, NULL_TREE);
 }
 
+/* Recursively append candidate IDENTIFIER_NODEs to CANDIDATES.  */
+
+static void
+lookup_field_fuzzy_find_candidates (tree type, tree component,
+				    vec<tree> *candidates)
+{
+  tree field;
+  for (field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
+    {
+      if (DECL_NAME (field) == NULL_TREE
+	  && (TREE_CODE (TREE_TYPE (field)) == RECORD_TYPE
+	      || TREE_CODE (TREE_TYPE (field)) == UNION_TYPE))
+	{
+	  lookup_field_fuzzy_find_candidates (TREE_TYPE (field),
+					      component,
+					      candidates);
+	}
+
+      if (DECL_NAME (field))
+	candidates->safe_push (DECL_NAME (field));
+    }
+}
+
+/* Like "lookup_field", but find the closest matching IDENTIFIER_NODE,
+   rather than returning a TREE_LIST for an exact match.  */
+
+static tree
+lookup_field_fuzzy (tree type, tree component)
+{
+  gcc_assert (TREE_CODE (component) == IDENTIFIER_NODE);
+
+  /* FIXME: move this to a unittest suite. */
+  levenshtein_distance_unit_tests ();
+
+  /* First, gather a list of candidates.  */
+  auto_vec <tree> candidates;
+
+  lookup_field_fuzzy_find_candidates (type, component,
+				      &candidates);
+
+  /* Now determine which is closest.  */
+  int i;
+  tree identifier;
+  tree best_identifier = NULL;
+  edit_distance_t best_distance = MAX_EDIT_DISTANCE;
+  FOR_EACH_VEC_ELT (candidates, i, identifier)
+    {
+      edit_distance_t dist = levenshtein_distance (component, identifier);
+      if (dist < best_distance)
+	{
+	  best_distance = dist;
+	  best_identifier = identifier;
+	}
+    }
+
+  return best_identifier;
+}
+
 /* Make an expression to refer to the COMPONENT field of structure or
    union value DATUM.  COMPONENT is an IDENTIFIER_NODE.  LOC is the
    location of the COMPONENT_REF.  */
@@ -2284,7 +2343,12 @@  build_component_ref (location_t loc, tree datum, tree component)
 
       if (!field)
 	{
-	  error_at (loc, "%qT has no member named %qE", type, component);
+	  tree guessed_id = lookup_field_fuzzy (type, component);
+	  if (guessed_id)
+	    error_at (loc, "%qT has no member named %qE; did you mean %qE?",
+		      type, component, guessed_id);
+	  else
+	    error_at (loc, "%qT has no member named %qE", type, component);
 	  return error_mark_node;
 	}
 
diff --git a/gcc/spellcheck.c b/gcc/spellcheck.c
new file mode 100644
index 0000000..c407aa0
--- /dev/null
+++ b/gcc/spellcheck.c
@@ -0,0 +1,166 @@ 
+/* Find near-matches for strings and identifiers.
+   Copyright (C) 2015 Free Software Foundation, Inc.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free
+Software Foundation; either version 3, or (at your option) any later
+version.
+
+GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3.  If not see
+<http://www.gnu.org/licenses/>.  */
+
+#include "config.h"
+#include "system.h"
+#include "coretypes.h"
+#include "tm.h"
+#include "tree.h"
+#include "spellcheck.h"
+
+/* The Levenshtein distance is an "edit-distance": the minimal
+   number of one-character insertions, removals or substitutions
+   that are needed to change one string into another.
+
+   This implementation uses the Wagner-Fischer algorithm.  */
+
+static edit_distance_t
+levenshtein_distance (const char *s, int m,
+		      const char *t, int n)
+{
+  const bool debug = false;
+
+  if (debug)
+    {
+      printf ("s: \"%s\" (m=%i)\n", s, m);
+      printf ("t: \"%s\" (n=%i)\n", t, n);
+    }
+
+  if (m == 0)
+    return n;
+  if (n == 0)
+    return m;
+
+  /* We effectively build a matrix where each (i, j) contains the
+     Levenshtein distance between the prefix strings s[0:i]
+     and t[0:j].
+     Rather than actually build an (m + 1) * (n + 1) matrix,
+     we simply keep track of the last row, v0 and a new row, v1,
+     which avoids an (m + 1) * (n + 1) allocation and memory accesses
+     in favor of two (m + 1) allocations.  These could potentially be
+     statically-allocated if we impose a maximum length on the
+     strings of interest.  */
+  edit_distance_t *v0 = new edit_distance_t[m + 1];
+  edit_distance_t *v1 = new edit_distance_t[m + 1];
+
+  /* The first row is for the case of an empty target string, which
+     we can reach by deleting every character in the source string.  */
+  for (int i = 0; i < m + 1; i++)
+    v0[i] = i;
+
+  /* Build successive rows.  */
+  for (int i = 0; i < n; i++)
+    {
+      if (debug)
+	{
+	  printf ("i:%i v0 = ", i);
+	  for (int j = 0; j < m + 1; j++)
+	    printf ("%i ", v0[j]);
+	  printf ("\n");
+	}
+
+      /* The initial column is for the case of an empty source string; we
+	 can reach prefixes of the target string of length i
+	 by inserting i characters.  */
+      v1[0] = i + 1;
+
+      /* Build the rest of the row by considering neighbours to
+	 the north, west and northwest.  */
+      for (int j = 0; j < m; j++)
+	{
+	  edit_distance_t cost = (s[j] == t[i] ? 0 : 1);
+	  edit_distance_t deletion     = v1[j] + 1;
+	  edit_distance_t insertion    = v0[j + 1] + 1;
+	  edit_distance_t substitution = v0[j] + cost;
+	  edit_distance_t cheapest = MIN (deletion, insertion);
+	  cheapest = MIN (cheapest, substitution);
+	  v1[j + 1] = cheapest;
+	}
+
+      /* Prepare to move on to next row.  */
+      for (int j = 0; j < m + 1; j++)
+	v0[j] = v1[j];
+    }
+
+  if (debug)
+    {
+      printf ("final v1 = ");
+      for (int j = 0; j < m + 1; j++)
+	printf ("%i ", v1[j]);
+      printf ("\n");
+    }
+
+  edit_distance_t result = v1[m];
+  delete[] v0;
+  delete[] v1;
+  return result;
+}
+
+/* Calculate Levenshtein distance between two nil-terminated strings.
+   This exists purely for the unit tests.  */
+
+edit_distance_t
+levenshtein_distance (const char *s, const char *t)
+{
+  return levenshtein_distance (s, strlen (s), t, strlen (t));
+}
+
+/* Unit tests for levenshtein_distance.  */
+
+static void
+levenshtein_distance_unit_test (const char *a, const char *b,
+				edit_distance_t expected)
+{
+  /* Run every test both ways to ensure it's symmetric.  */
+  gcc_assert (levenshtein_distance (a, b) == expected);
+  gcc_assert (levenshtein_distance (b, a) == expected);
+}
+
+void
+levenshtein_distance_unit_tests (void)
+{
+  levenshtein_distance_unit_test ("", "nonempty", strlen ("nonempty"));
+  levenshtein_distance_unit_test ("saturday", "sunday", 3);
+  levenshtein_distance_unit_test ("foo", "m_foo", 2);
+  levenshtein_distance_unit_test ("hello_world", "HelloWorld", 3);
+  levenshtein_distance_unit_test
+    ("the quick brown fox jumps over the lazy dog", "dog", 40);
+  levenshtein_distance_unit_test
+    ("the quick brown fox jumps over the lazy dog",
+     "the quick brown dog jumps over the lazy fox",
+     4);
+  levenshtein_distance_unit_test
+    ("Lorem ipsum dolor sit amet, consectetur adipiscing elit,",
+     "All your base are belong to us",
+     44);
+}
+
+/* Calculate Levenshtein distance between two identifiers.  */
+
+edit_distance_t
+levenshtein_distance (tree ident_s, tree ident_t)
+{
+  gcc_assert (TREE_CODE (ident_s) == IDENTIFIER_NODE);
+  gcc_assert (TREE_CODE (ident_t) == IDENTIFIER_NODE);
+
+  return levenshtein_distance (IDENTIFIER_POINTER (ident_s),
+			       IDENTIFIER_LENGTH (ident_s),
+			       IDENTIFIER_POINTER (ident_t),
+			       IDENTIFIER_LENGTH (ident_t));
+}
diff --git a/gcc/spellcheck.h b/gcc/spellcheck.h
new file mode 100644
index 0000000..6f50b71
--- /dev/null
+++ b/gcc/spellcheck.h
@@ -0,0 +1,35 @@ 
+/* Find near-matches for strings and identifiers.
+   Copyright (C) 2015 Free Software Foundation, Inc.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free
+Software Foundation; either version 3, or (at your option) any later
+version.
+
+GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3.  If not see
+<http://www.gnu.org/licenses/>.  */
+
+#ifndef GCC_SPELLCHECK_H
+#define GCC_SPELLCHECK_H
+
+typedef unsigned int edit_distance_t;
+const edit_distance_t MAX_EDIT_DISTANCE = UINT_MAX;
+
+extern void
+levenshtein_distance_unit_tests (void);
+
+extern edit_distance_t
+levenshtein_distance (const char *s, const char *t);
+
+extern edit_distance_t
+levenshtein_distance (tree ident_s, tree ident_t);
+
+#endif  /* GCC_SPELLCHECK_H  */
diff --git a/gcc/testsuite/gcc.dg/spellcheck.c b/gcc/testsuite/gcc.dg/spellcheck.c
new file mode 100644
index 0000000..53ebb86
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/spellcheck.c
@@ -0,0 +1,49 @@ 
+/* { dg-do compile } */
+
+struct foo
+{
+  int foo;
+  int bar;
+  int baz;
+};
+
+int test (struct foo *ptr)
+{
+  return ptr->m_bar; /* { dg-error "'struct foo' has no member named 'm_bar'; did you mean 'bar'?" } */
+}
+
+int test2 (void)
+{
+  struct foo instance = {0, 0, 0};
+  return instance.m_bar; /* { dg-error "'struct foo' has no member named 'm_bar'; did you mean 'bar'?" } */
+}
+
+#include <inttypes.h>
+int64 i; /* { dg-error "unknown type name 'int64'; did you mean 'int64_t'?" } */
+
+typedef int ins;
+int test3 (void)
+{
+    int inr;
+    inp x; /* { dg-error "unknown type name 'inp'; did you mean 'ins'?" } */
+}
+
+struct s {
+    struct j { int aa; } kk;
+    int ab;
+};
+
+void test4 (struct s x)
+{
+  x.ac;  /* { dg-error "'struct s' has no member named 'ac'; did you mean 'ab'?" } */
+}
+
+int test5 (struct foo *ptr)
+{
+  return sizeof (ptr->foa); /* { dg-error "'struct foo' has no member named 'foa'; did you mean 'foo'?" } */
+}
+
+/* Verify that gcc doesn't offer nonsensical suggestions.  */
+
+nonsensical_suggestion_t var; /* { dg-bogus "did you mean" } */
+/* { dg-error "unknown type name" "" { target { *-*-* } } 48 } */