diff mbox

[C] Ignore invisible bindings for misspelling hints (PR c/71858)

Message ID 1468516947.13104.50.camel@redhat.com
State New
Headers show

Commit Message

David Malcolm July 14, 2016, 5:22 p.m. UTC
On Thu, 2016-07-14 at 16:50 +0200, Jakub Jelinek wrote:
> Hi!
> 
> As mentioned in the PR, anticipated decls should be ignored from
> fuzzy
> lookups, unless the corresponding decl is declared first.

I wrote a patch for this, similar to yours, but with a slightly
different approach for handling builtins.

I think it's reasonable to offer suggestions for misspellings of e.g.
"snprintf" even if the pertinent header hasn't been included, since it
will help guide a novice developer to the missing header.  For example,
if the user has:

void
test_7 (int i, int j)
{
  int buffer[100];
  snprint (buffer, 100, "%i of %i", i, j);
}

...then they get:

test.c: In function ‘test_7’:
test.c:5:3: warning: implicit declaration of function ‘snprint’; did
you mean ‘snprintf’? [-Wimplicit-function-declaration]
   snprint (buffer, 100, "%i of %i", i, j);
   ^~~~~~~
   snprintf

...and on correcting it, they get the "you forgot <stdio.h>" warning:

test.c: In function ‘test_7’:
test.c:5:3: warning: implicit declaration of function ‘snprintf’ [
-Wimplicit-function-declaration]
   snprintf (buffer, 100, "%i of %i", i, j);
   ^~~~~~~~
test.c:5:3: warning: incompatible implicit declaration of built-in
function ‘snprintf’
test.c:5:3: note: include ‘<stdio.h>’ or provide a declaration of
‘snprintf’

(and more errors, it's missing a &size argument).


What I find surprising is the suggestion of "carg"; I've seen this a
few times as a suggestion.   I confess to having to look up carg; IMHO
"carg" is much less likely to be a useful suggestion than, say,
"printf".

Attached is the patch I wrote (not tested yet, and no ChangeLog,
sorry), which keeps the anticipated decls, but uses a crude heuristic
to reject short ones.

As noted in the patch, a more sophisticated approach might be to
analyze a large corpus of real-world code and see how often each
builtin is used, and filter out the less-frequently-used ones.

But I felt a simple "reject any builtin name with length <= 4" was good
enough as a first iteration.

Thoughts?

> Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?
> 
> 2016-07-13  Jakub Jelinek  <jakub@redhat.com>
> 
> 	PR c/71858
> 	* c-decl.c (lookup_name_fuzzy): Ignore binding->invisible.
> 
> 	* gcc.dg/spellcheck-identifiers.c (snprintf): Declare.
> 	* gcc.dg/spellcheck-identifiers-2.c: New test.
> 	* gcc.dg/diagnostic-token-ranges.c (nanl): Declare.
> 	* c-c++-common/attributes-1.c: Adjust dg-prune-output.
> 
> --- gcc/c/c-decl.c.jj	2016-06-24 12:59:22.000000000 +0200
> +++ gcc/c/c-decl.c	2016-07-13 22:40:23.410658411 +0200
> @@ -4021,7 +4021,7 @@ lookup_name_fuzzy (tree name, enum looku
>    for (c_scope *scope = current_scope; scope; scope = scope->outer)
>      for (c_binding *binding = scope->bindings; binding; binding =
> binding->prev)
>        {
> -	if (!binding->id)
> +	if (!binding->id || binding->invisible)
>  	  continue;
>  	/* Don't use bindings from implicitly declared functions,
>  	   as they were likely misspellings themselves.  */
> --- gcc/testsuite/gcc.dg/spellcheck-identifiers.c.jj	2016-06
> -24 12:59:12.000000000 +0200
> +++ gcc/testsuite/gcc.dg/spellcheck-identifiers.c	2016-07-14
> 10:03:36.147466813 +0200
> @@ -121,7 +121,7 @@ test_6 (enum foo f)
>      }
>  }
>  
> -/* Verify that we offer names of builtins as suggestions.  */
> +int snprintf (char *, __SIZE_TYPE__, const char *, ...);
>  void
>  test_7 (int i, int j)
> --- gcc/testsuite/gcc.dg/spellcheck-identifiers-2.c.jj	2016-07
> -14 09:44:16.351537449 +0200
> +++ gcc/testsuite/gcc.dg/spellcheck-identifiers-2.c	2016-07-14
> 10:02:21.965426567 +0200
> @@ -0,0 +1,33 @@
> +/* PR c/71858 */
> +/* Make sure anticipated builtins are not considered before they are
> declared.  */
> +/* { dg-do compile } */
> +/* { dg-options "-Wimplicit-function-declaration -fdiagnostics-show
> -caret" } */
> +
> +int sscafn (const char *, const char *, ...);
> +
> +int
> +test_1 (const char *p)
> +{
> +  int i;
> +  return ssacnf (p, "%d", &i); /* { dg-warning "10: implicit
> declaration of function .ssacnf.; did you mean .sscafn.?" } */
> +  /* { dg-begin-multiline-output "" }
> +   return ssacnf (p, "%d", &i);
> +          ^~~~~~
> +          sscafn
> +   { dg-end-multiline-output "" } */
> +}
> +
> +int scafn (const char *, ...);
> +int scanf (const char *, ...);
> +
> +int
> +test_2 (void)
> +{
> +  int i;
> +  return sacnf ("%d", &i); /* { dg-warning "10: implicit declaration
> of function .sacnf.; did you mean .scanf.?" } */
> +  /* { dg-begin-multiline-output "" }
> +   return sacnf ("%d", &i);
> +          ^~~~~
> +          scanf
> +   { dg-end-multiline-output "" } */
> +}
> --- gcc/testsuite/gcc.dg/diagnostic-token-ranges.c.jj	2016-06
> -24 12:59:12.000000000 +0200
> +++ gcc/testsuite/gcc.dg/diagnostic-token-ranges.c	2016-07-14
> 11:06:23.013803011 +0200
> @@ -2,6 +2,8 @@
>  
>  /* Verify that various diagnostics show source code ranges.  */
>  
> +long double nanl (const char *);
> +
>  /* These ones merely use token ranges; they don't use tree ranges. 
>  */
>  
>  void undeclared_identifier (void)
> --- gcc/testsuite/c-c++-common/attributes-1.c.jj	2016-06-23
> 14:31:57.000000000 +0200
> +++ gcc/testsuite/c-c++-common/attributes-1.c	2016-07-14
> 14:51:34.871006659 +0200
> @@ -1,5 +1,5 @@
>  /* { dg-do compile } */
> -/* { dg-prune-output "undeclared here \\(not in a function\\); did
> you mean .carg..|\[^\n\r\]* was not declared in this scope" } */
> +/* { dg-prune-output "undeclared here \\(not in a function\\); did
> you mean .char..|\[^\n\r\]* was not declared in this scope" } */
>  
>  void* my_calloc(unsigned, unsigned)
> __attribute__((alloc_size(1,bar))); /* { dg-warning "outside range" }
> */
>  void* my_realloc(void*, unsigned) __attribute__((alloc_size(bar)));
> /* { dg-warning "outside range" } */
> 
> 	Jakub

Comments

Jakub Jelinek July 14, 2016, 5:47 p.m. UTC | #1
On Thu, Jul 14, 2016 at 01:22:27PM -0400, David Malcolm wrote:
> I wrote a patch for this, similar to yours, but with a slightly
> different approach for handling builtins.
> 
> I think it's reasonable to offer suggestions for misspellings of e.g.
> "snprintf" even if the pertinent header hasn't been included, since it
> will help guide a novice developer to the missing header.  For example,
> if the user has:
> 
> void
> test_7 (int i, int j)
> {
>   int buffer[100];
>   snprint (buffer, 100, "%i of %i", i, j);
> }
> 
> ...then they get:
> 
> test.c: In function ‘test_7’:
> test.c:5:3: warning: implicit declaration of function ‘snprint’; did
> you mean ‘snprintf’? [-Wimplicit-function-declaration]
>    snprint (buffer, 100, "%i of %i", i, j);
>    ^~~~~~~
>    snprintf
> 
> ...and on correcting it, they get the "you forgot <stdio.h>" warning:

I find it weird.  What is special on the builtins?  Lots of standard headers
contain lots of functions that aren't backed by builtins, some of them used
more often than the builtins; the reason for builtins is often just that gcc
wants to be able to optimize them somehow.

E.g. note fopen is not a builtin, so you still will not suggest it if
stdio.h is not included.

Not all the builtins out there are even enabled in all compilation modes
when the containing header is included, it can depend on the language
version, feature test macros, etc.  So, by considering invisible builtins
you could e.g. suggest C99 builtins in strict C89 compilation mode where the
header would not define it, etc.

In any case, this is your baby, so it is up to the FE maintainers and you to
get agreement on, I won't fight on this.

> What I find surprising is the suggestion of "carg"; I've seen this a
> few times as a suggestion.   I confess to having to look up carg; IMHO
> "carg" is much less likely to be a useful suggestion than, say,
> "printf".

For the short identifiers, the question is if the maximum Damerau-Levenshtein
distance (do you handle transpositions now?) shouldn't be lowered (say for at
most 4 chars identifiers only consider levenshtein distance 1 and not more,
e.g. for 3 character identifiers allowing distance of 2 or 3 means you pretty
much suggest any other 3 char identifier), regardless of if it is a builtin or not.

> @@ -3992,6 +3995,26 @@ lookup_name_fuzzy (tree name, enum lookup_name_fuzzy_kind kind)
>  	if (kind == FUZZY_LOOKUP_TYPENAME)
>  	  if (TREE_CODE (binding->decl) != TYPE_DECL)
>  	    continue;
> +	if (kind == FUZZY_LOOKUP_FUNCTION_NAME)
> +	  if (TREE_CODE (binding->decl) != FUNCTION_DECL)
> +	    continue;

Note this is handled in the second patch I've posted, and this one doesn't
handle function pointer variables/parameters.

	Jakub
David Malcolm July 14, 2016, 6:22 p.m. UTC | #2
On Thu, 2016-07-14 at 19:47 +0200, Jakub Jelinek wrote:
> On Thu, Jul 14, 2016 at 01:22:27PM -0400, David Malcolm wrote:
> > I wrote a patch for this, similar to yours, but with a slightly
> > different approach for handling builtins.
> > 
> > I think it's reasonable to offer suggestions for misspellings of
> > e.g.
> > "snprintf" even if the pertinent header hasn't been included, since
> > it
> > will help guide a novice developer to the missing header.  For
> > example,
> > if the user has:
> > 
> > void
> > test_7 (int i, int j)
> > {
> >   int buffer[100];
> >   snprint (buffer, 100, "%i of %i", i, j);
> > }
> > 
> > ...then they get:
> > 
> > test.c: In function ‘test_7’:
> > test.c:5:3: warning: implicit declaration of function ‘snprint’;
> > did
> > you mean ‘snprintf’? [-Wimplicit-function-declaration]
> >    snprint (buffer, 100, "%i of %i", i, j);
> >    ^~~~~~~
> >    snprintf
> > 
> > ...and on correcting it, they get the "you forgot <stdio.h>"
> > warning:
> 
> I find it weird.  What is special on the builtins?  Lots of standard
> headers
> contain lots of functions that aren't backed by builtins, some of
> them used
> more often than the builtins; the reason for builtins is often just
> that gcc
> wants to be able to optimize them somehow.
> 
> E.g. note fopen is not a builtin, so you still will not suggest it if
> stdio.h is not included.
> 
> Not all the builtins out there are even enabled in all compilation
> modes
> when the containing header is included, it can depend on the language
> version, feature test macros, etc.  So, by considering invisible
> builtins
> you could e.g. suggest C99 builtins in strict C89 compilation mode
> where the
> header would not define it, etc.
> 
> In any case, this is your baby, so it is up to the FE maintainers and
> you to
> get agreement on, I won't fight on this.

Thanks.

I agree with the points you raise; in that light I'm fine with your
patch.

> > What I find surprising is the suggestion of "carg"; I've seen this
> > a
> > few times as a suggestion.   I confess to having to look up carg;
> > IMHO
> > "carg" is much less likely to be a useful suggestion than, say,
> > "printf".
> 
> For the short identifiers, the question is if the maximum Damerau
> -Levenshtein
> distance (do you handle transpositions now?) 

We don't currently handle transpositions, but it would be good to do so
(PR other/69968).

> shouldn't be lowered (say for at
> most 4 chars identifiers only consider levenshtein distance 1 and not
> more,
> e.g. for 3 character identifiers allowing distance of 2 or 3 means
> you pretty
> much suggest any other 3 char identifier), regardless of if it is a
> builtin or not.

The logic in get_best_meaningful_candidate is currently:

  unsigned int cutoff = MAX (m_goal_len, m_best_candidate_len) / 2;
  if (m_best_distance > cutoff)
     return NULL;

For a pair of 3 character identifiers, cutoff will be 3 / 2 == 1, so it
will suggest within a distance of 1 and reject if the distance is 2 or
3.   Or is there a bug?

That cutoff could definitely be improved.  Perhaps the MAX should be
changed to a MIN?  Or just base it off the goal length e.g. 
  unsigned int cutoff = m_goal_len / 2;
thus implicitly rejecting
suggestions when m_goal_len == 1.
That would eliminate the "bar" to
"char" suggestion in the PR, since m_goal_len == 3 would have a cutoff
of 1.

> 
> > @@ -3992,6 +3995,26 @@ lookup_name_fuzzy (tree name, enum
> > lookup_name_fuzzy_kind kind)
> >  	if (kind == FUZZY_LOOKUP_TYPENAME)
> >  	  if (TREE_CODE (binding->decl) != TYPE_DECL)
> >  	    continue;
> > +	if (kind == FUZZY_LOOKUP_FUNCTION_NAME)
> > +	  if (TREE_CODE (binding->decl) != FUNCTION_DECL)
> > +	    continue;
> 
> Note this is handled in the second patch I've posted, and this one
> doesn't
> handle function pointer variables/parameters.
> 
> 	Jakub
Jakub Jelinek July 14, 2016, 6:49 p.m. UTC | #3
On Thu, Jul 14, 2016 at 02:22:36PM -0400, David Malcolm wrote:
> I agree with the points you raise; in that light I'm fine with your
> patch.

Ok, Marek has approved it earlier, so I went ahead and committed.

> > shouldn't be lowered (say for at
> > most 4 chars identifiers only consider levenshtein distance 1 and not
> > more,
> > e.g. for 3 character identifiers allowing distance of 2 or 3 means
> > you pretty
> > much suggest any other 3 char identifier), regardless of if it is a
> > builtin or not.
> 
> The logic in get_best_meaningful_candidate is currently:
> 
>   unsigned int cutoff = MAX (m_goal_len, m_best_candidate_len) / 2;
>   if (m_best_distance > cutoff)
>      return NULL;
> 
> For a pair of 3 character identifiers, cutoff will be 3 / 2 == 1, so it
> will suggest within a distance of 1 and reject if the distance is 2 or
> 3.   Or is there a bug?
> 
> That cutoff could definitely be improved.  Perhaps the MAX should be
> changed to a MIN?  Or just base it off the goal length e.g. 
>   unsigned int cutoff = m_goal_len / 2;
> thus implicitly rejecting
> suggestions when m_goal_len == 1.
> That would eliminate the "bar" to
> "char" suggestion in the PR, since m_goal_len == 3 would have a cutoff
> of 1.

I'm afraid I don't have enough experience, so we'll need to look at lost of
suggestions and find the right tuning.

	Jakub
diff mbox

Patch

From 7f3cd32fd4338db7b8b8f09b16429975c5907cb6 Mon Sep 17 00:00:00 2001
From: David Malcolm <dmalcolm@redhat.com>
Date: Wed, 13 Jul 2016 13:40:22 -0400
Subject: [PATCH] FIXME: PR c/71858 - Don't always offer names of builtins as
 misspellings

---
 gcc/c-family/c-common.h                       |  3 +++
 gcc/c/c-decl.c                                | 25 ++++++++++++++++++++++++-
 gcc/testsuite/gcc.dg/spellcheck-identifiers.c | 15 +++++++++++++++
 3 files changed, 42 insertions(+), 1 deletion(-)

diff --git a/gcc/c-family/c-common.h b/gcc/c-family/c-common.h
index 44d98d1..32e68d0 100644
--- a/gcc/c-family/c-common.h
+++ b/gcc/c-family/c-common.h
@@ -994,6 +994,9 @@  enum lookup_name_fuzzy_kind {
   /* Names of types.  */
   FUZZY_LOOKUP_TYPENAME,
 
+  /* Names of functions.  */
+  FUZZY_LOOKUP_FUNCTION_NAME,
+
   /* Any name.  */
   FUZZY_LOOKUP_NAME
 };
diff --git a/gcc/c/c-decl.c b/gcc/c/c-decl.c
index dc14485..de03ad4 100644
--- a/gcc/c/c-decl.c
+++ b/gcc/c/c-decl.c
@@ -3090,7 +3090,7 @@  implicit_decl_warning (location_t loc, tree id, tree olddecl)
       bool warned;
       const char *hint = NULL;
       if (!olddecl)
-	hint = lookup_name_fuzzy (id, FUZZY_LOOKUP_NAME);
+	hint = lookup_name_fuzzy (id, FUZZY_LOOKUP_FUNCTION_NAME);
 
       if (flag_isoc99)
 	if (hint)
@@ -3984,6 +3984,9 @@  lookup_name_fuzzy (tree name, enum lookup_name_fuzzy_kind kind)
       {
 	if (!binding->id)
 	  continue;
+
+	gcc_assert (TREE_CODE (binding->id) == IDENTIFIER_NODE);
+
 	/* Don't use bindings from implicitly declared functions,
 	   as they were likely misspellings themselves.  */
 	if (TREE_CODE (binding->decl) == FUNCTION_DECL)
@@ -3992,6 +3995,26 @@  lookup_name_fuzzy (tree name, enum lookup_name_fuzzy_kind kind)
 	if (kind == FUZZY_LOOKUP_TYPENAME)
 	  if (TREE_CODE (binding->decl) != TYPE_DECL)
 	    continue;
+	if (kind == FUZZY_LOOKUP_FUNCTION_NAME)
+	  if (TREE_CODE (binding->decl) != FUNCTION_DECL)
+	    continue;
+
+	/* There are numerous builtin functions with short names, such as
+	   "carg" which are prone to being suggested, and are generally
+	   unhelpful as suggestions. (PR c/71858)
+	   Avoid suggesting them by imposing a minimum length on builtin
+	   functions as suggestions.
+	   A minimum length of 5 is rather arbitrary, but it allows for
+	   "fopen", but prevents "carg".  A more sophisticated approach
+	   might be to analyze how often each builtin is used in real
+	   code, and filter out the less-frequently-used ones.  */
+	const int builtin_length_threshold = 5;
+	if (TREE_CODE (binding->decl) == FUNCTION_DECL)
+	  if (DECL_IS_BUILTIN (binding->decl))
+	    if (IDENTIFIER_LENGTH (binding->id) < builtin_length_threshold)
+	      continue;
+
+	/* Passed all tests; consider this as a suggestion.  */
 	bm.consider (binding->id);
       }
 
diff --git a/gcc/testsuite/gcc.dg/spellcheck-identifiers.c b/gcc/testsuite/gcc.dg/spellcheck-identifiers.c
index 22a12d0..2111499 100644
--- a/gcc/testsuite/gcc.dg/spellcheck-identifiers.c
+++ b/gcc/testsuite/gcc.dg/spellcheck-identifiers.c
@@ -134,3 +134,18 @@  test_7 (int i, int j)
    snprintf
    { dg-end-multiline-output "" } */
 }
+
+/* PR c/71858 - Don't always offer names of builtins as misspellings;
+   in particular, don't offer "carg" as a suggestion for "bar".  */
+
+int
+test_8 (void)
+{
+  return bar ();
+  /* { dg-bogus "did you mean" "" { target *-*-* } 144 } */
+  /* { dg-warning "implicit declaration" "" { target *-*-* } 144 } */
+  /* { dg-begin-multiline-output "" }
+   return bar ();
+          ^~~
+   { dg-end-multiline-output "" } */
+}
-- 
1.8.5.3