diff mbox

[3/4] spellcheck.h: add best_match template; implement early-reject

Message ID 1465917353-61387-3-git-send-email-dmalcolm@redhat.com
State New
Headers show

Commit Message

David Malcolm June 14, 2016, 3:15 p.m. UTC
There's a lot of repetition between find_closest_string and
find_closest_identifier, and the next patch adds more, so this
patch moves the logic into a new template class "best_match"
for locating the closest string from a sequence of candidates.

The patch also introduces a pair of early-reject optimizations
that weren't present in the older implementations, reducing the
number of calls to levenshtein_distance.
Introducing class best_match allows for these optimizations to be
in one place (best_match::consider), rather than having to implement
them twice.

Successfully bootstrapped&regrtested in combination with the rest of
the kit on x86_64-pc-linux-gnu
Successful -fself-test of stage1 on powerpc-ibm-aix7.1.3.0 of
just this patch (on top of patches 1 and 2).

OK for trunk if it passes individual bootstrap&regrtest?

gcc/c/ChangeLog:
	* c-typeck.c: Include spellcheck-tree.h rather than spellcheck.h.

gcc/cp/ChangeLog:
	* search.c: Include spellcheck-tree.h rather than spellcheck.h.

gcc/ChangeLog:
	* spellcheck-tree.c: Include spellcheck-tree.h rather than
	spellcheck.h.
	(find_closest_identifier): Reimplement in terms of
	best_match<tree,tree>.
	* spellcheck-tree.h: New file.
	* spellcheck.c (struct edit_distance_traits<const char *>): New
	struct.
	(find_closest_string): Reimplement in terms of
	best_match<const char *, const char *>.
	* spellcheck.h (levenshtein_distance): Move prototype of tree-based
	overload to spellcheck-tree.h.
	(find_closest_identifier): Likewise.
	(struct edit_distance_traits<T>): New template.
	(class best_match): New class.
---
 gcc/c/c-typeck.c      |   2 +-
 gcc/cp/search.c       |   2 +-
 gcc/spellcheck-tree.c |  24 ++---------
 gcc/spellcheck-tree.h |  51 +++++++++++++++++++++++
 gcc/spellcheck.c      |  42 +++++++++----------
 gcc/spellcheck.h      | 110 +++++++++++++++++++++++++++++++++++++++++++++++---
 6 files changed, 183 insertions(+), 48 deletions(-)
 create mode 100644 gcc/spellcheck-tree.h

Comments

Jeff Law June 14, 2016, 9:05 p.m. UTC | #1
On 06/14/2016 09:15 AM, David Malcolm wrote:
> There's a lot of repetition between find_closest_string and
> find_closest_identifier, and the next patch adds more, so this
> patch moves the logic into a new template class "best_match"
> for locating the closest string from a sequence of candidates.
>
> The patch also introduces a pair of early-reject optimizations
> that weren't present in the older implementations, reducing the
> number of calls to levenshtein_distance.
> Introducing class best_match allows for these optimizations to be
> in one place (best_match::consider), rather than having to implement
> them twice.
>
> Successfully bootstrapped&regrtested in combination with the rest of
> the kit on x86_64-pc-linux-gnu
> Successful -fself-test of stage1 on powerpc-ibm-aix7.1.3.0 of
> just this patch (on top of patches 1 and 2).
>
> OK for trunk if it passes individual bootstrap&regrtest?
>
> gcc/c/ChangeLog:
> 	* c-typeck.c: Include spellcheck-tree.h rather than spellcheck.h.
>
> gcc/cp/ChangeLog:
> 	* search.c: Include spellcheck-tree.h rather than spellcheck.h.
>
> gcc/ChangeLog:
> 	* spellcheck-tree.c: Include spellcheck-tree.h rather than
> 	spellcheck.h.
> 	(find_closest_identifier): Reimplement in terms of
> 	best_match<tree,tree>.
> 	* spellcheck-tree.h: New file.
> 	* spellcheck.c (struct edit_distance_traits<const char *>): New
> 	struct.
> 	(find_closest_string): Reimplement in terms of
> 	best_match<const char *, const char *>.
> 	* spellcheck.h (levenshtein_distance): Move prototype of tree-based
> 	overload to spellcheck-tree.h.
> 	(find_closest_identifier): Likewise.
> 	(struct edit_distance_traits<T>): New template.
> 	(class best_match): New class.
OK.
jeff
diff mbox

Patch

diff --git a/gcc/c/c-typeck.c b/gcc/c/c-typeck.c
index f987508..f03c07b 100644
--- a/gcc/c/c-typeck.c
+++ b/gcc/c/c-typeck.c
@@ -47,7 +47,7 @@  along with GCC; see the file COPYING3.  If not see
 #include "c-family/c-ubsan.h"
 #include "cilk.h"
 #include "gomp-constants.h"
-#include "spellcheck.h"
+#include "spellcheck-tree.h"
 #include "gcc-rich-location.h"
 
 /* Possible cases of implicit bad conversions.  Used to select
diff --git a/gcc/cp/search.c b/gcc/cp/search.c
index f47833f..990c3fe 100644
--- a/gcc/cp/search.c
+++ b/gcc/cp/search.c
@@ -27,7 +27,7 @@  along with GCC; see the file COPYING3.  If not see
 #include "cp-tree.h"
 #include "intl.h"
 #include "toplev.h"
-#include "spellcheck.h"
+#include "spellcheck-tree.h"
 
 static int is_subobject_of_p (tree, tree);
 static tree dfs_lookup_base (tree, void *);
diff --git a/gcc/spellcheck-tree.c b/gcc/spellcheck-tree.c
index 2d73b774..63fb1a8 100644
--- a/gcc/spellcheck-tree.c
+++ b/gcc/spellcheck-tree.c
@@ -22,7 +22,7 @@  along with GCC; see the file COPYING3.  If not see
 #include "coretypes.h"
 #include "tm.h"
 #include "tree.h"
-#include "spellcheck.h"
+#include "spellcheck-tree.h"
 #include "selftest.h"
 #include "stringpool.h"
 
@@ -53,32 +53,16 @@  find_closest_identifier (tree target, const auto_vec<tree> *candidates)
 {
   gcc_assert (TREE_CODE (target) == IDENTIFIER_NODE);
 
+  best_match<tree, tree> bm (target);
   int i;
   tree identifier;
-  tree best_identifier = NULL_TREE;
-  edit_distance_t best_distance = MAX_EDIT_DISTANCE;
   FOR_EACH_VEC_ELT (*candidates, i, identifier)
     {
       gcc_assert (TREE_CODE (identifier) == IDENTIFIER_NODE);
-      edit_distance_t dist = levenshtein_distance (target, identifier);
-      if (dist < best_distance)
-	{
-	  best_distance = dist;
-	  best_identifier = identifier;
-	}
+      bm.consider (identifier);
     }
 
-  /* If more than half of the letters were misspelled, the suggestion is
-     likely to be meaningless.  */
-  if (best_identifier)
-    {
-      unsigned int cutoff = MAX (IDENTIFIER_LENGTH (target),
-				 IDENTIFIER_LENGTH (best_identifier)) / 2;
-      if (best_distance > cutoff)
-	return NULL_TREE;
-    }
-
-  return best_identifier;
+  return bm.get_best_meaningful_candidate ();
 }
 
 #if CHECKING_P
diff --git a/gcc/spellcheck-tree.h b/gcc/spellcheck-tree.h
new file mode 100644
index 0000000..0d5e253
--- /dev/null
+++ b/gcc/spellcheck-tree.h
@@ -0,0 +1,51 @@ 
+/* Find near-matches for identifiers.
+   Copyright (C) 2015-2016 Free Software Foundation, Inc.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free
+Software Foundation; either version 3, or (at your option) any later
+version.
+
+GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3.  If not see
+<http://www.gnu.org/licenses/>.  */
+
+#ifndef GCC_SPELLCHECK_TREE_H
+#define GCC_SPELLCHECK_TREE_H
+
+#include "spellcheck.h"
+
+/* spellcheck-tree.c  */
+
+extern edit_distance_t
+levenshtein_distance (tree ident_s, tree ident_t);
+
+extern tree
+find_closest_identifier (tree target, const auto_vec<tree> *candidates);
+
+/* Specialization of edit_distance_traits for identifiers.  */
+
+template <>
+struct edit_distance_traits<tree>
+{
+  static size_t get_length (tree id)
+  {
+    gcc_assert (TREE_CODE (id) == IDENTIFIER_NODE);
+    return IDENTIFIER_LENGTH (id);
+  }
+
+  static const char *get_string (tree id)
+  {
+    gcc_assert (TREE_CODE (id) == IDENTIFIER_NODE);
+    return IDENTIFIER_POINTER (id);
+  }
+};
+
+#endif  /* GCC_SPELLCHECK_TREE_H  */
diff --git a/gcc/spellcheck.c b/gcc/spellcheck.c
index e03f484..2648f3a 100644
--- a/gcc/spellcheck.c
+++ b/gcc/spellcheck.c
@@ -121,6 +121,24 @@  levenshtein_distance (const char *s, const char *t)
   return levenshtein_distance (s, strlen (s), t, strlen (t));
 }
 
+/* Specialization of edit_distance_traits for C-style strings.  */
+
+template <>
+struct edit_distance_traits<const char *>
+{
+  static size_t get_length (const char *str)
+  {
+    gcc_assert (str);
+    return strlen (str);
+  }
+
+  static const char *get_string (const char *str)
+  {
+    gcc_assert (str);
+    return str;
+  }
+};
+
 /* Given TARGET, a non-NULL string, and CANDIDATES, a non-NULL ptr to
    an autovec of non-NULL strings, determine which element within
    CANDIDATES has the lowest edit distance to TARGET.  If there are
@@ -139,32 +157,14 @@  find_closest_string (const char *target,
 
   int i;
   const char *candidate;
-  const char *best_candidate = NULL;
-  edit_distance_t best_distance = MAX_EDIT_DISTANCE;
-  size_t len_target = strlen (target);
+  best_match<const char *, const char *> bm (target);
   FOR_EACH_VEC_ELT (*candidates, i, candidate)
     {
       gcc_assert (candidate);
-      edit_distance_t dist
-	= levenshtein_distance (target, len_target,
-				candidate, strlen (candidate));
-      if (dist < best_distance)
-	{
-	  best_distance = dist;
-	  best_candidate = candidate;
-	}
-    }
-
-  /* If more than half of the letters were misspelled, the suggestion is
-     likely to be meaningless.  */
-  if (best_candidate)
-    {
-      unsigned int cutoff = MAX (len_target, strlen (best_candidate)) / 2;
-      if (best_distance > cutoff)
-	return NULL;
+      bm.consider (candidate);
     }
 
-  return best_candidate;
+  return bm.get_best_meaningful_candidate ();
 }
 
 #if CHECKING_P
diff --git a/gcc/spellcheck.h b/gcc/spellcheck.h
index 040c33e..7379399 100644
--- a/gcc/spellcheck.h
+++ b/gcc/spellcheck.h
@@ -35,12 +35,112 @@  extern const char *
 find_closest_string (const char *target,
 		     const auto_vec<const char *> *candidates);
 
-/* spellcheck-tree.c  */
+/* A traits class for describing a string-like type usable by
+   class best_match.
+   Specializations should provide the implementations of the following:
 
-extern edit_distance_t
-levenshtein_distance (tree ident_s, tree ident_t);
+     static size_t get_length (TYPE);
+     static const char *get_string (TYPE);
+
+   get_string should return a non-NULL ptr, which does not need to be
+   0-terminated.  */
+
+template <typename TYPE>
+struct edit_distance_traits {};
+
+/* A type for use when determining the best match against a string,
+   expressed as a template so that we can match against various
+   string-like types (const char *, frontend identifiers, and preprocessor
+   macros).
+
+   This type accumulates the best possible match against GOAL_TYPE for
+   a sequence of elements of CANDIDATE_TYPE, whilst minimizing the
+   number of calls to levenshtein_distance and to
+   edit_distance_traits<T>::get_length.  */
+
+template <typename GOAL_TYPE, typename CANDIDATE_TYPE>
+class best_match
+{
+ public:
+  typedef GOAL_TYPE goal_t;
+  typedef CANDIDATE_TYPE candidate_t;
+  typedef edit_distance_traits<goal_t> goal_traits;
+  typedef edit_distance_traits<candidate_t> candidate_traits;
+
+  /* Constructor.  */
+
+  best_match (goal_t goal)
+  : m_goal (goal_traits::get_string (goal)),
+    m_goal_len (goal_traits::get_length (goal)),
+    m_best_candidate (NULL),
+    m_best_distance (MAX_EDIT_DISTANCE)
+  {}
+
+  /* Compare the edit distance between CANDIDATE and m_goal,
+     and if it's the best so far, record it.  */
+
+  void consider (candidate_t candidate)
+  {
+    size_t candidate_len = candidate_traits::get_length (candidate);
+
+    /* Calculate a lower bound on the candidate's distance to the goal,
+       based on the difference in lengths; it will require at least
+       this many insertions/deletions.  */
+    edit_distance_t min_candidate_distance
+      = abs ((ssize_t)candidate_len - (ssize_t)m_goal_len);
+
+    /* If the candidate's length is sufficiently different to that
+       of the goal string, then the number of insertions/deletions
+       may be >= the best distance so far.  If so, we can reject
+       the candidate immediately without needing to compute
+       the exact distance, since it won't be an improvement.  */
+    if (min_candidate_distance >= m_best_distance)
+      return;
+
+    /* If the candidate will be unable to beat the criterion in
+       get_best_meaningful_candidate, reject it without computing
+       the exact distance.  */
+    unsigned int cutoff = MAX (m_goal_len, candidate_len) / 2;
+    if (min_candidate_distance > cutoff)
+      return;
+
+    /* Otherwise, compute the distance and see if the candidate
+       has beaten the previous best value.  */
+    edit_distance_t dist
+      = levenshtein_distance (m_goal, m_goal_len,
+			      candidate_traits::get_string (candidate),
+			      candidate_len);
+    if (dist < m_best_distance)
+      {
+	m_best_distance = dist;
+	m_best_candidate = candidate;
+	m_best_candidate_len = candidate_len;
+      }
+  }
+
+  /* Get the best candidate so far, but applying a filter to ensure
+     that we return NULL if none of the candidates are close to the goal,
+     to avoid offering nonsensical suggestions to the user.  */
+
+  candidate_t get_best_meaningful_candidate () const
+  {
+    /* If more than half of the letters were misspelled, the suggestion is
+       likely to be meaningless.  */
+    if (m_best_candidate)
+      {
+	unsigned int cutoff = MAX (m_goal_len, m_best_candidate_len) / 2;
+	if (m_best_distance > cutoff)
+	  return NULL;
+    }
+    return m_best_candidate;
+  }
 
-extern tree
-find_closest_identifier (tree target, const auto_vec<tree> *candidates);
+ private:
+  const char *m_goal;
+  size_t m_goal_len;
+  candidate_t m_best_candidate;
+  edit_distance_t m_best_distance;
+  size_t m_best_candidate_len;
+};
 
 #endif  /* GCC_SPELLCHECK_H  */