diff mbox series

[1/3] rhashtable: further improve stability of rhashtable_walk

Message ID 153086109256.2825.15329014177598382684.stgit@noble
State Changes Requested, archived
Delegated to: David Miller
Headers show
Series rhashtable: replace rhashtable_walk_peek implementation | expand

Commit Message

NeilBrown July 6, 2018, 7:11 a.m. UTC
If the sequence:
   obj = rhashtable_walk_next(iter);
   rhashtable_walk_stop(iter);
   rhashtable_remove_fast(ht, &obj->head, params);
   rhashtable_walk_start(iter);

 races with another thread inserting or removing
 an object on the same hash chain, a subsequent
 rhashtable_walk_next() is not guaranteed to get the "next"
 object. It is possible that an object could be
 repeated, or missed.

 This can be made more reliable by keeping the objects in a hash chain
 sorted by memory address.  A subsequent rhashtable_walk_next()
 call can reliably find the correct position in the list, and thus
 find the 'next' object.

 It is not possible to take this approach with an rhltable as keeping
 the hash chain in order is not so easy.  When the first object with a
 given key is removed, it is replaced in the chain with the next
 object with the same key, and the address of that object may not be
 correctly ordered.
 I have not yet found any way to achieve the same stability
 with rhltables, that doesn't have a major impact on lookup
 or insert.  No code currently in Linux would benefit from
 such extra stability.

 With this patch:
 - a new object is always inserted after the last object with a
   smaller address, or at the start.  This preserves the property,
   important when allowing objects to be removed and re-added, that
   an object is never inserted *after* a position that it previously
   held in the list.
 - when rhashtable_walk_start() is called, it records that 'p' is not
   'safe', meaning that it cannot be dereferenced.  The revalidation
   that was previously done here is moved to rhashtable_walk_next()
 - when rhashtable_walk_next() is called while p is not NULL and not
   safe, it walks the chain looking for the first object with an
   address greater than p and returns that.  If there is none, it moves
   to the next hash chain.

Signed-off-by: NeilBrown <neilb@suse.com>
---
 include/linux/rhashtable-types.h |    1 
 include/linux/rhashtable.h       |   10 ++++-
 lib/rhashtable.c                 |   82 +++++++++++++++++++++++++-------------
 3 files changed, 62 insertions(+), 31 deletions(-)

Comments

kernel test robot July 6, 2018, 8:24 a.m. UTC | #1
Hi NeilBrown,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on net-next/master]
[also build test ERROR on next-20180705]
[cannot apply to linus/master linux-sof-driver/master v4.18-rc3]
[if your patch is applied to the wrong git tree, please drop us a note to help improve the system]

url:    https://github.com/0day-ci/linux/commits/NeilBrown/rhashtable-replace-rhashtable_walk_peek-implementation/20180706-153705
config: i386-randconfig-x078-201826 (attached as .config)
compiler: gcc-7 (Debian 7.3.0-16) 7.3.0
reproduce:
        # save the attached .config to linux build tree
        make ARCH=i386 

All errors (new ones prefixed by >>):

   In file included from ipc/util.c:66:0:
   include/linux/rhashtable.h: In function '__rhashtable_insert_fast':
>> include/linux/rhashtable.h:624:6: error: 'headp' undeclared (first use in this function); did you mean 'head'?
         headp = &head->next;
         ^~~~~
         head
   include/linux/rhashtable.h:624:6: note: each undeclared identifier is reported only once for each function it appears in

vim +624 include/linux/rhashtable.h

   570	
   571	/* Internal function, please use rhashtable_insert_fast() instead. This
   572	 * function returns the existing element already in hashes in there is a clash,
   573	 * otherwise it returns an error via ERR_PTR().
   574	 */
   575	static inline void *__rhashtable_insert_fast(
   576		struct rhashtable *ht, const void *key, struct rhash_head *obj,
   577		const struct rhashtable_params params, bool rhlist)
   578	{
   579		struct rhashtable_compare_arg arg = {
   580			.ht = ht,
   581			.key = key,
   582		};
   583		struct rhash_head __rcu **pprev;
   584		struct bucket_table *tbl;
   585		struct rhash_head *head;
   586		spinlock_t *lock;
   587		unsigned int hash;
   588		int elasticity;
   589		void *data;
   590	
   591		rcu_read_lock();
   592	
   593		tbl = rht_dereference_rcu(ht->tbl, ht);
   594		hash = rht_head_hashfn(ht, tbl, obj, params);
   595		lock = rht_bucket_lock(tbl, hash);
   596		spin_lock_bh(lock);
   597	
   598		if (unlikely(rcu_access_pointer(tbl->future_tbl))) {
   599	slow_path:
   600			spin_unlock_bh(lock);
   601			rcu_read_unlock();
   602			return rhashtable_insert_slow(ht, key, obj);
   603		}
   604	
   605		elasticity = RHT_ELASTICITY;
   606		pprev = rht_bucket_insert(ht, tbl, hash);
   607		data = ERR_PTR(-ENOMEM);
   608		if (!pprev)
   609			goto out;
   610	
   611		rht_for_each_continue(head, *pprev, tbl, hash) {
   612			struct rhlist_head *plist;
   613			struct rhlist_head *list;
   614	
   615			elasticity--;
   616			if (!key ||
   617			    (params.obj_cmpfn ?
   618			     params.obj_cmpfn(&arg, rht_obj(ht, head)) :
   619			     rhashtable_compare(&arg, rht_obj(ht, head)))) {
   620				if (rhlist) {
   621					pprev = &head->next;
   622				} else {
   623					if (head < obj)
 > 624						headp = &head->next;
   625				}
   626				continue;
   627			}
   628	
   629			data = rht_obj(ht, head);
   630	
   631			if (!rhlist)
   632				goto out;
   633	
   634	
   635			list = container_of(obj, struct rhlist_head, rhead);
   636			plist = container_of(head, struct rhlist_head, rhead);
   637	
   638			RCU_INIT_POINTER(list->next, plist);
   639			head = rht_dereference_bucket(head->next, tbl, hash);
   640			RCU_INIT_POINTER(list->rhead.next, head);
   641			rcu_assign_pointer(*pprev, obj);
   642	
   643			goto good;
   644		}
   645	
   646		if (elasticity <= 0)
   647			goto slow_path;
   648	
   649		data = ERR_PTR(-E2BIG);
   650		if (unlikely(rht_grow_above_max(ht, tbl)))
   651			goto out;
   652	
   653		if (unlikely(rht_grow_above_100(ht, tbl)))
   654			goto slow_path;
   655	
   656		head = rht_dereference_bucket(*pprev, tbl, hash);
   657	
   658		RCU_INIT_POINTER(obj->next, head);
   659		if (rhlist) {
   660			struct rhlist_head *list;
   661	
   662			list = container_of(obj, struct rhlist_head, rhead);
   663			RCU_INIT_POINTER(list->next, NULL);
   664		}
   665	
   666		rcu_assign_pointer(*pprev, obj);
   667	
   668		atomic_inc(&ht->nelems);
   669		if (rht_grow_above_75(ht, tbl))
   670			schedule_work(&ht->run_work);
   671	
   672	good:
   673		data = NULL;
   674	
   675	out:
   676		spin_unlock_bh(lock);
   677		rcu_read_unlock();
   678	
   679		return data;
   680	}
   681	

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation
Paolo Abeni July 6, 2018, 8:59 a.m. UTC | #2
On Fri, 2018-07-06 at 17:11 +1000, NeilBrown wrote:
> If the sequence:
>    obj = rhashtable_walk_next(iter);
>    rhashtable_walk_stop(iter);
>    rhashtable_remove_fast(ht, &obj->head, params);
>    rhashtable_walk_start(iter);
> 
>  races with another thread inserting or removing
>  an object on the same hash chain, a subsequent
>  rhashtable_walk_next() is not guaranteed to get the "next"
>  object. It is possible that an object could be
>  repeated, or missed.

The above scenario is very similar to the one I'm running:

   rhashtable_walk_next(iter);
   rhashtable_walk_stop(iter);     
   // rhashtable change not yet identified, could be either
   // remove, insert or even rehash
   rhashtable_walk_start(iter);
   rhashtable_walk_next(iter);

but I'm seeing use-after-free there. I'll try this patch to see if
solves my issue.

Note: the code under test is a pending new patch I'm holding due to the
above issue, I can send it as RFC to share the code if you think it may
help.

> @@ -867,15 +866,39 @@ void *rhashtable_walk_next(struct rhashtable_iter *iter)
>  	bool rhlist = ht->rhlist;
>  
>  	if (p) {
> -		if (!rhlist || !(list = rcu_dereference(list->next))) {
> -			p = rcu_dereference(p->next);
> -			list = container_of(p, struct rhlist_head, rhead);
> -		}
> -		if (!rht_is_a_nulls(p)) {
> -			iter->skip++;
> -			iter->p = p;
> -			iter->list = list;
> -			return rht_obj(ht, rhlist ? &list->rhead : p);
> +		if (!rhlist && iter->p_is_unsafe) {
> +			/*
> +			 * First time next() was called after start().
> +			 * Need to find location of 'p' in the list.
> +			 */
> +			struct rhash_head *p;
> +
> +			iter->skip = 0;
> +			rht_for_each_rcu(p, iter->walker.tbl, iter->slot) {
> +				iter->skip++;
> +				if (p <= iter->p)
> +					continue;

Out of sheer ignorance, I really don't understand the goal of the above
conditional ?!?

Should it possibly be something like:
				if (p != iter->p->next)

instead? 
But I think we can't safely dereference 'p' yet ?!?

I'm sorry for the possibly dumb comments, rhashtable internals are
somewhat obscure to me, but I'm really interested in this topic.

Cheers,

Paolo
kernel test robot July 6, 2018, 9:25 a.m. UTC | #3
Hi NeilBrown,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on net-next/master]
[also build test ERROR on next-20180706]
[cannot apply to linus/master linux-sof-driver/master v4.18-rc3]
[if your patch is applied to the wrong git tree, please drop us a note to help improve the system]

url:    https://github.com/0day-ci/linux/commits/NeilBrown/rhashtable-replace-rhashtable_walk_peek-implementation/20180706-153705
config: i386-randconfig-a0-07060846 (attached as .config)
compiler: gcc-4.9 (Debian 4.9.4-2) 4.9.4
reproduce:
        # save the attached .config to linux build tree
        make ARCH=i386 

All errors (new ones prefixed by >>):

   In file included from net/core/xdp.c:10:0:
   include/linux/rhashtable.h: In function '__rhashtable_insert_fast':
>> include/linux/rhashtable.h:624:6: error: 'headp' undeclared (first use in this function)
         headp = &head->next;
         ^
   include/linux/rhashtable.h:624:6: note: each undeclared identifier is reported only once for each function it appears in

vim +/headp +624 include/linux/rhashtable.h

   570	
   571	/* Internal function, please use rhashtable_insert_fast() instead. This
   572	 * function returns the existing element already in hashes in there is a clash,
   573	 * otherwise it returns an error via ERR_PTR().
   574	 */
   575	static inline void *__rhashtable_insert_fast(
   576		struct rhashtable *ht, const void *key, struct rhash_head *obj,
   577		const struct rhashtable_params params, bool rhlist)
   578	{
   579		struct rhashtable_compare_arg arg = {
   580			.ht = ht,
   581			.key = key,
   582		};
   583		struct rhash_head __rcu **pprev;
   584		struct bucket_table *tbl;
   585		struct rhash_head *head;
   586		spinlock_t *lock;
   587		unsigned int hash;
   588		int elasticity;
   589		void *data;
   590	
   591		rcu_read_lock();
   592	
   593		tbl = rht_dereference_rcu(ht->tbl, ht);
   594		hash = rht_head_hashfn(ht, tbl, obj, params);
   595		lock = rht_bucket_lock(tbl, hash);
   596		spin_lock_bh(lock);
   597	
   598		if (unlikely(rcu_access_pointer(tbl->future_tbl))) {
   599	slow_path:
   600			spin_unlock_bh(lock);
   601			rcu_read_unlock();
   602			return rhashtable_insert_slow(ht, key, obj);
   603		}
   604	
   605		elasticity = RHT_ELASTICITY;
   606		pprev = rht_bucket_insert(ht, tbl, hash);
   607		data = ERR_PTR(-ENOMEM);
   608		if (!pprev)
   609			goto out;
   610	
   611		rht_for_each_continue(head, *pprev, tbl, hash) {
   612			struct rhlist_head *plist;
   613			struct rhlist_head *list;
   614	
   615			elasticity--;
   616			if (!key ||
   617			    (params.obj_cmpfn ?
   618			     params.obj_cmpfn(&arg, rht_obj(ht, head)) :
   619			     rhashtable_compare(&arg, rht_obj(ht, head)))) {
   620				if (rhlist) {
   621					pprev = &head->next;
   622				} else {
   623					if (head < obj)
 > 624						headp = &head->next;
   625				}
   626				continue;
   627			}
   628	
   629			data = rht_obj(ht, head);
   630	
   631			if (!rhlist)
   632				goto out;
   633	
   634	
   635			list = container_of(obj, struct rhlist_head, rhead);
   636			plist = container_of(head, struct rhlist_head, rhead);
   637	
   638			RCU_INIT_POINTER(list->next, plist);
   639			head = rht_dereference_bucket(head->next, tbl, hash);
   640			RCU_INIT_POINTER(list->rhead.next, head);
   641			rcu_assign_pointer(*pprev, obj);
   642	
   643			goto good;
   644		}
   645	
   646		if (elasticity <= 0)
   647			goto slow_path;
   648	
   649		data = ERR_PTR(-E2BIG);
   650		if (unlikely(rht_grow_above_max(ht, tbl)))
   651			goto out;
   652	
   653		if (unlikely(rht_grow_above_100(ht, tbl)))
   654			goto slow_path;
   655	
   656		head = rht_dereference_bucket(*pprev, tbl, hash);
   657	
   658		RCU_INIT_POINTER(obj->next, head);
   659		if (rhlist) {
   660			struct rhlist_head *list;
   661	
   662			list = container_of(obj, struct rhlist_head, rhead);
   663			RCU_INIT_POINTER(list->next, NULL);
   664		}
   665	
   666		rcu_assign_pointer(*pprev, obj);
   667	
   668		atomic_inc(&ht->nelems);
   669		if (rht_grow_above_75(ht, tbl))
   670			schedule_work(&ht->run_work);
   671	
   672	good:
   673		data = NULL;
   674	
   675	out:
   676		spin_unlock_bh(lock);
   677		rcu_read_unlock();
   678	
   679		return data;
   680	}
   681	

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation
NeilBrown July 6, 2018, 9:50 a.m. UTC | #4
On Fri, Jul 06 2018, kbuild test robot wrote:

> Hi NeilBrown,
>
> Thank you for the patch! Yet something to improve:
>
> [auto build test ERROR on net-next/master]
> [also build test ERROR on next-20180705]
> [cannot apply to linus/master linux-sof-driver/master v4.18-rc3]
> [if your patch is applied to the wrong git tree, please drop us a note
> to help improve the system]

Patch is against net-next, plus "rhashtable: detect when object movement
might have invalidated a lookup" which was posted earlier (this
dependency wasn't made explicit).

Thanks,
NeilBrown
NeilBrown July 6, 2018, 9:55 a.m. UTC | #5
On Fri, Jul 06 2018, Paolo Abeni wrote:

> On Fri, 2018-07-06 at 17:11 +1000, NeilBrown wrote:
>> If the sequence:
>>    obj = rhashtable_walk_next(iter);
>>    rhashtable_walk_stop(iter);
>>    rhashtable_remove_fast(ht, &obj->head, params);
>>    rhashtable_walk_start(iter);
>> 
>>  races with another thread inserting or removing
>>  an object on the same hash chain, a subsequent
>>  rhashtable_walk_next() is not guaranteed to get the "next"
>>  object. It is possible that an object could be
>>  repeated, or missed.
>
> The above scenario is very similar to the one I'm running:
>
>    rhashtable_walk_next(iter);
>    rhashtable_walk_stop(iter);     
>    // rhashtable change not yet identified, could be either
>    // remove, insert or even rehash
>    rhashtable_walk_start(iter);
>    rhashtable_walk_next(iter);
>
> but I'm seeing use-after-free there. I'll try this patch to see if
> solves my issue.
>
> Note: the code under test is a pending new patch I'm holding due to the
> above issue, I can send it as RFC to share the code if you think it may
> help.

I'd suggest post it.  I may not get a chance to look at it, but if you
don't post it, then I definitely won't :-)

>
>> @@ -867,15 +866,39 @@ void *rhashtable_walk_next(struct rhashtable_iter *iter)
>>  	bool rhlist = ht->rhlist;
>>  
>>  	if (p) {
>> -		if (!rhlist || !(list = rcu_dereference(list->next))) {
>> -			p = rcu_dereference(p->next);
>> -			list = container_of(p, struct rhlist_head, rhead);
>> -		}
>> -		if (!rht_is_a_nulls(p)) {
>> -			iter->skip++;
>> -			iter->p = p;
>> -			iter->list = list;
>> -			return rht_obj(ht, rhlist ? &list->rhead : p);
>> +		if (!rhlist && iter->p_is_unsafe) {
>> +			/*
>> +			 * First time next() was called after start().
>> +			 * Need to find location of 'p' in the list.
>> +			 */
>> +			struct rhash_head *p;
>> +
>> +			iter->skip = 0;
>> +			rht_for_each_rcu(p, iter->walker.tbl, iter->slot) {
>> +				iter->skip++;
>> +				if (p <= iter->p)
>> +					continue;
>
> Out of sheer ignorance, I really don't understand the goal of the above
> conditional ?!?

I hoped the patch description would cover that:
     With this patch:
     - a new object is always inserted after the last object with a
       smaller address, or at the start.  This preserves the property,
       important when allowing objects to be removed and re-added, that
       an object is never inserted *after* a position that it previously
       held in the list.

The items in each table slot are stored in order of the address of the
item.  So to find the first item in a slot that was not before the
previously returned item (iter->p), we step forward while this item is
<= that one. 

Does that help at all?

NeilBrown


>
> Should it possibly be something like:
> 				if (p != iter->p->next)
>
> instead? 
> But I think we can't safely dereference 'p' yet ?!?
>
> I'm sorry for the possibly dumb comments, rhashtable internals are
> somewhat obscure to me, but I'm really interested in this topic.
>
> Cheers,
>
> Paolo
Paolo Abeni July 6, 2018, 10:12 a.m. UTC | #6
On Fri, 2018-07-06 at 19:55 +1000, NeilBrown wrote:
> On Fri, Jul 06 2018, Paolo Abeni wrote:
> 
> > Note: the code under test is a pending new patch I'm holding due to the
> > above issue, I can send it as RFC to share the code if you think it may
> > help.
> 
> I'd suggest post it.  I may not get a chance to look at it, but if you
> don't post it, then I definitely won't :-)

Oks, thanks, I just spammed the list (and you ;)

> > > @@ -867,15 +866,39 @@ void *rhashtable_walk_next(struct rhashtable_iter *iter)
> > >  	bool rhlist = ht->rhlist;
> > >  
> > >  	if (p) {
> > > -		if (!rhlist || !(list = rcu_dereference(list->next))) {
> > > -			p = rcu_dereference(p->next);
> > > -			list = container_of(p, struct rhlist_head, rhead);
> > > -		}
> > > -		if (!rht_is_a_nulls(p)) {
> > > -			iter->skip++;
> > > -			iter->p = p;
> > > -			iter->list = list;
> > > -			return rht_obj(ht, rhlist ? &list->rhead : p);
> > > +		if (!rhlist && iter->p_is_unsafe) {
> > > +			/*
> > > +			 * First time next() was called after start().
> > > +			 * Need to find location of 'p' in the list.
> > > +			 */
> > > +			struct rhash_head *p;
> > > +
> > > +			iter->skip = 0;
> > > +			rht_for_each_rcu(p, iter->walker.tbl, iter->slot) {
> > > +				iter->skip++;
> > > +				if (p <= iter->p)
> > > +					continue;
> > 
> > Out of sheer ignorance, I really don't understand the goal of the above
> > conditional ?!?
> 
> I hoped the patch description would cover that:
>      With this patch:
>      - a new object is always inserted after the last object with a
>        smaller address, or at the start.  This preserves the property,
>        important when allowing objects to be removed and re-added, that
>        an object is never inserted *after* a position that it previously
>        held in the list.
> 
> The items in each table slot are stored in order of the address of the
> item.  So to find the first item in a slot that was not before the
> previously returned item (iter->p), we step forward while this item is
> <= that one. 
> 
> Does that help at all?

Yes, it's very clear. Before I dumbly skipped some slices of the patch.

Thanks,

Paolo
diff mbox series

Patch

diff --git a/include/linux/rhashtable-types.h b/include/linux/rhashtable-types.h
index 763d613ce2c2..bc3e84547ba7 100644
--- a/include/linux/rhashtable-types.h
+++ b/include/linux/rhashtable-types.h
@@ -126,6 +126,7 @@  struct rhashtable_iter {
 	struct rhashtable_walker walker;
 	unsigned int slot;
 	unsigned int skip;
+	bool p_is_unsafe;
 	bool end_of_table;
 };
 
diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h
index 10435a77b156..657e37ae314c 100644
--- a/include/linux/rhashtable.h
+++ b/include/linux/rhashtable.h
@@ -628,7 +628,12 @@  static inline void *__rhashtable_insert_fast(
 		    (params.obj_cmpfn ?
 		     params.obj_cmpfn(&arg, rht_obj(ht, head)) :
 		     rhashtable_compare(&arg, rht_obj(ht, head)))) {
-			pprev = &head->next;
+			if (rhlist) {
+				pprev = &head->next;
+			} else {
+				if (head < obj)
+					headp = &head->next;
+			}
 			continue;
 		}
 
@@ -1124,7 +1129,8 @@  static inline int rhashtable_walk_init(struct rhashtable *ht,
  * Note that if you restart a walk after rhashtable_walk_stop you
  * may see the same object twice.  Also, you may miss objects if
  * there are removals in between rhashtable_walk_stop and the next
- * call to rhashtable_walk_start.
+ * call to rhashtable_walk_start.  Note that this is different to
+ * rhashtable_walk_enter() which misses objects.
  *
  * For a completely stable walk you should construct your own data
  * structure outside the hash table.
diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index f87af707f086..36f97d0c69ce 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -228,6 +228,7 @@  static int rhashtable_rehash_one(struct rhashtable *ht, unsigned int old_hash)
 	struct bucket_table *old_tbl = rht_dereference(ht->tbl, ht);
 	struct bucket_table *new_tbl = rhashtable_last_table(ht, old_tbl);
 	struct rhash_head __rcu **pprev = rht_bucket_var(old_tbl, old_hash);
+	struct rhash_head __rcu **inspos;
 	int err = -EAGAIN;
 	struct rhash_head *head, *next, *entry;
 	spinlock_t *new_bucket_lock;
@@ -256,12 +257,15 @@  static int rhashtable_rehash_one(struct rhashtable *ht, unsigned int old_hash)
 	new_bucket_lock = rht_bucket_lock(new_tbl, new_hash);
 
 	spin_lock_nested(new_bucket_lock, SINGLE_DEPTH_NESTING);
-	head = rht_dereference_bucket(new_tbl->buckets[new_hash],
-				      new_tbl, new_hash);
-
+	inspos = &new_tbl->buckets[new_hash];
+	head = rht_dereference_bucket(*inspos, new_tbl, new_hash);
+	while (!rht_is_a_nulls(head) && head < entry) {
+		inspos = &head->next;
+		head = rht_dereference_bucket(*inspos, new_tbl, new_hash);
+	}
 	RCU_INIT_POINTER(entry->next, head);
 
-	rcu_assign_pointer(new_tbl->buckets[new_hash], entry);
+	rcu_assign_pointer(*inspos, entry);
 	spin_unlock(new_bucket_lock);
 
 	rcu_assign_pointer(*pprev, next);
@@ -557,6 +561,10 @@  static struct bucket_table *rhashtable_insert_one(struct rhashtable *ht,
 		return ERR_PTR(-ENOMEM);
 
 	head = rht_dereference_bucket(*pprev, tbl, hash);
+	while (!ht->rhlist && !rht_is_a_nulls(head) && head < obj) {
+		pprev = &head->next;
+		head = rht_dereference_bucket(*pprev, tbl, hash);
+	}
 
 	RCU_INIT_POINTER(obj->next, head);
 	if (ht->rhlist) {
@@ -651,10 +659,10 @@  EXPORT_SYMBOL_GPL(rhashtable_insert_slow);
  *
  * This function prepares a hash table walk.
  *
- * Note that if you restart a walk after rhashtable_walk_stop you
- * may see the same object twice.  Also, you may miss objects if
- * there are removals in between rhashtable_walk_stop and the next
- * call to rhashtable_walk_start.
+ * A walk is guaranteed to return every object that was in
+ * the table before this call, and is still in the table when
+ * rhashtable_walk_next() returns NULL.  Duplicates can be
+ * seen, but only if there is a rehash event during the walk.
  *
  * For a completely stable walk you should construct your own data
  * structure outside the hash table.
@@ -738,19 +746,10 @@  int rhashtable_walk_start_check(struct rhashtable_iter *iter)
 
 	if (iter->p && !rhlist) {
 		/*
-		 * We need to validate that 'p' is still in the table, and
-		 * if so, update 'skip'
+		 * 'p' will be revalidated when rhashtable_walk_next()
+		 * is called.
 		 */
-		struct rhash_head *p;
-		int skip = 0;
-		rht_for_each_rcu(p, iter->walker.tbl, iter->slot) {
-			skip++;
-			if (p == iter->p) {
-				iter->skip = skip;
-				goto found;
-			}
-		}
-		iter->p = NULL;
+		iter->p_is_unsafe = true;
 	} else if (iter->p && rhlist) {
 		/* Need to validate that 'list' is still in the table, and
 		 * if so, update 'skip' and 'p'.
@@ -867,15 +866,39 @@  void *rhashtable_walk_next(struct rhashtable_iter *iter)
 	bool rhlist = ht->rhlist;
 
 	if (p) {
-		if (!rhlist || !(list = rcu_dereference(list->next))) {
-			p = rcu_dereference(p->next);
-			list = container_of(p, struct rhlist_head, rhead);
-		}
-		if (!rht_is_a_nulls(p)) {
-			iter->skip++;
-			iter->p = p;
-			iter->list = list;
-			return rht_obj(ht, rhlist ? &list->rhead : p);
+		if (!rhlist && iter->p_is_unsafe) {
+			/*
+			 * First time next() was called after start().
+			 * Need to find location of 'p' in the list.
+			 */
+			struct rhash_head *p;
+
+			iter->skip = 0;
+			rht_for_each_rcu(p, iter->walker.tbl, iter->slot) {
+				iter->skip++;
+				if (p <= iter->p)
+					continue;
+
+				/* p is the next object after iter->p */
+				iter->p = p;
+				iter->p_is_unsafe = false;
+				return rht_obj(ht, p);
+			}
+			/* There is no "next" object in the list, move
+			 * to next hash chain.
+			 */
+		} else {
+			if (!rhlist || !(list = rcu_dereference(list->next))) {
+				p = rcu_dereference(p->next);
+				list = container_of(p, struct rhlist_head,
+						    rhead);
+			}
+			if (!rht_is_a_nulls(p)) {
+				iter->skip++;
+				iter->p = p;
+				iter->list = list;
+				return rht_obj(ht, rhlist ? &list->rhead : p);
+			}
 		}
 
 		/* At the end of this slot, switch to next one and then find
@@ -885,6 +908,7 @@  void *rhashtable_walk_next(struct rhashtable_iter *iter)
 		iter->slot++;
 	}
 
+	iter->p_is_unsafe = false;
 	return __rhashtable_walk_find_next(iter);
 }
 EXPORT_SYMBOL_GPL(rhashtable_walk_next);