rhashtable: Move hash_rnd into bucket_table

Message ID	20150309095833.GA4028@gondor.apana.org.au
State	Changes Requested, archived
Delegated to:	David Miller
Headers	show Return-Path: <netdev-owner@vger.kernel.org> Date: Mon, 9 Mar 2015 20:58:33 +1100 From: Herbert Xu <herbert@gondor.apana.org.au> To: David Miller <davem@davemloft.net> Cc: tgraf@suug.ch, netdev@vger.kernel.org Subject: rhashtable: Move hash_rnd into bucket_table Message-ID: <20150309095833.GA4028@gondor.apana.org.au> References: <20150131102150.GA29498@gondor.apana.org.au> <20150202.191956.404010205495035337.davem@davemloft.net> <20150202.192634.1792206005008667377.davem@davemloft.net> <20150203201750.GA22474@gondor.apana.org.au> MIME-Version: 1.0 Content-Type: text/plain; charset=us-ascii Content-Disposition: inline In-Reply-To: <20150203201750.GA22474@gondor.apana.org.au> User-Agent: Mutt/1.5.21 (2010-09-15) Sender: netdev-owner@vger.kernel.org Precedence: bulk

Herbert Xu March 9, 2015, 9:58 a.m. UTC

Currently hash_rnd is a parameter that users can set.  However,
no existing users set this parameter.  It is also something that
people are unlikely to want to set directly since it's just a
random number.

In preparation for allowing the reseeding/rehashing of rhashtable,
this patch moves hash_rnd into bucket_table so that it's now an
internal state rather than a parameter.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

Daniel Borkmann March 9, 2015, 4:26 p.m. UTC | #1

On 03/09/2015 10:58 AM, Herbert Xu wrote:
> Currently hash_rnd is a parameter that users can set.  However,
> no existing users set this parameter.  It is also something that
> people are unlikely to want to set directly since it's just a
> random number.

I believe the original purpose was to have reproduceability, but
yeah, there are no users of it currently.

> In preparation for allowing the reseeding/rehashing of rhashtable,
> this patch moves hash_rnd into bucket_table so that it's now an
> internal state rather than a parameter.
>
> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
>
> diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h
> index d438eeb..5ef8ea5 100644
> --- a/include/linux/rhashtable.h
> +++ b/include/linux/rhashtable.h
> @@ -49,12 +49,14 @@ struct rhash_head {
>   /**
>    * struct bucket_table - Table of hash buckets
>    * @size: Number of hash buckets
> + * @hash_rnd: Random seed to fold into hash
>    * @locks_mask: Mask to apply before accessing locks[]
>    * @locks: Array of spinlocks protecting individual buckets
>    * @buckets: size * hash buckets
>    */
>   struct bucket_table {
>   	size_t			size;
> +	u32			hash_rnd;
>   	unsigned int		locks_mask;
>   	spinlock_t		*locks;
>
> @@ -72,7 +74,6 @@ struct rhashtable;
>    * @key_len: Length of key
>    * @key_offset: Offset of key in struct to be hashed
>    * @head_offset: Offset of rhash_head in struct to be hashed
> - * @hash_rnd: Seed to use while hashing
>    * @max_shift: Maximum number of shifts while expanding
>    * @min_shift: Minimum number of shifts while shrinking
>    * @nulls_base: Base value to generate nulls marker
> @@ -85,7 +86,6 @@ struct rhashtable_params {
>   	size_t			key_len;
>   	size_t			key_offset;
>   	size_t			head_offset;
> -	u32			hash_rnd;
>   	size_t			max_shift;
>   	size_t			min_shift;
>   	u32			nulls_base;
> diff --git a/lib/rhashtable.c b/lib/rhashtable.c
> index b5344ef..4ce267f 100644
> --- a/lib/rhashtable.c
> +++ b/lib/rhashtable.c
> @@ -66,25 +66,28 @@ static u32 rht_bucket_index(const struct bucket_table *tbl, u32 hash)
>   	return hash & (tbl->size - 1);
>   }
>
> -static u32 obj_raw_hashfn(const struct rhashtable *ht, const void *ptr)
> +static u32 obj_raw_hashfn(struct rhashtable *ht, const void *ptr)
>   {
> +	struct bucket_table *tbl = rht_dereference_rcu(ht->tbl, ht);

const

>   	u32 hash;
>
>   	if (unlikely(!ht->p.key_len))
> -		hash = ht->p.obj_hashfn(ptr, ht->p.hash_rnd);
> +		hash = ht->p.obj_hashfn(ptr, tbl->hash_rnd);
>   	else
>   		hash = ht->p.hashfn(ptr + ht->p.key_offset, ht->p.key_len,
> -				    ht->p.hash_rnd);
> +				    tbl->hash_rnd);
>
>   	return hash >> HASH_RESERVED_SPACE;
>   }
>
>   static u32 key_hashfn(struct rhashtable *ht, const void *key, u32 len)
>   {
> -	return ht->p.hashfn(key, len, ht->p.hash_rnd) >> HASH_RESERVED_SPACE;
> +	struct bucket_table *tbl = rht_dereference_rcu(ht->tbl, ht);
> +

const

> +	return ht->p.hashfn(key, len, tbl->hash_rnd) >> HASH_RESERVED_SPACE;
>   }
>
> -static u32 head_hashfn(const struct rhashtable *ht,
> +static u32 head_hashfn(struct rhashtable *ht,
>   		       const struct bucket_table *tbl,
>   		       const struct rhash_head *he)
>   {
> @@ -92,7 +95,7 @@ static u32 head_hashfn(const struct rhashtable *ht,
>   }
>
>   #ifdef CONFIG_PROVE_LOCKING
> -static void debug_dump_buckets(const struct rhashtable *ht,
> +static void debug_dump_buckets(struct rhashtable *ht,
>   			       const struct bucket_table *tbl)
>   {
>   	struct rhash_head *he;
> @@ -1099,14 +1102,13 @@ int rhashtable_init(struct rhashtable *ht, struct rhashtable_params *params)
>   	if (tbl == NULL)
>   		return -ENOMEM;
>
> +	get_random_bytes(&tbl->hash_rnd, sizeof(tbl->hash_rnd));
> +

Have you tested this patch? ;-)

If I see this correctly, the first time you shrink or expand, your hash_rnd
from then onwards will constantly be 0.

You need to copy over the above old hash_rnd to the new one from the newly
allocated tbl. So, bucket_table_alloc() needs a change as well. Of course,
with rehashing, the get_random_bytes() would directly move there.

>   	atomic_set(&ht->nelems, 0);
>   	atomic_set(&ht->shift, ilog2(tbl->size));
>   	RCU_INIT_POINTER(ht->tbl, tbl);
>   	RCU_INIT_POINTER(ht->future_tbl, tbl);
>
> -	if (!ht->p.hash_rnd)
> -		get_random_bytes(&ht->p.hash_rnd, sizeof(ht->p.hash_rnd));
> -
>   	INIT_WORK(&ht->run_work, rht_deferred_worker);
>
>   	return 0;
>

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

David Miller March 9, 2015, 7:51 p.m. UTC | #2

From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Mon, 9 Mar 2015 20:58:33 +1100

> Currently hash_rnd is a parameter that users can set.  However,
> no existing users set this parameter.  It is also something that
> people are unlikely to want to set directly since it's just a
> random number.
> 
> In preparation for allowing the reseeding/rehashing of rhashtable,
> this patch moves hash_rnd into bucket_table so that it's now an
> internal state rather than a parameter.
> 
> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

Thomas is right, you're going to have to move the hash_rnd initialization
to where we allocate the bucket tables, otherwise the first shrink/expand
will set it to zero.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

David Miller March 9, 2015, 7:55 p.m. UTC | #3

From: David Miller <davem@redhat.com>
Date: Mon, 09 Mar 2015 15:51:37 -0400 (EDT)

> From: Herbert Xu <herbert@gondor.apana.org.au>
> Date: Mon, 9 Mar 2015 20:58:33 +1100
> 
>> Currently hash_rnd is a parameter that users can set.  However,
>> no existing users set this parameter.  It is also something that
>> people are unlikely to want to set directly since it's just a
>> random number.
>> 
>> In preparation for allowing the reseeding/rehashing of rhashtable,
>> this patch moves hash_rnd into bucket_table so that it's now an
>> internal state rather than a parameter.
>> 
>> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
> 
> Thomas is right, you're going to have to move the hash_rnd initialization
> to where we allocate the bucket tables, otherwise the first shrink/expand
> will set it to zero.

My bad, Daniel pointed this out, not Thomas.

Sorry about that.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Herbert Xu March 9, 2015, 10:21 p.m. UTC | #4

On Mon, Mar 09, 2015 at 05:26:52PM +0100, Daniel Borkmann wrote:
> 
> I believe the original purpose was to have reproduceability, but
> yeah, there are no users of it currently.

If anyone ever needed that we could always provide a way to set
the random value, through a rehash.
 
> Have you tested this patch? ;-)

Clearly not :)

> If I see this correctly, the first time you shrink or expand, your hash_rnd
> from then onwards will constantly be 0.
> 
> You need to copy over the above old hash_rnd to the new one from the newly
> allocated tbl. So, bucket_table_alloc() needs a change as well. Of course,
> with rehashing, the get_random_bytes() would directly move there.

Yes it should copy them.  I've fixed in my tree now and it actually
works!

Cheers,

Herbert Xu March 9, 2015, 10:26 p.m. UTC | #5

Hi Dave:

These two patches implement arbitrary rehashing in rhashtable.  The
idea is based on the hashtable in net/bridge/br_multicast.c plus
the suggestion by Josh Triplett to avoid using two linked lists.

I have tested it using netlink but obviously more testing especially
using netfilter would be welcome.

After this I would like to explore the idea of allowing multiple
rehashes in parallel.  This would allow us to not have to rehash
synchronously when the table gets too big and the previous rehash
hasn't finished.

Cheers,

Josh Triplett March 9, 2015, 10:32 p.m. UTC | #6

On Tue, Mar 10, 2015 at 09:26:31AM +1100, Herbert Xu wrote:
> Hi Dave:
> 
> These two patches implement arbitrary rehashing in rhashtable.  The
> idea is based on the hashtable in net/bridge/br_multicast.c plus
> the suggestion by Josh Triplett to avoid using two linked lists.

Awesome.

> I have tested it using netlink but obviously more testing especially
> using netfilter would be welcome.
> 
> After this I would like to explore the idea of allowing multiple
> rehashes in parallel.  This would allow us to not have to rehash
> synchronously when the table gets too big and the previous rehash
> hasn't finished.

Aieee.

- Josh Triplett
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Thomas Graf March 10, 2015, 6:20 p.m. UTC | #7

On 03/10/15 at 09:26am, Herbert Xu wrote:
> After this I would like to explore the idea of allowing multiple
> rehashes in parallel.  This would allow us to not have to rehash
> synchronously when the table gets too big and the previous rehash
> hasn't finished.

Another interesting idea was to allow growing faster than 2x. It
actually became easier with this generic rehashing code because
the number of bucket locks can grow quicker than 2x again.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Thomas Graf March 12, 2015, 4:46 p.m. UTC | #8

On 03/10/15 at 09:26am, Herbert Xu wrote:
> I have tested it using netlink but obviously more testing especially
> using netfilter would be welcome.

I ran the attached bind_netlink test written by Ying Xue and it
currently crashes with:

[  108.693908] BUG: unable to handle kernel paging request at fffffffffffffe68

I'm running bind_netlink like this:

while true; do
    ./bind_netlink 9000 324234324&
    ./bind_netlink 9000 888448422&
    ./bind_netlink 9000 324
done

I tested with the synchronize_rcu() fix on top as well and that
didn't help so it must be something else.
#include <sys/types.h>
#include <sys/socket.h>
#include <stdlib.h>
#include <stdio.h>
#include <assert.h>
#include <string.h>
#include <errno.h>
#include <unistd.h>
#include <netinet/in.h>
#include <linux/netlink.h>

void diep(char *err)
{
	perror(err);
	exit(1);
}

int main(int argc, char *argv[])
{
	int i;
	int *fds;
	int num_ports;
	struct sockaddr_nl local;

	srand(strtoul(argv[2], NULL, 0));

	if (argc < 2) {
		fprintf(stderr, "Usage: %s <PORTS>\n", argv[0]);
		exit(1);
	}

	num_ports = atoi(argv[optind]);

	printf("Create %u ports\n", num_ports);

	fds = malloc(sizeof(int) * num_ports);
	for (i = 1; i <= num_ports; i++) {
		if (!(fds[i] = socket(PF_NETLINK, SOCK_RAW, 0)))
			diep("socket");

		memset(&local, 0, sizeof(local));
		local.nl_family = AF_NETLINK;
		local.nl_pid = rand();
		local.nl_groups = 0;
		if(bind(fds[i], (struct sockaddr *)&local, sizeof(local)) != 0){
			diep("socket");
		}
		
		if (!(i % 1000))
			printf("Created %u ports\n", i);

	}
	printf("Ports successfully created, terminating\n");

	return 0;
}

rhashtable: Move hash_rnd into bucket_table

Commit Message

Comments

Patch