diff mbox

[v3,net-next,1/5] net: Get skb hash over flow_keys structure

Message ID 1431444182-3935169-2-git-send-email-tom@herbertland.com
State Changes Requested, archived
Delegated to: David Miller
Headers show

Commit Message

Tom Herbert May 12, 2015, 3:22 p.m. UTC
This patch changes flow hashing to use jhash2 over the flow_keys
structure instead just doing jhash_3words over src, dst, and ports.
This method will allow us take more input into the hashing function
so that we can include full IPv6 addresses, VLAN, flow labels etc.
without needing to resort to xor'ing which makes for a poor hash.

Signed-off-by: Tom Herbert <tom@herbertland.com>
---
 include/net/flow_keys.h   | 13 ++++++++++---
 net/core/flow_dissector.c | 21 +++++++++++++++------
 2 files changed, 25 insertions(+), 9 deletions(-)

Comments

David Miller May 13, 2015, 7:30 p.m. UTC | #1
From: Tom Herbert <tom@herbertland.com>
Date: Tue, 12 May 2015 08:22:58 -0700

> @@ -15,6 +15,13 @@
>   * All the members, except thoff, are in network byte order.
>   */
>  struct flow_keys {
> +	u16	thoff;
> +	u16	padding1;
> +#define FLOW_KEYS_HASH_START_FIELD	n_proto
> +	__be16	n_proto;
> +	u8	ip_proto;
> +	u8	padding;
> +

This padding works if everyone consistently zero initializes the whole
key structure, but for whatever reason (performance, unintentional
oversight, etc.) not all paths do.

So, for example, inet_set_txhash() is going to have random crap in
keys.padding, so the hashes computed are not stable for a given flow
key tuple.

That's just the first code path I found with this issue, there are
probably several others.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Tom Herbert May 13, 2015, 7:37 p.m. UTC | #2
On Wed, May 13, 2015 at 3:30 PM, David Miller <davem@davemloft.net> wrote:
> From: Tom Herbert <tom@herbertland.com>
> Date: Tue, 12 May 2015 08:22:58 -0700
>
>> @@ -15,6 +15,13 @@
>>   * All the members, except thoff, are in network byte order.
>>   */
>>  struct flow_keys {
>> +     u16     thoff;
>> +     u16     padding1;
>> +#define FLOW_KEYS_HASH_START_FIELD   n_proto
>> +     __be16  n_proto;
>> +     u8      ip_proto;
>> +     u8      padding;
>> +
>
> This padding works if everyone consistently zero initializes the whole
> key structure, but for whatever reason (performance, unintentional
> oversight, etc.) not all paths do.
>
> So, for example, inet_set_txhash() is going to have random crap in
> keys.padding, so the hashes computed are not stable for a given flow
> key tuple.
>
> That's just the first code path I found with this issue, there are
> probably several others.

memset zero is in the second patch for inet_set_txhash and
ip6_set_txhash. I can respin so those are in the first patch.

Tom
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Miller May 13, 2015, 8:02 p.m. UTC | #3
From: Tom Herbert <tom@herbertland.com>
Date: Wed, 13 May 2015 15:37:50 -0400

> On Wed, May 13, 2015 at 3:30 PM, David Miller <davem@davemloft.net> wrote:
>> From: Tom Herbert <tom@herbertland.com>
>> Date: Tue, 12 May 2015 08:22:58 -0700
>>
>>> @@ -15,6 +15,13 @@
>>>   * All the members, except thoff, are in network byte order.
>>>   */
>>>  struct flow_keys {
>>> +     u16     thoff;
>>> +     u16     padding1;
>>> +#define FLOW_KEYS_HASH_START_FIELD   n_proto
>>> +     __be16  n_proto;
>>> +     u8      ip_proto;
>>> +     u8      padding;
>>> +
>>
>> This padding works if everyone consistently zero initializes the whole
>> key structure, but for whatever reason (performance, unintentional
>> oversight, etc.) not all paths do.
>>
>> So, for example, inet_set_txhash() is going to have random crap in
>> keys.padding, so the hashes computed are not stable for a given flow
>> key tuple.
>>
>> That's just the first code path I found with this issue, there are
>> probably several others.
> 
> memset zero is in the second patch for inet_set_txhash and
> ip6_set_txhash. I can respin so those are in the first patch.

Yes, for bisectability you should probably do that.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/include/net/flow_keys.h b/include/net/flow_keys.h
index 6d6ef62..8a15f17 100644
--- a/include/net/flow_keys.h
+++ b/include/net/flow_keys.h
@@ -15,6 +15,13 @@ 
  * All the members, except thoff, are in network byte order.
  */
 struct flow_keys {
+	u16	thoff;
+	u16	padding1;
+#define FLOW_KEYS_HASH_START_FIELD	n_proto
+	__be16	n_proto;
+	u8	ip_proto;
+	u8	padding;
+
 	/* (src,dst) must be grouped, in the same way than in IP header */
 	__be32 src;
 	__be32 dst;
@@ -22,11 +29,11 @@  struct flow_keys {
 		__be32 ports;
 		__be16 port16[2];
 	};
-	u16	thoff;
-	__be16	n_proto;
-	u8	ip_proto;
 };
 
+#define FLOW_KEYS_HASH_OFFSET		\
+	offsetof(struct flow_keys, FLOW_KEYS_HASH_START_FIELD)
+
 bool __skb_flow_dissect(const struct sk_buff *skb, struct flow_keys *flow,
 			void *data, __be16 proto, int nhoff, int hlen);
 static inline bool skb_flow_dissect(const struct sk_buff *skb, struct flow_keys *flow)
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index d3acc4d..7007c18 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -267,9 +267,20 @@  static __always_inline void __flow_hash_secret_init(void)
 	net_get_random_once(&hashrnd, sizeof(hashrnd));
 }
 
-static __always_inline u32 __flow_hash_3words(u32 a, u32 b, u32 c, u32 keyval)
+static __always_inline u32 __flow_hash_words(u32 *words, u32 length, u32 keyval)
 {
-	return jhash_3words(a, b, c, keyval);
+	return jhash2(words, length, keyval);
+}
+
+static inline void *flow_keys_hash_start(struct flow_keys *flow)
+{
+	BUILD_BUG_ON(FLOW_KEYS_HASH_OFFSET % 4);
+	return (void *)flow + FLOW_KEYS_HASH_OFFSET;
+}
+
+static inline size_t flow_keys_hash_length(struct flow_keys *flow)
+{
+	return (sizeof(*flow) - FLOW_KEYS_HASH_OFFSET) / sizeof(u32);
 }
 
 static inline u32 __flow_hash_from_keys(struct flow_keys *keys, u32 keyval)
@@ -284,10 +295,8 @@  static inline u32 __flow_hash_from_keys(struct flow_keys *keys, u32 keyval)
 		swap(keys->port16[0], keys->port16[1]);
 	}
 
-	hash = __flow_hash_3words((__force u32)keys->dst,
-				  (__force u32)keys->src,
-				  (__force u32)keys->ports,
-				  keyval);
+	hash = __flow_hash_words((u32 *)flow_keys_hash_start(keys),
+				 flow_keys_hash_length(keys), keyval);
 	if (!hash)
 		hash = 1;