diff mbox series

[net-next,v2,4/4] bonding: balance ICMP echoes in layer3+4 mode

Message ID 20191029135053.10055-5-mcroce@redhat.com
State Accepted
Delegated to: David Miller
Headers show
Series ICMP flow improvements | expand

Commit Message

Matteo Croce Oct. 29, 2019, 1:50 p.m. UTC
The bonding uses the L4 ports to balance flows between slaves. As the ICMP
protocol has no ports, those packets are sent all to the same device:

    # tcpdump -qltnni veth0 ip |sed 's/^/0: /' &
    # tcpdump -qltnni veth1 ip |sed 's/^/1: /' &
    # ping -qc1 192.168.0.2
    1: IP 192.168.0.1 > 192.168.0.2: ICMP echo request, id 315, seq 1, length 64
    1: IP 192.168.0.2 > 192.168.0.1: ICMP echo reply, id 315, seq 1, length 64
    # ping -qc1 192.168.0.2
    1: IP 192.168.0.1 > 192.168.0.2: ICMP echo request, id 316, seq 1, length 64
    1: IP 192.168.0.2 > 192.168.0.1: ICMP echo reply, id 316, seq 1, length 64
    # ping -qc1 192.168.0.2
    1: IP 192.168.0.1 > 192.168.0.2: ICMP echo request, id 317, seq 1, length 64
    1: IP 192.168.0.2 > 192.168.0.1: ICMP echo reply, id 317, seq 1, length 64

But some ICMP packets have an Identifier field which is
used to match packets within sessions, let's use this value in the hash
function to balance these packets between bond slaves:

    # ping -qc1 192.168.0.2
    0: IP 192.168.0.1 > 192.168.0.2: ICMP echo request, id 303, seq 1, length 64
    0: IP 192.168.0.2 > 192.168.0.1: ICMP echo reply, id 303, seq 1, length 64
    # ping -qc1 192.168.0.2
    1: IP 192.168.0.1 > 192.168.0.2: ICMP echo request, id 304, seq 1, length 64
    1: IP 192.168.0.2 > 192.168.0.1: ICMP echo reply, id 304, seq 1, length 64

Aso, let's use a flow_dissector_key which defines FLOW_DISSECTOR_KEY_ICMP,
so we can balance pings encapsulated in a tunnel when using mode encap3+4:

    # ping -q 192.168.1.2 -c1
    0: IP 192.168.0.1 > 192.168.0.2: GREv0, length 102: IP 192.168.1.1 > 192.168.1.2: ICMP echo request, id 585, seq 1, length 64
    0: IP 192.168.0.2 > 192.168.0.1: GREv0, length 102: IP 192.168.1.2 > 192.168.1.1: ICMP echo reply, id 585, seq 1, length 64
    # ping -q 192.168.1.2 -c1
    1: IP 192.168.0.1 > 192.168.0.2: GREv0, length 102: IP 192.168.1.1 > 192.168.1.2: ICMP echo request, id 586, seq 1, length 64
    1: IP 192.168.0.2 > 192.168.0.1: GREv0, length 102: IP 192.168.1.2 > 192.168.1.1: ICMP echo reply, id 586, seq 1, length 64

Signed-off-by: Matteo Croce <mcroce@redhat.com>
---
 drivers/net/bonding/bond_main.c | 77 ++++++++++++++++++++++++++++++---
 1 file changed, 70 insertions(+), 7 deletions(-)

Comments

Nikolay Aleksandrov Oct. 29, 2019, 6:35 p.m. UTC | #1
On 29/10/2019 15:50, Matteo Croce wrote:
> The bonding uses the L4 ports to balance flows between slaves. As the ICMP
> protocol has no ports, those packets are sent all to the same device:
> 
>     # tcpdump -qltnni veth0 ip |sed 's/^/0: /' &
>     # tcpdump -qltnni veth1 ip |sed 's/^/1: /' &
>     # ping -qc1 192.168.0.2
>     1: IP 192.168.0.1 > 192.168.0.2: ICMP echo request, id 315, seq 1, length 64
>     1: IP 192.168.0.2 > 192.168.0.1: ICMP echo reply, id 315, seq 1, length 64
>     # ping -qc1 192.168.0.2
>     1: IP 192.168.0.1 > 192.168.0.2: ICMP echo request, id 316, seq 1, length 64
>     1: IP 192.168.0.2 > 192.168.0.1: ICMP echo reply, id 316, seq 1, length 64
>     # ping -qc1 192.168.0.2
>     1: IP 192.168.0.1 > 192.168.0.2: ICMP echo request, id 317, seq 1, length 64
>     1: IP 192.168.0.2 > 192.168.0.1: ICMP echo reply, id 317, seq 1, length 64
> 
> But some ICMP packets have an Identifier field which is
> used to match packets within sessions, let's use this value in the hash
> function to balance these packets between bond slaves:
> 
>     # ping -qc1 192.168.0.2
>     0: IP 192.168.0.1 > 192.168.0.2: ICMP echo request, id 303, seq 1, length 64
>     0: IP 192.168.0.2 > 192.168.0.1: ICMP echo reply, id 303, seq 1, length 64
>     # ping -qc1 192.168.0.2
>     1: IP 192.168.0.1 > 192.168.0.2: ICMP echo request, id 304, seq 1, length 64
>     1: IP 192.168.0.2 > 192.168.0.1: ICMP echo reply, id 304, seq 1, length 64
> 
> Aso, let's use a flow_dissector_key which defines FLOW_DISSECTOR_KEY_ICMP,

Also ?

> so we can balance pings encapsulated in a tunnel when using mode encap3+4:
> 
>     # ping -q 192.168.1.2 -c1
>     0: IP 192.168.0.1 > 192.168.0.2: GREv0, length 102: IP 192.168.1.1 > 192.168.1.2: ICMP echo request, id 585, seq 1, length 64
>     0: IP 192.168.0.2 > 192.168.0.1: GREv0, length 102: IP 192.168.1.2 > 192.168.1.1: ICMP echo reply, id 585, seq 1, length 64
>     # ping -q 192.168.1.2 -c1
>     1: IP 192.168.0.1 > 192.168.0.2: GREv0, length 102: IP 192.168.1.1 > 192.168.1.2: ICMP echo request, id 586, seq 1, length 64
>     1: IP 192.168.0.2 > 192.168.0.1: GREv0, length 102: IP 192.168.1.2 > 192.168.1.1: ICMP echo reply, id 586, seq 1, length 64
> 
> Signed-off-by: Matteo Croce <mcroce@redhat.com>
> ---
>  drivers/net/bonding/bond_main.c | 77 ++++++++++++++++++++++++++++++---
>  1 file changed, 70 insertions(+), 7 deletions(-)
> 

Hi Matteo,
Wouldn't it be more useful and simpler to use some field to choose the slave (override the hash
completely) in a deterministic way from user-space ?
For example the mark can be interpreted as a slave id in the bonding (should be
optional, to avoid breaking existing setups). ping already supports -m and
anything else can set it, this way it can be used to do monitoring for a specific
slave with any protocol and would be a much simpler change.
User-space can then implement any logic for the monitoring case and as a minor bonus
can monitor the slaves in parallel. And the opposite as well - if people don't want
these balanced for some reason, they wouldn't enable it.

Or maybe I've misunderstood why this change is needed. :)
It would actually be nice to include the use-case which brought this on
in the commit message.

Cheers,
 Nik
Nikolay Aleksandrov Oct. 29, 2019, 6:41 p.m. UTC | #2
On 29/10/2019 20:35, Nikolay Aleksandrov wrote:
> On 29/10/2019 15:50, Matteo Croce wrote:
>> The bonding uses the L4 ports to balance flows between slaves. As the ICMP
>> protocol has no ports, those packets are sent all to the same device:
>>
>>     # tcpdump -qltnni veth0 ip |sed 's/^/0: /' &
>>     # tcpdump -qltnni veth1 ip |sed 's/^/1: /' &
>>     # ping -qc1 192.168.0.2
>>     1: IP 192.168.0.1 > 192.168.0.2: ICMP echo request, id 315, seq 1, length 64
>>     1: IP 192.168.0.2 > 192.168.0.1: ICMP echo reply, id 315, seq 1, length 64
>>     # ping -qc1 192.168.0.2
>>     1: IP 192.168.0.1 > 192.168.0.2: ICMP echo request, id 316, seq 1, length 64
>>     1: IP 192.168.0.2 > 192.168.0.1: ICMP echo reply, id 316, seq 1, length 64
>>     # ping -qc1 192.168.0.2
>>     1: IP 192.168.0.1 > 192.168.0.2: ICMP echo request, id 317, seq 1, length 64
>>     1: IP 192.168.0.2 > 192.168.0.1: ICMP echo reply, id 317, seq 1, length 64
>>
>> But some ICMP packets have an Identifier field which is
>> used to match packets within sessions, let's use this value in the hash
>> function to balance these packets between bond slaves:
>>
>>     # ping -qc1 192.168.0.2
>>     0: IP 192.168.0.1 > 192.168.0.2: ICMP echo request, id 303, seq 1, length 64
>>     0: IP 192.168.0.2 > 192.168.0.1: ICMP echo reply, id 303, seq 1, length 64
>>     # ping -qc1 192.168.0.2
>>     1: IP 192.168.0.1 > 192.168.0.2: ICMP echo request, id 304, seq 1, length 64
>>     1: IP 192.168.0.2 > 192.168.0.1: ICMP echo reply, id 304, seq 1, length 64
>>
>> Aso, let's use a flow_dissector_key which defines FLOW_DISSECTOR_KEY_ICMP,
> 
> Also ?
> 
>> so we can balance pings encapsulated in a tunnel when using mode encap3+4:
>>
>>     # ping -q 192.168.1.2 -c1
>>     0: IP 192.168.0.1 > 192.168.0.2: GREv0, length 102: IP 192.168.1.1 > 192.168.1.2: ICMP echo request, id 585, seq 1, length 64
>>     0: IP 192.168.0.2 > 192.168.0.1: GREv0, length 102: IP 192.168.1.2 > 192.168.1.1: ICMP echo reply, id 585, seq 1, length 64
>>     # ping -q 192.168.1.2 -c1
>>     1: IP 192.168.0.1 > 192.168.0.2: GREv0, length 102: IP 192.168.1.1 > 192.168.1.2: ICMP echo request, id 586, seq 1, length 64
>>     1: IP 192.168.0.2 > 192.168.0.1: GREv0, length 102: IP 192.168.1.2 > 192.168.1.1: ICMP echo reply, id 586, seq 1, length 64
>>
>> Signed-off-by: Matteo Croce <mcroce@redhat.com>
>> ---
>>  drivers/net/bonding/bond_main.c | 77 ++++++++++++++++++++++++++++++---
>>  1 file changed, 70 insertions(+), 7 deletions(-)
>>
> 
> Hi Matteo,
> Wouldn't it be more useful and simpler to use some field to choose the slave (override the hash
> completely) in a deterministic way from user-space ?
> For example the mark can be interpreted as a slave id in the bonding (should be
> optional, to avoid breaking existing setups). ping already supports -m and
> anything else can set it, this way it can be used to do monitoring for a specific
> slave with any protocol and would be a much simpler change.
> User-space can then implement any logic for the monitoring case and as a minor bonus
> can monitor the slaves in parallel. And the opposite as well - if people don't want
> these balanced for some reason, they wouldn't enable it.
> 

Ooh I just noticed you'd like to balance replies as well. Nevermind

> Or maybe I've misunderstood why this change is needed. :)
> It would actually be nice to include the use-case which brought this on
> in the commit message.
> 
> Cheers,
>  Nik
>
Matteo Croce Oct. 29, 2019, 7:45 p.m. UTC | #3
On Tue, Oct 29, 2019 at 7:41 PM Nikolay Aleksandrov
<nikolay@cumulusnetworks.com> wrote:
>
> On 29/10/2019 20:35, Nikolay Aleksandrov wrote:
> > Hi Matteo,
> > Wouldn't it be more useful and simpler to use some field to choose the slave (override the hash
> > completely) in a deterministic way from user-space ?
> > For example the mark can be interpreted as a slave id in the bonding (should be
> > optional, to avoid breaking existing setups). ping already supports -m and
> > anything else can set it, this way it can be used to do monitoring for a specific
> > slave with any protocol and would be a much simpler change.
> > User-space can then implement any logic for the monitoring case and as a minor bonus
> > can monitor the slaves in parallel. And the opposite as well - if people don't want
> > these balanced for some reason, they wouldn't enable it.
> >
>
> Ooh I just noticed you'd like to balance replies as well. Nevermind
>

Also, the bonding could be in a router in the middle so no way to read the mark.
Nikolay Aleksandrov Oct. 29, 2019, 8:07 p.m. UTC | #4
On 29/10/2019 21:45, Matteo Croce wrote:
> On Tue, Oct 29, 2019 at 7:41 PM Nikolay Aleksandrov
> <nikolay@cumulusnetworks.com> wrote:
>>
>> On 29/10/2019 20:35, Nikolay Aleksandrov wrote:
>>> Hi Matteo,
>>> Wouldn't it be more useful and simpler to use some field to choose the slave (override the hash
>>> completely) in a deterministic way from user-space ?
>>> For example the mark can be interpreted as a slave id in the bonding (should be
>>> optional, to avoid breaking existing setups). ping already supports -m and
>>> anything else can set it, this way it can be used to do monitoring for a specific
>>> slave with any protocol and would be a much simpler change.
>>> User-space can then implement any logic for the monitoring case and as a minor bonus
>>> can monitor the slaves in parallel. And the opposite as well - if people don't want
>>> these balanced for some reason, they wouldn't enable it.
>>>
>>
>> Ooh I just noticed you'd like to balance replies as well. Nevermind
>>
> 
> Also, the bonding could be in a router in the middle so no way to read the mark.
> 

Yeah, of course. I was just thinking from the host monitoring POV as I thought
that was the initial intent (reading the last set's discussion).

Anyway the patch looks good to me,
Reviewed-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
Eric Dumazet Oct. 29, 2019, 9:03 p.m. UTC | #5
On 10/29/19 11:35 AM, Nikolay Aleksandrov wrote:

> Hi Matteo,
> Wouldn't it be more useful and simpler to use some field to choose the slave (override the hash
> completely) in a deterministic way from user-space ?
> For example the mark can be interpreted as a slave id in the bonding (should be
> optional, to avoid breaking existing setups). ping already supports -m and
> anything else can set it, this way it can be used to do monitoring for a specific
> slave with any protocol and would be a much simpler change.
> User-space can then implement any logic for the monitoring case and as a minor bonus
> can monitor the slaves in parallel. And the opposite as well - if people don't want
> these balanced for some reason, they wouldn't enable it.
> 

I kind of agree giving user more control. But I do not believe we need to use the mark
(this might be already used by other layers)

TCP uses sk->sk_hash to feed skb->hash.

Anything using skb_set_owner_w() is also using sk->sk_hash if set.

So presumably we could add a generic SO_TXHASH socket option to let user space
read/set this field.
Nikolay Aleksandrov Oct. 29, 2019, 9:50 p.m. UTC | #6
On 29/10/2019 23:03, Eric Dumazet wrote:
> 
> 
> On 10/29/19 11:35 AM, Nikolay Aleksandrov wrote:
> 
>> Hi Matteo,
>> Wouldn't it be more useful and simpler to use some field to choose the slave (override the hash
>> completely) in a deterministic way from user-space ?
>> For example the mark can be interpreted as a slave id in the bonding (should be
>> optional, to avoid breaking existing setups). ping already supports -m and
>> anything else can set it, this way it can be used to do monitoring for a specific
>> slave with any protocol and would be a much simpler change.
>> User-space can then implement any logic for the monitoring case and as a minor bonus
>> can monitor the slaves in parallel. And the opposite as well - if people don't want
>> these balanced for some reason, they wouldn't enable it.
>>
> 
> I kind of agree giving user more control. But I do not believe we need to use the mark
> (this might be already used by other layers)
> 
> TCP uses sk->sk_hash to feed skb->hash.
> 
> Anything using skb_set_owner_w() is also using sk->sk_hash if set.
> 
> So presumably we could add a generic SO_TXHASH socket option to let user space
> read/set this field.
> 

Right, I was just giving it as an example. Your suggestion sounds much better and
wouldn't interfere with other layers, plus we already use skb->hash in bond_xmit_hash()
and skb_set_owner_w() sets l4_hash if txhash is present which is perfect.

One thing - how do we deal with sk_rethink_txhash() ? I guess we'll need some way to
signal that the user specified the txhash and it is not to be recomputed ?
That can also be used to avoid the connect txhash set as well if SO_TXHASH was set prior
to the connect. It's quite late here, I'll look into it more tomorrow. :)

Thanks,
 Nik
Matteo Croce Oct. 29, 2019, 11:03 p.m. UTC | #7
On Tue, Oct 29, 2019 at 10:03 PM Eric Dumazet <eric.dumazet@gmail.com> wrote:
>
>
>
> On 10/29/19 11:35 AM, Nikolay Aleksandrov wrote:
>
> > Hi Matteo,
> > Wouldn't it be more useful and simpler to use some field to choose the slave (override the hash
> > completely) in a deterministic way from user-space ?
> > For example the mark can be interpreted as a slave id in the bonding (should be
> > optional, to avoid breaking existing setups). ping already supports -m and
> > anything else can set it, this way it can be used to do monitoring for a specific
> > slave with any protocol and would be a much simpler change.
> > User-space can then implement any logic for the monitoring case and as a minor bonus
> > can monitor the slaves in parallel. And the opposite as well - if people don't want
> > these balanced for some reason, they wouldn't enable it.
> >
>
> I kind of agree giving user more control. But I do not believe we need to use the mark
> (this might be already used by other layers)
>
> TCP uses sk->sk_hash to feed skb->hash.
>
> Anything using skb_set_owner_w() is also using sk->sk_hash if set.
>
> So presumably we could add a generic SO_TXHASH socket option to let user space
> read/set this field.
>

Hi Eric,

this would work for locally generated echoes, but what about forwarded packets?
The point behind my changeset is to provide consistent results within
a session by using the same path for request and response,
but avoid all sessions flowing to the same path.
This should resemble what happens with TCP and UDP: different
connections, different port, probably a different path. And by doing
this in the flow dissector, other applications could benefit it.

Also, this should somewhat balance the traffic of a router forwarding
those packets. Maybe it's not so much in percentage, but in some
gateways be a considerable volume.

Regards,
Eric Dumazet Oct. 29, 2019, 11:04 p.m. UTC | #8
On 10/29/19 2:50 PM, Nikolay Aleksandrov wrote:

> Right, I was just giving it as an example. Your suggestion sounds much better and
> wouldn't interfere with other layers, plus we already use skb->hash in bond_xmit_hash()
> and skb_set_owner_w() sets l4_hash if txhash is present which is perfect.
> 
> One thing - how do we deal with sk_rethink_txhash() ? I guess we'll need some way to
> signal that the user specified the txhash and it is not to be recomputed ?
> That can also be used to avoid the connect txhash set as well if SO_TXHASH was set prior
> to the connect. It's quite late here, I'll look into it more tomorrow. :)

I guess that we have something similar with SO_RCVBUF/SO_SNDBUF

autotuning is disabled when/if they are used :

 SOCK_RCVBUF_LOCK & SOCK_SNDBUF_LOCK

We could add a SOCK_TXHASH_LOCK so that sk_rethink_txhash() does nothing if
user forced a TXHASH value.

Something like the following (probably not complete) patch.

diff --git a/include/net/sock.h b/include/net/sock.h
index 380312cc67a9d9ee8720eb2db82b1f7f8a5615ab..a8882738710eaa9d9d629e1207837a798401a594 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1354,6 +1354,7 @@ static inline int __sk_prot_rehash(struct sock *sk)
 #define SOCK_RCVBUF_LOCK       2
 #define SOCK_BINDADDR_LOCK     4
 #define SOCK_BINDPORT_LOCK     8
+#define SOCK_TXHASH_LOCK       16
 
 struct socket_alloc {
        struct socket socket;
@@ -1852,7 +1853,8 @@ static inline u32 net_tx_rndhash(void)
 
 static inline void sk_set_txhash(struct sock *sk)
 {
-       sk->sk_txhash = net_tx_rndhash();
+       if (!(sk->sk_userlocks & SOCK_TXHASH_LOCK))
+               sk->sk_txhash = net_tx_rndhash();
 }
 
 static inline void sk_rethink_txhash(struct sock *sk)
diff --git a/include/uapi/asm-generic/socket.h b/include/uapi/asm-generic/socket.h
index 77f7c1638eb1ce7d3e143bbffd60056e472b1122..998be6ee7991de3a76d4ad33df3a38dbe791eae8 100644
--- a/include/uapi/asm-generic/socket.h
+++ b/include/uapi/asm-generic/socket.h
@@ -118,6 +118,7 @@
 #define SO_SNDTIMEO_NEW         67
 
 #define SO_DETACH_REUSEPORT_BPF 68
+#define SO_TXHASH              69
 
 #if !defined(__KERNEL__)
 
diff --git a/net/core/sock.c b/net/core/sock.c
index 997b352c2a72ee39f00b102a553ac1191202b74f..85b85dffd462bc3b497e0432100ff24b759832e0 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -770,6 +770,10 @@ int sock_setsockopt(struct socket *sock, int level, int optname,
        case SO_BROADCAST:
                sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
                break;
+       case SO_TXHASH:
+               sk->sk_txhash = val;
+               sk->sk_userlocks |= SOCK_TXHASH_LOCK;
+               break;
        case SO_SNDBUF:
                /* Don't error on this BSD doesn't and if you think
                 * about it this is right. Otherwise apps have to
@@ -1249,6 +1253,10 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
                v.val = sock_flag(sk, SOCK_BROADCAST);
                break;
 
+       case SO_TXHASH:
+               v.val = sk->sk_txhash;
+               break;
+
        case SO_SNDBUF:
                v.val = sk->sk_sndbuf;
                break;
Eric Dumazet Oct. 29, 2019, 11:14 p.m. UTC | #9
On 10/29/19 4:03 PM, Matteo Croce wrote:

> Hi Eric,
> 
> this would work for locally generated echoes, but what about forwarded packets?
> The point behind my changeset is to provide consistent results within
> a session by using the same path for request and response,
> but avoid all sessions flowing to the same path.
> This should resemble what happens with TCP and UDP: different
> connections, different port, probably a different path. And by doing
> this in the flow dissector, other applications could benefit it.

In principle it is fine, but I was not sure of overall impact of your change
on performance for 99.9% of packets that are not ICMP :)
Matteo Croce Oct. 29, 2019, 11:19 p.m. UTC | #10
On Wed, Oct 30, 2019 at 12:14 AM Eric Dumazet <eric.dumazet@gmail.com> wrote:
>
>
>
> On 10/29/19 4:03 PM, Matteo Croce wrote:
>
> > Hi Eric,
> >
> > this would work for locally generated echoes, but what about forwarded packets?
> > The point behind my changeset is to provide consistent results within
> > a session by using the same path for request and response,
> > but avoid all sessions flowing to the same path.
> > This should resemble what happens with TCP and UDP: different
> > connections, different port, probably a different path. And by doing
> > this in the flow dissector, other applications could benefit it.
>
> In principle it is fine, but I was not sure of overall impact of your change
> on performance for 99.9% of packets that are not ICMP :)
>

Good point. I didn't measure it (I will) but all the code additions
are under some if (proto == ICMP) or similar.
My guess is that performance shouldn't change for non ICMP traffic,
but I'm curious to test it.
Nikolay Aleksandrov Oct. 30, 2019, 12:48 p.m. UTC | #11
On 30/10/2019 01:04, Eric Dumazet wrote:
> 
> 
> On 10/29/19 2:50 PM, Nikolay Aleksandrov wrote:
> 
>> Right, I was just giving it as an example. Your suggestion sounds much better and
>> wouldn't interfere with other layers, plus we already use skb->hash in bond_xmit_hash()
>> and skb_set_owner_w() sets l4_hash if txhash is present which is perfect.
>>
>> One thing - how do we deal with sk_rethink_txhash() ? I guess we'll need some way to
>> signal that the user specified the txhash and it is not to be recomputed ?
>> That can also be used to avoid the connect txhash set as well if SO_TXHASH was set prior
>> to the connect. It's quite late here, I'll look into it more tomorrow. :)
> 
> I guess that we have something similar with SO_RCVBUF/SO_SNDBUF
> 
> autotuning is disabled when/if they are used :
> 
>  SOCK_RCVBUF_LOCK & SOCK_SNDBUF_LOCK
> 
> We could add a SOCK_TXHASH_LOCK so that sk_rethink_txhash() does nothing if
> user forced a TXHASH value.
> 
> Something like the following (probably not complete) patch.
> 

Actually I think it's ok. I had a similar change last night sans the userlocks.
I just built and tested a kernel with it successfully using the bonding.
The only case that doesn't seem to work is a raw socket without hdrincl, IIUC due to
the direct alloc_skb() (transhdrlen == 0) in ip_append_data().
Unless you have other concerns could you please submit it formally ?

> diff --git a/include/net/sock.h b/include/net/sock.h
> index 380312cc67a9d9ee8720eb2db82b1f7f8a5615ab..a8882738710eaa9d9d629e1207837a798401a594 100644
> --- a/include/net/sock.h
> +++ b/include/net/sock.h
> @@ -1354,6 +1354,7 @@ static inline int __sk_prot_rehash(struct sock *sk)
>  #define SOCK_RCVBUF_LOCK       2
>  #define SOCK_BINDADDR_LOCK     4
>  #define SOCK_BINDPORT_LOCK     8
> +#define SOCK_TXHASH_LOCK       16
>  
>  struct socket_alloc {
>         struct socket socket;
> @@ -1852,7 +1853,8 @@ static inline u32 net_tx_rndhash(void)
>  
>  static inline void sk_set_txhash(struct sock *sk)
>  {
> -       sk->sk_txhash = net_tx_rndhash();
> +       if (!(sk->sk_userlocks & SOCK_TXHASH_LOCK))
> +               sk->sk_txhash = net_tx_rndhash();
>  }
>  
>  static inline void sk_rethink_txhash(struct sock *sk)
> diff --git a/include/uapi/asm-generic/socket.h b/include/uapi/asm-generic/socket.h
> index 77f7c1638eb1ce7d3e143bbffd60056e472b1122..998be6ee7991de3a76d4ad33df3a38dbe791eae8 100644
> --- a/include/uapi/asm-generic/socket.h
> +++ b/include/uapi/asm-generic/socket.h
> @@ -118,6 +118,7 @@
>  #define SO_SNDTIMEO_NEW         67
>  
>  #define SO_DETACH_REUSEPORT_BPF 68
> +#define SO_TXHASH              69
>  
>  #if !defined(__KERNEL__)
>  
> diff --git a/net/core/sock.c b/net/core/sock.c
> index 997b352c2a72ee39f00b102a553ac1191202b74f..85b85dffd462bc3b497e0432100ff24b759832e0 100644
> --- a/net/core/sock.c
> +++ b/net/core/sock.c
> @@ -770,6 +770,10 @@ int sock_setsockopt(struct socket *sock, int level, int optname,
>         case SO_BROADCAST:
>                 sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
>                 break;
> +       case SO_TXHASH:
> +               sk->sk_txhash = val;
> +               sk->sk_userlocks |= SOCK_TXHASH_LOCK;
> +               break;
>         case SO_SNDBUF:
>                 /* Don't error on this BSD doesn't and if you think
>                  * about it this is right. Otherwise apps have to
> @@ -1249,6 +1253,10 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
>                 v.val = sock_flag(sk, SOCK_BROADCAST);
>                 break;
>  
> +       case SO_TXHASH:
> +               v.val = sk->sk_txhash;
> +               break;
> +
>         case SO_SNDBUF:
>                 v.val = sk->sk_sndbuf;
>                 break;
>
Matteo Croce Oct. 31, 2019, 4:22 p.m. UTC | #12
On Wed, Oct 30, 2019 at 12:19 AM Matteo Croce <mcroce@redhat.com> wrote:
>
> On Wed, Oct 30, 2019 at 12:14 AM Eric Dumazet <eric.dumazet@gmail.com> wrote:
> >
> >
> >
> > On 10/29/19 4:03 PM, Matteo Croce wrote:
> >
> > > Hi Eric,
> > >
> > > this would work for locally generated echoes, but what about forwarded packets?
> > > The point behind my changeset is to provide consistent results within
> > > a session by using the same path for request and response,
> > > but avoid all sessions flowing to the same path.
> > > This should resemble what happens with TCP and UDP: different
> > > connections, different port, probably a different path. And by doing
> > > this in the flow dissector, other applications could benefit it.
> >
> > In principle it is fine, but I was not sure of overall impact of your change
> > on performance for 99.9% of packets that are not ICMP :)
> >
>
> Good point. I didn't measure it (I will) but all the code additions
> are under some if (proto == ICMP) or similar.
> My guess is that performance shouldn't change for non ICMP traffic,
> but I'm curious to test it.
>

Indeed if there is some impact it's way below the measurement uncertainty.
I've bonded two veth pairs and added a tc drop to the peers, then
started mausezahn to generate UDP traffic.
Traffic is measured on the veth peers:

Stock 5.4-rc5:

rx: 261.5 Mbps 605.4 Kpps
rx: 261.2 Mbps 604.6 Kpps
rx: 261.6 Mbps 605.5 Kpps

patched:

rx: 261.4 Mbps 605.1 Kpps
rx: 261.1 Mbps 604.4 Kpps
rx: 260.3 Mbps 602.5 Kpps

perf top shows no significatn change in bond* and skb_flow* functions

Regards,
diff mbox series

Patch

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 21d8fcc83c9c..3e496e746cc6 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -200,6 +200,51 @@  atomic_t netpoll_block_tx = ATOMIC_INIT(0);
 
 unsigned int bond_net_id __read_mostly;
 
+static const struct flow_dissector_key flow_keys_bonding_keys[] = {
+	{
+		.key_id = FLOW_DISSECTOR_KEY_CONTROL,
+		.offset = offsetof(struct flow_keys, control),
+	},
+	{
+		.key_id = FLOW_DISSECTOR_KEY_BASIC,
+		.offset = offsetof(struct flow_keys, basic),
+	},
+	{
+		.key_id = FLOW_DISSECTOR_KEY_IPV4_ADDRS,
+		.offset = offsetof(struct flow_keys, addrs.v4addrs),
+	},
+	{
+		.key_id = FLOW_DISSECTOR_KEY_IPV6_ADDRS,
+		.offset = offsetof(struct flow_keys, addrs.v6addrs),
+	},
+	{
+		.key_id = FLOW_DISSECTOR_KEY_TIPC,
+		.offset = offsetof(struct flow_keys, addrs.tipckey),
+	},
+	{
+		.key_id = FLOW_DISSECTOR_KEY_PORTS,
+		.offset = offsetof(struct flow_keys, ports),
+	},
+	{
+		.key_id = FLOW_DISSECTOR_KEY_ICMP,
+		.offset = offsetof(struct flow_keys, icmp),
+	},
+	{
+		.key_id = FLOW_DISSECTOR_KEY_VLAN,
+		.offset = offsetof(struct flow_keys, vlan),
+	},
+	{
+		.key_id = FLOW_DISSECTOR_KEY_FLOW_LABEL,
+		.offset = offsetof(struct flow_keys, tags),
+	},
+	{
+		.key_id = FLOW_DISSECTOR_KEY_GRE_KEYID,
+		.offset = offsetof(struct flow_keys, keyid),
+	},
+};
+
+static struct flow_dissector flow_keys_bonding __read_mostly;
+
 /*-------------------------- Forward declarations ---------------------------*/
 
 static int bond_init(struct net_device *bond_dev);
@@ -3263,10 +3308,14 @@  static bool bond_flow_dissect(struct bonding *bond, struct sk_buff *skb,
 	const struct iphdr *iph;
 	int noff, proto = -1;
 
-	if (bond->params.xmit_policy > BOND_XMIT_POLICY_LAYER23)
-		return skb_flow_dissect_flow_keys(skb, fk, 0);
+	if (bond->params.xmit_policy > BOND_XMIT_POLICY_LAYER23) {
+		memset(fk, 0, sizeof(*fk));
+		return __skb_flow_dissect(NULL, skb, &flow_keys_bonding,
+					  fk, NULL, 0, 0, 0, 0);
+	}
 
 	fk->ports.ports = 0;
+	memset(&fk->icmp, 0, sizeof(fk->icmp));
 	noff = skb_network_offset(skb);
 	if (skb->protocol == htons(ETH_P_IP)) {
 		if (unlikely(!pskb_may_pull(skb, noff + sizeof(*iph))))
@@ -3286,8 +3335,14 @@  static bool bond_flow_dissect(struct bonding *bond, struct sk_buff *skb,
 	} else {
 		return false;
 	}
-	if (bond->params.xmit_policy == BOND_XMIT_POLICY_LAYER34 && proto >= 0)
-		fk->ports.ports = skb_flow_get_ports(skb, noff, proto);
+	if (bond->params.xmit_policy == BOND_XMIT_POLICY_LAYER34 && proto >= 0) {
+		if (proto == IPPROTO_ICMP || proto == IPPROTO_ICMPV6)
+			skb_flow_get_icmp_tci(skb, &fk->icmp, skb->data,
+					      skb_transport_offset(skb),
+					      skb_headlen(skb));
+		else
+			fk->ports.ports = skb_flow_get_ports(skb, noff, proto);
+	}
 
 	return true;
 }
@@ -3314,10 +3369,14 @@  u32 bond_xmit_hash(struct bonding *bond, struct sk_buff *skb)
 		return bond_eth_hash(skb);
 
 	if (bond->params.xmit_policy == BOND_XMIT_POLICY_LAYER23 ||
-	    bond->params.xmit_policy == BOND_XMIT_POLICY_ENCAP23)
+	    bond->params.xmit_policy == BOND_XMIT_POLICY_ENCAP23) {
 		hash = bond_eth_hash(skb);
-	else
-		hash = (__force u32)flow.ports.ports;
+	} else {
+		if (flow.icmp.id)
+			memcpy(&hash, &flow.icmp, sizeof(hash));
+		else
+			memcpy(&hash, &flow.ports.ports, sizeof(hash));
+	}
 	hash ^= (__force u32)flow_get_u32_dst(&flow) ^
 		(__force u32)flow_get_u32_src(&flow);
 	hash ^= (hash >> 16);
@@ -4901,6 +4960,10 @@  static int __init bonding_init(void)
 			goto err;
 	}
 
+	skb_flow_dissector_init(&flow_keys_bonding,
+				flow_keys_bonding_keys,
+				ARRAY_SIZE(flow_keys_bonding_keys));
+
 	register_netdevice_notifier(&bond_netdev_notifier);
 out:
 	return res;