diff mbox series

[net-next,4/4] bonding: balance ICMP echoes in layer3+4 mode

Message ID 20191021200948.23775-5-mcroce@redhat.com
State Changes Requested
Delegated to: David Miller
Headers show
Series ICMP flow improvements | expand

Commit Message

Matteo Croce Oct. 21, 2019, 8:09 p.m. UTC
The bonding uses the L4 ports to balance flows between slaves.
As the ICMP protocol has no ports, those packets are sent all to the
same device:

    # tcpdump -qltnni veth0 ip |sed 's/^/0: /' &
    # tcpdump -qltnni veth1 ip |sed 's/^/1: /' &
    # ping -qc1 192.168.0.2
    1: IP 192.168.0.1 > 192.168.0.2: ICMP echo request, id 315, seq 1, length 64
    1: IP 192.168.0.2 > 192.168.0.1: ICMP echo reply, id 315, seq 1, length 64
    # ping -qc1 192.168.0.2
    1: IP 192.168.0.1 > 192.168.0.2: ICMP echo request, id 316, seq 1, length 64
    1: IP 192.168.0.2 > 192.168.0.1: ICMP echo reply, id 316, seq 1, length 64
    # ping -qc1 192.168.0.2
    1: IP 192.168.0.1 > 192.168.0.2: ICMP echo request, id 317, seq 1, length 64
    1: IP 192.168.0.2 > 192.168.0.1: ICMP echo reply, id 317, seq 1, length 64

But some ICMP packets have an Identifier field which is
used to match packets within sessions, let's use this value in the hash
function to balance these packets between bond slaves:

    # ping -qc1 192.168.0.2
    0: IP 192.168.0.1 > 192.168.0.2: ICMP echo request, id 303, seq 1, length 64
    0: IP 192.168.0.2 > 192.168.0.1: ICMP echo reply, id 303, seq 1, length 64
    # ping -qc1 192.168.0.2
    1: IP 192.168.0.1 > 192.168.0.2: ICMP echo request, id 304, seq 1, length 64
    1: IP 192.168.0.2 > 192.168.0.1: ICMP echo reply, id 304, seq 1, length 64

Signed-off-by: Matteo Croce <mcroce@redhat.com>
---
 drivers/net/bonding/bond_main.c | 22 +++++++++++++++++-----
 1 file changed, 17 insertions(+), 5 deletions(-)

Comments

Simon Horman Oct. 23, 2019, 10:01 a.m. UTC | #1
On Mon, Oct 21, 2019 at 10:09:48PM +0200, Matteo Croce wrote:
> The bonding uses the L4 ports to balance flows between slaves.
> As the ICMP protocol has no ports, those packets are sent all to the
> same device:
> 
>     # tcpdump -qltnni veth0 ip |sed 's/^/0: /' &
>     # tcpdump -qltnni veth1 ip |sed 's/^/1: /' &
>     # ping -qc1 192.168.0.2
>     1: IP 192.168.0.1 > 192.168.0.2: ICMP echo request, id 315, seq 1, length 64
>     1: IP 192.168.0.2 > 192.168.0.1: ICMP echo reply, id 315, seq 1, length 64
>     # ping -qc1 192.168.0.2
>     1: IP 192.168.0.1 > 192.168.0.2: ICMP echo request, id 316, seq 1, length 64
>     1: IP 192.168.0.2 > 192.168.0.1: ICMP echo reply, id 316, seq 1, length 64
>     # ping -qc1 192.168.0.2
>     1: IP 192.168.0.1 > 192.168.0.2: ICMP echo request, id 317, seq 1, length 64
>     1: IP 192.168.0.2 > 192.168.0.1: ICMP echo reply, id 317, seq 1, length 64
> 
> But some ICMP packets have an Identifier field which is
> used to match packets within sessions, let's use this value in the hash
> function to balance these packets between bond slaves:
> 
>     # ping -qc1 192.168.0.2
>     0: IP 192.168.0.1 > 192.168.0.2: ICMP echo request, id 303, seq 1, length 64
>     0: IP 192.168.0.2 > 192.168.0.1: ICMP echo reply, id 303, seq 1, length 64
>     # ping -qc1 192.168.0.2
>     1: IP 192.168.0.1 > 192.168.0.2: ICMP echo request, id 304, seq 1, length 64
>     1: IP 192.168.0.2 > 192.168.0.1: ICMP echo reply, id 304, seq 1, length 64
> 
> Signed-off-by: Matteo Croce <mcroce@redhat.com>

I see where this patch is going but it is unclear to me what problem it is
solving. I would expect ICMP traffic to be low volume and thus able to be
handled by a single lower-device of a bond.

...
Matteo Croce Oct. 23, 2019, 4:58 p.m. UTC | #2
On Wed, Oct 23, 2019 at 12:01 PM Simon Horman
<simon.horman@netronome.com> wrote:
>
> On Mon, Oct 21, 2019 at 10:09:48PM +0200, Matteo Croce wrote:
> > The bonding uses the L4 ports to balance flows between slaves.
> > As the ICMP protocol has no ports, those packets are sent all to the
> > same device:
> >
> >     # tcpdump -qltnni veth0 ip |sed 's/^/0: /' &
> >     # tcpdump -qltnni veth1 ip |sed 's/^/1: /' &
> >     # ping -qc1 192.168.0.2
> >     1: IP 192.168.0.1 > 192.168.0.2: ICMP echo request, id 315, seq 1, length 64
> >     1: IP 192.168.0.2 > 192.168.0.1: ICMP echo reply, id 315, seq 1, length 64
> >     # ping -qc1 192.168.0.2
> >     1: IP 192.168.0.1 > 192.168.0.2: ICMP echo request, id 316, seq 1, length 64
> >     1: IP 192.168.0.2 > 192.168.0.1: ICMP echo reply, id 316, seq 1, length 64
> >     # ping -qc1 192.168.0.2
> >     1: IP 192.168.0.1 > 192.168.0.2: ICMP echo request, id 317, seq 1, length 64
> >     1: IP 192.168.0.2 > 192.168.0.1: ICMP echo reply, id 317, seq 1, length 64
> >
> > But some ICMP packets have an Identifier field which is
> > used to match packets within sessions, let's use this value in the hash
> > function to balance these packets between bond slaves:
> >
> >     # ping -qc1 192.168.0.2
> >     0: IP 192.168.0.1 > 192.168.0.2: ICMP echo request, id 303, seq 1, length 64
> >     0: IP 192.168.0.2 > 192.168.0.1: ICMP echo reply, id 303, seq 1, length 64
> >     # ping -qc1 192.168.0.2
> >     1: IP 192.168.0.1 > 192.168.0.2: ICMP echo request, id 304, seq 1, length 64
> >     1: IP 192.168.0.2 > 192.168.0.1: ICMP echo reply, id 304, seq 1, length 64
> >
> > Signed-off-by: Matteo Croce <mcroce@redhat.com>
>
> I see where this patch is going but it is unclear to me what problem it is
> solving. I would expect ICMP traffic to be low volume and thus able to be
> handled by a single lower-device of a bond.
>
> ...

Hi,

The problem is not balancing the volume, even if it could increase due
to IoT devices pinging some well known DNS servers to check for
connection.
If a bonding slave is down, people using pings to check for
connectivity could fail to detect a broken link if all the packets are
sent to the alive link.
Anyway, although I didn't measure it, the computational overhead of
this changeset should be minimal, and only affect ICMP packets when
the ICMP dissector is used.

Regards,
Simon Horman Oct. 23, 2019, 6 p.m. UTC | #3
On Wed, Oct 23, 2019 at 06:58:16PM +0200, Matteo Croce wrote:
> On Wed, Oct 23, 2019 at 12:01 PM Simon Horman
> <simon.horman@netronome.com> wrote:
> >
> > On Mon, Oct 21, 2019 at 10:09:48PM +0200, Matteo Croce wrote:
> > > The bonding uses the L4 ports to balance flows between slaves.
> > > As the ICMP protocol has no ports, those packets are sent all to the
> > > same device:
> > >
> > >     # tcpdump -qltnni veth0 ip |sed 's/^/0: /' &
> > >     # tcpdump -qltnni veth1 ip |sed 's/^/1: /' &
> > >     # ping -qc1 192.168.0.2
> > >     1: IP 192.168.0.1 > 192.168.0.2: ICMP echo request, id 315, seq 1, length 64
> > >     1: IP 192.168.0.2 > 192.168.0.1: ICMP echo reply, id 315, seq 1, length 64
> > >     # ping -qc1 192.168.0.2
> > >     1: IP 192.168.0.1 > 192.168.0.2: ICMP echo request, id 316, seq 1, length 64
> > >     1: IP 192.168.0.2 > 192.168.0.1: ICMP echo reply, id 316, seq 1, length 64
> > >     # ping -qc1 192.168.0.2
> > >     1: IP 192.168.0.1 > 192.168.0.2: ICMP echo request, id 317, seq 1, length 64
> > >     1: IP 192.168.0.2 > 192.168.0.1: ICMP echo reply, id 317, seq 1, length 64
> > >
> > > But some ICMP packets have an Identifier field which is
> > > used to match packets within sessions, let's use this value in the hash
> > > function to balance these packets between bond slaves:
> > >
> > >     # ping -qc1 192.168.0.2
> > >     0: IP 192.168.0.1 > 192.168.0.2: ICMP echo request, id 303, seq 1, length 64
> > >     0: IP 192.168.0.2 > 192.168.0.1: ICMP echo reply, id 303, seq 1, length 64
> > >     # ping -qc1 192.168.0.2
> > >     1: IP 192.168.0.1 > 192.168.0.2: ICMP echo request, id 304, seq 1, length 64
> > >     1: IP 192.168.0.2 > 192.168.0.1: ICMP echo reply, id 304, seq 1, length 64
> > >
> > > Signed-off-by: Matteo Croce <mcroce@redhat.com>
> >
> > I see where this patch is going but it is unclear to me what problem it is
> > solving. I would expect ICMP traffic to be low volume and thus able to be
> > handled by a single lower-device of a bond.
> >
> > ...
> 
> Hi,
> 
> The problem is not balancing the volume, even if it could increase due
> to IoT devices pinging some well known DNS servers to check for
> connection.
> If a bonding slave is down, people using pings to check for
> connectivity could fail to detect a broken link if all the packets are
> sent to the alive link.
> Anyway, although I didn't measure it, the computational overhead of
> this changeset should be minimal, and only affect ICMP packets when
> the ICMP dissector is used.

So the idea is that by using different id values ping could be used
to probe all lower-devices of a bond? If so then I understand why
you want this and have no particular objection.
diff mbox series

Patch

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 21d8fcc83c9c..83afb03f4d07 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -3267,6 +3267,8 @@  static bool bond_flow_dissect(struct bonding *bond, struct sk_buff *skb,
 		return skb_flow_dissect_flow_keys(skb, fk, 0);
 
 	fk->ports.ports = 0;
+	fk->icmp.icmp = 0;
+	fk->icmp.id = 0;
 	noff = skb_network_offset(skb);
 	if (skb->protocol == htons(ETH_P_IP)) {
 		if (unlikely(!pskb_may_pull(skb, noff + sizeof(*iph))))
@@ -3286,8 +3288,14 @@  static bool bond_flow_dissect(struct bonding *bond, struct sk_buff *skb,
 	} else {
 		return false;
 	}
-	if (bond->params.xmit_policy == BOND_XMIT_POLICY_LAYER34 && proto >= 0)
-		fk->ports.ports = skb_flow_get_ports(skb, noff, proto);
+	if (bond->params.xmit_policy == BOND_XMIT_POLICY_LAYER34 && proto >= 0) {
+		if (proto == IPPROTO_ICMP || proto == IPPROTO_ICMPV6)
+			skb_flow_get_icmp_tci(skb, &fk->icmp, skb->data,
+					      skb_transport_offset(skb),
+					      skb_headlen(skb));
+		else
+			fk->ports.ports = skb_flow_get_ports(skb, noff, proto);
+	}
 
 	return true;
 }
@@ -3314,10 +3322,14 @@  u32 bond_xmit_hash(struct bonding *bond, struct sk_buff *skb)
 		return bond_eth_hash(skb);
 
 	if (bond->params.xmit_policy == BOND_XMIT_POLICY_LAYER23 ||
-	    bond->params.xmit_policy == BOND_XMIT_POLICY_ENCAP23)
+	    bond->params.xmit_policy == BOND_XMIT_POLICY_ENCAP23) {
 		hash = bond_eth_hash(skb);
-	else
-		hash = (__force u32)flow.ports.ports;
+	} else {
+		if (flow.icmp.id)
+			memcpy(&hash, &flow.icmp, sizeof(hash));
+		else
+			memcpy(&hash, &flow.ports.ports, sizeof(hash));
+	}
 	hash ^= (__force u32)flow_get_u32_dst(&flow) ^
 		(__force u32)flow_get_u32_src(&flow);
 	hash ^= (hash >> 16);