diff mbox series

[RFC,2/2] veth: propagate bridge GSO to peer

Message ID 20171126181749.19288-3-sthemmin@microsoft.com
State RFC, archived
Delegated to: David Miller
Headers show
Series None | expand

Commit Message

Stephen Hemminger Nov. 26, 2017, 6:17 p.m. UTC
This allows veth device in containers to see the GSO maximum
settings of the actual device being used for output.

Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
---
 drivers/net/veth.c | 72 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 72 insertions(+)

Comments

David Ahern Nov. 27, 2017, 3:13 a.m. UTC | #1
On 11/26/17 11:17 AM, Stephen Hemminger wrote:
> This allows veth device in containers to see the GSO maximum
> settings of the actual device being used for output.
> 
> Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
> ---
>  drivers/net/veth.c | 72 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 72 insertions(+)
> 
> diff --git a/drivers/net/veth.c b/drivers/net/veth.c
> index f5438d0978ca..0c9ce156943b 100644
> --- a/drivers/net/veth.c
> +++ b/drivers/net/veth.c
> @@ -511,17 +511,89 @@ static struct rtnl_link_ops veth_link_ops = {
>  	.get_link_net	= veth_get_link_net,
>  };
>  
> +/* When veth device is added to a bridge or other master device
> + * then reflect the GSO max values from the upper device
> + * to the other end of veth pair.
> + */
> +static void veth_change_upper(struct net_device *dev,
> +		      const struct netdev_notifier_changeupper_info *info)
> +{
> +	struct net_device *upper = info->upper_dev;
> +	struct net_device *peer;
> +	struct veth_priv *priv;
> +
> +	if (dev->netdev_ops != &veth_netdev_ops)
> +		return;
> +
> +	priv = netdev_priv(dev);
> +	peer = rtnl_dereference(priv->peer);
> +	if (!peer)
> +		return;
> +
> +	if (upper) {
> +		peer->gso_max_segs = upper->gso_max_segs;
> +		peer->gso_max_size = upper->gso_max_size;
> +	} else {
> +		peer->gso_max_segs = GSO_MAX_SEGS;
> +		peer->gso_max_size = GSO_MAX_SIZE;
> +	}

veth devices can be added to a VRF instead of a bridge, and I do not
believe the gso propagation works for L3 master devices.

From a quick grep, team devices do not appear to handle gso changes either.
Stephen Hemminger Nov. 27, 2017, 7:07 a.m. UTC | #2
On Sun, 26 Nov 2017 20:13:39 -0700
David Ahern <dsahern@gmail.com> wrote:

> On 11/26/17 11:17 AM, Stephen Hemminger wrote:
> > This allows veth device in containers to see the GSO maximum
> > settings of the actual device being used for output.
> > 
> > Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
> > ---
> >  drivers/net/veth.c | 72 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
> >  1 file changed, 72 insertions(+)
> > 
> > diff --git a/drivers/net/veth.c b/drivers/net/veth.c
> > index f5438d0978ca..0c9ce156943b 100644
> > --- a/drivers/net/veth.c
> > +++ b/drivers/net/veth.c
> > @@ -511,17 +511,89 @@ static struct rtnl_link_ops veth_link_ops = {
> >  	.get_link_net	= veth_get_link_net,
> >  };
> >  
> > +/* When veth device is added to a bridge or other master device
> > + * then reflect the GSO max values from the upper device
> > + * to the other end of veth pair.
> > + */
> > +static void veth_change_upper(struct net_device *dev,
> > +		      const struct netdev_notifier_changeupper_info *info)
> > +{
> > +	struct net_device *upper = info->upper_dev;
> > +	struct net_device *peer;
> > +	struct veth_priv *priv;
> > +
> > +	if (dev->netdev_ops != &veth_netdev_ops)
> > +		return;
> > +
> > +	priv = netdev_priv(dev);
> > +	peer = rtnl_dereference(priv->peer);
> > +	if (!peer)
> > +		return;
> > +
> > +	if (upper) {
> > +		peer->gso_max_segs = upper->gso_max_segs;
> > +		peer->gso_max_size = upper->gso_max_size;
> > +	} else {
> > +		peer->gso_max_segs = GSO_MAX_SEGS;
> > +		peer->gso_max_size = GSO_MAX_SIZE;
> > +	}  
> 
> veth devices can be added to a VRF instead of a bridge, and I do not
> believe the gso propagation works for L3 master devices.
> 
> From a quick grep, team devices do not appear to handle gso changes either.

This code should still work correctly, but no optimization would happen.
The gso_max_size of the VRF or team will
still be GSO_MAX_SIZE so there would be no change. If VRF or Team ever got smart
enough to handle GSO limits, then the algorithm would handle it.
Solio Sarabia Nov. 27, 2017, 8:14 p.m. UTC | #3
On Sun, Nov 26, 2017 at 11:07:25PM -0800, Stephen Hemminger wrote:
> On Sun, 26 Nov 2017 20:13:39 -0700
> David Ahern <dsahern@gmail.com> wrote:
> 
> > On 11/26/17 11:17 AM, Stephen Hemminger wrote:
> > > This allows veth device in containers to see the GSO maximum
> > > settings of the actual device being used for output.
> > 
> > veth devices can be added to a VRF instead of a bridge, and I do not
> > believe the gso propagation works for L3 master devices.
> > 
> > From a quick grep, team devices do not appear to handle gso changes either.
> 
> This code should still work correctly, but no optimization would happen.
> The gso_max_size of the VRF or team will
> still be GSO_MAX_SIZE so there would be no change. If VRF or Team ever got smart
> enough to handle GSO limits, then the algorithm would handle it.

This patch propagates gso value from bridge to its veth endpoints.
However, since bridge is never aware of the GSO limit from underlying
interfaces, bridge/veth still have larger GSO size.

In the docker case, bridge is not linked directly to physical or
synthetic interfaces; it relies on iptables to decide which interface to
forward packets to.
Stephen Hemminger Nov. 27, 2017, 9:15 p.m. UTC | #4
On Mon, 27 Nov 2017 12:14:19 -0800
Solio Sarabia <solio.sarabia@intel.com> wrote:

> On Sun, Nov 26, 2017 at 11:07:25PM -0800, Stephen Hemminger wrote:
> > On Sun, 26 Nov 2017 20:13:39 -0700
> > David Ahern <dsahern@gmail.com> wrote:
> >   
> > > On 11/26/17 11:17 AM, Stephen Hemminger wrote:  
> > > > This allows veth device in containers to see the GSO maximum
> > > > settings of the actual device being used for output.  
> > > 
> > > veth devices can be added to a VRF instead of a bridge, and I do not
> > > believe the gso propagation works for L3 master devices.
> > > 
> > > From a quick grep, team devices do not appear to handle gso changes either.  
> > 
> > This code should still work correctly, but no optimization would happen.
> > The gso_max_size of the VRF or team will
> > still be GSO_MAX_SIZE so there would be no change. If VRF or Team ever got smart
> > enough to handle GSO limits, then the algorithm would handle it.  
> 
> This patch propagates gso value from bridge to its veth endpoints.
> However, since bridge is never aware of the GSO limit from underlying
> interfaces, bridge/veth still have larger GSO size.
> 
> In the docker case, bridge is not linked directly to physical or
> synthetic interfaces; it relies on iptables to decide which interface to
> forward packets to.

So for the docker case, then direct control of GSO values via netlink (ie ip link set)
seems like the better solution.
Solio Sarabia Nov. 28, 2017, 1:42 a.m. UTC | #5
On Mon, Nov 27, 2017 at 01:15:02PM -0800, Stephen Hemminger wrote:
> On Mon, 27 Nov 2017 12:14:19 -0800
> Solio Sarabia <solio.sarabia@intel.com> wrote:
> 
> > On Sun, Nov 26, 2017 at 11:07:25PM -0800, Stephen Hemminger wrote:
> > > On Sun, 26 Nov 2017 20:13:39 -0700
> > > David Ahern <dsahern@gmail.com> wrote:
> > >   
> > > > On 11/26/17 11:17 AM, Stephen Hemminger wrote:  
> > > > > This allows veth device in containers to see the GSO maximum
> > > > > settings of the actual device being used for output.  
> > > > 
> > > > veth devices can be added to a VRF instead of a bridge, and I do not
> > > > believe the gso propagation works for L3 master devices.
> > > > 
> > > > From a quick grep, team devices do not appear to handle gso changes either.  
> > > 
> > > This code should still work correctly, but no optimization would happen.
> > > The gso_max_size of the VRF or team will
> > > still be GSO_MAX_SIZE so there would be no change. If VRF or Team ever got smart
> > > enough to handle GSO limits, then the algorithm would handle it.  
> > 
> > This patch propagates gso value from bridge to its veth endpoints.
> > However, since bridge is never aware of the GSO limit from underlying
> > interfaces, bridge/veth still have larger GSO size.
> > 
> > In the docker case, bridge is not linked directly to physical or
> > synthetic interfaces; it relies on iptables to decide which interface to
> > forward packets to.
> 
> So for the docker case, then direct control of GSO values via netlink (ie ip link set)
> seems like the better solution.

Adding ioctl support for 'ip link set' would work. I'm still concerned
how to enforce the upper limit to not exceed that of the lower devices.

Consider a system with three NICs, each reporting values in the range
[60,000 - 62,780]. Users could set virtual interfaces' gso to 65,536,
exceeding the limit, and having the host do sw gso (vms settings must
not affect host performance.)

Looping through interfaces?  With the difference that now it'd be
trigger upon user's request, not every time a veth is created (like one
previous patch discussed.)
David Ahern Nov. 28, 2017, 2:02 a.m. UTC | #6
On 11/27/17 6:42 PM, Solio Sarabia wrote:
> On Mon, Nov 27, 2017 at 01:15:02PM -0800, Stephen Hemminger wrote:
>> On Mon, 27 Nov 2017 12:14:19 -0800
>> Solio Sarabia <solio.sarabia@intel.com> wrote:
>>
>>> On Sun, Nov 26, 2017 at 11:07:25PM -0800, Stephen Hemminger wrote:
>>>> On Sun, 26 Nov 2017 20:13:39 -0700
>>>> David Ahern <dsahern@gmail.com> wrote:
>>>>   
>>>>> On 11/26/17 11:17 AM, Stephen Hemminger wrote:  
>>>>>> This allows veth device in containers to see the GSO maximum
>>>>>> settings of the actual device being used for output.  
>>>>>
>>>>> veth devices can be added to a VRF instead of a bridge, and I do not
>>>>> believe the gso propagation works for L3 master devices.
>>>>>
>>>>> From a quick grep, team devices do not appear to handle gso changes either.  
>>>>
>>>> This code should still work correctly, but no optimization would happen.
>>>> The gso_max_size of the VRF or team will
>>>> still be GSO_MAX_SIZE so there would be no change. If VRF or Team ever got smart
>>>> enough to handle GSO limits, then the algorithm would handle it.  
>>>
>>> This patch propagates gso value from bridge to its veth endpoints.
>>> However, since bridge is never aware of the GSO limit from underlying
>>> interfaces, bridge/veth still have larger GSO size.
>>>
>>> In the docker case, bridge is not linked directly to physical or
>>> synthetic interfaces; it relies on iptables to decide which interface to
>>> forward packets to.
>>
>> So for the docker case, then direct control of GSO values via netlink (ie ip link set)
>> seems like the better solution.
> 
> Adding ioctl support for 'ip link set' would work. I'm still concerned
> how to enforce the upper limit to not exceed that of the lower devices.
> 
> Consider a system with three NICs, each reporting values in the range
> [60,000 - 62,780]. Users could set virtual interfaces' gso to 65,536,
> exceeding the limit, and having the host do sw gso (vms settings must
> not affect host performance.)
> 
> Looping through interfaces?  With the difference that now it'd be
> trigger upon user's request, not every time a veth is created (like one
> previous patch discussed.)
> 

You are concerned about the routed case right? One option is to have VRF
devices propagate gso sizes to all devices (veth, vlan, etc) enslaved to
it. VRF devices are Layer 3 master devices so an L3 parallel to a bridge.
Solio Sarabia Nov. 30, 2017, 12:35 a.m. UTC | #7
On Mon, Nov 27, 2017 at 07:02:01PM -0700, David Ahern wrote:
> On 11/27/17 6:42 PM, Solio Sarabia wrote:
> > Adding ioctl support for 'ip link set' would work. I'm still concerned
> > how to enforce the upper limit to not exceed that of the lower devices.
> > 
Actually, giving the user control to change gso doesn't solve the issue.
In a VM, user could simple ignore setting the gso, still hurting host
perf. We need to enforce the lower gso on the bridge/veth.

Should this issue be fixed at hv_netvsc level? Why is the driver passing
down gso buffer sizes greater than what synthetic interface allows.

> > Consider a system with three NICs, each reporting values in the range
> > [60,000 - 62,780]. Users could set virtual interfaces' gso to 65,536,
> > exceeding the limit, and having the host do sw gso (vms settings must
> > not affect host performance.)
> > 
> > Looping through interfaces?  With the difference that now it'd be
> > trigger upon user's request, not every time a veth is created (like one
> > previous patch discussed.)
> > 
> 
> You are concerned about the routed case right? One option is to have VRF
> devices propagate gso sizes to all devices (veth, vlan, etc) enslaved to
> it. VRF devices are Layer 3 master devices so an L3 parallel to a bridge.
Having the VRF device propagate the gso to its slaves is opposite of
what we do now: get minimum of all ports and assign it to bridge
(net/bridge/br_if.c, br_min_mtu, br_set_gso_limits.)

Would it be right to change the logic flow? If so, this this could work:

(1) bridge gets gso from lower devices upon init/setup
(2) when new device is attached to bridge, bridge sets gso for this new
    slave (and its peer if it's veth.)
(3) as the code is now, there's an optimization opportunity: for every
    new interface attached to bridge, bridge loops through all ports to
    set gso, mtu. It's not necessary as bridge already has the minimum
    from previous interfaces attached. Could be O(1) instead of O(n).
Stephen Hemminger Nov. 30, 2017, 5:10 p.m. UTC | #8
On Wed, 29 Nov 2017 16:35:37 -0800
Solio Sarabia <solio.sarabia@intel.com> wrote:

> On Mon, Nov 27, 2017 at 07:02:01PM -0700, David Ahern wrote:
> > On 11/27/17 6:42 PM, Solio Sarabia wrote:  
> > > Adding ioctl support for 'ip link set' would work. I'm still concerned
> > > how to enforce the upper limit to not exceed that of the lower devices.
> > >   
> Actually, giving the user control to change gso doesn't solve the issue.
> In a VM, user could simple ignore setting the gso, still hurting host
> perf. We need to enforce the lower gso on the bridge/veth.
> 
> Should this issue be fixed at hv_netvsc level? Why is the driver passing
> down gso buffer sizes greater than what synthetic interface allows.
> 
> > > Consider a system with three NICs, each reporting values in the range
> > > [60,000 - 62,780]. Users could set virtual interfaces' gso to 65,536,
> > > exceeding the limit, and having the host do sw gso (vms settings must
> > > not affect host performance.)
> > > 
> > > Looping through interfaces?  With the difference that now it'd be
> > > trigger upon user's request, not every time a veth is created (like one
> > > previous patch discussed.)
> > >   
> > 
> > You are concerned about the routed case right? One option is to have VRF
> > devices propagate gso sizes to all devices (veth, vlan, etc) enslaved to
> > it. VRF devices are Layer 3 master devices so an L3 parallel to a bridge.  
> Having the VRF device propagate the gso to its slaves is opposite of
> what we do now: get minimum of all ports and assign it to bridge
> (net/bridge/br_if.c, br_min_mtu, br_set_gso_limits.)
> 
> Would it be right to change the logic flow? If so, this this could work:
> 
> (1) bridge gets gso from lower devices upon init/setup
> (2) when new device is attached to bridge, bridge sets gso for this new
>     slave (and its peer if it's veth.)
> (3) as the code is now, there's an optimization opportunity: for every
>     new interface attached to bridge, bridge loops through all ports to
>     set gso, mtu. It's not necessary as bridge already has the minimum
>     from previous interfaces attached. Could be O(1) instead of O(n).


The problem goes back into the core GSO networking code.
Something like this is needed.

static inline bool netif_needs_gso(struct sk_buff *skb,
				   const struct net_device *dev,
				   netdev_features_t features)
{
	return skb_is_gso(skb) &&
		(!skb_gso_ok(skb, features) ||
		 unlikely(skb_shinfo(skb)->gso_segs > dev->gso_max_segs) ||  << new
		 unlikely(skb_shinfo(skb)->gso_size > dev->gso_max_size) ||  << new
		 unlikely((skb->ip_summed != CHECKSUM_PARTIAL) &&
			  (skb->ip_summed != CHECKSUM_UNNECESSARY)));
}

What that will do is split up the monster GSO packets if they ever bleed
across from one device to another through the twisty mazes of packet
processing paths.
Eric Dumazet Nov. 30, 2017, 5:26 p.m. UTC | #9
On Thu, 2017-11-30 at 09:10 -0800, Stephen Hemminger wrote:
> 
> 
> The problem goes back into the core GSO networking code.
> Something like this is needed.
> 
> static inline bool netif_needs_gso(struct sk_buff *skb,
> 				   const struct net_device *dev,
> 				   netdev_features_t features)
> {
> 	return skb_is_gso(skb) &&
> 		(!skb_gso_ok(skb, features) ||
> 		 unlikely(skb_shinfo(skb)->gso_segs > dev-
> >gso_max_segs) ||  << new
> 		 unlikely(skb_shinfo(skb)->gso_size > dev-
> >gso_max_size) ||  << new
> 		 unlikely((skb->ip_summed != CHECKSUM_PARTIAL) &&
> 			  (skb->ip_summed != CHECKSUM_UNNECESSARY)));
> }
> 
> What that will do is split up the monster GSO packets if they ever
> bleed
> across from one device to another through the twisty mazes of packet
> processing paths.


Since very few drivers have these gso_max_segs / gso_max_size, check
could be done in their ndo_features_check()
Stephen Hemminger Nov. 30, 2017, 5:36 p.m. UTC | #10
On Thu, 30 Nov 2017 09:26:39 -0800
Eric Dumazet <eric.dumazet@gmail.com> wrote:

> On Thu, 2017-11-30 at 09:10 -0800, Stephen Hemminger wrote:
> > 
> > 
> > The problem goes back into the core GSO networking code.
> > Something like this is needed.
> > 
> > static inline bool netif_needs_gso(struct sk_buff *skb,
> > 				   const struct net_device *dev,
> > 				   netdev_features_t features)
> > {
> > 	return skb_is_gso(skb) &&
> > 		(!skb_gso_ok(skb, features) ||
> > 		 unlikely(skb_shinfo(skb)->gso_segs > dev-  
> > >gso_max_segs) ||  << new  
> > 		 unlikely(skb_shinfo(skb)->gso_size > dev-  
> > >gso_max_size) ||  << new  
> > 		 unlikely((skb->ip_summed != CHECKSUM_PARTIAL) &&
> > 			  (skb->ip_summed != CHECKSUM_UNNECESSARY)));
> > }
> > 
> > What that will do is split up the monster GSO packets if they ever
> > bleed
> > across from one device to another through the twisty mazes of packet
> > processing paths.  
> 
> 
> Since very few drivers have these gso_max_segs / gso_max_size, check
> could be done in their ndo_features_check()

Agreed, we could do it there.
David Miller Nov. 30, 2017, 5:38 p.m. UTC | #11
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Thu, 30 Nov 2017 09:26:39 -0800

> On Thu, 2017-11-30 at 09:10 -0800, Stephen Hemminger wrote:
>> 
>> 
>> The problem goes back into the core GSO networking code.
>> Something like this is needed.
>> 
>> static inline bool netif_needs_gso(struct sk_buff *skb,
>> 				   const struct net_device *dev,
>> 				   netdev_features_t features)
>> {
>> 	return skb_is_gso(skb) &&
>> 		(!skb_gso_ok(skb, features) ||
>> 		 unlikely(skb_shinfo(skb)->gso_segs > dev-
>> >gso_max_segs) ||  << new
>> 		 unlikely(skb_shinfo(skb)->gso_size > dev-
>> >gso_max_size) ||  << new
>> 		 unlikely((skb->ip_summed != CHECKSUM_PARTIAL) &&
>> 			  (skb->ip_summed != CHECKSUM_UNNECESSARY)));
>> }
>> 
>> What that will do is split up the monster GSO packets if they ever
>> bleed
>> across from one device to another through the twisty mazes of packet
>> processing paths.
> 
> 
> Since very few drivers have these gso_max_segs / gso_max_size, check
> could be done in their ndo_features_check()

Agreed.
Stephen Hemminger Nov. 30, 2017, 5:49 p.m. UTC | #12
On Thu, 30 Nov 2017 09:26:39 -0800
Eric Dumazet <eric.dumazet@gmail.com> wrote:

> On Thu, 2017-11-30 at 09:10 -0800, Stephen Hemminger wrote:
> > 
> > 
> > The problem goes back into the core GSO networking code.
> > Something like this is needed.
> > 
> > static inline bool netif_needs_gso(struct sk_buff *skb,
> > 				   const struct net_device *dev,
> > 				   netdev_features_t features)
> > {
> > 	return skb_is_gso(skb) &&
> > 		(!skb_gso_ok(skb, features) ||
> > 		 unlikely(skb_shinfo(skb)->gso_segs > dev-  
> > >gso_max_segs) ||  << new  
> > 		 unlikely(skb_shinfo(skb)->gso_size > dev-  
> > >gso_max_size) ||  << new  
> > 		 unlikely((skb->ip_summed != CHECKSUM_PARTIAL) &&
> > 			  (skb->ip_summed != CHECKSUM_UNNECESSARY)));
> > }
> > 
> > What that will do is split up the monster GSO packets if they ever
> > bleed
> > across from one device to another through the twisty mazes of packet
> > processing paths.  
> 
> 
> Since very few drivers have these gso_max_segs / gso_max_size, check
> could be done in their ndo_features_check()

Actually, we already check for max_segs, just missing check for size here:

From 71a134f41c4aae8947241091300d21745aa237f2 Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <sthemmin@microsoft.com>
Date: Thu, 30 Nov 2017 09:45:11 -0800
Subject: [PATCH] net: do not GSO if frame is too large

This adds an additional check to breakup skb's that exceed a devices
GSO maximum size. The code was already checking for too many segments
but did not check size.

This has been observed to be a problem when using containers on
Hyper-V/Azure where the allowed GSO maximum size is less than
maximum and skb's have gone through multiple layers to arrive
at the virtual device.

Signed-off-by: Stephen Hemminger <sthemmin@microsoft.com>
---
 net/core/dev.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/net/core/dev.c b/net/core/dev.c
index 07ed21d64f92..0bb398f3bfa3 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2918,9 +2918,11 @@ static netdev_features_t gso_features_check(const struct sk_buff *skb,
 					    struct net_device *dev,
 					    netdev_features_t features)
 {
+	unsigned int gso_size = skb_shinfo(skb)->gso_size;
 	u16 gso_segs = skb_shinfo(skb)->gso_segs;
 
-	if (gso_segs > dev->gso_max_segs)
+	if (gso_segs > dev->gso_max_segs ||
+	    gso_size > dev->gso_max_size)
 		return features & ~NETIF_F_GSO_MASK;
 
 	/* Support for GSO partial features requires software
Eric Dumazet Nov. 30, 2017, 5:59 p.m. UTC | #13
On Thu, 2017-11-30 at 09:49 -0800, Stephen Hemminger wrote:
> On Thu, 30 Nov 2017 09:26:39 -0800
> Eric Dumazet <eric.dumazet@gmail.com> wrote:
> 
> > On Thu, 2017-11-30 at 09:10 -0800, Stephen Hemminger wrote:
> > > 
> > > 
> > > The problem goes back into the core GSO networking code.
> > > Something like this is needed.
> > > 
> > > static inline bool netif_needs_gso(struct sk_buff *skb,
> > > 				   const struct net_device *dev,
> > > 				   netdev_features_t features)
> > > {
> > > 	return skb_is_gso(skb) &&
> > > 		(!skb_gso_ok(skb, features) ||
> > > 		 unlikely(skb_shinfo(skb)->gso_segs > dev-  
> > > > gso_max_segs) ||  << new  
> > > 
> > > 		 unlikely(skb_shinfo(skb)->gso_size > dev-  
> > > > gso_max_size) ||  << new  
> > > 
> > > 		 unlikely((skb->ip_summed != CHECKSUM_PARTIAL) &&
> > > 			  (skb->ip_summed != CHECKSUM_UNNECESSARY)));
> > > }
> > > 
> > > What that will do is split up the monster GSO packets if they
> > > ever
> > > bleed
> > > across from one device to another through the twisty mazes of
> > > packet
> > > processing paths.  
> > 
> > 
> > Since very few drivers have these gso_max_segs / gso_max_size,
> > check
> > could be done in their ndo_features_check()
> 
> Actually, we already check for max_segs, just missing check for size
> here:
> 
> From 71a134f41c4aae8947241091300d21745aa237f2 Mon Sep 17 00:00:00
> 2001
> From: Stephen Hemminger <sthemmin@microsoft.com>
> Date: Thu, 30 Nov 2017 09:45:11 -0800
> Subject: [PATCH] net: do not GSO if frame is too large
> 
> This adds an additional check to breakup skb's that exceed a devices
> GSO maximum size. The code was already checking for too many segments
> but did not check size.
> 
> This has been observed to be a problem when using containers on
> Hyper-V/Azure where the allowed GSO maximum size is less than
> maximum and skb's have gone through multiple layers to arrive
> at the virtual device.
> 
> Signed-off-by: Stephen Hemminger <sthemmin@microsoft.com>
> ---
>  net/core/dev.c | 4 +++-
>  1 file changed, 3 insertions(+), 1 deletion(-)
> 
> diff --git a/net/core/dev.c b/net/core/dev.c
> index 07ed21d64f92..0bb398f3bfa3 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -2918,9 +2918,11 @@ static netdev_features_t
> gso_features_check(const struct sk_buff *skb,
>  					    struct net_device *dev,
>  					    netdev_features_t
> features)
>  {
> +	unsigned int gso_size = skb_shinfo(skb)->gso_size;
>  	u16 gso_segs = skb_shinfo(skb)->gso_segs;
>  
> -	if (gso_segs > dev->gso_max_segs)
> +	if (gso_segs > dev->gso_max_segs ||
> +	    gso_size > dev->gso_max_size)
>  		return features & ~NETIF_F_GSO_MASK;
>  
>  	/* Support for GSO partial features requires software


Yes, but check commit 743b03a83297690f0bd38c452a3bbb47d2be300a
("net: remove netdevice gso_min_segs")

Plan was to get rid of the existing check, not adding new ones :/
Stephen Hemminger Nov. 30, 2017, 6:08 p.m. UTC | #14
On Thu, 30 Nov 2017 09:59:23 -0800
Eric Dumazet <eric.dumazet@gmail.com> wrote:

> On Thu, 2017-11-30 at 09:49 -0800, Stephen Hemminger wrote:
> > On Thu, 30 Nov 2017 09:26:39 -0800
> > Eric Dumazet <eric.dumazet@gmail.com> wrote:
> >   
> > > On Thu, 2017-11-30 at 09:10 -0800, Stephen Hemminger wrote:  
> > > > 
> > > > 
> > > > The problem goes back into the core GSO networking code.
> > > > Something like this is needed.
> > > > 
> > > > static inline bool netif_needs_gso(struct sk_buff *skb,
> > > > 				   const struct net_device *dev,
> > > > 				   netdev_features_t features)
> > > > {
> > > > 	return skb_is_gso(skb) &&
> > > > 		(!skb_gso_ok(skb, features) ||
> > > > 		 unlikely(skb_shinfo(skb)->gso_segs > dev-    
> > > > > gso_max_segs) ||  << new    
> > > > 
> > > > 		 unlikely(skb_shinfo(skb)->gso_size > dev-    
> > > > > gso_max_size) ||  << new    
> > > > 
> > > > 		 unlikely((skb->ip_summed != CHECKSUM_PARTIAL) &&
> > > > 			  (skb->ip_summed != CHECKSUM_UNNECESSARY)));
> > > > }
> > > > 
> > > > What that will do is split up the monster GSO packets if they
> > > > ever
> > > > bleed
> > > > across from one device to another through the twisty mazes of
> > > > packet
> > > > processing paths.    
> > > 
> > > 
> > > Since very few drivers have these gso_max_segs / gso_max_size,
> > > check
> > > could be done in their ndo_features_check()  
> > 
> > Actually, we already check for max_segs, just missing check for size
> > here:
> > 
> > From 71a134f41c4aae8947241091300d21745aa237f2 Mon Sep 17 00:00:00
> > 2001
> > From: Stephen Hemminger <sthemmin@microsoft.com>
> > Date: Thu, 30 Nov 2017 09:45:11 -0800
> > Subject: [PATCH] net: do not GSO if frame is too large
> > 
> > This adds an additional check to breakup skb's that exceed a devices
> > GSO maximum size. The code was already checking for too many segments
> > but did not check size.
> > 
> > This has been observed to be a problem when using containers on
> > Hyper-V/Azure where the allowed GSO maximum size is less than
> > maximum and skb's have gone through multiple layers to arrive
> > at the virtual device.
> > 
> > Signed-off-by: Stephen Hemminger <sthemmin@microsoft.com>
> > ---
> >  net/core/dev.c | 4 +++-
> >  1 file changed, 3 insertions(+), 1 deletion(-)
> > 
> > diff --git a/net/core/dev.c b/net/core/dev.c
> > index 07ed21d64f92..0bb398f3bfa3 100644
> > --- a/net/core/dev.c
> > +++ b/net/core/dev.c
> > @@ -2918,9 +2918,11 @@ static netdev_features_t
> > gso_features_check(const struct sk_buff *skb,
> >  					    struct net_device *dev,
> >  					    netdev_features_t
> > features)
> >  {
> > +	unsigned int gso_size = skb_shinfo(skb)->gso_size;
> >  	u16 gso_segs = skb_shinfo(skb)->gso_segs;
> >  
> > -	if (gso_segs > dev->gso_max_segs)
> > +	if (gso_segs > dev->gso_max_segs ||
> > +	    gso_size > dev->gso_max_size)
> >  		return features & ~NETIF_F_GSO_MASK;
> >  
> >  	/* Support for GSO partial features requires software  
> 
> 
> Yes, but check commit 743b03a83297690f0bd38c452a3bbb47d2be300a
> ("net: remove netdevice gso_min_segs")
> 
> Plan was to get rid of the existing check, not adding new ones :/

Sure can do it in the driver and that has other benefits like ability
to backport to older distributions.

Still need gso_max_size though since want to tell TCP to avoid
generating mega-jumbo frames.
Eric Dumazet Nov. 30, 2017, 6:10 p.m. UTC | #15
On Thu, 2017-11-30 at 10:08 -0800, Stephen Hemminger wrote:
> On Thu, 30 Nov 2017 09:59:23 -0800
> Eric Dumazet <eric.dumazet@gmail.com> wrote:
> 
> > On Thu, 2017-11-30 at 09:49 -0800, Stephen Hemminger wrote:
> > > On Thu, 30 Nov 2017 09:26:39 -0800
> > > Eric Dumazet <eric.dumazet@gmail.com> wrote:
> > >   
> > > > On Thu, 2017-11-30 at 09:10 -0800, Stephen Hemminger wrote:  
> > > > > 
> > > > > 
> > > > > The problem goes back into the core GSO networking code.
> > > > > Something like this is needed.
> > > > > 
> > > > > static inline bool netif_needs_gso(struct sk_buff *skb,
> > > > > 				   const struct net_device
> > > > > *dev,
> > > > > 				   netdev_features_t features)
> > > > > {
> > > > > 	return skb_is_gso(skb) &&
> > > > > 		(!skb_gso_ok(skb, features) ||
> > > > > 		 unlikely(skb_shinfo(skb)->gso_segs > dev-    
> > > > > > gso_max_segs) ||  << new    
> > > > > 
> > > > > 		 unlikely(skb_shinfo(skb)->gso_size > dev-    
> > > > > > gso_max_size) ||  << new    
> > > > > 
> > > > > 		 unlikely((skb->ip_summed != CHECKSUM_PARTIAL)
> > > > > &&
> > > > > 			  (skb->ip_summed !=
> > > > > CHECKSUM_UNNECESSARY)));
> > > > > }
> > > > > 
> > > > > What that will do is split up the monster GSO packets if they
> > > > > ever
> > > > > bleed
> > > > > across from one device to another through the twisty mazes of
> > > > > packet
> > > > > processing paths.    
> > > > 
> > > > 
> > > > Since very few drivers have these gso_max_segs / gso_max_size,
> > > > check
> > > > could be done in their ndo_features_check()  
> > > 
> > > Actually, we already check for max_segs, just missing check for
> > > size
> > > here:
> > > 
> > > From 71a134f41c4aae8947241091300d21745aa237f2 Mon Sep 17 00:00:00
> > > 2001
> > > From: Stephen Hemminger <sthemmin@microsoft.com>
> > > Date: Thu, 30 Nov 2017 09:45:11 -0800
> > > Subject: [PATCH] net: do not GSO if frame is too large
> > > 
> > > This adds an additional check to breakup skb's that exceed a
> > > devices
> > > GSO maximum size. The code was already checking for too many
> > > segments
> > > but did not check size.
> > > 
> > > This has been observed to be a problem when using containers on
> > > Hyper-V/Azure where the allowed GSO maximum size is less than
> > > maximum and skb's have gone through multiple layers to arrive
> > > at the virtual device.
> > > 
> > > Signed-off-by: Stephen Hemminger <sthemmin@microsoft.com>
> > > ---
> > >  net/core/dev.c | 4 +++-
> > >  1 file changed, 3 insertions(+), 1 deletion(-)
> > > 
> > > diff --git a/net/core/dev.c b/net/core/dev.c
> > > index 07ed21d64f92..0bb398f3bfa3 100644
> > > --- a/net/core/dev.c
> > > +++ b/net/core/dev.c
> > > @@ -2918,9 +2918,11 @@ static netdev_features_t
> > > gso_features_check(const struct sk_buff *skb,
> > >  					    struct net_device
> > > *dev,
> > >  					    netdev_features_t
> > > features)
> > >  {
> > > +	unsigned int gso_size = skb_shinfo(skb)->gso_size;
> > >  	u16 gso_segs = skb_shinfo(skb)->gso_segs;
> > >  
> > > -	if (gso_segs > dev->gso_max_segs)
> > > +	if (gso_segs > dev->gso_max_segs ||
> > > +	    gso_size > dev->gso_max_size)
> > >  		return features & ~NETIF_F_GSO_MASK;
> > >  
> > >  	/* Support for GSO partial features requires software  
> > 
> > 
> > Yes, but check commit 743b03a83297690f0bd38c452a3bbb47d2be300a
> > ("net: remove netdevice gso_min_segs")
> > 
> > Plan was to get rid of the existing check, not adding new ones :/
> 
> Sure can do it in the driver and that has other benefits like ability
> to backport to older distributions.
> 
> Still need gso_max_size though since want to tell TCP to avoid
> generating mega-jumbo frames.
> 

Sure, the netdev->gso_max_{size|segs} are staying.

I was simply trying to not add another check in fast path :/

Thanks.
Stephen Hemminger Dec. 1, 2017, 8:30 p.m. UTC | #16
On Mon, 27 Nov 2017 19:02:01 -0700
David Ahern <dsahern@gmail.com> wrote:

> On 11/27/17 6:42 PM, Solio Sarabia wrote:
> > On Mon, Nov 27, 2017 at 01:15:02PM -0800, Stephen Hemminger wrote:  
> >> On Mon, 27 Nov 2017 12:14:19 -0800
> >> Solio Sarabia <solio.sarabia@intel.com> wrote:
> >>  
> >>> On Sun, Nov 26, 2017 at 11:07:25PM -0800, Stephen Hemminger wrote:  
> >>>> On Sun, 26 Nov 2017 20:13:39 -0700
> >>>> David Ahern <dsahern@gmail.com> wrote:
> >>>>     
> >>>>> On 11/26/17 11:17 AM, Stephen Hemminger wrote:    
> >>>>>> This allows veth device in containers to see the GSO maximum
> >>>>>> settings of the actual device being used for output.    
> >>>>>
> >>>>> veth devices can be added to a VRF instead of a bridge, and I do not
> >>>>> believe the gso propagation works for L3 master devices.
> >>>>>
> >>>>> From a quick grep, team devices do not appear to handle gso changes either.    
> >>>>
> >>>> This code should still work correctly, but no optimization would happen.
> >>>> The gso_max_size of the VRF or team will
> >>>> still be GSO_MAX_SIZE so there would be no change. If VRF or Team ever got smart
> >>>> enough to handle GSO limits, then the algorithm would handle it.    
> >>>
> >>> This patch propagates gso value from bridge to its veth endpoints.
> >>> However, since bridge is never aware of the GSO limit from underlying
> >>> interfaces, bridge/veth still have larger GSO size.
> >>>
> >>> In the docker case, bridge is not linked directly to physical or
> >>> synthetic interfaces; it relies on iptables to decide which interface to
> >>> forward packets to.  
> >>
> >> So for the docker case, then direct control of GSO values via netlink (ie ip link set)
> >> seems like the better solution.  
> > 
> > Adding ioctl support for 'ip link set' would work. I'm still concerned
> > how to enforce the upper limit to not exceed that of the lower devices.
> > 
> > Consider a system with three NICs, each reporting values in the range
> > [60,000 - 62,780]. Users could set virtual interfaces' gso to 65,536,
> > exceeding the limit, and having the host do sw gso (vms settings must
> > not affect host performance.)
> > 
> > Looping through interfaces?  With the difference that now it'd be
> > trigger upon user's request, not every time a veth is created (like one
> > previous patch discussed.)
> >   
> 
> You are concerned about the routed case right? One option is to have VRF
> devices propagate gso sizes to all devices (veth, vlan, etc) enslaved to
> it. VRF devices are Layer 3 master devices so an L3 parallel to a bridge.

See the patch set I posted today which punts the problem to veth setup.
diff mbox series

Patch

diff --git a/drivers/net/veth.c b/drivers/net/veth.c
index f5438d0978ca..0c9ce156943b 100644
--- a/drivers/net/veth.c
+++ b/drivers/net/veth.c
@@ -511,17 +511,89 @@  static struct rtnl_link_ops veth_link_ops = {
 	.get_link_net	= veth_get_link_net,
 };
 
+/* When veth device is added to a bridge or other master device
+ * then reflect the GSO max values from the upper device
+ * to the other end of veth pair.
+ */
+static void veth_change_upper(struct net_device *dev,
+		      const struct netdev_notifier_changeupper_info *info)
+{
+	struct net_device *upper = info->upper_dev;
+	struct net_device *peer;
+	struct veth_priv *priv;
+
+	if (dev->netdev_ops != &veth_netdev_ops)
+		return;
+
+	priv = netdev_priv(dev);
+	peer = rtnl_dereference(priv->peer);
+	if (!peer)
+		return;
+
+	if (upper) {
+		peer->gso_max_segs = upper->gso_max_segs;
+		peer->gso_max_size = upper->gso_max_size;
+	} else {
+		peer->gso_max_segs = GSO_MAX_SEGS;
+		peer->gso_max_size = GSO_MAX_SIZE;
+	}
+}
+
+static void veth_change_upper_gso(struct net_device *upper)
+{
+	struct net_device *peer, *dev;
+	struct veth_priv *priv;
+
+	for_each_netdev(dev_net(upper), dev) {
+		if (dev->netdev_ops != &veth_netdev_ops)
+			continue;
+		if (!netdev_has_upper_dev(dev, upper))
+			continue;
+
+		priv = netdev_priv(dev);
+		peer = rtnl_dereference(priv->peer);
+		if (!peer)
+			continue;
+		peer->gso_max_segs = upper->gso_max_segs;
+		peer->gso_max_size = upper->gso_max_size;
+	}
+}
+
+static int veth_netdev_event(struct notifier_block *this,
+			     unsigned long event, void *ptr)
+{
+	struct net_device *event_dev = netdev_notifier_info_to_dev(ptr);
+
+	/* Propagate the upper (bridge) device settings to peer */
+	switch (event) {
+	case NETDEV_CHANGEUPPER:
+		veth_change_upper(event_dev, ptr);
+		break;
+	case NETDEV_CHANGE_GSO_MAX:
+		veth_change_upper_gso(event_dev);
+		break;
+	}
+
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block veth_netdev_notifier = {
+	.notifier_call = veth_netdev_event,
+};
+
 /*
  * init/fini
  */
 
 static __init int veth_init(void)
 {
+	register_netdevice_notifier(&veth_netdev_notifier);
 	return rtnl_link_register(&veth_link_ops);
 }
 
 static __exit void veth_exit(void)
 {
+	unregister_netdevice_notifier(&veth_netdev_notifier);
 	rtnl_link_unregister(&veth_link_ops);
 }