diff mbox

net: Add ndo_gso_check

Message ID 1411962607-27878-1-git-send-email-therbert@google.com
State Changes Requested, archived
Delegated to: David Miller
Headers show

Commit Message

Tom Herbert Sept. 29, 2014, 3:50 a.m. UTC
Add ndo_gso_check which a device can define to indicate whether is
is capable of doing GSO on a packet. This funciton would be called from
the stack to determine whether software GSO is needed to be done. A
driver should populate this function if it advertises GSO types for
which there are combinations that it wouldn't be able to handle. For
instance a device that performs UDP tunneling might only implement
support for transparent Ethernet bridging type of inner packets
or might have limitations on lengths of inner headers.

Signed-off-by: Tom Herbert <therbert@google.com>
---
 include/linux/netdevice.h | 12 +++++++++++-
 net/core/dev.c            |  2 +-
 2 files changed, 12 insertions(+), 2 deletions(-)

Comments

Or Gerlitz Sept. 29, 2014, 7:59 p.m. UTC | #1
On Mon, Sep 29, 2014 at 6:50 AM, Tom Herbert <therbert@google.com> wrote:
> Add ndo_gso_check which a device can define to indicate whether is
> is capable of doing GSO on a packet. This funciton would be called from
> the stack to determine whether software GSO is needed to be done.

please, no...

We should strive to have a model/architecture under which the driver
can clearly advertize up something (bits, collection of bits,
whatever) which the stack can run through some dec-sion making code
and decide if to GSO yes/no, do it ala SW or ala HW. As life, this
model need not be perfect and can be biased towards being
simple/robust/conservative and developed incrementally.

We certainly don't want each driver that support some sort of HW
offloads for tunnels (today we have 4-5, tomorrow maybe 40...) to
implement their super sophisticated/efficient "dig in a packet and
tell myself if I can GSO/CSUM it or not" code, while repeating (expect
many copy/paste bugs...) the other drivers' code.

Saying that, while looking quickly on the fm10k driver during it's
super fast upstream review cycle, I saw something which looked like
yellow light, I drove one, but now I see it was very much a red one:
the chain of calls:

fm10k_xmit_frame_ring --> fm10k_tso -->
fm10k_tx_encap_offload --> {fm10k_port_is_vxlan,  fm10k_port_is_nvgre}

is pretty much live example to what you are describing here, right?

L2 Drivers are not supposed to do such heavy duty lookups in packets
with tons of code that can be generic.

I would suggest as band aid to the current situation with non-VXLAN
UDP offloads enabled upstream and few drivers that don't really
support GSO/CSUM for udp tunnels which are not vxlan -- to have a
VXLAN bit which tells to the stack "I can do HW GSO to VXLAN" and have
the stack to enable HW GSO over these devices only to VXLAN. This
would bring the upstream code into consistent/well-defined state, and
we can take it from there.

Or.

SB another comment

> A driver should populate this function if it advertises GSO types for
> which there are combinations that it wouldn't be able to handle. For
> instance a device that performs UDP tunneling might only implement
> support for transparent Ethernet bridging type of inner packets
> or might have limitations on lengths of inner headers.
>
> Signed-off-by: Tom Herbert <therbert@google.com>
> ---
>  include/linux/netdevice.h | 12 +++++++++++-
>  net/core/dev.c            |  2 +-
>  2 files changed, 12 insertions(+), 2 deletions(-)
>
> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> index 9f5d293..f8c2027 100644
> --- a/include/linux/netdevice.h
> +++ b/include/linux/netdevice.h
> @@ -997,6 +997,12 @@ typedef u16 (*select_queue_fallback_t)(struct net_device *dev,
>   *     Callback to use for xmit over the accelerated station. This
>   *     is used in place of ndo_start_xmit on accelerated net
>   *     devices.
> + * bool        (*ndo_gso_check) (struct sk_buff *skb,
> + *                       struct net_device *dev);
> + *     Called by core transmit path to determine if device is capable of
> + *     performing GSO on a packet. The device returns true if it is
> + *     able to GSO the packet, false otherwise. If the return value is
> + *     false the stack will do software GSO.
>   */
>  struct net_device_ops {
>         int                     (*ndo_init)(struct net_device *dev);
> @@ -1146,6 +1152,8 @@ struct net_device_ops {
>                                                         struct net_device *dev,
>                                                         void *priv);
>         int                     (*ndo_get_lock_subclass)(struct net_device *dev);
> +       bool                    (*ndo_gso_check) (struct sk_buff *skb,
> +                                                 struct net_device *dev);
>  };
>
>  /**
> @@ -3536,10 +3544,12 @@ static inline bool skb_gso_ok(struct sk_buff *skb, netdev_features_t features)
>                (!skb_has_frag_list(skb) || (features & NETIF_F_FRAGLIST));
>  }
>
> -static inline bool netif_needs_gso(struct sk_buff *skb,
> +static inline bool netif_needs_gso(struct net_device *dev, struct sk_buff *skb,
>                                    netdev_features_t features)
>  {
>         return skb_is_gso(skb) && (!skb_gso_ok(skb, features) ||
> +               (dev->netdev_ops->ndo_gso_check &&
> +                !dev->netdev_ops->ndo_gso_check(skb, dev)) ||
>                 unlikely((skb->ip_summed != CHECKSUM_PARTIAL) &&
>                          (skb->ip_summed != CHECKSUM_UNNECESSARY)));
>  }
> diff --git a/net/core/dev.c b/net/core/dev.c
> index e2ced01..8c2b9bb 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -2680,7 +2680,7 @@ struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev)
>         if (skb->encapsulation)
>                 features &= dev->hw_enc_features;
>
> -       if (netif_needs_gso(skb, features)) {
> +       if (netif_needs_gso(dev, skb, features)) {
>                 struct sk_buff *segs;
>
>                 segs = skb_gso_segment(skb, features);

unrelated fix? best to put in a different patch.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Miller Sept. 29, 2014, 8:12 p.m. UTC | #2
From: Or Gerlitz <gerlitz.or@gmail.com>
Date: Mon, 29 Sep 2014 22:59:39 +0300

> Saying that, while looking quickly on the fm10k driver during it's
> super fast upstream review cycle,

I call bullshit.  fm10k had been posted for review starting around 10
days before I pulled it in from Jeff's tree.

If I let most patches rot that long in patchwork there'd be a crowd
with pitchforks approaching me.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jesse Gross Sept. 29, 2014, 8:13 p.m. UTC | #3
On Sun, Sep 28, 2014 at 8:50 PM, Tom Herbert <therbert@google.com> wrote:
> Add ndo_gso_check which a device can define to indicate whether is
> is capable of doing GSO on a packet. This funciton would be called from
> the stack to determine whether software GSO is needed to be done. A
> driver should populate this function if it advertises GSO types for
> which there are combinations that it wouldn't be able to handle. For
> instance a device that performs UDP tunneling might only implement
> support for transparent Ethernet bridging type of inner packets
> or might have limitations on lengths of inner headers.
>
> Signed-off-by: Tom Herbert <therbert@google.com>

Offloads that might have limitations extend beyond GSO - checksum is
another possibility that could need something like a length check.

In addition, while I would also like to be optimistic about the
capabilities of existing NICs it's unlikely that any of them that are
advertising SKB_GSO_UDP_TUNNEL can actually do it with full
generality. So unless we can get the driver writers to chime in about
their capabilities (maybe we can, there's only a handful of them right
now), we probably need to provide a more conservative implementation
for those drivers. I guess I would probably do length equal to VXLAN
and perhaps containing Ethernet as a balance between the most
conservative and the most optimistic.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Tom Herbert Sept. 29, 2014, 8:47 p.m. UTC | #4
On Mon, Sep 29, 2014 at 1:13 PM, Jesse Gross <jesse@nicira.com> wrote:
> On Sun, Sep 28, 2014 at 8:50 PM, Tom Herbert <therbert@google.com> wrote:
>> Add ndo_gso_check which a device can define to indicate whether is
>> is capable of doing GSO on a packet. This funciton would be called from
>> the stack to determine whether software GSO is needed to be done. A
>> driver should populate this function if it advertises GSO types for
>> which there are combinations that it wouldn't be able to handle. For
>> instance a device that performs UDP tunneling might only implement
>> support for transparent Ethernet bridging type of inner packets
>> or might have limitations on lengths of inner headers.
>>
>> Signed-off-by: Tom Herbert <therbert@google.com>
>
> Offloads that might have limitations extend beyond GSO - checksum is
> another possibility that could need something like a length check.
>
There are several examples of drivers that advertise checksum offload
but then in TX path decide that they can't do it for various reasons--
in this case they need to do software checksum calculation themselves.
Applying this model to GSO, that is drivers do software GSO when they
need to punt on packet, seems pretty hard to me, but maybe someone can
look at it.

> In addition, while I would also like to be optimistic about the
> capabilities of existing NICs it's unlikely that any of them that are
> advertising SKB_GSO_UDP_TUNNEL can actually do it with full
> generality. So unless we can get the driver writers to chime in about
> their capabilities (maybe we can, there's only a handful of them right
> now), we probably need to provide a more conservative implementation
> for those drivers. I guess I would probably do length equal to VXLAN
> and perhaps containing Ethernet as a balance between the most
> conservative and the most optimistic.

Checking header length in ndo_gso_check would be trivial.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Tom Herbert Sept. 29, 2014, 8:53 p.m. UTC | #5
On Mon, Sep 29, 2014 at 12:59 PM, Or Gerlitz <gerlitz.or@gmail.com> wrote:
> On Mon, Sep 29, 2014 at 6:50 AM, Tom Herbert <therbert@google.com> wrote:
>> Add ndo_gso_check which a device can define to indicate whether is
>> is capable of doing GSO on a packet. This funciton would be called from
>> the stack to determine whether software GSO is needed to be done.
>
> please, no...
>
> We should strive to have a model/architecture under which the driver
> can clearly advertize up something (bits, collection of bits,
> whatever) which the stack can run through some dec-sion making code
> and decide if to GSO yes/no, do it ala SW or ala HW. As life, this
> model need not be perfect and can be biased towards being
> simple/robust/conservative and developed incrementally.
>
Please make a specific proposal then.

> We certainly don't want each driver that support some sort of HW
> offloads for tunnels (today we have 4-5, tomorrow maybe 40...) to
> implement their super sophisticated/efficient "dig in a packet and
> tell myself if I can GSO/CSUM it or not" code, while repeating (expect
> many copy/paste bugs...) the other drivers' code.
>
I would encourage vendors to implement protocol agnostic, generic
mechanisms so that they don't need to dig into the packet.

> Saying that, while looking quickly on the fm10k driver during it's
> super fast upstream review cycle, I saw something which looked like
> yellow light, I drove one, but now I see it was very much a red one:
> the chain of calls:
>
> fm10k_xmit_frame_ring --> fm10k_tso -->
> fm10k_tx_encap_offload --> {fm10k_port_is_vxlan,  fm10k_port_is_nvgre}
>
> is pretty much live example to what you are describing here, right?
>
> L2 Drivers are not supposed to do such heavy duty lookups in packets
> with tons of code that can be generic.
>
> I would suggest as band aid to the current situation with non-VXLAN
> UDP offloads enabled upstream and few drivers that don't really
> support GSO/CSUM for udp tunnels which are not vxlan -- to have a
> VXLAN bit which tells to the stack "I can do HW GSO to VXLAN" and have
> the stack to enable HW GSO over these devices only to VXLAN. This
> would bring the upstream code into consistent/well-defined state, and
> we can take it from there.
>
> Or.
>
> SB another comment
>
>> A driver should populate this function if it advertises GSO types for
>> which there are combinations that it wouldn't be able to handle. For
>> instance a device that performs UDP tunneling might only implement
>> support for transparent Ethernet bridging type of inner packets
>> or might have limitations on lengths of inner headers.
>>
>> Signed-off-by: Tom Herbert <therbert@google.com>
>> ---
>>  include/linux/netdevice.h | 12 +++++++++++-
>>  net/core/dev.c            |  2 +-
>>  2 files changed, 12 insertions(+), 2 deletions(-)
>>
>> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
>> index 9f5d293..f8c2027 100644
>> --- a/include/linux/netdevice.h
>> +++ b/include/linux/netdevice.h
>> @@ -997,6 +997,12 @@ typedef u16 (*select_queue_fallback_t)(struct net_device *dev,
>>   *     Callback to use for xmit over the accelerated station. This
>>   *     is used in place of ndo_start_xmit on accelerated net
>>   *     devices.
>> + * bool        (*ndo_gso_check) (struct sk_buff *skb,
>> + *                       struct net_device *dev);
>> + *     Called by core transmit path to determine if device is capable of
>> + *     performing GSO on a packet. The device returns true if it is
>> + *     able to GSO the packet, false otherwise. If the return value is
>> + *     false the stack will do software GSO.
>>   */
>>  struct net_device_ops {
>>         int                     (*ndo_init)(struct net_device *dev);
>> @@ -1146,6 +1152,8 @@ struct net_device_ops {
>>                                                         struct net_device *dev,
>>                                                         void *priv);
>>         int                     (*ndo_get_lock_subclass)(struct net_device *dev);
>> +       bool                    (*ndo_gso_check) (struct sk_buff *skb,
>> +                                                 struct net_device *dev);
>>  };
>>
>>  /**
>> @@ -3536,10 +3544,12 @@ static inline bool skb_gso_ok(struct sk_buff *skb, netdev_features_t features)
>>                (!skb_has_frag_list(skb) || (features & NETIF_F_FRAGLIST));
>>  }
>>
>> -static inline bool netif_needs_gso(struct sk_buff *skb,
>> +static inline bool netif_needs_gso(struct net_device *dev, struct sk_buff *skb,
>>                                    netdev_features_t features)
>>  {
>>         return skb_is_gso(skb) && (!skb_gso_ok(skb, features) ||
>> +               (dev->netdev_ops->ndo_gso_check &&
>> +                !dev->netdev_ops->ndo_gso_check(skb, dev)) ||
>>                 unlikely((skb->ip_summed != CHECKSUM_PARTIAL) &&
>>                          (skb->ip_summed != CHECKSUM_UNNECESSARY)));
>>  }
>> diff --git a/net/core/dev.c b/net/core/dev.c
>> index e2ced01..8c2b9bb 100644
>> --- a/net/core/dev.c
>> +++ b/net/core/dev.c
>> @@ -2680,7 +2680,7 @@ struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev)
>>         if (skb->encapsulation)
>>                 features &= dev->hw_enc_features;
>>
>> -       if (netif_needs_gso(skb, features)) {
>> +       if (netif_needs_gso(dev, skb, features)) {
>>                 struct sk_buff *segs;
>>
>>                 segs = skb_gso_segment(skb, features);
>
> unrelated fix? best to put in a different patch.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Or Gerlitz Sept. 29, 2014, 9:10 p.m. UTC | #6
On Mon, Sep 29, 2014 at 11:53 PM, Tom Herbert <therbert@google.com> wrote:
> On Mon, Sep 29, 2014 at 12:59 PM, Or Gerlitz <gerlitz.or@gmail.com> wrote:
>> On Mon, Sep 29, 2014 at 6:50 AM, Tom Herbert <therbert@google.com> wrote:
>>> Add ndo_gso_check which a device can define to indicate whether is
>>> is capable of doing GSO on a packet. This funciton would be called from
>>> the stack to determine whether software GSO is needed to be done.
>>
>> please, no...
>>
>> We should strive to have a model/architecture under which the driver
>> can clearly advertize up something (bits, collection of bits,
>> whatever) which the stack can run through some dec-sion making code
>> and decide if to GSO yes/no, do it ala SW or ala HW. As life, this
>> model need not be perfect and can be biased towards being
>> simple/robust/conservative and developed incrementally.
>>
> Please make a specific proposal then.

We 1st need to bring the system back into a consistent state, see my
band-aid proposal. BTW, this band-aid might turn to be the basis for
the longer term solution too. I admit that saying "no" is ten (100)
times harder vs. say "let's do it this way", but IMHO the fm10k call
chain I pointed on is what you are suggesting more-or-less and is
no-no


>> We certainly don't want each driver that support some sort of HW
>> offloads for tunnels (today we have 4-5, tomorrow maybe 40...) to
>> implement their super sophisticated/efficient "dig in a packet and
>> tell myself if I can GSO/CSUM it or not" code, while repeating (expect
>> many copy/paste bugs...) the other drivers' code.
>>
> I would encourage vendors to implement protocol agnostic, generic
> mechanisms so that they don't need to dig into the packet.

M2.

But wait, in Linux we support 20y old HW and it seems now will likely
to continue doing so for maybe the next 10 or 20 coming years - I
recall that one of the sessions in the coming Linux Con/KVM Forum/LPC
conference series deals with practices to test kernel changes over
driver code for which the HW is totally out of stock. Do we even have
EOL procedure for upstream HW drivers...?

So with this in mind, new NIC HW can (and should) definitely be
developed along design guidelines such as being fully generic and
simple. But, fact is that we have HW today and more coming up which is
not 100% along this vision and their drivers are upstream and we want
the kernel to be generic and hopefully using robust/simple SW model,
right?

Or.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Tom Herbert Sept. 29, 2014, 9:38 p.m. UTC | #7
On Mon, Sep 29, 2014 at 2:10 PM, Or Gerlitz <gerlitz.or@gmail.com> wrote:
> On Mon, Sep 29, 2014 at 11:53 PM, Tom Herbert <therbert@google.com> wrote:
>> On Mon, Sep 29, 2014 at 12:59 PM, Or Gerlitz <gerlitz.or@gmail.com> wrote:
>>> On Mon, Sep 29, 2014 at 6:50 AM, Tom Herbert <therbert@google.com> wrote:
>>>> Add ndo_gso_check which a device can define to indicate whether is
>>>> is capable of doing GSO on a packet. This funciton would be called from
>>>> the stack to determine whether software GSO is needed to be done.
>>>
>>> please, no...
>>>
>>> We should strive to have a model/architecture under which the driver
>>> can clearly advertize up something (bits, collection of bits,
>>> whatever) which the stack can run through some dec-sion making code
>>> and decide if to GSO yes/no, do it ala SW or ala HW. As life, this
>>> model need not be perfect and can be biased towards being
>>> simple/robust/conservative and developed incrementally.
>>>
>> Please make a specific proposal then.
>
> We 1st need to bring the system back into a consistent state, see my
> band-aid proposal. BTW, this band-aid might turn to be the basis for
> the longer term solution too. I admit that saying "no" is ten (100)
> times harder vs. say "let's do it this way", but IMHO the fm10k call
> chain I pointed on is what you are suggesting more-or-less and is
> no-no
>
I'd much rather have drivers do this, than inflict the stack with more
complexity. As you describe "the driver can clearly advertise up
something (bits, collection of bits, whatever) which the stack can run
through some dec-sion making code and decide if to GSO yes/no"-- seems
very complex to me. My proposed alternative is to just ask the driver
and they can implement whatever policy they want, stack should doesn't
care about the specifics, just needs an answer. Neither does this
necessarily mean that driver needs to inspect packet, for instance I
suspect that just be looking at inner header lengths and skb->protocol
being TEB would be standard check to match VXLAN.

In any case, if you can formulate your proposal in a patch that would
be very helpful.

>
>>> We certainly don't want each driver that support some sort of HW
>>> offloads for tunnels (today we have 4-5, tomorrow maybe 40...) to
>>> implement their super sophisticated/efficient "dig in a packet and
>>> tell myself if I can GSO/CSUM it or not" code, while repeating (expect
>>> many copy/paste bugs...) the other drivers' code.
>>>
>> I would encourage vendors to implement protocol agnostic, generic
>> mechanisms so that they don't need to dig into the packet.
>
> M2.
>
> But wait, in Linux we support 20y old HW and it seems now will likely
> to continue doing so for maybe the next 10 or 20 coming years - I
> recall that one of the sessions in the coming Linux Con/KVM Forum/LPC
> conference series deals with practices to test kernel changes over
> driver code for which the HW is totally out of stock. Do we even have
> EOL procedure for upstream HW drivers...?
>
> So with this in mind, new NIC HW can (and should) definitely be
> developed along design guidelines such as being fully generic and
> simple. But, fact is that we have HW today and more coming up which is
> not 100% along this vision and their drivers are upstream and we want
> the kernel to be generic and hopefully using robust/simple SW model,
> right?
>
> Or.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jesse Gross Sept. 30, 2014, 12:33 a.m. UTC | #8
On Mon, Sep 29, 2014 at 1:47 PM, Tom Herbert <therbert@google.com> wrote:
> On Mon, Sep 29, 2014 at 1:13 PM, Jesse Gross <jesse@nicira.com> wrote:
>> On Sun, Sep 28, 2014 at 8:50 PM, Tom Herbert <therbert@google.com> wrote:
>>> Add ndo_gso_check which a device can define to indicate whether is
>>> is capable of doing GSO on a packet. This funciton would be called from
>>> the stack to determine whether software GSO is needed to be done. A
>>> driver should populate this function if it advertises GSO types for
>>> which there are combinations that it wouldn't be able to handle. For
>>> instance a device that performs UDP tunneling might only implement
>>> support for transparent Ethernet bridging type of inner packets
>>> or might have limitations on lengths of inner headers.
>>>
>>> Signed-off-by: Tom Herbert <therbert@google.com>
>>
>> Offloads that might have limitations extend beyond GSO - checksum is
>> another possibility that could need something like a length check.
>>
> There are several examples of drivers that advertise checksum offload
> but then in TX path decide that they can't do it for various reasons--
> in this case they need to do software checksum calculation themselves.
> Applying this model to GSO, that is drivers do software GSO when they
> need to punt on packet, seems pretty hard to me, but maybe someone can
> look at it.

Yes, I definitely think that it's easier to have GSO done in the
stack. I agree with you that exposing various capabilities,
limitations, etc. for different features will get messy very quickly,
so I guess this model makes sense for now.

>> In addition, while I would also like to be optimistic about the
>> capabilities of existing NICs it's unlikely that any of them that are
>> advertising SKB_GSO_UDP_TUNNEL can actually do it with full
>> generality. So unless we can get the driver writers to chime in about
>> their capabilities (maybe we can, there's only a handful of them right
>> now), we probably need to provide a more conservative implementation
>> for those drivers. I guess I would probably do length equal to VXLAN
>> and perhaps containing Ethernet as a balance between the most
>> conservative and the most optimistic.
>
> Checking header length in ndo_gso_check would be trivial.

I agree - I just think that we need to provide some implementations in
existing drivers to do this. Otherwise, as we introduce new protocols
they will probably be broken in some situations.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Tom Herbert Sept. 30, 2014, 1:59 a.m. UTC | #9
On Mon, Sep 29, 2014 at 5:33 PM, Jesse Gross <jesse@nicira.com> wrote:
> On Mon, Sep 29, 2014 at 1:47 PM, Tom Herbert <therbert@google.com> wrote:
>> On Mon, Sep 29, 2014 at 1:13 PM, Jesse Gross <jesse@nicira.com> wrote:
>>> On Sun, Sep 28, 2014 at 8:50 PM, Tom Herbert <therbert@google.com> wrote:
>>>> Add ndo_gso_check which a device can define to indicate whether is
>>>> is capable of doing GSO on a packet. This funciton would be called from
>>>> the stack to determine whether software GSO is needed to be done. A
>>>> driver should populate this function if it advertises GSO types for
>>>> which there are combinations that it wouldn't be able to handle. For
>>>> instance a device that performs UDP tunneling might only implement
>>>> support for transparent Ethernet bridging type of inner packets
>>>> or might have limitations on lengths of inner headers.
>>>>
>>>> Signed-off-by: Tom Herbert <therbert@google.com>
>>>
>>> Offloads that might have limitations extend beyond GSO - checksum is
>>> another possibility that could need something like a length check.
>>>
>> There are several examples of drivers that advertise checksum offload
>> but then in TX path decide that they can't do it for various reasons--
>> in this case they need to do software checksum calculation themselves.
>> Applying this model to GSO, that is drivers do software GSO when they
>> need to punt on packet, seems pretty hard to me, but maybe someone can
>> look at it.
>
> Yes, I definitely think that it's easier to have GSO done in the
> stack. I agree with you that exposing various capabilities,
> limitations, etc. for different features will get messy very quickly,
> so I guess this model makes sense for now.
>
>>> In addition, while I would also like to be optimistic about the
>>> capabilities of existing NICs it's unlikely that any of them that are
>>> advertising SKB_GSO_UDP_TUNNEL can actually do it with full
>>> generality. So unless we can get the driver writers to chime in about
>>> their capabilities (maybe we can, there's only a handful of them right
>>> now), we probably need to provide a more conservative implementation
>>> for those drivers. I guess I would probably do length equal to VXLAN
>>> and perhaps containing Ethernet as a balance between the most
>>> conservative and the most optimistic.
>>
>> Checking header length in ndo_gso_check would be trivial.
>
> I agree - I just think that we need to provide some implementations in
> existing drivers to do this. Otherwise, as we introduce new protocols
> they will probably be broken in some situations.

A large part of the problem is that potential driver interfaces
(around GSO, checksum) are not clearly specified and especially with
encapsulation the meanings may not be intuitively obvious. For
instance, there's not a single comment in skbuff.h about what any of
the GSO constants mean. I think this is something that we should
address!
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Or Gerlitz Sept. 30, 2014, 2:30 p.m. UTC | #10
On Tue, Sep 30, 2014 at 12:38 AM, Tom Herbert <therbert@google.com> wrote:
> On Mon, Sep 29, 2014 at 2:10 PM, Or Gerlitz <gerlitz.or@gmail.com> wrote:
>> On Mon, Sep 29, 2014 at 11:53 PM, Tom Herbert <therbert@google.com> wrote:
>>> On Mon, Sep 29, 2014 at 12:59 PM, Or Gerlitz <gerlitz.or@gmail.com> wrote:
>>>> On Mon, Sep 29, 2014 at 6:50 AM, Tom Herbert <therbert@google.com> wrote:
>>>>> Add ndo_gso_check which a device can define to indicate whether is
>>>>> is capable of doing GSO on a packet. This funciton would be called from
>>>>> the stack to determine whether software GSO is needed to be done.
>>>>
>>>> please, no...
>>>>
>>>> We should strive to have a model/architecture under which the driver
>>>> can clearly advertize up something (bits, collection of bits,
>>>> whatever) which the stack can run through some dec-sion making code
>>>> and decide if to GSO yes/no, do it ala SW or ala HW. As life, this
>>>> model need not be perfect and can be biased towards being
>>>> simple/robust/conservative and developed incrementally.

>>> Please make a specific proposal then.

OK

>> We 1st need to bring the system back into a consistent state, see my
>> band-aid proposal. BTW, this band-aid might turn to be the basis for
>> the longer term solution too. I admit that saying "no" is ten (100)
>> times harder vs. say "let's do it this way", but IMHO the fm10k call
>> chain I pointed on is what you are suggesting more-or-less and is no-no

> I'd much rather have drivers do this, than inflict the stack with more
> complexity. As you describe "the driver can clearly advertise up
> something (bits, collection of bits, whatever) which the stack can run
> through some dec-sion making code and decide if to GSO yes/no"-- seems
> very complex to me. My proposed alternative is to just ask the driver

I see the point you are trying to make, but

> and they can implement whatever policy they want, stack should doesn't
> care about the specifics, just needs an answer. Neither does this
> necessarily mean that driver needs to inspect packet, for instance I
> suspect that just be looking at inner header lengths and skb->protocol
> being TEB would be standard check to match VXLAN.

I'm not sure how exactly this (inner protocol being Ethernet and inner
header lengths)
is going to work to differentiate between VXLAN and NVGRE (or @ least
the GRE-ing done
by OVS on guest Ethernet frames).

> In any case, if you can formulate your proposal in a patch that would
> be very helpful.

Quick idea is the following:

It's common that when someone along the stack (e,g OVS vxlan/gre datapath logic)
encapsulates a packet, they do know what sort of encapsulation they are doing.

So the encapsulating entity can color the packet skb and the driver
would advertize
to what colors (UDP encap types) they can do GSO. When we come to a
point where the
stack has to decide if go for SW or HW GSO, they attempt to match the colors.

Commit 6a93cc9052748c6355ec9d5b6c38b77f85f1cb0d "udp-tunnel: Add a few
more UDP tunnel APIs"
added encap_type field to sockets. And commit
acbf74a763002bdc74ccfcdac22360bf18e305c5
"vxlan: Refactor vxlan driver to make use of the common UDP tunnel
functions" sets encap_type
for vxlan sockets. We can apply the same idea on packets going by
specific UDP encapsulation drivers.

Or.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Tom Herbert Sept. 30, 2014, 3:34 p.m. UTC | #11
On Tue, Sep 30, 2014 at 7:30 AM, Or Gerlitz <gerlitz.or@gmail.com> wrote:
> On Tue, Sep 30, 2014 at 12:38 AM, Tom Herbert <therbert@google.com> wrote:
>> On Mon, Sep 29, 2014 at 2:10 PM, Or Gerlitz <gerlitz.or@gmail.com> wrote:
>>> On Mon, Sep 29, 2014 at 11:53 PM, Tom Herbert <therbert@google.com> wrote:
>>>> On Mon, Sep 29, 2014 at 12:59 PM, Or Gerlitz <gerlitz.or@gmail.com> wrote:
>>>>> On Mon, Sep 29, 2014 at 6:50 AM, Tom Herbert <therbert@google.com> wrote:
>>>>>> Add ndo_gso_check which a device can define to indicate whether is
>>>>>> is capable of doing GSO on a packet. This funciton would be called from
>>>>>> the stack to determine whether software GSO is needed to be done.
>>>>>
>>>>> please, no...
>>>>>
>>>>> We should strive to have a model/architecture under which the driver
>>>>> can clearly advertize up something (bits, collection of bits,
>>>>> whatever) which the stack can run through some dec-sion making code
>>>>> and decide if to GSO yes/no, do it ala SW or ala HW. As life, this
>>>>> model need not be perfect and can be biased towards being
>>>>> simple/robust/conservative and developed incrementally.
>
>>>> Please make a specific proposal then.
>
> OK
>
>>> We 1st need to bring the system back into a consistent state, see my
>>> band-aid proposal. BTW, this band-aid might turn to be the basis for
>>> the longer term solution too. I admit that saying "no" is ten (100)
>>> times harder vs. say "let's do it this way", but IMHO the fm10k call
>>> chain I pointed on is what you are suggesting more-or-less and is no-no
>
>> I'd much rather have drivers do this, than inflict the stack with more
>> complexity. As you describe "the driver can clearly advertise up
>> something (bits, collection of bits, whatever) which the stack can run
>> through some dec-sion making code and decide if to GSO yes/no"-- seems
>> very complex to me. My proposed alternative is to just ask the driver
>
> I see the point you are trying to make, but
>
>> and they can implement whatever policy they want, stack should doesn't
>> care about the specifics, just needs an answer. Neither does this
>> necessarily mean that driver needs to inspect packet, for instance I
>> suspect that just be looking at inner header lengths and skb->protocol
>> being TEB would be standard check to match VXLAN.
>
> I'm not sure how exactly this (inner protocol being Ethernet and inner
> header lengths)
> is going to work to differentiate between VXLAN and NVGRE (or @ least
> the GRE-ing done
> by OVS on guest Ethernet frames).
>
GSO processing for VXLAN and NVGRE should be identical. They both have
a four byte header that needs to be copied per packet and both only
carry Ethernet frames.

>> In any case, if you can formulate your proposal in a patch that would
>> be very helpful.
>
> Quick idea is the following:
>
> It's common that when someone along the stack (e,g OVS vxlan/gre datapath logic)
> encapsulates a packet, they do know what sort of encapsulation they are doing.
>
> So the encapsulating entity can color the packet skb and the driver
> would advertize
> to what colors (UDP encap types) they can do GSO. When we come to a
> point where the
> stack has to decide if go for SW or HW GSO, they attempt to match the colors.
>
This would be equivalent to adding more protocol specific GSO feature
bits. I still don't see how this will scale. The number of protocols
that we might want to encapsulate over UDP is vast-- even before FOU
adding possibility of encapsulating any IP protocol in UDP. And, as
already pointed out, devices might have other arbitrary limitations
such as length of inner headers that wouldn't easily be represented in
features.

Also, this does not benefit the stack or drivers that already support
generic SKB_GSO_UDP_TUNNEL mechanism.

Would any other driver maintainers like to chime in on this?

Thanks,
Tom

> Commit 6a93cc9052748c6355ec9d5b6c38b77f85f1cb0d "udp-tunnel: Add a few
> more UDP tunnel APIs"
> added encap_type field to sockets. And commit
> acbf74a763002bdc74ccfcdac22360bf18e305c5
> "vxlan: Refactor vxlan driver to make use of the common UDP tunnel
> functions" sets encap_type
> for vxlan sockets. We can apply the same idea on packets going by
> specific UDP encapsulation drivers.
>
> Or.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Stephen Hemminger Sept. 30, 2014, 9:37 p.m. UTC | #12
On Tue, 30 Sep 2014 08:34:14 -0700
Tom Herbert <therbert@google.com> wrote:

> On Tue, Sep 30, 2014 at 7:30 AM, Or Gerlitz <gerlitz.or@gmail.com> wrote:
> > On Tue, Sep 30, 2014 at 12:38 AM, Tom Herbert <therbert@google.com> wrote:
> >> On Mon, Sep 29, 2014 at 2:10 PM, Or Gerlitz <gerlitz.or@gmail.com> wrote:
> >>> On Mon, Sep 29, 2014 at 11:53 PM, Tom Herbert <therbert@google.com> wrote:
> >>>> On Mon, Sep 29, 2014 at 12:59 PM, Or Gerlitz <gerlitz.or@gmail.com> wrote:
> >>>>> On Mon, Sep 29, 2014 at 6:50 AM, Tom Herbert <therbert@google.com> wrote:
> >>>>>> Add ndo_gso_check which a device can define to indicate whether is
> >>>>>> is capable of doing GSO on a packet. This funciton would be called from
> >>>>>> the stack to determine whether software GSO is needed to be done.
> >>>>>
> >>>>> please, no...
> >>>>>
> >>>>> We should strive to have a model/architecture under which the driver
> >>>>> can clearly advertize up something (bits, collection of bits,
> >>>>> whatever) which the stack can run through some dec-sion making code
> >>>>> and decide if to GSO yes/no, do it ala SW or ala HW. As life, this
> >>>>> model need not be perfect and can be biased towards being
> >>>>> simple/robust/conservative and developed incrementally.
> >
> >>>> Please make a specific proposal then.
> >
> > OK
> >
> >>> We 1st need to bring the system back into a consistent state, see my
> >>> band-aid proposal. BTW, this band-aid might turn to be the basis for
> >>> the longer term solution too. I admit that saying "no" is ten (100)
> >>> times harder vs. say "let's do it this way", but IMHO the fm10k call
> >>> chain I pointed on is what you are suggesting more-or-less and is no-no
> >
> >> I'd much rather have drivers do this, than inflict the stack with more
> >> complexity. As you describe "the driver can clearly advertise up
> >> something (bits, collection of bits, whatever) which the stack can run
> >> through some dec-sion making code and decide if to GSO yes/no"-- seems
> >> very complex to me. My proposed alternative is to just ask the driver
> >
> > I see the point you are trying to make, but
> >
> >> and they can implement whatever policy they want, stack should doesn't
> >> care about the specifics, just needs an answer. Neither does this
> >> necessarily mean that driver needs to inspect packet, for instance I
> >> suspect that just be looking at inner header lengths and skb->protocol
> >> being TEB would be standard check to match VXLAN.
> >
> > I'm not sure how exactly this (inner protocol being Ethernet and inner
> > header lengths)
> > is going to work to differentiate between VXLAN and NVGRE (or @ least
> > the GRE-ing done
> > by OVS on guest Ethernet frames).
> >
> GSO processing for VXLAN and NVGRE should be identical. They both have
> a four byte header that needs to be copied per packet and both only
> carry Ethernet frames.
> 
> >> In any case, if you can formulate your proposal in a patch that would
> >> be very helpful.
> >
> > Quick idea is the following:
> >
> > It's common that when someone along the stack (e,g OVS vxlan/gre datapath logic)
> > encapsulates a packet, they do know what sort of encapsulation they are doing.
> >
> > So the encapsulating entity can color the packet skb and the driver
> > would advertize
> > to what colors (UDP encap types) they can do GSO. When we come to a
> > point where the
> > stack has to decide if go for SW or HW GSO, they attempt to match the colors.
> >
> This would be equivalent to adding more protocol specific GSO feature
> bits. I still don't see how this will scale. The number of protocols
> that we might want to encapsulate over UDP is vast-- even before FOU
> adding possibility of encapsulating any IP protocol in UDP. And, as
> already pointed out, devices might have other arbitrary limitations
> such as length of inner headers that wouldn't easily be represented in
> features.
> 
> Also, this does not benefit the stack or drivers that already support
> generic SKB_GSO_UDP_TUNNEL mechanism.
> 
> Would any other driver maintainers like to chime in on this?

I prefer the simplistic "yes I can do GSO" flag and let the
driver do software GSO for the cases where it detects "no never mind
that, I can't' do GSO on a Q-in-Q VLAN with NVGRE".

Software GSO at driver level is sometimes better anyway because it means driver can
enqueue a burst of packets into Tx ring.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eric Dumazet Sept. 30, 2014, 10:11 p.m. UTC | #13
On Tue, 2014-09-30 at 14:37 -0700, Stephen Hemminger wrote:

> Software GSO at driver level is sometimes better anyway because it means driver can
> enqueue a burst of packets into Tx ring.

Yes, and it can save a lot of churn (allocating sk_buff, copying ~256
bytes, bunch of atomic ops...)



--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Tom Herbert Oct. 1, 2014, 12:05 a.m. UTC | #14
On Tue, Sep 30, 2014 at 2:37 PM, Stephen Hemminger
<stephen@networkplumber.org> wrote:
>
> On Tue, 30 Sep 2014 08:34:14 -0700
> Tom Herbert <therbert@google.com> wrote:
>
> > On Tue, Sep 30, 2014 at 7:30 AM, Or Gerlitz <gerlitz.or@gmail.com> wrote:
> > > On Tue, Sep 30, 2014 at 12:38 AM, Tom Herbert <therbert@google.com> wrote:
> > >> On Mon, Sep 29, 2014 at 2:10 PM, Or Gerlitz <gerlitz.or@gmail.com> wrote:
> > >>> On Mon, Sep 29, 2014 at 11:53 PM, Tom Herbert <therbert@google.com> wrote:
> > >>>> On Mon, Sep 29, 2014 at 12:59 PM, Or Gerlitz <gerlitz.or@gmail.com> wrote:
> > >>>>> On Mon, Sep 29, 2014 at 6:50 AM, Tom Herbert <therbert@google.com> wrote:
> > >>>>>> Add ndo_gso_check which a device can define to indicate whether is
> > >>>>>> is capable of doing GSO on a packet. This funciton would be called from
> > >>>>>> the stack to determine whether software GSO is needed to be done.
> > >>>>>
> > >>>>> please, no...
> > >>>>>
> > >>>>> We should strive to have a model/architecture under which the driver
> > >>>>> can clearly advertize up something (bits, collection of bits,
> > >>>>> whatever) which the stack can run through some dec-sion making code
> > >>>>> and decide if to GSO yes/no, do it ala SW or ala HW. As life, this
> > >>>>> model need not be perfect and can be biased towards being
> > >>>>> simple/robust/conservative and developed incrementally.
> > >
> > >>>> Please make a specific proposal then.
> > >
> > > OK
> > >
> > >>> We 1st need to bring the system back into a consistent state, see my
> > >>> band-aid proposal. BTW, this band-aid might turn to be the basis for
> > >>> the longer term solution too. I admit that saying "no" is ten (100)
> > >>> times harder vs. say "let's do it this way", but IMHO the fm10k call
> > >>> chain I pointed on is what you are suggesting more-or-less and is no-no
> > >
> > >> I'd much rather have drivers do this, than inflict the stack with more
> > >> complexity. As you describe "the driver can clearly advertise up
> > >> something (bits, collection of bits, whatever) which the stack can run
> > >> through some dec-sion making code and decide if to GSO yes/no"-- seems
> > >> very complex to me. My proposed alternative is to just ask the driver
> > >
> > > I see the point you are trying to make, but
> > >
> > >> and they can implement whatever policy they want, stack should doesn't
> > >> care about the specifics, just needs an answer. Neither does this
> > >> necessarily mean that driver needs to inspect packet, for instance I
> > >> suspect that just be looking at inner header lengths and skb->protocol
> > >> being TEB would be standard check to match VXLAN.
> > >
> > > I'm not sure how exactly this (inner protocol being Ethernet and inner
> > > header lengths)
> > > is going to work to differentiate between VXLAN and NVGRE (or @ least
> > > the GRE-ing done
> > > by OVS on guest Ethernet frames).
> > >
> > GSO processing for VXLAN and NVGRE should be identical. They both have
> > a four byte header that needs to be copied per packet and both only
> > carry Ethernet frames.
> >
> > >> In any case, if you can formulate your proposal in a patch that would
> > >> be very helpful.
> > >
> > > Quick idea is the following:
> > >
> > > It's common that when someone along the stack (e,g OVS vxlan/gre datapath logic)
> > > encapsulates a packet, they do know what sort of encapsulation they are doing.
> > >
> > > So the encapsulating entity can color the packet skb and the driver
> > > would advertize
> > > to what colors (UDP encap types) they can do GSO. When we come to a
> > > point where the
> > > stack has to decide if go for SW or HW GSO, they attempt to match the colors.
> > >
> > This would be equivalent to adding more protocol specific GSO feature
> > bits. I still don't see how this will scale. The number of protocols
> > that we might want to encapsulate over UDP is vast-- even before FOU
> > adding possibility of encapsulating any IP protocol in UDP. And, as
> > already pointed out, devices might have other arbitrary limitations
> > such as length of inner headers that wouldn't easily be represented in
> > features.
> >
> > Also, this does not benefit the stack or drivers that already support
> > generic SKB_GSO_UDP_TUNNEL mechanism.
> >
> > Would any other driver maintainers like to chime in on this?
>
> I prefer the simplistic "yes I can do GSO" flag and let the
> driver do software GSO for the cases where it detects "no never mind
> that, I can't' do GSO on a Q-in-Q VLAN with NVGRE".
>
That would be nice since it entails no change to the stack and follows
same model used for checksum (I couldn't find any drivers that call
skb_mac_gso_segment right now). Though the driver might need to handle
the case where it is unable to queue all the segments created.

> Software GSO at driver level is sometimes better anyway because it means driver can
> enqueue a burst of packets into Tx ring.


Will this still be an advantage with the burst TX implementation?
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eric Dumazet Oct. 1, 2014, 12:18 a.m. UTC | #15
On Tue, 2014-09-30 at 17:05 -0700, Tom Herbert wrote:

> 
> Will this still be an advantage with the burst TX implementation?

Segmenting a 64K packet into 45 sk_buff adds an additional 90
allocations (45 sk_buff, 45 sk->head) and dirtying a _lot_ of memory.

We now have some helpers that might be used by drivers to perform TSO
emulation.

commit e876f208af18b074f800656e4d1b99da75b2135f ("net: Add a software
TSO helper API")

Check commit 3ae8f4e0b98b640aadf410c21185ccb6b5b02351 for reference
("net: mv643xx_eth: Implement software TSO")



--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eric Dumazet Oct. 1, 2014, 6:12 a.m. UTC | #16
On Tue, 2014-09-30 at 17:18 -0700, Eric Dumazet wrote:
> On Tue, 2014-09-30 at 17:05 -0700, Tom Herbert wrote:
> 
> > 
> > Will this still be an advantage with the burst TX implementation?
> 
> Segmenting a 64K packet into 45 sk_buff adds an additional 90
> allocations (45 sk_buff, 45 sk->head) and dirtying a _lot_ of memory.
> 
> We now have some helpers that might be used by drivers to perform TSO
> emulation.
> 
> commit e876f208af18b074f800656e4d1b99da75b2135f ("net: Add a software
> TSO helper API")
> 
> Check commit 3ae8f4e0b98b640aadf410c21185ccb6b5b02351 for reference
> ("net: mv643xx_eth: Implement software TSO")

BTW, even for NIC TSO capable, it seems TSO can be slower for small
packets (like 2 MSS packets) on some NICs at least...

In this case forcing segmentation can be a win (consumes more cpu
cycles, but reduce the stress on the NIC)

(Adding an extra check in netif_skb_features() would do as well, I
suppose)

+       if (skb_shinfo(skb)->gso_segs < skb->dev->gso_min_segs)
+               features &= ~NETIF_F_GSO_MASK;
 


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Or Gerlitz Oct. 1, 2014, 8:58 p.m. UTC | #17
On Tue, Sep 30, 2014 at 6:34 PM, Tom Herbert <therbert@google.com> wrote:
> On Tue, Sep 30, 2014 at 7:30 AM, Or Gerlitz <gerlitz.or@gmail.com> wrote:
>> On Tue, Sep 30, 2014 at 12:38 AM, Tom Herbert <therbert@google.com> wrote:
>>> On Mon, Sep 29, 2014 at 2:10 PM, Or Gerlitz <gerlitz.or@gmail.com> wrote:
>>>> On Mon, Sep 29, 2014 at 11:53 PM, Tom Herbert <therbert@google.com> wrote:
>>>>> On Mon, Sep 29, 2014 at 12:59 PM, Or Gerlitz <gerlitz.or@gmail.com> wrote:
>>>>>> On Mon, Sep 29, 2014 at 6:50 AM, Tom Herbert <therbert@google.com> wrote:

>> I'm not sure how exactly this (inner protocol being Ethernet and inner
>> header lengths)
>> is going to work to differentiate between VXLAN and NVGRE (or @ least
>> the GRE-ing done by OVS on guest Ethernet frames).

> GSO processing for VXLAN and NVGRE should be identical. They both have
> a four byte header that needs to be copied per packet and both only
> carry Ethernet frames.

I know and indeed in the mlx4 case, the HW (ConnectX3-pro NIC) that
supports VXLAN offloads also supports NVGRE, but generally speaking, I
can't commit for other vendors HW/driver which support VXLAN today.


> This would be equivalent to adding more protocol specific GSO feature
> bits. I still don't see how this will scale. The number of protocols
> that we might want to encapsulate over UDP is vast-- even before FOU
> adding possibility of encapsulating any IP protocol in UDP. And, as
> already pointed out, devices might have other arbitrary limitations
> such as length of inner headers that wouldn't easily be represented in features.


> Also, this does not benefit the stack or drivers that already support
> generic SKB_GSO_UDP_TUNNEL mechanism.

But again, we have 4-5 drivers that are

1. upstream
2. support GSO offloading under VXLAN
3. advertize GSO_UDP_TUNNEL

As a starting point to stabilize the system (3.18 which has FOU and
friends, merge window coming up while we are @ LPC) we can

1. add GSO_UDP_VXLAN_TUNNEL type

2. ask each maintainer to decide if they want to advertize the generic tunnel
or only the vxlan specific one

3. color udp tunneled skb's with the tunnel type being vxlan or something else

have today's code that decides if to do SW GSO or HW GSO choose according
to the skb color and the driver posted value.

> Would any other driver maintainers like to chime in on this?

Alex? John?
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jesse Gross Oct. 1, 2014, 9:12 p.m. UTC | #18
On Wed, Oct 1, 2014 at 1:58 PM, Or Gerlitz <gerlitz.or@gmail.com> wrote:
>> Also, this does not benefit the stack or drivers that already support
>> generic SKB_GSO_UDP_TUNNEL mechanism.
>
> But again, we have 4-5 drivers that are
>
> 1. upstream
> 2. support GSO offloading under VXLAN
> 3. advertize GSO_UDP_TUNNEL
>
> As a starting point to stabilize the system (3.18 which has FOU and
> friends, merge window coming up while we are @ LPC) we can
>
> 1. add GSO_UDP_VXLAN_TUNNEL type
>
> 2. ask each maintainer to decide if they want to advertize the generic tunnel
> or only the vxlan specific one
>
> 3. color udp tunneled skb's with the tunnel type being vxlan or something else

There is no point in doing this. We should create a generic mechanism
that allows for drivers to assert some restrictions. For existing
drivers we can just provide an implementation of restrictions that is
pretty conservative and ask driver writers to take a look and loosen
it if possible. This is both cleaner and just as safe.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Tom Herbert Oct. 1, 2014, 11:06 p.m. UTC | #19
On Wed, Oct 1, 2014 at 1:58 PM, Or Gerlitz <gerlitz.or@gmail.com> wrote:
> On Tue, Sep 30, 2014 at 6:34 PM, Tom Herbert <therbert@google.com> wrote:
>> On Tue, Sep 30, 2014 at 7:30 AM, Or Gerlitz <gerlitz.or@gmail.com> wrote:
>>> On Tue, Sep 30, 2014 at 12:38 AM, Tom Herbert <therbert@google.com> wrote:
>>>> On Mon, Sep 29, 2014 at 2:10 PM, Or Gerlitz <gerlitz.or@gmail.com> wrote:
>>>>> On Mon, Sep 29, 2014 at 11:53 PM, Tom Herbert <therbert@google.com> wrote:
>>>>>> On Mon, Sep 29, 2014 at 12:59 PM, Or Gerlitz <gerlitz.or@gmail.com> wrote:
>>>>>>> On Mon, Sep 29, 2014 at 6:50 AM, Tom Herbert <therbert@google.com> wrote:
>
>>> I'm not sure how exactly this (inner protocol being Ethernet and inner
>>> header lengths)
>>> is going to work to differentiate between VXLAN and NVGRE (or @ least
>>> the GRE-ing done by OVS on guest Ethernet frames).
>
>> GSO processing for VXLAN and NVGRE should be identical. They both have
>> a four byte header that needs to be copied per packet and both only
>> carry Ethernet frames.
>
> I know and indeed in the mlx4 case, the HW (ConnectX3-pro NIC) that
> supports VXLAN offloads also supports NVGRE, but generally speaking, I
> can't commit for other vendors HW/driver which support VXLAN today.
>
>
>> This would be equivalent to adding more protocol specific GSO feature
>> bits. I still don't see how this will scale. The number of protocols
>> that we might want to encapsulate over UDP is vast-- even before FOU
>> adding possibility of encapsulating any IP protocol in UDP. And, as
>> already pointed out, devices might have other arbitrary limitations
>> such as length of inner headers that wouldn't easily be represented in features.
>
>
>> Also, this does not benefit the stack or drivers that already support
>> generic SKB_GSO_UDP_TUNNEL mechanism.
>
> But again, we have 4-5 drivers that are
>
> 1. upstream
> 2. support GSO offloading under VXLAN
> 3. advertize GSO_UDP_TUNNEL
>
> As a starting point to stabilize the system (3.18 which has FOU and
> friends, merge window coming up while we are @ LPC) we can
>
> 1. add GSO_UDP_VXLAN_TUNNEL type
>
> 2. ask each maintainer to decide if they want to advertize the generic tunnel
> or only the vxlan specific one
>
> 3. color udp tunneled skb's with the tunnel type being vxlan or something else
>
> have today's code that decides if to do SW GSO or HW GSO choose according
> to the skb color and the driver posted value.
>
Solution #4: apply this patch and implement the check functions as
needed in those 4 or 5 drivers. If a device can only do VXLAN/NVGRE
then I believe the check function is something like:

bool mydev_gso_check(struct sk_buff *skb, struct net_device *dev)
{
        if ((skb_shinfo(skb)->gso_type & SKB_GSO_UDP_TUNNEL) &&
            ((skb->inner_protocol_type != ENCAP_TYPE_ETHER ||
              skb->protocol != htons(ETH_P_TEB) ||
              skb_inner_mac_header(skb) - skb_transport_header(skb) != 12)
                return false;

        return true;
}

>> Would any other driver maintainers like to chime in on this?
>
> Alex? John?
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Or Gerlitz Oct. 5, 2014, 2:04 p.m. UTC | #20
On Thu, Oct 2, 2014 at 2:06 AM, Tom Herbert <therbert@google.com> wrote:
> On Wed, Oct 1, 2014 at 1:58 PM, Or Gerlitz <gerlitz.or@gmail.com> wrote:
>> On Tue, Sep 30, 2014 at 6:34 PM, Tom Herbert <therbert@google.com> wrote:
>>> On Tue, Sep 30, 2014 at 7:30 AM, Or Gerlitz <gerlitz.or@gmail.com> wrote:
[...]
> Solution #4: apply this patch and implement the check functions as
> needed in those 4 or 5 drivers. If a device can only do VXLAN/NVGRE
> then I believe the check function is something like:
>
> bool mydev_gso_check(struct sk_buff *skb, struct net_device *dev)
> {
>         if ((skb_shinfo(skb)->gso_type & SKB_GSO_UDP_TUNNEL) &&
>             ((skb->inner_protocol_type != ENCAP_TYPE_ETHER ||
>               skb->protocol != htons(ETH_P_TEB) ||
>               skb_inner_mac_header(skb) - skb_transport_header(skb) != 12)
>                 return false;
>
>         return true;
> }

Yep, such helper can can be basically made to work and let the 4-5
drivers that can
do GSO offloading for vxlan but not for any FOU/GUE packets signal
that to the stack.

Re the 12 constant, you were referring to the udp+vxlan headers? it's 8+8

Also, we need a way for drivers that can support VXLAN or NVGRE but
not concurrently
on the same port @ the same time to only let vxlan packet to pass
successfully through the helper.

There was that coloring scheme I suggested, but you didn't like it...

>>> Would any other driver maintainers like to chime in on this?
>> Alex? John?


Or.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Tom Herbert Oct. 5, 2014, 6:49 p.m. UTC | #21
On Sun, Oct 5, 2014 at 7:04 AM, Or Gerlitz <gerlitz.or@gmail.com> wrote:
> On Thu, Oct 2, 2014 at 2:06 AM, Tom Herbert <therbert@google.com> wrote:
>> On Wed, Oct 1, 2014 at 1:58 PM, Or Gerlitz <gerlitz.or@gmail.com> wrote:
>>> On Tue, Sep 30, 2014 at 6:34 PM, Tom Herbert <therbert@google.com> wrote:
>>>> On Tue, Sep 30, 2014 at 7:30 AM, Or Gerlitz <gerlitz.or@gmail.com> wrote:
> [...]
>> Solution #4: apply this patch and implement the check functions as
>> needed in those 4 or 5 drivers. If a device can only do VXLAN/NVGRE
>> then I believe the check function is something like:
>>
>> bool mydev_gso_check(struct sk_buff *skb, struct net_device *dev)
>> {
>>         if ((skb_shinfo(skb)->gso_type & SKB_GSO_UDP_TUNNEL) &&
>>             ((skb->inner_protocol_type != ENCAP_TYPE_ETHER ||
>>               skb->protocol != htons(ETH_P_TEB) ||
>>               skb_inner_mac_header(skb) - skb_transport_header(skb) != 12)
>>                 return false;
>>
>>         return true;
>> }
>
> Yep, such helper can can be basically made to work and let the 4-5
> drivers that can
> do GSO offloading for vxlan but not for any FOU/GUE packets signal
> that to the stack.
>
> Re the 12 constant, you were referring to the udp+vxlan headers? it's 8+8
>
> Also, we need a way for drivers that can support VXLAN or NVGRE but
> not concurrently
> on the same port @ the same time to only let vxlan packet to pass
> successfully through the helper.
>
Or, there should be no difference in GSO processing between VXLAN and
NVGRE. Can you explain why you feel you need to differentiate them for
GSO?

Thanks,
Tom

> There was that coloring scheme I suggested, but you didn't like it...
>
>>>> Would any other driver maintainers like to chime in on this?
>>> Alex? John?
>
>
> Or.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Or Gerlitz Oct. 5, 2014, 7:13 p.m. UTC | #22
On Sun, Oct 5, 2014 at 9:49 PM, Tom Herbert <therbert@google.com> wrote:
> On Sun, Oct 5, 2014 at 7:04 AM, Or Gerlitz <gerlitz.or@gmail.com> wrote:
>> On Thu, Oct 2, 2014 at 2:06 AM, Tom Herbert <therbert@google.com> wrote:
>>> On Wed, Oct 1, 2014 at 1:58 PM, Or Gerlitz <gerlitz.or@gmail.com> wrote:
>>>> On Tue, Sep 30, 2014 at 6:34 PM, Tom Herbert <therbert@google.com> wrote:
>>>>> On Tue, Sep 30, 2014 at 7:30 AM, Or Gerlitz <gerlitz.or@gmail.com> wrote:
>> [...]
>>> Solution #4: apply this patch and implement the check functions as
>>> needed in those 4 or 5 drivers. If a device can only do VXLAN/NVGRE
>>> then I believe the check function is something like:
>>>
>>> bool mydev_gso_check(struct sk_buff *skb, struct net_device *dev)
>>> {
>>>         if ((skb_shinfo(skb)->gso_type & SKB_GSO_UDP_TUNNEL) &&
>>>             ((skb->inner_protocol_type != ENCAP_TYPE_ETHER ||
>>>               skb->protocol != htons(ETH_P_TEB) ||
>>>               skb_inner_mac_header(skb) - skb_transport_header(skb) != 12)
>>>                 return false;
>>>
>>>         return true;
>>> }
>>
>> Yep, such helper can can be basically made to work and let the 4-5
>> drivers that can
>> do GSO offloading for vxlan but not for any FOU/GUE packets signal
>> that to the stack.
>>
>> Re the 12 constant, you were referring to the udp+vxlan headers? it's 8+8
>>
>> Also, we need a way for drivers that can support VXLAN or NVGRE but
>> not concurrently
>> on the same port @ the same time to only let vxlan packet to pass
>> successfully through the helper.

> Or, there should be no difference in GSO processing between VXLAN and
> NVGRE. Can you explain why you feel you need to differentiate them for GSO?


RX wise, Linux tells the driver that UDP port X would be used for
VXLAN, right? and indeed, it's possible for some HW implementations
not to support RX offloading (checksum) for both VXLAN and NVGRE @ the
same time over the same port. But TX/GRO wise, you're probably
correct. The thing is that from the user POV they need solution that
works for both RX and TX offloading.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Tom Herbert Oct. 6, 2014, 5:59 p.m. UTC | #23
On Sun, Oct 5, 2014 at 12:13 PM, Or Gerlitz <gerlitz.or@gmail.com> wrote:
> On Sun, Oct 5, 2014 at 9:49 PM, Tom Herbert <therbert@google.com> wrote:
>> On Sun, Oct 5, 2014 at 7:04 AM, Or Gerlitz <gerlitz.or@gmail.com> wrote:
>>> On Thu, Oct 2, 2014 at 2:06 AM, Tom Herbert <therbert@google.com> wrote:
>>>> On Wed, Oct 1, 2014 at 1:58 PM, Or Gerlitz <gerlitz.or@gmail.com> wrote:
>>>>> On Tue, Sep 30, 2014 at 6:34 PM, Tom Herbert <therbert@google.com> wrote:
>>>>>> On Tue, Sep 30, 2014 at 7:30 AM, Or Gerlitz <gerlitz.or@gmail.com> wrote:
>>> [...]
>>>> Solution #4: apply this patch and implement the check functions as
>>>> needed in those 4 or 5 drivers. If a device can only do VXLAN/NVGRE
>>>> then I believe the check function is something like:
>>>>
>>>> bool mydev_gso_check(struct sk_buff *skb, struct net_device *dev)
>>>> {
>>>>         if ((skb_shinfo(skb)->gso_type & SKB_GSO_UDP_TUNNEL) &&
>>>>             ((skb->inner_protocol_type != ENCAP_TYPE_ETHER ||
>>>>               skb->protocol != htons(ETH_P_TEB) ||
>>>>               skb_inner_mac_header(skb) - skb_transport_header(skb) != 12)
>>>>                 return false;
>>>>
>>>>         return true;
>>>> }
>>>
>>> Yep, such helper can can be basically made to work and let the 4-5
>>> drivers that can
>>> do GSO offloading for vxlan but not for any FOU/GUE packets signal
>>> that to the stack.
>>>
>>> Re the 12 constant, you were referring to the udp+vxlan headers? it's 8+8
>>>
>>> Also, we need a way for drivers that can support VXLAN or NVGRE but
>>> not concurrently
>>> on the same port @ the same time to only let vxlan packet to pass
>>> successfully through the helper.
>
>> Or, there should be no difference in GSO processing between VXLAN and
>> NVGRE. Can you explain why you feel you need to differentiate them for GSO?
>
>
> RX wise, Linux tells the driver that UDP port X would be used for
> VXLAN, right? and indeed, it's possible for some HW implementations
> not to support RX offloading (checksum) for both VXLAN and NVGRE @ the
> same time over the same port. But TX/GRO wise, you're probably
> correct. The thing is that from the user POV they need solution that
> works for both RX and TX offloading.

I think from a user POV we want a solution that supports RX and TX
offloading across the widest range of protocols. This is accomplished
by implementing protocol agnostic mechanisms like CHECKSUM_COMPLETE
and protocol agnostic UDP tunnel TSO like we've described. IMO, the
fact that we have devices that implement protocol specific mechanisms
for NVGRE and VXLAN should be considered legacy support in the stack,
for new UDP encapsulation protocols we should not expose specifics in
the stack in either by adding a GSO type for each protocol, nor
ndo_add_foo_port for each protocol-- these things will not scale and
unnecessarily complicate the core stack.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Or Gerlitz Oct. 6, 2014, 8:22 p.m. UTC | #24
On Mon, Oct 6, 2014 at 8:59 PM, Tom Herbert <therbert@google.com> wrote:
> On Sun, Oct 5, 2014 at 12:13 PM, Or Gerlitz <gerlitz.or@gmail.com> wrote:
>> RX wise, Linux tells the driver that UDP port X would be used for
>> VXLAN, right? and indeed, it's possible for some HW implementations
>> not to support RX offloading (checksum) for both VXLAN and NVGRE @ the
>> same time over the same port. But TX/GRO wise, you're probably
>> correct. The thing is that from the user POV they need solution that
>> works for both RX and TX offloading.

> I think from a user POV we want a solution that supports RX and TX
> offloading across the widest range of protocols. This is accomplished
> by implementing protocol agnostic mechanisms like CHECKSUM_COMPLETE
> and protocol agnostic UDP tunnel TSO like we've described. IMO, the
> fact that we have devices that implement protocol specific mechanisms
> for NVGRE and VXLAN should be considered legacy support in the stack,
> for new UDP encapsulation protocols we should not expose specifics in
> the stack in either by adding a GSO type for each protocol, nor
> ndo_add_foo_port for each protocol-- these things will not scale and
> unnecessarily complicate the core stack.

I tend to generally agree to the wind that blows from your writeup, namely:

UDP encapsulation offloads wise, we should pose few general
requirements to NICs to be implemented by vendors in their tomorrow's
HW and treat the current generation (these 4-5 drivers with their
limitations as legacy which should be supported but not state the
stack overall design).

Still we should seek more ways to reduce the pain/amount of
not-well-defined-configurations when these drivers are there and the
stack goes through this upside-down turnaround changes. OTOH you
didn't accept my SKB coloring suggestion for GSO inspection, and OTOH
I guess we can live with some sort of generic helper in the form of
what you suggested, but like it or not, getting rid of
ndo_add_vxlan_port will simply break things out.

Are we going to have a session on the encapsulation/offloads design @ LPC?

I think a replay of your LKS presentation along with open discussion
on how to get there with the legacy requirements could be very
helpful.


Or.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Tom Herbert Oct. 6, 2014, 9:28 p.m. UTC | #25
On Mon, Oct 6, 2014 at 1:22 PM, Or Gerlitz <gerlitz.or@gmail.com> wrote:
> On Mon, Oct 6, 2014 at 8:59 PM, Tom Herbert <therbert@google.com> wrote:
>> On Sun, Oct 5, 2014 at 12:13 PM, Or Gerlitz <gerlitz.or@gmail.com> wrote:
>>> RX wise, Linux tells the driver that UDP port X would be used for
>>> VXLAN, right? and indeed, it's possible for some HW implementations
>>> not to support RX offloading (checksum) for both VXLAN and NVGRE @ the
>>> same time over the same port. But TX/GRO wise, you're probably
>>> correct. The thing is that from the user POV they need solution that
>>> works for both RX and TX offloading.
>
>> I think from a user POV we want a solution that supports RX and TX
>> offloading across the widest range of protocols. This is accomplished
>> by implementing protocol agnostic mechanisms like CHECKSUM_COMPLETE
>> and protocol agnostic UDP tunnel TSO like we've described. IMO, the
>> fact that we have devices that implement protocol specific mechanisms
>> for NVGRE and VXLAN should be considered legacy support in the stack,
>> for new UDP encapsulation protocols we should not expose specifics in
>> the stack in either by adding a GSO type for each protocol, nor
>> ndo_add_foo_port for each protocol-- these things will not scale and
>> unnecessarily complicate the core stack.
>
> I tend to generally agree to the wind that blows from your writeup, namely:
>
> UDP encapsulation offloads wise, we should pose few general
> requirements to NICs to be implemented by vendors in their tomorrow's
> HW and treat the current generation (these 4-5 drivers with their
> limitations as legacy which should be supported but not state the
> stack overall design).
>
> Still we should seek more ways to reduce the pain/amount of
> not-well-defined-configurations when these drivers are there and the
> stack goes through this upside-down turnaround changes. OTOH you
> didn't accept my SKB coloring suggestion for GSO inspection, and OTOH
> I guess we can live with some sort of generic helper in the form of
> what you suggested, but like it or not, getting rid of
> ndo_add_vxlan_port will simply break things out.
>
> Are we going to have a session on the encapsulation/offloads design @ LPC?
>
yes, I will talk about FOU and GUE implementation. You should
abstracts in the schedule now.

> I think a replay of your LKS presentation along with open discussion
> on how to get there with the legacy requirements could be very
> helpful.
>
>
> Or.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jesse Gross Oct. 6, 2014, 9:51 p.m. UTC | #26
On Sun, Oct 5, 2014 at 11:49 AM, Tom Herbert <therbert@google.com> wrote:
> On Sun, Oct 5, 2014 at 7:04 AM, Or Gerlitz <gerlitz.or@gmail.com> wrote:
>> On Thu, Oct 2, 2014 at 2:06 AM, Tom Herbert <therbert@google.com> wrote:
>>> On Wed, Oct 1, 2014 at 1:58 PM, Or Gerlitz <gerlitz.or@gmail.com> wrote:
>>>> On Tue, Sep 30, 2014 at 6:34 PM, Tom Herbert <therbert@google.com> wrote:
>>>>> On Tue, Sep 30, 2014 at 7:30 AM, Or Gerlitz <gerlitz.or@gmail.com> wrote:
>> [...]
>>> Solution #4: apply this patch and implement the check functions as
>>> needed in those 4 or 5 drivers. If a device can only do VXLAN/NVGRE
>>> then I believe the check function is something like:
>>>
>>> bool mydev_gso_check(struct sk_buff *skb, struct net_device *dev)
>>> {
>>>         if ((skb_shinfo(skb)->gso_type & SKB_GSO_UDP_TUNNEL) &&
>>>             ((skb->inner_protocol_type != ENCAP_TYPE_ETHER ||
>>>               skb->protocol != htons(ETH_P_TEB) ||
>>>               skb_inner_mac_header(skb) - skb_transport_header(skb) != 12)
>>>                 return false;
>>>
>>>         return true;
>>> }
>>
>> Yep, such helper can can be basically made to work and let the 4-5
>> drivers that can
>> do GSO offloading for vxlan but not for any FOU/GUE packets signal
>> that to the stack.
>>
>> Re the 12 constant, you were referring to the udp+vxlan headers? it's 8+8
>>
>> Also, we need a way for drivers that can support VXLAN or NVGRE but
>> not concurrently
>> on the same port @ the same time to only let vxlan packet to pass
>> successfully through the helper.
>>
> Or, there should be no difference in GSO processing between VXLAN and
> NVGRE. Can you explain why you feel you need to differentiate them for
> GSO?

There is a difference in the processing that needs to happen for VXLAN
and GRE, even on transmit: at a minimum, the length field in the UDP
header needs to be updated for VXLAN. These are already broken out in
the stack between GRE and UDP tunnels anyways though.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jesse Gross Oct. 6, 2014, 10:33 p.m. UTC | #27
On Mon, Oct 6, 2014 at 10:59 AM, Tom Herbert <therbert@google.com> wrote:
> On Sun, Oct 5, 2014 at 12:13 PM, Or Gerlitz <gerlitz.or@gmail.com> wrote:
>> On Sun, Oct 5, 2014 at 9:49 PM, Tom Herbert <therbert@google.com> wrote:
>>> On Sun, Oct 5, 2014 at 7:04 AM, Or Gerlitz <gerlitz.or@gmail.com> wrote:
>>>> On Thu, Oct 2, 2014 at 2:06 AM, Tom Herbert <therbert@google.com> wrote:
>>>>> On Wed, Oct 1, 2014 at 1:58 PM, Or Gerlitz <gerlitz.or@gmail.com> wrote:
>>>>>> On Tue, Sep 30, 2014 at 6:34 PM, Tom Herbert <therbert@google.com> wrote:
>>>>>>> On Tue, Sep 30, 2014 at 7:30 AM, Or Gerlitz <gerlitz.or@gmail.com> wrote:
>>>> [...]
>>>>> Solution #4: apply this patch and implement the check functions as
>>>>> needed in those 4 or 5 drivers. If a device can only do VXLAN/NVGRE
>>>>> then I believe the check function is something like:
>>>>>
>>>>> bool mydev_gso_check(struct sk_buff *skb, struct net_device *dev)
>>>>> {
>>>>>         if ((skb_shinfo(skb)->gso_type & SKB_GSO_UDP_TUNNEL) &&
>>>>>             ((skb->inner_protocol_type != ENCAP_TYPE_ETHER ||
>>>>>               skb->protocol != htons(ETH_P_TEB) ||
>>>>>               skb_inner_mac_header(skb) - skb_transport_header(skb) != 12)
>>>>>                 return false;
>>>>>
>>>>>         return true;
>>>>> }
>>>>
>>>> Yep, such helper can can be basically made to work and let the 4-5
>>>> drivers that can
>>>> do GSO offloading for vxlan but not for any FOU/GUE packets signal
>>>> that to the stack.
>>>>
>>>> Re the 12 constant, you were referring to the udp+vxlan headers? it's 8+8
>>>>
>>>> Also, we need a way for drivers that can support VXLAN or NVGRE but
>>>> not concurrently
>>>> on the same port @ the same time to only let vxlan packet to pass
>>>> successfully through the helper.
>>
>>> Or, there should be no difference in GSO processing between VXLAN and
>>> NVGRE. Can you explain why you feel you need to differentiate them for GSO?
>>
>>
>> RX wise, Linux tells the driver that UDP port X would be used for
>> VXLAN, right? and indeed, it's possible for some HW implementations
>> not to support RX offloading (checksum) for both VXLAN and NVGRE @ the
>> same time over the same port. But TX/GRO wise, you're probably
>> correct. The thing is that from the user POV they need solution that
>> works for both RX and TX offloading.
>
> I think from a user POV we want a solution that supports RX and TX
> offloading across the widest range of protocols. This is accomplished
> by implementing protocol agnostic mechanisms like CHECKSUM_COMPLETE
> and protocol agnostic UDP tunnel TSO like we've described. IMO, the
> fact that we have devices that implement protocol specific mechanisms
> for NVGRE and VXLAN should be considered legacy support in the stack,
> for new UDP encapsulation protocols we should not expose specifics in
> the stack in either by adding a GSO type for each protocol, nor
> ndo_add_foo_port for each protocol-- these things will not scale and
> unnecessarily complicate the core stack.

It's not clear to me that allowing devices to know what protocols are
running on what ports actually complicates the stack. The part that is
complicated is usually the types of operations that are being
offloaded (checksum, TSO, etc.). In all of these tunnel cases, the
operations are same and if you have a clean registration mechanism
then nothing in the core has to see this - only the protocol doing the
registering and the driver that is supporting it.

I have no disagreement with trying to be generic across protocols. I'm
just not convinced that it is a realistic plan. It's obvious that it
is not doable today nor will be it be in the next generation of NICs
(which are guaranteed to add support for new protocols). Furthermore,
there will be more advanced stuff coming in the future that I think
will be difficult or impossible to make protocol agnostic. Rather than
pretending that this doesn't exist or will never happen, it's better
focus on how to integrating it cleanly.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Tom Herbert Oct. 7, 2014, 12:17 a.m. UTC | #28
On Mon, Oct 6, 2014 at 3:33 PM, Jesse Gross <jesse@nicira.com> wrote:
> On Mon, Oct 6, 2014 at 10:59 AM, Tom Herbert <therbert@google.com> wrote:
>> On Sun, Oct 5, 2014 at 12:13 PM, Or Gerlitz <gerlitz.or@gmail.com> wrote:
>>> On Sun, Oct 5, 2014 at 9:49 PM, Tom Herbert <therbert@google.com> wrote:
>>>> On Sun, Oct 5, 2014 at 7:04 AM, Or Gerlitz <gerlitz.or@gmail.com> wrote:
>>>>> On Thu, Oct 2, 2014 at 2:06 AM, Tom Herbert <therbert@google.com> wrote:
>>>>>> On Wed, Oct 1, 2014 at 1:58 PM, Or Gerlitz <gerlitz.or@gmail.com> wrote:
>>>>>>> On Tue, Sep 30, 2014 at 6:34 PM, Tom Herbert <therbert@google.com> wrote:
>>>>>>>> On Tue, Sep 30, 2014 at 7:30 AM, Or Gerlitz <gerlitz.or@gmail.com> wrote:
>>>>> [...]
>>>>>> Solution #4: apply this patch and implement the check functions as
>>>>>> needed in those 4 or 5 drivers. If a device can only do VXLAN/NVGRE
>>>>>> then I believe the check function is something like:
>>>>>>
>>>>>> bool mydev_gso_check(struct sk_buff *skb, struct net_device *dev)
>>>>>> {
>>>>>>         if ((skb_shinfo(skb)->gso_type & SKB_GSO_UDP_TUNNEL) &&
>>>>>>             ((skb->inner_protocol_type != ENCAP_TYPE_ETHER ||
>>>>>>               skb->protocol != htons(ETH_P_TEB) ||
>>>>>>               skb_inner_mac_header(skb) - skb_transport_header(skb) != 12)
>>>>>>                 return false;
>>>>>>
>>>>>>         return true;
>>>>>> }
>>>>>
>>>>> Yep, such helper can can be basically made to work and let the 4-5
>>>>> drivers that can
>>>>> do GSO offloading for vxlan but not for any FOU/GUE packets signal
>>>>> that to the stack.
>>>>>
>>>>> Re the 12 constant, you were referring to the udp+vxlan headers? it's 8+8
>>>>>
>>>>> Also, we need a way for drivers that can support VXLAN or NVGRE but
>>>>> not concurrently
>>>>> on the same port @ the same time to only let vxlan packet to pass
>>>>> successfully through the helper.
>>>
>>>> Or, there should be no difference in GSO processing between VXLAN and
>>>> NVGRE. Can you explain why you feel you need to differentiate them for GSO?
>>>
>>>
>>> RX wise, Linux tells the driver that UDP port X would be used for
>>> VXLAN, right? and indeed, it's possible for some HW implementations
>>> not to support RX offloading (checksum) for both VXLAN and NVGRE @ the
>>> same time over the same port. But TX/GRO wise, you're probably
>>> correct. The thing is that from the user POV they need solution that
>>> works for both RX and TX offloading.
>>
>> I think from a user POV we want a solution that supports RX and TX
>> offloading across the widest range of protocols. This is accomplished
>> by implementing protocol agnostic mechanisms like CHECKSUM_COMPLETE
>> and protocol agnostic UDP tunnel TSO like we've described. IMO, the
>> fact that we have devices that implement protocol specific mechanisms
>> for NVGRE and VXLAN should be considered legacy support in the stack,
>> for new UDP encapsulation protocols we should not expose specifics in
>> the stack in either by adding a GSO type for each protocol, nor
>> ndo_add_foo_port for each protocol-- these things will not scale and
>> unnecessarily complicate the core stack.
>
> It's not clear to me that allowing devices to know what protocols are
> running on what ports actually complicates the stack. The part that is
> complicated is usually the types of operations that are being
> offloaded (checksum, TSO, etc.). In all of these tunnel cases, the
> operations are same and if you have a clean registration mechanism
> then nothing in the core has to see this - only the protocol doing the
> registering and the driver that is supporting it.
>

We already have an ntuple filtering interface that allows configuring
a device for special processing of RX packets. I don't see why that
shouldn't apply to the use case protocol processing for specific ports
in the encapsulation use case.

> I have no disagreement with trying to be generic across protocols. I'm
> just not convinced that it is a realistic plan. It's obvious that it
> is not doable today nor will be it be in the next generation of NICs
> (which are guaranteed to add support for new protocols). Furthermore,
> there will be more advanced stuff coming in the future that I think
> will be difficult or impossible to make protocol agnostic. Rather than
> pretending that this doesn't exist or will never happen, it's better
> focus on how to integrating it cleanly.

Sorry, but I don't understand how supporting a new protocols in a
device for the purposes of returning CHECKSUM_UNNECESSARY is better or
easier to implement than just returning CHECKSUM_COMPLETE. Same thing
for trying to use NETIF_F_IP_CSUM with encapsulation rather than
NETIF_F_HW_CSUM. I'm not a hardware guy, so it's possible I'm missing
something obvious...

Can you be more specific about this "advanced stuff"?

Thanks,
Tom
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Or Gerlitz Oct. 7, 2014, 11:07 a.m. UTC | #29
> yes, I will talk about FOU and GUE implementation. You should
> abstracts in the schedule now.

Thanks, I think it would be also good to cover the challenges we're
discussing over this
thread w.r.t nowadays upstream NICs HW/drivers

>> I think a replay of your LKS presentation along with open discussion
>> on how to get there with the legacy requirements could be very
>> helpful.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Tom Herbert Oct. 7, 2014, 6:13 p.m. UTC | #30
David,

I don't think there are any outstanding objections to this patch.
Will you be able to apply it or do you need something more to be done?

Thanks,
Tom


On Sun, Sep 28, 2014 at 8:50 PM, Tom Herbert <therbert@google.com> wrote:
> Add ndo_gso_check which a device can define to indicate whether is
> is capable of doing GSO on a packet. This funciton would be called from
> the stack to determine whether software GSO is needed to be done. A
> driver should populate this function if it advertises GSO types for
> which there are combinations that it wouldn't be able to handle. For
> instance a device that performs UDP tunneling might only implement
> support for transparent Ethernet bridging type of inner packets
> or might have limitations on lengths of inner headers.
>
> Signed-off-by: Tom Herbert <therbert@google.com>
> ---
>  include/linux/netdevice.h | 12 +++++++++++-
>  net/core/dev.c            |  2 +-
>  2 files changed, 12 insertions(+), 2 deletions(-)
>
> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> index 9f5d293..f8c2027 100644
> --- a/include/linux/netdevice.h
> +++ b/include/linux/netdevice.h
> @@ -997,6 +997,12 @@ typedef u16 (*select_queue_fallback_t)(struct net_device *dev,
>   *     Callback to use for xmit over the accelerated station. This
>   *     is used in place of ndo_start_xmit on accelerated net
>   *     devices.
> + * bool        (*ndo_gso_check) (struct sk_buff *skb,
> + *                       struct net_device *dev);
> + *     Called by core transmit path to determine if device is capable of
> + *     performing GSO on a packet. The device returns true if it is
> + *     able to GSO the packet, false otherwise. If the return value is
> + *     false the stack will do software GSO.
>   */
>  struct net_device_ops {
>         int                     (*ndo_init)(struct net_device *dev);
> @@ -1146,6 +1152,8 @@ struct net_device_ops {
>                                                         struct net_device *dev,
>                                                         void *priv);
>         int                     (*ndo_get_lock_subclass)(struct net_device *dev);
> +       bool                    (*ndo_gso_check) (struct sk_buff *skb,
> +                                                 struct net_device *dev);
>  };
>
>  /**
> @@ -3536,10 +3544,12 @@ static inline bool skb_gso_ok(struct sk_buff *skb, netdev_features_t features)
>                (!skb_has_frag_list(skb) || (features & NETIF_F_FRAGLIST));
>  }
>
> -static inline bool netif_needs_gso(struct sk_buff *skb,
> +static inline bool netif_needs_gso(struct net_device *dev, struct sk_buff *skb,
>                                    netdev_features_t features)
>  {
>         return skb_is_gso(skb) && (!skb_gso_ok(skb, features) ||
> +               (dev->netdev_ops->ndo_gso_check &&
> +                !dev->netdev_ops->ndo_gso_check(skb, dev)) ||
>                 unlikely((skb->ip_summed != CHECKSUM_PARTIAL) &&
>                          (skb->ip_summed != CHECKSUM_UNNECESSARY)));
>  }
> diff --git a/net/core/dev.c b/net/core/dev.c
> index e2ced01..8c2b9bb 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -2680,7 +2680,7 @@ struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev)
>         if (skb->encapsulation)
>                 features &= dev->hw_enc_features;
>
> -       if (netif_needs_gso(skb, features)) {
> +       if (netif_needs_gso(dev, skb, features)) {
>                 struct sk_buff *segs;
>
>                 segs = skb_gso_segment(skb, features);
> --
> 2.1.0.rc2.206.gedb03e5
>
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jesse Gross Oct. 9, 2014, 12:30 a.m. UTC | #31
On Mon, Oct 6, 2014 at 5:17 PM, Tom Herbert <therbert@google.com> wrote:
> On Mon, Oct 6, 2014 at 3:33 PM, Jesse Gross <jesse@nicira.com> wrote:
>> On Mon, Oct 6, 2014 at 10:59 AM, Tom Herbert <therbert@google.com> wrote:
>>> On Sun, Oct 5, 2014 at 12:13 PM, Or Gerlitz <gerlitz.or@gmail.com> wrote:
>>>> On Sun, Oct 5, 2014 at 9:49 PM, Tom Herbert <therbert@google.com> wrote:
>>>>> On Sun, Oct 5, 2014 at 7:04 AM, Or Gerlitz <gerlitz.or@gmail.com> wrote:
>>>>>> On Thu, Oct 2, 2014 at 2:06 AM, Tom Herbert <therbert@google.com> wrote:
>>>>>>> On Wed, Oct 1, 2014 at 1:58 PM, Or Gerlitz <gerlitz.or@gmail.com> wrote:
>>>>>>>> On Tue, Sep 30, 2014 at 6:34 PM, Tom Herbert <therbert@google.com> wrote:
>>>>>>>>> On Tue, Sep 30, 2014 at 7:30 AM, Or Gerlitz <gerlitz.or@gmail.com> wrote:
>>>>>> [...]
>>>>>>> Solution #4: apply this patch and implement the check functions as
>>>>>>> needed in those 4 or 5 drivers. If a device can only do VXLAN/NVGRE
>>>>>>> then I believe the check function is something like:
>>>>>>>
>>>>>>> bool mydev_gso_check(struct sk_buff *skb, struct net_device *dev)
>>>>>>> {
>>>>>>>         if ((skb_shinfo(skb)->gso_type & SKB_GSO_UDP_TUNNEL) &&
>>>>>>>             ((skb->inner_protocol_type != ENCAP_TYPE_ETHER ||
>>>>>>>               skb->protocol != htons(ETH_P_TEB) ||
>>>>>>>               skb_inner_mac_header(skb) - skb_transport_header(skb) != 12)
>>>>>>>                 return false;
>>>>>>>
>>>>>>>         return true;
>>>>>>> }
>>>>>>
>>>>>> Yep, such helper can can be basically made to work and let the 4-5
>>>>>> drivers that can
>>>>>> do GSO offloading for vxlan but not for any FOU/GUE packets signal
>>>>>> that to the stack.
>>>>>>
>>>>>> Re the 12 constant, you were referring to the udp+vxlan headers? it's 8+8
>>>>>>
>>>>>> Also, we need a way for drivers that can support VXLAN or NVGRE but
>>>>>> not concurrently
>>>>>> on the same port @ the same time to only let vxlan packet to pass
>>>>>> successfully through the helper.
>>>>
>>>>> Or, there should be no difference in GSO processing between VXLAN and
>>>>> NVGRE. Can you explain why you feel you need to differentiate them for GSO?
>>>>
>>>>
>>>> RX wise, Linux tells the driver that UDP port X would be used for
>>>> VXLAN, right? and indeed, it's possible for some HW implementations
>>>> not to support RX offloading (checksum) for both VXLAN and NVGRE @ the
>>>> same time over the same port. But TX/GRO wise, you're probably
>>>> correct. The thing is that from the user POV they need solution that
>>>> works for both RX and TX offloading.
>>>
>>> I think from a user POV we want a solution that supports RX and TX
>>> offloading across the widest range of protocols. This is accomplished
>>> by implementing protocol agnostic mechanisms like CHECKSUM_COMPLETE
>>> and protocol agnostic UDP tunnel TSO like we've described. IMO, the
>>> fact that we have devices that implement protocol specific mechanisms
>>> for NVGRE and VXLAN should be considered legacy support in the stack,
>>> for new UDP encapsulation protocols we should not expose specifics in
>>> the stack in either by adding a GSO type for each protocol, nor
>>> ndo_add_foo_port for each protocol-- these things will not scale and
>>> unnecessarily complicate the core stack.
>>
>> It's not clear to me that allowing devices to know what protocols are
>> running on what ports actually complicates the stack. The part that is
>> complicated is usually the types of operations that are being
>> offloaded (checksum, TSO, etc.). In all of these tunnel cases, the
>> operations are same and if you have a clean registration mechanism
>> then nothing in the core has to see this - only the protocol doing the
>> registering and the driver that is supporting it.
>>
>
> We already have an ntuple filtering interface that allows configuring
> a device for special processing of RX packets. I don't see why that
> shouldn't apply to the use case protocol processing for specific ports
> in the encapsulation use case.

You mentioned this before but I guess I don't really understand it. I
suppose it is possible to express the port number and encapsulation as
a filter but it doesn't really seem all that natural and at the end of
the day it won't be mapped to a filter in the NIC. Can you explain it
some more?

>> I have no disagreement with trying to be generic across protocols. I'm
>> just not convinced that it is a realistic plan. It's obvious that it
>> is not doable today nor will be it be in the next generation of NICs
>> (which are guaranteed to add support for new protocols). Furthermore,
>> there will be more advanced stuff coming in the future that I think
>> will be difficult or impossible to make protocol agnostic. Rather than
>> pretending that this doesn't exist or will never happen, it's better
>> focus on how to integrating it cleanly.
>
> Sorry, but I don't understand how supporting a new protocols in a
> device for the purposes of returning CHECKSUM_UNNECESSARY is better or
> easier to implement than just returning CHECKSUM_COMPLETE. Same thing
> for trying to use NETIF_F_IP_CSUM with encapsulation rather than
> NETIF_F_HW_CSUM. I'm not a hardware guy, so it's possible I'm missing
> something obvious...
>
> Can you be more specific about this "advanced stuff"?

I think checksums are really the exception, not the rule. It's great
that they have this nice property of being additive and we should use
that where we can but that doesn't apply to other types of operation
(or even other types of checksums). Encryption or CRC32 carried inside
the tunnel header can't be accelerated without some additional
knowledge of the protocol. I think there were also a few other things
that came up along these lines when we talked about this in an earlier
thread - that's what I mean by "advanced stuff".

For the basic one's complement checksums, I have no objection to
CHECKSUM_COMPLETE. However, the reality is that this is not generally
implemented today and that is unlikely to change for a few years even
in the best case.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Tom Herbert Oct. 9, 2014, 1:46 a.m. UTC | #32
On Wed, Oct 8, 2014 at 5:30 PM, Jesse Gross <jesse@nicira.com> wrote:
> On Mon, Oct 6, 2014 at 5:17 PM, Tom Herbert <therbert@google.com> wrote:
>> On Mon, Oct 6, 2014 at 3:33 PM, Jesse Gross <jesse@nicira.com> wrote:
>>> On Mon, Oct 6, 2014 at 10:59 AM, Tom Herbert <therbert@google.com> wrote:
>>>> On Sun, Oct 5, 2014 at 12:13 PM, Or Gerlitz <gerlitz.or@gmail.com> wrote:
>>>>> On Sun, Oct 5, 2014 at 9:49 PM, Tom Herbert <therbert@google.com> wrote:
>>>>>> On Sun, Oct 5, 2014 at 7:04 AM, Or Gerlitz <gerlitz.or@gmail.com> wrote:
>>>>>>> On Thu, Oct 2, 2014 at 2:06 AM, Tom Herbert <therbert@google.com> wrote:
>>>>>>>> On Wed, Oct 1, 2014 at 1:58 PM, Or Gerlitz <gerlitz.or@gmail.com> wrote:
>>>>>>>>> On Tue, Sep 30, 2014 at 6:34 PM, Tom Herbert <therbert@google.com> wrote:
>>>>>>>>>> On Tue, Sep 30, 2014 at 7:30 AM, Or Gerlitz <gerlitz.or@gmail.com> wrote:
>>>>>>> [...]
>>>>>>>> Solution #4: apply this patch and implement the check functions as
>>>>>>>> needed in those 4 or 5 drivers. If a device can only do VXLAN/NVGRE
>>>>>>>> then I believe the check function is something like:
>>>>>>>>
>>>>>>>> bool mydev_gso_check(struct sk_buff *skb, struct net_device *dev)
>>>>>>>> {
>>>>>>>>         if ((skb_shinfo(skb)->gso_type & SKB_GSO_UDP_TUNNEL) &&
>>>>>>>>             ((skb->inner_protocol_type != ENCAP_TYPE_ETHER ||
>>>>>>>>               skb->protocol != htons(ETH_P_TEB) ||
>>>>>>>>               skb_inner_mac_header(skb) - skb_transport_header(skb) != 12)
>>>>>>>>                 return false;
>>>>>>>>
>>>>>>>>         return true;
>>>>>>>> }
>>>>>>>
>>>>>>> Yep, such helper can can be basically made to work and let the 4-5
>>>>>>> drivers that can
>>>>>>> do GSO offloading for vxlan but not for any FOU/GUE packets signal
>>>>>>> that to the stack.
>>>>>>>
>>>>>>> Re the 12 constant, you were referring to the udp+vxlan headers? it's 8+8
>>>>>>>
>>>>>>> Also, we need a way for drivers that can support VXLAN or NVGRE but
>>>>>>> not concurrently
>>>>>>> on the same port @ the same time to only let vxlan packet to pass
>>>>>>> successfully through the helper.
>>>>>
>>>>>> Or, there should be no difference in GSO processing between VXLAN and
>>>>>> NVGRE. Can you explain why you feel you need to differentiate them for GSO?
>>>>>
>>>>>
>>>>> RX wise, Linux tells the driver that UDP port X would be used for
>>>>> VXLAN, right? and indeed, it's possible for some HW implementations
>>>>> not to support RX offloading (checksum) for both VXLAN and NVGRE @ the
>>>>> same time over the same port. But TX/GRO wise, you're probably
>>>>> correct. The thing is that from the user POV they need solution that
>>>>> works for both RX and TX offloading.
>>>>
>>>> I think from a user POV we want a solution that supports RX and TX
>>>> offloading across the widest range of protocols. This is accomplished
>>>> by implementing protocol agnostic mechanisms like CHECKSUM_COMPLETE
>>>> and protocol agnostic UDP tunnel TSO like we've described. IMO, the
>>>> fact that we have devices that implement protocol specific mechanisms
>>>> for NVGRE and VXLAN should be considered legacy support in the stack,
>>>> for new UDP encapsulation protocols we should not expose specifics in
>>>> the stack in either by adding a GSO type for each protocol, nor
>>>> ndo_add_foo_port for each protocol-- these things will not scale and
>>>> unnecessarily complicate the core stack.
>>>
>>> It's not clear to me that allowing devices to know what protocols are
>>> running on what ports actually complicates the stack. The part that is
>>> complicated is usually the types of operations that are being
>>> offloaded (checksum, TSO, etc.). In all of these tunnel cases, the
>>> operations are same and if you have a clean registration mechanism
>>> then nothing in the core has to see this - only the protocol doing the
>>> registering and the driver that is supporting it.
>>>
>>
>> We already have an ntuple filtering interface that allows configuring
>> a device for special processing of RX packets. I don't see why that
>> shouldn't apply to the use case protocol processing for specific ports
>> in the encapsulation use case.
>
> You mentioned this before but I guess I don't really understand it. I
> suppose it is possible to express the port number and encapsulation as
> a filter but it doesn't really seem all that natural and at the end of
> the day it won't be mapped to a filter in the NIC. Can you explain it
> some more?
>
With n-tuple filters you should be able to configure a rule to match
packets (say by port) and assign an "action" which is understood by
the driver (say assume packets are VXLAN and return
CHECKSUM_UNNECESSARY). The interface is to the driver, so how this is
actually instantiated on the device is a private matter.

This is far more flexible and extensible model than trying to have the
stack do port->protocol registration. We can filter on much than just
destination port (this is actually a problem in UDP offloads which
only works with binding socket to INADDR_ANY). Also, this model can be
applied to many different scenarios not just encapsulation or those
protocols that are implemented by the kernel. I imagine someone will
want to do QUIC acceleration/steering in a device at some point, this
work even if the kernel doesn't implement any part of the protocol (a
design point of QUIC ;-) ). It would be interesting to see how the
super charged programmable protocol parsers Alexei described might be
integrated with RX filtering.

>>> I have no disagreement with trying to be generic across protocols. I'm
>>> just not convinced that it is a realistic plan. It's obvious that it
>>> is not doable today nor will be it be in the next generation of NICs
>>> (which are guaranteed to add support for new protocols). Furthermore,
>>> there will be more advanced stuff coming in the future that I think
>>> will be difficult or impossible to make protocol agnostic. Rather than
>>> pretending that this doesn't exist or will never happen, it's better
>>> focus on how to integrating it cleanly.
>>
>> Sorry, but I don't understand how supporting a new protocols in a
>> device for the purposes of returning CHECKSUM_UNNECESSARY is better or
>> easier to implement than just returning CHECKSUM_COMPLETE. Same thing
>> for trying to use NETIF_F_IP_CSUM with encapsulation rather than
>> NETIF_F_HW_CSUM. I'm not a hardware guy, so it's possible I'm missing
>> something obvious...
>>
>> Can you be more specific about this "advanced stuff"?
>
> I think checksums are really the exception, not the rule. It's great
> that they have this nice property of being additive and we should use
> that where we can but that doesn't apply to other types of operation
> (or even other types of checksums). Encryption or CRC32 carried inside
> the tunnel header can't be accelerated without some additional
> knowledge of the protocol. I think there were also a few other things
> that came up along these lines when we talked about this in an earlier
> thread - that's what I mean by "advanced stuff".
>
> For the basic one's complement checksums, I have no objection to
> CHECKSUM_COMPLETE. However, the reality is that this is not generally
> implemented today and that is unlikely to change for a few years even
> in the best case.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Or Gerlitz Jan. 15, 2015, 6:18 p.m. UTC | #33
On Mon, Oct 6, 2014 at 11:28 PM, Tom Herbert <therbert@google.com> wrote:
> On Mon, Oct 6, 2014 at 1:22 PM, Or Gerlitz <gerlitz.or@gmail.com> wrote:

>> Are we going to have a session on the encapsulation/offloads design @ LPC?

> yes, I will talk about FOU and GUE implementation. You should
> abstracts in the schedule now.

Hi Tom, so that LPC discussion eventually didn't made it to happen...
I saw [1] you're planning (hopefully exiting) three parts discussion @
netdev01

I would strongly vote for his to be on the workshops/tutorials days,
i.e not limited to 45m or alike... and the 1st part you plan for an
updated reply of your LSK preso [2] which missed LPC?

[1] https://netdev01.org/news/22
[2] http://vger.kernel.org/encapsulation_offloads.pdf

>> I think a replay of your LKS presentation along with open discussion
>> on how to get there with the legacy requirements could be very
>> helpful.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 9f5d293..f8c2027 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -997,6 +997,12 @@  typedef u16 (*select_queue_fallback_t)(struct net_device *dev,
  *	Callback to use for xmit over the accelerated station. This
  *	is used in place of ndo_start_xmit on accelerated net
  *	devices.
+ * bool	(*ndo_gso_check) (struct sk_buff *skb,
+ *			  struct net_device *dev);
+ *	Called by core transmit path to determine if device is capable of
+ *	performing GSO on a packet. The device returns true if it is
+ *	able to GSO the packet, false otherwise. If the return value is
+ *	false the stack will do software GSO.
  */
 struct net_device_ops {
 	int			(*ndo_init)(struct net_device *dev);
@@ -1146,6 +1152,8 @@  struct net_device_ops {
 							struct net_device *dev,
 							void *priv);
 	int			(*ndo_get_lock_subclass)(struct net_device *dev);
+	bool			(*ndo_gso_check) (struct sk_buff *skb,
+						  struct net_device *dev);
 };
 
 /**
@@ -3536,10 +3544,12 @@  static inline bool skb_gso_ok(struct sk_buff *skb, netdev_features_t features)
 	       (!skb_has_frag_list(skb) || (features & NETIF_F_FRAGLIST));
 }
 
-static inline bool netif_needs_gso(struct sk_buff *skb,
+static inline bool netif_needs_gso(struct net_device *dev, struct sk_buff *skb,
 				   netdev_features_t features)
 {
 	return skb_is_gso(skb) && (!skb_gso_ok(skb, features) ||
+		(dev->netdev_ops->ndo_gso_check &&
+		 !dev->netdev_ops->ndo_gso_check(skb, dev)) ||
 		unlikely((skb->ip_summed != CHECKSUM_PARTIAL) &&
 			 (skb->ip_summed != CHECKSUM_UNNECESSARY)));
 }
diff --git a/net/core/dev.c b/net/core/dev.c
index e2ced01..8c2b9bb 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2680,7 +2680,7 @@  struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev)
 	if (skb->encapsulation)
 		features &= dev->hw_enc_features;
 
-	if (netif_needs_gso(skb, features)) {
+	if (netif_needs_gso(dev, skb, features)) {
 		struct sk_buff *segs;
 
 		segs = skb_gso_segment(skb, features);