diff mbox

[RFC,net-next,v3,4/7] net: use skb->csum_algo to identify packets needing crc32c

Message ID 1b776a2f30504c01cbad6a9230840dd0e79ffc0c.1491562390.git.dcaratti@redhat.com
State RFC, archived
Delegated to: David Miller
Headers show

Commit Message

Davide Caratti April 7, 2017, 2:16 p.m. UTC
skb->csum_algo carries the indication on which algorithm is needed to
compute checksum on skb in the transmit path, when skb->ip_summed is
equal to CHECKSUM_PARTIAL. If skb carries a SCTP packet and crc32c
hasn't been yet written in L4 header, skb->csum_algo is assigned to
CRC32C_CHECKSUM. In any other case, skb->csum_algo is set to
INTERNET_CHECKSUM.

Suggested-by: Tom Herbert <tom@herbertland.com>
Signed-off-by: Davide Caratti <dcaratti@redhat.com>
---
 include/linux/skbuff.h                | 28 ++++++++++++++++++++--------
 net/core/dev.c                        |  2 +-
 net/netfilter/ipvs/ip_vs_proto_sctp.c |  2 +-
 net/netfilter/nf_nat_proto_sctp.c     |  2 +-
 net/sched/act_csum.c                  |  2 +-
 net/sctp/offload.c                    |  2 +-
 net/sctp/output.c                     |  3 ++-
 7 files changed, 27 insertions(+), 14 deletions(-)

Comments

Tom Herbert April 7, 2017, 3:43 p.m. UTC | #1
On Fri, Apr 7, 2017 at 7:16 AM, Davide Caratti <dcaratti@redhat.com> wrote:
> skb->csum_algo carries the indication on which algorithm is needed to
> compute checksum on skb in the transmit path, when skb->ip_summed is
> equal to CHECKSUM_PARTIAL. If skb carries a SCTP packet and crc32c
> hasn't been yet written in L4 header, skb->csum_algo is assigned to
> CRC32C_CHECKSUM. In any other case, skb->csum_algo is set to
> INTERNET_CHECKSUM.
>
> Suggested-by: Tom Herbert <tom@herbertland.com>
> Signed-off-by: Davide Caratti <dcaratti@redhat.com>
> ---
>  include/linux/skbuff.h                | 28 ++++++++++++++++++++--------
>  net/core/dev.c                        |  2 +-
>  net/netfilter/ipvs/ip_vs_proto_sctp.c |  2 +-
>  net/netfilter/nf_nat_proto_sctp.c     |  2 +-
>  net/sched/act_csum.c                  |  2 +-
>  net/sctp/offload.c                    |  2 +-
>  net/sctp/output.c                     |  3 ++-
>  7 files changed, 27 insertions(+), 14 deletions(-)
>
> diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
> index aaf1072..527be47 100644
> --- a/include/linux/skbuff.h
> +++ b/include/linux/skbuff.h
> @@ -189,12 +189,13 @@
>   *
>   *   NETIF_F_SCTP_CRC - This feature indicates that a device is capable of
>   *     offloading the SCTP CRC in a packet. To perform this offload the stack
> - *     will set ip_summed to CHECKSUM_PARTIAL and set csum_start and csum_offset
> - *     accordingly. Note the there is no indication in the skbuff that the
> - *     CHECKSUM_PARTIAL refers to an SCTP checksum, a driver that supports
> - *     both IP checksum offload and SCTP CRC offload must verify which offload
> - *     is configured for a packet presumably by inspecting packet headers; in
> - *     case, skb_crc32c_csum_help is provided to compute CRC on SCTP packets.
> + *     will set set csum_start and csum_offset accordingly, set ip_summed to
> + *     CHECKSUM_PARTIAL and set csum_algo to CRC32C_CHECKSUM, to provide an
> + *     indication in the skbuff that the CHECKSUM_PARTIAL refers to CRC32c.
> + *     A driver that supports both IP checksum offload and SCTP CRC32c offload
> + *     must verify which offload is configured for a packet by testing the
> + *     value of skb->csum_algo; skb_crc32c_csum_help is provided to resolve
> + *     CHECKSUM_PARTIAL on skbs where csum_algo is CRC32C_CHECKSUM.
>   *
>   *   NETIF_F_FCOE_CRC - This feature indicates that a device is capable of
>   *     offloading the FCOE CRC in a packet. To perform this offload the stack
> @@ -614,6 +615,7 @@ static inline bool skb_mstamp_after(const struct skb_mstamp *t1,
>   *     @wifi_acked_valid: wifi_acked was set
>   *     @wifi_acked: whether frame was acked on wifi or not
>   *     @no_fcs:  Request NIC to treat last 4 bytes as Ethernet FCS
> + *     @csum_algo: algorithm used to compute checksum
>   *     @dst_pending_confirm: need to confirm neighbour
>    *    @napi_id: id of the NAPI struct this skb came from
>   *     @secmark: security marking
> @@ -742,8 +744,10 @@ struct sk_buff {
>         __u8                    csum_valid:1;
>         __u8                    csum_complete_sw:1;
>         __u8                    csum_level:2;
> -       __u8                    __unused:1; /* one bit hole */
> -
> +       enum {
> +               INTERNET_CHECKSUM = 0,
> +               CRC32C_CHECKSUM,
> +       }                       csum_algo:1;

I am worried this opens the door to a new open ended functionality
that will be rarely used in practice. Checksum offload is pervasive,
CRC offload is still a very narrow use case. Adding yet more
CRC/checksum variants will need more bits. It may be sufficient for
now just to make this a single bit which indicates "ones' checksum" or
indicates "other". In this case of "other" we need some analysis so
determine which checksum it is, this might be something that flow
dissector could support.

>         __u8                    dst_pending_confirm:1;
>  #ifdef CONFIG_IPV6_NDISC_NODETYPE
>         __u8                    ndisc_nodetype:2;
> @@ -3129,6 +3133,14 @@ struct skb_checksum_ops {
>
>  extern const struct skb_checksum_ops *crc32c_csum_stub __read_mostly;
>
> +static inline void skb_set_crc32c_ipsummed(struct sk_buff *skb,
> +                                          const u8 ip_summed)
> +{
> +       skb->csum_algo = ip_summed == CHECKSUM_PARTIAL ? CRC32C_CHECKSUM :
> +               INTERNET_CHECKSUM;
> +       skb->ip_summed = ip_summed;

This seems odd to me. skb->csum_algo and skb->ip_summed always end up
having the same value.

> +}
> +
>  __wsum __skb_checksum(const struct sk_buff *skb, int offset, int len,
>                       __wsum csum, const struct skb_checksum_ops *ops);
>  __wsum skb_checksum(const struct sk_buff *skb, int offset, int len,
> diff --git a/net/core/dev.c b/net/core/dev.c
> index 91ba01a..c6a4281 100644
> --- a/net/core/dev.c
> +++ b/net/core/dev.c
> @@ -2641,7 +2641,7 @@ int skb_crc32c_csum_help(struct sk_buff *skb)
>                         goto out;
>         }
>         *(__le32 *)(skb->data + offset) = crc32c_csum;
> -       skb->ip_summed = CHECKSUM_NONE;
> +       skb_set_crc32c_ipsummed(skb, CHECKSUM_NONE);
>  out:
>         return ret;
>  }
> diff --git a/net/netfilter/ipvs/ip_vs_proto_sctp.c b/net/netfilter/ipvs/ip_vs_proto_sctp.c
> index 56f8e4b..8800bf7 100644
> --- a/net/netfilter/ipvs/ip_vs_proto_sctp.c
> +++ b/net/netfilter/ipvs/ip_vs_proto_sctp.c
> @@ -81,7 +81,7 @@ static void sctp_nat_csum(struct sk_buff *skb, sctp_sctphdr_t *sctph,
>                           unsigned int sctphoff)
>  {
>         sctph->checksum = sctp_compute_cksum(skb, sctphoff);
> -       skb->ip_summed = CHECKSUM_UNNECESSARY;
> +       skb_set_crc32c_ipsummed(skb, CHECKSUM_UNNECESSARY);

The old code is better. CHECKSUM_UNNECESSARY already applies to non IP
checksums. There is nothing special about crc32 in this regard and
skb->csum_algo should only be valid when skb->ip_summed ==
CHECKSUM_PARTIAL so no need to set it here. This point should also be
in documentation.

>  }
>
>  static int
> diff --git a/net/netfilter/nf_nat_proto_sctp.c b/net/netfilter/nf_nat_proto_sctp.c
> index 804e8a0..82a7c4c 100644
> --- a/net/netfilter/nf_nat_proto_sctp.c
> +++ b/net/netfilter/nf_nat_proto_sctp.c
> @@ -60,7 +60,7 @@ sctp_manip_pkt(struct sk_buff *skb,
>
>         if (skb->ip_summed != CHECKSUM_PARTIAL) {
>                 hdr->checksum = sctp_compute_cksum(skb, hdroff);
> -               skb->ip_summed = CHECKSUM_NONE;
> +               skb_set_crc32c_ipsummed(skb, CHECKSUM_NONE);
>         }
>
>         return true;
> diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c
> index 6c319a4..6e7e862 100644
> --- a/net/sched/act_csum.c
> +++ b/net/sched/act_csum.c
> @@ -349,7 +349,7 @@ static int tcf_csum_sctp(struct sk_buff *skb, unsigned int ihl,
>
>         sctph->checksum = sctp_compute_cksum(skb,
>                                              skb_network_offset(skb) + ihl);
> -       skb->ip_summed = CHECKSUM_NONE;
> +       skb_set_crc32c_ipsummed(skb, CHECKSUM_NONE);
>
>         return 1;
>  }
> diff --git a/net/sctp/offload.c b/net/sctp/offload.c
> index 378f462..4b98339 100644
> --- a/net/sctp/offload.c
> +++ b/net/sctp/offload.c
> @@ -34,7 +34,7 @@
>
>  static __le32 sctp_gso_make_checksum(struct sk_buff *skb)
>  {
> -       skb->ip_summed = CHECKSUM_NONE;
> +       skb_set_crc32c_ipsummed(skb, CHECKSUM_NONE);
>         return sctp_compute_cksum(skb, skb_transport_offset(skb));
>  }
>
> diff --git a/net/sctp/output.c b/net/sctp/output.c
> index 1224421..386cbd8 100644
> --- a/net/sctp/output.c
> +++ b/net/sctp/output.c
> @@ -524,10 +524,11 @@ static int sctp_packet_pack(struct sctp_packet *packet,
>                 struct sctphdr *sh =
>                         (struct sctphdr *)skb_transport_header(head);
>
> +               skb_set_crc32c_ipsummed(head, CHECKSUM_NONE);
>                 sh->checksum = sctp_compute_cksum(head, 0);
>         } else {
>  chksum:
> -               head->ip_summed = CHECKSUM_PARTIAL;
> +               skb_set_crc32c_ipsummed(head, CHECKSUM_PARTIAL);
>                 head->csum_start = skb_transport_header(head) - head->head;
>                 head->csum_offset = offsetof(struct sctphdr, checksum);
>         }
> --
> 2.7.4
>
Davide Caratti April 7, 2017, 5:29 p.m. UTC | #2
hello Tom,

On Fri, 2017-04-07 at 08:43 -0700, Tom Herbert wrote:
> On Fri, Apr 7, 2017 at 7:16 AM, Davide Caratti <dcaratti@redhat.com> wrote:
> > @@ -742,8 +744,10 @@ struct sk_buff {
> >         __u8                    csum_valid:1;
> >         __u8                    csum_complete_sw:1;
> >         __u8                    csum_level:2;
> > -       __u8                    __unused:1; /* one bit hole */
> > -
> > +       enum {
> > +               INTERNET_CHECKSUM = 0,
> > +               CRC32C_CHECKSUM,
> > +       }                       csum_algo:1;
> 
> I am worried this opens the door to a new open ended functionality
> that will be rarely used in practice. Checksum offload is pervasive,
> CRC offload is still a very narrow use case.

thank you for the prompt response. I thought there was a silent
agreement on that - Alexander proposed usage of an enum bitfield to be
ready for FCoE (and I'm not against it, unless I have to find a second
free bit in struct sk_buff :-) ). But maybe I'm misunderstanding your
concern: is it the  name of the variable, (csum_algo instead of
crc32c_csum), or the usage of enum bitfield (or both?) ?

On Fri, 2017-04-07 at 08:43 -0700, Tom Herbert wrote:
> Adding yet more
> CRC/checksum variants will need more bits. It may be sufficient for
> now just to make this a single bit which indicates "ones' checksum" or
> indicates "other". In this case of "other" we need some analysis so
> determine which checksum it is, this might be something that flow
> dissector could support.

... which is my intent: by the way, from my perspective, we don't need more than 1 bit
to extend the functionality. While reviewing my code, I was also considering
extending the witdth of skb->ip_summed from 2 to 3 bit, so that it was possible
to

#define

CRC32C_PARTIAL <- for SCTP
CRC_PARTIAL <- for FCoE
CHECKSUM_PARTIAL <- for everything else

It's conceptually the same thing, and the free bit is used more
efficiently. But then I would need to check all places where
CHECKSUM_PARTIAL is used in assignments and test: so, I told myself it's
not worth doing it until somebody requests to extend this functionality to
FCoE.

> > @@ -3129,6 +3133,14 @@ struct skb_checksum_ops {
> > 
> >  extern const struct skb_checksum_ops *crc32c_csum_stub __read_mostly;
> > 
> > +static inline void skb_set_crc32c_ipsummed(struct sk_buff *skb,
> > +                                          const u8 ip_summed)
> > +{
> > +       skb->csum_algo = ip_summed == CHECKSUM_PARTIAL ? CRC32C_CHECKSUM :
> > +               INTERNET_CHECKSUM;
> > +       skb->ip_summed = ip_summed;
> 
> This seems odd to me. skb->csum_algo and skb->ip_summed always end up
> having the same value.

this is accidentally true for CHEKSUM_NONE and CHECKSUM_PARTIAL, and only
if skb carries a SCTP packet. This was my intent:

ip_summed  (2 bit)                     | csum_algo     (1 bit)
---------------------------------------+-------------------
CHEKSUM_NONE = 0                       | INTERNET_CHECKSUM = 0
CHECKSUM_PARTIAL = 1                   | CRC32C_CHECKSUM = 1
CHECKSUM_COMPLETE = 2 (not applicable) | INTERNET_CHECKSUM = 0 (don't care)
CHECKSUM_UNNECESSARY = 3               | INTERNET_CHECKSUM = 0

I can do this in a more explicit way, changing the prototype to

static inline void skb_set_crc32c_ipsummed(struct sk_buff *skb,
                                           const u8 ip_summed,
                                           const u8 csum_algo)

(with the advantage of saving a test on the value of ip_summed).
Find in the comment below the reason why I'm clearing csum_algo every time
the SCTP CRC32c is computed.

> > --- a/net/netfilter/ipvs/ip_vs_proto_sctp.c
> > +++ b/net/netfilter/ipvs/ip_vs_proto_sctp.c
> > @@ -81,7 +81,7 @@ static void sctp_nat_csum(struct sk_buff *skb, sctp_sctphdr_t *sctph,
> >                           unsigned int sctphoff)
> >  {
> >         sctph->checksum = sctp_compute_cksum(skb, sctphoff);
> > -       skb->ip_summed = CHECKSUM_UNNECESSARY;
> > +       skb_set_crc32c_ipsummed(skb, CHECKSUM_UNNECESSARY);
> 
> The old code is better. CHECKSUM_UNNECESSARY already applies to non IP
> checksums. There is nothing special about crc32 in this regard and
> skb->csum_algo should only be valid when skb->ip_summed ==
> CHECKSUM_PARTIAL so no need to set it here. This point should also be
> in documentation.

In my understanding, csum_algo needs to be set to INTERNET_CHECKSUM after the
CRC32c is computed. Otherwise, after subsequent operation on the skb (e.g. it
is encapsulated in a UDP frame), there is the possibility for skb->ip_summed
to become CHECKSUM_PARTIAL again. So, to ensure that skb_checksum_help() and
not skb_crc32c_help() will be called, csum_algo must be 0.

To minimize the impact of the patch, I substituted all assignments of skb->ip_summed,
done by SCTP-related code, with calls to skb_set_crc32c_ipsummed(). The alternative is
to explicitly set csum_algo to 0 (INTERNET_CHECKSUM) in SCTP-related code. Do you agree?

thank you in advance,
regards
--
davide
Tom Herbert April 7, 2017, 6:11 p.m. UTC | #3
On Fri, Apr 7, 2017 at 10:29 AM, Davide Caratti <dcaratti@redhat.com> wrote:
> hello Tom,
>
> On Fri, 2017-04-07 at 08:43 -0700, Tom Herbert wrote:
>> On Fri, Apr 7, 2017 at 7:16 AM, Davide Caratti <dcaratti@redhat.com> wrote:
>> > @@ -742,8 +744,10 @@ struct sk_buff {
>> >         __u8                    csum_valid:1;
>> >         __u8                    csum_complete_sw:1;
>> >         __u8                    csum_level:2;
>> > -       __u8                    __unused:1; /* one bit hole */
>> > -
>> > +       enum {
>> > +               INTERNET_CHECKSUM = 0,
>> > +               CRC32C_CHECKSUM,
>> > +       }                       csum_algo:1;
>>
>> I am worried this opens the door to a new open ended functionality
>> that will be rarely used in practice. Checksum offload is pervasive,
>> CRC offload is still a very narrow use case.
>
> thank you for the prompt response. I thought there was a silent
> agreement on that - Alexander proposed usage of an enum bitfield to be
> ready for FCoE (and I'm not against it, unless I have to find a second
> free bit in struct sk_buff :-) ). But maybe I'm misunderstanding your
> concern: is it the  name of the variable, (csum_algo instead of
> crc32c_csum), or the usage of enum bitfield (or both?) ?
>
> On Fri, 2017-04-07 at 08:43 -0700, Tom Herbert wrote:
>> Adding yet more
>> CRC/checksum variants will need more bits. It may be sufficient for
>> now just to make this a single bit which indicates "ones' checksum" or
>> indicates "other". In this case of "other" we need some analysis so
>> determine which checksum it is, this might be something that flow
>> dissector could support.
>
> ... which is my intent: by the way, from my perspective, we don't need more than 1 bit
> to extend the functionality. While reviewing my code, I was also considering
> extending the witdth of skb->ip_summed from 2 to 3 bit, so that it was possible
> to
>
Maybe just call it csum_not_ip then. Then just do "if
(unlikely(skb->csum_not_ip)) ..."

> #define
>
> CRC32C_PARTIAL <- for SCTP
> CRC_PARTIAL <- for FCoE
> CHECKSUM_PARTIAL <- for everything else
>
> It's conceptually the same thing, and the free bit is used more
> efficiently. But then I would need to check all places where
> CHECKSUM_PARTIAL is used in assignments and test: so, I told myself it's
> not worth doing it until somebody requests to extend this functionality to
> FCoE.
>
I've thought about extending ip_summed before with something like
csum_invalid. I think it opens up a can of worms since ip_summed is
being used in so many places already and the semantics of each value
have to be extremely well defined for the whole system (this is one
place where we can't tolerate any ambiguity at all and it everything
needs to be clearly documented).

>> > @@ -3129,6 +3133,14 @@ struct skb_checksum_ops {
>> >
>> >  extern const struct skb_checksum_ops *crc32c_csum_stub __read_mostly;
>> >
>> > +static inline void skb_set_crc32c_ipsummed(struct sk_buff *skb,
>> > +                                          const u8 ip_summed)
>> > +{
>> > +       skb->csum_algo = ip_summed == CHECKSUM_PARTIAL ? CRC32C_CHECKSUM :
>> > +               INTERNET_CHECKSUM;
>> > +       skb->ip_summed = ip_summed;
>>
>> This seems odd to me. skb->csum_algo and skb->ip_summed always end up
>> having the same value.
>
> this is accidentally true for CHEKSUM_NONE and CHECKSUM_PARTIAL, and only
> if skb carries a SCTP packet. This was my intent:
>
> ip_summed  (2 bit)                     | csum_algo     (1 bit)
> ---------------------------------------+-------------------
> CHEKSUM_NONE = 0                       | INTERNET_CHECKSUM = 0
> CHECKSUM_PARTIAL = 1                   | CRC32C_CHECKSUM = 1
> CHECKSUM_COMPLETE = 2 (not applicable) | INTERNET_CHECKSUM = 0 (don't care)
> CHECKSUM_UNNECESSARY = 3               | INTERNET_CHECKSUM = 0
>
> I can do this in a more explicit way, changing the prototype to
>
> static inline void skb_set_crc32c_ipsummed(struct sk_buff *skb,
>                                            const u8 ip_summed,
>                                            const u8 csum_algo)
>
> (with the advantage of saving a test on the value of ip_summed).
> Find in the comment below the reason why I'm clearing csum_algo every time
> the SCTP CRC32c is computed.
>
>> > --- a/net/netfilter/ipvs/ip_vs_proto_sctp.c
>> > +++ b/net/netfilter/ipvs/ip_vs_proto_sctp.c
>> > @@ -81,7 +81,7 @@ static void sctp_nat_csum(struct sk_buff *skb, sctp_sctphdr_t *sctph,
>> >                           unsigned int sctphoff)
>> >  {
>> >         sctph->checksum = sctp_compute_cksum(skb, sctphoff);
>> > -       skb->ip_summed = CHECKSUM_UNNECESSARY;
>> > +       skb_set_crc32c_ipsummed(skb, CHECKSUM_UNNECESSARY);
>>
>> The old code is better. CHECKSUM_UNNECESSARY already applies to non IP
>> checksums. There is nothing special about crc32 in this regard and
>> skb->csum_algo should only be valid when skb->ip_summed ==
>> CHECKSUM_PARTIAL so no need to set it here. This point should also be
>> in documentation.
>
> In my understanding, csum_algo needs to be set to INTERNET_CHECKSUM after the
> CRC32c is computed. Otherwise, after subsequent operation on the skb (e.g. it
> is encapsulated in a UDP frame), there is the possibility for skb->ip_summed
> to become CHECKSUM_PARTIAL again. So, to ensure that skb_checksum_help() and
> not skb_crc32c_help() will be called, csum_algo must be 0.
>
ip_summed should no longer be CHECKSUM_PARTIAL with CRC32c is computed.

> To minimize the impact of the patch, I substituted all assignments of skb->ip_summed,
> done by SCTP-related code, with calls to skb_set_crc32c_ipsummed(). The alternative is
> to explicitly set csum_algo to 0 (INTERNET_CHECKSUM) in SCTP-related code. Do you agree?
>
No, like I said the only case where this new bit is relevant is when
CHECKSUM_PARTIAL for a CRC is being done. When it's set for offloading
sctp crc it must be set. When CRC is resolved, in the helper for
instance, it must be cleared. If these rules are properly followed
then the bit will be zero in all other cases without needing any
additional work or conditionals.

Tom

> thank you in advance,
> regards
> --
> davide
>
Davide Caratti April 13, 2017, 10:36 a.m. UTC | #4
thank you,

On Fri, 2017-04-07 at 08:43 -0700, Tom Herbert wrote:
> Maybe just call it csum_not_ip then. Then just do "if
> (unlikely(skb->csum_not_ip)) ..."

OK, I will rename the bit, avoid the enum and use the 'unlikely'. Up to now,
this series uses the bit for SCTP only and leaves unmodified behavior of
offloaded FCoE frames: please let me know if you disagree on that.

On Fri, 2017-04-07 at 08:43 -0700, Tom Herbert wrote:
> On Fri, Apr 7, 2017 at 10:29 AM, Davide Caratti <dcaratti@redhat.com> wrote:
> > In my understanding, csum_algo needs to be set to INTERNET_CHECKSUM after the
> > CRC32c is computed. Otherwise, after subsequent operation on the skb (e.g. it
> > is encapsulated in a UDP frame), there is the possibility for skb->ip_summed
> > to become CHECKSUM_PARTIAL again. So, to ensure that skb_checksum_help() and
> > not skb_crc32c_help() will be called, csum_algo must be 0.

> ip_summed should no longer be CHECKSUM_PARTIAL with CRC32c is computed.

Even though it's uncommon, skb->ip_summed can become CHECKSUM_PARTIAL again
after the CRC32c is computed and CHECKSUM_NONE is set: for example, when a
veth and a vxlan with UDP checksums are enslaved to the same bridge, and the
NIC below vxlan has no checksumming capabilities. Here, validate_xmit_skb is
called three times on the same skb (see perf output at the bottom): 

* before transmission on the veth: here ip_summed is CHECKSUM_PARTIAL, but
the device supports CRC32c offload so the skb is (correctly) untouched.

* before vxlan encapsulation: here ip_summed is CHECKSUM_PARTIAL,
skb->csum_not_inet is 1 and NETIF_F_SCTP_CRC is not set. Here,
skb_csum_hwoffload_help() correctly fills the CRC32c and assigns ip_summed
to CHECKSUM_NONE.

* before transmission on the NIC: ip_summed is CHECKSUM_PARTIAL again (because
udp_set_csum changed csum_start and csum_offset to point to the tunnel
UDP header). No bit in NETIF_F_HW_CSUM is set: if skb->csum_not_inet is still 1,
the helper (wrongly) computes CRC32c again, thus corrupting the outer UDP
transport header. On the contrary, if skb->csum_not_inet is 0, skb_checksum_help()
correctly resolves CHECKSUM_PARTIAL.

To avoid this problem, skb->csum_not_inet must be assigned to 0 every time
the CHECKSUM_PARTIAL is resolved on skb carrying SCTP packets.

> > To minimize the impact of the patch, I substituted all assignments of skb->ip_summed,
> > done by SCTP-related code, with calls to skb_set_crc32c_ipsummed(). The alternative is
> > to explicitly set csum_algo to 0 (INTERNET_CHECKSUM) in SCTP-related code. Do you agree?

> No, like I said the only case where this new bit is relevant is when
> CHECKSUM_PARTIAL for a CRC is being done. When it's set for offloading
> sctp crc it must be set. When CRC is resolved, in the helper for
> instance, it must be cleared. If these rules are properly followed
> then the bit will be zero in all other cases without needing any
> additional work or conditionals.

At a minimum, this csum_not_inet bit needs to be cleared in three places:
1) in skb_crc32c_csum_help, to fix scenarios like veth->bridge->vxlan->NIC above.
2) in sctp_gso_make_checksum, a SCTP GSO packet is segmented and CRC32c is written
on each segment. skb->ip_summed transitions from CHECKSUM_PARTIAL to CHECKSUM_NONE.
3) in act_csum, because TC action mangling the packet are called before 
validate_xmit_skb().

It is not necessary to do it in netfilter NAT (even it is harmless), because
SCTP packets having CHECKSUM_PARTIAL are not resolved (since commit 3189a290f98d
"netfilter: nat: skip checksum on offload SCTP packets"). And it should be not
needed in IPVS code, because ip_summed is set to CHECKSUM_UNNECESSARY, so skb
is not going to be checksummed anymore.

thank you in advance for the feedback!
regards,
--
davide 


scenario:

vethA --> vethB --> bridge --> vxlan encaps --> NIC

# ./perf probe -k vmlinux  --add \
 "skb_csum_hwoffload_help csum_offset=skb->csum_offset ip_summed=skb->ip_summed:b2@7/32 skb=skb"
# ./perf record -e probe:skb_csum_hwoffload_help -aR -- ./scenario-vxlan.sh 

# ./perf script
<....>
ncat  7577 [000] 22056.548907: probe:skb_csum_hwoffload_help: (ffffffff8162a6f0) csum_offset=8 ip_summed=1 skb=0xffff880106f6bd00
ncat  7577 [000] 22056.548915: probe:skb_csum_hwoffload_help: (ffffffff8162a6f0) csum_offset=8 ip_summed=1 skb=0xffff880106f6bd00
ncat  7577 [000] 22056.548917: probe:skb_csum_hwoffload_help: (ffffffff8162a6f0) csum_offset=6 ip_summed=1 skb=0xffff880106f6bd00
<....>
Davide Caratti April 20, 2017, 1:38 p.m. UTC | #5
hello Tom,

On Fri, 2017-04-07 at 11:11 -0700, Tom Herbert wrote:
> maybe just call it csum_not_ip then. Then just do "if
> (unlikely(skb->csum_not_ip)) ..."

Ok, done. V4 uses this bit for SCTP only and leaves unmodified behavior
when offloaded FCoE frames are processed. Further work is still possible
to extend this fix for FCoE, if needed, either by using additional sk_buff
bits, or using skb->csum_not_ip and use other data (e.g. skb->csum_offset)
to distinguish SCTP from FCoE.

> the only case where this new bit is relevant is when
> CHECKSUM_PARTIAL for a CRC is being done. When it's set for offloading
> sctp crc it must be set. When CRC is resolved, in the helper for
> instance, it must be cleared.

in V4 the bit is set when SCTP packets with offloaded checksum are
generated; the bit is cleared when CRC32c is resolved for such packets
(i.e. skb->ip_summed transitions from CHECKSUM_PARTIAL to CHECKSUM_NONE).

Any feedbacks are appreciated!
thank you in advance,
--
davide


Davide Caratti (7):
  skbuff: add stub to help computing crc32c on SCTP packets
  net: introduce skb_crc32c_csum_help
  sk_buff: remove support for csum_bad in sk_buff
  net: use skb->csum_not_inet to identify packets needing crc32c
  net: more accurate checksumming in validate_xmit_skb()
  openvswitch: more accurate checksumming in queue_userspace_packet()
  sk_buff.h: improve description of CHECKSUM_{COMPLETE,UNNECESSARY}

 Documentation/networking/checksum-offloads.txt   | 11 +++--
 drivers/net/ethernet/aquantia/atlantic/aq_ring.c |  2 +-
 include/linux/netdevice.h                        |  8 +--
 include/linux/skbuff.h                           | 58 +++++++++-------------
 net/bridge/netfilter/nft_reject_bridge.c         |  5 +-
 net/core/dev.c                                   | 63 +++++++++++++++++++++---
 net/core/skbuff.c                                | 24 +++++++++
 net/ipv4/netfilter/nf_reject_ipv4.c              |  2 +-
 net/ipv6/netfilter/nf_reject_ipv6.c              |  3 --
 net/openvswitch/datapath.c                       |  2 +-
 net/sched/act_csum.c                             |  1 +
 net/sctp/offload.c                               |  8 +++
 net/sctp/output.c                                |  1 +
 13 files changed, 128 insertions(+), 60 deletions(-)
Marcelo Ricardo Leitner April 27, 2017, 12:41 p.m. UTC | #6
On Thu, Apr 20, 2017 at 03:38:06PM +0200, Davide Caratti wrote:
> hello Tom,
> 
> On Fri, 2017-04-07 at 11:11 -0700, Tom Herbert wrote:
> > maybe just call it csum_not_ip then. Then just do "if
> > (unlikely(skb->csum_not_ip)) ..."
> 
> Ok, done. V4 uses this bit for SCTP only and leaves unmodified behavior
> when offloaded FCoE frames are processed. Further work is still possible
> to extend this fix for FCoE, if needed, either by using additional sk_buff
> bits, or using skb->csum_not_ip and use other data (e.g. skb->csum_offset)
> to distinguish SCTP from FCoE.
> 
> > the only case where this new bit is relevant is when
> > CHECKSUM_PARTIAL for a CRC is being done. When it's set for offloading
> > sctp crc it must be set. When CRC is resolved, in the helper for
> > instance, it must be cleared.
> 
> in V4 the bit is set when SCTP packets with offloaded checksum are
> generated; the bit is cleared when CRC32c is resolved for such packets
> (i.e. skb->ip_summed transitions from CHECKSUM_PARTIAL to CHECKSUM_NONE).
> 
> Any feedbacks are appreciated!
> thank you in advance,
> --
> davide
> 
> 
> Davide Caratti (7):
>   skbuff: add stub to help computing crc32c on SCTP packets
>   net: introduce skb_crc32c_csum_help
>   sk_buff: remove support for csum_bad in sk_buff
>   net: use skb->csum_not_inet to identify packets needing crc32c
>   net: more accurate checksumming in validate_xmit_skb()
>   openvswitch: more accurate checksumming in queue_userspace_packet()
>   sk_buff.h: improve description of CHECKSUM_{COMPLETE,UNNECESSARY}

Other than the comments I did on patch 2, this series LGTM.


> 
>  Documentation/networking/checksum-offloads.txt   | 11 +++--
>  drivers/net/ethernet/aquantia/atlantic/aq_ring.c |  2 +-
>  include/linux/netdevice.h                        |  8 +--
>  include/linux/skbuff.h                           | 58 +++++++++-------------
>  net/bridge/netfilter/nft_reject_bridge.c         |  5 +-
>  net/core/dev.c                                   | 63 +++++++++++++++++++++---
>  net/core/skbuff.c                                | 24 +++++++++
>  net/ipv4/netfilter/nf_reject_ipv4.c              |  2 +-
>  net/ipv6/netfilter/nf_reject_ipv6.c              |  3 --
>  net/openvswitch/datapath.c                       |  2 +-
>  net/sched/act_csum.c                             |  1 +
>  net/sctp/offload.c                               |  8 +++
>  net/sctp/output.c                                |  1 +
>  13 files changed, 128 insertions(+), 60 deletions(-)
> 
> -- 
> 2.7.4
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-sctp" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>
diff mbox

Patch

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index aaf1072..527be47 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -189,12 +189,13 @@ 
  *
  *   NETIF_F_SCTP_CRC - This feature indicates that a device is capable of
  *     offloading the SCTP CRC in a packet. To perform this offload the stack
- *     will set ip_summed to CHECKSUM_PARTIAL and set csum_start and csum_offset
- *     accordingly. Note the there is no indication in the skbuff that the
- *     CHECKSUM_PARTIAL refers to an SCTP checksum, a driver that supports
- *     both IP checksum offload and SCTP CRC offload must verify which offload
- *     is configured for a packet presumably by inspecting packet headers; in
- *     case, skb_crc32c_csum_help is provided to compute CRC on SCTP packets.
+ *     will set set csum_start and csum_offset accordingly, set ip_summed to
+ *     CHECKSUM_PARTIAL and set csum_algo to CRC32C_CHECKSUM, to provide an
+ *     indication in the skbuff that the CHECKSUM_PARTIAL refers to CRC32c.
+ *     A driver that supports both IP checksum offload and SCTP CRC32c offload
+ *     must verify which offload is configured for a packet by testing the
+ *     value of skb->csum_algo; skb_crc32c_csum_help is provided to resolve
+ *     CHECKSUM_PARTIAL on skbs where csum_algo is CRC32C_CHECKSUM.
  *
  *   NETIF_F_FCOE_CRC - This feature indicates that a device is capable of
  *     offloading the FCOE CRC in a packet. To perform this offload the stack
@@ -614,6 +615,7 @@  static inline bool skb_mstamp_after(const struct skb_mstamp *t1,
  *	@wifi_acked_valid: wifi_acked was set
  *	@wifi_acked: whether frame was acked on wifi or not
  *	@no_fcs:  Request NIC to treat last 4 bytes as Ethernet FCS
+ *	@csum_algo: algorithm used to compute checksum
  *	@dst_pending_confirm: need to confirm neighbour
   *	@napi_id: id of the NAPI struct this skb came from
  *	@secmark: security marking
@@ -742,8 +744,10 @@  struct sk_buff {
 	__u8			csum_valid:1;
 	__u8			csum_complete_sw:1;
 	__u8			csum_level:2;
-	__u8			__unused:1; /* one bit hole */
-
+	enum {
+		INTERNET_CHECKSUM = 0,
+		CRC32C_CHECKSUM,
+	}			csum_algo:1;
 	__u8			dst_pending_confirm:1;
 #ifdef CONFIG_IPV6_NDISC_NODETYPE
 	__u8			ndisc_nodetype:2;
@@ -3129,6 +3133,14 @@  struct skb_checksum_ops {
 
 extern const struct skb_checksum_ops *crc32c_csum_stub __read_mostly;
 
+static inline void skb_set_crc32c_ipsummed(struct sk_buff *skb,
+					   const u8 ip_summed)
+{
+	skb->csum_algo = ip_summed == CHECKSUM_PARTIAL ? CRC32C_CHECKSUM :
+		INTERNET_CHECKSUM;
+	skb->ip_summed = ip_summed;
+}
+
 __wsum __skb_checksum(const struct sk_buff *skb, int offset, int len,
 		      __wsum csum, const struct skb_checksum_ops *ops);
 __wsum skb_checksum(const struct sk_buff *skb, int offset, int len,
diff --git a/net/core/dev.c b/net/core/dev.c
index 91ba01a..c6a4281 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2641,7 +2641,7 @@  int skb_crc32c_csum_help(struct sk_buff *skb)
 			goto out;
 	}
 	*(__le32 *)(skb->data + offset) = crc32c_csum;
-	skb->ip_summed = CHECKSUM_NONE;
+	skb_set_crc32c_ipsummed(skb, CHECKSUM_NONE);
 out:
 	return ret;
 }
diff --git a/net/netfilter/ipvs/ip_vs_proto_sctp.c b/net/netfilter/ipvs/ip_vs_proto_sctp.c
index 56f8e4b..8800bf7 100644
--- a/net/netfilter/ipvs/ip_vs_proto_sctp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_sctp.c
@@ -81,7 +81,7 @@  static void sctp_nat_csum(struct sk_buff *skb, sctp_sctphdr_t *sctph,
 			  unsigned int sctphoff)
 {
 	sctph->checksum = sctp_compute_cksum(skb, sctphoff);
-	skb->ip_summed = CHECKSUM_UNNECESSARY;
+	skb_set_crc32c_ipsummed(skb, CHECKSUM_UNNECESSARY);
 }
 
 static int
diff --git a/net/netfilter/nf_nat_proto_sctp.c b/net/netfilter/nf_nat_proto_sctp.c
index 804e8a0..82a7c4c 100644
--- a/net/netfilter/nf_nat_proto_sctp.c
+++ b/net/netfilter/nf_nat_proto_sctp.c
@@ -60,7 +60,7 @@  sctp_manip_pkt(struct sk_buff *skb,
 
 	if (skb->ip_summed != CHECKSUM_PARTIAL) {
 		hdr->checksum = sctp_compute_cksum(skb, hdroff);
-		skb->ip_summed = CHECKSUM_NONE;
+		skb_set_crc32c_ipsummed(skb, CHECKSUM_NONE);
 	}
 
 	return true;
diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c
index 6c319a4..6e7e862 100644
--- a/net/sched/act_csum.c
+++ b/net/sched/act_csum.c
@@ -349,7 +349,7 @@  static int tcf_csum_sctp(struct sk_buff *skb, unsigned int ihl,
 
 	sctph->checksum = sctp_compute_cksum(skb,
 					     skb_network_offset(skb) + ihl);
-	skb->ip_summed = CHECKSUM_NONE;
+	skb_set_crc32c_ipsummed(skb, CHECKSUM_NONE);
 
 	return 1;
 }
diff --git a/net/sctp/offload.c b/net/sctp/offload.c
index 378f462..4b98339 100644
--- a/net/sctp/offload.c
+++ b/net/sctp/offload.c
@@ -34,7 +34,7 @@ 
 
 static __le32 sctp_gso_make_checksum(struct sk_buff *skb)
 {
-	skb->ip_summed = CHECKSUM_NONE;
+	skb_set_crc32c_ipsummed(skb, CHECKSUM_NONE);
 	return sctp_compute_cksum(skb, skb_transport_offset(skb));
 }
 
diff --git a/net/sctp/output.c b/net/sctp/output.c
index 1224421..386cbd8 100644
--- a/net/sctp/output.c
+++ b/net/sctp/output.c
@@ -524,10 +524,11 @@  static int sctp_packet_pack(struct sctp_packet *packet,
 		struct sctphdr *sh =
 			(struct sctphdr *)skb_transport_header(head);
 
+		skb_set_crc32c_ipsummed(head, CHECKSUM_NONE);
 		sh->checksum = sctp_compute_cksum(head, 0);
 	} else {
 chksum:
-		head->ip_summed = CHECKSUM_PARTIAL;
+		skb_set_crc32c_ipsummed(head, CHECKSUM_PARTIAL);
 		head->csum_start = skb_transport_header(head) - head->head;
 		head->csum_offset = offsetof(struct sctphdr, checksum);
 	}