diff mbox series

udp:nat:vxlan tx after nat should recsum if vxlan tx offload on

Message ID 20230401023029.967357-1-chenwei.0515@bytedance.com
State Changes Requested
Delegated to: Pablo Neira
Headers show
Series udp:nat:vxlan tx after nat should recsum if vxlan tx offload on | expand

Commit Message

Fei Cheng April 1, 2023, 2:30 a.m. UTC
From: "chenwei.0515" <chenwei.0515@bytedance.com>

    If vxlan-dev enable tx csum offload, there are two case of CHECKSUM_PARTIAL,
    but udp->check donot have the both meanings.

    1. vxlan-dev disable tx csum offload, udp->check is just pseudo hdr.
    2. vxlan-dev enable tx csum offload, udp->check is pseudo hdr and
       csum from outter l4 to innner l4.

    Unfortunately if there is a nat process after vxlan tx,udp_manip_pkt just use
    CSUM_PARTIAL to re csum PKT, which is just right on vxlan tx csum disable offload.

    This patch use skb->csum_local flag to identify two case, which will csum lco_csum if valid.

Signed-off-by: chenwei.0515 <chenwei.0515@bytedance.com>
---
 include/linux/skbuff.h       | 1 +
 net/ipv4/udp.c               | 1 +
 net/netfilter/nf_nat_proto.c | 9 +++++++++
 3 files changed, 11 insertions(+)

Comments

Willem de Bruijn April 2, 2023, 6:18 p.m. UTC | #1
On Fri, Mar 31, 2023 at 10:31 PM Fei Cheng <chenwei.0515@bytedance.com> wrote:
>
> From: "chenwei.0515" <chenwei.0515@bytedance.com>
>
>     If vxlan-dev enable tx csum offload, there are two case of CHECKSUM_PARTIAL,
>     but udp->check donot have the both meanings.
>
>     1. vxlan-dev disable tx csum offload, udp->check is just pseudo hdr.
>     2. vxlan-dev enable tx csum offload, udp->check is pseudo hdr and
>        csum from outter l4 to innner l4.
>
>     Unfortunately if there is a nat process after vxlan tx,udp_manip_pkt just use
>     CSUM_PARTIAL to re csum PKT, which is just right on vxlan tx csum disable offload.

The issue is that for encapsulated traffic with local checksum offload,
netfilter incorrectly recomputes the outer UDP checksum as if it is an
unencapsulated CHECKSUM_PARTIAL packet, correct?

So the underlying issue is that the two types of packets are
indistinguishable after udp_set_csum:

        } else if (skb->ip_summed == CHECKSUM_PARTIAL) {
                uh->check = 0;
                uh->check = udp_v4_check(len, saddr, daddr, lco_csum(skb));
                if (uh->check == 0)
                        uh->check = CSUM_MANGLED_0;
        } else {
                skb->ip_summed = CHECKSUM_PARTIAL;
                skb->csum_start = skb_transport_header(skb) - skb->head;
                skb->csum_offset = offsetof(struct udphdr, check);
                uh->check = ~udp_v4_check(len, saddr, daddr, 0);
        }

Clearly their ip_summed will be the same.

>
>     This patch use skb->csum_local flag to identify two case, which will csum lco_csum if valid.
>
> Signed-off-by: chenwei.0515 <chenwei.0515@bytedance.com>
> ---
>  include/linux/skbuff.h       | 1 +
>  net/ipv4/udp.c               | 1 +
>  net/netfilter/nf_nat_proto.c | 9 +++++++++
>  3 files changed, 11 insertions(+)
>
> diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
> index ff7ad331fb82..62996d8d0b4d 100644
> --- a/include/linux/skbuff.h
> +++ b/include/linux/skbuff.h
> @@ -990,6 +990,7 @@ struct sk_buff {
>         __u8                    slow_gro:1;
>         __u8                    csum_not_inet:1;
>         __u8                    scm_io_uring:1;
> +       __u8                    csum_local:1;

sk_buff are in space constrained.

I hope we can disambiguate the packets somehow without having
to resort to a new flag.

>
>  #ifdef CONFIG_NET_SCHED
>         __u16                   tc_index;       /* traffic control index */
> diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
> index c605d171eb2d..86bad0bbb76e 100644
> --- a/net/ipv4/udp.c
> +++ b/net/ipv4/udp.c
> @@ -889,6 +889,7 @@ void udp_set_csum(bool nocheck, struct sk_buff *skb,
>                 uh->check = udp_v4_check(len, saddr, daddr, lco_csum(skb));
>                 if (uh->check == 0)
>                         uh->check = CSUM_MANGLED_0;
> +               skb->csum_local = 1;
>         } else {
>                 skb->ip_summed = CHECKSUM_PARTIAL;
>                 skb->csum_start = skb_transport_header(skb) - skb->head;
> diff --git a/net/netfilter/nf_nat_proto.c b/net/netfilter/nf_nat_proto.c
> index 48cc60084d28..a0261fe2d932 100644
> --- a/net/netfilter/nf_nat_proto.c
> +++ b/net/netfilter/nf_nat_proto.c
> @@ -25,6 +25,7 @@
>  #include <net/ip6_route.h>
>  #include <net/xfrm.h>
>  #include <net/ipv6.h>
> +#include <net/udp.h>
>
>  #include <net/netfilter/nf_conntrack_core.h>
>  #include <net/netfilter/nf_conntrack.h>
> @@ -75,6 +76,14 @@ static bool udp_manip_pkt(struct sk_buff *skb,
>         hdr = (struct udphdr *)(skb->data + hdroff);
>         __udp_manip_pkt(skb, iphdroff, hdr, tuple, maniptype, !!hdr->check);
>
> +       if (skb->csum_local) {
> +               hdr->check = 0;
> +               hdr->check = udp_v4_check(htons(hdr->len), tuple->src.u3.ip, tuple->dst.u3.ip,
> +                                         lco_csum(skb));
> +               if (hdr->check == 0)
> +                       hdr->check = CSUM_MANGLED_0;
> +       }
> +
>         return true;
>  }
>
> --
> 2.11.0
>
kernel test robot April 3, 2023, 9:12 a.m. UTC | #2
Hi Fei,

Thank you for the patch! Perhaps something to improve:

[auto build test WARNING on linus/master]
[also build test WARNING on v6.3-rc5]
[cannot apply to next-20230331]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Fei-Cheng/udp-nat-vxlan-tx-after-nat-should-recsum-if-vxlan-tx-offload-on/20230401-103127
patch link:    https://lore.kernel.org/r/20230401023029.967357-1-chenwei.0515%40bytedance.com
patch subject: [PATCH]     udp:nat:vxlan tx after nat should recsum if vxlan tx offload on
config: microblaze-randconfig-s052-20230402 (https://download.01.org/0day-ci/archive/20230403/202304031611.3W21vgtb-lkp@intel.com/config)
compiler: microblaze-linux-gcc (GCC) 12.1.0
reproduce:
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # apt-get install sparse
        # sparse version: v0.6.4-39-gce1a6720-dirty
        # https://github.com/intel-lab-lkp/linux/commit/58c404cd43734d67706e3dc48c04a3e11e9dd8e8
        git remote add linux-review https://github.com/intel-lab-lkp/linux
        git fetch --no-tags linux-review Fei-Cheng/udp-nat-vxlan-tx-after-nat-should-recsum-if-vxlan-tx-offload-on/20230401-103127
        git checkout 58c404cd43734d67706e3dc48c04a3e11e9dd8e8
        # save the config file
        mkdir build_dir && cp config build_dir/.config
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-12.1.0 make.cross C=1 CF='-fdiagnostic-prefix -D__CHECK_ENDIAN__' O=build_dir ARCH=microblaze olddefconfig
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-12.1.0 make.cross C=1 CF='-fdiagnostic-prefix -D__CHECK_ENDIAN__' O=build_dir ARCH=microblaze SHELL=/bin/bash net/netfilter/

If you fix the issue, kindly add following tag where applicable
| Reported-by: kernel test robot <lkp@intel.com>
| Link: https://lore.kernel.org/oe-kbuild-all/202304031611.3W21vgtb-lkp@intel.com/

sparse warnings: (new ones prefixed by >>)
>> net/netfilter/nf_nat_proto.c:81:43: sparse: sparse: cast from restricted __be16
>> net/netfilter/nf_nat_proto.c:81:43: sparse: sparse: incorrect type in argument 1 (different base types) @@     expected int len @@     got restricted __be16 [usertype] @@
   net/netfilter/nf_nat_proto.c:81:43: sparse:     expected int len
   net/netfilter/nf_nat_proto.c:81:43: sparse:     got restricted __be16 [usertype]

vim +81 net/netfilter/nf_nat_proto.c

    65	
    66	static bool udp_manip_pkt(struct sk_buff *skb,
    67				  unsigned int iphdroff, unsigned int hdroff,
    68				  const struct nf_conntrack_tuple *tuple,
    69				  enum nf_nat_manip_type maniptype)
    70	{
    71		struct udphdr *hdr;
    72	
    73		if (skb_ensure_writable(skb, hdroff + sizeof(*hdr)))
    74			return false;
    75	
    76		hdr = (struct udphdr *)(skb->data + hdroff);
    77		__udp_manip_pkt(skb, iphdroff, hdr, tuple, maniptype, !!hdr->check);
    78	
    79		if (skb->csum_local) {
    80			hdr->check = 0;
  > 81			hdr->check = udp_v4_check(htons(hdr->len), tuple->src.u3.ip, tuple->dst.u3.ip,
    82						  lco_csum(skb));
    83			if (hdr->check == 0)
    84				hdr->check = CSUM_MANGLED_0;
    85		}
    86	
    87		return true;
    88	}
    89
Edward Cree April 3, 2023, 10:56 a.m. UTC | #3
On 02/04/2023 19:18, Willem de Bruijn wrote:
> On Fri, Mar 31, 2023 at 10:31 PM Fei Cheng <chenwei.0515@bytedance.com> wrote:
>>
>> From: "chenwei.0515" <chenwei.0515@bytedance.com>
>>
>>     If vxlan-dev enable tx csum offload, there are two case of CHECKSUM_PARTIAL,
>>     but udp->check donot have the both meanings.
>>
>>     1. vxlan-dev disable tx csum offload, udp->check is just pseudo hdr.
>>     2. vxlan-dev enable tx csum offload, udp->check is pseudo hdr and
>>        csum from outter l4 to innner l4.
>>
>>     Unfortunately if there is a nat process after vxlan tx,udp_manip_pkt just use
>>     CSUM_PARTIAL to re csum PKT, which is just right on vxlan tx csum disable offload.

In case 1 csum_start should point to the (outer) UDP header, whereas in
 case 2 csum_start should point to the inner L4 header (because in the
 normal TX path w/o NAT, nothing else will ever need to touch the outer
 csum after this point).

> The issue is that for encapsulated traffic with local checksum offload,
> netfilter incorrectly recomputes the outer UDP checksum as if it is an
> unencapsulated CHECKSUM_PARTIAL packet, correct?

So if netfilter sees a packet with CHECKSUM_PARTIAL whose csum_start
 doesn't point to the header nf NAT is editing, that's exactly the case
 where it needs to use lco_csum to calculate the new outer sum.  No?

-ed

PS. Fei, your emails aren't reaching the netdev mailing list, probably
 because you're sending as HTML.  Please switch to plain text.
Fei Cheng April 4, 2023, 1:48 a.m. UTC | #4
Thank you for remind plain text.
Use csum_start to seperate these two cases, maybe a good idea.
1.Disable tx csum
skb->ip_summed ==  CHECKSUM_PARTIAL && skb_transport_header == udp
2.Enable tx csum
skb->ip_summed ==  CHECKSUM_PARTIAL && skb_transport_header != udp

Correct?

在 2023/4/3 下午6:56, Edward Cree 写道:
> On 02/04/2023 19:18, Willem de Bruijn wrote:
>> On Fri, Mar 31, 2023 at 10:31 PM Fei Cheng <chenwei.0515@bytedance.com> wrote:
>>>
>>> From: "chenwei.0515" <chenwei.0515@bytedance.com>
>>>
>>>      If vxlan-dev enable tx csum offload, there are two case of CHECKSUM_PARTIAL,
>>>      but udp->check donot have the both meanings.
>>>
>>>      1. vxlan-dev disable tx csum offload, udp->check is just pseudo hdr.
>>>      2. vxlan-dev enable tx csum offload, udp->check is pseudo hdr and
>>>         csum from outter l4 to innner l4.
>>>
>>>      Unfortunately if there is a nat process after vxlan tx,udp_manip_pkt just use
>>>      CSUM_PARTIAL to re csum PKT, which is just right on vxlan tx csum disable offload.
> 
> In case 1 csum_start should point to the (outer) UDP header, whereas in
>   case 2 csum_start should point to the inner L4 header (because in the
>   normal TX path w/o NAT, nothing else will ever need to touch the outer
>   csum after this point).
> 
>> The issue is that for encapsulated traffic with local checksum offload,
>> netfilter incorrectly recomputes the outer UDP checksum as if it is an
>> unencapsulated CHECKSUM_PARTIAL packet, correct?
> 
> So if netfilter sees a packet with CHECKSUM_PARTIAL whose csum_start
>   doesn't point to the header nf NAT is editing, that's exactly the case
>   where it needs to use lco_csum to calculate the new outer sum.  No?
> 
> -ed
> 
> PS. Fei, your emails aren't reaching the netdev mailing list, probably
>   because you're sending as HTML.  Please switch to plain text.
Edward Cree April 4, 2023, 10:26 a.m. UTC | #5
On 04/04/2023 02:48, Fei Cheng wrote:
> Thank you for remind plain text.
> Use csum_start to seperate these two cases, maybe a good idea.
> 1.Disable tx csum
> skb->ip_summed ==  CHECKSUM_PARTIAL && skb_transport_header == udp
> 2.Enable tx csum
> skb->ip_summed ==  CHECKSUM_PARTIAL && skb_transport_header != udp
> 
> Correct?

What do you mean by "skb_transport_header == udp"?  That it is a UDP
 header?  Or that it is at the offset of the UDP header?  The inner
 L4 packet could be UDP as well.
And why are you even looking at skb_transport_header when it's
 csum_start that determines what the hardware will do?
AFAICT nothing in nf_nat_proto.c looks at skb_transport_header anyway;
 indeed in nf_nat_icmp_reply_translation() we can call into this code
 with hdroff ending up pointing way deeper in the packet.

In any case, after digging deeper into the netfilter code, it looks to
 me like there's no issue in the first place: netfilter doesn't
 'recompute' the UDP checksum, it just accumulates delta into it to
 account for the changes it made to the packet, on the assumption that
 the existing checksum is correct.
Which AIUI will do the right thing whether the checksum is
* a correct checksum for an unencapsulated packet
* a pseudohdr sum for a CHECKSUM_PARTIAL (unencapsulated) packet
* a correct (LCO) checksum for an encapsulated packet, whose inner L4
  checksum may or may not be CHECKSUM_PARTIAL offloaded.

Do you have a test case that breaks with the current code?

-ed
Fei Cheng April 4, 2023, 10:58 a.m. UTC | #6
test case
1. create vxlan tunnel with tx csum offload on.
2. iptables nat the packet out from vxlan
3. vxlan remote will see the packet udp csum error.

I used to use skb->csum_valid to distinguish the two cases.
If donot add extra skb flags, maybe csum_start can seperate too.




在 2023/4/4 下午6:26, Edward Cree 写道:
> On 04/04/2023 02:48, Fei Cheng wrote:
>> Thank you for remind plain text.
>> Use csum_start to seperate these two cases, maybe a good idea.
>> 1.Disable tx csum
>> skb->ip_summed ==  CHECKSUM_PARTIAL && skb_transport_header == udp
>> 2.Enable tx csum
>> skb->ip_summed ==  CHECKSUM_PARTIAL && skb_transport_header != udp
>>
>> Correct?
> 
> What do you mean by "skb_transport_header == udp"?  That it is a UDP
>   header?  Or that it is at the offset of the UDP header?  The inner
>   L4 packet could be UDP as well.
> And why are you even looking at skb_transport_header when it's
>   csum_start that determines what the hardware will do?
> AFAICT nothing in nf_nat_proto.c looks at skb_transport_header anyway;
>   indeed in nf_nat_icmp_reply_translation() we can call into this code
>   with hdroff ending up pointing way deeper in the packet.
> 
> In any case, after digging deeper into the netfilter code, it looks to
>   me like there's no issue in the first place: netfilter doesn't
>   'recompute' the UDP checksum, it just accumulates delta into it to
>   account for the changes it made to the packet, on the assumption that
>   the existing checksum is correct.
> Which AIUI will do the right thing whether the checksum is
> * a correct checksum for an unencapsulated packet
> * a pseudohdr sum for a CHECKSUM_PARTIAL (unencapsulated) packet
> * a correct (LCO) checksum for an encapsulated packet, whose inner L4
>    checksum may or may not be CHECKSUM_PARTIAL offloaded.
> 
> Do you have a test case that breaks with the current code?
> 
> -ed
diff mbox series

Patch

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index ff7ad331fb82..62996d8d0b4d 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -990,6 +990,7 @@  struct sk_buff {
 	__u8			slow_gro:1;
 	__u8			csum_not_inet:1;
 	__u8			scm_io_uring:1;
+	__u8			csum_local:1;
 
 #ifdef CONFIG_NET_SCHED
 	__u16			tc_index;	/* traffic control index */
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index c605d171eb2d..86bad0bbb76e 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -889,6 +889,7 @@  void udp_set_csum(bool nocheck, struct sk_buff *skb,
 		uh->check = udp_v4_check(len, saddr, daddr, lco_csum(skb));
 		if (uh->check == 0)
 			uh->check = CSUM_MANGLED_0;
+		skb->csum_local = 1;
 	} else {
 		skb->ip_summed = CHECKSUM_PARTIAL;
 		skb->csum_start = skb_transport_header(skb) - skb->head;
diff --git a/net/netfilter/nf_nat_proto.c b/net/netfilter/nf_nat_proto.c
index 48cc60084d28..a0261fe2d932 100644
--- a/net/netfilter/nf_nat_proto.c
+++ b/net/netfilter/nf_nat_proto.c
@@ -25,6 +25,7 @@ 
 #include <net/ip6_route.h>
 #include <net/xfrm.h>
 #include <net/ipv6.h>
+#include <net/udp.h>
 
 #include <net/netfilter/nf_conntrack_core.h>
 #include <net/netfilter/nf_conntrack.h>
@@ -75,6 +76,14 @@  static bool udp_manip_pkt(struct sk_buff *skb,
 	hdr = (struct udphdr *)(skb->data + hdroff);
 	__udp_manip_pkt(skb, iphdroff, hdr, tuple, maniptype, !!hdr->check);
 
+	if (skb->csum_local) {
+		hdr->check = 0;
+		hdr->check = udp_v4_check(htons(hdr->len), tuple->src.u3.ip, tuple->dst.u3.ip,
+					  lco_csum(skb));
+		if (hdr->check == 0)
+			hdr->check = CSUM_MANGLED_0;
+	}
+
 	return true;
 }