diff mbox series

[ovs-dev,v2] ipf: fix only nat the first fragment in the reass process

Message ID 1623998750-9037-1-git-send-email-wenxu@ucloud.cn
State Accepted
Headers show
Series [ovs-dev,v2] ipf: fix only nat the first fragment in the reass process | expand

Commit Message

wenxu June 18, 2021, 6:45 a.m. UTC
From: wenxu <wenxu@ucloud.cn>

The ipf collect original fragment packets and reass a new pkt
to do the conntrack logic. After finsh the conntrack things
copy the ct meta info to each orignal packet and modify the
l4 header in the first fragment. It should modify the ip src/
dst info for all the fragments.

Signed-off-by: wenxu <wenxu@ucloud.cn>
Co-authored-by: luke.li <luke.li@ucloud.cn>
Signed-off-by: luke.li <luke.li@ucloud.cn>
---
 lib/ipf.c | 82 +++++++++++++++++++++++++++++++++------------------------------
 1 file changed, 43 insertions(+), 39 deletions(-)

Comments

Aaron Conole July 8, 2021, 8:11 p.m. UTC | #1
wenxu@ucloud.cn writes:

> From: wenxu <wenxu@ucloud.cn>
>
> The ipf collect original fragment packets and reass a new pkt
> to do the conntrack logic. After finsh the conntrack things
> copy the ct meta info to each orignal packet and modify the
> l4 header in the first fragment. It should modify the ip src/
> dst info for all the fragments.
>
> Signed-off-by: wenxu <wenxu@ucloud.cn>
> Co-authored-by: luke.li <luke.li@ucloud.cn>
> Signed-off-by: luke.li <luke.li@ucloud.cn>
> ---

This looks okay.  Please post a set of sample packets that showcase the
issue.  I will write a test case that can demonstrate it that we can
check in with this fix.
Alternately, please add such a test case.

>  lib/ipf.c | 82 +++++++++++++++++++++++++++++++++------------------------------
>  1 file changed, 43 insertions(+), 39 deletions(-)
>
> diff --git a/lib/ipf.c b/lib/ipf.c
> index 9c83f19..795e801 100644
> --- a/lib/ipf.c
> +++ b/lib/ipf.c
> @@ -1149,52 +1149,56 @@ ipf_post_execute_reass_pkts(struct ipf *ipf,
>           * NETDEV_MAX_BURST. */
>          DP_PACKET_BATCH_REFILL_FOR_EACH (pb_idx, pb_cnt, pkt, pb) {
>              if (rp && pkt == rp->list->reass_execute_ctx) {
> +                const struct ipf_frag *frag_0 = &rp->list->frag_list[0];
> +                void *l4_frag = dp_packet_l4(frag_0->pkt);
> +                void *l4_reass = dp_packet_l4(pkt);
> +                memcpy(l4_frag, l4_reass, dp_packet_l4_size(frag_0->pkt));
> +
>                  for (int i = 0; i <= rp->list->last_inuse_idx; i++) {
> -                    rp->list->frag_list[i].pkt->md.ct_label = pkt->md.ct_label;
> -                    rp->list->frag_list[i].pkt->md.ct_mark = pkt->md.ct_mark;
> -                    rp->list->frag_list[i].pkt->md.ct_state = pkt->md.ct_state;
> -                    rp->list->frag_list[i].pkt->md.ct_zone = pkt->md.ct_zone;
> -                    rp->list->frag_list[i].pkt->md.ct_orig_tuple_ipv6 =
> -                        pkt->md.ct_orig_tuple_ipv6;
> +                    const struct ipf_frag *frag_i = &rp->list->frag_list[i];
> +
> +                    frag_i->pkt->md.ct_label = pkt->md.ct_label;
> +                    frag_i->pkt->md.ct_mark = pkt->md.ct_mark;
> +                    frag_i->pkt->md.ct_state = pkt->md.ct_state;
> +                    frag_i->pkt->md.ct_zone = pkt->md.ct_zone;
> +                    frag_i->pkt->md.ct_orig_tuple_ipv6 =
> +                        pkt->md.ct_orig_tuple_ipv6;
>                      if (pkt->md.ct_orig_tuple_ipv6) {
> -                        rp->list->frag_list[i].pkt->md.ct_orig_tuple.ipv6 =
> +                        frag_i->pkt->md.ct_orig_tuple.ipv6 =
>                              pkt->md.ct_orig_tuple.ipv6;
>                      } else {
> -                        rp->list->frag_list[i].pkt->md.ct_orig_tuple.ipv4  =
> +                        frag_i->pkt->md.ct_orig_tuple.ipv4 =
>                              pkt->md.ct_orig_tuple.ipv4;
>                      }
> -                }
> -
> -                const struct ipf_frag *frag_0 = &rp->list->frag_list[0];
> -                void *l4_frag = dp_packet_l4(frag_0->pkt);
> -                void *l4_reass = dp_packet_l4(pkt);
> -                memcpy(l4_frag, l4_reass, dp_packet_l4_size(frag_0->pkt));
> -
> -                if (v6) {
> -                    struct ovs_16aligned_ip6_hdr *l3_frag
> -                        = dp_packet_l3(frag_0->pkt);
> -                    struct ovs_16aligned_ip6_hdr *l3_reass = dp_packet_l3(pkt);
> -                    l3_frag->ip6_src = l3_reass->ip6_src;
> -                    l3_frag->ip6_dst = l3_reass->ip6_dst;
> -                } else {
> -                    struct ip_header *l3_frag = dp_packet_l3(frag_0->pkt);
> -                    struct ip_header *l3_reass = dp_packet_l3(pkt);
> -                    if (!dp_packet_hwol_is_ipv4(frag_0->pkt)) {
> -                        ovs_be32 reass_ip =
> -                            get_16aligned_be32(&l3_reass->ip_src);
> -                        ovs_be32 frag_ip =
> -                            get_16aligned_be32(&l3_frag->ip_src);
> -
> -                        l3_frag->ip_csum = recalc_csum32(l3_frag->ip_csum,
> -                                                         frag_ip, reass_ip);
> -                        reass_ip = get_16aligned_be32(&l3_reass->ip_dst);
> -                        frag_ip = get_16aligned_be32(&l3_frag->ip_dst);
> -                        l3_frag->ip_csum = recalc_csum32(l3_frag->ip_csum,
> -                                                         frag_ip, reass_ip);
> +                    if (v6) {
> +                        struct ovs_16aligned_ip6_hdr *l3_frag
> +                            = dp_packet_l3(frag_i->pkt);
> +                        struct ovs_16aligned_ip6_hdr *l3_reass
> +                            = dp_packet_l3(pkt);
> +                        l3_frag->ip6_src = l3_reass->ip6_src;
> +                        l3_frag->ip6_dst = l3_reass->ip6_dst;
> +                    } else {
> +                        struct ip_header *l3_frag = dp_packet_l3(frag_i->pkt);
> +                        struct ip_header *l3_reass = dp_packet_l3(pkt);
> +                        if (!dp_packet_hwol_is_ipv4(frag_i->pkt)) {
> +                            ovs_be32 reass_ip =
> +                                get_16aligned_be32(&l3_reass->ip_src);
> +                            ovs_be32 frag_ip =
> +                                get_16aligned_be32(&l3_frag->ip_src);
> +
> +                            l3_frag->ip_csum = recalc_csum32(l3_frag->ip_csum,
> +                                                             frag_ip,
> +                                                             reass_ip);
> +                            reass_ip = get_16aligned_be32(&l3_reass->ip_dst);
> +                            frag_ip = get_16aligned_be32(&l3_frag->ip_dst);
> +                            l3_frag->ip_csum = recalc_csum32(l3_frag->ip_csum,
> +                                                             frag_ip,
> +                                                             reass_ip);
> +                        }
> +
> +                        l3_frag->ip_src = l3_reass->ip_src;
> +                        l3_frag->ip_dst = l3_reass->ip_dst;
>                      }
> -
> -                    l3_frag->ip_src = l3_reass->ip_src;
> -                    l3_frag->ip_dst = l3_reass->ip_dst;
>                  }
>  
>                  ipf_completed_list_add(&ipf->frag_complete_list, rp->list);
wenxu July 9, 2021, 4:09 a.m. UTC | #2
My case :


ovs-ofctl add-flow manbr "table=0,ct_state=-trk,ip,in_port=dpdk2, actions=ct(table=1, nat)"
ovs-ofctl add-flow manbr "table=0,ct_state=-trk,ip,in_port=dpdk3, actions=ct(table=1, nat)"
ovs-ofctl add-flow manbr "table=0,table=1,ct_state=+trk+new,ip,in_port=dpdk2, actions=ct(commit, nat(src=1.1.1.2)),dpdk3"
ovs-ofctl add-flow manbr "table=0,table=1,ct_state=+trk+est,ip,in_port=dpdk2, actions=dpdk3"
ovs-ofctl add-flow manbr "table=0,table=1,ct_state=+trk+est,ip,in_port=dpdk3, actions=dpdk2"
ovs-ofctl add-flow manbr "table=0,arp,actions=normal"



With the frag-packet in udp-frag.pcap. And the wrong nated packet. nat-packet.pcap


send the packet: 
iperf -u -c 1.1.1.3 -t 120 -i 2  -b 1 -l 1800




From: Aaron Conole <aconole@redhat.com>
Date: 2021-07-09 04:11:12
To:  wenxu@ucloud.cn
Cc:  i.maximets@ovn.org,dev@openvswitch.org
Subject: Re: [ovs-dev] [PATCH v2] ipf: fix only nat the first fragment in the reass process>wenxu@ucloud.cn writes:
>
>> From: wenxu <wenxu@ucloud.cn>
>>
>> The ipf collect original fragment packets and reass a new pkt
>> to do the conntrack logic. After finsh the conntrack things
>> copy the ct meta info to each orignal packet and modify the
>> l4 header in the first fragment. It should modify the ip src/
>> dst info for all the fragments.
>>
>> Signed-off-by: wenxu <wenxu@ucloud.cn>
>> Co-authored-by: luke.li <luke.li@ucloud.cn>
>> Signed-off-by: luke.li <luke.li@ucloud.cn>
>> ---
>
>This looks okay.  Please post a set of sample packets that showcase the
>issue.  I will write a test case that can demonstrate it that we can
>check in with this fix.
>Alternately, please add such a test case.
>
>>  lib/ipf.c | 82 +++++++++++++++++++++++++++++++++------------------------------
>>  1 file changed, 43 insertions(+), 39 deletions(-)
>>
>> diff --git a/lib/ipf.c b/lib/ipf.c
>> index 9c83f19..795e801 100644
>> --- a/lib/ipf.c
>> +++ b/lib/ipf.c
>> @@ -1149,52 +1149,56 @@ ipf_post_execute_reass_pkts(struct ipf *ipf,
>>           * NETDEV_MAX_BURST. */
>>          DP_PACKET_BATCH_REFILL_FOR_EACH (pb_idx, pb_cnt, pkt, pb) {
>>              if (rp && pkt == rp->list->reass_execute_ctx) {
>> +                const struct ipf_frag *frag_0 = &rp->list->frag_list[0];
>> +                void *l4_frag = dp_packet_l4(frag_0->pkt);
>> +                void *l4_reass = dp_packet_l4(pkt);
>> +                memcpy(l4_frag, l4_reass, dp_packet_l4_size(frag_0->pkt));
>> +
>>                  for (int i = 0; i <= rp->list->last_inuse_idx; i++) {
>> -                    rp->list->frag_list[i].pkt->md.ct_label = pkt->md.ct_label;
>> -                    rp->list->frag_list[i].pkt->md.ct_mark = pkt->md.ct_mark;
>> -                    rp->list->frag_list[i].pkt->md.ct_state = pkt->md.ct_state;
>> -                    rp->list->frag_list[i].pkt->md.ct_zone = pkt->md.ct_zone;
>> -                    rp->list->frag_list[i].pkt->md.ct_orig_tuple_ipv6 =
>> -                        pkt->md.ct_orig_tuple_ipv6;
>> +                    const struct ipf_frag *frag_i = &rp->list->frag_list[i];
>> +
>> +                    frag_i->pkt->md.ct_label = pkt->md.ct_label;
>> +                    frag_i->pkt->md.ct_mark = pkt->md.ct_mark;
>> +                    frag_i->pkt->md.ct_state = pkt->md.ct_state;
>> +                    frag_i->pkt->md.ct_zone = pkt->md.ct_zone;
>> +                    frag_i->pkt->md.ct_orig_tuple_ipv6 =
>> +                        pkt->md.ct_orig_tuple_ipv6;
>>                      if (pkt->md.ct_orig_tuple_ipv6) {
>> -                        rp->list->frag_list[i].pkt->md.ct_orig_tuple.ipv6 =
>> +                        frag_i->pkt->md.ct_orig_tuple.ipv6 =
>>                              pkt->md.ct_orig_tuple.ipv6;
>>                      } else {
>> -                        rp->list->frag_list[i].pkt->md.ct_orig_tuple.ipv4  =
>> +                        frag_i->pkt->md.ct_orig_tuple.ipv4 =
>>                              pkt->md.ct_orig_tuple.ipv4;
>>                      }
>> -                }
>> -
>> -                const struct ipf_frag *frag_0 = &rp->list->frag_list[0];
>> -                void *l4_frag = dp_packet_l4(frag_0->pkt);
>> -                void *l4_reass = dp_packet_l4(pkt);
>> -                memcpy(l4_frag, l4_reass, dp_packet_l4_size(frag_0->pkt));
>> -
>> -                if (v6) {
>> -                    struct ovs_16aligned_ip6_hdr *l3_frag
>> -                        = dp_packet_l3(frag_0->pkt);
>> -                    struct ovs_16aligned_ip6_hdr *l3_reass = dp_packet_l3(pkt);
>> -                    l3_frag->ip6_src = l3_reass->ip6_src;
>> -                    l3_frag->ip6_dst = l3_reass->ip6_dst;
>> -                } else {
>> -                    struct ip_header *l3_frag = dp_packet_l3(frag_0->pkt);
>> -                    struct ip_header *l3_reass = dp_packet_l3(pkt);
>> -                    if (!dp_packet_hwol_is_ipv4(frag_0->pkt)) {
>> -                        ovs_be32 reass_ip =
>> -                            get_16aligned_be32(&l3_reass->ip_src);
>> -                        ovs_be32 frag_ip =
>> -                            get_16aligned_be32(&l3_frag->ip_src);
>> -
>> -                        l3_frag->ip_csum = recalc_csum32(l3_frag->ip_csum,
>> -                                                         frag_ip, reass_ip);
>> -                        reass_ip = get_16aligned_be32(&l3_reass->ip_dst);
>> -                        frag_ip = get_16aligned_be32(&l3_frag->ip_dst);
>> -                        l3_frag->ip_csum = recalc_csum32(l3_frag->ip_csum,
>> -                                                         frag_ip, reass_ip);
>> +                    if (v6) {
>> +                        struct ovs_16aligned_ip6_hdr *l3_frag
>> +                            = dp_packet_l3(frag_i->pkt);
>> +                        struct ovs_16aligned_ip6_hdr *l3_reass
>> +                            = dp_packet_l3(pkt);
>> +                        l3_frag->ip6_src = l3_reass->ip6_src;
>> +                        l3_frag->ip6_dst = l3_reass->ip6_dst;
>> +                    } else {
>> +                        struct ip_header *l3_frag = dp_packet_l3(frag_i->pkt);
>> +                        struct ip_header *l3_reass = dp_packet_l3(pkt);
>> +                        if (!dp_packet_hwol_is_ipv4(frag_i->pkt)) {
>> +                            ovs_be32 reass_ip =
>> +                                get_16aligned_be32(&l3_reass->ip_src);
>> +                            ovs_be32 frag_ip =
>> +                                get_16aligned_be32(&l3_frag->ip_src);
>> +
>> +                            l3_frag->ip_csum = recalc_csum32(l3_frag->ip_csum,
>> +                                                             frag_ip,
>> +                                                             reass_ip);
>> +                            reass_ip = get_16aligned_be32(&l3_reass->ip_dst);
>> +                            frag_ip = get_16aligned_be32(&l3_frag->ip_dst);
>> +                            l3_frag->ip_csum = recalc_csum32(l3_frag->ip_csum,
>> +                                                             frag_ip,
>> +                                                             reass_ip);
>> +                        }
>> +
>> +                        l3_frag->ip_src = l3_reass->ip_src;
>> +                        l3_frag->ip_dst = l3_reass->ip_dst;
>>                      }
>> -
>> -                    l3_frag->ip_src = l3_reass->ip_src;
>> -                    l3_frag->ip_dst = l3_reass->ip_dst;
>>                  }
>>  
>>                  ipf_completed_list_add(&ipf->frag_complete_list, rp->list);
>
ze wang Aug. 6, 2021, 8:48 a.m. UTC | #3
We had the same problem, when I send a series of fragments, only the
first piece of the packet is NATed. We reviewed and tested the patch,
it do solve the problem, and we think there are not any other bugs in
it.
Reviewed-by: wangze <wangze712@gmail.com>

wenxu <wenxu@ucloud.cn> 于2021年8月6日周五 下午4:38写道:
>
>
> My case :
>
>
> ovs-ofctl add-flow manbr "table=0,ct_state=-trk,ip,in_port=dpdk2, actions=ct(table=1, nat)"
> ovs-ofctl add-flow manbr "table=0,ct_state=-trk,ip,in_port=dpdk3, actions=ct(table=1, nat)"
> ovs-ofctl add-flow manbr "table=0,table=1,ct_state=+trk+new,ip,in_port=dpdk2, actions=ct(commit, nat(src=1.1.1.2)),dpdk3"
> ovs-ofctl add-flow manbr "table=0,table=1,ct_state=+trk+est,ip,in_port=dpdk2, actions=dpdk3"
> ovs-ofctl add-flow manbr "table=0,table=1,ct_state=+trk+est,ip,in_port=dpdk3, actions=dpdk2"
> ovs-ofctl add-flow manbr "table=0,arp,actions=normal"
>
>
>
> With the frag-packet in udp-frag.pcap. And the wrong nated packet. nat-packet.pcap
>
>
> send the packet:
> iperf -u -c 1.1.1.3 -t 120 -i 2  -b 1 -l 1800
>
>
>
>
> From: Aaron Conole <aconole@redhat.com>
> Date: 2021-07-09 04:11:12
> To:  wenxu@ucloud.cn
> Cc:  i.maximets@ovn.org,dev@openvswitch.org
> Subject: Re: [ovs-dev] [PATCH v2] ipf: fix only nat the first fragment in the reass process>wenxu@ucloud.cn writes:
> >
> >> From: wenxu <wenxu@ucloud.cn>
> >>
> >> The ipf collect original fragment packets and reass a new pkt
> >> to do the conntrack logic. After finsh the conntrack things
> >> copy the ct meta info to each orignal packet and modify the
> >> l4 header in the first fragment. It should modify the ip src/
> >> dst info for all the fragments.
> >>
> >> Signed-off-by: wenxu <wenxu@ucloud.cn>
> >> Co-authored-by: luke.li <luke.li@ucloud.cn>
> >> Signed-off-by: luke.li <luke.li@ucloud.cn>
> >> ---
> >
> >This looks okay.  Please post a set of sample packets that showcase the
> >issue.  I will write a test case that can demonstrate it that we can
> >check in with this fix.
> >Alternately, please add such a test case.
> >
> >>  lib/ipf.c | 82 +++++++++++++++++++++++++++++++++------------------------------
> >>  1 file changed, 43 insertions(+), 39 deletions(-)
> >>
> >> diff --git a/lib/ipf.c b/lib/ipf.c
> >> index 9c83f19..795e801 100644
> >> --- a/lib/ipf.c
> >> +++ b/lib/ipf.c
> >> @@ -1149,52 +1149,56 @@ ipf_post_execute_reass_pkts(struct ipf *ipf,
> >>           * NETDEV_MAX_BURST. */
> >>          DP_PACKET_BATCH_REFILL_FOR_EACH (pb_idx, pb_cnt, pkt, pb) {
> >>              if (rp && pkt == rp->list->reass_execute_ctx) {
> >> +                const struct ipf_frag *frag_0 = &rp->list->frag_list[0];
> >> +                void *l4_frag = dp_packet_l4(frag_0->pkt);
> >> +                void *l4_reass = dp_packet_l4(pkt);
> >> +                memcpy(l4_frag, l4_reass, dp_packet_l4_size(frag_0->pkt));
> >> +
> >>                  for (int i = 0; i <= rp->list->last_inuse_idx; i++) {
> >> -                    rp->list->frag_list[i].pkt->md.ct_label = pkt->md.ct_label;
> >> -                    rp->list->frag_list[i].pkt->md.ct_mark = pkt->md.ct_mark;
> >> -                    rp->list->frag_list[i].pkt->md.ct_state = pkt->md.ct_state;
> >> -                    rp->list->frag_list[i].pkt->md.ct_zone = pkt->md.ct_zone;
> >> -                    rp->list->frag_list[i].pkt->md.ct_orig_tuple_ipv6 =
> >> -                        pkt->md.ct_orig_tuple_ipv6;
> >> +                    const struct ipf_frag *frag_i = &rp->list->frag_list[i];
> >> +
> >> +                    frag_i->pkt->md.ct_label = pkt->md.ct_label;
> >> +                    frag_i->pkt->md.ct_mark = pkt->md.ct_mark;
> >> +                    frag_i->pkt->md.ct_state = pkt->md.ct_state;
> >> +                    frag_i->pkt->md.ct_zone = pkt->md.ct_zone;
> >> +                    frag_i->pkt->md.ct_orig_tuple_ipv6 =
> >> +                        pkt->md.ct_orig_tuple_ipv6;
> >>                      if (pkt->md.ct_orig_tuple_ipv6) {
> >> -                        rp->list->frag_list[i].pkt->md.ct_orig_tuple.ipv6 =
> >> +                        frag_i->pkt->md.ct_orig_tuple.ipv6 =
> >>                              pkt->md.ct_orig_tuple.ipv6;
> >>                      } else {
> >> -                        rp->list->frag_list[i].pkt->md.ct_orig_tuple.ipv4  =
> >> +                        frag_i->pkt->md.ct_orig_tuple.ipv4 =
> >>                              pkt->md.ct_orig_tuple.ipv4;
> >>                      }
> >> -                }
> >> -
> >> -                const struct ipf_frag *frag_0 = &rp->list->frag_list[0];
> >> -                void *l4_frag = dp_packet_l4(frag_0->pkt);
> >> -                void *l4_reass = dp_packet_l4(pkt);
> >> -                memcpy(l4_frag, l4_reass, dp_packet_l4_size(frag_0->pkt));
> >> -
> >> -                if (v6) {
> >> -                    struct ovs_16aligned_ip6_hdr *l3_frag
> >> -                        = dp_packet_l3(frag_0->pkt);
> >> -                    struct ovs_16aligned_ip6_hdr *l3_reass = dp_packet_l3(pkt);
> >> -                    l3_frag->ip6_src = l3_reass->ip6_src;
> >> -                    l3_frag->ip6_dst = l3_reass->ip6_dst;
> >> -                } else {
> >> -                    struct ip_header *l3_frag = dp_packet_l3(frag_0->pkt);
> >> -                    struct ip_header *l3_reass = dp_packet_l3(pkt);
> >> -                    if (!dp_packet_hwol_is_ipv4(frag_0->pkt)) {
> >> -                        ovs_be32 reass_ip =
> >> -                            get_16aligned_be32(&l3_reass->ip_src);
> >> -                        ovs_be32 frag_ip =
> >> -                            get_16aligned_be32(&l3_frag->ip_src);
> >> -
> >> -                        l3_frag->ip_csum = recalc_csum32(l3_frag->ip_csum,
> >> -                                                         frag_ip, reass_ip);
> >> -                        reass_ip = get_16aligned_be32(&l3_reass->ip_dst);
> >> -                        frag_ip = get_16aligned_be32(&l3_frag->ip_dst);
> >> -                        l3_frag->ip_csum = recalc_csum32(l3_frag->ip_csum,
> >> -                                                         frag_ip, reass_ip);
> >> +                    if (v6) {
> >> +                        struct ovs_16aligned_ip6_hdr *l3_frag
> >> +                            = dp_packet_l3(frag_i->pkt);
> >> +                        struct ovs_16aligned_ip6_hdr *l3_reass
> >> +                            = dp_packet_l3(pkt);
> >> +                        l3_frag->ip6_src = l3_reass->ip6_src;
> >> +                        l3_frag->ip6_dst = l3_reass->ip6_dst;
> >> +                    } else {
> >> +                        struct ip_header *l3_frag = dp_packet_l3(frag_i->pkt);
> >> +                        struct ip_header *l3_reass = dp_packet_l3(pkt);
> >> +                        if (!dp_packet_hwol_is_ipv4(frag_i->pkt)) {
> >> +                            ovs_be32 reass_ip =
> >> +                                get_16aligned_be32(&l3_reass->ip_src);
> >> +                            ovs_be32 frag_ip =
> >> +                                get_16aligned_be32(&l3_frag->ip_src);
> >> +
> >> +                            l3_frag->ip_csum = recalc_csum32(l3_frag->ip_csum,
> >> +                                                             frag_ip,
> >> +                                                             reass_ip);
> >> +                            reass_ip = get_16aligned_be32(&l3_reass->ip_dst);
> >> +                            frag_ip = get_16aligned_be32(&l3_frag->ip_dst);
> >> +                            l3_frag->ip_csum = recalc_csum32(l3_frag->ip_csum,
> >> +                                                             frag_ip,
> >> +                                                             reass_ip);
> >> +                        }
> >> +
> >> +                        l3_frag->ip_src = l3_reass->ip_src;
> >> +                        l3_frag->ip_dst = l3_reass->ip_dst;
> >>                      }
> >> -
> >> -                    l3_frag->ip_src = l3_reass->ip_src;
> >> -                    l3_frag->ip_dst = l3_reass->ip_dst;
> >>                  }
> >>
> >>                  ipf_completed_list_add(&ipf->frag_complete_list, rp->list);
> >
>
>
>
>
> _______________________________________________
> dev mailing list
> dev@openvswitch.org
> https://mail.openvswitch.org/mailman/listinfo/ovs-dev
Aaron Conole Aug. 12, 2021, 4:17 p.m. UTC | #4
wenxu@ucloud.cn writes:

> From: wenxu <wenxu@ucloud.cn>
>
> The ipf collect original fragment packets and reass a new pkt
> to do the conntrack logic. After finsh the conntrack things
> copy the ct meta info to each orignal packet and modify the
> l4 header in the first fragment. It should modify the ip src/
> dst info for all the fragments.
>
> Signed-off-by: wenxu <wenxu@ucloud.cn>
> Co-authored-by: luke.li <luke.li@ucloud.cn>
> Signed-off-by: luke.li <luke.li@ucloud.cn>
> ---

Acked-by: Aaron Conole <aconole@redhat.com>

Thanks for the fix.  I see it can work for any l3 protocol.

Based on the comments you supplied, I wrote the following test case.  It
can either be folded in by you (or Ilya on apply), or I can submit as a
separate patch (in case you are worried about having my sign-off /
coauthor on this patch).

When testing 'make check-system-userspace' before this patch, I see a
failure and get the following tcpdump logged:

  12:15:31 aconole@RHTPC1VM0NT {master} ~/git/ovs$ sudo tcpdump -r tests/system-userspace-testsuite.dir/078/p1.pcap 
  reading from file tests/system-userspace-testsuite.dir/078/p1.pcap, link-type EN10MB (Ethernet), snapshot length 262144
  dropped privs to tcpdump
  12:07:21.364925 ARP, Request who-has 10.2.1.2 tell 10.2.1.1, length 28
  12:07:21.364928 ARP, Reply 10.2.1.2 is-at e6:45:4a:80:7c:61 (oui Unknown), length 28
  12:07:21.365095 IP 10.1.1.1 > 10.1.1.2: ICMP echo request, id 40165, seq 1, length 1480
  12:07:21.365099 IP 10.2.1.1 > 10.1.1.2: icmp
  12:07:21.365101 IP 10.2.1.1 > 10.1.1.2: icmp
  12:07:21.365102 IP 10.2.1.1 > 10.1.1.2: icmp

We see the first frag correct, but subsequent frags are broken.

This test worked both for userspace and kernel datapath on my local
system.

---
 tests/system-traffic.at | 46 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 46 insertions(+)

diff --git a/tests/system-traffic.at b/tests/system-traffic.at
index f400cfabc9..feb335c783 100644
--- a/tests/system-traffic.at
+++ b/tests/system-traffic.at
@@ -3305,6 +3305,52 @@ NS_CHECK_EXEC([at_ns0], [ping6 -s 3200 -q -c 3 -i 0.3 -w 2 fc00::2 | FORMAT_PING
 OVS_TRAFFIC_VSWITCHD_STOP
 AT_CLEANUP
 
+AT_SETUP([conntrack - IPv4 Fragmentation + nat])
+AT_SKIP_IF([test $HAVE_TCPDUMP = no])
+CHECK_CONNTRACK()
+
+OVS_TRAFFIC_VSWITCHD_START(
+   [set-fail-mode br0 secure -- ])
+
+ADD_NAMESPACES(at_ns0, at_ns1)
+
+ADD_VETH(p0, at_ns0, br0, "10.2.1.1/24")
+NS_CHECK_EXEC([at_ns0], [ip link set dev p0 address 80:88:88:88:88:88])
+ADD_VETH(p1, at_ns1, br0, "10.1.1.2/24")
+
+# create a dummy route for NAT
+ADD_VETH(p2, at_ns1, br0, "10.2.1.2/24")
+NS_CHECK_EXEC([at_ns0], [ip route add 10.1.1.0/24 via 10.2.1.2])
+NS_CHECK_EXEC([at_ns1], [ip neigh add 10.1.1.1 nud permanent lladdr 80:88:88:88:88:88 dev p1])
+
+# disable iptables from getting in the way
+NS_EXEC([at_ns1], [iptables --flush])
+
+# solely for debugging when things go wrong
+NS_EXEC([at_ns0], [tcpdump -i p0 -w p0.pcap -xx >tcpdump.out &])
+NS_EXEC([at_ns1], [tcpdump -i p1 -w p1.pcap -xx >tcpdump.out &])
+NS_EXEC([at_ns1], [tcpdump -i p2 -w p2.pcap -xx >tcpdump2.out &])
+
+AT_DATA([flows.txt], [dnl
+table=0,arp,actions=normal
+table=0,ct_state=-trk,ip,in_port=ovs-p0, actions=ct(table=1, nat)
+table=0,ct_state=-trk,ip,in_port=ovs-p1, actions=ct(table=1, nat)
+table=1,ct_state=+trk+new,ip,in_port=ovs-p0, actions=ct(commit, nat(src=10.1.1.1)),ovs-p1
+table=1,ct_state=+trk+est,ip,in_port=ovs-p0, actions=ovs-p1
+table=1,ct_state=+trk+est,ip,in_port=ovs-p1, actions=ovs-p0
+])
+
+AT_CHECK([ovs-ofctl add-flows br0 flows.txt])
+
+#check connectivity
+NS_CHECK_EXEC([at_ns0], [ping -c 1 10.1.1.2 -M dont -s 4500 | FORMAT_PING], [0], [dnl
+1 packets transmitted, 1 received, 0% packet loss, time 0ms
+])
+
+OVS_TRAFFIC_VSWITCHD_STOP
+AT_CLEANUP
+
+
 AT_SETUP([conntrack - resubmit to ct multiple times])
 CHECK_CONNTRACK()
Paolo Valerio Aug. 12, 2021, 5:33 p.m. UTC | #5
Hi,

thanks for working on this

wenxu@ucloud.cn writes:

> From: wenxu <wenxu@ucloud.cn>
>
> The ipf collect original fragment packets and reass a new pkt
> to do the conntrack logic. After finsh the conntrack things
> copy the ct meta info to each orignal packet and modify the
> l4 header in the first fragment. It should modify the ip src/
> dst info for all the fragments.
>
> Signed-off-by: wenxu <wenxu@ucloud.cn>
> Co-authored-by: luke.li <luke.li@ucloud.cn>
> Signed-off-by: luke.li <luke.li@ucloud.cn>
> ---

The patch LGTM, and works as expected.
Any chance to add a Fixes tag?

I think it should be:

4ea96698f667 ("Userspace datapath: Add fragmentation handling.")

besides that:

Acked-by: Paolo Valerio <pvalerio@redhat.com>
Ilya Maximets Aug. 16, 2021, 7:26 p.m. UTC | #6
On 8/12/21 6:17 PM, Aaron Conole wrote:
> wenxu@ucloud.cn writes:
> 
>> From: wenxu <wenxu@ucloud.cn>
>>
>> The ipf collect original fragment packets and reass a new pkt
>> to do the conntrack logic. After finsh the conntrack things
>> copy the ct meta info to each orignal packet and modify the
>> l4 header in the first fragment. It should modify the ip src/
>> dst info for all the fragments.
>>
>> Signed-off-by: wenxu <wenxu@ucloud.cn>
>> Co-authored-by: luke.li <luke.li@ucloud.cn>
>> Signed-off-by: luke.li <luke.li@ucloud.cn>
>> ---
> 
> Acked-by: Aaron Conole <aconole@redhat.com>
> 
> Thanks for the fix.  I see it can work for any l3 protocol.
> 
> Based on the comments you supplied, I wrote the following test case.  It
> can either be folded in by you (or Ilya on apply), or I can submit as a
> separate patch (in case you are worried about having my sign-off /
> coauthor on this patch).
> 
> When testing 'make check-system-userspace' before this patch, I see a
> failure and get the following tcpdump logged:
> 
>   12:15:31 aconole@RHTPC1VM0NT {master} ~/git/ovs$ sudo tcpdump -r tests/system-userspace-testsuite.dir/078/p1.pcap 
>   reading from file tests/system-userspace-testsuite.dir/078/p1.pcap, link-type EN10MB (Ethernet), snapshot length 262144
>   dropped privs to tcpdump
>   12:07:21.364925 ARP, Request who-has 10.2.1.2 tell 10.2.1.1, length 28
>   12:07:21.364928 ARP, Reply 10.2.1.2 is-at e6:45:4a:80:7c:61 (oui Unknown), length 28
>   12:07:21.365095 IP 10.1.1.1 > 10.1.1.2: ICMP echo request, id 40165, seq 1, length 1480
>   12:07:21.365099 IP 10.2.1.1 > 10.1.1.2: icmp
>   12:07:21.365101 IP 10.2.1.1 > 10.1.1.2: icmp
>   12:07:21.365102 IP 10.2.1.1 > 10.1.1.2: icmp
> 
> We see the first frag correct, but subsequent frags are broken.
> 
> This test worked both for userspace and kernel datapath on my local
> system.

Hmm.  This test fails for me for both kernel and userspace:

tcpdump -r tests/system-userspace-testsuite.dir/078/p0.pcap
15:17:12.832383 ARP, Request who-has 10.2.1.2 tell 10.2.1.1, length 28
15:17:12.834317 ARP, Reply 10.2.1.2 is-at 46:0c:83:aa:6e:b0 (oui Unknown), length 28
15:17:12.834327 IP 10.2.1.1 > 10.1.1.2: ICMP echo request, id 27759, seq 1, length 1480
15:17:12.834329 IP 10.2.1.1 > 10.1.1.2: icmp
15:17:12.834330 IP 10.2.1.1 > 10.1.1.2: icmp
15:17:12.834332 IP 10.2.1.1 > 10.1.1.2: icmp

tcpdump -r tests/system-userspace-testsuite.dir/078/p1.pcap
15:17:12.833542 ARP, Request who-has 10.2.1.2 tell 10.2.1.1, length 28
15:17:12.834994 IP 10.1.1.1 > 10.1.1.2: ICMP echo request, id 27759, seq 1, length 1480
15:17:12.834999 IP 10.1.1.1 > 10.1.1.2: icmp
15:17:12.835002 IP 10.1.1.1 > 10.1.1.2: icmp
15:17:12.835004 IP 10.1.1.1 > 10.1.1.2: icmp

ping -c 1 10.1.1.2 -M dont -s 4500 | grep "transmitted" | sed 's/time.*ms$/time 0ms/'
NS_EXEC_HEREDOC
--- -   2021-08-16 15:17:22.844535052 -0400
+++ /root/ovs/tests/system-userspace-testsuite.dir/at-groups/78/stdout
@@ -1,2 +1,2 @@
-1 packets transmitted, 1 received, 0% packet loss, time 0ms
+1 packets transmitted, 0 received, 100% packet loss, time 0ms

# uname -a
Linux rhel8 4.18.0-305.3.1.el8_4.x86_64

I'm not sure what is going on.  Could you, please, re-check?

I will not apply this patch for now until we figure out how to test it.

Best regards, Ilya Maximets.

> 
> ---
>  tests/system-traffic.at | 46 +++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 46 insertions(+)
> 
> diff --git a/tests/system-traffic.at b/tests/system-traffic.at
> index f400cfabc9..feb335c783 100644
> --- a/tests/system-traffic.at
> +++ b/tests/system-traffic.at
> @@ -3305,6 +3305,52 @@ NS_CHECK_EXEC([at_ns0], [ping6 -s 3200 -q -c 3 -i 0.3 -w 2 fc00::2 | FORMAT_PING
>  OVS_TRAFFIC_VSWITCHD_STOP
>  AT_CLEANUP
>  
> +AT_SETUP([conntrack - IPv4 Fragmentation + nat])
> +AT_SKIP_IF([test $HAVE_TCPDUMP = no])
> +CHECK_CONNTRACK()
> +
> +OVS_TRAFFIC_VSWITCHD_START(
> +   [set-fail-mode br0 secure -- ])
> +
> +ADD_NAMESPACES(at_ns0, at_ns1)
> +
> +ADD_VETH(p0, at_ns0, br0, "10.2.1.1/24")
> +NS_CHECK_EXEC([at_ns0], [ip link set dev p0 address 80:88:88:88:88:88])
> +ADD_VETH(p1, at_ns1, br0, "10.1.1.2/24")
> +
> +# create a dummy route for NAT
> +ADD_VETH(p2, at_ns1, br0, "10.2.1.2/24")
> +NS_CHECK_EXEC([at_ns0], [ip route add 10.1.1.0/24 via 10.2.1.2])
> +NS_CHECK_EXEC([at_ns1], [ip neigh add 10.1.1.1 nud permanent lladdr 80:88:88:88:88:88 dev p1])
> +
> +# disable iptables from getting in the way
> +NS_EXEC([at_ns1], [iptables --flush])
> +
> +# solely for debugging when things go wrong
> +NS_EXEC([at_ns0], [tcpdump -i p0 -w p0.pcap -xx >tcpdump.out &])
> +NS_EXEC([at_ns1], [tcpdump -i p1 -w p1.pcap -xx >tcpdump.out &])
> +NS_EXEC([at_ns1], [tcpdump -i p2 -w p2.pcap -xx >tcpdump2.out &])
> +
> +AT_DATA([flows.txt], [dnl
> +table=0,arp,actions=normal
> +table=0,ct_state=-trk,ip,in_port=ovs-p0, actions=ct(table=1, nat)
> +table=0,ct_state=-trk,ip,in_port=ovs-p1, actions=ct(table=1, nat)
> +table=1,ct_state=+trk+new,ip,in_port=ovs-p0, actions=ct(commit, nat(src=10.1.1.1)),ovs-p1
> +table=1,ct_state=+trk+est,ip,in_port=ovs-p0, actions=ovs-p1
> +table=1,ct_state=+trk+est,ip,in_port=ovs-p1, actions=ovs-p0
> +])
> +
> +AT_CHECK([ovs-ofctl add-flows br0 flows.txt])
> +
> +#check connectivity
> +NS_CHECK_EXEC([at_ns0], [ping -c 1 10.1.1.2 -M dont -s 4500 | FORMAT_PING], [0], [dnl
> +1 packets transmitted, 1 received, 0% packet loss, time 0ms
> +])
> +
> +OVS_TRAFFIC_VSWITCHD_STOP
> +AT_CLEANUP
> +
> +
>  AT_SETUP([conntrack - resubmit to ct multiple times])
>  CHECK_CONNTRACK()
>  
>
Aaron Conole Aug. 23, 2021, 3:11 p.m. UTC | #7
Ilya Maximets <i.maximets@ovn.org> writes:

> On 8/12/21 6:17 PM, Aaron Conole wrote:
>> wenxu@ucloud.cn writes:
>> 
>>> From: wenxu <wenxu@ucloud.cn>
>>>
>>> The ipf collect original fragment packets and reass a new pkt
>>> to do the conntrack logic. After finsh the conntrack things
>>> copy the ct meta info to each orignal packet and modify the
>>> l4 header in the first fragment. It should modify the ip src/
>>> dst info for all the fragments.
>>>
>>> Signed-off-by: wenxu <wenxu@ucloud.cn>
>>> Co-authored-by: luke.li <luke.li@ucloud.cn>
>>> Signed-off-by: luke.li <luke.li@ucloud.cn>
>>> ---
>> 
>> Acked-by: Aaron Conole <aconole@redhat.com>
>> 
>> Thanks for the fix.  I see it can work for any l3 protocol.
>> 
>> Based on the comments you supplied, I wrote the following test case.  It
>> can either be folded in by you (or Ilya on apply), or I can submit as a
>> separate patch (in case you are worried about having my sign-off /
>> coauthor on this patch).
>> 
>> When testing 'make check-system-userspace' before this patch, I see a
>> failure and get the following tcpdump logged:
>> 
>>   12:15:31 aconole@RHTPC1VM0NT {master} ~/git/ovs$ sudo tcpdump -r
>> tests/system-userspace-testsuite.dir/078/p1.pcap
>>   reading from file
>> tests/system-userspace-testsuite.dir/078/p1.pcap, link-type EN10MB
>> (Ethernet), snapshot length 262144
>>   dropped privs to tcpdump
>>   12:07:21.364925 ARP, Request who-has 10.2.1.2 tell 10.2.1.1, length 28
>>   12:07:21.364928 ARP, Reply 10.2.1.2 is-at e6:45:4a:80:7c:61 (oui Unknown), length 28
>>   12:07:21.365095 IP 10.1.1.1 > 10.1.1.2: ICMP echo request, id 40165, seq 1, length 1480
>>   12:07:21.365099 IP 10.2.1.1 > 10.1.1.2: icmp
>>   12:07:21.365101 IP 10.2.1.1 > 10.1.1.2: icmp
>>   12:07:21.365102 IP 10.2.1.1 > 10.1.1.2: icmp
>> 
>> We see the first frag correct, but subsequent frags are broken.
>> 
>> This test worked both for userspace and kernel datapath on my local
>> system.
>
> Hmm.  This test fails for me for both kernel and userspace:

Okay, I'll try it again on my system.  For reference, I was on F34,
kernel 5.12.12-300.fc34.x86_64

> tcpdump -r tests/system-userspace-testsuite.dir/078/p0.pcap
> 15:17:12.832383 ARP, Request who-has 10.2.1.2 tell 10.2.1.1, length 28
> 15:17:12.834317 ARP, Reply 10.2.1.2 is-at 46:0c:83:aa:6e:b0 (oui Unknown), length 28
> 15:17:12.834327 IP 10.2.1.1 > 10.1.1.2: ICMP echo request, id 27759, seq 1, length 1480
> 15:17:12.834329 IP 10.2.1.1 > 10.1.1.2: icmp
> 15:17:12.834330 IP 10.2.1.1 > 10.1.1.2: icmp
> 15:17:12.834332 IP 10.2.1.1 > 10.1.1.2: icmp
>
> tcpdump -r tests/system-userspace-testsuite.dir/078/p1.pcap
> 15:17:12.833542 ARP, Request who-has 10.2.1.2 tell 10.2.1.1, length 28
> 15:17:12.834994 IP 10.1.1.1 > 10.1.1.2: ICMP echo request, id 27759, seq 1, length 1480
> 15:17:12.834999 IP 10.1.1.1 > 10.1.1.2: icmp
> 15:17:12.835002 IP 10.1.1.1 > 10.1.1.2: icmp
> 15:17:12.835004 IP 10.1.1.1 > 10.1.1.2: icmp
>
> ping -c 1 10.1.1.2 -M dont -s 4500 | grep "transmitted" | sed 's/time.*ms$/time 0ms/'
> NS_EXEC_HEREDOC
> --- -   2021-08-16 15:17:22.844535052 -0400
> +++ /root/ovs/tests/system-userspace-testsuite.dir/at-groups/78/stdout
> @@ -1,2 +1,2 @@
> -1 packets transmitted, 1 received, 0% packet loss, time 0ms
> +1 packets transmitted, 0 received, 100% packet loss, time 0ms
>
> # uname -a
> Linux rhel8 4.18.0-305.3.1.el8_4.x86_64
>
> I'm not sure what is going on.  Could you, please, re-check?

I'll boot an rhel8.4 instance and try it out.

> I will not apply this patch for now until we figure out how to test it.

Okay.

Thanks, Ilya!

> Best regards, Ilya Maximets.
>
>> 
>> ---
>>  tests/system-traffic.at | 46 +++++++++++++++++++++++++++++++++++++++++
>>  1 file changed, 46 insertions(+)
>> 
>> diff --git a/tests/system-traffic.at b/tests/system-traffic.at
>> index f400cfabc9..feb335c783 100644
>> --- a/tests/system-traffic.at
>> +++ b/tests/system-traffic.at
>> @@ -3305,6 +3305,52 @@ NS_CHECK_EXEC([at_ns0], [ping6 -s 3200 -q -c
>> 3 -i 0.3 -w 2 fc00::2 | FORMAT_PING
>>  OVS_TRAFFIC_VSWITCHD_STOP
>>  AT_CLEANUP
>>  
>> +AT_SETUP([conntrack - IPv4 Fragmentation + nat])
>> +AT_SKIP_IF([test $HAVE_TCPDUMP = no])
>> +CHECK_CONNTRACK()
>> +
>> +OVS_TRAFFIC_VSWITCHD_START(
>> +   [set-fail-mode br0 secure -- ])
>> +
>> +ADD_NAMESPACES(at_ns0, at_ns1)
>> +
>> +ADD_VETH(p0, at_ns0, br0, "10.2.1.1/24")
>> +NS_CHECK_EXEC([at_ns0], [ip link set dev p0 address 80:88:88:88:88:88])
>> +ADD_VETH(p1, at_ns1, br0, "10.1.1.2/24")
>> +
>> +# create a dummy route for NAT
>> +ADD_VETH(p2, at_ns1, br0, "10.2.1.2/24")
>> +NS_CHECK_EXEC([at_ns0], [ip route add 10.1.1.0/24 via 10.2.1.2])
>> +NS_CHECK_EXEC([at_ns1], [ip neigh add 10.1.1.1 nud permanent lladdr 80:88:88:88:88:88 dev p1])
>> +
>> +# disable iptables from getting in the way
>> +NS_EXEC([at_ns1], [iptables --flush])
>> +
>> +# solely for debugging when things go wrong
>> +NS_EXEC([at_ns0], [tcpdump -i p0 -w p0.pcap -xx >tcpdump.out &])
>> +NS_EXEC([at_ns1], [tcpdump -i p1 -w p1.pcap -xx >tcpdump.out &])
>> +NS_EXEC([at_ns1], [tcpdump -i p2 -w p2.pcap -xx >tcpdump2.out &])
>> +
>> +AT_DATA([flows.txt], [dnl
>> +table=0,arp,actions=normal
>> +table=0,ct_state=-trk,ip,in_port=ovs-p0, actions=ct(table=1, nat)
>> +table=0,ct_state=-trk,ip,in_port=ovs-p1, actions=ct(table=1, nat)
>> +table=1,ct_state=+trk+new,ip,in_port=ovs-p0, actions=ct(commit, nat(src=10.1.1.1)),ovs-p1
>> +table=1,ct_state=+trk+est,ip,in_port=ovs-p0, actions=ovs-p1
>> +table=1,ct_state=+trk+est,ip,in_port=ovs-p1, actions=ovs-p0
>> +])
>> +
>> +AT_CHECK([ovs-ofctl add-flows br0 flows.txt])
>> +
>> +#check connectivity
>> +NS_CHECK_EXEC([at_ns0], [ping -c 1 10.1.1.2 -M dont -s 4500 | FORMAT_PING], [0], [dnl
>> +1 packets transmitted, 1 received, 0% packet loss, time 0ms
>> +])
>> +
>> +OVS_TRAFFIC_VSWITCHD_STOP
>> +AT_CLEANUP
>> +
>> +
>>  AT_SETUP([conntrack - resubmit to ct multiple times])
>>  CHECK_CONNTRACK()
>>  
>>
Aaron Conole Aug. 24, 2021, 5:43 p.m. UTC | #8
Aaron Conole <aconole@redhat.com> writes:

> Ilya Maximets <i.maximets@ovn.org> writes:
>
>> On 8/12/21 6:17 PM, Aaron Conole wrote:
>>> wenxu@ucloud.cn writes:
>>> 
>>>> From: wenxu <wenxu@ucloud.cn>
>>>>
>>>> The ipf collect original fragment packets and reass a new pkt
>>>> to do the conntrack logic. After finsh the conntrack things
>>>> copy the ct meta info to each orignal packet and modify the
>>>> l4 header in the first fragment. It should modify the ip src/
>>>> dst info for all the fragments.
>>>>
>>>> Signed-off-by: wenxu <wenxu@ucloud.cn>
>>>> Co-authored-by: luke.li <luke.li@ucloud.cn>
>>>> Signed-off-by: luke.li <luke.li@ucloud.cn>
>>>> ---
>>> 
>>> Acked-by: Aaron Conole <aconole@redhat.com>
>>> 
>>> Thanks for the fix.  I see it can work for any l3 protocol.
>>> 
>>> Based on the comments you supplied, I wrote the following test case.  It
>>> can either be folded in by you (or Ilya on apply), or I can submit as a
>>> separate patch (in case you are worried about having my sign-off /
>>> coauthor on this patch).
>>> 
>>> When testing 'make check-system-userspace' before this patch, I see a
>>> failure and get the following tcpdump logged:
>>> 
>>>   12:15:31 aconole@RHTPC1VM0NT {master} ~/git/ovs$ sudo tcpdump -r
>>> tests/system-userspace-testsuite.dir/078/p1.pcap
>>>   reading from file
>>> tests/system-userspace-testsuite.dir/078/p1.pcap, link-type EN10MB
>>> (Ethernet), snapshot length 262144
>>>   dropped privs to tcpdump
>>>   12:07:21.364925 ARP, Request who-has 10.2.1.2 tell 10.2.1.1, length 28
>>>   12:07:21.364928 ARP, Reply 10.2.1.2 is-at e6:45:4a:80:7c:61 (oui Unknown), length 28
>>>   12:07:21.365095 IP 10.1.1.1 > 10.1.1.2: ICMP echo request, id 40165, seq 1, length 1480
>>>   12:07:21.365099 IP 10.2.1.1 > 10.1.1.2: icmp
>>>   12:07:21.365101 IP 10.2.1.1 > 10.1.1.2: icmp
>>>   12:07:21.365102 IP 10.2.1.1 > 10.1.1.2: icmp
>>> 
>>> We see the first frag correct, but subsequent frags are broken.
>>> 
>>> This test worked both for userspace and kernel datapath on my local
>>> system.
>>
>> Hmm.  This test fails for me for both kernel and userspace:
>
> Okay, I'll try it again on my system.  For reference, I was on F34,
> kernel 5.12.12-300.fc34.x86_64
>
>> tcpdump -r tests/system-userspace-testsuite.dir/078/p0.pcap
>> 15:17:12.832383 ARP, Request who-has 10.2.1.2 tell 10.2.1.1, length 28
>> 15:17:12.834317 ARP, Reply 10.2.1.2 is-at 46:0c:83:aa:6e:b0 (oui Unknown), length 28
>> 15:17:12.834327 IP 10.2.1.1 > 10.1.1.2: ICMP echo request, id 27759, seq 1, length 1480
>> 15:17:12.834329 IP 10.2.1.1 > 10.1.1.2: icmp
>> 15:17:12.834330 IP 10.2.1.1 > 10.1.1.2: icmp
>> 15:17:12.834332 IP 10.2.1.1 > 10.1.1.2: icmp
>>
>> tcpdump -r tests/system-userspace-testsuite.dir/078/p1.pcap
>> 15:17:12.833542 ARP, Request who-has 10.2.1.2 tell 10.2.1.1, length 28
>> 15:17:12.834994 IP 10.1.1.1 > 10.1.1.2: ICMP echo request, id 27759, seq 1, length 1480
>> 15:17:12.834999 IP 10.1.1.1 > 10.1.1.2: icmp
>> 15:17:12.835002 IP 10.1.1.1 > 10.1.1.2: icmp
>> 15:17:12.835004 IP 10.1.1.1 > 10.1.1.2: icmp
>>
>> ping -c 1 10.1.1.2 -M dont -s 4500 | grep "transmitted" | sed 's/time.*ms$/time 0ms/'
>> NS_EXEC_HEREDOC
>> --- -   2021-08-16 15:17:22.844535052 -0400
>> +++ /root/ovs/tests/system-userspace-testsuite.dir/at-groups/78/stdout
>> @@ -1,2 +1,2 @@
>> -1 packets transmitted, 1 received, 0% packet loss, time 0ms
>> +1 packets transmitted, 0 received, 100% packet loss, time 0ms
>>
>> # uname -a
>> Linux rhel8 4.18.0-305.3.1.el8_4.x86_64
>>
>> I'm not sure what is going on.  Could you, please, re-check?
>
> I'll boot an rhel8.4 instance and try it out.
>
>> I will not apply this patch for now until we figure out how to test it.
>
> Okay.
>

I hope this change works.  I altered the test environment to use a
single port so instead of having a dummy attached to the bridge, I now
use lo to be the receiver.

  [ 'client' ] < --- > [ bridge ] < --- > [ 'server' ][ lo ]

Before it was

  [ 'client' ] < --- > [ bridge ] < --- > [ 'server' ]
                            ^ --------- > [  'dummy' ]

And 'dummy' provided the l3 routing path.  Seems that doesn't work in all
environments, so I use 'server' to provide the routing path and 'lo' as
the actual ping endpoing.

I tested this on RHEL8 (with the latest z-stream kernel), and Fedora
34.

---
 tests/system-traffic.at | 41 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 41 insertions(+)

diff --git a/tests/system-traffic.at b/tests/system-traffic.at
index f400cfabc9..de9108ac20 100644
--- a/tests/system-traffic.at
+++ b/tests/system-traffic.at
@@ -3305,6 +3305,47 @@ NS_CHECK_EXEC([at_ns0], [ping6 -s 3200 -q -c 3 -i 0.3 -w 2 fc00::2 | FORMAT_PING
 OVS_TRAFFIC_VSWITCHD_STOP
 AT_CLEANUP
 
+AT_SETUP([conntrack - IPv4 Fragmentation + NAT])
+AT_SKIP_IF([test $HAVE_TCPDUMP = no])
+CHECK_CONNTRACK()
+
+OVS_TRAFFIC_VSWITCHD_START(
+   [set-fail-mode br0 secure -- ])
+
+ADD_NAMESPACES(at_ns0, at_ns1)
+
+ADD_VETH(p0, at_ns0, br0, "10.2.1.1/24")
+ADD_VETH(p1, at_ns1, br0, "10.2.1.2/24")
+
+# create a dummy route for NAT
+NS_CHECK_EXEC([at_ns1], [ip addr add 10.1.1.2/32 dev lo])
+NS_CHECK_EXEC([at_ns0], [ip route add 10.1.1.0/24 via 10.2.1.2])
+NS_CHECK_EXEC([at_ns1], [ip route add 10.1.1.0/24 via 10.2.1.1])
+
+# solely for debugging when things go wrong
+NS_EXEC([at_ns0], [tcpdump -i p0 -w p0.pcap -xx >tcpdump.out &])
+NS_EXEC([at_ns1], [tcpdump -i p1 -w p1.pcap -xx >tcpdump.out &])
+
+AT_DATA([flows.txt], [dnl
+table=0,arp,actions=normal
+table=0,ct_state=-trk,ip,in_port=ovs-p0, actions=ct(table=1, nat)
+table=0,ct_state=-trk,ip,in_port=ovs-p1, actions=ct(table=1, nat)
+table=1,ct_state=+trk+new,ip,in_port=ovs-p0, actions=ct(commit, nat(src=10.1.1.1)),ovs-p1
+table=1,ct_state=+trk+est,ip,in_port=ovs-p0, actions=ovs-p1
+table=1,ct_state=+trk+est,ip,in_port=ovs-p1, actions=ovs-p0
+])
+
+AT_CHECK([ovs-ofctl add-flows br0 flows.txt])
+
+#check connectivity
+NS_CHECK_EXEC([at_ns0], [ping -c 1 10.1.1.2 -M dont -s 4500 | FORMAT_PING], [0], [dnl
+1 packets transmitted, 1 received, 0% packet loss, time 0ms
+])
+
+OVS_TRAFFIC_VSWITCHD_STOP
+AT_CLEANUP
+
+
 AT_SETUP([conntrack - resubmit to ct multiple times])
 CHECK_CONNTRACK()
Ilya Maximets Sept. 15, 2021, 9:04 p.m. UTC | #9
On 8/24/21 19:43, Aaron Conole wrote:
> Aaron Conole <aconole@redhat.com> writes:
> 
>> Ilya Maximets <i.maximets@ovn.org> writes:
>>
>>> On 8/12/21 6:17 PM, Aaron Conole wrote:
>>>> wenxu@ucloud.cn writes:
>>>>
>>>>> From: wenxu <wenxu@ucloud.cn>
>>>>>
>>>>> The ipf collect original fragment packets and reass a new pkt
>>>>> to do the conntrack logic. After finsh the conntrack things
>>>>> copy the ct meta info to each orignal packet and modify the
>>>>> l4 header in the first fragment. It should modify the ip src/
>>>>> dst info for all the fragments.
>>>>>
>>>>> Signed-off-by: wenxu <wenxu@ucloud.cn>
>>>>> Co-authored-by: luke.li <luke.li@ucloud.cn>
>>>>> Signed-off-by: luke.li <luke.li@ucloud.cn>
>>>>> ---
>>>>
>>>> Acked-by: Aaron Conole <aconole@redhat.com>
>>>>
>>>> Thanks for the fix.  I see it can work for any l3 protocol.
>>>>
>>>> Based on the comments you supplied, I wrote the following test case.  It
>>>> can either be folded in by you (or Ilya on apply), or I can submit as a
>>>> separate patch (in case you are worried about having my sign-off /
>>>> coauthor on this patch).
>>>>
>>>> When testing 'make check-system-userspace' before this patch, I see a
>>>> failure and get the following tcpdump logged:
>>>>
>>>>   12:15:31 aconole@RHTPC1VM0NT {master} ~/git/ovs$ sudo tcpdump -r
>>>> tests/system-userspace-testsuite.dir/078/p1.pcap
>>>>   reading from file
>>>> tests/system-userspace-testsuite.dir/078/p1.pcap, link-type EN10MB
>>>> (Ethernet), snapshot length 262144
>>>>   dropped privs to tcpdump
>>>>   12:07:21.364925 ARP, Request who-has 10.2.1.2 tell 10.2.1.1, length 28
>>>>   12:07:21.364928 ARP, Reply 10.2.1.2 is-at e6:45:4a:80:7c:61 (oui Unknown), length 28
>>>>   12:07:21.365095 IP 10.1.1.1 > 10.1.1.2: ICMP echo request, id 40165, seq 1, length 1480
>>>>   12:07:21.365099 IP 10.2.1.1 > 10.1.1.2: icmp
>>>>   12:07:21.365101 IP 10.2.1.1 > 10.1.1.2: icmp
>>>>   12:07:21.365102 IP 10.2.1.1 > 10.1.1.2: icmp
>>>>
>>>> We see the first frag correct, but subsequent frags are broken.
>>>>
>>>> This test worked both for userspace and kernel datapath on my local
>>>> system.
>>>
>>> Hmm.  This test fails for me for both kernel and userspace:
>>
>> Okay, I'll try it again on my system.  For reference, I was on F34,
>> kernel 5.12.12-300.fc34.x86_64
>>
>>> tcpdump -r tests/system-userspace-testsuite.dir/078/p0.pcap
>>> 15:17:12.832383 ARP, Request who-has 10.2.1.2 tell 10.2.1.1, length 28
>>> 15:17:12.834317 ARP, Reply 10.2.1.2 is-at 46:0c:83:aa:6e:b0 (oui Unknown), length 28
>>> 15:17:12.834327 IP 10.2.1.1 > 10.1.1.2: ICMP echo request, id 27759, seq 1, length 1480
>>> 15:17:12.834329 IP 10.2.1.1 > 10.1.1.2: icmp
>>> 15:17:12.834330 IP 10.2.1.1 > 10.1.1.2: icmp
>>> 15:17:12.834332 IP 10.2.1.1 > 10.1.1.2: icmp
>>>
>>> tcpdump -r tests/system-userspace-testsuite.dir/078/p1.pcap
>>> 15:17:12.833542 ARP, Request who-has 10.2.1.2 tell 10.2.1.1, length 28
>>> 15:17:12.834994 IP 10.1.1.1 > 10.1.1.2: ICMP echo request, id 27759, seq 1, length 1480
>>> 15:17:12.834999 IP 10.1.1.1 > 10.1.1.2: icmp
>>> 15:17:12.835002 IP 10.1.1.1 > 10.1.1.2: icmp
>>> 15:17:12.835004 IP 10.1.1.1 > 10.1.1.2: icmp
>>>
>>> ping -c 1 10.1.1.2 -M dont -s 4500 | grep "transmitted" | sed 's/time.*ms$/time 0ms/'
>>> NS_EXEC_HEREDOC
>>> --- -   2021-08-16 15:17:22.844535052 -0400
>>> +++ /root/ovs/tests/system-userspace-testsuite.dir/at-groups/78/stdout
>>> @@ -1,2 +1,2 @@
>>> -1 packets transmitted, 1 received, 0% packet loss, time 0ms
>>> +1 packets transmitted, 0 received, 100% packet loss, time 0ms
>>>
>>> # uname -a
>>> Linux rhel8 4.18.0-305.3.1.el8_4.x86_64
>>>
>>> I'm not sure what is going on.  Could you, please, re-check?
>>
>> I'll boot an rhel8.4 instance and try it out.
>>
>>> I will not apply this patch for now until we figure out how to test it.
>>
>> Okay.
>>
> 
> I hope this change works.  I altered the test environment to use a
> single port so instead of having a dummy attached to the bridge, I now
> use lo to be the receiver.

This one worked for me.  Thanks!

With the test, applied to master and backported down to 2.13.

Best regards, Ilya Maximets.
diff mbox series

Patch

diff --git a/lib/ipf.c b/lib/ipf.c
index 9c83f19..795e801 100644
--- a/lib/ipf.c
+++ b/lib/ipf.c
@@ -1149,52 +1149,56 @@  ipf_post_execute_reass_pkts(struct ipf *ipf,
          * NETDEV_MAX_BURST. */
         DP_PACKET_BATCH_REFILL_FOR_EACH (pb_idx, pb_cnt, pkt, pb) {
             if (rp && pkt == rp->list->reass_execute_ctx) {
+                const struct ipf_frag *frag_0 = &rp->list->frag_list[0];
+                void *l4_frag = dp_packet_l4(frag_0->pkt);
+                void *l4_reass = dp_packet_l4(pkt);
+                memcpy(l4_frag, l4_reass, dp_packet_l4_size(frag_0->pkt));
+
                 for (int i = 0; i <= rp->list->last_inuse_idx; i++) {
-                    rp->list->frag_list[i].pkt->md.ct_label = pkt->md.ct_label;
-                    rp->list->frag_list[i].pkt->md.ct_mark = pkt->md.ct_mark;
-                    rp->list->frag_list[i].pkt->md.ct_state = pkt->md.ct_state;
-                    rp->list->frag_list[i].pkt->md.ct_zone = pkt->md.ct_zone;
-                    rp->list->frag_list[i].pkt->md.ct_orig_tuple_ipv6 =
-                        pkt->md.ct_orig_tuple_ipv6;
+                    const struct ipf_frag *frag_i = &rp->list->frag_list[i];
+
+                    frag_i->pkt->md.ct_label = pkt->md.ct_label;
+                    frag_i->pkt->md.ct_mark = pkt->md.ct_mark;
+                    frag_i->pkt->md.ct_state = pkt->md.ct_state;
+                    frag_i->pkt->md.ct_zone = pkt->md.ct_zone;
+                    frag_i->pkt->md.ct_orig_tuple_ipv6 =
+                        pkt->md.ct_orig_tuple_ipv6;
                     if (pkt->md.ct_orig_tuple_ipv6) {
-                        rp->list->frag_list[i].pkt->md.ct_orig_tuple.ipv6 =
+                        frag_i->pkt->md.ct_orig_tuple.ipv6 =
                             pkt->md.ct_orig_tuple.ipv6;
                     } else {
-                        rp->list->frag_list[i].pkt->md.ct_orig_tuple.ipv4  =
+                        frag_i->pkt->md.ct_orig_tuple.ipv4 =
                             pkt->md.ct_orig_tuple.ipv4;
                     }
-                }
-
-                const struct ipf_frag *frag_0 = &rp->list->frag_list[0];
-                void *l4_frag = dp_packet_l4(frag_0->pkt);
-                void *l4_reass = dp_packet_l4(pkt);
-                memcpy(l4_frag, l4_reass, dp_packet_l4_size(frag_0->pkt));
-
-                if (v6) {
-                    struct ovs_16aligned_ip6_hdr *l3_frag
-                        = dp_packet_l3(frag_0->pkt);
-                    struct ovs_16aligned_ip6_hdr *l3_reass = dp_packet_l3(pkt);
-                    l3_frag->ip6_src = l3_reass->ip6_src;
-                    l3_frag->ip6_dst = l3_reass->ip6_dst;
-                } else {
-                    struct ip_header *l3_frag = dp_packet_l3(frag_0->pkt);
-                    struct ip_header *l3_reass = dp_packet_l3(pkt);
-                    if (!dp_packet_hwol_is_ipv4(frag_0->pkt)) {
-                        ovs_be32 reass_ip =
-                            get_16aligned_be32(&l3_reass->ip_src);
-                        ovs_be32 frag_ip =
-                            get_16aligned_be32(&l3_frag->ip_src);
-
-                        l3_frag->ip_csum = recalc_csum32(l3_frag->ip_csum,
-                                                         frag_ip, reass_ip);
-                        reass_ip = get_16aligned_be32(&l3_reass->ip_dst);
-                        frag_ip = get_16aligned_be32(&l3_frag->ip_dst);
-                        l3_frag->ip_csum = recalc_csum32(l3_frag->ip_csum,
-                                                         frag_ip, reass_ip);
+                    if (v6) {
+                        struct ovs_16aligned_ip6_hdr *l3_frag
+                            = dp_packet_l3(frag_i->pkt);
+                        struct ovs_16aligned_ip6_hdr *l3_reass
+                            = dp_packet_l3(pkt);
+                        l3_frag->ip6_src = l3_reass->ip6_src;
+                        l3_frag->ip6_dst = l3_reass->ip6_dst;
+                    } else {
+                        struct ip_header *l3_frag = dp_packet_l3(frag_i->pkt);
+                        struct ip_header *l3_reass = dp_packet_l3(pkt);
+                        if (!dp_packet_hwol_is_ipv4(frag_i->pkt)) {
+                            ovs_be32 reass_ip =
+                                get_16aligned_be32(&l3_reass->ip_src);
+                            ovs_be32 frag_ip =
+                                get_16aligned_be32(&l3_frag->ip_src);
+
+                            l3_frag->ip_csum = recalc_csum32(l3_frag->ip_csum,
+                                                             frag_ip,
+                                                             reass_ip);
+                            reass_ip = get_16aligned_be32(&l3_reass->ip_dst);
+                            frag_ip = get_16aligned_be32(&l3_frag->ip_dst);
+                            l3_frag->ip_csum = recalc_csum32(l3_frag->ip_csum,
+                                                             frag_ip,
+                                                             reass_ip);
+                        }
+
+                        l3_frag->ip_src = l3_reass->ip_src;
+                        l3_frag->ip_dst = l3_reass->ip_dst;
                     }
-
-                    l3_frag->ip_src = l3_reass->ip_src;
-                    l3_frag->ip_dst = l3_reass->ip_dst;
                 }
 
                 ipf_completed_list_add(&ipf->frag_complete_list, rp->list);