Message ID | 1623998750-9037-1-git-send-email-wenxu@ucloud.cn |
---|---|
State | Accepted |
Headers | show |
Series | [ovs-dev,v2] ipf: fix only nat the first fragment in the reass process | expand |
wenxu@ucloud.cn writes: > From: wenxu <wenxu@ucloud.cn> > > The ipf collect original fragment packets and reass a new pkt > to do the conntrack logic. After finsh the conntrack things > copy the ct meta info to each orignal packet and modify the > l4 header in the first fragment. It should modify the ip src/ > dst info for all the fragments. > > Signed-off-by: wenxu <wenxu@ucloud.cn> > Co-authored-by: luke.li <luke.li@ucloud.cn> > Signed-off-by: luke.li <luke.li@ucloud.cn> > --- This looks okay. Please post a set of sample packets that showcase the issue. I will write a test case that can demonstrate it that we can check in with this fix. Alternately, please add such a test case. > lib/ipf.c | 82 +++++++++++++++++++++++++++++++++------------------------------ > 1 file changed, 43 insertions(+), 39 deletions(-) > > diff --git a/lib/ipf.c b/lib/ipf.c > index 9c83f19..795e801 100644 > --- a/lib/ipf.c > +++ b/lib/ipf.c > @@ -1149,52 +1149,56 @@ ipf_post_execute_reass_pkts(struct ipf *ipf, > * NETDEV_MAX_BURST. */ > DP_PACKET_BATCH_REFILL_FOR_EACH (pb_idx, pb_cnt, pkt, pb) { > if (rp && pkt == rp->list->reass_execute_ctx) { > + const struct ipf_frag *frag_0 = &rp->list->frag_list[0]; > + void *l4_frag = dp_packet_l4(frag_0->pkt); > + void *l4_reass = dp_packet_l4(pkt); > + memcpy(l4_frag, l4_reass, dp_packet_l4_size(frag_0->pkt)); > + > for (int i = 0; i <= rp->list->last_inuse_idx; i++) { > - rp->list->frag_list[i].pkt->md.ct_label = pkt->md.ct_label; > - rp->list->frag_list[i].pkt->md.ct_mark = pkt->md.ct_mark; > - rp->list->frag_list[i].pkt->md.ct_state = pkt->md.ct_state; > - rp->list->frag_list[i].pkt->md.ct_zone = pkt->md.ct_zone; > - rp->list->frag_list[i].pkt->md.ct_orig_tuple_ipv6 = > - pkt->md.ct_orig_tuple_ipv6; > + const struct ipf_frag *frag_i = &rp->list->frag_list[i]; > + > + frag_i->pkt->md.ct_label = pkt->md.ct_label; > + frag_i->pkt->md.ct_mark = pkt->md.ct_mark; > + frag_i->pkt->md.ct_state = pkt->md.ct_state; > + frag_i->pkt->md.ct_zone = pkt->md.ct_zone; > + frag_i->pkt->md.ct_orig_tuple_ipv6 = > + pkt->md.ct_orig_tuple_ipv6; > if (pkt->md.ct_orig_tuple_ipv6) { > - rp->list->frag_list[i].pkt->md.ct_orig_tuple.ipv6 = > + frag_i->pkt->md.ct_orig_tuple.ipv6 = > pkt->md.ct_orig_tuple.ipv6; > } else { > - rp->list->frag_list[i].pkt->md.ct_orig_tuple.ipv4 = > + frag_i->pkt->md.ct_orig_tuple.ipv4 = > pkt->md.ct_orig_tuple.ipv4; > } > - } > - > - const struct ipf_frag *frag_0 = &rp->list->frag_list[0]; > - void *l4_frag = dp_packet_l4(frag_0->pkt); > - void *l4_reass = dp_packet_l4(pkt); > - memcpy(l4_frag, l4_reass, dp_packet_l4_size(frag_0->pkt)); > - > - if (v6) { > - struct ovs_16aligned_ip6_hdr *l3_frag > - = dp_packet_l3(frag_0->pkt); > - struct ovs_16aligned_ip6_hdr *l3_reass = dp_packet_l3(pkt); > - l3_frag->ip6_src = l3_reass->ip6_src; > - l3_frag->ip6_dst = l3_reass->ip6_dst; > - } else { > - struct ip_header *l3_frag = dp_packet_l3(frag_0->pkt); > - struct ip_header *l3_reass = dp_packet_l3(pkt); > - if (!dp_packet_hwol_is_ipv4(frag_0->pkt)) { > - ovs_be32 reass_ip = > - get_16aligned_be32(&l3_reass->ip_src); > - ovs_be32 frag_ip = > - get_16aligned_be32(&l3_frag->ip_src); > - > - l3_frag->ip_csum = recalc_csum32(l3_frag->ip_csum, > - frag_ip, reass_ip); > - reass_ip = get_16aligned_be32(&l3_reass->ip_dst); > - frag_ip = get_16aligned_be32(&l3_frag->ip_dst); > - l3_frag->ip_csum = recalc_csum32(l3_frag->ip_csum, > - frag_ip, reass_ip); > + if (v6) { > + struct ovs_16aligned_ip6_hdr *l3_frag > + = dp_packet_l3(frag_i->pkt); > + struct ovs_16aligned_ip6_hdr *l3_reass > + = dp_packet_l3(pkt); > + l3_frag->ip6_src = l3_reass->ip6_src; > + l3_frag->ip6_dst = l3_reass->ip6_dst; > + } else { > + struct ip_header *l3_frag = dp_packet_l3(frag_i->pkt); > + struct ip_header *l3_reass = dp_packet_l3(pkt); > + if (!dp_packet_hwol_is_ipv4(frag_i->pkt)) { > + ovs_be32 reass_ip = > + get_16aligned_be32(&l3_reass->ip_src); > + ovs_be32 frag_ip = > + get_16aligned_be32(&l3_frag->ip_src); > + > + l3_frag->ip_csum = recalc_csum32(l3_frag->ip_csum, > + frag_ip, > + reass_ip); > + reass_ip = get_16aligned_be32(&l3_reass->ip_dst); > + frag_ip = get_16aligned_be32(&l3_frag->ip_dst); > + l3_frag->ip_csum = recalc_csum32(l3_frag->ip_csum, > + frag_ip, > + reass_ip); > + } > + > + l3_frag->ip_src = l3_reass->ip_src; > + l3_frag->ip_dst = l3_reass->ip_dst; > } > - > - l3_frag->ip_src = l3_reass->ip_src; > - l3_frag->ip_dst = l3_reass->ip_dst; > } > > ipf_completed_list_add(&ipf->frag_complete_list, rp->list);
My case : ovs-ofctl add-flow manbr "table=0,ct_state=-trk,ip,in_port=dpdk2, actions=ct(table=1, nat)" ovs-ofctl add-flow manbr "table=0,ct_state=-trk,ip,in_port=dpdk3, actions=ct(table=1, nat)" ovs-ofctl add-flow manbr "table=0,table=1,ct_state=+trk+new,ip,in_port=dpdk2, actions=ct(commit, nat(src=1.1.1.2)),dpdk3" ovs-ofctl add-flow manbr "table=0,table=1,ct_state=+trk+est,ip,in_port=dpdk2, actions=dpdk3" ovs-ofctl add-flow manbr "table=0,table=1,ct_state=+trk+est,ip,in_port=dpdk3, actions=dpdk2" ovs-ofctl add-flow manbr "table=0,arp,actions=normal" With the frag-packet in udp-frag.pcap. And the wrong nated packet. nat-packet.pcap send the packet: iperf -u -c 1.1.1.3 -t 120 -i 2 -b 1 -l 1800 From: Aaron Conole <aconole@redhat.com> Date: 2021-07-09 04:11:12 To: wenxu@ucloud.cn Cc: i.maximets@ovn.org,dev@openvswitch.org Subject: Re: [ovs-dev] [PATCH v2] ipf: fix only nat the first fragment in the reass process>wenxu@ucloud.cn writes: > >> From: wenxu <wenxu@ucloud.cn> >> >> The ipf collect original fragment packets and reass a new pkt >> to do the conntrack logic. After finsh the conntrack things >> copy the ct meta info to each orignal packet and modify the >> l4 header in the first fragment. It should modify the ip src/ >> dst info for all the fragments. >> >> Signed-off-by: wenxu <wenxu@ucloud.cn> >> Co-authored-by: luke.li <luke.li@ucloud.cn> >> Signed-off-by: luke.li <luke.li@ucloud.cn> >> --- > >This looks okay. Please post a set of sample packets that showcase the >issue. I will write a test case that can demonstrate it that we can >check in with this fix. >Alternately, please add such a test case. > >> lib/ipf.c | 82 +++++++++++++++++++++++++++++++++------------------------------ >> 1 file changed, 43 insertions(+), 39 deletions(-) >> >> diff --git a/lib/ipf.c b/lib/ipf.c >> index 9c83f19..795e801 100644 >> --- a/lib/ipf.c >> +++ b/lib/ipf.c >> @@ -1149,52 +1149,56 @@ ipf_post_execute_reass_pkts(struct ipf *ipf, >> * NETDEV_MAX_BURST. */ >> DP_PACKET_BATCH_REFILL_FOR_EACH (pb_idx, pb_cnt, pkt, pb) { >> if (rp && pkt == rp->list->reass_execute_ctx) { >> + const struct ipf_frag *frag_0 = &rp->list->frag_list[0]; >> + void *l4_frag = dp_packet_l4(frag_0->pkt); >> + void *l4_reass = dp_packet_l4(pkt); >> + memcpy(l4_frag, l4_reass, dp_packet_l4_size(frag_0->pkt)); >> + >> for (int i = 0; i <= rp->list->last_inuse_idx; i++) { >> - rp->list->frag_list[i].pkt->md.ct_label = pkt->md.ct_label; >> - rp->list->frag_list[i].pkt->md.ct_mark = pkt->md.ct_mark; >> - rp->list->frag_list[i].pkt->md.ct_state = pkt->md.ct_state; >> - rp->list->frag_list[i].pkt->md.ct_zone = pkt->md.ct_zone; >> - rp->list->frag_list[i].pkt->md.ct_orig_tuple_ipv6 = >> - pkt->md.ct_orig_tuple_ipv6; >> + const struct ipf_frag *frag_i = &rp->list->frag_list[i]; >> + >> + frag_i->pkt->md.ct_label = pkt->md.ct_label; >> + frag_i->pkt->md.ct_mark = pkt->md.ct_mark; >> + frag_i->pkt->md.ct_state = pkt->md.ct_state; >> + frag_i->pkt->md.ct_zone = pkt->md.ct_zone; >> + frag_i->pkt->md.ct_orig_tuple_ipv6 = >> + pkt->md.ct_orig_tuple_ipv6; >> if (pkt->md.ct_orig_tuple_ipv6) { >> - rp->list->frag_list[i].pkt->md.ct_orig_tuple.ipv6 = >> + frag_i->pkt->md.ct_orig_tuple.ipv6 = >> pkt->md.ct_orig_tuple.ipv6; >> } else { >> - rp->list->frag_list[i].pkt->md.ct_orig_tuple.ipv4 = >> + frag_i->pkt->md.ct_orig_tuple.ipv4 = >> pkt->md.ct_orig_tuple.ipv4; >> } >> - } >> - >> - const struct ipf_frag *frag_0 = &rp->list->frag_list[0]; >> - void *l4_frag = dp_packet_l4(frag_0->pkt); >> - void *l4_reass = dp_packet_l4(pkt); >> - memcpy(l4_frag, l4_reass, dp_packet_l4_size(frag_0->pkt)); >> - >> - if (v6) { >> - struct ovs_16aligned_ip6_hdr *l3_frag >> - = dp_packet_l3(frag_0->pkt); >> - struct ovs_16aligned_ip6_hdr *l3_reass = dp_packet_l3(pkt); >> - l3_frag->ip6_src = l3_reass->ip6_src; >> - l3_frag->ip6_dst = l3_reass->ip6_dst; >> - } else { >> - struct ip_header *l3_frag = dp_packet_l3(frag_0->pkt); >> - struct ip_header *l3_reass = dp_packet_l3(pkt); >> - if (!dp_packet_hwol_is_ipv4(frag_0->pkt)) { >> - ovs_be32 reass_ip = >> - get_16aligned_be32(&l3_reass->ip_src); >> - ovs_be32 frag_ip = >> - get_16aligned_be32(&l3_frag->ip_src); >> - >> - l3_frag->ip_csum = recalc_csum32(l3_frag->ip_csum, >> - frag_ip, reass_ip); >> - reass_ip = get_16aligned_be32(&l3_reass->ip_dst); >> - frag_ip = get_16aligned_be32(&l3_frag->ip_dst); >> - l3_frag->ip_csum = recalc_csum32(l3_frag->ip_csum, >> - frag_ip, reass_ip); >> + if (v6) { >> + struct ovs_16aligned_ip6_hdr *l3_frag >> + = dp_packet_l3(frag_i->pkt); >> + struct ovs_16aligned_ip6_hdr *l3_reass >> + = dp_packet_l3(pkt); >> + l3_frag->ip6_src = l3_reass->ip6_src; >> + l3_frag->ip6_dst = l3_reass->ip6_dst; >> + } else { >> + struct ip_header *l3_frag = dp_packet_l3(frag_i->pkt); >> + struct ip_header *l3_reass = dp_packet_l3(pkt); >> + if (!dp_packet_hwol_is_ipv4(frag_i->pkt)) { >> + ovs_be32 reass_ip = >> + get_16aligned_be32(&l3_reass->ip_src); >> + ovs_be32 frag_ip = >> + get_16aligned_be32(&l3_frag->ip_src); >> + >> + l3_frag->ip_csum = recalc_csum32(l3_frag->ip_csum, >> + frag_ip, >> + reass_ip); >> + reass_ip = get_16aligned_be32(&l3_reass->ip_dst); >> + frag_ip = get_16aligned_be32(&l3_frag->ip_dst); >> + l3_frag->ip_csum = recalc_csum32(l3_frag->ip_csum, >> + frag_ip, >> + reass_ip); >> + } >> + >> + l3_frag->ip_src = l3_reass->ip_src; >> + l3_frag->ip_dst = l3_reass->ip_dst; >> } >> - >> - l3_frag->ip_src = l3_reass->ip_src; >> - l3_frag->ip_dst = l3_reass->ip_dst; >> } >> >> ipf_completed_list_add(&ipf->frag_complete_list, rp->list); >
We had the same problem, when I send a series of fragments, only the first piece of the packet is NATed. We reviewed and tested the patch, it do solve the problem, and we think there are not any other bugs in it. Reviewed-by: wangze <wangze712@gmail.com> wenxu <wenxu@ucloud.cn> 于2021年8月6日周五 下午4:38写道: > > > My case : > > > ovs-ofctl add-flow manbr "table=0,ct_state=-trk,ip,in_port=dpdk2, actions=ct(table=1, nat)" > ovs-ofctl add-flow manbr "table=0,ct_state=-trk,ip,in_port=dpdk3, actions=ct(table=1, nat)" > ovs-ofctl add-flow manbr "table=0,table=1,ct_state=+trk+new,ip,in_port=dpdk2, actions=ct(commit, nat(src=1.1.1.2)),dpdk3" > ovs-ofctl add-flow manbr "table=0,table=1,ct_state=+trk+est,ip,in_port=dpdk2, actions=dpdk3" > ovs-ofctl add-flow manbr "table=0,table=1,ct_state=+trk+est,ip,in_port=dpdk3, actions=dpdk2" > ovs-ofctl add-flow manbr "table=0,arp,actions=normal" > > > > With the frag-packet in udp-frag.pcap. And the wrong nated packet. nat-packet.pcap > > > send the packet: > iperf -u -c 1.1.1.3 -t 120 -i 2 -b 1 -l 1800 > > > > > From: Aaron Conole <aconole@redhat.com> > Date: 2021-07-09 04:11:12 > To: wenxu@ucloud.cn > Cc: i.maximets@ovn.org,dev@openvswitch.org > Subject: Re: [ovs-dev] [PATCH v2] ipf: fix only nat the first fragment in the reass process>wenxu@ucloud.cn writes: > > > >> From: wenxu <wenxu@ucloud.cn> > >> > >> The ipf collect original fragment packets and reass a new pkt > >> to do the conntrack logic. After finsh the conntrack things > >> copy the ct meta info to each orignal packet and modify the > >> l4 header in the first fragment. It should modify the ip src/ > >> dst info for all the fragments. > >> > >> Signed-off-by: wenxu <wenxu@ucloud.cn> > >> Co-authored-by: luke.li <luke.li@ucloud.cn> > >> Signed-off-by: luke.li <luke.li@ucloud.cn> > >> --- > > > >This looks okay. Please post a set of sample packets that showcase the > >issue. I will write a test case that can demonstrate it that we can > >check in with this fix. > >Alternately, please add such a test case. > > > >> lib/ipf.c | 82 +++++++++++++++++++++++++++++++++------------------------------ > >> 1 file changed, 43 insertions(+), 39 deletions(-) > >> > >> diff --git a/lib/ipf.c b/lib/ipf.c > >> index 9c83f19..795e801 100644 > >> --- a/lib/ipf.c > >> +++ b/lib/ipf.c > >> @@ -1149,52 +1149,56 @@ ipf_post_execute_reass_pkts(struct ipf *ipf, > >> * NETDEV_MAX_BURST. */ > >> DP_PACKET_BATCH_REFILL_FOR_EACH (pb_idx, pb_cnt, pkt, pb) { > >> if (rp && pkt == rp->list->reass_execute_ctx) { > >> + const struct ipf_frag *frag_0 = &rp->list->frag_list[0]; > >> + void *l4_frag = dp_packet_l4(frag_0->pkt); > >> + void *l4_reass = dp_packet_l4(pkt); > >> + memcpy(l4_frag, l4_reass, dp_packet_l4_size(frag_0->pkt)); > >> + > >> for (int i = 0; i <= rp->list->last_inuse_idx; i++) { > >> - rp->list->frag_list[i].pkt->md.ct_label = pkt->md.ct_label; > >> - rp->list->frag_list[i].pkt->md.ct_mark = pkt->md.ct_mark; > >> - rp->list->frag_list[i].pkt->md.ct_state = pkt->md.ct_state; > >> - rp->list->frag_list[i].pkt->md.ct_zone = pkt->md.ct_zone; > >> - rp->list->frag_list[i].pkt->md.ct_orig_tuple_ipv6 = > >> - pkt->md.ct_orig_tuple_ipv6; > >> + const struct ipf_frag *frag_i = &rp->list->frag_list[i]; > >> + > >> + frag_i->pkt->md.ct_label = pkt->md.ct_label; > >> + frag_i->pkt->md.ct_mark = pkt->md.ct_mark; > >> + frag_i->pkt->md.ct_state = pkt->md.ct_state; > >> + frag_i->pkt->md.ct_zone = pkt->md.ct_zone; > >> + frag_i->pkt->md.ct_orig_tuple_ipv6 = > >> + pkt->md.ct_orig_tuple_ipv6; > >> if (pkt->md.ct_orig_tuple_ipv6) { > >> - rp->list->frag_list[i].pkt->md.ct_orig_tuple.ipv6 = > >> + frag_i->pkt->md.ct_orig_tuple.ipv6 = > >> pkt->md.ct_orig_tuple.ipv6; > >> } else { > >> - rp->list->frag_list[i].pkt->md.ct_orig_tuple.ipv4 = > >> + frag_i->pkt->md.ct_orig_tuple.ipv4 = > >> pkt->md.ct_orig_tuple.ipv4; > >> } > >> - } > >> - > >> - const struct ipf_frag *frag_0 = &rp->list->frag_list[0]; > >> - void *l4_frag = dp_packet_l4(frag_0->pkt); > >> - void *l4_reass = dp_packet_l4(pkt); > >> - memcpy(l4_frag, l4_reass, dp_packet_l4_size(frag_0->pkt)); > >> - > >> - if (v6) { > >> - struct ovs_16aligned_ip6_hdr *l3_frag > >> - = dp_packet_l3(frag_0->pkt); > >> - struct ovs_16aligned_ip6_hdr *l3_reass = dp_packet_l3(pkt); > >> - l3_frag->ip6_src = l3_reass->ip6_src; > >> - l3_frag->ip6_dst = l3_reass->ip6_dst; > >> - } else { > >> - struct ip_header *l3_frag = dp_packet_l3(frag_0->pkt); > >> - struct ip_header *l3_reass = dp_packet_l3(pkt); > >> - if (!dp_packet_hwol_is_ipv4(frag_0->pkt)) { > >> - ovs_be32 reass_ip = > >> - get_16aligned_be32(&l3_reass->ip_src); > >> - ovs_be32 frag_ip = > >> - get_16aligned_be32(&l3_frag->ip_src); > >> - > >> - l3_frag->ip_csum = recalc_csum32(l3_frag->ip_csum, > >> - frag_ip, reass_ip); > >> - reass_ip = get_16aligned_be32(&l3_reass->ip_dst); > >> - frag_ip = get_16aligned_be32(&l3_frag->ip_dst); > >> - l3_frag->ip_csum = recalc_csum32(l3_frag->ip_csum, > >> - frag_ip, reass_ip); > >> + if (v6) { > >> + struct ovs_16aligned_ip6_hdr *l3_frag > >> + = dp_packet_l3(frag_i->pkt); > >> + struct ovs_16aligned_ip6_hdr *l3_reass > >> + = dp_packet_l3(pkt); > >> + l3_frag->ip6_src = l3_reass->ip6_src; > >> + l3_frag->ip6_dst = l3_reass->ip6_dst; > >> + } else { > >> + struct ip_header *l3_frag = dp_packet_l3(frag_i->pkt); > >> + struct ip_header *l3_reass = dp_packet_l3(pkt); > >> + if (!dp_packet_hwol_is_ipv4(frag_i->pkt)) { > >> + ovs_be32 reass_ip = > >> + get_16aligned_be32(&l3_reass->ip_src); > >> + ovs_be32 frag_ip = > >> + get_16aligned_be32(&l3_frag->ip_src); > >> + > >> + l3_frag->ip_csum = recalc_csum32(l3_frag->ip_csum, > >> + frag_ip, > >> + reass_ip); > >> + reass_ip = get_16aligned_be32(&l3_reass->ip_dst); > >> + frag_ip = get_16aligned_be32(&l3_frag->ip_dst); > >> + l3_frag->ip_csum = recalc_csum32(l3_frag->ip_csum, > >> + frag_ip, > >> + reass_ip); > >> + } > >> + > >> + l3_frag->ip_src = l3_reass->ip_src; > >> + l3_frag->ip_dst = l3_reass->ip_dst; > >> } > >> - > >> - l3_frag->ip_src = l3_reass->ip_src; > >> - l3_frag->ip_dst = l3_reass->ip_dst; > >> } > >> > >> ipf_completed_list_add(&ipf->frag_complete_list, rp->list); > > > > > > > _______________________________________________ > dev mailing list > dev@openvswitch.org > https://mail.openvswitch.org/mailman/listinfo/ovs-dev
wenxu@ucloud.cn writes: > From: wenxu <wenxu@ucloud.cn> > > The ipf collect original fragment packets and reass a new pkt > to do the conntrack logic. After finsh the conntrack things > copy the ct meta info to each orignal packet and modify the > l4 header in the first fragment. It should modify the ip src/ > dst info for all the fragments. > > Signed-off-by: wenxu <wenxu@ucloud.cn> > Co-authored-by: luke.li <luke.li@ucloud.cn> > Signed-off-by: luke.li <luke.li@ucloud.cn> > --- Acked-by: Aaron Conole <aconole@redhat.com> Thanks for the fix. I see it can work for any l3 protocol. Based on the comments you supplied, I wrote the following test case. It can either be folded in by you (or Ilya on apply), or I can submit as a separate patch (in case you are worried about having my sign-off / coauthor on this patch). When testing 'make check-system-userspace' before this patch, I see a failure and get the following tcpdump logged: 12:15:31 aconole@RHTPC1VM0NT {master} ~/git/ovs$ sudo tcpdump -r tests/system-userspace-testsuite.dir/078/p1.pcap reading from file tests/system-userspace-testsuite.dir/078/p1.pcap, link-type EN10MB (Ethernet), snapshot length 262144 dropped privs to tcpdump 12:07:21.364925 ARP, Request who-has 10.2.1.2 tell 10.2.1.1, length 28 12:07:21.364928 ARP, Reply 10.2.1.2 is-at e6:45:4a:80:7c:61 (oui Unknown), length 28 12:07:21.365095 IP 10.1.1.1 > 10.1.1.2: ICMP echo request, id 40165, seq 1, length 1480 12:07:21.365099 IP 10.2.1.1 > 10.1.1.2: icmp 12:07:21.365101 IP 10.2.1.1 > 10.1.1.2: icmp 12:07:21.365102 IP 10.2.1.1 > 10.1.1.2: icmp We see the first frag correct, but subsequent frags are broken. This test worked both for userspace and kernel datapath on my local system. --- tests/system-traffic.at | 46 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/tests/system-traffic.at b/tests/system-traffic.at index f400cfabc9..feb335c783 100644 --- a/tests/system-traffic.at +++ b/tests/system-traffic.at @@ -3305,6 +3305,52 @@ NS_CHECK_EXEC([at_ns0], [ping6 -s 3200 -q -c 3 -i 0.3 -w 2 fc00::2 | FORMAT_PING OVS_TRAFFIC_VSWITCHD_STOP AT_CLEANUP +AT_SETUP([conntrack - IPv4 Fragmentation + nat]) +AT_SKIP_IF([test $HAVE_TCPDUMP = no]) +CHECK_CONNTRACK() + +OVS_TRAFFIC_VSWITCHD_START( + [set-fail-mode br0 secure -- ]) + +ADD_NAMESPACES(at_ns0, at_ns1) + +ADD_VETH(p0, at_ns0, br0, "10.2.1.1/24") +NS_CHECK_EXEC([at_ns0], [ip link set dev p0 address 80:88:88:88:88:88]) +ADD_VETH(p1, at_ns1, br0, "10.1.1.2/24") + +# create a dummy route for NAT +ADD_VETH(p2, at_ns1, br0, "10.2.1.2/24") +NS_CHECK_EXEC([at_ns0], [ip route add 10.1.1.0/24 via 10.2.1.2]) +NS_CHECK_EXEC([at_ns1], [ip neigh add 10.1.1.1 nud permanent lladdr 80:88:88:88:88:88 dev p1]) + +# disable iptables from getting in the way +NS_EXEC([at_ns1], [iptables --flush]) + +# solely for debugging when things go wrong +NS_EXEC([at_ns0], [tcpdump -i p0 -w p0.pcap -xx >tcpdump.out &]) +NS_EXEC([at_ns1], [tcpdump -i p1 -w p1.pcap -xx >tcpdump.out &]) +NS_EXEC([at_ns1], [tcpdump -i p2 -w p2.pcap -xx >tcpdump2.out &]) + +AT_DATA([flows.txt], [dnl +table=0,arp,actions=normal +table=0,ct_state=-trk,ip,in_port=ovs-p0, actions=ct(table=1, nat) +table=0,ct_state=-trk,ip,in_port=ovs-p1, actions=ct(table=1, nat) +table=1,ct_state=+trk+new,ip,in_port=ovs-p0, actions=ct(commit, nat(src=10.1.1.1)),ovs-p1 +table=1,ct_state=+trk+est,ip,in_port=ovs-p0, actions=ovs-p1 +table=1,ct_state=+trk+est,ip,in_port=ovs-p1, actions=ovs-p0 +]) + +AT_CHECK([ovs-ofctl add-flows br0 flows.txt]) + +#check connectivity +NS_CHECK_EXEC([at_ns0], [ping -c 1 10.1.1.2 -M dont -s 4500 | FORMAT_PING], [0], [dnl +1 packets transmitted, 1 received, 0% packet loss, time 0ms +]) + +OVS_TRAFFIC_VSWITCHD_STOP +AT_CLEANUP + + AT_SETUP([conntrack - resubmit to ct multiple times]) CHECK_CONNTRACK()
Hi, thanks for working on this wenxu@ucloud.cn writes: > From: wenxu <wenxu@ucloud.cn> > > The ipf collect original fragment packets and reass a new pkt > to do the conntrack logic. After finsh the conntrack things > copy the ct meta info to each orignal packet and modify the > l4 header in the first fragment. It should modify the ip src/ > dst info for all the fragments. > > Signed-off-by: wenxu <wenxu@ucloud.cn> > Co-authored-by: luke.li <luke.li@ucloud.cn> > Signed-off-by: luke.li <luke.li@ucloud.cn> > --- The patch LGTM, and works as expected. Any chance to add a Fixes tag? I think it should be: 4ea96698f667 ("Userspace datapath: Add fragmentation handling.") besides that: Acked-by: Paolo Valerio <pvalerio@redhat.com>
On 8/12/21 6:17 PM, Aaron Conole wrote: > wenxu@ucloud.cn writes: > >> From: wenxu <wenxu@ucloud.cn> >> >> The ipf collect original fragment packets and reass a new pkt >> to do the conntrack logic. After finsh the conntrack things >> copy the ct meta info to each orignal packet and modify the >> l4 header in the first fragment. It should modify the ip src/ >> dst info for all the fragments. >> >> Signed-off-by: wenxu <wenxu@ucloud.cn> >> Co-authored-by: luke.li <luke.li@ucloud.cn> >> Signed-off-by: luke.li <luke.li@ucloud.cn> >> --- > > Acked-by: Aaron Conole <aconole@redhat.com> > > Thanks for the fix. I see it can work for any l3 protocol. > > Based on the comments you supplied, I wrote the following test case. It > can either be folded in by you (or Ilya on apply), or I can submit as a > separate patch (in case you are worried about having my sign-off / > coauthor on this patch). > > When testing 'make check-system-userspace' before this patch, I see a > failure and get the following tcpdump logged: > > 12:15:31 aconole@RHTPC1VM0NT {master} ~/git/ovs$ sudo tcpdump -r tests/system-userspace-testsuite.dir/078/p1.pcap > reading from file tests/system-userspace-testsuite.dir/078/p1.pcap, link-type EN10MB (Ethernet), snapshot length 262144 > dropped privs to tcpdump > 12:07:21.364925 ARP, Request who-has 10.2.1.2 tell 10.2.1.1, length 28 > 12:07:21.364928 ARP, Reply 10.2.1.2 is-at e6:45:4a:80:7c:61 (oui Unknown), length 28 > 12:07:21.365095 IP 10.1.1.1 > 10.1.1.2: ICMP echo request, id 40165, seq 1, length 1480 > 12:07:21.365099 IP 10.2.1.1 > 10.1.1.2: icmp > 12:07:21.365101 IP 10.2.1.1 > 10.1.1.2: icmp > 12:07:21.365102 IP 10.2.1.1 > 10.1.1.2: icmp > > We see the first frag correct, but subsequent frags are broken. > > This test worked both for userspace and kernel datapath on my local > system. Hmm. This test fails for me for both kernel and userspace: tcpdump -r tests/system-userspace-testsuite.dir/078/p0.pcap 15:17:12.832383 ARP, Request who-has 10.2.1.2 tell 10.2.1.1, length 28 15:17:12.834317 ARP, Reply 10.2.1.2 is-at 46:0c:83:aa:6e:b0 (oui Unknown), length 28 15:17:12.834327 IP 10.2.1.1 > 10.1.1.2: ICMP echo request, id 27759, seq 1, length 1480 15:17:12.834329 IP 10.2.1.1 > 10.1.1.2: icmp 15:17:12.834330 IP 10.2.1.1 > 10.1.1.2: icmp 15:17:12.834332 IP 10.2.1.1 > 10.1.1.2: icmp tcpdump -r tests/system-userspace-testsuite.dir/078/p1.pcap 15:17:12.833542 ARP, Request who-has 10.2.1.2 tell 10.2.1.1, length 28 15:17:12.834994 IP 10.1.1.1 > 10.1.1.2: ICMP echo request, id 27759, seq 1, length 1480 15:17:12.834999 IP 10.1.1.1 > 10.1.1.2: icmp 15:17:12.835002 IP 10.1.1.1 > 10.1.1.2: icmp 15:17:12.835004 IP 10.1.1.1 > 10.1.1.2: icmp ping -c 1 10.1.1.2 -M dont -s 4500 | grep "transmitted" | sed 's/time.*ms$/time 0ms/' NS_EXEC_HEREDOC --- - 2021-08-16 15:17:22.844535052 -0400 +++ /root/ovs/tests/system-userspace-testsuite.dir/at-groups/78/stdout @@ -1,2 +1,2 @@ -1 packets transmitted, 1 received, 0% packet loss, time 0ms +1 packets transmitted, 0 received, 100% packet loss, time 0ms # uname -a Linux rhel8 4.18.0-305.3.1.el8_4.x86_64 I'm not sure what is going on. Could you, please, re-check? I will not apply this patch for now until we figure out how to test it. Best regards, Ilya Maximets. > > --- > tests/system-traffic.at | 46 +++++++++++++++++++++++++++++++++++++++++ > 1 file changed, 46 insertions(+) > > diff --git a/tests/system-traffic.at b/tests/system-traffic.at > index f400cfabc9..feb335c783 100644 > --- a/tests/system-traffic.at > +++ b/tests/system-traffic.at > @@ -3305,6 +3305,52 @@ NS_CHECK_EXEC([at_ns0], [ping6 -s 3200 -q -c 3 -i 0.3 -w 2 fc00::2 | FORMAT_PING > OVS_TRAFFIC_VSWITCHD_STOP > AT_CLEANUP > > +AT_SETUP([conntrack - IPv4 Fragmentation + nat]) > +AT_SKIP_IF([test $HAVE_TCPDUMP = no]) > +CHECK_CONNTRACK() > + > +OVS_TRAFFIC_VSWITCHD_START( > + [set-fail-mode br0 secure -- ]) > + > +ADD_NAMESPACES(at_ns0, at_ns1) > + > +ADD_VETH(p0, at_ns0, br0, "10.2.1.1/24") > +NS_CHECK_EXEC([at_ns0], [ip link set dev p0 address 80:88:88:88:88:88]) > +ADD_VETH(p1, at_ns1, br0, "10.1.1.2/24") > + > +# create a dummy route for NAT > +ADD_VETH(p2, at_ns1, br0, "10.2.1.2/24") > +NS_CHECK_EXEC([at_ns0], [ip route add 10.1.1.0/24 via 10.2.1.2]) > +NS_CHECK_EXEC([at_ns1], [ip neigh add 10.1.1.1 nud permanent lladdr 80:88:88:88:88:88 dev p1]) > + > +# disable iptables from getting in the way > +NS_EXEC([at_ns1], [iptables --flush]) > + > +# solely for debugging when things go wrong > +NS_EXEC([at_ns0], [tcpdump -i p0 -w p0.pcap -xx >tcpdump.out &]) > +NS_EXEC([at_ns1], [tcpdump -i p1 -w p1.pcap -xx >tcpdump.out &]) > +NS_EXEC([at_ns1], [tcpdump -i p2 -w p2.pcap -xx >tcpdump2.out &]) > + > +AT_DATA([flows.txt], [dnl > +table=0,arp,actions=normal > +table=0,ct_state=-trk,ip,in_port=ovs-p0, actions=ct(table=1, nat) > +table=0,ct_state=-trk,ip,in_port=ovs-p1, actions=ct(table=1, nat) > +table=1,ct_state=+trk+new,ip,in_port=ovs-p0, actions=ct(commit, nat(src=10.1.1.1)),ovs-p1 > +table=1,ct_state=+trk+est,ip,in_port=ovs-p0, actions=ovs-p1 > +table=1,ct_state=+trk+est,ip,in_port=ovs-p1, actions=ovs-p0 > +]) > + > +AT_CHECK([ovs-ofctl add-flows br0 flows.txt]) > + > +#check connectivity > +NS_CHECK_EXEC([at_ns0], [ping -c 1 10.1.1.2 -M dont -s 4500 | FORMAT_PING], [0], [dnl > +1 packets transmitted, 1 received, 0% packet loss, time 0ms > +]) > + > +OVS_TRAFFIC_VSWITCHD_STOP > +AT_CLEANUP > + > + > AT_SETUP([conntrack - resubmit to ct multiple times]) > CHECK_CONNTRACK() > >
Ilya Maximets <i.maximets@ovn.org> writes: > On 8/12/21 6:17 PM, Aaron Conole wrote: >> wenxu@ucloud.cn writes: >> >>> From: wenxu <wenxu@ucloud.cn> >>> >>> The ipf collect original fragment packets and reass a new pkt >>> to do the conntrack logic. After finsh the conntrack things >>> copy the ct meta info to each orignal packet and modify the >>> l4 header in the first fragment. It should modify the ip src/ >>> dst info for all the fragments. >>> >>> Signed-off-by: wenxu <wenxu@ucloud.cn> >>> Co-authored-by: luke.li <luke.li@ucloud.cn> >>> Signed-off-by: luke.li <luke.li@ucloud.cn> >>> --- >> >> Acked-by: Aaron Conole <aconole@redhat.com> >> >> Thanks for the fix. I see it can work for any l3 protocol. >> >> Based on the comments you supplied, I wrote the following test case. It >> can either be folded in by you (or Ilya on apply), or I can submit as a >> separate patch (in case you are worried about having my sign-off / >> coauthor on this patch). >> >> When testing 'make check-system-userspace' before this patch, I see a >> failure and get the following tcpdump logged: >> >> 12:15:31 aconole@RHTPC1VM0NT {master} ~/git/ovs$ sudo tcpdump -r >> tests/system-userspace-testsuite.dir/078/p1.pcap >> reading from file >> tests/system-userspace-testsuite.dir/078/p1.pcap, link-type EN10MB >> (Ethernet), snapshot length 262144 >> dropped privs to tcpdump >> 12:07:21.364925 ARP, Request who-has 10.2.1.2 tell 10.2.1.1, length 28 >> 12:07:21.364928 ARP, Reply 10.2.1.2 is-at e6:45:4a:80:7c:61 (oui Unknown), length 28 >> 12:07:21.365095 IP 10.1.1.1 > 10.1.1.2: ICMP echo request, id 40165, seq 1, length 1480 >> 12:07:21.365099 IP 10.2.1.1 > 10.1.1.2: icmp >> 12:07:21.365101 IP 10.2.1.1 > 10.1.1.2: icmp >> 12:07:21.365102 IP 10.2.1.1 > 10.1.1.2: icmp >> >> We see the first frag correct, but subsequent frags are broken. >> >> This test worked both for userspace and kernel datapath on my local >> system. > > Hmm. This test fails for me for both kernel and userspace: Okay, I'll try it again on my system. For reference, I was on F34, kernel 5.12.12-300.fc34.x86_64 > tcpdump -r tests/system-userspace-testsuite.dir/078/p0.pcap > 15:17:12.832383 ARP, Request who-has 10.2.1.2 tell 10.2.1.1, length 28 > 15:17:12.834317 ARP, Reply 10.2.1.2 is-at 46:0c:83:aa:6e:b0 (oui Unknown), length 28 > 15:17:12.834327 IP 10.2.1.1 > 10.1.1.2: ICMP echo request, id 27759, seq 1, length 1480 > 15:17:12.834329 IP 10.2.1.1 > 10.1.1.2: icmp > 15:17:12.834330 IP 10.2.1.1 > 10.1.1.2: icmp > 15:17:12.834332 IP 10.2.1.1 > 10.1.1.2: icmp > > tcpdump -r tests/system-userspace-testsuite.dir/078/p1.pcap > 15:17:12.833542 ARP, Request who-has 10.2.1.2 tell 10.2.1.1, length 28 > 15:17:12.834994 IP 10.1.1.1 > 10.1.1.2: ICMP echo request, id 27759, seq 1, length 1480 > 15:17:12.834999 IP 10.1.1.1 > 10.1.1.2: icmp > 15:17:12.835002 IP 10.1.1.1 > 10.1.1.2: icmp > 15:17:12.835004 IP 10.1.1.1 > 10.1.1.2: icmp > > ping -c 1 10.1.1.2 -M dont -s 4500 | grep "transmitted" | sed 's/time.*ms$/time 0ms/' > NS_EXEC_HEREDOC > --- - 2021-08-16 15:17:22.844535052 -0400 > +++ /root/ovs/tests/system-userspace-testsuite.dir/at-groups/78/stdout > @@ -1,2 +1,2 @@ > -1 packets transmitted, 1 received, 0% packet loss, time 0ms > +1 packets transmitted, 0 received, 100% packet loss, time 0ms > > # uname -a > Linux rhel8 4.18.0-305.3.1.el8_4.x86_64 > > I'm not sure what is going on. Could you, please, re-check? I'll boot an rhel8.4 instance and try it out. > I will not apply this patch for now until we figure out how to test it. Okay. Thanks, Ilya! > Best regards, Ilya Maximets. > >> >> --- >> tests/system-traffic.at | 46 +++++++++++++++++++++++++++++++++++++++++ >> 1 file changed, 46 insertions(+) >> >> diff --git a/tests/system-traffic.at b/tests/system-traffic.at >> index f400cfabc9..feb335c783 100644 >> --- a/tests/system-traffic.at >> +++ b/tests/system-traffic.at >> @@ -3305,6 +3305,52 @@ NS_CHECK_EXEC([at_ns0], [ping6 -s 3200 -q -c >> 3 -i 0.3 -w 2 fc00::2 | FORMAT_PING >> OVS_TRAFFIC_VSWITCHD_STOP >> AT_CLEANUP >> >> +AT_SETUP([conntrack - IPv4 Fragmentation + nat]) >> +AT_SKIP_IF([test $HAVE_TCPDUMP = no]) >> +CHECK_CONNTRACK() >> + >> +OVS_TRAFFIC_VSWITCHD_START( >> + [set-fail-mode br0 secure -- ]) >> + >> +ADD_NAMESPACES(at_ns0, at_ns1) >> + >> +ADD_VETH(p0, at_ns0, br0, "10.2.1.1/24") >> +NS_CHECK_EXEC([at_ns0], [ip link set dev p0 address 80:88:88:88:88:88]) >> +ADD_VETH(p1, at_ns1, br0, "10.1.1.2/24") >> + >> +# create a dummy route for NAT >> +ADD_VETH(p2, at_ns1, br0, "10.2.1.2/24") >> +NS_CHECK_EXEC([at_ns0], [ip route add 10.1.1.0/24 via 10.2.1.2]) >> +NS_CHECK_EXEC([at_ns1], [ip neigh add 10.1.1.1 nud permanent lladdr 80:88:88:88:88:88 dev p1]) >> + >> +# disable iptables from getting in the way >> +NS_EXEC([at_ns1], [iptables --flush]) >> + >> +# solely for debugging when things go wrong >> +NS_EXEC([at_ns0], [tcpdump -i p0 -w p0.pcap -xx >tcpdump.out &]) >> +NS_EXEC([at_ns1], [tcpdump -i p1 -w p1.pcap -xx >tcpdump.out &]) >> +NS_EXEC([at_ns1], [tcpdump -i p2 -w p2.pcap -xx >tcpdump2.out &]) >> + >> +AT_DATA([flows.txt], [dnl >> +table=0,arp,actions=normal >> +table=0,ct_state=-trk,ip,in_port=ovs-p0, actions=ct(table=1, nat) >> +table=0,ct_state=-trk,ip,in_port=ovs-p1, actions=ct(table=1, nat) >> +table=1,ct_state=+trk+new,ip,in_port=ovs-p0, actions=ct(commit, nat(src=10.1.1.1)),ovs-p1 >> +table=1,ct_state=+trk+est,ip,in_port=ovs-p0, actions=ovs-p1 >> +table=1,ct_state=+trk+est,ip,in_port=ovs-p1, actions=ovs-p0 >> +]) >> + >> +AT_CHECK([ovs-ofctl add-flows br0 flows.txt]) >> + >> +#check connectivity >> +NS_CHECK_EXEC([at_ns0], [ping -c 1 10.1.1.2 -M dont -s 4500 | FORMAT_PING], [0], [dnl >> +1 packets transmitted, 1 received, 0% packet loss, time 0ms >> +]) >> + >> +OVS_TRAFFIC_VSWITCHD_STOP >> +AT_CLEANUP >> + >> + >> AT_SETUP([conntrack - resubmit to ct multiple times]) >> CHECK_CONNTRACK() >> >>
Aaron Conole <aconole@redhat.com> writes: > Ilya Maximets <i.maximets@ovn.org> writes: > >> On 8/12/21 6:17 PM, Aaron Conole wrote: >>> wenxu@ucloud.cn writes: >>> >>>> From: wenxu <wenxu@ucloud.cn> >>>> >>>> The ipf collect original fragment packets and reass a new pkt >>>> to do the conntrack logic. After finsh the conntrack things >>>> copy the ct meta info to each orignal packet and modify the >>>> l4 header in the first fragment. It should modify the ip src/ >>>> dst info for all the fragments. >>>> >>>> Signed-off-by: wenxu <wenxu@ucloud.cn> >>>> Co-authored-by: luke.li <luke.li@ucloud.cn> >>>> Signed-off-by: luke.li <luke.li@ucloud.cn> >>>> --- >>> >>> Acked-by: Aaron Conole <aconole@redhat.com> >>> >>> Thanks for the fix. I see it can work for any l3 protocol. >>> >>> Based on the comments you supplied, I wrote the following test case. It >>> can either be folded in by you (or Ilya on apply), or I can submit as a >>> separate patch (in case you are worried about having my sign-off / >>> coauthor on this patch). >>> >>> When testing 'make check-system-userspace' before this patch, I see a >>> failure and get the following tcpdump logged: >>> >>> 12:15:31 aconole@RHTPC1VM0NT {master} ~/git/ovs$ sudo tcpdump -r >>> tests/system-userspace-testsuite.dir/078/p1.pcap >>> reading from file >>> tests/system-userspace-testsuite.dir/078/p1.pcap, link-type EN10MB >>> (Ethernet), snapshot length 262144 >>> dropped privs to tcpdump >>> 12:07:21.364925 ARP, Request who-has 10.2.1.2 tell 10.2.1.1, length 28 >>> 12:07:21.364928 ARP, Reply 10.2.1.2 is-at e6:45:4a:80:7c:61 (oui Unknown), length 28 >>> 12:07:21.365095 IP 10.1.1.1 > 10.1.1.2: ICMP echo request, id 40165, seq 1, length 1480 >>> 12:07:21.365099 IP 10.2.1.1 > 10.1.1.2: icmp >>> 12:07:21.365101 IP 10.2.1.1 > 10.1.1.2: icmp >>> 12:07:21.365102 IP 10.2.1.1 > 10.1.1.2: icmp >>> >>> We see the first frag correct, but subsequent frags are broken. >>> >>> This test worked both for userspace and kernel datapath on my local >>> system. >> >> Hmm. This test fails for me for both kernel and userspace: > > Okay, I'll try it again on my system. For reference, I was on F34, > kernel 5.12.12-300.fc34.x86_64 > >> tcpdump -r tests/system-userspace-testsuite.dir/078/p0.pcap >> 15:17:12.832383 ARP, Request who-has 10.2.1.2 tell 10.2.1.1, length 28 >> 15:17:12.834317 ARP, Reply 10.2.1.2 is-at 46:0c:83:aa:6e:b0 (oui Unknown), length 28 >> 15:17:12.834327 IP 10.2.1.1 > 10.1.1.2: ICMP echo request, id 27759, seq 1, length 1480 >> 15:17:12.834329 IP 10.2.1.1 > 10.1.1.2: icmp >> 15:17:12.834330 IP 10.2.1.1 > 10.1.1.2: icmp >> 15:17:12.834332 IP 10.2.1.1 > 10.1.1.2: icmp >> >> tcpdump -r tests/system-userspace-testsuite.dir/078/p1.pcap >> 15:17:12.833542 ARP, Request who-has 10.2.1.2 tell 10.2.1.1, length 28 >> 15:17:12.834994 IP 10.1.1.1 > 10.1.1.2: ICMP echo request, id 27759, seq 1, length 1480 >> 15:17:12.834999 IP 10.1.1.1 > 10.1.1.2: icmp >> 15:17:12.835002 IP 10.1.1.1 > 10.1.1.2: icmp >> 15:17:12.835004 IP 10.1.1.1 > 10.1.1.2: icmp >> >> ping -c 1 10.1.1.2 -M dont -s 4500 | grep "transmitted" | sed 's/time.*ms$/time 0ms/' >> NS_EXEC_HEREDOC >> --- - 2021-08-16 15:17:22.844535052 -0400 >> +++ /root/ovs/tests/system-userspace-testsuite.dir/at-groups/78/stdout >> @@ -1,2 +1,2 @@ >> -1 packets transmitted, 1 received, 0% packet loss, time 0ms >> +1 packets transmitted, 0 received, 100% packet loss, time 0ms >> >> # uname -a >> Linux rhel8 4.18.0-305.3.1.el8_4.x86_64 >> >> I'm not sure what is going on. Could you, please, re-check? > > I'll boot an rhel8.4 instance and try it out. > >> I will not apply this patch for now until we figure out how to test it. > > Okay. > I hope this change works. I altered the test environment to use a single port so instead of having a dummy attached to the bridge, I now use lo to be the receiver. [ 'client' ] < --- > [ bridge ] < --- > [ 'server' ][ lo ] Before it was [ 'client' ] < --- > [ bridge ] < --- > [ 'server' ] ^ --------- > [ 'dummy' ] And 'dummy' provided the l3 routing path. Seems that doesn't work in all environments, so I use 'server' to provide the routing path and 'lo' as the actual ping endpoing. I tested this on RHEL8 (with the latest z-stream kernel), and Fedora 34. --- tests/system-traffic.at | 41 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/tests/system-traffic.at b/tests/system-traffic.at index f400cfabc9..de9108ac20 100644 --- a/tests/system-traffic.at +++ b/tests/system-traffic.at @@ -3305,6 +3305,47 @@ NS_CHECK_EXEC([at_ns0], [ping6 -s 3200 -q -c 3 -i 0.3 -w 2 fc00::2 | FORMAT_PING OVS_TRAFFIC_VSWITCHD_STOP AT_CLEANUP +AT_SETUP([conntrack - IPv4 Fragmentation + NAT]) +AT_SKIP_IF([test $HAVE_TCPDUMP = no]) +CHECK_CONNTRACK() + +OVS_TRAFFIC_VSWITCHD_START( + [set-fail-mode br0 secure -- ]) + +ADD_NAMESPACES(at_ns0, at_ns1) + +ADD_VETH(p0, at_ns0, br0, "10.2.1.1/24") +ADD_VETH(p1, at_ns1, br0, "10.2.1.2/24") + +# create a dummy route for NAT +NS_CHECK_EXEC([at_ns1], [ip addr add 10.1.1.2/32 dev lo]) +NS_CHECK_EXEC([at_ns0], [ip route add 10.1.1.0/24 via 10.2.1.2]) +NS_CHECK_EXEC([at_ns1], [ip route add 10.1.1.0/24 via 10.2.1.1]) + +# solely for debugging when things go wrong +NS_EXEC([at_ns0], [tcpdump -i p0 -w p0.pcap -xx >tcpdump.out &]) +NS_EXEC([at_ns1], [tcpdump -i p1 -w p1.pcap -xx >tcpdump.out &]) + +AT_DATA([flows.txt], [dnl +table=0,arp,actions=normal +table=0,ct_state=-trk,ip,in_port=ovs-p0, actions=ct(table=1, nat) +table=0,ct_state=-trk,ip,in_port=ovs-p1, actions=ct(table=1, nat) +table=1,ct_state=+trk+new,ip,in_port=ovs-p0, actions=ct(commit, nat(src=10.1.1.1)),ovs-p1 +table=1,ct_state=+trk+est,ip,in_port=ovs-p0, actions=ovs-p1 +table=1,ct_state=+trk+est,ip,in_port=ovs-p1, actions=ovs-p0 +]) + +AT_CHECK([ovs-ofctl add-flows br0 flows.txt]) + +#check connectivity +NS_CHECK_EXEC([at_ns0], [ping -c 1 10.1.1.2 -M dont -s 4500 | FORMAT_PING], [0], [dnl +1 packets transmitted, 1 received, 0% packet loss, time 0ms +]) + +OVS_TRAFFIC_VSWITCHD_STOP +AT_CLEANUP + + AT_SETUP([conntrack - resubmit to ct multiple times]) CHECK_CONNTRACK()
On 8/24/21 19:43, Aaron Conole wrote: > Aaron Conole <aconole@redhat.com> writes: > >> Ilya Maximets <i.maximets@ovn.org> writes: >> >>> On 8/12/21 6:17 PM, Aaron Conole wrote: >>>> wenxu@ucloud.cn writes: >>>> >>>>> From: wenxu <wenxu@ucloud.cn> >>>>> >>>>> The ipf collect original fragment packets and reass a new pkt >>>>> to do the conntrack logic. After finsh the conntrack things >>>>> copy the ct meta info to each orignal packet and modify the >>>>> l4 header in the first fragment. It should modify the ip src/ >>>>> dst info for all the fragments. >>>>> >>>>> Signed-off-by: wenxu <wenxu@ucloud.cn> >>>>> Co-authored-by: luke.li <luke.li@ucloud.cn> >>>>> Signed-off-by: luke.li <luke.li@ucloud.cn> >>>>> --- >>>> >>>> Acked-by: Aaron Conole <aconole@redhat.com> >>>> >>>> Thanks for the fix. I see it can work for any l3 protocol. >>>> >>>> Based on the comments you supplied, I wrote the following test case. It >>>> can either be folded in by you (or Ilya on apply), or I can submit as a >>>> separate patch (in case you are worried about having my sign-off / >>>> coauthor on this patch). >>>> >>>> When testing 'make check-system-userspace' before this patch, I see a >>>> failure and get the following tcpdump logged: >>>> >>>> 12:15:31 aconole@RHTPC1VM0NT {master} ~/git/ovs$ sudo tcpdump -r >>>> tests/system-userspace-testsuite.dir/078/p1.pcap >>>> reading from file >>>> tests/system-userspace-testsuite.dir/078/p1.pcap, link-type EN10MB >>>> (Ethernet), snapshot length 262144 >>>> dropped privs to tcpdump >>>> 12:07:21.364925 ARP, Request who-has 10.2.1.2 tell 10.2.1.1, length 28 >>>> 12:07:21.364928 ARP, Reply 10.2.1.2 is-at e6:45:4a:80:7c:61 (oui Unknown), length 28 >>>> 12:07:21.365095 IP 10.1.1.1 > 10.1.1.2: ICMP echo request, id 40165, seq 1, length 1480 >>>> 12:07:21.365099 IP 10.2.1.1 > 10.1.1.2: icmp >>>> 12:07:21.365101 IP 10.2.1.1 > 10.1.1.2: icmp >>>> 12:07:21.365102 IP 10.2.1.1 > 10.1.1.2: icmp >>>> >>>> We see the first frag correct, but subsequent frags are broken. >>>> >>>> This test worked both for userspace and kernel datapath on my local >>>> system. >>> >>> Hmm. This test fails for me for both kernel and userspace: >> >> Okay, I'll try it again on my system. For reference, I was on F34, >> kernel 5.12.12-300.fc34.x86_64 >> >>> tcpdump -r tests/system-userspace-testsuite.dir/078/p0.pcap >>> 15:17:12.832383 ARP, Request who-has 10.2.1.2 tell 10.2.1.1, length 28 >>> 15:17:12.834317 ARP, Reply 10.2.1.2 is-at 46:0c:83:aa:6e:b0 (oui Unknown), length 28 >>> 15:17:12.834327 IP 10.2.1.1 > 10.1.1.2: ICMP echo request, id 27759, seq 1, length 1480 >>> 15:17:12.834329 IP 10.2.1.1 > 10.1.1.2: icmp >>> 15:17:12.834330 IP 10.2.1.1 > 10.1.1.2: icmp >>> 15:17:12.834332 IP 10.2.1.1 > 10.1.1.2: icmp >>> >>> tcpdump -r tests/system-userspace-testsuite.dir/078/p1.pcap >>> 15:17:12.833542 ARP, Request who-has 10.2.1.2 tell 10.2.1.1, length 28 >>> 15:17:12.834994 IP 10.1.1.1 > 10.1.1.2: ICMP echo request, id 27759, seq 1, length 1480 >>> 15:17:12.834999 IP 10.1.1.1 > 10.1.1.2: icmp >>> 15:17:12.835002 IP 10.1.1.1 > 10.1.1.2: icmp >>> 15:17:12.835004 IP 10.1.1.1 > 10.1.1.2: icmp >>> >>> ping -c 1 10.1.1.2 -M dont -s 4500 | grep "transmitted" | sed 's/time.*ms$/time 0ms/' >>> NS_EXEC_HEREDOC >>> --- - 2021-08-16 15:17:22.844535052 -0400 >>> +++ /root/ovs/tests/system-userspace-testsuite.dir/at-groups/78/stdout >>> @@ -1,2 +1,2 @@ >>> -1 packets transmitted, 1 received, 0% packet loss, time 0ms >>> +1 packets transmitted, 0 received, 100% packet loss, time 0ms >>> >>> # uname -a >>> Linux rhel8 4.18.0-305.3.1.el8_4.x86_64 >>> >>> I'm not sure what is going on. Could you, please, re-check? >> >> I'll boot an rhel8.4 instance and try it out. >> >>> I will not apply this patch for now until we figure out how to test it. >> >> Okay. >> > > I hope this change works. I altered the test environment to use a > single port so instead of having a dummy attached to the bridge, I now > use lo to be the receiver. This one worked for me. Thanks! With the test, applied to master and backported down to 2.13. Best regards, Ilya Maximets.
diff --git a/lib/ipf.c b/lib/ipf.c index 9c83f19..795e801 100644 --- a/lib/ipf.c +++ b/lib/ipf.c @@ -1149,52 +1149,56 @@ ipf_post_execute_reass_pkts(struct ipf *ipf, * NETDEV_MAX_BURST. */ DP_PACKET_BATCH_REFILL_FOR_EACH (pb_idx, pb_cnt, pkt, pb) { if (rp && pkt == rp->list->reass_execute_ctx) { + const struct ipf_frag *frag_0 = &rp->list->frag_list[0]; + void *l4_frag = dp_packet_l4(frag_0->pkt); + void *l4_reass = dp_packet_l4(pkt); + memcpy(l4_frag, l4_reass, dp_packet_l4_size(frag_0->pkt)); + for (int i = 0; i <= rp->list->last_inuse_idx; i++) { - rp->list->frag_list[i].pkt->md.ct_label = pkt->md.ct_label; - rp->list->frag_list[i].pkt->md.ct_mark = pkt->md.ct_mark; - rp->list->frag_list[i].pkt->md.ct_state = pkt->md.ct_state; - rp->list->frag_list[i].pkt->md.ct_zone = pkt->md.ct_zone; - rp->list->frag_list[i].pkt->md.ct_orig_tuple_ipv6 = - pkt->md.ct_orig_tuple_ipv6; + const struct ipf_frag *frag_i = &rp->list->frag_list[i]; + + frag_i->pkt->md.ct_label = pkt->md.ct_label; + frag_i->pkt->md.ct_mark = pkt->md.ct_mark; + frag_i->pkt->md.ct_state = pkt->md.ct_state; + frag_i->pkt->md.ct_zone = pkt->md.ct_zone; + frag_i->pkt->md.ct_orig_tuple_ipv6 = + pkt->md.ct_orig_tuple_ipv6; if (pkt->md.ct_orig_tuple_ipv6) { - rp->list->frag_list[i].pkt->md.ct_orig_tuple.ipv6 = + frag_i->pkt->md.ct_orig_tuple.ipv6 = pkt->md.ct_orig_tuple.ipv6; } else { - rp->list->frag_list[i].pkt->md.ct_orig_tuple.ipv4 = + frag_i->pkt->md.ct_orig_tuple.ipv4 = pkt->md.ct_orig_tuple.ipv4; } - } - - const struct ipf_frag *frag_0 = &rp->list->frag_list[0]; - void *l4_frag = dp_packet_l4(frag_0->pkt); - void *l4_reass = dp_packet_l4(pkt); - memcpy(l4_frag, l4_reass, dp_packet_l4_size(frag_0->pkt)); - - if (v6) { - struct ovs_16aligned_ip6_hdr *l3_frag - = dp_packet_l3(frag_0->pkt); - struct ovs_16aligned_ip6_hdr *l3_reass = dp_packet_l3(pkt); - l3_frag->ip6_src = l3_reass->ip6_src; - l3_frag->ip6_dst = l3_reass->ip6_dst; - } else { - struct ip_header *l3_frag = dp_packet_l3(frag_0->pkt); - struct ip_header *l3_reass = dp_packet_l3(pkt); - if (!dp_packet_hwol_is_ipv4(frag_0->pkt)) { - ovs_be32 reass_ip = - get_16aligned_be32(&l3_reass->ip_src); - ovs_be32 frag_ip = - get_16aligned_be32(&l3_frag->ip_src); - - l3_frag->ip_csum = recalc_csum32(l3_frag->ip_csum, - frag_ip, reass_ip); - reass_ip = get_16aligned_be32(&l3_reass->ip_dst); - frag_ip = get_16aligned_be32(&l3_frag->ip_dst); - l3_frag->ip_csum = recalc_csum32(l3_frag->ip_csum, - frag_ip, reass_ip); + if (v6) { + struct ovs_16aligned_ip6_hdr *l3_frag + = dp_packet_l3(frag_i->pkt); + struct ovs_16aligned_ip6_hdr *l3_reass + = dp_packet_l3(pkt); + l3_frag->ip6_src = l3_reass->ip6_src; + l3_frag->ip6_dst = l3_reass->ip6_dst; + } else { + struct ip_header *l3_frag = dp_packet_l3(frag_i->pkt); + struct ip_header *l3_reass = dp_packet_l3(pkt); + if (!dp_packet_hwol_is_ipv4(frag_i->pkt)) { + ovs_be32 reass_ip = + get_16aligned_be32(&l3_reass->ip_src); + ovs_be32 frag_ip = + get_16aligned_be32(&l3_frag->ip_src); + + l3_frag->ip_csum = recalc_csum32(l3_frag->ip_csum, + frag_ip, + reass_ip); + reass_ip = get_16aligned_be32(&l3_reass->ip_dst); + frag_ip = get_16aligned_be32(&l3_frag->ip_dst); + l3_frag->ip_csum = recalc_csum32(l3_frag->ip_csum, + frag_ip, + reass_ip); + } + + l3_frag->ip_src = l3_reass->ip_src; + l3_frag->ip_dst = l3_reass->ip_dst; } - - l3_frag->ip_src = l3_reass->ip_src; - l3_frag->ip_dst = l3_reass->ip_dst; } ipf_completed_list_add(&ipf->frag_complete_list, rp->list);