Message ID | 1441850422-33750-7-git-send-email-joestringer@nicira.com |
---|---|
State | Superseded |
Headers | show |
With the few comments below, Acked-by: Jarno Rajahalme <jrajahalme@nicira.com> The MRU is still lost if the packet is sent to the controller, but maybe sending (potentially) large packet to controller is a bad idea to begin with. Jarno > On Sep 9, 2015, at 7:00 PM, Joe Stringer <joestringer@nicira.com> wrote: > > From: Andy Zhou <azhou@nicira.com> > > User space now may receive re-assembled IP fragments. The user space > netlink handler can now accept packets with the new OVS_PACKET_ATTR_MRU > attribute. This allows the kernel to assemble fragmented packets for the > duration of OpenFlow processing, then re-fragment at output time. Most > notably this occurs for packets that are sent through the connection > tracker. > > Signed-off-by: Andy Zhou <azhou@nicira.com> > --- > datapath/linux/compat/include/linux/openvswitch.h | 5 + > lib/dpif-netlink.c | 5 + > lib/dpif.c | 2 + > lib/dpif.h | 3 + > ofproto/ofproto-dpif-upcall.c | 17 +- > ofproto/ofproto-dpif.c | 3 + > tests/system-traffic.at | 221 ++++++++++++++++++++++ > 7 files changed, 254 insertions(+), 2 deletions(-) > > diff --git a/datapath/linux/compat/include/linux/openvswitch.h b/datapath/linux/compat/include/linux/openvswitch.h > index bbc6ca4..1997175 100644 > --- a/datapath/linux/compat/include/linux/openvswitch.h > +++ b/datapath/linux/compat/include/linux/openvswitch.h > @@ -187,6 +187,10 @@ enum ovs_packet_cmd { > * %OVS_USERSPACE_ATTR_EGRESS_TUN_PORT attribute, which is sent only if the > * output port is actually a tunnel port. Contains the output tunnel key > * extracted from the packet as nested %OVS_TUNNEL_KEY_ATTR_* attributes. > + * @OVS_PACKET_ATTR_MRU: Present for an %OVS_PACKET_CMD_ACTION and > + * %OVS_PACKET_ATTR_USERSPACE action specify the Maximum received fragment > + * size. > + * > * These attributes follow the &struct ovs_header within the Generic Netlink > * payload for %OVS_PACKET_* commands. > */ > @@ -202,6 +206,7 @@ enum ovs_packet_attr { > OVS_PACKET_ATTR_UNUSED2, > OVS_PACKET_ATTR_PROBE, /* Packet operation is a feature probe, > error logging should be suppressed. */ > + OVS_PACKET_ATTR_MRU, /* Maximum received IP fragment size. */ > __OVS_PACKET_ATTR_MAX > }; > > diff --git a/lib/dpif-netlink.c b/lib/dpif-netlink.c > index ffeb124..007f4ec 100644 > --- a/lib/dpif-netlink.c > +++ b/lib/dpif-netlink.c > @@ -1547,6 +1547,9 @@ dpif_netlink_encode_execute(int dp_ifindex, const struct dpif_execute *d_exec, > if (d_exec->probe) { > nl_msg_put_flag(buf, OVS_PACKET_ATTR_PROBE); > } > + if (d_exec->mru) { > + nl_msg_put_u16(buf, OVS_PACKET_ATTR_MRU, d_exec->mru); > + } > } > > /* Executes, against 'dpif', up to the first 'n_ops' operations in 'ops'. > @@ -1965,6 +1968,7 @@ parse_odp_packet(const struct dpif_netlink *dpif, struct ofpbuf *buf, > [OVS_PACKET_ATTR_USERDATA] = { .type = NL_A_UNSPEC, .optional = true }, > [OVS_PACKET_ATTR_EGRESS_TUN_KEY] = { .type = NL_A_NESTED, .optional = true }, > [OVS_PACKET_ATTR_ACTIONS] = { .type = NL_A_NESTED, .optional = true }, > + [OVS_PACKET_ATTR_MRU] = { .type = NL_A_BE16, .optional = true } > }; > > struct ovs_header *ovs_header; > @@ -2002,6 +2006,7 @@ parse_odp_packet(const struct dpif_netlink *dpif, struct ofpbuf *buf, > upcall->userdata = a[OVS_PACKET_ATTR_USERDATA]; > upcall->out_tun_key = a[OVS_PACKET_ATTR_EGRESS_TUN_KEY]; > upcall->actions = a[OVS_PACKET_ATTR_ACTIONS]; > + upcall->mru = a[OVS_PACKET_ATTR_MRU]; > > /* Allow overwriting the netlink attribute header without reallocating. */ > dp_packet_use_stub(&upcall->packet, > diff --git a/lib/dpif.c b/lib/dpif.c > index c03aa1d..f56d2f2 100644 > --- a/lib/dpif.c > +++ b/lib/dpif.c > @@ -1126,6 +1126,7 @@ dpif_execute_helper_cb(void *aux_, struct dp_packet **packets, int cnt, > execute.packet = packet; > execute.needs_help = false; > execute.probe = false; > + execute.mru = 0; > aux->error = dpif_execute(aux->dpif, &execute); > log_execute_message(aux->dpif, &execute, true, aux->error); > > @@ -1693,6 +1694,7 @@ log_execute_message(struct dpif *dpif, const struct dpif_execute *execute, > ds_put_format(&ds, " failed (%s)", ovs_strerror(error)); > } > ds_put_format(&ds, " on packet %s", packet); > + ds_put_format(&ds, " mtu %d", execute->mru); > vlog(THIS_MODULE, error ? VLL_WARN : VLL_DBG, "%s", ds_cstr(&ds)); > ds_destroy(&ds); > free(packet); > diff --git a/lib/dpif.h b/lib/dpif.h > index 1b8a7b9..76317e4 100644 > --- a/lib/dpif.h > +++ b/lib/dpif.h > @@ -697,6 +697,8 @@ struct dpif_execute { > size_t actions_len; /* Length of 'actions' in bytes. */ > bool needs_help; > bool probe; /* Suppress error messages. */ > + unsigned int mru; /* packet's Maximum receive unit. > + 0 if not a fragmented packet */ > Should this be called “mtu” instead? Execute does not receive packets… > /* Input, but possibly modified as a side effect of execution. */ > struct dp_packet *packet; /* Packet to execute. */ > @@ -780,6 +782,7 @@ struct dpif_upcall { > struct nlattr *key; /* Flow key. */ > size_t key_len; /* Length of 'key' in bytes. */ > ovs_u128 ufid; /* Unique flow identifier for 'key'. */ > + struct nlattr *mru; /* Maximum receive unit. */ > > /* DPIF_UC_ACTION only. */ > struct nlattr *userdata; /* Argument to OVS_ACTION_ATTR_USERSPACE. */ > diff --git a/ofproto/ofproto-dpif-upcall.c b/ofproto/ofproto-dpif-upcall.c > index 8a43bbf..badb4d9 100644 > --- a/ofproto/ofproto-dpif-upcall.c > +++ b/ofproto/ofproto-dpif-upcall.c > @@ -183,6 +183,8 @@ struct upcall { > unsigned pmd_id; /* Datapath poll mode driver id. */ > const struct dp_packet *packet; /* Packet associated with this upcall. */ > ofp_port_t in_port; /* OpenFlow in port, or OFPP_NONE. */ > + unsigned int mru; /* If !0, Maximum receive unit of > + fragmented IP packet */ > Would uint16_t be a more appropriate type for mru? > enum dpif_upcall_type type; /* Datapath type of the upcall. */ > const struct nlattr *userdata; /* Userdata for DPIF_UC_ACTION Upcalls. */ > @@ -329,6 +331,7 @@ static enum upcall_type classify_upcall(enum dpif_upcall_type type, > static int upcall_receive(struct upcall *, const struct dpif_backer *, > const struct dp_packet *packet, enum dpif_upcall_type, > const struct nlattr *userdata, const struct flow *, > + const unsigned int mru, > const ovs_u128 *ufid, const unsigned pmd_id); > static void upcall_uninit(struct upcall *); > > @@ -716,6 +719,7 @@ recv_upcalls(struct handler *handler) > struct dpif_upcall *dupcall = &dupcalls[n_upcalls]; > struct upcall *upcall = &upcalls[n_upcalls]; > struct flow *flow = &flows[n_upcalls]; > + unsigned int mru; > int error; > > ofpbuf_use_stub(recv_buf, recv_stubs[n_upcalls], > @@ -730,8 +734,14 @@ recv_upcalls(struct handler *handler) > goto free_dupcall; > } > > + if (dupcall->mru) { > + mru = nl_attr_get_u16(dupcall->mru); > + } else { > + mru = 0; > + } > + > error = upcall_receive(upcall, udpif->backer, &dupcall->packet, > - dupcall->type, dupcall->userdata, flow, > + dupcall->type, dupcall->userdata, flow, mru, > &dupcall->ufid, PMD_ID_NULL); > if (error) { > if (error == ENODEV) { > @@ -977,6 +987,7 @@ static int > upcall_receive(struct upcall *upcall, const struct dpif_backer *backer, > const struct dp_packet *packet, enum dpif_upcall_type type, > const struct nlattr *userdata, const struct flow *flow, > + const unsigned int mru, > const ovs_u128 *ufid, const unsigned pmd_id) > { > int error; > @@ -1006,6 +1017,7 @@ upcall_receive(struct upcall *upcall, const struct dpif_backer *backer, > upcall->ukey = NULL; > upcall->key = NULL; > upcall->key_len = 0; > + upcall->mru = mru; > > upcall->out_tun_key = NULL; > upcall->actions = NULL; > @@ -1133,7 +1145,7 @@ upcall_cb(const struct dp_packet *packet, const struct flow *flow, ovs_u128 *ufi > atomic_read_relaxed(&udpif->flow_limit, &flow_limit); > > error = upcall_receive(&upcall, udpif->backer, packet, type, userdata, > - flow, ufid, pmd_id); > + flow, 0, ufid, pmd_id); > if (error) { > return error; > } > @@ -1355,6 +1367,7 @@ handle_upcalls(struct udpif *udpif, struct upcall *upcalls, > op->dop.u.execute.actions_len = upcall->odp_actions.size; > op->dop.u.execute.needs_help = (upcall->xout.slow & SLOW_ACTION) != 0; > op->dop.u.execute.probe = false; > + op->dop.u.execute.mru = upcall->mru; > } > } > > diff --git a/ofproto/ofproto-dpif.c b/ofproto/ofproto-dpif.c > index ddf177a..b210881 100644 > --- a/ofproto/ofproto-dpif.c > +++ b/ofproto/ofproto-dpif.c > @@ -1116,6 +1116,7 @@ check_variable_length_userdata(struct dpif_backer *backer) > execute.packet = &packet; > execute.needs_help = false; > execute.probe = true; > + execute.mru = 0; > > error = dpif_execute(backer->dpif, &execute); > > @@ -1214,6 +1215,7 @@ check_masked_set_action(struct dpif_backer *backer) > execute.packet = &packet; > execute.needs_help = false; > execute.probe = true; > + execute.mru = 0; > > error = dpif_execute(backer->dpif, &execute); > > @@ -3722,6 +3724,7 @@ ofproto_dpif_execute_actions(struct ofproto_dpif *ofproto, > execute.packet = packet; > execute.needs_help = (xout.slow & SLOW_ACTION) != 0; > execute.probe = false; > + execute.mru = 0; > > /* Fix up in_port. */ > in_port = flow->in_port.ofp_port; > diff --git a/tests/system-traffic.at b/tests/system-traffic.at > index cdd0315..4074eaa 100644 > --- a/tests/system-traffic.at > +++ b/tests/system-traffic.at > @@ -923,3 +923,224 @@ TIME_WAIT src=10.1.1.1 dst=10.1.1.2 sport=<cleared> dport=<cleared> src=10.1.1.2 > > OVS_TRAFFIC_VSWITCHD_STOP > AT_CLEANUP > + > +AT_SETUP([fragmentation - ipv4 icmp ]) > +CHECK_CONNTRACK() > +OVS_TRAFFIC_VSWITCHD_START( > + [set-fail-mode br0 secure -- ]) > + > +ADD_NAMESPACES(at_ns0, at_ns1) > + > +ADD_VETH(p0, at_ns0, br0, "10.1.1.1/24") > +ADD_VETH(p1, at_ns1, br0, "10.1.1.2/24") > + > +dnl Sending ping through conntrack > +AT_DATA([flows.txt], [dnl > +priority=1,action=drop > +priority=10,arp,action=normal > +in_port=1,icmp,action=ct(commit,zone=9),2 > +in_port=2,ct_state=-trk,icmp,action=ct(table=0,zone=9) > +in_port=2,ct_state=+trk+est-new,icmp,action=1 > +]) > + > +AT_CHECK([ovs-ofctl add-flows br0 flows.txt]) > + > +dnl Basic connectivity check. > +NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -w 2 10.1.1.2 | FORMAT_PING], [0], [dnl > +3 packets transmitted, 3 received, 0% packet loss, time 0ms > +]) > + > +dnl Ipv4 fragmentation connectivity check. > +NS_CHECK_EXEC([at_ns0], [ping -s 1600 -q -c 3 -i 0.3 -w 2 10.1.1.2 | FORMAT_PING], [0], [dnl > +3 packets transmitted, 3 received, 0% packet loss, time 0ms > +]) > + > +dnl Ipv4 larger fragmentation connectivity check. > +NS_CHECK_EXEC([at_ns0], [ping -s 3200 -q -c 3 -i 0.3 -w 2 10.1.1.2 | FORMAT_PING], [0], [dnl > +3 packets transmitted, 3 received, 0% packet loss, time 0ms > +]) > + > +OVS_TRAFFIC_VSWITCHD_STOP > +AT_CLEANUP > + > +AT_SETUP([fragmentation - ipv4 icmp + vlan]) > +CHECK_CONNTRACK() > +OVS_TRAFFIC_VSWITCHD_START( > + [set-fail-mode br0 secure -- ]) > + > +ADD_NAMESPACES(at_ns0, at_ns1) > + > +ADD_VETH(p0, at_ns0, br0, "10.1.1.1/24") > +ADD_VETH(p1, at_ns1, br0, "10.1.1.2/24") > +ADD_VLAN(p0, at_ns0, 100, "10.2.2.1/24") > +ADD_VLAN(p1, at_ns1, 100, "10.2.2.2/24") > + > +dnl Sending ping through conntrack > +AT_DATA([flows.txt], [dnl > +priority=1,action=drop > +priority=10,arp,action=normal > +in_port=1,icmp,action=ct(commit,zone=9),2 > +in_port=2,ct_state=-trk,icmp,action=ct(table=0,zone=9) > +in_port=2,ct_state=+trk+est-new,icmp,action=1 > +]) > + > +AT_CHECK([ovs-ofctl add-flows br0 flows.txt]) > + > +dnl Basic connectivity check. > +NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -w 2 10.2.2.2 | FORMAT_PING], [0], [dnl > +3 packets transmitted, 3 received, 0% packet loss, time 0ms > +]) > + > +dnl Ipv4 fragmentation connectivity check. > +NS_CHECK_EXEC([at_ns0], [ping -s 1600 -q -c 3 -i 0.3 -w 2 10.2.2.2 | FORMAT_PING], [0], [dnl > +3 packets transmitted, 3 received, 0% packet loss, time 0ms > +]) > + > +dnl Ipv4 larger fragmentation connectivity check. > +NS_CHECK_EXEC([at_ns0], [ping -s 3200 -q -c 3 -i 0.3 -w 2 10.2.2.2 | FORMAT_PING], [0], [dnl > +3 packets transmitted, 3 received, 0% packet loss, time 0ms > +]) > + > +OVS_TRAFFIC_VSWITCHD_STOP > +AT_CLEANUP > + > +AT_SETUP([fragmentation - ipv6 icmp ]) > +CHECK_CONNTRACK() > +OVS_TRAFFIC_VSWITCHD_START( > + [set-fail-mode br0 secure -- ]) > + > +ADD_NAMESPACES(at_ns0, at_ns1) > + > +ADD_VETH(p0, at_ns0, br0, "fc00::1/96") > +ADD_VETH(p1, at_ns1, br0, "fc00::2/96") > + > +dnl Sending ping through conntrack > +AT_DATA([flows.txt], [dnl > +priority=1,action=drop > +priority=10,in_port=1,ipv6,action=ct(commit,zone=9),2 > +priority=10,in_port=2,ct_state=-trk,ipv6,action=ct(table=0,zone=9) > +priority=10,in_port=2,ct_state=+trk+est-new,ipv6,action=1 > +icmp6,icmp_type=135,action=normal > +icmp6,icmp_type=136,action=normal > +]) > + > +AT_CHECK([ovs-ofctl add-flows br0 flows.txt]) > + > +dnl Without this sleep, we get occasional failures due to the following error: > +dnl "connect: Cannot assign requested address" > +sleep 2; > + > +dnl Basic connectivity check. > +NS_CHECK_EXEC([at_ns0], [ping6 -q -c 3 -i 0.3 -w 2 fc00::2 | FORMAT_PING], [0], [dnl > +3 packets transmitted, 3 received, 0% packet loss, time 0ms > +]) > + > +dnl Ipv4 fragmentation connectivity check. > +NS_CHECK_EXEC([at_ns0], [ping6 -s 1600 -q -c 3 -i 0.3 -w 2 fc00::2 | FORMAT_PING], [0], [dnl > +3 packets transmitted, 3 received, 0% packet loss, time 0ms > +]) > + > +dnl Ipv4 larger fragmentation connectivity check. > +NS_CHECK_EXEC([at_ns0], [ping6 -s 3200 -q -c 3 -i 0.3 -w 2 fc00::2 | FORMAT_PING], [0], [dnl > +3 packets transmitted, 3 received, 0% packet loss, time 0ms > +]) > + > +OVS_TRAFFIC_VSWITCHD_STOP > +AT_CLEANUP > + > +AT_SETUP([fragmentation - ipv6 icmp + vlan]) > +CHECK_CONNTRACK() > +OVS_TRAFFIC_VSWITCHD_START( > + [set-fail-mode br0 secure -- ]) > + > +ADD_NAMESPACES(at_ns0, at_ns1) > + > +ADD_VETH(p0, at_ns0, br0, "fc00::1/96") > +ADD_VETH(p1, at_ns1, br0, "fc00::2/96") > + > +ADD_VLAN(p0, at_ns0, 100, "fc00:1::3/96") > +ADD_VLAN(p1, at_ns1, 100, "fc00:1::4/96") > + > +dnl Sending ping through conntrack > +AT_DATA([flows.txt], [dnl > +priority=1,action=drop > +priority=10,in_port=1,ipv6,action=ct(commit,zone=9),2 > +priority=10,in_port=2,ct_state=-trk,ipv6,action=ct(table=0,zone=9) > +priority=10,in_port=2,ct_state=+trk+est-new,ipv6,action=1 > +icmp6,icmp_type=135,action=normal > +icmp6,icmp_type=136,action=normal > +]) > + > +AT_CHECK([ovs-ofctl add-flows br0 flows.txt]) > + > +dnl Without this sleep, we get occasional failures due to the following error: > +dnl "connect: Cannot assign requested address" > +sleep 2; > + > +dnl Basic connectivity check. > +NS_CHECK_EXEC([at_ns0], [ping6 -q -c 3 -i 0.3 -w 2 fc00:1::4 | FORMAT_PING], [0], [dnl > +3 packets transmitted, 3 received, 0% packet loss, time 0ms > +]) > + > +dnl Ipv4 fragmentation connectivity check. > +NS_CHECK_EXEC([at_ns0], [ping6 -s 1600 -q -c 3 -i 0.3 -w 2 fc00:1::4 | FORMAT_PING], [0], [dnl > +3 packets transmitted, 3 received, 0% packet loss, time 0ms > +]) > + > +dnl Ipv4 larger fragmentation connectivity check. > +NS_CHECK_EXEC([at_ns0], [ping6 -s 3200 -q -c 3 -i 0.3 -w 2 fc00:1::4 | FORMAT_PING], [0], [dnl > +3 packets transmitted, 3 received, 0% packet loss, time 0ms > +]) > + > +OVS_TRAFFIC_VSWITCHD_STOP > +AT_CLEANUP > + > +AT_SETUP([fragmentation over vxlan tunnel]) > +AT_SKIP_IF([! ip link help 2>&1 | grep vxlan >/dev/null]) > +CHECK_CONNTRACK() > + > +OVS_TRAFFIC_VSWITCHD_START( > + [set-fail-mode br0 standalone --]) > +ADD_BR([br-underlay], [set-fail-mode br-underlay standalone]) > +ADD_NAMESPACES(at_ns0) > + > +dnl Sending ping through conntrack > +AT_DATA([flows.txt], [dnl > +priority=1,action=drop > +priority=10,arp,action=normal > +in_port=1,icmp,action=ct(commit,zone=9),LOCAL > +in_port=LOCAL,ct_state=-trk,icmp,action=ct(table=0,zone=9) > +in_port=LOCAL,ct_state=+trk+est-new,icmp,action=1 > +]) > + > +AT_CHECK([ovs-ofctl add-flows br0 flows.txt]) > + > +dnl Set up underlay link from host into the namespace using veth pair. > +ADD_VETH(p0, at_ns0, br-underlay, "172.31.1.1/24") > +AT_CHECK([ip addr add dev br-underlay "172.31.1.100/24"]) > +AT_CHECK([ip link set dev br-underlay up]) > + > +dnl Set up tunnel endpoints on OVS outside the namespace and with a native > +dnl linux device inside the namespace. > +ADD_OVS_TUNNEL([vxlan], [br0], [at_ns0], [172.31.1.1], [10.1.1.100/24]) > +ADD_NATIVE_TUNNEL([vxlan], [at_vxlan1], [at_ns0], [172.31.1.100], [10.1.1.1/24], > + [id 0 dstport 4789]) > + > +dnl First, check the underlay > +NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -w 2 172.31.1.100 | FORMAT_PING], [0], [dnl > +3 packets transmitted, 3 received, 0% packet loss, time 0ms > +]) > + > +dnl Okay, now check the overlay with different packet sizes > +NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -w 2 10.1.1.100 | FORMAT_PING], [0], [dnl > +3 packets transmitted, 3 received, 0% packet loss, time 0ms > +]) > +NS_CHECK_EXEC([at_ns0], [ping -s 1600 -q -c 3 -i 0.3 -w 2 10.1.1.100 | FORMAT_PING], [0], [dnl > +3 packets transmitted, 3 received, 0% packet loss, time 0ms > +]) > +NS_CHECK_EXEC([at_ns0], [ping -s 3200 -q -c 3 -i 0.3 -w 2 10.1.1.100 | FORMAT_PING], [0], [dnl > +3 packets transmitted, 3 received, 0% packet loss, time 0ms > +]) > + > +OVS_TRAFFIC_VSWITCHD_STOP > +AT_CLEANUP > -- > 2.1.4 > > _______________________________________________ > dev mailing list > dev@openvswitch.org > http://openvswitch.org/mailman/listinfo/dev
On 11 September 2015 at 16:52, Jarno Rajahalme <jrajahalme@nicira.com> wrote: > With the few comments below, > > Acked-by: Jarno Rajahalme <jrajahalme@nicira.com> > > The MRU is still lost if the packet is sent to the controller, but maybe sending (potentially) large packet to controller is a bad idea to begin with. I'll add this caveat to the commit message. Fixed the remaining issues, thanks.
diff --git a/datapath/linux/compat/include/linux/openvswitch.h b/datapath/linux/compat/include/linux/openvswitch.h index bbc6ca4..1997175 100644 --- a/datapath/linux/compat/include/linux/openvswitch.h +++ b/datapath/linux/compat/include/linux/openvswitch.h @@ -187,6 +187,10 @@ enum ovs_packet_cmd { * %OVS_USERSPACE_ATTR_EGRESS_TUN_PORT attribute, which is sent only if the * output port is actually a tunnel port. Contains the output tunnel key * extracted from the packet as nested %OVS_TUNNEL_KEY_ATTR_* attributes. + * @OVS_PACKET_ATTR_MRU: Present for an %OVS_PACKET_CMD_ACTION and + * %OVS_PACKET_ATTR_USERSPACE action specify the Maximum received fragment + * size. + * * These attributes follow the &struct ovs_header within the Generic Netlink * payload for %OVS_PACKET_* commands. */ @@ -202,6 +206,7 @@ enum ovs_packet_attr { OVS_PACKET_ATTR_UNUSED2, OVS_PACKET_ATTR_PROBE, /* Packet operation is a feature probe, error logging should be suppressed. */ + OVS_PACKET_ATTR_MRU, /* Maximum received IP fragment size. */ __OVS_PACKET_ATTR_MAX }; diff --git a/lib/dpif-netlink.c b/lib/dpif-netlink.c index ffeb124..007f4ec 100644 --- a/lib/dpif-netlink.c +++ b/lib/dpif-netlink.c @@ -1547,6 +1547,9 @@ dpif_netlink_encode_execute(int dp_ifindex, const struct dpif_execute *d_exec, if (d_exec->probe) { nl_msg_put_flag(buf, OVS_PACKET_ATTR_PROBE); } + if (d_exec->mru) { + nl_msg_put_u16(buf, OVS_PACKET_ATTR_MRU, d_exec->mru); + } } /* Executes, against 'dpif', up to the first 'n_ops' operations in 'ops'. @@ -1965,6 +1968,7 @@ parse_odp_packet(const struct dpif_netlink *dpif, struct ofpbuf *buf, [OVS_PACKET_ATTR_USERDATA] = { .type = NL_A_UNSPEC, .optional = true }, [OVS_PACKET_ATTR_EGRESS_TUN_KEY] = { .type = NL_A_NESTED, .optional = true }, [OVS_PACKET_ATTR_ACTIONS] = { .type = NL_A_NESTED, .optional = true }, + [OVS_PACKET_ATTR_MRU] = { .type = NL_A_BE16, .optional = true } }; struct ovs_header *ovs_header; @@ -2002,6 +2006,7 @@ parse_odp_packet(const struct dpif_netlink *dpif, struct ofpbuf *buf, upcall->userdata = a[OVS_PACKET_ATTR_USERDATA]; upcall->out_tun_key = a[OVS_PACKET_ATTR_EGRESS_TUN_KEY]; upcall->actions = a[OVS_PACKET_ATTR_ACTIONS]; + upcall->mru = a[OVS_PACKET_ATTR_MRU]; /* Allow overwriting the netlink attribute header without reallocating. */ dp_packet_use_stub(&upcall->packet, diff --git a/lib/dpif.c b/lib/dpif.c index c03aa1d..f56d2f2 100644 --- a/lib/dpif.c +++ b/lib/dpif.c @@ -1126,6 +1126,7 @@ dpif_execute_helper_cb(void *aux_, struct dp_packet **packets, int cnt, execute.packet = packet; execute.needs_help = false; execute.probe = false; + execute.mru = 0; aux->error = dpif_execute(aux->dpif, &execute); log_execute_message(aux->dpif, &execute, true, aux->error); @@ -1693,6 +1694,7 @@ log_execute_message(struct dpif *dpif, const struct dpif_execute *execute, ds_put_format(&ds, " failed (%s)", ovs_strerror(error)); } ds_put_format(&ds, " on packet %s", packet); + ds_put_format(&ds, " mtu %d", execute->mru); vlog(THIS_MODULE, error ? VLL_WARN : VLL_DBG, "%s", ds_cstr(&ds)); ds_destroy(&ds); free(packet); diff --git a/lib/dpif.h b/lib/dpif.h index 1b8a7b9..76317e4 100644 --- a/lib/dpif.h +++ b/lib/dpif.h @@ -697,6 +697,8 @@ struct dpif_execute { size_t actions_len; /* Length of 'actions' in bytes. */ bool needs_help; bool probe; /* Suppress error messages. */ + unsigned int mru; /* packet's Maximum receive unit. + 0 if not a fragmented packet */ /* Input, but possibly modified as a side effect of execution. */ struct dp_packet *packet; /* Packet to execute. */ @@ -780,6 +782,7 @@ struct dpif_upcall { struct nlattr *key; /* Flow key. */ size_t key_len; /* Length of 'key' in bytes. */ ovs_u128 ufid; /* Unique flow identifier for 'key'. */ + struct nlattr *mru; /* Maximum receive unit. */ /* DPIF_UC_ACTION only. */ struct nlattr *userdata; /* Argument to OVS_ACTION_ATTR_USERSPACE. */ diff --git a/ofproto/ofproto-dpif-upcall.c b/ofproto/ofproto-dpif-upcall.c index 8a43bbf..badb4d9 100644 --- a/ofproto/ofproto-dpif-upcall.c +++ b/ofproto/ofproto-dpif-upcall.c @@ -183,6 +183,8 @@ struct upcall { unsigned pmd_id; /* Datapath poll mode driver id. */ const struct dp_packet *packet; /* Packet associated with this upcall. */ ofp_port_t in_port; /* OpenFlow in port, or OFPP_NONE. */ + unsigned int mru; /* If !0, Maximum receive unit of + fragmented IP packet */ enum dpif_upcall_type type; /* Datapath type of the upcall. */ const struct nlattr *userdata; /* Userdata for DPIF_UC_ACTION Upcalls. */ @@ -329,6 +331,7 @@ static enum upcall_type classify_upcall(enum dpif_upcall_type type, static int upcall_receive(struct upcall *, const struct dpif_backer *, const struct dp_packet *packet, enum dpif_upcall_type, const struct nlattr *userdata, const struct flow *, + const unsigned int mru, const ovs_u128 *ufid, const unsigned pmd_id); static void upcall_uninit(struct upcall *); @@ -716,6 +719,7 @@ recv_upcalls(struct handler *handler) struct dpif_upcall *dupcall = &dupcalls[n_upcalls]; struct upcall *upcall = &upcalls[n_upcalls]; struct flow *flow = &flows[n_upcalls]; + unsigned int mru; int error; ofpbuf_use_stub(recv_buf, recv_stubs[n_upcalls], @@ -730,8 +734,14 @@ recv_upcalls(struct handler *handler) goto free_dupcall; } + if (dupcall->mru) { + mru = nl_attr_get_u16(dupcall->mru); + } else { + mru = 0; + } + error = upcall_receive(upcall, udpif->backer, &dupcall->packet, - dupcall->type, dupcall->userdata, flow, + dupcall->type, dupcall->userdata, flow, mru, &dupcall->ufid, PMD_ID_NULL); if (error) { if (error == ENODEV) { @@ -977,6 +987,7 @@ static int upcall_receive(struct upcall *upcall, const struct dpif_backer *backer, const struct dp_packet *packet, enum dpif_upcall_type type, const struct nlattr *userdata, const struct flow *flow, + const unsigned int mru, const ovs_u128 *ufid, const unsigned pmd_id) { int error; @@ -1006,6 +1017,7 @@ upcall_receive(struct upcall *upcall, const struct dpif_backer *backer, upcall->ukey = NULL; upcall->key = NULL; upcall->key_len = 0; + upcall->mru = mru; upcall->out_tun_key = NULL; upcall->actions = NULL; @@ -1133,7 +1145,7 @@ upcall_cb(const struct dp_packet *packet, const struct flow *flow, ovs_u128 *ufi atomic_read_relaxed(&udpif->flow_limit, &flow_limit); error = upcall_receive(&upcall, udpif->backer, packet, type, userdata, - flow, ufid, pmd_id); + flow, 0, ufid, pmd_id); if (error) { return error; } @@ -1355,6 +1367,7 @@ handle_upcalls(struct udpif *udpif, struct upcall *upcalls, op->dop.u.execute.actions_len = upcall->odp_actions.size; op->dop.u.execute.needs_help = (upcall->xout.slow & SLOW_ACTION) != 0; op->dop.u.execute.probe = false; + op->dop.u.execute.mru = upcall->mru; } } diff --git a/ofproto/ofproto-dpif.c b/ofproto/ofproto-dpif.c index ddf177a..b210881 100644 --- a/ofproto/ofproto-dpif.c +++ b/ofproto/ofproto-dpif.c @@ -1116,6 +1116,7 @@ check_variable_length_userdata(struct dpif_backer *backer) execute.packet = &packet; execute.needs_help = false; execute.probe = true; + execute.mru = 0; error = dpif_execute(backer->dpif, &execute); @@ -1214,6 +1215,7 @@ check_masked_set_action(struct dpif_backer *backer) execute.packet = &packet; execute.needs_help = false; execute.probe = true; + execute.mru = 0; error = dpif_execute(backer->dpif, &execute); @@ -3722,6 +3724,7 @@ ofproto_dpif_execute_actions(struct ofproto_dpif *ofproto, execute.packet = packet; execute.needs_help = (xout.slow & SLOW_ACTION) != 0; execute.probe = false; + execute.mru = 0; /* Fix up in_port. */ in_port = flow->in_port.ofp_port; diff --git a/tests/system-traffic.at b/tests/system-traffic.at index cdd0315..4074eaa 100644 --- a/tests/system-traffic.at +++ b/tests/system-traffic.at @@ -923,3 +923,224 @@ TIME_WAIT src=10.1.1.1 dst=10.1.1.2 sport=<cleared> dport=<cleared> src=10.1.1.2 OVS_TRAFFIC_VSWITCHD_STOP AT_CLEANUP + +AT_SETUP([fragmentation - ipv4 icmp ]) +CHECK_CONNTRACK() +OVS_TRAFFIC_VSWITCHD_START( + [set-fail-mode br0 secure -- ]) + +ADD_NAMESPACES(at_ns0, at_ns1) + +ADD_VETH(p0, at_ns0, br0, "10.1.1.1/24") +ADD_VETH(p1, at_ns1, br0, "10.1.1.2/24") + +dnl Sending ping through conntrack +AT_DATA([flows.txt], [dnl +priority=1,action=drop +priority=10,arp,action=normal +in_port=1,icmp,action=ct(commit,zone=9),2 +in_port=2,ct_state=-trk,icmp,action=ct(table=0,zone=9) +in_port=2,ct_state=+trk+est-new,icmp,action=1 +]) + +AT_CHECK([ovs-ofctl add-flows br0 flows.txt]) + +dnl Basic connectivity check. +NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -w 2 10.1.1.2 | FORMAT_PING], [0], [dnl +3 packets transmitted, 3 received, 0% packet loss, time 0ms +]) + +dnl Ipv4 fragmentation connectivity check. +NS_CHECK_EXEC([at_ns0], [ping -s 1600 -q -c 3 -i 0.3 -w 2 10.1.1.2 | FORMAT_PING], [0], [dnl +3 packets transmitted, 3 received, 0% packet loss, time 0ms +]) + +dnl Ipv4 larger fragmentation connectivity check. +NS_CHECK_EXEC([at_ns0], [ping -s 3200 -q -c 3 -i 0.3 -w 2 10.1.1.2 | FORMAT_PING], [0], [dnl +3 packets transmitted, 3 received, 0% packet loss, time 0ms +]) + +OVS_TRAFFIC_VSWITCHD_STOP +AT_CLEANUP + +AT_SETUP([fragmentation - ipv4 icmp + vlan]) +CHECK_CONNTRACK() +OVS_TRAFFIC_VSWITCHD_START( + [set-fail-mode br0 secure -- ]) + +ADD_NAMESPACES(at_ns0, at_ns1) + +ADD_VETH(p0, at_ns0, br0, "10.1.1.1/24") +ADD_VETH(p1, at_ns1, br0, "10.1.1.2/24") +ADD_VLAN(p0, at_ns0, 100, "10.2.2.1/24") +ADD_VLAN(p1, at_ns1, 100, "10.2.2.2/24") + +dnl Sending ping through conntrack +AT_DATA([flows.txt], [dnl +priority=1,action=drop +priority=10,arp,action=normal +in_port=1,icmp,action=ct(commit,zone=9),2 +in_port=2,ct_state=-trk,icmp,action=ct(table=0,zone=9) +in_port=2,ct_state=+trk+est-new,icmp,action=1 +]) + +AT_CHECK([ovs-ofctl add-flows br0 flows.txt]) + +dnl Basic connectivity check. +NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -w 2 10.2.2.2 | FORMAT_PING], [0], [dnl +3 packets transmitted, 3 received, 0% packet loss, time 0ms +]) + +dnl Ipv4 fragmentation connectivity check. +NS_CHECK_EXEC([at_ns0], [ping -s 1600 -q -c 3 -i 0.3 -w 2 10.2.2.2 | FORMAT_PING], [0], [dnl +3 packets transmitted, 3 received, 0% packet loss, time 0ms +]) + +dnl Ipv4 larger fragmentation connectivity check. +NS_CHECK_EXEC([at_ns0], [ping -s 3200 -q -c 3 -i 0.3 -w 2 10.2.2.2 | FORMAT_PING], [0], [dnl +3 packets transmitted, 3 received, 0% packet loss, time 0ms +]) + +OVS_TRAFFIC_VSWITCHD_STOP +AT_CLEANUP + +AT_SETUP([fragmentation - ipv6 icmp ]) +CHECK_CONNTRACK() +OVS_TRAFFIC_VSWITCHD_START( + [set-fail-mode br0 secure -- ]) + +ADD_NAMESPACES(at_ns0, at_ns1) + +ADD_VETH(p0, at_ns0, br0, "fc00::1/96") +ADD_VETH(p1, at_ns1, br0, "fc00::2/96") + +dnl Sending ping through conntrack +AT_DATA([flows.txt], [dnl +priority=1,action=drop +priority=10,in_port=1,ipv6,action=ct(commit,zone=9),2 +priority=10,in_port=2,ct_state=-trk,ipv6,action=ct(table=0,zone=9) +priority=10,in_port=2,ct_state=+trk+est-new,ipv6,action=1 +icmp6,icmp_type=135,action=normal +icmp6,icmp_type=136,action=normal +]) + +AT_CHECK([ovs-ofctl add-flows br0 flows.txt]) + +dnl Without this sleep, we get occasional failures due to the following error: +dnl "connect: Cannot assign requested address" +sleep 2; + +dnl Basic connectivity check. +NS_CHECK_EXEC([at_ns0], [ping6 -q -c 3 -i 0.3 -w 2 fc00::2 | FORMAT_PING], [0], [dnl +3 packets transmitted, 3 received, 0% packet loss, time 0ms +]) + +dnl Ipv4 fragmentation connectivity check. +NS_CHECK_EXEC([at_ns0], [ping6 -s 1600 -q -c 3 -i 0.3 -w 2 fc00::2 | FORMAT_PING], [0], [dnl +3 packets transmitted, 3 received, 0% packet loss, time 0ms +]) + +dnl Ipv4 larger fragmentation connectivity check. +NS_CHECK_EXEC([at_ns0], [ping6 -s 3200 -q -c 3 -i 0.3 -w 2 fc00::2 | FORMAT_PING], [0], [dnl +3 packets transmitted, 3 received, 0% packet loss, time 0ms +]) + +OVS_TRAFFIC_VSWITCHD_STOP +AT_CLEANUP + +AT_SETUP([fragmentation - ipv6 icmp + vlan]) +CHECK_CONNTRACK() +OVS_TRAFFIC_VSWITCHD_START( + [set-fail-mode br0 secure -- ]) + +ADD_NAMESPACES(at_ns0, at_ns1) + +ADD_VETH(p0, at_ns0, br0, "fc00::1/96") +ADD_VETH(p1, at_ns1, br0, "fc00::2/96") + +ADD_VLAN(p0, at_ns0, 100, "fc00:1::3/96") +ADD_VLAN(p1, at_ns1, 100, "fc00:1::4/96") + +dnl Sending ping through conntrack +AT_DATA([flows.txt], [dnl +priority=1,action=drop +priority=10,in_port=1,ipv6,action=ct(commit,zone=9),2 +priority=10,in_port=2,ct_state=-trk,ipv6,action=ct(table=0,zone=9) +priority=10,in_port=2,ct_state=+trk+est-new,ipv6,action=1 +icmp6,icmp_type=135,action=normal +icmp6,icmp_type=136,action=normal +]) + +AT_CHECK([ovs-ofctl add-flows br0 flows.txt]) + +dnl Without this sleep, we get occasional failures due to the following error: +dnl "connect: Cannot assign requested address" +sleep 2; + +dnl Basic connectivity check. +NS_CHECK_EXEC([at_ns0], [ping6 -q -c 3 -i 0.3 -w 2 fc00:1::4 | FORMAT_PING], [0], [dnl +3 packets transmitted, 3 received, 0% packet loss, time 0ms +]) + +dnl Ipv4 fragmentation connectivity check. +NS_CHECK_EXEC([at_ns0], [ping6 -s 1600 -q -c 3 -i 0.3 -w 2 fc00:1::4 | FORMAT_PING], [0], [dnl +3 packets transmitted, 3 received, 0% packet loss, time 0ms +]) + +dnl Ipv4 larger fragmentation connectivity check. +NS_CHECK_EXEC([at_ns0], [ping6 -s 3200 -q -c 3 -i 0.3 -w 2 fc00:1::4 | FORMAT_PING], [0], [dnl +3 packets transmitted, 3 received, 0% packet loss, time 0ms +]) + +OVS_TRAFFIC_VSWITCHD_STOP +AT_CLEANUP + +AT_SETUP([fragmentation over vxlan tunnel]) +AT_SKIP_IF([! ip link help 2>&1 | grep vxlan >/dev/null]) +CHECK_CONNTRACK() + +OVS_TRAFFIC_VSWITCHD_START( + [set-fail-mode br0 standalone --]) +ADD_BR([br-underlay], [set-fail-mode br-underlay standalone]) +ADD_NAMESPACES(at_ns0) + +dnl Sending ping through conntrack +AT_DATA([flows.txt], [dnl +priority=1,action=drop +priority=10,arp,action=normal +in_port=1,icmp,action=ct(commit,zone=9),LOCAL +in_port=LOCAL,ct_state=-trk,icmp,action=ct(table=0,zone=9) +in_port=LOCAL,ct_state=+trk+est-new,icmp,action=1 +]) + +AT_CHECK([ovs-ofctl add-flows br0 flows.txt]) + +dnl Set up underlay link from host into the namespace using veth pair. +ADD_VETH(p0, at_ns0, br-underlay, "172.31.1.1/24") +AT_CHECK([ip addr add dev br-underlay "172.31.1.100/24"]) +AT_CHECK([ip link set dev br-underlay up]) + +dnl Set up tunnel endpoints on OVS outside the namespace and with a native +dnl linux device inside the namespace. +ADD_OVS_TUNNEL([vxlan], [br0], [at_ns0], [172.31.1.1], [10.1.1.100/24]) +ADD_NATIVE_TUNNEL([vxlan], [at_vxlan1], [at_ns0], [172.31.1.100], [10.1.1.1/24], + [id 0 dstport 4789]) + +dnl First, check the underlay +NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -w 2 172.31.1.100 | FORMAT_PING], [0], [dnl +3 packets transmitted, 3 received, 0% packet loss, time 0ms +]) + +dnl Okay, now check the overlay with different packet sizes +NS_CHECK_EXEC([at_ns0], [ping -q -c 3 -i 0.3 -w 2 10.1.1.100 | FORMAT_PING], [0], [dnl +3 packets transmitted, 3 received, 0% packet loss, time 0ms +]) +NS_CHECK_EXEC([at_ns0], [ping -s 1600 -q -c 3 -i 0.3 -w 2 10.1.1.100 | FORMAT_PING], [0], [dnl +3 packets transmitted, 3 received, 0% packet loss, time 0ms +]) +NS_CHECK_EXEC([at_ns0], [ping -s 3200 -q -c 3 -i 0.3 -w 2 10.1.1.100 | FORMAT_PING], [0], [dnl +3 packets transmitted, 3 received, 0% packet loss, time 0ms +]) + +OVS_TRAFFIC_VSWITCHD_STOP +AT_CLEANUP