Message ID | 1366596740-18246-1-git-send-email-horms@verge.net.au |
---|---|
State | Not Applicable, archived |
Delegated to: | David Miller |
Headers | show |
On Apr 22, 2013, at 5:12 , ext Simon Horman wrote: > diff --git a/datapath/actions.c b/datapath/actions.c ... > +static int push_mpls(struct sk_buff *skb, > + const struct ovs_action_push_mpls *mpls, > + unsigned *mpls_stack_depth) > +{ > + __be32 *new_mpls_lse; > + int err; > + > + err = make_writable(skb, skb->mac_len + MPLS_HLEN); > + if (unlikely(err)) > + return err; > + > + skb_push(skb, MPLS_HLEN); > + memmove(skb_mac_header(skb) - MPLS_HLEN, skb_mac_header(skb), > + skb->mac_len); > + skb_reset_mac_header(skb); > + skb_set_network_header(skb, skb->mac_len); > + > + new_mpls_lse = (__be32 *)skb_network_header(skb); > + *new_mpls_lse = mpls->mpls_lse; > + Note: Now the skb_network_header points to the MPLS header. But see below... > + if (get_ip_summed(skb) == OVS_CSUM_COMPLETE) > + skb->csum = csum_add(skb->csum, csum_partial(new_mpls_lse, > + MPLS_HLEN, 0)); > + > + set_ethertype(skb, mpls->mpls_ethertype); > + if (!eth_p_mpls(skb->protocol)) > + (*mpls_stack_depth)++; Here mpls_stack_depth is increased only conditionally. What would be the case where mpls_stack_depth would remain unchanged here? And how does that relate to what follows below... > + return 0; > +} > + > +static int pop_mpls(struct sk_buff *skb, const __be16 *ethertype, > + unsigned *mpls_stack_depth) > +{ > + int err; > + > + err = make_writable(skb, skb->mac_len + MPLS_HLEN); > + if (unlikely(err)) > + return err; > + > + if (get_ip_summed(skb) == OVS_CSUM_COMPLETE) > + skb->csum = csum_sub(skb->csum, > + csum_partial(skb_network_header(skb), > + MPLS_HLEN, 0)); > + > + memmove(skb_mac_header(skb) + MPLS_HLEN, skb_mac_header(skb), > + skb->mac_len); > + > + skb_pull(skb, MPLS_HLEN); > + skb_reset_mac_header(skb); > + skb_set_network_header(skb, skb->mac_len); > + > + set_ethertype(skb, *ethertype); > + if (!eth_p_mpls(skb->protocol)) > + (*mpls_stack_depth)--; Ditto? > + return 0; > +} > + > +static int set_mpls(struct sk_buff *skb, const __be32 *mpls_lse) > +{ > + __be32 *stack = (__be32 *)skb_network_header(skb); > + int err; > + > + err = make_writable(skb, skb->mac_len + MPLS_HLEN); > + if (unlikely(err)) > + return err; > + > + if (get_ip_summed(skb) == OVS_CSUM_COMPLETE) { > + __be32 diff[] = { ~(*stack), *mpls_lse }; > + skb->csum = ~csum_partial((char *)diff, sizeof(diff), > + ~skb->csum); > + } > + > + *stack = *mpls_lse; > + > + return 0; > +} > + > /* remove VLAN header from packet and update csum accordingly. */ > static int __pop_vlan_tci(struct sk_buff *skb, __be16 *current_tci) > { > @@ -115,6 +210,9 @@ static int push_vlan(struct sk_buff *skb, const struct ovs_action_push_vlan *vla > if (!__vlan_put_tag(skb, current_tag)) > return -ENOMEM; > > + /* update mac_len for MPLS functions */ > + skb_reset_mac_len(skb); > + > if (get_ip_summed(skb) == OVS_CSUM_COMPLETE) > skb->csum = csum_add(skb->csum, csum_partial(skb->data > + (2 * ETH_ALEN), VLAN_HLEN, 0)); > @@ -352,13 +450,31 @@ static int set_tcp(struct sk_buff *skb, const struct ovs_key_tcp *tcp_port_key) > return 0; > } > > -static int do_output(struct datapath *dp, struct sk_buff *skb, int out_port) > +static int do_output(struct datapath *dp, struct sk_buff *skb, int out_port, > + unsigned mpls_stack_depth) > { > struct vport *vport; > > if (unlikely(!skb)) > return -ENOMEM; > > + /* During action execution the network_header, and mac_len, > + * correspondingly, have tracked the end of the L2 frame (including > + * any VLAN headers), but proper skb output processing of GSO skbs > + * requires the network_header (and mac_len) to track the start of > + * the L3 header instead. These differ in the presence of MPLS > + * headers. > + * > + * mpls_stack_depth is only non-zero if a non-MPLS skb is turned > + * into an MPLS skb via an MPLS push action. This is the only case > + * where an MPLS skb may be a GSO skb. > + */ > + if (mpls_stack_depth) { > + skb->mac_len += MPLS_HLEN * mpls_stack_depth; > + skb_set_network_header(skb, skb->mac_len); > + skb_set_encapsulation_features(skb); > + } > + Here the skb_network_header is changed to point to the L3 header. Is it significant that in some cases (?) mpls_stack_depth may remain at zero, even when a MPLS header was in fact added? (See above). > vport = ovs_vport_rcu(dp, out_port); > if (unlikely(!vport)) { > kfree_skb(skb); -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Mon, Apr 22, 2013 at 01:55:43PM +0000, Rajahalme, Jarno (NSN - FI/Espoo) wrote: > On Apr 22, 2013, at 5:12 , ext Simon Horman wrote: > > > diff --git a/datapath/actions.c b/datapath/actions.c > ... > > +static int push_mpls(struct sk_buff *skb, > > + const struct ovs_action_push_mpls *mpls, > > + unsigned *mpls_stack_depth) > > +{ > > + __be32 *new_mpls_lse; > > + int err; > > + > > + err = make_writable(skb, skb->mac_len + MPLS_HLEN); > > + if (unlikely(err)) > > + return err; > > + > > + skb_push(skb, MPLS_HLEN); > > + memmove(skb_mac_header(skb) - MPLS_HLEN, skb_mac_header(skb), > > + skb->mac_len); > > + skb_reset_mac_header(skb); > > + skb_set_network_header(skb, skb->mac_len); > > + > > + new_mpls_lse = (__be32 *)skb_network_header(skb); > > + *new_mpls_lse = mpls->mpls_lse; > > + > > Note: Now the skb_network_header points to the MPLS header. But see below... The reason for this is that pop_mpls(), push_mpls() and set_ethertype() each rely on this to locate the end of the L2 headers and/or the top of the MPLS stack. > > + if (get_ip_summed(skb) == OVS_CSUM_COMPLETE) > > + skb->csum = csum_add(skb->csum, csum_partial(new_mpls_lse, > > + MPLS_HLEN, 0)); > > + > > + set_ethertype(skb, mpls->mpls_ethertype); > > + if (!eth_p_mpls(skb->protocol)) > > + (*mpls_stack_depth)++; > > Here mpls_stack_depth is increased only conditionally. What would be the case where mpls_stack_depth would remain unchanged here? And how does that relate to what follows below... It would be unchanged if the packet was originally MPLS. See below... > > > + return 0; > > +} > > + > > +static int pop_mpls(struct sk_buff *skb, const __be16 *ethertype, > > + unsigned *mpls_stack_depth) > > +{ > > + int err; > > + > > + err = make_writable(skb, skb->mac_len + MPLS_HLEN); > > + if (unlikely(err)) > > + return err; > > + > > + if (get_ip_summed(skb) == OVS_CSUM_COMPLETE) > > + skb->csum = csum_sub(skb->csum, > > + csum_partial(skb_network_header(skb), > > + MPLS_HLEN, 0)); > > + > > + memmove(skb_mac_header(skb) + MPLS_HLEN, skb_mac_header(skb), > > + skb->mac_len); > > + > > + skb_pull(skb, MPLS_HLEN); > > + skb_reset_mac_header(skb); > > + skb_set_network_header(skb, skb->mac_len); > > + > > + set_ethertype(skb, *ethertype); > > + if (!eth_p_mpls(skb->protocol)) > > + (*mpls_stack_depth)--; > > Ditto? > > > + return 0; > > +} > > + > > +static int set_mpls(struct sk_buff *skb, const __be32 *mpls_lse) > > +{ > > + __be32 *stack = (__be32 *)skb_network_header(skb); > > + int err; > > + > > + err = make_writable(skb, skb->mac_len + MPLS_HLEN); > > + if (unlikely(err)) > > + return err; > > + > > + if (get_ip_summed(skb) == OVS_CSUM_COMPLETE) { > > + __be32 diff[] = { ~(*stack), *mpls_lse }; > > + skb->csum = ~csum_partial((char *)diff, sizeof(diff), > > + ~skb->csum); > > + } > > + > > + *stack = *mpls_lse; > > + > > + return 0; > > +} > > + > > /* remove VLAN header from packet and update csum accordingly. */ > > static int __pop_vlan_tci(struct sk_buff *skb, __be16 *current_tci) > > { > > @@ -115,6 +210,9 @@ static int push_vlan(struct sk_buff *skb, const struct ovs_action_push_vlan *vla > > if (!__vlan_put_tag(skb, current_tag)) > > return -ENOMEM; > > > > + /* update mac_len for MPLS functions */ > > + skb_reset_mac_len(skb); > > + > > if (get_ip_summed(skb) == OVS_CSUM_COMPLETE) > > skb->csum = csum_add(skb->csum, csum_partial(skb->data > > + (2 * ETH_ALEN), VLAN_HLEN, 0)); > > @@ -352,13 +450,31 @@ static int set_tcp(struct sk_buff *skb, const struct ovs_key_tcp *tcp_port_key) > > return 0; > > } > > > > -static int do_output(struct datapath *dp, struct sk_buff *skb, int out_port) > > +static int do_output(struct datapath *dp, struct sk_buff *skb, int out_port, > > + unsigned mpls_stack_depth) > > { > > struct vport *vport; > > > > if (unlikely(!skb)) > > return -ENOMEM; > > > > + /* During action execution the network_header, and mac_len, > > + * correspondingly, have tracked the end of the L2 frame (including > > + * any VLAN headers), but proper skb output processing of GSO skbs > > + * requires the network_header (and mac_len) to track the start of > > + * the L3 header instead. These differ in the presence of MPLS > > + * headers. > > + * > > + * mpls_stack_depth is only non-zero if a non-MPLS skb is turned > > + * into an MPLS skb via an MPLS push action. This is the only case > > + * where an MPLS skb may be a GSO skb. > > + */ > > + if (mpls_stack_depth) { > > + skb->mac_len += MPLS_HLEN * mpls_stack_depth; > > + skb_set_network_header(skb, skb->mac_len); > > + skb_set_encapsulation_features(skb); > > + } > > + > > Here the skb_network_header is changed to point to the L3 header. Is it > significant that in some cases (?) mpls_stack_depth may remain at zero, > even when a MPLS header was in fact added? (See above). With the current code I believe there are the following cases: Input: non-MPLS skb: Output: network header and mac_len correspond to the beginning of the L3 headers Input: MPLS: Output: network header and mac_len correspond to the end of the L2 headers. In the case of MPLS output the end of the L2 headers and the beginning of the L3 headers will differ. As far as I know the network header and mac_len only need to correspond to the beginning of the L3 header if GSO segmentation will occur (actually, some proposed changes to the network stack are required, see "[PATCH 0/2] Small Modifications to GSO to allow segmentation of MPLS"). That only occurs if the skb is GSO. Which in turn can only occur if the recieved packet is non-MPLS. This is because the linux kernel doesn't support MPLS offloads on receive (or anywhere else for that matter). In the case that we have a non-MPLS skb the stack depth starts at zero and is tracked. This is used to update the network header and mac_len. Otherwise the stack depth is unknown and the network header and mac_len are left as-is, corresponding to the end of the L2 headers. Actually, it is possible to tighten up the if clause to be the following, as it is only necessary to update the network header and mac_len for GSO skbs. if (mpls_stack_depth && skb_is_gso(skb)) { ... } It is possible for us to find and track the MPLS stack depth for all cases and to update the network header and mac_len. However I don't think that there is any run-time benefit and it seems expensive to find out what the original stack depth was - I believe it would require parsing the MPLS entire stack for each packet. -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Apr 23, 2013, at 4:51 , ext Simon Horman wrote: > On Mon, Apr 22, 2013 at 01:55:43PM +0000, Rajahalme, Jarno (NSN - FI/Espoo) wrote: >> >> Here the skb_network_header is changed to point to the L3 header. Is it >> significant that in some cases (?) mpls_stack_depth may remain at zero, >> even when a MPLS header was in fact added? (See above). > > With the current code I believe there are the following cases: > > Input: non-MPLS skb: Output: network header and mac_len correspond to the > beginning of the L3 headers > Input: MPLS: Output: network header and mac_len correspond to the > end of the L2 headers. > > In the case of MPLS output the end of the L2 headers and the beginning > of the L3 headers will differ. > > > As far as I know the network header and mac_len only need to correspond to > the beginning of the L3 header if GSO segmentation will occur (actually, > some proposed changes to the network stack are required, see "[PATCH 0/2] > Small Modifications to GSO to allow segmentation of MPLS"). That only > occurs if the skb is GSO. Which in turn can only occur if the recieved > packet is non-MPLS. This is because the linux kernel doesn't support > MPLS offloads on receive (or anywhere else for that matter). > > In the case that we have a non-MPLS skb the stack depth starts at zero and > is tracked. This is used to update the network header and mac_len. > Otherwise the stack depth is unknown and the network header and mac_len are > left as-is, corresponding to the end of the L2 headers. > > Actually, it is possible to tighten up the if clause to be the following, > as it is only necessary to update the network header and mac_len for GSO skbs. > > if (mpls_stack_depth && skb_is_gso(skb)) { > ... > } > > It is possible for us to find and track the MPLS stack depth for all cases > and to update the network header and mac_len. However I don't think that > there is any run-time benefit and it seems expensive to find out what the > original stack depth was - I believe it would require parsing the MPLS > entire stack for each packet. > Thanks for explaining this. I think it would be better to keep updating the the network_header and mac_len for the Non-MPLS input packets regardless of the GSO status of the skb. It would be more consistent and less surprising. Jarno -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Tue, Apr 23, 2013 at 07:41:37AM +0000, Rajahalme, Jarno (NSN - FI/Espoo) wrote: > > On Apr 23, 2013, at 4:51 , ext Simon Horman wrote: > > > On Mon, Apr 22, 2013 at 01:55:43PM +0000, Rajahalme, Jarno (NSN - FI/Espoo) wrote: > >> > >> Here the skb_network_header is changed to point to the L3 header. Is it > >> significant that in some cases (?) mpls_stack_depth may remain at zero, > >> even when a MPLS header was in fact added? (See above). > > > > With the current code I believe there are the following cases: > > > > Input: non-MPLS skb: Output: network header and mac_len correspond to the > > beginning of the L3 headers > > Input: MPLS: Output: network header and mac_len correspond to the > > end of the L2 headers. > > > > In the case of MPLS output the end of the L2 headers and the beginning > > of the L3 headers will differ. > > > > > > As far as I know the network header and mac_len only need to correspond to > > the beginning of the L3 header if GSO segmentation will occur (actually, > > some proposed changes to the network stack are required, see "[PATCH 0/2] > > Small Modifications to GSO to allow segmentation of MPLS"). That only > > occurs if the skb is GSO. Which in turn can only occur if the recieved > > packet is non-MPLS. This is because the linux kernel doesn't support > > MPLS offloads on receive (or anywhere else for that matter). > > > > In the case that we have a non-MPLS skb the stack depth starts at zero and > > is tracked. This is used to update the network header and mac_len. > > Otherwise the stack depth is unknown and the network header and mac_len are > > left as-is, corresponding to the end of the L2 headers. > > > > Actually, it is possible to tighten up the if clause to be the following, > > as it is only necessary to update the network header and mac_len for GSO skbs. > > > > if (mpls_stack_depth && skb_is_gso(skb)) { > > ... > > } > > > > It is possible for us to find and track the MPLS stack depth for all cases > > and to update the network header and mac_len. However I don't think that > > there is any run-time benefit and it seems expensive to find out what the > > original stack depth was - I believe it would require parsing the MPLS > > entire stack for each packet. > > > > Thanks for explaining this. > > I think it would be better to keep updating the the network_header and mac_len for the Non-MPLS input packets regardless of the GSO status of the skb. It would be more consistent and less surprising. I agree entirely that it would be more consistent and less surprising. But I'm not sure if the cost is worth it. Jesse, do you have an opinion on this? -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Tue, Apr 23, 2013 at 12:58 AM, Simon Horman <horms@verge.net.au> wrote: > On Tue, Apr 23, 2013 at 07:41:37AM +0000, Rajahalme, Jarno (NSN - FI/Espoo) wrote: >> >> On Apr 23, 2013, at 4:51 , ext Simon Horman wrote: >> >> > On Mon, Apr 22, 2013 at 01:55:43PM +0000, Rajahalme, Jarno (NSN - FI/Espoo) wrote: >> >> >> >> Here the skb_network_header is changed to point to the L3 header. Is it >> >> significant that in some cases (?) mpls_stack_depth may remain at zero, >> >> even when a MPLS header was in fact added? (See above). >> > >> > With the current code I believe there are the following cases: >> > >> > Input: non-MPLS skb: Output: network header and mac_len correspond to the >> > beginning of the L3 headers >> > Input: MPLS: Output: network header and mac_len correspond to the >> > end of the L2 headers. >> > >> > In the case of MPLS output the end of the L2 headers and the beginning >> > of the L3 headers will differ. >> > >> > >> > As far as I know the network header and mac_len only need to correspond to >> > the beginning of the L3 header if GSO segmentation will occur (actually, >> > some proposed changes to the network stack are required, see "[PATCH 0/2] >> > Small Modifications to GSO to allow segmentation of MPLS"). That only >> > occurs if the skb is GSO. Which in turn can only occur if the recieved >> > packet is non-MPLS. This is because the linux kernel doesn't support >> > MPLS offloads on receive (or anywhere else for that matter). >> > >> > In the case that we have a non-MPLS skb the stack depth starts at zero and >> > is tracked. This is used to update the network header and mac_len. >> > Otherwise the stack depth is unknown and the network header and mac_len are >> > left as-is, corresponding to the end of the L2 headers. >> > >> > Actually, it is possible to tighten up the if clause to be the following, >> > as it is only necessary to update the network header and mac_len for GSO skbs. >> > >> > if (mpls_stack_depth && skb_is_gso(skb)) { >> > ... >> > } >> > >> > It is possible for us to find and track the MPLS stack depth for all cases >> > and to update the network header and mac_len. However I don't think that >> > there is any run-time benefit and it seems expensive to find out what the >> > original stack depth was - I believe it would require parsing the MPLS >> > entire stack for each packet. >> > >> >> Thanks for explaining this. >> >> I think it would be better to keep updating the the network_header and mac_len for the Non-MPLS input packets regardless of the GSO status of the skb. It would be more consistent and less surprising. > > I agree entirely that it would be more consistent and less surprising. > But I'm not sure if the cost is worth it. > > Jesse, do you have an opinion on this? In general, I would tend to agree with Jarno that keeping this consistent would be significantly easier to understand. I think the cost is probably not particularly high. However, I also think that having different meanings for the layer pointers inside and outside of OVS is not particularly ideal since it makes the overall system harder to understand. Using network header for the start of the MPLS stack might not be great since it means that we couldn't really take advantage of any actual hardware offloading in the future. Maybe we could use mac_len for that purpose and that would keep things more consistent? -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Tue, Apr 23, 2013 at 06:47:36PM -0700, Jesse Gross wrote: > On Tue, Apr 23, 2013 at 12:58 AM, Simon Horman <horms@verge.net.au> wrote: > > On Tue, Apr 23, 2013 at 07:41:37AM +0000, Rajahalme, Jarno (NSN - FI/Espoo) wrote: > >> > >> On Apr 23, 2013, at 4:51 , ext Simon Horman wrote: > >> > >> > On Mon, Apr 22, 2013 at 01:55:43PM +0000, Rajahalme, Jarno (NSN - FI/Espoo) wrote: > >> >> > >> >> Here the skb_network_header is changed to point to the L3 header. Is it > >> >> significant that in some cases (?) mpls_stack_depth may remain at zero, > >> >> even when a MPLS header was in fact added? (See above). > >> > > >> > With the current code I believe there are the following cases: > >> > > >> > Input: non-MPLS skb: Output: network header and mac_len correspond to the > >> > beginning of the L3 headers > >> > Input: MPLS: Output: network header and mac_len correspond to the > >> > end of the L2 headers. > >> > > >> > In the case of MPLS output the end of the L2 headers and the beginning > >> > of the L3 headers will differ. > >> > > >> > > >> > As far as I know the network header and mac_len only need to correspond to > >> > the beginning of the L3 header if GSO segmentation will occur (actually, > >> > some proposed changes to the network stack are required, see "[PATCH 0/2] > >> > Small Modifications to GSO to allow segmentation of MPLS"). That only > >> > occurs if the skb is GSO. Which in turn can only occur if the recieved > >> > packet is non-MPLS. This is because the linux kernel doesn't support > >> > MPLS offloads on receive (or anywhere else for that matter). > >> > > >> > In the case that we have a non-MPLS skb the stack depth starts at zero and > >> > is tracked. This is used to update the network header and mac_len. > >> > Otherwise the stack depth is unknown and the network header and mac_len are > >> > left as-is, corresponding to the end of the L2 headers. > >> > > >> > Actually, it is possible to tighten up the if clause to be the following, > >> > as it is only necessary to update the network header and mac_len for GSO skbs. > >> > > >> > if (mpls_stack_depth && skb_is_gso(skb)) { > >> > ... > >> > } > >> > > >> > It is possible for us to find and track the MPLS stack depth for all cases > >> > and to update the network header and mac_len. However I don't think that > >> > there is any run-time benefit and it seems expensive to find out what the > >> > original stack depth was - I believe it would require parsing the MPLS > >> > entire stack for each packet. > >> > > >> > >> Thanks for explaining this. > >> > >> I think it would be better to keep updating the the network_header and mac_len for the Non-MPLS input packets regardless of the GSO status of the skb. It would be more consistent and less surprising. > > > > I agree entirely that it would be more consistent and less surprising. > > But I'm not sure if the cost is worth it. > > > > Jesse, do you have an opinion on this? > > In general, I would tend to agree with Jarno that keeping this > consistent would be significantly easier to understand. I think the > cost is probably not particularly high. I think it would be become high for large MPLS stack depths. But I'm happy to wear that if you are. > However, I also think that having different meanings for the layer > pointers inside and outside of OVS is not particularly ideal since it > makes the overall system harder to understand. Using network header > for the start of the MPLS stack might not be great since it means that > we couldn't really take advantage of any actual hardware offloading in > the future. Maybe we could use mac_len for that purpose and that would > keep things more consistent? To clarify, your suggestion is: mac_len: corresponds to the top of the MPLS stack network_header: corresponds to the bottom of the MPLS stack If so, yes I think that could work and I will see about making it so. -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Wed, Apr 24, 2013 at 2:12 AM, Simon Horman <horms@verge.net.au> wrote: > On Tue, Apr 23, 2013 at 06:47:36PM -0700, Jesse Gross wrote: >> On Tue, Apr 23, 2013 at 12:58 AM, Simon Horman <horms@verge.net.au> wrote: >> > On Tue, Apr 23, 2013 at 07:41:37AM +0000, Rajahalme, Jarno (NSN - FI/Espoo) wrote: >> >> >> >> On Apr 23, 2013, at 4:51 , ext Simon Horman wrote: >> >> >> >> > On Mon, Apr 22, 2013 at 01:55:43PM +0000, Rajahalme, Jarno (NSN - FI/Espoo) wrote: >> >> >> >> >> >> Here the skb_network_header is changed to point to the L3 header. Is it >> >> >> significant that in some cases (?) mpls_stack_depth may remain at zero, >> >> >> even when a MPLS header was in fact added? (See above). >> >> > >> >> > With the current code I believe there are the following cases: >> >> > >> >> > Input: non-MPLS skb: Output: network header and mac_len correspond to the >> >> > beginning of the L3 headers >> >> > Input: MPLS: Output: network header and mac_len correspond to the >> >> > end of the L2 headers. >> >> > >> >> > In the case of MPLS output the end of the L2 headers and the beginning >> >> > of the L3 headers will differ. >> >> > >> >> > >> >> > As far as I know the network header and mac_len only need to correspond to >> >> > the beginning of the L3 header if GSO segmentation will occur (actually, >> >> > some proposed changes to the network stack are required, see "[PATCH 0/2] >> >> > Small Modifications to GSO to allow segmentation of MPLS"). That only >> >> > occurs if the skb is GSO. Which in turn can only occur if the recieved >> >> > packet is non-MPLS. This is because the linux kernel doesn't support >> >> > MPLS offloads on receive (or anywhere else for that matter). >> >> > >> >> > In the case that we have a non-MPLS skb the stack depth starts at zero and >> >> > is tracked. This is used to update the network header and mac_len. >> >> > Otherwise the stack depth is unknown and the network header and mac_len are >> >> > left as-is, corresponding to the end of the L2 headers. >> >> > >> >> > Actually, it is possible to tighten up the if clause to be the following, >> >> > as it is only necessary to update the network header and mac_len for GSO skbs. >> >> > >> >> > if (mpls_stack_depth && skb_is_gso(skb)) { >> >> > ... >> >> > } >> >> > >> >> > It is possible for us to find and track the MPLS stack depth for all cases >> >> > and to update the network header and mac_len. However I don't think that >> >> > there is any run-time benefit and it seems expensive to find out what the >> >> > original stack depth was - I believe it would require parsing the MPLS >> >> > entire stack for each packet. >> >> > >> >> >> >> Thanks for explaining this. >> >> >> >> I think it would be better to keep updating the the network_header and mac_len for the Non-MPLS input packets regardless of the GSO status of the skb. It would be more consistent and less surprising. >> > >> > I agree entirely that it would be more consistent and less surprising. >> > But I'm not sure if the cost is worth it. >> > >> > Jesse, do you have an opinion on this? >> >> In general, I would tend to agree with Jarno that keeping this >> consistent would be significantly easier to understand. I think the >> cost is probably not particularly high. > > I think it would be become high for large MPLS stack depths. > But I'm happy to wear that if you are. I guess, although in practice having more than 2 or 3 MPLS labels would be extremely rare so I'm not sure that it's worth optimizing at this point. >> However, I also think that having different meanings for the layer >> pointers inside and outside of OVS is not particularly ideal since it >> makes the overall system harder to understand. Using network header >> for the start of the MPLS stack might not be great since it means that >> we couldn't really take advantage of any actual hardware offloading in >> the future. Maybe we could use mac_len for that purpose and that would >> keep things more consistent? > > To clarify, your suggestion is: > > mac_len: corresponds to the top of the MPLS stack > network_header: corresponds to the bottom of the MPLS stack > > If so, yes I think that could work and I will see about making it so. Yes, that's what I was thinking. (Although to be more precise, network_header would point to the start of the L3 header, as it does currently. In practice, that's the same as the bottom of the MPLS stack in the vast majority of situations and in all cases that OVS supports.) -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Wed, Apr 24, 2013 at 10:51:37AM -0700, Jesse Gross wrote: > On Wed, Apr 24, 2013 at 2:12 AM, Simon Horman <horms@verge.net.au> wrote: > > On Tue, Apr 23, 2013 at 06:47:36PM -0700, Jesse Gross wrote: > >> On Tue, Apr 23, 2013 at 12:58 AM, Simon Horman <horms@verge.net.au> wrote: > >> > On Tue, Apr 23, 2013 at 07:41:37AM +0000, Rajahalme, Jarno (NSN - FI/Espoo) wrote: > >> >> > >> >> On Apr 23, 2013, at 4:51 , ext Simon Horman wrote: > >> >> > >> >> > On Mon, Apr 22, 2013 at 01:55:43PM +0000, Rajahalme, Jarno (NSN - FI/Espoo) wrote: > >> >> >> > >> >> >> Here the skb_network_header is changed to point to the L3 header. Is it > >> >> >> significant that in some cases (?) mpls_stack_depth may remain at zero, > >> >> >> even when a MPLS header was in fact added? (See above). > >> >> > > >> >> > With the current code I believe there are the following cases: > >> >> > > >> >> > Input: non-MPLS skb: Output: network header and mac_len correspond to the > >> >> > beginning of the L3 headers > >> >> > Input: MPLS: Output: network header and mac_len correspond to the > >> >> > end of the L2 headers. > >> >> > > >> >> > In the case of MPLS output the end of the L2 headers and the beginning > >> >> > of the L3 headers will differ. > >> >> > > >> >> > > >> >> > As far as I know the network header and mac_len only need to correspond to > >> >> > the beginning of the L3 header if GSO segmentation will occur (actually, > >> >> > some proposed changes to the network stack are required, see "[PATCH 0/2] > >> >> > Small Modifications to GSO to allow segmentation of MPLS"). That only > >> >> > occurs if the skb is GSO. Which in turn can only occur if the recieved > >> >> > packet is non-MPLS. This is because the linux kernel doesn't support > >> >> > MPLS offloads on receive (or anywhere else for that matter). > >> >> > > >> >> > In the case that we have a non-MPLS skb the stack depth starts at zero and > >> >> > is tracked. This is used to update the network header and mac_len. > >> >> > Otherwise the stack depth is unknown and the network header and mac_len are > >> >> > left as-is, corresponding to the end of the L2 headers. > >> >> > > >> >> > Actually, it is possible to tighten up the if clause to be the following, > >> >> > as it is only necessary to update the network header and mac_len for GSO skbs. > >> >> > > >> >> > if (mpls_stack_depth && skb_is_gso(skb)) { > >> >> > ... > >> >> > } > >> >> > > >> >> > It is possible for us to find and track the MPLS stack depth for all cases > >> >> > and to update the network header and mac_len. However I don't think that > >> >> > there is any run-time benefit and it seems expensive to find out what the > >> >> > original stack depth was - I believe it would require parsing the MPLS > >> >> > entire stack for each packet. > >> >> > > >> >> > >> >> Thanks for explaining this. > >> >> > >> >> I think it would be better to keep updating the the network_header and mac_len for the Non-MPLS input packets regardless of the GSO status of the skb. It would be more consistent and less surprising. > >> > > >> > I agree entirely that it would be more consistent and less surprising. > >> > But I'm not sure if the cost is worth it. > >> > > >> > Jesse, do you have an opinion on this? > >> > >> In general, I would tend to agree with Jarno that keeping this > >> consistent would be significantly easier to understand. I think the > >> cost is probably not particularly high. > > > > I think it would be become high for large MPLS stack depths. > > But I'm happy to wear that if you are. > > I guess, although in practice having more than 2 or 3 MPLS labels > would be extremely rare so I'm not sure that it's worth optimizing at > this point. Ok, agreed. I have added some code accordingly. > >> However, I also think that having different meanings for the layer > >> pointers inside and outside of OVS is not particularly ideal since it > >> makes the overall system harder to understand. Using network header > >> for the start of the MPLS stack might not be great since it means that > >> we couldn't really take advantage of any actual hardware offloading in > >> the future. Maybe we could use mac_len for that purpose and that would > >> keep things more consistent? > > > > To clarify, your suggestion is: > > > > mac_len: corresponds to the top of the MPLS stack > > network_header: corresponds to the bottom of the MPLS stack > > > > If so, yes I think that could work and I will see about making it so. > > Yes, that's what I was thinking. (Although to be more precise, > network_header would point to the start of the L3 header, as it does > currently. In practice, that's the same as the bottom of the MPLS > stack in the vast majority of situations and in all cases that OVS > supports.) Yes, thats what I should have said. I seem to have this portion working. I am still struggling to get TCP checksums working without my hack to skb_segment(). -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
diff --git a/acinclude.m4 b/acinclude.m4 index 911a23d..04da8a4 100644 --- a/acinclude.m4 +++ b/acinclude.m4 @@ -254,6 +254,13 @@ AC_DEFUN([OVS_CHECK_LINUX_COMPAT], [ OVS_GREP_IFELSE([$KSRC/include/linux/skbuff.h], [consume_skb]) OVS_GREP_IFELSE([$KSRC/include/linux/skbuff.h], [skb_frag_page]) OVS_GREP_IFELSE([$KSRC/include/linux/skbuff.h], [skb_reset_mac_len]) + # If encapsulation_features is merged upstream then a release number may + # be used to select compatibility code and the following check may be + # removed. This check is here for now to allow Open vSwtich to provide + # an example of how encapsulation_features may be used to provide + # GSO of non-MPLS GSO skbs that are turned into MPLS GSO skbs + # using MPLS push actions + OVS_GREP_IFELSE([$KSRC/include/linux/skbuff.h], [encapsulation_features]) OVS_GREP_IFELSE([$KSRC/include/linux/string.h], [kmemdup], [], [OVS_GREP_IFELSE([$KSRC/include/linux/slab.h], [kmemdup])]) diff --git a/datapath/actions.c b/datapath/actions.c index 0dac658..6e96931 100644 --- a/datapath/actions.c +++ b/datapath/actions.c @@ -38,6 +38,7 @@ #include "vport.h" static int do_execute_actions(struct datapath *dp, struct sk_buff *skb, + unsigned *mpls_stack_depth, const struct nlattr *attr, int len, bool keep_skb); static int make_writable(struct sk_buff *skb, int write_len) @@ -48,6 +49,100 @@ static int make_writable(struct sk_buff *skb, int write_len) return pskb_expand_head(skb, 0, 0, GFP_ATOMIC); } +static void set_ethertype(struct sk_buff *skb, const __be16 ethertype) +{ + /* skb_mac_header hdr is not used to locate the ethertype to + * set as it will be incorrect in the presence of VLAN tags + */ + struct ethhdr *hdr = (struct ethhdr *)(skb_network_header(skb) - + ETH_HLEN); + if (hdr->h_proto == ethertype) + return; + hdr->h_proto = ethertype; + if (get_ip_summed(skb) == OVS_CSUM_COMPLETE) { + __be16 diff[] = { ~hdr->h_proto, ethertype }; + skb->csum = ~csum_partial((char *)diff, sizeof(diff), + ~skb->csum); + } +} + +static int push_mpls(struct sk_buff *skb, + const struct ovs_action_push_mpls *mpls, + unsigned *mpls_stack_depth) +{ + __be32 *new_mpls_lse; + int err; + + err = make_writable(skb, skb->mac_len + MPLS_HLEN); + if (unlikely(err)) + return err; + + skb_push(skb, MPLS_HLEN); + memmove(skb_mac_header(skb) - MPLS_HLEN, skb_mac_header(skb), + skb->mac_len); + skb_reset_mac_header(skb); + skb_set_network_header(skb, skb->mac_len); + + new_mpls_lse = (__be32 *)skb_network_header(skb); + *new_mpls_lse = mpls->mpls_lse; + + if (get_ip_summed(skb) == OVS_CSUM_COMPLETE) + skb->csum = csum_add(skb->csum, csum_partial(new_mpls_lse, + MPLS_HLEN, 0)); + + set_ethertype(skb, mpls->mpls_ethertype); + if (!eth_p_mpls(skb->protocol)) + (*mpls_stack_depth)++; + return 0; +} + +static int pop_mpls(struct sk_buff *skb, const __be16 *ethertype, + unsigned *mpls_stack_depth) +{ + int err; + + err = make_writable(skb, skb->mac_len + MPLS_HLEN); + if (unlikely(err)) + return err; + + if (get_ip_summed(skb) == OVS_CSUM_COMPLETE) + skb->csum = csum_sub(skb->csum, + csum_partial(skb_network_header(skb), + MPLS_HLEN, 0)); + + memmove(skb_mac_header(skb) + MPLS_HLEN, skb_mac_header(skb), + skb->mac_len); + + skb_pull(skb, MPLS_HLEN); + skb_reset_mac_header(skb); + skb_set_network_header(skb, skb->mac_len); + + set_ethertype(skb, *ethertype); + if (!eth_p_mpls(skb->protocol)) + (*mpls_stack_depth)--; + return 0; +} + +static int set_mpls(struct sk_buff *skb, const __be32 *mpls_lse) +{ + __be32 *stack = (__be32 *)skb_network_header(skb); + int err; + + err = make_writable(skb, skb->mac_len + MPLS_HLEN); + if (unlikely(err)) + return err; + + if (get_ip_summed(skb) == OVS_CSUM_COMPLETE) { + __be32 diff[] = { ~(*stack), *mpls_lse }; + skb->csum = ~csum_partial((char *)diff, sizeof(diff), + ~skb->csum); + } + + *stack = *mpls_lse; + + return 0; +} + /* remove VLAN header from packet and update csum accordingly. */ static int __pop_vlan_tci(struct sk_buff *skb, __be16 *current_tci) { @@ -115,6 +210,9 @@ static int push_vlan(struct sk_buff *skb, const struct ovs_action_push_vlan *vla if (!__vlan_put_tag(skb, current_tag)) return -ENOMEM; + /* update mac_len for MPLS functions */ + skb_reset_mac_len(skb); + if (get_ip_summed(skb) == OVS_CSUM_COMPLETE) skb->csum = csum_add(skb->csum, csum_partial(skb->data + (2 * ETH_ALEN), VLAN_HLEN, 0)); @@ -352,13 +450,31 @@ static int set_tcp(struct sk_buff *skb, const struct ovs_key_tcp *tcp_port_key) return 0; } -static int do_output(struct datapath *dp, struct sk_buff *skb, int out_port) +static int do_output(struct datapath *dp, struct sk_buff *skb, int out_port, + unsigned mpls_stack_depth) { struct vport *vport; if (unlikely(!skb)) return -ENOMEM; + /* During action execution the network_header, and mac_len, + * correspondingly, have tracked the end of the L2 frame (including + * any VLAN headers), but proper skb output processing of GSO skbs + * requires the network_header (and mac_len) to track the start of + * the L3 header instead. These differ in the presence of MPLS + * headers. + * + * mpls_stack_depth is only non-zero if a non-MPLS skb is turned + * into an MPLS skb via an MPLS push action. This is the only case + * where an MPLS skb may be a GSO skb. + */ + if (mpls_stack_depth) { + skb->mac_len += MPLS_HLEN * mpls_stack_depth; + skb_set_network_header(skb, skb->mac_len); + skb_set_encapsulation_features(skb); + } + vport = ovs_vport_rcu(dp, out_port); if (unlikely(!vport)) { kfree_skb(skb); @@ -398,7 +514,7 @@ static int output_userspace(struct datapath *dp, struct sk_buff *skb, } static int sample(struct datapath *dp, struct sk_buff *skb, - const struct nlattr *attr) + unsigned *mpls_stack_depth, const struct nlattr *attr) { const struct nlattr *acts_list = NULL; const struct nlattr *a; @@ -418,8 +534,9 @@ static int sample(struct datapath *dp, struct sk_buff *skb, } } - return do_execute_actions(dp, skb, nla_data(acts_list), - nla_len(acts_list), true); + return do_execute_actions(dp, skb, mpls_stack_depth, + nla_data(acts_list), nla_len(acts_list), + true); } static int execute_set_action(struct sk_buff *skb, @@ -459,13 +576,23 @@ static int execute_set_action(struct sk_buff *skb, case OVS_KEY_ATTR_UDP: err = set_udp(skb, nla_data(nested_attr)); break; + + case OVS_KEY_ATTR_MPLS: + err = set_mpls(skb, nla_data(nested_attr)); + break; } return err; } -/* Execute a list of actions against 'skb'. */ +/* Execute a list of actions against 'skb'. + * + * The stack depth is only tracked in the case of a non-MPLS packet + * that becomes MPLS via an MPLS push action. The stack depth + * is passed to do_output() in order to allow it to prepare the + * skb for possible GSO segmentation. */ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb, + unsigned *mpls_stack_depth, const struct nlattr *attr, int len, bool keep_skb) { /* Every output action needs a separate clone of 'skb', but the common @@ -481,7 +608,8 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb, int err = 0; if (prev_port != -1) { - do_output(dp, skb_clone(skb, GFP_ATOMIC), prev_port); + do_output(dp, skb_clone(skb, GFP_ATOMIC), + prev_port, *mpls_stack_depth); prev_port = -1; } @@ -494,6 +622,14 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb, output_userspace(dp, skb, a); break; + case OVS_ACTION_ATTR_PUSH_MPLS: + err = push_mpls(skb, nla_data(a), mpls_stack_depth); + break; + + case OVS_ACTION_ATTR_POP_MPLS: + err = pop_mpls(skb, nla_data(a), mpls_stack_depth); + break; + case OVS_ACTION_ATTR_PUSH_VLAN: err = push_vlan(skb, nla_data(a)); if (unlikely(err)) /* skb already freed. */ @@ -509,7 +645,7 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb, break; case OVS_ACTION_ATTR_SAMPLE: - err = sample(dp, skb, a); + err = sample(dp, skb, mpls_stack_depth, a); break; } @@ -523,7 +659,7 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb, if (keep_skb) skb = skb_clone(skb, GFP_ATOMIC); - do_output(dp, skb, prev_port); + do_output(dp, skb, prev_port, *mpls_stack_depth); } else if (!keep_skb) consume_skb(skb); @@ -556,6 +692,7 @@ int ovs_execute_actions(struct datapath *dp, struct sk_buff *skb) struct sw_flow_actions *acts = rcu_dereference(OVS_CB(skb)->flow->sf_acts); struct loop_counter *loop; int error; + unsigned mpls_stack_depth = 0; /* Check whether we've looped too much. */ loop = &__get_cpu_var(loop_counters); @@ -568,7 +705,7 @@ int ovs_execute_actions(struct datapath *dp, struct sk_buff *skb) } OVS_CB(skb)->tun_key = NULL; - error = do_execute_actions(dp, skb, acts->actions, + error = do_execute_actions(dp, skb, &mpls_stack_depth, acts->actions, acts->actions_len, false); /* Check whether sub-actions looped too much. */ diff --git a/datapath/datapath.c b/datapath/datapath.c index 3cb58b0..e4ed5e0 100644 --- a/datapath/datapath.c +++ b/datapath/datapath.c @@ -550,13 +550,26 @@ static inline void add_nested_action_end(struct sw_flow_actions *sfa, int st_off a->nla_len = sfa->actions_len - st_offset; } -static int validate_and_copy_actions(const struct nlattr *attr, +struct eth_types { + size_t depth; + __be16 types[SAMPLE_ACTION_DEPTH]; +}; + +static void eth_types_set(struct eth_types *types, size_t depth, __be16 type) +{ + types->depth = depth; + types->types[depth] = type; +} + +static int validate_and_copy_actions__(const struct nlattr *attr, const struct sw_flow_key *key, int depth, - struct sw_flow_actions **sfa); + struct sw_flow_actions **sfa, + struct eth_types *eth_types); static int validate_and_copy_sample(const struct nlattr *attr, const struct sw_flow_key *key, int depth, - struct sw_flow_actions **sfa) + struct sw_flow_actions **sfa, + struct eth_types *eth_types) { const struct nlattr *attrs[OVS_SAMPLE_ATTR_MAX + 1]; const struct nlattr *probability, *actions; @@ -592,7 +605,8 @@ static int validate_and_copy_sample(const struct nlattr *attr, if (st_acts < 0) return st_acts; - err = validate_and_copy_actions(actions, key, depth + 1, sfa); + err = validate_and_copy_actions__(actions, key, depth + 1, sfa, + eth_types); if (err) return err; @@ -602,12 +616,12 @@ static int validate_and_copy_sample(const struct nlattr *attr, return 0; } -static int validate_tp_port(const struct sw_flow_key *flow_key) +static int validate_tp_port(const struct sw_flow_key *flow_key, __be16 eth_type) { - if (flow_key->eth.type == htons(ETH_P_IP)) { + if (eth_type == htons(ETH_P_IP)) { if (flow_key->ipv4.tp.src || flow_key->ipv4.tp.dst) return 0; - } else if (flow_key->eth.type == htons(ETH_P_IPV6)) { + } else if (eth_type == htons(ETH_P_IPV6)) { if (flow_key->ipv6.tp.src || flow_key->ipv6.tp.dst) return 0; } @@ -638,7 +652,7 @@ static int validate_and_copy_set_tun(const struct nlattr *attr, static int validate_set(const struct nlattr *a, const struct sw_flow_key *flow_key, struct sw_flow_actions **sfa, - bool *set_tun) + bool *set_tun, struct eth_types *eth_types) { const struct nlattr *ovs_key = nla_data(a); int key_type = nla_type(ovs_key); @@ -675,9 +689,12 @@ static int validate_set(const struct nlattr *a, return err; break; - case OVS_KEY_ATTR_IPV4: - if (flow_key->eth.type != htons(ETH_P_IP)) - return -EINVAL; + case OVS_KEY_ATTR_IPV4: { + size_t i; + + for (i = 0; i < eth_types->depth; i++) + if (eth_types->types[i] != htons(ETH_P_IP)) + return -EINVAL; if (!flow_key->ip.proto) return -EINVAL; @@ -690,10 +707,14 @@ static int validate_set(const struct nlattr *a, return -EINVAL; break; + } - case OVS_KEY_ATTR_IPV6: - if (flow_key->eth.type != htons(ETH_P_IPV6)) - return -EINVAL; + case OVS_KEY_ATTR_IPV6: { + size_t i; + + for (i = 0; i < eth_types->depth; i++) + if (eth_types->types[i] != htons(ETH_P_IPV6)) + return -EINVAL; if (!flow_key->ip.proto) return -EINVAL; @@ -709,18 +730,37 @@ static int validate_set(const struct nlattr *a, return -EINVAL; break; + } + + case OVS_KEY_ATTR_TCP: { + size_t i; - case OVS_KEY_ATTR_TCP: if (flow_key->ip.proto != IPPROTO_TCP) return -EINVAL; - return validate_tp_port(flow_key); + for (i = 0; i < eth_types->depth; i++) + if (validate_tp_port(flow_key, eth_types->types[i])) + return -EINVAL; + } - case OVS_KEY_ATTR_UDP: + case OVS_KEY_ATTR_UDP: { + size_t i; if (flow_key->ip.proto != IPPROTO_UDP) return -EINVAL; - return validate_tp_port(flow_key); + for (i = 0; i < eth_types->depth; i++) + if (validate_tp_port(flow_key, eth_types->types[i])) + return -EINVAL; + } + + case OVS_KEY_ATTR_MPLS: { + size_t i; + + for (i = 0; i < eth_types->depth; i++) + if (!eth_p_mpls(eth_types->types[i])) + return -EINVAL; + break; + } default: return -EINVAL; @@ -764,10 +804,10 @@ static int copy_action(const struct nlattr *from, return 0; } -static int validate_and_copy_actions(const struct nlattr *attr, - const struct sw_flow_key *key, - int depth, - struct sw_flow_actions **sfa) +static int validate_and_copy_actions__(const struct nlattr *attr, + const struct sw_flow_key *key, int depth, + struct sw_flow_actions **sfa, + struct eth_types *eth_types) { const struct nlattr *a; int rem, err; @@ -775,11 +815,29 @@ static int validate_and_copy_actions(const struct nlattr *attr, if (depth >= SAMPLE_ACTION_DEPTH) return -EOVERFLOW; + /* Due to the sample action there may be more than one possibility + * for the current ethernet type. They all need to be verified. + * + * This is handled by tracking a stack of ethernet types, one for + * each (sample) depth of validation. Here the ethernet type for + * the current depth is pushed onto the stack. It may be modified + * as by actions are validated. When a modification occurs the + * ethernet types for higher stack-depths are popped off the stack. + * All entries on the stack are checked when validating the + * ethernet type required by an action. + */ + if (!depth) + eth_types_set(eth_types, 0, key->eth.type); + else + eth_types_set(eth_types, depth, eth_types->types[depth - 1]); + nla_for_each_nested(a, attr, rem) { /* Expected argument lengths, (u32)-1 for variable length. */ static const u32 action_lens[OVS_ACTION_ATTR_MAX + 1] = { [OVS_ACTION_ATTR_OUTPUT] = sizeof(u32), [OVS_ACTION_ATTR_USERSPACE] = (u32)-1, + [OVS_ACTION_ATTR_PUSH_MPLS] = sizeof(struct ovs_action_push_mpls), + [OVS_ACTION_ATTR_POP_MPLS] = sizeof(__be16), [OVS_ACTION_ATTR_PUSH_VLAN] = sizeof(struct ovs_action_push_vlan), [OVS_ACTION_ATTR_POP_VLAN] = 0, [OVS_ACTION_ATTR_SET] = (u32)-1, @@ -810,6 +868,35 @@ static int validate_and_copy_actions(const struct nlattr *attr, return -EINVAL; break; + case OVS_ACTION_ATTR_PUSH_MPLS: { + const struct ovs_action_push_mpls *mpls = nla_data(a); + if (!eth_p_mpls(mpls->mpls_ethertype)) + return -EINVAL; + eth_types_set(eth_types, depth, mpls->mpls_ethertype); + break; + } + + case OVS_ACTION_ATTR_POP_MPLS: { + size_t i; + + for (i = 0; i < eth_types->depth; i++) + if (eth_types->types[i] != htons(ETH_P_IP)) + return -EINVAL; + + /* Disallow subsequent l2.5+ set and mpls_pop actions + * as there is no check here to ensure that the new + * eth_type is valid and thus set actions could + * write off the end of the packet or otherwise + * corrupt it. + * + * Support for these actions that after mpls_pop + * using packet recirculation is planned. + * are planned to be supported using using packet + * recirculation. + */ + eth_types_set(eth_types, depth, htons(0)); + break; + } case OVS_ACTION_ATTR_POP_VLAN: break; @@ -823,13 +910,14 @@ static int validate_and_copy_actions(const struct nlattr *attr, break; case OVS_ACTION_ATTR_SET: - err = validate_set(a, key, sfa, &skip_copy); + err = validate_set(a, key, sfa, &skip_copy, eth_types); if (err) return err; break; case OVS_ACTION_ATTR_SAMPLE: - err = validate_and_copy_sample(a, key, depth, sfa); + err = validate_and_copy_sample(a, key, depth, sfa, + eth_types); if (err) return err; skip_copy = true; @@ -851,6 +939,14 @@ static int validate_and_copy_actions(const struct nlattr *attr, return 0; } +static int validate_and_copy_actions(const struct nlattr *attr, + const struct sw_flow_key *key, + struct sw_flow_actions **sfa) +{ + struct eth_types eth_type; + return validate_and_copy_actions__(attr, key, 0, sfa, ð_type); +} + static void clear_stats(struct sw_flow *flow) { flow->used = 0; @@ -915,7 +1011,7 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info) if (IS_ERR(acts)) goto err_flow_free; - err = validate_and_copy_actions(a[OVS_PACKET_ATTR_ACTIONS], &flow->key, 0, &acts); + err = validate_and_copy_actions(a[OVS_PACKET_ATTR_ACTIONS], &flow->key, &acts); rcu_assign_pointer(flow->sf_acts, acts); if (err) goto err_flow_free; @@ -1251,7 +1347,7 @@ static int ovs_flow_cmd_new_or_set(struct sk_buff *skb, struct genl_info *info) if (IS_ERR(acts)) goto error; - error = validate_and_copy_actions(a[OVS_FLOW_ATTR_ACTIONS], &key, 0, &acts); + error = validate_and_copy_actions(a[OVS_FLOW_ATTR_ACTIONS], &key, &acts); if (error) goto err_kfree; } else if (info->genlhdr->cmd == OVS_FLOW_CMD_NEW) { diff --git a/datapath/datapath.h b/datapath/datapath.h index ad59a3a..4f0f4e1 100644 --- a/datapath/datapath.h +++ b/datapath/datapath.h @@ -202,4 +202,6 @@ struct sk_buff *ovs_vport_cmd_build_info(struct vport *, u32 portid, u32 seq, int ovs_execute_actions(struct datapath *dp, struct sk_buff *skb); void ovs_dp_notify_wq(struct work_struct *work); + +unsigned char *skb_cb_mpls_stack(const struct sk_buff *skb); #endif /* datapath.h */ diff --git a/datapath/flow.c b/datapath/flow.c index 3ce926e..a9d6434 100644 --- a/datapath/flow.c +++ b/datapath/flow.c @@ -648,6 +648,7 @@ int ovs_flow_extract(struct sk_buff *skb, u16 in_port, struct sw_flow_key *key, return -ENOMEM; skb_reset_network_header(skb); + skb_reset_mac_len(skb); __skb_push(skb, skb->data - skb_mac_header(skb)); /* Network layer. */ @@ -730,6 +731,13 @@ int ovs_flow_extract(struct sk_buff *skb, u16 in_port, struct sw_flow_key *key, memcpy(key->ipv4.arp.tha, arp->ar_tha, ETH_ALEN); key_len = SW_FLOW_KEY_OFFSET(ipv4.arp); } + } else if (eth_p_mpls(key->eth.type)) { + error = check_header(skb, MPLS_HLEN); + if (unlikely(error)) + goto out; + + key_len = SW_FLOW_KEY_OFFSET(mpls.top_lse); + memcpy(&key->mpls.top_lse, skb_network_header(skb), MPLS_HLEN); } else if (key->eth.type == htons(ETH_P_IPV6)) { int nh_len; /* IPv6 Header + Extensions */ @@ -848,6 +856,9 @@ const int ovs_key_lens[OVS_KEY_ATTR_MAX + 1] = { [OVS_KEY_ATTR_ARP] = sizeof(struct ovs_key_arp), [OVS_KEY_ATTR_ND] = sizeof(struct ovs_key_nd), [OVS_KEY_ATTR_TUNNEL] = -1, + + /* Not upstream. */ + [OVS_KEY_ATTR_MPLS] = sizeof(struct ovs_key_mpls), }; static int ipv4_flow_from_nlattrs(struct sw_flow_key *swkey, int *key_len, @@ -1254,6 +1265,15 @@ int ovs_flow_from_nlattrs(struct sw_flow_key *swkey, int *key_lenp, swkey->ip.proto = ntohs(arp_key->arp_op); memcpy(swkey->ipv4.arp.sha, arp_key->arp_sha, ETH_ALEN); memcpy(swkey->ipv4.arp.tha, arp_key->arp_tha, ETH_ALEN); + } else if (eth_p_mpls(swkey->eth.type)) { + const struct ovs_key_mpls *mpls_key; + if (!(attrs & (1ULL << OVS_KEY_ATTR_MPLS))) + return -EINVAL; + attrs &= ~(1ULL << OVS_KEY_ATTR_MPLS); + + key_len = SW_FLOW_KEY_OFFSET(mpls.top_lse); + mpls_key = nla_data(a[OVS_KEY_ATTR_MPLS]); + swkey->mpls.top_lse = mpls_key->mpls_lse; } if (attrs) @@ -1420,6 +1440,14 @@ int ovs_flow_to_nlattrs(const struct sw_flow_key *swkey, struct sk_buff *skb) arp_key->arp_op = htons(swkey->ip.proto); memcpy(arp_key->arp_sha, swkey->ipv4.arp.sha, ETH_ALEN); memcpy(arp_key->arp_tha, swkey->ipv4.arp.tha, ETH_ALEN); + } else if (eth_p_mpls(swkey->eth.type)) { + struct ovs_key_mpls *mpls_key; + + nla = nla_reserve(skb, OVS_KEY_ATTR_MPLS, sizeof(*mpls_key)); + if (!nla) + goto nla_put_failure; + mpls_key = nla_data(nla); + mpls_key->mpls_lse = swkey->mpls.top_lse; } if ((swkey->eth.type == htons(ETH_P_IP) || diff --git a/datapath/flow.h b/datapath/flow.h index dba66cf..2e3083b 100644 --- a/datapath/flow.h +++ b/datapath/flow.h @@ -72,12 +72,17 @@ struct sw_flow_key { __be16 tci; /* 0 if no VLAN, VLAN_TAG_PRESENT set otherwise. */ __be16 type; /* Ethernet frame type. */ } eth; - struct { - u8 proto; /* IP protocol or lower 8 bits of ARP opcode. */ - u8 tos; /* IP ToS. */ - u8 ttl; /* IP TTL/hop limit. */ - u8 frag; /* One of OVS_FRAG_TYPE_*. */ - } ip; + union { + struct { + __be32 top_lse; /* top label stack entry */ + } mpls; + struct { + u8 proto; /* IP protocol or lower 8 bits of ARP opcode. */ + u8 tos; /* IP ToS. */ + u8 ttl; /* IP TTL/hop limit. */ + u8 frag; /* One of OVS_FRAG_TYPE_*. */ + } ip; + }; union { struct { struct { @@ -143,6 +148,8 @@ struct arp_eth_header { unsigned char ar_tip[4]; /* target IP address */ } __packed; +#define MPLS_HLEN 4 + int ovs_flow_init(void); void ovs_flow_exit(void); @@ -204,4 +211,10 @@ int ipv4_tun_from_nlattr(const struct nlattr *attr, int ipv4_tun_to_nlattr(struct sk_buff *skb, const struct ovs_key_ipv4_tunnel *tun_key); +static inline bool eth_p_mpls(__be16 eth_type) +{ + return eth_type == htons(ETH_P_MPLS_UC) || + eth_type == htons(ETH_P_MPLS_MC); +} + #endif /* flow.h */ diff --git a/datapath/linux/compat/include/linux/skbuff.h b/datapath/linux/compat/include/linux/skbuff.h index d485b39..f620a0a 100644 --- a/datapath/linux/compat/include/linux/skbuff.h +++ b/datapath/linux/compat/include/linux/skbuff.h @@ -251,4 +251,14 @@ static inline void skb_reset_mac_len(struct sk_buff *skb) skb->mac_len = skb->network_header - skb->mac_header; } #endif + +#ifdef HAVE_ENCAPSULATION_FEATURES +static inline void skb_set_encapsulation_features(struct sk_buff *skb) +{ + skb->encapsulation_features = 1; +} +#else +/* MPLS GSO is not supported */ +static inline void skb_set_encapsulation_features(struct sk_buff *skb) { } +#endif #endif diff --git a/include/linux/openvswitch.h b/include/linux/openvswitch.h index bd2f05f..e890fd8 100644 --- a/include/linux/openvswitch.h +++ b/include/linux/openvswitch.h @@ -287,7 +287,9 @@ enum ovs_key_attr { OVS_KEY_ATTR_IPV4_TUNNEL, /* struct ovs_key_ipv4_tunnel */ #endif - OVS_KEY_ATTR_MPLS = 62, /* struct ovs_key_mpls */ + OVS_KEY_ATTR_MPLS = 62, /* array of struct ovs_key_mpls. + * The implementation may restrict + * the accepted length of the array. */ __OVS_KEY_ATTR_MAX }; @@ -330,7 +332,7 @@ struct ovs_key_ethernet { }; struct ovs_key_mpls { - __be32 mpls_top_lse; + __be32 mpls_lse; }; struct ovs_key_ipv4 { diff --git a/lib/odp-util.c b/lib/odp-util.c index 751c1c9..3206dc9 100644 --- a/lib/odp-util.c +++ b/lib/odp-util.c @@ -906,7 +906,7 @@ format_odp_key_attr(const struct nlattr *a, struct ds *ds) case OVS_KEY_ATTR_MPLS: { const struct ovs_key_mpls *mpls_key = nl_attr_get(a); ds_put_char(ds, '('); - format_mpls_lse(ds, mpls_key->mpls_top_lse); + format_mpls_lse(ds, mpls_key->mpls_lse); ds_put_char(ds, ')'); break; } @@ -1231,7 +1231,7 @@ parse_odp_key_attr(const char *s, const struct simap *port_names, mpls = nl_msg_put_unspec_uninit(key, OVS_KEY_ATTR_MPLS, sizeof *mpls); - mpls->mpls_top_lse = mpls_lse_from_components(label, tc, ttl, bos); + mpls->mpls_lse = mpls_lse_from_components(label, tc, ttl, bos); return n; } } @@ -1594,7 +1594,7 @@ odp_flow_key_from_flow(struct ofpbuf *buf, const struct flow *flow, mpls_key = nl_msg_put_unspec_uninit(buf, OVS_KEY_ATTR_MPLS, sizeof *mpls_key); - mpls_key->mpls_top_lse = flow->mpls_lse; + mpls_key->mpls_lse = flow->mpls_lse; } if (is_ip_any(flow) && !(flow->nw_frag & FLOW_NW_FRAG_LATER)) { @@ -2250,7 +2250,7 @@ commit_mpls_action(const struct flow *flow, struct flow *base, } else { struct ovs_key_mpls mpls_key; - mpls_key.mpls_top_lse = flow->mpls_lse; + mpls_key.mpls_lse = flow->mpls_lse; commit_set_action(odp_actions, OVS_KEY_ATTR_MPLS, &mpls_key, sizeof(mpls_key)); }