@@ -256,6 +256,13 @@ AC_DEFUN([OVS_CHECK_LINUX_COMPAT], [
OVS_GREP_IFELSE([$KSRC/include/linux/skbuff.h], [consume_skb])
OVS_GREP_IFELSE([$KSRC/include/linux/skbuff.h], [skb_frag_page])
OVS_GREP_IFELSE([$KSRC/include/linux/skbuff.h], [skb_reset_mac_len])
+ # If inner_protocol is merged upstream then a release number may
+ # be used to select compatibility code and the following check may be
+ # removed. This check is here for now to allow Open vSwtich to provide
+ # an example of how inner_protocol may be used to provide
+ # GSO of non-MPLS GSO skbs that are turned into MPLS GSO skbs
+ # using MPLS push actions
+ OVS_GREP_IFELSE([$KSRC/include/linux/skbuff.h], [inner_protocol])
OVS_GREP_IFELSE([$KSRC/include/linux/string.h], [kmemdup], [],
[OVS_GREP_IFELSE([$KSRC/include/linux/slab.h], [kmemdup])])
@@ -48,6 +48,120 @@ static int make_writable(struct sk_buff *skb, int write_len)
return pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
}
+/* The end of the mac header.
+ *
+ * For non-MPLS skbs this will correspond to the network header.
+ * For MPLS skbs it will be berfore the network_header as the MPLS
+ * label stack lies between the end of the mac header and the network
+ * header. That is, for MPLS skbs the end of the mac header
+ * is the top of the MPLS label stack.
+ */
+static inline unsigned char *skb_mac_header_end(const struct sk_buff *skb)
+{
+ return skb_mac_header(skb) + skb->mac_len;
+}
+
+static __be16 *get_ethertype(struct sk_buff *skb)
+{
+ /* skb_mac_header hdr is not used to locate the ethertype to
+ * set as it will be incorrect in the presence of VLAN tags
+ */
+ struct ethhdr *hdr = (struct ethhdr *)(skb_mac_header_end(skb) -
+ ETH_HLEN);
+ return &hdr->h_proto;
+}
+
+static void set_ethertype(struct sk_buff *skb, __be16 ethertype)
+{
+ __be16 *skb_ethertype = get_ethertype(skb);
+ if (*skb_ethertype == ethertype)
+ return;
+ if (get_ip_summed(skb) == OVS_CSUM_COMPLETE) {
+ __be16 diff[] = { ~*skb_ethertype, ethertype };
+ skb->csum = ~csum_partial((char *)diff, sizeof(diff),
+ ~skb->csum);
+ }
+ *skb_ethertype = ethertype;
+}
+
+static int push_mpls(struct sk_buff *skb,
+ const struct ovs_action_push_mpls *mpls)
+{
+ __be32 *new_mpls_lse;
+ int err;
+
+ err = make_writable(skb, skb->mac_len + MPLS_HLEN);
+ if (unlikely(err))
+ return err;
+
+ skb_push(skb, MPLS_HLEN);
+ memmove(skb_mac_header(skb) - MPLS_HLEN, skb_mac_header(skb),
+ skb->mac_len);
+ skb_reset_mac_header(skb);
+
+ new_mpls_lse = (__be32 *)skb_mac_header_end(skb);
+ *new_mpls_lse = mpls->mpls_lse;
+
+ if (get_ip_summed(skb) == OVS_CSUM_COMPLETE)
+ skb->csum = csum_add(skb->csum, csum_partial(new_mpls_lse,
+ MPLS_HLEN, 0));
+
+ /* If the skb was non-MPLS then record the original protocol
+ * in skb->inner_protocol. This is to allow GSO to perform
+ * segmentation of the inner-packet.
+ */
+ if (!eth_p_mpls(skb->protocol))
+ skb_set_inner_protocol(skb, skb->protocol);
+
+ set_ethertype(skb, mpls->mpls_ethertype);
+ skb->protocol = mpls->mpls_ethertype;
+ return 0;
+}
+
+static int pop_mpls(struct sk_buff *skb, const __be16 ethertype)
+{
+ int err;
+
+ err = make_writable(skb, skb->mac_len + MPLS_HLEN);
+ if (unlikely(err))
+ return err;
+
+ if (get_ip_summed(skb) == OVS_CSUM_COMPLETE)
+ skb->csum = csum_sub(skb->csum,
+ csum_partial(skb_mac_header_end(skb),
+ MPLS_HLEN, 0));
+
+ memmove(skb_mac_header(skb) + MPLS_HLEN, skb_mac_header(skb),
+ skb->mac_len);
+
+ skb_pull(skb, MPLS_HLEN);
+ skb_reset_mac_header(skb);
+
+ set_ethertype(skb, ethertype);
+ skb->protocol = ethertype;
+ return 0;
+}
+
+static int set_mpls(struct sk_buff *skb, const __be32 *mpls_lse)
+{
+ __be32 *stack = (__be32 *)skb_mac_header_end(skb);
+ int err;
+
+ err = make_writable(skb, skb->mac_len + MPLS_HLEN);
+ if (unlikely(err))
+ return err;
+
+ if (get_ip_summed(skb) == OVS_CSUM_COMPLETE) {
+ __be32 diff[] = { ~(*stack), *mpls_lse };
+ skb->csum = ~csum_partial((char *)diff, sizeof(diff),
+ ~skb->csum);
+ }
+
+ *stack = *mpls_lse;
+
+ return 0;
+}
+
/* remove VLAN header from packet and update csum accordingly. */
static int __pop_vlan_tci(struct sk_buff *skb, __be16 *current_tci)
{
@@ -115,6 +229,9 @@ static int push_vlan(struct sk_buff *skb, const struct ovs_action_push_vlan *vla
if (!__vlan_put_tag(skb, current_tag))
return -ENOMEM;
+ /* update mac_len for skb_mac_header_end() */
+ skb_reset_mac_len(skb);
+
if (get_ip_summed(skb) == OVS_CSUM_COMPLETE)
skb->csum = csum_add(skb->csum, csum_partial(skb->data
+ (2 * ETH_ALEN), VLAN_HLEN, 0));
@@ -459,12 +576,21 @@ static int execute_set_action(struct sk_buff *skb,
case OVS_KEY_ATTR_UDP:
err = set_udp(skb, nla_data(nested_attr));
break;
+
+ case OVS_KEY_ATTR_MPLS:
+ err = set_mpls(skb, nla_data(nested_attr));
+ break;
}
return err;
}
-/* Execute a list of actions against 'skb'. */
+/* Execute a list of actions against 'skb'.
+ *
+ * The stack depth is only tracked in the case of a non-MPLS packet
+ * that becomes MPLS via an MPLS push action. The stack depth
+ * is passed to do_output() in order to allow it to prepare the
+ * skb for possible GSO segmentation. */
static int do_execute_actions(struct datapath *dp, struct sk_buff *skb,
const struct nlattr *attr, int len, bool keep_skb)
{
@@ -494,6 +620,14 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb,
output_userspace(dp, skb, a);
break;
+ case OVS_ACTION_ATTR_PUSH_MPLS:
+ err = push_mpls(skb, nla_data(a));
+ break;
+
+ case OVS_ACTION_ATTR_POP_MPLS:
+ err = pop_mpls(skb, nla_get_be16(a));
+ break;
+
case OVS_ACTION_ATTR_PUSH_VLAN:
err = push_vlan(skb, nla_data(a));
if (unlikely(err)) /* skb already freed. */
@@ -551,13 +551,26 @@ static inline void add_nested_action_end(struct sw_flow_actions *sfa, int st_off
a->nla_len = sfa->actions_len - st_offset;
}
-static int validate_and_copy_actions(const struct nlattr *attr,
+struct eth_types {
+ size_t depth;
+ __be16 types[SAMPLE_ACTION_DEPTH];
+};
+
+static void eth_types_set(struct eth_types *types, size_t depth, __be16 type)
+{
+ types->depth = depth;
+ types->types[depth] = type;
+}
+
+static int validate_and_copy_actions__(const struct nlattr *attr,
const struct sw_flow_key *key, int depth,
- struct sw_flow_actions **sfa);
+ struct sw_flow_actions **sfa,
+ struct eth_types *eth_types);
static int validate_and_copy_sample(const struct nlattr *attr,
const struct sw_flow_key *key, int depth,
- struct sw_flow_actions **sfa)
+ struct sw_flow_actions **sfa,
+ struct eth_types *eth_types)
{
const struct nlattr *attrs[OVS_SAMPLE_ATTR_MAX + 1];
const struct nlattr *probability, *actions;
@@ -593,7 +606,8 @@ static int validate_and_copy_sample(const struct nlattr *attr,
if (st_acts < 0)
return st_acts;
- err = validate_and_copy_actions(actions, key, depth + 1, sfa);
+ err = validate_and_copy_actions__(actions, key, depth + 1, sfa,
+ eth_types);
if (err)
return err;
@@ -603,12 +617,12 @@ static int validate_and_copy_sample(const struct nlattr *attr,
return 0;
}
-static int validate_tp_port(const struct sw_flow_key *flow_key)
+static int validate_tp_port(const struct sw_flow_key *flow_key, __be16 eth_type)
{
- if (flow_key->eth.type == htons(ETH_P_IP)) {
+ if (eth_type == htons(ETH_P_IP)) {
if (flow_key->ipv4.tp.src || flow_key->ipv4.tp.dst)
return 0;
- } else if (flow_key->eth.type == htons(ETH_P_IPV6)) {
+ } else if (eth_type == htons(ETH_P_IPV6)) {
if (flow_key->ipv6.tp.src || flow_key->ipv6.tp.dst)
return 0;
}
@@ -639,7 +653,7 @@ static int validate_and_copy_set_tun(const struct nlattr *attr,
static int validate_set(const struct nlattr *a,
const struct sw_flow_key *flow_key,
struct sw_flow_actions **sfa,
- bool *set_tun)
+ bool *set_tun, struct eth_types *eth_types)
{
const struct nlattr *ovs_key = nla_data(a);
int key_type = nla_type(ovs_key);
@@ -676,9 +690,12 @@ static int validate_set(const struct nlattr *a,
return err;
break;
- case OVS_KEY_ATTR_IPV4:
- if (flow_key->eth.type != htons(ETH_P_IP))
- return -EINVAL;
+ case OVS_KEY_ATTR_IPV4: {
+ size_t i;
+
+ for (i = 0; i < eth_types->depth; i++)
+ if (eth_types->types[i] != htons(ETH_P_IP))
+ return -EINVAL;
if (!flow_key->ip.proto)
return -EINVAL;
@@ -691,10 +708,14 @@ static int validate_set(const struct nlattr *a,
return -EINVAL;
break;
+ }
- case OVS_KEY_ATTR_IPV6:
- if (flow_key->eth.type != htons(ETH_P_IPV6))
- return -EINVAL;
+ case OVS_KEY_ATTR_IPV6: {
+ size_t i;
+
+ for (i = 0; i < eth_types->depth; i++)
+ if (eth_types->types[i] != htons(ETH_P_IPV6))
+ return -EINVAL;
if (!flow_key->ip.proto)
return -EINVAL;
@@ -710,18 +731,37 @@ static int validate_set(const struct nlattr *a,
return -EINVAL;
break;
+ }
+
+ case OVS_KEY_ATTR_TCP: {
+ size_t i;
- case OVS_KEY_ATTR_TCP:
if (flow_key->ip.proto != IPPROTO_TCP)
return -EINVAL;
- return validate_tp_port(flow_key);
+ for (i = 0; i < eth_types->depth; i++)
+ if (validate_tp_port(flow_key, eth_types->types[i]))
+ return -EINVAL;
+ }
- case OVS_KEY_ATTR_UDP:
+ case OVS_KEY_ATTR_UDP: {
+ size_t i;
if (flow_key->ip.proto != IPPROTO_UDP)
return -EINVAL;
- return validate_tp_port(flow_key);
+ for (i = 0; i < eth_types->depth; i++)
+ if (validate_tp_port(flow_key, eth_types->types[i]))
+ return -EINVAL;
+ }
+
+ case OVS_KEY_ATTR_MPLS: {
+ size_t i;
+
+ for (i = 0; i < eth_types->depth; i++)
+ if (!eth_p_mpls(eth_types->types[i]))
+ return -EINVAL;
+ break;
+ }
default:
return -EINVAL;
@@ -765,10 +805,10 @@ static int copy_action(const struct nlattr *from,
return 0;
}
-static int validate_and_copy_actions(const struct nlattr *attr,
- const struct sw_flow_key *key,
- int depth,
- struct sw_flow_actions **sfa)
+static int validate_and_copy_actions__(const struct nlattr *attr,
+ const struct sw_flow_key *key, int depth,
+ struct sw_flow_actions **sfa,
+ struct eth_types *eth_types)
{
const struct nlattr *a;
int rem, err;
@@ -776,11 +816,29 @@ static int validate_and_copy_actions(const struct nlattr *attr,
if (depth >= SAMPLE_ACTION_DEPTH)
return -EOVERFLOW;
+ /* Due to the sample action there may be more than one possibility
+ * for the current ethernet type. They all need to be verified.
+ *
+ * This is handled by tracking a stack of ethernet types, one for
+ * each (sample) depth of validation. Here the ethernet type for
+ * the current depth is pushed onto the stack. It may be modified
+ * as by actions are validated. When a modification occurs the
+ * ethernet types for higher stack-depths are popped off the stack.
+ * All entries on the stack are checked when validating the
+ * ethernet type required by an action.
+ */
+ if (!depth)
+ eth_types_set(eth_types, 0, key->eth.type);
+ else
+ eth_types_set(eth_types, depth, eth_types->types[depth - 1]);
+
nla_for_each_nested(a, attr, rem) {
/* Expected argument lengths, (u32)-1 for variable length. */
static const u32 action_lens[OVS_ACTION_ATTR_MAX + 1] = {
[OVS_ACTION_ATTR_OUTPUT] = sizeof(u32),
[OVS_ACTION_ATTR_USERSPACE] = (u32)-1,
+ [OVS_ACTION_ATTR_PUSH_MPLS] = sizeof(struct ovs_action_push_mpls),
+ [OVS_ACTION_ATTR_POP_MPLS] = sizeof(__be16),
[OVS_ACTION_ATTR_PUSH_VLAN] = sizeof(struct ovs_action_push_vlan),
[OVS_ACTION_ATTR_POP_VLAN] = 0,
[OVS_ACTION_ATTR_SET] = (u32)-1,
@@ -811,6 +869,35 @@ static int validate_and_copy_actions(const struct nlattr *attr,
return -EINVAL;
break;
+ case OVS_ACTION_ATTR_PUSH_MPLS: {
+ const struct ovs_action_push_mpls *mpls = nla_data(a);
+ if (!eth_p_mpls(mpls->mpls_ethertype))
+ return -EINVAL;
+ eth_types_set(eth_types, depth, mpls->mpls_ethertype);
+ break;
+ }
+
+ case OVS_ACTION_ATTR_POP_MPLS: {
+ size_t i;
+
+ for (i = 0; i < eth_types->depth; i++)
+ if (eth_types->types[i] != htons(ETH_P_IP))
+ return -EINVAL;
+
+ /* Disallow subsequent l2.5+ set and mpls_pop actions
+ * as there is no check here to ensure that the new
+ * eth_type is valid and thus set actions could
+ * write off the end of the packet or otherwise
+ * corrupt it.
+ *
+ * Support for these actions that after mpls_pop
+ * using packet recirculation is planned.
+ * are planned to be supported using using packet
+ * recirculation.
+ */
+ eth_types_set(eth_types, depth, htons(0));
+ break;
+ }
case OVS_ACTION_ATTR_POP_VLAN:
break;
@@ -824,13 +911,14 @@ static int validate_and_copy_actions(const struct nlattr *attr,
break;
case OVS_ACTION_ATTR_SET:
- err = validate_set(a, key, sfa, &skip_copy);
+ err = validate_set(a, key, sfa, &skip_copy, eth_types);
if (err)
return err;
break;
case OVS_ACTION_ATTR_SAMPLE:
- err = validate_and_copy_sample(a, key, depth, sfa);
+ err = validate_and_copy_sample(a, key, depth, sfa,
+ eth_types);
if (err)
return err;
skip_copy = true;
@@ -852,6 +940,14 @@ static int validate_and_copy_actions(const struct nlattr *attr,
return 0;
}
+static int validate_and_copy_actions(const struct nlattr *attr,
+ const struct sw_flow_key *key,
+ struct sw_flow_actions **sfa)
+{
+ struct eth_types eth_type;
+ return validate_and_copy_actions__(attr, key, 0, sfa, ð_type);
+}
+
static void clear_stats(struct sw_flow *flow)
{
flow->used = 0;
@@ -916,7 +1012,7 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info)
if (IS_ERR(acts))
goto err_flow_free;
- err = validate_and_copy_actions(a[OVS_PACKET_ATTR_ACTIONS], &flow->key, 0, &acts);
+ err = validate_and_copy_actions(a[OVS_PACKET_ATTR_ACTIONS], &flow->key, &acts);
rcu_assign_pointer(flow->sf_acts, acts);
if (err)
goto err_flow_free;
@@ -1252,7 +1348,7 @@ static int ovs_flow_cmd_new_or_set(struct sk_buff *skb, struct genl_info *info)
if (IS_ERR(acts))
goto error;
- error = validate_and_copy_actions(a[OVS_FLOW_ATTR_ACTIONS], &key, 0, &acts);
+ error = validate_and_copy_actions(a[OVS_FLOW_ATTR_ACTIONS], &key, &acts);
if (error)
goto err_kfree;
} else if (info->genlhdr->cmd == OVS_FLOW_CMD_NEW) {
@@ -47,6 +47,8 @@
static struct kmem_cache *flow_cache;
+#define MPLS_BOS_MASK 0x00000100
+
static int check_header(struct sk_buff *skb, int len)
{
if (unlikely(skb->len < len))
@@ -648,6 +650,7 @@ int ovs_flow_extract(struct sk_buff *skb, u16 in_port, struct sw_flow_key *key,
return -ENOMEM;
skb_reset_network_header(skb);
+ skb_reset_mac_len(skb);
__skb_push(skb, skb->data - skb_mac_header(skb));
/* Network layer. */
@@ -730,6 +733,35 @@ int ovs_flow_extract(struct sk_buff *skb, u16 in_port, struct sw_flow_key *key,
memcpy(key->ipv4.arp.tha, arp->ar_tha, ETH_ALEN);
key_len = SW_FLOW_KEY_OFFSET(ipv4.arp);
}
+ } else if (eth_p_mpls(key->eth.type)) {
+ size_t stack_len = MPLS_HLEN;
+
+ /* In the presence of an MPLS label stack the end of the L2
+ * header and the beginning of the L3 header differ.
+ *
+ * Advance network_header to the beginning of the L3
+ * header. mac_len corresponds to the end of the L2 header.
+ */
+ while (1) {
+ __be32 lse;
+
+ error = check_header(skb, skb->mac_len + stack_len);
+ if (unlikely(error))
+ goto out;
+
+ memcpy(&lse, skb_network_header(skb), MPLS_HLEN);
+
+ if (stack_len == MPLS_HLEN) {
+ key_len = SW_FLOW_KEY_OFFSET(mpls.top_lse);
+ memcpy(&key->mpls.top_lse, &lse, MPLS_HLEN);
+ }
+
+ skb_set_network_header(skb, skb->mac_len + stack_len);
+ if (lse & htonl(MPLS_BOS_MASK))
+ break;
+
+ stack_len += MPLS_HLEN;
+ }
} else if (key->eth.type == htons(ETH_P_IPV6)) {
int nh_len; /* IPv6 Header + Extensions */
@@ -848,6 +880,9 @@ const int ovs_key_lens[OVS_KEY_ATTR_MAX + 1] = {
[OVS_KEY_ATTR_ARP] = sizeof(struct ovs_key_arp),
[OVS_KEY_ATTR_ND] = sizeof(struct ovs_key_nd),
[OVS_KEY_ATTR_TUNNEL] = -1,
+
+ /* Not upstream. */
+ [OVS_KEY_ATTR_MPLS] = sizeof(struct ovs_key_mpls),
};
static int ipv4_flow_from_nlattrs(struct sw_flow_key *swkey, int *key_len,
@@ -1254,6 +1289,15 @@ int ovs_flow_from_nlattrs(struct sw_flow_key *swkey, int *key_lenp,
swkey->ip.proto = ntohs(arp_key->arp_op);
memcpy(swkey->ipv4.arp.sha, arp_key->arp_sha, ETH_ALEN);
memcpy(swkey->ipv4.arp.tha, arp_key->arp_tha, ETH_ALEN);
+ } else if (eth_p_mpls(swkey->eth.type)) {
+ const struct ovs_key_mpls *mpls_key;
+ if (!(attrs & (1ULL << OVS_KEY_ATTR_MPLS)))
+ return -EINVAL;
+ attrs &= ~(1ULL << OVS_KEY_ATTR_MPLS);
+
+ key_len = SW_FLOW_KEY_OFFSET(mpls.top_lse);
+ mpls_key = nla_data(a[OVS_KEY_ATTR_MPLS]);
+ swkey->mpls.top_lse = mpls_key->mpls_lse;
}
if (attrs)
@@ -1420,6 +1464,14 @@ int ovs_flow_to_nlattrs(const struct sw_flow_key *swkey, struct sk_buff *skb)
arp_key->arp_op = htons(swkey->ip.proto);
memcpy(arp_key->arp_sha, swkey->ipv4.arp.sha, ETH_ALEN);
memcpy(arp_key->arp_tha, swkey->ipv4.arp.tha, ETH_ALEN);
+ } else if (eth_p_mpls(swkey->eth.type)) {
+ struct ovs_key_mpls *mpls_key;
+
+ nla = nla_reserve(skb, OVS_KEY_ATTR_MPLS, sizeof(*mpls_key));
+ if (!nla)
+ goto nla_put_failure;
+ mpls_key = nla_data(nla);
+ mpls_key->mpls_lse = swkey->mpls.top_lse;
}
if ((swkey->eth.type == htons(ETH_P_IP) ||
@@ -72,12 +72,17 @@ struct sw_flow_key {
__be16 tci; /* 0 if no VLAN, VLAN_TAG_PRESENT set otherwise. */
__be16 type; /* Ethernet frame type. */
} eth;
- struct {
- u8 proto; /* IP protocol or lower 8 bits of ARP opcode. */
- u8 tos; /* IP ToS. */
- u8 ttl; /* IP TTL/hop limit. */
- u8 frag; /* One of OVS_FRAG_TYPE_*. */
- } ip;
+ union {
+ struct {
+ __be32 top_lse; /* top label stack entry */
+ } mpls;
+ struct {
+ u8 proto; /* IP protocol or lower 8 bits of ARP opcode. */
+ u8 tos; /* IP ToS. */
+ u8 ttl; /* IP TTL/hop limit. */
+ u8 frag; /* One of OVS_FRAG_TYPE_*. */
+ } ip;
+ };
union {
struct {
struct {
@@ -143,6 +148,8 @@ struct arp_eth_header {
unsigned char ar_tip[4]; /* target IP address */
} __packed;
+#define MPLS_HLEN 4
+
int ovs_flow_init(void);
void ovs_flow_exit(void);
@@ -204,4 +211,10 @@ int ipv4_tun_from_nlattr(const struct nlattr *attr,
int ipv4_tun_to_nlattr(struct sk_buff *skb,
const struct ovs_key_ipv4_tunnel *tun_key);
+static inline bool eth_p_mpls(__be16 eth_type)
+{
+ return eth_type == htons(ETH_P_MPLS_UC) ||
+ eth_type == htons(ETH_P_MPLS_MC);
+}
+
#endif /* flow.h */
@@ -251,4 +251,14 @@ static inline void skb_reset_mac_len(struct sk_buff *skb)
skb->mac_len = skb->network_header - skb->mac_header;
}
#endif
+
+#ifdef HAVE_INNER_PROTOCOL
+static inline void skb_set_inner_protocol(struct sk_buff *skb, __be16 ethertype)
+{
+ skb->inner_protocol = ethertype;
+}
+#else
+/* MPLS GSO is not supported */
+static inline void skb_set_inner_protocol(struct sk_buff *skb, __be16 ethertype) { }
+#endif
#endif