Patchwork [net-next,4/7] openvswitch: add ipv6 'set' action

login
register
mail settings
Submitter Jesse Gross
Date Nov. 29, 2012, 6:35 p.m.
Message ID <1354214149-33651-5-git-send-email-jesse@nicira.com>
Download mbox | patch
Permalink /patch/202810/
State Accepted
Delegated to: David Miller
Headers show

Comments

Jesse Gross - Nov. 29, 2012, 6:35 p.m.
From: Ansis Atteka <aatteka@nicira.com>

This patch adds ipv6 set action functionality. It allows to change
traffic class, flow label, hop-limit, ipv6 source and destination
address fields.

Signed-off-by: Ansis Atteka <aatteka@nicira.com>
Signed-off-by: Jesse Gross <jesse@nicira.com>
---
 net/openvswitch/actions.c  |   93 ++++++++++++++++++++++++++++++++++++++++++++
 net/openvswitch/datapath.c |   20 ++++++++++
 2 files changed, 113 insertions(+)
Tom Herbert - Dec. 12, 2012, 3:14 a.m.
> This patch adds ipv6 set action functionality. It allows to change
> traffic class, flow label, hop-limit, ipv6 source and destination
> address fields.
>
I have to wonder about these patches and the underlying design
direction.  Aren't these sort of things and more already implemented
by IPtables but in a modular and extensible fashion?  Has there been
any thought into hooking OVS to IP tables to leverage all the existing
functionality?

Thanks,
Tom

> Signed-off-by: Ansis Atteka <aatteka@nicira.com>
> Signed-off-by: Jesse Gross <jesse@nicira.com>
> ---
>  net/openvswitch/actions.c  |   93 ++++++++++++++++++++++++++++++++++++++++++++
>  net/openvswitch/datapath.c |   20 ++++++++++
>  2 files changed, 113 insertions(+)
>
> diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
> index 0811447..a58ed27 100644
> --- a/net/openvswitch/actions.c
> +++ b/net/openvswitch/actions.c
> @@ -28,6 +28,7 @@
>  #include <linux/if_arp.h>
>  #include <linux/if_vlan.h>
>  #include <net/ip.h>
> +#include <net/ipv6.h>
>  #include <net/checksum.h>
>  #include <net/dsfield.h>
>
> @@ -162,6 +163,53 @@ static void set_ip_addr(struct sk_buff *skb, struct iphdr *nh,
>         *addr = new_addr;
>  }
>
> +static void update_ipv6_checksum(struct sk_buff *skb, u8 l4_proto,
> +                                __be32 addr[4], const __be32 new_addr[4])
> +{
> +       int transport_len = skb->len - skb_transport_offset(skb);
> +
> +       if (l4_proto == IPPROTO_TCP) {
> +               if (likely(transport_len >= sizeof(struct tcphdr)))
> +                       inet_proto_csum_replace16(&tcp_hdr(skb)->check, skb,
> +                                                 addr, new_addr, 1);
> +       } else if (l4_proto == IPPROTO_UDP) {
> +               if (likely(transport_len >= sizeof(struct udphdr))) {
> +                       struct udphdr *uh = udp_hdr(skb);
> +
> +                       if (uh->check || skb->ip_summed == CHECKSUM_PARTIAL) {
> +                               inet_proto_csum_replace16(&uh->check, skb,
> +                                                         addr, new_addr, 1);
> +                               if (!uh->check)
> +                                       uh->check = CSUM_MANGLED_0;
> +                       }
> +               }
> +       }
> +}
> +
> +static void set_ipv6_addr(struct sk_buff *skb, u8 l4_proto,
> +                         __be32 addr[4], const __be32 new_addr[4],
> +                         bool recalculate_csum)
> +{
> +       if (recalculate_csum)
> +               update_ipv6_checksum(skb, l4_proto, addr, new_addr);
> +
> +       skb->rxhash = 0;
> +       memcpy(addr, new_addr, sizeof(__be32[4]));
> +}
> +
> +static void set_ipv6_tc(struct ipv6hdr *nh, u8 tc)
> +{
> +       nh->priority = tc >> 4;
> +       nh->flow_lbl[0] = (nh->flow_lbl[0] & 0x0F) | ((tc & 0x0F) << 4);
> +}
> +
> +static void set_ipv6_fl(struct ipv6hdr *nh, u32 fl)
> +{
> +       nh->flow_lbl[0] = (nh->flow_lbl[0] & 0xF0) | (fl & 0x000F0000) >> 16;
> +       nh->flow_lbl[1] = (fl & 0x0000FF00) >> 8;
> +       nh->flow_lbl[2] = fl & 0x000000FF;
> +}
> +
>  static void set_ip_ttl(struct sk_buff *skb, struct iphdr *nh, u8 new_ttl)
>  {
>         csum_replace2(&nh->check, htons(nh->ttl << 8), htons(new_ttl << 8));
> @@ -195,6 +243,47 @@ static int set_ipv4(struct sk_buff *skb, const struct ovs_key_ipv4 *ipv4_key)
>         return 0;
>  }
>
> +static int set_ipv6(struct sk_buff *skb, const struct ovs_key_ipv6 *ipv6_key)
> +{
> +       struct ipv6hdr *nh;
> +       int err;
> +       __be32 *saddr;
> +       __be32 *daddr;
> +
> +       err = make_writable(skb, skb_network_offset(skb) +
> +                           sizeof(struct ipv6hdr));
> +       if (unlikely(err))
> +               return err;
> +
> +       nh = ipv6_hdr(skb);
> +       saddr = (__be32 *)&nh->saddr;
> +       daddr = (__be32 *)&nh->daddr;
> +
> +       if (memcmp(ipv6_key->ipv6_src, saddr, sizeof(ipv6_key->ipv6_src)))
> +               set_ipv6_addr(skb, ipv6_key->ipv6_proto, saddr,
> +                             ipv6_key->ipv6_src, true);
> +
> +       if (memcmp(ipv6_key->ipv6_dst, daddr, sizeof(ipv6_key->ipv6_dst))) {
> +               unsigned int offset = 0;
> +               int flags = IP6_FH_F_SKIP_RH;
> +               bool recalc_csum = true;
> +
> +               if (ipv6_ext_hdr(nh->nexthdr))
> +                       recalc_csum = ipv6_find_hdr(skb, &offset,
> +                                                   NEXTHDR_ROUTING, NULL,
> +                                                   &flags) != NEXTHDR_ROUTING;
> +
> +               set_ipv6_addr(skb, ipv6_key->ipv6_proto, daddr,
> +                             ipv6_key->ipv6_dst, recalc_csum);
> +       }
> +
> +       set_ipv6_tc(nh, ipv6_key->ipv6_tclass);
> +       set_ipv6_fl(nh, ntohl(ipv6_key->ipv6_label));
> +       nh->hop_limit = ipv6_key->ipv6_hlimit;
> +
> +       return 0;
> +}
> +
>  /* Must follow make_writable() since that can move the skb data. */
>  static void set_tp_port(struct sk_buff *skb, __be16 *port,
>                          __be16 new_port, __sum16 *check)
> @@ -347,6 +436,10 @@ static int execute_set_action(struct sk_buff *skb,
>                 err = set_ipv4(skb, nla_data(nested_attr));
>                 break;
>
> +       case OVS_KEY_ATTR_IPV6:
> +               err = set_ipv6(skb, nla_data(nested_attr));
> +               break;
> +
>         case OVS_KEY_ATTR_TCP:
>                 err = set_tcp(skb, nla_data(nested_attr));
>                 break;
> diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
> index 4c4b62c..fd4a6a4 100644
> --- a/net/openvswitch/datapath.c
> +++ b/net/openvswitch/datapath.c
> @@ -479,6 +479,7 @@ static int validate_set(const struct nlattr *a,
>
>         switch (key_type) {
>         const struct ovs_key_ipv4 *ipv4_key;
> +       const struct ovs_key_ipv6 *ipv6_key;
>
>         case OVS_KEY_ATTR_PRIORITY:
>         case OVS_KEY_ATTR_ETHERNET:
> @@ -500,6 +501,25 @@ static int validate_set(const struct nlattr *a,
>
>                 break;
>
> +       case OVS_KEY_ATTR_IPV6:
> +               if (flow_key->eth.type != htons(ETH_P_IPV6))
> +                       return -EINVAL;
> +
> +               if (!flow_key->ip.proto)
> +                       return -EINVAL;
> +
> +               ipv6_key = nla_data(ovs_key);
> +               if (ipv6_key->ipv6_proto != flow_key->ip.proto)
> +                       return -EINVAL;
> +
> +               if (ipv6_key->ipv6_frag != flow_key->ip.frag)
> +                       return -EINVAL;
> +
> +               if (ntohl(ipv6_key->ipv6_label) & 0xFFF00000)
> +                       return -EINVAL;
> +
> +               break;
> +
>         case OVS_KEY_ATTR_TCP:
>                 if (flow_key->ip.proto != IPPROTO_TCP)
>                         return -EINVAL;
> --
> 1.7.9.5
>
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jesse Gross - Dec. 12, 2012, 6:17 p.m.
On Tue, Dec 11, 2012 at 7:14 PM, Tom Herbert <therbert@google.com> wrote:
>> This patch adds ipv6 set action functionality. It allows to change
>> traffic class, flow label, hop-limit, ipv6 source and destination
>> address fields.
>>
> I have to wonder about these patches and the underlying design
> direction.  Aren't these sort of things and more already implemented
> by IPtables but in a modular and extensible fashion?  Has there been
> any thought into hooking OVS to IP tables to leverage all the existing
> functionality?

At an implementation level, the goal is definitely to share as much
code as possible.  Some of that was obviously done to support this
patch and I'm sure there are more areas where it could be taken
further.

At a more conceptual level we've explored this path a number of times
and it's never been attractive since it has a tendency to drag more
OVS code into other parts of the kernel and generally make things
worse for everybody.  Of course, it's hard to say without knowing what
you're thinking.  Do you have a specific proposal?
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Tom Herbert - Dec. 12, 2012, 6:38 p.m.
> At an implementation level, the goal is definitely to share as much
> code as possible.  Some of that was obviously done to support this
> patch and I'm sure there are more areas where it could be taken
> further.
>
> At a more conceptual level we've explored this path a number of times
> and it's never been attractive since it has a tendency to drag more
> OVS code into other parts of the kernel and generally make things
> worse for everybody.  Of course, it's hard to say without knowing what
> you're thinking.  Do you have a specific proposal?

Where is the line drawn?  Is the intent that over the next five years
that functionality will be added ad hoc increments to make OVS have
the same functionality as IP tables, tc, routing?  Are we going to
have things like NAT, stateful firewalls, DDOS mechanisms implemented
in OVS (we already have people proposing such things!).
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jesse Gross - Dec. 12, 2012, 7:17 p.m.
On Wed, Dec 12, 2012 at 10:38 AM, Tom Herbert <therbert@google.com> wrote:
>> At an implementation level, the goal is definitely to share as much
>> code as possible.  Some of that was obviously done to support this
>> patch and I'm sure there are more areas where it could be taken
>> further.
>>
>> At a more conceptual level we've explored this path a number of times
>> and it's never been attractive since it has a tendency to drag more
>> OVS code into other parts of the kernel and generally make things
>> worse for everybody.  Of course, it's hard to say without knowing what
>> you're thinking.  Do you have a specific proposal?
>
> Where is the line drawn?  Is the intent that over the next five years
> that functionality will be added ad hoc increments to make OVS have
> the same functionality as IP tables, tc, routing?  Are we going to
> have things like NAT, stateful firewalls, DDOS mechanisms implemented
> in OVS (we already have people proposing such things!).

Definitely no to all of the above. (As an aside, years ago there was
NAT functionality in a precursor to OVS.  Everybody hated it and was
very happy when it was removed, so I wouldn't worry about that type of
thing popping up in OVS any time soon.)

The design of OVS works pretty well for the types of stateless
operations that are currently implemented because those map nicely to
flows that userspace can use to program in a fairly clean and powerful
manner.  This is much less true for things like stateful rules, QoS,
DPI, etc. because you either want to look at more information than
would usually be considered a flow or have state that changes very
quickly.  In these cases, the data plane needs to take action on its
own and the interaction with userspace is more akin to configuration
than programming.

As these types of features come up, I think you will start to see more
integration with netfilter and other tools (in fact, there are several
examples of this already - OVS QoS uses tc, the ability to interact
with skb->mark was added recently, and Pravin has been doing a lot of
work to refactor and integrate with the upstream tunneling code).
There are some definite tradeoffs to doing it this way, mostly in the
area of state management, so I don't think that it's feasible to
switch wholesale over to this model.  However, if we're careful then I
think it's possible to get the best of both worlds.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Patch

diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
index 0811447..a58ed27 100644
--- a/net/openvswitch/actions.c
+++ b/net/openvswitch/actions.c
@@ -28,6 +28,7 @@ 
 #include <linux/if_arp.h>
 #include <linux/if_vlan.h>
 #include <net/ip.h>
+#include <net/ipv6.h>
 #include <net/checksum.h>
 #include <net/dsfield.h>
 
@@ -162,6 +163,53 @@  static void set_ip_addr(struct sk_buff *skb, struct iphdr *nh,
 	*addr = new_addr;
 }
 
+static void update_ipv6_checksum(struct sk_buff *skb, u8 l4_proto,
+				 __be32 addr[4], const __be32 new_addr[4])
+{
+	int transport_len = skb->len - skb_transport_offset(skb);
+
+	if (l4_proto == IPPROTO_TCP) {
+		if (likely(transport_len >= sizeof(struct tcphdr)))
+			inet_proto_csum_replace16(&tcp_hdr(skb)->check, skb,
+						  addr, new_addr, 1);
+	} else if (l4_proto == IPPROTO_UDP) {
+		if (likely(transport_len >= sizeof(struct udphdr))) {
+			struct udphdr *uh = udp_hdr(skb);
+
+			if (uh->check || skb->ip_summed == CHECKSUM_PARTIAL) {
+				inet_proto_csum_replace16(&uh->check, skb,
+							  addr, new_addr, 1);
+				if (!uh->check)
+					uh->check = CSUM_MANGLED_0;
+			}
+		}
+	}
+}
+
+static void set_ipv6_addr(struct sk_buff *skb, u8 l4_proto,
+			  __be32 addr[4], const __be32 new_addr[4],
+			  bool recalculate_csum)
+{
+	if (recalculate_csum)
+		update_ipv6_checksum(skb, l4_proto, addr, new_addr);
+
+	skb->rxhash = 0;
+	memcpy(addr, new_addr, sizeof(__be32[4]));
+}
+
+static void set_ipv6_tc(struct ipv6hdr *nh, u8 tc)
+{
+	nh->priority = tc >> 4;
+	nh->flow_lbl[0] = (nh->flow_lbl[0] & 0x0F) | ((tc & 0x0F) << 4);
+}
+
+static void set_ipv6_fl(struct ipv6hdr *nh, u32 fl)
+{
+	nh->flow_lbl[0] = (nh->flow_lbl[0] & 0xF0) | (fl & 0x000F0000) >> 16;
+	nh->flow_lbl[1] = (fl & 0x0000FF00) >> 8;
+	nh->flow_lbl[2] = fl & 0x000000FF;
+}
+
 static void set_ip_ttl(struct sk_buff *skb, struct iphdr *nh, u8 new_ttl)
 {
 	csum_replace2(&nh->check, htons(nh->ttl << 8), htons(new_ttl << 8));
@@ -195,6 +243,47 @@  static int set_ipv4(struct sk_buff *skb, const struct ovs_key_ipv4 *ipv4_key)
 	return 0;
 }
 
+static int set_ipv6(struct sk_buff *skb, const struct ovs_key_ipv6 *ipv6_key)
+{
+	struct ipv6hdr *nh;
+	int err;
+	__be32 *saddr;
+	__be32 *daddr;
+
+	err = make_writable(skb, skb_network_offset(skb) +
+			    sizeof(struct ipv6hdr));
+	if (unlikely(err))
+		return err;
+
+	nh = ipv6_hdr(skb);
+	saddr = (__be32 *)&nh->saddr;
+	daddr = (__be32 *)&nh->daddr;
+
+	if (memcmp(ipv6_key->ipv6_src, saddr, sizeof(ipv6_key->ipv6_src)))
+		set_ipv6_addr(skb, ipv6_key->ipv6_proto, saddr,
+			      ipv6_key->ipv6_src, true);
+
+	if (memcmp(ipv6_key->ipv6_dst, daddr, sizeof(ipv6_key->ipv6_dst))) {
+		unsigned int offset = 0;
+		int flags = IP6_FH_F_SKIP_RH;
+		bool recalc_csum = true;
+
+		if (ipv6_ext_hdr(nh->nexthdr))
+			recalc_csum = ipv6_find_hdr(skb, &offset,
+						    NEXTHDR_ROUTING, NULL,
+						    &flags) != NEXTHDR_ROUTING;
+
+		set_ipv6_addr(skb, ipv6_key->ipv6_proto, daddr,
+			      ipv6_key->ipv6_dst, recalc_csum);
+	}
+
+	set_ipv6_tc(nh, ipv6_key->ipv6_tclass);
+	set_ipv6_fl(nh, ntohl(ipv6_key->ipv6_label));
+	nh->hop_limit = ipv6_key->ipv6_hlimit;
+
+	return 0;
+}
+
 /* Must follow make_writable() since that can move the skb data. */
 static void set_tp_port(struct sk_buff *skb, __be16 *port,
 			 __be16 new_port, __sum16 *check)
@@ -347,6 +436,10 @@  static int execute_set_action(struct sk_buff *skb,
 		err = set_ipv4(skb, nla_data(nested_attr));
 		break;
 
+	case OVS_KEY_ATTR_IPV6:
+		err = set_ipv6(skb, nla_data(nested_attr));
+		break;
+
 	case OVS_KEY_ATTR_TCP:
 		err = set_tcp(skb, nla_data(nested_attr));
 		break;
diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index 4c4b62c..fd4a6a4 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -479,6 +479,7 @@  static int validate_set(const struct nlattr *a,
 
 	switch (key_type) {
 	const struct ovs_key_ipv4 *ipv4_key;
+	const struct ovs_key_ipv6 *ipv6_key;
 
 	case OVS_KEY_ATTR_PRIORITY:
 	case OVS_KEY_ATTR_ETHERNET:
@@ -500,6 +501,25 @@  static int validate_set(const struct nlattr *a,
 
 		break;
 
+	case OVS_KEY_ATTR_IPV6:
+		if (flow_key->eth.type != htons(ETH_P_IPV6))
+			return -EINVAL;
+
+		if (!flow_key->ip.proto)
+			return -EINVAL;
+
+		ipv6_key = nla_data(ovs_key);
+		if (ipv6_key->ipv6_proto != flow_key->ip.proto)
+			return -EINVAL;
+
+		if (ipv6_key->ipv6_frag != flow_key->ip.frag)
+			return -EINVAL;
+
+		if (ntohl(ipv6_key->ipv6_label) & 0xFFF00000)
+			return -EINVAL;
+
+		break;
+
 	case OVS_KEY_ATTR_TCP:
 		if (flow_key->ip.proto != IPPROTO_TCP)
 			return -EINVAL;