Message ID | 1354214149-33651-5-git-send-email-jesse@nicira.com |
---|---|
State | Accepted, archived |
Delegated to: | David Miller |
Headers | show |
> This patch adds ipv6 set action functionality. It allows to change > traffic class, flow label, hop-limit, ipv6 source and destination > address fields. > I have to wonder about these patches and the underlying design direction. Aren't these sort of things and more already implemented by IPtables but in a modular and extensible fashion? Has there been any thought into hooking OVS to IP tables to leverage all the existing functionality? Thanks, Tom > Signed-off-by: Ansis Atteka <aatteka@nicira.com> > Signed-off-by: Jesse Gross <jesse@nicira.com> > --- > net/openvswitch/actions.c | 93 ++++++++++++++++++++++++++++++++++++++++++++ > net/openvswitch/datapath.c | 20 ++++++++++ > 2 files changed, 113 insertions(+) > > diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c > index 0811447..a58ed27 100644 > --- a/net/openvswitch/actions.c > +++ b/net/openvswitch/actions.c > @@ -28,6 +28,7 @@ > #include <linux/if_arp.h> > #include <linux/if_vlan.h> > #include <net/ip.h> > +#include <net/ipv6.h> > #include <net/checksum.h> > #include <net/dsfield.h> > > @@ -162,6 +163,53 @@ static void set_ip_addr(struct sk_buff *skb, struct iphdr *nh, > *addr = new_addr; > } > > +static void update_ipv6_checksum(struct sk_buff *skb, u8 l4_proto, > + __be32 addr[4], const __be32 new_addr[4]) > +{ > + int transport_len = skb->len - skb_transport_offset(skb); > + > + if (l4_proto == IPPROTO_TCP) { > + if (likely(transport_len >= sizeof(struct tcphdr))) > + inet_proto_csum_replace16(&tcp_hdr(skb)->check, skb, > + addr, new_addr, 1); > + } else if (l4_proto == IPPROTO_UDP) { > + if (likely(transport_len >= sizeof(struct udphdr))) { > + struct udphdr *uh = udp_hdr(skb); > + > + if (uh->check || skb->ip_summed == CHECKSUM_PARTIAL) { > + inet_proto_csum_replace16(&uh->check, skb, > + addr, new_addr, 1); > + if (!uh->check) > + uh->check = CSUM_MANGLED_0; > + } > + } > + } > +} > + > +static void set_ipv6_addr(struct sk_buff *skb, u8 l4_proto, > + __be32 addr[4], const __be32 new_addr[4], > + bool recalculate_csum) > +{ > + if (recalculate_csum) > + update_ipv6_checksum(skb, l4_proto, addr, new_addr); > + > + skb->rxhash = 0; > + memcpy(addr, new_addr, sizeof(__be32[4])); > +} > + > +static void set_ipv6_tc(struct ipv6hdr *nh, u8 tc) > +{ > + nh->priority = tc >> 4; > + nh->flow_lbl[0] = (nh->flow_lbl[0] & 0x0F) | ((tc & 0x0F) << 4); > +} > + > +static void set_ipv6_fl(struct ipv6hdr *nh, u32 fl) > +{ > + nh->flow_lbl[0] = (nh->flow_lbl[0] & 0xF0) | (fl & 0x000F0000) >> 16; > + nh->flow_lbl[1] = (fl & 0x0000FF00) >> 8; > + nh->flow_lbl[2] = fl & 0x000000FF; > +} > + > static void set_ip_ttl(struct sk_buff *skb, struct iphdr *nh, u8 new_ttl) > { > csum_replace2(&nh->check, htons(nh->ttl << 8), htons(new_ttl << 8)); > @@ -195,6 +243,47 @@ static int set_ipv4(struct sk_buff *skb, const struct ovs_key_ipv4 *ipv4_key) > return 0; > } > > +static int set_ipv6(struct sk_buff *skb, const struct ovs_key_ipv6 *ipv6_key) > +{ > + struct ipv6hdr *nh; > + int err; > + __be32 *saddr; > + __be32 *daddr; > + > + err = make_writable(skb, skb_network_offset(skb) + > + sizeof(struct ipv6hdr)); > + if (unlikely(err)) > + return err; > + > + nh = ipv6_hdr(skb); > + saddr = (__be32 *)&nh->saddr; > + daddr = (__be32 *)&nh->daddr; > + > + if (memcmp(ipv6_key->ipv6_src, saddr, sizeof(ipv6_key->ipv6_src))) > + set_ipv6_addr(skb, ipv6_key->ipv6_proto, saddr, > + ipv6_key->ipv6_src, true); > + > + if (memcmp(ipv6_key->ipv6_dst, daddr, sizeof(ipv6_key->ipv6_dst))) { > + unsigned int offset = 0; > + int flags = IP6_FH_F_SKIP_RH; > + bool recalc_csum = true; > + > + if (ipv6_ext_hdr(nh->nexthdr)) > + recalc_csum = ipv6_find_hdr(skb, &offset, > + NEXTHDR_ROUTING, NULL, > + &flags) != NEXTHDR_ROUTING; > + > + set_ipv6_addr(skb, ipv6_key->ipv6_proto, daddr, > + ipv6_key->ipv6_dst, recalc_csum); > + } > + > + set_ipv6_tc(nh, ipv6_key->ipv6_tclass); > + set_ipv6_fl(nh, ntohl(ipv6_key->ipv6_label)); > + nh->hop_limit = ipv6_key->ipv6_hlimit; > + > + return 0; > +} > + > /* Must follow make_writable() since that can move the skb data. */ > static void set_tp_port(struct sk_buff *skb, __be16 *port, > __be16 new_port, __sum16 *check) > @@ -347,6 +436,10 @@ static int execute_set_action(struct sk_buff *skb, > err = set_ipv4(skb, nla_data(nested_attr)); > break; > > + case OVS_KEY_ATTR_IPV6: > + err = set_ipv6(skb, nla_data(nested_attr)); > + break; > + > case OVS_KEY_ATTR_TCP: > err = set_tcp(skb, nla_data(nested_attr)); > break; > diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c > index 4c4b62c..fd4a6a4 100644 > --- a/net/openvswitch/datapath.c > +++ b/net/openvswitch/datapath.c > @@ -479,6 +479,7 @@ static int validate_set(const struct nlattr *a, > > switch (key_type) { > const struct ovs_key_ipv4 *ipv4_key; > + const struct ovs_key_ipv6 *ipv6_key; > > case OVS_KEY_ATTR_PRIORITY: > case OVS_KEY_ATTR_ETHERNET: > @@ -500,6 +501,25 @@ static int validate_set(const struct nlattr *a, > > break; > > + case OVS_KEY_ATTR_IPV6: > + if (flow_key->eth.type != htons(ETH_P_IPV6)) > + return -EINVAL; > + > + if (!flow_key->ip.proto) > + return -EINVAL; > + > + ipv6_key = nla_data(ovs_key); > + if (ipv6_key->ipv6_proto != flow_key->ip.proto) > + return -EINVAL; > + > + if (ipv6_key->ipv6_frag != flow_key->ip.frag) > + return -EINVAL; > + > + if (ntohl(ipv6_key->ipv6_label) & 0xFFF00000) > + return -EINVAL; > + > + break; > + > case OVS_KEY_ATTR_TCP: > if (flow_key->ip.proto != IPPROTO_TCP) > return -EINVAL; > -- > 1.7.9.5 > > -- > To unsubscribe from this list: send the line "unsubscribe netdev" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Tue, Dec 11, 2012 at 7:14 PM, Tom Herbert <therbert@google.com> wrote: >> This patch adds ipv6 set action functionality. It allows to change >> traffic class, flow label, hop-limit, ipv6 source and destination >> address fields. >> > I have to wonder about these patches and the underlying design > direction. Aren't these sort of things and more already implemented > by IPtables but in a modular and extensible fashion? Has there been > any thought into hooking OVS to IP tables to leverage all the existing > functionality? At an implementation level, the goal is definitely to share as much code as possible. Some of that was obviously done to support this patch and I'm sure there are more areas where it could be taken further. At a more conceptual level we've explored this path a number of times and it's never been attractive since it has a tendency to drag more OVS code into other parts of the kernel and generally make things worse for everybody. Of course, it's hard to say without knowing what you're thinking. Do you have a specific proposal? -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
> At an implementation level, the goal is definitely to share as much > code as possible. Some of that was obviously done to support this > patch and I'm sure there are more areas where it could be taken > further. > > At a more conceptual level we've explored this path a number of times > and it's never been attractive since it has a tendency to drag more > OVS code into other parts of the kernel and generally make things > worse for everybody. Of course, it's hard to say without knowing what > you're thinking. Do you have a specific proposal? Where is the line drawn? Is the intent that over the next five years that functionality will be added ad hoc increments to make OVS have the same functionality as IP tables, tc, routing? Are we going to have things like NAT, stateful firewalls, DDOS mechanisms implemented in OVS (we already have people proposing such things!). -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Wed, Dec 12, 2012 at 10:38 AM, Tom Herbert <therbert@google.com> wrote: >> At an implementation level, the goal is definitely to share as much >> code as possible. Some of that was obviously done to support this >> patch and I'm sure there are more areas where it could be taken >> further. >> >> At a more conceptual level we've explored this path a number of times >> and it's never been attractive since it has a tendency to drag more >> OVS code into other parts of the kernel and generally make things >> worse for everybody. Of course, it's hard to say without knowing what >> you're thinking. Do you have a specific proposal? > > Where is the line drawn? Is the intent that over the next five years > that functionality will be added ad hoc increments to make OVS have > the same functionality as IP tables, tc, routing? Are we going to > have things like NAT, stateful firewalls, DDOS mechanisms implemented > in OVS (we already have people proposing such things!). Definitely no to all of the above. (As an aside, years ago there was NAT functionality in a precursor to OVS. Everybody hated it and was very happy when it was removed, so I wouldn't worry about that type of thing popping up in OVS any time soon.) The design of OVS works pretty well for the types of stateless operations that are currently implemented because those map nicely to flows that userspace can use to program in a fairly clean and powerful manner. This is much less true for things like stateful rules, QoS, DPI, etc. because you either want to look at more information than would usually be considered a flow or have state that changes very quickly. In these cases, the data plane needs to take action on its own and the interaction with userspace is more akin to configuration than programming. As these types of features come up, I think you will start to see more integration with netfilter and other tools (in fact, there are several examples of this already - OVS QoS uses tc, the ability to interact with skb->mark was added recently, and Pravin has been doing a lot of work to refactor and integrate with the upstream tunneling code). There are some definite tradeoffs to doing it this way, mostly in the area of state management, so I don't think that it's feasible to switch wholesale over to this model. However, if we're careful then I think it's possible to get the best of both worlds. -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index 0811447..a58ed27 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -28,6 +28,7 @@ #include <linux/if_arp.h> #include <linux/if_vlan.h> #include <net/ip.h> +#include <net/ipv6.h> #include <net/checksum.h> #include <net/dsfield.h> @@ -162,6 +163,53 @@ static void set_ip_addr(struct sk_buff *skb, struct iphdr *nh, *addr = new_addr; } +static void update_ipv6_checksum(struct sk_buff *skb, u8 l4_proto, + __be32 addr[4], const __be32 new_addr[4]) +{ + int transport_len = skb->len - skb_transport_offset(skb); + + if (l4_proto == IPPROTO_TCP) { + if (likely(transport_len >= sizeof(struct tcphdr))) + inet_proto_csum_replace16(&tcp_hdr(skb)->check, skb, + addr, new_addr, 1); + } else if (l4_proto == IPPROTO_UDP) { + if (likely(transport_len >= sizeof(struct udphdr))) { + struct udphdr *uh = udp_hdr(skb); + + if (uh->check || skb->ip_summed == CHECKSUM_PARTIAL) { + inet_proto_csum_replace16(&uh->check, skb, + addr, new_addr, 1); + if (!uh->check) + uh->check = CSUM_MANGLED_0; + } + } + } +} + +static void set_ipv6_addr(struct sk_buff *skb, u8 l4_proto, + __be32 addr[4], const __be32 new_addr[4], + bool recalculate_csum) +{ + if (recalculate_csum) + update_ipv6_checksum(skb, l4_proto, addr, new_addr); + + skb->rxhash = 0; + memcpy(addr, new_addr, sizeof(__be32[4])); +} + +static void set_ipv6_tc(struct ipv6hdr *nh, u8 tc) +{ + nh->priority = tc >> 4; + nh->flow_lbl[0] = (nh->flow_lbl[0] & 0x0F) | ((tc & 0x0F) << 4); +} + +static void set_ipv6_fl(struct ipv6hdr *nh, u32 fl) +{ + nh->flow_lbl[0] = (nh->flow_lbl[0] & 0xF0) | (fl & 0x000F0000) >> 16; + nh->flow_lbl[1] = (fl & 0x0000FF00) >> 8; + nh->flow_lbl[2] = fl & 0x000000FF; +} + static void set_ip_ttl(struct sk_buff *skb, struct iphdr *nh, u8 new_ttl) { csum_replace2(&nh->check, htons(nh->ttl << 8), htons(new_ttl << 8)); @@ -195,6 +243,47 @@ static int set_ipv4(struct sk_buff *skb, const struct ovs_key_ipv4 *ipv4_key) return 0; } +static int set_ipv6(struct sk_buff *skb, const struct ovs_key_ipv6 *ipv6_key) +{ + struct ipv6hdr *nh; + int err; + __be32 *saddr; + __be32 *daddr; + + err = make_writable(skb, skb_network_offset(skb) + + sizeof(struct ipv6hdr)); + if (unlikely(err)) + return err; + + nh = ipv6_hdr(skb); + saddr = (__be32 *)&nh->saddr; + daddr = (__be32 *)&nh->daddr; + + if (memcmp(ipv6_key->ipv6_src, saddr, sizeof(ipv6_key->ipv6_src))) + set_ipv6_addr(skb, ipv6_key->ipv6_proto, saddr, + ipv6_key->ipv6_src, true); + + if (memcmp(ipv6_key->ipv6_dst, daddr, sizeof(ipv6_key->ipv6_dst))) { + unsigned int offset = 0; + int flags = IP6_FH_F_SKIP_RH; + bool recalc_csum = true; + + if (ipv6_ext_hdr(nh->nexthdr)) + recalc_csum = ipv6_find_hdr(skb, &offset, + NEXTHDR_ROUTING, NULL, + &flags) != NEXTHDR_ROUTING; + + set_ipv6_addr(skb, ipv6_key->ipv6_proto, daddr, + ipv6_key->ipv6_dst, recalc_csum); + } + + set_ipv6_tc(nh, ipv6_key->ipv6_tclass); + set_ipv6_fl(nh, ntohl(ipv6_key->ipv6_label)); + nh->hop_limit = ipv6_key->ipv6_hlimit; + + return 0; +} + /* Must follow make_writable() since that can move the skb data. */ static void set_tp_port(struct sk_buff *skb, __be16 *port, __be16 new_port, __sum16 *check) @@ -347,6 +436,10 @@ static int execute_set_action(struct sk_buff *skb, err = set_ipv4(skb, nla_data(nested_attr)); break; + case OVS_KEY_ATTR_IPV6: + err = set_ipv6(skb, nla_data(nested_attr)); + break; + case OVS_KEY_ATTR_TCP: err = set_tcp(skb, nla_data(nested_attr)); break; diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index 4c4b62c..fd4a6a4 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -479,6 +479,7 @@ static int validate_set(const struct nlattr *a, switch (key_type) { const struct ovs_key_ipv4 *ipv4_key; + const struct ovs_key_ipv6 *ipv6_key; case OVS_KEY_ATTR_PRIORITY: case OVS_KEY_ATTR_ETHERNET: @@ -500,6 +501,25 @@ static int validate_set(const struct nlattr *a, break; + case OVS_KEY_ATTR_IPV6: + if (flow_key->eth.type != htons(ETH_P_IPV6)) + return -EINVAL; + + if (!flow_key->ip.proto) + return -EINVAL; + + ipv6_key = nla_data(ovs_key); + if (ipv6_key->ipv6_proto != flow_key->ip.proto) + return -EINVAL; + + if (ipv6_key->ipv6_frag != flow_key->ip.frag) + return -EINVAL; + + if (ntohl(ipv6_key->ipv6_label) & 0xFFF00000) + return -EINVAL; + + break; + case OVS_KEY_ATTR_TCP: if (flow_key->ip.proto != IPPROTO_TCP) return -EINVAL;