diff mbox

[ovs-dev,v2,03/16] datapath: compat for NAT.

Message ID 1465943137-16856-4-git-send-email-jarno@ovn.org
State Superseded
Headers show

Commit Message

Jarno Rajahalme June 14, 2016, 10:25 p.m. UTC
Compat code required to make the NAT code in the following patch
compile with Linux 3.10 - 4.3.

Signed-off-by: Jarno Rajahalme <jarno@ovn.org>
---
 acinclude.m4                                       |  3 +
 datapath/linux/Modules.mk                          |  4 +
 .../linux/compat/include/linux/netfilter/nf_nat.h  | 15 ++++
 .../include/net/netfilter/nf_conntrack_core.h      | 28 ++++++-
 .../include/net/netfilter/nf_conntrack_seqadj.h    | 30 ++++++++
 .../linux/compat/include/net/netfilter/nf_nat.h    | 44 +++++++++++
 .../compat/include/net/netfilter/nf_nat_core.h     | 88 ++++++++++++++++++++++
 7 files changed, 211 insertions(+), 1 deletion(-)
 create mode 100644 datapath/linux/compat/include/linux/netfilter/nf_nat.h
 create mode 100644 datapath/linux/compat/include/net/netfilter/nf_conntrack_seqadj.h
 create mode 100644 datapath/linux/compat/include/net/netfilter/nf_nat.h
 create mode 100644 datapath/linux/compat/include/net/netfilter/nf_nat_core.h

Comments

Jesse Gross June 16, 2016, 11:39 p.m. UTC | #1
On Tue, Jun 14, 2016 at 3:25 PM, Jarno Rajahalme <jarno@ovn.org> wrote:
> diff --git a/datapath/linux/compat/include/linux/netfilter/nf_nat.h b/datapath/linux/compat/include/linux/netfilter/nf_nat.h
> new file mode 100644
> index 0000000..210b9a7
> --- /dev/null
> +++ b/datapath/linux/compat/include/linux/netfilter/nf_nat.h
> @@ -0,0 +1,15 @@
> +#ifndef _LINUX_NF_NAT_WRAPPER_H
> +#define _LINUX_NF_NAT_WRAPPER_H
> +
> +#include_next <linux/netfilter/nf_nat.h>
> +
> +/* Linux kernel 3.13 and older do not have NF_NAT_RANGE_PROTO_RANDOM_FULLY
> + * (unless backported by the distribution), but we fake it to maintain OVS API
> + * compatibility.  In this case NAT port allocation may happen sequentially
> + * instead.
> + */
> +#ifndef NF_NAT_RANGE_PROTO_RANDOM_FULLY
> +#define NF_NAT_RANGE_PROTO_RANDOM_FULLY (1 << 4)
> +#endif

I think that on kernels where this isn't defined this value will
likely be meaningless and ignored, right? If that's true then I think
0 would be a better choice to avoid the possibility of collision with
another backport. However, would it make sense to just define this to
NF_NAT_RANGE_PROTO_RANDOM to be a closer approximation?

> diff --git a/datapath/linux/compat/include/net/netfilter/nf_nat_core.h b/datapath/linux/compat/include/net/netfilter/nf_nat_core.h
> new file mode 100644
> index 0000000..6b17ab7
> --- /dev/null
> +++ b/datapath/linux/compat/include/net/netfilter/nf_nat_core.h
> @@ -0,0 +1,88 @@
> +#ifndef _NF_NAT_CORE_WRAPPER_H
> +#define _NF_NAT_CORE_WRAPPER_H
> +
> +#include_next <net/netfilter/nf_nat_core.h>
> +
> +#if LINUX_VERSION_CODE < KERNEL_VERSION(4,6,0)
> +
> +/* Linux 4.6 and newer do not depend on skb_dst being set, so this workaround
> + * is not needed there.
> + */
> +static inline unsigned int
> +rpl_nf_nat_packet(struct nf_conn *ct, enum ip_conntrack_info ctinfo,
> +                 unsigned int hooknum, struct sk_buff *skb)
> +{
> +       /* Change skb to CHECKSUM_PARTIAL to avoid running code that
> +        * expects skb_dst being set.
> +        */

What code triggers the problem? This code kind of scares me, it seems
like it is easy to miss things. (What about protocols for which there
is no checksum offload? I suppose many of them aren't possible to NAT
but I there might be some.)

> +       if (skb->ip_summed != CHECKSUM_PARTIAL) {
> +               switch (skb->protocol) {
> +               case ETH_P_IP:

There's a byte order issue here: skb->protocol is in network byte
order but the ETH_P_* constants are in host byte order.
Jarno Rajahalme June 18, 2016, 1:26 a.m. UTC | #2
> On Jun 16, 2016, at 4:39 PM, Jesse Gross <jesse@kernel.org> wrote:
> 
> On Tue, Jun 14, 2016 at 3:25 PM, Jarno Rajahalme <jarno@ovn.org> wrote:
>> diff --git a/datapath/linux/compat/include/linux/netfilter/nf_nat.h b/datapath/linux/compat/include/linux/netfilter/nf_nat.h
>> new file mode 100644
>> index 0000000..210b9a7
>> --- /dev/null
>> +++ b/datapath/linux/compat/include/linux/netfilter/nf_nat.h
>> @@ -0,0 +1,15 @@
>> +#ifndef _LINUX_NF_NAT_WRAPPER_H
>> +#define _LINUX_NF_NAT_WRAPPER_H
>> +
>> +#include_next <linux/netfilter/nf_nat.h>
>> +
>> +/* Linux kernel 3.13 and older do not have NF_NAT_RANGE_PROTO_RANDOM_FULLY
>> + * (unless backported by the distribution), but we fake it to maintain OVS API
>> + * compatibility.  In this case NAT port allocation may happen sequentially
>> + * instead.
>> + */
>> +#ifndef NF_NAT_RANGE_PROTO_RANDOM_FULLY
>> +#define NF_NAT_RANGE_PROTO_RANDOM_FULLY (1 << 4)
>> +#endif
> 
> I think that on kernels where this isn't defined this value will
> likely be meaningless and ignored, right? If that's true then I think
> 0 would be a better choice to avoid the possibility of collision with
> another backport. However, would it make sense to just define this to
> NF_NAT_RANGE_PROTO_RANDOM to be a closer approximation?
> 

This was difficult without touching the conntrack.c itself, but you are right, PROTO_RANDOM is a better approximation. I added the needed combat code to conntrack.c itself for v3.

>> diff --git a/datapath/linux/compat/include/net/netfilter/nf_nat_core.h b/datapath/linux/compat/include/net/netfilter/nf_nat_core.h
>> new file mode 100644
>> index 0000000..6b17ab7
>> --- /dev/null
>> +++ b/datapath/linux/compat/include/net/netfilter/nf_nat_core.h
>> @@ -0,0 +1,88 @@
>> +#ifndef _NF_NAT_CORE_WRAPPER_H
>> +#define _NF_NAT_CORE_WRAPPER_H
>> +
>> +#include_next <net/netfilter/nf_nat_core.h>
>> +
>> +#if LINUX_VERSION_CODE < KERNEL_VERSION(4,6,0)
>> +
>> +/* Linux 4.6 and newer do not depend on skb_dst being set, so this workaround
>> + * is not needed there.
>> + */
>> +static inline unsigned int
>> +rpl_nf_nat_packet(struct nf_conn *ct, enum ip_conntrack_info ctinfo,
>> +                 unsigned int hooknum, struct sk_buff *skb)
>> +{
>> +       /* Change skb to CHECKSUM_PARTIAL to avoid running code that
>> +        * expects skb_dst being set.
>> +        */
> 
> What code triggers the problem? This code kind of scares me, it seems
> like it is easy to miss things. (What about protocols for which there
> is no checksum offload? I suppose many of them aren't possible to NAT
> but I there might be some.)

It is the checksum recalculation code for IPv4 and IPv6 that is called from NAT helper after a TCP or UDP packet payload has been mangled. As such, a better location for this is right before the helper call. Unfortunately the helper is called via a function pointer, so a simple backport in the compat directory does not work. Therefore I added this piece to conntrack.c itself, which also allowed removing code duplication for finding the transport header for IPv6.

> 
>> +       if (skb->ip_summed != CHECKSUM_PARTIAL) {
>> +               switch (skb->protocol) {
>> +               case ETH_P_IP:
> 
> There's a byte order issue here: skb->protocol is in network byte
> order but the ETH_P_* constants are in host byte order.

Oops. This is now avoided by sharing code that was already in conntrack.c.

These conntrack.c changes are now new separate patches after the main NAT backport in v3 I'll post in a moment.

  Jarno
diff mbox

Patch

diff --git a/acinclude.m4 b/acinclude.m4
index 6871ba6..3fd2e93 100644
--- a/acinclude.m4
+++ b/acinclude.m4
@@ -472,6 +472,9 @@  AC_DEFUN([OVS_CHECK_LINUX_COMPAT], [
                   [nf_ct_frag6_consume_orig])
   OVS_GREP_IFELSE([$KSRC/include/net/netfilter/ipv6/nf_defrag_ipv6.h],
                   [nf_ct_frag6_output])
+  OVS_GREP_IFELSE([$KSRC/include/net/netfilter/nf_nat.h], [nf_ct_nat_ext_add])
+  OVS_GREP_IFELSE([$KSRC/include/net/netfilter/nf_nat.h], [nf_nat_alloc_null_binding])
+  OVS_GREP_IFELSE([$KSRC/include/net/netfilter/nf_conntrack_seqadj.h], [nf_ct_seq_adjust])
 
   OVS_GREP_IFELSE([$KSRC/include/linux/random.h], [prandom_u32])
   OVS_GREP_IFELSE([$KSRC/include/linux/random.h], [prandom_u32_max])
diff --git a/datapath/linux/Modules.mk b/datapath/linux/Modules.mk
index 8b65b71..55e85c4 100644
--- a/datapath/linux/Modules.mk
+++ b/datapath/linux/Modules.mk
@@ -55,6 +55,7 @@  openvswitch_headers += \
 	linux/compat/include/linux/netdevice.h \
 	linux/compat/include/linux/netdev_features.h \
 	linux/compat/include/linux/netfilter_ipv6.h \
+	linux/compat/include/linux/netfilter/nf_nat.h \
 	linux/compat/include/linux/netlink.h \
 	linux/compat/include/linux/openvswitch.h \
 	linux/compat/include/linux/poison.h \
@@ -97,7 +98,10 @@  openvswitch_headers += \
 	linux/compat/include/net/netfilter/nf_conntrack_core.h \
 	linux/compat/include/net/netfilter/nf_conntrack_expect.h \
 	linux/compat/include/net/netfilter/nf_conntrack_labels.h \
+	linux/compat/include/net/netfilter/nf_conntrack_seqadj.h \
 	linux/compat/include/net/netfilter/nf_conntrack_zones.h \
+	linux/compat/include/net/netfilter/nf_nat.h \
+	linux/compat/include/net/netfilter/nf_nat_core.h \
 	linux/compat/include/net/netfilter/ipv6/nf_defrag_ipv6.h \
 	linux/compat/include/net/sctp/checksum.h
 EXTRA_DIST += linux/compat/build-aux/export-check-whitelist
diff --git a/datapath/linux/compat/include/linux/netfilter/nf_nat.h b/datapath/linux/compat/include/linux/netfilter/nf_nat.h
new file mode 100644
index 0000000..210b9a7
--- /dev/null
+++ b/datapath/linux/compat/include/linux/netfilter/nf_nat.h
@@ -0,0 +1,15 @@ 
+#ifndef _LINUX_NF_NAT_WRAPPER_H
+#define _LINUX_NF_NAT_WRAPPER_H
+
+#include_next <linux/netfilter/nf_nat.h>
+
+/* Linux kernel 3.13 and older do not have NF_NAT_RANGE_PROTO_RANDOM_FULLY
+ * (unless backported by the distribution), but we fake it to maintain OVS API
+ * compatibility.  In this case NAT port allocation may happen sequentially
+ * instead.
+ */
+#ifndef NF_NAT_RANGE_PROTO_RANDOM_FULLY
+#define NF_NAT_RANGE_PROTO_RANDOM_FULLY (1 << 4)
+#endif
+
+#endif /* _LINUX_NF_NAT_WRAPPER_H */
diff --git a/datapath/linux/compat/include/net/netfilter/nf_conntrack_core.h b/datapath/linux/compat/include/net/netfilter/nf_conntrack_core.h
index faa219a..09a53c3 100644
--- a/datapath/linux/compat/include/net/netfilter/nf_conntrack_core.h
+++ b/datapath/linux/compat/include/net/netfilter/nf_conntrack_core.h
@@ -40,5 +40,31 @@  static void rpl_nf_ct_tmpl_free(struct nf_conn *tmpl)
 	kfree(tmpl);
 }
 #define nf_ct_tmpl_free rpl_nf_ct_tmpl_free
-#endif /* HAVE_NF_CT_TMPL_ALLOC */
+
+static inline struct nf_conntrack_tuple_hash *
+rpl_nf_conntrack_find_get(struct net *net,
+			  const struct nf_conntrack_zone *zone,
+			  const struct nf_conntrack_tuple *tuple)
+{
+	return nf_conntrack_find_get(net, zone->id, tuple);
+}
+#define nf_conntrack_find_get rpl_nf_conntrack_find_get
+#endif /* HAVE_NF_CT_TMPL_ALLOC_TAKES_STRUCT_ZONE */
+
+#ifndef HAVE_NF_CT_GET_TUPLEPR_TAKES_STRUCT_NET
+static inline bool rpl_nf_ct_get_tuple(const struct sk_buff *skb,
+				       unsigned int nhoff,
+				       unsigned int dataoff, u_int16_t l3num,
+				       u_int8_t protonum,
+				       struct net *net,
+				       struct nf_conntrack_tuple *tuple,
+				       const struct nf_conntrack_l3proto *l3proto,
+				       const struct nf_conntrack_l4proto *l4proto)
+{
+	return nf_ct_get_tuple(skb, nhoff, dataoff, l3num, protonum, tuple,
+			       l3proto, l4proto);
+}
+#define nf_ct_get_tuple rpl_nf_ct_get_tuple
+#endif /* HAVE_NF_CT_GET_TUPLEPR_TAKES_STRUCT_NET */
+
 #endif /* _NF_CONNTRACK_CORE_WRAPPER_H */
diff --git a/datapath/linux/compat/include/net/netfilter/nf_conntrack_seqadj.h b/datapath/linux/compat/include/net/netfilter/nf_conntrack_seqadj.h
new file mode 100644
index 0000000..b11d1a5
--- /dev/null
+++ b/datapath/linux/compat/include/net/netfilter/nf_conntrack_seqadj.h
@@ -0,0 +1,30 @@ 
+#ifndef _NF_CONNTRACK_SEQADJ_WRAPPER_H
+#define _NF_CONNTRACK_SEQADJ_WRAPPER_H
+
+#ifdef HAVE_NF_CT_SEQ_ADJUST
+#include_next <net/netfilter/nf_conntrack_seqadj.h>
+#else
+
+#include <net/netfilter/nf_nat_helper.h>
+
+/* TCP sequence number adjustment.  Returns 1 on success, 0 on failure */
+static inline int
+nf_ct_seq_adjust(struct sk_buff *skb,
+		 struct nf_conn *ct, enum ip_conntrack_info ctinfo,
+		 unsigned int protoff)
+{
+	typeof(nf_nat_seq_adjust_hook) seq_adjust;
+
+	seq_adjust = rcu_dereference(nf_nat_seq_adjust_hook);
+	if (!seq_adjust ||
+	    !seq_adjust(skb, ct, ctinfo, ip_hdrlen(skb))) {
+		NF_CT_STAT_INC_ATOMIC(nf_ct_net(ct), drop);
+		return 0;
+	}
+
+	return 1;
+}
+
+#endif /* HAVE_NF_CT_SEQ_ADJUST */
+
+#endif /* _NF_CONNTRACK_SEQADJ_WRAPPER_H */
diff --git a/datapath/linux/compat/include/net/netfilter/nf_nat.h b/datapath/linux/compat/include/net/netfilter/nf_nat.h
new file mode 100644
index 0000000..773e569
--- /dev/null
+++ b/datapath/linux/compat/include/net/netfilter/nf_nat.h
@@ -0,0 +1,44 @@ 
+#ifndef _NF_NAT_WRAPPER_H
+#define _NF_NAT_WRAPPER_H
+
+#include_next <net/netfilter/nf_nat.h>
+
+#ifndef HAVE_NF_CT_NAT_EXT_ADD
+
+static inline struct nf_conn_nat *
+nf_ct_nat_ext_add(struct nf_conn *ct)
+{
+	struct nf_conn_nat *nat = nfct_nat(ct);
+	if (nat)
+		return nat;
+
+	if (!nf_ct_is_confirmed(ct))
+		nat = nf_ct_ext_add(ct, NF_CT_EXT_NAT, GFP_ATOMIC);
+
+	return nat;
+}
+#endif /* HAVE_NF_CT_NAT_EXT_ADD */
+
+#ifndef HAVE_NF_NAT_ALLOC_NULL_BINDING
+static inline unsigned int
+nf_nat_alloc_null_binding(struct nf_conn *ct, unsigned int hooknum)
+{
+	/* Force range to this IP; let proto decide mapping for
+	 * per-proto parts (hence not IP_NAT_RANGE_PROTO_SPECIFIED).
+	 * Use reply in case it's already been mangled (eg local packet).
+	 */
+	union nf_inet_addr ip =
+		(HOOK2MANIP(hooknum) == NF_NAT_MANIP_SRC ?
+		ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3 :
+		ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3);
+	struct nf_nat_range range = {
+		.flags		= NF_NAT_RANGE_MAP_IPS,
+		.min_addr	= ip,
+		.max_addr	= ip,
+	};
+	return nf_nat_setup_info(ct, &range, HOOK2MANIP(hooknum));
+}
+
+#endif /* HAVE_NF_NAT_ALLOC_NULL_BINDING */
+
+#endif /* _NF_NAT_WRAPPER_H */
diff --git a/datapath/linux/compat/include/net/netfilter/nf_nat_core.h b/datapath/linux/compat/include/net/netfilter/nf_nat_core.h
new file mode 100644
index 0000000..6b17ab7
--- /dev/null
+++ b/datapath/linux/compat/include/net/netfilter/nf_nat_core.h
@@ -0,0 +1,88 @@ 
+#ifndef _NF_NAT_CORE_WRAPPER_H
+#define _NF_NAT_CORE_WRAPPER_H
+
+#include_next <net/netfilter/nf_nat_core.h>
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(4,6,0)
+
+/* Linux 4.6 and newer do not depend on skb_dst being set, so this workaround
+ * is not needed there.
+ */
+static inline unsigned int
+rpl_nf_nat_packet(struct nf_conn *ct, enum ip_conntrack_info ctinfo,
+		  unsigned int hooknum, struct sk_buff *skb)
+{
+	/* Change skb to CHECKSUM_PARTIAL to avoid running code that
+	 * expects skb_dst being set.
+	 */
+	if (skb->ip_summed != CHECKSUM_PARTIAL) {
+		switch (skb->protocol) {
+		case ETH_P_IP:
+			switch (ip_hdr(skb)->protocol) {
+			case IPPROTO_TCP:
+				skb->csum_offset = offsetof(struct tcphdr,
+							    check);
+				break;
+			case IPPROTO_UDP:
+			case IPPROTO_UDPLITE:
+				/* Skip if no csum. */
+				if (!udp_hdr(skb)->check)
+					goto out;
+				skb->csum_offset = offsetof(struct udphdr,
+							    check);
+				break;
+			case IPPROTO_SCTP:
+				skb->csum_offset = offsetof(struct sctphdr,
+							    checksum);
+				break;
+			default:
+				goto out;
+			}
+			break;
+		case ETH_P_IPV6: {
+			struct ipv6hdr *nh = ipv6_hdr(skb);
+			u8 nexthdr = nh->nexthdr;
+			int payload_ofs = (u8 *)(nh + 1) - skb->data;
+			__be16 frag_off;
+
+			payload_ofs = ipv6_skip_exthdr(skb, payload_ofs,
+						       &nexthdr, &frag_off);
+			if (unlikely(payload_ofs < 0))
+				goto out;
+
+			switch (nexthdr) {
+			case NEXTHDR_TCP:
+				skb->csum_offset = offsetof(struct tcphdr,
+							    check);
+				break;
+			case NEXTHDR_UDP:
+				/* Skip if no csum. */
+				if (!udp_hdr(skb)->check)
+					goto out;
+				skb->csum_offset = offsetof(struct udphdr,
+							    check);
+				break;
+			case NEXTHDR_SCTP:
+				skb->csum_offset = offsetof(struct sctphdr,
+							    checksum);
+				break;
+			default:
+				goto out;
+			}
+			break;
+		}
+		default:
+			goto out;
+		}
+
+		skb->ip_summed = CHECKSUM_PARTIAL;
+		skb->csum_start = skb_headroom(skb) + skb_transport_offset(skb);
+	}
+ out:
+	return nf_nat_packet(ct, ctinfo, hooknum, skb);
+}
+#define nf_nat_packet rpl_nf_nat_packet
+
+#endif /* LINUX_VERSION_CODE < KERNEL_VERSION(4,6,0) */
+
+#endif /* _NF_NAT_CORE_WRAPPER_H */