diff mbox

[1/1] bridge: forward IPv6 fragmented packets when passing netfilter

Message ID 1421628209-5064-1-git-send-email-bernhard.thaler@wvnet.at
State Superseded
Delegated to: Pablo Neira
Headers show

Commit Message

Bernhard Thaler Jan. 19, 2015, 12:43 a.m. UTC
Currently IPv6 fragmented packets are not forwarded on an ethernet bridge
with netfilter ip6_tables loaded. e.g. steps to reproduce

1) create a simple bridge like this

	modprobe br_netfilter
	brctl addbr br0
	brctl addif br0 eth0
	brctl addif br0 eth2
	ifconfig eth0 up
	ifconfig eth2 up
	ifconfig br0 up

2) place a host with an IPv6 address on each side of the bridge

	set IPv6 address on host A:
	ip -6 addr add fd01:2345:6789:1::1/64 dev eth0

	set IPv6 address on host B:
	ip -6 addr add fd01:2345:6789:1::2/64 dev eth0

3) run a simple ping command on host A with packets > MTU

	ping6 -s 4000 fd01:2345:6789:1::2

4) wait some time and run e.g. "ip6tables -t nat -nvL" on the bridge

IPv6 fragmented packets traverse the bridge cleanly until "iptables -t nat -nvL"
is run. As soon as it is run (and netfilter modules are loaded) IPv6 fragmented
packets do not traverse the bridge any more (you see no more responses in ping's
output).

Patch exports ip6_fragment() in include/net/ipv6.h and net/ipv6/ip6_output.c
to use it in net/bridge/br_netfilter.c's br_nf_dev_queue_xmit() for IPv6 packets
that need to be fragmented.

In net/bridge/br_netfilter.c br_nf_pre_routing_finish_ipv6() is changed to keep
track of fragment size and br_nf_dev_queue_xmit() checks for IPv6 packets that
need to be fragmented. br_parse_ip6_options() was introduced to do some validity
checks on the IPv6 packet before calling ip6_fragment() and was closely aligned
to IPv4 code as an example.
br_nf_dev_queue_xmit() is changed to contain the relevant code depending on 
CONFIG_NF_DEFRAG_IPV4 or CONFIG_NF_DEFRAG_IPV6 being used both or each for
their own.

ip6_fragment() in net/ipv6/ip6_output.c was changed due to a NULL pointer de-
reference happening when handling packets coming from br_nf_dev_queue_xmit().
When calling IP6_INC_STATS(), ip6_dst_idev(skb_dst(skb)) did crash the kernel
like this:

BUG: unable to handle kernel NULL pointer dereference at 000000000000037a
IP: [<ffffffff814cba2a>] ip6_fragment+0x99a/0x1290
PGD 3bc3f067 PUD 3bc12067 PMD 0 
Oops: 0000 [#1] SMP  
...

So in6_dev_get(skb->dev) is used to set a variable "idev" which is used to call
IP6_INC_STATS() later on. It is assumed that this also solves other occasions
where ip6_fragment() will be called that may cause the same crash. However,
a better fix would be to check for the missing element causing the NULL pointer
dereference and only setting it when it is missing.

ip6_fragment() is further changed to use nf_bridge_mtu_reduction(skb) as it is
done in the IPv4 code.

After applying this patch, in the same setup as stated above fragmented IPv6
packets traverse the bridge even after running "ip6tables -t nat -nvL" / net-
filter modules loaded. This was tested using overlong IPv6 ICMP ping packets
as described above.
Some tests were performed crafting invalid headers (e.g. ipv6 version field
set to 5) to test checks in br_parse_ip6_options(), however these packets do
not seem to reach changed code parts and seem to be dropped at an earlier
stage (more testing needed with invalid / fuzzed packets).

Signed-off-by: Bernhard Thaler <bernhard.thaler@wvnet.at>
---
 include/net/ipv6.h        |    1 +
 net/bridge/br_netfilter.c |   87 ++++++++++++++++++++++++++++++++++++++++-----
 net/ipv6/ip6_output.c     |   30 ++++++++--------
 3 files changed, 94 insertions(+), 24 deletions(-)

Comments

Pablo Neira Ayuso Jan. 20, 2015, 5:28 p.m. UTC | #1
On Mon, Jan 19, 2015 at 01:43:29AM +0100, Bernhard Thaler wrote:
> ip6_fragment() in net/ipv6/ip6_output.c was changed due to a NULL pointer de-
> reference happening when handling packets coming from br_nf_dev_queue_xmit().
> When calling IP6_INC_STATS(), ip6_dst_idev(skb_dst(skb)) did crash the kernel
> like this:
> 
> BUG: unable to handle kernel NULL pointer dereference at 000000000000037a
> IP: [<ffffffff814cba2a>] ip6_fragment+0x99a/0x1290
> PGD 3bc3f067 PUD 3bc12067 PMD 0 
> Oops: 0000 [#1] SMP  
> ...
> 
> So in6_dev_get(skb->dev) is used to set a variable "idev" which is used to call
> IP6_INC_STATS() later on. It is assumed that this also solves other occasions
> where ip6_fragment() will be called that may cause the same crash. However,
> a better fix would be to check for the missing element causing the NULL pointer
> dereference and only setting it when it is missing.

IP6_INC_STATS() handles null idev pointers. I suspect the struct
fake_rtable in struct net_bridge (see net/bridge/br_private.h) needs
to be converted to something like:

        union {
                struct rtable   fake_rtable;
                struct rt6_info fake_rt6_info;
        };

just to allocate enough room for it.

> ip6_fragment() is further changed to use nf_bridge_mtu_reduction(skb) as it is
> done in the IPv4 code.

This specific change looks the same to what we have in IPv4, so no
objections.

Thanks.
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index 4292929..aecbead 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -495,6 +495,7 @@  struct ip6_create_arg {
 
 void ip6_frag_init(struct inet_frag_queue *q, const void *a);
 bool ip6_frag_match(const struct inet_frag_queue *q, const void *a);
+int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
 
 /*
  *	Equivalent of ipv4 struct ip
diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c
index c190d22..fac485e 100644
--- a/net/bridge/br_netfilter.c
+++ b/net/bridge/br_netfilter.c
@@ -18,6 +18,7 @@ 
 #include <linux/kernel.h>
 #include <linux/slab.h>
 #include <linux/ip.h>
+#include <linux/ipv6.h>
 #include <linux/netdevice.h>
 #include <linux/skbuff.h>
 #include <linux/if_arp.h>
@@ -34,6 +35,7 @@ 
 
 #include <net/ip.h>
 #include <net/ipv6.h>
+#include <net/addrconf.h>
 #include <net/route.h>
 #include <net/netfilter/br_netfilter.h>
 
@@ -239,6 +241,59 @@  drop:
 	return -1;
 }
 
+/* Equivalent to br_parse_ip_options for IPv6 */
+
+static int br_parse_ip6_options(struct sk_buff *skb)
+{
+	const struct ipv6hdr *ip6h;
+	struct net_device *dev = skb->dev;
+	struct inet6_dev *idev = in6_dev_get(skb->dev);
+	u32 len;
+	u8 ip6h_len = sizeof(struct ipv6hdr);
+
+	if (!pskb_may_pull(skb, ip6h_len))
+		goto inhdr_error;
+
+	ip6h = ipv6_hdr(skb);
+
+	/* Basic sanity checks
+	 * check version
+	 * check minimum header length (40 bytes)
+	 * check total length
+	 */
+	if (ip6h->version != 6)
+		goto inhdr_error;
+
+	if (!pskb_may_pull(skb, ip6h_len))
+		goto inhdr_error;
+
+	len = ntohs(ip6h->payload_len) + ip6h_len;
+
+	if (skb->len < len) {
+		IP6_INC_STATS_BH(dev_net(dev), idev,
+				 IPSTATS_MIB_INTRUNCATEDPKTS);
+		goto drop;
+	} else if (len < ip6h_len) {
+		goto inhdr_error;
+	}
+
+	if (pskb_trim_rcsum(skb, len)) {
+		IP6_INC_STATS_BH(dev_net(dev), idev, IPSTATS_MIB_INDISCARDS);
+		goto drop;
+	}
+
+	memset(IP6CB(skb), 0, sizeof(struct inet6_skb_parm));
+	/* No IP options in IPv6 header; however it should be
+	 * checked if some next headers need special treatment
+	 */
+	return 0;
+
+inhdr_error:
+	IP6_INC_STATS_BH(dev_net(dev), idev, IPSTATS_MIB_INHDRERRORS);
+drop:
+	return -1;
+}
+
 /* PF_BRIDGE/PRE_ROUTING *********************************************/
 /* Undo the changes made for ip6tables PREROUTING and continue the
  * bridge PRE_ROUTING hook. */
@@ -246,6 +301,10 @@  static int br_nf_pre_routing_finish_ipv6(struct sk_buff *skb)
 {
 	struct nf_bridge_info *nf_bridge = skb->nf_bridge;
 	struct rtable *rt;
+	int frag_max_size;
+
+	frag_max_size = IP6CB(skb)->frag_max_size;
+	BR_INPUT_SKB_CB(skb)->frag_max_size = frag_max_size;
 
 	if (nf_bridge->mask & BRNF_PKT_TYPE) {
 		skb->pkt_type = PACKET_OTHERHOST;
@@ -763,7 +822,6 @@  static unsigned int br_nf_forward_arp(const struct nf_hook_ops *ops,
 	return NF_STOLEN;
 }
 
-#if IS_ENABLED(CONFIG_NF_DEFRAG_IPV4)
 static int br_nf_dev_queue_xmit(struct sk_buff *skb)
 {
 	int ret;
@@ -772,6 +830,7 @@  static int br_nf_dev_queue_xmit(struct sk_buff *skb)
 	/* This is wrong! We should preserve the original fragment
 	 * boundaries by preserving frag_list rather than refragmenting.
 	 */
+#if IS_ENABLED(CONFIG_NF_DEFRAG_IPV4)
 	if (skb->protocol == htons(ETH_P_IP) &&
 	    skb->len + nf_bridge_mtu_reduction(skb) > skb->dev->mtu &&
 	    !skb_is_gso(skb)) {
@@ -781,17 +840,27 @@  static int br_nf_dev_queue_xmit(struct sk_buff *skb)
 			return NF_DROP;
 		IPCB(skb)->frag_max_size = frag_max_size;
 		ret = ip_fragment(skb, br_dev_queue_push_xmit);
-	} else
-		ret = br_dev_queue_push_xmit(skb);
+		return ret;
+	}
+#endif
+#if IS_ENABLED(CONFIG_NF_DEFRAG_IPV6)
+	if (skb->protocol == htons(ETH_P_IPV6) &&
+	    skb->len + nf_bridge_mtu_reduction(skb) > skb->dev->mtu &&
+	    !skb_is_gso(skb)) {
+		frag_max_size = BR_INPUT_SKB_CB(skb)->frag_max_size;
+		if (br_parse_ip6_options(skb))
+			/* Drop invalid packet */
+			return NF_DROP;
+		IP6CB(skb)->frag_max_size = frag_max_size;
+		ret = ip6_fragment(skb, br_dev_queue_push_xmit);
+		return ret;
+	}
+#endif
+
+	ret = br_dev_queue_push_xmit(skb);
 
 	return ret;
 }
-#else
-static int br_nf_dev_queue_xmit(struct sk_buff *skb)
-{
-        return br_dev_queue_push_xmit(skb);
-}
-#endif
 
 /* PF_BRIDGE/POST_ROUTING ********************************************/
 static unsigned int br_nf_post_routing(const struct nf_hook_ops *ops,
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index ce69a12..f34cbb4 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -41,6 +41,7 @@ 
 
 #include <linux/netfilter.h>
 #include <linux/netfilter_ipv6.h>
+#include <linux/netfilter_bridge.h>
 
 #include <net/sock.h>
 #include <net/snmp.h>
@@ -564,6 +565,7 @@  int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
 	int ptr, offset = 0, err = 0;
 	u8 *prevhdr, nexthdr = 0;
 	struct net *net = dev_net(skb_dst(skb)->dev);
+	struct inet6_dev *idev = in6_dev_get(skb->dev);
 
 	hlen = ip6_find_1stfragopt(skb, &prevhdr);
 	nexthdr = *prevhdr;
@@ -581,8 +583,7 @@  int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
 
 		skb->dev = skb_dst(skb)->dev;
 		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
-		IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
-			      IPSTATS_MIB_FRAGFAILS);
+		IP6_INC_STATS(net, idev, IPSTATS_MIB_FRAGFAILS);
 		kfree_skb(skb);
 		return -EMSGSIZE;
 	}
@@ -592,6 +593,10 @@  int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
 			mtu = np->frag_size;
 	}
 	mtu -= hlen + sizeof(struct frag_hdr);
+#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
+	if (skb->nf_bridge)
+		mtu -= nf_bridge_mtu_reduction(skb);
+#endif
 
 	if (skb_has_frag_list(skb)) {
 		int first_len = skb_pagelen(skb);
@@ -630,8 +635,7 @@  int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
 		*prevhdr = NEXTHDR_FRAGMENT;
 		tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
 		if (!tmp_hdr) {
-			IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
-				      IPSTATS_MIB_FRAGFAILS);
+			IP6_INC_STATS(net, idev, IPSTATS_MIB_FRAGFAILS);
 			return -ENOMEM;
 		}
 
@@ -681,7 +685,7 @@  int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
 
 			err = output(skb);
 			if (!err)
-				IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
+				IP6_INC_STATS(net, idev,
 					      IPSTATS_MIB_FRAGCREATES);
 
 			if (err || !frag)
@@ -695,16 +699,14 @@  int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
 		kfree(tmp_hdr);
 
 		if (err == 0) {
-			IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
-				      IPSTATS_MIB_FRAGOKS);
+			IP6_INC_STATS(net, idev, IPSTATS_MIB_FRAGOKS);
 			ip6_rt_put(rt);
 			return 0;
 		}
 
 		kfree_skb_list(frag);
 
-		IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
-			      IPSTATS_MIB_FRAGFAILS);
+		IP6_INC_STATS(net, idev, IPSTATS_MIB_FRAGFAILS);
 		ip6_rt_put(rt);
 		return err;
 
@@ -752,8 +754,7 @@  slow_path:
 		frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) +
 				 hroom + troom, GFP_ATOMIC);
 		if (!frag) {
-			IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
-				      IPSTATS_MIB_FRAGFAILS);
+			IP6_INC_STATS(net, idev, IPSTATS_MIB_FRAGFAILS);
 			err = -ENOMEM;
 			goto fail;
 		}
@@ -819,17 +820,16 @@  slow_path:
 		IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 			      IPSTATS_MIB_FRAGCREATES);
 	}
-	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
-		      IPSTATS_MIB_FRAGOKS);
+	IP6_INC_STATS(net, idev, IPSTATS_MIB_FRAGOKS);
 	consume_skb(skb);
 	return err;
 
 fail:
-	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
-		      IPSTATS_MIB_FRAGFAILS);
+	IP6_INC_STATS(net, idev, IPSTATS_MIB_FRAGFAILS);
 	kfree_skb(skb);
 	return err;
 }
+EXPORT_SYMBOL(ip6_fragment);
 
 static inline int ip6_rt_check(const struct rt6key *rt_key,
 			       const struct in6_addr *fl_addr,