diff mbox

[05/19] netfilter: nf_conntrack_ipv6: improve fragmentation handling

Message ID 1344542943-11588-6-git-send-email-kaber@trash.net
State Not Applicable, archived
Delegated to: David Miller
Headers show

Commit Message

Patrick McHardy Aug. 9, 2012, 8:08 p.m. UTC
From: Patrick McHardy <kaber@trash.net>

The IPv6 conntrack fragmentation currently has a couple of shortcomings.
Fragmentes are collected in PREROUTING/OUTPUT, are defragmented, the
defragmented packet is then passed to conntrack, the resulting conntrack
information is attached to each original fragment and the fragments then
continue their way through the stack.

Helper invocation occurs in the POSTROUTING hook, at which point only
the original fragments are available. The result of this is that
fragmented packets are never passed to helpers.

This patch improves the situation in the following way:

- If a reassembled packet belongs to a connection that has a helper
  assigned, the reassembled packet is passed through the stack instead
  of the original fragments.

- During defragmentation, the largest received fragment size is stored.
  On output, the packet is refragmented if required. If the largest
  received fragment size exceeds the outgoing MTU, a "packet too big"
  message is generated, thus behaving as if the original fragments
  were passed through the stack from an outside point of view.

- The ipv6_helper() hook function can't receive fragments anymore for
  connections using a helper, so it is switched to use ipv6_skip_exthdr()
  instead of the netfilter specific nf_ct_ipv6_skip_exthdr() and the
  reassembled packets are passed to connection tracking helpers.

The result of this is that we can properly track fragmented packets, but
still generate ICMPv6 Packet too big messages if we would have before.

This patch is also required as a precondition for IPv6 NAT, where NAT
helpers might enlarge packets up to a point that they require
fragmentation. In that case we can't generate Packet too big messages
since the proper MTU can't be calculated in all cases (f.i. when
changing textual representation of a variable amount of addresses),
so the packet is transparently fragmented iff the original packet or
fragments would have fit the outgoing MTU.

Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 include/linux/ipv6.h                           |    1 +
 net/ipv6/ip6_output.c                          |    7 +++-
 net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c |   37 ++++++++++++++++++------
 net/ipv6/netfilter/nf_conntrack_reasm.c        |   19 ++++++++++--
 4 files changed, 50 insertions(+), 14 deletions(-)

Comments

Jesper Dangaard Brouer Aug. 17, 2012, 8:06 a.m. UTC | #1
On Thu, 2012-08-09 at 22:08 +0200, kaber@trash.net wrote:
> From: Patrick McHardy <kaber@trash.net>
> 
> The IPv6 conntrack fragmentation currently has a couple of shortcomings.
> Fragmentes are collected in PREROUTING/OUTPUT, are defragmented, the
> defragmented packet is then passed to conntrack, the resulting conntrack
> information is attached to each original fragment and the fragments then
> continue their way through the stack.
> 
> Helper invocation occurs in the POSTROUTING hook, at which point only
> the original fragments are available. The result of this is that
> fragmented packets are never passed to helpers.
> 
> This patch improves the situation in the following way:
> 
> - If a reassembled packet belongs to a connection that has a helper
>   assigned, the reassembled packet is passed through the stack instead
>   of the original fragments.

I'm working on IPv6 fragment handling for IPVS, and are taking advantage
of the "replay" by nf_ct_frag6_output() at hook prio -399
(NF_IP6_PRI_CONNTRACK_DEFRAG + 1).
By making a hook at NF_INET_PRE_ROUTING at prio -99 (NF_IP6_PRI_NAT_DST
+ 1).

I can see that the code path can be changed (with this patch), if a
helper is assigned.  Then the "replay" starts at prio -199
(NF_IP6_PRI_CONNTRACK + 1), I guess I'm safe as I run at -99.

I have tested that your patchset works, with my ipvs patches, but would
like the trigger the changed code path, to make sure.

Could you provide an iptables command/rule, that trigger this code path?


[cut]

> @@ -199,9 +200,13 @@ static unsigned int ipv6_confirm(unsigned int hooknum,
>  static unsigned int __ipv6_conntrack_in(struct net *net,
>  					unsigned int hooknum,
>  					struct sk_buff *skb,
> +					const struct net_device *in,
> +					const struct net_device *out,
>  					int (*okfn)(struct sk_buff *))
>  {
>  	struct sk_buff *reasm = skb->nfct_reasm;
> +	struct nf_conn *ct;
> +	enum ip_conntrack_info ctinfo;
>  
>  	/* This packet is fragmented and has reassembled packet. */
>  	if (reasm) {
> @@ -213,6 +218,20 @@ static unsigned int __ipv6_conntrack_in(struct net *net,
>  			if (ret != NF_ACCEPT)
>  				return ret;
>  		}
> +
> +		/* Conntrack helpers need the entire reassembled packet in the
> +		 * POST_ROUTING hook.
> +		 */
> +		ct = nf_ct_get(reasm, &ctinfo);
> +		if (ct != NULL && test_bit(IPS_HELPER_BIT, &ct->status)) {
> +			nf_conntrack_get_reasm(skb);
> +			NF_HOOK_THRESH(NFPROTO_IPV6, hooknum, reasm,
> +				       (struct net_device *)in,
> +				       (struct net_device *)out,
> +				       okfn, NF_IP6_PRI_CONNTRACK + 1);

Hook prio change to NF_IP6_PRI_CONNTRACK + 1


> +			return NF_DROP_ERR(-ECANCELED);
> +		}

[cut]

> @@ -592,6 +599,7 @@ void nf_ct_frag6_output(unsigned int hooknum, struct sk_buff *skb,
>  			int (*okfn)(struct sk_buff *))
>  {
>  	struct sk_buff *s, *s2;
> +	unsigned int ret = 0;
>  
>  	for (s = NFCT_FRAG6_CB(skb)->orig; s;) {
>  		nf_conntrack_put_reasm(s->nfct_reasm);
> @@ -601,8 +609,13 @@ void nf_ct_frag6_output(unsigned int hooknum, struct sk_buff *skb,
>  		s2 = s->next;
>  		s->next = NULL;
>  
> -		NF_HOOK_THRESH(NFPROTO_IPV6, hooknum, s, in, out, okfn,
> -			       NF_IP6_PRI_CONNTRACK_DEFRAG + 1);
> +		if (ret != -ECANCELED)
> +			ret = NF_HOOK_THRESH(NFPROTO_IPV6, hooknum, s,
> +					     in, out, okfn,
> +					     NF_IP6_PRI_CONNTRACK_DEFRAG + 1);

Old hook prio

> +		else
> +			kfree_skb(s);
> +
>  		s = s2;
>  	}
>  	nf_conntrack_put_reasm(skb);


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Pablo Neira Ayuso Aug. 17, 2012, 1:36 p.m. UTC | #2
On Thu, Aug 09, 2012 at 10:08:49PM +0200, kaber@trash.net wrote:
> From: Patrick McHardy <kaber@trash.net>
> 
> The IPv6 conntrack fragmentation currently has a couple of shortcomings.
> Fragmentes are collected in PREROUTING/OUTPUT, are defragmented, the
> defragmented packet is then passed to conntrack, the resulting conntrack
> information is attached to each original fragment and the fragments then
> continue their way through the stack.
> 
> Helper invocation occurs in the POSTROUTING hook, at which point only
> the original fragments are available. The result of this is that
> fragmented packets are never passed to helpers.
> 
> This patch improves the situation in the following way:
> 
> - If a reassembled packet belongs to a connection that has a helper
>   assigned, the reassembled packet is passed through the stack instead
>   of the original fragments.
> 
> - During defragmentation, the largest received fragment size is stored.
>   On output, the packet is refragmented if required. If the largest
>   received fragment size exceeds the outgoing MTU, a "packet too big"
>   message is generated, thus behaving as if the original fragments
>   were passed through the stack from an outside point of view.
> 
> - The ipv6_helper() hook function can't receive fragments anymore for
>   connections using a helper, so it is switched to use ipv6_skip_exthdr()
>   instead of the netfilter specific nf_ct_ipv6_skip_exthdr() and the
>   reassembled packets are passed to connection tracking helpers.
> 
> The result of this is that we can properly track fragmented packets, but
> still generate ICMPv6 Packet too big messages if we would have before.
> 
> This patch is also required as a precondition for IPv6 NAT, where NAT
> helpers might enlarge packets up to a point that they require
> fragmentation. In that case we can't generate Packet too big messages
> since the proper MTU can't be calculated in all cases (f.i. when
> changing textual representation of a variable amount of addresses),
> so the packet is transparently fragmented iff the original packet or
> fragments would have fit the outgoing MTU.
> 
> Signed-off-by: Patrick McHardy <kaber@trash.net>
> ---
>  include/linux/ipv6.h                           |    1 +
>  net/ipv6/ip6_output.c                          |    7 +++-
>  net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c |   37 ++++++++++++++++++------
>  net/ipv6/netfilter/nf_conntrack_reasm.c        |   19 ++++++++++--
>  4 files changed, 50 insertions(+), 14 deletions(-)
> 
> diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
> index 879db26..0b94e91 100644
> --- a/include/linux/ipv6.h
> +++ b/include/linux/ipv6.h
> @@ -256,6 +256,7 @@ struct inet6_skb_parm {
>  #if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
>  	__u16			dsthao;
>  #endif
> +	__u16			frag_max_size;
>  
>  #define IP6SKB_XFRM_TRANSFORMED	1
>  #define IP6SKB_FORWARDED	2
> diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
> index 5b2d63e..a4f6263 100644
> --- a/net/ipv6/ip6_output.c
> +++ b/net/ipv6/ip6_output.c
> @@ -493,7 +493,8 @@ int ip6_forward(struct sk_buff *skb)
>  	if (mtu < IPV6_MIN_MTU)
>  		mtu = IPV6_MIN_MTU;
>  
> -	if (skb->len > mtu && !skb_is_gso(skb)) {
> +	if ((!skb->local_df && skb->len > mtu && !skb_is_gso(skb)) ||
> +	    (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)) {
>  		/* Again, force OUTPUT device used as source address */
>  		skb->dev = dst->dev;
>  		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
> @@ -636,7 +637,9 @@ int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
>  	/* We must not fragment if the socket is set to force MTU discovery
>  	 * or if the skb it not generated by a local socket.
>  	 */
> -	if (unlikely(!skb->local_df && skb->len > mtu)) {
> +	if (unlikely(!skb->local_df && skb->len > mtu) ||
> +		     (IP6CB(skb)->frag_max_size &&
> +		      IP6CB(skb)->frag_max_size > mtu)) {
>  		if (skb->sk && dst_allfrag(skb_dst(skb)))
>  			sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
>  
> diff --git a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
> index 4794f96..560d823 100644
> --- a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
> +++ b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
> @@ -153,10 +153,10 @@ static unsigned int ipv6_helper(unsigned int hooknum,
>  	const struct nf_conn_help *help;
>  	const struct nf_conntrack_helper *helper;
>  	enum ip_conntrack_info ctinfo;
> -	unsigned int ret, protoff;
> -	unsigned int extoff = (u8 *)(ipv6_hdr(skb) + 1) - skb->data;
> -	unsigned char pnum = ipv6_hdr(skb)->nexthdr;
> -
> +	unsigned int ret;
> +	__be16 frag_off;
> +	int protoff;
> +	u8 nexthdr;
>  
>  	/* This is where we call the helper: as the packet goes out. */
>  	ct = nf_ct_get(skb, &ctinfo);
> @@ -171,9 +171,10 @@ static unsigned int ipv6_helper(unsigned int hooknum,
>  	if (!helper)
>  		return NF_ACCEPT;
>  
> -	protoff = nf_ct_ipv6_skip_exthdr(skb, extoff, &pnum,
> -					 skb->len - extoff);
> -	if (protoff > skb->len || pnum == NEXTHDR_FRAGMENT) {
> +	nexthdr = ipv6_hdr(skb)->nexthdr;
> +	protoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &nexthdr,
> +				   &frag_off);
> +	if (protoff < 0 || (frag_off & ntohs(~0x7)) != 0) {
>  		pr_debug("proto header not found\n");
>  		return NF_ACCEPT;
>  	}
> @@ -199,9 +200,13 @@ static unsigned int ipv6_confirm(unsigned int hooknum,
>  static unsigned int __ipv6_conntrack_in(struct net *net,
>  					unsigned int hooknum,
>  					struct sk_buff *skb,
> +					const struct net_device *in,
> +					const struct net_device *out,
>  					int (*okfn)(struct sk_buff *))
>  {
>  	struct sk_buff *reasm = skb->nfct_reasm;
> +	struct nf_conn *ct;
> +	enum ip_conntrack_info ctinfo;
>  
>  	/* This packet is fragmented and has reassembled packet. */
>  	if (reasm) {
> @@ -213,6 +218,20 @@ static unsigned int __ipv6_conntrack_in(struct net *net,
>  			if (ret != NF_ACCEPT)
>  				return ret;
>  		}
> +
> +		/* Conntrack helpers need the entire reassembled packet in the
> +		 * POST_ROUTING hook.
> +		 */
> +		ct = nf_ct_get(reasm, &ctinfo);
> +		if (ct != NULL && test_bit(IPS_HELPER_BIT, &ct->status)) {

Two things regarding the line above:

- I think this also need to check for !nf_ct_is_untracked(ct)

- IPS_HELPER_BIT is only set if the CT target is used to attach
  helpers. I know, this behaviour may seem confusing, but I didn't
  find any better way to avoid that NAT removes the helper
  explicitly attached via CT.

  So basically now that status bit means: "this helper has been attached
  via CT".

  Setting it inconditionally in __nf_ct_try_assign_helper would break
  the magic auto-assign helper code.

  On the other hand, the automatic helper assignment is scheduled to
  be removed (well, it would still take at least one 1.5/2 years
  before we do so). At that time, we'll be able to say that all
  conntrack with IPS_HELPER really has one helper. But now I think that
  you'll have to use for nfct_help instead to check if that ct has a
  helper.

> +			nf_conntrack_get_reasm(skb);
> +			NF_HOOK_THRESH(NFPROTO_IPV6, hooknum, reasm,
> +				       (struct net_device *)in,
> +				       (struct net_device *)out,
> +				       okfn, NF_IP6_PRI_CONNTRACK + 1);
> +			return NF_DROP_ERR(-ECANCELED);
> +		}
> +
>  		nf_conntrack_get(reasm->nfct);
>  		skb->nfct = reasm->nfct;
>  		skb->nfctinfo = reasm->nfctinfo;
> @@ -228,7 +247,7 @@ static unsigned int ipv6_conntrack_in(unsigned int hooknum,
>  				      const struct net_device *out,
>  				      int (*okfn)(struct sk_buff *))
>  {
> -	return __ipv6_conntrack_in(dev_net(in), hooknum, skb, okfn);
> +	return __ipv6_conntrack_in(dev_net(in), hooknum, skb, in, out, okfn);
>  }
>  
>  static unsigned int ipv6_conntrack_local(unsigned int hooknum,
> @@ -242,7 +261,7 @@ static unsigned int ipv6_conntrack_local(unsigned int hooknum,
>  		net_notice_ratelimited("ipv6_conntrack_local: packet too short\n");
>  		return NF_ACCEPT;
>  	}
> -	return __ipv6_conntrack_in(dev_net(out), hooknum, skb, okfn);
> +	return __ipv6_conntrack_in(dev_net(out), hooknum, skb, in, out, okfn);
>  }
>  
>  static struct nf_hook_ops ipv6_conntrack_ops[] __read_mostly = {
> diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c
> index c9c78c2..f94fb3a 100644
> --- a/net/ipv6/netfilter/nf_conntrack_reasm.c
> +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
> @@ -190,6 +190,7 @@ static int nf_ct_frag6_queue(struct nf_ct_frag6_queue *fq, struct sk_buff *skb,
>  			     const struct frag_hdr *fhdr, int nhoff)
>  {
>  	struct sk_buff *prev, *next;
> +	unsigned int payload_len;
>  	int offset, end;
>  
>  	if (fq->q.last_in & INET_FRAG_COMPLETE) {
> @@ -197,8 +198,10 @@ static int nf_ct_frag6_queue(struct nf_ct_frag6_queue *fq, struct sk_buff *skb,
>  		goto err;
>  	}
>  
> +	payload_len = ntohs(ipv6_hdr(skb)->payload_len);
> +
>  	offset = ntohs(fhdr->frag_off) & ~0x7;
> -	end = offset + (ntohs(ipv6_hdr(skb)->payload_len) -
> +	end = offset + (payload_len -
>  			((u8 *)(fhdr + 1) - (u8 *)(ipv6_hdr(skb) + 1)));
>  
>  	if ((unsigned int)end > IPV6_MAXPLEN) {
> @@ -307,6 +310,8 @@ found:
>  	skb->dev = NULL;
>  	fq->q.stamp = skb->tstamp;
>  	fq->q.meat += skb->len;
> +	if (payload_len > fq->q.max_size)
> +		fq->q.max_size = payload_len;
>  	atomic_add(skb->truesize, &nf_init_frags.mem);
>  
>  	/* The first fragment.
> @@ -412,10 +417,12 @@ nf_ct_frag6_reasm(struct nf_ct_frag6_queue *fq, struct net_device *dev)
>  	}
>  	atomic_sub(head->truesize, &nf_init_frags.mem);
>  
> +	head->local_df = 1;
>  	head->next = NULL;
>  	head->dev = dev;
>  	head->tstamp = fq->q.stamp;
>  	ipv6_hdr(head)->payload_len = htons(payload_len);
> +	IP6CB(head)->frag_max_size = sizeof(struct ipv6hdr) + fq->q.max_size;
>  
>  	/* Yes, and fold redundant checksum back. 8) */
>  	if (head->ip_summed == CHECKSUM_COMPLETE)
> @@ -592,6 +599,7 @@ void nf_ct_frag6_output(unsigned int hooknum, struct sk_buff *skb,
>  			int (*okfn)(struct sk_buff *))
>  {
>  	struct sk_buff *s, *s2;
> +	unsigned int ret = 0;
>  
>  	for (s = NFCT_FRAG6_CB(skb)->orig; s;) {
>  		nf_conntrack_put_reasm(s->nfct_reasm);
> @@ -601,8 +609,13 @@ void nf_ct_frag6_output(unsigned int hooknum, struct sk_buff *skb,
>  		s2 = s->next;
>  		s->next = NULL;
>  
> -		NF_HOOK_THRESH(NFPROTO_IPV6, hooknum, s, in, out, okfn,
> -			       NF_IP6_PRI_CONNTRACK_DEFRAG + 1);
> +		if (ret != -ECANCELED)
> +			ret = NF_HOOK_THRESH(NFPROTO_IPV6, hooknum, s,
> +					     in, out, okfn,
> +					     NF_IP6_PRI_CONNTRACK_DEFRAG + 1);
> +		else
> +			kfree_skb(s);
> +
>  		s = s2;
>  	}
>  	nf_conntrack_put_reasm(skb);
> -- 
> 1.7.1
> 
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Patrick McHardy Aug. 18, 2012, 12:26 p.m. UTC | #3
On Fri, 17 Aug 2012, Jesper Dangaard Brouer wrote:

> On Thu, 2012-08-09 at 22:08 +0200, kaber@trash.net wrote:
>> From: Patrick McHardy <kaber@trash.net>
>>
>> The IPv6 conntrack fragmentation currently has a couple of shortcomings.
>> Fragmentes are collected in PREROUTING/OUTPUT, are defragmented, the
>> defragmented packet is then passed to conntrack, the resulting conntrack
>> information is attached to each original fragment and the fragments then
>> continue their way through the stack.
>>
>> Helper invocation occurs in the POSTROUTING hook, at which point only
>> the original fragments are available. The result of this is that
>> fragmented packets are never passed to helpers.
>>
>> This patch improves the situation in the following way:
>>
>> - If a reassembled packet belongs to a connection that has a helper
>>   assigned, the reassembled packet is passed through the stack instead
>>   of the original fragments.
>
> I'm working on IPv6 fragment handling for IPVS, and are taking advantage
> of the "replay" by nf_ct_frag6_output() at hook prio -399
> (NF_IP6_PRI_CONNTRACK_DEFRAG + 1).
> By making a hook at NF_INET_PRE_ROUTING at prio -99 (NF_IP6_PRI_NAT_DST
> + 1).
>
> I can see that the code path can be changed (with this patch), if a
> helper is assigned.  Then the "replay" starts at prio -199
> (NF_IP6_PRI_CONNTRACK + 1), I guess I'm safe as I run at -99.
>
> I have tested that your patchset works, with my ipvs patches, but would
> like the trigger the changed code path, to make sure.
>
> Could you provide an iptables command/rule, that trigger this code path?

The easiest way is a large ping with the NAT patches also applied,
in that case we also pass the first packet of a connection through
the stack reassembled.

>> @@ -199,9 +200,13 @@ static unsigned int ipv6_confirm(unsigned int hooknum,
>>  static unsigned int __ipv6_conntrack_in(struct net *net,
>>  					unsigned int hooknum,
>>  					struct sk_buff *skb,
>> +					const struct net_device *in,
>> +					const struct net_device *out,
>>  					int (*okfn)(struct sk_buff *))
>>  {
>>  	struct sk_buff *reasm = skb->nfct_reasm;
>> +	struct nf_conn *ct;
>> +	enum ip_conntrack_info ctinfo;
>>
>>  	/* This packet is fragmented and has reassembled packet. */
>>  	if (reasm) {
>> @@ -213,6 +218,20 @@ static unsigned int __ipv6_conntrack_in(struct net *net,
>>  			if (ret != NF_ACCEPT)
>>  				return ret;
>>  		}
>> +
>> +		/* Conntrack helpers need the entire reassembled packet in the
>> +		 * POST_ROUTING hook.
>> +		 */
>> +		ct = nf_ct_get(reasm, &ctinfo);
>> +		if (ct != NULL && test_bit(IPS_HELPER_BIT, &ct->status)) {
>> +			nf_conntrack_get_reasm(skb);
>> +			NF_HOOK_THRESH(NFPROTO_IPV6, hooknum, reasm,
>> +				       (struct net_device *)in,
>> +				       (struct net_device *)out,
>> +				       okfn, NF_IP6_PRI_CONNTRACK + 1);
>
> Hook prio change to NF_IP6_PRI_CONNTRACK + 1

I didn't get this part, you want to change to PRE_CONNTRACK + 1? What
about raw and SELinux?
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jesper Dangaard Brouer Aug. 19, 2012, 7:37 p.m. UTC | #4
On Sat, 2012-08-18 at 14:26 +0200, Patrick McHardy wrote:
> On Fri, 17 Aug 2012, Jesper Dangaard Brouer wrote:
> 
> > On Thu, 2012-08-09 at 22:08 +0200, kaber@trash.net wrote:
> >> From: Patrick McHardy <kaber@trash.net>
> >>
> >> The IPv6 conntrack fragmentation currently has a couple of shortcomings.
> >> Fragmentes are collected in PREROUTING/OUTPUT, are defragmented, the
> >> defragmented packet is then passed to conntrack, the resulting conntrack
> >> information is attached to each original fragment and the fragments then
> >> continue their way through the stack.
> >>
> >> Helper invocation occurs in the POSTROUTING hook, at which point only
> >> the original fragments are available. The result of this is that
> >> fragmented packets are never passed to helpers.
> >>
> >> This patch improves the situation in the following way:
> >>
> >> - If a reassembled packet belongs to a connection that has a helper
> >>   assigned, the reassembled packet is passed through the stack instead
> >>   of the original fragments.
> >
> > I'm working on IPv6 fragment handling for IPVS, and are taking advantage
> > of the "replay" by nf_ct_frag6_output() at hook prio -399
> > (NF_IP6_PRI_CONNTRACK_DEFRAG + 1).
> > By making a hook at NF_INET_PRE_ROUTING at prio -99 (NF_IP6_PRI_NAT_DST
> > + 1).
> >
> > I can see that the code path can be changed (with this patch), if a
> > helper is assigned.  Then the "replay" starts at prio -199
> > (NF_IP6_PRI_CONNTRACK + 1), I guess I'm safe as I run at -99.
> >
> > I have tested that your patchset works, with my ipvs patches, but would
> > like the trigger the changed code path, to make sure.
> >
> > Could you provide an iptables command/rule, that trigger this code path?
> 
> The easiest way is a large ping with the NAT patches also applied,
> in that case we also pass the first packet of a connection through
> the stack reassembled.

So, a fragmented IPv6 ICMPv6 packet, I assume?

Don't I need to load some of the helper modules, or just the
nf_conntrack_ipv6 module, or perhaps only nf_defrag_ipv6 ?


> >> @@ -199,9 +200,13 @@ static unsigned int ipv6_confirm(unsigned int hooknum,
> >>  static unsigned int __ipv6_conntrack_in(struct net *net,
> >>  					unsigned int hooknum,
> >>  					struct sk_buff *skb,
> >> +					const struct net_device *in,
> >> +					const struct net_device *out,
> >>  					int (*okfn)(struct sk_buff *))
> >>  {
> >>  	struct sk_buff *reasm = skb->nfct_reasm;
> >> +	struct nf_conn *ct;
> >> +	enum ip_conntrack_info ctinfo;
> >>
> >>  	/* This packet is fragmented and has reassembled packet. */
> >>  	if (reasm) {
> >> @@ -213,6 +218,20 @@ static unsigned int __ipv6_conntrack_in(struct net *net,
> >>  			if (ret != NF_ACCEPT)
> >>  				return ret;
> >>  		}
> >> +
> >> +		/* Conntrack helpers need the entire reassembled packet in the
> >> +		 * POST_ROUTING hook.
> >> +		 */
> >> +		ct = nf_ct_get(reasm, &ctinfo);
> >> +		if (ct != NULL && test_bit(IPS_HELPER_BIT, &ct->status)) {
> >> +			nf_conntrack_get_reasm(skb);
> >> +			NF_HOOK_THRESH(NFPROTO_IPV6, hooknum, reasm,
> >> +				       (struct net_device *)in,
> >> +				       (struct net_device *)out,
> >> +				       okfn, NF_IP6_PRI_CONNTRACK + 1);
> >
> > Hook prio change to NF_IP6_PRI_CONNTRACK + 1
> 
> I didn't get this part, you want to change to PRE_CONNTRACK + 1? What
> about raw and SELinux?

No - I don't want any changes.

I was just pointing out *where* the changes occur in your patch. This is
just a "service" to other email readers, so they can spot the changes
faster, I were referring to.
Patrick McHardy Aug. 19, 2012, 7:44 p.m. UTC | #5
On Sun, 19 Aug 2012, Jesper Dangaard Brouer wrote:

> On Sat, 2012-08-18 at 14:26 +0200, Patrick McHardy wrote:
>>>
>>> Could you provide an iptables command/rule, that trigger this code path?
>>
>> The easiest way is a large ping with the NAT patches also applied,
>> in that case we also pass the first packet of a connection through
>> the stack reassembled.
>
> So, a fragmented IPv6 ICMPv6 packet, I assume?

Correct.

> Don't I need to load some of the helper modules, or just the
> nf_conntrack_ipv6 module, or perhaps only nf_defrag_ipv6 ?

Not with the entire patchset, just IPv6 conntrack is enough. Aith IPv6 NAT
the first packet of a connection must always be defragemented, independant
of an assigned helper.

>>>> @@ -199,9 +200,13 @@ static unsigned int ipv6_confirm(unsigned int hooknum,
>>>>  static unsigned int __ipv6_conntrack_in(struct net *net,
>>>>  					unsigned int hooknum,
>>>>  					struct sk_buff *skb,
>>>> +					const struct net_device *in,
>>>> +					const struct net_device *out,
>>>>  					int (*okfn)(struct sk_buff *))
>>>>  {
>>>>  	struct sk_buff *reasm = skb->nfct_reasm;
>>>> +	struct nf_conn *ct;
>>>> +	enum ip_conntrack_info ctinfo;
>>>>
>>>>  	/* This packet is fragmented and has reassembled packet. */
>>>>  	if (reasm) {
>>>> @@ -213,6 +218,20 @@ static unsigned int __ipv6_conntrack_in(struct net *net,
>>>>  			if (ret != NF_ACCEPT)
>>>>  				return ret;
>>>>  		}
>>>> +
>>>> +		/* Conntrack helpers need the entire reassembled packet in the
>>>> +		 * POST_ROUTING hook.
>>>> +		 */
>>>> +		ct = nf_ct_get(reasm, &ctinfo);
>>>> +		if (ct != NULL && test_bit(IPS_HELPER_BIT, &ct->status)) {
>>>> +			nf_conntrack_get_reasm(skb);
>>>> +			NF_HOOK_THRESH(NFPROTO_IPV6, hooknum, reasm,
>>>> +				       (struct net_device *)in,
>>>> +				       (struct net_device *)out,
>>>> +				       okfn, NF_IP6_PRI_CONNTRACK + 1);
>>>
>>> Hook prio change to NF_IP6_PRI_CONNTRACK + 1
>>
>> I didn't get this part, you want to change to PRE_CONNTRACK + 1? What
>> about raw and SELinux?
>
> No - I don't want any changes.
>
> I was just pointing out *where* the changes occur in your patch. This is
> just a "service" to other email readers, so they can spot the changes
> faster, I were referring to.

Could you send me your patch so I get a better picture of what you're 
doing exactly?
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jesper Dangaard Brouer Aug. 20, 2012, 1:13 p.m. UTC | #6
On Sun, 2012-08-19 at 21:44 +0200, Patrick McHardy wrote:

> Could you send me your patch so I get a better picture of what you're 
> doing exactly?

Okay, just posted the patchset.

Specifically look at patch:
 [PATCH 3/3] ipvs: Complete IPv6 fragment handling for IPVS

Where I use the hook to copy the fw mark from the reasm SKB packet to
the SKB fragments. (Perhaps, this could be done else were in the
netfilter framework).

--Jesper Brouer


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jesper Dangaard Brouer Aug. 21, 2012, 10:21 p.m. UTC | #7
On Sun, 2012-08-19 at 21:44 +0200, Patrick McHardy wrote:
> On Sun, 19 Aug 2012, Jesper Dangaard Brouer wrote:
> > On Sat, 2012-08-18 at 14:26 +0200, Patrick McHardy wrote:
[...]

> > Don't I need to load some of the helper modules, or just the
> > nf_conntrack_ipv6 module, or perhaps only nf_defrag_ipv6 ?
> 
> Not with the entire patchset, just IPv6 conntrack is enough. Aith IPv6 NAT
> the first packet of a connection must always be defragemented, independant
> of an assigned helper.

When loading "nf_conntrack_ipv6" I run into issues.

When sending a fragmented UDP packet.  With these patches, the IPVS
stack will no longer see the fragmented packets, but instead see one
large SKB.  This will trigger a MTU path check in e.g.
ip_vs_dr_xmit_v6() and an ICMPv6 too big packet is send back.

  IPVS: ip_vs_dr_xmit_v6(): frag needed

Perhaps we could change/fix the MTU check in IPVS?
(This would also solve issues I've seen with TSO/GSO frames, hitting
this code path).
Patrick McHardy Aug. 22, 2012, 10:21 p.m. UTC | #8
On Mon, 20 Aug 2012, Jesper Dangaard Brouer wrote:

> On Sun, 2012-08-19 at 21:44 +0200, Patrick McHardy wrote:
>
>> Could you send me your patch so I get a better picture of what you're
>> doing exactly?
>
> Okay, just posted the patchset.
>
> Specifically look at patch:
> [PATCH 3/3] ipvs: Complete IPv6 fragment handling for IPVS
>
> Where I use the hook to copy the fw mark from the reasm SKB packet to
> the SKB fragments. (Perhaps, this could be done else were in the
> netfilter framework).

Thanks, I'll have a look at this tommorrow.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Patrick McHardy Aug. 26, 2012, 9:20 p.m. UTC | #9
On Wed, 22 Aug 2012, Jesper Dangaard Brouer wrote:

> On Sun, 2012-08-19 at 21:44 +0200, Patrick McHardy wrote:
>> On Sun, 19 Aug 2012, Jesper Dangaard Brouer wrote:
>>> On Sat, 2012-08-18 at 14:26 +0200, Patrick McHardy wrote:
> [...]
>
>>> Don't I need to load some of the helper modules, or just the
>>> nf_conntrack_ipv6 module, or perhaps only nf_defrag_ipv6 ?
>>
>> Not with the entire patchset, just IPv6 conntrack is enough. Aith IPv6 NAT
>> the first packet of a connection must always be defragemented, independant
>> of an assigned helper.
>
> When loading "nf_conntrack_ipv6" I run into issues.
>
> When sending a fragmented UDP packet.  With these patches, the IPVS
> stack will no longer see the fragmented packets, but instead see one
> large SKB.  This will trigger a MTU path check in e.g.
> ip_vs_dr_xmit_v6() and an ICMPv6 too big packet is send back.
>
>  IPVS: ip_vs_dr_xmit_v6(): frag needed
>
> Perhaps we could change/fix the MTU check in IPVS?
> (This would also solve issues I've seen with TSO/GSO frames, hitting
> this code path).

I guess this should use the same check as in IPv6 output, check
whether IP6CB(skb)->max_frag_size is != 0 and > MTU and only send
an ICMPv6 error in that case.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jesper Dangaard Brouer Aug. 27, 2012, 10:13 a.m. UTC | #10
On Sun, 2012-08-26 at 23:20 +0200, Patrick McHardy wrote:
> On Wed, 22 Aug 2012, Jesper Dangaard Brouer wrote:
> 
> > On Sun, 2012-08-19 at 21:44 +0200, Patrick McHardy wrote:
> >> On Sun, 19 Aug 2012, Jesper Dangaard Brouer wrote:
> >>> On Sat, 2012-08-18 at 14:26 +0200, Patrick McHardy wrote:
> > [...]
> >
> >>> Don't I need to load some of the helper modules, or just the
> >>> nf_conntrack_ipv6 module, or perhaps only nf_defrag_ipv6 ?
> >>
> >> Not with the entire patchset, just IPv6 conntrack is enough. Aith IPv6 NAT
> >> the first packet of a connection must always be defragemented, independant
> >> of an assigned helper.
> >
> > When loading "nf_conntrack_ipv6" I run into issues.
> >
> > When sending a fragmented UDP packet.  With these patches, the IPVS
> > stack will no longer see the fragmented packets, but instead see one
> > large SKB.  This will trigger a MTU path check in e.g.
> > ip_vs_dr_xmit_v6() and an ICMPv6 too big packet is send back.
> >
> >  IPVS: ip_vs_dr_xmit_v6(): frag needed
> >
> > Perhaps we could change/fix the MTU check in IPVS?
> > (This would also solve issues I've seen with TSO/GSO frames, hitting
> > this code path).
> 
> I guess this should use the same check as in IPv6 output, check
> whether IP6CB(skb)->max_frag_size is != 0 and > MTU and only send
> an ICMPv6 error in that case.

Hans have (already) proposed this solution:

  if ((!skb->local_df && skb->len > mtu && !skb_is_gso(skb)) ||
      (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)) {

And I have tested it works.
But I'm not sure about, if we really need the "!skb->local_df" check ?


Thus, we should extend you patchset with a patch, that also address the
MTU checks in IPVS.
Patrick McHardy Aug. 27, 2012, 10:41 a.m. UTC | #11
On Mon, 27 Aug 2012, Jesper Dangaard Brouer wrote:

>>> When loading "nf_conntrack_ipv6" I run into issues.
>>>
>>> When sending a fragmented UDP packet.  With these patches, the IPVS
>>> stack will no longer see the fragmented packets, but instead see one
>>> large SKB.  This will trigger a MTU path check in e.g.
>>> ip_vs_dr_xmit_v6() and an ICMPv6 too big packet is send back.
>>>
>>>  IPVS: ip_vs_dr_xmit_v6(): frag needed
>>>
>>> Perhaps we could change/fix the MTU check in IPVS?
>>> (This would also solve issues I've seen with TSO/GSO frames, hitting
>>> this code path).
>>
>> I guess this should use the same check as in IPv6 output, check
>> whether IP6CB(skb)->max_frag_size is != 0 and > MTU and only send
>> an ICMPv6 error in that case.
>
> Hans have (already) proposed this solution:
>
>  if ((!skb->local_df && skb->len > mtu && !skb_is_gso(skb)) ||
>      (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)) {
>
> And I have tested it works.
> But I'm not sure about, if we really need the "!skb->local_df" check ?

Not necessarily, alternatively you could use:

if ((!IP6CB(skb)->frag_max_size &&
      (skb->len > mtu && !skb_is_gso(skb)) ||
     IP6CB(skb)->frag_max_size > mtu)

We just need to make sure that if frag_max_size is set that skb->len
is ignored. Either way is fine, but I'd suggest to keep it similar to
the variant used in IPv6.

> Thus, we should extend you patchset with a patch, that also address the
> MTU checks in IPVS.

Can you send over a patch for inclusion in the patchset? Besides this, its
ready for submission.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jesper Dangaard Brouer Aug. 27, 2012, 2:40 p.m. UTC | #12
The following patchset makes IPVS compatible with Patrick McHardys
IPv6 NAT patchset: http://thread.gmane.org/gmane.linux.network/239615

Specifically the part that improves defragmentation.

 Patch01: Cleanup (and fixes a bug) in IPVS MTU IPv6 handling

 Patch02: Extend MTU check to account for IPv6 NAT defrag changes

This patchset is based upon:
 Homes ipvs-next tree:
  git://git.kernel.org/pub/scm/linux/kernel/git/horms/ipvs-next.git

BUT with Patrick McHardy's IPv6 NAT patchset applied first, on top of
commit: 3654e61137db891f5312e6dd813b961484b5fdf3
title: "ipvs: add pmtu_disc option to disable IP DF for TUN packets"

Send due to request by Patrick:
 http://thread.gmane.org/gmane.linux.network/239615/focus=43870

---

Jesper Dangaard Brouer (2):
      ipvs: Extend MTU check to account for IPv6 NAT defrag changes
      ipvs: IPv6 MTU checking cleanup and bugfix


 net/netfilter/ipvs/ip_vs_xmit.c |   32 ++++++++++++++++++++++++++------
 1 files changed, 26 insertions(+), 6 deletions(-)


--
Best regards,
  Jesper Dangaard Brouer
  MSc.CS, Sr. Network Kernel Developer at Red Hat
  Author of http://www.iptv-analyzer.org
  LinkedIn: http://www.linkedin.com/in/brouer
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index 879db26..0b94e91 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -256,6 +256,7 @@  struct inet6_skb_parm {
 #if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
 	__u16			dsthao;
 #endif
+	__u16			frag_max_size;
 
 #define IP6SKB_XFRM_TRANSFORMED	1
 #define IP6SKB_FORWARDED	2
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 5b2d63e..a4f6263 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -493,7 +493,8 @@  int ip6_forward(struct sk_buff *skb)
 	if (mtu < IPV6_MIN_MTU)
 		mtu = IPV6_MIN_MTU;
 
-	if (skb->len > mtu && !skb_is_gso(skb)) {
+	if ((!skb->local_df && skb->len > mtu && !skb_is_gso(skb)) ||
+	    (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)) {
 		/* Again, force OUTPUT device used as source address */
 		skb->dev = dst->dev;
 		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
@@ -636,7 +637,9 @@  int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
 	/* We must not fragment if the socket is set to force MTU discovery
 	 * or if the skb it not generated by a local socket.
 	 */
-	if (unlikely(!skb->local_df && skb->len > mtu)) {
+	if (unlikely(!skb->local_df && skb->len > mtu) ||
+		     (IP6CB(skb)->frag_max_size &&
+		      IP6CB(skb)->frag_max_size > mtu)) {
 		if (skb->sk && dst_allfrag(skb_dst(skb)))
 			sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
 
diff --git a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
index 4794f96..560d823 100644
--- a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
+++ b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
@@ -153,10 +153,10 @@  static unsigned int ipv6_helper(unsigned int hooknum,
 	const struct nf_conn_help *help;
 	const struct nf_conntrack_helper *helper;
 	enum ip_conntrack_info ctinfo;
-	unsigned int ret, protoff;
-	unsigned int extoff = (u8 *)(ipv6_hdr(skb) + 1) - skb->data;
-	unsigned char pnum = ipv6_hdr(skb)->nexthdr;
-
+	unsigned int ret;
+	__be16 frag_off;
+	int protoff;
+	u8 nexthdr;
 
 	/* This is where we call the helper: as the packet goes out. */
 	ct = nf_ct_get(skb, &ctinfo);
@@ -171,9 +171,10 @@  static unsigned int ipv6_helper(unsigned int hooknum,
 	if (!helper)
 		return NF_ACCEPT;
 
-	protoff = nf_ct_ipv6_skip_exthdr(skb, extoff, &pnum,
-					 skb->len - extoff);
-	if (protoff > skb->len || pnum == NEXTHDR_FRAGMENT) {
+	nexthdr = ipv6_hdr(skb)->nexthdr;
+	protoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &nexthdr,
+				   &frag_off);
+	if (protoff < 0 || (frag_off & ntohs(~0x7)) != 0) {
 		pr_debug("proto header not found\n");
 		return NF_ACCEPT;
 	}
@@ -199,9 +200,13 @@  static unsigned int ipv6_confirm(unsigned int hooknum,
 static unsigned int __ipv6_conntrack_in(struct net *net,
 					unsigned int hooknum,
 					struct sk_buff *skb,
+					const struct net_device *in,
+					const struct net_device *out,
 					int (*okfn)(struct sk_buff *))
 {
 	struct sk_buff *reasm = skb->nfct_reasm;
+	struct nf_conn *ct;
+	enum ip_conntrack_info ctinfo;
 
 	/* This packet is fragmented and has reassembled packet. */
 	if (reasm) {
@@ -213,6 +218,20 @@  static unsigned int __ipv6_conntrack_in(struct net *net,
 			if (ret != NF_ACCEPT)
 				return ret;
 		}
+
+		/* Conntrack helpers need the entire reassembled packet in the
+		 * POST_ROUTING hook.
+		 */
+		ct = nf_ct_get(reasm, &ctinfo);
+		if (ct != NULL && test_bit(IPS_HELPER_BIT, &ct->status)) {
+			nf_conntrack_get_reasm(skb);
+			NF_HOOK_THRESH(NFPROTO_IPV6, hooknum, reasm,
+				       (struct net_device *)in,
+				       (struct net_device *)out,
+				       okfn, NF_IP6_PRI_CONNTRACK + 1);
+			return NF_DROP_ERR(-ECANCELED);
+		}
+
 		nf_conntrack_get(reasm->nfct);
 		skb->nfct = reasm->nfct;
 		skb->nfctinfo = reasm->nfctinfo;
@@ -228,7 +247,7 @@  static unsigned int ipv6_conntrack_in(unsigned int hooknum,
 				      const struct net_device *out,
 				      int (*okfn)(struct sk_buff *))
 {
-	return __ipv6_conntrack_in(dev_net(in), hooknum, skb, okfn);
+	return __ipv6_conntrack_in(dev_net(in), hooknum, skb, in, out, okfn);
 }
 
 static unsigned int ipv6_conntrack_local(unsigned int hooknum,
@@ -242,7 +261,7 @@  static unsigned int ipv6_conntrack_local(unsigned int hooknum,
 		net_notice_ratelimited("ipv6_conntrack_local: packet too short\n");
 		return NF_ACCEPT;
 	}
-	return __ipv6_conntrack_in(dev_net(out), hooknum, skb, okfn);
+	return __ipv6_conntrack_in(dev_net(out), hooknum, skb, in, out, okfn);
 }
 
 static struct nf_hook_ops ipv6_conntrack_ops[] __read_mostly = {
diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c
index c9c78c2..f94fb3a 100644
--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
+++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
@@ -190,6 +190,7 @@  static int nf_ct_frag6_queue(struct nf_ct_frag6_queue *fq, struct sk_buff *skb,
 			     const struct frag_hdr *fhdr, int nhoff)
 {
 	struct sk_buff *prev, *next;
+	unsigned int payload_len;
 	int offset, end;
 
 	if (fq->q.last_in & INET_FRAG_COMPLETE) {
@@ -197,8 +198,10 @@  static int nf_ct_frag6_queue(struct nf_ct_frag6_queue *fq, struct sk_buff *skb,
 		goto err;
 	}
 
+	payload_len = ntohs(ipv6_hdr(skb)->payload_len);
+
 	offset = ntohs(fhdr->frag_off) & ~0x7;
-	end = offset + (ntohs(ipv6_hdr(skb)->payload_len) -
+	end = offset + (payload_len -
 			((u8 *)(fhdr + 1) - (u8 *)(ipv6_hdr(skb) + 1)));
 
 	if ((unsigned int)end > IPV6_MAXPLEN) {
@@ -307,6 +310,8 @@  found:
 	skb->dev = NULL;
 	fq->q.stamp = skb->tstamp;
 	fq->q.meat += skb->len;
+	if (payload_len > fq->q.max_size)
+		fq->q.max_size = payload_len;
 	atomic_add(skb->truesize, &nf_init_frags.mem);
 
 	/* The first fragment.
@@ -412,10 +417,12 @@  nf_ct_frag6_reasm(struct nf_ct_frag6_queue *fq, struct net_device *dev)
 	}
 	atomic_sub(head->truesize, &nf_init_frags.mem);
 
+	head->local_df = 1;
 	head->next = NULL;
 	head->dev = dev;
 	head->tstamp = fq->q.stamp;
 	ipv6_hdr(head)->payload_len = htons(payload_len);
+	IP6CB(head)->frag_max_size = sizeof(struct ipv6hdr) + fq->q.max_size;
 
 	/* Yes, and fold redundant checksum back. 8) */
 	if (head->ip_summed == CHECKSUM_COMPLETE)
@@ -592,6 +599,7 @@  void nf_ct_frag6_output(unsigned int hooknum, struct sk_buff *skb,
 			int (*okfn)(struct sk_buff *))
 {
 	struct sk_buff *s, *s2;
+	unsigned int ret = 0;
 
 	for (s = NFCT_FRAG6_CB(skb)->orig; s;) {
 		nf_conntrack_put_reasm(s->nfct_reasm);
@@ -601,8 +609,13 @@  void nf_ct_frag6_output(unsigned int hooknum, struct sk_buff *skb,
 		s2 = s->next;
 		s->next = NULL;
 
-		NF_HOOK_THRESH(NFPROTO_IPV6, hooknum, s, in, out, okfn,
-			       NF_IP6_PRI_CONNTRACK_DEFRAG + 1);
+		if (ret != -ECANCELED)
+			ret = NF_HOOK_THRESH(NFPROTO_IPV6, hooknum, s,
+					     in, out, okfn,
+					     NF_IP6_PRI_CONNTRACK_DEFRAG + 1);
+		else
+			kfree_skb(s);
+
 		s = s2;
 	}
 	nf_conntrack_put_reasm(skb);