diff mbox

[V2] netfilter: ctnetlink: force null nat binding on insert

Message ID 1392549343-7145-1-git-send-email-fw@strlen.de
State Accepted
Headers show

Commit Message

Florian Westphal Feb. 16, 2014, 11:15 a.m. UTC
Quoting Andrey Vagin:
  When a conntrack is created  by kernel, it is initialized (sets
  IPS_{DST,SRC}_NAT_DONE_BIT bits in nf_nat_setup_info) and only then it
  is added in hashes (__nf_conntrack_hash_insert), so one conntract
  can't be initialized from a few threads concurrently.

  ctnetlink can add an uninitialized conntrack (w/o
  IPS_{DST,SRC}_NAT_DONE_BIT) in hashes, then a few threads can look up
  this conntrack and start initialize it concurrently. It's dangerous,
  because BUG can be triggered from nf_nat_setup_info.

Fix this race by always setting up nat, even if no CTA_NAT_ attribute
was requested before inserting the ct into the hash table.

In absence of CTA_NAT_ attribute, a null binding is created.

This alters current behaviour:
Before this patch, the first packet matching the newly injected
conntrack would be run through the nat table since nf_nat_initialized()
returns false.  IOW, this forces ctnetlink users to specify the desired
nat transformation on ct creation time.

Reported-By: Andrey Vagin <avagin@gmail.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
---
 Change since v1: fix OOPS when L3 conntrack module is not loaded.

 Caused by nf_nat_setup_info -> get_unique_tuple() which will
 call __nf_nat_l4proto_find() but thats only possible when l3 module
 is loaded, else null-ptr deref.

 For the non-ctnetlink case the l3 module must be there, else we cannot
 end up in this function.  Thus I did not want to add tests there
 and instead added a pre-check when adding null binding via ctnetlink.

 Not very nice, perhaps someone has better idea
 [ look in patch for comment in nfnetlink_attach_null_binding() ]

 net/netfilter/nf_conntrack_netlink.c | 37 +++++++++++---------------
 net/netfilter/nf_nat_core.c          | 51 +++++++++++++++++++++++++++++++-----
 2 files changed, 61 insertions(+), 27 deletions(-)

Comments

Pablo Neira Ayuso Feb. 17, 2014, 10:37 a.m. UTC | #1
Hi Florian,

On Sun, Feb 16, 2014 at 12:15:43PM +0100, Florian Westphal wrote:
> Quoting Andrey Vagin:
>   When a conntrack is created  by kernel, it is initialized (sets
>   IPS_{DST,SRC}_NAT_DONE_BIT bits in nf_nat_setup_info) and only then it
>   is added in hashes (__nf_conntrack_hash_insert), so one conntract
>   can't be initialized from a few threads concurrently.
> 
>   ctnetlink can add an uninitialized conntrack (w/o
>   IPS_{DST,SRC}_NAT_DONE_BIT) in hashes, then a few threads can look up
>   this conntrack and start initialize it concurrently. It's dangerous,
>   because BUG can be triggered from nf_nat_setup_info.
> 
> Fix this race by always setting up nat, even if no CTA_NAT_ attribute
> was requested before inserting the ct into the hash table.
> 
> In absence of CTA_NAT_ attribute, a null binding is created.
> 
> This alters current behaviour:
> Before this patch, the first packet matching the newly injected
> conntrack would be run through the nat table since nf_nat_initialized()
> returns false.  IOW, this forces ctnetlink users to specify the desired
> nat transformation on ct creation time.
> 
> Reported-By: Andrey Vagin <avagin@gmail.com>
> Signed-off-by: Florian Westphal <fw@strlen.de>
> ---
>  Change since v1: fix OOPS when L3 conntrack module is not loaded.
> 
>  Caused by nf_nat_setup_info -> get_unique_tuple() which will
>  call __nf_nat_l4proto_find() but thats only possible when l3 module
>  is loaded, else null-ptr deref.
> 
>  For the non-ctnetlink case the l3 module must be there, else we cannot
>  end up in this function.  Thus I did not want to add tests there
>  and instead added a pre-check when adding null binding via ctnetlink.
> 
>  Not very nice, perhaps someone has better idea
>  [ look in patch for comment in nfnetlink_attach_null_binding() ]
> 
>  net/netfilter/nf_conntrack_netlink.c | 37 +++++++++++---------------
>  net/netfilter/nf_nat_core.c          | 51 +++++++++++++++++++++++++++++++-----
>  2 files changed, 61 insertions(+), 27 deletions(-)
> 
> diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
> index bb322d0..40299a6 100644
> --- a/net/netfilter/nf_conntrack_netlink.c
> +++ b/net/netfilter/nf_conntrack_netlink.c
> @@ -1310,27 +1310,24 @@ ctnetlink_change_status(struct nf_conn *ct, const struct nlattr * const cda[])
>  }
>  
>  static int
> -ctnetlink_change_nat(struct nf_conn *ct, const struct nlattr * const cda[])
> +ctnetlink_setup_nat(struct nf_conn *ct, const struct nlattr * const cda[])
>  {
>  #ifdef CONFIG_NF_NAT_NEEDED
>  	int ret;
>  
> -	if (cda[CTA_NAT_DST]) {
> -		ret = ctnetlink_parse_nat_setup(ct,
> -						NF_NAT_MANIP_DST,
> -						cda[CTA_NAT_DST]);
> -		if (ret < 0)
> -			return ret;
> -	}
> -	if (cda[CTA_NAT_SRC]) {
> -		ret = ctnetlink_parse_nat_setup(ct,
> -						NF_NAT_MANIP_SRC,
> -						cda[CTA_NAT_SRC]);
> -		if (ret < 0)
> -			return ret;
> -	}
> -	return 0;
> +	ret = ctnetlink_parse_nat_setup(ct,
> +					NF_NAT_MANIP_DST,
> +					cda[CTA_NAT_DST]);
> +	if (ret < 0)
> +		return ret;
> +
> +	ret = ctnetlink_parse_nat_setup(ct,
> +					NF_NAT_MANIP_SRC,
> +					cda[CTA_NAT_SRC]);
> +	return ret;
>  #else
> +	if (!cda[CTA_NAT_DST] && !cda[CTA_NAT_SRC])
> +		return 0;
>  	return -EOPNOTSUPP;
>  #endif
>  }
> @@ -1659,11 +1656,9 @@ ctnetlink_create_conntrack(struct net *net, u16 zone,
>  			goto err2;
>  	}
>  
> -	if (cda[CTA_NAT_SRC] || cda[CTA_NAT_DST]) {
> -		err = ctnetlink_change_nat(ct, cda);
> -		if (err < 0)
> -			goto err2;
> -	}
> +	err = ctnetlink_setup_nat(ct, cda);
> +	if (err < 0)
> +		goto err2;
>  
>  	nf_ct_acct_ext_add(ct, GFP_ATOMIC);
>  	nf_ct_tstamp_ext_add(ct, GFP_ATOMIC);
> diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c
> index d3f5cd6..c8ba395 100644
> --- a/net/netfilter/nf_nat_core.c
> +++ b/net/netfilter/nf_nat_core.c
> @@ -432,15 +432,15 @@ nf_nat_setup_info(struct nf_conn *ct,
>  }
>  EXPORT_SYMBOL(nf_nat_setup_info);
>  
> -unsigned int
> -nf_nat_alloc_null_binding(struct nf_conn *ct, unsigned int hooknum)
> +static unsigned int
> +__nf_nat_alloc_null_binding(struct nf_conn *ct, enum nf_nat_manip_type manip)
>  {
>  	/* Force range to this IP; let proto decide mapping for
>  	 * per-proto parts (hence not IP_NAT_RANGE_PROTO_SPECIFIED).
>  	 * Use reply in case it's already been mangled (eg local packet).
>  	 */
>  	union nf_inet_addr ip =
> -		(HOOK2MANIP(hooknum) == NF_NAT_MANIP_SRC ?
> +		(manip == NF_NAT_MANIP_SRC ?
>  		ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3 :
>  		ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3);
>  	struct nf_nat_range range = {
> @@ -448,7 +448,13 @@ nf_nat_alloc_null_binding(struct nf_conn *ct, unsigned int hooknum)
>  		.min_addr	= ip,
>  		.max_addr	= ip,
>  	};
> -	return nf_nat_setup_info(ct, &range, HOOK2MANIP(hooknum));
> +	return nf_nat_setup_info(ct, &range, manip);
> +}
> +
> +unsigned int
> +nf_nat_alloc_null_binding(struct nf_conn *ct, unsigned int hooknum)
> +{
> +	return __nf_nat_alloc_null_binding(ct, HOOK2MANIP(hooknum));
>  }
>  EXPORT_SYMBOL_GPL(nf_nat_alloc_null_binding);
>  
> @@ -734,6 +740,33 @@ out:
>  }
>  
>  static int
> +nfnetlink_attach_null_binding(struct nf_conn *ct,
> +			      enum nf_nat_manip_type manip)
> +{
> +	int ret = -EAGAIN;
> +	bool can_alloc;
> +
> +	/* This looks bogus, but its important.
> +	 *
> +	 * We cannot be sure that L3 NAT is available.
> +	 *
> +	 * If it is not, we will oops in nf_nat_setup_info when we try
> +	 * to fetch the l4 nat protocol (which would be on top of the l3 one).
> +	 *
> +	 * Normally nf_nat_setup_info cannot be called without L3 nat
> +	 * available, but this function is invoked from ctnetlink.
> +	 */
> +	rcu_read_lock();
> +
> +	can_alloc = !!__nf_nat_l3proto_find(nf_ct_l3num(ct));
> +	if (can_alloc)
> +		ret = __nf_nat_alloc_null_binding(ct, manip);
> +
> +	rcu_read_unlock();
> +	return ret;

I think we should always do the module autoloading for nf-nat and
nf-nat-ipvX modules depending on nf_ct_l3num(ct), then return EAGAIN
to give another retry. Now, this needs to happen in any case, not only
when calling ctnetlink_parse_nat_setup().
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Florian Westphal Feb. 17, 2014, 10:45 a.m. UTC | #2
Pablo Neira Ayuso <pablo@netfilter.org> wrote:
> >  static int
> > +nfnetlink_attach_null_binding(struct nf_conn *ct,
> > +			      enum nf_nat_manip_type manip)
> > +{
> > +	int ret = -EAGAIN;
> > +	bool can_alloc;
> > +
> > +	/* This looks bogus, but its important.
> > +	 *
> > +	 * We cannot be sure that L3 NAT is available.
> > +	 *
> > +	 * If it is not, we will oops in nf_nat_setup_info when we try
> > +	 * to fetch the l4 nat protocol (which would be on top of the l3 one).
> > +	 *
> > +	 * Normally nf_nat_setup_info cannot be called without L3 nat
> > +	 * available, but this function is invoked from ctnetlink.
> > +	 */
> > +	rcu_read_lock();
> > +
> > +	can_alloc = !!__nf_nat_l3proto_find(nf_ct_l3num(ct));
> > +	if (can_alloc)
> > +		ret = __nf_nat_alloc_null_binding(ct, manip);
> > +
> > +	rcu_read_unlock();
> > +	return ret;
> 
> I think we should always do the module autoloading for nf-nat and
> nf-nat-ipvX modules depending on nf_ct_l3num(ct), then return EAGAIN
> to give another retry. Now, this needs to happen in any case, not only
> when calling ctnetlink_parse_nat_setup().

Not sure what you mean.  If we enter nf_nat_setup_info without ctnetlink
involvement the nf-nat protocol should already be there (else, how can
we end up in nf_nat_setup_info? NAT/MASQUERADE depends on nf-nat).

What use-case did you have in mind? (or, to put it differently, where
do you think the module probing logic should be)?

Thanks!
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Pablo Neira Ayuso Feb. 17, 2014, 10:58 a.m. UTC | #3
On Mon, Feb 17, 2014 at 11:45:11AM +0100, Florian Westphal wrote:
> Pablo Neira Ayuso <pablo@netfilter.org> wrote:
> > >  static int
> > > +nfnetlink_attach_null_binding(struct nf_conn *ct,
> > > +			      enum nf_nat_manip_type manip)
> > > +{
> > > +	int ret = -EAGAIN;
> > > +	bool can_alloc;
> > > +
> > > +	/* This looks bogus, but its important.
> > > +	 *
> > > +	 * We cannot be sure that L3 NAT is available.
> > > +	 *
> > > +	 * If it is not, we will oops in nf_nat_setup_info when we try
> > > +	 * to fetch the l4 nat protocol (which would be on top of the l3 one).
> > > +	 *
> > > +	 * Normally nf_nat_setup_info cannot be called without L3 nat
> > > +	 * available, but this function is invoked from ctnetlink.
> > > +	 */
> > > +	rcu_read_lock();
> > > +
> > > +	can_alloc = !!__nf_nat_l3proto_find(nf_ct_l3num(ct));
> > > +	if (can_alloc)
> > > +		ret = __nf_nat_alloc_null_binding(ct, manip);
> > > +
> > > +	rcu_read_unlock();
> > > +	return ret;
> >
> > I think we should always do the module autoloading for nf-nat and
> > nf-nat-ipvX modules depending on nf_ct_l3num(ct), then return EAGAIN
> > to give another retry. Now, this needs to happen in any case, not only
> > when calling ctnetlink_parse_nat_setup().
>
> Not sure what you mean.  If we enter nf_nat_setup_info without ctnetlink
> involvement the nf-nat protocol should already be there (else, how can
> we end up in nf_nat_setup_info? NAT/MASQUERADE depends on nf-nat).
>
> What use-case did you have in mind? (or, to put it differently, where
> do you think the module probing logic should be)?

If __nf_nat_l3proto_find returns NULL before trying to attach the null
binding, I think you should call the routine to autoload the modules
before returning EAGAIN.

        proto = __nf_nat_l3proto_find(nf_ct_l3num(ct));
        if (proto == NULL) {
                ... release locks
                request_module(...);
                ... acquire locks again
                return -EAGAIN;
        }
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Florian Westphal Feb. 17, 2014, 11:15 a.m. UTC | #4
Pablo Neira Ayuso <pablo@netfilter.org> wrote:
> > > I think we should always do the module autoloading for nf-nat and
> > > nf-nat-ipvX modules depending on nf_ct_l3num(ct), then return EAGAIN
> > > to give another retry. Now, this needs to happen in any case, not only
> > > when calling ctnetlink_parse_nat_setup().
> >
> > Not sure what you mean.  If we enter nf_nat_setup_info without ctnetlink
> > involvement the nf-nat protocol should already be there (else, how can
> > we end up in nf_nat_setup_info? NAT/MASQUERADE depends on nf-nat).
> >
> > What use-case did you have in mind? (or, to put it differently, where
> > do you think the module probing logic should be)?
> 
> If __nf_nat_l3proto_find returns NULL before trying to attach the null
> binding, I think you should call the routine to autoload the modules
> before returning EAGAIN.
> 
>         proto = __nf_nat_l3proto_find(nf_ct_l3num(ct));
>         if (proto == NULL) {
>                 ... release locks
>                 request_module(...);
>                 ... acquire locks again
>                 return -EAGAIN;
>         }

The patch alters ctnetlink to call ctnetlink_parse_nat_setup even when
NAT attr == NULL.

nfnetlink_attach_null_binding() returns EAGAIN; this return value is
propagated back to ctnetlink_parse_nat_setup.

That will then request_module(), nfnetlink will replay the message.

running
conntrack -I -p tcp -s 1.1.1.1 -d 2.2.2.2 --timeout 100 --state \
	ESTABLISHED --sport 10 --dport 20
on newly booted machine works, lsmod pre/post
shows:
+nf_conntrack_ipv4      14808  1 
+nf_defrag_ipv4         12702  1 nf_conntrack_ipv4
+nf_nat_ipv4            13199  0 
+nf_nat                 20926  1 nf_nat_ipv4

Which is the desired behaviour afaiu.

[ If you think calling ctnetlink_parse_nat_setup with NULL attr
  is abuse, please let me know and I will try to come up with something
  different ]
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index bb322d0..40299a6 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -1310,27 +1310,24 @@  ctnetlink_change_status(struct nf_conn *ct, const struct nlattr * const cda[])
 }
 
 static int
-ctnetlink_change_nat(struct nf_conn *ct, const struct nlattr * const cda[])
+ctnetlink_setup_nat(struct nf_conn *ct, const struct nlattr * const cda[])
 {
 #ifdef CONFIG_NF_NAT_NEEDED
 	int ret;
 
-	if (cda[CTA_NAT_DST]) {
-		ret = ctnetlink_parse_nat_setup(ct,
-						NF_NAT_MANIP_DST,
-						cda[CTA_NAT_DST]);
-		if (ret < 0)
-			return ret;
-	}
-	if (cda[CTA_NAT_SRC]) {
-		ret = ctnetlink_parse_nat_setup(ct,
-						NF_NAT_MANIP_SRC,
-						cda[CTA_NAT_SRC]);
-		if (ret < 0)
-			return ret;
-	}
-	return 0;
+	ret = ctnetlink_parse_nat_setup(ct,
+					NF_NAT_MANIP_DST,
+					cda[CTA_NAT_DST]);
+	if (ret < 0)
+		return ret;
+
+	ret = ctnetlink_parse_nat_setup(ct,
+					NF_NAT_MANIP_SRC,
+					cda[CTA_NAT_SRC]);
+	return ret;
 #else
+	if (!cda[CTA_NAT_DST] && !cda[CTA_NAT_SRC])
+		return 0;
 	return -EOPNOTSUPP;
 #endif
 }
@@ -1659,11 +1656,9 @@  ctnetlink_create_conntrack(struct net *net, u16 zone,
 			goto err2;
 	}
 
-	if (cda[CTA_NAT_SRC] || cda[CTA_NAT_DST]) {
-		err = ctnetlink_change_nat(ct, cda);
-		if (err < 0)
-			goto err2;
-	}
+	err = ctnetlink_setup_nat(ct, cda);
+	if (err < 0)
+		goto err2;
 
 	nf_ct_acct_ext_add(ct, GFP_ATOMIC);
 	nf_ct_tstamp_ext_add(ct, GFP_ATOMIC);
diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c
index d3f5cd6..c8ba395 100644
--- a/net/netfilter/nf_nat_core.c
+++ b/net/netfilter/nf_nat_core.c
@@ -432,15 +432,15 @@  nf_nat_setup_info(struct nf_conn *ct,
 }
 EXPORT_SYMBOL(nf_nat_setup_info);
 
-unsigned int
-nf_nat_alloc_null_binding(struct nf_conn *ct, unsigned int hooknum)
+static unsigned int
+__nf_nat_alloc_null_binding(struct nf_conn *ct, enum nf_nat_manip_type manip)
 {
 	/* Force range to this IP; let proto decide mapping for
 	 * per-proto parts (hence not IP_NAT_RANGE_PROTO_SPECIFIED).
 	 * Use reply in case it's already been mangled (eg local packet).
 	 */
 	union nf_inet_addr ip =
-		(HOOK2MANIP(hooknum) == NF_NAT_MANIP_SRC ?
+		(manip == NF_NAT_MANIP_SRC ?
 		ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3 :
 		ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3);
 	struct nf_nat_range range = {
@@ -448,7 +448,13 @@  nf_nat_alloc_null_binding(struct nf_conn *ct, unsigned int hooknum)
 		.min_addr	= ip,
 		.max_addr	= ip,
 	};
-	return nf_nat_setup_info(ct, &range, HOOK2MANIP(hooknum));
+	return nf_nat_setup_info(ct, &range, manip);
+}
+
+unsigned int
+nf_nat_alloc_null_binding(struct nf_conn *ct, unsigned int hooknum)
+{
+	return __nf_nat_alloc_null_binding(ct, HOOK2MANIP(hooknum));
 }
 EXPORT_SYMBOL_GPL(nf_nat_alloc_null_binding);
 
@@ -734,6 +740,33 @@  out:
 }
 
 static int
+nfnetlink_attach_null_binding(struct nf_conn *ct,
+			      enum nf_nat_manip_type manip)
+{
+	int ret = -EAGAIN;
+	bool can_alloc;
+
+	/* This looks bogus, but its important.
+	 *
+	 * We cannot be sure that L3 NAT is available.
+	 *
+	 * If it is not, we will oops in nf_nat_setup_info when we try
+	 * to fetch the l4 nat protocol (which would be on top of the l3 one).
+	 *
+	 * Normally nf_nat_setup_info cannot be called without L3 nat
+	 * available, but this function is invoked from ctnetlink.
+	 */
+	rcu_read_lock();
+
+	can_alloc = !!__nf_nat_l3proto_find(nf_ct_l3num(ct));
+	if (can_alloc)
+		ret = __nf_nat_alloc_null_binding(ct, manip);
+
+	rcu_read_unlock();
+	return ret;
+}
+
+static int
 nfnetlink_parse_nat_setup(struct nf_conn *ct,
 			  enum nf_nat_manip_type manip,
 			  const struct nlattr *attr)
@@ -741,11 +774,17 @@  nfnetlink_parse_nat_setup(struct nf_conn *ct,
 	struct nf_nat_range range;
 	int err;
 
+	/* should not happen, restricted to creating new conntracks
+	 * via ctnetlink */
+	if (WARN_ON_ONCE(nf_nat_initialized(ct, manip)))
+		return -EEXIST;
+
+	if (attr == NULL)
+		return nfnetlink_attach_null_binding(ct, manip);
+
 	err = nfnetlink_parse_nat(attr, ct, &range);
 	if (err < 0)
 		return err;
-	if (nf_nat_initialized(ct, manip))
-		return -EEXIST;
 
 	return nf_nat_setup_info(ct, &range, manip);
 }