diff mbox

[nf,1/3] netfilter: conntrack: fix race between nf_conntrack proc read and hash resize

Message ID 1467457167-5363-2-git-send-email-zlpnobody@163.com
State Changes Requested
Delegated to: Pablo Neira
Headers show

Commit Message

Liping Zhang July 2, 2016, 10:59 a.m. UTC
From: Liping Zhang <liping.zhang@spreadtrum.com>

When we do "cat /proc/net/nf_conntrack", and meanwhile resize the conntrack
hash table via /sys/module/nf_conntrack/parameters/hashsize, race will
happen, because reader can observe a newly allocated hash but the old size
(or vice versa). So oops will happen like follows:

  BUG: unable to handle kernel NULL pointer dereference at 0000000000000017
  IP: [<ffffffffa0418e21>] seq_print_acct+0x11/0x50 [nf_conntrack]
  Call Trace:
  [<ffffffffa0412f4e>] ? ct_seq_show+0x14e/0x340 [nf_conntrack]
  [<ffffffff81261a1c>] seq_read+0x2cc/0x390
  [<ffffffff812a8d62>] proc_reg_read+0x42/0x70
  [<ffffffff8123bee7>] __vfs_read+0x37/0x130
  [<ffffffff81347980>] ? security_file_permission+0xa0/0xc0
  [<ffffffff8123cf75>] vfs_read+0x95/0x140
  [<ffffffff8123e475>] SyS_read+0x55/0xc0
  [<ffffffff817c2572>] entry_SYSCALL_64_fastpath+0x1a/0xa4

It is very easy to reproduce this kernel crash.
1. open one shell and input the following cmds:
  while : ; do
    echo $RANDOM > hashsize
  done
2. open more shells and input the following cmds:
  while : ; do
    cat /proc/net/nf_conntrack
  done
3. just wait a monent, oops will happen soon.

The solution in this patch is based on Florian's Commit 5e3c61f98175
("netfilter: conntrack: fix lookup race during hash resize").

Signed-off-by: Liping Zhang <liping.zhang@spreadtrum.com>
---
 include/net/netfilter/nf_conntrack_core.h            |  1 +
 .../netfilter/nf_conntrack_l3proto_ipv4_compat.c     | 20 ++++++++++++++++----
 net/netfilter/nf_conntrack_core.c                    |  4 +++-
 net/netfilter/nf_conntrack_standalone.c              | 20 +++++++++++++++-----
 4 files changed, 35 insertions(+), 10 deletions(-)

Comments

Florian Westphal July 2, 2016, 5:46 p.m. UTC | #1
Liping Zhang <zlpnobody@163.com> wrote:
> From: Liping Zhang <liping.zhang@spreadtrum.com>
> 
> When we do "cat /proc/net/nf_conntrack", and meanwhile resize the conntrack
> hash table via /sys/module/nf_conntrack/parameters/hashsize, race will
> happen, because reader can observe a newly allocated hash but the old size
> (or vice versa). So oops will happen like follows:
> 
>   BUG: unable to handle kernel NULL pointer dereference at 0000000000000017
>   IP: [<ffffffffa0418e21>] seq_print_acct+0x11/0x50 [nf_conntrack]
>   Call Trace:
>   [<ffffffffa0412f4e>] ? ct_seq_show+0x14e/0x340 [nf_conntrack]
>   [<ffffffff81261a1c>] seq_read+0x2cc/0x390
>   [<ffffffff812a8d62>] proc_reg_read+0x42/0x70
>   [<ffffffff8123bee7>] __vfs_read+0x37/0x130
>   [<ffffffff81347980>] ? security_file_permission+0xa0/0xc0
>   [<ffffffff8123cf75>] vfs_read+0x95/0x140
>   [<ffffffff8123e475>] SyS_read+0x55/0xc0
>   [<ffffffff817c2572>] entry_SYSCALL_64_fastpath+0x1a/0xa4
> 
> It is very easy to reproduce this kernel crash.
> 1. open one shell and input the following cmds:
>   while : ; do
>     echo $RANDOM > hashsize
>   done
> 2. open more shells and input the following cmds:
>   while : ; do
>     cat /proc/net/nf_conntrack
>   done
> 3. just wait a monent, oops will happen soon.

Good catch, but ...

> diff --git a/include/net/netfilter/nf_conntrack_core.h b/include/net/netfilter/nf_conntrack_core.h
> index 3e2f332..4f6453a 100644
> --- a/include/net/netfilter/nf_conntrack_core.h
> +++ b/include/net/netfilter/nf_conntrack_core.h
> @@ -82,6 +82,7 @@ print_tuple(struct seq_file *s, const struct nf_conntrack_tuple *tuple,
>  #define CONNTRACK_LOCKS 1024
>  
>  extern struct hlist_nulls_head *nf_conntrack_hash;
> +extern seqcount_t nf_conntrack_generation;

instead of this and the proliferation of this:

> +	do {
> +		sequence = read_seqcount_begin(&nf_conntrack_generation);
> +		st->htable_size = nf_conntrack_htable_size;
> +		st->hash = nf_conntrack_hash;
> +	} while (read_seqcount_retry(&nf_conntrack_generation, sequence));
> +
>  	return ct_get_idx(seq, *pos);
>  }

I think it might be better to do something like

/* must be called with rcu read lock held */
unsigned int nf_conntrack_get_ht(struct hlist_nulls_head *h,
			         unsigned int *buckets)
{
	do {
		s = read_seq ...
		size = nf_conntrack_htable_size;
		ptr = nf_conntrack_hash;
	} while ...

	*h = ptr;
	*buckets = size;

	return s;
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Liping Zhang July 3, 2016, 2:22 a.m. UTC | #2
>Good catch, but ...
>
>> diff --git a/include/net/netfilter/nf_conntrack_core.h b/include/net/netfilter/nf_conntrack_core.h
>> index 3e2f332..4f6453a 100644
>> --- a/include/net/netfilter/nf_conntrack_core.h
>> +++ b/include/net/netfilter/nf_conntrack_core.h
>> @@ -82,6 +82,7 @@ print_tuple(struct seq_file *s, const struct nf_conntrack_tuple *tuple,
>>  #define CONNTRACK_LOCKS 1024
>>  
>>  extern struct hlist_nulls_head *nf_conntrack_hash;
>> +extern seqcount_t nf_conntrack_generation;
>
>instead of this and the proliferation of this:
>
>> +	do {
>> +		sequence = read_seqcount_begin(&nf_conntrack_generation);
>> +		st->htable_size = nf_conntrack_htable_size;
>> +		st->hash = nf_conntrack_hash;
>> +	} while (read_seqcount_retry(&nf_conntrack_generation, sequence));
>> +
>>  	return ct_get_idx(seq, *pos);
>>  }
>
>I think it might be better to do something like
>
>/* must be called with rcu read lock held */
>unsigned int nf_conntrack_get_ht(struct hlist_nulls_head *h,
>			         unsigned int *buckets)
>{
>	do {
>		s = read_seq ...
>		size = nf_conntrack_htable_size;
>		ptr = nf_conntrack_hash;
>	} while ...
>
>	*h = ptr;
>	*buckets = size;
>
>	return s;

Agree.

And I also find there's no need to use nf_conntrack_generation in my patch #2 and #3.
Will send V2 later.

Thanks
diff mbox

Patch

diff --git a/include/net/netfilter/nf_conntrack_core.h b/include/net/netfilter/nf_conntrack_core.h
index 3e2f332..4f6453a 100644
--- a/include/net/netfilter/nf_conntrack_core.h
+++ b/include/net/netfilter/nf_conntrack_core.h
@@ -82,6 +82,7 @@  print_tuple(struct seq_file *s, const struct nf_conntrack_tuple *tuple,
 #define CONNTRACK_LOCKS 1024
 
 extern struct hlist_nulls_head *nf_conntrack_hash;
+extern seqcount_t nf_conntrack_generation;
 extern spinlock_t nf_conntrack_locks[CONNTRACK_LOCKS];
 void nf_conntrack_lock(spinlock_t *lock);
 
diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
index c6f3c40..584899f 100644
--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
+++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
@@ -26,6 +26,8 @@ 
 
 struct ct_iter_state {
 	struct seq_net_private p;
+	struct hlist_nulls_head *hash;
+	unsigned int htable_size;
 	unsigned int bucket;
 };
 
@@ -35,10 +37,10 @@  static struct hlist_nulls_node *ct_get_first(struct seq_file *seq)
 	struct hlist_nulls_node *n;
 
 	for (st->bucket = 0;
-	     st->bucket < nf_conntrack_htable_size;
+	     st->bucket < st->htable_size;
 	     st->bucket++) {
 		n = rcu_dereference(
-			hlist_nulls_first_rcu(&nf_conntrack_hash[st->bucket]));
+			hlist_nulls_first_rcu(&st->hash[st->bucket]));
 		if (!is_a_nulls(n))
 			return n;
 	}
@@ -53,11 +55,11 @@  static struct hlist_nulls_node *ct_get_next(struct seq_file *seq,
 	head = rcu_dereference(hlist_nulls_next_rcu(head));
 	while (is_a_nulls(head)) {
 		if (likely(get_nulls_value(head) == st->bucket)) {
-			if (++st->bucket >= nf_conntrack_htable_size)
+			if (++st->bucket >= st->htable_size)
 				return NULL;
 		}
 		head = rcu_dereference(
-			hlist_nulls_first_rcu(&nf_conntrack_hash[st->bucket]));
+			hlist_nulls_first_rcu(&st->hash[st->bucket]));
 	}
 	return head;
 }
@@ -75,7 +77,17 @@  static struct hlist_nulls_node *ct_get_idx(struct seq_file *seq, loff_t pos)
 static void *ct_seq_start(struct seq_file *seq, loff_t *pos)
 	__acquires(RCU)
 {
+	struct ct_iter_state *st = seq->private;
+	unsigned int sequence;
+
 	rcu_read_lock();
+
+	do {
+		sequence = read_seqcount_begin(&nf_conntrack_generation);
+		st->htable_size = nf_conntrack_htable_size;
+		st->hash = nf_conntrack_hash;
+	} while (read_seqcount_retry(&nf_conntrack_generation, sequence));
+
 	return ct_get_idx(seq, *pos);
 }
 
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index f204274..1c39697 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -72,9 +72,11 @@  EXPORT_SYMBOL_GPL(nf_conntrack_expect_lock);
 struct hlist_nulls_head *nf_conntrack_hash __read_mostly;
 EXPORT_SYMBOL_GPL(nf_conntrack_hash);
 
+seqcount_t nf_conntrack_generation __read_mostly;
+EXPORT_SYMBOL_GPL(nf_conntrack_generation);
+
 static __read_mostly struct kmem_cache *nf_conntrack_cachep;
 static __read_mostly spinlock_t nf_conntrack_locks_all_lock;
-static __read_mostly seqcount_t nf_conntrack_generation;
 static __read_mostly DEFINE_SPINLOCK(nf_conntrack_locks_all_lock);
 static __read_mostly bool nf_conntrack_locks_all;
 
diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c
index c026c47..2cf484b 100644
--- a/net/netfilter/nf_conntrack_standalone.c
+++ b/net/netfilter/nf_conntrack_standalone.c
@@ -48,6 +48,8 @@  EXPORT_SYMBOL_GPL(print_tuple);
 
 struct ct_iter_state {
 	struct seq_net_private p;
+	struct hlist_nulls_head *hash;
+	unsigned int htable_size;
 	unsigned int bucket;
 	u_int64_t time_now;
 };
@@ -58,9 +60,10 @@  static struct hlist_nulls_node *ct_get_first(struct seq_file *seq)
 	struct hlist_nulls_node *n;
 
 	for (st->bucket = 0;
-	     st->bucket < nf_conntrack_htable_size;
+	     st->bucket < st->htable_size;
 	     st->bucket++) {
-		n = rcu_dereference(hlist_nulls_first_rcu(&nf_conntrack_hash[st->bucket]));
+		n = rcu_dereference(
+			hlist_nulls_first_rcu(&st->hash[st->bucket]));
 		if (!is_a_nulls(n))
 			return n;
 	}
@@ -75,12 +78,11 @@  static struct hlist_nulls_node *ct_get_next(struct seq_file *seq,
 	head = rcu_dereference(hlist_nulls_next_rcu(head));
 	while (is_a_nulls(head)) {
 		if (likely(get_nulls_value(head) == st->bucket)) {
-			if (++st->bucket >= nf_conntrack_htable_size)
+			if (++st->bucket >= st->htable_size)
 				return NULL;
 		}
 		head = rcu_dereference(
-				hlist_nulls_first_rcu(
-					&nf_conntrack_hash[st->bucket]));
+			hlist_nulls_first_rcu(&st->hash[st->bucket]));
 	}
 	return head;
 }
@@ -99,9 +101,17 @@  static void *ct_seq_start(struct seq_file *seq, loff_t *pos)
 	__acquires(RCU)
 {
 	struct ct_iter_state *st = seq->private;
+	unsigned int sequence;
 
 	st->time_now = ktime_get_real_ns();
 	rcu_read_lock();
+
+	do {
+		sequence = read_seqcount_begin(&nf_conntrack_generation);
+		st->htable_size = nf_conntrack_htable_size;
+		st->hash = nf_conntrack_hash;
+	} while (read_seqcount_retry(&nf_conntrack_generation, sequence));
+
 	return ct_get_idx(seq, *pos);
 }