diff mbox

[RFT,3/3] iptables: lock free counters

Message ID 20090204001755.808036408@vyatta.com
State RFC, archived
Delegated to: David Miller
Headers show

Commit Message

stephen hemminger Feb. 4, 2009, 12:12 a.m. UTC
Make netfilter tables that use x_tables (iptables, ip6tables, arptables)
operatate without locking on the receive path.
Replace existing reader/writer lock with Read-Copy-Update to
elminate the overhead of a read lock on each incoming packet.
This should reduce the overhead of iptables especially on SMP
systems.

The previous code used a reader-writer lock for two purposes.
The first was to ensure that the xt_table_info reference was not in
process of being changed. Since xt_table_info is only freed via one
routine, it was a direct conversion to RCU.

The other use of the reader-writer lock was to to block changes
to counters while they were being read. This is handled by swapping in
a new set of counter values and then summing the old ones. The sum
is then restored back on a single cpu.

Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>

---
 include/linux/netfilter/x_tables.h |   13 ++++
 net/ipv4/netfilter/arp_tables.c    |   92 ++++++++++++++++++++++++-----------
 net/ipv4/netfilter/ip_tables.c     |   97 ++++++++++++++++++++++++-------------
 net/ipv6/netfilter/ip6_tables.c    |   97 +++++++++++++++++++++++--------------
 net/netfilter/x_tables.c           |   67 +++++++++++++++++++++----
 5 files changed, 258 insertions(+), 108 deletions(-)

Comments

Eric Dumazet Feb. 4, 2009, 3:10 a.m. UTC | #1
Stephen Hemminger a écrit :
> Make netfilter tables that use x_tables (iptables, ip6tables, arptables)
> operatate without locking on the receive path.
> Replace existing reader/writer lock with Read-Copy-Update to
> elminate the overhead of a read lock on each incoming packet.
> This should reduce the overhead of iptables especially on SMP
> systems.
> 
> The previous code used a reader-writer lock for two purposes.
> The first was to ensure that the xt_table_info reference was not in
> process of being changed. Since xt_table_info is only freed via one
> routine, it was a direct conversion to RCU.
> 
> The other use of the reader-writer lock was to to block changes
> to counters while they were being read. This is handled by swapping in
> a new set of counter values and then summing the old ones. The sum
> is then restored back on a single cpu.
> 
> Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
> 
> ---
>  include/linux/netfilter/x_tables.h |   13 ++++
>  net/ipv4/netfilter/arp_tables.c    |   92 ++++++++++++++++++++++++-----------
>  net/ipv4/netfilter/ip_tables.c     |   97 ++++++++++++++++++++++++-------------
>  net/ipv6/netfilter/ip6_tables.c    |   97 +++++++++++++++++++++++--------------
>  net/netfilter/x_tables.c           |   67 +++++++++++++++++++++----
>  5 files changed, 258 insertions(+), 108 deletions(-)
> 
> --- a/include/linux/netfilter/x_tables.h	2009-02-02 15:06:39.893751845 -0800
> +++ b/include/linux/netfilter/x_tables.h	2009-02-03 15:44:21.743663216 -0800
> @@ -353,7 +353,7 @@ struct xt_table
>  	unsigned int valid_hooks;
>  
>  	/* Lock for the curtain */
> -	rwlock_t lock;
> +	struct mutex lock;
>  
>  	/* Man behind the curtain... */
>  	struct xt_table_info *private;
> @@ -383,9 +383,15 @@ struct xt_table_info
>  	unsigned int hook_entry[NF_INET_NUMHOOKS];
>  	unsigned int underflow[NF_INET_NUMHOOKS];
>  
> +	/* For the dustman... */
> +	union {
> +		struct rcu_head rcu;
> +		struct work_struct work;
> +	};
> +
>  	/* ipt_entry tables: one per CPU */
>  	/* Note : this field MUST be the last one, see XT_TABLE_INFO_SZ */
> -	char *entries[1];
> +	void *entries[1];
>  };
>  
>  #define XT_TABLE_INFO_SZ (offsetof(struct xt_table_info, entries) \
> @@ -432,6 +438,9 @@ extern void xt_proto_fini(struct net *ne
>  
>  extern struct xt_table_info *xt_alloc_table_info(unsigned int size);
>  extern void xt_free_table_info(struct xt_table_info *info);
> +extern void xt_zero_table_entries(struct xt_table_info *info);
> +extern void xt_swap_table_entries(struct xt_table_info *old,
> +				  struct xt_table_info *new);
>  
>  #ifdef CONFIG_COMPAT
>  #include <net/compat.h>
> --- a/net/ipv4/netfilter/ip_tables.c	2009-02-02 15:06:29.684249364 -0800
> +++ b/net/ipv4/netfilter/ip_tables.c	2009-02-03 15:52:32.047583686 -0800
> @@ -347,10 +347,12 @@ ipt_do_table(struct sk_buff *skb,
>  	mtpar.family  = tgpar.family = NFPROTO_IPV4;
>  	tgpar.hooknum = hook;
>  
> -	read_lock_bh(&table->lock);
>  	IP_NF_ASSERT(table->valid_hooks & (1 << hook));
> -	private = table->private;
> -	table_base = (void *)private->entries[smp_processor_id()];
> +
> +	rcu_read_lock_bh();
> +	private = rcu_dereference(table->private);
> +	table_base = rcu_dereference(private->entries[smp_processor_id()]);
> +
>  	e = get_entry(table_base, private->hook_entry[hook]);
>  
>  	/* For return from builtin chain */
> @@ -445,7 +447,7 @@ ipt_do_table(struct sk_buff *skb,
>  		}
>  	} while (!hotdrop);
>  
> -	read_unlock_bh(&table->lock);
> +	rcu_read_unlock_bh();
>  
>  #ifdef DEBUG_ALLOW_ALL
>  	return NF_ACCEPT;
> @@ -924,13 +926,45 @@ get_counters(const struct xt_table_info 
>  				  counters,
>  				  &i);
>  	}
> +
> +}
> +
> +/* We're lazy, and add to the first CPU; overflow works its fey magic
> + * and everything is OK. */
> +static int
> +add_counter_to_entry(struct ipt_entry *e,
> +		     const struct xt_counters addme[],
> +		     unsigned int *i)
> +{
> +	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
> +
> +	(*i)++;
> +	return 0;
> +}
> +
> +/* Take values from counters and add them back onto the current cpu */
> +static void put_counters(struct xt_table_info *t,
> +			 const struct xt_counters counters[])
> +{
> +	unsigned int i, cpu;
> +
> +	local_bh_disable();
> +	cpu = smp_processor_id();
> +	i = 0;
> +	IPT_ENTRY_ITERATE(t->entries[cpu],
> +			  t->size,
> +			  add_counter_to_entry,
> +			  counters,
> +			  &i);
> +	local_bh_enable();
>  }
>  
>  static struct xt_counters * alloc_counters(struct xt_table *table)
>  {
>  	unsigned int countersize;
>  	struct xt_counters *counters;
> -	const struct xt_table_info *private = table->private;
> +	struct xt_table_info *private = table->private;
> +	struct xt_table_info *tmp;
>  
>  	/* We need atomic snapshot of counters: rest doesn't change
>  	   (other than comefrom, which userspace doesn't care
> @@ -939,14 +973,30 @@ static struct xt_counters * alloc_counte
>  	counters = vmalloc_node(countersize, numa_node_id());
>  
>  	if (counters == NULL)
> -		return ERR_PTR(-ENOMEM);
> +		goto nomem;
> +
> +	tmp = xt_alloc_table_info(private->size);
> +	if (!tmp)
> +		goto free_counters;
> +

> +	xt_zero_table_entries(tmp);
This is not correct. We must copy rules and zero counters on the copied stuff.

> +
> +	mutex_lock(&table->lock);
> +	xt_swap_table_entries(private, tmp);
> +	synchronize_net();	/* Wait until smoke has cleared */
>  
> -	/* First, sum counters... */
> -	write_lock_bh(&table->lock);
> -	get_counters(private, counters);
> -	write_unlock_bh(&table->lock);
> +	get_counters(tmp, counters);

Yes, tmp now hold previous pointers

> +	put_counters(private, counters);
> +	mutex_unlock(&table->lock);
> +
> +	xt_free_table_info(tmp);
>  
>  	return counters;
> +
> + free_counters:
> +	vfree(counters);
> + nomem:
> +	return ERR_PTR(-ENOMEM);
>  }
>  
>  static int
> @@ -1312,27 +1362,6 @@ do_replace(struct net *net, void __user 
>  	return ret;
>  }
>  
> -/* We're lazy, and add to the first CPU; overflow works its fey magic
> - * and everything is OK. */
> -static int
> -add_counter_to_entry(struct ipt_entry *e,
> -		     const struct xt_counters addme[],
> -		     unsigned int *i)
> -{
> -#if 0
> -	duprintf("add_counter: Entry %u %lu/%lu + %lu/%lu\n",
> -		 *i,
> -		 (long unsigned int)e->counters.pcnt,
> -		 (long unsigned int)e->counters.bcnt,
> -		 (long unsigned int)addme[*i].pcnt,
> -		 (long unsigned int)addme[*i].bcnt);
> -#endif
> -
> -	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
> -
> -	(*i)++;
> -	return 0;
> -}
>  
>  static int
>  do_add_counters(struct net *net, void __user *user, unsigned int len, int compat)
> @@ -1393,13 +1422,14 @@ do_add_counters(struct net *net, void __
>  		goto free;
>  	}
>  
> -	write_lock_bh(&t->lock);
> +	mutex_lock(&t->lock);
>  	private = t->private;
>  	if (private->number != num_counters) {
>  		ret = -EINVAL;
>  		goto unlock_up_free;
>  	}
>  
> +	preempt_disable();
>  	i = 0;
>  	/* Choose the copy that is on our node */
>  	loc_cpu_entry = private->entries[raw_smp_processor_id()];
> @@ -1408,8 +1438,9 @@ do_add_counters(struct net *net, void __
>  			  add_counter_to_entry,
>  			  paddc,
>  			  &i);
> +	preempt_enable();
>   unlock_up_free:
> -	write_unlock_bh(&t->lock);
> +	mutex_unlock(&t->lock);
>  	xt_table_unlock(t);
>  	module_put(t->me);
>   free:
> --- a/net/netfilter/x_tables.c	2009-02-02 15:06:29.708249745 -0800
> +++ b/net/netfilter/x_tables.c	2009-02-03 15:44:21.743663216 -0800
> @@ -611,18 +611,61 @@ struct xt_table_info *xt_alloc_table_inf
>  }
>  EXPORT_SYMBOL(xt_alloc_table_info);
>  
> -void xt_free_table_info(struct xt_table_info *info)
> +/* Zero out entries */
> +void xt_zero_table_entries(struct xt_table_info *info)
>  {
> -	int cpu;
> +	unsigned int cpu;
> +
> +	for_each_possible_cpu(cpu)
> +		memset(info->entries[cpu], 0, info->size);
> +}
> +EXPORT_SYMBOL_GPL(xt_zero_table_entries);

Hum, you forgot entries[cpu] points to quite complex data set,
with iptables rules, countaining counters...

Only counters must be zeroed, one by one.

You thus need an ITERATE invocation...

I wont be able to make the incremental patch (too busy @work at this moment),
I am sorry :(

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Patrick McHardy Feb. 9, 2009, 3:52 p.m. UTC | #2
Eric Dumazet wrote:
> Stephen Hemminger a écrit :
>> @@ -939,14 +973,30 @@ static struct xt_counters * alloc_counte
>>  	counters = vmalloc_node(countersize, numa_node_id());
>>  
>>  	if (counters == NULL)
>> -		return ERR_PTR(-ENOMEM);
>> +		goto nomem;
>> +
>> +	tmp = xt_alloc_table_info(private->size);
>> +	if (!tmp)
>> +		goto free_counters;
>> +
> 
>> +	xt_zero_table_entries(tmp);
> This is not correct. We must copy rules and zero counters on the copied stuff.

Indeed.

>>  static int
>>  do_add_counters(struct net *net, void __user *user, unsigned int len, int compat)
>> @@ -1393,13 +1422,14 @@ do_add_counters(struct net *net, void __
>>  		goto free;
>>  	}
>>  
>> -	write_lock_bh(&t->lock);
>> +	mutex_lock(&t->lock);
>>  	private = t->private;
>>  	if (private->number != num_counters) {
>>  		ret = -EINVAL;
>>  		goto unlock_up_free;
>>  	}
>>  
>> +	preempt_disable();
>>  	i = 0;
>>  	/* Choose the copy that is on our node */

This isn't actually necessary, its merely an optimization. Since this
can take quite a while, it might be nicer not to disable preempt.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Patrick McHardy Feb. 9, 2009, 3:58 p.m. UTC | #3
Stephen Hemminger wrote:

> --- a/net/ipv4/netfilter/ip_tables.c	2009-01-29 11:09:20.720069979 -0800
> +++ b/net/ipv4/netfilter/ip_tables.c	2009-01-29 11:10:49.827070289 -0800
> @@ -347,9 +347,9 @@ ipt_do_table(struct sk_buff *skb,
>  	mtpar.family  = tgpar.family = NFPROTO_IPV4;
>  	tgpar.hooknum = hook;
>  
> -	read_lock_bh(&table->lock);
> +	rcu_read_lock_bh();
>  	IP_NF_ASSERT(table->valid_hooks & (1 << hook));
> -	private = table->private;
> +	private = rcu_dereference(table->private);
>  	table_base = (void *)private->entries[smp_processor_id()];
>  	e = get_entry(table_base, private->hook_entry[hook]);

I think this doesn't actually need the _bh rcu_read_lock() variant
since updates are never done in softirq context.

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
stephen hemminger Feb. 9, 2009, 5:14 p.m. UTC | #4
On Mon, 09 Feb 2009 16:52:59 +0100
Patrick McHardy <kaber@trash.net> wrote:

> Eric Dumazet wrote:
> > Stephen Hemminger a écrit :
> >> @@ -939,14 +973,30 @@ static struct xt_counters * alloc_counte
> >>  	counters = vmalloc_node(countersize, numa_node_id());
> >>  
> >>  	if (counters == NULL)
> >> -		return ERR_PTR(-ENOMEM);
> >> +		goto nomem;
> >> +
> >> +	tmp = xt_alloc_table_info(private->size);
> >> +	if (!tmp)
> >> +		goto free_counters;
> >> +
> > 
> >> +	xt_zero_table_entries(tmp);
> > This is not correct. We must copy rules and zero counters on the copied stuff.
> 
> Indeed.
It is in next version.

> >>  static int
> >>  do_add_counters(struct net *net, void __user *user, unsigned int len, int compat)
> >> @@ -1393,13 +1422,14 @@ do_add_counters(struct net *net, void __
> >>  		goto free;
> >>  	}
> >>  
> >> -	write_lock_bh(&t->lock);
> >> +	mutex_lock(&t->lock);
> >>  	private = t->private;
> >>  	if (private->number != num_counters) {
> >>  		ret = -EINVAL;
> >>  		goto unlock_up_free;
> >>  	}
> >>  
> >> +	preempt_disable();
> >>  	i = 0;
> >>  	/* Choose the copy that is on our node */
> 
> This isn't actually necessary, its merely an optimization. Since this
> can take quite a while, it might be nicer not to disable preempt.

Need to stay on same cpu, to avoid race of preempt and two cpu's updating
same counter entry (and 64 bit counter update is not atomic)
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

--- a/include/linux/netfilter/x_tables.h	2009-02-02 15:06:39.893751845 -0800
+++ b/include/linux/netfilter/x_tables.h	2009-02-03 15:44:21.743663216 -0800
@@ -353,7 +353,7 @@  struct xt_table
 	unsigned int valid_hooks;
 
 	/* Lock for the curtain */
-	rwlock_t lock;
+	struct mutex lock;
 
 	/* Man behind the curtain... */
 	struct xt_table_info *private;
@@ -383,9 +383,15 @@  struct xt_table_info
 	unsigned int hook_entry[NF_INET_NUMHOOKS];
 	unsigned int underflow[NF_INET_NUMHOOKS];
 
+	/* For the dustman... */
+	union {
+		struct rcu_head rcu;
+		struct work_struct work;
+	};
+
 	/* ipt_entry tables: one per CPU */
 	/* Note : this field MUST be the last one, see XT_TABLE_INFO_SZ */
-	char *entries[1];
+	void *entries[1];
 };
 
 #define XT_TABLE_INFO_SZ (offsetof(struct xt_table_info, entries) \
@@ -432,6 +438,9 @@  extern void xt_proto_fini(struct net *ne
 
 extern struct xt_table_info *xt_alloc_table_info(unsigned int size);
 extern void xt_free_table_info(struct xt_table_info *info);
+extern void xt_zero_table_entries(struct xt_table_info *info);
+extern void xt_swap_table_entries(struct xt_table_info *old,
+				  struct xt_table_info *new);
 
 #ifdef CONFIG_COMPAT
 #include <net/compat.h>
--- a/net/ipv4/netfilter/ip_tables.c	2009-02-02 15:06:29.684249364 -0800
+++ b/net/ipv4/netfilter/ip_tables.c	2009-02-03 15:52:32.047583686 -0800
@@ -347,10 +347,12 @@  ipt_do_table(struct sk_buff *skb,
 	mtpar.family  = tgpar.family = NFPROTO_IPV4;
 	tgpar.hooknum = hook;
 
-	read_lock_bh(&table->lock);
 	IP_NF_ASSERT(table->valid_hooks & (1 << hook));
-	private = table->private;
-	table_base = (void *)private->entries[smp_processor_id()];
+
+	rcu_read_lock_bh();
+	private = rcu_dereference(table->private);
+	table_base = rcu_dereference(private->entries[smp_processor_id()]);
+
 	e = get_entry(table_base, private->hook_entry[hook]);
 
 	/* For return from builtin chain */
@@ -445,7 +447,7 @@  ipt_do_table(struct sk_buff *skb,
 		}
 	} while (!hotdrop);
 
-	read_unlock_bh(&table->lock);
+	rcu_read_unlock_bh();
 
 #ifdef DEBUG_ALLOW_ALL
 	return NF_ACCEPT;
@@ -924,13 +926,45 @@  get_counters(const struct xt_table_info 
 				  counters,
 				  &i);
 	}
+
+}
+
+/* We're lazy, and add to the first CPU; overflow works its fey magic
+ * and everything is OK. */
+static int
+add_counter_to_entry(struct ipt_entry *e,
+		     const struct xt_counters addme[],
+		     unsigned int *i)
+{
+	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
+
+	(*i)++;
+	return 0;
+}
+
+/* Take values from counters and add them back onto the current cpu */
+static void put_counters(struct xt_table_info *t,
+			 const struct xt_counters counters[])
+{
+	unsigned int i, cpu;
+
+	local_bh_disable();
+	cpu = smp_processor_id();
+	i = 0;
+	IPT_ENTRY_ITERATE(t->entries[cpu],
+			  t->size,
+			  add_counter_to_entry,
+			  counters,
+			  &i);
+	local_bh_enable();
 }
 
 static struct xt_counters * alloc_counters(struct xt_table *table)
 {
 	unsigned int countersize;
 	struct xt_counters *counters;
-	const struct xt_table_info *private = table->private;
+	struct xt_table_info *private = table->private;
+	struct xt_table_info *tmp;
 
 	/* We need atomic snapshot of counters: rest doesn't change
 	   (other than comefrom, which userspace doesn't care
@@ -939,14 +973,30 @@  static struct xt_counters * alloc_counte
 	counters = vmalloc_node(countersize, numa_node_id());
 
 	if (counters == NULL)
-		return ERR_PTR(-ENOMEM);
+		goto nomem;
+
+	tmp = xt_alloc_table_info(private->size);
+	if (!tmp)
+		goto free_counters;
+
+	xt_zero_table_entries(tmp);
+
+	mutex_lock(&table->lock);
+	xt_swap_table_entries(private, tmp);
+	synchronize_net();	/* Wait until smoke has cleared */
 
-	/* First, sum counters... */
-	write_lock_bh(&table->lock);
-	get_counters(private, counters);
-	write_unlock_bh(&table->lock);
+	get_counters(tmp, counters);
+	put_counters(private, counters);
+	mutex_unlock(&table->lock);
+
+	xt_free_table_info(tmp);
 
 	return counters;
+
+ free_counters:
+	vfree(counters);
+ nomem:
+	return ERR_PTR(-ENOMEM);
 }
 
 static int
@@ -1312,27 +1362,6 @@  do_replace(struct net *net, void __user 
 	return ret;
 }
 
-/* We're lazy, and add to the first CPU; overflow works its fey magic
- * and everything is OK. */
-static int
-add_counter_to_entry(struct ipt_entry *e,
-		     const struct xt_counters addme[],
-		     unsigned int *i)
-{
-#if 0
-	duprintf("add_counter: Entry %u %lu/%lu + %lu/%lu\n",
-		 *i,
-		 (long unsigned int)e->counters.pcnt,
-		 (long unsigned int)e->counters.bcnt,
-		 (long unsigned int)addme[*i].pcnt,
-		 (long unsigned int)addme[*i].bcnt);
-#endif
-
-	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
-
-	(*i)++;
-	return 0;
-}
 
 static int
 do_add_counters(struct net *net, void __user *user, unsigned int len, int compat)
@@ -1393,13 +1422,14 @@  do_add_counters(struct net *net, void __
 		goto free;
 	}
 
-	write_lock_bh(&t->lock);
+	mutex_lock(&t->lock);
 	private = t->private;
 	if (private->number != num_counters) {
 		ret = -EINVAL;
 		goto unlock_up_free;
 	}
 
+	preempt_disable();
 	i = 0;
 	/* Choose the copy that is on our node */
 	loc_cpu_entry = private->entries[raw_smp_processor_id()];
@@ -1408,8 +1438,9 @@  do_add_counters(struct net *net, void __
 			  add_counter_to_entry,
 			  paddc,
 			  &i);
+	preempt_enable();
  unlock_up_free:
-	write_unlock_bh(&t->lock);
+	mutex_unlock(&t->lock);
 	xt_table_unlock(t);
 	module_put(t->me);
  free:
--- a/net/netfilter/x_tables.c	2009-02-02 15:06:29.708249745 -0800
+++ b/net/netfilter/x_tables.c	2009-02-03 15:44:21.743663216 -0800
@@ -611,18 +611,61 @@  struct xt_table_info *xt_alloc_table_inf
 }
 EXPORT_SYMBOL(xt_alloc_table_info);
 
-void xt_free_table_info(struct xt_table_info *info)
+/* Zero out entries */
+void xt_zero_table_entries(struct xt_table_info *info)
 {
-	int cpu;
+	unsigned int cpu;
+
+	for_each_possible_cpu(cpu)
+		memset(info->entries[cpu], 0, info->size);
+}
+EXPORT_SYMBOL_GPL(xt_zero_table_entries);
+
+/* Swap existing entries with new zeroed ones */
+void xt_swap_table_entries(struct xt_table_info *oldinfo,
+				  struct xt_table_info *newinfo)
+{
+	unsigned int cpu;
 
 	for_each_possible_cpu(cpu) {
-		if (info->size <= PAGE_SIZE)
-			kfree(info->entries[cpu]);
-		else
-			vfree(info->entries[cpu]);
+		void *p = oldinfo->entries[cpu];
+
+		rcu_assign_pointer(oldinfo->entries[cpu], newinfo->entries[cpu]);
+		newinfo->entries[cpu] = p;
 	}
+}
+EXPORT_SYMBOL_GPL(xt_swap_table_entries);
+
+/* callback to do free for vmalloc'd case */
+static void xt_free_table_info_work(struct work_struct *arg)
+{
+	int cpu;
+	struct xt_table_info *info = container_of(arg, struct xt_table_info, work);
+
+	for_each_possible_cpu(cpu)
+		vfree(info->entries[cpu]);
 	kfree(info);
 }
+
+static void xt_free_table_info_rcu(struct rcu_head *arg)
+{
+	struct xt_table_info *info = container_of(arg, struct xt_table_info, rcu);
+	if (info->size <= PAGE_SIZE) {
+		unsigned int cpu;
+		for_each_possible_cpu(cpu)
+			kfree(info->entries[cpu]);
+		kfree(info);
+	} else {
+		/* can't safely call vfree in current context */
+		INIT_WORK(&info->work, xt_free_table_info_work);
+		schedule_work(&info->work);
+	}
+}
+
+void xt_free_table_info(struct xt_table_info *info)
+{
+	call_rcu(&info->rcu, xt_free_table_info_rcu);
+}
 EXPORT_SYMBOL(xt_free_table_info);
 
 /* Find table by name, grabs mutex & ref.  Returns ERR_PTR() on error. */
@@ -671,21 +714,22 @@  xt_replace_table(struct xt_table *table,
 	struct xt_table_info *oldinfo, *private;
 
 	/* Do the substitution. */
-	write_lock_bh(&table->lock);
+	mutex_lock(&table->lock);
 	private = table->private;
 	/* Check inside lock: is the old number correct? */
 	if (num_counters != private->number) {
 		duprintf("num_counters != table->private->number (%u/%u)\n",
 			 num_counters, private->number);
-		write_unlock_bh(&table->lock);
+		mutex_unlock(&table->lock);
 		*error = -EAGAIN;
 		return NULL;
 	}
 	oldinfo = private;
-	table->private = newinfo;
+	rcu_assign_pointer(table->private, newinfo);
 	newinfo->initial_entries = oldinfo->initial_entries;
-	write_unlock_bh(&table->lock);
+	mutex_unlock(&table->lock);
 
+	synchronize_net();
 	return oldinfo;
 }
 EXPORT_SYMBOL_GPL(xt_replace_table);
@@ -719,7 +763,8 @@  struct xt_table *xt_register_table(struc
 
 	/* Simplifies replace_table code. */
 	table->private = bootstrap;
-	rwlock_init(&table->lock);
+	mutex_init(&table->lock);
+
 	if (!xt_replace_table(table, 0, newinfo, &ret))
 		goto unlock;
 
--- a/net/ipv4/netfilter/arp_tables.c	2009-02-02 15:06:29.696250564 -0800
+++ b/net/ipv4/netfilter/arp_tables.c	2009-02-03 15:50:16.592791631 -0800
@@ -237,9 +237,10 @@  unsigned int arpt_do_table(struct sk_buf
 	indev = in ? in->name : nulldevname;
 	outdev = out ? out->name : nulldevname;
 
-	read_lock_bh(&table->lock);
-	private = table->private;
-	table_base = (void *)private->entries[smp_processor_id()];
+	rcu_read_lock_bh();
+	private = rcu_dereference(table->private);
+	table_base = rcu_dereference(private->entries[smp_processor_id()]);
+
 	e = get_entry(table_base, private->hook_entry[hook]);
 	back = get_entry(table_base, private->underflow[hook]);
 
@@ -311,7 +312,8 @@  unsigned int arpt_do_table(struct sk_buf
 			e = (void *)e + e->next_offset;
 		}
 	} while (!hotdrop);
-	read_unlock_bh(&table->lock);
+
+	rcu_read_unlock_bh();
 
 	if (hotdrop)
 		return NF_DROP;
@@ -714,11 +716,43 @@  static void get_counters(const struct xt
 	}
 }
 
-static inline struct xt_counters *alloc_counters(struct xt_table *table)
+
+/* We're lazy, and add to the first CPU; overflow works its fey magic
+ * and everything is OK. */
+static int
+add_counter_to_entry(struct arpt_entry *e,
+		     const struct xt_counters addme[],
+		     unsigned int *i)
+{
+	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
+
+	(*i)++;
+	return 0;
+}
+
+/* Take values from counters and add them back onto the current cpu */
+static void put_counters(struct xt_table_info *t,
+			 const struct xt_counters counters[])
+{
+	unsigned int i, cpu;
+
+	local_bh_disable();
+	cpu = smp_processor_id();
+	i = 0;
+	ARPT_ENTRY_ITERATE(t->entries[cpu],
+			  t->size,
+			  add_counter_to_entry,
+			  counters,
+			  &i);
+	local_bh_enable();
+}
+
+static struct xt_counters *alloc_counters(struct xt_table *table)
 {
 	unsigned int countersize;
 	struct xt_counters *counters;
-	const struct xt_table_info *private = table->private;
+	struct xt_table_info *private = table->private;
+	struct xt_table_info *tmp;
 
 	/* We need atomic snapshot of counters: rest doesn't change
 	 * (other than comefrom, which userspace doesn't care
@@ -728,14 +762,30 @@  static inline struct xt_counters *alloc_
 	counters = vmalloc_node(countersize, numa_node_id());
 
 	if (counters == NULL)
-		return ERR_PTR(-ENOMEM);
+		goto nomem;
 
-	/* First, sum counters... */
-	write_lock_bh(&table->lock);
-	get_counters(private, counters);
-	write_unlock_bh(&table->lock);
+	tmp = xt_alloc_table_info(private->size);
+	if (!tmp)
+		goto free_counters;
+
+	xt_zero_table_entries(tmp);
+
+	mutex_lock(&table->lock);
+	xt_swap_table_entries(private, tmp);
+	synchronize_net();	/* Wait until smoke has cleared */
+
+	get_counters(tmp, counters);
+	put_counters(private, counters);
+	mutex_unlock(&table->lock);
+
+	xt_free_table_info(tmp);
 
 	return counters;
+
+ free_counters:
+	vfree(counters);
+ nomem:
+	return ERR_PTR(-ENOMEM);
 }
 
 static int copy_entries_to_user(unsigned int total_size,
@@ -1075,20 +1125,6 @@  static int do_replace(struct net *net, v
 	return ret;
 }
 
-/* We're lazy, and add to the first CPU; overflow works its fey magic
- * and everything is OK.
- */
-static inline int add_counter_to_entry(struct arpt_entry *e,
-				       const struct xt_counters addme[],
-				       unsigned int *i)
-{
-
-	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
-
-	(*i)++;
-	return 0;
-}
-
 static int do_add_counters(struct net *net, void __user *user, unsigned int len,
 			   int compat)
 {
@@ -1148,13 +1184,14 @@  static int do_add_counters(struct net *n
 		goto free;
 	}
 
-	write_lock_bh(&t->lock);
+	mutex_lock(&t->lock);
 	private = t->private;
 	if (private->number != num_counters) {
 		ret = -EINVAL;
 		goto unlock_up_free;
 	}
 
+	preempt_disable();
 	i = 0;
 	/* Choose the copy that is on our node */
 	loc_cpu_entry = private->entries[smp_processor_id()];
@@ -1164,7 +1201,8 @@  static int do_add_counters(struct net *n
 			   paddc,
 			   &i);
  unlock_up_free:
-	write_unlock_bh(&t->lock);
+	mutex_unlock(&t->lock);
+
 	xt_table_unlock(t);
 	module_put(t->me);
  free:
--- a/net/ipv6/netfilter/ip6_tables.c	2009-02-02 15:06:29.724250485 -0800
+++ b/net/ipv6/netfilter/ip6_tables.c	2009-02-03 15:54:10.084493787 -0800
@@ -373,10 +373,12 @@  ip6t_do_table(struct sk_buff *skb,
 	mtpar.family  = tgpar.family = NFPROTO_IPV6;
 	tgpar.hooknum = hook;
 
-	read_lock_bh(&table->lock);
 	IP_NF_ASSERT(table->valid_hooks & (1 << hook));
-	private = table->private;
-	table_base = (void *)private->entries[smp_processor_id()];
+
+	rcu_read_lock_bh();
+	private = rcu_dereference(table->private);
+	table_base = rcu_dereference(private->entries[smp_processor_id()]);
+
 	e = get_entry(table_base, private->hook_entry[hook]);
 
 	/* For return from builtin chain */
@@ -474,7 +476,7 @@  ip6t_do_table(struct sk_buff *skb,
 #ifdef CONFIG_NETFILTER_DEBUG
 	((struct ip6t_entry *)table_base)->comefrom = NETFILTER_LINK_POISON;
 #endif
-	read_unlock_bh(&table->lock);
+	rcu_read_unlock_bh();
 
 #ifdef DEBUG_ALLOW_ALL
 	return NF_ACCEPT;
@@ -955,11 +957,42 @@  get_counters(const struct xt_table_info 
 	}
 }
 
+/* We're lazy, and add to the first CPU; overflow works its fey magic
+ * and everything is OK. */
+static int
+add_counter_to_entry(struct ip6t_entry *e,
+		     const struct xt_counters addme[],
+		     unsigned int *i)
+{
+	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
+
+	(*i)++;
+	return 0;
+}
+
+/* Take values from counters and add them back onto the current cpu */
+static void put_counters(struct xt_table_info *t,
+			 const struct xt_counters counters[])
+{
+	unsigned int i, cpu;
+
+	local_bh_disable();
+	cpu = smp_processor_id();
+	i = 0;
+	IP6T_ENTRY_ITERATE(t->entries[cpu],
+			   t->size,
+			   add_counter_to_entry,
+			   counters,
+			   &i);
+	local_bh_enable();
+}
+
 static struct xt_counters *alloc_counters(struct xt_table *table)
 {
 	unsigned int countersize;
 	struct xt_counters *counters;
-	const struct xt_table_info *private = table->private;
+	struct xt_table_info *private = table->private;
+	struct xt_table_info *tmp;
 
 	/* We need atomic snapshot of counters: rest doesn't change
 	   (other than comefrom, which userspace doesn't care
@@ -968,14 +1001,28 @@  static struct xt_counters *alloc_counter
 	counters = vmalloc_node(countersize, numa_node_id());
 
 	if (counters == NULL)
-		return ERR_PTR(-ENOMEM);
+		goto nomem;
+
+	tmp = xt_alloc_table_info(private->size);
+	if (!tmp)
+		goto free_counters;
+
+	xt_zero_table_entries(tmp);
+
+	mutex_lock(&table->lock);
+	xt_swap_table_entries(private, tmp);
+	synchronize_net();	/* Wait until smoke has cleared */
+
+	get_counters(tmp, counters);
+	put_counters(private, counters);
+	mutex_unlock(&table->lock);
 
-	/* First, sum counters... */
-	write_lock_bh(&table->lock);
-	get_counters(private, counters);
-	write_unlock_bh(&table->lock);
+	xt_free_table_info(tmp);
 
-	return counters;
+ free_counters:
+	vfree(counters);
+ nomem:
+	return ERR_PTR(-ENOMEM);
 }
 
 static int
@@ -1342,28 +1389,6 @@  do_replace(struct net *net, void __user 
 	return ret;
 }
 
-/* We're lazy, and add to the first CPU; overflow works its fey magic
- * and everything is OK. */
-static inline int
-add_counter_to_entry(struct ip6t_entry *e,
-		     const struct xt_counters addme[],
-		     unsigned int *i)
-{
-#if 0
-	duprintf("add_counter: Entry %u %lu/%lu + %lu/%lu\n",
-		 *i,
-		 (long unsigned int)e->counters.pcnt,
-		 (long unsigned int)e->counters.bcnt,
-		 (long unsigned int)addme[*i].pcnt,
-		 (long unsigned int)addme[*i].bcnt);
-#endif
-
-	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
-
-	(*i)++;
-	return 0;
-}
-
 static int
 do_add_counters(struct net *net, void __user *user, unsigned int len,
 		int compat)
@@ -1424,13 +1449,14 @@  do_add_counters(struct net *net, void __
 		goto free;
 	}
 
-	write_lock_bh(&t->lock);
+	mutex_lock(&t->lock);
 	private = t->private;
 	if (private->number != num_counters) {
 		ret = -EINVAL;
 		goto unlock_up_free;
 	}
 
+	preempt_disable();
 	i = 0;
 	/* Choose the copy that is on our node */
 	loc_cpu_entry = private->entries[raw_smp_processor_id()];
@@ -1439,8 +1465,9 @@  do_add_counters(struct net *net, void __
 			  add_counter_to_entry,
 			  paddc,
 			  &i);
+	preempt_enable();
  unlock_up_free:
-	write_unlock_bh(&t->lock);
+	mutex_unlock(&t->lock);
 	xt_table_unlock(t);
 	module_put(t->me);
  free: