Patchwork [3/3] iptables: lock free counters (alternate version)

login
register
mail settings
Submitter stephen hemminger
Date Feb. 2, 2009, 11:33 p.m.
Message ID <20090202153357.3ac6edfa@extreme>
Download mbox | patch
Permalink /patch/21663/
State Not Applicable
Delegated to: David Miller
Headers show

Comments

stephen hemminger - Feb. 2, 2009, 11:33 p.m.
This is an alternative to earlier RCU/seqcount_t version of counters.
The counters operate as usual without locking, but when counters are rotated
around the CPU's entries.  RCU is used in two ways, first to handle the
counter rotation, second for replace.

Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>

---
 include/linux/netfilter/x_tables.h |   10 +++-
 net/ipv4/netfilter/arp_tables.c    |   73 +++++++++++++++++++++++++++---------
 net/ipv4/netfilter/ip_tables.c     |   68 ++++++++++++++++++++++++---------
 net/ipv6/netfilter/ip6_tables.c    |   75 ++++++++++++++++++++++++++-----------
 net/netfilter/x_tables.c           |   43 +++++++++++++++------
 5 files changed, 197 insertions(+), 72 deletions(-)

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eric Dumazet - Feb. 3, 2009, 7 p.m.
Stephen Hemminger a écrit :
> This is an alternative to earlier RCU/seqcount_t version of counters.
> The counters operate as usual without locking, but when counters are rotated
> around the CPU's entries.  RCU is used in two ways, first to handle the
> counter rotation, second for replace.
> 

Is it a working patch or just a prototype ?

> Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
> 
> ---
>  include/linux/netfilter/x_tables.h |   10 +++-
>  net/ipv4/netfilter/arp_tables.c    |   73 +++++++++++++++++++++++++++---------
>  net/ipv4/netfilter/ip_tables.c     |   68 ++++++++++++++++++++++++---------
>  net/ipv6/netfilter/ip6_tables.c    |   75 ++++++++++++++++++++++++++-----------
>  net/netfilter/x_tables.c           |   43 +++++++++++++++------
>  5 files changed, 197 insertions(+), 72 deletions(-)
> 
> --- a/include/linux/netfilter/x_tables.h	2009-02-02 15:06:39.893751845 -0800
> +++ b/include/linux/netfilter/x_tables.h	2009-02-02 15:28:10.022574005 -0800
> @@ -353,7 +353,7 @@ struct xt_table
>  	unsigned int valid_hooks;
>  
>  	/* Lock for the curtain */
> -	rwlock_t lock;
> +	struct mutex lock;
>  
>  	/* Man behind the curtain... */
>  	struct xt_table_info *private;
> @@ -383,9 +383,15 @@ struct xt_table_info
>  	unsigned int hook_entry[NF_INET_NUMHOOKS];
>  	unsigned int underflow[NF_INET_NUMHOOKS];
>  
> +	/* For the dustman... */
> +	union {
> +		struct rcu_head rcu;
> +		struct work_struct work;
> +	};
> +
>  	/* ipt_entry tables: one per CPU */
>  	/* Note : this field MUST be the last one, see XT_TABLE_INFO_SZ */
> -	char *entries[1];
> +	void *entries[1];
>  };
>  
>  #define XT_TABLE_INFO_SZ (offsetof(struct xt_table_info, entries) \
> --- a/net/ipv4/netfilter/ip_tables.c	2009-02-02 15:06:29.684249364 -0800
> +++ b/net/ipv4/netfilter/ip_tables.c	2009-02-02 15:14:13.256499021 -0800
> @@ -347,10 +347,12 @@ ipt_do_table(struct sk_buff *skb,
>  	mtpar.family  = tgpar.family = NFPROTO_IPV4;
>  	tgpar.hooknum = hook;
>  
> -	read_lock_bh(&table->lock);
>  	IP_NF_ASSERT(table->valid_hooks & (1 << hook));
> -	private = table->private;
> -	table_base = (void *)private->entries[smp_processor_id()];
> +
> +	rcu_read_lock_bh();
> +	private = rcu_dereference(table->private);
> +	table_base = rcu_dereference(private->entries[smp_processor_id()]);
> +
>  	e = get_entry(table_base, private->hook_entry[hook]);
>  
>  	/* For return from builtin chain */
> @@ -445,7 +447,7 @@ ipt_do_table(struct sk_buff *skb,
>  		}
>  	} while (!hotdrop);
>  
> -	read_unlock_bh(&table->lock);
> +	rcu_read_unlock_bh();
>  
>  #ifdef DEBUG_ALLOW_ALL
>  	return NF_ACCEPT;
> @@ -892,45 +894,73 @@ set_entry_to_counter(const struct ipt_en
>  	return 0;
>  }
>  
> +static inline int
> +set_counter_to_entry(struct ipt_entry *e,
> +		     const struct ipt_counters total[],
> +		     unsigned int *i)
> +{
> +	SET_COUNTER(e->counters, total[*i].bcnt, total[*i].pcnt);
> +
> +	(*i)++;
> +	return 0;
> +}
> +
> +
>  static void
> -get_counters(const struct xt_table_info *t,
> +get_counters(struct xt_table_info *t,
>  	     struct xt_counters counters[])
>  {
>  	unsigned int cpu;
>  	unsigned int i;
>  	unsigned int curcpu;
> +	struct ipt_entry *e;
>  
> -	/* Instead of clearing (by a previous call to memset())
> -	 * the counters and using adds, we set the counters
> -	 * with data used by 'current' CPU
> -	 * We dont care about preemption here.
> -	 */
> +	preempt_disable();
>  	curcpu = raw_smp_processor_id();
> -
> +	e = t->entries[curcpu];
>  	i = 0;
> -	IPT_ENTRY_ITERATE(t->entries[curcpu],
> +	IPT_ENTRY_ITERATE(e,
>  			  t->size,
>  			  set_entry_to_counter,

Hum, current cpu might be interrupted by NIC, since you only disabled preemption.
set_entry_to_counter() might get garbage.
I suppose I already mentioned it :)

>  			  counters,
>  			  &i);
>  
>  	for_each_possible_cpu(cpu) {
> +		void *p;
> +
>  		if (cpu == curcpu)
>  			continue;
> +
> +		/* Swizzle the values and wait */
> +		e->counters = ((struct xt_counters) { 0, 0 });

I dont see what you want to do here...

e->counters is the counter associated with rule #0

> +		p = t->entries[cpu];
> +		rcu_assign_pointer(t->entries[cpu], e);
> +		synchronize_net();


Oh well, not this synchronize_net() :)

This wont provide atomic sampling of counters for whole CPUS, and introduce large delays
on big machines (NR_CPUS >= 64)

What problem do we want to solve here ?


Current iptables is able to do an atomic snapshot because of the rwlock.

If we really want to keep this feature (but get rid of rwlock), we might do the reverse
with two seqlocks + RCU

One seqlock (seqlock_counters) to protect counter updates
One seqlock (seqlock_rules) to protect rules changes

Ie :

ipt_do_table() doing :
{
rcu_read_lock 
read_seqbegin(&table->seqlock_rules);
rcu fetch priv table pointer and work on it
 do {
 for all counters updates, use 
 do {
   seq = read_seqbegin(table->seqlock_counters);
   update counters
   }
 } while (!hotdrop);
rcu_read_unlock()
}

for get_counter() (iptables -L)
writer doing a write_seqlock(&table->seqlock_counters), waiting one RCU grace period, 
{
get / sum all counters (no updates of counters are allowed)
}
write_sequnlock();


for iptables_update/replace (iptables -A|I|D|R|Z|F...)
writer doing a write_seqlock(&table->seqlock_rules), waiting one RCU grace period, 
{
change rules/counters
}
write_sequnlock();


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eric Dumazet - Feb. 3, 2009, 7:19 p.m.
Eric Dumazet a écrit :
> Stephen Hemminger a écrit :
>> This is an alternative to earlier RCU/seqcount_t version of counters.
>> The counters operate as usual without locking, but when counters are rotated
>> around the CPU's entries.  RCU is used in two ways, first to handle the
>> counter rotation, second for replace.
>>
> 
> Is it a working patch or just a prototype ?
> 
>> Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
>>
>> ---
>>  include/linux/netfilter/x_tables.h |   10 +++-
>>  net/ipv4/netfilter/arp_tables.c    |   73 +++++++++++++++++++++++++++---------
>>  net/ipv4/netfilter/ip_tables.c     |   68 ++++++++++++++++++++++++---------
>>  net/ipv6/netfilter/ip6_tables.c    |   75 ++++++++++++++++++++++++++-----------
>>  net/netfilter/x_tables.c           |   43 +++++++++++++++------
>>  5 files changed, 197 insertions(+), 72 deletions(-)
>>
>> --- a/include/linux/netfilter/x_tables.h	2009-02-02 15:06:39.893751845 -0800
>> +++ b/include/linux/netfilter/x_tables.h	2009-02-02 15:28:10.022574005 -0800
>> @@ -353,7 +353,7 @@ struct xt_table
>>  	unsigned int valid_hooks;
>>  
>>  	/* Lock for the curtain */
>> -	rwlock_t lock;
>> +	struct mutex lock;
>>  
>>  	/* Man behind the curtain... */
>>  	struct xt_table_info *private;
>> @@ -383,9 +383,15 @@ struct xt_table_info
>>  	unsigned int hook_entry[NF_INET_NUMHOOKS];
>>  	unsigned int underflow[NF_INET_NUMHOOKS];
>>  
>> +	/* For the dustman... */
>> +	union {
>> +		struct rcu_head rcu;
>> +		struct work_struct work;
>> +	};
>> +
>>  	/* ipt_entry tables: one per CPU */
>>  	/* Note : this field MUST be the last one, see XT_TABLE_INFO_SZ */
>> -	char *entries[1];
>> +	void *entries[1];
>>  };
>>  
>>  #define XT_TABLE_INFO_SZ (offsetof(struct xt_table_info, entries) \
>> --- a/net/ipv4/netfilter/ip_tables.c	2009-02-02 15:06:29.684249364 -0800
>> +++ b/net/ipv4/netfilter/ip_tables.c	2009-02-02 15:14:13.256499021 -0800
>> @@ -347,10 +347,12 @@ ipt_do_table(struct sk_buff *skb,
>>  	mtpar.family  = tgpar.family = NFPROTO_IPV4;
>>  	tgpar.hooknum = hook;
>>  
>> -	read_lock_bh(&table->lock);
>>  	IP_NF_ASSERT(table->valid_hooks & (1 << hook));
>> -	private = table->private;
>> -	table_base = (void *)private->entries[smp_processor_id()];
>> +
>> +	rcu_read_lock_bh();
>> +	private = rcu_dereference(table->private);
>> +	table_base = rcu_dereference(private->entries[smp_processor_id()]);
>> +
>>  	e = get_entry(table_base, private->hook_entry[hook]);
>>  
>>  	/* For return from builtin chain */
>> @@ -445,7 +447,7 @@ ipt_do_table(struct sk_buff *skb,
>>  		}
>>  	} while (!hotdrop);
>>  
>> -	read_unlock_bh(&table->lock);
>> +	rcu_read_unlock_bh();
>>  
>>  #ifdef DEBUG_ALLOW_ALL
>>  	return NF_ACCEPT;
>> @@ -892,45 +894,73 @@ set_entry_to_counter(const struct ipt_en
>>  	return 0;
>>  }
>>  
>> +static inline int
>> +set_counter_to_entry(struct ipt_entry *e,
>> +		     const struct ipt_counters total[],
>> +		     unsigned int *i)
>> +{
>> +	SET_COUNTER(e->counters, total[*i].bcnt, total[*i].pcnt);
>> +
>> +	(*i)++;
>> +	return 0;
>> +}
>> +
>> +
>>  static void
>> -get_counters(const struct xt_table_info *t,
>> +get_counters(struct xt_table_info *t,
>>  	     struct xt_counters counters[])
>>  {
>>  	unsigned int cpu;
>>  	unsigned int i;
>>  	unsigned int curcpu;
>> +	struct ipt_entry *e;
>>  
>> -	/* Instead of clearing (by a previous call to memset())
>> -	 * the counters and using adds, we set the counters
>> -	 * with data used by 'current' CPU
>> -	 * We dont care about preemption here.
>> -	 */
>> +	preempt_disable();
>>  	curcpu = raw_smp_processor_id();
>> -
>> +	e = t->entries[curcpu];
>>  	i = 0;
>> -	IPT_ENTRY_ITERATE(t->entries[curcpu],
>> +	IPT_ENTRY_ITERATE(e,
>>  			  t->size,
>>  			  set_entry_to_counter,
> 
> Hum, current cpu might be interrupted by NIC, since you only disabled preemption.
> set_entry_to_counter() might get garbage.
> I suppose I already mentioned it :)
> 
>>  			  counters,
>>  			  &i);
>>  
>>  	for_each_possible_cpu(cpu) {
>> +		void *p;
>> +
>>  		if (cpu == curcpu)
>>  			continue;
>> +
>> +		/* Swizzle the values and wait */
>> +		e->counters = ((struct xt_counters) { 0, 0 });
> 
> I dont see what you want to do here...
> 
> e->counters is the counter associated with rule #0
> 
>> +		p = t->entries[cpu];
>> +		rcu_assign_pointer(t->entries[cpu], e);
>> +		synchronize_net();
> 
> 
> Oh well, not this synchronize_net() :)
> 
> This wont provide atomic sampling of counters for whole CPUS, and introduce large delays
> on big machines (NR_CPUS >= 64)
> 
> What problem do we want to solve here ?
> 
> 
> Current iptables is able to do an atomic snapshot because of the rwlock.
> 
> If we really want to keep this feature (but get rid of rwlock), we might do the reverse
> with two seqlocks + RCU
> 
> One seqlock (seqlock_counters) to protect counter updates
> One seqlock (seqlock_rules) to protect rules changes
> 
> Ie :
> 
> ipt_do_table() doing :
> {
> rcu_read_lock 
> read_seqbegin(&table->seqlock_rules);
> rcu fetch priv table pointer and work on it
>  do {
>  for all counters updates, use 
>  do {
>    seq = read_seqbegin(table->seqlock_counters);

Hum... reading my mail, this one can be done once only, at the beginning

Why then I suggested two different seqlocks, I am wondering :)

I need to think a litle bit more, we might do something to allow several "iptables -L" in //

maybe an atomic_t to protect counters would be ok :


Ie :

One atomic_t (counters_readers) to protect counter updates
One seqlock (seqlock_rules) to protect rules changes



ipt_do_table() doing :
{
begin:
read_seqbegin(&table->seqlock_rules); // it could loop but with BH enabled
rcu_read_lock_bh();
while (atomic_read(&table->counters_readers) > 0) {
   cpu_relax();
   rcu_read_unlock_bh();
   goto begin;
}
private = rcu_dereference(table->private);
...
 do {
 for all counters updates, use 
 do {
   update counters
   }
 } while (!hotdrop);
rcu_read_unlock_bh()
}


for get_counter() (iptables -L)
atomic_inc(&table->counters_readers);
waiting one RCU grace period, 
{
get / sum all counters (no updates of counters are allowed)
}
atomic_dec(&table->counters_readers)



for iptables_update/replace (iptables -A|I|D|R|Z|F...)
writer doing a write_seqlock(&table->seqlock_rules), waiting one RCU grace period, 
{
change rules/counters
}
write_sequnlock();





--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Paul E. McKenney - Feb. 3, 2009, 7:32 p.m.
On Tue, Feb 03, 2009 at 08:00:16PM +0100, Eric Dumazet wrote:
> Stephen Hemminger a écrit :
> > This is an alternative to earlier RCU/seqcount_t version of counters.
> > The counters operate as usual without locking, but when counters are rotated
> > around the CPU's entries.  RCU is used in two ways, first to handle the
> > counter rotation, second for replace.
> 
> Is it a working patch or just a prototype ?
> 
> > Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
> > 
> > ---
> >  include/linux/netfilter/x_tables.h |   10 +++-
> >  net/ipv4/netfilter/arp_tables.c    |   73 +++++++++++++++++++++++++++---------
> >  net/ipv4/netfilter/ip_tables.c     |   68 ++++++++++++++++++++++++---------
> >  net/ipv6/netfilter/ip6_tables.c    |   75 ++++++++++++++++++++++++++-----------
> >  net/netfilter/x_tables.c           |   43 +++++++++++++++------
> >  5 files changed, 197 insertions(+), 72 deletions(-)
> > 
> > --- a/include/linux/netfilter/x_tables.h	2009-02-02 15:06:39.893751845 -0800
> > +++ b/include/linux/netfilter/x_tables.h	2009-02-02 15:28:10.022574005 -0800
> > @@ -353,7 +353,7 @@ struct xt_table
> >  	unsigned int valid_hooks;
> >  
> >  	/* Lock for the curtain */
> > -	rwlock_t lock;
> > +	struct mutex lock;
> >  
> >  	/* Man behind the curtain... */
> >  	struct xt_table_info *private;
> > @@ -383,9 +383,15 @@ struct xt_table_info
> >  	unsigned int hook_entry[NF_INET_NUMHOOKS];
> >  	unsigned int underflow[NF_INET_NUMHOOKS];
> >  
> > +	/* For the dustman... */
> > +	union {
> > +		struct rcu_head rcu;
> > +		struct work_struct work;
> > +	};
> > +
> >  	/* ipt_entry tables: one per CPU */
> >  	/* Note : this field MUST be the last one, see XT_TABLE_INFO_SZ */
> > -	char *entries[1];
> > +	void *entries[1];
> >  };
> >  
> >  #define XT_TABLE_INFO_SZ (offsetof(struct xt_table_info, entries) \
> > --- a/net/ipv4/netfilter/ip_tables.c	2009-02-02 15:06:29.684249364 -0800
> > +++ b/net/ipv4/netfilter/ip_tables.c	2009-02-02 15:14:13.256499021 -0800
> > @@ -347,10 +347,12 @@ ipt_do_table(struct sk_buff *skb,
> >  	mtpar.family  = tgpar.family = NFPROTO_IPV4;
> >  	tgpar.hooknum = hook;
> >  
> > -	read_lock_bh(&table->lock);
> >  	IP_NF_ASSERT(table->valid_hooks & (1 << hook));
> > -	private = table->private;
> > -	table_base = (void *)private->entries[smp_processor_id()];
> > +
> > +	rcu_read_lock_bh();
> > +	private = rcu_dereference(table->private);
> > +	table_base = rcu_dereference(private->entries[smp_processor_id()]);
> > +
> >  	e = get_entry(table_base, private->hook_entry[hook]);
> >  
> >  	/* For return from builtin chain */
> > @@ -445,7 +447,7 @@ ipt_do_table(struct sk_buff *skb,
> >  		}
> >  	} while (!hotdrop);
> >  
> > -	read_unlock_bh(&table->lock);
> > +	rcu_read_unlock_bh();
> >  
> >  #ifdef DEBUG_ALLOW_ALL
> >  	return NF_ACCEPT;
> > @@ -892,45 +894,73 @@ set_entry_to_counter(const struct ipt_en
> >  	return 0;
> >  }
> >  
> > +static inline int
> > +set_counter_to_entry(struct ipt_entry *e,
> > +		     const struct ipt_counters total[],
> > +		     unsigned int *i)
> > +{
> > +	SET_COUNTER(e->counters, total[*i].bcnt, total[*i].pcnt);
> > +
> > +	(*i)++;
> > +	return 0;
> > +}
> > +
> > +
> >  static void
> > -get_counters(const struct xt_table_info *t,
> > +get_counters(struct xt_table_info *t,
> >  	     struct xt_counters counters[])
> >  {
> >  	unsigned int cpu;
> >  	unsigned int i;
> >  	unsigned int curcpu;
> > +	struct ipt_entry *e;
> >  
> > -	/* Instead of clearing (by a previous call to memset())
> > -	 * the counters and using adds, we set the counters
> > -	 * with data used by 'current' CPU
> > -	 * We dont care about preemption here.
> > -	 */
> > +	preempt_disable();
> >  	curcpu = raw_smp_processor_id();
> > -
> > +	e = t->entries[curcpu];
> >  	i = 0;
> > -	IPT_ENTRY_ITERATE(t->entries[curcpu],
> > +	IPT_ENTRY_ITERATE(e,
> >  			  t->size,
> >  			  set_entry_to_counter,
> 
> Hum, current cpu might be interrupted by NIC, since you only disabled preemption.
> set_entry_to_counter() might get garbage.
> I suppose I already mentioned it :)
> 
> >  			  counters,
> >  			  &i);
> >  
> >  	for_each_possible_cpu(cpu) {
> > +		void *p;
> > +
> >  		if (cpu == curcpu)
> >  			continue;
> > +
> > +		/* Swizzle the values and wait */
> > +		e->counters = ((struct xt_counters) { 0, 0 });
> 
> I dont see what you want to do here...
> 
> e->counters is the counter associated with rule #0
> 
> > +		p = t->entries[cpu];
> > +		rcu_assign_pointer(t->entries[cpu], e);
> > +		synchronize_net();
> 
> 
> Oh well, not this synchronize_net() :)
> 
> This wont provide atomic sampling of counters for whole CPUS, and introduce large delays
> on big machines (NR_CPUS >= 64)

Why would this not provide the moral equivalent of atomic sampling?
The code above switches to another counter set, and waits for a grace
period.  Shouldn't this mean that all CPUs that were incrementing the
old set of counters have finished doing so, so that the aggregate count
covers all CPUs that started their increments before the pointer switch?
Same as acquiring a write lock, which would wait for all CPUs that
started their increments before starting the write-lock acquisition.
CPUs that started their increments after starting the write acquisition
would not be accounted for in the total, same as the RCU approach.

Steve's approach does delay reading out the counters, but it avoids
delaying any CPU trying to increment the counters.

So what am I missing here?

							Thanx, Paul

> What problem do we want to solve here ?
> 
> 
> Current iptables is able to do an atomic snapshot because of the rwlock.
> 
> If we really want to keep this feature (but get rid of rwlock), we might do the reverse
> with two seqlocks + RCU
> 
> One seqlock (seqlock_counters) to protect counter updates
> One seqlock (seqlock_rules) to protect rules changes
> 
> Ie :
> 
> ipt_do_table() doing :
> {
> rcu_read_lock 
> read_seqbegin(&table->seqlock_rules);
> rcu fetch priv table pointer and work on it
>  do {
>  for all counters updates, use 
>  do {
>    seq = read_seqbegin(table->seqlock_counters);
>    update counters
>    }
>  } while (!hotdrop);
> rcu_read_unlock()
> }
> 
> for get_counter() (iptables -L)
> writer doing a write_seqlock(&table->seqlock_counters), waiting one RCU grace period, 
> {
> get / sum all counters (no updates of counters are allowed)
> }
> write_sequnlock();
> 
> 
> for iptables_update/replace (iptables -A|I|D|R|Z|F...)
> writer doing a write_seqlock(&table->seqlock_rules), waiting one RCU grace period, 
> {
> change rules/counters
> }
> write_sequnlock();
> 
> 
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eric Dumazet - Feb. 3, 2009, 8:20 p.m.
Paul E. McKenney a écrit :
> On Tue, Feb 03, 2009 at 08:00:16PM +0100, Eric Dumazet wrote:
>> Stephen Hemminger a écrit :
>>> This is an alternative to earlier RCU/seqcount_t version of counters.
>>> The counters operate as usual without locking, but when counters are rotated
>>> around the CPU's entries.  RCU is used in two ways, first to handle the
>>> counter rotation, second for replace.
>> Is it a working patch or just a prototype ?
>>
>>> Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
>>>
>>> ---
>>>  include/linux/netfilter/x_tables.h |   10 +++-
>>>  net/ipv4/netfilter/arp_tables.c    |   73 +++++++++++++++++++++++++++---------
>>>  net/ipv4/netfilter/ip_tables.c     |   68 ++++++++++++++++++++++++---------
>>>  net/ipv6/netfilter/ip6_tables.c    |   75 ++++++++++++++++++++++++++-----------
>>>  net/netfilter/x_tables.c           |   43 +++++++++++++++------
>>>  5 files changed, 197 insertions(+), 72 deletions(-)
>>>
>>> --- a/include/linux/netfilter/x_tables.h	2009-02-02 15:06:39.893751845 -0800
>>> +++ b/include/linux/netfilter/x_tables.h	2009-02-02 15:28:10.022574005 -0800
>>> @@ -353,7 +353,7 @@ struct xt_table
>>>  	unsigned int valid_hooks;
>>>  
>>>  	/* Lock for the curtain */
>>> -	rwlock_t lock;
>>> +	struct mutex lock;
>>>  
>>>  	/* Man behind the curtain... */
>>>  	struct xt_table_info *private;
>>> @@ -383,9 +383,15 @@ struct xt_table_info
>>>  	unsigned int hook_entry[NF_INET_NUMHOOKS];
>>>  	unsigned int underflow[NF_INET_NUMHOOKS];
>>>  
>>> +	/* For the dustman... */
>>> +	union {
>>> +		struct rcu_head rcu;
>>> +		struct work_struct work;
>>> +	};
>>> +
>>>  	/* ipt_entry tables: one per CPU */
>>>  	/* Note : this field MUST be the last one, see XT_TABLE_INFO_SZ */
>>> -	char *entries[1];
>>> +	void *entries[1];
>>>  };
>>>  
>>>  #define XT_TABLE_INFO_SZ (offsetof(struct xt_table_info, entries) \
>>> --- a/net/ipv4/netfilter/ip_tables.c	2009-02-02 15:06:29.684249364 -0800
>>> +++ b/net/ipv4/netfilter/ip_tables.c	2009-02-02 15:14:13.256499021 -0800
>>> @@ -347,10 +347,12 @@ ipt_do_table(struct sk_buff *skb,
>>>  	mtpar.family  = tgpar.family = NFPROTO_IPV4;
>>>  	tgpar.hooknum = hook;
>>>  
>>> -	read_lock_bh(&table->lock);
>>>  	IP_NF_ASSERT(table->valid_hooks & (1 << hook));
>>> -	private = table->private;
>>> -	table_base = (void *)private->entries[smp_processor_id()];
>>> +
>>> +	rcu_read_lock_bh();
>>> +	private = rcu_dereference(table->private);
>>> +	table_base = rcu_dereference(private->entries[smp_processor_id()]);
>>> +
>>>  	e = get_entry(table_base, private->hook_entry[hook]);
>>>  
>>>  	/* For return from builtin chain */
>>> @@ -445,7 +447,7 @@ ipt_do_table(struct sk_buff *skb,
>>>  		}
>>>  	} while (!hotdrop);
>>>  
>>> -	read_unlock_bh(&table->lock);
>>> +	rcu_read_unlock_bh();
>>>  
>>>  #ifdef DEBUG_ALLOW_ALL
>>>  	return NF_ACCEPT;
>>> @@ -892,45 +894,73 @@ set_entry_to_counter(const struct ipt_en
>>>  	return 0;
>>>  }
>>>  
>>> +static inline int
>>> +set_counter_to_entry(struct ipt_entry *e,
>>> +		     const struct ipt_counters total[],
>>> +		     unsigned int *i)
>>> +{
>>> +	SET_COUNTER(e->counters, total[*i].bcnt, total[*i].pcnt);
>>> +
>>> +	(*i)++;
>>> +	return 0;
>>> +}
>>> +
>>> +
>>>  static void
>>> -get_counters(const struct xt_table_info *t,
>>> +get_counters(struct xt_table_info *t,
>>>  	     struct xt_counters counters[])
>>>  {
>>>  	unsigned int cpu;
>>>  	unsigned int i;
>>>  	unsigned int curcpu;
>>> +	struct ipt_entry *e;
>>>  
>>> -	/* Instead of clearing (by a previous call to memset())
>>> -	 * the counters and using adds, we set the counters
>>> -	 * with data used by 'current' CPU
>>> -	 * We dont care about preemption here.
>>> -	 */
>>> +	preempt_disable();
>>>  	curcpu = raw_smp_processor_id();
>>> -
>>> +	e = t->entries[curcpu];
>>>  	i = 0;
>>> -	IPT_ENTRY_ITERATE(t->entries[curcpu],
>>> +	IPT_ENTRY_ITERATE(e,
>>>  			  t->size,
>>>  			  set_entry_to_counter,
>> Hum, current cpu might be interrupted by NIC, since you only disabled preemption.
>> set_entry_to_counter() might get garbage.
>> I suppose I already mentioned it :)
>>
>>>  			  counters,
>>>  			  &i);
>>>  
>>>  	for_each_possible_cpu(cpu) {
>>> +		void *p;
>>> +
>>>  		if (cpu == curcpu)
>>>  			continue;
>>> +
>>> +		/* Swizzle the values and wait */
>>> +		e->counters = ((struct xt_counters) { 0, 0 });
>> I dont see what you want to do here...
>>
>> e->counters is the counter associated with rule #0
>>
>>> +		p = t->entries[cpu];
>>> +		rcu_assign_pointer(t->entries[cpu], e);
>>> +		synchronize_net();
>>
>> Oh well, not this synchronize_net() :)
>>
>> This wont provide atomic sampling of counters for whole CPUS, and introduce large delays
>> on big machines (NR_CPUS >= 64)
> 
> Why would this not provide the moral equivalent of atomic sampling?
> The code above switches to another counter set, and waits for a grace
> period.  Shouldn't this mean that all CPUs that were incrementing the
> old set of counters have finished doing so, so that the aggregate count
> covers all CPUs that started their increments before the pointer switch?
> Same as acquiring a write lock, which would wait for all CPUs that
> started their increments before starting the write-lock acquisition.
> CPUs that started their increments after starting the write acquisition
> would not be accounted for in the total, same as the RCU approach.
> 
> Steve's approach does delay reading out the counters, but it avoids
> delaying any CPU trying to increment the counters.

I see your point, but this is not what Stephen implemented.

So.. CPU will increments which counters, if not delayed ?

How counters will be synced again after our 'iptables -L' finished ?

"iptable -L" is not supposed to miss some counters updates (only some packets
 might be droped at NIC level because we spend time in the collection).
If packets matches some rules, we really want up2date counters.

Maybe we need for this collection an extra "cpu", to collect 
all increments that were done when CPUs where directed to a 
"secondary table/counters"

> 
> So what am I missing here?
> 

Well, I saw one synchronize_net() inside the for_each_possible_cpu(cpu) loop.
Say we have NR_CPUS=4096, how long will it takes to perform "iptables -L" ?



General/intuitive idea would be :

switch pointers to a newly allocated table (and zeroed counters)
wait one RCU grace period
collect/sum all counters of "old" table + (all cpus) into user provided table
restore previous table
wait one RCU grace period
disable_bh()
collect/sum all counters of "new and temporary" table (all cpus) and
reinject them into local cpu table (this cpu should not be interrupted)
enable_bh()

This way, "iptables -L" is not too expensive and doesnt block packet processing at all.

Thanks Paul

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
stephen hemminger - Feb. 3, 2009, 8:44 p.m.
> >>> +	e = t->entries[curcpu];
> >>>  	i = 0;
> >>> -	IPT_ENTRY_ITERATE(t->entries[curcpu],
> >>> +	IPT_ENTRY_ITERATE(e,
> >>>  			  t->size,
> >>>  			  set_entry_to_counter,
> >> Hum, current cpu might be interrupted by NIC, since you only disabled preemption.
> >> set_entry_to_counter() might get garbage.
> >> I suppose I already mentioned it :)


Need to change preempt_disable to local_bh_disable.

> >>>  			  counters,
> >>>  			  &i);
> >>>  
> >>>  	for_each_possible_cpu(cpu) {
> >>> +		void *p;
> >>> +
> >>>  		if (cpu == curcpu)
> >>>  			continue;
> >>> +
> >>> +		/* Swizzle the values and wait */
> >>> +		e->counters = ((struct xt_counters) { 0, 0 });
> >> I dont see what you want to do here...
> >>
> >> e->counters is the counter associated with rule #0
> >>
> >>> +		p = t->entries[cpu];
> >>> +		rcu_assign_pointer(t->entries[cpu], e);
> >>> +		synchronize_net();
> >>
> >> Oh well, not this synchronize_net() :)
> >>
> >> This wont provide atomic sampling of counters for whole CPUS, and introduce large delays
> >> on big machines (NR_CPUS >= 64)

For large number of CPU's it  would be possible to allocate
a full set of ncpu-1 new counters, then swap them in and then do one synchronize net.
Or instead of synchronize_net, maybe a "wait for CPU #n" to go through
grace period.

> > Why would this not provide the moral equivalent of atomic sampling?
> > The code above switches to another counter set, and waits for a grace
> > period.  Shouldn't this mean that all CPUs that were incrementing the
> > old set of counters have finished doing so, so that the aggregate count
> > covers all CPUs that started their increments before the pointer switch?
> > Same as acquiring a write lock, which would wait for all CPUs that
> > started their increments before starting the write-lock acquisition.
> > CPUs that started their increments after starting the write acquisition
> > would not be accounted for in the total, same as the RCU approach.
> > 
> > Steve's approach does delay reading out the counters, but it avoids
> > delaying any CPU trying to increment the counters.
> 
> I see your point, but this is not what Stephen implemented.
> 
> So.. CPU will increments which counters, if not delayed ?

The concept is that only the sum of all the counters matters, not
which CPU has the count.

> How counters will be synced again after our 'iptables -L' finished ?
> 
> "iptable -L" is not supposed to miss some counters updates (only some packets
>  might be droped at NIC level because we spend time in the collection).
> If packets matches some rules, we really want up2date counters.
> 
> Maybe we need for this collection an extra "cpu", to collect 
> all increments that were done when CPUs where directed to a 
> "secondary table/counters"
> 
> > 
> > So what am I missing here?
> > 
> 
> Well, I saw one synchronize_net() inside the for_each_possible_cpu(cpu) loop.
> Say we have NR_CPUS=4096, how long will it takes to perform "iptables -L" ?
> 
> 
> 
> General/intuitive idea would be :
> 
> switch pointers to a newly allocated table (and zeroed counters)
> wait one RCU grace period
> collect/sum all counters of "old" table + (all cpus) into user provided table
> restore previous table
> wait one RCU grace period
> disable_bh()
> collect/sum all counters of "new and temporary" table (all cpus) and
> reinject them into local cpu table (this cpu should not be interrupted)
> enable_bh()
> 
> This way, "iptables -L" is not too expensive and doesnt block packet processing at all.

Pretty much what I said.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eric Dumazet - Feb. 3, 2009, 9:05 p.m.
Stephen Hemminger a écrit :
>>
>> General/intuitive idea would be :
>>
>> switch pointers to a newly allocated table (and zeroed counters)
>> wait one RCU grace period
>> collect/sum all counters of "old" table + (all cpus) into user provided table
>> restore previous table
>> wait one RCU grace period
>> disable_bh()
>> collect/sum all counters of "new and temporary" table (all cpus) and
>> reinject them into local cpu table (this cpu should not be interrupted)
>> enable_bh()
>>
>> This way, "iptables -L" is not too expensive and doesnt block packet processing at all.
> 
> Pretty much what I said.


Cool then, sorry for misunderstanding your patch.

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Paul E. McKenney - Feb. 3, 2009, 9:10 p.m.
On Tue, Feb 03, 2009 at 09:20:04PM +0100, Eric Dumazet wrote:
> Paul E. McKenney a écrit :
> > On Tue, Feb 03, 2009 at 08:00:16PM +0100, Eric Dumazet wrote:
> >> Stephen Hemminger a écrit :
> >>> This is an alternative to earlier RCU/seqcount_t version of counters.
> >>> The counters operate as usual without locking, but when counters are rotated
> >>> around the CPU's entries.  RCU is used in two ways, first to handle the
> >>> counter rotation, second for replace.
> >> Is it a working patch or just a prototype ?
> >>
> >>> Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
> >>>
> >>> ---
> >>>  include/linux/netfilter/x_tables.h |   10 +++-
> >>>  net/ipv4/netfilter/arp_tables.c    |   73 +++++++++++++++++++++++++++---------
> >>>  net/ipv4/netfilter/ip_tables.c     |   68 ++++++++++++++++++++++++---------
> >>>  net/ipv6/netfilter/ip6_tables.c    |   75 ++++++++++++++++++++++++++-----------
> >>>  net/netfilter/x_tables.c           |   43 +++++++++++++++------
> >>>  5 files changed, 197 insertions(+), 72 deletions(-)
> >>>
> >>> --- a/include/linux/netfilter/x_tables.h	2009-02-02 15:06:39.893751845 -0800
> >>> +++ b/include/linux/netfilter/x_tables.h	2009-02-02 15:28:10.022574005 -0800
> >>> @@ -353,7 +353,7 @@ struct xt_table
> >>>  	unsigned int valid_hooks;
> >>>  
> >>>  	/* Lock for the curtain */
> >>> -	rwlock_t lock;
> >>> +	struct mutex lock;
> >>>  
> >>>  	/* Man behind the curtain... */
> >>>  	struct xt_table_info *private;
> >>> @@ -383,9 +383,15 @@ struct xt_table_info
> >>>  	unsigned int hook_entry[NF_INET_NUMHOOKS];
> >>>  	unsigned int underflow[NF_INET_NUMHOOKS];
> >>>  
> >>> +	/* For the dustman... */
> >>> +	union {
> >>> +		struct rcu_head rcu;
> >>> +		struct work_struct work;
> >>> +	};
> >>> +
> >>>  	/* ipt_entry tables: one per CPU */
> >>>  	/* Note : this field MUST be the last one, see XT_TABLE_INFO_SZ */
> >>> -	char *entries[1];
> >>> +	void *entries[1];
> >>>  };
> >>>  
> >>>  #define XT_TABLE_INFO_SZ (offsetof(struct xt_table_info, entries) \
> >>> --- a/net/ipv4/netfilter/ip_tables.c	2009-02-02 15:06:29.684249364 -0800
> >>> +++ b/net/ipv4/netfilter/ip_tables.c	2009-02-02 15:14:13.256499021 -0800
> >>> @@ -347,10 +347,12 @@ ipt_do_table(struct sk_buff *skb,
> >>>  	mtpar.family  = tgpar.family = NFPROTO_IPV4;
> >>>  	tgpar.hooknum = hook;
> >>>  
> >>> -	read_lock_bh(&table->lock);
> >>>  	IP_NF_ASSERT(table->valid_hooks & (1 << hook));
> >>> -	private = table->private;
> >>> -	table_base = (void *)private->entries[smp_processor_id()];
> >>> +
> >>> +	rcu_read_lock_bh();
> >>> +	private = rcu_dereference(table->private);
> >>> +	table_base = rcu_dereference(private->entries[smp_processor_id()]);
> >>> +
> >>>  	e = get_entry(table_base, private->hook_entry[hook]);
> >>>  
> >>>  	/* For return from builtin chain */
> >>> @@ -445,7 +447,7 @@ ipt_do_table(struct sk_buff *skb,
> >>>  		}
> >>>  	} while (!hotdrop);
> >>>  
> >>> -	read_unlock_bh(&table->lock);
> >>> +	rcu_read_unlock_bh();
> >>>  
> >>>  #ifdef DEBUG_ALLOW_ALL
> >>>  	return NF_ACCEPT;
> >>> @@ -892,45 +894,73 @@ set_entry_to_counter(const struct ipt_en
> >>>  	return 0;
> >>>  }
> >>>  
> >>> +static inline int
> >>> +set_counter_to_entry(struct ipt_entry *e,
> >>> +		     const struct ipt_counters total[],
> >>> +		     unsigned int *i)
> >>> +{
> >>> +	SET_COUNTER(e->counters, total[*i].bcnt, total[*i].pcnt);
> >>> +
> >>> +	(*i)++;
> >>> +	return 0;
> >>> +}
> >>> +
> >>> +
> >>>  static void
> >>> -get_counters(const struct xt_table_info *t,
> >>> +get_counters(struct xt_table_info *t,
> >>>  	     struct xt_counters counters[])
> >>>  {
> >>>  	unsigned int cpu;
> >>>  	unsigned int i;
> >>>  	unsigned int curcpu;
> >>> +	struct ipt_entry *e;
> >>>  
> >>> -	/* Instead of clearing (by a previous call to memset())
> >>> -	 * the counters and using adds, we set the counters
> >>> -	 * with data used by 'current' CPU
> >>> -	 * We dont care about preemption here.
> >>> -	 */
> >>> +	preempt_disable();
> >>>  	curcpu = raw_smp_processor_id();
> >>> -
> >>> +	e = t->entries[curcpu];
> >>>  	i = 0;
> >>> -	IPT_ENTRY_ITERATE(t->entries[curcpu],
> >>> +	IPT_ENTRY_ITERATE(e,
> >>>  			  t->size,
> >>>  			  set_entry_to_counter,
> >> Hum, current cpu might be interrupted by NIC, since you only disabled preemption.
> >> set_entry_to_counter() might get garbage.
> >> I suppose I already mentioned it :)
> >>
> >>>  			  counters,
> >>>  			  &i);
> >>>  
> >>>  	for_each_possible_cpu(cpu) {
> >>> +		void *p;
> >>> +
> >>>  		if (cpu == curcpu)
> >>>  			continue;
> >>> +
> >>> +		/* Swizzle the values and wait */
> >>> +		e->counters = ((struct xt_counters) { 0, 0 });
> >> I dont see what you want to do here...
> >>
> >> e->counters is the counter associated with rule #0
> >>
> >>> +		p = t->entries[cpu];
> >>> +		rcu_assign_pointer(t->entries[cpu], e);
> >>> +		synchronize_net();
> >>
> >> Oh well, not this synchronize_net() :)
> >>
> >> This wont provide atomic sampling of counters for whole CPUS, and introduce large delays
> >> on big machines (NR_CPUS >= 64)
> > 
> > Why would this not provide the moral equivalent of atomic sampling?
> > The code above switches to another counter set, and waits for a grace
> > period.  Shouldn't this mean that all CPUs that were incrementing the
> > old set of counters have finished doing so, so that the aggregate count
> > covers all CPUs that started their increments before the pointer switch?
> > Same as acquiring a write lock, which would wait for all CPUs that
> > started their increments before starting the write-lock acquisition.
> > CPUs that started their increments after starting the write acquisition
> > would not be accounted for in the total, same as the RCU approach.
> > 
> > Steve's approach does delay reading out the counters, but it avoids
> > delaying any CPU trying to increment the counters.
> 
> I see your point, but this is not what Stephen implemented.
> 
> So.. CPU will increments which counters, if not delayed ?

The new set installed by the rcu_assign_pointer().

> How counters will be synced again after our 'iptables -L' finished ?

The usual approach would be to have three sets of counters, one currently
being incremented, one just removed from service, and the last one holding
the cumulative value.  After a synchronize_net() following removing
a set from service, you add in the values in the previous set removed
from service.  Then you keep the new set for the next 'iptables -L'.

> "iptable -L" is not supposed to miss some counters updates (only some packets
>  might be droped at NIC level because we spend time in the collection).
> If packets matches some rules, we really want up2date counters.

No counter updates would be lost using the above method.

> Maybe we need for this collection an extra "cpu", to collect 
> all increments that were done when CPUs where directed to a 
> "secondary table/counters"

It should be easier to maintain a third set of counters that hold the
accumulated counts from the earlier instances of 'iptables -L'.

> > So what am I missing here?
> 
> Well, I saw one synchronize_net() inside the for_each_possible_cpu(cpu) loop.
> Say we have NR_CPUS=4096, how long will it takes to perform "iptables -L" ?

Good point, the for_each_possible_cpu() was cut out -- I should have
gone back and looked at the original patch.

Seems like it should be possible to do a single synchronize_net()
after swizzling all the counters...

> General/intuitive idea would be :
> 
> switch pointers to a newly allocated table (and zeroed counters)
> wait one RCU grace period
> collect/sum all counters of "old" table + (all cpus) into user provided table
> restore previous table
> wait one RCU grace period
> disable_bh()
> collect/sum all counters of "new and temporary" table (all cpus) and
> reinject them into local cpu table (this cpu should not be interrupted)
> enable_bh()
> 
> This way, "iptables -L" is not too expensive and doesnt block packet processing at all.

My thought would be:

o	acquire some sort of mutex.

o	switch counters to newly allocated (and zeroed) table (T1).
	The old table being switched out is T2.

o	wait one RCU grace period.

o	Sum T2 into a single global counter (G).

o	Free T2.

o	Copy G to a local variable.

o	release the mutex.

o	Return the value of the local variable.

Then you can repeat, allocating a new table again and using the new
value of G.

Which may well be what you are saying above,

							Thanx, Paul
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
stephen hemminger - Feb. 3, 2009, 9:22 p.m.
On Tue, 3 Feb 2009 13:10:00 -0800
"Paul E. McKenney" <paulmck@linux.vnet.ibm.com> wrote:

> On Tue, Feb 03, 2009 at 09:20:04PM +0100, Eric Dumazet wrote:
> > Paul E. McKenney a écrit :
> > > On Tue, Feb 03, 2009 at 08:00:16PM +0100, Eric Dumazet wrote:
> > >> Stephen Hemminger a écrit :
> > >>> This is an alternative to earlier RCU/seqcount_t version of counters.
> > >>> The counters operate as usual without locking, but when counters are rotated
> > >>> around the CPU's entries.  RCU is used in two ways, first to handle the
> > >>> counter rotation, second for replace.
> > >> Is it a working patch or just a prototype ?
> > >>
> > >>> Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
> > >>>
> > >>> ---
> > >>>  include/linux/netfilter/x_tables.h |   10 +++-
> > >>>  net/ipv4/netfilter/arp_tables.c    |   73 +++++++++++++++++++++++++++---------
> > >>>  net/ipv4/netfilter/ip_tables.c     |   68 ++++++++++++++++++++++++---------
> > >>>  net/ipv6/netfilter/ip6_tables.c    |   75 ++++++++++++++++++++++++++-----------
> > >>>  net/netfilter/x_tables.c           |   43 +++++++++++++++------
> > >>>  5 files changed, 197 insertions(+), 72 deletions(-)
> > >>>
> > >>> --- a/include/linux/netfilter/x_tables.h	2009-02-02 15:06:39.893751845 -0800
> > >>> +++ b/include/linux/netfilter/x_tables.h	2009-02-02 15:28:10.022574005 -0800
> > >>> @@ -353,7 +353,7 @@ struct xt_table
> > >>>  	unsigned int valid_hooks;
> > >>>  
> > >>>  	/* Lock for the curtain */
> > >>> -	rwlock_t lock;
> > >>> +	struct mutex lock;
> > >>>  
> > >>>  	/* Man behind the curtain... */
> > >>>  	struct xt_table_info *private;
> > >>> @@ -383,9 +383,15 @@ struct xt_table_info
> > >>>  	unsigned int hook_entry[NF_INET_NUMHOOKS];
> > >>>  	unsigned int underflow[NF_INET_NUMHOOKS];
> > >>>  
> > >>> +	/* For the dustman... */
> > >>> +	union {
> > >>> +		struct rcu_head rcu;
> > >>> +		struct work_struct work;
> > >>> +	};
> > >>> +
> > >>>  	/* ipt_entry tables: one per CPU */
> > >>>  	/* Note : this field MUST be the last one, see XT_TABLE_INFO_SZ */
> > >>> -	char *entries[1];
> > >>> +	void *entries[1];
> > >>>  };
> > >>>  
> > >>>  #define XT_TABLE_INFO_SZ (offsetof(struct xt_table_info, entries) \
> > >>> --- a/net/ipv4/netfilter/ip_tables.c	2009-02-02 15:06:29.684249364 -0800
> > >>> +++ b/net/ipv4/netfilter/ip_tables.c	2009-02-02 15:14:13.256499021 -0800
> > >>> @@ -347,10 +347,12 @@ ipt_do_table(struct sk_buff *skb,
> > >>>  	mtpar.family  = tgpar.family = NFPROTO_IPV4;
> > >>>  	tgpar.hooknum = hook;
> > >>>  
> > >>> -	read_lock_bh(&table->lock);
> > >>>  	IP_NF_ASSERT(table->valid_hooks & (1 << hook));
> > >>> -	private = table->private;
> > >>> -	table_base = (void *)private->entries[smp_processor_id()];
> > >>> +
> > >>> +	rcu_read_lock_bh();
> > >>> +	private = rcu_dereference(table->private);
> > >>> +	table_base = rcu_dereference(private->entries[smp_processor_id()]);
> > >>> +
> > >>>  	e = get_entry(table_base, private->hook_entry[hook]);
> > >>>  
> > >>>  	/* For return from builtin chain */
> > >>> @@ -445,7 +447,7 @@ ipt_do_table(struct sk_buff *skb,
> > >>>  		}
> > >>>  	} while (!hotdrop);
> > >>>  
> > >>> -	read_unlock_bh(&table->lock);
> > >>> +	rcu_read_unlock_bh();
> > >>>  
> > >>>  #ifdef DEBUG_ALLOW_ALL
> > >>>  	return NF_ACCEPT;
> > >>> @@ -892,45 +894,73 @@ set_entry_to_counter(const struct ipt_en
> > >>>  	return 0;
> > >>>  }
> > >>>  
> > >>> +static inline int
> > >>> +set_counter_to_entry(struct ipt_entry *e,
> > >>> +		     const struct ipt_counters total[],
> > >>> +		     unsigned int *i)
> > >>> +{
> > >>> +	SET_COUNTER(e->counters, total[*i].bcnt, total[*i].pcnt);
> > >>> +
> > >>> +	(*i)++;
> > >>> +	return 0;
> > >>> +}
> > >>> +
> > >>> +
> > >>>  static void
> > >>> -get_counters(const struct xt_table_info *t,
> > >>> +get_counters(struct xt_table_info *t,
> > >>>  	     struct xt_counters counters[])
> > >>>  {
> > >>>  	unsigned int cpu;
> > >>>  	unsigned int i;
> > >>>  	unsigned int curcpu;
> > >>> +	struct ipt_entry *e;
> > >>>  
> > >>> -	/* Instead of clearing (by a previous call to memset())
> > >>> -	 * the counters and using adds, we set the counters
> > >>> -	 * with data used by 'current' CPU
> > >>> -	 * We dont care about preemption here.
> > >>> -	 */
> > >>> +	preempt_disable();
> > >>>  	curcpu = raw_smp_processor_id();
> > >>> -
> > >>> +	e = t->entries[curcpu];
> > >>>  	i = 0;
> > >>> -	IPT_ENTRY_ITERATE(t->entries[curcpu],
> > >>> +	IPT_ENTRY_ITERATE(e,
> > >>>  			  t->size,
> > >>>  			  set_entry_to_counter,
> > >> Hum, current cpu might be interrupted by NIC, since you only disabled preemption.
> > >> set_entry_to_counter() might get garbage.
> > >> I suppose I already mentioned it :)
> > >>
> > >>>  			  counters,
> > >>>  			  &i);
> > >>>  
> > >>>  	for_each_possible_cpu(cpu) {
> > >>> +		void *p;
> > >>> +
> > >>>  		if (cpu == curcpu)
> > >>>  			continue;
> > >>> +
> > >>> +		/* Swizzle the values and wait */
> > >>> +		e->counters = ((struct xt_counters) { 0, 0 });
> > >> I dont see what you want to do here...
> > >>
> > >> e->counters is the counter associated with rule #0
> > >>
> > >>> +		p = t->entries[cpu];
> > >>> +		rcu_assign_pointer(t->entries[cpu], e);
> > >>> +		synchronize_net();
> > >>
> > >> Oh well, not this synchronize_net() :)
> > >>
> > >> This wont provide atomic sampling of counters for whole CPUS, and introduce large delays
> > >> on big machines (NR_CPUS >= 64)
> > > 
> > > Why would this not provide the moral equivalent of atomic sampling?
> > > The code above switches to another counter set, and waits for a grace
> > > period.  Shouldn't this mean that all CPUs that were incrementing the
> > > old set of counters have finished doing so, so that the aggregate count
> > > covers all CPUs that started their increments before the pointer switch?
> > > Same as acquiring a write lock, which would wait for all CPUs that
> > > started their increments before starting the write-lock acquisition.
> > > CPUs that started their increments after starting the write acquisition
> > > would not be accounted for in the total, same as the RCU approach.
> > > 
> > > Steve's approach does delay reading out the counters, but it avoids
> > > delaying any CPU trying to increment the counters.
> > 
> > I see your point, but this is not what Stephen implemented.
> > 
> > So.. CPU will increments which counters, if not delayed ?
> 
> The new set installed by the rcu_assign_pointer().
> 
> > How counters will be synced again after our 'iptables -L' finished ?
> 
> The usual approach would be to have three sets of counters, one currently
> being incremented, one just removed from service, and the last one holding
> the cumulative value.  After a synchronize_net() following removing
> a set from service, you add in the values in the previous set removed
> from service.  Then you keep the new set for the next 'iptables -L'.
> 
> > "iptable -L" is not supposed to miss some counters updates (only some packets
> >  might be droped at NIC level because we spend time in the collection).
> > If packets matches some rules, we really want up2date counters.
> 
> No counter updates would be lost using the above method.
> 
> > Maybe we need for this collection an extra "cpu", to collect 
> > all increments that were done when CPUs where directed to a 
> > "secondary table/counters"
> 
> It should be easier to maintain a third set of counters that hold the
> accumulated counts from the earlier instances of 'iptables -L'.
> 
> > > So what am I missing here?
> > 
> > Well, I saw one synchronize_net() inside the for_each_possible_cpu(cpu) loop.
> > Say we have NR_CPUS=4096, how long will it takes to perform "iptables -L" ?
> 
> Good point, the for_each_possible_cpu() was cut out -- I should have
> gone back and looked at the original patch.
> 
> Seems like it should be possible to do a single synchronize_net()
> after swizzling all the counters...
> 
> > General/intuitive idea would be :
> > 
> > switch pointers to a newly allocated table (and zeroed counters)
> > wait one RCU grace period
> > collect/sum all counters of "old" table + (all cpus) into user provided table
> > restore previous table
> > wait one RCU grace period
> > disable_bh()
> > collect/sum all counters of "new and temporary" table (all cpus) and
> > reinject them into local cpu table (this cpu should not be interrupted)
> > enable_bh()
> > 
> > This way, "iptables -L" is not too expensive and doesnt block packet processing at all.
> 
> My thought would be:
> 
> o	acquire some sort of mutex.
> 
> o	switch counters to newly allocated (and zeroed) table (T1).
> 	The old table being switched out is T2.
> 
> o	wait one RCU grace period.
> 
> o	Sum T2 into a single global counter (G).
> 
> o	Free T2.
> 
> o	Copy G to a local variable.
> 
> o	release the mutex.
> 
> o	Return the value of the local variable.
> 
> Then you can repeat, allocating a new table again and using the new
> value of G.
> 
> Which may well be what you are saying above,
> 

I was using the current CPU counter as the global counter G.


> 							Thanx, Paul
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Rick Jones - Feb. 3, 2009, 9:27 p.m.
So, is this patch stream at a point where I should try running the 32-core 
netperf tests against it?  And if so, against which tree should I try applying 
patches?

rick jones
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Paul E. McKenney - Feb. 3, 2009, 11:11 p.m.
On Tue, Feb 03, 2009 at 01:22:20PM -0800, Stephen Hemminger wrote:
> On Tue, 3 Feb 2009 13:10:00 -0800
> "Paul E. McKenney" <paulmck@linux.vnet.ibm.com> wrote:
> 
> > On Tue, Feb 03, 2009 at 09:20:04PM +0100, Eric Dumazet wrote:
> > > Paul E. McKenney a écrit :
> > > > On Tue, Feb 03, 2009 at 08:00:16PM +0100, Eric Dumazet wrote:
> > > >> Stephen Hemminger a écrit :
> > > >>> This is an alternative to earlier RCU/seqcount_t version of counters.
> > > >>> The counters operate as usual without locking, but when counters are rotated
> > > >>> around the CPU's entries.  RCU is used in two ways, first to handle the
> > > >>> counter rotation, second for replace.
> > > >> Is it a working patch or just a prototype ?
> > > >>
> > > >>> Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
> > > >>>
> > > >>> ---
> > > >>>  include/linux/netfilter/x_tables.h |   10 +++-
> > > >>>  net/ipv4/netfilter/arp_tables.c    |   73 +++++++++++++++++++++++++++---------
> > > >>>  net/ipv4/netfilter/ip_tables.c     |   68 ++++++++++++++++++++++++---------
> > > >>>  net/ipv6/netfilter/ip6_tables.c    |   75 ++++++++++++++++++++++++++-----------
> > > >>>  net/netfilter/x_tables.c           |   43 +++++++++++++++------
> > > >>>  5 files changed, 197 insertions(+), 72 deletions(-)
> > > >>>
> > > >>> --- a/include/linux/netfilter/x_tables.h	2009-02-02 15:06:39.893751845 -0800
> > > >>> +++ b/include/linux/netfilter/x_tables.h	2009-02-02 15:28:10.022574005 -0800
> > > >>> @@ -353,7 +353,7 @@ struct xt_table
> > > >>>  	unsigned int valid_hooks;
> > > >>>  
> > > >>>  	/* Lock for the curtain */
> > > >>> -	rwlock_t lock;
> > > >>> +	struct mutex lock;
> > > >>>  
> > > >>>  	/* Man behind the curtain... */
> > > >>>  	struct xt_table_info *private;
> > > >>> @@ -383,9 +383,15 @@ struct xt_table_info
> > > >>>  	unsigned int hook_entry[NF_INET_NUMHOOKS];
> > > >>>  	unsigned int underflow[NF_INET_NUMHOOKS];
> > > >>>  
> > > >>> +	/* For the dustman... */
> > > >>> +	union {
> > > >>> +		struct rcu_head rcu;
> > > >>> +		struct work_struct work;
> > > >>> +	};
> > > >>> +
> > > >>>  	/* ipt_entry tables: one per CPU */
> > > >>>  	/* Note : this field MUST be the last one, see XT_TABLE_INFO_SZ */
> > > >>> -	char *entries[1];
> > > >>> +	void *entries[1];
> > > >>>  };
> > > >>>  
> > > >>>  #define XT_TABLE_INFO_SZ (offsetof(struct xt_table_info, entries) \
> > > >>> --- a/net/ipv4/netfilter/ip_tables.c	2009-02-02 15:06:29.684249364 -0800
> > > >>> +++ b/net/ipv4/netfilter/ip_tables.c	2009-02-02 15:14:13.256499021 -0800
> > > >>> @@ -347,10 +347,12 @@ ipt_do_table(struct sk_buff *skb,
> > > >>>  	mtpar.family  = tgpar.family = NFPROTO_IPV4;
> > > >>>  	tgpar.hooknum = hook;
> > > >>>  
> > > >>> -	read_lock_bh(&table->lock);
> > > >>>  	IP_NF_ASSERT(table->valid_hooks & (1 << hook));
> > > >>> -	private = table->private;
> > > >>> -	table_base = (void *)private->entries[smp_processor_id()];
> > > >>> +
> > > >>> +	rcu_read_lock_bh();
> > > >>> +	private = rcu_dereference(table->private);
> > > >>> +	table_base = rcu_dereference(private->entries[smp_processor_id()]);
> > > >>> +
> > > >>>  	e = get_entry(table_base, private->hook_entry[hook]);
> > > >>>  
> > > >>>  	/* For return from builtin chain */
> > > >>> @@ -445,7 +447,7 @@ ipt_do_table(struct sk_buff *skb,
> > > >>>  		}
> > > >>>  	} while (!hotdrop);
> > > >>>  
> > > >>> -	read_unlock_bh(&table->lock);
> > > >>> +	rcu_read_unlock_bh();
> > > >>>  
> > > >>>  #ifdef DEBUG_ALLOW_ALL
> > > >>>  	return NF_ACCEPT;
> > > >>> @@ -892,45 +894,73 @@ set_entry_to_counter(const struct ipt_en
> > > >>>  	return 0;
> > > >>>  }
> > > >>>  
> > > >>> +static inline int
> > > >>> +set_counter_to_entry(struct ipt_entry *e,
> > > >>> +		     const struct ipt_counters total[],
> > > >>> +		     unsigned int *i)
> > > >>> +{
> > > >>> +	SET_COUNTER(e->counters, total[*i].bcnt, total[*i].pcnt);
> > > >>> +
> > > >>> +	(*i)++;
> > > >>> +	return 0;
> > > >>> +}
> > > >>> +
> > > >>> +
> > > >>>  static void
> > > >>> -get_counters(const struct xt_table_info *t,
> > > >>> +get_counters(struct xt_table_info *t,
> > > >>>  	     struct xt_counters counters[])
> > > >>>  {
> > > >>>  	unsigned int cpu;
> > > >>>  	unsigned int i;
> > > >>>  	unsigned int curcpu;
> > > >>> +	struct ipt_entry *e;
> > > >>>  
> > > >>> -	/* Instead of clearing (by a previous call to memset())
> > > >>> -	 * the counters and using adds, we set the counters
> > > >>> -	 * with data used by 'current' CPU
> > > >>> -	 * We dont care about preemption here.
> > > >>> -	 */
> > > >>> +	preempt_disable();
> > > >>>  	curcpu = raw_smp_processor_id();
> > > >>> -
> > > >>> +	e = t->entries[curcpu];
> > > >>>  	i = 0;
> > > >>> -	IPT_ENTRY_ITERATE(t->entries[curcpu],
> > > >>> +	IPT_ENTRY_ITERATE(e,
> > > >>>  			  t->size,
> > > >>>  			  set_entry_to_counter,
> > > >> Hum, current cpu might be interrupted by NIC, since you only disabled preemption.
> > > >> set_entry_to_counter() might get garbage.
> > > >> I suppose I already mentioned it :)
> > > >>
> > > >>>  			  counters,
> > > >>>  			  &i);
> > > >>>  
> > > >>>  	for_each_possible_cpu(cpu) {
> > > >>> +		void *p;
> > > >>> +
> > > >>>  		if (cpu == curcpu)
> > > >>>  			continue;
> > > >>> +
> > > >>> +		/* Swizzle the values and wait */
> > > >>> +		e->counters = ((struct xt_counters) { 0, 0 });
> > > >> I dont see what you want to do here...
> > > >>
> > > >> e->counters is the counter associated with rule #0
> > > >>
> > > >>> +		p = t->entries[cpu];
> > > >>> +		rcu_assign_pointer(t->entries[cpu], e);
> > > >>> +		synchronize_net();
> > > >>
> > > >> Oh well, not this synchronize_net() :)
> > > >>
> > > >> This wont provide atomic sampling of counters for whole CPUS, and introduce large delays
> > > >> on big machines (NR_CPUS >= 64)
> > > > 
> > > > Why would this not provide the moral equivalent of atomic sampling?
> > > > The code above switches to another counter set, and waits for a grace
> > > > period.  Shouldn't this mean that all CPUs that were incrementing the
> > > > old set of counters have finished doing so, so that the aggregate count
> > > > covers all CPUs that started their increments before the pointer switch?
> > > > Same as acquiring a write lock, which would wait for all CPUs that
> > > > started their increments before starting the write-lock acquisition.
> > > > CPUs that started their increments after starting the write acquisition
> > > > would not be accounted for in the total, same as the RCU approach.
> > > > 
> > > > Steve's approach does delay reading out the counters, but it avoids
> > > > delaying any CPU trying to increment the counters.
> > > 
> > > I see your point, but this is not what Stephen implemented.
> > > 
> > > So.. CPU will increments which counters, if not delayed ?
> > 
> > The new set installed by the rcu_assign_pointer().
> > 
> > > How counters will be synced again after our 'iptables -L' finished ?
> > 
> > The usual approach would be to have three sets of counters, one currently
> > being incremented, one just removed from service, and the last one holding
> > the cumulative value.  After a synchronize_net() following removing
> > a set from service, you add in the values in the previous set removed
> > from service.  Then you keep the new set for the next 'iptables -L'.
> > 
> > > "iptable -L" is not supposed to miss some counters updates (only some packets
> > >  might be droped at NIC level because we spend time in the collection).
> > > If packets matches some rules, we really want up2date counters.
> > 
> > No counter updates would be lost using the above method.
> > 
> > > Maybe we need for this collection an extra "cpu", to collect 
> > > all increments that were done when CPUs where directed to a 
> > > "secondary table/counters"
> > 
> > It should be easier to maintain a third set of counters that hold the
> > accumulated counts from the earlier instances of 'iptables -L'.
> > 
> > > > So what am I missing here?
> > > 
> > > Well, I saw one synchronize_net() inside the for_each_possible_cpu(cpu) loop.
> > > Say we have NR_CPUS=4096, how long will it takes to perform "iptables -L" ?
> > 
> > Good point, the for_each_possible_cpu() was cut out -- I should have
> > gone back and looked at the original patch.
> > 
> > Seems like it should be possible to do a single synchronize_net()
> > after swizzling all the counters...
> > 
> > > General/intuitive idea would be :
> > > 
> > > switch pointers to a newly allocated table (and zeroed counters)
> > > wait one RCU grace period
> > > collect/sum all counters of "old" table + (all cpus) into user provided table
> > > restore previous table
> > > wait one RCU grace period
> > > disable_bh()
> > > collect/sum all counters of "new and temporary" table (all cpus) and
> > > reinject them into local cpu table (this cpu should not be interrupted)
> > > enable_bh()
> > > 
> > > This way, "iptables -L" is not too expensive and doesnt block packet processing at all.
> > 
> > My thought would be:
> > 
> > o	acquire some sort of mutex.
> > 
> > o	switch counters to newly allocated (and zeroed) table (T1).
> > 	The old table being switched out is T2.
> > 
> > o	wait one RCU grace period.
> > 
> > o	Sum T2 into a single global counter (G).
> > 
> > o	Free T2.
> > 
> > o	Copy G to a local variable.
> > 
> > o	release the mutex.
> > 
> > o	Return the value of the local variable.
> > 
> > Then you can repeat, allocating a new table again and using the new
> > value of G.
> > 
> > Which may well be what you are saying above,
> > 
> 
> I was using the current CPU counter as the global counter G.

Sounds good, then!  ;-)

						Thanx, Paul
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Patch

--- a/include/linux/netfilter/x_tables.h	2009-02-02 15:06:39.893751845 -0800
+++ b/include/linux/netfilter/x_tables.h	2009-02-02 15:28:10.022574005 -0800
@@ -353,7 +353,7 @@  struct xt_table
 	unsigned int valid_hooks;
 
 	/* Lock for the curtain */
-	rwlock_t lock;
+	struct mutex lock;
 
 	/* Man behind the curtain... */
 	struct xt_table_info *private;
@@ -383,9 +383,15 @@  struct xt_table_info
 	unsigned int hook_entry[NF_INET_NUMHOOKS];
 	unsigned int underflow[NF_INET_NUMHOOKS];
 
+	/* For the dustman... */
+	union {
+		struct rcu_head rcu;
+		struct work_struct work;
+	};
+
 	/* ipt_entry tables: one per CPU */
 	/* Note : this field MUST be the last one, see XT_TABLE_INFO_SZ */
-	char *entries[1];
+	void *entries[1];
 };
 
 #define XT_TABLE_INFO_SZ (offsetof(struct xt_table_info, entries) \
--- a/net/ipv4/netfilter/ip_tables.c	2009-02-02 15:06:29.684249364 -0800
+++ b/net/ipv4/netfilter/ip_tables.c	2009-02-02 15:14:13.256499021 -0800
@@ -347,10 +347,12 @@  ipt_do_table(struct sk_buff *skb,
 	mtpar.family  = tgpar.family = NFPROTO_IPV4;
 	tgpar.hooknum = hook;
 
-	read_lock_bh(&table->lock);
 	IP_NF_ASSERT(table->valid_hooks & (1 << hook));
-	private = table->private;
-	table_base = (void *)private->entries[smp_processor_id()];
+
+	rcu_read_lock_bh();
+	private = rcu_dereference(table->private);
+	table_base = rcu_dereference(private->entries[smp_processor_id()]);
+
 	e = get_entry(table_base, private->hook_entry[hook]);
 
 	/* For return from builtin chain */
@@ -445,7 +447,7 @@  ipt_do_table(struct sk_buff *skb,
 		}
 	} while (!hotdrop);
 
-	read_unlock_bh(&table->lock);
+	rcu_read_unlock_bh();
 
 #ifdef DEBUG_ALLOW_ALL
 	return NF_ACCEPT;
@@ -892,45 +894,73 @@  set_entry_to_counter(const struct ipt_en
 	return 0;
 }
 
+static inline int
+set_counter_to_entry(struct ipt_entry *e,
+		     const struct ipt_counters total[],
+		     unsigned int *i)
+{
+	SET_COUNTER(e->counters, total[*i].bcnt, total[*i].pcnt);
+
+	(*i)++;
+	return 0;
+}
+
+
 static void
-get_counters(const struct xt_table_info *t,
+get_counters(struct xt_table_info *t,
 	     struct xt_counters counters[])
 {
 	unsigned int cpu;
 	unsigned int i;
 	unsigned int curcpu;
+	struct ipt_entry *e;
 
-	/* Instead of clearing (by a previous call to memset())
-	 * the counters and using adds, we set the counters
-	 * with data used by 'current' CPU
-	 * We dont care about preemption here.
-	 */
+	preempt_disable();
 	curcpu = raw_smp_processor_id();
-
+	e = t->entries[curcpu];
 	i = 0;
-	IPT_ENTRY_ITERATE(t->entries[curcpu],
+	IPT_ENTRY_ITERATE(e,
 			  t->size,
 			  set_entry_to_counter,
 			  counters,
 			  &i);
 
 	for_each_possible_cpu(cpu) {
+		void *p;
+
 		if (cpu == curcpu)
 			continue;
+
+		/* Swizzle the values and wait */
+		e->counters = ((struct xt_counters) { 0, 0 });
+		p = t->entries[cpu];
+		rcu_assign_pointer(t->entries[cpu], e);
+		synchronize_net();
+		e = p;
+
 		i = 0;
-		IPT_ENTRY_ITERATE(t->entries[cpu],
+		IPT_ENTRY_ITERATE(e,
 				  t->size,
 				  add_entry_to_counter,
 				  counters,
 				  &i);
 	}
+	i = 0;
+	t->entries[curcpu] = e;
+	IPT_ENTRY_ITERATE(e,
+			  t->size,
+			  set_counter_to_entry,
+			  counters,
+			  &i);
+
+	preempt_enable();
 }
 
 static struct xt_counters * alloc_counters(struct xt_table *table)
 {
 	unsigned int countersize;
 	struct xt_counters *counters;
-	const struct xt_table_info *private = table->private;
+	struct xt_table_info *private = table->private;
 
 	/* We need atomic snapshot of counters: rest doesn't change
 	   (other than comefrom, which userspace doesn't care
@@ -942,9 +972,9 @@  static struct xt_counters * alloc_counte
 		return ERR_PTR(-ENOMEM);
 
 	/* First, sum counters... */
-	write_lock_bh(&table->lock);
+	mutex_lock(&table->lock);
 	get_counters(private, counters);
-	write_unlock_bh(&table->lock);
+	mutex_unlock(&table->lock);
 
 	return counters;
 }
@@ -1393,13 +1423,14 @@  do_add_counters(struct net *net, void __
 		goto free;
 	}
 
-	write_lock_bh(&t->lock);
+	mutex_lock(&t->lock);
 	private = t->private;
 	if (private->number != num_counters) {
 		ret = -EINVAL;
 		goto unlock_up_free;
 	}
 
+	preempt_disable();
 	i = 0;
 	/* Choose the copy that is on our node */
 	loc_cpu_entry = private->entries[raw_smp_processor_id()];
@@ -1408,8 +1439,9 @@  do_add_counters(struct net *net, void __
 			  add_counter_to_entry,
 			  paddc,
 			  &i);
+	preempt_enable();
  unlock_up_free:
-	write_unlock_bh(&t->lock);
+	mutex_unlock(&t->lock);
 	xt_table_unlock(t);
 	module_put(t->me);
  free:
--- a/net/netfilter/x_tables.c	2009-02-02 15:06:29.708249745 -0800
+++ b/net/netfilter/x_tables.c	2009-02-02 15:30:33.718573969 -0800
@@ -611,18 +611,36 @@  struct xt_table_info *xt_alloc_table_inf
 }
 EXPORT_SYMBOL(xt_alloc_table_info);
 
-void xt_free_table_info(struct xt_table_info *info)
+/* callback to do free for vmalloc'd case */
+static void xt_free_table_info_work(struct work_struct *arg)
 {
 	int cpu;
+	struct xt_table_info *info = container_of(arg, struct xt_table_info, work);
 
-	for_each_possible_cpu(cpu) {
-		if (info->size <= PAGE_SIZE)
-			kfree(info->entries[cpu]);
-		else
-			vfree(info->entries[cpu]);
-	}
+	for_each_possible_cpu(cpu)
+		vfree(info->entries[cpu]);
 	kfree(info);
 }
+
+static void xt_free_table_info_rcu(struct rcu_head *arg)
+{
+	struct xt_table_info *info = container_of(arg, struct xt_table_info, rcu);
+ 	if (info->size <= PAGE_SIZE) {
+		unsigned int cpu;
+ 		for_each_possible_cpu(cpu)
+ 			kfree(info->entries[cpu]);
+ 		kfree(info);
+ 	} else {
+ 		/* can't safely call vfree in current context */
+ 		INIT_WORK(&info->work, xt_free_table_info_work);
+ 		schedule_work(&info->work);
+  	}
+}
+
+void xt_free_table_info(struct xt_table_info *info)
+{
+ 	call_rcu(&info->rcu, xt_free_table_info_rcu);
+}
 EXPORT_SYMBOL(xt_free_table_info);
 
 /* Find table by name, grabs mutex & ref.  Returns ERR_PTR() on error. */
@@ -671,20 +689,20 @@  xt_replace_table(struct xt_table *table,
 	struct xt_table_info *oldinfo, *private;
 
 	/* Do the substitution. */
-	write_lock_bh(&table->lock);
+	mutex_lock(&table->lock);
 	private = table->private;
 	/* Check inside lock: is the old number correct? */
 	if (num_counters != private->number) {
 		duprintf("num_counters != table->private->number (%u/%u)\n",
 			 num_counters, private->number);
-		write_unlock_bh(&table->lock);
+		mutex_unlock(&table->lock);
 		*error = -EAGAIN;
 		return NULL;
 	}
 	oldinfo = private;
-	table->private = newinfo;
+	rcu_assign_pointer(table->private, newinfo);
 	newinfo->initial_entries = oldinfo->initial_entries;
-	write_unlock_bh(&table->lock);
+	mutex_unlock(&table->lock);
 
 	return oldinfo;
 }
@@ -719,7 +737,8 @@  struct xt_table *xt_register_table(struc
 
 	/* Simplifies replace_table code. */
 	table->private = bootstrap;
-	rwlock_init(&table->lock);
+	mutex_init(&table->lock);
+
 	if (!xt_replace_table(table, 0, newinfo, &ret))
 		goto unlock;
 
--- a/net/ipv4/netfilter/arp_tables.c	2009-02-02 15:06:29.696250564 -0800
+++ b/net/ipv4/netfilter/arp_tables.c	2009-02-02 15:14:45.969206521 -0800
@@ -237,9 +237,10 @@  unsigned int arpt_do_table(struct sk_buf
 	indev = in ? in->name : nulldevname;
 	outdev = out ? out->name : nulldevname;
 
-	read_lock_bh(&table->lock);
-	private = table->private;
-	table_base = (void *)private->entries[smp_processor_id()];
+	rcu_read_lock_bh();
+	private = rcu_dereference(table->private);
+	table_base = rcu_dereference(private->entries[smp_processor_id()]);
+
 	e = get_entry(table_base, private->hook_entry[hook]);
 	back = get_entry(table_base, private->underflow[hook]);
 
@@ -311,7 +312,8 @@  unsigned int arpt_do_table(struct sk_buf
 			e = (void *)e + e->next_offset;
 		}
 	} while (!hotdrop);
-	read_unlock_bh(&table->lock);
+
+	rcu_read_unlock_bh();
 
 	if (hotdrop)
 		return NF_DROP;
@@ -681,44 +683,77 @@  static inline int set_entry_to_counter(c
 	return 0;
 }
 
-static void get_counters(const struct xt_table_info *t,
-			 struct xt_counters counters[])
+
+static inline int
+set_counter_to_entry(struct arpt_entry *e,
+		     const struct xt_counters total[],
+		     unsigned int *i)
+{
+	SET_COUNTER(e->counters, total[*i].bcnt, total[*i].pcnt);
+
+ 	(*i)++;
+ 	return 0;
+}
+
+
+static void
+get_counters(struct xt_table_info *t,
+	     struct xt_counters counters[])
 {
 	unsigned int cpu;
 	unsigned int i;
 	unsigned int curcpu;
+	struct arpt_entry *e;
 
-	/* Instead of clearing (by a previous call to memset())
-	 * the counters and using adds, we set the counters
-	 * with data used by 'current' CPU
-	 * We dont care about preemption here.
-	 */
+	preempt_disable();
 	curcpu = raw_smp_processor_id();
 
 	i = 0;
-	ARPT_ENTRY_ITERATE(t->entries[curcpu],
+ 	e = t->entries[curcpu];
+	ARPT_ENTRY_ITERATE(e,
 			   t->size,
 			   set_entry_to_counter,
 			   counters,
 			   &i);
 
 	for_each_possible_cpu(cpu) {
+		void *p;
+
 		if (cpu == curcpu)
 			continue;
+
+		/* Swizzle the values and wait */
+		e->counters.bcnt = 0;
+		e->counters.pcnt = 0;
+		p = t->entries[cpu];
+		rcu_assign_pointer(t->entries[cpu], e);
+		synchronize_net();
+		e = p;
+
 		i = 0;
-		ARPT_ENTRY_ITERATE(t->entries[cpu],
+		ARPT_ENTRY_ITERATE(e,
 				   t->size,
 				   add_entry_to_counter,
 				   counters,
 				   &i);
 	}
+
+	i = 0;
+	t->entries[curcpu] = e;
+	ARPT_ENTRY_ITERATE(e,
+			  t->size,
+			  set_counter_to_entry,
+			  counters,
+			  &i);
+
+	preempt_enable();
 }
 
 static inline struct xt_counters *alloc_counters(struct xt_table *table)
 {
 	unsigned int countersize;
 	struct xt_counters *counters;
-	const struct xt_table_info *private = table->private;
+	struct xt_table_info *private = table->private;
 
 	/* We need atomic snapshot of counters: rest doesn't change
 	 * (other than comefrom, which userspace doesn't care
@@ -731,9 +766,9 @@  static inline struct xt_counters *alloc_
 		return ERR_PTR(-ENOMEM);
 
 	/* First, sum counters... */
-	write_lock_bh(&table->lock);
+	mutex_lock(&table->lock);
 	get_counters(private, counters);
-	write_unlock_bh(&table->lock);
+	mutex_unlock(&table->lock);
 
 	return counters;
 }
@@ -1148,13 +1183,14 @@  static int do_add_counters(struct net *n
 		goto free;
 	}
 
-	write_lock_bh(&t->lock);
+	mutex_lock(&t->lock);
 	private = t->private;
 	if (private->number != num_counters) {
 		ret = -EINVAL;
 		goto unlock_up_free;
 	}
 
+	preempt_disable();
 	i = 0;
 	/* Choose the copy that is on our node */
 	loc_cpu_entry = private->entries[smp_processor_id()];
@@ -1164,7 +1200,8 @@  static int do_add_counters(struct net *n
 			   paddc,
 			   &i);
  unlock_up_free:
-	write_unlock_bh(&t->lock);
+	mutex_unlock(&t->lock);
+
 	xt_table_unlock(t);
 	module_put(t->me);
  free:
--- a/net/ipv6/netfilter/ip6_tables.c	2009-02-02 15:06:29.724250485 -0800
+++ b/net/ipv6/netfilter/ip6_tables.c	2009-02-02 15:15:05.352498160 -0800
@@ -373,10 +373,12 @@  ip6t_do_table(struct sk_buff *skb,
 	mtpar.family  = tgpar.family = NFPROTO_IPV6;
 	tgpar.hooknum = hook;
 
-	read_lock_bh(&table->lock);
 	IP_NF_ASSERT(table->valid_hooks & (1 << hook));
-	private = table->private;
-	table_base = (void *)private->entries[smp_processor_id()];
+
+	rcu_read_lock_bh();
+	private = rcu_dereference(table->private);
+	table_base = rcu_dereference(private->entries[smp_processor_id()]);
+
 	e = get_entry(table_base, private->hook_entry[hook]);
 
 	/* For return from builtin chain */
@@ -474,7 +476,7 @@  ip6t_do_table(struct sk_buff *skb,
 #ifdef CONFIG_NETFILTER_DEBUG
 	((struct ip6t_entry *)table_base)->comefrom = NETFILTER_LINK_POISON;
 #endif
-	read_unlock_bh(&table->lock);
+	rcu_read_unlock_bh();
 
 #ifdef DEBUG_ALLOW_ALL
 	return NF_ACCEPT;
@@ -921,45 +923,72 @@  set_entry_to_counter(const struct ip6t_e
 	return 0;
 }
 
+static inline int
+set_counter_to_entry(struct ip6t_entry *e,
+		     const struct ip6t_counters total[],
+		     unsigned int *i)
+{
+	SET_COUNTER(e->counters, total[*i].bcnt, total[*i].pcnt);
+
+	(*i)++;
+	return 0;
+}
+
 static void
-get_counters(const struct xt_table_info *t,
+get_counters(struct xt_table_info *t,
 	     struct xt_counters counters[])
 {
 	unsigned int cpu;
 	unsigned int i;
 	unsigned int curcpu;
+	struct ip6t_entry *e;
 
-	/* Instead of clearing (by a previous call to memset())
-	 * the counters and using adds, we set the counters
-	 * with data used by 'current' CPU
-	 * We dont care about preemption here.
-	 */
+	preempt_disable();
 	curcpu = raw_smp_processor_id();
-
+	e = t->entries[curcpu];
 	i = 0;
-	IP6T_ENTRY_ITERATE(t->entries[curcpu],
-			   t->size,
-			   set_entry_to_counter,
-			   counters,
-			   &i);
+	IP6T_ENTRY_ITERATE(e,
+			  t->size,
+			  set_entry_to_counter,
+			  counters,
+			  &i);
 
 	for_each_possible_cpu(cpu) {
+		void *p;
+
 		if (cpu == curcpu)
 			continue;
+
+		/* Swizzle the values and wait */
+		e->counters = ((struct xt_counters) { 0, 0 });
+		p = t->entries[cpu];
+		rcu_assign_pointer(t->entries[cpu], e);
+		synchronize_net();
+		e = p;
+
 		i = 0;
-		IP6T_ENTRY_ITERATE(t->entries[cpu],
+		IP6T_ENTRY_ITERATE(e,
 				  t->size,
 				  add_entry_to_counter,
 				  counters,
 				  &i);
 	}
+	i = 0;
+	t->entries[curcpu] = e;
+	IP6T_ENTRY_ITERATE(e,
+			  t->size,
+			  set_counter_to_entry,
+			  counters,
+			  &i);
+
+	preempt_enable();
 }
 
 static struct xt_counters *alloc_counters(struct xt_table *table)
 {
 	unsigned int countersize;
 	struct xt_counters *counters;
-	const struct xt_table_info *private = table->private;
+	struct xt_table_info *private = table->private;
 
 	/* We need atomic snapshot of counters: rest doesn't change
 	   (other than comefrom, which userspace doesn't care
@@ -971,9 +1000,9 @@  static struct xt_counters *alloc_counter
 		return ERR_PTR(-ENOMEM);
 
 	/* First, sum counters... */
-	write_lock_bh(&table->lock);
+	mutex_lock(&table->lock);
 	get_counters(private, counters);
-	write_unlock_bh(&table->lock);
+	mutex_unlock(&table->lock);
 
 	return counters;
 }
@@ -1424,13 +1453,14 @@  do_add_counters(struct net *net, void __
 		goto free;
 	}
 
-	write_lock_bh(&t->lock);
+	mutex_lock(&t->lock);
 	private = t->private;
 	if (private->number != num_counters) {
 		ret = -EINVAL;
 		goto unlock_up_free;
 	}
 
+	preempt_disable();
 	i = 0;
 	/* Choose the copy that is on our node */
 	loc_cpu_entry = private->entries[raw_smp_processor_id()];
@@ -1439,8 +1469,9 @@  do_add_counters(struct net *net, void __
 			  add_counter_to_entry,
 			  paddc,
 			  &i);
+	preempt_enable();
  unlock_up_free:
-	write_unlock_bh(&t->lock);
+	mutex_unlock(&t->lock);
 	xt_table_unlock(t);
 	module_put(t->me);
  free: