Message ID | 20090421143927.52d7d89d@nehalam |
---|---|
State | RFC, archived |
Delegated to: | David Miller |
Headers | show |
On Tue, Apr 21, 2009 at 02:39:27PM -0700, Stephen Hemminger wrote: > This version of x_tables (ip/ip6/arp) locking uses a per-cpu > recursive lock that can be nested. > > The idea for this came from an earlier version done by Eric Dumazet. > Locking is done per-cpu, the fast path locks on the current cpu > and updates counters. This reduces the contention of a > single reader lock (in 2.6.29) without the delay of synchronize_net() > (in 2.6.30-rc2). > > The mutex that was added for 2.6.30 in xt_table is unnecessary since > there already is a mutex for xt[af].mutex that is held. Looks good from a concurrency viewpoint! Reviewed-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> > Signed-off-by: Stephen Hemminger <shemminger@vyatta.com > > --- > CHANGES > - reader and write now inline > - only acquire one cpu write lock at a time This is very good -- gets rid of problems with preemption nesting depth limitations and lockdep limitations. In addition, it gets rid of the period of time during which all packet processing might otherwise be blocked. Not a big deal on a small machine, but could be a real problem on a large one. Very cool!!! > - write lock pushed down into get_counters > > include/linux/netfilter/x_tables.h | 50 +++++++++++++-- > net/ipv4/netfilter/arp_tables.c | 121 ++++++++++--------------------------- > net/ipv4/netfilter/ip_tables.c | 120 +++++++++--------------------------- > net/ipv6/netfilter/ip6_tables.c | 120 ++++++++++-------------------------- > net/netfilter/x_tables.c | 55 +++++++++------- > 5 files changed, 174 insertions(+), 292 deletions(-) > > --- a/include/linux/netfilter/x_tables.h 2009-04-21 07:57:06.668582345 -0700 > +++ b/include/linux/netfilter/x_tables.h 2009-04-21 14:24:03.295299154 -0700 > @@ -354,9 +354,6 @@ struct xt_table > /* What hooks you will enter on */ > unsigned int valid_hooks; > > - /* Lock for the curtain */ > - struct mutex lock; > - > /* Man behind the curtain... */ > struct xt_table_info *private; > > @@ -434,8 +431,51 @@ extern void xt_proto_fini(struct net *ne > > extern struct xt_table_info *xt_alloc_table_info(unsigned int size); > extern void xt_free_table_info(struct xt_table_info *info); > -extern void xt_table_entry_swap_rcu(struct xt_table_info *old, > - struct xt_table_info *new); > + > + > +/* > + * Per-CPU read/write lock. This makes reader locking fast since > + * there is no shared variable to cause cache ping-pong; but adds an > + * additional write-side penalty since write must iterate over all > + * possible CPU's. Readers read lock their per-cpu lock, and writers > + * write lock on all CPU's. > + * > + * Read lock is used by ip/arp/ip6 tables rule processing which runs per-cpu. > + * It needs to ensure that the rules are not being changed while packet > + * is being processed. > + * > + * Write lock is used in two cases: > + * 1. reading counter values > + * all readers need to be stopped and the per-CPU values are summed. > + * > + * 2. replacing rules > + * all packets in flight have to be processed before rules are swapped, > + * then counters are read from the old (stale) info. > + * > + */ > +DECLARE_PER_CPU(rwlock_t, xt_info_locks); > + > +static inline void xt_info_rdlock_bh(void) > +{ > + local_bh_disable(); > + read_lock(&__get_cpu_var(xt_info_locks)); Good, you do indeed need to prevent migration before you acquire this CPU's lock. Otherwise, you could have more than one CPU attempting to update the same counter. > +} > + > +static inline void xt_info_rdunlock_bh(void) > +{ > + read_unlock_bh(&__get_cpu_var(xt_info_locks)); > +} > + > +static inline void xt_info_wrlock(unsigned int cpu) > +{ > + write_lock(&per_cpu(xt_info_locks, cpu)); > +} > + > +static inline void xt_info_wrunlock(unsigned int cpu) > +{ > + > + write_unlock(&per_cpu(xt_info_locks, cpu)); > +} > > /* > * This helper is performance critical and must be inlined > --- a/net/ipv4/netfilter/ip_tables.c 2009-04-21 07:57:06.649549203 -0700 > +++ b/net/ipv4/netfilter/ip_tables.c 2009-04-21 14:33:29.842423044 -0700 > @@ -338,10 +338,9 @@ ipt_do_table(struct sk_buff *skb, > tgpar.hooknum = hook; > > IP_NF_ASSERT(table->valid_hooks & (1 << hook)); > - > - rcu_read_lock_bh(); > - private = rcu_dereference(table->private); > - table_base = rcu_dereference(private->entries[smp_processor_id()]); > + xt_info_rdlock_bh(); > + private = table->private; > + table_base = private->entries[smp_processor_id()]; > > e = get_entry(table_base, private->hook_entry[hook]); > > @@ -436,8 +435,7 @@ ipt_do_table(struct sk_buff *skb, > e = (void *)e + e->next_offset; > } > } while (!hotdrop); > - > - rcu_read_unlock_bh(); > + xt_info_rdunlock_bh(); > > #ifdef DEBUG_ALLOW_ALL > return NF_ACCEPT; > @@ -897,89 +895,39 @@ get_counters(const struct xt_table_info > /* Instead of clearing (by a previous call to memset()) > * the counters and using adds, we set the counters > * with data used by 'current' CPU > - * We dont care about preemption here. Good. Of course, the reason we care about preemption here is that otherwise some other task could mess up this CPU's counters. Bad for real-time response, but then again, what the heck are you doing updating netfilter rules while a system is running a real-time workload? > */ > - curcpu = raw_smp_processor_id(); > + local_bh_disable(); > + curcpu = smp_processor_id(); > > i = 0; > + xt_info_wrlock(curcpu); > IPT_ENTRY_ITERATE(t->entries[curcpu], > t->size, > set_entry_to_counter, > counters, > &i); > + xt_info_wrunlock(curcpu); > > for_each_possible_cpu(cpu) { > if (cpu == curcpu) > continue; > i = 0; > + xt_info_wrlock(cpu); > IPT_ENTRY_ITERATE(t->entries[cpu], > t->size, > add_entry_to_counter, > counters, > &i); > + xt_info_wrunlock(cpu); > } > - > -} > - > -/* We're lazy, and add to the first CPU; overflow works its fey magic > - * and everything is OK. */ > -static int > -add_counter_to_entry(struct ipt_entry *e, > - const struct xt_counters addme[], > - unsigned int *i) > -{ > - ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt); > - > - (*i)++; > - return 0; > -} > - > -/* Take values from counters and add them back onto the current cpu */ > -static void put_counters(struct xt_table_info *t, > - const struct xt_counters counters[]) > -{ > - unsigned int i, cpu; > - > - local_bh_disable(); > - cpu = smp_processor_id(); > - i = 0; > - IPT_ENTRY_ITERATE(t->entries[cpu], > - t->size, > - add_counter_to_entry, > - counters, > - &i); > local_bh_enable(); > } > > - > -static inline int > -zero_entry_counter(struct ipt_entry *e, void *arg) > -{ > - e->counters.bcnt = 0; > - e->counters.pcnt = 0; > - return 0; > -} > - > -static void > -clone_counters(struct xt_table_info *newinfo, const struct xt_table_info *info) > -{ > - unsigned int cpu; > - const void *loc_cpu_entry = info->entries[raw_smp_processor_id()]; > - > - memcpy(newinfo, info, offsetof(struct xt_table_info, entries)); > - for_each_possible_cpu(cpu) { > - memcpy(newinfo->entries[cpu], loc_cpu_entry, info->size); > - IPT_ENTRY_ITERATE(newinfo->entries[cpu], newinfo->size, > - zero_entry_counter, NULL); > - } > -} > - > static struct xt_counters * alloc_counters(struct xt_table *table) > { > unsigned int countersize; > struct xt_counters *counters; > struct xt_table_info *private = table->private; > - struct xt_table_info *info; > > /* We need atomic snapshot of counters: rest doesn't change > (other than comefrom, which userspace doesn't care > @@ -988,30 +936,11 @@ static struct xt_counters * alloc_counte > counters = vmalloc_node(countersize, numa_node_id()); > > if (counters == NULL) > - goto nomem; > + return ERR_PTR(-ENOMEM); > > - info = xt_alloc_table_info(private->size); > - if (!info) > - goto free_counters; > - > - clone_counters(info, private); > - > - mutex_lock(&table->lock); > - xt_table_entry_swap_rcu(private, info); > - synchronize_net(); /* Wait until smoke has cleared */ > - > - get_counters(info, counters); > - put_counters(private, counters); > - mutex_unlock(&table->lock); > - > - xt_free_table_info(info); > + get_counters(private, counters); > > return counters; > - > - free_counters: > - vfree(counters); > - nomem: > - return ERR_PTR(-ENOMEM); > } > > static int > @@ -1377,11 +1306,23 @@ do_replace(struct net *net, void __user > return ret; > } > > +/* We're lazy, and add to the first CPU; overflow works its fey magic > + * and everything is OK. */ > +static int > +add_counter_to_entry(struct ipt_entry *e, > + const struct xt_counters addme[], > + unsigned int *i) > +{ > + ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt); > + > + (*i)++; > + return 0; > +} > > static int > do_add_counters(struct net *net, void __user *user, unsigned int len, int compat) > { > - unsigned int i; > + unsigned int i, curcpu; > struct xt_counters_info tmp; > struct xt_counters *paddc; > unsigned int num_counters; > @@ -1437,25 +1378,26 @@ do_add_counters(struct net *net, void __ > goto free; > } > > - mutex_lock(&t->lock); > + local_bh_disable(); > private = t->private; > if (private->number != num_counters) { > ret = -EINVAL; > goto unlock_up_free; > } > > - preempt_disable(); > i = 0; > /* Choose the copy that is on our node */ > - loc_cpu_entry = private->entries[raw_smp_processor_id()]; > + curcpu = smp_processor_id(); > + loc_cpu_entry = private->entries[curcpu]; > + xt_info_wrlock(curcpu); > IPT_ENTRY_ITERATE(loc_cpu_entry, > private->size, > add_counter_to_entry, > paddc, > &i); > - preempt_enable(); > + xt_info_wrunlock(curcpu); > unlock_up_free: > - mutex_unlock(&t->lock); > + local_bh_enable(); > xt_table_unlock(t); > module_put(t->me); > free: > --- a/net/netfilter/x_tables.c 2009-04-21 07:57:06.605264365 -0700 > +++ b/net/netfilter/x_tables.c 2009-04-21 14:33:50.177486967 -0700 > @@ -625,20 +625,6 @@ void xt_free_table_info(struct xt_table_ > } > EXPORT_SYMBOL(xt_free_table_info); > > -void xt_table_entry_swap_rcu(struct xt_table_info *oldinfo, > - struct xt_table_info *newinfo) > -{ > - unsigned int cpu; > - > - for_each_possible_cpu(cpu) { > - void *p = oldinfo->entries[cpu]; > - rcu_assign_pointer(oldinfo->entries[cpu], newinfo->entries[cpu]); > - newinfo->entries[cpu] = p; > - } > - > -} > -EXPORT_SYMBOL_GPL(xt_table_entry_swap_rcu); > - > /* Find table by name, grabs mutex & ref. Returns ERR_PTR() on error. */ > struct xt_table *xt_find_table_lock(struct net *net, u_int8_t af, > const char *name) > @@ -676,32 +662,43 @@ void xt_compat_unlock(u_int8_t af) > EXPORT_SYMBOL_GPL(xt_compat_unlock); > #endif > > +DEFINE_PER_CPU(rwlock_t, xt_info_locks); > +EXPORT_PER_CPU_SYMBOL_GPL(xt_info_locks); Hmmm... Not sure I want to know what module uses this. Ah, that is right -- a bunch of the comms stack can be built as a module. > + > + > struct xt_table_info * > xt_replace_table(struct xt_table *table, > unsigned int num_counters, > struct xt_table_info *newinfo, > int *error) > { > - struct xt_table_info *oldinfo, *private; > + unsigned int i; > + struct xt_table_info *private; > > /* Do the substitution. */ > - mutex_lock(&table->lock); > + local_bh_disable(); Good, disabling preemption prevents other readers from messing with this CPU's local state. > private = table->private; > /* Check inside lock: is the old number correct? */ > if (num_counters != private->number) { > duprintf("num_counters != table->private->number (%u/%u)\n", > num_counters, private->number); > - mutex_unlock(&table->lock); > + local_bh_enable(); > *error = -EAGAIN; > return NULL; > } > - oldinfo = private; > - rcu_assign_pointer(table->private, newinfo); > - newinfo->initial_entries = oldinfo->initial_entries; > - mutex_unlock(&table->lock); > > - synchronize_net(); > - return oldinfo; > + table->private = newinfo; > + newinfo->initial_entries = private->initial_entries; > + > + /* wait for each other cpu to see new table */ > + for_each_possible_cpu(i) > + if (i != smp_processor_id()) { > + xt_info_wrlock(i); > + xt_info_wrunlock(i); > + } And the above loop acts sort of like a lock-based synchronize_rcu(). > + local_bh_enable(); > + > + return private; > } > EXPORT_SYMBOL_GPL(xt_replace_table); > > @@ -734,7 +731,6 @@ struct xt_table *xt_register_table(struc > > /* Simplifies replace_table code. */ > table->private = bootstrap; > - mutex_init(&table->lock); > > if (!xt_replace_table(table, 0, newinfo, &ret)) > goto unlock; > @@ -1147,7 +1143,16 @@ static struct pernet_operations xt_net_o > > static int __init xt_init(void) > { > - int i, rv; > + unsigned int i; > + int rv; > + static struct lock_class_key xt_lock_key[NR_CPUS]; > + > + for_each_possible_cpu(i) { > + rwlock_t *lock = &per_cpu(xt_info_locks, i); > + > + rwlock_init(lock); > + lockdep_set_class(lock, xt_lock_key+i); > + } > > xt = kmalloc(sizeof(struct xt_af) * NFPROTO_NUMPROTO, GFP_KERNEL); > if (!xt) > --- a/net/ipv6/netfilter/ip6_tables.c 2009-04-21 07:57:06.621260619 -0700 > +++ b/net/ipv6/netfilter/ip6_tables.c 2009-04-21 14:29:57.312236004 -0700 > @@ -365,9 +365,9 @@ ip6t_do_table(struct sk_buff *skb, > > IP_NF_ASSERT(table->valid_hooks & (1 << hook)); > > - rcu_read_lock_bh(); > - private = rcu_dereference(table->private); > - table_base = rcu_dereference(private->entries[smp_processor_id()]); > + xt_info_rdlock_bh(); > + private = table->private; > + table_base = private->entries[smp_processor_id()]; > > e = get_entry(table_base, private->hook_entry[hook]); > > @@ -466,7 +466,7 @@ ip6t_do_table(struct sk_buff *skb, > #ifdef CONFIG_NETFILTER_DEBUG > ((struct ip6t_entry *)table_base)->comefrom = NETFILTER_LINK_POISON; > #endif > - rcu_read_unlock_bh(); > + xt_info_rdunlock_bh(); > > #ifdef DEBUG_ALLOW_ALL > return NF_ACCEPT; > @@ -926,87 +926,40 @@ get_counters(const struct xt_table_info > /* Instead of clearing (by a previous call to memset()) > * the counters and using adds, we set the counters > * with data used by 'current' CPU > - * We dont care about preemption here. > */ > - curcpu = raw_smp_processor_id(); > + local_bh_disable(); > + curcpu = smp_processor_id(); Same story, but for IPv6. > i = 0; > + xt_info_wrlock(curcpu); > IP6T_ENTRY_ITERATE(t->entries[curcpu], > t->size, > set_entry_to_counter, > counters, > &i); > + xt_info_wrunlock(curcpu); > + > > for_each_possible_cpu(cpu) { > if (cpu == curcpu) > continue; > i = 0; > + xt_info_wrlock(cpu); > IP6T_ENTRY_ITERATE(t->entries[cpu], > t->size, > add_entry_to_counter, > counters, > &i); > + xt_info_wrunlock(cpu); > } > -} > - > -/* We're lazy, and add to the first CPU; overflow works its fey magic > - * and everything is OK. */ > -static int > -add_counter_to_entry(struct ip6t_entry *e, > - const struct xt_counters addme[], > - unsigned int *i) > -{ > - ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt); > - > - (*i)++; > - return 0; > -} > - > -/* Take values from counters and add them back onto the current cpu */ > -static void put_counters(struct xt_table_info *t, > - const struct xt_counters counters[]) > -{ > - unsigned int i, cpu; > - > - local_bh_disable(); > - cpu = smp_processor_id(); > - i = 0; > - IP6T_ENTRY_ITERATE(t->entries[cpu], > - t->size, > - add_counter_to_entry, > - counters, > - &i); > local_bh_enable(); > } > > -static inline int > -zero_entry_counter(struct ip6t_entry *e, void *arg) > -{ > - e->counters.bcnt = 0; > - e->counters.pcnt = 0; > - return 0; > -} > - > -static void > -clone_counters(struct xt_table_info *newinfo, const struct xt_table_info *info) > -{ > - unsigned int cpu; > - const void *loc_cpu_entry = info->entries[raw_smp_processor_id()]; > - > - memcpy(newinfo, info, offsetof(struct xt_table_info, entries)); > - for_each_possible_cpu(cpu) { > - memcpy(newinfo->entries[cpu], loc_cpu_entry, info->size); > - IP6T_ENTRY_ITERATE(newinfo->entries[cpu], newinfo->size, > - zero_entry_counter, NULL); > - } > -} > - > static struct xt_counters *alloc_counters(struct xt_table *table) > { > unsigned int countersize; > struct xt_counters *counters; > struct xt_table_info *private = table->private; > - struct xt_table_info *info; > > /* We need atomic snapshot of counters: rest doesn't change > (other than comefrom, which userspace doesn't care > @@ -1015,30 +968,11 @@ static struct xt_counters *alloc_counter > counters = vmalloc_node(countersize, numa_node_id()); > > if (counters == NULL) > - goto nomem; > - > - info = xt_alloc_table_info(private->size); > - if (!info) > - goto free_counters; > - > - clone_counters(info, private); > + return ERR_PTR(-ENOMEM); > > - mutex_lock(&table->lock); > - xt_table_entry_swap_rcu(private, info); > - synchronize_net(); /* Wait until smoke has cleared */ > - > - get_counters(info, counters); > - put_counters(private, counters); > - mutex_unlock(&table->lock); > - > - xt_free_table_info(info); > + get_counters(private, counters); > > return counters; > - > - free_counters: > - vfree(counters); > - nomem: > - return ERR_PTR(-ENOMEM); > } > > static int > @@ -1405,11 +1339,24 @@ do_replace(struct net *net, void __user > return ret; > } > > +/* We're lazy, and add to the first CPU; overflow works its fey magic > + * and everything is OK. */ > +static int > +add_counter_to_entry(struct ip6t_entry *e, > + const struct xt_counters addme[], > + unsigned int *i) > +{ > + ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt); > + > + (*i)++; > + return 0; > +} > + > static int > do_add_counters(struct net *net, void __user *user, unsigned int len, > int compat) > { > - unsigned int i; > + unsigned int i, curcpu; > struct xt_counters_info tmp; > struct xt_counters *paddc; > unsigned int num_counters; > @@ -1465,25 +1412,28 @@ do_add_counters(struct net *net, void __ > goto free; > } > > - mutex_lock(&t->lock); > + > + local_bh_disable(); > private = t->private; > if (private->number != num_counters) { > ret = -EINVAL; > goto unlock_up_free; > } > > - preempt_disable(); > i = 0; > /* Choose the copy that is on our node */ > - loc_cpu_entry = private->entries[raw_smp_processor_id()]; > + curcpu = smp_processor_id(); > + xt_info_wrlock(curcpu); > + loc_cpu_entry = private->entries[curcpu]; > IP6T_ENTRY_ITERATE(loc_cpu_entry, > private->size, > add_counter_to_entry, > paddc, > &i); > - preempt_enable(); > + xt_info_wrunlock(curcpu); > + > unlock_up_free: > - mutex_unlock(&t->lock); > + local_bh_enable(); > xt_table_unlock(t); > module_put(t->me); > free: > --- a/net/ipv4/netfilter/arp_tables.c 2009-04-21 07:57:06.633261308 -0700 > +++ b/net/ipv4/netfilter/arp_tables.c 2009-04-21 14:34:39.026255677 -0700 > @@ -253,9 +253,9 @@ unsigned int arpt_do_table(struct sk_buf > indev = in ? in->name : nulldevname; > outdev = out ? out->name : nulldevname; > > - rcu_read_lock_bh(); > - private = rcu_dereference(table->private); > - table_base = rcu_dereference(private->entries[smp_processor_id()]); > + xt_info_rdlock_bh(); > + private = table->private; > + table_base = private->entries[smp_processor_id()]; > > e = get_entry(table_base, private->hook_entry[hook]); > back = get_entry(table_base, private->underflow[hook]); > @@ -273,6 +273,7 @@ unsigned int arpt_do_table(struct sk_buf > > hdr_len = sizeof(*arp) + (2 * sizeof(struct in_addr)) + > (2 * skb->dev->addr_len); > + > ADD_COUNTER(e->counters, hdr_len, 1); > > t = arpt_get_target(e); > @@ -328,8 +329,7 @@ unsigned int arpt_do_table(struct sk_buf > e = (void *)e + e->next_offset; > } > } while (!hotdrop); > - > - rcu_read_unlock_bh(); > + xt_info_rdunlock_bh(); > > if (hotdrop) > return NF_DROP; > @@ -711,88 +711,39 @@ static void get_counters(const struct xt > /* Instead of clearing (by a previous call to memset()) > * the counters and using adds, we set the counters > * with data used by 'current' CPU > - * We dont care about preemption here. > */ > - curcpu = raw_smp_processor_id(); > + local_bh_disable(); Same story, but for ARP. > + curcpu = smp_processor_id(); > > i = 0; > + xt_info_wrlock(curcpu); > ARPT_ENTRY_ITERATE(t->entries[curcpu], > t->size, > set_entry_to_counter, > counters, > &i); > + xt_info_wrunlock(curcpu); > > for_each_possible_cpu(cpu) { > if (cpu == curcpu) > continue; > i = 0; > + xt_info_wrlock(cpu); > ARPT_ENTRY_ITERATE(t->entries[cpu], > t->size, > add_entry_to_counter, > counters, > &i); > + xt_info_wrunlock(cpu); > } > -} > - > - > -/* We're lazy, and add to the first CPU; overflow works its fey magic > - * and everything is OK. */ > -static int > -add_counter_to_entry(struct arpt_entry *e, > - const struct xt_counters addme[], > - unsigned int *i) > -{ > - ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt); > - > - (*i)++; > - return 0; > -} > - > -/* Take values from counters and add them back onto the current cpu */ > -static void put_counters(struct xt_table_info *t, > - const struct xt_counters counters[]) > -{ > - unsigned int i, cpu; > - > - local_bh_disable(); > - cpu = smp_processor_id(); > - i = 0; > - ARPT_ENTRY_ITERATE(t->entries[cpu], > - t->size, > - add_counter_to_entry, > - counters, > - &i); > local_bh_enable(); > } > > -static inline int > -zero_entry_counter(struct arpt_entry *e, void *arg) > -{ > - e->counters.bcnt = 0; > - e->counters.pcnt = 0; > - return 0; > -} > - > -static void > -clone_counters(struct xt_table_info *newinfo, const struct xt_table_info *info) > -{ > - unsigned int cpu; > - const void *loc_cpu_entry = info->entries[raw_smp_processor_id()]; > - > - memcpy(newinfo, info, offsetof(struct xt_table_info, entries)); > - for_each_possible_cpu(cpu) { > - memcpy(newinfo->entries[cpu], loc_cpu_entry, info->size); > - ARPT_ENTRY_ITERATE(newinfo->entries[cpu], newinfo->size, > - zero_entry_counter, NULL); > - } > -} > - > static struct xt_counters *alloc_counters(struct xt_table *table) > { > unsigned int countersize; > struct xt_counters *counters; > struct xt_table_info *private = table->private; > - struct xt_table_info *info; > > /* We need atomic snapshot of counters: rest doesn't change > * (other than comefrom, which userspace doesn't care > @@ -802,30 +753,11 @@ static struct xt_counters *alloc_counter > counters = vmalloc_node(countersize, numa_node_id()); > > if (counters == NULL) > - goto nomem; > - > - info = xt_alloc_table_info(private->size); > - if (!info) > - goto free_counters; > - > - clone_counters(info, private); > + return ERR_PTR(-ENOMEM); > > - mutex_lock(&table->lock); > - xt_table_entry_swap_rcu(private, info); > - synchronize_net(); /* Wait until smoke has cleared */ > - > - get_counters(info, counters); > - put_counters(private, counters); > - mutex_unlock(&table->lock); > - > - xt_free_table_info(info); > + get_counters(private, counters); > > return counters; > - > - free_counters: > - vfree(counters); > - nomem: > - return ERR_PTR(-ENOMEM); > } > > static int copy_entries_to_user(unsigned int total_size, > @@ -1165,10 +1097,23 @@ static int do_replace(struct net *net, v > return ret; > } > > +/* We're lazy, and add to the first CPU; overflow works its fey magic > + * and everything is OK. */ > +static int > +add_counter_to_entry(struct arpt_entry *e, > + const struct xt_counters addme[], > + unsigned int *i) > +{ > + ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt); > + > + (*i)++; > + return 0; > +} > + > static int do_add_counters(struct net *net, void __user *user, unsigned int len, > int compat) > { > - unsigned int i; > + unsigned int i, curcpu; > struct xt_counters_info tmp; > struct xt_counters *paddc; > unsigned int num_counters; > @@ -1224,26 +1169,26 @@ static int do_add_counters(struct net *n > goto free; > } > > - mutex_lock(&t->lock); > + local_bh_disable(); > private = t->private; > if (private->number != num_counters) { > ret = -EINVAL; > goto unlock_up_free; > } > > - preempt_disable(); > i = 0; > /* Choose the copy that is on our node */ > - loc_cpu_entry = private->entries[smp_processor_id()]; > + curcpu = smp_processor_id(); > + loc_cpu_entry = private->entries[curcpu]; > + xt_info_wrlock(curcpu); > ARPT_ENTRY_ITERATE(loc_cpu_entry, > private->size, > add_counter_to_entry, > paddc, > &i); > - preempt_enable(); > + xt_info_wrunlock(curcpu); > unlock_up_free: > - mutex_unlock(&t->lock); > - > + local_bh_enable(); > xt_table_unlock(t); > module_put(t->me); > free: -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Stephen Hemminger a écrit : > This version of x_tables (ip/ip6/arp) locking uses a per-cpu > recursive lock that can be nested. > > The idea for this came from an earlier version done by Eric Dumazet. > Locking is done per-cpu, the fast path locks on the current cpu > and updates counters. This reduces the contention of a > single reader lock (in 2.6.29) without the delay of synchronize_net() > (in 2.6.30-rc2). > > The mutex that was added for 2.6.30 in xt_table is unnecessary since > there already is a mutex for xt[af].mutex that is held. > > Signed-off-by: Stephen Hemminger <shemminger@vyatta.com Signed-off-by: Eric Dumazet <dada1@cosmosbay.com> Tested today on various machines and no problem so far tbench/oprofile results, 3.7236% cpu spent in ipt_do_table, and 0.84% used on read_lock/read_unlock c04a5c30 <ipt_do_table>: /* ipt_do_table total: 217134 3.7236 */ ... 349 0.0060 :c04a5ccc: call c04ce380 <_read_lock> 23914 0.4101 :c04a5cd1: mov 0xc(%edi),%eax ... :c04a5ecb: call c04ce5f0 <_read_unlock_bh> 25279 0.4335 :c04a5ed0: cmpb $0x0,-0xd(%ebp) "iptables -L" fetches its data very fast too. 150 us on a 8 cpus machine, small firewall rules. 400-700 us on same machine, 1000 fw rules set (160000 bytes per cpu) depending on network trafic (from light to flood) Thanks > > --- > CHANGES > - reader and write now inline > - only acquire one cpu write lock at a time > - write lock pushed down into get_counters > > include/linux/netfilter/x_tables.h | 50 +++++++++++++-- > net/ipv4/netfilter/arp_tables.c | 121 ++++++++++--------------------------- > net/ipv4/netfilter/ip_tables.c | 120 +++++++++--------------------------- > net/ipv6/netfilter/ip6_tables.c | 120 ++++++++++-------------------------- > net/netfilter/x_tables.c | 55 +++++++++------- > 5 files changed, 174 insertions(+), 292 deletions(-) > -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Tue, 21 Apr 2009, Stephen Hemminger wrote: > > This version of x_tables (ip/ip6/arp) locking uses a per-cpu > recursive lock that can be nested. Ack on the code. But the comment is _still_ crap. Please update. It's not a recursive lock, as clearly shown by the code itself. It's a per-cpu read-write lock, and only the reader is "recursive" (but that's how read-write locks with in Linux, and that has nothing to do with anything). So make the explanations match the code and the intent. Write it something like This version of x_tables (ip/ip6/arp) locking uses a per-cpu reader-writer lock lock where the readers can nest. and don't confuse it with incorrect commit messages. The lock is very much not recursive - on purpose - for half the people taking it. [ That, btw, was always true, even in the original random open-coded version. Because you can't actually do a real recursive lock without having notion of "current ownership" either by making the count be <per-thread,per-lock> - like the BKL - or by saving the ownership information in the lock. A plain counter simply doesn't do it. ] Linus -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
--- a/include/linux/netfilter/x_tables.h 2009-04-21 07:57:06.668582345 -0700 +++ b/include/linux/netfilter/x_tables.h 2009-04-21 14:24:03.295299154 -0700 @@ -354,9 +354,6 @@ struct xt_table /* What hooks you will enter on */ unsigned int valid_hooks; - /* Lock for the curtain */ - struct mutex lock; - /* Man behind the curtain... */ struct xt_table_info *private; @@ -434,8 +431,51 @@ extern void xt_proto_fini(struct net *ne extern struct xt_table_info *xt_alloc_table_info(unsigned int size); extern void xt_free_table_info(struct xt_table_info *info); -extern void xt_table_entry_swap_rcu(struct xt_table_info *old, - struct xt_table_info *new); + + +/* + * Per-CPU read/write lock. This makes reader locking fast since + * there is no shared variable to cause cache ping-pong; but adds an + * additional write-side penalty since write must iterate over all + * possible CPU's. Readers read lock their per-cpu lock, and writers + * write lock on all CPU's. + * + * Read lock is used by ip/arp/ip6 tables rule processing which runs per-cpu. + * It needs to ensure that the rules are not being changed while packet + * is being processed. + * + * Write lock is used in two cases: + * 1. reading counter values + * all readers need to be stopped and the per-CPU values are summed. + * + * 2. replacing rules + * all packets in flight have to be processed before rules are swapped, + * then counters are read from the old (stale) info. + * + */ +DECLARE_PER_CPU(rwlock_t, xt_info_locks); + +static inline void xt_info_rdlock_bh(void) +{ + local_bh_disable(); + read_lock(&__get_cpu_var(xt_info_locks)); +} + +static inline void xt_info_rdunlock_bh(void) +{ + read_unlock_bh(&__get_cpu_var(xt_info_locks)); +} + +static inline void xt_info_wrlock(unsigned int cpu) +{ + write_lock(&per_cpu(xt_info_locks, cpu)); +} + +static inline void xt_info_wrunlock(unsigned int cpu) +{ + + write_unlock(&per_cpu(xt_info_locks, cpu)); +} /* * This helper is performance critical and must be inlined --- a/net/ipv4/netfilter/ip_tables.c 2009-04-21 07:57:06.649549203 -0700 +++ b/net/ipv4/netfilter/ip_tables.c 2009-04-21 14:33:29.842423044 -0700 @@ -338,10 +338,9 @@ ipt_do_table(struct sk_buff *skb, tgpar.hooknum = hook; IP_NF_ASSERT(table->valid_hooks & (1 << hook)); - - rcu_read_lock_bh(); - private = rcu_dereference(table->private); - table_base = rcu_dereference(private->entries[smp_processor_id()]); + xt_info_rdlock_bh(); + private = table->private; + table_base = private->entries[smp_processor_id()]; e = get_entry(table_base, private->hook_entry[hook]); @@ -436,8 +435,7 @@ ipt_do_table(struct sk_buff *skb, e = (void *)e + e->next_offset; } } while (!hotdrop); - - rcu_read_unlock_bh(); + xt_info_rdunlock_bh(); #ifdef DEBUG_ALLOW_ALL return NF_ACCEPT; @@ -897,89 +895,39 @@ get_counters(const struct xt_table_info /* Instead of clearing (by a previous call to memset()) * the counters and using adds, we set the counters * with data used by 'current' CPU - * We dont care about preemption here. */ - curcpu = raw_smp_processor_id(); + local_bh_disable(); + curcpu = smp_processor_id(); i = 0; + xt_info_wrlock(curcpu); IPT_ENTRY_ITERATE(t->entries[curcpu], t->size, set_entry_to_counter, counters, &i); + xt_info_wrunlock(curcpu); for_each_possible_cpu(cpu) { if (cpu == curcpu) continue; i = 0; + xt_info_wrlock(cpu); IPT_ENTRY_ITERATE(t->entries[cpu], t->size, add_entry_to_counter, counters, &i); + xt_info_wrunlock(cpu); } - -} - -/* We're lazy, and add to the first CPU; overflow works its fey magic - * and everything is OK. */ -static int -add_counter_to_entry(struct ipt_entry *e, - const struct xt_counters addme[], - unsigned int *i) -{ - ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt); - - (*i)++; - return 0; -} - -/* Take values from counters and add them back onto the current cpu */ -static void put_counters(struct xt_table_info *t, - const struct xt_counters counters[]) -{ - unsigned int i, cpu; - - local_bh_disable(); - cpu = smp_processor_id(); - i = 0; - IPT_ENTRY_ITERATE(t->entries[cpu], - t->size, - add_counter_to_entry, - counters, - &i); local_bh_enable(); } - -static inline int -zero_entry_counter(struct ipt_entry *e, void *arg) -{ - e->counters.bcnt = 0; - e->counters.pcnt = 0; - return 0; -} - -static void -clone_counters(struct xt_table_info *newinfo, const struct xt_table_info *info) -{ - unsigned int cpu; - const void *loc_cpu_entry = info->entries[raw_smp_processor_id()]; - - memcpy(newinfo, info, offsetof(struct xt_table_info, entries)); - for_each_possible_cpu(cpu) { - memcpy(newinfo->entries[cpu], loc_cpu_entry, info->size); - IPT_ENTRY_ITERATE(newinfo->entries[cpu], newinfo->size, - zero_entry_counter, NULL); - } -} - static struct xt_counters * alloc_counters(struct xt_table *table) { unsigned int countersize; struct xt_counters *counters; struct xt_table_info *private = table->private; - struct xt_table_info *info; /* We need atomic snapshot of counters: rest doesn't change (other than comefrom, which userspace doesn't care @@ -988,30 +936,11 @@ static struct xt_counters * alloc_counte counters = vmalloc_node(countersize, numa_node_id()); if (counters == NULL) - goto nomem; + return ERR_PTR(-ENOMEM); - info = xt_alloc_table_info(private->size); - if (!info) - goto free_counters; - - clone_counters(info, private); - - mutex_lock(&table->lock); - xt_table_entry_swap_rcu(private, info); - synchronize_net(); /* Wait until smoke has cleared */ - - get_counters(info, counters); - put_counters(private, counters); - mutex_unlock(&table->lock); - - xt_free_table_info(info); + get_counters(private, counters); return counters; - - free_counters: - vfree(counters); - nomem: - return ERR_PTR(-ENOMEM); } static int @@ -1377,11 +1306,23 @@ do_replace(struct net *net, void __user return ret; } +/* We're lazy, and add to the first CPU; overflow works its fey magic + * and everything is OK. */ +static int +add_counter_to_entry(struct ipt_entry *e, + const struct xt_counters addme[], + unsigned int *i) +{ + ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt); + + (*i)++; + return 0; +} static int do_add_counters(struct net *net, void __user *user, unsigned int len, int compat) { - unsigned int i; + unsigned int i, curcpu; struct xt_counters_info tmp; struct xt_counters *paddc; unsigned int num_counters; @@ -1437,25 +1378,26 @@ do_add_counters(struct net *net, void __ goto free; } - mutex_lock(&t->lock); + local_bh_disable(); private = t->private; if (private->number != num_counters) { ret = -EINVAL; goto unlock_up_free; } - preempt_disable(); i = 0; /* Choose the copy that is on our node */ - loc_cpu_entry = private->entries[raw_smp_processor_id()]; + curcpu = smp_processor_id(); + loc_cpu_entry = private->entries[curcpu]; + xt_info_wrlock(curcpu); IPT_ENTRY_ITERATE(loc_cpu_entry, private->size, add_counter_to_entry, paddc, &i); - preempt_enable(); + xt_info_wrunlock(curcpu); unlock_up_free: - mutex_unlock(&t->lock); + local_bh_enable(); xt_table_unlock(t); module_put(t->me); free: --- a/net/netfilter/x_tables.c 2009-04-21 07:57:06.605264365 -0700 +++ b/net/netfilter/x_tables.c 2009-04-21 14:33:50.177486967 -0700 @@ -625,20 +625,6 @@ void xt_free_table_info(struct xt_table_ } EXPORT_SYMBOL(xt_free_table_info); -void xt_table_entry_swap_rcu(struct xt_table_info *oldinfo, - struct xt_table_info *newinfo) -{ - unsigned int cpu; - - for_each_possible_cpu(cpu) { - void *p = oldinfo->entries[cpu]; - rcu_assign_pointer(oldinfo->entries[cpu], newinfo->entries[cpu]); - newinfo->entries[cpu] = p; - } - -} -EXPORT_SYMBOL_GPL(xt_table_entry_swap_rcu); - /* Find table by name, grabs mutex & ref. Returns ERR_PTR() on error. */ struct xt_table *xt_find_table_lock(struct net *net, u_int8_t af, const char *name) @@ -676,32 +662,43 @@ void xt_compat_unlock(u_int8_t af) EXPORT_SYMBOL_GPL(xt_compat_unlock); #endif +DEFINE_PER_CPU(rwlock_t, xt_info_locks); +EXPORT_PER_CPU_SYMBOL_GPL(xt_info_locks); + + struct xt_table_info * xt_replace_table(struct xt_table *table, unsigned int num_counters, struct xt_table_info *newinfo, int *error) { - struct xt_table_info *oldinfo, *private; + unsigned int i; + struct xt_table_info *private; /* Do the substitution. */ - mutex_lock(&table->lock); + local_bh_disable(); private = table->private; /* Check inside lock: is the old number correct? */ if (num_counters != private->number) { duprintf("num_counters != table->private->number (%u/%u)\n", num_counters, private->number); - mutex_unlock(&table->lock); + local_bh_enable(); *error = -EAGAIN; return NULL; } - oldinfo = private; - rcu_assign_pointer(table->private, newinfo); - newinfo->initial_entries = oldinfo->initial_entries; - mutex_unlock(&table->lock); - synchronize_net(); - return oldinfo; + table->private = newinfo; + newinfo->initial_entries = private->initial_entries; + + /* wait for each other cpu to see new table */ + for_each_possible_cpu(i) + if (i != smp_processor_id()) { + xt_info_wrlock(i); + xt_info_wrunlock(i); + } + local_bh_enable(); + + return private; } EXPORT_SYMBOL_GPL(xt_replace_table); @@ -734,7 +731,6 @@ struct xt_table *xt_register_table(struc /* Simplifies replace_table code. */ table->private = bootstrap; - mutex_init(&table->lock); if (!xt_replace_table(table, 0, newinfo, &ret)) goto unlock; @@ -1147,7 +1143,16 @@ static struct pernet_operations xt_net_o static int __init xt_init(void) { - int i, rv; + unsigned int i; + int rv; + static struct lock_class_key xt_lock_key[NR_CPUS]; + + for_each_possible_cpu(i) { + rwlock_t *lock = &per_cpu(xt_info_locks, i); + + rwlock_init(lock); + lockdep_set_class(lock, xt_lock_key+i); + } xt = kmalloc(sizeof(struct xt_af) * NFPROTO_NUMPROTO, GFP_KERNEL); if (!xt) --- a/net/ipv6/netfilter/ip6_tables.c 2009-04-21 07:57:06.621260619 -0700 +++ b/net/ipv6/netfilter/ip6_tables.c 2009-04-21 14:29:57.312236004 -0700 @@ -365,9 +365,9 @@ ip6t_do_table(struct sk_buff *skb, IP_NF_ASSERT(table->valid_hooks & (1 << hook)); - rcu_read_lock_bh(); - private = rcu_dereference(table->private); - table_base = rcu_dereference(private->entries[smp_processor_id()]); + xt_info_rdlock_bh(); + private = table->private; + table_base = private->entries[smp_processor_id()]; e = get_entry(table_base, private->hook_entry[hook]); @@ -466,7 +466,7 @@ ip6t_do_table(struct sk_buff *skb, #ifdef CONFIG_NETFILTER_DEBUG ((struct ip6t_entry *)table_base)->comefrom = NETFILTER_LINK_POISON; #endif - rcu_read_unlock_bh(); + xt_info_rdunlock_bh(); #ifdef DEBUG_ALLOW_ALL return NF_ACCEPT; @@ -926,87 +926,40 @@ get_counters(const struct xt_table_info /* Instead of clearing (by a previous call to memset()) * the counters and using adds, we set the counters * with data used by 'current' CPU - * We dont care about preemption here. */ - curcpu = raw_smp_processor_id(); + local_bh_disable(); + curcpu = smp_processor_id(); i = 0; + xt_info_wrlock(curcpu); IP6T_ENTRY_ITERATE(t->entries[curcpu], t->size, set_entry_to_counter, counters, &i); + xt_info_wrunlock(curcpu); + for_each_possible_cpu(cpu) { if (cpu == curcpu) continue; i = 0; + xt_info_wrlock(cpu); IP6T_ENTRY_ITERATE(t->entries[cpu], t->size, add_entry_to_counter, counters, &i); + xt_info_wrunlock(cpu); } -} - -/* We're lazy, and add to the first CPU; overflow works its fey magic - * and everything is OK. */ -static int -add_counter_to_entry(struct ip6t_entry *e, - const struct xt_counters addme[], - unsigned int *i) -{ - ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt); - - (*i)++; - return 0; -} - -/* Take values from counters and add them back onto the current cpu */ -static void put_counters(struct xt_table_info *t, - const struct xt_counters counters[]) -{ - unsigned int i, cpu; - - local_bh_disable(); - cpu = smp_processor_id(); - i = 0; - IP6T_ENTRY_ITERATE(t->entries[cpu], - t->size, - add_counter_to_entry, - counters, - &i); local_bh_enable(); } -static inline int -zero_entry_counter(struct ip6t_entry *e, void *arg) -{ - e->counters.bcnt = 0; - e->counters.pcnt = 0; - return 0; -} - -static void -clone_counters(struct xt_table_info *newinfo, const struct xt_table_info *info) -{ - unsigned int cpu; - const void *loc_cpu_entry = info->entries[raw_smp_processor_id()]; - - memcpy(newinfo, info, offsetof(struct xt_table_info, entries)); - for_each_possible_cpu(cpu) { - memcpy(newinfo->entries[cpu], loc_cpu_entry, info->size); - IP6T_ENTRY_ITERATE(newinfo->entries[cpu], newinfo->size, - zero_entry_counter, NULL); - } -} - static struct xt_counters *alloc_counters(struct xt_table *table) { unsigned int countersize; struct xt_counters *counters; struct xt_table_info *private = table->private; - struct xt_table_info *info; /* We need atomic snapshot of counters: rest doesn't change (other than comefrom, which userspace doesn't care @@ -1015,30 +968,11 @@ static struct xt_counters *alloc_counter counters = vmalloc_node(countersize, numa_node_id()); if (counters == NULL) - goto nomem; - - info = xt_alloc_table_info(private->size); - if (!info) - goto free_counters; - - clone_counters(info, private); + return ERR_PTR(-ENOMEM); - mutex_lock(&table->lock); - xt_table_entry_swap_rcu(private, info); - synchronize_net(); /* Wait until smoke has cleared */ - - get_counters(info, counters); - put_counters(private, counters); - mutex_unlock(&table->lock); - - xt_free_table_info(info); + get_counters(private, counters); return counters; - - free_counters: - vfree(counters); - nomem: - return ERR_PTR(-ENOMEM); } static int @@ -1405,11 +1339,24 @@ do_replace(struct net *net, void __user return ret; } +/* We're lazy, and add to the first CPU; overflow works its fey magic + * and everything is OK. */ +static int +add_counter_to_entry(struct ip6t_entry *e, + const struct xt_counters addme[], + unsigned int *i) +{ + ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt); + + (*i)++; + return 0; +} + static int do_add_counters(struct net *net, void __user *user, unsigned int len, int compat) { - unsigned int i; + unsigned int i, curcpu; struct xt_counters_info tmp; struct xt_counters *paddc; unsigned int num_counters; @@ -1465,25 +1412,28 @@ do_add_counters(struct net *net, void __ goto free; } - mutex_lock(&t->lock); + + local_bh_disable(); private = t->private; if (private->number != num_counters) { ret = -EINVAL; goto unlock_up_free; } - preempt_disable(); i = 0; /* Choose the copy that is on our node */ - loc_cpu_entry = private->entries[raw_smp_processor_id()]; + curcpu = smp_processor_id(); + xt_info_wrlock(curcpu); + loc_cpu_entry = private->entries[curcpu]; IP6T_ENTRY_ITERATE(loc_cpu_entry, private->size, add_counter_to_entry, paddc, &i); - preempt_enable(); + xt_info_wrunlock(curcpu); + unlock_up_free: - mutex_unlock(&t->lock); + local_bh_enable(); xt_table_unlock(t); module_put(t->me); free: --- a/net/ipv4/netfilter/arp_tables.c 2009-04-21 07:57:06.633261308 -0700 +++ b/net/ipv4/netfilter/arp_tables.c 2009-04-21 14:34:39.026255677 -0700 @@ -253,9 +253,9 @@ unsigned int arpt_do_table(struct sk_buf indev = in ? in->name : nulldevname; outdev = out ? out->name : nulldevname; - rcu_read_lock_bh(); - private = rcu_dereference(table->private); - table_base = rcu_dereference(private->entries[smp_processor_id()]); + xt_info_rdlock_bh(); + private = table->private; + table_base = private->entries[smp_processor_id()]; e = get_entry(table_base, private->hook_entry[hook]); back = get_entry(table_base, private->underflow[hook]); @@ -273,6 +273,7 @@ unsigned int arpt_do_table(struct sk_buf hdr_len = sizeof(*arp) + (2 * sizeof(struct in_addr)) + (2 * skb->dev->addr_len); + ADD_COUNTER(e->counters, hdr_len, 1); t = arpt_get_target(e); @@ -328,8 +329,7 @@ unsigned int arpt_do_table(struct sk_buf e = (void *)e + e->next_offset; } } while (!hotdrop); - - rcu_read_unlock_bh(); + xt_info_rdunlock_bh(); if (hotdrop) return NF_DROP; @@ -711,88 +711,39 @@ static void get_counters(const struct xt /* Instead of clearing (by a previous call to memset()) * the counters and using adds, we set the counters * with data used by 'current' CPU - * We dont care about preemption here. */ - curcpu = raw_smp_processor_id(); + local_bh_disable(); + curcpu = smp_processor_id(); i = 0; + xt_info_wrlock(curcpu); ARPT_ENTRY_ITERATE(t->entries[curcpu], t->size, set_entry_to_counter, counters, &i); + xt_info_wrunlock(curcpu); for_each_possible_cpu(cpu) { if (cpu == curcpu) continue; i = 0; + xt_info_wrlock(cpu); ARPT_ENTRY_ITERATE(t->entries[cpu], t->size, add_entry_to_counter, counters, &i); + xt_info_wrunlock(cpu); } -} - - -/* We're lazy, and add to the first CPU; overflow works its fey magic - * and everything is OK. */ -static int -add_counter_to_entry(struct arpt_entry *e, - const struct xt_counters addme[], - unsigned int *i) -{ - ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt); - - (*i)++; - return 0; -} - -/* Take values from counters and add them back onto the current cpu */ -static void put_counters(struct xt_table_info *t, - const struct xt_counters counters[]) -{ - unsigned int i, cpu; - - local_bh_disable(); - cpu = smp_processor_id(); - i = 0; - ARPT_ENTRY_ITERATE(t->entries[cpu], - t->size, - add_counter_to_entry, - counters, - &i); local_bh_enable(); } -static inline int -zero_entry_counter(struct arpt_entry *e, void *arg) -{ - e->counters.bcnt = 0; - e->counters.pcnt = 0; - return 0; -} - -static void -clone_counters(struct xt_table_info *newinfo, const struct xt_table_info *info) -{ - unsigned int cpu; - const void *loc_cpu_entry = info->entries[raw_smp_processor_id()]; - - memcpy(newinfo, info, offsetof(struct xt_table_info, entries)); - for_each_possible_cpu(cpu) { - memcpy(newinfo->entries[cpu], loc_cpu_entry, info->size); - ARPT_ENTRY_ITERATE(newinfo->entries[cpu], newinfo->size, - zero_entry_counter, NULL); - } -} - static struct xt_counters *alloc_counters(struct xt_table *table) { unsigned int countersize; struct xt_counters *counters; struct xt_table_info *private = table->private; - struct xt_table_info *info; /* We need atomic snapshot of counters: rest doesn't change * (other than comefrom, which userspace doesn't care @@ -802,30 +753,11 @@ static struct xt_counters *alloc_counter counters = vmalloc_node(countersize, numa_node_id()); if (counters == NULL) - goto nomem; - - info = xt_alloc_table_info(private->size); - if (!info) - goto free_counters; - - clone_counters(info, private); + return ERR_PTR(-ENOMEM); - mutex_lock(&table->lock); - xt_table_entry_swap_rcu(private, info); - synchronize_net(); /* Wait until smoke has cleared */ - - get_counters(info, counters); - put_counters(private, counters); - mutex_unlock(&table->lock); - - xt_free_table_info(info); + get_counters(private, counters); return counters; - - free_counters: - vfree(counters); - nomem: - return ERR_PTR(-ENOMEM); } static int copy_entries_to_user(unsigned int total_size, @@ -1165,10 +1097,23 @@ static int do_replace(struct net *net, v return ret; } +/* We're lazy, and add to the first CPU; overflow works its fey magic + * and everything is OK. */ +static int +add_counter_to_entry(struct arpt_entry *e, + const struct xt_counters addme[], + unsigned int *i) +{ + ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt); + + (*i)++; + return 0; +} + static int do_add_counters(struct net *net, void __user *user, unsigned int len, int compat) { - unsigned int i; + unsigned int i, curcpu; struct xt_counters_info tmp; struct xt_counters *paddc; unsigned int num_counters; @@ -1224,26 +1169,26 @@ static int do_add_counters(struct net *n goto free; } - mutex_lock(&t->lock); + local_bh_disable(); private = t->private; if (private->number != num_counters) { ret = -EINVAL; goto unlock_up_free; } - preempt_disable(); i = 0; /* Choose the copy that is on our node */ - loc_cpu_entry = private->entries[smp_processor_id()]; + curcpu = smp_processor_id(); + loc_cpu_entry = private->entries[curcpu]; + xt_info_wrlock(curcpu); ARPT_ENTRY_ITERATE(loc_cpu_entry, private->size, add_counter_to_entry, paddc, &i); - preempt_enable(); + xt_info_wrunlock(curcpu); unlock_up_free: - mutex_unlock(&t->lock); - + local_bh_enable(); xt_table_unlock(t); module_put(t->me); free:
This version of x_tables (ip/ip6/arp) locking uses a per-cpu recursive lock that can be nested. The idea for this came from an earlier version done by Eric Dumazet. Locking is done per-cpu, the fast path locks on the current cpu and updates counters. This reduces the contention of a single reader lock (in 2.6.29) without the delay of synchronize_net() (in 2.6.30-rc2). The mutex that was added for 2.6.30 in xt_table is unnecessary since there already is a mutex for xt[af].mutex that is held. Signed-off-by: Stephen Hemminger <shemminger@vyatta.com --- CHANGES - reader and write now inline - only acquire one cpu write lock at a time - write lock pushed down into get_counters include/linux/netfilter/x_tables.h | 50 +++++++++++++-- net/ipv4/netfilter/arp_tables.c | 121 ++++++++++--------------------------- net/ipv4/netfilter/ip_tables.c | 120 +++++++++--------------------------- net/ipv6/netfilter/ip6_tables.c | 120 ++++++++++-------------------------- net/netfilter/x_tables.c | 55 +++++++++------- 5 files changed, 174 insertions(+), 292 deletions(-) -- To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html