@@ -353,7 +353,7 @@ struct xt_table
unsigned int valid_hooks;
/* Lock for the curtain */
- rwlock_t lock;
+ struct mutex lock;
/* Man behind the curtain... */
struct xt_table_info *private;
@@ -383,9 +383,15 @@ struct xt_table_info
unsigned int hook_entry[NF_INET_NUMHOOKS];
unsigned int underflow[NF_INET_NUMHOOKS];
+ /* For the dustman... */
+ union {
+ struct rcu_head rcu;
+ struct work_struct work;
+ };
+
/* ipt_entry tables: one per CPU */
/* Note : this field MUST be the last one, see XT_TABLE_INFO_SZ */
- char *entries[1];
+ void *entries[1];
};
#define XT_TABLE_INFO_SZ (offsetof(struct xt_table_info, entries) \
@@ -432,6 +438,9 @@ extern void xt_proto_fini(struct net *ne
extern struct xt_table_info *xt_alloc_table_info(unsigned int size);
extern void xt_free_table_info(struct xt_table_info *info);
+extern void xt_zero_table_entries(struct xt_table_info *info);
+extern void xt_swap_table_entries(struct xt_table_info *old,
+ struct xt_table_info *new);
#ifdef CONFIG_COMPAT
#include <net/compat.h>
@@ -347,10 +347,12 @@ ipt_do_table(struct sk_buff *skb,
mtpar.family = tgpar.family = NFPROTO_IPV4;
tgpar.hooknum = hook;
- read_lock_bh(&table->lock);
IP_NF_ASSERT(table->valid_hooks & (1 << hook));
- private = table->private;
- table_base = (void *)private->entries[smp_processor_id()];
+
+ rcu_read_lock_bh();
+ private = rcu_dereference(table->private);
+ table_base = rcu_dereference(private->entries[smp_processor_id()]);
+
e = get_entry(table_base, private->hook_entry[hook]);
/* For return from builtin chain */
@@ -445,7 +447,7 @@ ipt_do_table(struct sk_buff *skb,
}
} while (!hotdrop);
- read_unlock_bh(&table->lock);
+ rcu_read_unlock_bh();
#ifdef DEBUG_ALLOW_ALL
return NF_ACCEPT;
@@ -924,13 +926,45 @@ get_counters(const struct xt_table_info
counters,
&i);
}
+
+}
+
+/* We're lazy, and add to the first CPU; overflow works its fey magic
+ * and everything is OK. */
+static int
+add_counter_to_entry(struct ipt_entry *e,
+ const struct xt_counters addme[],
+ unsigned int *i)
+{
+ ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
+
+ (*i)++;
+ return 0;
+}
+
+/* Take values from counters and add them back onto the current cpu */
+static void put_counters(struct xt_table_info *t,
+ const struct xt_counters counters[])
+{
+ unsigned int i, cpu;
+
+ local_bh_disable();
+ cpu = smp_processor_id();
+ i = 0;
+ IPT_ENTRY_ITERATE(t->entries[cpu],
+ t->size,
+ add_counter_to_entry,
+ counters,
+ &i);
+ local_bh_enable();
}
static struct xt_counters * alloc_counters(struct xt_table *table)
{
unsigned int countersize;
struct xt_counters *counters;
- const struct xt_table_info *private = table->private;
+ struct xt_table_info *private = table->private;
+ struct xt_table_info *tmp;
/* We need atomic snapshot of counters: rest doesn't change
(other than comefrom, which userspace doesn't care
@@ -939,14 +973,30 @@ static struct xt_counters * alloc_counte
counters = vmalloc_node(countersize, numa_node_id());
if (counters == NULL)
- return ERR_PTR(-ENOMEM);
+ goto nomem;
+
+ tmp = xt_alloc_table_info(private->size);
+ if (!tmp)
+ goto free_counters;
+
+ xt_zero_table_entries(tmp);
+
+ mutex_lock(&table->lock);
+ xt_swap_table_entries(private, tmp);
+ synchronize_net(); /* Wait until smoke has cleared */
- /* First, sum counters... */
- write_lock_bh(&table->lock);
- get_counters(private, counters);
- write_unlock_bh(&table->lock);
+ get_counters(tmp, counters);
+ put_counters(private, counters);
+ mutex_unlock(&table->lock);
+
+ xt_free_table_info(tmp);
return counters;
+
+ free_counters:
+ vfree(counters);
+ nomem:
+ return ERR_PTR(-ENOMEM);
}
static int
@@ -1312,27 +1362,6 @@ do_replace(struct net *net, void __user
return ret;
}
-/* We're lazy, and add to the first CPU; overflow works its fey magic
- * and everything is OK. */
-static int
-add_counter_to_entry(struct ipt_entry *e,
- const struct xt_counters addme[],
- unsigned int *i)
-{
-#if 0
- duprintf("add_counter: Entry %u %lu/%lu + %lu/%lu\n",
- *i,
- (long unsigned int)e->counters.pcnt,
- (long unsigned int)e->counters.bcnt,
- (long unsigned int)addme[*i].pcnt,
- (long unsigned int)addme[*i].bcnt);
-#endif
-
- ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
-
- (*i)++;
- return 0;
-}
static int
do_add_counters(struct net *net, void __user *user, unsigned int len, int compat)
@@ -1393,13 +1422,14 @@ do_add_counters(struct net *net, void __
goto free;
}
- write_lock_bh(&t->lock);
+ mutex_lock(&t->lock);
private = t->private;
if (private->number != num_counters) {
ret = -EINVAL;
goto unlock_up_free;
}
+ preempt_disable();
i = 0;
/* Choose the copy that is on our node */
loc_cpu_entry = private->entries[raw_smp_processor_id()];
@@ -1408,8 +1438,9 @@ do_add_counters(struct net *net, void __
add_counter_to_entry,
paddc,
&i);
+ preempt_enable();
unlock_up_free:
- write_unlock_bh(&t->lock);
+ mutex_unlock(&t->lock);
xt_table_unlock(t);
module_put(t->me);
free:
@@ -611,18 +611,61 @@ struct xt_table_info *xt_alloc_table_inf
}
EXPORT_SYMBOL(xt_alloc_table_info);
-void xt_free_table_info(struct xt_table_info *info)
+/* Zero out entries */
+void xt_zero_table_entries(struct xt_table_info *info)
{
- int cpu;
+ unsigned int cpu;
+
+ for_each_possible_cpu(cpu)
+ memset(info->entries[cpu], 0, info->size);
+}
+EXPORT_SYMBOL_GPL(xt_zero_table_entries);
+
+/* Swap existing entries with new zeroed ones */
+void xt_swap_table_entries(struct xt_table_info *oldinfo,
+ struct xt_table_info *newinfo)
+{
+ unsigned int cpu;
for_each_possible_cpu(cpu) {
- if (info->size <= PAGE_SIZE)
- kfree(info->entries[cpu]);
- else
- vfree(info->entries[cpu]);
+ void *p = oldinfo->entries[cpu];
+
+ rcu_assign_pointer(oldinfo->entries[cpu], newinfo->entries[cpu]);
+ newinfo->entries[cpu] = p;
}
+}
+EXPORT_SYMBOL_GPL(xt_swap_table_entries);
+
+/* callback to do free for vmalloc'd case */
+static void xt_free_table_info_work(struct work_struct *arg)
+{
+ int cpu;
+ struct xt_table_info *info = container_of(arg, struct xt_table_info, work);
+
+ for_each_possible_cpu(cpu)
+ vfree(info->entries[cpu]);
kfree(info);
}
+
+static void xt_free_table_info_rcu(struct rcu_head *arg)
+{
+ struct xt_table_info *info = container_of(arg, struct xt_table_info, rcu);
+ if (info->size <= PAGE_SIZE) {
+ unsigned int cpu;
+ for_each_possible_cpu(cpu)
+ kfree(info->entries[cpu]);
+ kfree(info);
+ } else {
+ /* can't safely call vfree in current context */
+ INIT_WORK(&info->work, xt_free_table_info_work);
+ schedule_work(&info->work);
+ }
+}
+
+void xt_free_table_info(struct xt_table_info *info)
+{
+ call_rcu(&info->rcu, xt_free_table_info_rcu);
+}
EXPORT_SYMBOL(xt_free_table_info);
/* Find table by name, grabs mutex & ref. Returns ERR_PTR() on error. */
@@ -671,21 +714,22 @@ xt_replace_table(struct xt_table *table,
struct xt_table_info *oldinfo, *private;
/* Do the substitution. */
- write_lock_bh(&table->lock);
+ mutex_lock(&table->lock);
private = table->private;
/* Check inside lock: is the old number correct? */
if (num_counters != private->number) {
duprintf("num_counters != table->private->number (%u/%u)\n",
num_counters, private->number);
- write_unlock_bh(&table->lock);
+ mutex_unlock(&table->lock);
*error = -EAGAIN;
return NULL;
}
oldinfo = private;
- table->private = newinfo;
+ rcu_assign_pointer(table->private, newinfo);
newinfo->initial_entries = oldinfo->initial_entries;
- write_unlock_bh(&table->lock);
+ mutex_unlock(&table->lock);
+ synchronize_net();
return oldinfo;
}
EXPORT_SYMBOL_GPL(xt_replace_table);
@@ -719,7 +763,8 @@ struct xt_table *xt_register_table(struc
/* Simplifies replace_table code. */
table->private = bootstrap;
- rwlock_init(&table->lock);
+ mutex_init(&table->lock);
+
if (!xt_replace_table(table, 0, newinfo, &ret))
goto unlock;
@@ -237,9 +237,10 @@ unsigned int arpt_do_table(struct sk_buf
indev = in ? in->name : nulldevname;
outdev = out ? out->name : nulldevname;
- read_lock_bh(&table->lock);
- private = table->private;
- table_base = (void *)private->entries[smp_processor_id()];
+ rcu_read_lock_bh();
+ private = rcu_dereference(table->private);
+ table_base = rcu_dereference(private->entries[smp_processor_id()]);
+
e = get_entry(table_base, private->hook_entry[hook]);
back = get_entry(table_base, private->underflow[hook]);
@@ -311,7 +312,8 @@ unsigned int arpt_do_table(struct sk_buf
e = (void *)e + e->next_offset;
}
} while (!hotdrop);
- read_unlock_bh(&table->lock);
+
+ rcu_read_unlock_bh();
if (hotdrop)
return NF_DROP;
@@ -714,11 +716,43 @@ static void get_counters(const struct xt
}
}
-static inline struct xt_counters *alloc_counters(struct xt_table *table)
+
+/* We're lazy, and add to the first CPU; overflow works its fey magic
+ * and everything is OK. */
+static int
+add_counter_to_entry(struct arpt_entry *e,
+ const struct xt_counters addme[],
+ unsigned int *i)
+{
+ ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
+
+ (*i)++;
+ return 0;
+}
+
+/* Take values from counters and add them back onto the current cpu */
+static void put_counters(struct xt_table_info *t,
+ const struct xt_counters counters[])
+{
+ unsigned int i, cpu;
+
+ local_bh_disable();
+ cpu = smp_processor_id();
+ i = 0;
+ ARPT_ENTRY_ITERATE(t->entries[cpu],
+ t->size,
+ add_counter_to_entry,
+ counters,
+ &i);
+ local_bh_enable();
+}
+
+static struct xt_counters *alloc_counters(struct xt_table *table)
{
unsigned int countersize;
struct xt_counters *counters;
- const struct xt_table_info *private = table->private;
+ struct xt_table_info *private = table->private;
+ struct xt_table_info *tmp;
/* We need atomic snapshot of counters: rest doesn't change
* (other than comefrom, which userspace doesn't care
@@ -728,14 +762,30 @@ static inline struct xt_counters *alloc_
counters = vmalloc_node(countersize, numa_node_id());
if (counters == NULL)
- return ERR_PTR(-ENOMEM);
+ goto nomem;
- /* First, sum counters... */
- write_lock_bh(&table->lock);
- get_counters(private, counters);
- write_unlock_bh(&table->lock);
+ tmp = xt_alloc_table_info(private->size);
+ if (!tmp)
+ goto free_counters;
+
+ xt_zero_table_entries(tmp);
+
+ mutex_lock(&table->lock);
+ xt_swap_table_entries(private, tmp);
+ synchronize_net(); /* Wait until smoke has cleared */
+
+ get_counters(tmp, counters);
+ put_counters(private, counters);
+ mutex_unlock(&table->lock);
+
+ xt_free_table_info(tmp);
return counters;
+
+ free_counters:
+ vfree(counters);
+ nomem:
+ return ERR_PTR(-ENOMEM);
}
static int copy_entries_to_user(unsigned int total_size,
@@ -1075,20 +1125,6 @@ static int do_replace(struct net *net, v
return ret;
}
-/* We're lazy, and add to the first CPU; overflow works its fey magic
- * and everything is OK.
- */
-static inline int add_counter_to_entry(struct arpt_entry *e,
- const struct xt_counters addme[],
- unsigned int *i)
-{
-
- ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
-
- (*i)++;
- return 0;
-}
-
static int do_add_counters(struct net *net, void __user *user, unsigned int len,
int compat)
{
@@ -1148,13 +1184,14 @@ static int do_add_counters(struct net *n
goto free;
}
- write_lock_bh(&t->lock);
+ mutex_lock(&t->lock);
private = t->private;
if (private->number != num_counters) {
ret = -EINVAL;
goto unlock_up_free;
}
+ preempt_disable();
i = 0;
/* Choose the copy that is on our node */
loc_cpu_entry = private->entries[smp_processor_id()];
@@ -1164,7 +1201,8 @@ static int do_add_counters(struct net *n
paddc,
&i);
unlock_up_free:
- write_unlock_bh(&t->lock);
+ mutex_unlock(&t->lock);
+
xt_table_unlock(t);
module_put(t->me);
free:
@@ -373,10 +373,12 @@ ip6t_do_table(struct sk_buff *skb,
mtpar.family = tgpar.family = NFPROTO_IPV6;
tgpar.hooknum = hook;
- read_lock_bh(&table->lock);
IP_NF_ASSERT(table->valid_hooks & (1 << hook));
- private = table->private;
- table_base = (void *)private->entries[smp_processor_id()];
+
+ rcu_read_lock_bh();
+ private = rcu_dereference(table->private);
+ table_base = rcu_dereference(private->entries[smp_processor_id()]);
+
e = get_entry(table_base, private->hook_entry[hook]);
/* For return from builtin chain */
@@ -474,7 +476,7 @@ ip6t_do_table(struct sk_buff *skb,
#ifdef CONFIG_NETFILTER_DEBUG
((struct ip6t_entry *)table_base)->comefrom = NETFILTER_LINK_POISON;
#endif
- read_unlock_bh(&table->lock);
+ rcu_read_unlock_bh();
#ifdef DEBUG_ALLOW_ALL
return NF_ACCEPT;
@@ -955,11 +957,42 @@ get_counters(const struct xt_table_info
}
}
+/* We're lazy, and add to the first CPU; overflow works its fey magic
+ * and everything is OK. */
+static int
+add_counter_to_entry(struct ip6t_entry *e,
+ const struct xt_counters addme[],
+ unsigned int *i)
+{
+ ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
+
+ (*i)++;
+ return 0;
+}
+
+/* Take values from counters and add them back onto the current cpu */
+static void put_counters(struct xt_table_info *t,
+ const struct xt_counters counters[])
+{
+ unsigned int i, cpu;
+
+ local_bh_disable();
+ cpu = smp_processor_id();
+ i = 0;
+ IP6T_ENTRY_ITERATE(t->entries[cpu],
+ t->size,
+ add_counter_to_entry,
+ counters,
+ &i);
+ local_bh_enable();
+}
+
static struct xt_counters *alloc_counters(struct xt_table *table)
{
unsigned int countersize;
struct xt_counters *counters;
- const struct xt_table_info *private = table->private;
+ struct xt_table_info *private = table->private;
+ struct xt_table_info *tmp;
/* We need atomic snapshot of counters: rest doesn't change
(other than comefrom, which userspace doesn't care
@@ -968,14 +1001,28 @@ static struct xt_counters *alloc_counter
counters = vmalloc_node(countersize, numa_node_id());
if (counters == NULL)
- return ERR_PTR(-ENOMEM);
+ goto nomem;
+
+ tmp = xt_alloc_table_info(private->size);
+ if (!tmp)
+ goto free_counters;
+
+ xt_zero_table_entries(tmp);
+
+ mutex_lock(&table->lock);
+ xt_swap_table_entries(private, tmp);
+ synchronize_net(); /* Wait until smoke has cleared */
+
+ get_counters(tmp, counters);
+ put_counters(private, counters);
+ mutex_unlock(&table->lock);
- /* First, sum counters... */
- write_lock_bh(&table->lock);
- get_counters(private, counters);
- write_unlock_bh(&table->lock);
+ xt_free_table_info(tmp);
- return counters;
+ free_counters:
+ vfree(counters);
+ nomem:
+ return ERR_PTR(-ENOMEM);
}
static int
@@ -1342,28 +1389,6 @@ do_replace(struct net *net, void __user
return ret;
}
-/* We're lazy, and add to the first CPU; overflow works its fey magic
- * and everything is OK. */
-static inline int
-add_counter_to_entry(struct ip6t_entry *e,
- const struct xt_counters addme[],
- unsigned int *i)
-{
-#if 0
- duprintf("add_counter: Entry %u %lu/%lu + %lu/%lu\n",
- *i,
- (long unsigned int)e->counters.pcnt,
- (long unsigned int)e->counters.bcnt,
- (long unsigned int)addme[*i].pcnt,
- (long unsigned int)addme[*i].bcnt);
-#endif
-
- ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
-
- (*i)++;
- return 0;
-}
-
static int
do_add_counters(struct net *net, void __user *user, unsigned int len,
int compat)
@@ -1424,13 +1449,14 @@ do_add_counters(struct net *net, void __
goto free;
}
- write_lock_bh(&t->lock);
+ mutex_lock(&t->lock);
private = t->private;
if (private->number != num_counters) {
ret = -EINVAL;
goto unlock_up_free;
}
+ preempt_disable();
i = 0;
/* Choose the copy that is on our node */
loc_cpu_entry = private->entries[raw_smp_processor_id()];
@@ -1439,8 +1465,9 @@ do_add_counters(struct net *net, void __
add_counter_to_entry,
paddc,
&i);
+ preempt_enable();
unlock_up_free:
- write_unlock_bh(&t->lock);
+ mutex_unlock(&t->lock);
xt_table_unlock(t);
module_put(t->me);
free:
Make netfilter tables that use x_tables (iptables, ip6tables, arptables) operatate without locking on the receive path. Replace existing reader/writer lock with Read-Copy-Update to elminate the overhead of a read lock on each incoming packet. This should reduce the overhead of iptables especially on SMP systems. The previous code used a reader-writer lock for two purposes. The first was to ensure that the xt_table_info reference was not in process of being changed. Since xt_table_info is only freed via one routine, it was a direct conversion to RCU. The other use of the reader-writer lock was to to block changes to counters while they were being read. This is handled by swapping in a new set of counter values and then summing the old ones. The sum is then restored back on a single cpu. Signed-off-by: Stephen Hemminger <shemminger@vyatta.com> --- include/linux/netfilter/x_tables.h | 13 ++++ net/ipv4/netfilter/arp_tables.c | 92 ++++++++++++++++++++++++----------- net/ipv4/netfilter/ip_tables.c | 97 ++++++++++++++++++++++++------------- net/ipv6/netfilter/ip6_tables.c | 97 +++++++++++++++++++++++-------------- net/netfilter/x_tables.c | 67 +++++++++++++++++++++---- 5 files changed, 258 insertions(+), 108 deletions(-)