@@ -35,6 +35,7 @@ extern int rtnl_is_locked(void);
extern wait_queue_head_t netdev_unregistering_wq;
extern struct mutex net_mutex;
+extern struct rw_semaphore net_sem;
#ifdef CONFIG_PROVE_LOCKING
extern bool lockdep_rtnl_is_held(void);
@@ -41,6 +41,11 @@ struct net init_net = {
EXPORT_SYMBOL(init_net);
static bool init_net_initialized;
+/*
+ * net_sem: protects: pernet_list, net_generic_ids,
+ * init_net_initialized and first_* pointers.
+ */
+DECLARE_RWSEM(net_sem);
#define MIN_PERNET_OPS_ID \
((sizeof(struct net_generic) + sizeof(void *) - 1) / sizeof(void *))
@@ -411,12 +416,16 @@ struct net *copy_net_ns(unsigned long flags,
net->ucounts = ucounts;
get_user_ns(user_ns);
- rv = mutex_lock_killable(&net_mutex);
+ rv = down_read_killable(&net_sem);
if (rv < 0)
goto put_userns;
-
+ rv = mutex_lock_killable(&net_mutex);
+ if (rv < 0)
+ goto up_read;
rv = setup_net(net, user_ns);
mutex_unlock(&net_mutex);
+up_read:
+ up_read(&net_sem);
if (rv < 0) {
put_userns:
put_user_ns(user_ns);
@@ -443,6 +452,7 @@ static void cleanup_net(struct work_struct *work)
list_replace_init(&cleanup_list, &net_kill_list);
spin_unlock_irq(&cleanup_list_lock);
+ down_read(&net_sem);
mutex_lock(&net_mutex);
/* Don't let anyone else find us. */
@@ -484,6 +494,7 @@ static void cleanup_net(struct work_struct *work)
ops_free_list(ops, &net_exit_list);
mutex_unlock(&net_mutex);
+ up_read(&net_sem);
/* Ensure there are no outstanding rcu callbacks using this
* network namespace.
@@ -510,8 +521,10 @@ static void cleanup_net(struct work_struct *work)
*/
void net_ns_barrier(void)
{
+ down_write(&net_sem);
mutex_lock(&net_mutex);
mutex_unlock(&net_mutex);
+ up_write(&net_sem);
}
EXPORT_SYMBOL(net_ns_barrier);
@@ -838,12 +851,12 @@ static int __init net_ns_init(void)
rcu_assign_pointer(init_net.gen, ng);
- mutex_lock(&net_mutex);
+ down_write(&net_sem);
if (setup_net(&init_net, &init_user_ns))
panic("Could not setup the initial network namespace");
init_net_initialized = true;
- mutex_unlock(&net_mutex);
+ up_write(&net_sem);
register_pernet_subsys(&net_ns_ops);
@@ -983,9 +996,9 @@ static void unregister_pernet_operations(struct pernet_operations *ops)
int register_pernet_subsys(struct pernet_operations *ops)
{
int error;
- mutex_lock(&net_mutex);
+ down_write(&net_sem);
error = register_pernet_operations(first_device, ops);
- mutex_unlock(&net_mutex);
+ up_write(&net_sem);
return error;
}
EXPORT_SYMBOL_GPL(register_pernet_subsys);
@@ -1001,9 +1014,9 @@ EXPORT_SYMBOL_GPL(register_pernet_subsys);
*/
void unregister_pernet_subsys(struct pernet_operations *ops)
{
- mutex_lock(&net_mutex);
+ down_write(&net_sem);
unregister_pernet_operations(ops);
- mutex_unlock(&net_mutex);
+ up_write(&net_sem);
}
EXPORT_SYMBOL_GPL(unregister_pernet_subsys);
@@ -1029,11 +1042,11 @@ EXPORT_SYMBOL_GPL(unregister_pernet_subsys);
int register_pernet_device(struct pernet_operations *ops)
{
int error;
- mutex_lock(&net_mutex);
+ down_write(&net_sem);
error = register_pernet_operations(&pernet_list, ops);
if (!error && (first_device == &pernet_list))
first_device = &ops->list;
- mutex_unlock(&net_mutex);
+ up_write(&net_sem);
return error;
}
EXPORT_SYMBOL_GPL(register_pernet_device);
@@ -1049,11 +1062,11 @@ EXPORT_SYMBOL_GPL(register_pernet_device);
*/
void unregister_pernet_device(struct pernet_operations *ops)
{
- mutex_lock(&net_mutex);
+ down_write(&net_sem);
if (&ops->list == first_device)
first_device = first_device->next;
unregister_pernet_operations(ops);
- mutex_unlock(&net_mutex);
+ up_write(&net_sem);
}
EXPORT_SYMBOL_GPL(unregister_pernet_device);
@@ -390,11 +390,11 @@ static void rtnl_lock_unregistering_all(void)
void rtnl_link_unregister(struct rtnl_link_ops *ops)
{
/* Close the race with cleanup_net() */
- mutex_lock(&net_mutex);
+ down_write(&net_sem);
rtnl_lock_unregistering_all();
__rtnl_link_unregister(ops);
rtnl_unlock();
- mutex_unlock(&net_mutex);
+ up_write(&net_sem);
}
EXPORT_SYMBOL_GPL(rtnl_link_unregister);
Curently mutex is used to protect pernet operations list. It makes cleanup_net() to execute ->exit methods of the same operations set, which was used on the time of ->init, even after net namespace is unlinked from net_namespace_list. But the problem is it's need to synchronize_rcu() after net is removed from net_namespace_list(): Destroy net_ns: cleanup_net() mutex_lock(&net_mutex) list_del_rcu(&net->list) synchronize_rcu() <--- Sleep there for ages list_for_each_entry_reverse(ops, &pernet_list, list) ops_exit_list(ops, &net_exit_list) list_for_each_entry_reverse(ops, &pernet_list, list) ops_free_list(ops, &net_exit_list) mutex_unlock(&net_mutex) This primitive is not fast, especially on the systems with many processors and/or when preemptible RCU is enabled in config. So, all the time, while cleanup_net() is waiting for RCU grace period, creation of new net namespaces is not possible, the tasks, who makes it, are sleeping on the same mutex: Create net_ns: copy_net_ns() mutex_lock_killable(&net_mutex) <--- Sleep there for ages I observed 20-30 seconds hangs of "unshare -n" on ordinary 8-cpu laptop with preemptible RCU enabled. The solution is to convert net_mutex to the rw_semaphore and add small locks to really small number of pernet_operations, what really need them. Then, pernet_operations::init/::exit methods, modifying the net-related data, will require down_read() locking only, while down_write() will be used for changing pernet_list. This gives signify performance increase, like you may see here: https://www.spinics.net/lists/netdev/msg467095.html It's 4.6 times performance increase on one-thread test. Multi-thread tests increase may be close to 4.6 multiplied to number of threads. This patch starts replacing net_mutex to net_sem. It adds rw_semaphore, describes the variables it protects, and makes to use where appropriate. net_mutex is still present, and next patches will kick it out step-by-step. Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com> --- include/linux/rtnetlink.h | 1 + net/core/net_namespace.c | 37 +++++++++++++++++++++++++------------ net/core/rtnetlink.c | 4 ++-- 3 files changed, 28 insertions(+), 14 deletions(-)