diff mbox series

[SRU,Mantic] io_uring: add a sysctl to disable io_uring system-wide

Message ID 20230911184310.1937349-5-cascardo@canonical.com
State New
Headers show
Series [SRU,Mantic] io_uring: add a sysctl to disable io_uring system-wide | expand

Commit Message

Thadeu Lima de Souza Cascardo Sept. 11, 2023, 6:43 p.m. UTC
From: Matteo Rizzo <matteorizzo@google.com>

BugLink: https://bugs.launchpad.net/bugs/2035116

Introduce a new sysctl (io_uring_disabled) which can be either 0, 1, or
2. When 0 (the default), all processes are allowed to create io_uring
instances, which is the current behavior.  When 1, io_uring creation is
disabled (io_uring_setup() will fail with -EPERM) for unprivileged
processes not in the kernel.io_uring_group group.  When 2, calls to
io_uring_setup() fail with -EPERM regardless of privilege.

Signed-off-by: Matteo Rizzo <matteorizzo@google.com>
[JEM: modified to add io_uring_group]
Signed-off-by: Jeff Moyer <jmoyer@redhat.com>
Link: https://lore.kernel.org/r/x49y1i42j1z.fsf@segfault.boston.devel.redhat.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
(backported from commit 76d3ccecfa186af3120e206d62f03db1a94a535f)
[cascardo: conflict due to missing b97f96e22f051d59d07a527dbd7d90408b661ca8]
Signed-off-by: Thadeu Lima de Souza Cascardo <cascardo@canonical.com>
---
 Documentation/admin-guide/sysctl/kernel.rst | 29 ++++++++++++
 io_uring/io_uring.c                         | 51 +++++++++++++++++++++
 2 files changed, 80 insertions(+)

Comments

Stefan Bader Oct. 4, 2023, 9:46 a.m. UTC | #1
On 11.09.23 20:43, Thadeu Lima de Souza Cascardo wrote:
> From: Matteo Rizzo <matteorizzo@google.com>
> 
> BugLink: https://bugs.launchpad.net/bugs/2035116
> 
> Introduce a new sysctl (io_uring_disabled) which can be either 0, 1, or
> 2. When 0 (the default), all processes are allowed to create io_uring
> instances, which is the current behavior.  When 1, io_uring creation is
> disabled (io_uring_setup() will fail with -EPERM) for unprivileged
> processes not in the kernel.io_uring_group group.  When 2, calls to
> io_uring_setup() fail with -EPERM regardless of privilege.
> 
> Signed-off-by: Matteo Rizzo <matteorizzo@google.com>
> [JEM: modified to add io_uring_group]
> Signed-off-by: Jeff Moyer <jmoyer@redhat.com>
> Link: https://lore.kernel.org/r/x49y1i42j1z.fsf@segfault.boston.devel.redhat.com
> Signed-off-by: Jens Axboe <axboe@kernel.dk>
> (backported from commit 76d3ccecfa186af3120e206d62f03db1a94a535f)
> [cascardo: conflict due to missing b97f96e22f051d59d07a527dbd7d90408b661ca8]
> Signed-off-by: Thadeu Lima de Souza Cascardo <cascardo@canonical.com>
> ---

Applied to mantic:linux/master-next. Thanks.

-Stefan

>   Documentation/admin-guide/sysctl/kernel.rst | 29 ++++++++++++
>   io_uring/io_uring.c                         | 51 +++++++++++++++++++++
>   2 files changed, 80 insertions(+)
> 
> diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst
> index 3800fab1619b..0795d790cc56 100644
> --- a/Documentation/admin-guide/sysctl/kernel.rst
> +++ b/Documentation/admin-guide/sysctl/kernel.rst
> @@ -450,6 +450,35 @@ this allows system administrators to override the
>   ``IA64_THREAD_UAC_NOPRINT`` ``prctl`` and avoid logs being flooded.
>   
>   
> +io_uring_disabled
> +=================
> +
> +Prevents all processes from creating new io_uring instances. Enabling this
> +shrinks the kernel's attack surface.
> +
> += ======================================================================
> +0 All processes can create io_uring instances as normal. This is the
> +  default setting.
> +1 io_uring creation is disabled (io_uring_setup() will fail with
> +  -EPERM) for unprivileged processes not in the io_uring_group group.
> +  Existing io_uring instances can still be used.  See the
> +  documentation for io_uring_group for more information.
> +2 io_uring creation is disabled for all processes. io_uring_setup()
> +  always fails with -EPERM. Existing io_uring instances can still be
> +  used.
> += ======================================================================
> +
> +
> +io_uring_group
> +==============
> +
> +When io_uring_disabled is set to 1, a process must either be
> +privileged (CAP_SYS_ADMIN) or be in the io_uring_group group in order
> +to create an io_uring instance.  If io_uring_group is set to -1 (the
> +default), only processes with the CAP_SYS_ADMIN capability may create
> +io_uring instances.
> +
> +
>   kexec_load_disabled
>   ===================
>   
> diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
> index 93db3e4e7b68..8beb362356fd 100644
> --- a/io_uring/io_uring.c
> +++ b/io_uring/io_uring.c
> @@ -152,6 +152,31 @@ static void __io_submit_flush_completions(struct io_ring_ctx *ctx);
>   
>   struct kmem_cache *req_cachep;
>   
> +static int __read_mostly sysctl_io_uring_disabled;
> +static int __read_mostly sysctl_io_uring_group = -1;
> +
> +#ifdef CONFIG_SYSCTL
> +static struct ctl_table kernel_io_uring_disabled_table[] = {
> +	{
> +		.procname	= "io_uring_disabled",
> +		.data		= &sysctl_io_uring_disabled,
> +		.maxlen		= sizeof(sysctl_io_uring_disabled),
> +		.mode		= 0644,
> +		.proc_handler	= proc_dointvec_minmax,
> +		.extra1		= SYSCTL_ZERO,
> +		.extra2		= SYSCTL_TWO,
> +	},
> +	{
> +		.procname	= "io_uring_group",
> +		.data		= &sysctl_io_uring_group,
> +		.maxlen		= sizeof(gid_t),
> +		.mode		= 0644,
> +		.proc_handler	= proc_dointvec,
> +	},
> +	{},
> +};
> +#endif
> +
>   struct sock *io_uring_get_socket(struct file *file)
>   {
>   #if defined(CONFIG_UNIX)
> @@ -4040,9 +4065,30 @@ static long io_uring_setup(u32 entries, struct io_uring_params __user *params)
>   	return io_uring_create(entries, &p, params);
>   }
>   
> +static inline bool io_uring_allowed(void)
> +{
> +	int disabled = READ_ONCE(sysctl_io_uring_disabled);
> +	kgid_t io_uring_group;
> +
> +	if (disabled == 2)
> +		return false;
> +
> +	if (disabled == 0 || capable(CAP_SYS_ADMIN))
> +		return true;
> +
> +	io_uring_group = make_kgid(&init_user_ns, sysctl_io_uring_group);
> +	if (!gid_valid(io_uring_group))
> +		return false;
> +
> +	return in_group_p(io_uring_group);
> +}
> +
>   SYSCALL_DEFINE2(io_uring_setup, u32, entries,
>   		struct io_uring_params __user *, params)
>   {
> +	if (!io_uring_allowed())
> +		return -EPERM;
> +
>   	return io_uring_setup(entries, params);
>   }
>   
> @@ -4617,6 +4663,11 @@ static int __init io_uring_init(void)
>   
>   	req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC |
>   				SLAB_ACCOUNT | SLAB_TYPESAFE_BY_RCU);
> +
> +#ifdef CONFIG_SYSCTL
> +	register_sysctl_init("kernel", kernel_io_uring_disabled_table);
> +#endif
> +
>   	return 0;
>   };
>   __initcall(io_uring_init);
diff mbox series

Patch

diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst
index 3800fab1619b..0795d790cc56 100644
--- a/Documentation/admin-guide/sysctl/kernel.rst
+++ b/Documentation/admin-guide/sysctl/kernel.rst
@@ -450,6 +450,35 @@  this allows system administrators to override the
 ``IA64_THREAD_UAC_NOPRINT`` ``prctl`` and avoid logs being flooded.
 
 
+io_uring_disabled
+=================
+
+Prevents all processes from creating new io_uring instances. Enabling this
+shrinks the kernel's attack surface.
+
+= ======================================================================
+0 All processes can create io_uring instances as normal. This is the
+  default setting.
+1 io_uring creation is disabled (io_uring_setup() will fail with
+  -EPERM) for unprivileged processes not in the io_uring_group group.
+  Existing io_uring instances can still be used.  See the
+  documentation for io_uring_group for more information.
+2 io_uring creation is disabled for all processes. io_uring_setup()
+  always fails with -EPERM. Existing io_uring instances can still be
+  used.
+= ======================================================================
+
+
+io_uring_group
+==============
+
+When io_uring_disabled is set to 1, a process must either be
+privileged (CAP_SYS_ADMIN) or be in the io_uring_group group in order
+to create an io_uring instance.  If io_uring_group is set to -1 (the
+default), only processes with the CAP_SYS_ADMIN capability may create
+io_uring instances.
+
+
 kexec_load_disabled
 ===================
 
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 93db3e4e7b68..8beb362356fd 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -152,6 +152,31 @@  static void __io_submit_flush_completions(struct io_ring_ctx *ctx);
 
 struct kmem_cache *req_cachep;
 
+static int __read_mostly sysctl_io_uring_disabled;
+static int __read_mostly sysctl_io_uring_group = -1;
+
+#ifdef CONFIG_SYSCTL
+static struct ctl_table kernel_io_uring_disabled_table[] = {
+	{
+		.procname	= "io_uring_disabled",
+		.data		= &sysctl_io_uring_disabled,
+		.maxlen		= sizeof(sysctl_io_uring_disabled),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_TWO,
+	},
+	{
+		.procname	= "io_uring_group",
+		.data		= &sysctl_io_uring_group,
+		.maxlen		= sizeof(gid_t),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+	{},
+};
+#endif
+
 struct sock *io_uring_get_socket(struct file *file)
 {
 #if defined(CONFIG_UNIX)
@@ -4040,9 +4065,30 @@  static long io_uring_setup(u32 entries, struct io_uring_params __user *params)
 	return io_uring_create(entries, &p, params);
 }
 
+static inline bool io_uring_allowed(void)
+{
+	int disabled = READ_ONCE(sysctl_io_uring_disabled);
+	kgid_t io_uring_group;
+
+	if (disabled == 2)
+		return false;
+
+	if (disabled == 0 || capable(CAP_SYS_ADMIN))
+		return true;
+
+	io_uring_group = make_kgid(&init_user_ns, sysctl_io_uring_group);
+	if (!gid_valid(io_uring_group))
+		return false;
+
+	return in_group_p(io_uring_group);
+}
+
 SYSCALL_DEFINE2(io_uring_setup, u32, entries,
 		struct io_uring_params __user *, params)
 {
+	if (!io_uring_allowed())
+		return -EPERM;
+
 	return io_uring_setup(entries, params);
 }
 
@@ -4617,6 +4663,11 @@  static int __init io_uring_init(void)
 
 	req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC |
 				SLAB_ACCOUNT | SLAB_TYPESAFE_BY_RCU);
+
+#ifdef CONFIG_SYSCTL
+	register_sysctl_init("kernel", kernel_io_uring_disabled_table);
+#endif
+
 	return 0;
 };
 __initcall(io_uring_init);