diff mbox series

[v2,bpf-next,05/21] bpf: Introduce bpf_sysctl_{get,set}_new_value helpers

Message ID 1b3f7545f3d14e2277a3206eef2c3fea6329245d.1553560620.git.rdna@fb.com
State Superseded
Delegated to: BPF Maintainers
Headers show
Series bpf: Sysctl hook | expand

Commit Message

Andrey Ignatov March 26, 2019, 12:43 a.m. UTC
Add helpers to work with new value being written to sysctl by user
space.

bpf_sysctl_get_new_value() copies value being written to sysctl into
provided buffer.

bpf_sysctl_set_new_value() overrides new value being written by user
space with a one from provided buffer. Buffer should contain string
representation of the value, similar to what can be seen in /proc/sys/.

Both helpers can be used only on sysctl write.

File position matters and can be managed by an interface that will be
introduced separately. E.g. if user space calls sys_write to a file in
/proc/sys/ at file position = X, where X > 0, then the value set by
bpf_sysctl_set_new_value() will be written starting from X. If program
wants to override whole value with specified buffer, file position has
to be set to zero.

Documentation for the new helpers is provided in bpf.h UAPI.

Signed-off-by: Andrey Ignatov <rdna@fb.com>
---
 fs/proc/proc_sysctl.c      | 22 ++++++++---
 include/linux/bpf-cgroup.h |  8 ++--
 include/linux/filter.h     |  3 ++
 include/uapi/linux/bpf.h   | 38 +++++++++++++++++-
 kernel/bpf/cgroup.c        | 81 +++++++++++++++++++++++++++++++++++++-
 5 files changed, 142 insertions(+), 10 deletions(-)

Comments

Daniel Borkmann April 4, 2019, 2:37 p.m. UTC | #1
On 03/26/2019 01:43 AM, Andrey Ignatov wrote:
> Add helpers to work with new value being written to sysctl by user
> space.
> 
> bpf_sysctl_get_new_value() copies value being written to sysctl into
> provided buffer.
> 
> bpf_sysctl_set_new_value() overrides new value being written by user
> space with a one from provided buffer. Buffer should contain string
> representation of the value, similar to what can be seen in /proc/sys/.
> 
> Both helpers can be used only on sysctl write.
> 
> File position matters and can be managed by an interface that will be
> introduced separately. E.g. if user space calls sys_write to a file in
> /proc/sys/ at file position = X, where X > 0, then the value set by
> bpf_sysctl_set_new_value() will be written starting from X. If program
> wants to override whole value with specified buffer, file position has
> to be set to zero.
> 
> Documentation for the new helpers is provided in bpf.h UAPI.
> 
> Signed-off-by: Andrey Ignatov <rdna@fb.com>
> ---
>  fs/proc/proc_sysctl.c      | 22 ++++++++---
>  include/linux/bpf-cgroup.h |  8 ++--
>  include/linux/filter.h     |  3 ++
>  include/uapi/linux/bpf.h   | 38 +++++++++++++++++-
>  kernel/bpf/cgroup.c        | 81 +++++++++++++++++++++++++++++++++++++-
>  5 files changed, 142 insertions(+), 10 deletions(-)
> 
> diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
> index 72f4a096c146..4d1ab22774f7 100644
> --- a/fs/proc/proc_sysctl.c
> +++ b/fs/proc/proc_sysctl.c
> @@ -570,8 +570,8 @@ static ssize_t proc_sys_call_handler(struct file *filp, void __user *buf,
>  	struct inode *inode = file_inode(filp);
>  	struct ctl_table_header *head = grab_header(inode);
>  	struct ctl_table *table = PROC_I(inode)->sysctl_entry;
> +	void *new_buf = NULL;
>  	ssize_t error;
> -	size_t res;
>  
>  	if (IS_ERR(head))
>  		return PTR_ERR(head);
> @@ -589,15 +589,27 @@ static ssize_t proc_sys_call_handler(struct file *filp, void __user *buf,
>  	if (!table->proc_handler)
>  		goto out;
>  
> -	error = BPF_CGROUP_RUN_PROG_SYSCTL(head, table, write);
> +	error = BPF_CGROUP_RUN_PROG_SYSCTL(head, table, write, buf, &count,
> +					   &new_buf);
>  	if (error)
>  		goto out;
>  
>  	/* careful: calling conventions are nasty here */
> -	res = count;
> -	error = table->proc_handler(table, write, buf, &res, ppos);
> +	if (new_buf) {
> +		mm_segment_t old_fs;
> +
> +		old_fs = get_fs();
> +		set_fs(KERNEL_DS);
> +		error = table->proc_handler(table, write, (void __user *)new_buf,
> +					    &count, ppos);
> +		set_fs(old_fs);

From quick glance on the set, the above stood out. Afaik, there is an ongoing
effort by Al and other fs/core folks (as visible in the git log) to get rid of
set_fs() calls in the tree with the goal of eliminating this interface /entirely/
(more context on 'why' here: https://lwn.net/Articles/722267/). Is there a better
way to achieve the above w/o needing it?

> +		kfree(new_buf);
> +	} else {
> +		error = table->proc_handler(table, write, buf, &count, ppos);
> +	}
> +
>  	if (!error)
> -		error = res;
> +		error = count;
>  out:
>  	sysctl_head_finish(head);
>  
> diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
> index b1c45da20a26..1e97271f9a10 100644
> --- a/include/linux/bpf-cgroup.h
> +++ b/include/linux/bpf-cgroup.h
> @@ -113,7 +113,8 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor,
>  
>  int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head,
>  				   struct ctl_table *table, int write,
> -				   enum bpf_attach_type type);
> +				   void __user *buf, size_t *pcount,
> +				   void **new_buf, enum bpf_attach_type type);
>  
>  static inline enum bpf_cgroup_storage_type cgroup_storage_type(
>  	struct bpf_map *map)
> @@ -261,11 +262,12 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key,
>  })
>  
>  
> -#define BPF_CGROUP_RUN_PROG_SYSCTL(head, table, write)			       \
> +#define BPF_CGROUP_RUN_PROG_SYSCTL(head, table, write, buf, count, nbuf)       \
>  ({									       \
>  	int __ret = 0;							       \
>  	if (cgroup_bpf_enabled)						       \
>  		__ret = __cgroup_bpf_run_filter_sysctl(head, table, write,     \
> +						       buf, count, nbuf,       \
>  						       BPF_CGROUP_SYSCTL);     \
>  	__ret;								       \
>  })
> @@ -338,7 +340,7 @@ static inline int bpf_percpu_cgroup_storage_update(struct bpf_map *map,
>  #define BPF_CGROUP_RUN_PROG_UDP6_SENDMSG_LOCK(sk, uaddr, t_ctx) ({ 0; })
>  #define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops) ({ 0; })
>  #define BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(type,major,minor,access) ({ 0; })
> -#define BPF_CGROUP_RUN_PROG_SYSCTL(head, table, write) ({ 0; })
> +#define BPF_CGROUP_RUN_PROG_SYSCTL(head,table,write,buf,count,nbuf) ({ 0; })
>  
>  #define for_each_cgroup_storage_type(stype) for (; false; )
>
Andrey Ignatov April 5, 2019, 12:20 a.m. UTC | #2
Daniel Borkmann <daniel@iogearbox.net> [Thu, 2019-04-04 07:38 -0700]:
> On 03/26/2019 01:43 AM, Andrey Ignatov wrote:
> > Add helpers to work with new value being written to sysctl by user
> > space.
> > 
> > bpf_sysctl_get_new_value() copies value being written to sysctl into
> > provided buffer.
> > 
> > bpf_sysctl_set_new_value() overrides new value being written by user
> > space with a one from provided buffer. Buffer should contain string
> > representation of the value, similar to what can be seen in /proc/sys/.
> > 
> > Both helpers can be used only on sysctl write.
> > 
> > File position matters and can be managed by an interface that will be
> > introduced separately. E.g. if user space calls sys_write to a file in
> > /proc/sys/ at file position = X, where X > 0, then the value set by
> > bpf_sysctl_set_new_value() will be written starting from X. If program
> > wants to override whole value with specified buffer, file position has
> > to be set to zero.
> > 
> > Documentation for the new helpers is provided in bpf.h UAPI.
> > 
> > Signed-off-by: Andrey Ignatov <rdna@fb.com>
> > ---
> >  fs/proc/proc_sysctl.c      | 22 ++++++++---
> >  include/linux/bpf-cgroup.h |  8 ++--
> >  include/linux/filter.h     |  3 ++
> >  include/uapi/linux/bpf.h   | 38 +++++++++++++++++-
> >  kernel/bpf/cgroup.c        | 81 +++++++++++++++++++++++++++++++++++++-
> >  5 files changed, 142 insertions(+), 10 deletions(-)
> > 
> > diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
> > index 72f4a096c146..4d1ab22774f7 100644
> > --- a/fs/proc/proc_sysctl.c
> > +++ b/fs/proc/proc_sysctl.c
> > @@ -570,8 +570,8 @@ static ssize_t proc_sys_call_handler(struct file *filp, void __user *buf,
> >  	struct inode *inode = file_inode(filp);
> >  	struct ctl_table_header *head = grab_header(inode);
> >  	struct ctl_table *table = PROC_I(inode)->sysctl_entry;
> > +	void *new_buf = NULL;
> >  	ssize_t error;
> > -	size_t res;
> >  
> >  	if (IS_ERR(head))
> >  		return PTR_ERR(head);
> > @@ -589,15 +589,27 @@ static ssize_t proc_sys_call_handler(struct file *filp, void __user *buf,
> >  	if (!table->proc_handler)
> >  		goto out;
> >  
> > -	error = BPF_CGROUP_RUN_PROG_SYSCTL(head, table, write);
> > +	error = BPF_CGROUP_RUN_PROG_SYSCTL(head, table, write, buf, &count,
> > +					   &new_buf);
> >  	if (error)
> >  		goto out;
> >  
> >  	/* careful: calling conventions are nasty here */
> > -	res = count;
> > -	error = table->proc_handler(table, write, buf, &res, ppos);
> > +	if (new_buf) {
> > +		mm_segment_t old_fs;
> > +
> > +		old_fs = get_fs();
> > +		set_fs(KERNEL_DS);
> > +		error = table->proc_handler(table, write, (void __user *)new_buf,
> > +					    &count, ppos);
> > +		set_fs(old_fs);
> 
> From quick glance on the set, the above stood out. Afaik, there is an ongoing
> effort by Al and other fs/core folks (as visible in the git log) to get rid of
> set_fs() calls in the tree with the goal of eliminating this interface /entirely/
> (more context on 'why' here: https://urldefense.proofpoint.com/v2/url?u=https-3A__lwn.net_Articles_722267_&d=DwICaQ&c=5VD0RTtNlTh3ycd41b3MUw&r=3jAokpHyGuCuJ834j-tttQ&m=fmn6jd1czDvp5a6GeSw0zLMxU3VRcgm1ohqwAPOKf38&s=AfbvJ91arUzm328cKzcHXeeb104boAx8NJjsoIU6Lbk&e=). Is there a better
> way to achieve the above w/o needing it?

That's a good question. I've spent quite a lot of time looking for a
better way and the only one I'm aware of so far is to change
proc_handler signature, so that it accepts kernel 'buffer', and copying
between user and kernel happens outside of proc_handler.

But it would require changing all proc_handler implementations as well
so that they accept kernel 'buffer' and don't copy data from/to user by
themselves and there are just too many sysctl proc_handler
implementations:

  % git grep -E '\.proc_handler\s+=\s+' | \
  	sed -Ee 's/^.*\.proc_handler\s+=\s+//' | sort -u | wc -l
  179
  % git grep -lE '\.proc_handler\s+=\s+' | wc -l
  103


, i.e. it's huge refactoring that can be really hard to upstream.

Also I looked at the LWN article you mentioned and found this branch
that cleans up set_fs use cases:
http://git.infradead.org/users/hch/vfs.git/shortlog/refs/heads/setfs-elimination

I see it uses either similar approach, i.e. introduce separate (or use
available) function that accepts kernel buffer, then copy data from user
and pass it to such a function. Another approach, I see in the branch,
is to use iovec iterators (that's mentioned in the LWN article as well),
but that again would require changing all proc_handler implementations.

That's being said I don't know a better way to do this w/o huge
refactoring.


> > +		kfree(new_buf);
> > +	} else {
> > +		error = table->proc_handler(table, write, buf, &count, ppos);
> > +	}
> > +
> >  	if (!error)
> > -		error = res;
> > +		error = count;
> >  out:
> >  	sysctl_head_finish(head);
> >  
> > diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
> > index b1c45da20a26..1e97271f9a10 100644
> > --- a/include/linux/bpf-cgroup.h
> > +++ b/include/linux/bpf-cgroup.h
> > @@ -113,7 +113,8 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor,
> >  
> >  int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head,
> >  				   struct ctl_table *table, int write,
> > -				   enum bpf_attach_type type);
> > +				   void __user *buf, size_t *pcount,
> > +				   void **new_buf, enum bpf_attach_type type);
> >  
> >  static inline enum bpf_cgroup_storage_type cgroup_storage_type(
> >  	struct bpf_map *map)
> > @@ -261,11 +262,12 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key,
> >  })
> >  
> >  
> > -#define BPF_CGROUP_RUN_PROG_SYSCTL(head, table, write)			       \
> > +#define BPF_CGROUP_RUN_PROG_SYSCTL(head, table, write, buf, count, nbuf)       \
> >  ({									       \
> >  	int __ret = 0;							       \
> >  	if (cgroup_bpf_enabled)						       \
> >  		__ret = __cgroup_bpf_run_filter_sysctl(head, table, write,     \
> > +						       buf, count, nbuf,       \
> >  						       BPF_CGROUP_SYSCTL);     \
> >  	__ret;								       \
> >  })
> > @@ -338,7 +340,7 @@ static inline int bpf_percpu_cgroup_storage_update(struct bpf_map *map,
> >  #define BPF_CGROUP_RUN_PROG_UDP6_SENDMSG_LOCK(sk, uaddr, t_ctx) ({ 0; })
> >  #define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops) ({ 0; })
> >  #define BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(type,major,minor,access) ({ 0; })
> > -#define BPF_CGROUP_RUN_PROG_SYSCTL(head, table, write) ({ 0; })
> > +#define BPF_CGROUP_RUN_PROG_SYSCTL(head,table,write,buf,count,nbuf) ({ 0; })
> >  
> >  #define for_each_cgroup_storage_type(stype) for (; false; )
> >
diff mbox series

Patch

diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index 72f4a096c146..4d1ab22774f7 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -570,8 +570,8 @@  static ssize_t proc_sys_call_handler(struct file *filp, void __user *buf,
 	struct inode *inode = file_inode(filp);
 	struct ctl_table_header *head = grab_header(inode);
 	struct ctl_table *table = PROC_I(inode)->sysctl_entry;
+	void *new_buf = NULL;
 	ssize_t error;
-	size_t res;
 
 	if (IS_ERR(head))
 		return PTR_ERR(head);
@@ -589,15 +589,27 @@  static ssize_t proc_sys_call_handler(struct file *filp, void __user *buf,
 	if (!table->proc_handler)
 		goto out;
 
-	error = BPF_CGROUP_RUN_PROG_SYSCTL(head, table, write);
+	error = BPF_CGROUP_RUN_PROG_SYSCTL(head, table, write, buf, &count,
+					   &new_buf);
 	if (error)
 		goto out;
 
 	/* careful: calling conventions are nasty here */
-	res = count;
-	error = table->proc_handler(table, write, buf, &res, ppos);
+	if (new_buf) {
+		mm_segment_t old_fs;
+
+		old_fs = get_fs();
+		set_fs(KERNEL_DS);
+		error = table->proc_handler(table, write, (void __user *)new_buf,
+					    &count, ppos);
+		set_fs(old_fs);
+		kfree(new_buf);
+	} else {
+		error = table->proc_handler(table, write, buf, &count, ppos);
+	}
+
 	if (!error)
-		error = res;
+		error = count;
 out:
 	sysctl_head_finish(head);
 
diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index b1c45da20a26..1e97271f9a10 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -113,7 +113,8 @@  int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor,
 
 int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head,
 				   struct ctl_table *table, int write,
-				   enum bpf_attach_type type);
+				   void __user *buf, size_t *pcount,
+				   void **new_buf, enum bpf_attach_type type);
 
 static inline enum bpf_cgroup_storage_type cgroup_storage_type(
 	struct bpf_map *map)
@@ -261,11 +262,12 @@  int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key,
 })
 
 
-#define BPF_CGROUP_RUN_PROG_SYSCTL(head, table, write)			       \
+#define BPF_CGROUP_RUN_PROG_SYSCTL(head, table, write, buf, count, nbuf)       \
 ({									       \
 	int __ret = 0;							       \
 	if (cgroup_bpf_enabled)						       \
 		__ret = __cgroup_bpf_run_filter_sysctl(head, table, write,     \
+						       buf, count, nbuf,       \
 						       BPF_CGROUP_SYSCTL);     \
 	__ret;								       \
 })
@@ -338,7 +340,7 @@  static inline int bpf_percpu_cgroup_storage_update(struct bpf_map *map,
 #define BPF_CGROUP_RUN_PROG_UDP6_SENDMSG_LOCK(sk, uaddr, t_ctx) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops) ({ 0; })
 #define BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(type,major,minor,access) ({ 0; })
-#define BPF_CGROUP_RUN_PROG_SYSCTL(head, table, write) ({ 0; })
+#define BPF_CGROUP_RUN_PROG_SYSCTL(head,table,write,buf,count,nbuf) ({ 0; })
 
 #define for_each_cgroup_storage_type(stype) for (; false; )
 
diff --git a/include/linux/filter.h b/include/linux/filter.h
index f254ff92819f..a23653f9460c 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -1184,6 +1184,9 @@  struct bpf_sysctl_kern {
 	struct ctl_table *table;
 	void *cur_val;
 	size_t cur_len;
+	void *new_val;
+	size_t new_len;
+	int new_updated;
 	int write;
 };
 
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 481e66cce8dc..fed5b605449a 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -2514,6 +2514,40 @@  union bpf_attr {
  *
  *		**-EINVAL** if current value was unavailable, e.g. because
  *		sysctl is uninitialized and read returns -EIO for it.
+ *
+ * int bpf_sysctl_get_new_value(struct bpf_sysctl *ctx, char *buf, size_t buf_len)
+ *	Description
+ *		Get new value being written by user space to sysctl (before
+ *		the actual write happens) and copy it as a string into
+ *		provided by program buffer *buf* of size *buf_len*.
+ *
+ *		User space may write new value at file position > 0.
+ *
+ *		The buffer is always NUL terminated, unless it's zero-sized.
+ *	Return
+ *		Number of character copied (not including the trailing NUL).
+ *
+ *		**-E2BIG** if the buffer wasn't big enough (*buf* will contain
+ *		truncated name in this case).
+ *
+ *		**-EINVAL** if sysctl is being read.
+ *
+ * int bpf_sysctl_set_new_value(struct bpf_sysctl *ctx, const char *buf, size_t buf_len)
+ *	Description
+ *		Override new value being written by user space to sysctl with
+ *		value provided by program in buffer *buf* of size *buf_len*.
+ *
+ *		*buf* should contain a string in same form as provided by user
+ *		space on sysctl write.
+ *
+ *		User space may write new value at file position > 0. To override
+ *		the whole sysctl value file position should be set to zero.
+ *	Return
+ *		0 on success.
+ *
+ *		**-E2BIG** if the *buf_len* is too big.
+ *
+ *		**-EINVAL** if sysctl is being read.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -2618,7 +2652,9 @@  union bpf_attr {
 	FN(skc_lookup_tcp),		\
 	FN(tcp_check_syncookie),	\
 	FN(sysctl_get_name),		\
-	FN(sysctl_get_current_value),
+	FN(sysctl_get_current_value),	\
+	FN(sysctl_get_new_value),	\
+	FN(sysctl_set_new_value),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index c6b2cf29a54b..ba4e21986760 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -778,6 +778,13 @@  const struct bpf_verifier_ops cg_dev_verifier_ops = {
  * @head: sysctl table header
  * @table: sysctl table
  * @write: sysctl is being read (= 0) or written (= 1)
+ * @buf: pointer to buffer passed by user space
+ * @pcount: value-result argument: value is size of buffer pointed to by @buf,
+ *	result is size of @new_buf if program set new value, initial value
+ *	otherwise
+ * @new_buf: pointer to pointer to new buffer that will be allocated if program
+ *	overrides new value provided by user space on sysctl write
+ *	NOTE: it's caller responsibility to free *new_buf if it was set
  * @type: type of program to be executed
  *
  * Program is run when sysctl is being accessed, either read or written, and
@@ -788,7 +795,8 @@  const struct bpf_verifier_ops cg_dev_verifier_ops = {
  */
 int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head,
 				   struct ctl_table *table, int write,
-				   enum bpf_attach_type type)
+				   void __user *buf, size_t *pcount,
+				   void **new_buf, enum bpf_attach_type type)
 {
 	struct bpf_sysctl_kern ctx = {
 		.head = head,
@@ -796,6 +804,9 @@  int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head,
 		.write = write,
 		.cur_val = NULL,
 		.cur_len = PAGE_SIZE,
+		.new_val = NULL,
+		.new_len = 0,
+		.new_updated = 0,
 	};
 	struct cgroup *cgrp;
 	int ret;
@@ -818,6 +829,18 @@  int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head,
 		ctx.cur_len = 0;
 	}
 
+	if (write && buf && *pcount) {
+		/* BPF program should be able to override new value with a
+		 * buffer bigger than provided by user.
+		 */
+		ctx.new_val = kmalloc_track_caller(PAGE_SIZE, GFP_KERNEL);
+		ctx.new_len = min(PAGE_SIZE, *pcount);
+		if (!ctx.new_val ||
+		    copy_from_user(ctx.new_val, buf, ctx.new_len))
+			/* Let BPF program decide how to proceed. */
+			ctx.new_len = 0;
+	}
+
 	rcu_read_lock();
 	cgrp = task_dfl_cgroup(current);
 	ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], &ctx, BPF_PROG_RUN);
@@ -825,6 +848,13 @@  int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head,
 
 	kfree(ctx.cur_val);
 
+	if (ret == 1 && ctx.new_updated) {
+		*new_buf = ctx.new_val;
+		*pcount = ctx.new_len;
+	} else {
+		kfree(ctx.new_val);
+	}
+
 	return ret == 1 ? 0 : -EPERM;
 }
 EXPORT_SYMBOL(__cgroup_bpf_run_filter_sysctl);
@@ -932,6 +962,51 @@  static const struct bpf_func_proto bpf_sysctl_get_current_value_proto = {
 	.arg3_type	= ARG_CONST_SIZE,
 };
 
+BPF_CALL_3(bpf_sysctl_get_new_value, struct bpf_sysctl_kern *, ctx, char *, buf,
+	   size_t, buf_len)
+{
+	if (!ctx->write) {
+		if (buf && buf_len)
+			memset(buf, '\0', buf_len);
+		return -EINVAL;
+	}
+	return copy_sysctl_value(buf, buf_len, ctx->new_val, ctx->new_len);
+}
+
+static const struct bpf_func_proto bpf_sysctl_get_new_value_proto = {
+	.func		= bpf_sysctl_get_new_value,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_PTR_TO_UNINIT_MEM,
+	.arg3_type	= ARG_CONST_SIZE,
+};
+
+BPF_CALL_3(bpf_sysctl_set_new_value, struct bpf_sysctl_kern *, ctx,
+	   const char *, buf, size_t, buf_len)
+{
+	if (!ctx->write || !ctx->new_val || !ctx->new_len || !buf || !buf_len)
+		return -EINVAL;
+
+	if (buf_len > PAGE_SIZE - 1)
+		return -E2BIG;
+
+	memcpy(ctx->new_val, buf, buf_len);
+	ctx->new_len = buf_len;
+	ctx->new_updated = 1;
+
+	return 0;
+}
+
+static const struct bpf_func_proto bpf_sysctl_set_new_value_proto = {
+	.func		= bpf_sysctl_set_new_value,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_PTR_TO_MEM,
+	.arg3_type	= ARG_CONST_SIZE,
+};
+
 static const struct bpf_func_proto *
 sysctl_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 {
@@ -940,6 +1015,10 @@  sysctl_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_sysctl_get_name_proto;
 	case BPF_FUNC_sysctl_get_current_value:
 		return &bpf_sysctl_get_current_value_proto;
+	case BPF_FUNC_sysctl_get_new_value:
+		return &bpf_sysctl_get_new_value_proto;
+	case BPF_FUNC_sysctl_set_new_value:
+		return &bpf_sysctl_set_new_value_proto;
 	default:
 		return cgroup_base_func_proto(func_id, prog);
 	}