diff mbox series

[v8,bpf-next,1/3] bpf: sharing bpf runtime stats with BPF_ENABLE_STATS

Message ID 20200429064543.634465-2-songliubraving@fb.com
State Changes Requested
Delegated to: BPF Maintainers
Headers show
Series bpf: sharing bpf runtime stats with | expand

Commit Message

Song Liu April 29, 2020, 6:45 a.m. UTC
Currently, sysctl kernel.bpf_stats_enabled controls BPF runtime stats.
Typical userspace tools use kernel.bpf_stats_enabled as follows:

  1. Enable kernel.bpf_stats_enabled;
  2. Check program run_time_ns;
  3. Sleep for the monitoring period;
  4. Check program run_time_ns again, calculate the difference;
  5. Disable kernel.bpf_stats_enabled.

The problem with this approach is that only one userspace tool can toggle
this sysctl. If multiple tools toggle the sysctl at the same time, the
measurement may be inaccurate.

To fix this problem while keep backward compatibility, introduce a new
bpf command BPF_ENABLE_STATS. On success, this command enables stats and
returns a valid fd. BPF_ENABLE_STATS takes argument "type". Currently,
only one type, BPF_STATS_RUN_TIME, is supported. We can extend the
command to support other types of stats in the future.

With BPF_ENABLE_STATS, user space tool would have the following flow:

  1. Get a fd with BPF_ENABLE_STATS, and make sure it is valid;
  2. Check program run_time_ns;
  3. Sleep for the monitoring period;
  4. Check program run_time_ns again, calculate the difference;
  5. Close the fd.

Signed-off-by: Song Liu <songliubraving@fb.com>
---
 include/linux/bpf.h            |  1 +
 include/uapi/linux/bpf.h       | 11 +++++++
 kernel/bpf/syscall.c           | 57 ++++++++++++++++++++++++++++++++++
 kernel/sysctl.c                | 36 ++++++++++++++++++++-
 tools/include/uapi/linux/bpf.h | 11 +++++++
 5 files changed, 115 insertions(+), 1 deletion(-)

Comments

Daniel Borkmann April 29, 2020, 11:33 p.m. UTC | #1
On 4/29/20 8:45 AM, Song Liu wrote:
> Currently, sysctl kernel.bpf_stats_enabled controls BPF runtime stats.
> Typical userspace tools use kernel.bpf_stats_enabled as follows:
> 
>    1. Enable kernel.bpf_stats_enabled;
>    2. Check program run_time_ns;
>    3. Sleep for the monitoring period;
>    4. Check program run_time_ns again, calculate the difference;
>    5. Disable kernel.bpf_stats_enabled.
> 
> The problem with this approach is that only one userspace tool can toggle
> this sysctl. If multiple tools toggle the sysctl at the same time, the
> measurement may be inaccurate.
> 
> To fix this problem while keep backward compatibility, introduce a new
> bpf command BPF_ENABLE_STATS. On success, this command enables stats and
> returns a valid fd. BPF_ENABLE_STATS takes argument "type". Currently,
> only one type, BPF_STATS_RUN_TIME, is supported. We can extend the
> command to support other types of stats in the future.
> 
> With BPF_ENABLE_STATS, user space tool would have the following flow:
> 
>    1. Get a fd with BPF_ENABLE_STATS, and make sure it is valid;
>    2. Check program run_time_ns;
>    3. Sleep for the monitoring period;
>    4. Check program run_time_ns again, calculate the difference;
>    5. Close the fd.
> 
> Signed-off-by: Song Liu <songliubraving@fb.com>
[...]
> diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
> index d23c04cbe14f..8691b2cc550d 100644
> --- a/kernel/bpf/syscall.c
> +++ b/kernel/bpf/syscall.c
> @@ -3872,6 +3872,60 @@ static int bpf_link_get_fd_by_id(const union bpf_attr *attr)
>   	return fd;
>   }
>   
> +DEFINE_MUTEX(bpf_stats_enabled_mutex);
> +
> +static int bpf_stats_release(struct inode *inode, struct file *file)
> +{
> +	mutex_lock(&bpf_stats_enabled_mutex);
> +	static_key_slow_dec(&bpf_stats_enabled_key.key);
> +	mutex_unlock(&bpf_stats_enabled_mutex);
> +	return 0;
> +}
> +
> +static const struct file_operations bpf_stats_fops = {
> +	.release = bpf_stats_release,
> +};
> +
> +static int bpf_enable_runtime_stats(void)
> +{
> +	int fd;
> +
> +	mutex_lock(&bpf_stats_enabled_mutex);
> +
> +	/* Set a very high limit to avoid overflow */
> +	if (static_key_count(&bpf_stats_enabled_key.key) > INT_MAX / 2) {
> +		mutex_unlock(&bpf_stats_enabled_mutex);
> +		return -EBUSY;
> +	}
> +
> +	fd = anon_inode_getfd("bpf-stats", &bpf_stats_fops, NULL, 0);

Missing O_CLOEXEC or intentional (if latter, I'd have expected a comment
here though)?

> +	if (fd >= 0)
> +		static_key_slow_inc(&bpf_stats_enabled_key.key);
> +
> +	mutex_unlock(&bpf_stats_enabled_mutex);
> +	return fd;
> +}
> +
> +#define BPF_ENABLE_STATS_LAST_FIELD enable_stats.type
> +
[...]
> diff --git a/kernel/sysctl.c b/kernel/sysctl.c
> index e961286d0e14..af08ef0690cb 100644
> --- a/kernel/sysctl.c
> +++ b/kernel/sysctl.c
> @@ -201,6 +201,40 @@ static int max_extfrag_threshold = 1000;
>   
>   #endif /* CONFIG_SYSCTL */
>   
> +#ifdef CONFIG_BPF_SYSCALL
> +static int bpf_stats_handler(struct ctl_table *table, int write,
> +			     void __user *buffer, size_t *lenp,
> +			     loff_t *ppos)
> +{
> +	struct static_key *key = (struct static_key *)table->data;
> +	static int saved_val;
> +	int val, ret;
> +	struct ctl_table tmp = {
> +		.data   = &val,
> +		.maxlen = sizeof(val),
> +		.mode   = table->mode,
> +		.extra1 = SYSCTL_ZERO,
> +		.extra2 = SYSCTL_ONE,
> +	};
> +
> +	if (write && !capable(CAP_SYS_ADMIN))
> +		return -EPERM;
> +
> +	mutex_lock(&bpf_stats_enabled_mutex);
> +	val = saved_val;
> +	ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
> +	if (write && !ret && val != saved_val) {
> +		if (val)
> +			static_key_slow_inc(key);
> +		else
> +			static_key_slow_dec(key);
> +		saved_val = val;
> +	}
> +	mutex_unlock(&bpf_stats_enabled_mutex);
> +	return ret;
> +}

nit: I wonder if most of the logic could have been shared with
proc_do_static_key() here and only the mutex passed as an arg to
the common helper?

> +#endif
> +
>   /*
>    * /proc/sys support
>    */
> @@ -2549,7 +2583,7 @@ static struct ctl_table kern_table[] = {
>   		.data		= &bpf_stats_enabled_key.key,
>   		.maxlen		= sizeof(bpf_stats_enabled_key),
>   		.mode		= 0644,
Song Liu April 30, 2020, 12:28 a.m. UTC | #2
> On Apr 29, 2020, at 4:33 PM, Daniel Borkmann <daniel@iogearbox.net> wrote:
> 
>> 
[...]

>> +
>> +	fd = anon_inode_getfd("bpf-stats", &bpf_stats_fops, NULL, 0);
> 
> Missing O_CLOEXEC or intentional (if latter, I'd have expected a comment
> here though)?

Yeah, we should have O_CLOEXEC here. Will fix (unless you want fix it at
commit time). 

> 
>> +	if (fd >= 0)
>> +		static_key_slow_inc(&bpf_stats_enabled_key.key);
>> +
>> +	mutex_unlock(&bpf_stats_enabled_mutex);
>> +	return fd;
>> +}
>> +
>> +#define BPF_ENABLE_STATS_LAST_FIELD enable_stats.type
>> +
> [...]
>> diff --git a/kernel/sysctl.c b/kernel/sysctl.c
>> index e961286d0e14..af08ef0690cb 100644
>> --- a/kernel/sysctl.c
>> +++ b/kernel/sysctl.c
>> @@ -201,6 +201,40 @@ static int max_extfrag_threshold = 1000;
>>    #endif /* CONFIG_SYSCTL */
>>  +#ifdef CONFIG_BPF_SYSCALL
>> +static int bpf_stats_handler(struct ctl_table *table, int write,
>> +			     void __user *buffer, size_t *lenp,
>> +			     loff_t *ppos)
>> +{
>> +	struct static_key *key = (struct static_key *)table->data;
>> +	static int saved_val;
>> +	int val, ret;
>> +	struct ctl_table tmp = {
>> +		.data   = &val,
>> +		.maxlen = sizeof(val),
>> +		.mode   = table->mode,
>> +		.extra1 = SYSCTL_ZERO,
>> +		.extra2 = SYSCTL_ONE,
>> +	};
>> +
>> +	if (write && !capable(CAP_SYS_ADMIN))
>> +		return -EPERM;
>> +
>> +	mutex_lock(&bpf_stats_enabled_mutex);
>> +	val = saved_val;
>> +	ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
>> +	if (write && !ret && val != saved_val) {
>> +		if (val)
>> +			static_key_slow_inc(key);
>> +		else
>> +			static_key_slow_dec(key);
>> +		saved_val = val;
>> +	}
>> +	mutex_unlock(&bpf_stats_enabled_mutex);
>> +	return ret;
>> +}
> 
> nit: I wonder if most of the logic could have been shared with
> proc_do_static_key() here and only the mutex passed as an arg to
> the common helper?

We have static saved_val here, so it is not so easy to share it. 
I think it is cleaner with separate functions.  

Thanks,
Song
diff mbox series

Patch

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index c07b1d2f3824..1262ec460ab3 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -987,6 +987,7 @@  _out:							\
 
 #ifdef CONFIG_BPF_SYSCALL
 DECLARE_PER_CPU(int, bpf_prog_active);
+extern struct mutex bpf_stats_enabled_mutex;
 
 /*
  * Block execution of BPF programs attached to instrumentation (perf,
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 0eccafae55bb..7d6024554f57 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -115,6 +115,7 @@  enum bpf_cmd {
 	BPF_LINK_UPDATE,
 	BPF_LINK_GET_FD_BY_ID,
 	BPF_LINK_GET_NEXT_ID,
+	BPF_ENABLE_STATS,
 };
 
 enum bpf_map_type {
@@ -390,6 +391,12 @@  enum {
  */
 #define BPF_F_QUERY_EFFECTIVE	(1U << 0)
 
+/* type for BPF_ENABLE_STATS */
+enum bpf_stats_type {
+	/* enabled run_time_ns and run_cnt */
+	BPF_STATS_RUN_TIME = 0,
+};
+
 enum bpf_stack_build_id_status {
 	/* user space need an empty entry to identify end of a trace */
 	BPF_STACK_BUILD_ID_EMPTY = 0,
@@ -601,6 +608,10 @@  union bpf_attr {
 		__u32		old_prog_fd;
 	} link_update;
 
+	struct { /* struct used by BPF_ENABLE_STATS command */
+		__u32		type;
+	} enable_stats;
+
 } __attribute__((aligned(8)));
 
 /* The description below is an attempt at providing documentation to eBPF
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index d23c04cbe14f..8691b2cc550d 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -3872,6 +3872,60 @@  static int bpf_link_get_fd_by_id(const union bpf_attr *attr)
 	return fd;
 }
 
+DEFINE_MUTEX(bpf_stats_enabled_mutex);
+
+static int bpf_stats_release(struct inode *inode, struct file *file)
+{
+	mutex_lock(&bpf_stats_enabled_mutex);
+	static_key_slow_dec(&bpf_stats_enabled_key.key);
+	mutex_unlock(&bpf_stats_enabled_mutex);
+	return 0;
+}
+
+static const struct file_operations bpf_stats_fops = {
+	.release = bpf_stats_release,
+};
+
+static int bpf_enable_runtime_stats(void)
+{
+	int fd;
+
+	mutex_lock(&bpf_stats_enabled_mutex);
+
+	/* Set a very high limit to avoid overflow */
+	if (static_key_count(&bpf_stats_enabled_key.key) > INT_MAX / 2) {
+		mutex_unlock(&bpf_stats_enabled_mutex);
+		return -EBUSY;
+	}
+
+	fd = anon_inode_getfd("bpf-stats", &bpf_stats_fops, NULL, 0);
+	if (fd >= 0)
+		static_key_slow_inc(&bpf_stats_enabled_key.key);
+
+	mutex_unlock(&bpf_stats_enabled_mutex);
+	return fd;
+}
+
+#define BPF_ENABLE_STATS_LAST_FIELD enable_stats.type
+
+static int bpf_enable_stats(union bpf_attr *attr)
+{
+
+	if (CHECK_ATTR(BPF_ENABLE_STATS))
+		return -EINVAL;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	switch (attr->enable_stats.type) {
+	case BPF_STATS_RUN_TIME:
+		return bpf_enable_runtime_stats();
+	default:
+		break;
+	}
+	return -EINVAL;
+}
+
 SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
 {
 	union bpf_attr attr;
@@ -3996,6 +4050,9 @@  SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
 		err = bpf_obj_get_next_id(&attr, uattr,
 					  &link_idr, &link_idr_lock);
 		break;
+	case BPF_ENABLE_STATS:
+		err = bpf_enable_stats(&attr);
+		break;
 	default:
 		err = -EINVAL;
 		break;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index e961286d0e14..af08ef0690cb 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -201,6 +201,40 @@  static int max_extfrag_threshold = 1000;
 
 #endif /* CONFIG_SYSCTL */
 
+#ifdef CONFIG_BPF_SYSCALL
+static int bpf_stats_handler(struct ctl_table *table, int write,
+			     void __user *buffer, size_t *lenp,
+			     loff_t *ppos)
+{
+	struct static_key *key = (struct static_key *)table->data;
+	static int saved_val;
+	int val, ret;
+	struct ctl_table tmp = {
+		.data   = &val,
+		.maxlen = sizeof(val),
+		.mode   = table->mode,
+		.extra1 = SYSCTL_ZERO,
+		.extra2 = SYSCTL_ONE,
+	};
+
+	if (write && !capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	mutex_lock(&bpf_stats_enabled_mutex);
+	val = saved_val;
+	ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
+	if (write && !ret && val != saved_val) {
+		if (val)
+			static_key_slow_inc(key);
+		else
+			static_key_slow_dec(key);
+		saved_val = val;
+	}
+	mutex_unlock(&bpf_stats_enabled_mutex);
+	return ret;
+}
+#endif
+
 /*
  * /proc/sys support
  */
@@ -2549,7 +2583,7 @@  static struct ctl_table kern_table[] = {
 		.data		= &bpf_stats_enabled_key.key,
 		.maxlen		= sizeof(bpf_stats_enabled_key),
 		.mode		= 0644,
-		.proc_handler	= proc_do_static_key,
+		.proc_handler	= bpf_stats_handler,
 	},
 #endif
 #if defined(CONFIG_TREE_RCU)
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 0eccafae55bb..7d6024554f57 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -115,6 +115,7 @@  enum bpf_cmd {
 	BPF_LINK_UPDATE,
 	BPF_LINK_GET_FD_BY_ID,
 	BPF_LINK_GET_NEXT_ID,
+	BPF_ENABLE_STATS,
 };
 
 enum bpf_map_type {
@@ -390,6 +391,12 @@  enum {
  */
 #define BPF_F_QUERY_EFFECTIVE	(1U << 0)
 
+/* type for BPF_ENABLE_STATS */
+enum bpf_stats_type {
+	/* enabled run_time_ns and run_cnt */
+	BPF_STATS_RUN_TIME = 0,
+};
+
 enum bpf_stack_build_id_status {
 	/* user space need an empty entry to identify end of a trace */
 	BPF_STACK_BUILD_ID_EMPTY = 0,
@@ -601,6 +608,10 @@  union bpf_attr {
 		__u32		old_prog_fd;
 	} link_update;
 
+	struct { /* struct used by BPF_ENABLE_STATS command */
+		__u32		type;
+	} enable_stats;
+
 } __attribute__((aligned(8)));
 
 /* The description below is an attempt at providing documentation to eBPF