diff mbox series

[RFC,bpf-next,09/16] bpf: add bpf_seq_printf and bpf_seq_write helpers

Message ID 20200408232531.2676134-1-yhs@fb.com
State RFC
Delegated to: BPF Maintainers
Headers show
Series bpf: implement bpf based dumping of kernel data structures | expand

Commit Message

Yonghong Song April 8, 2020, 11:25 p.m. UTC
Two helpers bpf_seq_printf and bpf_seq_write, are added for
writing data to the seq_file buffer.

bpf_seq_printf supports common format string flag/width/type
fields so at least I can get identical results for
netlink and ipv6_route targets.

For bpf_seq_printf, return value 1 specifically indicates
a write failure due to overflow in order to differentiate
the failure from format strings.

For seq_file show, since the same object may be called
twice, some bpf_prog might be sensitive to this. With return
value indicating overflow happens the bpf program can
react differently.

Signed-off-by: Yonghong Song <yhs@fb.com>
---
 include/uapi/linux/bpf.h       |  18 +++-
 kernel/trace/bpf_trace.c       | 172 +++++++++++++++++++++++++++++++++
 scripts/bpf_helpers_doc.py     |   2 +
 tools/include/uapi/linux/bpf.h |  18 +++-
 4 files changed, 208 insertions(+), 2 deletions(-)

Comments

Alexei Starovoitov April 10, 2020, 3:26 a.m. UTC | #1
On Wed, Apr 08, 2020 at 04:25:31PM -0700, Yonghong Song wrote:
>  
> +BPF_CALL_5(bpf_seq_printf, struct seq_file *, m, char *, fmt, u32, fmt_size, u64, arg1,
> +	   u64, arg2)
> +{
> +	bool buf_used = false;
> +	int i, copy_size;
> +	int mod[2] = {};
> +	int fmt_cnt = 0;
> +	u64 unsafe_addr;
> +	char buf[64];
> +
> +	/*
> +	 * bpf_check()->check_func_arg()->check_stack_boundary()
> +	 * guarantees that fmt points to bpf program stack,
> +	 * fmt_size bytes of it were initialized and fmt_size > 0
> +	 */
> +	if (fmt[--fmt_size] != 0)
> +		return -EINVAL;
...
> +/* Horrid workaround for getting va_list handling working with different
> + * argument type combinations generically for 32 and 64 bit archs.
> + */
> +#define __BPF_SP_EMIT()	__BPF_ARG2_SP()
> +#define __BPF_SP(...)							\
> +	seq_printf(m, fmt, ##__VA_ARGS__)
> +
> +#define __BPF_ARG1_SP(...)						\
> +	((mod[0] == 2 || (mod[0] == 1 && __BITS_PER_LONG == 64))	\
> +	  ? __BPF_SP(arg1, ##__VA_ARGS__)				\
> +	  : ((mod[0] == 1 || (mod[0] == 0 && __BITS_PER_LONG == 32))	\
> +	      ? __BPF_SP((long)arg1, ##__VA_ARGS__)			\
> +	      : __BPF_SP((u32)arg1, ##__VA_ARGS__)))
> +
> +#define __BPF_ARG2_SP(...)						\
> +	((mod[1] == 2 || (mod[1] == 1 && __BITS_PER_LONG == 64))	\
> +	  ? __BPF_ARG1_SP(arg2, ##__VA_ARGS__)				\
> +	  : ((mod[1] == 1 || (mod[1] == 0 && __BITS_PER_LONG == 32))	\
> +	      ? __BPF_ARG1_SP((long)arg2, ##__VA_ARGS__)		\
> +	      : __BPF_ARG1_SP((u32)arg2, ##__VA_ARGS__)))
> +
> +	__BPF_SP_EMIT();
> +	return seq_has_overflowed(m);
> +}

This function is mostly a copy-paste of bpf_trace_printk() with difference
of printing via seq_printf vs __trace_printk.
Please find a way to share the code.
Yonghong Song April 10, 2020, 6:12 a.m. UTC | #2
On 4/9/20 8:26 PM, Alexei Starovoitov wrote:
> On Wed, Apr 08, 2020 at 04:25:31PM -0700, Yonghong Song wrote:
>>   
>> +BPF_CALL_5(bpf_seq_printf, struct seq_file *, m, char *, fmt, u32, fmt_size, u64, arg1,
>> +	   u64, arg2)
>> +{
>> +	bool buf_used = false;
>> +	int i, copy_size;
>> +	int mod[2] = {};
>> +	int fmt_cnt = 0;
>> +	u64 unsafe_addr;
>> +	char buf[64];
>> +
>> +	/*
>> +	 * bpf_check()->check_func_arg()->check_stack_boundary()
>> +	 * guarantees that fmt points to bpf program stack,
>> +	 * fmt_size bytes of it were initialized and fmt_size > 0
>> +	 */
>> +	if (fmt[--fmt_size] != 0)
>> +		return -EINVAL;
> ...
>> +/* Horrid workaround for getting va_list handling working with different
>> + * argument type combinations generically for 32 and 64 bit archs.
>> + */
>> +#define __BPF_SP_EMIT()	__BPF_ARG2_SP()
>> +#define __BPF_SP(...)							\
>> +	seq_printf(m, fmt, ##__VA_ARGS__)
>> +
>> +#define __BPF_ARG1_SP(...)						\
>> +	((mod[0] == 2 || (mod[0] == 1 && __BITS_PER_LONG == 64))	\
>> +	  ? __BPF_SP(arg1, ##__VA_ARGS__)				\
>> +	  : ((mod[0] == 1 || (mod[0] == 0 && __BITS_PER_LONG == 32))	\
>> +	      ? __BPF_SP((long)arg1, ##__VA_ARGS__)			\
>> +	      : __BPF_SP((u32)arg1, ##__VA_ARGS__)))
>> +
>> +#define __BPF_ARG2_SP(...)						\
>> +	((mod[1] == 2 || (mod[1] == 1 && __BITS_PER_LONG == 64))	\
>> +	  ? __BPF_ARG1_SP(arg2, ##__VA_ARGS__)				\
>> +	  : ((mod[1] == 1 || (mod[1] == 0 && __BITS_PER_LONG == 32))	\
>> +	      ? __BPF_ARG1_SP((long)arg2, ##__VA_ARGS__)		\
>> +	      : __BPF_ARG1_SP((u32)arg2, ##__VA_ARGS__)))
>> +
>> +	__BPF_SP_EMIT();
>> +	return seq_has_overflowed(m);
>> +}
> 
> This function is mostly a copy-paste of bpf_trace_printk() with difference
> of printing via seq_printf vs __trace_printk.
> Please find a way to share the code.

Will do.
Andrii Nakryiko April 14, 2020, 5:28 a.m. UTC | #3
On Wed, Apr 8, 2020 at 4:26 PM Yonghong Song <yhs@fb.com> wrote:
>
> Two helpers bpf_seq_printf and bpf_seq_write, are added for
> writing data to the seq_file buffer.
>
> bpf_seq_printf supports common format string flag/width/type
> fields so at least I can get identical results for
> netlink and ipv6_route targets.
>
> For bpf_seq_printf, return value 1 specifically indicates
> a write failure due to overflow in order to differentiate
> the failure from format strings.
>
> For seq_file show, since the same object may be called
> twice, some bpf_prog might be sensitive to this. With return
> value indicating overflow happens the bpf program can
> react differently.
>
> Signed-off-by: Yonghong Song <yhs@fb.com>
> ---
>  include/uapi/linux/bpf.h       |  18 +++-
>  kernel/trace/bpf_trace.c       | 172 +++++++++++++++++++++++++++++++++
>  scripts/bpf_helpers_doc.py     |   2 +
>  tools/include/uapi/linux/bpf.h |  18 +++-
>  4 files changed, 208 insertions(+), 2 deletions(-)
>
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index b51d56fc77f9..a245f0df53c4 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -3030,6 +3030,20 @@ union bpf_attr {
>   *             * **-EOPNOTSUPP**       Unsupported operation, for example a
>   *                                     call from outside of TC ingress.
>   *             * **-ESOCKTNOSUPPORT**  Socket type not supported (reuseport).
> + *
> + * int bpf_seq_printf(struct seq_file *m, const char *fmt, u32 fmt_size, ...)
> + *     Description
> + *             seq_printf
> + *     Return
> + *             0 if successful, or
> + *             1 if failure due to buffer overflow, or
> + *             a negative value for format string related failures.

This encoding feels a bit arbitrary, why not stick to normal error
codes and return, for example, EAGAIN on overflow (or EOVERFLOW?..)

> + *
> + * int bpf_seq_write(struct seq_file *m, const void *data, u32 len)
> + *     Description
> + *             seq_write
> + *     Return
> + *             0 if successful, non-zero otherwise.

Especially given that bpf_seq_write will probably return <0 on the same error?

>   */
>  #define __BPF_FUNC_MAPPER(FN)          \
>         FN(unspec),                     \
> @@ -3156,7 +3170,9 @@ union bpf_attr {
>         FN(xdp_output),                 \
>         FN(get_netns_cookie),           \
>         FN(get_current_ancestor_cgroup_id),     \
> -       FN(sk_assign),
> +       FN(sk_assign),                  \
> +       FN(seq_printf),                 \
> +       FN(seq_write),
>
>  /* integer value in 'imm' field of BPF_CALL instruction selects which helper
>   * function eBPF program intends to call
> diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
> index ca1796747a77..e7d6ba7c9c51 100644
> --- a/kernel/trace/bpf_trace.c
> +++ b/kernel/trace/bpf_trace.c
> @@ -457,6 +457,174 @@ const struct bpf_func_proto *bpf_get_trace_printk_proto(void)
>         return &bpf_trace_printk_proto;
>  }
>
> +BPF_CALL_5(bpf_seq_printf, struct seq_file *, m, char *, fmt, u32, fmt_size, u64, arg1,
> +          u64, arg2)
> +{

I honestly didn't dare to look at implementation below, but this
limitation of only up to 2 arguments in bpf_seq_printf (arg1 and arg2)
seem extremely limiting. It might be ok for bpf_printk, but not for
more general and non-debugging bpf_seq_printf.

How about instead of passing arguments as 4th and 5th argument,
bpf_seq_printf would require passing a pointer to a long array, where
each item corresponds to printf argument? So on BPF program side, one
would have to do this, to printf 5 arguments;

long __tmp_arr[] = { 123, pointer_to_str, some_input_int,
some_input_long, 5 * arg_x };
return bpf_seq_printf(m, fmt, fmt_size, &__tmp_arr, sizeof(__tmp_arr));

And the bpf_seq_printf would know that 4th argument is a pointer to an
array of size provided in 5th argument and process them accordingly.
This would theoretically allow to have arbitrary number of arguments.
This local array construction can be abstracted into macro, of course.
Would something like this be possible?

[...]

> +/* Horrid workaround for getting va_list handling working with different
> + * argument type combinations generically for 32 and 64 bit archs.
> + */
> +#define __BPF_SP_EMIT()        __BPF_ARG2_SP()
> +#define __BPF_SP(...)                                                  \
> +       seq_printf(m, fmt, ##__VA_ARGS__)
> +
> +#define __BPF_ARG1_SP(...)                                             \
> +       ((mod[0] == 2 || (mod[0] == 1 && __BITS_PER_LONG == 64))        \
> +         ? __BPF_SP(arg1, ##__VA_ARGS__)                               \
> +         : ((mod[0] == 1 || (mod[0] == 0 && __BITS_PER_LONG == 32))    \
> +             ? __BPF_SP((long)arg1, ##__VA_ARGS__)                     \
> +             : __BPF_SP((u32)arg1, ##__VA_ARGS__)))
> +
> +#define __BPF_ARG2_SP(...)                                             \
> +       ((mod[1] == 2 || (mod[1] == 1 && __BITS_PER_LONG == 64))        \
> +         ? __BPF_ARG1_SP(arg2, ##__VA_ARGS__)                          \
> +         : ((mod[1] == 1 || (mod[1] == 0 && __BITS_PER_LONG == 32))    \
> +             ? __BPF_ARG1_SP((long)arg2, ##__VA_ARGS__)                \
> +             : __BPF_ARG1_SP((u32)arg2, ##__VA_ARGS__)))

hm... wouldn't this make it impossible to print 64-bit numbers on
32-bit arches? It seems to be truncating to 32-bit unconditionally....

> +
> +       __BPF_SP_EMIT();
> +       return seq_has_overflowed(m);
> +}
> +

[...]
diff mbox series

Patch

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index b51d56fc77f9..a245f0df53c4 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -3030,6 +3030,20 @@  union bpf_attr {
  *		* **-EOPNOTSUPP**	Unsupported operation, for example a
  *					call from outside of TC ingress.
  *		* **-ESOCKTNOSUPPORT**	Socket type not supported (reuseport).
+ *
+ * int bpf_seq_printf(struct seq_file *m, const char *fmt, u32 fmt_size, ...)
+ * 	Description
+ * 		seq_printf
+ * 	Return
+ * 		0 if successful, or
+ * 		1 if failure due to buffer overflow, or
+ * 		a negative value for format string related failures.
+ *
+ * int bpf_seq_write(struct seq_file *m, const void *data, u32 len)
+ * 	Description
+ * 		seq_write
+ * 	Return
+ * 		0 if successful, non-zero otherwise.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -3156,7 +3170,9 @@  union bpf_attr {
 	FN(xdp_output),			\
 	FN(get_netns_cookie),		\
 	FN(get_current_ancestor_cgroup_id),	\
-	FN(sk_assign),
+	FN(sk_assign),			\
+	FN(seq_printf),			\
+	FN(seq_write),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index ca1796747a77..e7d6ba7c9c51 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -457,6 +457,174 @@  const struct bpf_func_proto *bpf_get_trace_printk_proto(void)
 	return &bpf_trace_printk_proto;
 }
 
+BPF_CALL_5(bpf_seq_printf, struct seq_file *, m, char *, fmt, u32, fmt_size, u64, arg1,
+	   u64, arg2)
+{
+	bool buf_used = false;
+	int i, copy_size;
+	int mod[2] = {};
+	int fmt_cnt = 0;
+	u64 unsafe_addr;
+	char buf[64];
+
+	/*
+	 * bpf_check()->check_func_arg()->check_stack_boundary()
+	 * guarantees that fmt points to bpf program stack,
+	 * fmt_size bytes of it were initialized and fmt_size > 0
+	 */
+	if (fmt[--fmt_size] != 0)
+		return -EINVAL;
+
+	/* check format string for allowed specifiers */
+	for (i = 0; i < fmt_size; i++) {
+		if ((!isprint(fmt[i]) && !isspace(fmt[i])) || !isascii(fmt[i]))
+			return -EINVAL;
+
+		if (fmt[i] != '%')
+			continue;
+
+		if (fmt_cnt >= 2)
+			return -EINVAL;
+
+		/* fmt[i] != 0 && fmt[last] == 0, so we can access fmt[i + 1] */
+		i++;
+
+		/* skip optional "[0+-][num]" width formating field */
+		while (fmt[i] == '0' || fmt[i] == '+'  || fmt[i] == '-')
+			i++;
+		if (fmt[i] >= '1' && fmt[i] <= '9') {
+			i++;
+			while (fmt[i] >= '0' && fmt[i] <= '9')
+				i++;
+		}
+
+		if (fmt[i] == 'l') {
+			mod[fmt_cnt]++;
+			i++;
+		} else if (fmt[i] == 's') {
+			mod[fmt_cnt]++;
+			fmt_cnt++;
+			/* disallow any further format extensions */
+			if (fmt[i + 1] != 0 &&
+			    !isspace(fmt[i + 1]) &&
+			    !ispunct(fmt[i + 1]))
+				return -EINVAL;
+
+			if (buf_used)
+				/* allow only one '%s'/'%p' per fmt string */
+				return -EINVAL;
+			buf_used = true;
+
+			if (fmt_cnt == 1) {
+				unsafe_addr = arg1;
+				arg1 = (long) buf;
+			} else {
+				unsafe_addr = arg2;
+				arg2 = (long) buf;
+			}
+			buf[0] = 0;
+			strncpy_from_unsafe(buf,
+					    (void *) (long) unsafe_addr,
+					    sizeof(buf));
+			continue;
+		} else if (fmt[i] == 'p') {
+			mod[fmt_cnt]++;
+			fmt_cnt++;
+			if (fmt[i + 1] == 0 ||
+			    fmt[i + 1] == 'K' ||
+			    fmt[i + 1] == 'x') {
+				/* just kernel pointers */
+				continue;
+			}
+
+			if (buf_used)
+				return -EINVAL;
+			buf_used = true;
+
+			/* only support "%pI4", "%pi4", "%pI6" and "pi6". */
+			if (fmt[i + 1] != 'i' && fmt[i + 1] != 'I')
+				return -EINVAL;
+			if (fmt[i + 2] != '4' && fmt[i + 2] != '6')
+				return -EINVAL;
+
+			copy_size = (fmt[i + 2] == '4') ? 4 : 16;
+
+			if (fmt_cnt == 1) {
+				unsafe_addr = arg1;
+				arg1 = (long) buf;
+			} else {
+				unsafe_addr = arg2;
+				arg2 = (long) buf;
+			}
+			probe_kernel_read(buf, (void *) (long) unsafe_addr, copy_size);
+
+			i += 2;
+			continue;
+		}
+
+		if (fmt[i] == 'l') {
+			mod[fmt_cnt]++;
+			i++;
+		}
+
+		if (fmt[i] != 'i' && fmt[i] != 'd' &&
+		    fmt[i] != 'u' && fmt[i] != 'x')
+			return -EINVAL;
+		fmt_cnt++;
+	}
+
+/* Horrid workaround for getting va_list handling working with different
+ * argument type combinations generically for 32 and 64 bit archs.
+ */
+#define __BPF_SP_EMIT()	__BPF_ARG2_SP()
+#define __BPF_SP(...)							\
+	seq_printf(m, fmt, ##__VA_ARGS__)
+
+#define __BPF_ARG1_SP(...)						\
+	((mod[0] == 2 || (mod[0] == 1 && __BITS_PER_LONG == 64))	\
+	  ? __BPF_SP(arg1, ##__VA_ARGS__)				\
+	  : ((mod[0] == 1 || (mod[0] == 0 && __BITS_PER_LONG == 32))	\
+	      ? __BPF_SP((long)arg1, ##__VA_ARGS__)			\
+	      : __BPF_SP((u32)arg1, ##__VA_ARGS__)))
+
+#define __BPF_ARG2_SP(...)						\
+	((mod[1] == 2 || (mod[1] == 1 && __BITS_PER_LONG == 64))	\
+	  ? __BPF_ARG1_SP(arg2, ##__VA_ARGS__)				\
+	  : ((mod[1] == 1 || (mod[1] == 0 && __BITS_PER_LONG == 32))	\
+	      ? __BPF_ARG1_SP((long)arg2, ##__VA_ARGS__)		\
+	      : __BPF_ARG1_SP((u32)arg2, ##__VA_ARGS__)))
+
+	__BPF_SP_EMIT();
+	return seq_has_overflowed(m);
+}
+
+static int bpf_seq_printf_btf_ids[5];
+static const struct bpf_func_proto bpf_seq_printf_proto = {
+	.func		= bpf_seq_printf,
+	.gpl_only	= true,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_BTF_ID,
+	.arg2_type	= ARG_PTR_TO_MEM,
+	.arg3_type	= ARG_CONST_SIZE,
+	.btf_id		= bpf_seq_printf_btf_ids,
+};
+
+BPF_CALL_3(bpf_seq_write, struct seq_file *, m, const char *, data, u32, len)
+{
+	return seq_write(m, data, len);
+}
+
+static int bpf_seq_write_btf_ids[5];
+static const struct bpf_func_proto bpf_seq_write_proto = {
+	.func		= bpf_seq_write,
+	.gpl_only	= true,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_BTF_ID,
+	.arg2_type	= ARG_PTR_TO_MEM,
+	.arg3_type	= ARG_CONST_SIZE,
+	.btf_id		= bpf_seq_write_btf_ids,
+};
+
 static __always_inline int
 get_map_perf_counter(struct bpf_map *map, u64 flags,
 		     u64 *value, u64 *enabled, u64 *running)
@@ -1224,6 +1392,10 @@  tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 	case BPF_FUNC_xdp_output:
 		return &bpf_xdp_output_proto;
 #endif
+	case BPF_FUNC_seq_printf:
+		return &bpf_seq_printf_proto;
+	case BPF_FUNC_seq_write:
+		return &bpf_seq_write_proto;
 	default:
 		return raw_tp_prog_func_proto(func_id, prog);
 	}
diff --git a/scripts/bpf_helpers_doc.py b/scripts/bpf_helpers_doc.py
index f43d193aff3a..ded304c96a05 100755
--- a/scripts/bpf_helpers_doc.py
+++ b/scripts/bpf_helpers_doc.py
@@ -414,6 +414,7 @@  class PrinterHelpers(Printer):
             'struct sk_reuseport_md',
             'struct sockaddr',
             'struct tcphdr',
+            'struct seq_file',
 
             'struct __sk_buff',
             'struct sk_msg_md',
@@ -450,6 +451,7 @@  class PrinterHelpers(Printer):
             'struct sk_reuseport_md',
             'struct sockaddr',
             'struct tcphdr',
+            'struct seq_file',
     }
     mapped_types = {
             'u8': '__u8',
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index b51d56fc77f9..a245f0df53c4 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -3030,6 +3030,20 @@  union bpf_attr {
  *		* **-EOPNOTSUPP**	Unsupported operation, for example a
  *					call from outside of TC ingress.
  *		* **-ESOCKTNOSUPPORT**	Socket type not supported (reuseport).
+ *
+ * int bpf_seq_printf(struct seq_file *m, const char *fmt, u32 fmt_size, ...)
+ * 	Description
+ * 		seq_printf
+ * 	Return
+ * 		0 if successful, or
+ * 		1 if failure due to buffer overflow, or
+ * 		a negative value for format string related failures.
+ *
+ * int bpf_seq_write(struct seq_file *m, const void *data, u32 len)
+ * 	Description
+ * 		seq_write
+ * 	Return
+ * 		0 if successful, non-zero otherwise.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -3156,7 +3170,9 @@  union bpf_attr {
 	FN(xdp_output),			\
 	FN(get_netns_cookie),		\
 	FN(get_current_ancestor_cgroup_id),	\
-	FN(sk_assign),
+	FN(sk_assign),			\
+	FN(seq_printf),			\
+	FN(seq_write),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call