diff mbox series

[bpf-next,v2,13/20] bpf: add bpf_seq_printf and bpf_seq_write helpers

Message ID 20200504062602.2048597-1-yhs@fb.com
State Changes Requested
Delegated to: BPF Maintainers
Headers show
Series bpf: implement bpf iterator for kernel data | expand

Commit Message

Yonghong Song May 4, 2020, 6:26 a.m. UTC
Two helpers bpf_seq_printf and bpf_seq_write, are added for
writing data to the seq_file buffer.

bpf_seq_printf supports common format string flag/width/type
fields so at least I can get identical results for
netlink and ipv6_route targets.

For bpf_seq_printf and bpf_seq_write, return value -EOVERFLOW
specifically indicates a write failure due to overflow, which
means the object will be repeated in the next bpf invocation
if object collection stays the same. Note that if the object
collection is changed, depending how collection traversal is
done, even if the object still in the collection, it may not
be visited.

bpf_seq_printf may return -EBUSY meaning that internal percpu
buffer for memory copy of strings or other pointees is
not available. Bpf program can return 1 to indicate it
wants the same object to be repeated. Right now, this should not
happen on no-RT kernels since migrate_enable(), which guards
bpf prog call, calls preempt_enable().

Signed-off-by: Yonghong Song <yhs@fb.com>
---
 include/uapi/linux/bpf.h       |  32 +++++-
 kernel/trace/bpf_trace.c       | 195 +++++++++++++++++++++++++++++++++
 scripts/bpf_helpers_doc.py     |   2 +
 tools/include/uapi/linux/bpf.h |  32 +++++-
 4 files changed, 259 insertions(+), 2 deletions(-)

Comments

Andrii Nakryiko May 6, 2020, 5:37 p.m. UTC | #1
On Sun, May 3, 2020 at 11:26 PM Yonghong Song <yhs@fb.com> wrote:
>
> Two helpers bpf_seq_printf and bpf_seq_write, are added for
> writing data to the seq_file buffer.
>
> bpf_seq_printf supports common format string flag/width/type
> fields so at least I can get identical results for
> netlink and ipv6_route targets.
>

Does seq_printf() has its own format string specification? Is there
any documentation explaining? I was confused by few different checks
below...

> For bpf_seq_printf and bpf_seq_write, return value -EOVERFLOW
> specifically indicates a write failure due to overflow, which
> means the object will be repeated in the next bpf invocation
> if object collection stays the same. Note that if the object
> collection is changed, depending how collection traversal is
> done, even if the object still in the collection, it may not
> be visited.
>
> bpf_seq_printf may return -EBUSY meaning that internal percpu
> buffer for memory copy of strings or other pointees is
> not available. Bpf program can return 1 to indicate it
> wants the same object to be repeated. Right now, this should not
> happen on no-RT kernels since migrate_enable(), which guards
> bpf prog call, calls preempt_enable().

You probably meant migrate_disable()/preempt_disable(), right? But
could it still happen, at least due to NMI? E.g., perf_event BPF
program gets triggered during bpf_iter program execution? I think for
perf_event_output function, we have 3 levels, for one of each possible
"contexts"? Should we do something like that here as well?

>
> Signed-off-by: Yonghong Song <yhs@fb.com>
> ---
>  include/uapi/linux/bpf.h       |  32 +++++-
>  kernel/trace/bpf_trace.c       | 195 +++++++++++++++++++++++++++++++++
>  scripts/bpf_helpers_doc.py     |   2 +
>  tools/include/uapi/linux/bpf.h |  32 +++++-
>  4 files changed, 259 insertions(+), 2 deletions(-)
>
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index 97ceb0f2e539..e440a9d5cca2 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -3076,6 +3076,34 @@ union bpf_attr {
>   *             See: clock_gettime(CLOCK_BOOTTIME)
>   *     Return
>   *             Current *ktime*.
> + *

[...]

> +BPF_CALL_5(bpf_seq_printf, struct seq_file *, m, char *, fmt, u32, fmt_size,
> +          const void *, data, u32, data_len)
> +{
> +       int err = -EINVAL, fmt_cnt = 0, memcpy_cnt = 0;
> +       int i, buf_used, copy_size, num_args;
> +       u64 params[MAX_SEQ_PRINTF_VARARGS];
> +       struct bpf_seq_printf_buf *bufs;
> +       const u64 *args = data;
> +
> +       buf_used = this_cpu_inc_return(bpf_seq_printf_buf_used);
> +       if (WARN_ON_ONCE(buf_used > 1)) {
> +               err = -EBUSY;
> +               goto out;
> +       }
> +
> +       bufs = this_cpu_ptr(&bpf_seq_printf_buf);
> +
> +       /*
> +        * bpf_check()->check_func_arg()->check_stack_boundary()
> +        * guarantees that fmt points to bpf program stack,
> +        * fmt_size bytes of it were initialized and fmt_size > 0
> +        */
> +       if (fmt[--fmt_size] != 0)

If we allow fmt_size == 0, this will need to be changed.

> +               goto out;
> +
> +       if (data_len & 7)
> +               goto out;
> +
> +       for (i = 0; i < fmt_size; i++) {
> +               if (fmt[i] == '%' && (!data || !data_len))

So %% escaping is not supported?

> +                       goto out;
> +       }
> +
> +       num_args = data_len / 8;
> +
> +       /* check format string for allowed specifiers */
> +       for (i = 0; i < fmt_size; i++) {
> +               if ((!isprint(fmt[i]) && !isspace(fmt[i])) || !isascii(fmt[i]))

why these restrictions? are they essential?

> +                       goto out;
> +
> +               if (fmt[i] != '%')
> +                       continue;
> +
> +               if (fmt_cnt >= MAX_SEQ_PRINTF_VARARGS) {
> +                       err = -E2BIG;
> +                       goto out;
> +               }
> +
> +               if (fmt_cnt >= num_args)
> +                       goto out;
> +
> +               /* fmt[i] != 0 && fmt[last] == 0, so we can access fmt[i + 1] */
> +               i++;
> +
> +               /* skip optional "[0+-][num]" width formating field */
> +               while (fmt[i] == '0' || fmt[i] == '+'  || fmt[i] == '-')

There could be space as well, as an alternative to 0.

> +                       i++;
> +               if (fmt[i] >= '1' && fmt[i] <= '9') {
> +                       i++;
> +                       while (fmt[i] >= '0' && fmt[i] <= '9')
> +                               i++;
> +               }
> +
> +               if (fmt[i] == 's') {
> +                       /* disallow any further format extensions */
> +                       if (fmt[i + 1] != 0 &&
> +                           !isspace(fmt[i + 1]) &&
> +                           !ispunct(fmt[i + 1]))
> +                               goto out;

I'm not sure I follow this check either. printf("%sbla", "whatever")
is a perfectly fine format string. Unless seq_printf has some
additional restrictions?

> +
> +                       /* try our best to copy */
> +                       if (memcpy_cnt >= MAX_SEQ_PRINTF_MAX_MEMCPY) {
> +                               err = -E2BIG;
> +                               goto out;
> +                       }
> +

[...]

> +
> +static int bpf_seq_printf_btf_ids[5];
> +static const struct bpf_func_proto bpf_seq_printf_proto = {
> +       .func           = bpf_seq_printf,
> +       .gpl_only       = true,
> +       .ret_type       = RET_INTEGER,
> +       .arg1_type      = ARG_PTR_TO_BTF_ID,
> +       .arg2_type      = ARG_PTR_TO_MEM,
> +       .arg3_type      = ARG_CONST_SIZE,

It feels like allowing zero shouldn't hurt too much?

> +       .arg4_type      = ARG_PTR_TO_MEM_OR_NULL,
> +       .arg5_type      = ARG_CONST_SIZE_OR_ZERO,
> +       .btf_id         = bpf_seq_printf_btf_ids,
> +};
> +
> +BPF_CALL_3(bpf_seq_write, struct seq_file *, m, const void *, data, u32, len)
> +{
> +       return seq_write(m, data, len) ? -EOVERFLOW : 0;
> +}
> +
> +static int bpf_seq_write_btf_ids[5];
> +static const struct bpf_func_proto bpf_seq_write_proto = {
> +       .func           = bpf_seq_write,
> +       .gpl_only       = true,
> +       .ret_type       = RET_INTEGER,
> +       .arg1_type      = ARG_PTR_TO_BTF_ID,
> +       .arg2_type      = ARG_PTR_TO_MEM,
> +       .arg3_type      = ARG_CONST_SIZE,

Same, ARG_CONST_SIZE_OR_ZERO?

> +       .btf_id         = bpf_seq_write_btf_ids,
> +};
> +

[...]
Yonghong Song May 6, 2020, 9:42 p.m. UTC | #2
On 5/6/20 10:37 AM, Andrii Nakryiko wrote:
> On Sun, May 3, 2020 at 11:26 PM Yonghong Song <yhs@fb.com> wrote:
>>
>> Two helpers bpf_seq_printf and bpf_seq_write, are added for
>> writing data to the seq_file buffer.
>>
>> bpf_seq_printf supports common format string flag/width/type
>> fields so at least I can get identical results for
>> netlink and ipv6_route targets.
>>
> 
> Does seq_printf() has its own format string specification? Is there
> any documentation explaining? I was confused by few different checks
> below...

Not really. Similar to bpf_trace_printk(), since we need to
parse format string, so we may only support a subset of
what seq_printf() does. But we should not invent new
formats.

> 
>> For bpf_seq_printf and bpf_seq_write, return value -EOVERFLOW
>> specifically indicates a write failure due to overflow, which
>> means the object will be repeated in the next bpf invocation
>> if object collection stays the same. Note that if the object
>> collection is changed, depending how collection traversal is
>> done, even if the object still in the collection, it may not
>> be visited.
>>
>> bpf_seq_printf may return -EBUSY meaning that internal percpu
>> buffer for memory copy of strings or other pointees is
>> not available. Bpf program can return 1 to indicate it
>> wants the same object to be repeated. Right now, this should not
>> happen on no-RT kernels since migrate_enable(), which guards
>> bpf prog call, calls preempt_enable().
> 
> You probably meant migrate_disable()/preempt_disable(), right? But

Yes, sorry for typo.

> could it still happen, at least due to NMI? E.g., perf_event BPF
> program gets triggered during bpf_iter program execution? I think for
> perf_event_output function, we have 3 levels, for one of each possible
> "contexts"? Should we do something like that here as well?

Currently bpf_seq_printf() and bpf_seq_write() helpers can
only be called by iter bpf programs. The iter bpf program can only
be run on process context as it is triggered by a read() syscall.
So one level should be enough for non-RT kernel.

For RT kernel, migrate_disable does not prevent preemption,
so it is possible task in the middle of bpf_seq_printf() might
be preempted, so I implemented the logic to return -EBUSY.
I think this case should be extremely rare so I only implemented
one level nesting.

> 
>>
>> Signed-off-by: Yonghong Song <yhs@fb.com>
>> ---
>>   include/uapi/linux/bpf.h       |  32 +++++-
>>   kernel/trace/bpf_trace.c       | 195 +++++++++++++++++++++++++++++++++
>>   scripts/bpf_helpers_doc.py     |   2 +
>>   tools/include/uapi/linux/bpf.h |  32 +++++-
>>   4 files changed, 259 insertions(+), 2 deletions(-)
>>
>> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
>> index 97ceb0f2e539..e440a9d5cca2 100644
>> --- a/include/uapi/linux/bpf.h
>> +++ b/include/uapi/linux/bpf.h
>> @@ -3076,6 +3076,34 @@ union bpf_attr {
>>    *             See: clock_gettime(CLOCK_BOOTTIME)
>>    *     Return
>>    *             Current *ktime*.
>> + *
> 
> [...]
> 
>> +BPF_CALL_5(bpf_seq_printf, struct seq_file *, m, char *, fmt, u32, fmt_size,
>> +          const void *, data, u32, data_len)
>> +{
>> +       int err = -EINVAL, fmt_cnt = 0, memcpy_cnt = 0;
>> +       int i, buf_used, copy_size, num_args;
>> +       u64 params[MAX_SEQ_PRINTF_VARARGS];
>> +       struct bpf_seq_printf_buf *bufs;
>> +       const u64 *args = data;
>> +
>> +       buf_used = this_cpu_inc_return(bpf_seq_printf_buf_used);
>> +       if (WARN_ON_ONCE(buf_used > 1)) {
>> +               err = -EBUSY;
>> +               goto out;
>> +       }
>> +
>> +       bufs = this_cpu_ptr(&bpf_seq_printf_buf);
>> +
>> +       /*
>> +        * bpf_check()->check_func_arg()->check_stack_boundary()
>> +        * guarantees that fmt points to bpf program stack,
>> +        * fmt_size bytes of it were initialized and fmt_size > 0
>> +        */
>> +       if (fmt[--fmt_size] != 0)
> 
> If we allow fmt_size == 0, this will need to be changed.

Currently, we do not support fmt_size == 0. Yes, if we allow, this
needs change.

> 
>> +               goto out;
>> +
>> +       if (data_len & 7)
>> +               goto out;
>> +
>> +       for (i = 0; i < fmt_size; i++) {
>> +               if (fmt[i] == '%' && (!data || !data_len))
> 
> So %% escaping is not supported?

Yes, have not seen a need yet my ipv6_route/netlink example.
Can certain add if there is a use case.

> 
>> +                       goto out;
>> +       }
>> +
>> +       num_args = data_len / 8;
>> +
>> +       /* check format string for allowed specifiers */
>> +       for (i = 0; i < fmt_size; i++) {
>> +               if ((!isprint(fmt[i]) && !isspace(fmt[i])) || !isascii(fmt[i]))
> 
> why these restrictions? are they essential?

This is the same restriction in bpf_trace_printk(). I guess the purpose
is to avoid weird print. To promote bpf_iter to dump beyond asscii, I 
guess we can remove this restriction.

> 
>> +                       goto out;
>> +
>> +               if (fmt[i] != '%')
>> +                       continue;
>> +
>> +               if (fmt_cnt >= MAX_SEQ_PRINTF_VARARGS) {
>> +                       err = -E2BIG;
>> +                       goto out;
>> +               }
>> +
>> +               if (fmt_cnt >= num_args)
>> +                       goto out;
>> +
>> +               /* fmt[i] != 0 && fmt[last] == 0, so we can access fmt[i + 1] */
>> +               i++;
>> +
>> +               /* skip optional "[0+-][num]" width formating field */
>> +               while (fmt[i] == '0' || fmt[i] == '+'  || fmt[i] == '-')
> 
> There could be space as well, as an alternative to 0.

We can allow space. But '0' is used more common, right?

> 
>> +                       i++;
>> +               if (fmt[i] >= '1' && fmt[i] <= '9') {
>> +                       i++;
>> +                       while (fmt[i] >= '0' && fmt[i] <= '9')
>> +                               i++;
>> +               }
>> +
>> +               if (fmt[i] == 's') {
>> +                       /* disallow any further format extensions */
>> +                       if (fmt[i + 1] != 0 &&
>> +                           !isspace(fmt[i + 1]) &&
>> +                           !ispunct(fmt[i + 1]))
>> +                               goto out;
> 
> I'm not sure I follow this check either. printf("%sbla", "whatever")
> is a perfectly fine format string. Unless seq_printf has some
> additional restrictions?

Yes, just some restriction inherited from bpf_trace_printk().
Will remove.

> 
>> +
>> +                       /* try our best to copy */
>> +                       if (memcpy_cnt >= MAX_SEQ_PRINTF_MAX_MEMCPY) {
>> +                               err = -E2BIG;
>> +                               goto out;
>> +                       }
>> +
> 
> [...]
> 
>> +
>> +static int bpf_seq_printf_btf_ids[5];
>> +static const struct bpf_func_proto bpf_seq_printf_proto = {
>> +       .func           = bpf_seq_printf,
>> +       .gpl_only       = true,
>> +       .ret_type       = RET_INTEGER,
>> +       .arg1_type      = ARG_PTR_TO_BTF_ID,
>> +       .arg2_type      = ARG_PTR_TO_MEM,
>> +       .arg3_type      = ARG_CONST_SIZE,
> 
> It feels like allowing zero shouldn't hurt too much?

This is the format string, I would prefer to keep it non-zero.

> 
>> +       .arg4_type      = ARG_PTR_TO_MEM_OR_NULL,
>> +       .arg5_type      = ARG_CONST_SIZE_OR_ZERO,
>> +       .btf_id         = bpf_seq_printf_btf_ids,
>> +};
>> +
>> +BPF_CALL_3(bpf_seq_write, struct seq_file *, m, const void *, data, u32, len)
>> +{
>> +       return seq_write(m, data, len) ? -EOVERFLOW : 0;
>> +}
>> +
>> +static int bpf_seq_write_btf_ids[5];
>> +static const struct bpf_func_proto bpf_seq_write_proto = {
>> +       .func           = bpf_seq_write,
>> +       .gpl_only       = true,
>> +       .ret_type       = RET_INTEGER,
>> +       .arg1_type      = ARG_PTR_TO_BTF_ID,
>> +       .arg2_type      = ARG_PTR_TO_MEM,
>> +       .arg3_type      = ARG_CONST_SIZE,
> 
> Same, ARG_CONST_SIZE_OR_ZERO?

This one, possible. Let me check.

> 
>> +       .btf_id         = bpf_seq_write_btf_ids,
>> +};
>> +
> 
> [...]
>
Andrii Nakryiko May 8, 2020, 6:15 p.m. UTC | #3
On Wed, May 6, 2020 at 2:42 PM Yonghong Song <yhs@fb.com> wrote:
>
>
>
> On 5/6/20 10:37 AM, Andrii Nakryiko wrote:
> > On Sun, May 3, 2020 at 11:26 PM Yonghong Song <yhs@fb.com> wrote:
> >>
> >> Two helpers bpf_seq_printf and bpf_seq_write, are added for
> >> writing data to the seq_file buffer.
> >>
> >> bpf_seq_printf supports common format string flag/width/type
> >> fields so at least I can get identical results for
> >> netlink and ipv6_route targets.
> >>
> >
> > Does seq_printf() has its own format string specification? Is there
> > any documentation explaining? I was confused by few different checks
> > below...
>
> Not really. Similar to bpf_trace_printk(), since we need to
> parse format string, so we may only support a subset of
> what seq_printf() does. But we should not invent new
> formats.
>
> >
> >> For bpf_seq_printf and bpf_seq_write, return value -EOVERFLOW
> >> specifically indicates a write failure due to overflow, which
> >> means the object will be repeated in the next bpf invocation
> >> if object collection stays the same. Note that if the object
> >> collection is changed, depending how collection traversal is
> >> done, even if the object still in the collection, it may not
> >> be visited.
> >>
> >> bpf_seq_printf may return -EBUSY meaning that internal percpu
> >> buffer for memory copy of strings or other pointees is
> >> not available. Bpf program can return 1 to indicate it
> >> wants the same object to be repeated. Right now, this should not
> >> happen on no-RT kernels since migrate_enable(), which guards
> >> bpf prog call, calls preempt_enable().
> >
> > You probably meant migrate_disable()/preempt_disable(), right? But
>
> Yes, sorry for typo.
>
> > could it still happen, at least due to NMI? E.g., perf_event BPF
> > program gets triggered during bpf_iter program execution? I think for
> > perf_event_output function, we have 3 levels, for one of each possible
> > "contexts"? Should we do something like that here as well?
>
> Currently bpf_seq_printf() and bpf_seq_write() helpers can
> only be called by iter bpf programs. The iter bpf program can only
> be run on process context as it is triggered by a read() syscall.
> So one level should be enough for non-RT kernel.
>
> For RT kernel, migrate_disable does not prevent preemption,
> so it is possible task in the middle of bpf_seq_printf() might
> be preempted, so I implemented the logic to return -EBUSY.
> I think this case should be extremely rare so I only implemented
> one level nesting.

yeah, makes sense

>
> >
> >>
> >> Signed-off-by: Yonghong Song <yhs@fb.com>
> >> ---
> >>   include/uapi/linux/bpf.h       |  32 +++++-
> >>   kernel/trace/bpf_trace.c       | 195 +++++++++++++++++++++++++++++++++
> >>   scripts/bpf_helpers_doc.py     |   2 +
> >>   tools/include/uapi/linux/bpf.h |  32 +++++-
> >>   4 files changed, 259 insertions(+), 2 deletions(-)
> >>
> >> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> >> index 97ceb0f2e539..e440a9d5cca2 100644
> >> --- a/include/uapi/linux/bpf.h
> >> +++ b/include/uapi/linux/bpf.h
> >> @@ -3076,6 +3076,34 @@ union bpf_attr {
> >>    *             See: clock_gettime(CLOCK_BOOTTIME)
> >>    *     Return
> >>    *             Current *ktime*.
> >> + *
> >
> > [...]
> >
> >> +BPF_CALL_5(bpf_seq_printf, struct seq_file *, m, char *, fmt, u32, fmt_size,
> >> +          const void *, data, u32, data_len)
> >> +{
> >> +       int err = -EINVAL, fmt_cnt = 0, memcpy_cnt = 0;
> >> +       int i, buf_used, copy_size, num_args;
> >> +       u64 params[MAX_SEQ_PRINTF_VARARGS];
> >> +       struct bpf_seq_printf_buf *bufs;
> >> +       const u64 *args = data;
> >> +
> >> +       buf_used = this_cpu_inc_return(bpf_seq_printf_buf_used);
> >> +       if (WARN_ON_ONCE(buf_used > 1)) {
> >> +               err = -EBUSY;
> >> +               goto out;
> >> +       }
> >> +
> >> +       bufs = this_cpu_ptr(&bpf_seq_printf_buf);
> >> +
> >> +       /*
> >> +        * bpf_check()->check_func_arg()->check_stack_boundary()
> >> +        * guarantees that fmt points to bpf program stack,
> >> +        * fmt_size bytes of it were initialized and fmt_size > 0
> >> +        */
> >> +       if (fmt[--fmt_size] != 0)
> >
> > If we allow fmt_size == 0, this will need to be changed.
>
> Currently, we do not support fmt_size == 0. Yes, if we allow, this
> needs change.
>
> >
> >> +               goto out;
> >> +
> >> +       if (data_len & 7)
> >> +               goto out;
> >> +
> >> +       for (i = 0; i < fmt_size; i++) {
> >> +               if (fmt[i] == '%' && (!data || !data_len))
> >
> > So %% escaping is not supported?
>
> Yes, have not seen a need yet my ipv6_route/netlink example.
> Can certain add if there is a use case.

I can imagine this being used quite often when trying to print out
percentages... Would just suck to have to upgrade kernel just to be
able to print % character :)

>
> >
> >> +                       goto out;
> >> +       }
> >> +
> >> +       num_args = data_len / 8;
> >> +
> >> +       /* check format string for allowed specifiers */
> >> +       for (i = 0; i < fmt_size; i++) {
> >> +               if ((!isprint(fmt[i]) && !isspace(fmt[i])) || !isascii(fmt[i]))
> >
> > why these restrictions? are they essential?
>
> This is the same restriction in bpf_trace_printk(). I guess the purpose
> is to avoid weird print. To promote bpf_iter to dump beyond asscii, I
> guess we can remove this restriction.

well, if underlying seq_printf() will fail for those "more liberal"
strings, then that would be bad. Basically, we should try to not
impose additional restrictions compared to seq_printf, but also no
need to allow more, which will be rejected by it. I haven't checked
seq_printf implementation though, so don't know what are those
restrictions.

>
> >
> >> +                       goto out;
> >> +
> >> +               if (fmt[i] != '%')
> >> +                       continue;
> >> +
> >> +               if (fmt_cnt >= MAX_SEQ_PRINTF_VARARGS) {
> >> +                       err = -E2BIG;
> >> +                       goto out;
> >> +               }
> >> +
> >> +               if (fmt_cnt >= num_args)
> >> +                       goto out;
> >> +
> >> +               /* fmt[i] != 0 && fmt[last] == 0, so we can access fmt[i + 1] */
> >> +               i++;
> >> +
> >> +               /* skip optional "[0+-][num]" width formating field */
> >> +               while (fmt[i] == '0' || fmt[i] == '+'  || fmt[i] == '-')
> >
> > There could be space as well, as an alternative to 0.
>
> We can allow space. But '0' is used more common, right?

I use both space and 0 quite often, space especially with aligned strings.

>
> >
> >> +                       i++;
> >> +               if (fmt[i] >= '1' && fmt[i] <= '9') {
> >> +                       i++;
> >> +                       while (fmt[i] >= '0' && fmt[i] <= '9')
> >> +                               i++;
> >> +               }
> >> +
> >> +               if (fmt[i] == 's') {
> >> +                       /* disallow any further format extensions */
> >> +                       if (fmt[i + 1] != 0 &&
> >> +                           !isspace(fmt[i + 1]) &&
> >> +                           !ispunct(fmt[i + 1]))
> >> +                               goto out;
> >
> > I'm not sure I follow this check either. printf("%sbla", "whatever")
> > is a perfectly fine format string. Unless seq_printf has some
> > additional restrictions?
>
> Yes, just some restriction inherited from bpf_trace_printk().
> Will remove.

see comment above, if we allow it here, but seq_printf() will reject
it, then there is no point

>
> >
> >> +
> >> +                       /* try our best to copy */
> >> +                       if (memcpy_cnt >= MAX_SEQ_PRINTF_MAX_MEMCPY) {
> >> +                               err = -E2BIG;
> >> +                               goto out;
> >> +                       }
> >> +
> >
> > [...]
> >
> >> +
> >> +static int bpf_seq_printf_btf_ids[5];
> >> +static const struct bpf_func_proto bpf_seq_printf_proto = {
> >> +       .func           = bpf_seq_printf,
> >> +       .gpl_only       = true,
> >> +       .ret_type       = RET_INTEGER,
> >> +       .arg1_type      = ARG_PTR_TO_BTF_ID,
> >> +       .arg2_type      = ARG_PTR_TO_MEM,
> >> +       .arg3_type      = ARG_CONST_SIZE,
> >
> > It feels like allowing zero shouldn't hurt too much?
>
> This is the format string, I would prefer to keep it non-zero.

yeah, makes sense, I suppose

>
> >
> >> +       .arg4_type      = ARG_PTR_TO_MEM_OR_NULL,
> >> +       .arg5_type      = ARG_CONST_SIZE_OR_ZERO,
> >> +       .btf_id         = bpf_seq_printf_btf_ids,
> >> +};
> >> +
> >> +BPF_CALL_3(bpf_seq_write, struct seq_file *, m, const void *, data, u32, len)
> >> +{
> >> +       return seq_write(m, data, len) ? -EOVERFLOW : 0;
> >> +}
> >> +
> >> +static int bpf_seq_write_btf_ids[5];
> >> +static const struct bpf_func_proto bpf_seq_write_proto = {
> >> +       .func           = bpf_seq_write,
> >> +       .gpl_only       = true,
> >> +       .ret_type       = RET_INTEGER,
> >> +       .arg1_type      = ARG_PTR_TO_BTF_ID,
> >> +       .arg2_type      = ARG_PTR_TO_MEM,
> >> +       .arg3_type      = ARG_CONST_SIZE,
> >
> > Same, ARG_CONST_SIZE_OR_ZERO?
>
> This one, possible. Let me check.

I just remember how much trouble perf_event_output() was causing me
because it enforced >0 for data length. Even though my variable-sized
output was always >0, proving that to (especially older) verifier was
extremely hard. So the less unnecessary restrictions - the better.

>
> >
> >> +       .btf_id         = bpf_seq_write_btf_ids,
> >> +};
> >> +
> >
> > [...]
> >
diff mbox series

Patch

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 97ceb0f2e539..e440a9d5cca2 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -3076,6 +3076,34 @@  union bpf_attr {
  * 		See: clock_gettime(CLOCK_BOOTTIME)
  * 	Return
  * 		Current *ktime*.
+ *
+ * int bpf_seq_printf(struct seq_file *m, const char *fmt, u32 fmt_size, const void *data, u32 data_len)
+ * 	Description
+ * 		seq_printf uses seq_file seq_printf() to print out the format string.
+ * 		The *m* represents the seq_file. The *fmt* and *fmt_size* are for
+ * 		the format string itself. The *data* and *data_len* are format string
+ * 		arguments. The *data* are a u64 array and corresponding format string
+ * 		values are stored in the array. For strings and pointers where pointees
+ * 		are accessed, only the pointer values are stored in the *data* array.
+ * 		The *data_len* is the *data* size in term of bytes.
+ * 	Return
+ * 		0 on success, or a negative errno in case of failure.
+ *
+ *		* **-EBUSY**		Percpu memory copy buffer is busy, can try again
+ *					by returning 1 from bpf program.
+ *		* **-EINVAL**		Invalid arguments, or invalid/unsupported formats.
+ *		* **-E2BIG**		Too many format specifiers.
+ *		* **-EOVERFLOW**	Overflow happens, the same object will be tried again.
+ *
+ * int bpf_seq_write(struct seq_file *m, const void *data, u32 len)
+ * 	Description
+ * 		seq_write uses seq_file seq_write() to write the data.
+ * 		The *m* represents the seq_file. The *data* and *len* represent the
+ *		data to write in bytes.
+ * 	Return
+ * 		0 on success, or a negative errno in case of failure.
+ *
+ *		* **-EOVERFLOW**	Overflow happens, the same object will be tried again.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -3203,7 +3231,9 @@  union bpf_attr {
 	FN(get_netns_cookie),		\
 	FN(get_current_ancestor_cgroup_id),	\
 	FN(sk_assign),			\
-	FN(ktime_get_boot_ns),
+	FN(ktime_get_boot_ns),		\
+	FN(seq_printf),			\
+	FN(seq_write),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index e875c95d3ced..7c06eb39bacc 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -457,6 +457,193 @@  const struct bpf_func_proto *bpf_get_trace_printk_proto(void)
 	return &bpf_trace_printk_proto;
 }
 
+#define MAX_SEQ_PRINTF_VARARGS		12
+#define MAX_SEQ_PRINTF_MAX_MEMCPY	6
+#define MAX_SEQ_PRINTF_STR_LEN		128
+
+struct bpf_seq_printf_buf {
+	char buf[MAX_SEQ_PRINTF_MAX_MEMCPY][MAX_SEQ_PRINTF_STR_LEN];
+};
+static DEFINE_PER_CPU(struct bpf_seq_printf_buf, bpf_seq_printf_buf);
+static DEFINE_PER_CPU(int, bpf_seq_printf_buf_used);
+
+BPF_CALL_5(bpf_seq_printf, struct seq_file *, m, char *, fmt, u32, fmt_size,
+	   const void *, data, u32, data_len)
+{
+	int err = -EINVAL, fmt_cnt = 0, memcpy_cnt = 0;
+	int i, buf_used, copy_size, num_args;
+	u64 params[MAX_SEQ_PRINTF_VARARGS];
+	struct bpf_seq_printf_buf *bufs;
+	const u64 *args = data;
+
+	buf_used = this_cpu_inc_return(bpf_seq_printf_buf_used);
+	if (WARN_ON_ONCE(buf_used > 1)) {
+		err = -EBUSY;
+		goto out;
+	}
+
+	bufs = this_cpu_ptr(&bpf_seq_printf_buf);
+
+	/*
+	 * bpf_check()->check_func_arg()->check_stack_boundary()
+	 * guarantees that fmt points to bpf program stack,
+	 * fmt_size bytes of it were initialized and fmt_size > 0
+	 */
+	if (fmt[--fmt_size] != 0)
+		goto out;
+
+	if (data_len & 7)
+		goto out;
+
+	for (i = 0; i < fmt_size; i++) {
+		if (fmt[i] == '%' && (!data || !data_len))
+			goto out;
+	}
+
+	num_args = data_len / 8;
+
+	/* check format string for allowed specifiers */
+	for (i = 0; i < fmt_size; i++) {
+		if ((!isprint(fmt[i]) && !isspace(fmt[i])) || !isascii(fmt[i]))
+			goto out;
+
+		if (fmt[i] != '%')
+			continue;
+
+		if (fmt_cnt >= MAX_SEQ_PRINTF_VARARGS) {
+			err = -E2BIG;
+			goto out;
+		}
+
+		if (fmt_cnt >= num_args)
+			goto out;
+
+		/* fmt[i] != 0 && fmt[last] == 0, so we can access fmt[i + 1] */
+		i++;
+
+		/* skip optional "[0+-][num]" width formating field */
+		while (fmt[i] == '0' || fmt[i] == '+'  || fmt[i] == '-')
+			i++;
+		if (fmt[i] >= '1' && fmt[i] <= '9') {
+			i++;
+			while (fmt[i] >= '0' && fmt[i] <= '9')
+				i++;
+		}
+
+		if (fmt[i] == 's') {
+			/* disallow any further format extensions */
+			if (fmt[i + 1] != 0 &&
+			    !isspace(fmt[i + 1]) &&
+			    !ispunct(fmt[i + 1]))
+				goto out;
+
+			/* try our best to copy */
+			if (memcpy_cnt >= MAX_SEQ_PRINTF_MAX_MEMCPY) {
+				err = -E2BIG;
+				goto out;
+			}
+
+			bufs->buf[memcpy_cnt][0] = 0;
+			strncpy_from_unsafe(bufs->buf[memcpy_cnt],
+					    (void *) (long) args[fmt_cnt],
+					    MAX_SEQ_PRINTF_STR_LEN);
+			params[fmt_cnt] = (u64)(long)bufs->buf[memcpy_cnt];
+
+			fmt_cnt++;
+			memcpy_cnt++;
+			continue;
+		}
+
+		if (fmt[i] == 'p') {
+			if (fmt[i + 1] == 0 ||
+			    fmt[i + 1] == 'K' ||
+			    fmt[i + 1] == 'x') {
+				/* just kernel pointers */
+				params[fmt_cnt] = args[fmt_cnt];
+				fmt_cnt++;
+				continue;
+			}
+
+			/* only support "%pI4", "%pi4", "%pI6" and "pi6". */
+			if (fmt[i + 1] != 'i' && fmt[i + 1] != 'I')
+				goto out;
+			if (fmt[i + 2] != '4' && fmt[i + 2] != '6')
+				goto out;
+
+			if (memcpy_cnt >= MAX_SEQ_PRINTF_MAX_MEMCPY) {
+				err = -E2BIG;
+				goto out;
+			}
+
+
+			copy_size = (fmt[i + 2] == '4') ? 4 : 16;
+
+			probe_kernel_read(bufs->buf[memcpy_cnt],
+					  (void *) (long) args[fmt_cnt], copy_size);
+			params[fmt_cnt] = (u64)(long)bufs->buf[memcpy_cnt];
+
+			i += 2;
+			fmt_cnt++;
+			memcpy_cnt++;
+			continue;
+		}
+
+		if (fmt[i] == 'l') {
+			i++;
+			if (fmt[i] == 'l')
+				i++;
+		}
+
+		if (fmt[i] != 'i' && fmt[i] != 'd' &&
+		    fmt[i] != 'u' && fmt[i] != 'x')
+			goto out;
+
+		params[fmt_cnt] = args[fmt_cnt];
+		fmt_cnt++;
+	}
+
+	/* Maximumly we can have MAX_SEQ_PRINTF_VARARGS parameter, just give
+	 * all of them to seq_printf().
+	 */
+	seq_printf(m, fmt, params[0], params[1], params[2], params[3],
+		   params[4], params[5], params[6], params[7], params[8],
+		   params[9], params[10], params[11]);
+
+	err = seq_has_overflowed(m) ? -EOVERFLOW : 0;
+out:
+	this_cpu_dec(bpf_seq_printf_buf_used);
+	return err;
+}
+
+static int bpf_seq_printf_btf_ids[5];
+static const struct bpf_func_proto bpf_seq_printf_proto = {
+	.func		= bpf_seq_printf,
+	.gpl_only	= true,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_BTF_ID,
+	.arg2_type	= ARG_PTR_TO_MEM,
+	.arg3_type	= ARG_CONST_SIZE,
+	.arg4_type      = ARG_PTR_TO_MEM_OR_NULL,
+	.arg5_type      = ARG_CONST_SIZE_OR_ZERO,
+	.btf_id		= bpf_seq_printf_btf_ids,
+};
+
+BPF_CALL_3(bpf_seq_write, struct seq_file *, m, const void *, data, u32, len)
+{
+	return seq_write(m, data, len) ? -EOVERFLOW : 0;
+}
+
+static int bpf_seq_write_btf_ids[5];
+static const struct bpf_func_proto bpf_seq_write_proto = {
+	.func		= bpf_seq_write,
+	.gpl_only	= true,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_BTF_ID,
+	.arg2_type	= ARG_PTR_TO_MEM,
+	.arg3_type	= ARG_CONST_SIZE,
+	.btf_id		= bpf_seq_write_btf_ids,
+};
+
 static __always_inline int
 get_map_perf_counter(struct bpf_map *map, u64 flags,
 		     u64 *value, u64 *enabled, u64 *running)
@@ -1226,6 +1413,14 @@  tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 	case BPF_FUNC_xdp_output:
 		return &bpf_xdp_output_proto;
 #endif
+	case BPF_FUNC_seq_printf:
+		return prog->expected_attach_type == BPF_TRACE_ITER ?
+		       &bpf_seq_printf_proto :
+		       NULL;
+	case BPF_FUNC_seq_write:
+		return prog->expected_attach_type == BPF_TRACE_ITER ?
+		       &bpf_seq_write_proto :
+		       NULL;
 	default:
 		return raw_tp_prog_func_proto(func_id, prog);
 	}
diff --git a/scripts/bpf_helpers_doc.py b/scripts/bpf_helpers_doc.py
index f43d193aff3a..ded304c96a05 100755
--- a/scripts/bpf_helpers_doc.py
+++ b/scripts/bpf_helpers_doc.py
@@ -414,6 +414,7 @@  class PrinterHelpers(Printer):
             'struct sk_reuseport_md',
             'struct sockaddr',
             'struct tcphdr',
+            'struct seq_file',
 
             'struct __sk_buff',
             'struct sk_msg_md',
@@ -450,6 +451,7 @@  class PrinterHelpers(Printer):
             'struct sk_reuseport_md',
             'struct sockaddr',
             'struct tcphdr',
+            'struct seq_file',
     }
     mapped_types = {
             'u8': '__u8',
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 97ceb0f2e539..e440a9d5cca2 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -3076,6 +3076,34 @@  union bpf_attr {
  * 		See: clock_gettime(CLOCK_BOOTTIME)
  * 	Return
  * 		Current *ktime*.
+ *
+ * int bpf_seq_printf(struct seq_file *m, const char *fmt, u32 fmt_size, const void *data, u32 data_len)
+ * 	Description
+ * 		seq_printf uses seq_file seq_printf() to print out the format string.
+ * 		The *m* represents the seq_file. The *fmt* and *fmt_size* are for
+ * 		the format string itself. The *data* and *data_len* are format string
+ * 		arguments. The *data* are a u64 array and corresponding format string
+ * 		values are stored in the array. For strings and pointers where pointees
+ * 		are accessed, only the pointer values are stored in the *data* array.
+ * 		The *data_len* is the *data* size in term of bytes.
+ * 	Return
+ * 		0 on success, or a negative errno in case of failure.
+ *
+ *		* **-EBUSY**		Percpu memory copy buffer is busy, can try again
+ *					by returning 1 from bpf program.
+ *		* **-EINVAL**		Invalid arguments, or invalid/unsupported formats.
+ *		* **-E2BIG**		Too many format specifiers.
+ *		* **-EOVERFLOW**	Overflow happens, the same object will be tried again.
+ *
+ * int bpf_seq_write(struct seq_file *m, const void *data, u32 len)
+ * 	Description
+ * 		seq_write uses seq_file seq_write() to write the data.
+ * 		The *m* represents the seq_file. The *data* and *len* represent the
+ *		data to write in bytes.
+ * 	Return
+ * 		0 on success, or a negative errno in case of failure.
+ *
+ *		* **-EOVERFLOW**	Overflow happens, the same object will be tried again.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -3203,7 +3231,9 @@  union bpf_attr {
 	FN(get_netns_cookie),		\
 	FN(get_current_ancestor_cgroup_id),	\
 	FN(sk_assign),			\
-	FN(ktime_get_boot_ns),
+	FN(ktime_get_boot_ns),		\
+	FN(seq_printf),			\
+	FN(seq_write),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call