diff mbox series

[v2,bpf-next,1/3] bpf: Add bpf_perf_prog_read_branches() helper

Message ID 20200122202220.21335-2-dxu@dxuuu.xyz
State Changes Requested
Delegated to: BPF Maintainers
Headers show
Series Add bpf_perf_prog_read_branches() helper | expand

Commit Message

Daniel Xu Jan. 22, 2020, 8:22 p.m. UTC
Branch records are a CPU feature that can be configured to record
certain branches that are taken during code execution. This data is
particularly interesting for profile guided optimizations. perf has had
branch record support for a while but the data collection can be a bit
coarse grained.

We (Facebook) have seen in experiments that associating metadata with
branch records can improve results (after postprocessing). We generally
use bpf_probe_read_*() to get metadata out of userspace. That's why bpf
support for branch records is useful.

Aside from this particular use case, having branch data available to bpf
progs can be useful to get stack traces out of userspace applications
that omit frame pointers.

Signed-off-by: Daniel Xu <dxu@dxuuu.xyz>
---
 include/uapi/linux/bpf.h | 13 ++++++++++++-
 kernel/trace/bpf_trace.c | 31 +++++++++++++++++++++++++++++++
 2 files changed, 43 insertions(+), 1 deletion(-)

Comments

John Fastabend Jan. 23, 2020, 5:39 a.m. UTC | #1
Daniel Xu wrote:
> Branch records are a CPU feature that can be configured to record
> certain branches that are taken during code execution. This data is
> particularly interesting for profile guided optimizations. perf has had
> branch record support for a while but the data collection can be a bit
> coarse grained.
> 
> We (Facebook) have seen in experiments that associating metadata with
> branch records can improve results (after postprocessing). We generally
> use bpf_probe_read_*() to get metadata out of userspace. That's why bpf
> support for branch records is useful.
> 
> Aside from this particular use case, having branch data available to bpf
> progs can be useful to get stack traces out of userspace applications
> that omit frame pointers.
> 
> Signed-off-by: Daniel Xu <dxu@dxuuu.xyz>
> ---
>  include/uapi/linux/bpf.h | 13 ++++++++++++-
>  kernel/trace/bpf_trace.c | 31 +++++++++++++++++++++++++++++++
>  2 files changed, 43 insertions(+), 1 deletion(-)
> 
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index 033d90a2282d..7350c5be6158 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -2885,6 +2885,16 @@ union bpf_attr {
>   *		**-EPERM** if no permission to send the *sig*.
>   *
>   *		**-EAGAIN** if bpf program can try again.
> + *
> + * int bpf_perf_prog_read_branches(struct bpf_perf_event_data *ctx, void *buf, u32 buf_size)
> + * 	Description
> + * 		For en eBPF program attached to a perf event, retrieve the
> + * 		branch records (struct perf_branch_entry) associated to *ctx*
> + * 		and store it in	the buffer pointed by *buf* up to size
> + * 		*buf_size* bytes.

It seems extra bytes in buf will be cleared. The number of bytes
copied is returned so I don't see any reason to clear the extra bytes I would
just let the BPF program do this if they care. But it should be noted in
the description at least.

> + * 	Return
> + *		On success, number of bytes written to *buf*. On error, a
> + *		negative value.
>   */
>  #define __BPF_FUNC_MAPPER(FN)		\
>  	FN(unspec),			\
> @@ -3004,7 +3014,8 @@ union bpf_attr {
>  	FN(probe_read_user_str),	\
>  	FN(probe_read_kernel_str),	\
>  	FN(tcp_send_ack),		\
> -	FN(send_signal_thread),
> +	FN(send_signal_thread),		\
> +	FN(perf_prog_read_branches),
>  
>  /* integer value in 'imm' field of BPF_CALL instruction selects which helper
>   * function eBPF program intends to call
> diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
> index 19e793aa441a..24c51272a1f7 100644
> --- a/kernel/trace/bpf_trace.c
> +++ b/kernel/trace/bpf_trace.c
> @@ -1028,6 +1028,35 @@ static const struct bpf_func_proto bpf_perf_prog_read_value_proto = {
>           .arg3_type      = ARG_CONST_SIZE,
>  };
>  
> +BPF_CALL_3(bpf_perf_prog_read_branches, struct bpf_perf_event_data_kern *, ctx,
> +	   void *, buf, u32, size)
> +{
> +	struct perf_branch_stack *br_stack = ctx->data->br_stack;
> +	u32 to_copy = 0, to_clear = size;
> +	int err = -EINVAL;
> +
> +	if (unlikely(!br_stack))
> +		goto clear;
> +
> +	to_copy = min_t(u32, br_stack->nr * sizeof(struct perf_branch_entry), size);
> +	to_clear -= to_copy;
> +
> +	memcpy(buf, br_stack->entries, to_copy);
> +	err = to_copy;
> +clear:
> +	memset(buf + to_copy, 0, to_clear);

Here, why do this at all? If the user cares they can clear the bytes
directly from the BPF program. I suspect its probably going to be
wasted work in most cases. If its needed for some reason provide 
a comment with it.

> +	return err;
> +}

[...]
Daniel Xu Jan. 23, 2020, 8:09 p.m. UTC | #2
Hi John, thanks for looking.

On Wed Jan 22, 2020 at 9:39 PM, John Fastabend wrote:
[...]
> > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> > index 033d90a2282d..7350c5be6158 100644
> > --- a/include/uapi/linux/bpf.h
> > +++ b/include/uapi/linux/bpf.h
> > @@ -2885,6 +2885,16 @@ union bpf_attr {
> >   *		**-EPERM** if no permission to send the *sig*.
> >   *
> >   *		**-EAGAIN** if bpf program can try again.
> > + *
> > + * int bpf_perf_prog_read_branches(struct bpf_perf_event_data *ctx, void *buf, u32 buf_size)
> > + * 	Description
> > + * 		For en eBPF program attached to a perf event, retrieve the
> > + * 		branch records (struct perf_branch_entry) associated to *ctx*
> > + * 		and store it in	the buffer pointed by *buf* up to size
> > + * 		*buf_size* bytes.
>
> 
> It seems extra bytes in buf will be cleared. The number of bytes
> copied is returned so I don't see any reason to clear the extra bytes I
> would
> just let the BPF program do this if they care. But it should be noted in
> the description at least.

In include/linux/bpf.h:

        /* the following constraints used to prototype bpf_memcmp() and other
         * functions that access data on eBPF program stack
         */
        ARG_PTR_TO_UNINIT_MEM,  /* pointer to memory does not need to be initialized,
                                 * helper function must fill all bytes or clear
                                 * them in error case.
                                 */

I figured it would be good to clear out the stack b/c this helper
writes data on program stack.

Also bpf_perf_prog_read_value() does something similar (fill zeros on
failure).

[...]
> > +	to_copy = min_t(u32, br_stack->nr * sizeof(struct perf_branch_entry), size);
> > +	to_clear -= to_copy;
> > +
> > +	memcpy(buf, br_stack->entries, to_copy);
> > +	err = to_copy;
> > +clear:
> > +	memset(buf + to_copy, 0, to_clear);
>
> 
> Here, why do this at all? If the user cares they can clear the bytes
> directly from the BPF program. I suspect its probably going to be
> wasted work in most cases. If its needed for some reason provide
> a comment with it.

Same concern as above, right?

I can send a V3 with updated uapi/linux/bpf.h description (and a rebase).

Thanks,
Daniel
Daniel Borkmann Jan. 23, 2020, 10:23 p.m. UTC | #3
On 1/23/20 9:09 PM, Daniel Xu wrote:
> Hi John, thanks for looking.
> 
> On Wed Jan 22, 2020 at 9:39 PM, John Fastabend wrote:
> [...]
>>> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
>>> index 033d90a2282d..7350c5be6158 100644
>>> --- a/include/uapi/linux/bpf.h
>>> +++ b/include/uapi/linux/bpf.h
>>> @@ -2885,6 +2885,16 @@ union bpf_attr {
>>>    *		**-EPERM** if no permission to send the *sig*.
>>>    *
>>>    *		**-EAGAIN** if bpf program can try again.
>>> + *
>>> + * int bpf_perf_prog_read_branches(struct bpf_perf_event_data *ctx, void *buf, u32 buf_size)
>>> + * 	Description
>>> + * 		For en eBPF program attached to a perf event, retrieve the
>>> + * 		branch records (struct perf_branch_entry) associated to *ctx*
>>> + * 		and store it in	the buffer pointed by *buf* up to size
>>> + * 		*buf_size* bytes.
>>
>> It seems extra bytes in buf will be cleared. The number of bytes
>> copied is returned so I don't see any reason to clear the extra bytes I
>> would
>> just let the BPF program do this if they care. But it should be noted in
>> the description at least.
> 
> In include/linux/bpf.h:
> 
>          /* the following constraints used to prototype bpf_memcmp() and other
>           * functions that access data on eBPF program stack
>           */
>          ARG_PTR_TO_UNINIT_MEM,  /* pointer to memory does not need to be initialized,
>                                   * helper function must fill all bytes or clear
>                                   * them in error case.
>                                   */
> 
> I figured it would be good to clear out the stack b/c this helper
> writes data on program stack.
> 
> Also bpf_perf_prog_read_value() does something similar (fill zeros on
> failure).
> 
> [...]
>>> +	to_copy = min_t(u32, br_stack->nr * sizeof(struct perf_branch_entry), size);
>>> +	to_clear -= to_copy;
>>> +
>>> +	memcpy(buf, br_stack->entries, to_copy);
>>> +	err = to_copy;
>>> +clear:
>>> +	memset(buf + to_copy, 0, to_clear);
>>
>>
>> Here, why do this at all? If the user cares they can clear the bytes
>> directly from the BPF program. I suspect its probably going to be
>> wasted work in most cases. If its needed for some reason provide
>> a comment with it.
> 
> Same concern as above, right?

Yes, so we've been following this practice for all the BPF helpers no matter
which program type. Though for tracing it may be up to debate whether it makes
still sense given there's nothing to be leaked here since you can read this data
anyway via probe read if you'd wanted to. So we might as well get rid of the
clearing for all tracing helpers.

Different question related to your set. It looks like br_stack is only available
on x86, is that correct? For other archs this will always bail out on !br_stack
test. Perhaps we should document this fact so users are not surprised why their
prog using this helper is not working on !x86. Wdyt?

Thanks,
Daniel
Daniel Xu Jan. 23, 2020, 10:30 p.m. UTC | #4
On Thu Jan 23, 2020 at 11:23 PM, Daniel Borkmann wrote:
[...]
> 
> Yes, so we've been following this practice for all the BPF helpers no
> matter
> which program type. Though for tracing it may be up to debate whether it
> makes
> still sense given there's nothing to be leaked here since you can read
> this data
> anyway via probe read if you'd wanted to. So we might as well get rid of
> the
> clearing for all tracing helpers.

Right, that makes sense. Do you want me to leave it in for this patchset
and then remove all of them in a followup patchset?

> 
> Different question related to your set. It looks like br_stack is only
> available
> on x86, is that correct? For other archs this will always bail out on
> !br_stack
> test. Perhaps we should document this fact so users are not surprised
> why their
> prog using this helper is not working on !x86. Wdyt?

I think perf_event_open() should fail on !x86 if a user tries to configure
it with branch stack collection. So there would not be the opportunity for
the bpf prog to be attached and run. I haven't tested this, though. I'll
look through the code / install a VM and test it.

[...]

Thanks,
Daniel
Andrii Nakryiko Jan. 23, 2020, 10:41 p.m. UTC | #5
On 1/23/20 2:30 PM, Daniel Xu wrote:
> On Thu Jan 23, 2020 at 11:23 PM, Daniel Borkmann wrote:
> [...]
>>
>> Yes, so we've been following this practice for all the BPF helpers no
>> matter
>> which program type. Though for tracing it may be up to debate whether it
>> makes
>> still sense given there's nothing to be leaked here since you can read
>> this data
>> anyway via probe read if you'd wanted to. So we might as well get rid of
>> the
>> clearing for all tracing helpers.
> 
> Right, that makes sense. Do you want me to leave it in for this patchset
> and then remove all of them in a followup patchset?
> 

I don't think we can remove that for existing tracing helpers (e.g., 
bpf_probe_read). There are applications that explicitly expect 
destination memory to be zeroed out on failure. It's a BPF world's 
memset(0).

I also wonder if BPF verifier has any extra assumptions for 
ARG_PTR_TO_UNINIT_MEM w.r.t. it being initialized after helper call 
(e.g., for liveness tracking).

>>
>> Different question related to your set. It looks like br_stack is only
>> available
>> on x86, is that correct? For other archs this will always bail out on
>> !br_stack
>> test. Perhaps we should document this fact so users are not surprised
>> why their
>> prog using this helper is not working on !x86. Wdyt?
> 
> I think perf_event_open() should fail on !x86 if a user tries to configure
> it with branch stack collection. So there would not be the opportunity for
> the bpf prog to be attached and run. I haven't tested this, though. I'll
> look through the code / install a VM and test it.
> 
> [...]
> 
> Thanks,
> Daniel
>
Daniel Borkmann Jan. 23, 2020, 10:44 p.m. UTC | #6
On 1/23/20 11:30 PM, Daniel Xu wrote:
> On Thu Jan 23, 2020 at 11:23 PM, Daniel Borkmann wrote:
> [...]
>>
>> Yes, so we've been following this practice for all the BPF helpers no
>> matter
>> which program type. Though for tracing it may be up to debate whether it
>> makes
>> still sense given there's nothing to be leaked here since you can read
>> this data
>> anyway via probe read if you'd wanted to. So we might as well get rid of
>> the
>> clearing for all tracing helpers.
> 
> Right, that makes sense. Do you want me to leave it in for this patchset
> and then remove all of them in a followup patchset?

Lets leave it in and in a different set, we can clean this up for all tracing
related helpers at once.

>> Different question related to your set. It looks like br_stack is only
>> available
>> on x86, is that correct? For other archs this will always bail out on
>> !br_stack
>> test. Perhaps we should document this fact so users are not surprised
>> why their
>> prog using this helper is not working on !x86. Wdyt?
> 
> I think perf_event_open() should fail on !x86 if a user tries to configure
> it with branch stack collection. So there would not be the opportunity for
> the bpf prog to be attached and run. I haven't tested this, though. I'll
> look through the code / install a VM and test it.

As far as I can see the prog would still be attachable and runnable, just that
the helper always will return -EINVAL on these archs. Maybe error code should be
changed into -ENOENT to avoid confusion wrt whether user provided some invalid
input args. Should this actually bail out with -EINVAL if size is not a multiple
of sizeof(struct perf_branch_entry) as otherwise we'd end up copying half broken
branch entry information?

Thanks,
Daniel
Martin KaFai Lau Jan. 23, 2020, 11:07 p.m. UTC | #7
On Thu, Jan 23, 2020 at 11:44:53PM +0100, Daniel Borkmann wrote:
> On 1/23/20 11:30 PM, Daniel Xu wrote:
> > On Thu Jan 23, 2020 at 11:23 PM, Daniel Borkmann wrote:
> > [...]
> > > 
> > > Yes, so we've been following this practice for all the BPF helpers no
> > > matter
> > > which program type. Though for tracing it may be up to debate whether it
> > > makes
> > > still sense given there's nothing to be leaked here since you can read
> > > this data
> > > anyway via probe read if you'd wanted to. So we might as well get rid of
> > > the
> > > clearing for all tracing helpers.
> > 
> > Right, that makes sense. Do you want me to leave it in for this patchset
> > and then remove all of them in a followup patchset?
> 
> Lets leave it in and in a different set, we can clean this up for all tracing
> related helpers at once.
> 
> > > Different question related to your set. It looks like br_stack is only
> > > available
> > > on x86, is that correct? For other archs this will always bail out on
> > > !br_stack
> > > test. Perhaps we should document this fact so users are not surprised
> > > why their
> > > prog using this helper is not working on !x86. Wdyt?
> > 
> > I think perf_event_open() should fail on !x86 if a user tries to configure
> > it with branch stack collection. So there would not be the opportunity for
> > the bpf prog to be attached and run. I haven't tested this, though. I'll
> > look through the code / install a VM and test it.
> 
> As far as I can see the prog would still be attachable and runnable, just that
> the helper always will return -EINVAL on these archs. Maybe error code should be
> changed into -ENOENT to avoid confusion wrt whether user provided some invalid
+1 on -ENOENT.

> input args. Should this actually bail out with -EINVAL if size is not a multiple
> of sizeof(struct perf_branch_entry) as otherwise we'd end up copying half broken
> branch entry information?
Daniel Borkmann Jan. 23, 2020, 11:09 p.m. UTC | #8
On 1/23/20 11:41 PM, Andrii Nakryiko wrote:
> On 1/23/20 2:30 PM, Daniel Xu wrote:
>> On Thu Jan 23, 2020 at 11:23 PM, Daniel Borkmann wrote:
>> [...]
>>>
>>> Yes, so we've been following this practice for all the BPF helpers no
>>> matter
>>> which program type. Though for tracing it may be up to debate whether it
>>> makes
>>> still sense given there's nothing to be leaked here since you can read
>>> this data
>>> anyway via probe read if you'd wanted to. So we might as well get rid of
>>> the
>>> clearing for all tracing helpers.
>>
>> Right, that makes sense. Do you want me to leave it in for this patchset
>> and then remove all of them in a followup patchset?
> 
> I don't think we can remove that for existing tracing helpers (e.g.,
> bpf_probe_read). There are applications that explicitly expect
> destination memory to be zeroed out on failure. It's a BPF world's
> memset(0).

Due to avoiding error checks that way if the expected outcome of the buf
is non-zero anyway? Agree, that those would break, so yeah they cannot be
removed then.

> I also wonder if BPF verifier has any extra assumptions for
> ARG_PTR_TO_UNINIT_MEM w.r.t. it being initialized after helper call
> (e.g., for liveness tracking).

There are no extra assumptions other than memory being written after the
helper call (whether success or failure of the helper itself doesn't matter,
so there are no assumptions about the content); the data that has been
written to the buffer is marked as initialized but unknown (e.g. in
check_stack_write() the case where reg remains NULL since value_regno is
negative).

Thanks,
Daniel
Daniel Xu Jan. 23, 2020, 11:27 p.m. UTC | #9
On Thu Jan 23, 2020 at 11:44 PM, Daniel Borkmann wrote:
[...]
> >> Different question related to your set. It looks like br_stack is only
> >> available
> >> on x86, is that correct? For other archs this will always bail out on
> >> !br_stack
> >> test. Perhaps we should document this fact so users are not surprised
> >> why their
> >> prog using this helper is not working on !x86. Wdyt?
> > 
> > I think perf_event_open() should fail on !x86 if a user tries to configure
> > it with branch stack collection. So there would not be the opportunity for
> > the bpf prog to be attached and run. I haven't tested this, though. I'll
> > look through the code / install a VM and test it.
>
> 
> As far as I can see the prog would still be attachable and runnable,
> just that
> the helper always will return -EINVAL on these archs. Maybe error code
> should be
> changed into -ENOENT to avoid confusion wrt whether user provided some
> invalid
> input args. 

Ok, will add.

> Should this actually bail out with -EINVAL if size is not a
> multiple
> of sizeof(struct perf_branch_entry) as otherwise we'd end up copying
> half broken
> branch entry information?

Sure, makes sense.
>
> 
> Thanks,
> Daniel
>
> 
>
>
diff mbox series

Patch

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 033d90a2282d..7350c5be6158 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -2885,6 +2885,16 @@  union bpf_attr {
  *		**-EPERM** if no permission to send the *sig*.
  *
  *		**-EAGAIN** if bpf program can try again.
+ *
+ * int bpf_perf_prog_read_branches(struct bpf_perf_event_data *ctx, void *buf, u32 buf_size)
+ * 	Description
+ * 		For en eBPF program attached to a perf event, retrieve the
+ * 		branch records (struct perf_branch_entry) associated to *ctx*
+ * 		and store it in	the buffer pointed by *buf* up to size
+ * 		*buf_size* bytes.
+ * 	Return
+ *		On success, number of bytes written to *buf*. On error, a
+ *		negative value.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
 	FN(unspec),			\
@@ -3004,7 +3014,8 @@  union bpf_attr {
 	FN(probe_read_user_str),	\
 	FN(probe_read_kernel_str),	\
 	FN(tcp_send_ack),		\
-	FN(send_signal_thread),
+	FN(send_signal_thread),		\
+	FN(perf_prog_read_branches),
 
 /* integer value in 'imm' field of BPF_CALL instruction selects which helper
  * function eBPF program intends to call
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 19e793aa441a..24c51272a1f7 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -1028,6 +1028,35 @@  static const struct bpf_func_proto bpf_perf_prog_read_value_proto = {
          .arg3_type      = ARG_CONST_SIZE,
 };
 
+BPF_CALL_3(bpf_perf_prog_read_branches, struct bpf_perf_event_data_kern *, ctx,
+	   void *, buf, u32, size)
+{
+	struct perf_branch_stack *br_stack = ctx->data->br_stack;
+	u32 to_copy = 0, to_clear = size;
+	int err = -EINVAL;
+
+	if (unlikely(!br_stack))
+		goto clear;
+
+	to_copy = min_t(u32, br_stack->nr * sizeof(struct perf_branch_entry), size);
+	to_clear -= to_copy;
+
+	memcpy(buf, br_stack->entries, to_copy);
+	err = to_copy;
+clear:
+	memset(buf + to_copy, 0, to_clear);
+	return err;
+}
+
+static const struct bpf_func_proto bpf_perf_prog_read_branches_proto = {
+         .func           = bpf_perf_prog_read_branches,
+         .gpl_only       = true,
+         .ret_type       = RET_INTEGER,
+         .arg1_type      = ARG_PTR_TO_CTX,
+         .arg2_type      = ARG_PTR_TO_UNINIT_MEM,
+         .arg3_type      = ARG_CONST_SIZE,
+};
+
 static const struct bpf_func_proto *
 pe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 {
@@ -1040,6 +1069,8 @@  pe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 		return &bpf_get_stack_proto_tp;
 	case BPF_FUNC_perf_prog_read_value:
 		return &bpf_perf_prog_read_value_proto;
+	case BPF_FUNC_perf_prog_read_branches:
+		return &bpf_perf_prog_read_branches_proto;
 	default:
 		return tracing_func_proto(func_id, prog);
 	}