diff mbox

[v3,net-next] bpf: fix bpf_perf_event_read() helper

Message ID 1445559014-4667-1-git-send-email-ast@kernel.org
State Accepted, archived
Delegated to: David Miller
Headers show

Commit Message

Alexei Starovoitov Oct. 23, 2015, 12:10 a.m. UTC
Fix safety checks for bpf_perf_event_read():
- only non-inherited events can be added to perf_event_array map
  (do this check statically at map insertion time)
- dynamically check that event is local and !pmu->count
Otherwise buggy bpf program can cause kernel splat.

Also fix error path after perf_event_attrs()
and remove redundant 'extern'.

Fixes: 35578d798400 ("bpf: Implement function bpf_perf_event_read() that get the selected hardware PMU conuter")
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
v2->v3:
. refactor checks based on Wangnan's and Peter's feedback
while refactoring realized that these two issues need fixes as well:
. fix perf_event_attrs() error path
. remove redundant extern

v1->v2: fix compile in case of !CONFIG_PERF_EVENTS

Even in the worst case the crash is not possible.
Only warn_on_once, so imo net-next is ok.

 include/linux/bpf.h      |    1 -
 kernel/bpf/arraymap.c    |   25 ++++++++++++++++---------
 kernel/trace/bpf_trace.c |    7 ++++++-
 3 files changed, 22 insertions(+), 11 deletions(-)

Comments

Wangnan (F) Oct. 23, 2015, 2:21 a.m. UTC | #1
On 2015/10/23 8:10, Alexei Starovoitov wrote:
> Fix safety checks for bpf_perf_event_read():
> - only non-inherited events can be added to perf_event_array map
>    (do this check statically at map insertion time)
> - dynamically check that event is local and !pmu->count
> Otherwise buggy bpf program can cause kernel splat.
>
> Also fix error path after perf_event_attrs()
> and remove redundant 'extern'.
>
> Fixes: 35578d798400 ("bpf: Implement function bpf_perf_event_read() that get the selected hardware PMU conuter")
> Signed-off-by: Alexei Starovoitov <ast@kernel.org>
> ---
> v2->v3:
> . refactor checks based on Wangnan's and Peter's feedback
> while refactoring realized that these two issues need fixes as well:
> . fix perf_event_attrs() error path
> . remove redundant extern
>
> v1->v2: fix compile in case of !CONFIG_PERF_EVENTS
>
> Even in the worst case the crash is not possible.
> Only warn_on_once, so imo net-next is ok.
>
>   include/linux/bpf.h      |    1 -
>   kernel/bpf/arraymap.c    |   25 ++++++++++++++++---------
>   kernel/trace/bpf_trace.c |    7 ++++++-
>   3 files changed, 22 insertions(+), 11 deletions(-)
>
> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> index e3a51b74e275..75718fa28260 100644
> --- a/include/linux/bpf.h
> +++ b/include/linux/bpf.h
> @@ -194,7 +194,6 @@ extern const struct bpf_func_proto bpf_map_lookup_elem_proto;
>   extern const struct bpf_func_proto bpf_map_update_elem_proto;
>   extern const struct bpf_func_proto bpf_map_delete_elem_proto;
>   
> -extern const struct bpf_func_proto bpf_perf_event_read_proto;
>   extern const struct bpf_func_proto bpf_get_prandom_u32_proto;
>   extern const struct bpf_func_proto bpf_get_smp_processor_id_proto;
>   extern const struct bpf_func_proto bpf_tail_call_proto;
> diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
> index e3cfe46b074f..3f4c99e06c6b 100644
> --- a/kernel/bpf/arraymap.c
> +++ b/kernel/bpf/arraymap.c
> @@ -292,16 +292,23 @@ static void *perf_event_fd_array_get_ptr(struct bpf_map *map, int fd)
>   
>   	attr = perf_event_attrs(event);
>   	if (IS_ERR(attr))
> -		return (void *)attr;
> +		goto err;
>   
> -	if (attr->type != PERF_TYPE_RAW &&
> -	    !(attr->type == PERF_TYPE_SOFTWARE &&
> -	      attr->config == PERF_COUNT_SW_BPF_OUTPUT) &&
> -	    attr->type != PERF_TYPE_HARDWARE) {
> -		perf_event_release_kernel(event);
> -		return ERR_PTR(-EINVAL);
> -	}
> -	return event;
> +	if (attr->inherit)
> +		goto err;
> +

Since Peter suggest it is pointless for a system-wide perf_event
has inherit bit set [1], I think it should be safe to enable
system-wide perf_event pass this check?

I'll check code to make sure.

[1] 
http://lkml.kernel.org/r/20151022124142.GQ17308@twins.programming.kicks-ass.net

> +	if (attr->type == PERF_TYPE_RAW)
> +		return event;
> +
> +	if (attr->type == PERF_TYPE_HARDWARE)
> +		return event;
> +
> +	if (attr->type == PERF_TYPE_SOFTWARE &&
> +	    attr->config == PERF_COUNT_SW_BPF_OUTPUT)
> +		return event;
> +err:
> +	perf_event_release_kernel(event);
> +	return ERR_PTR(-EINVAL);
>   }
>   
>   static void perf_event_fd_array_put_ptr(void *ptr)
> diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
> index 47febbe7998e..003df3887287 100644
> --- a/kernel/trace/bpf_trace.c
> +++ b/kernel/trace/bpf_trace.c
> @@ -199,6 +199,11 @@ static u64 bpf_perf_event_read(u64 r1, u64 index, u64 r3, u64 r4, u64 r5)
>   	if (!event)
>   		return -ENOENT;
>   
> +	/* make sure event is local and doesn't have pmu::count */
> +	if (event->oncpu != smp_processor_id() ||
> +	    event->pmu->count)
> +		return -EINVAL;
> +
>   	/*
>   	 * we don't know if the function is run successfully by the
>   	 * return value. It can be judged in other places, such as
> @@ -207,7 +212,7 @@ static u64 bpf_perf_event_read(u64 r1, u64 index, u64 r3, u64 r4, u64 r5)
>   	return perf_event_read_local(event);
>   }
>   
> -const struct bpf_func_proto bpf_perf_event_read_proto = {
> +static const struct bpf_func_proto bpf_perf_event_read_proto = {
>   	.func		= bpf_perf_event_read,
>   	.gpl_only	= false,
>   	.ret_type	= RET_INTEGER,


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Alexei Starovoitov Oct. 23, 2015, 2:30 a.m. UTC | #2
On 10/22/15 7:21 PM, Wangnan (F) wrote:
>> +    if (attr->inherit)
>> +        goto err;
>> +
>
> Since Peter suggest it is pointless for a system-wide perf_event
> has inherit bit set [1], I think it should be safe to enable
> system-wide perf_event pass this check?

here we don't know whether it's system wide or not, so the check
is needed.
The patch is the fix that should have been there from day one.
We must be safe first and relax later.

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Wangnan (F) Oct. 23, 2015, 3:47 a.m. UTC | #3
On 2015/10/23 8:10, Alexei Starovoitov wrote:
> Fix safety checks for bpf_perf_event_read():
> - only non-inherited events can be added to perf_event_array map
>    (do this check statically at map insertion time)
> - dynamically check that event is local and !pmu->count
> Otherwise buggy bpf program can cause kernel splat.
>
> Also fix error path after perf_event_attrs()
> and remove redundant 'extern'.
>
> Fixes: 35578d798400 ("bpf: Implement function bpf_perf_event_read() that get the selected hardware PMU conuter")
> Signed-off-by: Alexei Starovoitov <ast@kernel.org>

Tested-by: Wang Nan <wangnan0@huawei.com>


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Peter Zijlstra Oct. 23, 2015, 12:03 p.m. UTC | #4
On Thu, Oct 22, 2015 at 05:10:14PM -0700, Alexei Starovoitov wrote:
> +++ b/kernel/trace/bpf_trace.c
> @@ -199,6 +199,11 @@ static u64 bpf_perf_event_read(u64 r1, u64 index, u64 r3, u64 r4, u64 r5)
>  	if (!event)
>  		return -ENOENT;
>  
> +	/* make sure event is local and doesn't have pmu::count */
> +	if (event->oncpu != smp_processor_id() ||
> +	    event->pmu->count)
> +		return -EINVAL;
> +
>  	/*
>  	 * we don't know if the function is run successfully by the
>  	 * return value. It can be judged in other places, such as

I might want to go turn that into a helper function to keep !perf code
from poking around in the event itself, but its ok for now I suppose.

> @@ -207,7 +212,7 @@ static u64 bpf_perf_event_read(u64 r1, u64 index, u64 r3, u64 r4, u64 r5)
>  	return perf_event_read_local(event);
>  }

So the bpf_perf_event_read() returns the count value, does this not also
mean that returning -EINVAL here is also 'wrong'?

I mean, sure an actual count value that high is unlikely, but its still
a broken interface.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Alexei Starovoitov Oct. 23, 2015, 2:42 p.m. UTC | #5
On 10/23/15 5:03 AM, Peter Zijlstra wrote:
> So the bpf_perf_event_read() returns the count value, does this not also
> mean that returning -EINVAL here is also 'wrong'?
>
> I mean, sure an actual count value that high is unlikely, but its still
> a broken interface.

Agree. that's not pretty interface. I wish I looked at it more carefully
when it was introduced. Now it's too late to change.

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Peter Zijlstra Oct. 23, 2015, 2:54 p.m. UTC | #6
On Fri, Oct 23, 2015 at 07:42:22AM -0700, Alexei Starovoitov wrote:
> On 10/23/15 5:03 AM, Peter Zijlstra wrote:
> >So the bpf_perf_event_read() returns the count value, does this not also
> >mean that returning -EINVAL here is also 'wrong'?
> >
> >I mean, sure an actual count value that high is unlikely, but its still
> >a broken interface.
> 
> Agree. that's not pretty interface. I wish I looked at it more carefully
> when it was introduced. Now it's too late to change.

Right; and I figure changing the function signature is not done because
the eBPF stuff is ABI? Unfortunate indeed.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Ingo Molnar Oct. 25, 2015, 9:21 a.m. UTC | #7
* Alexei Starovoitov <ast@plumgrid.com> wrote:

> On 10/23/15 5:03 AM, Peter Zijlstra wrote:
> >So the bpf_perf_event_read() returns the count value, does this not also
> >mean that returning -EINVAL here is also 'wrong'?
> >
> >I mean, sure an actual count value that high is unlikely, but its still
> >a broken interface.
> 
> Agree. that's not pretty interface. I wish I looked at it more carefully
> when it was introduced. Now it's too late to change.

So I really, really think eBPF needs to have an easy to use mechanism to phase out 
old ABI components and introducing new (better) ones!

Then old crap can be de-emphasised and eventually removed, instead of having to 
live with crap forever ...

Thanks,

	Ingo
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Alexei Starovoitov Oct. 25, 2015, 4:23 p.m. UTC | #8
On 10/25/15 2:21 AM, Ingo Molnar wrote:
> Then old crap can be de-emphasised and eventually removed, instead of having to
> live with crap forever ...

strongly disagree. none of the helpers are 'crap'.
bpf_perf_event_read() muxes of -EINVAL into return value, but it's non
ambiguous to the program whether it got an error or real counter value.
So it's not pretty, but it's a reasonable trade off.
Properly written bpf programs will not be hitting the error path (which
is there for safety and protection against buggy programs) and will
consume return value without any extra checks.
bpf_perf_event_read() could have been done via passing a pointer to
stack where counter value can be stored, but that is much slower,
since program would need to init the stack and pass pointers while
helpers are not inlined, so the cost of return via stack is higher
than returning by value. In this case bpf_perf_event_read() can be hot,
so makes sense to optimize and sacrifice 'pretty' factor.

All existing helpers have use cases behind them and none overlap,
so not a single one can be 'deprecated'.
In general I don't think it's worth to make an exception in the kernel
that some interfaces are not ABI. That will give a bad impression on
the kernel overall. Either we have generic deprecation mechanism for
everything or none and my vote is for none.

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Peter Zijlstra Oct. 26, 2015, 12:32 p.m. UTC | #9
On Sun, Oct 25, 2015 at 09:23:36AM -0700, Alexei Starovoitov wrote:
> bpf_perf_event_read() muxes of -EINVAL into return value, but it's non
> ambiguous to the program whether it got an error or real counter value.

How can that be, the (u64)-EINVAL value is a valid counter value..
unlikely maybe, but still quite possible.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Wangnan (F) Oct. 26, 2015, 12:54 p.m. UTC | #10
On 2015/10/26 20:32, Peter Zijlstra wrote:
> On Sun, Oct 25, 2015 at 09:23:36AM -0700, Alexei Starovoitov wrote:
>> bpf_perf_event_read() muxes of -EINVAL into return value, but it's non
>> ambiguous to the program whether it got an error or real counter value.
> How can that be, the (u64)-EINVAL value is a valid counter value..
> unlikely maybe, but still quite possible.
In our real usecase we simply treat return value larger than 
0x7fffffffffffffff
as error result. We can make it even larger, for example, to 
0xffffffffffff0000.
Resuling values can be pre-processed by a script to filter potential 
error result
out so it is not a very big problem for our real usecases.

For a better interface, I suggest

  u64 bpf_perf_event_read(bool *perror);

which still returns counter value through its return value but put error 
code
to stack. Then BPF program can pass NULL to the function if BPF problem
doesn't want to deal with error code.

Thank you.

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Alexei Starovoitov Oct. 26, 2015, 9:28 p.m. UTC | #11
On 10/26/15 5:54 AM, Wangnan (F) wrote:
>
>
> On 2015/10/26 20:32, Peter Zijlstra wrote:
>> On Sun, Oct 25, 2015 at 09:23:36AM -0700, Alexei Starovoitov wrote:
>>> bpf_perf_event_read() muxes of -EINVAL into return value, but it's non
>>> ambiguous to the program whether it got an error or real counter value.
>> How can that be, the (u64)-EINVAL value is a valid counter value..
>> unlikely maybe, but still quite possible.
> In our real usecase we simply treat return value larger than
> 0x7fffffffffffffff
> as error result. We can make it even larger, for example, to
> 0xffffffffffff0000.

either above or write the program that index is valid, then you
don't need to check for errors.

> Resuling values can be pre-processed by a script to filter potential
> error result
> out so it is not a very big problem for our real usecases.
>
> For a better interface, I suggest
>
>   u64 bpf_perf_event_read(bool *perror);
>
> which still returns counter value through its return value but put error
> code
> to stack. Then BPF program can pass NULL to the function if BPF problem
> doesn't want to deal with error code.

no. we're not going to introduce another interface for this.
The current one is fine. Don't pass incorrect index and you won't see
einval. Returning ints or bools via stack is much slower.

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Miller Oct. 27, 2015, 4:50 a.m. UTC | #12
From: Alexei Starovoitov <ast@plumgrid.com>
Date: Thu, 22 Oct 2015 17:10:14 -0700

> Fix safety checks for bpf_perf_event_read():
> - only non-inherited events can be added to perf_event_array map
>   (do this check statically at map insertion time)
> - dynamically check that event is local and !pmu->count
> Otherwise buggy bpf program can cause kernel splat.
> 
> Also fix error path after perf_event_attrs()
> and remove redundant 'extern'.
> 
> Fixes: 35578d798400 ("bpf: Implement function bpf_perf_event_read() that get the selected hardware PMU conuter")
> Signed-off-by: Alexei Starovoitov <ast@kernel.org>

Applied, although my tendancy is to agree with the sentiment that you must
respect the entire universe of valid 64-bit counter values.  I do not buy
the arguments about values overlapping error codes being unlikely or not
worth worrying about.

Just FYI...
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index e3a51b74e275..75718fa28260 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -194,7 +194,6 @@  extern const struct bpf_func_proto bpf_map_lookup_elem_proto;
 extern const struct bpf_func_proto bpf_map_update_elem_proto;
 extern const struct bpf_func_proto bpf_map_delete_elem_proto;
 
-extern const struct bpf_func_proto bpf_perf_event_read_proto;
 extern const struct bpf_func_proto bpf_get_prandom_u32_proto;
 extern const struct bpf_func_proto bpf_get_smp_processor_id_proto;
 extern const struct bpf_func_proto bpf_tail_call_proto;
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index e3cfe46b074f..3f4c99e06c6b 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -292,16 +292,23 @@  static void *perf_event_fd_array_get_ptr(struct bpf_map *map, int fd)
 
 	attr = perf_event_attrs(event);
 	if (IS_ERR(attr))
-		return (void *)attr;
+		goto err;
 
-	if (attr->type != PERF_TYPE_RAW &&
-	    !(attr->type == PERF_TYPE_SOFTWARE &&
-	      attr->config == PERF_COUNT_SW_BPF_OUTPUT) &&
-	    attr->type != PERF_TYPE_HARDWARE) {
-		perf_event_release_kernel(event);
-		return ERR_PTR(-EINVAL);
-	}
-	return event;
+	if (attr->inherit)
+		goto err;
+
+	if (attr->type == PERF_TYPE_RAW)
+		return event;
+
+	if (attr->type == PERF_TYPE_HARDWARE)
+		return event;
+
+	if (attr->type == PERF_TYPE_SOFTWARE &&
+	    attr->config == PERF_COUNT_SW_BPF_OUTPUT)
+		return event;
+err:
+	perf_event_release_kernel(event);
+	return ERR_PTR(-EINVAL);
 }
 
 static void perf_event_fd_array_put_ptr(void *ptr)
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 47febbe7998e..003df3887287 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -199,6 +199,11 @@  static u64 bpf_perf_event_read(u64 r1, u64 index, u64 r3, u64 r4, u64 r5)
 	if (!event)
 		return -ENOENT;
 
+	/* make sure event is local and doesn't have pmu::count */
+	if (event->oncpu != smp_processor_id() ||
+	    event->pmu->count)
+		return -EINVAL;
+
 	/*
 	 * we don't know if the function is run successfully by the
 	 * return value. It can be judged in other places, such as
@@ -207,7 +212,7 @@  static u64 bpf_perf_event_read(u64 r1, u64 index, u64 r3, u64 r4, u64 r5)
 	return perf_event_read_local(event);
 }
 
-const struct bpf_func_proto bpf_perf_event_read_proto = {
+static const struct bpf_func_proto bpf_perf_event_read_proto = {
 	.func		= bpf_perf_event_read,
 	.gpl_only	= false,
 	.ret_type	= RET_INTEGER,