diff mbox

[net-next,2/3] bpf: introduce bpf_perf_event_output() helper

Message ID 1445396556-4854-3-git-send-email-ast@kernel.org
State Accepted, archived
Delegated to: David Miller
Headers show

Commit Message

Alexei Starovoitov Oct. 21, 2015, 3:02 a.m. UTC
This helper is used to send raw data from eBPF program into
special PERF_TYPE_SOFTWARE/PERF_COUNT_SW_BPF_OUTPUT perf_event.
User space needs to perf_event_open() it (either for one or all cpus) and
store FD into perf_event_array (similar to bpf_perf_event_read() helper)
before eBPF program can send data into it.

Today the programs triggered by kprobe collect the data and either store
it into the maps or print it via bpf_trace_printk() where latter is the debug
facility and not suitable to stream the data. This new helper replaces
such bpf_trace_printk() usage and allows programs to have dedicated
channel into user space for post-processing of the raw data collected.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/uapi/linux/bpf.h        |   11 ++++++++++
 include/uapi/linux/perf_event.h |    1 +
 kernel/bpf/arraymap.c           |    2 ++
 kernel/bpf/verifier.c           |    3 ++-
 kernel/trace/bpf_trace.c        |   46 +++++++++++++++++++++++++++++++++++++++
 5 files changed, 62 insertions(+), 1 deletion(-)

Comments

He Kuang Oct. 21, 2015, 10:01 a.m. UTC | #1
hi, Alexei

I've tested the sample in next patch and it works well. I think more work on
the perf side needs to be done for parsing PERF_COUNT_SW_BPF_OUTPUT event type,
are you working on that?

Thank you.

On 2015/10/21 11:02, Alexei Starovoitov wrote:
> This helper is used to send raw data from eBPF program into
> special PERF_TYPE_SOFTWARE/PERF_COUNT_SW_BPF_OUTPUT perf_event.
> User space needs to perf_event_open() it (either for one or all cpus) and
> store FD into perf_event_array (similar to bpf_perf_event_read() helper)
> before eBPF program can send data into it.
>
> Today the programs triggered by kprobe collect the data and either store
> it into the maps or print it via bpf_trace_printk() where latter is the debug
> facility and not suitable to stream the data. This new helper replaces
> such bpf_trace_printk() usage and allows programs to have dedicated
> channel into user space for post-processing of the raw data collected.
>
> Signed-off-by: Alexei Starovoitov <ast@kernel.org>
> ---
>   include/uapi/linux/bpf.h        |   11 ++++++++++
>   include/uapi/linux/perf_event.h |    1 +
>   kernel/bpf/arraymap.c           |    2 ++
>   kernel/bpf/verifier.c           |    3 ++-
>   kernel/trace/bpf_trace.c        |   46 +++++++++++++++++++++++++++++++++++++++
>   5 files changed, 62 insertions(+), 1 deletion(-)
>
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index 564f1f091991..2e032426cfb7 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -287,6 +287,17 @@ enum bpf_func_id {
>   	 * Return: realm if != 0
>   	 */
>   	BPF_FUNC_get_route_realm,
> +
> +	/**
> +	 * bpf_perf_event_output(ctx, map, index, data, size) - output perf raw sample
> +	 * @ctx: struct pt_regs*
> +	 * @map: pointer to perf_event_array map
> +	 * @index: index of event in the map
> +	 * @data: data on stack to be output as raw data
> +	 * @size: size of data
> +	 * Return: 0 on success
> +	 */
> +	BPF_FUNC_perf_event_output,
>   	__BPF_FUNC_MAX_ID,
>   };
>
> diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
> index 2881145cda86..d3c417615361 100644
> --- a/include/uapi/linux/perf_event.h
> +++ b/include/uapi/linux/perf_event.h
> @@ -110,6 +110,7 @@ enum perf_sw_ids {
>   	PERF_COUNT_SW_ALIGNMENT_FAULTS		= 7,
>   	PERF_COUNT_SW_EMULATION_FAULTS		= 8,
>   	PERF_COUNT_SW_DUMMY			= 9,
> +	PERF_COUNT_SW_BPF_OUTPUT		= 10,
>
>   	PERF_COUNT_SW_MAX,			/* non-ABI */
>   };
> diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
> index f2d9e698c753..e3cfe46b074f 100644
> --- a/kernel/bpf/arraymap.c
> +++ b/kernel/bpf/arraymap.c
> @@ -295,6 +295,8 @@ static void *perf_event_fd_array_get_ptr(struct bpf_map *map, int fd)
>   		return (void *)attr;
>
>   	if (attr->type != PERF_TYPE_RAW &&
> +	    !(attr->type == PERF_TYPE_SOFTWARE &&
> +	      attr->config == PERF_COUNT_SW_BPF_OUTPUT) &&
>   	    attr->type != PERF_TYPE_HARDWARE) {
>   		perf_event_release_kernel(event);
>   		return ERR_PTR(-EINVAL);
> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> index 1d6b97be79e1..b56cf51f8d42 100644
> --- a/kernel/bpf/verifier.c
> +++ b/kernel/bpf/verifier.c
> @@ -245,6 +245,7 @@ static const struct {
>   } func_limit[] = {
>   	{BPF_MAP_TYPE_PROG_ARRAY, BPF_FUNC_tail_call},
>   	{BPF_MAP_TYPE_PERF_EVENT_ARRAY, BPF_FUNC_perf_event_read},
> +	{BPF_MAP_TYPE_PERF_EVENT_ARRAY, BPF_FUNC_perf_event_output},
>   };
>
>   static void print_verifier_state(struct verifier_env *env)
> @@ -910,7 +911,7 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id)
>   		 * don't allow any other map type to be passed into
>   		 * the special func;
>   		 */
> -		if (bool_map != bool_func)
> +		if (bool_func && bool_map != bool_func)
>   			return -EINVAL;
>   	}
>
> diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
> index 0fe96c7c8803..47febbe7998e 100644
> --- a/kernel/trace/bpf_trace.c
> +++ b/kernel/trace/bpf_trace.c
> @@ -215,6 +215,50 @@ const struct bpf_func_proto bpf_perf_event_read_proto = {
>   	.arg2_type	= ARG_ANYTHING,
>   };
>
> +static u64 bpf_perf_event_output(u64 r1, u64 r2, u64 index, u64 r4, u64 size)
> +{
> +	struct pt_regs *regs = (struct pt_regs *) (long) r1;
> +	struct bpf_map *map = (struct bpf_map *) (long) r2;
> +	struct bpf_array *array = container_of(map, struct bpf_array, map);
> +	void *data = (void *) (long) r4;
> +	struct perf_sample_data sample_data;
> +	struct perf_event *event;
> +	struct perf_raw_record raw = {
> +		.size = size,
> +		.data = data,
> +	};
> +
> +	if (unlikely(index >= array->map.max_entries))
> +		return -E2BIG;
> +
> +	event = (struct perf_event *)array->ptrs[index];
> +	if (unlikely(!event))
> +		return -ENOENT;
> +
> +	if (unlikely(event->attr.type != PERF_TYPE_SOFTWARE ||
> +		     event->attr.config != PERF_COUNT_SW_BPF_OUTPUT))
> +		return -EINVAL;
> +
> +	if (unlikely(event->oncpu != smp_processor_id()))
> +		return -EOPNOTSUPP;
> +
> +	perf_sample_data_init(&sample_data, 0, 0);
> +	sample_data.raw = &raw;
> +	perf_event_output(event, &sample_data, regs);
> +	return 0;
> +}
> +
> +static const struct bpf_func_proto bpf_perf_event_output_proto = {
> +	.func		= bpf_perf_event_output,
> +	.gpl_only	= false,
> +	.ret_type	= RET_INTEGER,
> +	.arg1_type	= ARG_PTR_TO_CTX,
> +	.arg2_type	= ARG_CONST_MAP_PTR,
> +	.arg3_type	= ARG_ANYTHING,
> +	.arg4_type	= ARG_PTR_TO_STACK,
> +	.arg5_type	= ARG_CONST_STACK_SIZE,
> +};
> +
>   static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func_id)
>   {
>   	switch (func_id) {
> @@ -242,6 +286,8 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func
>   		return &bpf_get_smp_processor_id_proto;
>   	case BPF_FUNC_perf_event_read:
>   		return &bpf_perf_event_read_proto;
> +	case BPF_FUNC_perf_event_output:
> +		return &bpf_perf_event_output_proto;
>   	default:
>   		return NULL;
>   	}
>

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Wangnan (F) Oct. 21, 2015, 11:05 a.m. UTC | #2
On 2015/10/21 18:01, He Kuang wrote:
> hi, Alexei
>
> I've tested the sample in next patch and it works well. I think more 
> work on
> the perf side needs to be done for parsing PERF_COUNT_SW_BPF_OUTPUT 
> event type,
> are you working on that?
>
> Thank you.

We need to add something into parse-event.y/l to let it know the new 
software event.
We can do this simple task. However, as I gussed before, perf is unable 
to print
this new event so there is some work need to be done to let 'perf 
report' and
'perf script' know it.

One thing I can still remember is finding a way to inject format 
information through
those output so 'perf script' and 'perf data convert --to-ctf' can 
decode the raw data
without extra work. Can you remember that we have discussed a solution 
which connects
debuginfo and output raw data using a builtin function in LLVM/clang? We 
already have
clang/LLVM patch on it, but on perf side I haven't do start working on 
it. I think we can
make perf output raw data hexadamically at first.

I'll do the above work and put related patch at the end of my eBPF 
patchset because
they shouldn't be merged until this patchset upstreamed.

Thank you.

>
> On 2015/10/21 11:02, Alexei Starovoitov wrote:
>> This helper is used to send raw data from eBPF program into
>> special PERF_TYPE_SOFTWARE/PERF_COUNT_SW_BPF_OUTPUT perf_event.
>> User space needs to perf_event_open() it (either for one or all cpus) 
>> and
>> store FD into perf_event_array (similar to bpf_perf_event_read() helper)
>> before eBPF program can send data into it.
>>
>> Today the programs triggered by kprobe collect the data and either store
>> it into the maps or print it via bpf_trace_printk() where latter is 
>> the debug
>> facility and not suitable to stream the data. This new helper replaces
>> such bpf_trace_printk() usage and allows programs to have dedicated
>> channel into user space for post-processing of the raw data collected.
>>
>> Signed-off-by: Alexei Starovoitov <ast@kernel.org>
>> ---
>>   include/uapi/linux/bpf.h        |   11 ++++++++++
>>   include/uapi/linux/perf_event.h |    1 +
>>   kernel/bpf/arraymap.c           |    2 ++
>>   kernel/bpf/verifier.c           |    3 ++-
>>   kernel/trace/bpf_trace.c        |   46 
>> +++++++++++++++++++++++++++++++++++++++
>>   5 files changed, 62 insertions(+), 1 deletion(-)
>>
>> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
>> index 564f1f091991..2e032426cfb7 100644
>> --- a/include/uapi/linux/bpf.h
>> +++ b/include/uapi/linux/bpf.h
>> @@ -287,6 +287,17 @@ enum bpf_func_id {
>>        * Return: realm if != 0
>>        */
>>       BPF_FUNC_get_route_realm,
>> +
>> +    /**
>> +     * bpf_perf_event_output(ctx, map, index, data, size) - output 
>> perf raw sample
>> +     * @ctx: struct pt_regs*
>> +     * @map: pointer to perf_event_array map
>> +     * @index: index of event in the map
>> +     * @data: data on stack to be output as raw data
>> +     * @size: size of data
>> +     * Return: 0 on success
>> +     */
>> +    BPF_FUNC_perf_event_output,
>>       __BPF_FUNC_MAX_ID,
>>   };
>>
>> diff --git a/include/uapi/linux/perf_event.h 
>> b/include/uapi/linux/perf_event.h
>> index 2881145cda86..d3c417615361 100644
>> --- a/include/uapi/linux/perf_event.h
>> +++ b/include/uapi/linux/perf_event.h
>> @@ -110,6 +110,7 @@ enum perf_sw_ids {
>>       PERF_COUNT_SW_ALIGNMENT_FAULTS        = 7,
>>       PERF_COUNT_SW_EMULATION_FAULTS        = 8,
>>       PERF_COUNT_SW_DUMMY            = 9,
>> +    PERF_COUNT_SW_BPF_OUTPUT        = 10,
>>
>>       PERF_COUNT_SW_MAX,            /* non-ABI */
>>   };
>> diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
>> index f2d9e698c753..e3cfe46b074f 100644
>> --- a/kernel/bpf/arraymap.c
>> +++ b/kernel/bpf/arraymap.c
>> @@ -295,6 +295,8 @@ static void *perf_event_fd_array_get_ptr(struct 
>> bpf_map *map, int fd)
>>           return (void *)attr;
>>
>>       if (attr->type != PERF_TYPE_RAW &&
>> +        !(attr->type == PERF_TYPE_SOFTWARE &&
>> +          attr->config == PERF_COUNT_SW_BPF_OUTPUT) &&
>>           attr->type != PERF_TYPE_HARDWARE) {
>>           perf_event_release_kernel(event);
>>           return ERR_PTR(-EINVAL);
>> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
>> index 1d6b97be79e1..b56cf51f8d42 100644
>> --- a/kernel/bpf/verifier.c
>> +++ b/kernel/bpf/verifier.c
>> @@ -245,6 +245,7 @@ static const struct {
>>   } func_limit[] = {
>>       {BPF_MAP_TYPE_PROG_ARRAY, BPF_FUNC_tail_call},
>>       {BPF_MAP_TYPE_PERF_EVENT_ARRAY, BPF_FUNC_perf_event_read},
>> +    {BPF_MAP_TYPE_PERF_EVENT_ARRAY, BPF_FUNC_perf_event_output},
>>   };
>>
>>   static void print_verifier_state(struct verifier_env *env)
>> @@ -910,7 +911,7 @@ static int check_map_func_compatibility(struct 
>> bpf_map *map, int func_id)
>>            * don't allow any other map type to be passed into
>>            * the special func;
>>            */
>> -        if (bool_map != bool_func)
>> +        if (bool_func && bool_map != bool_func)
>>               return -EINVAL;
>>       }
>>
>> diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
>> index 0fe96c7c8803..47febbe7998e 100644
>> --- a/kernel/trace/bpf_trace.c
>> +++ b/kernel/trace/bpf_trace.c
>> @@ -215,6 +215,50 @@ const struct bpf_func_proto 
>> bpf_perf_event_read_proto = {
>>       .arg2_type    = ARG_ANYTHING,
>>   };
>>
>> +static u64 bpf_perf_event_output(u64 r1, u64 r2, u64 index, u64 r4, 
>> u64 size)
>> +{
>> +    struct pt_regs *regs = (struct pt_regs *) (long) r1;
>> +    struct bpf_map *map = (struct bpf_map *) (long) r2;
>> +    struct bpf_array *array = container_of(map, struct bpf_array, map);
>> +    void *data = (void *) (long) r4;
>> +    struct perf_sample_data sample_data;
>> +    struct perf_event *event;
>> +    struct perf_raw_record raw = {
>> +        .size = size,
>> +        .data = data,
>> +    };
>> +
>> +    if (unlikely(index >= array->map.max_entries))
>> +        return -E2BIG;
>> +
>> +    event = (struct perf_event *)array->ptrs[index];
>> +    if (unlikely(!event))
>> +        return -ENOENT;
>> +
>> +    if (unlikely(event->attr.type != PERF_TYPE_SOFTWARE ||
>> +             event->attr.config != PERF_COUNT_SW_BPF_OUTPUT))
>> +        return -EINVAL;
>> +
>> +    if (unlikely(event->oncpu != smp_processor_id()))
>> +        return -EOPNOTSUPP;
>> +
>> +    perf_sample_data_init(&sample_data, 0, 0);
>> +    sample_data.raw = &raw;
>> +    perf_event_output(event, &sample_data, regs);
>> +    return 0;
>> +}
>> +
>> +static const struct bpf_func_proto bpf_perf_event_output_proto = {
>> +    .func        = bpf_perf_event_output,
>> +    .gpl_only    = false,
>> +    .ret_type    = RET_INTEGER,
>> +    .arg1_type    = ARG_PTR_TO_CTX,
>> +    .arg2_type    = ARG_CONST_MAP_PTR,
>> +    .arg3_type    = ARG_ANYTHING,
>> +    .arg4_type    = ARG_PTR_TO_STACK,
>> +    .arg5_type    = ARG_CONST_STACK_SIZE,
>> +};
>> +
>>   static const struct bpf_func_proto *kprobe_prog_func_proto(enum 
>> bpf_func_id func_id)
>>   {
>>       switch (func_id) {
>> @@ -242,6 +286,8 @@ static const struct bpf_func_proto 
>> *kprobe_prog_func_proto(enum bpf_func_id func
>>           return &bpf_get_smp_processor_id_proto;
>>       case BPF_FUNC_perf_event_read:
>>           return &bpf_perf_event_read_proto;
>> +    case BPF_FUNC_perf_event_output:
>> +        return &bpf_perf_event_output_proto;
>>       default:
>>           return NULL;
>>       }
>>
>


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Peter Zijlstra Oct. 21, 2015, 12:06 p.m. UTC | #3
On Tue, Oct 20, 2015 at 08:02:34PM -0700, Alexei Starovoitov wrote:
> This helper is used to send raw data from eBPF program into
> special PERF_TYPE_SOFTWARE/PERF_COUNT_SW_BPF_OUTPUT perf_event.
> User space needs to perf_event_open() it (either for one or all cpus) and
> store FD into perf_event_array (similar to bpf_perf_event_read() helper)
> before eBPF program can send data into it.
> 

> diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
> index 2881145cda86..d3c417615361 100644
> --- a/include/uapi/linux/perf_event.h
> +++ b/include/uapi/linux/perf_event.h
> @@ -110,6 +110,7 @@ enum perf_sw_ids {
>  	PERF_COUNT_SW_ALIGNMENT_FAULTS		= 7,
>  	PERF_COUNT_SW_EMULATION_FAULTS		= 8,
>  	PERF_COUNT_SW_DUMMY			= 9,
> +	PERF_COUNT_SW_BPF_OUTPUT		= 10,
>  
>  	PERF_COUNT_SW_MAX,			/* non-ABI */
>  };

Do you really need the new type? Can't you use DUMMY for this?
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Alexei Starovoitov Oct. 21, 2015, 5:26 p.m. UTC | #4
On 10/21/15 5:06 AM, Peter Zijlstra wrote:
>>   	PERF_COUNT_SW_DUMMY			= 9,
>> >+	PERF_COUNT_SW_BPF_OUTPUT		= 10,
>> >
>> >  	PERF_COUNT_SW_MAX,			/* non-ABI */
>> >  };
> Do you really need the new type? Can't you use DUMMY for this?

It works fine with dummy, but user space code looked odd when
it perf_event_open-s dummy events and starts reading them.
So I thought either to add alias:
PERF_COUNT_SW_BPF_OUTPUT = PERF_COUNT_SW_DUMMY
or
add new number like I did above.
New number is cheap and I saw that dummy used for probing,
so went with new number to disambiguate.

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Peter Zijlstra Oct. 21, 2015, 7:33 p.m. UTC | #5
On Wed, Oct 21, 2015 at 10:26:21AM -0700, Alexei Starovoitov wrote:
> On 10/21/15 5:06 AM, Peter Zijlstra wrote:
> >>  	PERF_COUNT_SW_DUMMY			= 9,
> >>>+	PERF_COUNT_SW_BPF_OUTPUT		= 10,
> >>>
> >>>  	PERF_COUNT_SW_MAX,			/* non-ABI */
> >>>  };
> >Do you really need the new type? Can't you use DUMMY for this?
> 
> It works fine with dummy, but user space code looked odd when
> it perf_event_open-s dummy events and starts reading them.
> So I thought either to add alias:
> PERF_COUNT_SW_BPF_OUTPUT = PERF_COUNT_SW_DUMMY
> or
> add new number like I did above.
> New number is cheap and I saw that dummy used for probing,
> so went with new number to disambiguate.

Ah, ok. It wasn't clear to me why you needed another number.

The SW numbers add some extra (unused) data size to the kernel as a
number of arrays are sized by them, but one more should not hurt (too)
much.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Alexei Starovoitov Oct. 21, 2015, 8:04 p.m. UTC | #6
On 10/21/15 4:05 AM, Wangnan (F) wrote:
> I think we can
> make perf output raw data hexadamically at first.

that would be my suggestion as well.
print it as hex first and when llvm type info is available you can teach
perf to be smarter.
We're working on support for this feature in iovisor/bcc
without requiring extra llvm work, since we can generate
python classes on the fly from C code of bpf program.

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Peter Zijlstra Oct. 22, 2015, 1:59 p.m. UTC | #7
On Tue, Oct 20, 2015 at 08:02:34PM -0700, Alexei Starovoitov wrote:
> +++ b/kernel/trace/bpf_trace.c
> @@ -215,6 +215,50 @@ const struct bpf_func_proto bpf_perf_event_read_proto = {
>  	.arg2_type	= ARG_ANYTHING,
>  };
>  
> +static u64 bpf_perf_event_output(u64 r1, u64 r2, u64 index, u64 r4, u64 size)
> +{
> +	struct pt_regs *regs = (struct pt_regs *) (long) r1;
> +	struct bpf_map *map = (struct bpf_map *) (long) r2;
> +	struct bpf_array *array = container_of(map, struct bpf_array, map);
> +	void *data = (void *) (long) r4;
> +	struct perf_sample_data sample_data;
> +	struct perf_event *event;
> +	struct perf_raw_record raw = {
> +		.size = size,
> +		.data = data,
> +	};
> +
> +	if (unlikely(index >= array->map.max_entries))
> +		return -E2BIG;
> +
> +	event = (struct perf_event *)array->ptrs[index];
> +	if (unlikely(!event))
> +		return -ENOENT;
> +
> +	if (unlikely(event->attr.type != PERF_TYPE_SOFTWARE ||
> +		     event->attr.config != PERF_COUNT_SW_BPF_OUTPUT))
> +		return -EINVAL;
> +
> +	if (unlikely(event->oncpu != smp_processor_id()))
> +		return -EOPNOTSUPP;
> +
> +	perf_sample_data_init(&sample_data, 0, 0);
> +	sample_data.raw = &raw;
> +	perf_event_output(event, &sample_data, regs);
> +	return 0;
> +}

Note that this function also very much requires event to be local.


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Alexei Starovoitov Oct. 22, 2015, 3:38 p.m. UTC | #8
On 10/22/15 6:59 AM, Peter Zijlstra wrote:
>> +	if (unlikely(event->oncpu != smp_processor_id()))
>> >+		return -EOPNOTSUPP;
>> >+
>> >+	perf_sample_data_init(&sample_data, 0, 0);
>> >+	sample_data.raw = &raw;
>> >+	perf_event_output(event, &sample_data, regs);
>> >+	return 0;
>> >+}
> Note that this function also very much requires event to be local.

correct. that's why there is a dynamic check above to prevent buggy
programs causing issues.


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Peter Zijlstra Oct. 23, 2015, 2:39 p.m. UTC | #9
On Tue, Oct 20, 2015 at 08:02:34PM -0700, Alexei Starovoitov wrote:
> +static const struct bpf_func_proto bpf_perf_event_output_proto = {
> +	.func		= bpf_perf_event_output,
> +	.gpl_only	= false,

Oh ?

> +	.ret_type	= RET_INTEGER,
> +	.arg1_type	= ARG_PTR_TO_CTX,
> +	.arg2_type	= ARG_CONST_MAP_PTR,
> +	.arg3_type	= ARG_ANYTHING,
> +	.arg4_type	= ARG_PTR_TO_STACK,
> +	.arg5_type	= ARG_CONST_STACK_SIZE,
> +};
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Alexei Starovoitov Oct. 23, 2015, 3:02 p.m. UTC | #10
On 10/23/15 7:39 AM, Peter Zijlstra wrote:
> On Tue, Oct 20, 2015 at 08:02:34PM -0700, Alexei Starovoitov wrote:
>> >+static const struct bpf_func_proto bpf_perf_event_output_proto = {
>> >+	.func		= bpf_perf_event_output,
>> >+	.gpl_only	= false,
> Oh ?

no particular reason. key helper bpf_probe_read() is gpl, so all
bpf for tracing progs have to be gpl.
If you feel strongly about it, I can change it.

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Peter Zijlstra Oct. 23, 2015, 4:42 p.m. UTC | #11
On Fri, Oct 23, 2015 at 08:02:00AM -0700, Alexei Starovoitov wrote:
> On 10/23/15 7:39 AM, Peter Zijlstra wrote:
> >On Tue, Oct 20, 2015 at 08:02:34PM -0700, Alexei Starovoitov wrote:
> >>>+static const struct bpf_func_proto bpf_perf_event_output_proto = {
> >>>+	.func		= bpf_perf_event_output,
> >>>+	.gpl_only	= false,
> >Oh ?
> 
> no particular reason. key helper bpf_probe_read() is gpl, so all
> bpf for tracing progs have to be gpl.
> If you feel strongly about it, I can change it.

All the perf symbols are export GPL, so I suppose this should be true.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Alexei Starovoitov Oct. 23, 2015, 5:25 p.m. UTC | #12
On 10/23/15 9:42 AM, Peter Zijlstra wrote:
> On Fri, Oct 23, 2015 at 08:02:00AM -0700, Alexei Starovoitov wrote:
>> On 10/23/15 7:39 AM, Peter Zijlstra wrote:
>>> On Tue, Oct 20, 2015 at 08:02:34PM -0700, Alexei Starovoitov wrote:
>>>>> +static const struct bpf_func_proto bpf_perf_event_output_proto = {
>>>>> +	.func		= bpf_perf_event_output,
>>>>> +	.gpl_only	= false,
>>> Oh ?
>>
>> no particular reason. key helper bpf_probe_read() is gpl, so all
>> bpf for tracing progs have to be gpl.
>> If you feel strongly about it, I can change it.
>
> All the perf symbols are export GPL, so I suppose this should be true.

ok. will send a patch.

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Wangnan (F) Oct. 26, 2015, 1:46 a.m. UTC | #13
On 2015/10/24 1:25, Alexei Starovoitov wrote:
> On 10/23/15 9:42 AM, Peter Zijlstra wrote:
>> On Fri, Oct 23, 2015 at 08:02:00AM -0700, Alexei Starovoitov wrote:
>>> On 10/23/15 7:39 AM, Peter Zijlstra wrote:
>>>> On Tue, Oct 20, 2015 at 08:02:34PM -0700, Alexei Starovoitov wrote:
>>>>>> +static const struct bpf_func_proto bpf_perf_event_output_proto = {
>>>>>> +    .func        = bpf_perf_event_output,
>>>>>> +    .gpl_only    = false,
>>>> Oh ?
>>>
>>> no particular reason. key helper bpf_probe_read() is gpl, so all
>>> bpf for tracing progs have to be gpl.
>>> If you feel strongly about it, I can change it.
>>
>> All the perf symbols are export GPL, so I suppose this should be true.
>
> ok. will send a patch.
>

Can we (or have we already) setup some rules for licensing? Which part
should be GPL? Who has the response to decide it?

Thank you.


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Alexei Starovoitov Oct. 26, 2015, 10:27 a.m. UTC | #14
On 10/25/15 6:46 PM, Wangnan (F) wrote:
> Can we (or have we already) setup some rules for licensing? Which part
> should be GPL? Who has the response to decide it?

in my mind the rules were set long ago. See my other email.

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 564f1f091991..2e032426cfb7 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -287,6 +287,17 @@  enum bpf_func_id {
 	 * Return: realm if != 0
 	 */
 	BPF_FUNC_get_route_realm,
+
+	/**
+	 * bpf_perf_event_output(ctx, map, index, data, size) - output perf raw sample
+	 * @ctx: struct pt_regs*
+	 * @map: pointer to perf_event_array map
+	 * @index: index of event in the map
+	 * @data: data on stack to be output as raw data
+	 * @size: size of data
+	 * Return: 0 on success
+	 */
+	BPF_FUNC_perf_event_output,
 	__BPF_FUNC_MAX_ID,
 };
 
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index 2881145cda86..d3c417615361 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -110,6 +110,7 @@  enum perf_sw_ids {
 	PERF_COUNT_SW_ALIGNMENT_FAULTS		= 7,
 	PERF_COUNT_SW_EMULATION_FAULTS		= 8,
 	PERF_COUNT_SW_DUMMY			= 9,
+	PERF_COUNT_SW_BPF_OUTPUT		= 10,
 
 	PERF_COUNT_SW_MAX,			/* non-ABI */
 };
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index f2d9e698c753..e3cfe46b074f 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -295,6 +295,8 @@  static void *perf_event_fd_array_get_ptr(struct bpf_map *map, int fd)
 		return (void *)attr;
 
 	if (attr->type != PERF_TYPE_RAW &&
+	    !(attr->type == PERF_TYPE_SOFTWARE &&
+	      attr->config == PERF_COUNT_SW_BPF_OUTPUT) &&
 	    attr->type != PERF_TYPE_HARDWARE) {
 		perf_event_release_kernel(event);
 		return ERR_PTR(-EINVAL);
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 1d6b97be79e1..b56cf51f8d42 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -245,6 +245,7 @@  static const struct {
 } func_limit[] = {
 	{BPF_MAP_TYPE_PROG_ARRAY, BPF_FUNC_tail_call},
 	{BPF_MAP_TYPE_PERF_EVENT_ARRAY, BPF_FUNC_perf_event_read},
+	{BPF_MAP_TYPE_PERF_EVENT_ARRAY, BPF_FUNC_perf_event_output},
 };
 
 static void print_verifier_state(struct verifier_env *env)
@@ -910,7 +911,7 @@  static int check_map_func_compatibility(struct bpf_map *map, int func_id)
 		 * don't allow any other map type to be passed into
 		 * the special func;
 		 */
-		if (bool_map != bool_func)
+		if (bool_func && bool_map != bool_func)
 			return -EINVAL;
 	}
 
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 0fe96c7c8803..47febbe7998e 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -215,6 +215,50 @@  const struct bpf_func_proto bpf_perf_event_read_proto = {
 	.arg2_type	= ARG_ANYTHING,
 };
 
+static u64 bpf_perf_event_output(u64 r1, u64 r2, u64 index, u64 r4, u64 size)
+{
+	struct pt_regs *regs = (struct pt_regs *) (long) r1;
+	struct bpf_map *map = (struct bpf_map *) (long) r2;
+	struct bpf_array *array = container_of(map, struct bpf_array, map);
+	void *data = (void *) (long) r4;
+	struct perf_sample_data sample_data;
+	struct perf_event *event;
+	struct perf_raw_record raw = {
+		.size = size,
+		.data = data,
+	};
+
+	if (unlikely(index >= array->map.max_entries))
+		return -E2BIG;
+
+	event = (struct perf_event *)array->ptrs[index];
+	if (unlikely(!event))
+		return -ENOENT;
+
+	if (unlikely(event->attr.type != PERF_TYPE_SOFTWARE ||
+		     event->attr.config != PERF_COUNT_SW_BPF_OUTPUT))
+		return -EINVAL;
+
+	if (unlikely(event->oncpu != smp_processor_id()))
+		return -EOPNOTSUPP;
+
+	perf_sample_data_init(&sample_data, 0, 0);
+	sample_data.raw = &raw;
+	perf_event_output(event, &sample_data, regs);
+	return 0;
+}
+
+static const struct bpf_func_proto bpf_perf_event_output_proto = {
+	.func		= bpf_perf_event_output,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_CONST_MAP_PTR,
+	.arg3_type	= ARG_ANYTHING,
+	.arg4_type	= ARG_PTR_TO_STACK,
+	.arg5_type	= ARG_CONST_STACK_SIZE,
+};
+
 static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func_id)
 {
 	switch (func_id) {
@@ -242,6 +286,8 @@  static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func
 		return &bpf_get_smp_processor_id_proto;
 	case BPF_FUNC_perf_event_read:
 		return &bpf_perf_event_read_proto;
+	case BPF_FUNC_perf_event_output:
+		return &bpf_perf_event_output_proto;
 	default:
 		return NULL;
 	}