diff mbox

[RFC,1/2] perf: Add the flag sample_disable not to output data on samples

Message ID 1444640563-159175-2-git-send-email-xiakaixu@huawei.com
State RFC, archived
Delegated to: David Miller
Headers show

Commit Message

Kaixu Xia Oct. 12, 2015, 9:02 a.m. UTC
In some scenarios we don't want to output trace data when sampling
to reduce overhead. This patch adds the flag sample_disable to
implement this function. By setting this flag and integrating with
ebpf, we can control the data output process and get the samples we
are most interested in.

Signed-off-by: Kaixu Xia <xiakaixu@huawei.com>
---
 include/linux/bpf.h        | 1 +
 include/linux/perf_event.h | 2 ++
 kernel/bpf/arraymap.c      | 5 +++++
 kernel/events/core.c       | 3 +++
 4 files changed, 11 insertions(+)

Comments

Peter Zijlstra Oct. 12, 2015, 12:02 p.m. UTC | #1
On Mon, Oct 12, 2015 at 09:02:42AM +0000, Kaixu Xia wrote:
> --- a/include/linux/perf_event.h
> +++ b/include/linux/perf_event.h
> @@ -483,6 +483,8 @@ struct perf_event {
>  	perf_overflow_handler_t		overflow_handler;
>  	void				*overflow_handler_context;
>  
> +	atomic_t			*sample_disable;
> +
>  #ifdef CONFIG_EVENT_TRACING
>  	struct trace_event_call		*tp_event;
>  	struct event_filter		*filter;

> diff --git a/kernel/events/core.c b/kernel/events/core.c
> index b11756f..f6ef45c 100644
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -6337,6 +6337,9 @@ static int __perf_event_overflow(struct perf_event *event,
>  		irq_work_queue(&event->pending);
>  	}
>  
> +	if ((event->sample_disable) && atomic_read(event->sample_disable))
> +		return ret;
> +
>  	if (event->overflow_handler)
>  		event->overflow_handler(event, data, regs);
>  	else

Try and guarantee sample_disable lives in the same cacheline as
overflow_handler.

I think we should at the very least replace the kzalloc() currently used
with a cacheline aligned alloc, and check the structure layout to verify
these two do in fact share a cacheline.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Wangnan (F) Oct. 12, 2015, 12:05 p.m. UTC | #2
On 2015/10/12 20:02, Peter Zijlstra wrote:
> On Mon, Oct 12, 2015 at 09:02:42AM +0000, Kaixu Xia wrote:
>> --- a/include/linux/perf_event.h
>> +++ b/include/linux/perf_event.h
>> @@ -483,6 +483,8 @@ struct perf_event {
>>   	perf_overflow_handler_t		overflow_handler;
>>   	void				*overflow_handler_context;
>>   
>> +	atomic_t			*sample_disable;
>> +
>>   #ifdef CONFIG_EVENT_TRACING
>>   	struct trace_event_call		*tp_event;
>>   	struct event_filter		*filter;
>> diff --git a/kernel/events/core.c b/kernel/events/core.c
>> index b11756f..f6ef45c 100644
>> --- a/kernel/events/core.c
>> +++ b/kernel/events/core.c
>> @@ -6337,6 +6337,9 @@ static int __perf_event_overflow(struct perf_event *event,
>>   		irq_work_queue(&event->pending);
>>   	}
>>   
>> +	if ((event->sample_disable) && atomic_read(event->sample_disable))
>> +		return ret;
>> +
>>   	if (event->overflow_handler)
>>   		event->overflow_handler(event, data, regs);
>>   	else
> Try and guarantee sample_disable lives in the same cacheline as
> overflow_handler.

Could you please explain why we need them to be in a same cacheline?

Thank you.

> I think we should at the very least replace the kzalloc() currently used
> with a cacheline aligned alloc, and check the structure layout to verify
> these two do in fact share a cacheline.


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Peter Zijlstra Oct. 12, 2015, 12:12 p.m. UTC | #3
On Mon, Oct 12, 2015 at 08:05:20PM +0800, Wangnan (F) wrote:
> 
> 
> On 2015/10/12 20:02, Peter Zijlstra wrote:
> >On Mon, Oct 12, 2015 at 09:02:42AM +0000, Kaixu Xia wrote:
> >>--- a/include/linux/perf_event.h
> >>+++ b/include/linux/perf_event.h
> >>@@ -483,6 +483,8 @@ struct perf_event {
> >>  	perf_overflow_handler_t		overflow_handler;
> >>  	void				*overflow_handler_context;
> >>+	atomic_t			*sample_disable;
> >>+
> >>  #ifdef CONFIG_EVENT_TRACING
> >>  	struct trace_event_call		*tp_event;
> >>  	struct event_filter		*filter;
> >>diff --git a/kernel/events/core.c b/kernel/events/core.c
> >>index b11756f..f6ef45c 100644
> >>--- a/kernel/events/core.c
> >>+++ b/kernel/events/core.c
> >>@@ -6337,6 +6337,9 @@ static int __perf_event_overflow(struct perf_event *event,
> >>  		irq_work_queue(&event->pending);
> >>  	}
> >>+	if ((event->sample_disable) && atomic_read(event->sample_disable))
> >>+		return ret;
> >>+
> >>  	if (event->overflow_handler)
> >>  		event->overflow_handler(event, data, regs);
> >>  	else
> >Try and guarantee sample_disable lives in the same cacheline as
> >overflow_handler.
> 
> Could you please explain why we need them to be in a same cacheline?

Because otherwise you've just added a cacheline miss to this relatively
hot path.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
kernel test robot Oct. 12, 2015, 2:14 p.m. UTC | #4
Hi Kaixu,

[auto build test ERROR on tip/perf/core -- if it's inappropriate base, please suggest rules for selecting the more suitable base]

url:    https://github.com/0day-ci/linux/commits/Kaixu-Xia/bpf-enable-disable-events-stored-in-PERF_EVENT_ARRAY-maps-trace-data-output-when-perf-sampling/20151012-170616
config: m68k-allyesconfig (attached as .config)
reproduce:
        wget https://git.kernel.org/cgit/linux/kernel/git/wfg/lkp-tests.git/plain/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # save the attached .config to linux build tree
        make.cross ARCH=m68k 

All errors (new ones prefixed by >>):

   kernel/bpf/arraymap.c: In function 'perf_event_fd_array_get_ptr':
>> kernel/bpf/arraymap.c:305:7: error: 'struct perf_event' has no member named 'sample_disable'
     event->sample_disable = &map->perf_sample_disable;
          ^

vim +305 kernel/bpf/arraymap.c

   299		if (attr->type != PERF_TYPE_RAW &&
   300		    attr->type != PERF_TYPE_HARDWARE) {
   301			perf_event_release_kernel(event);
   302			return ERR_PTR(-EINVAL);
   303		}
   304	
 > 305		event->sample_disable = &map->perf_sample_disable;
   306		return event;
   307	}
   308	

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation
Alexei Starovoitov Oct. 12, 2015, 7:20 p.m. UTC | #5
On 10/12/15 2:02 AM, Kaixu Xia wrote:
> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> index f57d7fe..25e073d 100644
> --- a/include/linux/bpf.h
> +++ b/include/linux/bpf.h
> @@ -39,6 +39,7 @@ struct bpf_map {
>   	u32 max_entries;
>   	const struct bpf_map_ops *ops;
>   	struct work_struct work;
> +	atomic_t perf_sample_disable;
>   };
>
>   struct bpf_map_type_list {
> diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
> index 092a0e8..0606d1d 100644
> --- a/include/linux/perf_event.h
> +++ b/include/linux/perf_event.h
> @@ -483,6 +483,8 @@ struct perf_event {
>   	perf_overflow_handler_t		overflow_handler;
>   	void				*overflow_handler_context;
>
> +	atomic_t			*sample_disable;

this looks fragile and unnecessary.
Why add such field to generic bpf_map and carry its pointer into perf_event?
Single extra field in perf_event would have been enough.
Even better is to avoid adding any fields.
There is already event->state why not to use that?
The proper perf_event_enable/disable are so heavy that another
mechanism needed? cpu_function_call is probably too much to do
from bpf program, but that can be simplified?
Based on the use case from cover letter, sounds like you want
something like soft_disable?
Then extending event->state would make the most sense.
Also consider the case of re-entrant event enable/disable.
So inc/dec of a flag may be needed?

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Kaixu Xia Oct. 13, 2015, 2:30 a.m. UTC | #6
于 2015/10/13 3:20, Alexei Starovoitov 写道:
> On 10/12/15 2:02 AM, Kaixu Xia wrote:
>> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
>> index f57d7fe..25e073d 100644
>> --- a/include/linux/bpf.h
>> +++ b/include/linux/bpf.h
>> @@ -39,6 +39,7 @@ struct bpf_map {
>>       u32 max_entries;
>>       const struct bpf_map_ops *ops;
>>       struct work_struct work;
>> +    atomic_t perf_sample_disable;
>>   };
>>
>>   struct bpf_map_type_list {
>> diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
>> index 092a0e8..0606d1d 100644
>> --- a/include/linux/perf_event.h
>> +++ b/include/linux/perf_event.h
>> @@ -483,6 +483,8 @@ struct perf_event {
>>       perf_overflow_handler_t        overflow_handler;
>>       void                *overflow_handler_context;
>>
>> +    atomic_t            *sample_disable;
> 
> this looks fragile and unnecessary.
> Why add such field to generic bpf_map and carry its pointer into perf_event?
> Single extra field in perf_event would have been enough.
> Even better is to avoid adding any fields.
> There is already event->state why not to use that?
> The proper perf_event_enable/disable are so heavy that another
> mechanism needed? cpu_function_call is probably too much to do
> from bpf program, but that can be simplified?
> Based on the use case from cover letter, sounds like you want
> something like soft_disable?
> Then extending event->state would make the most sense.
> Also consider the case of re-entrant event enable/disable.
> So inc/dec of a flag may be needed?

Thanks for your comments!
I've tried perf_event_enable/disable, but there is a warning caused
by cpu_function_call. The main reason as follows,
 int smp_call_function_single(...)
 {
	...
	WARN_ON_ONCE(cpu_online(this_cpu) && irqs_disabled()
		     && !oops_in_progress);
	...
}
So I added the extra atomic flag filed in order to avoid this problem.
> 
> 
> .
> 


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Alexei Starovoitov Oct. 13, 2015, 3:10 a.m. UTC | #7
On 10/12/15 7:30 PM, xiakaixu wrote:
>> The proper perf_event_enable/disable are so heavy that another
>> >mechanism needed? cpu_function_call is probably too much to do
>> >from bpf program, but that can be simplified?
>> >Based on the use case from cover letter, sounds like you want
>> >something like soft_disable?
>> >Then extending event->state would make the most sense.
>> >Also consider the case of re-entrant event enable/disable.
>> >So inc/dec of a flag may be needed?
> Thanks for your comments!
> I've tried perf_event_enable/disable, but there is a warning caused
> by cpu_function_call. The main reason as follows,
>   int smp_call_function_single(...)
>   {
> 	...
> 	WARN_ON_ONCE(cpu_online(this_cpu) && irqs_disabled()
> 		     && !oops_in_progress);

of course, that's what I meant by 'cpu_function_call is too much
to do from bpf program'. In this case it's running out of kprobe
with disabled irq, so you hit the warning, but even if it was regular
tracepoint, doing ipi from bpf is too much. All bpf helpers must be
deterministic without such side effects.

> So I added the extra atomic flag filed in order to avoid this problem.

that's a hammer approach. There are other ways to do it, like:
- extend event->state with this soft_disable-like functionality
  (Also consider the case of re-entrant event enable/disable.
   inc/dec may be needed)
- or tap into event->attr.sample_period
   may be it can be temporarily set to zero to indicate soft_disabled.

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Sergei Shtylyov Oct. 13, 2015, noon UTC | #8
Hello.

On 10/12/2015 12:02 PM, Kaixu Xia wrote:

> In some scenarios we don't want to output trace data when sampling
> to reduce overhead. This patch adds the flag sample_disable to
> implement this function. By setting this flag and integrating with
> ebpf, we can control the data output process and get the samples we
> are most interested in.
>
> Signed-off-by: Kaixu Xia <xiakaixu@huawei.com>

[...]
> diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
> index 29ace10..4ae82c9 100644
> --- a/kernel/bpf/arraymap.c
> +++ b/kernel/bpf/arraymap.c
[...]
> diff --git a/kernel/events/core.c b/kernel/events/core.c
> index b11756f..f6ef45c 100644
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -6337,6 +6337,9 @@ static int __perf_event_overflow(struct perf_event *event,
>   		irq_work_queue(&event->pending);
>   	}
>
> +	if ((event->sample_disable) && atomic_read(event->sample_disable))

    Inner parens not needed at all.

[...]

MBR, Sergei

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index f57d7fe..25e073d 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -39,6 +39,7 @@  struct bpf_map {
 	u32 max_entries;
 	const struct bpf_map_ops *ops;
 	struct work_struct work;
+	atomic_t perf_sample_disable;
 };
 
 struct bpf_map_type_list {
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 092a0e8..0606d1d 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -483,6 +483,8 @@  struct perf_event {
 	perf_overflow_handler_t		overflow_handler;
 	void				*overflow_handler_context;
 
+	atomic_t			*sample_disable;
+
 #ifdef CONFIG_EVENT_TRACING
 	struct trace_event_call		*tp_event;
 	struct event_filter		*filter;
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 29ace10..4ae82c9 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -51,6 +51,9 @@  static struct bpf_map *array_map_alloc(union bpf_attr *attr)
 
 	array->elem_size = elem_size;
 
+	if (attr->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY)
+		atomic_set(&array->map.perf_sample_disable, 1);
+
 	return &array->map;
 }
 
@@ -298,6 +301,8 @@  static void *perf_event_fd_array_get_ptr(struct bpf_map *map, int fd)
 		perf_event_release_kernel(event);
 		return ERR_PTR(-EINVAL);
 	}
+
+	event->sample_disable = &map->perf_sample_disable;
 	return event;
 }
 
diff --git a/kernel/events/core.c b/kernel/events/core.c
index b11756f..f6ef45c 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -6337,6 +6337,9 @@  static int __perf_event_overflow(struct perf_event *event,
 		irq_work_queue(&event->pending);
 	}
 
+	if ((event->sample_disable) && atomic_read(event->sample_disable))
+		return ret;
+
 	if (event->overflow_handler)
 		event->overflow_handler(event, data, regs);
 	else