Message ID | 20150806121029.GQ19282@twins.programming.kicks-ass.net (mailing list archive) |
---|---|
State | Not Applicable |
Headers | show |
Peter Zijlstra [peterz@infradead.org] wrote: | On Sun, Jul 26, 2015 at 10:40:37PM -0700, Sukadev Bhattiprolu wrote: | > @@ -3743,7 +3762,13 @@ static u64 perf_event_aggregate(struct perf_event *event, u64 *enabled, | > lockdep_assert_held(&event->child_mutex); | > | > list_for_each_entry(child, &event->child_list, child_list) { | > +#if 0 | > + /* | > + * TODO: Do we need this read() for group events on PMUs that | > + * don't implement PERF_PMU_TXN_READ transactions? | > + */ | > (void)perf_event_read(child, false); | > +#endif | > total += perf_event_count(child); | > *enabled += child->total_time_enabled; | > *running += child->total_time_running; | | Aw gawd, I've been an idiot!! | | I just realized this is a _CHILD_ loop, not a _SIBLING_ loop !! | | We need to flip the loops in perf_read_group(), find attached two | patches that go on top of 1,2,4. | | After this you can add the perf_event_read() return value (just fold | patches 6,8) after which you can do patch 10 (which has a broken | Subject fwiw). Thanks for the patches. I am building and testing, but have a question on the second patch below: <snip> | Subject: perf: Invert perf_read_group() loops | From: Peter Zijlstra <peterz@infradead.org> | Date: Thu Aug 6 13:41:13 CEST 2015 | | In order to enable the use of perf_event_read(.group = true), we need | to invert the sibling-child loop nesting of perf_read_group(). | | Currently we iterate the child list for each sibling, this precludes | using group reads. Flip things around so we iterate each group for | each child. | | Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> | --- | kernel/events/core.c | 84 ++++++++++++++++++++++++++++++++------------------- | 1 file changed, 54 insertions(+), 30 deletions(-) | | --- a/kernel/events/core.c | +++ b/kernel/events/core.c | @@ -3809,50 +3809,74 @@ u64 perf_event_read_value(struct perf_ev | } | EXPORT_SYMBOL_GPL(perf_event_read_value); | | -static int perf_read_group(struct perf_event *event, | - u64 read_format, char __user *buf) | +static void __perf_read_group_add(struct perf_event *leader, u64 read_format, u64 *values) | { | - struct perf_event *leader = event->group_leader, *sub; | - struct perf_event_context *ctx = leader->ctx; | - int n = 0, size = 0, ret; | - u64 count, enabled, running; | - u64 values[5]; | + struct perf_event *sub; | + int n = 1; /* skip @nr */ This n = 1 is to skip over the values[0] = 1 + nr_siblings in the caller. Anyway, in __perf_read_group_add() we always start with n = 1, however ... | | - lockdep_assert_held(&ctx->mutex); | + perf_event_read(leader, true); | + | + /* | + * Since we co-schedule groups, {enabled,running} times of siblings | + * will be identical to those of the leader, so we only publish one | + * set. | + */ | + if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { | + values[n++] += leader->total_time_enabled + | + atomic64_read(leader->child_total_time_enabled); | + } | | - count = perf_event_read_value(leader, &enabled, &running); | + if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) { | + values[n++] += leader->total_time_running + | + atomic64_read(leader->child_total_time_running); | + } | | - values[n++] = 1 + leader->nr_siblings; | - if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) | - values[n++] = enabled; | - if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) | - values[n++] = running; | - values[n++] = count; | + /* | + * Write {count,id} tuples for every sibling. | + */ | + values[n++] += perf_event_count(leader); | if (read_format & PERF_FORMAT_ID) | values[n++] = primary_event_id(leader); | | - size = n * sizeof(u64); | + list_for_each_entry(sub, &leader->sibling_list, group_entry) { | + values[n++] += perf_event_count(sub); | + if (read_format & PERF_FORMAT_ID) | + values[n++] = primary_event_id(sub); | + } | +} | | - if (copy_to_user(buf, values, size)) | - return -EFAULT; | +static int perf_read_group(struct perf_event *event, | + u64 read_format, char __user *buf) | +{ | + struct perf_event *leader = event->group_leader, *child; | + struct perf_event_context *ctx = leader->ctx; | + int ret = leader->read_size; | + u64 *values; | | - ret = size; | + lockdep_assert_held(&ctx->mutex); | | - list_for_each_entry(sub, &leader->sibling_list, group_entry) { | - n = 0; | + values = kzalloc(event->read_size); | + if (!values) | + return -ENOMEM; | | - values[n++] = perf_event_read_value(sub, &enabled, &running); | - if (read_format & PERF_FORMAT_ID) | - values[n++] = primary_event_id(sub); | + values[0] = 1 + leader->nr_siblings; | | - size = n * sizeof(u64); | + /* | + * By locking the child_mutex of the leader we effectively | + * lock the child list of all siblings.. XXX explain how. | + */ | + mutex_lock(&leader->child_mutex); | | - if (copy_to_user(buf + ret, values, size)) { | - return -EFAULT; | - } | + __perf_read_group_add(leader, read_format, values); ... we don't copy_to_user() here, | + list_for_each_entry(child, &leader->child_list, child_list) | + __perf_read_group_add(child, read_format, values); so won't we overwrite the values[], if we always start at n = 1 in __perf_read_group_add()? | | - ret += size; | - } | + mutex_unlock(&leader->child_mutex); | + | + if (copy_to_user(buf, values, event->read_size)) | + ret = -EFAULT; | + | + kfree(values); | | return ret; | }
On Tue, Aug 11, 2015 at 09:14:00PM -0700, Sukadev Bhattiprolu wrote: > | +static void __perf_read_group_add(struct perf_event *leader, u64 read_format, u64 *values) > | { > | + struct perf_event *sub; > | + int n = 1; /* skip @nr */ > > This n = 1 is to skip over the values[0] = 1 + nr_siblings in the > caller. > > Anyway, in __perf_read_group_add() we always start with n = 1, however > ... > | > | + perf_event_read(leader, true); > | + > | + /* > | + * Since we co-schedule groups, {enabled,running} times of siblings > | + * will be identical to those of the leader, so we only publish one > | + * set. > | + */ > | + if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { > | + values[n++] += leader->total_time_enabled + > | + atomic64_read(leader->child_total_time_enabled); Note how this is an in-place addition, > | + } > | > | + if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) { > | + values[n++] += leader->total_time_running + > | + atomic64_read(leader->child_total_time_running); and here, > | + } > | > | + /* > | + * Write {count,id} tuples for every sibling. > | + */ > | + values[n++] += perf_event_count(leader); and here, > | if (read_format & PERF_FORMAT_ID) > | values[n++] = primary_event_id(leader); and this will always assign the same value. > | + list_for_each_entry(sub, &leader->sibling_list, group_entry) { > | + values[n++] += perf_event_count(sub); > | + if (read_format & PERF_FORMAT_ID) > | + values[n++] = primary_event_id(sub); Same for these, therefore, > | + } > | +} > | > | +static int perf_read_group(struct perf_event *event, > | + u64 read_format, char __user *buf) > | +{ > | + struct perf_event *leader = event->group_leader, *child; > | + struct perf_event_context *ctx = leader->ctx; > | + int ret = leader->read_size; > | + u64 *values; > | > | + lockdep_assert_held(&ctx->mutex); > | > | + values = kzalloc(event->read_size); > | + if (!values) > | + return -ENOMEM; > | > | + values[0] = 1 + leader->nr_siblings; > | > | + /* > | + * By locking the child_mutex of the leader we effectively > | + * lock the child list of all siblings.. XXX explain how. > | + */ > | + mutex_lock(&leader->child_mutex); > | > | + __perf_read_group_add(leader, read_format, values); > > ... we don't copy_to_user() here, > > | + list_for_each_entry(child, &leader->child_list, child_list) > | + __perf_read_group_add(child, read_format, values); > > so won't we overwrite the values[], if we always start at n = 1 > in __perf_read_group_add()? yes and no, we have to re-iterate the same values for each child as they all have the same group, but we add the time and count fields, we do not overwrite. The _add() suffix was supposed to be a hint ;-) > | + mutex_unlock(&leader->child_mutex); > | + > | + if (copy_to_user(buf, values, event->read_size)) > | + ret = -EFAULT; > | + > | + kfree(values); > | > | return ret; > | } Where previously we would iterate the group and for each member iterate/sum all the child values together before copying the value out, we now, because we need to read groups together, need to first iterate the child list and sum whole groups.
Peter Zijlstra [peterz@infradead.org] wrote: | On Tue, Aug 11, 2015 at 09:14:00PM -0700, Sukadev Bhattiprolu wrote: | > | +static void __perf_read_group_add(struct perf_event *leader, u64 read_format, u64 *values) | > | { | > | + struct perf_event *sub; | > | + int n = 1; /* skip @nr */ | > | > This n = 1 is to skip over the values[0] = 1 + nr_siblings in the | > caller. | > | > Anyway, in __perf_read_group_add() we always start with n = 1, however | > ... | > | | > | + perf_event_read(leader, true); | > | + | > | + /* | > | + * Since we co-schedule groups, {enabled,running} times of siblings | > | + * will be identical to those of the leader, so we only publish one | > | + * set. | > | + */ | > | + if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { | > | + values[n++] += leader->total_time_enabled + | > | + atomic64_read(leader->child_total_time_enabled); | | Note how this is an in-place addition, Ah, yes, Sorry I missed that. It make sense now and my tests seem to be running fine. | | > | + } | > | | > | + if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) { | > | + values[n++] += leader->total_time_running + | > | + atomic64_read(leader->child_total_time_running); | | and here, | | > | + } | > | | > | + /* | > | + * Write {count,id} tuples for every sibling. | > | + */ | > | + values[n++] += perf_event_count(leader); | | and here, | | | > | if (read_format & PERF_FORMAT_ID) | > | values[n++] = primary_event_id(leader); | | and this will always assign the same value. | | > | + list_for_each_entry(sub, &leader->sibling_list, group_entry) { | > | + values[n++] += perf_event_count(sub); | > | + if (read_format & PERF_FORMAT_ID) | > | + values[n++] = primary_event_id(sub); | | Same for these, therefore, | | > | + } | > | +} | > | | > | +static int perf_read_group(struct perf_event *event, | > | + u64 read_format, char __user *buf) | > | +{ | > | + struct perf_event *leader = event->group_leader, *child; | > | + struct perf_event_context *ctx = leader->ctx; | > | + int ret = leader->read_size; One other question, We return leader->read_size but allocate/copy_to_user the sibling's event->read_size. We consistently use read_format from the 'event' being read, rather than its 'group_leader', so we are ok in terms of what we copy into values[] for each event in the group. But, can the leader's read_format (and hence its read_size) differ from its sibling's read_size? If so, in the current code, we return the event's read_size but in the new code, we return the leader's read_size. | > | + u64 *values; | > | | > | + lockdep_assert_held(&ctx->mutex); | > | | > | + values = kzalloc(event->read_size); | > | + if (!values) | > | + return -ENOMEM; | > | | > | + values[0] = 1 + leader->nr_siblings; | > | | > | + /* | > | + * By locking the child_mutex of the leader we effectively | > | + * lock the child list of all siblings.. XXX explain how. | > | + */ | > | + mutex_lock(&leader->child_mutex); | > | | > | + __perf_read_group_add(leader, read_format, values); | > | > ... we don't copy_to_user() here, | > | > | + list_for_each_entry(child, &leader->child_list, child_list) | > | + __perf_read_group_add(child, read_format, values); | > | > so won't we overwrite the values[], if we always start at n = 1 | > in __perf_read_group_add()? | | yes and no, we have to re-iterate the same values for each child as they | all have the same group, but we add the time and count fields, we do not | overwrite. The _add() suffix was supposed to be a hint ;-) | | > | + mutex_unlock(&leader->child_mutex); | > | + | > | + if (copy_to_user(buf, values, event->read_size)) | > | + ret = -EFAULT; | > | + | > | + kfree(values); | > | | > | return ret; | > | } | | Where previously we would iterate the group and for each member | iterate/sum all the child values together before copying the value out, | we now, because we need to read groups together, need to first iterate | the child list and sum whole groups.
On Thu, Aug 13, 2015 at 01:04:28PM -0700, Sukadev Bhattiprolu wrote: > | > | +static int perf_read_group(struct perf_event *event, > | > | + u64 read_format, char __user *buf) > | > | +{ > | > | + struct perf_event *leader = event->group_leader, *child; > | > | + struct perf_event_context *ctx = leader->ctx; > | > | + int ret = leader->read_size; > One other question, We return leader->read_size but allocate/copy_to_user > the sibling's event->read_size. We consistently use read_format from the > 'event' being read, rather than its 'group_leader', so we are ok in terms > of what we copy into values[] for each event in the group. > > But, can the leader's read_format (and hence its read_size) differ from > its sibling's read_size? If so, in the current code, we return the event's > read_size but in the new code, we return the leader's read_size. Hmm, good spotting that. I'm fairly sure I didn't do that on purpose. I think we should use event->read_size there too and have the lot consistent. I don't think we require read_format to be uniform across siblings.
--- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3184,12 +3184,18 @@ void perf_event_exec(void) rcu_read_unlock(); } +struct perf_read_data { + struct perf_event *event; + bool group; +}; + /* * Cross CPU call to read the hardware event */ static void __perf_event_read(void *info) { - struct perf_event *event = info; + struct perf_read_data *data = info; + struct perf_event *sub, *event = data->event; struct perf_event_context *ctx = event->ctx; struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); @@ -3208,9 +3214,21 @@ static void __perf_event_read(void *info update_context_time(ctx); update_cgrp_time_from_event(event); } + update_event_times(event); if (event->state == PERF_EVENT_STATE_ACTIVE) event->pmu->read(event); + + if (!data->group) + goto unlock; + + list_for_each_entry(sub, &event->sibling_list, group_entry) { + update_event_times(sub); + if (sub->state == PERF_EVENT_STATE_ACTIVE) + sub->pmu->read(sub); + } + +unlock: raw_spin_unlock(&ctx->lock); } @@ -3222,15 +3240,19 @@ static inline u64 perf_event_count(struc return __perf_event_count(event); } -static void perf_event_read(struct perf_event *event) +static void perf_event_read(struct perf_event *event, bool group) { /* * If event is enabled and currently active on a CPU, update the * value in the event structure: */ if (event->state == PERF_EVENT_STATE_ACTIVE) { + struct perf_read_data data = { + .event = event, + .group = group, + }; smp_call_function_single(event->oncpu, - __perf_event_read, event, 1); + __perf_event_read, &data, 1); } else if (event->state == PERF_EVENT_STATE_INACTIVE) { struct perf_event_context *ctx = event->ctx; unsigned long flags; @@ -3245,7 +3267,10 @@ static void perf_event_read(struct perf_ update_context_time(ctx); update_cgrp_time_from_event(event); } - update_event_times(event); + if (group) + update_group_times(event); + else + update_event_times(event); raw_spin_unlock_irqrestore(&ctx->lock, flags); } } @@ -3764,7 +3789,7 @@ u64 perf_event_read_value(struct perf_ev mutex_lock(&event->child_mutex); - perf_event_read(event); + perf_event_read(event, false); total += perf_event_count(event); *enabled += event->total_time_enabled + @@ -3773,7 +3798,7 @@ u64 perf_event_read_value(struct perf_ev atomic64_read(&event->child_total_time_running); list_for_each_entry(child, &event->child_list, child_list) { - perf_event_read(child); + perf_event_read(child, false); total += perf_event_count(child); *enabled += child->total_time_enabled; *running += child->total_time_running; @@ -3934,7 +3959,7 @@ static unsigned int perf_poll(struct fil static void _perf_event_reset(struct perf_event *event) { - perf_event_read(event); + perf_event_read(event, false); local64_set(&event->count, 0); perf_event_update_userpage(event); }