diff mbox series

profiling: fix streaming of TOPN counters

Message ID 38cb2810-6cc1-8c69-4762-5f182e7342e9@suse.cz
State New
Headers show
Series profiling: fix streaming of TOPN counters | expand

Commit Message

Martin Liška Feb. 16, 2021, 6:32 p.m. UTC
Hello.

As Honza noticed, when using GCC 11 during train run of Clang leads to
very slow training run. Apparently, it's caused by a corrupted TOP N profile.
The patch fixed that by preallocating a buffer that is latter stream out.

With the patch, I can profiledbootstrap GCC and the instrumented clang finished
in ~6 minutes its training run.
Patch can bootstrap on x86_64-linux-gnu and survives regression tests.

Ready to be installed?
Thanks,
Martin

libgcc/ChangeLog:

	PR gcov-profile/99105
	* libgcov-driver.c (write_top_counters): Rename to ...
	(write_topn_counters): ... this.
	Make a temporary allocation of counters before streaming. That
	helps in multi-threaded environment.
	(write_one_data): Use renamed function.
	* libgcov-merge.c (__gcov_merge_topn): Add assert about tracked
	number of counters.
---
  libgcc/libgcov-driver.c | 40 ++++++++++++++++++++++++++++++----------
  libgcc/libgcov-merge.c  |  1 +
  2 files changed, 31 insertions(+), 10 deletions(-)

Comments

Richard Biener Feb. 16, 2021, 8:46 p.m. UTC | #1
On February 16, 2021 7:32:16 PM GMT+01:00, "Martin Liška" <mliska@suse.cz> wrote:
>Hello.
>
>As Honza noticed, when using GCC 11 during train run of Clang leads to
>very slow training run. Apparently, it's caused by a corrupted TOP N
>profile.
>The patch fixed that by preallocating a buffer that is latter stream
>out.
>
>With the patch, I can profiledbootstrap GCC and the instrumented clang
>finished
>in ~6 minutes its training run.
>Patch can bootstrap on x86_64-linux-gnu and survives regression tests.
>
>Ready to be installed?

Looks like this can only shrink the race window. Don't you need atomic accesses or locking here? Can the number of counters change? IIRC the counters are dynamically allocated. Are they ever reallocated? 

>Thanks,
>Martin
>
>libgcc/ChangeLog:
>
>	PR gcov-profile/99105
>	* libgcov-driver.c (write_top_counters): Rename to ...
>	(write_topn_counters): ... this.
>	Make a temporary allocation of counters before streaming. That
>	helps in multi-threaded environment.
>	(write_one_data): Use renamed function.
>	* libgcov-merge.c (__gcov_merge_topn): Add assert about tracked
>	number of counters.
>---
>  libgcc/libgcov-driver.c | 40 ++++++++++++++++++++++++++++++----------
>  libgcc/libgcov-merge.c  |  1 +
>  2 files changed, 31 insertions(+), 10 deletions(-)
>
>diff --git a/libgcc/libgcov-driver.c b/libgcc/libgcov-driver.c
>index e474e032b54..74c68ab04e2 100644
>--- a/libgcc/libgcov-driver.c
>+++ b/libgcc/libgcov-driver.c
>@@ -337,7 +337,7 @@ read_error:
>  /* Store all TOP N counters where each has a dynamic length.  */
>  
>  static void
>-write_top_counters (const struct gcov_ctr_info *ci_ptr,
>+write_topn_counters (const struct gcov_ctr_info *ci_ptr,
>  		    unsigned t_ix,
>  		    gcov_unsigned_t n_counts)
>  {
>@@ -347,22 +347,42 @@ write_top_counters (const struct gcov_ctr_info
>*ci_ptr,
>    for (unsigned i = 0; i < counters; i++)
>      pair_total += ci_ptr->values[GCOV_TOPN_MEM_COUNTERS * i + 1];
>unsigned disk_size = GCOV_TOPN_DISK_COUNTERS * counters + 2 *
>pair_total;
>-  gcov_write_tag_length (GCOV_TAG_FOR_COUNTER (t_ix),
>-			 GCOV_TAG_COUNTER_LENGTH (disk_size));
>+
>+  /* It can happen in multi-threaded environment that number of
>counters is smaller.
>+     Because of that we build a buffer which we stream latter in this
>function.  */
>+  gcov_type *buffer
>+    = (gcov_type *) xmalloc (disk_size * sizeof (gcov_type));
>+  gcov_type *ptr = buffer;
>  
>    for (unsigned i = 0; i < counters; i++)
>      {
>-      gcov_type pair_count = ci_ptr->values[GCOV_TOPN_MEM_COUNTERS * i
>+ 1];
>-      gcov_write_counter (ci_ptr->values[GCOV_TOPN_MEM_COUNTERS * i]);
>-      gcov_write_counter (pair_count);
>+      (*ptr++) = ci_ptr->values[GCOV_TOPN_MEM_COUNTERS * i];
>+      /* Skip pair count for now.  */
>+      gcov_type *pair_count_ptr = ptr;
>+      ++ptr;
>+
>+      unsigned pair_count = 0;
>      gcov_type start = ci_ptr->values[GCOV_TOPN_MEM_COUNTERS * i + 2];
>       for (struct gcov_kvp *node = (struct gcov_kvp *)(intptr_t)start;
>-	   node != NULL; node = node->next)
>+	   node != NULL; node = node->next, ++pair_count)
>  	{
>-	  gcov_write_counter (node->value);
>-	  gcov_write_counter (node->count);
>+	  (*ptr++) = node->value;
>+	  (*ptr++) = node->count;
>  	}
>+
>+      *pair_count_ptr = pair_count;
>      }
>+
>+  unsigned int length = ptr - buffer;
>+  gcc_assert (length <= disk_size);
>+  gcov_write_tag_length (GCOV_TAG_FOR_COUNTER (t_ix),
>+			 GCOV_TAG_COUNTER_LENGTH (length));
>+
>+  /* FIXME: use a block store  */
>+  for (unsigned i = 0; i < length; i++)
>+    gcov_write_counter (buffer[i]);
>+
>+  free (buffer);
>  }
>  
>  /* Write counters in GI_PTR and the summary in PRG to a gcda file. In
>@@ -425,7 +445,7 @@ write_one_data (const struct gcov_info *gi_ptr,
>  	  n_counts = ci_ptr->num;
>  
>  	  if (t_ix == GCOV_COUNTER_V_TOPN || t_ix == GCOV_COUNTER_V_INDIR)
>-	    write_top_counters (ci_ptr, t_ix, n_counts);
>+	    write_topn_counters (ci_ptr, t_ix, n_counts);
>  	  else
>  	    {
>  	      /* Do not stream when all counters are zero.  */
>diff --git a/libgcc/libgcov-merge.c b/libgcc/libgcov-merge.c
>index 7db188a4f4c..3379b688128 100644
>--- a/libgcc/libgcov-merge.c
>+++ b/libgcc/libgcov-merge.c
>@@ -109,6 +109,7 @@ __gcov_merge_topn (gcov_type *counters, unsigned
>n_counters)
>      /* First value is number of total executions of the profiler.  */
>        gcov_type all = gcov_get_counter_ignore_scaling (-1);
>        gcov_type n = gcov_get_counter_ignore_scaling (-1);
>+      gcc_assert (n <= GCOV_TOPN_MAXIMUM_TRACKED_VALUES);
>  
>        unsigned full = all < 0;
>        gcov_type *total = &counters[GCOV_TOPN_MEM_COUNTERS * i];
Martin Liška Feb. 17, 2021, 8:36 a.m. UTC | #2
On 2/16/21 9:46 PM, Richard Biener wrote:
> Looks like this can only shrink the race window.

Yes, kind of.

> Don't you need atomic accesses or locking here? Can the number of counters change?

So what we do: we want to stream out N TOPN counters where each looks like this:

{total_count, number_of_tracked_values, pointer_to_first_tracked_value }

Where tracked values live in a linked list and we always append to a linked list.
What happens is that number of tracked values in a linked list does not match
number_of_tracked_values.

I'll write it even more robust...

IIRC the counters are dynamically allocated. Are they ever reallocated?

No, they live in a linked list.

Martin
Martin Liška Feb. 17, 2021, 1:16 p.m. UTC | #3
On 2/17/21 9:36 AM, Martin Liška wrote:
> I'll write it even more robust...

This is more elegant approach I've just tested on the instrumented clang.

Patch can bootstrap on x86_64-linux-gnu and survives regression tests.

Ready to be installed?
Thanks,
Martin
Richard Biener Feb. 18, 2021, 9:31 a.m. UTC | #4
On Wed, Feb 17, 2021 at 2:16 PM Martin Liška <mliska@suse.cz> wrote:
>
> On 2/17/21 9:36 AM, Martin Liška wrote:
> > I'll write it even more robust...
>
> This is more elegant approach I've just tested on the instrumented clang.
>
> Patch can bootstrap on x86_64-linux-gnu and survives regression tests.
>
> Ready to be installed?

Isn't it still too complicated?  We're asked to write N counters so why
don't we just write N counters?  Thus, you already do

       for (struct gcov_kvp *node = (struct gcov_kvp *)(intptr_t)start;
-          node != NULL; node = node->next)
+          node != NULL; node = node->next, j++)
        {
          gcov_write_counter (node->value);
          gcov_write_counter (node->count);
+
+         /* Stop when we reach expected number of items.  */
+         if (j + 1 == list_sizes[i])
+           break;
        }

why isn't this then only thing you need (just using pair_count as gathered
previously)?  And just count pair_count to zero as IV?  No need to
do the node != NULL check?

Note architectures with less nice memory ordering guarantees might
eventually see partially updated pointers and counters so I think
we at least want atomic_read ()s of the values with the weakest
consistency possible.  (but that can be done as followup if we agree on that)

Richard.

> Thanks,
> Martin
Martin Liška Feb. 18, 2021, 9:46 a.m. UTC | #5
On 2/18/21 10:31 AM, Richard Biener wrote:
> On Wed, Feb 17, 2021 at 2:16 PM Martin Liška <mliska@suse.cz> wrote:
>>
>> On 2/17/21 9:36 AM, Martin Liška wrote:
>>> I'll write it even more robust...
>>
>> This is more elegant approach I've just tested on the instrumented clang.
>>
>> Patch can bootstrap on x86_64-linux-gnu and survives regression tests.
>>
>> Ready to be installed?
> 
> Isn't it still too complicated?  We're asked to write N counters so why
> don't we just write N counters?

Well, we are asked to stream N counters. But each has a variable length,
which makes it funny :)

   Thus, you already do
> 
>         for (struct gcov_kvp *node = (struct gcov_kvp *)(intptr_t)start;
> -          node != NULL; node = node->next)
> +          node != NULL; node = node->next, j++)
>          {
>            gcov_write_counter (node->value);
>            gcov_write_counter (node->count);
> +
> +         /* Stop when we reach expected number of items.  */
> +         if (j + 1 == list_sizes[i])
> +           break;
>          }
> 
> why isn't this then only thing you need (just using pair_count as gathered
> previously)?  And just count pair_count to zero as IV?  No need to
> do the node != NULL check?

We can't do that, because that will lead in a corrupted profile.
We can have 2 problematic situations:

1) linked list is shorter, e.g.

10 2 10000 10

Here 2 represents 2 tuples, but we stream only one {10000: 10}. That happens in the instrumented clang.

2) linked list is longer, e.g.
10 1 10000 5 222222 5

Here 1 represents 1 tuple, be we stream 2 tuples {10000: 10} and {222222: 5}.

> 
> Note architectures with less nice memory ordering guarantees might
> eventually see partially updated pointers and counters so I think
> we at least want atomic_read ()s of the values with the weakest
> consistency possible.  (but that can be done as followup if we agree on that)

Can we really see a partially updated pointer?

Thanks,
Martin

> 
> Richard.
> 
>> Thanks,
>> Martin
Richard Biener Feb. 18, 2021, 10:02 a.m. UTC | #6
On Thu, Feb 18, 2021 at 10:46 AM Martin Liška <mliska@suse.cz> wrote:
>
> On 2/18/21 10:31 AM, Richard Biener wrote:
> > On Wed, Feb 17, 2021 at 2:16 PM Martin Liška <mliska@suse.cz> wrote:
> >>
> >> On 2/17/21 9:36 AM, Martin Liška wrote:
> >>> I'll write it even more robust...
> >>
> >> This is more elegant approach I've just tested on the instrumented clang.
> >>
> >> Patch can bootstrap on x86_64-linux-gnu and survives regression tests.
> >>
> >> Ready to be installed?
> >
> > Isn't it still too complicated?  We're asked to write N counters so why
> > don't we just write N counters?
>
> Well, we are asked to stream N counters. But each has a variable length,
> which makes it funny :)
>
>    Thus, you already do
> >
> >         for (struct gcov_kvp *node = (struct gcov_kvp *)(intptr_t)start;
> > -          node != NULL; node = node->next)
> > +          node != NULL; node = node->next, j++)
> >          {
> >            gcov_write_counter (node->value);
> >            gcov_write_counter (node->count);
> > +
> > +         /* Stop when we reach expected number of items.  */
> > +         if (j + 1 == list_sizes[i])
> > +           break;
> >          }
> >
> > why isn't this then only thing you need (just using pair_count as gathered
> > previously)?  And just count pair_count to zero as IV?  No need to
> > do the node != NULL check?
>
> We can't do that, because that will lead in a corrupted profile.
> We can have 2 problematic situations:
>
> 1) linked list is shorter, e.g.
>
> 10 2 10000 10
>
> Here 2 represents 2 tuples, but we stream only one {10000: 10}. That happens in the instrumented clang.
>
> 2) linked list is longer, e.g.
> 10 1 10000 5 222222 5
>
> Here 1 represents 1 tuple, be we stream 2 tuples {10000: 10} and {222222: 5}.

I guess the problem is that for whatever reason the stream includes

   unsigned disk_size = GCOV_TOPN_DISK_COUNTERS * counters + 2 * pair_total;
   gcov_write_tag_length (GCOV_TAG_FOR_COUNTER (t_ix),
                         GCOV_TAG_COUNTER_LENGTH (disk_size));

which has the sum of the list lengths, but ...

-      gcov_type pair_count = ci_ptr->values[GCOV_TOPN_MEM_COUNTERS * i + 1];
       gcov_write_counter (ci_ptr->values[GCOV_TOPN_MEM_COUNTERS * i]);
-      gcov_write_counter (pair_count);

we stream something like that again.  What are the two different counters?
You save one, but the other you take async again?

The disk format here is sub-optimal at least and optimizing the read side
which has locking in place for making the write-side racy doesn't look good.

> >
> > Note architectures with less nice memory ordering guarantees might
> > eventually see partially updated pointers and counters so I think
> > we at least want atomic_read ()s of the values with the weakest
> > consistency possible.  (but that can be done as followup if we agree on that)
>
> Can we really see a partially updated pointer?

Not on x86 for aligned data.  Note the same applies to the counter values.

We've seen allocating memory from libgcov is problematic, so this is why I'm
asking.

> Thanks,
> Martin
>
> >
> > Richard.
> >
> >> Thanks,
> >> Martin
>
Martin Liška Feb. 18, 2021, 11:35 a.m. UTC | #7
On 2/18/21 11:02 AM, Richard Biener wrote:
> On Thu, Feb 18, 2021 at 10:46 AM Martin Liška <mliska@suse.cz> wrote:
>>
>> On 2/18/21 10:31 AM, Richard Biener wrote:
>>> On Wed, Feb 17, 2021 at 2:16 PM Martin Liška <mliska@suse.cz> wrote:
>>>>
>>>> On 2/17/21 9:36 AM, Martin Liška wrote:
>>>>> I'll write it even more robust...
>>>>
>>>> This is more elegant approach I've just tested on the instrumented clang.
>>>>
>>>> Patch can bootstrap on x86_64-linux-gnu and survives regression tests.
>>>>
>>>> Ready to be installed?
>>>
>>> Isn't it still too complicated?  We're asked to write N counters so why
>>> don't we just write N counters?
>>
>> Well, we are asked to stream N counters. But each has a variable length,
>> which makes it funny :)
>>
>>     Thus, you already do
>>>
>>>          for (struct gcov_kvp *node = (struct gcov_kvp *)(intptr_t)start;
>>> -          node != NULL; node = node->next)
>>> +          node != NULL; node = node->next, j++)
>>>           {
>>>             gcov_write_counter (node->value);
>>>             gcov_write_counter (node->count);
>>> +
>>> +         /* Stop when we reach expected number of items.  */
>>> +         if (j + 1 == list_sizes[i])
>>> +           break;
>>>           }
>>>
>>> why isn't this then only thing you need (just using pair_count as gathered
>>> previously)?  And just count pair_count to zero as IV?  No need to
>>> do the node != NULL check?
>>
>> We can't do that, because that will lead in a corrupted profile.
>> We can have 2 problematic situations:
>>
>> 1) linked list is shorter, e.g.
>>
>> 10 2 10000 10
>>
>> Here 2 represents 2 tuples, but we stream only one {10000: 10}. That happens in the instrumented clang.
>>
>> 2) linked list is longer, e.g.
>> 10 1 10000 5 222222 5
>>
>> Here 1 represents 1 tuple, be we stream 2 tuples {10000: 10} and {222222: 5}.
> 
> I guess the problem is that for whatever reason the stream includes
> 
>     unsigned disk_size = GCOV_TOPN_DISK_COUNTERS * counters + 2 * pair_total;
>     gcov_write_tag_length (GCOV_TAG_FOR_COUNTER (t_ix),
>                           GCOV_TAG_COUNTER_LENGTH (disk_size));
> 
> which has the sum of the list lengths, but ...

Yes, GCOV tags always contain size and it's very handy in verification
that a .gcda is in a consistent state.

> 
> -      gcov_type pair_count = ci_ptr->values[GCOV_TOPN_MEM_COUNTERS * i + 1];
>         gcov_write_counter (ci_ptr->values[GCOV_TOPN_MEM_COUNTERS * i]);
> -      gcov_write_counter (pair_count);
> 
> we stream something like that again.  What are the two different counters?
> You save one, but the other you take async again?

ci_ptr->values[GCOV_TOPN_MEM_COUNTERS * i + 0] == total_number_of_executions
ci_ptr->values[GCOV_TOPN_MEM_COUNTERS * i + 1] == length of the linked list
ci_ptr->values[GCOV_TOPN_MEM_COUNTERS * i + 2] == pointer to the first item
in a linked list

> 
> The disk format here is sub-optimal at least and optimizing the read side
> which has locking in place for making the write-side racy doesn't look good.

Yes, we slighly miss some space. Per-file locking should not block streaming
parallel of profiles.

> 
>>>
>>> Note architectures with less nice memory ordering guarantees might
>>> eventually see partially updated pointers and counters so I think
>>> we at least want atomic_read ()s of the values with the weakest
>>> consistency possible.  (but that can be done as followup if we agree on that)
>>
>> Can we really see a partially updated pointer?
> 
> Not on x86 for aligned data.  Note the same applies to the counter values.
> 
> We've seen allocating memory from libgcov is problematic, so this is why I'm
> asking.

Sure.

Martin

> 
>> Thanks,
>> Martin
>>
>>>
>>> Richard.
>>>
>>>> Thanks,
>>>> Martin
>>
Richard Biener Feb. 18, 2021, 3:19 p.m. UTC | #8
On Thu, Feb 18, 2021 at 12:35 PM Martin Liška <mliska@suse.cz> wrote:
>
> On 2/18/21 11:02 AM, Richard Biener wrote:
> > On Thu, Feb 18, 2021 at 10:46 AM Martin Liška <mliska@suse.cz> wrote:
> >>
> >> On 2/18/21 10:31 AM, Richard Biener wrote:
> >>> On Wed, Feb 17, 2021 at 2:16 PM Martin Liška <mliska@suse.cz> wrote:
> >>>>
> >>>> On 2/17/21 9:36 AM, Martin Liška wrote:
> >>>>> I'll write it even more robust...
> >>>>
> >>>> This is more elegant approach I've just tested on the instrumented clang.
> >>>>
> >>>> Patch can bootstrap on x86_64-linux-gnu and survives regression tests.
> >>>>
> >>>> Ready to be installed?
> >>>
> >>> Isn't it still too complicated?  We're asked to write N counters so why
> >>> don't we just write N counters?
> >>
> >> Well, we are asked to stream N counters. But each has a variable length,
> >> which makes it funny :)
> >>
> >>     Thus, you already do
> >>>
> >>>          for (struct gcov_kvp *node = (struct gcov_kvp *)(intptr_t)start;
> >>> -          node != NULL; node = node->next)
> >>> +          node != NULL; node = node->next, j++)
> >>>           {
> >>>             gcov_write_counter (node->value);
> >>>             gcov_write_counter (node->count);
> >>> +
> >>> +         /* Stop when we reach expected number of items.  */
> >>> +         if (j + 1 == list_sizes[i])
> >>> +           break;
> >>>           }
> >>>
> >>> why isn't this then only thing you need (just using pair_count as gathered
> >>> previously)?  And just count pair_count to zero as IV?  No need to
> >>> do the node != NULL check?
> >>
> >> We can't do that, because that will lead in a corrupted profile.
> >> We can have 2 problematic situations:
> >>
> >> 1) linked list is shorter, e.g.
> >>
> >> 10 2 10000 10
> >>
> >> Here 2 represents 2 tuples, but we stream only one {10000: 10}. That happens in the instrumented clang.
> >>
> >> 2) linked list is longer, e.g.
> >> 10 1 10000 5 222222 5
> >>
> >> Here 1 represents 1 tuple, be we stream 2 tuples {10000: 10} and {222222: 5}.
> >
> > I guess the problem is that for whatever reason the stream includes
> >
> >     unsigned disk_size = GCOV_TOPN_DISK_COUNTERS * counters + 2 * pair_total;
> >     gcov_write_tag_length (GCOV_TAG_FOR_COUNTER (t_ix),
> >                           GCOV_TAG_COUNTER_LENGTH (disk_size));
> >
> > which has the sum of the list lengths, but ...
>
> Yes, GCOV tags always contain size and it's very handy in verification
> that a .gcda is in a consistent state.

So is the observed issue that the above written length is too small because
we end up streaming more counters?  As said, I think including 2 * pair_total
above only causes complication ... but I guess if that's how gcov records are
structured then so be it and your patch storing the individual lengths away
first is necessary (including obtaining temporary storage, and no, the stack
is not a good place here I think).

> >
> > -      gcov_type pair_count = ci_ptr->values[GCOV_TOPN_MEM_COUNTERS * i + 1];
> >         gcov_write_counter (ci_ptr->values[GCOV_TOPN_MEM_COUNTERS * i]);
> > -      gcov_write_counter (pair_count);
> >
> > we stream something like that again.  What are the two different counters?
> > You save one, but the other you take async again?
>
> ci_ptr->values[GCOV_TOPN_MEM_COUNTERS * i + 0] == total_number_of_executions
> ci_ptr->values[GCOV_TOPN_MEM_COUNTERS * i + 1] == length of the linked list
> ci_ptr->values[GCOV_TOPN_MEM_COUNTERS * i + 2] == pointer to the first item
> in a linked list
>
> >
> > The disk format here is sub-optimal at least and optimizing the read side
> > which has locking in place for making the write-side racy doesn't look good.
>
> Yes, we slighly miss some space. Per-file locking should not block streaming
> parallel of profiles.
>
> >
> >>>
> >>> Note architectures with less nice memory ordering guarantees might
> >>> eventually see partially updated pointers and counters so I think
> >>> we at least want atomic_read ()s of the values with the weakest
> >>> consistency possible.  (but that can be done as followup if we agree on that)
> >>
> >> Can we really see a partially updated pointer?
> >
> > Not on x86 for aligned data.  Note the same applies to the counter values.
> >
> > We've seen allocating memory from libgcov is problematic, so this is why I'm
> > asking.
>
> Sure.
>
> Martin
>
> >
> >> Thanks,
> >> Martin
> >>
> >>>
> >>> Richard.
> >>>
> >>>> Thanks,
> >>>> Martin
> >>
>
Jan Hubicka March 3, 2021, 11:26 a.m. UTC | #9
> 
> libgcc/ChangeLog:
> 
> 	PR gcov-profile/99105
> 	* libgcov-driver.c (write_top_counters): Rename to ...
> 	(write_topn_counters): ... this.
> 	(write_one_data): Pre-allocate buffer for number of items
> 	in the corresponding linked lists.
> 	* libgcov-merge.c (__gcov_merge_topn): Use renamed function.
> 
> gcc/testsuite/ChangeLog:
> 
> 	PR gcov-profile/99105
> 	* gcc.dg/tree-prof/indir-call-prof-malloc.c: Use profile
> 	correction as the wrapped malloc is called one more time
> 	from libgcov.

>    for (unsigned i = 0; i < counters; i++)
>      {
> -      gcov_type pair_count = ci_ptr->values[GCOV_TOPN_MEM_COUNTERS * i + 1];
>        gcov_write_counter (ci_ptr->values[GCOV_TOPN_MEM_COUNTERS * i]);
> -      gcov_write_counter (pair_count);
> +      gcov_write_counter (list_sizes[i]);
>        gcov_type start = ci_ptr->values[GCOV_TOPN_MEM_COUNTERS * i + 2];
> +
> +      unsigned j = 0;
>        for (struct gcov_kvp *node = (struct gcov_kvp *)(intptr_t)start;
> -	   node != NULL; node = node->next)
> +	   node != NULL; node = node->next, j++)
>  	{
>  	  gcov_write_counter (node->value);
>  	  gcov_write_counter (node->count);
> +
> +	  /* Stop when we reach expected number of items.  */
> +	  if (j + 1 == list_sizes[i])
> +	    break;

Since you counted number of entries earlier, I would expect loop to
always terminate here and thus the node != NULL condition in for loop
above to be unnecesary.
>  	}
>      }
> +
> +  free (list_sizes);

We already have our own mmap allocator, so I wonder if we don't want to
avoid malloc/free pair here.  The counters are per-function, right?  I
wonder if they happen to be large on some bigger project, but it may
reduct risk of user messing this up with his own malloc/free
implementation if we used alloca for counts of reasonable size.
>  }
>  
>  /* Write counters in GI_PTR and the summary in PRG to a gcda file. In
> @@ -425,7 +446,7 @@ write_one_data (const struct gcov_info *gi_ptr,
>  	  n_counts = ci_ptr->num;
>  
>  	  if (t_ix == GCOV_COUNTER_V_TOPN || t_ix == GCOV_COUNTER_V_INDIR)
> -	    write_top_counters (ci_ptr, t_ix, n_counts);
> +	    write_topn_counters (ci_ptr, t_ix, n_counts);
>  	  else
>  	    {
>  	      /* Do not stream when all counters are zero.  */
> diff --git a/libgcc/libgcov-merge.c b/libgcc/libgcov-merge.c
> index 7db188a4f4c..3379b688128 100644
> --- a/libgcc/libgcov-merge.c
> +++ b/libgcc/libgcov-merge.c
> @@ -109,6 +109,7 @@ __gcov_merge_topn (gcov_type *counters, unsigned n_counters)
>        /* First value is number of total executions of the profiler.  */
>        gcov_type all = gcov_get_counter_ignore_scaling (-1);
>        gcov_type n = gcov_get_counter_ignore_scaling (-1);
> +      gcc_assert (n <= GCOV_TOPN_MAXIMUM_TRACKED_VALUES);

I think in the runtime we do not want to have asserts checking
implementation correctness since it bloats it up.  So I would leave it
out.

I wonder if we can have some testcase for parallel updating/streaming in
testsuite?

Otherwise the patch looks good to me.
Honza
>  
>        unsigned full = all < 0;
>        gcov_type *total = &counters[GCOV_TOPN_MEM_COUNTERS * i];
> -- 
> 2.30.0
>
Martin Liška March 3, 2021, 2:51 p.m. UTC | #10
On 3/3/21 12:26 PM, Jan Hubicka wrote:
>>
>> libgcc/ChangeLog:
>>
>> 	PR gcov-profile/99105
>> 	* libgcov-driver.c (write_top_counters): Rename to ...
>> 	(write_topn_counters): ... this.
>> 	(write_one_data): Pre-allocate buffer for number of items
>> 	in the corresponding linked lists.
>> 	* libgcov-merge.c (__gcov_merge_topn): Use renamed function.
>>
>> gcc/testsuite/ChangeLog:
>>
>> 	PR gcov-profile/99105
>> 	* gcc.dg/tree-prof/indir-call-prof-malloc.c: Use profile
>> 	correction as the wrapped malloc is called one more time
>> 	from libgcov.
> 
>>     for (unsigned i = 0; i < counters; i++)
>>       {
>> -      gcov_type pair_count = ci_ptr->values[GCOV_TOPN_MEM_COUNTERS * i + 1];
>>         gcov_write_counter (ci_ptr->values[GCOV_TOPN_MEM_COUNTERS * i]);
>> -      gcov_write_counter (pair_count);
>> +      gcov_write_counter (list_sizes[i]);
>>         gcov_type start = ci_ptr->values[GCOV_TOPN_MEM_COUNTERS * i + 2];
>> +
>> +      unsigned j = 0;
>>         for (struct gcov_kvp *node = (struct gcov_kvp *)(intptr_t)start;
>> -	   node != NULL; node = node->next)
>> +	   node != NULL; node = node->next, j++)
>>   	{
>>   	  gcov_write_counter (node->value);
>>   	  gcov_write_counter (node->count);
>> +
>> +	  /* Stop when we reach expected number of items.  */
>> +	  if (j + 1 == list_sizes[i])
>> +	    break;
> 
> Since you counted number of entries earlier, I would expect loop to
> always terminate here and thus the node != NULL condition in for loop
> above to be unnecesary.

Yes, fixed that.

>>   	}
>>       }
>> +
>> +  free (list_sizes);
> 
> We already have our own mmap allocator, so I wonder if we don't want to
> avoid malloc/free pair here.  The counters are per-function, right?  I
> wonder if they happen to be large on some bigger project, but it may
> reduct risk of user messing this up with his own malloc/free
> implementation if we used alloca for counts of reasonable size.

Good idea, I've also implemented that.

>>   }
>>   
>>   /* Write counters in GI_PTR and the summary in PRG to a gcda file. In
>> @@ -425,7 +446,7 @@ write_one_data (const struct gcov_info *gi_ptr,
>>   	  n_counts = ci_ptr->num;
>>   
>>   	  if (t_ix == GCOV_COUNTER_V_TOPN || t_ix == GCOV_COUNTER_V_INDIR)
>> -	    write_top_counters (ci_ptr, t_ix, n_counts);
>> +	    write_topn_counters (ci_ptr, t_ix, n_counts);
>>   	  else
>>   	    {
>>   	      /* Do not stream when all counters are zero.  */
>> diff --git a/libgcc/libgcov-merge.c b/libgcc/libgcov-merge.c
>> index 7db188a4f4c..3379b688128 100644
>> --- a/libgcc/libgcov-merge.c
>> +++ b/libgcc/libgcov-merge.c
>> @@ -109,6 +109,7 @@ __gcov_merge_topn (gcov_type *counters, unsigned n_counters)
>>         /* First value is number of total executions of the profiler.  */
>>         gcov_type all = gcov_get_counter_ignore_scaling (-1);
>>         gcov_type n = gcov_get_counter_ignore_scaling (-1);
>> +      gcc_assert (n <= GCOV_TOPN_MAXIMUM_TRACKED_VALUES);
> 
> I think in the runtime we do not want to have asserts checking
> implementation correctness since it bloats it up.  So I would leave it
> out.

Makes sense.

> 
> I wonder if we can have some testcase for parallel updating/streaming in
> testsuite?

Well, I tried something like this:

#include <pthread.h>
#include "gcov.h"

#define LOOPS 1000000
#define THREADS 8

void *
worker (void *data)
{
   char memory[1024];
   for (unsigned i = 0; i < LOOPS; i++)
   {
       asm volatile("" ::: "memory");
       __builtin_memset (memory, 0, i % 15);
   }

   return NULL;
}

int main()
{
   pthread_t threads[THREADS];

   for (unsigned i = 0; i < THREADS; i++)
     if (pthread_create (&threads[i], NULL, worker, NULL))
       return 0;

   __gcov_dump ();

   return 0;
}

But it's quite difficult to make a setup where we hit point when we append to the linked list.

I've just tested instrumented clang:
$ make -j128 check-clang
...
Testing Time: 365.93s
   Unsupported      :   787
   Passed           : 26513
   Expectedly Failed:    25
[100%] Built target check-clang

May I install the patch?
Thanks,
Martin

> 
> Otherwise the patch looks good to me.
> Honza
>>   
>>         unsigned full = all < 0;
>>         gcov_type *total = &counters[GCOV_TOPN_MEM_COUNTERS * i];
>> -- 
>> 2.30.0
>>
>
Jan Hubicka March 4, 2021, 12:50 p.m. UTC | #11
>  .../gcc.dg/tree-prof/indir-call-prof-malloc.c |  2 +-
>  gcc/testsuite/gcc.dg/tree-prof/pr97461.c      |  2 +-
>  libgcc/libgcov-driver.c                       | 56 ++++++++++++++++---
>  3 files changed, 50 insertions(+), 10 deletions(-)
> 
> diff --git a/gcc/testsuite/gcc.dg/tree-prof/indir-call-prof-malloc.c b/gcc/testsuite/gcc.dg/tree-prof/indir-call-prof-malloc.c
> index 454e224c95f..7bda4aedfc8 100644
> --- a/gcc/testsuite/gcc.dg/tree-prof/indir-call-prof-malloc.c
> +++ b/gcc/testsuite/gcc.dg/tree-prof/indir-call-prof-malloc.c
> @@ -1,4 +1,4 @@
> -/* { dg-options "-O2 -ldl" } */
> +/* { dg-options "-O2 -ldl -fprofile-correction" } */
>  
>  #define _GNU_SOURCE
>  #include <stdio.h>
> diff --git a/gcc/testsuite/gcc.dg/tree-prof/pr97461.c b/gcc/testsuite/gcc.dg/tree-prof/pr97461.c
> index 213fac9af04..f684be4d80f 100644
> --- a/gcc/testsuite/gcc.dg/tree-prof/pr97461.c
> +++ b/gcc/testsuite/gcc.dg/tree-prof/pr97461.c
> @@ -1,5 +1,5 @@
>  /* PR gcov-profile/97461 */
> -/* { dg-options "-O2 -ldl" } */
> +/* { dg-options "-O2 -ldl -fprofile-correction" } */
>  
>  #define _GNU_SOURCE
>  
> diff --git a/libgcc/libgcov-driver.c b/libgcc/libgcov-driver.c
> index 91462350132..d2e60a5a6df 100644
> --- a/libgcc/libgcov-driver.c
> +++ b/libgcc/libgcov-driver.c
> @@ -42,6 +42,10 @@ void __gcov_init (struct gcov_info *p __attribute__ ((unused))) {}
>  #include <sys/stat.h>
>  #endif
>  
> +#if HAVE_SYS_MMAN_H
> +#include <sys/mman.h>
> +#endif
> +
>  #ifdef L_gcov
>  
>  /* A utility function for outputting errors.  */
> @@ -334,30 +338,66 @@ read_error:
>    return -1;
>  }
>  
> +#define MAX(X,Y) ((X) > (Y) ? (X) : (Y))
> +
>  /* Store all TOP N counters where each has a dynamic length.  */
>  
>  static void
> -write_top_counters (const struct gcov_ctr_info *ci_ptr,
> -		    unsigned t_ix,
> -		    gcov_unsigned_t n_counts)
> +write_topn_counters (const struct gcov_ctr_info *ci_ptr,
> +		     unsigned t_ix,
> +		     gcov_unsigned_t n_counts)
>  {
>    unsigned counters = n_counts / GCOV_TOPN_MEM_COUNTERS;
>    gcc_assert (n_counts % GCOV_TOPN_MEM_COUNTERS == 0);
> +
> +  /* It can happen in a multi-threaded environment that number of counters is
> +     different from the size of the corresponding linked lists.  */
> +#define LIST_SIZE_MIN_LENGTH 4 * 1024
> +
> +  static unsigned *list_sizes = NULL;
> +  static unsigned list_size_length = 0;
> +
> +  if (list_sizes == NULL || counters > list_size_length)
> +    {
> +      list_size_length = MAX (LIST_SIZE_MIN_LENGTH, counters);
> +#if HAVE_SYS_MMAN_H
> +      list_sizes = (unsigned *)mmap (NULL, list_size_length * sizeof (unsigned),
> +				     PROT_READ | PROT_WRITE,
> +				     MAP_PRIVATE | MAP_ANONYMOUS, 0, 0);
> +#endif
> +
> +      /* Malloc fallback.  */
> +      if (list_sizes == NULL)
> +	list_sizes = (unsigned *)xmalloc (list_size_length * sizeof (unsigned));
I see, you switched to allocating buffer permanently. This also works.
Given that you do not deallocate the memory, I think you want to
allocate list_size_length*2 so we do not end up taking O(mn) memory
where n is the largest number of counters in a file and m is number of
sections with counters.

Can you please unify the mmap code in list allocation and here, so there
is only one place in libgcov Windows folks will need to update?

Otherwise the patch looks OK.
Honza
Martin Liška March 4, 2021, 3:20 p.m. UTC | #12
On 3/4/21 1:50 PM, Jan Hubicka wrote:
>>   .../gcc.dg/tree-prof/indir-call-prof-malloc.c |  2 +-
>>   gcc/testsuite/gcc.dg/tree-prof/pr97461.c      |  2 +-
>>   libgcc/libgcov-driver.c                       | 56 ++++++++++++++++---
>>   3 files changed, 50 insertions(+), 10 deletions(-)
>>
>> diff --git a/gcc/testsuite/gcc.dg/tree-prof/indir-call-prof-malloc.c b/gcc/testsuite/gcc.dg/tree-prof/indir-call-prof-malloc.c
>> index 454e224c95f..7bda4aedfc8 100644
>> --- a/gcc/testsuite/gcc.dg/tree-prof/indir-call-prof-malloc.c
>> +++ b/gcc/testsuite/gcc.dg/tree-prof/indir-call-prof-malloc.c
>> @@ -1,4 +1,4 @@
>> -/* { dg-options "-O2 -ldl" } */
>> +/* { dg-options "-O2 -ldl -fprofile-correction" } */
>>   
>>   #define _GNU_SOURCE
>>   #include <stdio.h>
>> diff --git a/gcc/testsuite/gcc.dg/tree-prof/pr97461.c b/gcc/testsuite/gcc.dg/tree-prof/pr97461.c
>> index 213fac9af04..f684be4d80f 100644
>> --- a/gcc/testsuite/gcc.dg/tree-prof/pr97461.c
>> +++ b/gcc/testsuite/gcc.dg/tree-prof/pr97461.c
>> @@ -1,5 +1,5 @@
>>   /* PR gcov-profile/97461 */
>> -/* { dg-options "-O2 -ldl" } */
>> +/* { dg-options "-O2 -ldl -fprofile-correction" } */
>>   
>>   #define _GNU_SOURCE
>>   
>> diff --git a/libgcc/libgcov-driver.c b/libgcc/libgcov-driver.c
>> index 91462350132..d2e60a5a6df 100644
>> --- a/libgcc/libgcov-driver.c
>> +++ b/libgcc/libgcov-driver.c
>> @@ -42,6 +42,10 @@ void __gcov_init (struct gcov_info *p __attribute__ ((unused))) {}
>>   #include <sys/stat.h>
>>   #endif
>>   
>> +#if HAVE_SYS_MMAN_H
>> +#include <sys/mman.h>
>> +#endif
>> +
>>   #ifdef L_gcov
>>   
>>   /* A utility function for outputting errors.  */
>> @@ -334,30 +338,66 @@ read_error:
>>     return -1;
>>   }
>>   
>> +#define MAX(X,Y) ((X) > (Y) ? (X) : (Y))
>> +
>>   /* Store all TOP N counters where each has a dynamic length.  */
>>   
>>   static void
>> -write_top_counters (const struct gcov_ctr_info *ci_ptr,
>> -		    unsigned t_ix,
>> -		    gcov_unsigned_t n_counts)
>> +write_topn_counters (const struct gcov_ctr_info *ci_ptr,
>> +		     unsigned t_ix,
>> +		     gcov_unsigned_t n_counts)
>>   {
>>     unsigned counters = n_counts / GCOV_TOPN_MEM_COUNTERS;
>>     gcc_assert (n_counts % GCOV_TOPN_MEM_COUNTERS == 0);
>> +
>> +  /* It can happen in a multi-threaded environment that number of counters is
>> +     different from the size of the corresponding linked lists.  */
>> +#define LIST_SIZE_MIN_LENGTH 4 * 1024
>> +
>> +  static unsigned *list_sizes = NULL;
>> +  static unsigned list_size_length = 0;
>> +
>> +  if (list_sizes == NULL || counters > list_size_length)
>> +    {
>> +      list_size_length = MAX (LIST_SIZE_MIN_LENGTH, counters);
>> +#if HAVE_SYS_MMAN_H
>> +      list_sizes = (unsigned *)mmap (NULL, list_size_length * sizeof (unsigned),
>> +				     PROT_READ | PROT_WRITE,
>> +				     MAP_PRIVATE | MAP_ANONYMOUS, 0, 0);
>> +#endif
>> +
>> +      /* Malloc fallback.  */
>> +      if (list_sizes == NULL)
>> +	list_sizes = (unsigned *)xmalloc (list_size_length * sizeof (unsigned));
> I see, you switched to allocating buffer permanently. This also works.
> Given that you do not deallocate the memory, I think you want to
> allocate list_size_length*2 so we do not end up taking O(mn) memory
> where n is the largest number of counters in a file and m is number of
> sections with counters.

Yeah, I used there a minimal amount of memory that should fit all normal functions.
I updated that to 2 * counters.

> 
> Can you please unify the mmap code in list allocation and here, so there
> is only one place in libgcov Windows folks will need to update?
> 
> Otherwise the patch looks OK.

Thanks, pushed.
Martin

> Honza
>
diff mbox series

Patch

diff --git a/libgcc/libgcov-driver.c b/libgcc/libgcov-driver.c
index e474e032b54..74c68ab04e2 100644
--- a/libgcc/libgcov-driver.c
+++ b/libgcc/libgcov-driver.c
@@ -337,7 +337,7 @@  read_error:
  /* Store all TOP N counters where each has a dynamic length.  */
  
  static void
-write_top_counters (const struct gcov_ctr_info *ci_ptr,
+write_topn_counters (const struct gcov_ctr_info *ci_ptr,
  		    unsigned t_ix,
  		    gcov_unsigned_t n_counts)
  {
@@ -347,22 +347,42 @@  write_top_counters (const struct gcov_ctr_info *ci_ptr,
    for (unsigned i = 0; i < counters; i++)
      pair_total += ci_ptr->values[GCOV_TOPN_MEM_COUNTERS * i + 1];
    unsigned disk_size = GCOV_TOPN_DISK_COUNTERS * counters + 2 * pair_total;
-  gcov_write_tag_length (GCOV_TAG_FOR_COUNTER (t_ix),
-			 GCOV_TAG_COUNTER_LENGTH (disk_size));
+
+  /* It can happen in multi-threaded environment that number of counters is smaller.
+     Because of that we build a buffer which we stream latter in this function.  */
+  gcov_type *buffer
+    = (gcov_type *) xmalloc (disk_size * sizeof (gcov_type));
+  gcov_type *ptr = buffer;
  
    for (unsigned i = 0; i < counters; i++)
      {
-      gcov_type pair_count = ci_ptr->values[GCOV_TOPN_MEM_COUNTERS * i + 1];
-      gcov_write_counter (ci_ptr->values[GCOV_TOPN_MEM_COUNTERS * i]);
-      gcov_write_counter (pair_count);
+      (*ptr++) = ci_ptr->values[GCOV_TOPN_MEM_COUNTERS * i];
+      /* Skip pair count for now.  */
+      gcov_type *pair_count_ptr = ptr;
+      ++ptr;
+
+      unsigned pair_count = 0;
        gcov_type start = ci_ptr->values[GCOV_TOPN_MEM_COUNTERS * i + 2];
        for (struct gcov_kvp *node = (struct gcov_kvp *)(intptr_t)start;
-	   node != NULL; node = node->next)
+	   node != NULL; node = node->next, ++pair_count)
  	{
-	  gcov_write_counter (node->value);
-	  gcov_write_counter (node->count);
+	  (*ptr++) = node->value;
+	  (*ptr++) = node->count;
  	}
+
+      *pair_count_ptr = pair_count;
      }
+
+  unsigned int length = ptr - buffer;
+  gcc_assert (length <= disk_size);
+  gcov_write_tag_length (GCOV_TAG_FOR_COUNTER (t_ix),
+			 GCOV_TAG_COUNTER_LENGTH (length));
+
+  /* FIXME: use a block store  */
+  for (unsigned i = 0; i < length; i++)
+    gcov_write_counter (buffer[i]);
+
+  free (buffer);
  }
  
  /* Write counters in GI_PTR and the summary in PRG to a gcda file. In
@@ -425,7 +445,7 @@  write_one_data (const struct gcov_info *gi_ptr,
  	  n_counts = ci_ptr->num;
  
  	  if (t_ix == GCOV_COUNTER_V_TOPN || t_ix == GCOV_COUNTER_V_INDIR)
-	    write_top_counters (ci_ptr, t_ix, n_counts);
+	    write_topn_counters (ci_ptr, t_ix, n_counts);
  	  else
  	    {
  	      /* Do not stream when all counters are zero.  */
diff --git a/libgcc/libgcov-merge.c b/libgcc/libgcov-merge.c
index 7db188a4f4c..3379b688128 100644
--- a/libgcc/libgcov-merge.c
+++ b/libgcc/libgcov-merge.c
@@ -109,6 +109,7 @@  __gcov_merge_topn (gcov_type *counters, unsigned n_counters)
        /* First value is number of total executions of the profiler.  */
        gcov_type all = gcov_get_counter_ignore_scaling (-1);
        gcov_type n = gcov_get_counter_ignore_scaling (-1);
+      gcc_assert (n <= GCOV_TOPN_MAXIMUM_TRACKED_VALUES);
  
        unsigned full = all < 0;
        gcov_type *total = &counters[GCOV_TOPN_MEM_COUNTERS * i];