diff mbox series

Come up with -flto=auto option.

Message ID 95f20c0a-81a3-2319-326b-3c5baf71e2d1@suse.cz
State New
Headers show
Series Come up with -flto=auto option. | expand

Commit Message

Martin Liška July 23, 2019, 8:30 a.m. UTC
Hi.

As we as openSUSE started using -flto, I see it very handy to have
an option value that will automatically detect number of cores
that can be used for parallel LTRANS phase.

Thoughts?

gcc/ChangeLog:

2019-07-23  Martin Liska  <mliska@suse.cz>

	* doc/invoke.texi: Document the new option value.
	* lto-wrapper.c (cpuset_popcount): New function
	is a copy of libgomp/config/linux/proc.c.
	(init_num_threads): Likewise.
	(run_gcc): Support -flto=auto.
---
 gcc/doc/invoke.texi |   3 ++
 gcc/lto-wrapper.c   | 124 +++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 126 insertions(+), 1 deletion(-)

Comments

Jan Hubicka July 23, 2019, 9:20 a.m. UTC | #1
> Hi.
> 
> As we as openSUSE started using -flto, I see it very handy to have
> an option value that will automatically detect number of cores
> that can be used for parallel LTRANS phase.
> 
> Thoughts?
Hi,
great you found time to make this. It should become the default for
-flto IMO.

I think we also should auto-detect the case where jobserver is available
and in that case let make to connect to the outer jobserver.  (We should
also really convince make developers to invent way to connect to it w/o
the extra + role)

Honza
> 
> gcc/ChangeLog:
> 
> 2019-07-23  Martin Liska  <mliska@suse.cz>
> 
> 	* doc/invoke.texi: Document the new option value.
> 	* lto-wrapper.c (cpuset_popcount): New function
> 	is a copy of libgomp/config/linux/proc.c.
> 	(init_num_threads): Likewise.
> 	(run_gcc): Support -flto=auto.
> ---
>  gcc/doc/invoke.texi |   3 ++
>  gcc/lto-wrapper.c   | 124 +++++++++++++++++++++++++++++++++++++++++++-
>  2 files changed, 126 insertions(+), 1 deletion(-)
> 
> 

> diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
> index 77a2d561e38..58656fbe1e1 100644
> --- a/gcc/doc/invoke.texi
> +++ b/gcc/doc/invoke.texi
> @@ -10398,6 +10398,9 @@ parallel jobs by utilizing an installed @command{make} program.  The
>  environment variable @env{MAKE} may be used to override the program
>  used.  The default value for @var{n} is 1.
>  
> +You can specify @var{auto} to automatically detect number of
> +cores that will determine the number of parallel jobs.
> +
>  You can also specify @option{-flto=jobserver} to use GNU make's
>  job server mode to determine the number of parallel jobs. This
>  is useful when the Makefile calling GCC is already executing in parallel.
> diff --git a/gcc/lto-wrapper.c b/gcc/lto-wrapper.c
> index 946897726d0..5451285f896 100644
> --- a/gcc/lto-wrapper.c
> +++ b/gcc/lto-wrapper.c
> @@ -1110,6 +1110,110 @@ cmp_priority (const void *a, const void *b)
>    return *((const int *)b)-*((const int *)a);
>  }
>  
> +/* Number of CPUs that can be used for parallel LTRANS phase.  */
> +
> +static unsigned long nthreads_var = 0;
> +
> +#ifdef HAVE_PTHREAD_AFFINITY_NP
> +unsigned long cpuset_size;
> +static unsigned long get_cpuset_size;
> +cpu_set_t *cpusetp;
> +
> +unsigned long
> +static cpuset_popcount (unsigned long cpusetsize, cpu_set_t *cpusetp)
> +{
> +#ifdef CPU_COUNT_S
> +  /* glibc 2.7 and above provide a macro for this.  */
> +  return CPU_COUNT_S (cpusetsize, cpusetp);
> +#else
> +#ifdef CPU_COUNT
> +  if (cpusetsize == sizeof (cpu_set_t))
> +    /* glibc 2.6 and above provide a macro for this.  */
> +    return CPU_COUNT (cpusetp);
> +#endif
> +  size_t i;
> +  unsigned long ret = 0;
> +  STATIC_ASSERT (sizeof (cpusetp->__bits[0]) == sizeof (unsigned long int));
> +  for (i = 0; i < cpusetsize / sizeof (cpusetp->__bits[0]); i++)
> +    {
> +      unsigned long int mask = cpusetp->__bits[i];
> +      if (mask == 0)
> +	continue;
> +      ret += __builtin_popcountl (mask);
> +    }
> +  return ret;
> +#endif
> +}
> +#endif
> +
> +/* At startup, determine the default number of threads.  It would seem
> +   this should be related to the number of cpus online.  */
> +
> +static void
> +init_num_threads (void)
> +{
> +#ifdef HAVE_PTHREAD_AFFINITY_NP
> +#if defined (_SC_NPROCESSORS_CONF) && defined (CPU_ALLOC_SIZE)
> +  cpuset_size = sysconf (_SC_NPROCESSORS_CONF);
> +  cpuset_size = CPU_ALLOC_SIZE (cpuset_size);
> +#else
> +  cpuset_size = sizeof (cpu_set_t);
> +#endif
> +
> +  cpusetp = (cpu_set_t *) xmalloc (gomp_cpuset_size);
> +  do
> +    {
> +      int ret = pthread_getaffinity_np (pthread_self (), gomp_cpuset_size,
> +					cpusetp);
> +      if (ret == 0)
> +	{
> +	  /* Count only the CPUs this process can use.  */
> +	  nthreads_var = cpuset_popcount (cpuset_size, cpusetp);
> +	  if (nthreads_var == 0)
> +	    break;
> +	  get_cpuset_size = cpuset_size;
> +#ifdef CPU_ALLOC_SIZE
> +	  unsigned long i;
> +	  for (i = cpuset_size * 8; i; i--)
> +	    if (CPU_ISSET_S (i - 1, cpuset_size, cpusetp))
> +	      break;
> +	  cpuset_size = CPU_ALLOC_SIZE (i);
> +#endif
> +	  return;
> +	}
> +      if (ret != EINVAL)
> +	break;
> +#ifdef CPU_ALLOC_SIZE
> +      if (cpuset_size < sizeof (cpu_set_t))
> +	cpuset_size = sizeof (cpu_set_t);
> +      else
> +	cpuset_size = cpuset_size * 2;
> +      if (cpuset_size < 8 * sizeof (cpu_set_t))
> +	cpusetp
> +	  = (cpu_set_t *) realloc (cpusetp, cpuset_size);
> +      else
> +	{
> +	  /* Avoid fatal if too large memory allocation would be
> +	     requested, e.g. kernel returning EINVAL all the time.  */
> +	  void *p = realloc (cpusetp, cpuset_size);
> +	  if (p == NULL)
> +	    break;
> +	  cpusetp = (cpu_set_t *) p;
> +	}
> +#else
> +      break;
> +#endif
> +    }
> +  while (1);
> +  cpuset_size = 0;
> +  nthreads_var = 1;
> +  free (cpusetp);
> +  cpusetp = NULL;
> +#endif
> +#ifdef _SC_NPROCESSORS_ONLN
> +  nthreads_var = sysconf (_SC_NPROCESSORS_ONLN);
> +#endif
> +}
>  
>  /* Execute gcc. ARGC is the number of arguments. ARGV contains the arguments. */
>  
> @@ -1124,6 +1228,7 @@ run_gcc (unsigned argc, char *argv[])
>    const char *collect_gcc, *collect_gcc_options;
>    int parallel = 0;
>    int jobserver = 0;
> +  int auto_parallel = 0;
>    bool no_partition = false;
>    struct cl_decoded_option *fdecoded_options = NULL;
>    struct cl_decoded_option *offload_fdecoded_options = NULL;
> @@ -1251,6 +1356,11 @@ run_gcc (unsigned argc, char *argv[])
>  	      jobserver = 1;
>  	      parallel = 1;
>  	    }
> +	  else if (strcmp (option->arg, "auto") == 0)
> +	    {
> +	      auto_parallel = 1;
> +	      parallel = 1;
> +	    }
>  	  else
>  	    {
>  	      parallel = atoi (option->arg);
> @@ -1291,6 +1401,7 @@ run_gcc (unsigned argc, char *argv[])
>      {
>        lto_mode = LTO_MODE_LTO;
>        jobserver = 0;
> +      auto_parallel = 0;
>        parallel = 0;
>      }
>  
> @@ -1485,6 +1596,16 @@ cont1:
>  
>        if (jobserver)
>  	obstack_ptr_grow (&argv_obstack, xstrdup ("-fwpa=jobserver"));
> +      else if (auto_parallel)
> +	{
> +	  char buf[256];
> +	  init_num_threads ();
> +	  if (verbose)
> +	    fprintf (stderr, "LTO parallelism level set to %ld\n",
> +		     nthreads_var);
> +	  sprintf (buf, "-fwpa=%ld", nthreads_var);
> +	  obstack_ptr_grow (&argv_obstack, xstrdup (buf));
> +	}
>        else if (parallel > 1)
>  	{
>  	  char buf[256];
> @@ -1692,7 +1813,8 @@ cont:
>  	  i = 3;
>  	  if (!jobserver)
>  	    {
> -	      snprintf (jobs, 31, "-j%d", parallel);
> +	      snprintf (jobs, 31, "-j%ld",
> +			auto_parallel ? nthreads_var : parallel);
>  	      new_argv[i++] = jobs;
>  	    }
>  	  new_argv[i++] = "all";
>
Jeff Law July 23, 2019, 1:09 p.m. UTC | #2
On 7/23/19 3:20 AM, Jan Hubicka wrote:
>> Hi.
>>
>> As we as openSUSE started using -flto, I see it very handy to have
>> an option value that will automatically detect number of cores
>> that can be used for parallel LTRANS phase.
>>
>> Thoughts?
> Hi,
> great you found time to make this. It should become the default for
> -flto IMO.
I was going to hack it into the rpm configury bits since we have access
to the # cores there.  But an auto-selector within GCC is even better.

BTW, isn't this all going to wreck havoc with reproducible builds since
partitioning can affect code generation?  That's one of our open
questions...

Jeff
Richard Biener July 23, 2019, 1:15 p.m. UTC | #3
On Tue, Jul 23, 2019 at 3:09 PM Jeff Law <law@redhat.com> wrote:
>
> On 7/23/19 3:20 AM, Jan Hubicka wrote:
> >> Hi.
> >>
> >> As we as openSUSE started using -flto, I see it very handy to have
> >> an option value that will automatically detect number of cores
> >> that can be used for parallel LTRANS phase.
> >>
> >> Thoughts?
> > Hi,
> > great you found time to make this. It should become the default for
> > -flto IMO.
> I was going to hack it into the rpm configury bits since we have access
> to the # cores there.  But an auto-selector within GCC is even better.
>
> BTW, isn't this all going to wreck havoc with reproducible builds since
> partitioning can affect code generation?  That's one of our open
> questions...

Partitioning is independent on N in -flto=N it's just dependent on
the LTO partitioning --params.

Richard.

> Jeff
Michael Matz July 23, 2019, 1:50 p.m. UTC | #4
Hi,

On Tue, 23 Jul 2019, Jeff Law wrote:

> > great you found time to make this. It should become the default for
> > -flto IMO.
> I was going to hack it into the rpm configury bits since we have access
> to the # cores there.  But an auto-selector within GCC is even better.
> 
> BTW, isn't this all going to wreck havoc with reproducible builds since 
> partitioning can affect code generation?  That's one of our open 
> questions...

See Richi for this, but the reason for -flto=auto (or just -flto, or 
whatever the endresult will be) is _also_ reproducible builds, because 
some packages like to encode the compile flags into their binaries and 
hence would change depending on build host just because of "-flto=32" vs. 
"-flto=64" even when the code remains exactly the same.


Ciao,
Michael.
Jeff Law July 23, 2019, 1:57 p.m. UTC | #5
On 7/23/19 7:50 AM, Michael Matz wrote:
> Hi,
> 
> On Tue, 23 Jul 2019, Jeff Law wrote:
> 
>>> great you found time to make this. It should become the default for
>>> -flto IMO.
>> I was going to hack it into the rpm configury bits since we have access
>> to the # cores there.  But an auto-selector within GCC is even better.
>>
>> BTW, isn't this all going to wreck havoc with reproducible builds since 
>> partitioning can affect code generation?  That's one of our open 
>> questions...
> 
> See Richi for this, but the reason for -flto=auto (or just -flto, or 
> whatever the endresult will be) is _also_ reproducible builds, because 
> some packages like to encode the compile flags into their binaries and 
> hence would change depending on build host just because of "-flto=32" vs. 
> "-flto=64" even when the code remains exactly the same.
Makes sense.

What did you end up doing with old autoconf scripts that aren't LTO
safe?  Many of the older style tests to see if a function exists are
broken by LTO.  I've seen more issues with this than anything in the LTO
space so far.

We're capturing config.h files and comparing them with and without LTO
to at least be able to enumerate where these issues are.   Sadly this
test gets lots of false positives due to flag and timestamp encodings
into the config.h files :(

jeff
Martin Liška July 23, 2019, 2:23 p.m. UTC | #6
On 7/23/19 3:57 PM, Jeff Law wrote:
> On 7/23/19 7:50 AM, Michael Matz wrote:
>> Hi,
>>
>> On Tue, 23 Jul 2019, Jeff Law wrote:
>>
>>>> great you found time to make this. It should become the default for
>>>> -flto IMO.
>>> I was going to hack it into the rpm configury bits since we have access
>>> to the # cores there.  But an auto-selector within GCC is even better.
>>>
>>> BTW, isn't this all going to wreck havoc with reproducible builds since 
>>> partitioning can affect code generation?  That's one of our open 
>>> questions...
>>
>> See Richi for this, but the reason for -flto=auto (or just -flto, or 
>> whatever the endresult will be) is _also_ reproducible builds, because 
>> some packages like to encode the compile flags into their binaries and 
>> hence would change depending on build host just because of "-flto=32" vs. 
>> "-flto=64" even when the code remains exactly the same.
> Makes sense.
> 
> What did you end up doing with old autoconf scripts that aren't LTO
> safe?  Many of the older style tests to see if a function exists are
> broken by LTO.  I've seen more issues with this than anything in the LTO
> space so far.

Well, I've seen some of these failures, but only a few.

Martin

> 
> We're capturing config.h files and comparing them with and without LTO
> to at least be able to enumerate where these issues are.   Sadly this
> test gets lots of false positives due to flag and timestamp encodings
> into the config.h files :(
> 
> jeff
>
Allan Sandfeld Jensen July 23, 2019, 10:11 p.m. UTC | #7
On Dienstag, 23. Juli 2019 10:30:07 CEST Martin Liška wrote:
> Hi.
> 
> As we as openSUSE started using -flto, I see it very handy to have
> an option value that will automatically detect number of cores
> that can be used for parallel LTRANS phase.
> 
> Thoughts?
> 
That's really nice. 

How much extra work would it be to make it support a posix make jobserver? 

As far as I understand, you would need to guess a partition size first (as 
your patch here does), but then only start each job when given a token from 
the jobserver FD.

With that the integration to existing build infrastructure would be optimal.

Cheers
'Allan
Jeff Law July 23, 2019, 10:32 p.m. UTC | #8
On 7/23/19 8:23 AM, Martin Liška wrote:
> On 7/23/19 3:57 PM, Jeff Law wrote:
>> On 7/23/19 7:50 AM, Michael Matz wrote:
>>> Hi,
>>>
>>> On Tue, 23 Jul 2019, Jeff Law wrote:
>>>
>>>>> great you found time to make this. It should become the default for
>>>>> -flto IMO.
>>>> I was going to hack it into the rpm configury bits since we have access
>>>> to the # cores there.  But an auto-selector within GCC is even better.
>>>>
>>>> BTW, isn't this all going to wreck havoc with reproducible builds since 
>>>> partitioning can affect code generation?  That's one of our open 
>>>> questions...
>>>
>>> See Richi for this, but the reason for -flto=auto (or just -flto, or 
>>> whatever the endresult will be) is _also_ reproducible builds, because 
>>> some packages like to encode the compile flags into their binaries and 
>>> hence would change depending on build host just because of "-flto=32" vs. 
>>> "-flto=64" even when the code remains exactly the same.
>> Makes sense.
>>
>> What did you end up doing with old autoconf scripts that aren't LTO
>> safe?  Many of the older style tests to see if a function exists are
>> broken by LTO.  I've seen more issues with this than anything in the LTO
>> space so far.
> 
> Well, I've seen some of these failures, but only a few.
Many appear to be silent, possibly not really affecting anything (like
all the packages that test for doprnt, but really don't care about it in
the end).    But there were enough real failures that I put in auditing
to detect any cases where we get different config.h files with LTO vs
non-LTO and that is tripping often enough to have my concerns about how
much work it's going to be to get everything fixed.


But still, overall we're moving forward.  Next step is to get everything
classified into buckets and start iterating.  Presumably you'd be open
to a google doc of some kind where we can coordinate any such efforts?

jeff

ps.  I'm on PTO July 25 to Aug 5, so not much is going to happen in the
next couple weeks :-)
Martin Liška July 24, 2019, 6:45 a.m. UTC | #9
On 7/24/19 12:11 AM, Allan Sandfeld Jensen wrote:
> On Dienstag, 23. Juli 2019 10:30:07 CEST Martin Liška wrote:
>> Hi.
>>
>> As we as openSUSE started using -flto, I see it very handy to have
>> an option value that will automatically detect number of cores
>> that can be used for parallel LTRANS phase.
>>
>> Thoughts?
>>
> That's really nice. 
> 
> How much extra work would it be to make it support a posix make jobserver? 

We do support it via -flto=jobserver:

           You can also specify -flto=jobserver to use GNU make's job server mode to determine the number of parallel jobs. This is useful when the Makefile calling GCC is already executing in parallel.  You must prepend a + to the command recipe in the parent
           Makefile for this to work.  This option likely only works if MAKE is GNU make.

Problem is that nowadays you how much more common make systems like ninja, meson and others
that probably do not support that.

> 
> As far as I understand, you would need to guess a partition size first (as 
> your patch here does), but then only start each job when given a token from 
> the jobserver FD.
> 
> With that the integration to existing build infrastructure would be optimal.

Fully agree with you. Maybe a new (more generic) infrastructure (API) would be welcome.

Martin

> 
> Cheers
> 'Allan
> 
>
Martin Liška July 24, 2019, 6:47 a.m. UTC | #10
On 7/24/19 12:32 AM, Jeff Law wrote:
> On 7/23/19 8:23 AM, Martin Liška wrote:
>> On 7/23/19 3:57 PM, Jeff Law wrote:
>>> On 7/23/19 7:50 AM, Michael Matz wrote:
>>>> Hi,
>>>>
>>>> On Tue, 23 Jul 2019, Jeff Law wrote:
>>>>
>>>>>> great you found time to make this. It should become the default for
>>>>>> -flto IMO.
>>>>> I was going to hack it into the rpm configury bits since we have access
>>>>> to the # cores there.  But an auto-selector within GCC is even better.
>>>>>
>>>>> BTW, isn't this all going to wreck havoc with reproducible builds since 
>>>>> partitioning can affect code generation?  That's one of our open 
>>>>> questions...
>>>>
>>>> See Richi for this, but the reason for -flto=auto (or just -flto, or 
>>>> whatever the endresult will be) is _also_ reproducible builds, because 
>>>> some packages like to encode the compile flags into their binaries and 
>>>> hence would change depending on build host just because of "-flto=32" vs. 
>>>> "-flto=64" even when the code remains exactly the same.
>>> Makes sense.
>>>
>>> What did you end up doing with old autoconf scripts that aren't LTO
>>> safe?  Many of the older style tests to see if a function exists are
>>> broken by LTO.  I've seen more issues with this than anything in the LTO
>>> space so far.
>>
>> Well, I've seen some of these failures, but only a few.
> Many appear to be silent, possibly not really affecting anything (like
> all the packages that test for doprnt, but really don't care about it in
> the end).    But there were enough real failures that I put in auditing
> to detect any cases where we get different config.h files with LTO vs
> non-LTO and that is tripping often enough to have my concerns about how
> much work it's going to be to get everything fixed.

I see.

> 
> 
> But still, overall we're moving forward.  Next step is to get everything
> classified into buckets and start iterating.  Presumably you'd be open
> to a google doc of some kind where we can coordinate any such efforts?

Sure, I'm open. In the meantime, I've got a META issue that I use for tracking
of LTO-related issues in openSUSE:
https://bugzilla.opensuse.org/show_bug.cgi?id=1133084

Martin

> 
> jeff
> 
> ps.  I'm on PTO July 25 to Aug 5, so not much is going to happen in the
> next couple weeks :-)
>
Allan Sandfeld Jensen July 24, 2019, 7:03 a.m. UTC | #11
On Mittwoch, 24. Juli 2019 08:45:21 CEST Martin Liška wrote:
> On 7/24/19 12:11 AM, Allan Sandfeld Jensen wrote:
> > On Dienstag, 23. Juli 2019 10:30:07 CEST Martin Liška wrote:
> >> Hi.
> >> 
> >> As we as openSUSE started using -flto, I see it very handy to have
> >> an option value that will automatically detect number of cores
> >> that can be used for parallel LTRANS phase.
> >> 
> >> Thoughts?
> > 
> > That's really nice.
> > 
> > How much extra work would it be to make it support a posix make jobserver?
> 
> We do support it via -flto=jobserver:
> 
Good to know :)

> 
> Problem is that nowadays you how much more common make systems like ninja,
> meson and others that probably do not support that.
> 
There are patches to enable it in ninja, and I know some Linux distros apply 
the patches by default. Though that is more listening, so it probably requires 
launching ninja using make, if you want to be able to pass it own to gcc.

'Allan
Martin Liška July 24, 2019, 7:12 a.m. UTC | #12
On 7/24/19 9:03 AM, Allan Sandfeld Jensen wrote:
> On Mittwoch, 24. Juli 2019 08:45:21 CEST Martin Liška wrote:
>> On 7/24/19 12:11 AM, Allan Sandfeld Jensen wrote:
>>> On Dienstag, 23. Juli 2019 10:30:07 CEST Martin Liška wrote:
>>>> Hi.
>>>>
>>>> As we as openSUSE started using -flto, I see it very handy to have
>>>> an option value that will automatically detect number of cores
>>>> that can be used for parallel LTRANS phase.
>>>>
>>>> Thoughts?
>>>
>>> That's really nice.
>>>
>>> How much extra work would it be to make it support a posix make jobserver?
>>
>> We do support it via -flto=jobserver:
>>
> Good to know :)
> 
>>
>> Problem is that nowadays you how much more common make systems like ninja,
>> meson and others that probably do not support that.
>>
> There are patches to enable it in ninja, and I know some Linux distros apply 
> the patches by default. Though that is more listening, so it probably requires 
> launching ninja using make, if you want to be able to pass it own to gcc.

Btw. I know that Nathan will need a much closer integration between a compiler
and a build system for hit C++ modules work. That can be a good opportunity
for the LTO to utilize the infrastructure as well.

Martin

> 
> 'Allan
> 
>
Nathan Sidwell July 24, 2019, 11:05 a.m. UTC | #13
On 7/24/19 3:12 AM, Martin Liška wrote:
> On 7/24/19 9:03 AM, Allan Sandfeld Jensen wrote:

>> There are patches to enable it in ninja, and I know some Linux distros apply
>> the patches by default. Though that is more listening, so it probably requires
>> launching ninja using make, if you want to be able to pass it own to gcc.
> 
> Btw. I know that Nathan will need a much closer integration between a compiler
> and a build system for hit C++ modules work. That can be a good opportunity
> for the LTO to utilize the infrastructure as well.
> 

Yes indeed, I do mention LTO requirements as a foil against 'everything 
is different' complaints about modules :)  FWIW WG21 (C++ std) Study 
Group 15 is the tooling group, which is where build system people hang 
out.  If anyone's interested in joining email me.

nathan
Jeff Law July 24, 2019, 3:12 p.m. UTC | #14
On 7/24/19 12:47 AM, Martin Liška wrote:
> On 7/24/19 12:32 AM, Jeff Law wrote:
>> On 7/23/19 8:23 AM, Martin Liška wrote:
>>> On 7/23/19 3:57 PM, Jeff Law wrote:
>>>> On 7/23/19 7:50 AM, Michael Matz wrote:
>>>>> Hi,
>>>>>
>>>>> On Tue, 23 Jul 2019, Jeff Law wrote:
>>>>>
>>>>>>> great you found time to make this. It should become the default for
>>>>>>> -flto IMO.
>>>>>> I was going to hack it into the rpm configury bits since we have access
>>>>>> to the # cores there.  But an auto-selector within GCC is even better.
>>>>>>
>>>>>> BTW, isn't this all going to wreck havoc with reproducible builds since 
>>>>>> partitioning can affect code generation?  That's one of our open 
>>>>>> questions...
>>>>>
>>>>> See Richi for this, but the reason for -flto=auto (or just -flto, or 
>>>>> whatever the endresult will be) is _also_ reproducible builds, because 
>>>>> some packages like to encode the compile flags into their binaries and 
>>>>> hence would change depending on build host just because of "-flto=32" vs. 
>>>>> "-flto=64" even when the code remains exactly the same.
>>>> Makes sense.
>>>>
>>>> What did you end up doing with old autoconf scripts that aren't LTO
>>>> safe?  Many of the older style tests to see if a function exists are
>>>> broken by LTO.  I've seen more issues with this than anything in the LTO
>>>> space so far.
>>>
>>> Well, I've seen some of these failures, but only a few.
>> Many appear to be silent, possibly not really affecting anything (like
>> all the packages that test for doprnt, but really don't care about it in
>> the end).    But there were enough real failures that I put in auditing
>> to detect any cases where we get different config.h files with LTO vs
>> non-LTO and that is tripping often enough to have my concerns about how
>> much work it's going to be to get everything fixed.
> 
> I see.
> 
>>
>>
>> But still, overall we're moving forward.  Next step is to get everything
>> classified into buckets and start iterating.  Presumably you'd be open
>> to a google doc of some kind where we can coordinate any such efforts?
> 
> Sure, I'm open. In the meantime, I've got a META issue that I use for tracking
> of LTO-related issues in openSUSE:
> https://bugzilla.opensuse.org/show_bug.cgi?id=1133084
Sounds good.  I'll try to get it populated and at least the first level
categorization done today.  Matthew B. may do some additional analysis
while I'm offline.

I've got scripts which can query the jenkins build failure plugin to
help with that first level categorization.  So, in theory, I can query
for all the configury differences or query for any cases where LTO
exposed a fatal warning,  testsuite failures, etc.

I don't want to burn a lot of time on the config.h stuff right now.  At
this stage we're more interested in the other failure categories.  But
I'm going to have to figure out something shortly after returning when I
put in the Fedora 32 change request.

I've got your META BZ in one of my tabs.  I've referred to it several
times over the last few weeks :-)

jeff
Jeff Law July 24, 2019, 3:44 p.m. UTC | #15
On 7/23/19 2:30 AM, Martin Liška wrote:
> Hi.
> 
> As we as openSUSE started using -flto, I see it very handy to have
> an option value that will automatically detect number of cores
> that can be used for parallel LTRANS phase.
> 
> Thoughts?
> 
> gcc/ChangeLog:
> 
> 2019-07-23  Martin Liska  <mliska@suse.cz>
> 
> 	* doc/invoke.texi: Document the new option value.
> 	* lto-wrapper.c (cpuset_popcount): New function
> 	is a copy of libgomp/config/linux/proc.c.
> 	(init_num_threads): Likewise.
> 	(run_gcc): Support -flto=auto.
OK.  I won't be at all surprised if this causes headaches on some hosts,
so please watch for Solaris, AIX, etc etc issues.

Jeff
diff mbox series

Patch

diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 77a2d561e38..58656fbe1e1 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -10398,6 +10398,9 @@  parallel jobs by utilizing an installed @command{make} program.  The
 environment variable @env{MAKE} may be used to override the program
 used.  The default value for @var{n} is 1.
 
+You can specify @var{auto} to automatically detect number of
+cores that will determine the number of parallel jobs.
+
 You can also specify @option{-flto=jobserver} to use GNU make's
 job server mode to determine the number of parallel jobs. This
 is useful when the Makefile calling GCC is already executing in parallel.
diff --git a/gcc/lto-wrapper.c b/gcc/lto-wrapper.c
index 946897726d0..5451285f896 100644
--- a/gcc/lto-wrapper.c
+++ b/gcc/lto-wrapper.c
@@ -1110,6 +1110,110 @@  cmp_priority (const void *a, const void *b)
   return *((const int *)b)-*((const int *)a);
 }
 
+/* Number of CPUs that can be used for parallel LTRANS phase.  */
+
+static unsigned long nthreads_var = 0;
+
+#ifdef HAVE_PTHREAD_AFFINITY_NP
+unsigned long cpuset_size;
+static unsigned long get_cpuset_size;
+cpu_set_t *cpusetp;
+
+unsigned long
+static cpuset_popcount (unsigned long cpusetsize, cpu_set_t *cpusetp)
+{
+#ifdef CPU_COUNT_S
+  /* glibc 2.7 and above provide a macro for this.  */
+  return CPU_COUNT_S (cpusetsize, cpusetp);
+#else
+#ifdef CPU_COUNT
+  if (cpusetsize == sizeof (cpu_set_t))
+    /* glibc 2.6 and above provide a macro for this.  */
+    return CPU_COUNT (cpusetp);
+#endif
+  size_t i;
+  unsigned long ret = 0;
+  STATIC_ASSERT (sizeof (cpusetp->__bits[0]) == sizeof (unsigned long int));
+  for (i = 0; i < cpusetsize / sizeof (cpusetp->__bits[0]); i++)
+    {
+      unsigned long int mask = cpusetp->__bits[i];
+      if (mask == 0)
+	continue;
+      ret += __builtin_popcountl (mask);
+    }
+  return ret;
+#endif
+}
+#endif
+
+/* At startup, determine the default number of threads.  It would seem
+   this should be related to the number of cpus online.  */
+
+static void
+init_num_threads (void)
+{
+#ifdef HAVE_PTHREAD_AFFINITY_NP
+#if defined (_SC_NPROCESSORS_CONF) && defined (CPU_ALLOC_SIZE)
+  cpuset_size = sysconf (_SC_NPROCESSORS_CONF);
+  cpuset_size = CPU_ALLOC_SIZE (cpuset_size);
+#else
+  cpuset_size = sizeof (cpu_set_t);
+#endif
+
+  cpusetp = (cpu_set_t *) xmalloc (gomp_cpuset_size);
+  do
+    {
+      int ret = pthread_getaffinity_np (pthread_self (), gomp_cpuset_size,
+					cpusetp);
+      if (ret == 0)
+	{
+	  /* Count only the CPUs this process can use.  */
+	  nthreads_var = cpuset_popcount (cpuset_size, cpusetp);
+	  if (nthreads_var == 0)
+	    break;
+	  get_cpuset_size = cpuset_size;
+#ifdef CPU_ALLOC_SIZE
+	  unsigned long i;
+	  for (i = cpuset_size * 8; i; i--)
+	    if (CPU_ISSET_S (i - 1, cpuset_size, cpusetp))
+	      break;
+	  cpuset_size = CPU_ALLOC_SIZE (i);
+#endif
+	  return;
+	}
+      if (ret != EINVAL)
+	break;
+#ifdef CPU_ALLOC_SIZE
+      if (cpuset_size < sizeof (cpu_set_t))
+	cpuset_size = sizeof (cpu_set_t);
+      else
+	cpuset_size = cpuset_size * 2;
+      if (cpuset_size < 8 * sizeof (cpu_set_t))
+	cpusetp
+	  = (cpu_set_t *) realloc (cpusetp, cpuset_size);
+      else
+	{
+	  /* Avoid fatal if too large memory allocation would be
+	     requested, e.g. kernel returning EINVAL all the time.  */
+	  void *p = realloc (cpusetp, cpuset_size);
+	  if (p == NULL)
+	    break;
+	  cpusetp = (cpu_set_t *) p;
+	}
+#else
+      break;
+#endif
+    }
+  while (1);
+  cpuset_size = 0;
+  nthreads_var = 1;
+  free (cpusetp);
+  cpusetp = NULL;
+#endif
+#ifdef _SC_NPROCESSORS_ONLN
+  nthreads_var = sysconf (_SC_NPROCESSORS_ONLN);
+#endif
+}
 
 /* Execute gcc. ARGC is the number of arguments. ARGV contains the arguments. */
 
@@ -1124,6 +1228,7 @@  run_gcc (unsigned argc, char *argv[])
   const char *collect_gcc, *collect_gcc_options;
   int parallel = 0;
   int jobserver = 0;
+  int auto_parallel = 0;
   bool no_partition = false;
   struct cl_decoded_option *fdecoded_options = NULL;
   struct cl_decoded_option *offload_fdecoded_options = NULL;
@@ -1251,6 +1356,11 @@  run_gcc (unsigned argc, char *argv[])
 	      jobserver = 1;
 	      parallel = 1;
 	    }
+	  else if (strcmp (option->arg, "auto") == 0)
+	    {
+	      auto_parallel = 1;
+	      parallel = 1;
+	    }
 	  else
 	    {
 	      parallel = atoi (option->arg);
@@ -1291,6 +1401,7 @@  run_gcc (unsigned argc, char *argv[])
     {
       lto_mode = LTO_MODE_LTO;
       jobserver = 0;
+      auto_parallel = 0;
       parallel = 0;
     }
 
@@ -1485,6 +1596,16 @@  cont1:
 
       if (jobserver)
 	obstack_ptr_grow (&argv_obstack, xstrdup ("-fwpa=jobserver"));
+      else if (auto_parallel)
+	{
+	  char buf[256];
+	  init_num_threads ();
+	  if (verbose)
+	    fprintf (stderr, "LTO parallelism level set to %ld\n",
+		     nthreads_var);
+	  sprintf (buf, "-fwpa=%ld", nthreads_var);
+	  obstack_ptr_grow (&argv_obstack, xstrdup (buf));
+	}
       else if (parallel > 1)
 	{
 	  char buf[256];
@@ -1692,7 +1813,8 @@  cont:
 	  i = 3;
 	  if (!jobserver)
 	    {
-	      snprintf (jobs, 31, "-j%d", parallel);
+	      snprintf (jobs, 31, "-j%ld",
+			auto_parallel ? nthreads_var : parallel);
 	      new_argv[i++] = jobs;
 	    }
 	  new_argv[i++] = "all";