diff mbox

[4.7,google] Support for getting CPU type and feature information at run-time. (issue4893046)

Message ID 20110816205046.44176B21AC@azwildcat.mtv.corp.google.com
State New
Headers show

Commit Message

Sriraman Tallam Aug. 16, 2011, 8:50 p.m. UTC
Support for getting CPU type and feature information at run-time.

The following patch provides support for finding the platform type at run-time, like cpu type and features supported. The multi-versioning framework will use the builtins added to dispatch the right function version. Please refer to http://gcc.gnu.org/ml/gcc/2011-08/msg00298.html for details on function multi-versioning usability.

	* tree-pass.h (pass_tree_fold_builtin_target): New pass.
	* builtins.def (BUILT_IN_TARGET_SUPPORTS_CMOV): New builtin.
	(BUILT_IN_TARGET_SUPPORTS_MMX): New builtin.
	(BUILT_IN_TARGET_SUPPORTS_POPCOUNT): New builtin.
	(BUILT_IN_TARGET_SUPPORTS_SSE): New builtin.
	(BUILT_IN_TARGET_SUPPORTS_SSE2): New builtin.
	(BUILT_IN_TARGET_SUPPORTS_SSE3): New builtin.
	(BUILT_IN_TARGET_SUPPORTS_SSSE3): New builtin.
	(BUILT_IN_TARGET_SUPPORTS_SSE4_1): New builtin.
	(BUILT_IN_TARGET_SUPPORTS_SSE4_2): New builtin.
	(BUILT_IN_TARGET_IS_AMD): New builtin.
	(BUILT_IN_TARGET_IS_INTEL): New builtin.
	(BUILT_IN_TARGET_IS_COREI7_NEHALEM): New builtin.
	(BUILT_IN_TARGET_IS_COREI7_WESTMERE): New builtin.
	(BUILT_IN_TARGET_IS_COREI7_SANDYBRIDGE): New builtin.
	(BUILT_IN_TARGET_IS_AMDFAM10_BARCELONA): New builtin.
	(BUILT_IN_TARGET_IS_AMDFAM10_SHANGHAI): New builtin.
	(BUILT_IN_TARGET_IS_AMDFAM10_ISTANBUL): New builtin.
	* mversn-dispatch.c (do_fold_builtin_target): New function.
	(gate_fold_builtin_target): New function.
	(pass_tree_fold_builtin_target): New pass.
	* timevar.def (TV_FOLD_BUILTIN_TARGET): New var.
	* passes.c (init_optimization_passes): Add new pass to pass list.
	* config/i386/i386.c (build_struct_with_one_bit_fields): New function.
	(make_var_decl): New function.
	(get_field_from_struct): New function.
	(make_constructor_to_get_target_type): New function.
	(fold_builtin_target): New function.
	(ix86_fold_builtin): New function.
	(TARGET_FOLD_BUILTIN): New macro.

	* gcc.dg/builtin_target.c: New test.
	
	* config/i386/i386-cpuinfo.c: New file.
	* config/i386/t-cpuinfo: New file.
	* config.host: Add t-cpuinfo to link i386-cpuinfo.o with libgcc


--
This patch is available for review at http://codereview.appspot.com/4893046

Comments

H.J. Lu Aug. 16, 2011, 9:06 p.m. UTC | #1
On Tue, Aug 16, 2011 at 1:50 PM, Sriraman Tallam <tmsriram@google.com> wrote:
> Support for getting CPU type and feature information at run-time.
>
> The following patch provides support for finding the platform type at run-time, like cpu type and features supported. The multi-versioning framework will use the builtins added to dispatch the right function version. Please refer to http://gcc.gnu.org/ml/gcc/2011-08/msg00298.html for details on function multi-versioning usability.
>
>        * tree-pass.h (pass_tree_fold_builtin_target): New pass.
>        * builtins.def (BUILT_IN_TARGET_SUPPORTS_CMOV): New builtin.
>        (BUILT_IN_TARGET_SUPPORTS_MMX): New builtin.
>        (BUILT_IN_TARGET_SUPPORTS_POPCOUNT): New builtin.
>        (BUILT_IN_TARGET_SUPPORTS_SSE): New builtin.
>        (BUILT_IN_TARGET_SUPPORTS_SSE2): New builtin.
>        (BUILT_IN_TARGET_SUPPORTS_SSE3): New builtin.
>        (BUILT_IN_TARGET_SUPPORTS_SSSE3): New builtin.
>        (BUILT_IN_TARGET_SUPPORTS_SSE4_1): New builtin.
>        (BUILT_IN_TARGET_SUPPORTS_SSE4_2): New builtin.
>        (BUILT_IN_TARGET_IS_AMD): New builtin.
>        (BUILT_IN_TARGET_IS_INTEL): New builtin.
>        (BUILT_IN_TARGET_IS_COREI7_NEHALEM): New builtin.
>        (BUILT_IN_TARGET_IS_COREI7_WESTMERE): New builtin.
>        (BUILT_IN_TARGET_IS_COREI7_SANDYBRIDGE): New builtin.

Can you add Intel Atom?

>        (BUILT_IN_TARGET_IS_AMDFAM10_BARCELONA): New builtin.
>        (BUILT_IN_TARGET_IS_AMDFAM10_SHANGHAI): New builtin.
>        (BUILT_IN_TARGET_IS_AMDFAM10_ISTANBUL): New builtin.
>        * mversn-dispatch.c (do_fold_builtin_target): New function.
>        (gate_fold_builtin_target): New function.
>        (pass_tree_fold_builtin_target): New pass.
>        * timevar.def (TV_FOLD_BUILTIN_TARGET): New var.
>        * passes.c (init_optimization_passes): Add new pass to pass list.
>        * config/i386/i386.c (build_struct_with_one_bit_fields): New function.
>        (make_var_decl): New function.
>        (get_field_from_struct): New function.
>        (make_constructor_to_get_target_type): New function.
>        (fold_builtin_target): New function.
>        (ix86_fold_builtin): New function.
>        (TARGET_FOLD_BUILTIN): New macro.
>
>        * gcc.dg/builtin_target.c: New test.
>
>        * config/i386/i386-cpuinfo.c: New file.
>        * config/i386/t-cpuinfo: New file.
>        * config.host: Add t-cpuinfo to link i386-cpuinfo.o with libgcc
>

> +static void
> +get_intel_cpu (unsigned int family, unsigned int model, unsigned int brand_id)
> +{
> +  /* Parse family and model only if brand ID is 0. */
> +  if (brand_id == 0)
> +    {
> +      switch (family)
> +       {
> +       case 0x5:
> +         __cpu_type = PROCESSOR_PENTIUM;
> +         break;
> +       case 0x6:
> +         switch (model)
> +           {
> +           case 0x1a:
> +           case 0x1e:
> +           case 0x1f:
> +           case 0x2e:
> +             /* Nehalem.  */
> +             __cpu_type = PROCESSOR_COREI7_NEHALEM;
> +             __cpu_model.__cpu_is_corei7_nehalem = 1;
> +             break;
> +           case 0x25:
> +           case 0x2c:
> +           case 0x2f:
> +             /* Westmere.  */
> +             __cpu_type = PROCESSOR_COREI7_WESTMERE;
> +             __cpu_model.__cpu_is_corei7_westmere = 1;
> +             break;
> +           case 0x2a:
> +             /* Sandy Bridge.  */
> +             __cpu_type = PROCESSOR_COREI7_SANDYBRIDGE;
> +             __cpu_model.__cpu_is_corei7_sandybridge = 1;
> +             break;
> +           case 0x17:
> +           case 0x1d:
> +             /* Penryn.  */
> +           case 0x0f:
> +             /* Merom.  */
> +             __cpu_type = PROCESSOR_CORE2;
> +             break;
> +           default:
> +             __cpu_type = PROCESSOR_INTEL_GENERIC;
> +             break;
> +           }
> +         break;
> +       default:
> +         /* We have no idea.  */
> +         __cpu_type = PROCESSOR_INTEL_GENERIC;
> +         break;
> +       }
> +    }
> +}
> +

Please see config/i386/driver-i386.c for Intel CPU detection.
I will try to make it up to date.  For example, I added
model 0x2d, 0x1c, 0x26,

Thanks.
Sriraman Tallam Aug. 16, 2011, 9:14 p.m. UTC | #2
On Tue, Aug 16, 2011 at 2:06 PM, H.J. Lu <hjl.tools@gmail.com> wrote:
> On Tue, Aug 16, 2011 at 1:50 PM, Sriraman Tallam <tmsriram@google.com> wrote:
>> Support for getting CPU type and feature information at run-time.
>>
>> The following patch provides support for finding the platform type at run-time, like cpu type and features supported. The multi-versioning framework will use the builtins added to dispatch the right function version. Please refer to http://gcc.gnu.org/ml/gcc/2011-08/msg00298.html for details on function multi-versioning usability.
>>
>>        * tree-pass.h (pass_tree_fold_builtin_target): New pass.
>>        * builtins.def (BUILT_IN_TARGET_SUPPORTS_CMOV): New builtin.
>>        (BUILT_IN_TARGET_SUPPORTS_MMX): New builtin.
>>        (BUILT_IN_TARGET_SUPPORTS_POPCOUNT): New builtin.
>>        (BUILT_IN_TARGET_SUPPORTS_SSE): New builtin.
>>        (BUILT_IN_TARGET_SUPPORTS_SSE2): New builtin.
>>        (BUILT_IN_TARGET_SUPPORTS_SSE3): New builtin.
>>        (BUILT_IN_TARGET_SUPPORTS_SSSE3): New builtin.
>>        (BUILT_IN_TARGET_SUPPORTS_SSE4_1): New builtin.
>>        (BUILT_IN_TARGET_SUPPORTS_SSE4_2): New builtin.
>>        (BUILT_IN_TARGET_IS_AMD): New builtin.
>>        (BUILT_IN_TARGET_IS_INTEL): New builtin.
>>        (BUILT_IN_TARGET_IS_COREI7_NEHALEM): New builtin.
>>        (BUILT_IN_TARGET_IS_COREI7_WESTMERE): New builtin.
>>        (BUILT_IN_TARGET_IS_COREI7_SANDYBRIDGE): New builtin.
>
> Can you add Intel Atom?

Yes, I will. There is probably a lot more that is interesting which I
missed and will add if somebody sees fit.

>
>>        (BUILT_IN_TARGET_IS_AMDFAM10_BARCELONA): New builtin.
>>        (BUILT_IN_TARGET_IS_AMDFAM10_SHANGHAI): New builtin.
>>        (BUILT_IN_TARGET_IS_AMDFAM10_ISTANBUL): New builtin.
>>        * mversn-dispatch.c (do_fold_builtin_target): New function.
>>        (gate_fold_builtin_target): New function.
>>        (pass_tree_fold_builtin_target): New pass.
>>        * timevar.def (TV_FOLD_BUILTIN_TARGET): New var.
>>        * passes.c (init_optimization_passes): Add new pass to pass list.
>>        * config/i386/i386.c (build_struct_with_one_bit_fields): New function.
>>        (make_var_decl): New function.
>>        (get_field_from_struct): New function.
>>        (make_constructor_to_get_target_type): New function.
>>        (fold_builtin_target): New function.
>>        (ix86_fold_builtin): New function.
>>        (TARGET_FOLD_BUILTIN): New macro.
>>
>>        * gcc.dg/builtin_target.c: New test.
>>
>>        * config/i386/i386-cpuinfo.c: New file.
>>        * config/i386/t-cpuinfo: New file.
>>        * config.host: Add t-cpuinfo to link i386-cpuinfo.o with libgcc
>>
>
>> +static void
>> +get_intel_cpu (unsigned int family, unsigned int model, unsigned int brand_id)
>> +{
>> +  /* Parse family and model only if brand ID is 0. */
>> +  if (brand_id == 0)
>> +    {
>> +      switch (family)
>> +       {
>> +       case 0x5:
>> +         __cpu_type = PROCESSOR_PENTIUM;
>> +         break;
>> +       case 0x6:
>> +         switch (model)
>> +           {
>> +           case 0x1a:
>> +           case 0x1e:
>> +           case 0x1f:
>> +           case 0x2e:
>> +             /* Nehalem.  */
>> +             __cpu_type = PROCESSOR_COREI7_NEHALEM;
>> +             __cpu_model.__cpu_is_corei7_nehalem = 1;
>> +             break;
>> +           case 0x25:
>> +           case 0x2c:
>> +           case 0x2f:
>> +             /* Westmere.  */
>> +             __cpu_type = PROCESSOR_COREI7_WESTMERE;
>> +             __cpu_model.__cpu_is_corei7_westmere = 1;
>> +             break;
>> +           case 0x2a:
>> +             /* Sandy Bridge.  */
>> +             __cpu_type = PROCESSOR_COREI7_SANDYBRIDGE;
>> +             __cpu_model.__cpu_is_corei7_sandybridge = 1;
>> +             break;
>> +           case 0x17:
>> +           case 0x1d:
>> +             /* Penryn.  */
>> +           case 0x0f:
>> +             /* Merom.  */
>> +             __cpu_type = PROCESSOR_CORE2;
>> +             break;
>> +           default:
>> +             __cpu_type = PROCESSOR_INTEL_GENERIC;
>> +             break;
>> +           }
>> +         break;
>> +       default:
>> +         /* We have no idea.  */
>> +         __cpu_type = PROCESSOR_INTEL_GENERIC;
>> +         break;
>> +       }
>> +    }
>> +}
>> +
>
> Please see config/i386/driver-i386.c for Intel CPU detection.
> I will try to make it up to date.  For example, I added
> model 0x2d, 0x1c, 0x26,

I used the code in config/i386/driver-i386.c was used as reference.

Thanks,
-Sri.

>
> Thanks.
>
> --
> H.J.
>
Andi Kleen Aug. 16, 2011, 9:52 p.m. UTC | #3
tmsriram@google.com (Sriraman Tallam) writes:

> Support for getting CPU type and feature information at run-time.
>
> The following patch provides support for finding the platform type at run-time, like cpu type and features supported. The multi-versioning framework will use the builtins added to dispatch the right function version. Please refer to http://gcc.gnu.org/ml/gcc/2011-08/msg00298.html for details on function multi-versioning usability.

It would be nice if you could share the code for handling the model
numbers with the similar code in gcc.c

-Andi
Joseph Myers Aug. 16, 2011, 10:35 p.m. UTC | #4
On Tue, 16 Aug 2011, Sriraman Tallam wrote:

> Index: libgcc/config/i386/t-cpuinfo
> ===================================================================
> --- libgcc/config/i386/t-cpuinfo	(revision 0)
> +++ libgcc/config/i386/t-cpuinfo	(revision 0)
> @@ -0,0 +1,2 @@
> +# This is an endfile
> +LIB2ADD += $(srcdir)/config/i386/i386-cpuinfo.c

What do you mean by this comment?  That it's linked in like crt*end*.o?  
It looks to me like a normal libgcc object, not an endfile.

> Index: libgcc/config/i386/i386-cpuinfo.c
> ===================================================================
> --- libgcc/config/i386/i386-cpuinfo.c	(revision 0)
> +++ libgcc/config/i386/i386-cpuinfo.c	(revision 0)
> @@ -0,0 +1,275 @@
> +/* Copyright (C) 2011 Free Software Foundation, Inc.
> + * Contributed by Sriraman Tallam <tmsriram@google.com>.

Please format in the normal way; no leading "*" on each comment line.

> +#include <string.h>

Don't include headers not provided by GCC in libgcc without checking 
inhibit_libc, to avoid bootstrap problems.  Declaring just the functions 
you need is safer here than including a system header.

> +#ifdef __GNUC__

Such a conditional does not make sense in libgcc code.

> +/* This function will be linked in to binaries that need to look up
> +   CPU information.  */
> +
> +void
> +__cpu_indicator_init(void)

Format according to the GNU Coding Standards.

You appear not to have added any symbol versions; do you have a particular 
rationale for these functions being linked separately into each executable 
and shared library needing them, rather than being exported from the 
shared libgcc?
Sriraman Tallam Aug. 17, 2011, 12:07 a.m. UTC | #5
On Tue, Aug 16, 2011 at 3:35 PM, Joseph S. Myers
<joseph@codesourcery.com> wrote:
> On Tue, 16 Aug 2011, Sriraman Tallam wrote:
>
>> Index: libgcc/config/i386/t-cpuinfo
>> ===================================================================
>> --- libgcc/config/i386/t-cpuinfo      (revision 0)
>> +++ libgcc/config/i386/t-cpuinfo      (revision 0)
>> @@ -0,0 +1,2 @@
>> +# This is an endfile
>> +LIB2ADD += $(srcdir)/config/i386/i386-cpuinfo.c
>
> What do you mean by this comment?  That it's linked in like crt*end*.o?
> It looks to me like a normal libgcc object, not an endfile.
>
>> Index: libgcc/config/i386/i386-cpuinfo.c
>> ===================================================================
>> --- libgcc/config/i386/i386-cpuinfo.c (revision 0)
>> +++ libgcc/config/i386/i386-cpuinfo.c (revision 0)
>> @@ -0,0 +1,275 @@
>> +/* Copyright (C) 2011 Free Software Foundation, Inc.
>> + * Contributed by Sriraman Tallam <tmsriram@google.com>.
>
> Please format in the normal way; no leading "*" on each comment line.
>
>> +#include <string.h>
>
> Don't include headers not provided by GCC in libgcc without checking
> inhibit_libc, to avoid bootstrap problems.  Declaring just the functions
> you need is safer here than including a system header.
>
>> +#ifdef __GNUC__
>
> Such a conditional does not make sense in libgcc code.
>
>> +/* This function will be linked in to binaries that need to look up
>> +   CPU information.  */
>> +
>> +void
>> +__cpu_indicator_init(void)
>
> Format according to the GNU Coding Standards.
>
> You appear not to have added any symbol versions; do you have a particular
> rationale for these functions being linked separately into each executable
> and shared library needing them, rather than being exported from the
> shared libgcc?

I did not realize I could just make shared libgcc export those
symbols. I will make the changes you mentioned.

Thanks,
-Sri.

>
> --
> Joseph S. Myers
> joseph@codesourcery.com
>
Richard Biener Aug. 17, 2011, 7:37 a.m. UTC | #6
On Tue, Aug 16, 2011 at 10:50 PM, Sriraman Tallam <tmsriram@google.com> wrote:
> Support for getting CPU type and feature information at run-time.
>
> The following patch provides support for finding the platform type at run-time, like cpu type and features supported. The multi-versioning framework will use the builtins added to dispatch the right function version. Please refer to http://gcc.gnu.org/ml/gcc/2011-08/msg00298.html for details on function multi-versioning usability.

Please provide an overview why you need the new builtins, why you need
a separate pass to fold them (instead of just expanding them) and why
you are creating
vars behind the back of GCC:

+  /* Set finalized to 1, otherwise it asserts in function "write_symbol" in
+     lto-streamer-out.c. */
+  vnode->finalized = 1;

where I think you miss a varpool_finalize_node call somewhere.  Why
isn't this all done at target init time?  If you don't mark the
variable as to be preserved
like you do cgraph will optimize it all away if it isn't needed.

Richard.

>        * tree-pass.h (pass_tree_fold_builtin_target): New pass.
>        * builtins.def (BUILT_IN_TARGET_SUPPORTS_CMOV): New builtin.
>        (BUILT_IN_TARGET_SUPPORTS_MMX): New builtin.
>        (BUILT_IN_TARGET_SUPPORTS_POPCOUNT): New builtin.
>        (BUILT_IN_TARGET_SUPPORTS_SSE): New builtin.
>        (BUILT_IN_TARGET_SUPPORTS_SSE2): New builtin.
>        (BUILT_IN_TARGET_SUPPORTS_SSE3): New builtin.
>        (BUILT_IN_TARGET_SUPPORTS_SSSE3): New builtin.
>        (BUILT_IN_TARGET_SUPPORTS_SSE4_1): New builtin.
>        (BUILT_IN_TARGET_SUPPORTS_SSE4_2): New builtin.
>        (BUILT_IN_TARGET_IS_AMD): New builtin.
>        (BUILT_IN_TARGET_IS_INTEL): New builtin.
>        (BUILT_IN_TARGET_IS_COREI7_NEHALEM): New builtin.
>        (BUILT_IN_TARGET_IS_COREI7_WESTMERE): New builtin.
>        (BUILT_IN_TARGET_IS_COREI7_SANDYBRIDGE): New builtin.
>        (BUILT_IN_TARGET_IS_AMDFAM10_BARCELONA): New builtin.
>        (BUILT_IN_TARGET_IS_AMDFAM10_SHANGHAI): New builtin.
>        (BUILT_IN_TARGET_IS_AMDFAM10_ISTANBUL): New builtin.
>        * mversn-dispatch.c (do_fold_builtin_target): New function.
>        (gate_fold_builtin_target): New function.
>        (pass_tree_fold_builtin_target): New pass.
>        * timevar.def (TV_FOLD_BUILTIN_TARGET): New var.
>        * passes.c (init_optimization_passes): Add new pass to pass list.
>        * config/i386/i386.c (build_struct_with_one_bit_fields): New function.
>        (make_var_decl): New function.
>        (get_field_from_struct): New function.
>        (make_constructor_to_get_target_type): New function.
>        (fold_builtin_target): New function.
>        (ix86_fold_builtin): New function.
>        (TARGET_FOLD_BUILTIN): New macro.
>
>        * gcc.dg/builtin_target.c: New test.
>
>        * config/i386/i386-cpuinfo.c: New file.
>        * config/i386/t-cpuinfo: New file.
>        * config.host: Add t-cpuinfo to link i386-cpuinfo.o with libgcc
>
> Index: libgcc/config.host
> ===================================================================
> --- libgcc/config.host  (revision 177767)
> +++ libgcc/config.host  (working copy)
> @@ -609,7 +609,7 @@ case ${host} in
>  i[34567]86-*-linux* | x86_64-*-linux* | \
>   i[34567]86-*-kfreebsd*-gnu | i[34567]86-*-knetbsd*-gnu | \
>   i[34567]86-*-gnu*)
> -       tmake_file="${tmake_file} t-tls"
> +       tmake_file="${tmake_file} t-tls i386/t-cpuinfo"
>        if test "$libgcc_cv_cfi" = "yes"; then
>                tmake_file="${tmake_file} t-stack i386/t-stack-i386"
>        fi
> Index: libgcc/config/i386/t-cpuinfo
> ===================================================================
> --- libgcc/config/i386/t-cpuinfo        (revision 0)
> +++ libgcc/config/i386/t-cpuinfo        (revision 0)
> @@ -0,0 +1,2 @@
> +# This is an endfile
> +LIB2ADD += $(srcdir)/config/i386/i386-cpuinfo.c
> Index: libgcc/config/i386/i386-cpuinfo.c
> ===================================================================
> --- libgcc/config/i386/i386-cpuinfo.c   (revision 0)
> +++ libgcc/config/i386/i386-cpuinfo.c   (revision 0)
> @@ -0,0 +1,275 @@
> +/* Copyright (C) 2011 Free Software Foundation, Inc.
> + * Contributed by Sriraman Tallam <tmsriram@google.com>.
> + *
> + * This file is free software; you can redistribute it and/or modify it
> + * under the terms of the GNU General Public License as published by the
> + * Free Software Foundation; either version 3, or (at your option) any
> + * later version.
> + *
> + * This file is distributed in the hope that it will be useful, but
> + * WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * General Public License for more details.
> + *
> + * Under Section 7 of GPL version 3, you are granted additional
> + * permissions described in the GCC Runtime Library Exception, version
> + * 3.1, as published by the Free Software Foundation.
> + *
> + * You should have received a copy of the GNU General Public License and
> + * a copy of the GCC Runtime Library Exception along with this program;
> + * see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
> + * <http://www.gnu.org/licenses/>.
> + *
> + *
> + * This code is adapted from gcc/config/i386/driver-i386.c. The CPUID
> + * instruction is used to figure out the cpu type and supported features.
> + * GCC runs __cpu_indicator_init from a constructor which sets the members
> + * of __cpu_model and __cpu_features.
> + */
> +
> +#include <string.h>
> +
> +#ifdef __GNUC__
> +#include "cpuid.h"
> +
> +enum processor_type
> +{
> +  PROCESSOR_PENTIUM = 0,
> +  PROCESSOR_CORE2,
> +  PROCESSOR_COREI7_NEHALEM,
> +  PROCESSOR_COREI7_WESTMERE,
> +  PROCESSOR_COREI7_SANDYBRIDGE,
> +  PROCESSOR_INTEL_GENERIC,
> +  PROCESSOR_AMDFAM10_BARCELONA,
> +  PROCESSOR_AMDFAM10_SHANGHAI,
> +  PROCESSOR_AMDFAM10_ISTANBUL,
> +  PROCESSOR_AMDFAM10_GENERIC,
> +  PROCESSOR_AMD_GENERIC,
> +  PROCESSOR_GENERIC,
> +  PROCESSOR_max
> +};
> +
> +enum vendor_signatures
> +{
> +  SIG_INTEL =  0x756e6547 /* Genu */,
> +  SIG_AMD =    0x68747541 /* Auth */
> +};
> +
> +
> +/* Features supported. */
> +
> +struct __processor_features
> +{
> +  unsigned int __cpu_cmov : 1;
> +  unsigned int __cpu_mmx : 1;
> +  unsigned int __cpu_popcnt : 1;
> +  unsigned int __cpu_sse : 1;
> +  unsigned int __cpu_sse2 : 1;
> +  unsigned int __cpu_sse3 : 1;
> +  unsigned int __cpu_ssse3 : 1;
> +  unsigned int __cpu_sse4_1 : 1;
> +  unsigned int __cpu_sse4_2 : 1;
> +};
> +
> +/* Flags exported. */
> +
> +struct __processor_model
> +{
> +  unsigned int __cpu_is_amd : 1;
> +  unsigned int __cpu_is_intel : 1;
> +  unsigned int __cpu_is_corei7_nehalem : 1;
> +  unsigned int __cpu_is_corei7_westmere : 1;
> +  unsigned int __cpu_is_corei7_sandybridge : 1;
> +  unsigned int __cpu_is_amdfam10_barcelona : 1;
> +  unsigned int __cpu_is_amdfam10_shanghai : 1;
> +  unsigned int __cpu_is_amdfam10_istanbul : 1;
> +};
> +
> +enum processor_type __cpu_type = PROCESSOR_GENERIC;
> +struct __processor_features __cpu_features;
> +struct __processor_model __cpu_model;
> +
> +static void
> +get_amd_cpu (unsigned int family, unsigned int model)
> +{
> +  switch (family)
> +    {
> +    case 0x10:
> +      switch (model)
> +       {
> +       case 0x2:
> +         __cpu_type = PROCESSOR_AMDFAM10_BARCELONA;
> +         __cpu_model.__cpu_is_amdfam10_barcelona = 1;
> +         break;
> +       case 0x4:
> +         __cpu_type = PROCESSOR_AMDFAM10_SHANGHAI;
> +         __cpu_model.__cpu_is_amdfam10_shanghai = 1;
> +         break;
> +       case 0x8:
> +         __cpu_type = PROCESSOR_AMDFAM10_ISTANBUL;
> +         __cpu_model.__cpu_is_amdfam10_istanbul = 1;
> +         break;
> +       default:
> +         __cpu_type = PROCESSOR_AMDFAM10_GENERIC;
> +         break;
> +       }
> +      break;
> +    default:
> +      __cpu_type = PROCESSOR_AMD_GENERIC;
> +    }
> +}
> +
> +static void
> +get_intel_cpu (unsigned int family, unsigned int model, unsigned int brand_id)
> +{
> +  /* Parse family and model only if brand ID is 0. */
> +  if (brand_id == 0)
> +    {
> +      switch (family)
> +       {
> +       case 0x5:
> +         __cpu_type = PROCESSOR_PENTIUM;
> +         break;
> +       case 0x6:
> +         switch (model)
> +           {
> +           case 0x1a:
> +           case 0x1e:
> +           case 0x1f:
> +           case 0x2e:
> +             /* Nehalem.  */
> +             __cpu_type = PROCESSOR_COREI7_NEHALEM;
> +             __cpu_model.__cpu_is_corei7_nehalem = 1;
> +             break;
> +           case 0x25:
> +           case 0x2c:
> +           case 0x2f:
> +             /* Westmere.  */
> +             __cpu_type = PROCESSOR_COREI7_WESTMERE;
> +             __cpu_model.__cpu_is_corei7_westmere = 1;
> +             break;
> +           case 0x2a:
> +             /* Sandy Bridge.  */
> +             __cpu_type = PROCESSOR_COREI7_SANDYBRIDGE;
> +             __cpu_model.__cpu_is_corei7_sandybridge = 1;
> +             break;
> +           case 0x17:
> +           case 0x1d:
> +             /* Penryn.  */
> +           case 0x0f:
> +             /* Merom.  */
> +             __cpu_type = PROCESSOR_CORE2;
> +             break;
> +           default:
> +             __cpu_type = PROCESSOR_INTEL_GENERIC;
> +             break;
> +           }
> +         break;
> +       default:
> +         /* We have no idea.  */
> +         __cpu_type = PROCESSOR_INTEL_GENERIC;
> +         break;
> +       }
> +    }
> +}
> +
> +static void
> +get_available_features (unsigned int ecx, unsigned int edx)
> +{
> +  __cpu_features.__cpu_cmov = (edx & bit_CMOV) ? 1 : 0;
> +  __cpu_features.__cpu_mmx = (edx & bit_MMX) ? 1 : 0;
> +  __cpu_features.__cpu_sse = (edx & bit_SSE) ? 1 : 0;
> +  __cpu_features.__cpu_sse2 = (edx & bit_SSE2) ? 1 : 0;
> +  __cpu_features.__cpu_popcnt = (ecx & bit_POPCNT) ? 1 : 0;
> +  __cpu_features.__cpu_sse3 = (ecx & bit_SSE3) ? 1 : 0;
> +  __cpu_features.__cpu_ssse3 = (ecx & bit_SSSE3) ? 1 : 0;
> +  __cpu_features.__cpu_sse4_1 = (ecx & bit_SSE4_1) ? 1 : 0;
> +  __cpu_features.__cpu_sse4_2 = (ecx & bit_SSE4_2) ? 1 : 0;
> +}
> +
> +/* A noinline function calling __get_cpuid. Having many calls to
> +   cpuid in one function in 32-bit mode causes GCC to complain:
> +   "can’t find a register in class ‘CLOBBERED_REGS’".  This is
> +   related to PR rtl-optimization 44174. */
> +
> +static int __attribute__ ((noinline))
> +__get_cpuid_output (unsigned int __level,
> +                   unsigned int *__eax, unsigned int *__ebx,
> +                   unsigned int *__ecx, unsigned int *__edx)
> +{
> +  return __get_cpuid (__level, __eax, __ebx, __ecx, __edx);
> +}
> +
> +/* This function will be linked in to binaries that need to look up
> +   CPU information.  */
> +
> +void
> +__cpu_indicator_init(void)
> +{
> +  unsigned int eax, ebx, ecx, edx;
> +
> +  int max_level = 5;
> +  unsigned int vendor;
> +  unsigned int model, family, brand_id;
> +
> +  memset (&__cpu_features, 0, sizeof (struct __processor_features));
> +  memset (&__cpu_model, 0, sizeof (struct __processor_model));
> +
> +  /* Assume cpuid insn present. Run in level 0 to get vendor id. */
> +  if (!__get_cpuid_output (0, &eax, &ebx, &ecx, &edx))
> +    return;
> +
> +  vendor = ebx;
> +  max_level = eax;
> +
> +  if (max_level < 1)
> +    return;
> +
> +  if (!__get_cpuid_output (1, &eax, &ebx, &ecx, &edx))
> +    return;
> +
> +  model = (eax >> 4) & 0x0f;
> +  family = (eax >> 8) & 0x0f;
> +  brand_id = ebx & 0xff;
> +
> +  /* Adjust model and family for Intel CPUS. */
> +  if (vendor == SIG_INTEL)
> +    {
> +      unsigned int extended_model, extended_family;
> +
> +      extended_model = (eax >> 12) & 0xf0;
> +      extended_family = (eax >> 20) & 0xff;
> +      if (family == 0x0f)
> +       {
> +         family += extended_family;
> +         model += extended_model;
> +       }
> +      else if (family == 0x06)
> +       model += extended_model;
> +    }
> +
> +  /* Find CPU model. */
> +
> +  if (vendor == SIG_AMD)
> +    {
> +      __cpu_model.__cpu_is_amd = 1;
> +      get_amd_cpu (family, model);
> +    }
> +  else if (vendor == SIG_INTEL)
> +    {
> +      __cpu_model.__cpu_is_intel = 1;
> +      get_intel_cpu (family, model, brand_id);
> +    }
> +
> +  /* Find available features. */
> +  get_available_features (ecx, edx);
> +}
> +
> +#else
> +
> +void
> +__cpu_indicator_init(void)
> +{
> +}
> +
> +#endif /* __GNUC__ */
> Index: gcc/tree-pass.h
> ===================================================================
> --- gcc/tree-pass.h     (revision 177767)
> +++ gcc/tree-pass.h     (working copy)
> @@ -449,6 +449,7 @@ extern struct gimple_opt_pass pass_split_functions
>  extern struct gimple_opt_pass pass_feedback_split_functions;
>  extern struct gimple_opt_pass pass_threadsafe_analyze;
>  extern struct gimple_opt_pass pass_tree_convert_builtin_dispatch;
> +extern struct gimple_opt_pass pass_tree_fold_builtin_target;
>
>  /* IPA Passes */
>  extern struct simple_ipa_opt_pass pass_ipa_lower_emutls;
> Index: gcc/testsuite/gcc.dg/builtin_target.c
> ===================================================================
> --- gcc/testsuite/gcc.dg/builtin_target.c       (revision 0)
> +++ gcc/testsuite/gcc.dg/builtin_target.c       (revision 0)
> @@ -0,0 +1,49 @@
> +/* This test checks if the __builtin_target_* calls are recognized. */
> +
> +/* { dg-do run } */
> +
> +int
> +fn1 ()
> +{
> +  if (__builtin_target_supports_cmov () < 0)
> +    return -1;
> +  if (__builtin_target_supports_mmx () < 0)
> +    return -1;
> +  if (__builtin_target_supports_popcount () < 0)
> +    return -1;
> +  if (__builtin_target_supports_sse () < 0)
> +    return -1;
> +  if (__builtin_target_supports_sse2 () < 0)
> +    return -1;
> +  if (__builtin_target_supports_sse3 () < 0)
> +    return -1;
> +  if (__builtin_target_supports_ssse3 () < 0)
> +    return -1;
> +  if (__builtin_target_supports_sse4_1 () < 0)
> +    return -1;
> +  if (__builtin_target_supports_sse4_2 () < 0)
> +    return -1;
> +  if (__builtin_target_is_amd () < 0)
> +    return -1;
> +  if (__builtin_target_is_intel () < 0)
> +    return -1;
> +  if (__builtin_target_is_corei7_nehalem () < 0)
> +    return -1;
> +  if (__builtin_target_is_corei7_westmere () < 0)
> +    return -1;
> +  if (__builtin_target_is_corei7_sandybridge () < 0)
> +    return -1;
> +  if (__builtin_target_is_amdfam10_barcelona () < 0)
> +    return -1;
> +  if (__builtin_target_is_amdfam10_shanghai () < 0)
> +    return -1;
> +  if (__builtin_target_is_amdfam10_istanbul () < 0)
> +    return -1;
> +
> +  return 0;
> +}
> +
> +int main ()
> +{
> +  return fn1 ();
> +}
> Index: gcc/builtins.def
> ===================================================================
> --- gcc/builtins.def    (revision 177767)
> +++ gcc/builtins.def    (working copy)
> @@ -763,6 +763,25 @@ DEF_BUILTIN (BUILT_IN_EMUTLS_REGISTER_COMMON,
>  /* Multiversioning builtin dispatch hook. */
>  DEF_GCC_BUILTIN (BUILT_IN_DISPATCH, "dispatch", BT_FN_INT_PTR_FN_INT_PTR_PTR_VAR, ATTR_NULL)
>
> +/* Builtins to determine target type and features at run-time. */
> +DEF_GCC_BUILTIN (BUILT_IN_TARGET_SUPPORTS_CMOV, "target_supports_cmov", BT_FN_INT, ATTR_NULL)
> +DEF_GCC_BUILTIN (BUILT_IN_TARGET_SUPPORTS_MMX, "target_supports_mmx", BT_FN_INT, ATTR_NULL)
> +DEF_GCC_BUILTIN (BUILT_IN_TARGET_SUPPORTS_POPCOUNT, "target_supports_popcount", BT_FN_INT, ATTR_NULL)
> +DEF_GCC_BUILTIN (BUILT_IN_TARGET_SUPPORTS_SSE, "target_supports_sse", BT_FN_INT, ATTR_NULL)
> +DEF_GCC_BUILTIN (BUILT_IN_TARGET_SUPPORTS_SSE2, "target_supports_sse2", BT_FN_INT, ATTR_NULL)
> +DEF_GCC_BUILTIN (BUILT_IN_TARGET_SUPPORTS_SSE3, "target_supports_sse3", BT_FN_INT, ATTR_NULL)
> +DEF_GCC_BUILTIN (BUILT_IN_TARGET_SUPPORTS_SSSE3, "target_supports_ssse3", BT_FN_INT, ATTR_NULL)
> +DEF_GCC_BUILTIN (BUILT_IN_TARGET_SUPPORTS_SSE4_1, "target_supports_sse4_1", BT_FN_INT, ATTR_NULL)
> +DEF_GCC_BUILTIN (BUILT_IN_TARGET_SUPPORTS_SSE4_2, "target_supports_sse4_2", BT_FN_INT, ATTR_NULL)
> +DEF_GCC_BUILTIN (BUILT_IN_TARGET_IS_AMD, "target_is_amd", BT_FN_INT, ATTR_NULL)
> +DEF_GCC_BUILTIN (BUILT_IN_TARGET_IS_INTEL, "target_is_intel", BT_FN_INT, ATTR_NULL)
> +DEF_GCC_BUILTIN (BUILT_IN_TARGET_IS_COREI7_NEHALEM, "target_is_corei7_nehalem", BT_FN_INT, ATTR_NULL)
> +DEF_GCC_BUILTIN (BUILT_IN_TARGET_IS_COREI7_WESTMERE, "target_is_corei7_westmere", BT_FN_INT, ATTR_NULL)
> +DEF_GCC_BUILTIN (BUILT_IN_TARGET_IS_COREI7_SANDYBRIDGE, "target_is_corei7_sandybridge", BT_FN_INT, ATTR_NULL)
> +DEF_GCC_BUILTIN (BUILT_IN_TARGET_IS_AMDFAM10_BARCELONA, "target_is_amdfam10_barcelona", BT_FN_INT, ATTR_NULL)
> +DEF_GCC_BUILTIN (BUILT_IN_TARGET_IS_AMDFAM10_SHANGHAI, "target_is_amdfam10_shanghai", BT_FN_INT, ATTR_NULL)
> +DEF_GCC_BUILTIN (BUILT_IN_TARGET_IS_AMDFAM10_ISTANBUL, "target_is_amdfam10_istanbul", BT_FN_INT, ATTR_NULL)
> +
>  /* Exception support.  */
>  DEF_BUILTIN_STUB (BUILT_IN_UNWIND_RESUME, "__builtin_unwind_resume")
>  DEF_BUILTIN_STUB (BUILT_IN_CXA_END_CLEANUP, "__builtin_cxa_end_cleanup")
> Index: gcc/mversn-dispatch.c
> ===================================================================
> --- gcc/mversn-dispatch.c       (revision 177767)
> +++ gcc/mversn-dispatch.c       (working copy)
> @@ -135,6 +135,7 @@ along with GCC; see the file COPYING3.  If not see
>  #include "output.h"
>  #include "vecprim.h"
>  #include "gimple-pretty-print.h"
> +#include "target.h"
>
>  typedef struct cgraph_node* NODEPTR;
>  DEF_VEC_P (NODEPTR);
> @@ -1764,3 +1765,103 @@ struct gimple_opt_pass pass_tree_convert_builtin_d
>   TODO_update_ssa | TODO_verify_ssa
>  }
>  };
> +
> +/* Fold calls to __builtin_target_* */
> +
> +static unsigned int
> +do_fold_builtin_target (void)
> +{
> +  basic_block bb;
> +  gimple_stmt_iterator gsi;
> +
> +  /* Go through each stmt looking for __builtin_target_* calls */
> +  FOR_EACH_BB_FN (bb, DECL_STRUCT_FUNCTION (current_function_decl))
> +    {
> +      for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
> +        {
> +         gimple stmt = gsi_stmt (gsi);
> +         gimple assign_stmt;
> +          tree call_decl;
> +         tree lhs_retval;
> +         tree folded_val;
> +
> +         tree ssa_var, tmp_var;
> +         gimple init_stmt;
> +
> +          if (!is_gimple_call (stmt))
> +            continue;
> +
> +          call_decl = gimple_call_fndecl (stmt);
> +
> +         /* Check if it is a __builtin_target_* call. */
> +
> +         if (call_decl == NULL
> +             || DECL_NAME (call_decl) == NULL_TREE
> +             || DECL_BUILT_IN_CLASS (call_decl) != BUILT_IN_NORMAL
> +             || strstr (IDENTIFIER_POINTER (DECL_NAME (call_decl)),
> +                         "__builtin_target") == NULL)
> +            continue;
> +
> +         /* If the lhs is NULL there is no need to fold the call. */
> +         lhs_retval = gimple_call_lhs(stmt);
> +         if (lhs_retval == NULL)
> +           continue;
> +
> +         /* Call the target hook to fold the builtin */
> +          folded_val = targetm.fold_builtin(call_decl, 0, NULL, false);
> +
> +         /* If the target does not support the builtin then fold it to zero. */
> +         if (folded_val == NULL_TREE)
> +           folded_val = build_zero_cst (unsigned_type_node);
> +
> +         /* Type cast unsigned value to integer */
> +         tmp_var = create_tmp_var (unsigned_type_node, NULL);
> +         init_stmt = gimple_build_assign (tmp_var, folded_val);
> +         ssa_var = make_ssa_name (tmp_var, init_stmt);
> +         gimple_assign_set_lhs (init_stmt, ssa_var);
> +         mark_symbols_for_renaming (init_stmt);
> +
> +         assign_stmt = gimple_build_assign_with_ops (NOP_EXPR, lhs_retval, ssa_var, 0);
> +         mark_symbols_for_renaming(assign_stmt);
> +
> +         gsi_insert_after_without_update (&gsi, assign_stmt, GSI_SAME_STMT);
> +         gsi_insert_after_without_update (&gsi, init_stmt, GSI_SAME_STMT);
> +         /* Delete the original call. */
> +         gsi_remove(&gsi, true);
> +       }
> +    }
> +
> +  return 0;
> +}
> +
> +static bool
> +gate_fold_builtin_target (void)
> +{
> +  return true;
> +}
> +
> +/* Pass to fold __builtin_target_* functions */
> +
> +struct gimple_opt_pass pass_tree_fold_builtin_target =
> +{
> + {
> +  GIMPLE_PASS,
> +  "fold_builtin_target",               /* name */
> +  gate_fold_builtin_target,            /* gate */
> +  do_fold_builtin_target,              /* execute */
> +  NULL,                                        /* sub */
> +  NULL,                                        /* next */
> +  0,                                   /* static_pass_number */
> +  TV_FOLD_BUILTIN_TARGET,              /* tv_id */
> +  PROP_cfg,                            /* properties_required */
> +  PROP_cfg,                            /* properties_provided */
> +  0,                                   /* properties_destroyed */
> +  0,                                   /* todo_flags_start */
> +  TODO_dump_func |                     /* todo_flags_finish */
> +  TODO_cleanup_cfg |
> +  TODO_update_ssa |
> +  TODO_verify_ssa
> + }
> +};
> +
> +
> Index: gcc/timevar.def
> ===================================================================
> --- gcc/timevar.def     (revision 177767)
> +++ gcc/timevar.def     (working copy)
> @@ -124,6 +124,7 @@ DEFTIMEVAR (TV_PARSE_INMETH          , "parser inl
>  DEFTIMEVAR (TV_TEMPLATE_INST         , "template instantiation")
>  DEFTIMEVAR (TV_INLINE_HEURISTICS     , "inline heuristics")
>  DEFTIMEVAR (TV_MVERSN_DISPATCH       , "multiversion dispatch")
> +DEFTIMEVAR (TV_FOLD_BUILTIN_TARGET   , "fold __builtin_target calls")
>  DEFTIMEVAR (TV_INTEGRATION           , "integration")
>  DEFTIMEVAR (TV_TREE_GIMPLIFY        , "tree gimplify")
>  DEFTIMEVAR (TV_TREE_EH              , "tree eh")
> Index: gcc/passes.c
> ===================================================================
> --- gcc/passes.c        (revision 177767)
> +++ gcc/passes.c        (working copy)
> @@ -1249,6 +1249,8 @@ init_optimization_passes (void)
>     {
>       struct opt_pass **p = &pass_ipa_multiversion_dispatch.pass.sub;
>       NEXT_PASS (pass_tree_convert_builtin_dispatch);
> +      /* Fold calls to __builtin_target_*. */
> +      NEXT_PASS (pass_tree_fold_builtin_target);
>       /* Rebuilding cgraph edges is necessary as the above passes change
>          the call graph.  Otherwise, future optimizations use the old
>         call graph and make wrong decisions sometimes.*/
> Index: gcc/config/i386/i386.c
> ===================================================================
> --- gcc/config/i386/i386.c      (revision 177767)
> +++ gcc/config/i386/i386.c      (working copy)
> @@ -58,6 +58,8 @@ along with GCC; see the file COPYING3.  If not see
>  #include "sched-int.h"
>  #include "sbitmap.h"
>  #include "fibheap.h"
> +#include "tree-flow.h"
> +#include "tree-pass.h"
>
>  enum upper_128bits_state
>  {
> @@ -7867,6 +7869,338 @@ ix86_build_builtin_va_list (void)
>   return ret;
>  }
>
> +/* Returns a struct type with name NAME and number of fields equal to
> +   NUM_FIELDS.  Each field is a unsigned int bit field of length 1 bit. */
> +
> +static tree
> +build_struct_with_one_bit_fields (int num_fields, const char *name)
> +{
> +  int i;
> +  char field_name [10];
> +  tree field = NULL_TREE, field_chain = NULL_TREE;
> +  tree type = make_node (RECORD_TYPE);
> +
> +  strcpy (field_name, "k_field");
> +
> +  for (i = 0; i < num_fields; i++)
> +    {
> +      /* Name the fields, 0_field, 1_field, ... */
> +      field_name [0] = '0' + i;
> +      field = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
> +                         get_identifier (field_name), unsigned_type_node);
> +      DECL_BIT_FIELD (field) = 1;
> +      DECL_SIZE (field) = bitsize_one_node;
> +      if (field_chain != NULL_TREE)
> +       DECL_CHAIN (field) = field_chain;
> +      field_chain = field;
> +    }
> +  finish_builtin_struct (type, name, field_chain, NULL_TREE);
> +  return type;
> +}
> +
> +/* Returns a VAR_DECL of type TYPE and name NAME. */
> +
> +static tree
> +make_var_decl (tree type, const char *name)
> +{
> +  tree new_decl;
> +  struct varpool_node *vnode;
> +
> +  new_decl = build_decl (UNKNOWN_LOCATION,
> +                        VAR_DECL,
> +                        get_identifier(name),
> +                        type);
> +
> +  DECL_EXTERNAL (new_decl) = 1;
> +  TREE_STATIC (new_decl) = 1;
> +  TREE_PUBLIC (new_decl) = 1;
> +  DECL_INITIAL (new_decl) = 0;
> +  DECL_ARTIFICIAL (new_decl) = 0;
> +  DECL_PRESERVE_P (new_decl) = 1;
> +
> +  make_decl_one_only (new_decl, DECL_ASSEMBLER_NAME (new_decl));
> +  assemble_variable (new_decl, 0, 0, 0);
> +
> +  vnode = varpool_node (new_decl);
> +  gcc_assert (vnode != NULL);
> +  /* Set finalized to 1, otherwise it asserts in function "write_symbol" in
> +     lto-streamer-out.c. */
> +  vnode->finalized = 1;
> +
> +  return new_decl;
> +}
> +
> +/* Traverses the chain of fields in STRUCT_TYPE and returns the FIELD_NUM
> +   numbered field. */
> +
> +static tree
> +get_field_from_struct (tree struct_type, int field_num)
> +{
> +  int i;
> +  tree field = TYPE_FIELDS (struct_type);
> +
> +  for (i = 0; i < field_num; i++, field = DECL_CHAIN(field))
> +    {
> +      gcc_assert (field != NULL_TREE);
> +    }
> +
> +  return field;
> +}
> +
> +/* Create a new static constructor that calls __cpu_indicator_init ()
> +   function defined in libgcc/config/i386-cpuinfo.c which runs cpuid
> +   to figure out the type of the target. */
> +
> +static tree
> +make_constructor_to_get_target_type (const char *name)
> +{
> +  tree decl, type, t;
> +  gimple_seq seq;
> +  basic_block new_bb;
> +  tree old_current_function_decl;
> +
> +  tree __cpu_indicator_int_decl;
> +  gimple constructor_body;
> +
> +
> +  type = build_function_type_list (void_type_node, NULL_TREE);
> +
> +  /* Make a call stmt to __cpu_indicator_init */
> +  __cpu_indicator_int_decl = build_fn_decl ("__cpu_indicator_init", type);
> +  constructor_body = gimple_build_call (__cpu_indicator_int_decl, 0);
> +  DECL_EXTERNAL (__cpu_indicator_int_decl) = 1;
> +
> +  decl = build_fn_decl (name, type);
> +
> +  DECL_NAME (decl) = get_identifier (name);
> +  SET_DECL_ASSEMBLER_NAME (decl, DECL_NAME (decl));
> +  gcc_assert (cgraph_node (decl) != NULL);
> +
> +  TREE_USED (decl) = 1;
> +  DECL_ARTIFICIAL (decl) = 1;
> +  DECL_IGNORED_P (decl) = 0;
> +  TREE_PUBLIC (decl) = 0;
> +  DECL_UNINLINABLE (decl) = 1;
> +  DECL_EXTERNAL (decl) = 0;
> +  DECL_CONTEXT (decl) = NULL_TREE;
> +  DECL_INITIAL (decl) = make_node (BLOCK);
> +  DECL_STATIC_CONSTRUCTOR (decl) = 1;
> +  TREE_READONLY (decl) = 0;
> +  DECL_PURE_P (decl) = 0;
> +
> +  /* This is a comdat. */
> +  make_decl_one_only (decl, DECL_ASSEMBLER_NAME (decl));
> +
> +  /* Build result decl and add to function_decl. */
> +  t = build_decl (UNKNOWN_LOCATION, RESULT_DECL, NULL_TREE, void_type_node);
> +  DECL_ARTIFICIAL (t) = 1;
> +  DECL_IGNORED_P (t) = 1;
> +  DECL_RESULT (decl) = t;
> +
> +  gimplify_function_tree (decl);
> +
> +  /* Build CFG for this function. */
> +
> +  old_current_function_decl = current_function_decl;
> +  push_cfun (DECL_STRUCT_FUNCTION (decl));
> +  current_function_decl = decl;
> +  init_empty_tree_cfg_for_function (DECL_STRUCT_FUNCTION (decl));
> +  cfun->curr_properties |=
> +    (PROP_gimple_lcf | PROP_gimple_leh | PROP_cfg | PROP_referenced_vars |
> +     PROP_ssa);
> +  new_bb = create_empty_bb (ENTRY_BLOCK_PTR);
> +  make_edge (ENTRY_BLOCK_PTR, new_bb, EDGE_FALLTHRU);
> +
> +  /* XXX: Not sure if the edge commented below is necessary.  If I add this
> +     edge, it fails in gimple_verify_flow_info in tree-cfg.c in condition :
> +     " if (e->flags & EDGE_FALLTHRU)"
> +     during -fprofile-generate.
> +     Otherwise, it is fine.  Deleting this edge does not break anything.
> +     Commenting this so that it is clear I am intentionally not doing this.*/
> +  /* make_edge (new_bb, EXIT_BLOCK_PTR, EDGE_FALLTHRU); */
> +
> +  seq = gimple_seq_alloc_with_stmt (constructor_body);
> +
> +  set_bb_seq (new_bb, seq);
> +  gimple_set_bb (constructor_body, new_bb);
> +
> +  /* Set the lexical block of the constructor body. Fails the inliner
> +     other wise. */
> +  gimple_set_block (constructor_body, DECL_INITIAL (decl));
> +
> +  /* This call is very important if this pass runs when the IR is in
> +     SSA form.  It breaks things in strange ways otherwise. */
> +  init_tree_ssa (DECL_STRUCT_FUNCTION (decl));
> +  /* add_referenced_var (version_selector_var); */
> +
> +  cgraph_add_new_function (decl, true);
> +  cgraph_call_function_insertion_hooks (cgraph_node (decl));
> +  cgraph_mark_needed_node (cgraph_node (decl));
> +
> +  pop_cfun ();
> +  current_function_decl = old_current_function_decl;
> +  return decl;
> +}
> +
> +/* FNDECL is a __builtin_target_* call that is folded into an integer defined
> +   in libgcc/config/i386/i386-cpuinfo.c */
> +
> +static tree
> +fold_builtin_target (tree fndecl)
> +{
> +  /* This is the order of bit-fields in __processor_features in
> +     i386-cpuinfo.c */
> +  enum processor_features
> +  {
> +    F_CMOV = 0,
> +    F_MMX,
> +    F_POPCNT,
> +    F_SSE,
> +    F_SSE2,
> +    F_SSE3,
> +    F_SSSE3,
> +    F_SSE4_1,
> +    F_SSE4_2,
> +    F_MAX
> +  };
> +
> +  /* This is the order of bit-fields in __processor_model in
> +     i386-cpuinfo.c */
> +  enum processor_model
> +  {
> +    M_AMD = 0,
> +    M_INTEL,
> +    M_COREI7_NEHALEM,
> +    M_COREI7_WESTMERE,
> +    M_COREI7_SANDYBRIDGE,
> +    M_AMDFAM10_BARCELONA,
> +    M_AMDFAM10_SHANGHAI,
> +    M_AMDFAM10_ISTANBUL,
> +    M_MAX
> +  };
> +
> +  static tree __processor_features_type = NULL_TREE;
> +  static tree __cpu_features_var = NULL_TREE;
> +  static tree __processor_model_type = NULL_TREE;
> +  static tree __cpu_model_var = NULL_TREE;
> +  static tree ctor_decl = NULL_TREE;
> +  static tree field;
> +  static tree which_struct;
> +
> +  /* Make a call to __cpu_indicatior_init in a constructor.
> +     Function __cpu_indicator_init is defined in i386-cpuinfo.c. */
> +  if (ctor_decl == NULL_TREE)
> +   ctor_decl = make_constructor_to_get_target_type
> +               ("__cpu_indicator_init_ctor");
> +
> +  if (__processor_features_type == NULL_TREE)
> +    __processor_features_type = build_struct_with_one_bit_fields (F_MAX,
> +                                 "__processor_features");
> +
> +  if (__processor_model_type == NULL_TREE)
> +    __processor_model_type = build_struct_with_one_bit_fields (M_MAX,
> +                                 "__processor_model");
> +
> +  if (__cpu_features_var == NULL_TREE)
> +    __cpu_features_var = make_var_decl (__processor_features_type,
> +                                       "__cpu_features");
> +
> +  if (__cpu_model_var == NULL_TREE)
> +    __cpu_model_var = make_var_decl (__processor_model_type,
> +                                    "__cpu_model");
> +
> +  /* Look at fndecl code to identify the field requested. */
> +  switch (DECL_FUNCTION_CODE (fndecl))
> +    {
> +    case BUILT_IN_TARGET_SUPPORTS_CMOV:
> +      field = get_field_from_struct (__processor_features_type, F_CMOV);
> +      which_struct = __cpu_features_var;
> +      break;
> +    case BUILT_IN_TARGET_SUPPORTS_MMX:
> +      field = get_field_from_struct (__processor_features_type, F_MMX);
> +      which_struct = __cpu_features_var;
> +      break;
> +    case BUILT_IN_TARGET_SUPPORTS_POPCOUNT:
> +      field = get_field_from_struct (__processor_features_type, F_POPCNT);
> +      which_struct = __cpu_features_var;
> +      break;
> +    case BUILT_IN_TARGET_SUPPORTS_SSE:
> +      field = get_field_from_struct (__processor_features_type, F_SSE);
> +      which_struct = __cpu_features_var;
> +      break;
> +    case BUILT_IN_TARGET_SUPPORTS_SSE2:
> +      field = get_field_from_struct (__processor_features_type, F_SSE2);
> +      which_struct = __cpu_features_var;
> +      break;
> +    case BUILT_IN_TARGET_SUPPORTS_SSE3:
> +      field = get_field_from_struct (__processor_features_type, F_SSE3);
> +      which_struct = __cpu_features_var;
> +      break;
> +    case BUILT_IN_TARGET_SUPPORTS_SSSE3:
> +      field = get_field_from_struct (__processor_features_type, F_SSE3);
> +      which_struct = __cpu_features_var;
> +      break;
> +    case BUILT_IN_TARGET_SUPPORTS_SSE4_1:
> +      field = get_field_from_struct (__processor_features_type, F_SSE4_1);
> +      which_struct = __cpu_features_var;
> +      break;
> +    case BUILT_IN_TARGET_SUPPORTS_SSE4_2:
> +      field = get_field_from_struct (__processor_features_type, F_SSE4_2);
> +      which_struct = __cpu_features_var;
> +      break;
> +    case BUILT_IN_TARGET_IS_AMD:
> +      field = get_field_from_struct (__processor_model_type, M_AMD);;
> +      which_struct = __cpu_model_var;
> +      break;
> +    case BUILT_IN_TARGET_IS_INTEL:
> +      field = get_field_from_struct (__processor_model_type, M_INTEL);;
> +      which_struct = __cpu_model_var;
> +      break;
> +    case BUILT_IN_TARGET_IS_COREI7_NEHALEM:
> +      field = get_field_from_struct (__processor_model_type, M_COREI7_NEHALEM);;
> +      which_struct = __cpu_model_var;
> +      break;
> +    case BUILT_IN_TARGET_IS_COREI7_WESTMERE:
> +      field = get_field_from_struct (__processor_model_type, M_COREI7_WESTMERE);;
> +      which_struct = __cpu_model_var;
> +      break;
> +    case BUILT_IN_TARGET_IS_COREI7_SANDYBRIDGE:
> +      field = get_field_from_struct (__processor_model_type, M_COREI7_SANDYBRIDGE);;
> +      which_struct = __cpu_model_var;
> +      break;
> +    case BUILT_IN_TARGET_IS_AMDFAM10_BARCELONA:
> +      field = get_field_from_struct (__processor_model_type, M_AMDFAM10_BARCELONA);;
> +      which_struct = __cpu_model_var;
> +      break;
> +    case BUILT_IN_TARGET_IS_AMDFAM10_SHANGHAI:
> +      field = get_field_from_struct (__processor_model_type, M_AMDFAM10_SHANGHAI);;
> +      which_struct = __cpu_model_var;
> +      break;
> +    case BUILT_IN_TARGET_IS_AMDFAM10_ISTANBUL:
> +      field = get_field_from_struct (__processor_model_type, M_AMDFAM10_ISTANBUL);;
> +      which_struct = __cpu_model_var;
> +      break;
> +    default:
> +      return NULL_TREE;
> +    }
> +
> +  return build3 (COMPONENT_REF, TREE_TYPE (field), which_struct, field, NULL_TREE);
> +}
> +
> +/* Folds __builtin_target_* builtins. */
> +
> +static tree
> +ix86_fold_builtin (tree fndecl, int n_args ATTRIBUTE_UNUSED,
> +                   tree *args ATTRIBUTE_UNUSED, bool ignore ATTRIBUTE_UNUSED)
> +{
> +  const char *decl_name = IDENTIFIER_POINTER (DECL_NAME (fndecl));
> +  if (DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_NORMAL
> +      && strstr(decl_name, "__builtin_target") != NULL)
> +    return fold_builtin_target (fndecl);
> +
> +  return NULL_TREE;
> +}
> +
>  /* Worker function for TARGET_SETUP_INCOMING_VARARGS.  */
>
>  static void
> @@ -35097,6 +35431,9 @@ ix86_autovectorize_vector_sizes (void)
>  #undef TARGET_BUILD_BUILTIN_VA_LIST
>  #define TARGET_BUILD_BUILTIN_VA_LIST ix86_build_builtin_va_list
>
> +#undef TARGET_FOLD_BUILTIN
> +#define TARGET_FOLD_BUILTIN ix86_fold_builtin
> +
>  #undef TARGET_ENUM_VA_LIST_P
>  #define TARGET_ENUM_VA_LIST_P ix86_enum_va_list
>
>
> --
> This patch is available for review at http://codereview.appspot.com/4893046
>
Sriraman Tallam Aug. 17, 2011, 5:54 p.m. UTC | #7
On Wed, Aug 17, 2011 at 12:37 AM, Richard Guenther
<richard.guenther@gmail.com> wrote:
> On Tue, Aug 16, 2011 at 10:50 PM, Sriraman Tallam <tmsriram@google.com> wrote:
>> Support for getting CPU type and feature information at run-time.
>>
>> The following patch provides support for finding the platform type at run-time, like cpu type and features supported. The multi-versioning framework will use the builtins added to dispatch the right function version. Please refer to http://gcc.gnu.org/ml/gcc/2011-08/msg00298.html for details on function multi-versioning usability.
>
> Please provide an overview why you need the new builtins,

For multi-versioning,  the compiler can call the appropriate builtin
to dispatch the right version. The builtin call will later get folded.

For example,

int  __attribute__ version ("sse4_1")
compute ()
{
   // Do sse4_1 specific impkementation.
}

int
compute ()
{
  // Generic implementation
}

The compiler will check if the target supports the attribute and then
convert a call to compute ()  into  this:

if (__builtin_target_supports_sse4_1 ())
  compute_sse4_1 (); // Call to the SSE4_1 implementation
else
  compute_generic (); // Call to the generic implementation

Further, having it as builtin function allows it to be overridden by
the programmer. For instance, the programmer can override it to
identify newer CPU types not yet supported. Having these builtins
makes it convenient to identify platform type and features in general.

why you need
> a separate pass to fold them (instead of just expanding them) and why

I can move it into builtins.c along with where other builtins are
folded and remove the separate pass. My intention originally was to
fold them as early as possible, in this case after multi-versioning
but I guess this is not a requirement.

> you are creating
> vars behind the back of GCC:

The flow I had in mind was to have functions in libgcc which will use
CPUID to get target features and set global vars corresponding to the
features. So, the builtin should be folded by into the appropriate
variable in libgcc.

>
> +  /* Set finalized to 1, otherwise it asserts in function "write_symbol" in
> +     lto-streamer-out.c. */
> +  vnode->finalized = 1;
>
> where I think you miss a varpool_finalize_node call somewhere.  Why
> isn't this all done at target init time

I wanted to do this on demand. If none of the new builtins are called
in the program, I do not need to to do this at all. In summary, libgcc
has a function called __cpu_indicator_init which does the work of
determining target features and setting the appropriate globals. If
the new builtins are called, gcc will call __cpu_indicator_init in a
constructor so that it is called exactly once. Then, gcc will fold the
builtin to the appropriate global variable.


?  If you don't mark the
> variable as to be preserved
> like you do cgraph will optimize it all away if it isn't needed.

>
> Richard.
>
>>        * tree-pass.h (pass_tree_fold_builtin_target): New pass.
>>        * builtins.def (BUILT_IN_TARGET_SUPPORTS_CMOV): New builtin.
>>        (BUILT_IN_TARGET_SUPPORTS_MMX): New builtin.
>>        (BUILT_IN_TARGET_SUPPORTS_POPCOUNT): New builtin.
>>        (BUILT_IN_TARGET_SUPPORTS_SSE): New builtin.
>>        (BUILT_IN_TARGET_SUPPORTS_SSE2): New builtin.
>>        (BUILT_IN_TARGET_SUPPORTS_SSE3): New builtin.
>>        (BUILT_IN_TARGET_SUPPORTS_SSSE3): New builtin.
>>        (BUILT_IN_TARGET_SUPPORTS_SSE4_1): New builtin.
>>        (BUILT_IN_TARGET_SUPPORTS_SSE4_2): New builtin.
>>        (BUILT_IN_TARGET_IS_AMD): New builtin.
>>        (BUILT_IN_TARGET_IS_INTEL): New builtin.
>>        (BUILT_IN_TARGET_IS_COREI7_NEHALEM): New builtin.
>>        (BUILT_IN_TARGET_IS_COREI7_WESTMERE): New builtin.
>>        (BUILT_IN_TARGET_IS_COREI7_SANDYBRIDGE): New builtin.
>>        (BUILT_IN_TARGET_IS_AMDFAM10_BARCELONA): New builtin.
>>        (BUILT_IN_TARGET_IS_AMDFAM10_SHANGHAI): New builtin.
>>        (BUILT_IN_TARGET_IS_AMDFAM10_ISTANBUL): New builtin.
>>        * mversn-dispatch.c (do_fold_builtin_target): New function.
>>        (gate_fold_builtin_target): New function.
>>        (pass_tree_fold_builtin_target): New pass.
>>        * timevar.def (TV_FOLD_BUILTIN_TARGET): New var.
>>        * passes.c (init_optimization_passes): Add new pass to pass list.
>>        * config/i386/i386.c (build_struct_with_one_bit_fields): New function.
>>        (make_var_decl): New function.
>>        (get_field_from_struct): New function.
>>        (make_constructor_to_get_target_type): New function.
>>        (fold_builtin_target): New function.
>>        (ix86_fold_builtin): New function.
>>        (TARGET_FOLD_BUILTIN): New macro.
>>
>>        * gcc.dg/builtin_target.c: New test.
>>
>>        * config/i386/i386-cpuinfo.c: New file.
>>        * config/i386/t-cpuinfo: New file.
>>        * config.host: Add t-cpuinfo to link i386-cpuinfo.o with libgcc
>>
>> Index: libgcc/config.host
>> ===================================================================
>> --- libgcc/config.host  (revision 177767)
>> +++ libgcc/config.host  (working copy)
>> @@ -609,7 +609,7 @@ case ${host} in
>>  i[34567]86-*-linux* | x86_64-*-linux* | \
>>   i[34567]86-*-kfreebsd*-gnu | i[34567]86-*-knetbsd*-gnu | \
>>   i[34567]86-*-gnu*)
>> -       tmake_file="${tmake_file} t-tls"
>> +       tmake_file="${tmake_file} t-tls i386/t-cpuinfo"
>>        if test "$libgcc_cv_cfi" = "yes"; then
>>                tmake_file="${tmake_file} t-stack i386/t-stack-i386"
>>        fi
>> Index: libgcc/config/i386/t-cpuinfo
>> ===================================================================
>> --- libgcc/config/i386/t-cpuinfo        (revision 0)
>> +++ libgcc/config/i386/t-cpuinfo        (revision 0)
>> @@ -0,0 +1,2 @@
>> +# This is an endfile
>> +LIB2ADD += $(srcdir)/config/i386/i386-cpuinfo.c
>> Index: libgcc/config/i386/i386-cpuinfo.c
>> ===================================================================
>> --- libgcc/config/i386/i386-cpuinfo.c   (revision 0)
>> +++ libgcc/config/i386/i386-cpuinfo.c   (revision 0)
>> @@ -0,0 +1,275 @@
>> +/* Copyright (C) 2011 Free Software Foundation, Inc.
>> + * Contributed by Sriraman Tallam <tmsriram@google.com>.
>> + *
>> + * This file is free software; you can redistribute it and/or modify it
>> + * under the terms of the GNU General Public License as published by the
>> + * Free Software Foundation; either version 3, or (at your option) any
>> + * later version.
>> + *
>> + * This file is distributed in the hope that it will be useful, but
>> + * WITHOUT ANY WARRANTY; without even the implied warranty of
>> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
>> + * General Public License for more details.
>> + *
>> + * Under Section 7 of GPL version 3, you are granted additional
>> + * permissions described in the GCC Runtime Library Exception, version
>> + * 3.1, as published by the Free Software Foundation.
>> + *
>> + * You should have received a copy of the GNU General Public License and
>> + * a copy of the GCC Runtime Library Exception along with this program;
>> + * see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
>> + * <http://www.gnu.org/licenses/>.
>> + *
>> + *
>> + * This code is adapted from gcc/config/i386/driver-i386.c. The CPUID
>> + * instruction is used to figure out the cpu type and supported features.
>> + * GCC runs __cpu_indicator_init from a constructor which sets the members
>> + * of __cpu_model and __cpu_features.
>> + */
>> +
>> +#include <string.h>
>> +
>> +#ifdef __GNUC__
>> +#include "cpuid.h"
>> +
>> +enum processor_type
>> +{
>> +  PROCESSOR_PENTIUM = 0,
>> +  PROCESSOR_CORE2,
>> +  PROCESSOR_COREI7_NEHALEM,
>> +  PROCESSOR_COREI7_WESTMERE,
>> +  PROCESSOR_COREI7_SANDYBRIDGE,
>> +  PROCESSOR_INTEL_GENERIC,
>> +  PROCESSOR_AMDFAM10_BARCELONA,
>> +  PROCESSOR_AMDFAM10_SHANGHAI,
>> +  PROCESSOR_AMDFAM10_ISTANBUL,
>> +  PROCESSOR_AMDFAM10_GENERIC,
>> +  PROCESSOR_AMD_GENERIC,
>> +  PROCESSOR_GENERIC,
>> +  PROCESSOR_max
>> +};
>> +
>> +enum vendor_signatures
>> +{
>> +  SIG_INTEL =  0x756e6547 /* Genu */,
>> +  SIG_AMD =    0x68747541 /* Auth */
>> +};
>> +
>> +
>> +/* Features supported. */
>> +
>> +struct __processor_features
>> +{
>> +  unsigned int __cpu_cmov : 1;
>> +  unsigned int __cpu_mmx : 1;
>> +  unsigned int __cpu_popcnt : 1;
>> +  unsigned int __cpu_sse : 1;
>> +  unsigned int __cpu_sse2 : 1;
>> +  unsigned int __cpu_sse3 : 1;
>> +  unsigned int __cpu_ssse3 : 1;
>> +  unsigned int __cpu_sse4_1 : 1;
>> +  unsigned int __cpu_sse4_2 : 1;
>> +};
>> +
>> +/* Flags exported. */
>> +
>> +struct __processor_model
>> +{
>> +  unsigned int __cpu_is_amd : 1;
>> +  unsigned int __cpu_is_intel : 1;
>> +  unsigned int __cpu_is_corei7_nehalem : 1;
>> +  unsigned int __cpu_is_corei7_westmere : 1;
>> +  unsigned int __cpu_is_corei7_sandybridge : 1;
>> +  unsigned int __cpu_is_amdfam10_barcelona : 1;
>> +  unsigned int __cpu_is_amdfam10_shanghai : 1;
>> +  unsigned int __cpu_is_amdfam10_istanbul : 1;
>> +};
>> +
>> +enum processor_type __cpu_type = PROCESSOR_GENERIC;
>> +struct __processor_features __cpu_features;
>> +struct __processor_model __cpu_model;
>> +
>> +static void
>> +get_amd_cpu (unsigned int family, unsigned int model)
>> +{
>> +  switch (family)
>> +    {
>> +    case 0x10:
>> +      switch (model)
>> +       {
>> +       case 0x2:
>> +         __cpu_type = PROCESSOR_AMDFAM10_BARCELONA;
>> +         __cpu_model.__cpu_is_amdfam10_barcelona = 1;
>> +         break;
>> +       case 0x4:
>> +         __cpu_type = PROCESSOR_AMDFAM10_SHANGHAI;
>> +         __cpu_model.__cpu_is_amdfam10_shanghai = 1;
>> +         break;
>> +       case 0x8:
>> +         __cpu_type = PROCESSOR_AMDFAM10_ISTANBUL;
>> +         __cpu_model.__cpu_is_amdfam10_istanbul = 1;
>> +         break;
>> +       default:
>> +         __cpu_type = PROCESSOR_AMDFAM10_GENERIC;
>> +         break;
>> +       }
>> +      break;
>> +    default:
>> +      __cpu_type = PROCESSOR_AMD_GENERIC;
>> +    }
>> +}
>> +
>> +static void
>> +get_intel_cpu (unsigned int family, unsigned int model, unsigned int brand_id)
>> +{
>> +  /* Parse family and model only if brand ID is 0. */
>> +  if (brand_id == 0)
>> +    {
>> +      switch (family)
>> +       {
>> +       case 0x5:
>> +         __cpu_type = PROCESSOR_PENTIUM;
>> +         break;
>> +       case 0x6:
>> +         switch (model)
>> +           {
>> +           case 0x1a:
>> +           case 0x1e:
>> +           case 0x1f:
>> +           case 0x2e:
>> +             /* Nehalem.  */
>> +             __cpu_type = PROCESSOR_COREI7_NEHALEM;
>> +             __cpu_model.__cpu_is_corei7_nehalem = 1;
>> +             break;
>> +           case 0x25:
>> +           case 0x2c:
>> +           case 0x2f:
>> +             /* Westmere.  */
>> +             __cpu_type = PROCESSOR_COREI7_WESTMERE;
>> +             __cpu_model.__cpu_is_corei7_westmere = 1;
>> +             break;
>> +           case 0x2a:
>> +             /* Sandy Bridge.  */
>> +             __cpu_type = PROCESSOR_COREI7_SANDYBRIDGE;
>> +             __cpu_model.__cpu_is_corei7_sandybridge = 1;
>> +             break;
>> +           case 0x17:
>> +           case 0x1d:
>> +             /* Penryn.  */
>> +           case 0x0f:
>> +             /* Merom.  */
>> +             __cpu_type = PROCESSOR_CORE2;
>> +             break;
>> +           default:
>> +             __cpu_type = PROCESSOR_INTEL_GENERIC;
>> +             break;
>> +           }
>> +         break;
>> +       default:
>> +         /* We have no idea.  */
>> +         __cpu_type = PROCESSOR_INTEL_GENERIC;
>> +         break;
>> +       }
>> +    }
>> +}
>> +
>> +static void
>> +get_available_features (unsigned int ecx, unsigned int edx)
>> +{
>> +  __cpu_features.__cpu_cmov = (edx & bit_CMOV) ? 1 : 0;
>> +  __cpu_features.__cpu_mmx = (edx & bit_MMX) ? 1 : 0;
>> +  __cpu_features.__cpu_sse = (edx & bit_SSE) ? 1 : 0;
>> +  __cpu_features.__cpu_sse2 = (edx & bit_SSE2) ? 1 : 0;
>> +  __cpu_features.__cpu_popcnt = (ecx & bit_POPCNT) ? 1 : 0;
>> +  __cpu_features.__cpu_sse3 = (ecx & bit_SSE3) ? 1 : 0;
>> +  __cpu_features.__cpu_ssse3 = (ecx & bit_SSSE3) ? 1 : 0;
>> +  __cpu_features.__cpu_sse4_1 = (ecx & bit_SSE4_1) ? 1 : 0;
>> +  __cpu_features.__cpu_sse4_2 = (ecx & bit_SSE4_2) ? 1 : 0;
>> +}
>> +
>> +/* A noinline function calling __get_cpuid. Having many calls to
>> +   cpuid in one function in 32-bit mode causes GCC to complain:
>> +   "can’t find a register in class ‘CLOBBERED_REGS’".  This is
>> +   related to PR rtl-optimization 44174. */
>> +
>> +static int __attribute__ ((noinline))
>> +__get_cpuid_output (unsigned int __level,
>> +                   unsigned int *__eax, unsigned int *__ebx,
>> +                   unsigned int *__ecx, unsigned int *__edx)
>> +{
>> +  return __get_cpuid (__level, __eax, __ebx, __ecx, __edx);
>> +}
>> +
>> +/* This function will be linked in to binaries that need to look up
>> +   CPU information.  */
>> +
>> +void
>> +__cpu_indicator_init(void)
>> +{
>> +  unsigned int eax, ebx, ecx, edx;
>> +
>> +  int max_level = 5;
>> +  unsigned int vendor;
>> +  unsigned int model, family, brand_id;
>> +
>> +  memset (&__cpu_features, 0, sizeof (struct __processor_features));
>> +  memset (&__cpu_model, 0, sizeof (struct __processor_model));
>> +
>> +  /* Assume cpuid insn present. Run in level 0 to get vendor id. */
>> +  if (!__get_cpuid_output (0, &eax, &ebx, &ecx, &edx))
>> +    return;
>> +
>> +  vendor = ebx;
>> +  max_level = eax;
>> +
>> +  if (max_level < 1)
>> +    return;
>> +
>> +  if (!__get_cpuid_output (1, &eax, &ebx, &ecx, &edx))
>> +    return;
>> +
>> +  model = (eax >> 4) & 0x0f;
>> +  family = (eax >> 8) & 0x0f;
>> +  brand_id = ebx & 0xff;
>> +
>> +  /* Adjust model and family for Intel CPUS. */
>> +  if (vendor == SIG_INTEL)
>> +    {
>> +      unsigned int extended_model, extended_family;
>> +
>> +      extended_model = (eax >> 12) & 0xf0;
>> +      extended_family = (eax >> 20) & 0xff;
>> +      if (family == 0x0f)
>> +       {
>> +         family += extended_family;
>> +         model += extended_model;
>> +       }
>> +      else if (family == 0x06)
>> +       model += extended_model;
>> +    }
>> +
>> +  /* Find CPU model. */
>> +
>> +  if (vendor == SIG_AMD)
>> +    {
>> +      __cpu_model.__cpu_is_amd = 1;
>> +      get_amd_cpu (family, model);
>> +    }
>> +  else if (vendor == SIG_INTEL)
>> +    {
>> +      __cpu_model.__cpu_is_intel = 1;
>> +      get_intel_cpu (family, model, brand_id);
>> +    }
>> +
>> +  /* Find available features. */
>> +  get_available_features (ecx, edx);
>> +}
>> +
>> +#else
>> +
>> +void
>> +__cpu_indicator_init(void)
>> +{
>> +}
>> +
>> +#endif /* __GNUC__ */
>> Index: gcc/tree-pass.h
>> ===================================================================
>> --- gcc/tree-pass.h     (revision 177767)
>> +++ gcc/tree-pass.h     (working copy)
>> @@ -449,6 +449,7 @@ extern struct gimple_opt_pass pass_split_functions
>>  extern struct gimple_opt_pass pass_feedback_split_functions;
>>  extern struct gimple_opt_pass pass_threadsafe_analyze;
>>  extern struct gimple_opt_pass pass_tree_convert_builtin_dispatch;
>> +extern struct gimple_opt_pass pass_tree_fold_builtin_target;
>>
>>  /* IPA Passes */
>>  extern struct simple_ipa_opt_pass pass_ipa_lower_emutls;
>> Index: gcc/testsuite/gcc.dg/builtin_target.c
>> ===================================================================
>> --- gcc/testsuite/gcc.dg/builtin_target.c       (revision 0)
>> +++ gcc/testsuite/gcc.dg/builtin_target.c       (revision 0)
>> @@ -0,0 +1,49 @@
>> +/* This test checks if the __builtin_target_* calls are recognized. */
>> +
>> +/* { dg-do run } */
>> +
>> +int
>> +fn1 ()
>> +{
>> +  if (__builtin_target_supports_cmov () < 0)
>> +    return -1;
>> +  if (__builtin_target_supports_mmx () < 0)
>> +    return -1;
>> +  if (__builtin_target_supports_popcount () < 0)
>> +    return -1;
>> +  if (__builtin_target_supports_sse () < 0)
>> +    return -1;
>> +  if (__builtin_target_supports_sse2 () < 0)
>> +    return -1;
>> +  if (__builtin_target_supports_sse3 () < 0)
>> +    return -1;
>> +  if (__builtin_target_supports_ssse3 () < 0)
>> +    return -1;
>> +  if (__builtin_target_supports_sse4_1 () < 0)
>> +    return -1;
>> +  if (__builtin_target_supports_sse4_2 () < 0)
>> +    return -1;
>> +  if (__builtin_target_is_amd () < 0)
>> +    return -1;
>> +  if (__builtin_target_is_intel () < 0)
>> +    return -1;
>> +  if (__builtin_target_is_corei7_nehalem () < 0)
>> +    return -1;
>> +  if (__builtin_target_is_corei7_westmere () < 0)
>> +    return -1;
>> +  if (__builtin_target_is_corei7_sandybridge () < 0)
>> +    return -1;
>> +  if (__builtin_target_is_amdfam10_barcelona () < 0)
>> +    return -1;
>> +  if (__builtin_target_is_amdfam10_shanghai () < 0)
>> +    return -1;
>> +  if (__builtin_target_is_amdfam10_istanbul () < 0)
>> +    return -1;
>> +
>> +  return 0;
>> +}
>> +
>> +int main ()
>> +{
>> +  return fn1 ();
>> +}
>> Index: gcc/builtins.def
>> ===================================================================
>> --- gcc/builtins.def    (revision 177767)
>> +++ gcc/builtins.def    (working copy)
>> @@ -763,6 +763,25 @@ DEF_BUILTIN (BUILT_IN_EMUTLS_REGISTER_COMMON,
>>  /* Multiversioning builtin dispatch hook. */
>>  DEF_GCC_BUILTIN (BUILT_IN_DISPATCH, "dispatch", BT_FN_INT_PTR_FN_INT_PTR_PTR_VAR, ATTR_NULL)
>>
>> +/* Builtins to determine target type and features at run-time. */
>> +DEF_GCC_BUILTIN (BUILT_IN_TARGET_SUPPORTS_CMOV, "target_supports_cmov", BT_FN_INT, ATTR_NULL)
>> +DEF_GCC_BUILTIN (BUILT_IN_TARGET_SUPPORTS_MMX, "target_supports_mmx", BT_FN_INT, ATTR_NULL)
>> +DEF_GCC_BUILTIN (BUILT_IN_TARGET_SUPPORTS_POPCOUNT, "target_supports_popcount", BT_FN_INT, ATTR_NULL)
>> +DEF_GCC_BUILTIN (BUILT_IN_TARGET_SUPPORTS_SSE, "target_supports_sse", BT_FN_INT, ATTR_NULL)
>> +DEF_GCC_BUILTIN (BUILT_IN_TARGET_SUPPORTS_SSE2, "target_supports_sse2", BT_FN_INT, ATTR_NULL)
>> +DEF_GCC_BUILTIN (BUILT_IN_TARGET_SUPPORTS_SSE3, "target_supports_sse3", BT_FN_INT, ATTR_NULL)
>> +DEF_GCC_BUILTIN (BUILT_IN_TARGET_SUPPORTS_SSSE3, "target_supports_ssse3", BT_FN_INT, ATTR_NULL)
>> +DEF_GCC_BUILTIN (BUILT_IN_TARGET_SUPPORTS_SSE4_1, "target_supports_sse4_1", BT_FN_INT, ATTR_NULL)
>> +DEF_GCC_BUILTIN (BUILT_IN_TARGET_SUPPORTS_SSE4_2, "target_supports_sse4_2", BT_FN_INT, ATTR_NULL)
>> +DEF_GCC_BUILTIN (BUILT_IN_TARGET_IS_AMD, "target_is_amd", BT_FN_INT, ATTR_NULL)
>> +DEF_GCC_BUILTIN (BUILT_IN_TARGET_IS_INTEL, "target_is_intel", BT_FN_INT, ATTR_NULL)
>> +DEF_GCC_BUILTIN (BUILT_IN_TARGET_IS_COREI7_NEHALEM, "target_is_corei7_nehalem", BT_FN_INT, ATTR_NULL)
>> +DEF_GCC_BUILTIN (BUILT_IN_TARGET_IS_COREI7_WESTMERE, "target_is_corei7_westmere", BT_FN_INT, ATTR_NULL)
>> +DEF_GCC_BUILTIN (BUILT_IN_TARGET_IS_COREI7_SANDYBRIDGE, "target_is_corei7_sandybridge", BT_FN_INT, ATTR_NULL)
>> +DEF_GCC_BUILTIN (BUILT_IN_TARGET_IS_AMDFAM10_BARCELONA, "target_is_amdfam10_barcelona", BT_FN_INT, ATTR_NULL)
>> +DEF_GCC_BUILTIN (BUILT_IN_TARGET_IS_AMDFAM10_SHANGHAI, "target_is_amdfam10_shanghai", BT_FN_INT, ATTR_NULL)
>> +DEF_GCC_BUILTIN (BUILT_IN_TARGET_IS_AMDFAM10_ISTANBUL, "target_is_amdfam10_istanbul", BT_FN_INT, ATTR_NULL)
>> +
>>  /* Exception support.  */
>>  DEF_BUILTIN_STUB (BUILT_IN_UNWIND_RESUME, "__builtin_unwind_resume")
>>  DEF_BUILTIN_STUB (BUILT_IN_CXA_END_CLEANUP, "__builtin_cxa_end_cleanup")
>> Index: gcc/mversn-dispatch.c
>> ===================================================================
>> --- gcc/mversn-dispatch.c       (revision 177767)
>> +++ gcc/mversn-dispatch.c       (working copy)
>> @@ -135,6 +135,7 @@ along with GCC; see the file COPYING3.  If not see
>>  #include "output.h"
>>  #include "vecprim.h"
>>  #include "gimple-pretty-print.h"
>> +#include "target.h"
>>
>>  typedef struct cgraph_node* NODEPTR;
>>  DEF_VEC_P (NODEPTR);
>> @@ -1764,3 +1765,103 @@ struct gimple_opt_pass pass_tree_convert_builtin_d
>>   TODO_update_ssa | TODO_verify_ssa
>>  }
>>  };
>> +
>> +/* Fold calls to __builtin_target_* */
>> +
>> +static unsigned int
>> +do_fold_builtin_target (void)
>> +{
>> +  basic_block bb;
>> +  gimple_stmt_iterator gsi;
>> +
>> +  /* Go through each stmt looking for __builtin_target_* calls */
>> +  FOR_EACH_BB_FN (bb, DECL_STRUCT_FUNCTION (current_function_decl))
>> +    {
>> +      for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
>> +        {
>> +         gimple stmt = gsi_stmt (gsi);
>> +         gimple assign_stmt;
>> +          tree call_decl;
>> +         tree lhs_retval;
>> +         tree folded_val;
>> +
>> +         tree ssa_var, tmp_var;
>> +         gimple init_stmt;
>> +
>> +          if (!is_gimple_call (stmt))
>> +            continue;
>> +
>> +          call_decl = gimple_call_fndecl (stmt);
>> +
>> +         /* Check if it is a __builtin_target_* call. */
>> +
>> +         if (call_decl == NULL
>> +             || DECL_NAME (call_decl) == NULL_TREE
>> +             || DECL_BUILT_IN_CLASS (call_decl) != BUILT_IN_NORMAL
>> +             || strstr (IDENTIFIER_POINTER (DECL_NAME (call_decl)),
>> +                         "__builtin_target") == NULL)
>> +            continue;
>> +
>> +         /* If the lhs is NULL there is no need to fold the call. */
>> +         lhs_retval = gimple_call_lhs(stmt);
>> +         if (lhs_retval == NULL)
>> +           continue;
>> +
>> +         /* Call the target hook to fold the builtin */
>> +          folded_val = targetm.fold_builtin(call_decl, 0, NULL, false);
>> +
>> +         /* If the target does not support the builtin then fold it to zero. */
>> +         if (folded_val == NULL_TREE)
>> +           folded_val = build_zero_cst (unsigned_type_node);
>> +
>> +         /* Type cast unsigned value to integer */
>> +         tmp_var = create_tmp_var (unsigned_type_node, NULL);
>> +         init_stmt = gimple_build_assign (tmp_var, folded_val);
>> +         ssa_var = make_ssa_name (tmp_var, init_stmt);
>> +         gimple_assign_set_lhs (init_stmt, ssa_var);
>> +         mark_symbols_for_renaming (init_stmt);
>> +
>> +         assign_stmt = gimple_build_assign_with_ops (NOP_EXPR, lhs_retval, ssa_var, 0);
>> +         mark_symbols_for_renaming(assign_stmt);
>> +
>> +         gsi_insert_after_without_update (&gsi, assign_stmt, GSI_SAME_STMT);
>> +         gsi_insert_after_without_update (&gsi, init_stmt, GSI_SAME_STMT);
>> +         /* Delete the original call. */
>> +         gsi_remove(&gsi, true);
>> +       }
>> +    }
>> +
>> +  return 0;
>> +}
>> +
>> +static bool
>> +gate_fold_builtin_target (void)
>> +{
>> +  return true;
>> +}
>> +
>> +/* Pass to fold __builtin_target_* functions */
>> +
>> +struct gimple_opt_pass pass_tree_fold_builtin_target =
>> +{
>> + {
>> +  GIMPLE_PASS,
>> +  "fold_builtin_target",               /* name */
>> +  gate_fold_builtin_target,            /* gate */
>> +  do_fold_builtin_target,              /* execute */
>> +  NULL,                                        /* sub */
>> +  NULL,                                        /* next */
>> +  0,                                   /* static_pass_number */
>> +  TV_FOLD_BUILTIN_TARGET,              /* tv_id */
>> +  PROP_cfg,                            /* properties_required */
>> +  PROP_cfg,                            /* properties_provided */
>> +  0,                                   /* properties_destroyed */
>> +  0,                                   /* todo_flags_start */
>> +  TODO_dump_func |                     /* todo_flags_finish */
>> +  TODO_cleanup_cfg |
>> +  TODO_update_ssa |
>> +  TODO_verify_ssa
>> + }
>> +};
>> +
>> +
>> Index: gcc/timevar.def
>> ===================================================================
>> --- gcc/timevar.def     (revision 177767)
>> +++ gcc/timevar.def     (working copy)
>> @@ -124,6 +124,7 @@ DEFTIMEVAR (TV_PARSE_INMETH          , "parser inl
>>  DEFTIMEVAR (TV_TEMPLATE_INST         , "template instantiation")
>>  DEFTIMEVAR (TV_INLINE_HEURISTICS     , "inline heuristics")
>>  DEFTIMEVAR (TV_MVERSN_DISPATCH       , "multiversion dispatch")
>> +DEFTIMEVAR (TV_FOLD_BUILTIN_TARGET   , "fold __builtin_target calls")
>>  DEFTIMEVAR (TV_INTEGRATION           , "integration")
>>  DEFTIMEVAR (TV_TREE_GIMPLIFY        , "tree gimplify")
>>  DEFTIMEVAR (TV_TREE_EH              , "tree eh")
>> Index: gcc/passes.c
>> ===================================================================
>> --- gcc/passes.c        (revision 177767)
>> +++ gcc/passes.c        (working copy)
>> @@ -1249,6 +1249,8 @@ init_optimization_passes (void)
>>     {
>>       struct opt_pass **p = &pass_ipa_multiversion_dispatch.pass.sub;
>>       NEXT_PASS (pass_tree_convert_builtin_dispatch);
>> +      /* Fold calls to __builtin_target_*. */
>> +      NEXT_PASS (pass_tree_fold_builtin_target);
>>       /* Rebuilding cgraph edges is necessary as the above passes change
>>          the call graph.  Otherwise, future optimizations use the old
>>         call graph and make wrong decisions sometimes.*/
>> Index: gcc/config/i386/i386.c
>> ===================================================================
>> --- gcc/config/i386/i386.c      (revision 177767)
>> +++ gcc/config/i386/i386.c      (working copy)
>> @@ -58,6 +58,8 @@ along with GCC; see the file COPYING3.  If not see
>>  #include "sched-int.h"
>>  #include "sbitmap.h"
>>  #include "fibheap.h"
>> +#include "tree-flow.h"
>> +#include "tree-pass.h"
>>
>>  enum upper_128bits_state
>>  {
>> @@ -7867,6 +7869,338 @@ ix86_build_builtin_va_list (void)
>>   return ret;
>>  }
>>
>> +/* Returns a struct type with name NAME and number of fields equal to
>> +   NUM_FIELDS.  Each field is a unsigned int bit field of length 1 bit. */
>> +
>> +static tree
>> +build_struct_with_one_bit_fields (int num_fields, const char *name)
>> +{
>> +  int i;
>> +  char field_name [10];
>> +  tree field = NULL_TREE, field_chain = NULL_TREE;
>> +  tree type = make_node (RECORD_TYPE);
>> +
>> +  strcpy (field_name, "k_field");
>> +
>> +  for (i = 0; i < num_fields; i++)
>> +    {
>> +      /* Name the fields, 0_field, 1_field, ... */
>> +      field_name [0] = '0' + i;
>> +      field = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
>> +                         get_identifier (field_name), unsigned_type_node);
>> +      DECL_BIT_FIELD (field) = 1;
>> +      DECL_SIZE (field) = bitsize_one_node;
>> +      if (field_chain != NULL_TREE)
>> +       DECL_CHAIN (field) = field_chain;
>> +      field_chain = field;
>> +    }
>> +  finish_builtin_struct (type, name, field_chain, NULL_TREE);
>> +  return type;
>> +}
>> +
>> +/* Returns a VAR_DECL of type TYPE and name NAME. */
>> +
>> +static tree
>> +make_var_decl (tree type, const char *name)
>> +{
>> +  tree new_decl;
>> +  struct varpool_node *vnode;
>> +
>> +  new_decl = build_decl (UNKNOWN_LOCATION,
>> +                        VAR_DECL,
>> +                        get_identifier(name),
>> +                        type);
>> +
>> +  DECL_EXTERNAL (new_decl) = 1;
>> +  TREE_STATIC (new_decl) = 1;
>> +  TREE_PUBLIC (new_decl) = 1;
>> +  DECL_INITIAL (new_decl) = 0;
>> +  DECL_ARTIFICIAL (new_decl) = 0;
>> +  DECL_PRESERVE_P (new_decl) = 1;
>> +
>> +  make_decl_one_only (new_decl, DECL_ASSEMBLER_NAME (new_decl));
>> +  assemble_variable (new_decl, 0, 0, 0);
>> +
>> +  vnode = varpool_node (new_decl);
>> +  gcc_assert (vnode != NULL);
>> +  /* Set finalized to 1, otherwise it asserts in function "write_symbol" in
>> +     lto-streamer-out.c. */
>> +  vnode->finalized = 1;
>> +
>> +  return new_decl;
>> +}
>> +
>> +/* Traverses the chain of fields in STRUCT_TYPE and returns the FIELD_NUM
>> +   numbered field. */
>> +
>> +static tree
>> +get_field_from_struct (tree struct_type, int field_num)
>> +{
>> +  int i;
>> +  tree field = TYPE_FIELDS (struct_type);
>> +
>> +  for (i = 0; i < field_num; i++, field = DECL_CHAIN(field))
>> +    {
>> +      gcc_assert (field != NULL_TREE);
>> +    }
>> +
>> +  return field;
>> +}
>> +
>> +/* Create a new static constructor that calls __cpu_indicator_init ()
>> +   function defined in libgcc/config/i386-cpuinfo.c which runs cpuid
>> +   to figure out the type of the target. */
>> +
>> +static tree
>> +make_constructor_to_get_target_type (const char *name)
>> +{
>> +  tree decl, type, t;
>> +  gimple_seq seq;
>> +  basic_block new_bb;
>> +  tree old_current_function_decl;
>> +
>> +  tree __cpu_indicator_int_decl;
>> +  gimple constructor_body;
>> +
>> +
>> +  type = build_function_type_list (void_type_node, NULL_TREE);
>> +
>> +  /* Make a call stmt to __cpu_indicator_init */
>> +  __cpu_indicator_int_decl = build_fn_decl ("__cpu_indicator_init", type);
>> +  constructor_body = gimple_build_call (__cpu_indicator_int_decl, 0);
>> +  DECL_EXTERNAL (__cpu_indicator_int_decl) = 1;
>> +
>> +  decl = build_fn_decl (name, type);
>> +
>> +  DECL_NAME (decl) = get_identifier (name);
>> +  SET_DECL_ASSEMBLER_NAME (decl, DECL_NAME (decl));
>> +  gcc_assert (cgraph_node (decl) != NULL);
>> +
>> +  TREE_USED (decl) = 1;
>> +  DECL_ARTIFICIAL (decl) = 1;
>> +  DECL_IGNORED_P (decl) = 0;
>> +  TREE_PUBLIC (decl) = 0;
>> +  DECL_UNINLINABLE (decl) = 1;
>> +  DECL_EXTERNAL (decl) = 0;
>> +  DECL_CONTEXT (decl) = NULL_TREE;
>> +  DECL_INITIAL (decl) = make_node (BLOCK);
>> +  DECL_STATIC_CONSTRUCTOR (decl) = 1;
>> +  TREE_READONLY (decl) = 0;
>> +  DECL_PURE_P (decl) = 0;
>> +
>> +  /* This is a comdat. */
>> +  make_decl_one_only (decl, DECL_ASSEMBLER_NAME (decl));
>> +
>> +  /* Build result decl and add to function_decl. */
>> +  t = build_decl (UNKNOWN_LOCATION, RESULT_DECL, NULL_TREE, void_type_node);
>> +  DECL_ARTIFICIAL (t) = 1;
>> +  DECL_IGNORED_P (t) = 1;
>> +  DECL_RESULT (decl) = t;
>> +
>> +  gimplify_function_tree (decl);
>> +
>> +  /* Build CFG for this function. */
>> +
>> +  old_current_function_decl = current_function_decl;
>> +  push_cfun (DECL_STRUCT_FUNCTION (decl));
>> +  current_function_decl = decl;
>> +  init_empty_tree_cfg_for_function (DECL_STRUCT_FUNCTION (decl));
>> +  cfun->curr_properties |=
>> +    (PROP_gimple_lcf | PROP_gimple_leh | PROP_cfg | PROP_referenced_vars |
>> +     PROP_ssa);
>> +  new_bb = create_empty_bb (ENTRY_BLOCK_PTR);
>> +  make_edge (ENTRY_BLOCK_PTR, new_bb, EDGE_FALLTHRU);
>> +
>> +  /* XXX: Not sure if the edge commented below is necessary.  If I add this
>> +     edge, it fails in gimple_verify_flow_info in tree-cfg.c in condition :
>> +     " if (e->flags & EDGE_FALLTHRU)"
>> +     during -fprofile-generate.
>> +     Otherwise, it is fine.  Deleting this edge does not break anything.
>> +     Commenting this so that it is clear I am intentionally not doing this.*/
>> +  /* make_edge (new_bb, EXIT_BLOCK_PTR, EDGE_FALLTHRU); */
>> +
>> +  seq = gimple_seq_alloc_with_stmt (constructor_body);
>> +
>> +  set_bb_seq (new_bb, seq);
>> +  gimple_set_bb (constructor_body, new_bb);
>> +
>> +  /* Set the lexical block of the constructor body. Fails the inliner
>> +     other wise. */
>> +  gimple_set_block (constructor_body, DECL_INITIAL (decl));
>> +
>> +  /* This call is very important if this pass runs when the IR is in
>> +     SSA form.  It breaks things in strange ways otherwise. */
>> +  init_tree_ssa (DECL_STRUCT_FUNCTION (decl));
>> +  /* add_referenced_var (version_selector_var); */
>> +
>> +  cgraph_add_new_function (decl, true);
>> +  cgraph_call_function_insertion_hooks (cgraph_node (decl));
>> +  cgraph_mark_needed_node (cgraph_node (decl));
>> +
>> +  pop_cfun ();
>> +  current_function_decl = old_current_function_decl;
>> +  return decl;
>> +}
>> +
>> +/* FNDECL is a __builtin_target_* call that is folded into an integer defined
>> +   in libgcc/config/i386/i386-cpuinfo.c */
>> +
>> +static tree
>> +fold_builtin_target (tree fndecl)
>> +{
>> +  /* This is the order of bit-fields in __processor_features in
>> +     i386-cpuinfo.c */
>> +  enum processor_features
>> +  {
>> +    F_CMOV = 0,
>> +    F_MMX,
>> +    F_POPCNT,
>> +    F_SSE,
>> +    F_SSE2,
>> +    F_SSE3,
>> +    F_SSSE3,
>> +    F_SSE4_1,
>> +    F_SSE4_2,
>> +    F_MAX
>> +  };
>> +
>> +  /* This is the order of bit-fields in __processor_model in
>> +     i386-cpuinfo.c */
>> +  enum processor_model
>> +  {
>> +    M_AMD = 0,
>> +    M_INTEL,
>> +    M_COREI7_NEHALEM,
>> +    M_COREI7_WESTMERE,
>> +    M_COREI7_SANDYBRIDGE,
>> +    M_AMDFAM10_BARCELONA,
>> +    M_AMDFAM10_SHANGHAI,
>> +    M_AMDFAM10_ISTANBUL,
>> +    M_MAX
>> +  };
>> +
>> +  static tree __processor_features_type = NULL_TREE;
>> +  static tree __cpu_features_var = NULL_TREE;
>> +  static tree __processor_model_type = NULL_TREE;
>> +  static tree __cpu_model_var = NULL_TREE;
>> +  static tree ctor_decl = NULL_TREE;
>> +  static tree field;
>> +  static tree which_struct;
>> +
>> +  /* Make a call to __cpu_indicatior_init in a constructor.
>> +     Function __cpu_indicator_init is defined in i386-cpuinfo.c. */
>> +  if (ctor_decl == NULL_TREE)
>> +   ctor_decl = make_constructor_to_get_target_type
>> +               ("__cpu_indicator_init_ctor");
>> +
>> +  if (__processor_features_type == NULL_TREE)
>> +    __processor_features_type = build_struct_with_one_bit_fields (F_MAX,
>> +                                 "__processor_features");
>> +
>> +  if (__processor_model_type == NULL_TREE)
>> +    __processor_model_type = build_struct_with_one_bit_fields (M_MAX,
>> +                                 "__processor_model");
>> +
>> +  if (__cpu_features_var == NULL_TREE)
>> +    __cpu_features_var = make_var_decl (__processor_features_type,
>> +                                       "__cpu_features");
>> +
>> +  if (__cpu_model_var == NULL_TREE)
>> +    __cpu_model_var = make_var_decl (__processor_model_type,
>> +                                    "__cpu_model");
>> +
>> +  /* Look at fndecl code to identify the field requested. */
>> +  switch (DECL_FUNCTION_CODE (fndecl))
>> +    {
>> +    case BUILT_IN_TARGET_SUPPORTS_CMOV:
>> +      field = get_field_from_struct (__processor_features_type, F_CMOV);
>> +      which_struct = __cpu_features_var;
>> +      break;
>> +    case BUILT_IN_TARGET_SUPPORTS_MMX:
>> +      field = get_field_from_struct (__processor_features_type, F_MMX);
>> +      which_struct = __cpu_features_var;
>> +      break;
>> +    case BUILT_IN_TARGET_SUPPORTS_POPCOUNT:
>> +      field = get_field_from_struct (__processor_features_type, F_POPCNT);
>> +      which_struct = __cpu_features_var;
>> +      break;
>> +    case BUILT_IN_TARGET_SUPPORTS_SSE:
>> +      field = get_field_from_struct (__processor_features_type, F_SSE);
>> +      which_struct = __cpu_features_var;
>> +      break;
>> +    case BUILT_IN_TARGET_SUPPORTS_SSE2:
>> +      field = get_field_from_struct (__processor_features_type, F_SSE2);
>> +      which_struct = __cpu_features_var;
>> +      break;
>> +    case BUILT_IN_TARGET_SUPPORTS_SSE3:
>> +      field = get_field_from_struct (__processor_features_type, F_SSE3);
>> +      which_struct = __cpu_features_var;
>> +      break;
>> +    case BUILT_IN_TARGET_SUPPORTS_SSSE3:
>> +      field = get_field_from_struct (__processor_features_type, F_SSE3);
>> +      which_struct = __cpu_features_var;
>> +      break;
>> +    case BUILT_IN_TARGET_SUPPORTS_SSE4_1:
>> +      field = get_field_from_struct (__processor_features_type, F_SSE4_1);
>> +      which_struct = __cpu_features_var;
>> +      break;
>> +    case BUILT_IN_TARGET_SUPPORTS_SSE4_2:
>> +      field = get_field_from_struct (__processor_features_type, F_SSE4_2);
>> +      which_struct = __cpu_features_var;
>> +      break;
>> +    case BUILT_IN_TARGET_IS_AMD:
>> +      field = get_field_from_struct (__processor_model_type, M_AMD);;
>> +      which_struct = __cpu_model_var;
>> +      break;
>> +    case BUILT_IN_TARGET_IS_INTEL:
>> +      field = get_field_from_struct (__processor_model_type, M_INTEL);;
>> +      which_struct = __cpu_model_var;
>> +      break;
>> +    case BUILT_IN_TARGET_IS_COREI7_NEHALEM:
>> +      field = get_field_from_struct (__processor_model_type, M_COREI7_NEHALEM);;
>> +      which_struct = __cpu_model_var;
>> +      break;
>> +    case BUILT_IN_TARGET_IS_COREI7_WESTMERE:
>> +      field = get_field_from_struct (__processor_model_type, M_COREI7_WESTMERE);;
>> +      which_struct = __cpu_model_var;
>> +      break;
>> +    case BUILT_IN_TARGET_IS_COREI7_SANDYBRIDGE:
>> +      field = get_field_from_struct (__processor_model_type, M_COREI7_SANDYBRIDGE);;
>> +      which_struct = __cpu_model_var;
>> +      break;
>> +    case BUILT_IN_TARGET_IS_AMDFAM10_BARCELONA:
>> +      field = get_field_from_struct (__processor_model_type, M_AMDFAM10_BARCELONA);;
>> +      which_struct = __cpu_model_var;
>> +      break;
>> +    case BUILT_IN_TARGET_IS_AMDFAM10_SHANGHAI:
>> +      field = get_field_from_struct (__processor_model_type, M_AMDFAM10_SHANGHAI);;
>> +      which_struct = __cpu_model_var;
>> +      break;
>> +    case BUILT_IN_TARGET_IS_AMDFAM10_ISTANBUL:
>> +      field = get_field_from_struct (__processor_model_type, M_AMDFAM10_ISTANBUL);;
>> +      which_struct = __cpu_model_var;
>> +      break;
>> +    default:
>> +      return NULL_TREE;
>> +    }
>> +
>> +  return build3 (COMPONENT_REF, TREE_TYPE (field), which_struct, field, NULL_TREE);
>> +}
>> +
>> +/* Folds __builtin_target_* builtins. */
>> +
>> +static tree
>> +ix86_fold_builtin (tree fndecl, int n_args ATTRIBUTE_UNUSED,
>> +                   tree *args ATTRIBUTE_UNUSED, bool ignore ATTRIBUTE_UNUSED)
>> +{
>> +  const char *decl_name = IDENTIFIER_POINTER (DECL_NAME (fndecl));
>> +  if (DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_NORMAL
>> +      && strstr(decl_name, "__builtin_target") != NULL)
>> +    return fold_builtin_target (fndecl);
>> +
>> +  return NULL_TREE;
>> +}
>> +
>>  /* Worker function for TARGET_SETUP_INCOMING_VARARGS.  */
>>
>>  static void
>> @@ -35097,6 +35431,9 @@ ix86_autovectorize_vector_sizes (void)
>>  #undef TARGET_BUILD_BUILTIN_VA_LIST
>>  #define TARGET_BUILD_BUILTIN_VA_LIST ix86_build_builtin_va_list
>>
>> +#undef TARGET_FOLD_BUILTIN
>> +#define TARGET_FOLD_BUILTIN ix86_fold_builtin
>> +
>>  #undef TARGET_ENUM_VA_LIST_P
>>  #define TARGET_ENUM_VA_LIST_P ix86_enum_va_list
>>
>>
>> --
>> This patch is available for review at http://codereview.appspot.com/4893046
>>
>
Hans-Peter Nilsson Aug. 17, 2011, 11:59 p.m. UTC | #8
On Tue, 16 Aug 2011, Sriraman Tallam wrote:

(I don't see anyone else making this comment, so maybe I missed
something obvious, but I don't think so...)

> Support for getting CPU type and feature information at run-time.
>
> The following patch provides support for finding the platform type at run-time, like cpu type and features supported. The multi-versioning framework will use the builtins added to dispatch the right function version. Please refer to http://gcc.gnu.org/ml/gcc/2011-08/msg00298.html for details on function multi-versioning usability.
>
> 	* tree-pass.h (pass_tree_fold_builtin_target): New pass.
> 	* builtins.def (BUILT_IN_TARGET_SUPPORTS_CMOV): New builtin.
> 	(BUILT_IN_TARGET_SUPPORTS_MMX): New builtin.
> 	(BUILT_IN_TARGET_SUPPORTS_POPCOUNT): New builtin.
> 	(BUILT_IN_TARGET_SUPPORTS_SSE): New builtin.
> 	(BUILT_IN_TARGET_SUPPORTS_SSE2): New builtin.
> 	(BUILT_IN_TARGET_SUPPORTS_SSE3): New builtin.
> 	(BUILT_IN_TARGET_SUPPORTS_SSSE3): New builtin.
> 	(BUILT_IN_TARGET_SUPPORTS_SSE4_1): New builtin.
> 	(BUILT_IN_TARGET_SUPPORTS_SSE4_2): New builtin.
> 	(BUILT_IN_TARGET_IS_AMD): New builtin.
> 	(BUILT_IN_TARGET_IS_INTEL): New builtin.
> 	(BUILT_IN_TARGET_IS_COREI7_NEHALEM): New builtin.
> 	(BUILT_IN_TARGET_IS_COREI7_WESTMERE): New builtin.
> 	(BUILT_IN_TARGET_IS_COREI7_SANDYBRIDGE): New builtin.
> 	(BUILT_IN_TARGET_IS_AMDFAM10_BARCELONA): New builtin.
> 	(BUILT_IN_TARGET_IS_AMDFAM10_SHANGHAI): New builtin.
> 	(BUILT_IN_TARGET_IS_AMDFAM10_ISTANBUL): New builtin.
(cut)

Keep the port-specific bits in the port, please. I don't see why
this has to be in generic files as opposed to target hooks and
included target-specific file fragments like everything else
(well, most everything) in gcc.  If not, I think we'll see
cpu_ports*variants explosion here until these bits are
rewritten...

brgds, H-P
Sriraman Tallam Aug. 18, 2011, 12:27 a.m. UTC | #9
On Wed, Aug 17, 2011 at 4:59 PM, Hans-Peter Nilsson <hp@bitrange.com> wrote:
> On Tue, 16 Aug 2011, Sriraman Tallam wrote:
>
> (I don't see anyone else making this comment, so maybe I missed
> something obvious, but I don't think so...)
>
>> Support for getting CPU type and feature information at run-time.
>>
>> The following patch provides support for finding the platform type at run-time, like cpu type and features supported. The multi-versioning framework will use the builtins added to dispatch the right function version. Please refer to http://gcc.gnu.org/ml/gcc/2011-08/msg00298.html for details on function multi-versioning usability.
>>
>>       * tree-pass.h (pass_tree_fold_builtin_target): New pass.
>>       * builtins.def (BUILT_IN_TARGET_SUPPORTS_CMOV): New builtin.
>>       (BUILT_IN_TARGET_SUPPORTS_MMX): New builtin.
>>       (BUILT_IN_TARGET_SUPPORTS_POPCOUNT): New builtin.
>>       (BUILT_IN_TARGET_SUPPORTS_SSE): New builtin.
>>       (BUILT_IN_TARGET_SUPPORTS_SSE2): New builtin.
>>       (BUILT_IN_TARGET_SUPPORTS_SSE3): New builtin.
>>       (BUILT_IN_TARGET_SUPPORTS_SSSE3): New builtin.
>>       (BUILT_IN_TARGET_SUPPORTS_SSE4_1): New builtin.
>>       (BUILT_IN_TARGET_SUPPORTS_SSE4_2): New builtin.
>>       (BUILT_IN_TARGET_IS_AMD): New builtin.
>>       (BUILT_IN_TARGET_IS_INTEL): New builtin.
>>       (BUILT_IN_TARGET_IS_COREI7_NEHALEM): New builtin.
>>       (BUILT_IN_TARGET_IS_COREI7_WESTMERE): New builtin.
>>       (BUILT_IN_TARGET_IS_COREI7_SANDYBRIDGE): New builtin.
>>       (BUILT_IN_TARGET_IS_AMDFAM10_BARCELONA): New builtin.
>>       (BUILT_IN_TARGET_IS_AMDFAM10_SHANGHAI): New builtin.
>>       (BUILT_IN_TARGET_IS_AMDFAM10_ISTANBUL): New builtin.
> (cut)
>
> Keep the port-specific bits in the port, please. I don't see why
> this has to be in generic files as opposed to target hooks and
> included target-specific file fragments like everything else
> (well, most everything) in gcc.  If not, I think we'll see
> cpu_ports*variants explosion here until these bits are
> rewritten...

Yes, this should move into the port. Sorry, I will change it.

Thanks,
-Sri.

>
> brgds, H-P
>
>
Richard Biener Aug. 18, 2011, 8:03 a.m. UTC | #10
On Wed, Aug 17, 2011 at 7:54 PM, Sriraman Tallam <tmsriram@google.com> wrote:
> On Wed, Aug 17, 2011 at 12:37 AM, Richard Guenther
> <richard.guenther@gmail.com> wrote:
>> On Tue, Aug 16, 2011 at 10:50 PM, Sriraman Tallam <tmsriram@google.com> wrote:
>>> Support for getting CPU type and feature information at run-time.
>>>
>>> The following patch provides support for finding the platform type at run-time, like cpu type and features supported. The multi-versioning framework will use the builtins added to dispatch the right function version. Please refer to http://gcc.gnu.org/ml/gcc/2011-08/msg00298.html for details on function multi-versioning usability.
>>
>> Please provide an overview why you need the new builtins,
>
> For multi-versioning,  the compiler can call the appropriate builtin
> to dispatch the right version. The builtin call will later get folded.
>
> For example,
>
> int  __attribute__ version ("sse4_1")
> compute ()
> {
>   // Do sse4_1 specific impkementation.
> }
>
> int
> compute ()
> {
>  // Generic implementation
> }
>
> The compiler will check if the target supports the attribute and then
> convert a call to compute ()  into  this:
>
> if (__builtin_target_supports_sse4_1 ())
>  compute_sse4_1 (); // Call to the SSE4_1 implementation
> else
>  compute_generic (); // Call to the generic implementation
>
> Further, having it as builtin function allows it to be overridden by
> the programmer. For instance, the programmer can override it to
> identify newer CPU types not yet supported. Having these builtins
> makes it convenient to identify platform type and features in general.
>
> why you need
>> a separate pass to fold them (instead of just expanding them) and why
>
> I can move it into builtins.c along with where other builtins are
> folded and remove the separate pass. My intention originally was to
> fold them as early as possible, in this case after multi-versioning
> but I guess this is not a requirement.

Yes, they should be folded by targetm.fold_builtin instead.  The Frontend
should simply fold the tests at the time it creates them, that's as early
as possible (gimplification will also re-fold all builtin function calls).

>> you are creating
>> vars behind the back of GCC:
>
> The flow I had in mind was to have functions in libgcc which will use
> CPUID to get target features and set global vars corresponding to the
> features. So, the builtin should be folded by into the appropriate
> variable in libgcc.

Hm, but then the variable should reside in libgcc and you'd only need
an extern variant in the varpool.  I'm not sure separate constructors
(possibly in each module ...) would be better than a single one in
libgcc that would get run unconditionally.

>>
>> +  /* Set finalized to 1, otherwise it asserts in function "write_symbol" in
>> +     lto-streamer-out.c. */
>> +  vnode->finalized = 1;
>>
>> where I think you miss a varpool_finalize_node call somewhere.  Why
>> isn't this all done at target init time
>
> I wanted to do this on demand. If none of the new builtins are called
> in the program, I do not need to to do this at all. In summary, libgcc
> has a function called __cpu_indicator_init which does the work of
> determining target features and setting the appropriate globals. If
> the new builtins are called, gcc will call __cpu_indicator_init in a
> constructor so that it is called exactly once. Then, gcc will fold the
> builtin to the appropriate global variable.

I see, but this sounds like premature optimization to me, no?  Considering
you'd do this in each module and our inability to merge those constructors
at link time.  If we put __cpu_indicator, the constructor and the assorted
support into a separate module inside libgcc.a could we arrange it in a way
that if __cpu_indicator is not referenced from the program that piece isn't
linked in?  (not sure if that is possible with constructors)

Richard.

>
> ?  If you don't mark the
>> variable as to be preserved
>> like you do cgraph will optimize it all away if it isn't needed.
>
>>
>> Richard.
>>
>>>        * tree-pass.h (pass_tree_fold_builtin_target): New pass.
>>>        * builtins.def (BUILT_IN_TARGET_SUPPORTS_CMOV): New builtin.
>>>        (BUILT_IN_TARGET_SUPPORTS_MMX): New builtin.
>>>        (BUILT_IN_TARGET_SUPPORTS_POPCOUNT): New builtin.
>>>        (BUILT_IN_TARGET_SUPPORTS_SSE): New builtin.
>>>        (BUILT_IN_TARGET_SUPPORTS_SSE2): New builtin.
>>>        (BUILT_IN_TARGET_SUPPORTS_SSE3): New builtin.
>>>        (BUILT_IN_TARGET_SUPPORTS_SSSE3): New builtin.
>>>        (BUILT_IN_TARGET_SUPPORTS_SSE4_1): New builtin.
>>>        (BUILT_IN_TARGET_SUPPORTS_SSE4_2): New builtin.
>>>        (BUILT_IN_TARGET_IS_AMD): New builtin.
>>>        (BUILT_IN_TARGET_IS_INTEL): New builtin.
>>>        (BUILT_IN_TARGET_IS_COREI7_NEHALEM): New builtin.
>>>        (BUILT_IN_TARGET_IS_COREI7_WESTMERE): New builtin.
>>>        (BUILT_IN_TARGET_IS_COREI7_SANDYBRIDGE): New builtin.
>>>        (BUILT_IN_TARGET_IS_AMDFAM10_BARCELONA): New builtin.
>>>        (BUILT_IN_TARGET_IS_AMDFAM10_SHANGHAI): New builtin.
>>>        (BUILT_IN_TARGET_IS_AMDFAM10_ISTANBUL): New builtin.
>>>        * mversn-dispatch.c (do_fold_builtin_target): New function.
>>>        (gate_fold_builtin_target): New function.
>>>        (pass_tree_fold_builtin_target): New pass.
>>>        * timevar.def (TV_FOLD_BUILTIN_TARGET): New var.
>>>        * passes.c (init_optimization_passes): Add new pass to pass list.
>>>        * config/i386/i386.c (build_struct_with_one_bit_fields): New function.
>>>        (make_var_decl): New function.
>>>        (get_field_from_struct): New function.
>>>        (make_constructor_to_get_target_type): New function.
>>>        (fold_builtin_target): New function.
>>>        (ix86_fold_builtin): New function.
>>>        (TARGET_FOLD_BUILTIN): New macro.
>>>
>>>        * gcc.dg/builtin_target.c: New test.
>>>
>>>        * config/i386/i386-cpuinfo.c: New file.
>>>        * config/i386/t-cpuinfo: New file.
>>>        * config.host: Add t-cpuinfo to link i386-cpuinfo.o with libgcc
>>>
>>> Index: libgcc/config.host
>>> ===================================================================
>>> --- libgcc/config.host  (revision 177767)
>>> +++ libgcc/config.host  (working copy)
>>> @@ -609,7 +609,7 @@ case ${host} in
>>>  i[34567]86-*-linux* | x86_64-*-linux* | \
>>>   i[34567]86-*-kfreebsd*-gnu | i[34567]86-*-knetbsd*-gnu | \
>>>   i[34567]86-*-gnu*)
>>> -       tmake_file="${tmake_file} t-tls"
>>> +       tmake_file="${tmake_file} t-tls i386/t-cpuinfo"
>>>        if test "$libgcc_cv_cfi" = "yes"; then
>>>                tmake_file="${tmake_file} t-stack i386/t-stack-i386"
>>>        fi
>>> Index: libgcc/config/i386/t-cpuinfo
>>> ===================================================================
>>> --- libgcc/config/i386/t-cpuinfo        (revision 0)
>>> +++ libgcc/config/i386/t-cpuinfo        (revision 0)
>>> @@ -0,0 +1,2 @@
>>> +# This is an endfile
>>> +LIB2ADD += $(srcdir)/config/i386/i386-cpuinfo.c
>>> Index: libgcc/config/i386/i386-cpuinfo.c
>>> ===================================================================
>>> --- libgcc/config/i386/i386-cpuinfo.c   (revision 0)
>>> +++ libgcc/config/i386/i386-cpuinfo.c   (revision 0)
>>> @@ -0,0 +1,275 @@
>>> +/* Copyright (C) 2011 Free Software Foundation, Inc.
>>> + * Contributed by Sriraman Tallam <tmsriram@google.com>.
>>> + *
>>> + * This file is free software; you can redistribute it and/or modify it
>>> + * under the terms of the GNU General Public License as published by the
>>> + * Free Software Foundation; either version 3, or (at your option) any
>>> + * later version.
>>> + *
>>> + * This file is distributed in the hope that it will be useful, but
>>> + * WITHOUT ANY WARRANTY; without even the implied warranty of
>>> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
>>> + * General Public License for more details.
>>> + *
>>> + * Under Section 7 of GPL version 3, you are granted additional
>>> + * permissions described in the GCC Runtime Library Exception, version
>>> + * 3.1, as published by the Free Software Foundation.
>>> + *
>>> + * You should have received a copy of the GNU General Public License and
>>> + * a copy of the GCC Runtime Library Exception along with this program;
>>> + * see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
>>> + * <http://www.gnu.org/licenses/>.
>>> + *
>>> + *
>>> + * This code is adapted from gcc/config/i386/driver-i386.c. The CPUID
>>> + * instruction is used to figure out the cpu type and supported features.
>>> + * GCC runs __cpu_indicator_init from a constructor which sets the members
>>> + * of __cpu_model and __cpu_features.
>>> + */
>>> +
>>> +#include <string.h>
>>> +
>>> +#ifdef __GNUC__
>>> +#include "cpuid.h"
>>> +
>>> +enum processor_type
>>> +{
>>> +  PROCESSOR_PENTIUM = 0,
>>> +  PROCESSOR_CORE2,
>>> +  PROCESSOR_COREI7_NEHALEM,
>>> +  PROCESSOR_COREI7_WESTMERE,
>>> +  PROCESSOR_COREI7_SANDYBRIDGE,
>>> +  PROCESSOR_INTEL_GENERIC,
>>> +  PROCESSOR_AMDFAM10_BARCELONA,
>>> +  PROCESSOR_AMDFAM10_SHANGHAI,
>>> +  PROCESSOR_AMDFAM10_ISTANBUL,
>>> +  PROCESSOR_AMDFAM10_GENERIC,
>>> +  PROCESSOR_AMD_GENERIC,
>>> +  PROCESSOR_GENERIC,
>>> +  PROCESSOR_max
>>> +};
>>> +
>>> +enum vendor_signatures
>>> +{
>>> +  SIG_INTEL =  0x756e6547 /* Genu */,
>>> +  SIG_AMD =    0x68747541 /* Auth */
>>> +};
>>> +
>>> +
>>> +/* Features supported. */
>>> +
>>> +struct __processor_features
>>> +{
>>> +  unsigned int __cpu_cmov : 1;
>>> +  unsigned int __cpu_mmx : 1;
>>> +  unsigned int __cpu_popcnt : 1;
>>> +  unsigned int __cpu_sse : 1;
>>> +  unsigned int __cpu_sse2 : 1;
>>> +  unsigned int __cpu_sse3 : 1;
>>> +  unsigned int __cpu_ssse3 : 1;
>>> +  unsigned int __cpu_sse4_1 : 1;
>>> +  unsigned int __cpu_sse4_2 : 1;
>>> +};
>>> +
>>> +/* Flags exported. */
>>> +
>>> +struct __processor_model
>>> +{
>>> +  unsigned int __cpu_is_amd : 1;
>>> +  unsigned int __cpu_is_intel : 1;
>>> +  unsigned int __cpu_is_corei7_nehalem : 1;
>>> +  unsigned int __cpu_is_corei7_westmere : 1;
>>> +  unsigned int __cpu_is_corei7_sandybridge : 1;
>>> +  unsigned int __cpu_is_amdfam10_barcelona : 1;
>>> +  unsigned int __cpu_is_amdfam10_shanghai : 1;
>>> +  unsigned int __cpu_is_amdfam10_istanbul : 1;
>>> +};
>>> +
>>> +enum processor_type __cpu_type = PROCESSOR_GENERIC;
>>> +struct __processor_features __cpu_features;
>>> +struct __processor_model __cpu_model;
>>> +
>>> +static void
>>> +get_amd_cpu (unsigned int family, unsigned int model)
>>> +{
>>> +  switch (family)
>>> +    {
>>> +    case 0x10:
>>> +      switch (model)
>>> +       {
>>> +       case 0x2:
>>> +         __cpu_type = PROCESSOR_AMDFAM10_BARCELONA;
>>> +         __cpu_model.__cpu_is_amdfam10_barcelona = 1;
>>> +         break;
>>> +       case 0x4:
>>> +         __cpu_type = PROCESSOR_AMDFAM10_SHANGHAI;
>>> +         __cpu_model.__cpu_is_amdfam10_shanghai = 1;
>>> +         break;
>>> +       case 0x8:
>>> +         __cpu_type = PROCESSOR_AMDFAM10_ISTANBUL;
>>> +         __cpu_model.__cpu_is_amdfam10_istanbul = 1;
>>> +         break;
>>> +       default:
>>> +         __cpu_type = PROCESSOR_AMDFAM10_GENERIC;
>>> +         break;
>>> +       }
>>> +      break;
>>> +    default:
>>> +      __cpu_type = PROCESSOR_AMD_GENERIC;
>>> +    }
>>> +}
>>> +
>>> +static void
>>> +get_intel_cpu (unsigned int family, unsigned int model, unsigned int brand_id)
>>> +{
>>> +  /* Parse family and model only if brand ID is 0. */
>>> +  if (brand_id == 0)
>>> +    {
>>> +      switch (family)
>>> +       {
>>> +       case 0x5:
>>> +         __cpu_type = PROCESSOR_PENTIUM;
>>> +         break;
>>> +       case 0x6:
>>> +         switch (model)
>>> +           {
>>> +           case 0x1a:
>>> +           case 0x1e:
>>> +           case 0x1f:
>>> +           case 0x2e:
>>> +             /* Nehalem.  */
>>> +             __cpu_type = PROCESSOR_COREI7_NEHALEM;
>>> +             __cpu_model.__cpu_is_corei7_nehalem = 1;
>>> +             break;
>>> +           case 0x25:
>>> +           case 0x2c:
>>> +           case 0x2f:
>>> +             /* Westmere.  */
>>> +             __cpu_type = PROCESSOR_COREI7_WESTMERE;
>>> +             __cpu_model.__cpu_is_corei7_westmere = 1;
>>> +             break;
>>> +           case 0x2a:
>>> +             /* Sandy Bridge.  */
>>> +             __cpu_type = PROCESSOR_COREI7_SANDYBRIDGE;
>>> +             __cpu_model.__cpu_is_corei7_sandybridge = 1;
>>> +             break;
>>> +           case 0x17:
>>> +           case 0x1d:
>>> +             /* Penryn.  */
>>> +           case 0x0f:
>>> +             /* Merom.  */
>>> +             __cpu_type = PROCESSOR_CORE2;
>>> +             break;
>>> +           default:
>>> +             __cpu_type = PROCESSOR_INTEL_GENERIC;
>>> +             break;
>>> +           }
>>> +         break;
>>> +       default:
>>> +         /* We have no idea.  */
>>> +         __cpu_type = PROCESSOR_INTEL_GENERIC;
>>> +         break;
>>> +       }
>>> +    }
>>> +}
>>> +
>>> +static void
>>> +get_available_features (unsigned int ecx, unsigned int edx)
>>> +{
>>> +  __cpu_features.__cpu_cmov = (edx & bit_CMOV) ? 1 : 0;
>>> +  __cpu_features.__cpu_mmx = (edx & bit_MMX) ? 1 : 0;
>>> +  __cpu_features.__cpu_sse = (edx & bit_SSE) ? 1 : 0;
>>> +  __cpu_features.__cpu_sse2 = (edx & bit_SSE2) ? 1 : 0;
>>> +  __cpu_features.__cpu_popcnt = (ecx & bit_POPCNT) ? 1 : 0;
>>> +  __cpu_features.__cpu_sse3 = (ecx & bit_SSE3) ? 1 : 0;
>>> +  __cpu_features.__cpu_ssse3 = (ecx & bit_SSSE3) ? 1 : 0;
>>> +  __cpu_features.__cpu_sse4_1 = (ecx & bit_SSE4_1) ? 1 : 0;
>>> +  __cpu_features.__cpu_sse4_2 = (ecx & bit_SSE4_2) ? 1 : 0;
>>> +}
>>> +
>>> +/* A noinline function calling __get_cpuid. Having many calls to
>>> +   cpuid in one function in 32-bit mode causes GCC to complain:
>>> +   "can’t find a register in class ‘CLOBBERED_REGS’".  This is
>>> +   related to PR rtl-optimization 44174. */
>>> +
>>> +static int __attribute__ ((noinline))
>>> +__get_cpuid_output (unsigned int __level,
>>> +                   unsigned int *__eax, unsigned int *__ebx,
>>> +                   unsigned int *__ecx, unsigned int *__edx)
>>> +{
>>> +  return __get_cpuid (__level, __eax, __ebx, __ecx, __edx);
>>> +}
>>> +
>>> +/* This function will be linked in to binaries that need to look up
>>> +   CPU information.  */
>>> +
>>> +void
>>> +__cpu_indicator_init(void)
>>> +{
>>> +  unsigned int eax, ebx, ecx, edx;
>>> +
>>> +  int max_level = 5;
>>> +  unsigned int vendor;
>>> +  unsigned int model, family, brand_id;
>>> +
>>> +  memset (&__cpu_features, 0, sizeof (struct __processor_features));
>>> +  memset (&__cpu_model, 0, sizeof (struct __processor_model));
>>> +
>>> +  /* Assume cpuid insn present. Run in level 0 to get vendor id. */
>>> +  if (!__get_cpuid_output (0, &eax, &ebx, &ecx, &edx))
>>> +    return;
>>> +
>>> +  vendor = ebx;
>>> +  max_level = eax;
>>> +
>>> +  if (max_level < 1)
>>> +    return;
>>> +
>>> +  if (!__get_cpuid_output (1, &eax, &ebx, &ecx, &edx))
>>> +    return;
>>> +
>>> +  model = (eax >> 4) & 0x0f;
>>> +  family = (eax >> 8) & 0x0f;
>>> +  brand_id = ebx & 0xff;
>>> +
>>> +  /* Adjust model and family for Intel CPUS. */
>>> +  if (vendor == SIG_INTEL)
>>> +    {
>>> +      unsigned int extended_model, extended_family;
>>> +
>>> +      extended_model = (eax >> 12) & 0xf0;
>>> +      extended_family = (eax >> 20) & 0xff;
>>> +      if (family == 0x0f)
>>> +       {
>>> +         family += extended_family;
>>> +         model += extended_model;
>>> +       }
>>> +      else if (family == 0x06)
>>> +       model += extended_model;
>>> +    }
>>> +
>>> +  /* Find CPU model. */
>>> +
>>> +  if (vendor == SIG_AMD)
>>> +    {
>>> +      __cpu_model.__cpu_is_amd = 1;
>>> +      get_amd_cpu (family, model);
>>> +    }
>>> +  else if (vendor == SIG_INTEL)
>>> +    {
>>> +      __cpu_model.__cpu_is_intel = 1;
>>> +      get_intel_cpu (family, model, brand_id);
>>> +    }
>>> +
>>> +  /* Find available features. */
>>> +  get_available_features (ecx, edx);
>>> +}
>>> +
>>> +#else
>>> +
>>> +void
>>> +__cpu_indicator_init(void)
>>> +{
>>> +}
>>> +
>>> +#endif /* __GNUC__ */
>>> Index: gcc/tree-pass.h
>>> ===================================================================
>>> --- gcc/tree-pass.h     (revision 177767)
>>> +++ gcc/tree-pass.h     (working copy)
>>> @@ -449,6 +449,7 @@ extern struct gimple_opt_pass pass_split_functions
>>>  extern struct gimple_opt_pass pass_feedback_split_functions;
>>>  extern struct gimple_opt_pass pass_threadsafe_analyze;
>>>  extern struct gimple_opt_pass pass_tree_convert_builtin_dispatch;
>>> +extern struct gimple_opt_pass pass_tree_fold_builtin_target;
>>>
>>>  /* IPA Passes */
>>>  extern struct simple_ipa_opt_pass pass_ipa_lower_emutls;
>>> Index: gcc/testsuite/gcc.dg/builtin_target.c
>>> ===================================================================
>>> --- gcc/testsuite/gcc.dg/builtin_target.c       (revision 0)
>>> +++ gcc/testsuite/gcc.dg/builtin_target.c       (revision 0)
>>> @@ -0,0 +1,49 @@
>>> +/* This test checks if the __builtin_target_* calls are recognized. */
>>> +
>>> +/* { dg-do run } */
>>> +
>>> +int
>>> +fn1 ()
>>> +{
>>> +  if (__builtin_target_supports_cmov () < 0)
>>> +    return -1;
>>> +  if (__builtin_target_supports_mmx () < 0)
>>> +    return -1;
>>> +  if (__builtin_target_supports_popcount () < 0)
>>> +    return -1;
>>> +  if (__builtin_target_supports_sse () < 0)
>>> +    return -1;
>>> +  if (__builtin_target_supports_sse2 () < 0)
>>> +    return -1;
>>> +  if (__builtin_target_supports_sse3 () < 0)
>>> +    return -1;
>>> +  if (__builtin_target_supports_ssse3 () < 0)
>>> +    return -1;
>>> +  if (__builtin_target_supports_sse4_1 () < 0)
>>> +    return -1;
>>> +  if (__builtin_target_supports_sse4_2 () < 0)
>>> +    return -1;
>>> +  if (__builtin_target_is_amd () < 0)
>>> +    return -1;
>>> +  if (__builtin_target_is_intel () < 0)
>>> +    return -1;
>>> +  if (__builtin_target_is_corei7_nehalem () < 0)
>>> +    return -1;
>>> +  if (__builtin_target_is_corei7_westmere () < 0)
>>> +    return -1;
>>> +  if (__builtin_target_is_corei7_sandybridge () < 0)
>>> +    return -1;
>>> +  if (__builtin_target_is_amdfam10_barcelona () < 0)
>>> +    return -1;
>>> +  if (__builtin_target_is_amdfam10_shanghai () < 0)
>>> +    return -1;
>>> +  if (__builtin_target_is_amdfam10_istanbul () < 0)
>>> +    return -1;
>>> +
>>> +  return 0;
>>> +}
>>> +
>>> +int main ()
>>> +{
>>> +  return fn1 ();
>>> +}
>>> Index: gcc/builtins.def
>>> ===================================================================
>>> --- gcc/builtins.def    (revision 177767)
>>> +++ gcc/builtins.def    (working copy)
>>> @@ -763,6 +763,25 @@ DEF_BUILTIN (BUILT_IN_EMUTLS_REGISTER_COMMON,
>>>  /* Multiversioning builtin dispatch hook. */
>>>  DEF_GCC_BUILTIN (BUILT_IN_DISPATCH, "dispatch", BT_FN_INT_PTR_FN_INT_PTR_PTR_VAR, ATTR_NULL)
>>>
>>> +/* Builtins to determine target type and features at run-time. */
>>> +DEF_GCC_BUILTIN (BUILT_IN_TARGET_SUPPORTS_CMOV, "target_supports_cmov", BT_FN_INT, ATTR_NULL)
>>> +DEF_GCC_BUILTIN (BUILT_IN_TARGET_SUPPORTS_MMX, "target_supports_mmx", BT_FN_INT, ATTR_NULL)
>>> +DEF_GCC_BUILTIN (BUILT_IN_TARGET_SUPPORTS_POPCOUNT, "target_supports_popcount", BT_FN_INT, ATTR_NULL)
>>> +DEF_GCC_BUILTIN (BUILT_IN_TARGET_SUPPORTS_SSE, "target_supports_sse", BT_FN_INT, ATTR_NULL)
>>> +DEF_GCC_BUILTIN (BUILT_IN_TARGET_SUPPORTS_SSE2, "target_supports_sse2", BT_FN_INT, ATTR_NULL)
>>> +DEF_GCC_BUILTIN (BUILT_IN_TARGET_SUPPORTS_SSE3, "target_supports_sse3", BT_FN_INT, ATTR_NULL)
>>> +DEF_GCC_BUILTIN (BUILT_IN_TARGET_SUPPORTS_SSSE3, "target_supports_ssse3", BT_FN_INT, ATTR_NULL)
>>> +DEF_GCC_BUILTIN (BUILT_IN_TARGET_SUPPORTS_SSE4_1, "target_supports_sse4_1", BT_FN_INT, ATTR_NULL)
>>> +DEF_GCC_BUILTIN (BUILT_IN_TARGET_SUPPORTS_SSE4_2, "target_supports_sse4_2", BT_FN_INT, ATTR_NULL)
>>> +DEF_GCC_BUILTIN (BUILT_IN_TARGET_IS_AMD, "target_is_amd", BT_FN_INT, ATTR_NULL)
>>> +DEF_GCC_BUILTIN (BUILT_IN_TARGET_IS_INTEL, "target_is_intel", BT_FN_INT, ATTR_NULL)
>>> +DEF_GCC_BUILTIN (BUILT_IN_TARGET_IS_COREI7_NEHALEM, "target_is_corei7_nehalem", BT_FN_INT, ATTR_NULL)
>>> +DEF_GCC_BUILTIN (BUILT_IN_TARGET_IS_COREI7_WESTMERE, "target_is_corei7_westmere", BT_FN_INT, ATTR_NULL)
>>> +DEF_GCC_BUILTIN (BUILT_IN_TARGET_IS_COREI7_SANDYBRIDGE, "target_is_corei7_sandybridge", BT_FN_INT, ATTR_NULL)
>>> +DEF_GCC_BUILTIN (BUILT_IN_TARGET_IS_AMDFAM10_BARCELONA, "target_is_amdfam10_barcelona", BT_FN_INT, ATTR_NULL)
>>> +DEF_GCC_BUILTIN (BUILT_IN_TARGET_IS_AMDFAM10_SHANGHAI, "target_is_amdfam10_shanghai", BT_FN_INT, ATTR_NULL)
>>> +DEF_GCC_BUILTIN (BUILT_IN_TARGET_IS_AMDFAM10_ISTANBUL, "target_is_amdfam10_istanbul", BT_FN_INT, ATTR_NULL)
>>> +
>>>  /* Exception support.  */
>>>  DEF_BUILTIN_STUB (BUILT_IN_UNWIND_RESUME, "__builtin_unwind_resume")
>>>  DEF_BUILTIN_STUB (BUILT_IN_CXA_END_CLEANUP, "__builtin_cxa_end_cleanup")
>>> Index: gcc/mversn-dispatch.c
>>> ===================================================================
>>> --- gcc/mversn-dispatch.c       (revision 177767)
>>> +++ gcc/mversn-dispatch.c       (working copy)
>>> @@ -135,6 +135,7 @@ along with GCC; see the file COPYING3.  If not see
>>>  #include "output.h"
>>>  #include "vecprim.h"
>>>  #include "gimple-pretty-print.h"
>>> +#include "target.h"
>>>
>>>  typedef struct cgraph_node* NODEPTR;
>>>  DEF_VEC_P (NODEPTR);
>>> @@ -1764,3 +1765,103 @@ struct gimple_opt_pass pass_tree_convert_builtin_d
>>>   TODO_update_ssa | TODO_verify_ssa
>>>  }
>>>  };
>>> +
>>> +/* Fold calls to __builtin_target_* */
>>> +
>>> +static unsigned int
>>> +do_fold_builtin_target (void)
>>> +{
>>> +  basic_block bb;
>>> +  gimple_stmt_iterator gsi;
>>> +
>>> +  /* Go through each stmt looking for __builtin_target_* calls */
>>> +  FOR_EACH_BB_FN (bb, DECL_STRUCT_FUNCTION (current_function_decl))
>>> +    {
>>> +      for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
>>> +        {
>>> +         gimple stmt = gsi_stmt (gsi);
>>> +         gimple assign_stmt;
>>> +          tree call_decl;
>>> +         tree lhs_retval;
>>> +         tree folded_val;
>>> +
>>> +         tree ssa_var, tmp_var;
>>> +         gimple init_stmt;
>>> +
>>> +          if (!is_gimple_call (stmt))
>>> +            continue;
>>> +
>>> +          call_decl = gimple_call_fndecl (stmt);
>>> +
>>> +         /* Check if it is a __builtin_target_* call. */
>>> +
>>> +         if (call_decl == NULL
>>> +             || DECL_NAME (call_decl) == NULL_TREE
>>> +             || DECL_BUILT_IN_CLASS (call_decl) != BUILT_IN_NORMAL
>>> +             || strstr (IDENTIFIER_POINTER (DECL_NAME (call_decl)),
>>> +                         "__builtin_target") == NULL)
>>> +            continue;
>>> +
>>> +         /* If the lhs is NULL there is no need to fold the call. */
>>> +         lhs_retval = gimple_call_lhs(stmt);
>>> +         if (lhs_retval == NULL)
>>> +           continue;
>>> +
>>> +         /* Call the target hook to fold the builtin */
>>> +          folded_val = targetm.fold_builtin(call_decl, 0, NULL, false);
>>> +
>>> +         /* If the target does not support the builtin then fold it to zero. */
>>> +         if (folded_val == NULL_TREE)
>>> +           folded_val = build_zero_cst (unsigned_type_node);
>>> +
>>> +         /* Type cast unsigned value to integer */
>>> +         tmp_var = create_tmp_var (unsigned_type_node, NULL);
>>> +         init_stmt = gimple_build_assign (tmp_var, folded_val);
>>> +         ssa_var = make_ssa_name (tmp_var, init_stmt);
>>> +         gimple_assign_set_lhs (init_stmt, ssa_var);
>>> +         mark_symbols_for_renaming (init_stmt);
>>> +
>>> +         assign_stmt = gimple_build_assign_with_ops (NOP_EXPR, lhs_retval, ssa_var, 0);
>>> +         mark_symbols_for_renaming(assign_stmt);
>>> +
>>> +         gsi_insert_after_without_update (&gsi, assign_stmt, GSI_SAME_STMT);
>>> +         gsi_insert_after_without_update (&gsi, init_stmt, GSI_SAME_STMT);
>>> +         /* Delete the original call. */
>>> +         gsi_remove(&gsi, true);
>>> +       }
>>> +    }
>>> +
>>> +  return 0;
>>> +}
>>> +
>>> +static bool
>>> +gate_fold_builtin_target (void)
>>> +{
>>> +  return true;
>>> +}
>>> +
>>> +/* Pass to fold __builtin_target_* functions */
>>> +
>>> +struct gimple_opt_pass pass_tree_fold_builtin_target =
>>> +{
>>> + {
>>> +  GIMPLE_PASS,
>>> +  "fold_builtin_target",               /* name */
>>> +  gate_fold_builtin_target,            /* gate */
>>> +  do_fold_builtin_target,              /* execute */
>>> +  NULL,                                        /* sub */
>>> +  NULL,                                        /* next */
>>> +  0,                                   /* static_pass_number */
>>> +  TV_FOLD_BUILTIN_TARGET,              /* tv_id */
>>> +  PROP_cfg,                            /* properties_required */
>>> +  PROP_cfg,                            /* properties_provided */
>>> +  0,                                   /* properties_destroyed */
>>> +  0,                                   /* todo_flags_start */
>>> +  TODO_dump_func |                     /* todo_flags_finish */
>>> +  TODO_cleanup_cfg |
>>> +  TODO_update_ssa |
>>> +  TODO_verify_ssa
>>> + }
>>> +};
>>> +
>>> +
>>> Index: gcc/timevar.def
>>> ===================================================================
>>> --- gcc/timevar.def     (revision 177767)
>>> +++ gcc/timevar.def     (working copy)
>>> @@ -124,6 +124,7 @@ DEFTIMEVAR (TV_PARSE_INMETH          , "parser inl
>>>  DEFTIMEVAR (TV_TEMPLATE_INST         , "template instantiation")
>>>  DEFTIMEVAR (TV_INLINE_HEURISTICS     , "inline heuristics")
>>>  DEFTIMEVAR (TV_MVERSN_DISPATCH       , "multiversion dispatch")
>>> +DEFTIMEVAR (TV_FOLD_BUILTIN_TARGET   , "fold __builtin_target calls")
>>>  DEFTIMEVAR (TV_INTEGRATION           , "integration")
>>>  DEFTIMEVAR (TV_TREE_GIMPLIFY        , "tree gimplify")
>>>  DEFTIMEVAR (TV_TREE_EH              , "tree eh")
>>> Index: gcc/passes.c
>>> ===================================================================
>>> --- gcc/passes.c        (revision 177767)
>>> +++ gcc/passes.c        (working copy)
>>> @@ -1249,6 +1249,8 @@ init_optimization_passes (void)
>>>     {
>>>       struct opt_pass **p = &pass_ipa_multiversion_dispatch.pass.sub;
>>>       NEXT_PASS (pass_tree_convert_builtin_dispatch);
>>> +      /* Fold calls to __builtin_target_*. */
>>> +      NEXT_PASS (pass_tree_fold_builtin_target);
>>>       /* Rebuilding cgraph edges is necessary as the above passes change
>>>          the call graph.  Otherwise, future optimizations use the old
>>>         call graph and make wrong decisions sometimes.*/
>>> Index: gcc/config/i386/i386.c
>>> ===================================================================
>>> --- gcc/config/i386/i386.c      (revision 177767)
>>> +++ gcc/config/i386/i386.c      (working copy)
>>> @@ -58,6 +58,8 @@ along with GCC; see the file COPYING3.  If not see
>>>  #include "sched-int.h"
>>>  #include "sbitmap.h"
>>>  #include "fibheap.h"
>>> +#include "tree-flow.h"
>>> +#include "tree-pass.h"
>>>
>>>  enum upper_128bits_state
>>>  {
>>> @@ -7867,6 +7869,338 @@ ix86_build_builtin_va_list (void)
>>>   return ret;
>>>  }
>>>
>>> +/* Returns a struct type with name NAME and number of fields equal to
>>> +   NUM_FIELDS.  Each field is a unsigned int bit field of length 1 bit. */
>>> +
>>> +static tree
>>> +build_struct_with_one_bit_fields (int num_fields, const char *name)
>>> +{
>>> +  int i;
>>> +  char field_name [10];
>>> +  tree field = NULL_TREE, field_chain = NULL_TREE;
>>> +  tree type = make_node (RECORD_TYPE);
>>> +
>>> +  strcpy (field_name, "k_field");
>>> +
>>> +  for (i = 0; i < num_fields; i++)
>>> +    {
>>> +      /* Name the fields, 0_field, 1_field, ... */
>>> +      field_name [0] = '0' + i;
>>> +      field = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
>>> +                         get_identifier (field_name), unsigned_type_node);
>>> +      DECL_BIT_FIELD (field) = 1;
>>> +      DECL_SIZE (field) = bitsize_one_node;
>>> +      if (field_chain != NULL_TREE)
>>> +       DECL_CHAIN (field) = field_chain;
>>> +      field_chain = field;
>>> +    }
>>> +  finish_builtin_struct (type, name, field_chain, NULL_TREE);
>>> +  return type;
>>> +}
>>> +
>>> +/* Returns a VAR_DECL of type TYPE and name NAME. */
>>> +
>>> +static tree
>>> +make_var_decl (tree type, const char *name)
>>> +{
>>> +  tree new_decl;
>>> +  struct varpool_node *vnode;
>>> +
>>> +  new_decl = build_decl (UNKNOWN_LOCATION,
>>> +                        VAR_DECL,
>>> +                        get_identifier(name),
>>> +                        type);
>>> +
>>> +  DECL_EXTERNAL (new_decl) = 1;
>>> +  TREE_STATIC (new_decl) = 1;
>>> +  TREE_PUBLIC (new_decl) = 1;
>>> +  DECL_INITIAL (new_decl) = 0;
>>> +  DECL_ARTIFICIAL (new_decl) = 0;
>>> +  DECL_PRESERVE_P (new_decl) = 1;
>>> +
>>> +  make_decl_one_only (new_decl, DECL_ASSEMBLER_NAME (new_decl));
>>> +  assemble_variable (new_decl, 0, 0, 0);
>>> +
>>> +  vnode = varpool_node (new_decl);
>>> +  gcc_assert (vnode != NULL);
>>> +  /* Set finalized to 1, otherwise it asserts in function "write_symbol" in
>>> +     lto-streamer-out.c. */
>>> +  vnode->finalized = 1;
>>> +
>>> +  return new_decl;
>>> +}
>>> +
>>> +/* Traverses the chain of fields in STRUCT_TYPE and returns the FIELD_NUM
>>> +   numbered field. */
>>> +
>>> +static tree
>>> +get_field_from_struct (tree struct_type, int field_num)
>>> +{
>>> +  int i;
>>> +  tree field = TYPE_FIELDS (struct_type);
>>> +
>>> +  for (i = 0; i < field_num; i++, field = DECL_CHAIN(field))
>>> +    {
>>> +      gcc_assert (field != NULL_TREE);
>>> +    }
>>> +
>>> +  return field;
>>> +}
>>> +
>>> +/* Create a new static constructor that calls __cpu_indicator_init ()
>>> +   function defined in libgcc/config/i386-cpuinfo.c which runs cpuid
>>> +   to figure out the type of the target. */
>>> +
>>> +static tree
>>> +make_constructor_to_get_target_type (const char *name)
>>> +{
>>> +  tree decl, type, t;
>>> +  gimple_seq seq;
>>> +  basic_block new_bb;
>>> +  tree old_current_function_decl;
>>> +
>>> +  tree __cpu_indicator_int_decl;
>>> +  gimple constructor_body;
>>> +
>>> +
>>> +  type = build_function_type_list (void_type_node, NULL_TREE);
>>> +
>>> +  /* Make a call stmt to __cpu_indicator_init */
>>> +  __cpu_indicator_int_decl = build_fn_decl ("__cpu_indicator_init", type);
>>> +  constructor_body = gimple_build_call (__cpu_indicator_int_decl, 0);
>>> +  DECL_EXTERNAL (__cpu_indicator_int_decl) = 1;
>>> +
>>> +  decl = build_fn_decl (name, type);
>>> +
>>> +  DECL_NAME (decl) = get_identifier (name);
>>> +  SET_DECL_ASSEMBLER_NAME (decl, DECL_NAME (decl));
>>> +  gcc_assert (cgraph_node (decl) != NULL);
>>> +
>>> +  TREE_USED (decl) = 1;
>>> +  DECL_ARTIFICIAL (decl) = 1;
>>> +  DECL_IGNORED_P (decl) = 0;
>>> +  TREE_PUBLIC (decl) = 0;
>>> +  DECL_UNINLINABLE (decl) = 1;
>>> +  DECL_EXTERNAL (decl) = 0;
>>> +  DECL_CONTEXT (decl) = NULL_TREE;
>>> +  DECL_INITIAL (decl) = make_node (BLOCK);
>>> +  DECL_STATIC_CONSTRUCTOR (decl) = 1;
>>> +  TREE_READONLY (decl) = 0;
>>> +  DECL_PURE_P (decl) = 0;
>>> +
>>> +  /* This is a comdat. */
>>> +  make_decl_one_only (decl, DECL_ASSEMBLER_NAME (decl));
>>> +
>>> +  /* Build result decl and add to function_decl. */
>>> +  t = build_decl (UNKNOWN_LOCATION, RESULT_DECL, NULL_TREE, void_type_node);
>>> +  DECL_ARTIFICIAL (t) = 1;
>>> +  DECL_IGNORED_P (t) = 1;
>>> +  DECL_RESULT (decl) = t;
>>> +
>>> +  gimplify_function_tree (decl);
>>> +
>>> +  /* Build CFG for this function. */
>>> +
>>> +  old_current_function_decl = current_function_decl;
>>> +  push_cfun (DECL_STRUCT_FUNCTION (decl));
>>> +  current_function_decl = decl;
>>> +  init_empty_tree_cfg_for_function (DECL_STRUCT_FUNCTION (decl));
>>> +  cfun->curr_properties |=
>>> +    (PROP_gimple_lcf | PROP_gimple_leh | PROP_cfg | PROP_referenced_vars |
>>> +     PROP_ssa);
>>> +  new_bb = create_empty_bb (ENTRY_BLOCK_PTR);
>>> +  make_edge (ENTRY_BLOCK_PTR, new_bb, EDGE_FALLTHRU);
>>> +
>>> +  /* XXX: Not sure if the edge commented below is necessary.  If I add this
>>> +     edge, it fails in gimple_verify_flow_info in tree-cfg.c in condition :
>>> +     " if (e->flags & EDGE_FALLTHRU)"
>>> +     during -fprofile-generate.
>>> +     Otherwise, it is fine.  Deleting this edge does not break anything.
>>> +     Commenting this so that it is clear I am intentionally not doing this.*/
>>> +  /* make_edge (new_bb, EXIT_BLOCK_PTR, EDGE_FALLTHRU); */
>>> +
>>> +  seq = gimple_seq_alloc_with_stmt (constructor_body);
>>> +
>>> +  set_bb_seq (new_bb, seq);
>>> +  gimple_set_bb (constructor_body, new_bb);
>>> +
>>> +  /* Set the lexical block of the constructor body. Fails the inliner
>>> +     other wise. */
>>> +  gimple_set_block (constructor_body, DECL_INITIAL (decl));
>>> +
>>> +  /* This call is very important if this pass runs when the IR is in
>>> +     SSA form.  It breaks things in strange ways otherwise. */
>>> +  init_tree_ssa (DECL_STRUCT_FUNCTION (decl));
>>> +  /* add_referenced_var (version_selector_var); */
>>> +
>>> +  cgraph_add_new_function (decl, true);
>>> +  cgraph_call_function_insertion_hooks (cgraph_node (decl));
>>> +  cgraph_mark_needed_node (cgraph_node (decl));
>>> +
>>> +  pop_cfun ();
>>> +  current_function_decl = old_current_function_decl;
>>> +  return decl;
>>> +}
>>> +
>>> +/* FNDECL is a __builtin_target_* call that is folded into an integer defined
>>> +   in libgcc/config/i386/i386-cpuinfo.c */
>>> +
>>> +static tree
>>> +fold_builtin_target (tree fndecl)
>>> +{
>>> +  /* This is the order of bit-fields in __processor_features in
>>> +     i386-cpuinfo.c */
>>> +  enum processor_features
>>> +  {
>>> +    F_CMOV = 0,
>>> +    F_MMX,
>>> +    F_POPCNT,
>>> +    F_SSE,
>>> +    F_SSE2,
>>> +    F_SSE3,
>>> +    F_SSSE3,
>>> +    F_SSE4_1,
>>> +    F_SSE4_2,
>>> +    F_MAX
>>> +  };
>>> +
>>> +  /* This is the order of bit-fields in __processor_model in
>>> +     i386-cpuinfo.c */
>>> +  enum processor_model
>>> +  {
>>> +    M_AMD = 0,
>>> +    M_INTEL,
>>> +    M_COREI7_NEHALEM,
>>> +    M_COREI7_WESTMERE,
>>> +    M_COREI7_SANDYBRIDGE,
>>> +    M_AMDFAM10_BARCELONA,
>>> +    M_AMDFAM10_SHANGHAI,
>>> +    M_AMDFAM10_ISTANBUL,
>>> +    M_MAX
>>> +  };
>>> +
>>> +  static tree __processor_features_type = NULL_TREE;
>>> +  static tree __cpu_features_var = NULL_TREE;
>>> +  static tree __processor_model_type = NULL_TREE;
>>> +  static tree __cpu_model_var = NULL_TREE;
>>> +  static tree ctor_decl = NULL_TREE;
>>> +  static tree field;
>>> +  static tree which_struct;
>>> +
>>> +  /* Make a call to __cpu_indicatior_init in a constructor.
>>> +     Function __cpu_indicator_init is defined in i386-cpuinfo.c. */
>>> +  if (ctor_decl == NULL_TREE)
>>> +   ctor_decl = make_constructor_to_get_target_type
>>> +               ("__cpu_indicator_init_ctor");
>>> +
>>> +  if (__processor_features_type == NULL_TREE)
>>> +    __processor_features_type = build_struct_with_one_bit_fields (F_MAX,
>>> +                                 "__processor_features");
>>> +
>>> +  if (__processor_model_type == NULL_TREE)
>>> +    __processor_model_type = build_struct_with_one_bit_fields (M_MAX,
>>> +                                 "__processor_model");
>>> +
>>> +  if (__cpu_features_var == NULL_TREE)
>>> +    __cpu_features_var = make_var_decl (__processor_features_type,
>>> +                                       "__cpu_features");
>>> +
>>> +  if (__cpu_model_var == NULL_TREE)
>>> +    __cpu_model_var = make_var_decl (__processor_model_type,
>>> +                                    "__cpu_model");
>>> +
>>> +  /* Look at fndecl code to identify the field requested. */
>>> +  switch (DECL_FUNCTION_CODE (fndecl))
>>> +    {
>>> +    case BUILT_IN_TARGET_SUPPORTS_CMOV:
>>> +      field = get_field_from_struct (__processor_features_type, F_CMOV);
>>> +      which_struct = __cpu_features_var;
>>> +      break;
>>> +    case BUILT_IN_TARGET_SUPPORTS_MMX:
>>> +      field = get_field_from_struct (__processor_features_type, F_MMX);
>>> +      which_struct = __cpu_features_var;
>>> +      break;
>>> +    case BUILT_IN_TARGET_SUPPORTS_POPCOUNT:
>>> +      field = get_field_from_struct (__processor_features_type, F_POPCNT);
>>> +      which_struct = __cpu_features_var;
>>> +      break;
>>> +    case BUILT_IN_TARGET_SUPPORTS_SSE:
>>> +      field = get_field_from_struct (__processor_features_type, F_SSE);
>>> +      which_struct = __cpu_features_var;
>>> +      break;
>>> +    case BUILT_IN_TARGET_SUPPORTS_SSE2:
>>> +      field = get_field_from_struct (__processor_features_type, F_SSE2);
>>> +      which_struct = __cpu_features_var;
>>> +      break;
>>> +    case BUILT_IN_TARGET_SUPPORTS_SSE3:
>>> +      field = get_field_from_struct (__processor_features_type, F_SSE3);
>>> +      which_struct = __cpu_features_var;
>>> +      break;
>>> +    case BUILT_IN_TARGET_SUPPORTS_SSSE3:
>>> +      field = get_field_from_struct (__processor_features_type, F_SSE3);
>>> +      which_struct = __cpu_features_var;
>>> +      break;
>>> +    case BUILT_IN_TARGET_SUPPORTS_SSE4_1:
>>> +      field = get_field_from_struct (__processor_features_type, F_SSE4_1);
>>> +      which_struct = __cpu_features_var;
>>> +      break;
>>> +    case BUILT_IN_TARGET_SUPPORTS_SSE4_2:
>>> +      field = get_field_from_struct (__processor_features_type, F_SSE4_2);
>>> +      which_struct = __cpu_features_var;
>>> +      break;
>>> +    case BUILT_IN_TARGET_IS_AMD:
>>> +      field = get_field_from_struct (__processor_model_type, M_AMD);;
>>> +      which_struct = __cpu_model_var;
>>> +      break;
>>> +    case BUILT_IN_TARGET_IS_INTEL:
>>> +      field = get_field_from_struct (__processor_model_type, M_INTEL);;
>>> +      which_struct = __cpu_model_var;
>>> +      break;
>>> +    case BUILT_IN_TARGET_IS_COREI7_NEHALEM:
>>> +      field = get_field_from_struct (__processor_model_type, M_COREI7_NEHALEM);;
>>> +      which_struct = __cpu_model_var;
>>> +      break;
>>> +    case BUILT_IN_TARGET_IS_COREI7_WESTMERE:
>>> +      field = get_field_from_struct (__processor_model_type, M_COREI7_WESTMERE);;
>>> +      which_struct = __cpu_model_var;
>>> +      break;
>>> +    case BUILT_IN_TARGET_IS_COREI7_SANDYBRIDGE:
>>> +      field = get_field_from_struct (__processor_model_type, M_COREI7_SANDYBRIDGE);;
>>> +      which_struct = __cpu_model_var;
>>> +      break;
>>> +    case BUILT_IN_TARGET_IS_AMDFAM10_BARCELONA:
>>> +      field = get_field_from_struct (__processor_model_type, M_AMDFAM10_BARCELONA);;
>>> +      which_struct = __cpu_model_var;
>>> +      break;
>>> +    case BUILT_IN_TARGET_IS_AMDFAM10_SHANGHAI:
>>> +      field = get_field_from_struct (__processor_model_type, M_AMDFAM10_SHANGHAI);;
>>> +      which_struct = __cpu_model_var;
>>> +      break;
>>> +    case BUILT_IN_TARGET_IS_AMDFAM10_ISTANBUL:
>>> +      field = get_field_from_struct (__processor_model_type, M_AMDFAM10_ISTANBUL);;
>>> +      which_struct = __cpu_model_var;
>>> +      break;
>>> +    default:
>>> +      return NULL_TREE;
>>> +    }
>>> +
>>> +  return build3 (COMPONENT_REF, TREE_TYPE (field), which_struct, field, NULL_TREE);
>>> +}
>>> +
>>> +/* Folds __builtin_target_* builtins. */
>>> +
>>> +static tree
>>> +ix86_fold_builtin (tree fndecl, int n_args ATTRIBUTE_UNUSED,
>>> +                   tree *args ATTRIBUTE_UNUSED, bool ignore ATTRIBUTE_UNUSED)
>>> +{
>>> +  const char *decl_name = IDENTIFIER_POINTER (DECL_NAME (fndecl));
>>> +  if (DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_NORMAL
>>> +      && strstr(decl_name, "__builtin_target") != NULL)
>>> +    return fold_builtin_target (fndecl);
>>> +
>>> +  return NULL_TREE;
>>> +}
>>> +
>>>  /* Worker function for TARGET_SETUP_INCOMING_VARARGS.  */
>>>
>>>  static void
>>> @@ -35097,6 +35431,9 @@ ix86_autovectorize_vector_sizes (void)
>>>  #undef TARGET_BUILD_BUILTIN_VA_LIST
>>>  #define TARGET_BUILD_BUILTIN_VA_LIST ix86_build_builtin_va_list
>>>
>>> +#undef TARGET_FOLD_BUILTIN
>>> +#define TARGET_FOLD_BUILTIN ix86_fold_builtin
>>> +
>>>  #undef TARGET_ENUM_VA_LIST_P
>>>  #define TARGET_ENUM_VA_LIST_P ix86_enum_va_list
>>>
>>>
>>> --
>>> This patch is available for review at http://codereview.appspot.com/4893046
>>>
>>
>
Michael Matz Aug. 18, 2011, 1:10 p.m. UTC | #11
Hi,

On Thu, 18 Aug 2011, Richard Guenther wrote:

> > CPUID to get target features and set global vars corresponding to the 
> > features. So, the builtin should be folded by into the appropriate 
> > variable in libgcc.
> 
> Hm, but then the variable should reside in libgcc and you'd only need an 
> extern variant in the varpool.  I'm not sure separate constructors 
> (possibly in each module ...) would be better than a single one in 
> libgcc that would get run unconditionally.

Would be my preference too.

> > determining target features and setting the appropriate globals. If
> > the new builtins are called, gcc will call __cpu_indicator_init in a
> > constructor so that it is called exactly once. Then, gcc will fold the
> > builtin to the appropriate global variable.
> 
> I see, but this sounds like premature optimization to me, no?  Considering
> you'd do this in each module and our inability to merge those constructors
> at link time.  If we put __cpu_indicator, the constructor and the assorted
> support into a separate module inside libgcc.a could we arrange it in a way
> that if __cpu_indicator is not referenced from the program that piece isn't
> linked in?  (not sure if that is possible with constructors)

If you make an .o file only exporting __cpu_indicator, then it won't be 
included in a link where no object file refers to that symbol.  If you put 
the ctor for that variable in the same .o file you win.

I also take issue with the large number of builtins, I'd have expected one 
single builtin returning the CPU type, and an enum that can be tested.  
That potentially requires an installed gcc private header, but I think 
enabling access to this cpu detection facility in libgcc to our users is 
worthwhile.


Ciao,
Michael.
Xinliang David Li Aug. 18, 2011, 4:27 p.m. UTC | #12
On Thu, Aug 18, 2011 at 6:10 AM, Michael Matz <matz@suse.de> wrote:
> Hi,
>
> On Thu, 18 Aug 2011, Richard Guenther wrote:
>
>> > CPUID to get target features and set global vars corresponding to the
>> > features. So, the builtin should be folded by into the appropriate
>> > variable in libgcc.
>>
>> Hm, but then the variable should reside in libgcc and you'd only need an
>> extern variant in the varpool.  I'm not sure separate constructors
>> (possibly in each module ...) would be better than a single one in
>> libgcc that would get run unconditionally.
>
> Would be my preference too.
>
>> > determining target features and setting the appropriate globals. If
>> > the new builtins are called, gcc will call __cpu_indicator_init in a
>> > constructor so that it is called exactly once. Then, gcc will fold the
>> > builtin to the appropriate global variable.
>>
>> I see, but this sounds like premature optimization to me, no?  Considering
>> you'd do this in each module and our inability to merge those constructors
>> at link time.  If we put __cpu_indicator, the constructor and the assorted
>> support into a separate module inside libgcc.a could we arrange it in a way
>> that if __cpu_indicator is not referenced from the program that piece isn't
>> linked in?  (not sure if that is possible with constructors)
>
> If you make an .o file only exporting __cpu_indicator, then it won't be
> included in a link where no object file refers to that symbol.  If you put
> the ctor for that variable in the same .o file you win.
>
> I also take issue with the large number of builtins, I'd have expected one
> single builtin returning the CPU type, and an enum that can be tested.
> That potentially requires an installed gcc private header, but I think
> enabling access to this cpu detection facility in libgcc to our users is
> worthwhile.

The CPU type builtins can probably be combined, not the feature testing ones.

David

>
>
> Ciao,
> Michael.
>
Sriraman Tallam Aug. 18, 2011, 5:25 p.m. UTC | #13
On Thu, Aug 18, 2011 at 1:03 AM, Richard Guenther
<richard.guenther@gmail.com> wrote:
> On Wed, Aug 17, 2011 at 7:54 PM, Sriraman Tallam <tmsriram@google.com> wrote:
>> On Wed, Aug 17, 2011 at 12:37 AM, Richard Guenther
>> <richard.guenther@gmail.com> wrote:
>>> On Tue, Aug 16, 2011 at 10:50 PM, Sriraman Tallam <tmsriram@google.com> wrote:
>>>> Support for getting CPU type and feature information at run-time.
>>>>
>>>> The following patch provides support for finding the platform type at run-time, like cpu type and features supported. The multi-versioning framework will use the builtins added to dispatch the right function version. Please refer to http://gcc.gnu.org/ml/gcc/2011-08/msg00298.html for details on function multi-versioning usability.
>>>
>>> Please provide an overview why you need the new builtins,
>>
>> For multi-versioning,  the compiler can call the appropriate builtin
>> to dispatch the right version. The builtin call will later get folded.
>>
>> For example,
>>
>> int  __attribute__ version ("sse4_1")
>> compute ()
>> {
>>   // Do sse4_1 specific impkementation.
>> }
>>
>> int
>> compute ()
>> {
>>  // Generic implementation
>> }
>>
>> The compiler will check if the target supports the attribute and then
>> convert a call to compute ()  into  this:
>>
>> if (__builtin_target_supports_sse4_1 ())
>>  compute_sse4_1 (); // Call to the SSE4_1 implementation
>> else
>>  compute_generic (); // Call to the generic implementation
>>
>> Further, having it as builtin function allows it to be overridden by
>> the programmer. For instance, the programmer can override it to
>> identify newer CPU types not yet supported. Having these builtins
>> makes it convenient to identify platform type and features in general.
>>
>> why you need
>>> a separate pass to fold them (instead of just expanding them) and why
>>
>> I can move it into builtins.c along with where other builtins are
>> folded and remove the separate pass. My intention originally was to
>> fold them as early as possible, in this case after multi-versioning
>> but I guess this is not a requirement.
>
> Yes, they should be folded by targetm.fold_builtin instead.  The Frontend
> should simply fold the tests at the time it creates them, that's as early
> as possible (gimplification will also re-fold all builtin function calls).
>
>>> you are creating
>>> vars behind the back of GCC:
>>
>> The flow I had in mind was to have functions in libgcc which will use
>> CPUID to get target features and set global vars corresponding to the
>> features. So, the builtin should be folded by into the appropriate
>> variable in libgcc.
>
> Hm, but then the variable should reside in libgcc and you'd only need
> an extern variant in the varpool.  I'm not sure separate constructors
> (possibly in each module ...) would be better than a single one in
> libgcc that would get run unconditionally.
>
>>>
>>> +  /* Set finalized to 1, otherwise it asserts in function "write_symbol" in
>>> +     lto-streamer-out.c. */
>>> +  vnode->finalized = 1;
>>>
>>> where I think you miss a varpool_finalize_node call somewhere.  Why
>>> isn't this all done at target init time
>>
>> I wanted to do this on demand. If none of the new builtins are called
>> in the program, I do not need to to do this at all. In summary, libgcc
>> has a function called __cpu_indicator_init which does the work of
>> determining target features and setting the appropriate globals. If
>> the new builtins are called, gcc will call __cpu_indicator_init in a
>> constructor so that it is called exactly once. Then, gcc will fold the
>> builtin to the appropriate global variable.
>
> I see, but this sounds like premature optimization to me, no?  Considering
> you'd do this in each module and our inability to merge those constructors
> at link time.  If we put __cpu_indicator, the constructor and the assorted
> support into a separate module inside libgcc.a could we arrange it in a way
> that if __cpu_indicator is not referenced from the program that piece isn't
> linked in?  (not sure if that is possible with constructors)

Ok, so two things. I create the constructor as a comdat. So, it is
created by gcc in every module but at link time only one copy will be
kept. So, it is going to be called only once and that is not a
problem. The other thing is that I can eliminate all of this code gen
in gcc for and mark this as a constructor in libgcc which means it
will always be linked in and always be called once at run-time. There
is no easy way right now to garbage collect unreferenced ctors at
run-time. I do not have a strong opinion on this and I can do the
latter.

>
> Richard.
>
>>
>> ?  If you don't mark the
>>> variable as to be preserved
>>> like you do cgraph will optimize it all away if it isn't needed.
>>
>>>
>>> Richard.
>>>
>>>>        * tree-pass.h (pass_tree_fold_builtin_target): New pass.
>>>>        * builtins.def (BUILT_IN_TARGET_SUPPORTS_CMOV): New builtin.
>>>>        (BUILT_IN_TARGET_SUPPORTS_MMX): New builtin.
>>>>        (BUILT_IN_TARGET_SUPPORTS_POPCOUNT): New builtin.
>>>>        (BUILT_IN_TARGET_SUPPORTS_SSE): New builtin.
>>>>        (BUILT_IN_TARGET_SUPPORTS_SSE2): New builtin.
>>>>        (BUILT_IN_TARGET_SUPPORTS_SSE3): New builtin.
>>>>        (BUILT_IN_TARGET_SUPPORTS_SSSE3): New builtin.
>>>>        (BUILT_IN_TARGET_SUPPORTS_SSE4_1): New builtin.
>>>>        (BUILT_IN_TARGET_SUPPORTS_SSE4_2): New builtin.
>>>>        (BUILT_IN_TARGET_IS_AMD): New builtin.
>>>>        (BUILT_IN_TARGET_IS_INTEL): New builtin.
>>>>        (BUILT_IN_TARGET_IS_COREI7_NEHALEM): New builtin.
>>>>        (BUILT_IN_TARGET_IS_COREI7_WESTMERE): New builtin.
>>>>        (BUILT_IN_TARGET_IS_COREI7_SANDYBRIDGE): New builtin.
>>>>        (BUILT_IN_TARGET_IS_AMDFAM10_BARCELONA): New builtin.
>>>>        (BUILT_IN_TARGET_IS_AMDFAM10_SHANGHAI): New builtin.
>>>>        (BUILT_IN_TARGET_IS_AMDFAM10_ISTANBUL): New builtin.
>>>>        * mversn-dispatch.c (do_fold_builtin_target): New function.
>>>>        (gate_fold_builtin_target): New function.
>>>>        (pass_tree_fold_builtin_target): New pass.
>>>>        * timevar.def (TV_FOLD_BUILTIN_TARGET): New var.
>>>>        * passes.c (init_optimization_passes): Add new pass to pass list.
>>>>        * config/i386/i386.c (build_struct_with_one_bit_fields): New function.
>>>>        (make_var_decl): New function.
>>>>        (get_field_from_struct): New function.
>>>>        (make_constructor_to_get_target_type): New function.
>>>>        (fold_builtin_target): New function.
>>>>        (ix86_fold_builtin): New function.
>>>>        (TARGET_FOLD_BUILTIN): New macro.
>>>>
>>>>        * gcc.dg/builtin_target.c: New test.
>>>>
>>>>        * config/i386/i386-cpuinfo.c: New file.
>>>>        * config/i386/t-cpuinfo: New file.
>>>>        * config.host: Add t-cpuinfo to link i386-cpuinfo.o with libgcc
>>>>
>>>> Index: libgcc/config.host
>>>> ===================================================================
>>>> --- libgcc/config.host  (revision 177767)
>>>> +++ libgcc/config.host  (working copy)
>>>> @@ -609,7 +609,7 @@ case ${host} in
>>>>  i[34567]86-*-linux* | x86_64-*-linux* | \
>>>>   i[34567]86-*-kfreebsd*-gnu | i[34567]86-*-knetbsd*-gnu | \
>>>>   i[34567]86-*-gnu*)
>>>> -       tmake_file="${tmake_file} t-tls"
>>>> +       tmake_file="${tmake_file} t-tls i386/t-cpuinfo"
>>>>        if test "$libgcc_cv_cfi" = "yes"; then
>>>>                tmake_file="${tmake_file} t-stack i386/t-stack-i386"
>>>>        fi
>>>> Index: libgcc/config/i386/t-cpuinfo
>>>> ===================================================================
>>>> --- libgcc/config/i386/t-cpuinfo        (revision 0)
>>>> +++ libgcc/config/i386/t-cpuinfo        (revision 0)
>>>> @@ -0,0 +1,2 @@
>>>> +# This is an endfile
>>>> +LIB2ADD += $(srcdir)/config/i386/i386-cpuinfo.c
>>>> Index: libgcc/config/i386/i386-cpuinfo.c
>>>> ===================================================================
>>>> --- libgcc/config/i386/i386-cpuinfo.c   (revision 0)
>>>> +++ libgcc/config/i386/i386-cpuinfo.c   (revision 0)
>>>> @@ -0,0 +1,275 @@
>>>> +/* Copyright (C) 2011 Free Software Foundation, Inc.
>>>> + * Contributed by Sriraman Tallam <tmsriram@google.com>.
>>>> + *
>>>> + * This file is free software; you can redistribute it and/or modify it
>>>> + * under the terms of the GNU General Public License as published by the
>>>> + * Free Software Foundation; either version 3, or (at your option) any
>>>> + * later version.
>>>> + *
>>>> + * This file is distributed in the hope that it will be useful, but
>>>> + * WITHOUT ANY WARRANTY; without even the implied warranty of
>>>> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
>>>> + * General Public License for more details.
>>>> + *
>>>> + * Under Section 7 of GPL version 3, you are granted additional
>>>> + * permissions described in the GCC Runtime Library Exception, version
>>>> + * 3.1, as published by the Free Software Foundation.
>>>> + *
>>>> + * You should have received a copy of the GNU General Public License and
>>>> + * a copy of the GCC Runtime Library Exception along with this program;
>>>> + * see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
>>>> + * <http://www.gnu.org/licenses/>.
>>>> + *
>>>> + *
>>>> + * This code is adapted from gcc/config/i386/driver-i386.c. The CPUID
>>>> + * instruction is used to figure out the cpu type and supported features.
>>>> + * GCC runs __cpu_indicator_init from a constructor which sets the members
>>>> + * of __cpu_model and __cpu_features.
>>>> + */
>>>> +
>>>> +#include <string.h>
>>>> +
>>>> +#ifdef __GNUC__
>>>> +#include "cpuid.h"
>>>> +
>>>> +enum processor_type
>>>> +{
>>>> +  PROCESSOR_PENTIUM = 0,
>>>> +  PROCESSOR_CORE2,
>>>> +  PROCESSOR_COREI7_NEHALEM,
>>>> +  PROCESSOR_COREI7_WESTMERE,
>>>> +  PROCESSOR_COREI7_SANDYBRIDGE,
>>>> +  PROCESSOR_INTEL_GENERIC,
>>>> +  PROCESSOR_AMDFAM10_BARCELONA,
>>>> +  PROCESSOR_AMDFAM10_SHANGHAI,
>>>> +  PROCESSOR_AMDFAM10_ISTANBUL,
>>>> +  PROCESSOR_AMDFAM10_GENERIC,
>>>> +  PROCESSOR_AMD_GENERIC,
>>>> +  PROCESSOR_GENERIC,
>>>> +  PROCESSOR_max
>>>> +};
>>>> +
>>>> +enum vendor_signatures
>>>> +{
>>>> +  SIG_INTEL =  0x756e6547 /* Genu */,
>>>> +  SIG_AMD =    0x68747541 /* Auth */
>>>> +};
>>>> +
>>>> +
>>>> +/* Features supported. */
>>>> +
>>>> +struct __processor_features
>>>> +{
>>>> +  unsigned int __cpu_cmov : 1;
>>>> +  unsigned int __cpu_mmx : 1;
>>>> +  unsigned int __cpu_popcnt : 1;
>>>> +  unsigned int __cpu_sse : 1;
>>>> +  unsigned int __cpu_sse2 : 1;
>>>> +  unsigned int __cpu_sse3 : 1;
>>>> +  unsigned int __cpu_ssse3 : 1;
>>>> +  unsigned int __cpu_sse4_1 : 1;
>>>> +  unsigned int __cpu_sse4_2 : 1;
>>>> +};
>>>> +
>>>> +/* Flags exported. */
>>>> +
>>>> +struct __processor_model
>>>> +{
>>>> +  unsigned int __cpu_is_amd : 1;
>>>> +  unsigned int __cpu_is_intel : 1;
>>>> +  unsigned int __cpu_is_corei7_nehalem : 1;
>>>> +  unsigned int __cpu_is_corei7_westmere : 1;
>>>> +  unsigned int __cpu_is_corei7_sandybridge : 1;
>>>> +  unsigned int __cpu_is_amdfam10_barcelona : 1;
>>>> +  unsigned int __cpu_is_amdfam10_shanghai : 1;
>>>> +  unsigned int __cpu_is_amdfam10_istanbul : 1;
>>>> +};
>>>> +
>>>> +enum processor_type __cpu_type = PROCESSOR_GENERIC;
>>>> +struct __processor_features __cpu_features;
>>>> +struct __processor_model __cpu_model;
>>>> +
>>>> +static void
>>>> +get_amd_cpu (unsigned int family, unsigned int model)
>>>> +{
>>>> +  switch (family)
>>>> +    {
>>>> +    case 0x10:
>>>> +      switch (model)
>>>> +       {
>>>> +       case 0x2:
>>>> +         __cpu_type = PROCESSOR_AMDFAM10_BARCELONA;
>>>> +         __cpu_model.__cpu_is_amdfam10_barcelona = 1;
>>>> +         break;
>>>> +       case 0x4:
>>>> +         __cpu_type = PROCESSOR_AMDFAM10_SHANGHAI;
>>>> +         __cpu_model.__cpu_is_amdfam10_shanghai = 1;
>>>> +         break;
>>>> +       case 0x8:
>>>> +         __cpu_type = PROCESSOR_AMDFAM10_ISTANBUL;
>>>> +         __cpu_model.__cpu_is_amdfam10_istanbul = 1;
>>>> +         break;
>>>> +       default:
>>>> +         __cpu_type = PROCESSOR_AMDFAM10_GENERIC;
>>>> +         break;
>>>> +       }
>>>> +      break;
>>>> +    default:
>>>> +      __cpu_type = PROCESSOR_AMD_GENERIC;
>>>> +    }
>>>> +}
>>>> +
>>>> +static void
>>>> +get_intel_cpu (unsigned int family, unsigned int model, unsigned int brand_id)
>>>> +{
>>>> +  /* Parse family and model only if brand ID is 0. */
>>>> +  if (brand_id == 0)
>>>> +    {
>>>> +      switch (family)
>>>> +       {
>>>> +       case 0x5:
>>>> +         __cpu_type = PROCESSOR_PENTIUM;
>>>> +         break;
>>>> +       case 0x6:
>>>> +         switch (model)
>>>> +           {
>>>> +           case 0x1a:
>>>> +           case 0x1e:
>>>> +           case 0x1f:
>>>> +           case 0x2e:
>>>> +             /* Nehalem.  */
>>>> +             __cpu_type = PROCESSOR_COREI7_NEHALEM;
>>>> +             __cpu_model.__cpu_is_corei7_nehalem = 1;
>>>> +             break;
>>>> +           case 0x25:
>>>> +           case 0x2c:
>>>> +           case 0x2f:
>>>> +             /* Westmere.  */
>>>> +             __cpu_type = PROCESSOR_COREI7_WESTMERE;
>>>> +             __cpu_model.__cpu_is_corei7_westmere = 1;
>>>> +             break;
>>>> +           case 0x2a:
>>>> +             /* Sandy Bridge.  */
>>>> +             __cpu_type = PROCESSOR_COREI7_SANDYBRIDGE;
>>>> +             __cpu_model.__cpu_is_corei7_sandybridge = 1;
>>>> +             break;
>>>> +           case 0x17:
>>>> +           case 0x1d:
>>>> +             /* Penryn.  */
>>>> +           case 0x0f:
>>>> +             /* Merom.  */
>>>> +             __cpu_type = PROCESSOR_CORE2;
>>>> +             break;
>>>> +           default:
>>>> +             __cpu_type = PROCESSOR_INTEL_GENERIC;
>>>> +             break;
>>>> +           }
>>>> +         break;
>>>> +       default:
>>>> +         /* We have no idea.  */
>>>> +         __cpu_type = PROCESSOR_INTEL_GENERIC;
>>>> +         break;
>>>> +       }
>>>> +    }
>>>> +}
>>>> +
>>>> +static void
>>>> +get_available_features (unsigned int ecx, unsigned int edx)
>>>> +{
>>>> +  __cpu_features.__cpu_cmov = (edx & bit_CMOV) ? 1 : 0;
>>>> +  __cpu_features.__cpu_mmx = (edx & bit_MMX) ? 1 : 0;
>>>> +  __cpu_features.__cpu_sse = (edx & bit_SSE) ? 1 : 0;
>>>> +  __cpu_features.__cpu_sse2 = (edx & bit_SSE2) ? 1 : 0;
>>>> +  __cpu_features.__cpu_popcnt = (ecx & bit_POPCNT) ? 1 : 0;
>>>> +  __cpu_features.__cpu_sse3 = (ecx & bit_SSE3) ? 1 : 0;
>>>> +  __cpu_features.__cpu_ssse3 = (ecx & bit_SSSE3) ? 1 : 0;
>>>> +  __cpu_features.__cpu_sse4_1 = (ecx & bit_SSE4_1) ? 1 : 0;
>>>> +  __cpu_features.__cpu_sse4_2 = (ecx & bit_SSE4_2) ? 1 : 0;
>>>> +}
>>>> +
>>>> +/* A noinline function calling __get_cpuid. Having many calls to
>>>> +   cpuid in one function in 32-bit mode causes GCC to complain:
>>>> +   "can’t find a register in class ‘CLOBBERED_REGS’".  This is
>>>> +   related to PR rtl-optimization 44174. */
>>>> +
>>>> +static int __attribute__ ((noinline))
>>>> +__get_cpuid_output (unsigned int __level,
>>>> +                   unsigned int *__eax, unsigned int *__ebx,
>>>> +                   unsigned int *__ecx, unsigned int *__edx)
>>>> +{
>>>> +  return __get_cpuid (__level, __eax, __ebx, __ecx, __edx);
>>>> +}
>>>> +
>>>> +/* This function will be linked in to binaries that need to look up
>>>> +   CPU information.  */
>>>> +
>>>> +void
>>>> +__cpu_indicator_init(void)
>>>> +{
>>>> +  unsigned int eax, ebx, ecx, edx;
>>>> +
>>>> +  int max_level = 5;
>>>> +  unsigned int vendor;
>>>> +  unsigned int model, family, brand_id;
>>>> +
>>>> +  memset (&__cpu_features, 0, sizeof (struct __processor_features));
>>>> +  memset (&__cpu_model, 0, sizeof (struct __processor_model));
>>>> +
>>>> +  /* Assume cpuid insn present. Run in level 0 to get vendor id. */
>>>> +  if (!__get_cpuid_output (0, &eax, &ebx, &ecx, &edx))
>>>> +    return;
>>>> +
>>>> +  vendor = ebx;
>>>> +  max_level = eax;
>>>> +
>>>> +  if (max_level < 1)
>>>> +    return;
>>>> +
>>>> +  if (!__get_cpuid_output (1, &eax, &ebx, &ecx, &edx))
>>>> +    return;
>>>> +
>>>> +  model = (eax >> 4) & 0x0f;
>>>> +  family = (eax >> 8) & 0x0f;
>>>> +  brand_id = ebx & 0xff;
>>>> +
>>>> +  /* Adjust model and family for Intel CPUS. */
>>>> +  if (vendor == SIG_INTEL)
>>>> +    {
>>>> +      unsigned int extended_model, extended_family;
>>>> +
>>>> +      extended_model = (eax >> 12) & 0xf0;
>>>> +      extended_family = (eax >> 20) & 0xff;
>>>> +      if (family == 0x0f)
>>>> +       {
>>>> +         family += extended_family;
>>>> +         model += extended_model;
>>>> +       }
>>>> +      else if (family == 0x06)
>>>> +       model += extended_model;
>>>> +    }
>>>> +
>>>> +  /* Find CPU model. */
>>>> +
>>>> +  if (vendor == SIG_AMD)
>>>> +    {
>>>> +      __cpu_model.__cpu_is_amd = 1;
>>>> +      get_amd_cpu (family, model);
>>>> +    }
>>>> +  else if (vendor == SIG_INTEL)
>>>> +    {
>>>> +      __cpu_model.__cpu_is_intel = 1;
>>>> +      get_intel_cpu (family, model, brand_id);
>>>> +    }
>>>> +
>>>> +  /* Find available features. */
>>>> +  get_available_features (ecx, edx);
>>>> +}
>>>> +
>>>> +#else
>>>> +
>>>> +void
>>>> +__cpu_indicator_init(void)
>>>> +{
>>>> +}
>>>> +
>>>> +#endif /* __GNUC__ */
>>>> Index: gcc/tree-pass.h
>>>> ===================================================================
>>>> --- gcc/tree-pass.h     (revision 177767)
>>>> +++ gcc/tree-pass.h     (working copy)
>>>> @@ -449,6 +449,7 @@ extern struct gimple_opt_pass pass_split_functions
>>>>  extern struct gimple_opt_pass pass_feedback_split_functions;
>>>>  extern struct gimple_opt_pass pass_threadsafe_analyze;
>>>>  extern struct gimple_opt_pass pass_tree_convert_builtin_dispatch;
>>>> +extern struct gimple_opt_pass pass_tree_fold_builtin_target;
>>>>
>>>>  /* IPA Passes */
>>>>  extern struct simple_ipa_opt_pass pass_ipa_lower_emutls;
>>>> Index: gcc/testsuite/gcc.dg/builtin_target.c
>>>> ===================================================================
>>>> --- gcc/testsuite/gcc.dg/builtin_target.c       (revision 0)
>>>> +++ gcc/testsuite/gcc.dg/builtin_target.c       (revision 0)
>>>> @@ -0,0 +1,49 @@
>>>> +/* This test checks if the __builtin_target_* calls are recognized. */
>>>> +
>>>> +/* { dg-do run } */
>>>> +
>>>> +int
>>>> +fn1 ()
>>>> +{
>>>> +  if (__builtin_target_supports_cmov () < 0)
>>>> +    return -1;
>>>> +  if (__builtin_target_supports_mmx () < 0)
>>>> +    return -1;
>>>> +  if (__builtin_target_supports_popcount () < 0)
>>>> +    return -1;
>>>> +  if (__builtin_target_supports_sse () < 0)
>>>> +    return -1;
>>>> +  if (__builtin_target_supports_sse2 () < 0)
>>>> +    return -1;
>>>> +  if (__builtin_target_supports_sse3 () < 0)
>>>> +    return -1;
>>>> +  if (__builtin_target_supports_ssse3 () < 0)
>>>> +    return -1;
>>>> +  if (__builtin_target_supports_sse4_1 () < 0)
>>>> +    return -1;
>>>> +  if (__builtin_target_supports_sse4_2 () < 0)
>>>> +    return -1;
>>>> +  if (__builtin_target_is_amd () < 0)
>>>> +    return -1;
>>>> +  if (__builtin_target_is_intel () < 0)
>>>> +    return -1;
>>>> +  if (__builtin_target_is_corei7_nehalem () < 0)
>>>> +    return -1;
>>>> +  if (__builtin_target_is_corei7_westmere () < 0)
>>>> +    return -1;
>>>> +  if (__builtin_target_is_corei7_sandybridge () < 0)
>>>> +    return -1;
>>>> +  if (__builtin_target_is_amdfam10_barcelona () < 0)
>>>> +    return -1;
>>>> +  if (__builtin_target_is_amdfam10_shanghai () < 0)
>>>> +    return -1;
>>>> +  if (__builtin_target_is_amdfam10_istanbul () < 0)
>>>> +    return -1;
>>>> +
>>>> +  return 0;
>>>> +}
>>>> +
>>>> +int main ()
>>>> +{
>>>> +  return fn1 ();
>>>> +}
>>>> Index: gcc/builtins.def
>>>> ===================================================================
>>>> --- gcc/builtins.def    (revision 177767)
>>>> +++ gcc/builtins.def    (working copy)
>>>> @@ -763,6 +763,25 @@ DEF_BUILTIN (BUILT_IN_EMUTLS_REGISTER_COMMON,
>>>>  /* Multiversioning builtin dispatch hook. */
>>>>  DEF_GCC_BUILTIN (BUILT_IN_DISPATCH, "dispatch", BT_FN_INT_PTR_FN_INT_PTR_PTR_VAR, ATTR_NULL)
>>>>
>>>> +/* Builtins to determine target type and features at run-time. */
>>>> +DEF_GCC_BUILTIN (BUILT_IN_TARGET_SUPPORTS_CMOV, "target_supports_cmov", BT_FN_INT, ATTR_NULL)
>>>> +DEF_GCC_BUILTIN (BUILT_IN_TARGET_SUPPORTS_MMX, "target_supports_mmx", BT_FN_INT, ATTR_NULL)
>>>> +DEF_GCC_BUILTIN (BUILT_IN_TARGET_SUPPORTS_POPCOUNT, "target_supports_popcount", BT_FN_INT, ATTR_NULL)
>>>> +DEF_GCC_BUILTIN (BUILT_IN_TARGET_SUPPORTS_SSE, "target_supports_sse", BT_FN_INT, ATTR_NULL)
>>>> +DEF_GCC_BUILTIN (BUILT_IN_TARGET_SUPPORTS_SSE2, "target_supports_sse2", BT_FN_INT, ATTR_NULL)
>>>> +DEF_GCC_BUILTIN (BUILT_IN_TARGET_SUPPORTS_SSE3, "target_supports_sse3", BT_FN_INT, ATTR_NULL)
>>>> +DEF_GCC_BUILTIN (BUILT_IN_TARGET_SUPPORTS_SSSE3, "target_supports_ssse3", BT_FN_INT, ATTR_NULL)
>>>> +DEF_GCC_BUILTIN (BUILT_IN_TARGET_SUPPORTS_SSE4_1, "target_supports_sse4_1", BT_FN_INT, ATTR_NULL)
>>>> +DEF_GCC_BUILTIN (BUILT_IN_TARGET_SUPPORTS_SSE4_2, "target_supports_sse4_2", BT_FN_INT, ATTR_NULL)
>>>> +DEF_GCC_BUILTIN (BUILT_IN_TARGET_IS_AMD, "target_is_amd", BT_FN_INT, ATTR_NULL)
>>>> +DEF_GCC_BUILTIN (BUILT_IN_TARGET_IS_INTEL, "target_is_intel", BT_FN_INT, ATTR_NULL)
>>>> +DEF_GCC_BUILTIN (BUILT_IN_TARGET_IS_COREI7_NEHALEM, "target_is_corei7_nehalem", BT_FN_INT, ATTR_NULL)
>>>> +DEF_GCC_BUILTIN (BUILT_IN_TARGET_IS_COREI7_WESTMERE, "target_is_corei7_westmere", BT_FN_INT, ATTR_NULL)
>>>> +DEF_GCC_BUILTIN (BUILT_IN_TARGET_IS_COREI7_SANDYBRIDGE, "target_is_corei7_sandybridge", BT_FN_INT, ATTR_NULL)
>>>> +DEF_GCC_BUILTIN (BUILT_IN_TARGET_IS_AMDFAM10_BARCELONA, "target_is_amdfam10_barcelona", BT_FN_INT, ATTR_NULL)
>>>> +DEF_GCC_BUILTIN (BUILT_IN_TARGET_IS_AMDFAM10_SHANGHAI, "target_is_amdfam10_shanghai", BT_FN_INT, ATTR_NULL)
>>>> +DEF_GCC_BUILTIN (BUILT_IN_TARGET_IS_AMDFAM10_ISTANBUL, "target_is_amdfam10_istanbul", BT_FN_INT, ATTR_NULL)
>>>> +
>>>>  /* Exception support.  */
>>>>  DEF_BUILTIN_STUB (BUILT_IN_UNWIND_RESUME, "__builtin_unwind_resume")
>>>>  DEF_BUILTIN_STUB (BUILT_IN_CXA_END_CLEANUP, "__builtin_cxa_end_cleanup")
>>>> Index: gcc/mversn-dispatch.c
>>>> ===================================================================
>>>> --- gcc/mversn-dispatch.c       (revision 177767)
>>>> +++ gcc/mversn-dispatch.c       (working copy)
>>>> @@ -135,6 +135,7 @@ along with GCC; see the file COPYING3.  If not see
>>>>  #include "output.h"
>>>>  #include "vecprim.h"
>>>>  #include "gimple-pretty-print.h"
>>>> +#include "target.h"
>>>>
>>>>  typedef struct cgraph_node* NODEPTR;
>>>>  DEF_VEC_P (NODEPTR);
>>>> @@ -1764,3 +1765,103 @@ struct gimple_opt_pass pass_tree_convert_builtin_d
>>>>   TODO_update_ssa | TODO_verify_ssa
>>>>  }
>>>>  };
>>>> +
>>>> +/* Fold calls to __builtin_target_* */
>>>> +
>>>> +static unsigned int
>>>> +do_fold_builtin_target (void)
>>>> +{
>>>> +  basic_block bb;
>>>> +  gimple_stmt_iterator gsi;
>>>> +
>>>> +  /* Go through each stmt looking for __builtin_target_* calls */
>>>> +  FOR_EACH_BB_FN (bb, DECL_STRUCT_FUNCTION (current_function_decl))
>>>> +    {
>>>> +      for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
>>>> +        {
>>>> +         gimple stmt = gsi_stmt (gsi);
>>>> +         gimple assign_stmt;
>>>> +          tree call_decl;
>>>> +         tree lhs_retval;
>>>> +         tree folded_val;
>>>> +
>>>> +         tree ssa_var, tmp_var;
>>>> +         gimple init_stmt;
>>>> +
>>>> +          if (!is_gimple_call (stmt))
>>>> +            continue;
>>>> +
>>>> +          call_decl = gimple_call_fndecl (stmt);
>>>> +
>>>> +         /* Check if it is a __builtin_target_* call. */
>>>> +
>>>> +         if (call_decl == NULL
>>>> +             || DECL_NAME (call_decl) == NULL_TREE
>>>> +             || DECL_BUILT_IN_CLASS (call_decl) != BUILT_IN_NORMAL
>>>> +             || strstr (IDENTIFIER_POINTER (DECL_NAME (call_decl)),
>>>> +                         "__builtin_target") == NULL)
>>>> +            continue;
>>>> +
>>>> +         /* If the lhs is NULL there is no need to fold the call. */
>>>> +         lhs_retval = gimple_call_lhs(stmt);
>>>> +         if (lhs_retval == NULL)
>>>> +           continue;
>>>> +
>>>> +         /* Call the target hook to fold the builtin */
>>>> +          folded_val = targetm.fold_builtin(call_decl, 0, NULL, false);
>>>> +
>>>> +         /* If the target does not support the builtin then fold it to zero. */
>>>> +         if (folded_val == NULL_TREE)
>>>> +           folded_val = build_zero_cst (unsigned_type_node);
>>>> +
>>>> +         /* Type cast unsigned value to integer */
>>>> +         tmp_var = create_tmp_var (unsigned_type_node, NULL);
>>>> +         init_stmt = gimple_build_assign (tmp_var, folded_val);
>>>> +         ssa_var = make_ssa_name (tmp_var, init_stmt);
>>>> +         gimple_assign_set_lhs (init_stmt, ssa_var);
>>>> +         mark_symbols_for_renaming (init_stmt);
>>>> +
>>>> +         assign_stmt = gimple_build_assign_with_ops (NOP_EXPR, lhs_retval, ssa_var, 0);
>>>> +         mark_symbols_for_renaming(assign_stmt);
>>>> +
>>>> +         gsi_insert_after_without_update (&gsi, assign_stmt, GSI_SAME_STMT);
>>>> +         gsi_insert_after_without_update (&gsi, init_stmt, GSI_SAME_STMT);
>>>> +         /* Delete the original call. */
>>>> +         gsi_remove(&gsi, true);
>>>> +       }
>>>> +    }
>>>> +
>>>> +  return 0;
>>>> +}
>>>> +
>>>> +static bool
>>>> +gate_fold_builtin_target (void)
>>>> +{
>>>> +  return true;
>>>> +}
>>>> +
>>>> +/* Pass to fold __builtin_target_* functions */
>>>> +
>>>> +struct gimple_opt_pass pass_tree_fold_builtin_target =
>>>> +{
>>>> + {
>>>> +  GIMPLE_PASS,
>>>> +  "fold_builtin_target",               /* name */
>>>> +  gate_fold_builtin_target,            /* gate */
>>>> +  do_fold_builtin_target,              /* execute */
>>>> +  NULL,                                        /* sub */
>>>> +  NULL,                                        /* next */
>>>> +  0,                                   /* static_pass_number */
>>>> +  TV_FOLD_BUILTIN_TARGET,              /* tv_id */
>>>> +  PROP_cfg,                            /* properties_required */
>>>> +  PROP_cfg,                            /* properties_provided */
>>>> +  0,                                   /* properties_destroyed */
>>>> +  0,                                   /* todo_flags_start */
>>>> +  TODO_dump_func |                     /* todo_flags_finish */
>>>> +  TODO_cleanup_cfg |
>>>> +  TODO_update_ssa |
>>>> +  TODO_verify_ssa
>>>> + }
>>>> +};
>>>> +
>>>> +
>>>> Index: gcc/timevar.def
>>>> ===================================================================
>>>> --- gcc/timevar.def     (revision 177767)
>>>> +++ gcc/timevar.def     (working copy)
>>>> @@ -124,6 +124,7 @@ DEFTIMEVAR (TV_PARSE_INMETH          , "parser inl
>>>>  DEFTIMEVAR (TV_TEMPLATE_INST         , "template instantiation")
>>>>  DEFTIMEVAR (TV_INLINE_HEURISTICS     , "inline heuristics")
>>>>  DEFTIMEVAR (TV_MVERSN_DISPATCH       , "multiversion dispatch")
>>>> +DEFTIMEVAR (TV_FOLD_BUILTIN_TARGET   , "fold __builtin_target calls")
>>>>  DEFTIMEVAR (TV_INTEGRATION           , "integration")
>>>>  DEFTIMEVAR (TV_TREE_GIMPLIFY        , "tree gimplify")
>>>>  DEFTIMEVAR (TV_TREE_EH              , "tree eh")
>>>> Index: gcc/passes.c
>>>> ===================================================================
>>>> --- gcc/passes.c        (revision 177767)
>>>> +++ gcc/passes.c        (working copy)
>>>> @@ -1249,6 +1249,8 @@ init_optimization_passes (void)
>>>>     {
>>>>       struct opt_pass **p = &pass_ipa_multiversion_dispatch.pass.sub;
>>>>       NEXT_PASS (pass_tree_convert_builtin_dispatch);
>>>> +      /* Fold calls to __builtin_target_*. */
>>>> +      NEXT_PASS (pass_tree_fold_builtin_target);
>>>>       /* Rebuilding cgraph edges is necessary as the above passes change
>>>>          the call graph.  Otherwise, future optimizations use the old
>>>>         call graph and make wrong decisions sometimes.*/
>>>> Index: gcc/config/i386/i386.c
>>>> ===================================================================
>>>> --- gcc/config/i386/i386.c      (revision 177767)
>>>> +++ gcc/config/i386/i386.c      (working copy)
>>>> @@ -58,6 +58,8 @@ along with GCC; see the file COPYING3.  If not see
>>>>  #include "sched-int.h"
>>>>  #include "sbitmap.h"
>>>>  #include "fibheap.h"
>>>> +#include "tree-flow.h"
>>>> +#include "tree-pass.h"
>>>>
>>>>  enum upper_128bits_state
>>>>  {
>>>> @@ -7867,6 +7869,338 @@ ix86_build_builtin_va_list (void)
>>>>   return ret;
>>>>  }
>>>>
>>>> +/* Returns a struct type with name NAME and number of fields equal to
>>>> +   NUM_FIELDS.  Each field is a unsigned int bit field of length 1 bit. */
>>>> +
>>>> +static tree
>>>> +build_struct_with_one_bit_fields (int num_fields, const char *name)
>>>> +{
>>>> +  int i;
>>>> +  char field_name [10];
>>>> +  tree field = NULL_TREE, field_chain = NULL_TREE;
>>>> +  tree type = make_node (RECORD_TYPE);
>>>> +
>>>> +  strcpy (field_name, "k_field");
>>>> +
>>>> +  for (i = 0; i < num_fields; i++)
>>>> +    {
>>>> +      /* Name the fields, 0_field, 1_field, ... */
>>>> +      field_name [0] = '0' + i;
>>>> +      field = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
>>>> +                         get_identifier (field_name), unsigned_type_node);
>>>> +      DECL_BIT_FIELD (field) = 1;
>>>> +      DECL_SIZE (field) = bitsize_one_node;
>>>> +      if (field_chain != NULL_TREE)
>>>> +       DECL_CHAIN (field) = field_chain;
>>>> +      field_chain = field;
>>>> +    }
>>>> +  finish_builtin_struct (type, name, field_chain, NULL_TREE);
>>>> +  return type;
>>>> +}
>>>> +
>>>> +/* Returns a VAR_DECL of type TYPE and name NAME. */
>>>> +
>>>> +static tree
>>>> +make_var_decl (tree type, const char *name)
>>>> +{
>>>> +  tree new_decl;
>>>> +  struct varpool_node *vnode;
>>>> +
>>>> +  new_decl = build_decl (UNKNOWN_LOCATION,
>>>> +                        VAR_DECL,
>>>> +                        get_identifier(name),
>>>> +                        type);
>>>> +
>>>> +  DECL_EXTERNAL (new_decl) = 1;
>>>> +  TREE_STATIC (new_decl) = 1;
>>>> +  TREE_PUBLIC (new_decl) = 1;
>>>> +  DECL_INITIAL (new_decl) = 0;
>>>> +  DECL_ARTIFICIAL (new_decl) = 0;
>>>> +  DECL_PRESERVE_P (new_decl) = 1;
>>>> +
>>>> +  make_decl_one_only (new_decl, DECL_ASSEMBLER_NAME (new_decl));
>>>> +  assemble_variable (new_decl, 0, 0, 0);
>>>> +
>>>> +  vnode = varpool_node (new_decl);
>>>> +  gcc_assert (vnode != NULL);
>>>> +  /* Set finalized to 1, otherwise it asserts in function "write_symbol" in
>>>> +     lto-streamer-out.c. */
>>>> +  vnode->finalized = 1;
>>>> +
>>>> +  return new_decl;
>>>> +}
>>>> +
>>>> +/* Traverses the chain of fields in STRUCT_TYPE and returns the FIELD_NUM
>>>> +   numbered field. */
>>>> +
>>>> +static tree
>>>> +get_field_from_struct (tree struct_type, int field_num)
>>>> +{
>>>> +  int i;
>>>> +  tree field = TYPE_FIELDS (struct_type);
>>>> +
>>>> +  for (i = 0; i < field_num; i++, field = DECL_CHAIN(field))
>>>> +    {
>>>> +      gcc_assert (field != NULL_TREE);
>>>> +    }
>>>> +
>>>> +  return field;
>>>> +}
>>>> +
>>>> +/* Create a new static constructor that calls __cpu_indicator_init ()
>>>> +   function defined in libgcc/config/i386-cpuinfo.c which runs cpuid
>>>> +   to figure out the type of the target. */
>>>> +
>>>> +static tree
>>>> +make_constructor_to_get_target_type (const char *name)
>>>> +{
>>>> +  tree decl, type, t;
>>>> +  gimple_seq seq;
>>>> +  basic_block new_bb;
>>>> +  tree old_current_function_decl;
>>>> +
>>>> +  tree __cpu_indicator_int_decl;
>>>> +  gimple constructor_body;
>>>> +
>>>> +
>>>> +  type = build_function_type_list (void_type_node, NULL_TREE);
>>>> +
>>>> +  /* Make a call stmt to __cpu_indicator_init */
>>>> +  __cpu_indicator_int_decl = build_fn_decl ("__cpu_indicator_init", type);
>>>> +  constructor_body = gimple_build_call (__cpu_indicator_int_decl, 0);
>>>> +  DECL_EXTERNAL (__cpu_indicator_int_decl) = 1;
>>>> +
>>>> +  decl = build_fn_decl (name, type);
>>>> +
>>>> +  DECL_NAME (decl) = get_identifier (name);
>>>> +  SET_DECL_ASSEMBLER_NAME (decl, DECL_NAME (decl));
>>>> +  gcc_assert (cgraph_node (decl) != NULL);
>>>> +
>>>> +  TREE_USED (decl) = 1;
>>>> +  DECL_ARTIFICIAL (decl) = 1;
>>>> +  DECL_IGNORED_P (decl) = 0;
>>>> +  TREE_PUBLIC (decl) = 0;
>>>> +  DECL_UNINLINABLE (decl) = 1;
>>>> +  DECL_EXTERNAL (decl) = 0;
>>>> +  DECL_CONTEXT (decl) = NULL_TREE;
>>>> +  DECL_INITIAL (decl) = make_node (BLOCK);
>>>> +  DECL_STATIC_CONSTRUCTOR (decl) = 1;
>>>> +  TREE_READONLY (decl) = 0;
>>>> +  DECL_PURE_P (decl) = 0;
>>>> +
>>>> +  /* This is a comdat. */
>>>> +  make_decl_one_only (decl, DECL_ASSEMBLER_NAME (decl));
>>>> +
>>>> +  /* Build result decl and add to function_decl. */
>>>> +  t = build_decl (UNKNOWN_LOCATION, RESULT_DECL, NULL_TREE, void_type_node);
>>>> +  DECL_ARTIFICIAL (t) = 1;
>>>> +  DECL_IGNORED_P (t) = 1;
>>>> +  DECL_RESULT (decl) = t;
>>>> +
>>>> +  gimplify_function_tree (decl);
>>>> +
>>>> +  /* Build CFG for this function. */
>>>> +
>>>> +  old_current_function_decl = current_function_decl;
>>>> +  push_cfun (DECL_STRUCT_FUNCTION (decl));
>>>> +  current_function_decl = decl;
>>>> +  init_empty_tree_cfg_for_function (DECL_STRUCT_FUNCTION (decl));
>>>> +  cfun->curr_properties |=
>>>> +    (PROP_gimple_lcf | PROP_gimple_leh | PROP_cfg | PROP_referenced_vars |
>>>> +     PROP_ssa);
>>>> +  new_bb = create_empty_bb (ENTRY_BLOCK_PTR);
>>>> +  make_edge (ENTRY_BLOCK_PTR, new_bb, EDGE_FALLTHRU);
>>>> +
>>>> +  /* XXX: Not sure if the edge commented below is necessary.  If I add this
>>>> +     edge, it fails in gimple_verify_flow_info in tree-cfg.c in condition :
>>>> +     " if (e->flags & EDGE_FALLTHRU)"
>>>> +     during -fprofile-generate.
>>>> +     Otherwise, it is fine.  Deleting this edge does not break anything.
>>>> +     Commenting this so that it is clear I am intentionally not doing this.*/
>>>> +  /* make_edge (new_bb, EXIT_BLOCK_PTR, EDGE_FALLTHRU); */
>>>> +
>>>> +  seq = gimple_seq_alloc_with_stmt (constructor_body);
>>>> +
>>>> +  set_bb_seq (new_bb, seq);
>>>> +  gimple_set_bb (constructor_body, new_bb);
>>>> +
>>>> +  /* Set the lexical block of the constructor body. Fails the inliner
>>>> +     other wise. */
>>>> +  gimple_set_block (constructor_body, DECL_INITIAL (decl));
>>>> +
>>>> +  /* This call is very important if this pass runs when the IR is in
>>>> +     SSA form.  It breaks things in strange ways otherwise. */
>>>> +  init_tree_ssa (DECL_STRUCT_FUNCTION (decl));
>>>> +  /* add_referenced_var (version_selector_var); */
>>>> +
>>>> +  cgraph_add_new_function (decl, true);
>>>> +  cgraph_call_function_insertion_hooks (cgraph_node (decl));
>>>> +  cgraph_mark_needed_node (cgraph_node (decl));
>>>> +
>>>> +  pop_cfun ();
>>>> +  current_function_decl = old_current_function_decl;
>>>> +  return decl;
>>>> +}
>>>> +
>>>> +/* FNDECL is a __builtin_target_* call that is folded into an integer defined
>>>> +   in libgcc/config/i386/i386-cpuinfo.c */
>>>> +
>>>> +static tree
>>>> +fold_builtin_target (tree fndecl)
>>>> +{
>>>> +  /* This is the order of bit-fields in __processor_features in
>>>> +     i386-cpuinfo.c */
>>>> +  enum processor_features
>>>> +  {
>>>> +    F_CMOV = 0,
>>>> +    F_MMX,
>>>> +    F_POPCNT,
>>>> +    F_SSE,
>>>> +    F_SSE2,
>>>> +    F_SSE3,
>>>> +    F_SSSE3,
>>>> +    F_SSE4_1,
>>>> +    F_SSE4_2,
>>>> +    F_MAX
>>>> +  };
>>>> +
>>>> +  /* This is the order of bit-fields in __processor_model in
>>>> +     i386-cpuinfo.c */
>>>> +  enum processor_model
>>>> +  {
>>>> +    M_AMD = 0,
>>>> +    M_INTEL,
>>>> +    M_COREI7_NEHALEM,
>>>> +    M_COREI7_WESTMERE,
>>>> +    M_COREI7_SANDYBRIDGE,
>>>> +    M_AMDFAM10_BARCELONA,
>>>> +    M_AMDFAM10_SHANGHAI,
>>>> +    M_AMDFAM10_ISTANBUL,
>>>> +    M_MAX
>>>> +  };
>>>> +
>>>> +  static tree __processor_features_type = NULL_TREE;
>>>> +  static tree __cpu_features_var = NULL_TREE;
>>>> +  static tree __processor_model_type = NULL_TREE;
>>>> +  static tree __cpu_model_var = NULL_TREE;
>>>> +  static tree ctor_decl = NULL_TREE;
>>>> +  static tree field;
>>>> +  static tree which_struct;
>>>> +
>>>> +  /* Make a call to __cpu_indicatior_init in a constructor.
>>>> +     Function __cpu_indicator_init is defined in i386-cpuinfo.c. */
>>>> +  if (ctor_decl == NULL_TREE)
>>>> +   ctor_decl = make_constructor_to_get_target_type
>>>> +               ("__cpu_indicator_init_ctor");
>>>> +
>>>> +  if (__processor_features_type == NULL_TREE)
>>>> +    __processor_features_type = build_struct_with_one_bit_fields (F_MAX,
>>>> +                                 "__processor_features");
>>>> +
>>>> +  if (__processor_model_type == NULL_TREE)
>>>> +    __processor_model_type = build_struct_with_one_bit_fields (M_MAX,
>>>> +                                 "__processor_model");
>>>> +
>>>> +  if (__cpu_features_var == NULL_TREE)
>>>> +    __cpu_features_var = make_var_decl (__processor_features_type,
>>>> +                                       "__cpu_features");
>>>> +
>>>> +  if (__cpu_model_var == NULL_TREE)
>>>> +    __cpu_model_var = make_var_decl (__processor_model_type,
>>>> +                                    "__cpu_model");
>>>> +
>>>> +  /* Look at fndecl code to identify the field requested. */
>>>> +  switch (DECL_FUNCTION_CODE (fndecl))
>>>> +    {
>>>> +    case BUILT_IN_TARGET_SUPPORTS_CMOV:
>>>> +      field = get_field_from_struct (__processor_features_type, F_CMOV);
>>>> +      which_struct = __cpu_features_var;
>>>> +      break;
>>>> +    case BUILT_IN_TARGET_SUPPORTS_MMX:
>>>> +      field = get_field_from_struct (__processor_features_type, F_MMX);
>>>> +      which_struct = __cpu_features_var;
>>>> +      break;
>>>> +    case BUILT_IN_TARGET_SUPPORTS_POPCOUNT:
>>>> +      field = get_field_from_struct (__processor_features_type, F_POPCNT);
>>>> +      which_struct = __cpu_features_var;
>>>> +      break;
>>>> +    case BUILT_IN_TARGET_SUPPORTS_SSE:
>>>> +      field = get_field_from_struct (__processor_features_type, F_SSE);
>>>> +      which_struct = __cpu_features_var;
>>>> +      break;
>>>> +    case BUILT_IN_TARGET_SUPPORTS_SSE2:
>>>> +      field = get_field_from_struct (__processor_features_type, F_SSE2);
>>>> +      which_struct = __cpu_features_var;
>>>> +      break;
>>>> +    case BUILT_IN_TARGET_SUPPORTS_SSE3:
>>>> +      field = get_field_from_struct (__processor_features_type, F_SSE3);
>>>> +      which_struct = __cpu_features_var;
>>>> +      break;
>>>> +    case BUILT_IN_TARGET_SUPPORTS_SSSE3:
>>>> +      field = get_field_from_struct (__processor_features_type, F_SSE3);
>>>> +      which_struct = __cpu_features_var;
>>>> +      break;
>>>> +    case BUILT_IN_TARGET_SUPPORTS_SSE4_1:
>>>> +      field = get_field_from_struct (__processor_features_type, F_SSE4_1);
>>>> +      which_struct = __cpu_features_var;
>>>> +      break;
>>>> +    case BUILT_IN_TARGET_SUPPORTS_SSE4_2:
>>>> +      field = get_field_from_struct (__processor_features_type, F_SSE4_2);
>>>> +      which_struct = __cpu_features_var;
>>>> +      break;
>>>> +    case BUILT_IN_TARGET_IS_AMD:
>>>> +      field = get_field_from_struct (__processor_model_type, M_AMD);;
>>>> +      which_struct = __cpu_model_var;
>>>> +      break;
>>>> +    case BUILT_IN_TARGET_IS_INTEL:
>>>> +      field = get_field_from_struct (__processor_model_type, M_INTEL);;
>>>> +      which_struct = __cpu_model_var;
>>>> +      break;
>>>> +    case BUILT_IN_TARGET_IS_COREI7_NEHALEM:
>>>> +      field = get_field_from_struct (__processor_model_type, M_COREI7_NEHALEM);;
>>>> +      which_struct = __cpu_model_var;
>>>> +      break;
>>>> +    case BUILT_IN_TARGET_IS_COREI7_WESTMERE:
>>>> +      field = get_field_from_struct (__processor_model_type, M_COREI7_WESTMERE);;
>>>> +      which_struct = __cpu_model_var;
>>>> +      break;
>>>> +    case BUILT_IN_TARGET_IS_COREI7_SANDYBRIDGE:
>>>> +      field = get_field_from_struct (__processor_model_type, M_COREI7_SANDYBRIDGE);;
>>>> +      which_struct = __cpu_model_var;
>>>> +      break;
>>>> +    case BUILT_IN_TARGET_IS_AMDFAM10_BARCELONA:
>>>> +      field = get_field_from_struct (__processor_model_type, M_AMDFAM10_BARCELONA);;
>>>> +      which_struct = __cpu_model_var;
>>>> +      break;
>>>> +    case BUILT_IN_TARGET_IS_AMDFAM10_SHANGHAI:
>>>> +      field = get_field_from_struct (__processor_model_type, M_AMDFAM10_SHANGHAI);;
>>>> +      which_struct = __cpu_model_var;
>>>> +      break;
>>>> +    case BUILT_IN_TARGET_IS_AMDFAM10_ISTANBUL:
>>>> +      field = get_field_from_struct (__processor_model_type, M_AMDFAM10_ISTANBUL);;
>>>> +      which_struct = __cpu_model_var;
>>>> +      break;
>>>> +    default:
>>>> +      return NULL_TREE;
>>>> +    }
>>>> +
>>>> +  return build3 (COMPONENT_REF, TREE_TYPE (field), which_struct, field, NULL_TREE);
>>>> +}
>>>> +
>>>> +/* Folds __builtin_target_* builtins. */
>>>> +
>>>> +static tree
>>>> +ix86_fold_builtin (tree fndecl, int n_args ATTRIBUTE_UNUSED,
>>>> +                   tree *args ATTRIBUTE_UNUSED, bool ignore ATTRIBUTE_UNUSED)
>>>> +{
>>>> +  const char *decl_name = IDENTIFIER_POINTER (DECL_NAME (fndecl));
>>>> +  if (DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_NORMAL
>>>> +      && strstr(decl_name, "__builtin_target") != NULL)
>>>> +    return fold_builtin_target (fndecl);
>>>> +
>>>> +  return NULL_TREE;
>>>> +}
>>>> +
>>>>  /* Worker function for TARGET_SETUP_INCOMING_VARARGS.  */
>>>>
>>>>  static void
>>>> @@ -35097,6 +35431,9 @@ ix86_autovectorize_vector_sizes (void)
>>>>  #undef TARGET_BUILD_BUILTIN_VA_LIST
>>>>  #define TARGET_BUILD_BUILTIN_VA_LIST ix86_build_builtin_va_list
>>>>
>>>> +#undef TARGET_FOLD_BUILTIN
>>>> +#define TARGET_FOLD_BUILTIN ix86_fold_builtin
>>>> +
>>>>  #undef TARGET_ENUM_VA_LIST_P
>>>>  #define TARGET_ENUM_VA_LIST_P ix86_enum_va_list
>>>>
>>>>
>>>> --
>>>> This patch is available for review at http://codereview.appspot.com/4893046
>>>>
>>>
>>
>
Richard Henderson Aug. 18, 2011, 9:15 p.m. UTC | #14
On 08/18/2011 10:25 AM, Sriraman Tallam wrote:
> Ok, so two things. I create the constructor as a comdat. So, it is
> created by gcc in every module but at link time only one copy will be
> kept. So, it is going to be called only once and that is not a
> problem.

Err, no.  You'll wind up with one copy of the constructor
which will be called N times.

The comdat applies to the function body, not the data in
the .ctors section.


r~
Sriraman Tallam Aug. 18, 2011, 9:51 p.m. UTC | #15
On Thu, Aug 18, 2011 at 2:15 PM, Richard Henderson <rth@redhat.com> wrote:
> On 08/18/2011 10:25 AM, Sriraman Tallam wrote:
>> Ok, so two things. I create the constructor as a comdat. So, it is
>> created by gcc in every module but at link time only one copy will be
>> kept. So, it is going to be called only once and that is not a
>> problem.
>
> Err, no.  You'll wind up with one copy of the constructor
> which will be called N times.
>
> The comdat applies to the function body, not the data in
> the .ctors section.

Oh!, right, sorry. So, the only available option now is to mark it as
a constructor in libgcc.

Thanks.
-Sri.

>
>
> r~
>
Richard Henderson Aug. 18, 2011, 10:08 p.m. UTC | #16
On 08/18/2011 02:51 PM, Sriraman Tallam wrote:
> Oh!, right, sorry. So, the only available option now is to mark it as
> a constructor in libgcc.

Or call it explicitly from the out-of-line tests.

The thing is, if you intend to use this from ifunc tests, I believe
that these can run *extremely* early.  E.g. LD_BIND_NOW=1 will run
these while relocating the entire application, and therefore before
any of DT_INIT (aka .ctors), DT_INIT_ARRAY, or DT_PREINIT_ARRAY.


r~
Richard Biener Aug. 19, 2011, 9:04 a.m. UTC | #17
On Fri, Aug 19, 2011 at 12:08 AM, Richard Henderson <rth@redhat.com> wrote:
> On 08/18/2011 02:51 PM, Sriraman Tallam wrote:
>> Oh!, right, sorry. So, the only available option now is to mark it as
>> a constructor in libgcc.
>
> Or call it explicitly from the out-of-line tests.
>
> The thing is, if you intend to use this from ifunc tests, I believe
> that these can run *extremely* early.  E.g. LD_BIND_NOW=1 will run
> these while relocating the entire application, and therefore before
> any of DT_INIT (aka .ctors), DT_INIT_ARRAY, or DT_PREINIT_ARRAY.

So make sure that __cpu_indicator initially has a conservative correct
value?  I'd still prefer the constructor-in-libgcc option - if only because
then the compiler-side is much simplified.

Richard.

>
> r~
>
Jakub Jelinek Aug. 19, 2011, 9:09 a.m. UTC | #18
On Fri, Aug 19, 2011 at 11:04:11AM +0200, Richard Guenther wrote:
> On Fri, Aug 19, 2011 at 12:08 AM, Richard Henderson <rth@redhat.com> wrote:
> > On 08/18/2011 02:51 PM, Sriraman Tallam wrote:
> >> Oh!, right, sorry. So, the only available option now is to mark it as
> >> a constructor in libgcc.
> >
> > Or call it explicitly from the out-of-line tests.
> >
> > The thing is, if you intend to use this from ifunc tests, I believe
> > that these can run *extremely* early.  E.g. LD_BIND_NOW=1 will run
> > these while relocating the entire application, and therefore before
> > any of DT_INIT (aka .ctors), DT_INIT_ARRAY, or DT_PREINIT_ARRAY.
> 
> So make sure that __cpu_indicator initially has a conservative correct
> value?  I'd still prefer the constructor-in-libgcc option - if only because
> then the compiler-side is much simplified.

Note that exporting data from shared libraries and using those in binaries
often leads to copy relocations (which are possibly still not applied when
calling IFUNC functions with LD_BIND_NOW=1).  Similarly calling a function
in a different shared library might be a problem from IFUNC handler.

	Jakub
Richard Henderson Aug. 20, 2011, 9:02 p.m. UTC | #19
On 08/19/2011 02:04 AM, Richard Guenther wrote:
> So make sure that __cpu_indicator initially has a conservative correct
> value?  I'd still prefer the constructor-in-libgcc option - if only because
> then the compiler-side is much simplified.
> 

Err, I thought __cpu_indicator was a function, not data.

I think we need to discuss this more...


r~
H.J. Lu Aug. 20, 2011, 9:16 p.m. UTC | #20
On Sat, Aug 20, 2011 at 2:02 PM, Richard Henderson <rth@redhat.com> wrote:
> On 08/19/2011 02:04 AM, Richard Guenther wrote:
>> So make sure that __cpu_indicator initially has a conservative correct
>> value?  I'd still prefer the constructor-in-libgcc option - if only because
>> then the compiler-side is much simplified.
>>
>
> Err, I thought __cpu_indicator was a function, not data.
>
> I think we need to discuss this more...
>

In glibc, we export function __get_cpu_features as a private interface
used for IFUNC.  We can do something similar with libgcc very carefully.
Richard Biener Aug. 21, 2011, 9:04 a.m. UTC | #21
On Sat, Aug 20, 2011 at 11:02 PM, Richard Henderson <rth@redhat.com> wrote:
> On 08/19/2011 02:04 AM, Richard Guenther wrote:
>> So make sure that __cpu_indicator initially has a conservative correct
>> value?  I'd still prefer the constructor-in-libgcc option - if only because
>> then the compiler-side is much simplified.
>>
>
> Err, I thought __cpu_indicator was a function, not data.
>
> I think we need to discuss this more...

Oh, I thought it was data initialized by the constructor ...

>
> r~
>
Michael Matz Aug. 22, 2011, 2:07 p.m. UTC | #22
Hi,

On Sun, 21 Aug 2011, Richard Guenther wrote:

> On Sat, Aug 20, 2011 at 11:02 PM, Richard Henderson <rth@redhat.com> wrote:
> > On 08/19/2011 02:04 AM, Richard Guenther wrote:
> >> So make sure that __cpu_indicator initially has a conservative correct
> >> value?  I'd still prefer the constructor-in-libgcc option - if only because
> >> then the compiler-side is much simplified.
> >>
> >
> > Err, I thought __cpu_indicator was a function, not data.
> >
> > I think we need to discuss this more...
> 
> Oh, I thought it was data initialized by the constructor ...

Sriramans patch right now has a function __cpu_indicator_init which is 
called from (adhoc constructed) ctors and that initializes variables
__cpu_model and __cpu_features ;-)  There's no __cpu_indicator symbol :)

I think the whole initializer function and the associated data blobs have 
to sit in static libgcc and be hidden.  By that all shared modules 
will have their own copies of the model and features (and the initializer 
function) so there won't be issues with copy relocs, or cross shared lib 
calls while relocating the modules.  Dynamically they will contain the 
same data always, but it's not many bytes, and only modules making use of 
this facility will pay it.

The initializer function has to be callable from pre-.init contexts, e.g.  
ifunc dispatchers.  And to make life easier there should be one ctor 
function calling this initializer function too, so that normal code can 
rely on it being already called saving one check.


Ciao,
Michael.
H.J. Lu Aug. 22, 2011, 2:11 p.m. UTC | #23
On Mon, Aug 22, 2011 at 7:07 AM, Michael Matz <matz@suse.de> wrote:
> Hi,
>
> On Sun, 21 Aug 2011, Richard Guenther wrote:
>
>> On Sat, Aug 20, 2011 at 11:02 PM, Richard Henderson <rth@redhat.com> wrote:
>> > On 08/19/2011 02:04 AM, Richard Guenther wrote:
>> >> So make sure that __cpu_indicator initially has a conservative correct
>> >> value?  I'd still prefer the constructor-in-libgcc option - if only because
>> >> then the compiler-side is much simplified.
>> >>
>> >
>> > Err, I thought __cpu_indicator was a function, not data.
>> >
>> > I think we need to discuss this more...
>>
>> Oh, I thought it was data initialized by the constructor ...
>
> Sriramans patch right now has a function __cpu_indicator_init which is
> called from (adhoc constructed) ctors and that initializes variables
> __cpu_model and __cpu_features ;-)  There's no __cpu_indicator symbol :)
>
> I think the whole initializer function and the associated data blobs have
> to sit in static libgcc and be hidden.  By that all shared modules
> will have their own copies of the model and features (and the initializer
> function) so there won't be issues with copy relocs, or cross shared lib
> calls while relocating the modules.  Dynamically they will contain the
> same data always, but it's not many bytes, and only modules making use of
> this facility will pay it.
>
> The initializer function has to be callable from pre-.init contexts, e.g.
> ifunc dispatchers.  And to make life easier there should be one ctor
> function calling this initializer function too, so that normal code can
> rely on it being already called saving one check.
>

It sounds more complicated than necessary.  Why not just do it
on demand like glibc does?
Michael Matz Aug. 22, 2011, 3:56 p.m. UTC | #24
Hi,

On Mon, 22 Aug 2011, H.J. Lu wrote:

> >> Oh, I thought it was data initialized by the constructor ...
> >
> > Sriramans patch right now has a function __cpu_indicator_init which is 
> > called from (adhoc constructed) ctors and that initializes variables
> > __cpu_model and __cpu_features ;-)  There's no __cpu_indicator symbol :)
> >
> > I think the whole initializer function and the associated data blobs have
> > to sit in static libgcc and be hidden.  By that all shared modules
> > will have their own copies of the model and features (and the initializer
> > function) so there won't be issues with copy relocs, or cross shared lib
> > calls while relocating the modules.  Dynamically they will contain the
> > same data always, but it's not many bytes, and only modules making use of
> > this facility will pay it.
> >
> > The initializer function has to be callable from pre-.init contexts, e.g.
> > ifunc dispatchers.  And to make life easier there should be one ctor
> > function calling this initializer function too, so that normal code can
> > rely on it being already called saving one check.
> >
> 
> It sounds more complicated than necessary.  Why not just do it
> on demand like glibc does?

Ehm, the only difference would be to not have a ctor in libgcc that looks 
like so:

void __attribute__((constructor)) bla(void)
{
  __cpu_indicator_init ();
}

I don't see any complication.?


Ciao,
Michael.
H.J. Lu Aug. 22, 2011, 4:02 p.m. UTC | #25
On Mon, Aug 22, 2011 at 8:56 AM, Michael Matz <matz@suse.de> wrote:
> Hi,
>
> On Mon, 22 Aug 2011, H.J. Lu wrote:
>
>> >> Oh, I thought it was data initialized by the constructor ...
>> >
>> > Sriramans patch right now has a function __cpu_indicator_init which is
>> > called from (adhoc constructed) ctors and that initializes variables
>> > __cpu_model and __cpu_features ;-)  There's no __cpu_indicator symbol :)
>> >
>> > I think the whole initializer function and the associated data blobs have
>> > to sit in static libgcc and be hidden.  By that all shared modules
>> > will have their own copies of the model and features (and the initializer
>> > function) so there won't be issues with copy relocs, or cross shared lib
>> > calls while relocating the modules.  Dynamically they will contain the
>> > same data always, but it's not many bytes, and only modules making use of
>> > this facility will pay it.
>> >
>> > The initializer function has to be callable from pre-.init contexts, e.g.
>> > ifunc dispatchers.  And to make life easier there should be one ctor
>> > function calling this initializer function too, so that normal code can
>> > rely on it being already called saving one check.
>> >
>>
>> It sounds more complicated than necessary.  Why not just do it
>> on demand like glibc does?
>
> Ehm, the only difference would be to not have a ctor in libgcc that looks
> like so:
>
> void __attribute__((constructor)) bla(void)
> {
>  __cpu_indicator_init ();
> }
>
> I don't see any complication.?
>

Order of constructors.  A constructor may call functions
which use __cpu_indicator.
Sriraman Tallam Aug. 22, 2011, 6:50 p.m. UTC | #26
On Mon, Aug 22, 2011 at 9:02 AM, H.J. Lu <hjl.tools@gmail.com> wrote:
> On Mon, Aug 22, 2011 at 8:56 AM, Michael Matz <matz@suse.de> wrote:
>> Hi,
>>
>> On Mon, 22 Aug 2011, H.J. Lu wrote:
>>
>>> >> Oh, I thought it was data initialized by the constructor ...
>>> >
>>> > Sriramans patch right now has a function __cpu_indicator_init which is
>>> > called from (adhoc constructed) ctors and that initializes variables
>>> > __cpu_model and __cpu_features ;-)  There's no __cpu_indicator symbol :)
>>> >
>>> > I think the whole initializer function and the associated data blobs have
>>> > to sit in static libgcc and be hidden.  By that all shared modules
>>> > will have their own copies of the model and features (and the initializer
>>> > function) so there won't be issues with copy relocs, or cross shared lib
>>> > calls while relocating the modules.  Dynamically they will contain the
>>> > same data always, but it's not many bytes, and only modules making use of
>>> > this facility will pay it.
>>> >
>>> > The initializer function has to be callable from pre-.init contexts, e.g.
>>> > ifunc dispatchers.  And to make life easier there should be one ctor
>>> > function calling this initializer function too, so that normal code can
>>> > rely on it being already called saving one check.
>>> >
>>>
>>> It sounds more complicated than necessary.  Why not just do it
>>> on demand like glibc does?
>>
>> Ehm, the only difference would be to not have a ctor in libgcc that looks
>> like so:
>>
>> void __attribute__((constructor)) bla(void)
>> {
>>  __cpu_indicator_init ();
>> }
>>
>> I don't see any complication.?
>>
>
> Order of constructors.  A constructor may call functions
> which use __cpu_indicator.

I have a suggestion that is a hybrid of the proposed solutions here:

1) Make a constructor in every module that calls
"__cpu_indicator_init" and make it to be the first constructor to run.
 Will this solve the ordering problem?
2) Change __cpu_indicator_init to run only once by using a variable to
check if it has been run before.

So, each module's constructor will call __cpu_indicator_init but the
CPUID insns are only done once. I also avoid the extra overhead of
having to check if "__cpu_indicator_init" is called from within the
binary. Will this work?

Thanks,
-Sri.

>
> --
> H.J.
>
H.J. Lu Aug. 22, 2011, 6:58 p.m. UTC | #27
On Mon, Aug 22, 2011 at 11:50 AM, Sriraman Tallam <tmsriram@google.com> wrote:
> On Mon, Aug 22, 2011 at 9:02 AM, H.J. Lu <hjl.tools@gmail.com> wrote:
>> On Mon, Aug 22, 2011 at 8:56 AM, Michael Matz <matz@suse.de> wrote:
>>> Hi,
>>>
>>> On Mon, 22 Aug 2011, H.J. Lu wrote:
>>>
>>>> >> Oh, I thought it was data initialized by the constructor ...
>>>> >
>>>> > Sriramans patch right now has a function __cpu_indicator_init which is
>>>> > called from (adhoc constructed) ctors and that initializes variables
>>>> > __cpu_model and __cpu_features ;-)  There's no __cpu_indicator symbol :)
>>>> >
>>>> > I think the whole initializer function and the associated data blobs have
>>>> > to sit in static libgcc and be hidden.  By that all shared modules
>>>> > will have their own copies of the model and features (and the initializer
>>>> > function) so there won't be issues with copy relocs, or cross shared lib
>>>> > calls while relocating the modules.  Dynamically they will contain the
>>>> > same data always, but it's not many bytes, and only modules making use of
>>>> > this facility will pay it.
>>>> >
>>>> > The initializer function has to be callable from pre-.init contexts, e.g.
>>>> > ifunc dispatchers.  And to make life easier there should be one ctor
>>>> > function calling this initializer function too, so that normal code can
>>>> > rely on it being already called saving one check.
>>>> >
>>>>
>>>> It sounds more complicated than necessary.  Why not just do it
>>>> on demand like glibc does?
>>>
>>> Ehm, the only difference would be to not have a ctor in libgcc that looks
>>> like so:
>>>
>>> void __attribute__((constructor)) bla(void)
>>> {
>>>  __cpu_indicator_init ();
>>> }
>>>
>>> I don't see any complication.?
>>>
>>
>> Order of constructors.  A constructor may call functions
>> which use __cpu_indicator.
>
> I have a suggestion that is a hybrid of the proposed solutions here:
>
> 1) Make a constructor in every module that calls
> "__cpu_indicator_init" and make it to be the first constructor to run.
>  Will this solve the ordering problem?
> 2) Change __cpu_indicator_init to run only once by using a variable to
> check if it has been run before.
>
> So, each module's constructor will call __cpu_indicator_init but the
> CPUID insns are only done once. I also avoid the extra overhead of
> having to check if "__cpu_indicator_init" is called from within the
> binary. Will this work?
>

Please make it simple like

if __cpu_indicator is not initialized then
    call __cpu_indicator_init
fi

use  __cpu_indicator
Sriraman Tallam Aug. 22, 2011, 7:02 p.m. UTC | #28
On Mon, Aug 22, 2011 at 11:58 AM, H.J. Lu <hjl.tools@gmail.com> wrote:
> On Mon, Aug 22, 2011 at 11:50 AM, Sriraman Tallam <tmsriram@google.com> wrote:
>> On Mon, Aug 22, 2011 at 9:02 AM, H.J. Lu <hjl.tools@gmail.com> wrote:
>>> On Mon, Aug 22, 2011 at 8:56 AM, Michael Matz <matz@suse.de> wrote:
>>>> Hi,
>>>>
>>>> On Mon, 22 Aug 2011, H.J. Lu wrote:
>>>>
>>>>> >> Oh, I thought it was data initialized by the constructor ...
>>>>> >
>>>>> > Sriramans patch right now has a function __cpu_indicator_init which is
>>>>> > called from (adhoc constructed) ctors and that initializes variables
>>>>> > __cpu_model and __cpu_features ;-)  There's no __cpu_indicator symbol :)
>>>>> >
>>>>> > I think the whole initializer function and the associated data blobs have
>>>>> > to sit in static libgcc and be hidden.  By that all shared modules
>>>>> > will have their own copies of the model and features (and the initializer
>>>>> > function) so there won't be issues with copy relocs, or cross shared lib
>>>>> > calls while relocating the modules.  Dynamically they will contain the
>>>>> > same data always, but it's not many bytes, and only modules making use of
>>>>> > this facility will pay it.
>>>>> >
>>>>> > The initializer function has to be callable from pre-.init contexts, e.g.
>>>>> > ifunc dispatchers.  And to make life easier there should be one ctor
>>>>> > function calling this initializer function too, so that normal code can
>>>>> > rely on it being already called saving one check.
>>>>> >
>>>>>
>>>>> It sounds more complicated than necessary.  Why not just do it
>>>>> on demand like glibc does?
>>>>
>>>> Ehm, the only difference would be to not have a ctor in libgcc that looks
>>>> like so:
>>>>
>>>> void __attribute__((constructor)) bla(void)
>>>> {
>>>>  __cpu_indicator_init ();
>>>> }
>>>>
>>>> I don't see any complication.?
>>>>
>>>
>>> Order of constructors.  A constructor may call functions
>>> which use __cpu_indicator.
>>
>> I have a suggestion that is a hybrid of the proposed solutions here:
>>
>> 1) Make a constructor in every module that calls
>> "__cpu_indicator_init" and make it to be the first constructor to run.
>>  Will this solve the ordering problem?
>> 2) Change __cpu_indicator_init to run only once by using a variable to
>> check if it has been run before.
>>
>> So, each module's constructor will call __cpu_indicator_init but the
>> CPUID insns are only done once. I also avoid the extra overhead of
>> having to check if "__cpu_indicator_init" is called from within the
>> binary. Will this work?
>>
>
> Please make it simple like
>
> if __cpu_indicator is not initialized then
>    call __cpu_indicator_init
> fi
>
> use  __cpu_indicator
>

Will do, thanks.

-Sri.

>
> --
> H.J.
>
Richard Biener Aug. 22, 2011, 8:34 p.m. UTC | #29
On Mon, Aug 22, 2011 at 6:02 PM, H.J. Lu <hjl.tools@gmail.com> wrote:
> On Mon, Aug 22, 2011 at 8:56 AM, Michael Matz <matz@suse.de> wrote:
>> Hi,
>>
>> On Mon, 22 Aug 2011, H.J. Lu wrote:
>>
>>> >> Oh, I thought it was data initialized by the constructor ...
>>> >
>>> > Sriramans patch right now has a function __cpu_indicator_init which is
>>> > called from (adhoc constructed) ctors and that initializes variables
>>> > __cpu_model and __cpu_features ;-)  There's no __cpu_indicator symbol :)
>>> >
>>> > I think the whole initializer function and the associated data blobs have
>>> > to sit in static libgcc and be hidden.  By that all shared modules
>>> > will have their own copies of the model and features (and the initializer
>>> > function) so there won't be issues with copy relocs, or cross shared lib
>>> > calls while relocating the modules.  Dynamically they will contain the
>>> > same data always, but it's not many bytes, and only modules making use of
>>> > this facility will pay it.
>>> >
>>> > The initializer function has to be callable from pre-.init contexts, e.g.
>>> > ifunc dispatchers.  And to make life easier there should be one ctor
>>> > function calling this initializer function too, so that normal code can
>>> > rely on it being already called saving one check.
>>> >
>>>
>>> It sounds more complicated than necessary.  Why not just do it
>>> on demand like glibc does?
>>
>> Ehm, the only difference would be to not have a ctor in libgcc that looks
>> like so:
>>
>> void __attribute__((constructor)) bla(void)
>> {
>>  __cpu_indicator_init ();
>> }
>>
>> I don't see any complication.?
>>
>
> Order of constructors.  A constructor may call functions
> which use __cpu_indicator.

As I said - make __cpu_indicator have a conservative
default value (zero).  It is irrelevant if constructors that
run before initializing __cpu_indicator run with the
default CPU capabilities.

Richard.

> --
> H.J.
>
H.J. Lu Aug. 22, 2011, 8:39 p.m. UTC | #30
On Mon, Aug 22, 2011 at 1:34 PM, Richard Guenther
<richard.guenther@gmail.com> wrote:
> On Mon, Aug 22, 2011 at 6:02 PM, H.J. Lu <hjl.tools@gmail.com> wrote:
>> On Mon, Aug 22, 2011 at 8:56 AM, Michael Matz <matz@suse.de> wrote:
>>> Hi,
>>>
>>> On Mon, 22 Aug 2011, H.J. Lu wrote:
>>>
>>>> >> Oh, I thought it was data initialized by the constructor ...
>>>> >
>>>> > Sriramans patch right now has a function __cpu_indicator_init which is
>>>> > called from (adhoc constructed) ctors and that initializes variables
>>>> > __cpu_model and __cpu_features ;-)  There's no __cpu_indicator symbol :)
>>>> >
>>>> > I think the whole initializer function and the associated data blobs have
>>>> > to sit in static libgcc and be hidden.  By that all shared modules
>>>> > will have their own copies of the model and features (and the initializer
>>>> > function) so there won't be issues with copy relocs, or cross shared lib
>>>> > calls while relocating the modules.  Dynamically they will contain the
>>>> > same data always, but it's not many bytes, and only modules making use of
>>>> > this facility will pay it.
>>>> >
>>>> > The initializer function has to be callable from pre-.init contexts, e.g.
>>>> > ifunc dispatchers.  And to make life easier there should be one ctor
>>>> > function calling this initializer function too, so that normal code can
>>>> > rely on it being already called saving one check.
>>>> >
>>>>
>>>> It sounds more complicated than necessary.  Why not just do it
>>>> on demand like glibc does?
>>>
>>> Ehm, the only difference would be to not have a ctor in libgcc that looks
>>> like so:
>>>
>>> void __attribute__((constructor)) bla(void)
>>> {
>>>  __cpu_indicator_init ();
>>> }
>>>
>>> I don't see any complication.?
>>>
>>
>> Order of constructors.  A constructor may call functions
>> which use __cpu_indicator.
>
> As I said - make __cpu_indicator have a conservative
> default value (zero).  It is irrelevant if constructors that
> run before initializing __cpu_indicator run with the
> default CPU capabilities.
>

If  IFUNC is used, this just disables IFUNC for those functions
called with the conservative default value since they are only
resolved once.
Richard Biener Aug. 22, 2011, 8:46 p.m. UTC | #31
On Mon, Aug 22, 2011 at 10:39 PM, H.J. Lu <hjl.tools@gmail.com> wrote:
> On Mon, Aug 22, 2011 at 1:34 PM, Richard Guenther
> <richard.guenther@gmail.com> wrote:
>> On Mon, Aug 22, 2011 at 6:02 PM, H.J. Lu <hjl.tools@gmail.com> wrote:
>>> On Mon, Aug 22, 2011 at 8:56 AM, Michael Matz <matz@suse.de> wrote:
>>>> Hi,
>>>>
>>>> On Mon, 22 Aug 2011, H.J. Lu wrote:
>>>>
>>>>> >> Oh, I thought it was data initialized by the constructor ...
>>>>> >
>>>>> > Sriramans patch right now has a function __cpu_indicator_init which is
>>>>> > called from (adhoc constructed) ctors and that initializes variables
>>>>> > __cpu_model and __cpu_features ;-)  There's no __cpu_indicator symbol :)
>>>>> >
>>>>> > I think the whole initializer function and the associated data blobs have
>>>>> > to sit in static libgcc and be hidden.  By that all shared modules
>>>>> > will have their own copies of the model and features (and the initializer
>>>>> > function) so there won't be issues with copy relocs, or cross shared lib
>>>>> > calls while relocating the modules.  Dynamically they will contain the
>>>>> > same data always, but it's not many bytes, and only modules making use of
>>>>> > this facility will pay it.
>>>>> >
>>>>> > The initializer function has to be callable from pre-.init contexts, e.g.
>>>>> > ifunc dispatchers.  And to make life easier there should be one ctor
>>>>> > function calling this initializer function too, so that normal code can
>>>>> > rely on it being already called saving one check.
>>>>> >
>>>>>
>>>>> It sounds more complicated than necessary.  Why not just do it
>>>>> on demand like glibc does?
>>>>
>>>> Ehm, the only difference would be to not have a ctor in libgcc that looks
>>>> like so:
>>>>
>>>> void __attribute__((constructor)) bla(void)
>>>> {
>>>>  __cpu_indicator_init ();
>>>> }
>>>>
>>>> I don't see any complication.?
>>>>
>>>
>>> Order of constructors.  A constructor may call functions
>>> which use __cpu_indicator.
>>
>> As I said - make __cpu_indicator have a conservative
>> default value (zero).  It is irrelevant if constructors that
>> run before initializing __cpu_indicator run with the
>> default CPU capabilities.
>>
>
> If  IFUNC is used, this just disables IFUNC for those functions
> called with the conservative default value since they are only
> resolved once.

Huh, well.  So what happens if you use __cpu_indicator from the
IFUNC selector function!?  Honestly, if we care about these
corner-cases why not make __cpu_indicator a hidden function
instead.

IMHO IFUNC selectors should simply do

if (!__cpu_indicator)
  __cpu_indicator_init ();

Richard.

>
> --
> H.J.
>
H.J. Lu Aug. 22, 2011, 8:48 p.m. UTC | #32
On Mon, Aug 22, 2011 at 1:46 PM, Richard Guenther
<richard.guenther@gmail.com> wrote:
> On Mon, Aug 22, 2011 at 10:39 PM, H.J. Lu <hjl.tools@gmail.com> wrote:
>> On Mon, Aug 22, 2011 at 1:34 PM, Richard Guenther
>> <richard.guenther@gmail.com> wrote:
>>> On Mon, Aug 22, 2011 at 6:02 PM, H.J. Lu <hjl.tools@gmail.com> wrote:
>>>> On Mon, Aug 22, 2011 at 8:56 AM, Michael Matz <matz@suse.de> wrote:
>>>>> Hi,
>>>>>
>>>>> On Mon, 22 Aug 2011, H.J. Lu wrote:
>>>>>
>>>>>> >> Oh, I thought it was data initialized by the constructor ...
>>>>>> >
>>>>>> > Sriramans patch right now has a function __cpu_indicator_init which is
>>>>>> > called from (adhoc constructed) ctors and that initializes variables
>>>>>> > __cpu_model and __cpu_features ;-)  There's no __cpu_indicator symbol :)
>>>>>> >
>>>>>> > I think the whole initializer function and the associated data blobs have
>>>>>> > to sit in static libgcc and be hidden.  By that all shared modules
>>>>>> > will have their own copies of the model and features (and the initializer
>>>>>> > function) so there won't be issues with copy relocs, or cross shared lib
>>>>>> > calls while relocating the modules.  Dynamically they will contain the
>>>>>> > same data always, but it's not many bytes, and only modules making use of
>>>>>> > this facility will pay it.
>>>>>> >
>>>>>> > The initializer function has to be callable from pre-.init contexts, e.g.
>>>>>> > ifunc dispatchers.  And to make life easier there should be one ctor
>>>>>> > function calling this initializer function too, so that normal code can
>>>>>> > rely on it being already called saving one check.
>>>>>> >
>>>>>>
>>>>>> It sounds more complicated than necessary.  Why not just do it
>>>>>> on demand like glibc does?
>>>>>
>>>>> Ehm, the only difference would be to not have a ctor in libgcc that looks
>>>>> like so:
>>>>>
>>>>> void __attribute__((constructor)) bla(void)
>>>>> {
>>>>>  __cpu_indicator_init ();
>>>>> }
>>>>>
>>>>> I don't see any complication.?
>>>>>
>>>>
>>>> Order of constructors.  A constructor may call functions
>>>> which use __cpu_indicator.
>>>
>>> As I said - make __cpu_indicator have a conservative
>>> default value (zero).  It is irrelevant if constructors that
>>> run before initializing __cpu_indicator run with the
>>> default CPU capabilities.
>>>
>>
>> If  IFUNC is used, this just disables IFUNC for those functions
>> called with the conservative default value since they are only
>> resolved once.
>
> Huh, well.  So what happens if you use __cpu_indicator from the
> IFUNC selector function!?  Honestly, if we care about these
> corner-cases why not make __cpu_indicator a hidden function
> instead.
>
> IMHO IFUNC selectors should simply do
>
> if (!__cpu_indicator)
>  __cpu_indicator_init ();
>

Isn't it what I said before?
Richard Biener Aug. 22, 2011, 8:54 p.m. UTC | #33
On Mon, Aug 22, 2011 at 10:48 PM, H.J. Lu <hjl.tools@gmail.com> wrote:
> On Mon, Aug 22, 2011 at 1:46 PM, Richard Guenther
> <richard.guenther@gmail.com> wrote:
>> On Mon, Aug 22, 2011 at 10:39 PM, H.J. Lu <hjl.tools@gmail.com> wrote:
>>> On Mon, Aug 22, 2011 at 1:34 PM, Richard Guenther
>>> <richard.guenther@gmail.com> wrote:
>>>> On Mon, Aug 22, 2011 at 6:02 PM, H.J. Lu <hjl.tools@gmail.com> wrote:
>>>>> On Mon, Aug 22, 2011 at 8:56 AM, Michael Matz <matz@suse.de> wrote:
>>>>>> Hi,
>>>>>>
>>>>>> On Mon, 22 Aug 2011, H.J. Lu wrote:
>>>>>>
>>>>>>> >> Oh, I thought it was data initialized by the constructor ...
>>>>>>> >
>>>>>>> > Sriramans patch right now has a function __cpu_indicator_init which is
>>>>>>> > called from (adhoc constructed) ctors and that initializes variables
>>>>>>> > __cpu_model and __cpu_features ;-)  There's no __cpu_indicator symbol :)
>>>>>>> >
>>>>>>> > I think the whole initializer function and the associated data blobs have
>>>>>>> > to sit in static libgcc and be hidden.  By that all shared modules
>>>>>>> > will have their own copies of the model and features (and the initializer
>>>>>>> > function) so there won't be issues with copy relocs, or cross shared lib
>>>>>>> > calls while relocating the modules.  Dynamically they will contain the
>>>>>>> > same data always, but it's not many bytes, and only modules making use of
>>>>>>> > this facility will pay it.
>>>>>>> >
>>>>>>> > The initializer function has to be callable from pre-.init contexts, e.g.
>>>>>>> > ifunc dispatchers.  And to make life easier there should be one ctor
>>>>>>> > function calling this initializer function too, so that normal code can
>>>>>>> > rely on it being already called saving one check.
>>>>>>> >
>>>>>>>
>>>>>>> It sounds more complicated than necessary.  Why not just do it
>>>>>>> on demand like glibc does?
>>>>>>
>>>>>> Ehm, the only difference would be to not have a ctor in libgcc that looks
>>>>>> like so:
>>>>>>
>>>>>> void __attribute__((constructor)) bla(void)
>>>>>> {
>>>>>>  __cpu_indicator_init ();
>>>>>> }
>>>>>>
>>>>>> I don't see any complication.?
>>>>>>
>>>>>
>>>>> Order of constructors.  A constructor may call functions
>>>>> which use __cpu_indicator.
>>>>
>>>> As I said - make __cpu_indicator have a conservative
>>>> default value (zero).  It is irrelevant if constructors that
>>>> run before initializing __cpu_indicator run with the
>>>> default CPU capabilities.
>>>>
>>>
>>> If  IFUNC is used, this just disables IFUNC for those functions
>>> called with the conservative default value since they are only
>>> resolved once.
>>
>> Huh, well.  So what happens if you use __cpu_indicator from the
>> IFUNC selector function!?  Honestly, if we care about these
>> corner-cases why not make __cpu_indicator a hidden function
>> instead.
>>
>> IMHO IFUNC selectors should simply do
>>
>> if (!__cpu_indicator)
>>  __cpu_indicator_init ();
>>
>
> Isn't it what I said before?

Not in the quoted parts.  What I don't want is a constructor in each module.
Keep a single one in libgcc and document the __cpu_indicator usage
restrictions.

Richard.

> --
> H.J.
>
Michael Matz Aug. 23, 2011, 11:35 a.m. UTC | #34
Hi,

On Mon, 22 Aug 2011, H.J. Lu wrote:

> > void __attribute__((constructor)) bla(void)
> > {
> >  __cpu_indicator_init ();
> > }
> >
> > I don't see any complication.?
> >
> 
> Order of constructors.  A constructor may call functions
> which use __cpu_indicator.

That's why I wrote also:

> The initializer function has to be callable from pre-.init contexts, e.g.
> ifunc dispatchers.

It obviously has to be guarded against multiple calls.  The ctor in libgcc 
would be mere convenience because then non-ctor code can rely on the data 
being initialized, and only (potential) ctor code has to check and call 
the init function on demand.


Ciao,
Michael.
diff mbox

Patch

Index: libgcc/config.host
===================================================================
--- libgcc/config.host	(revision 177767)
+++ libgcc/config.host	(working copy)
@@ -609,7 +609,7 @@  case ${host} in
 i[34567]86-*-linux* | x86_64-*-linux* | \
   i[34567]86-*-kfreebsd*-gnu | i[34567]86-*-knetbsd*-gnu | \
   i[34567]86-*-gnu*)
-	tmake_file="${tmake_file} t-tls"
+	tmake_file="${tmake_file} t-tls i386/t-cpuinfo"
 	if test "$libgcc_cv_cfi" = "yes"; then
 		tmake_file="${tmake_file} t-stack i386/t-stack-i386"
 	fi
Index: libgcc/config/i386/t-cpuinfo
===================================================================
--- libgcc/config/i386/t-cpuinfo	(revision 0)
+++ libgcc/config/i386/t-cpuinfo	(revision 0)
@@ -0,0 +1,2 @@ 
+# This is an endfile
+LIB2ADD += $(srcdir)/config/i386/i386-cpuinfo.c
Index: libgcc/config/i386/i386-cpuinfo.c
===================================================================
--- libgcc/config/i386/i386-cpuinfo.c	(revision 0)
+++ libgcc/config/i386/i386-cpuinfo.c	(revision 0)
@@ -0,0 +1,275 @@ 
+/* Copyright (C) 2011 Free Software Foundation, Inc.
+ * Contributed by Sriraman Tallam <tmsriram@google.com>.
+ *
+ * This file is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 3, or (at your option) any
+ * later version.
+ *
+ * This file is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * Under Section 7 of GPL version 3, you are granted additional
+ * permissions described in the GCC Runtime Library Exception, version
+ * 3.1, as published by the Free Software Foundation.
+ *
+ * You should have received a copy of the GNU General Public License and
+ * a copy of the GCC Runtime Library Exception along with this program;
+ * see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+ * <http://www.gnu.org/licenses/>.
+ *
+ *
+ * This code is adapted from gcc/config/i386/driver-i386.c. The CPUID
+ * instruction is used to figure out the cpu type and supported features.
+ * GCC runs __cpu_indicator_init from a constructor which sets the members
+ * of __cpu_model and __cpu_features.
+ */
+
+#include <string.h>
+
+#ifdef __GNUC__
+#include "cpuid.h"
+
+enum processor_type
+{
+  PROCESSOR_PENTIUM = 0,
+  PROCESSOR_CORE2,
+  PROCESSOR_COREI7_NEHALEM,
+  PROCESSOR_COREI7_WESTMERE,
+  PROCESSOR_COREI7_SANDYBRIDGE,
+  PROCESSOR_INTEL_GENERIC,
+  PROCESSOR_AMDFAM10_BARCELONA,
+  PROCESSOR_AMDFAM10_SHANGHAI,
+  PROCESSOR_AMDFAM10_ISTANBUL,
+  PROCESSOR_AMDFAM10_GENERIC,
+  PROCESSOR_AMD_GENERIC,
+  PROCESSOR_GENERIC,
+  PROCESSOR_max
+};
+
+enum vendor_signatures
+{
+  SIG_INTEL =	0x756e6547 /* Genu */,
+  SIG_AMD =	0x68747541 /* Auth */
+};
+
+
+/* Features supported. */
+
+struct __processor_features
+{
+  unsigned int __cpu_cmov : 1;
+  unsigned int __cpu_mmx : 1;
+  unsigned int __cpu_popcnt : 1;
+  unsigned int __cpu_sse : 1;
+  unsigned int __cpu_sse2 : 1;
+  unsigned int __cpu_sse3 : 1;
+  unsigned int __cpu_ssse3 : 1;
+  unsigned int __cpu_sse4_1 : 1;
+  unsigned int __cpu_sse4_2 : 1;
+};
+
+/* Flags exported. */
+
+struct __processor_model
+{
+  unsigned int __cpu_is_amd : 1;
+  unsigned int __cpu_is_intel : 1;
+  unsigned int __cpu_is_corei7_nehalem : 1;
+  unsigned int __cpu_is_corei7_westmere : 1;
+  unsigned int __cpu_is_corei7_sandybridge : 1;
+  unsigned int __cpu_is_amdfam10_barcelona : 1;
+  unsigned int __cpu_is_amdfam10_shanghai : 1;
+  unsigned int __cpu_is_amdfam10_istanbul : 1;
+};
+
+enum processor_type __cpu_type = PROCESSOR_GENERIC;
+struct __processor_features __cpu_features;
+struct __processor_model __cpu_model;
+
+static void
+get_amd_cpu (unsigned int family, unsigned int model)
+{
+  switch (family)
+    {
+    case 0x10:
+      switch (model)
+	{
+	case 0x2:
+	  __cpu_type = PROCESSOR_AMDFAM10_BARCELONA;
+	  __cpu_model.__cpu_is_amdfam10_barcelona = 1;
+	  break;
+	case 0x4:
+	  __cpu_type = PROCESSOR_AMDFAM10_SHANGHAI;
+	  __cpu_model.__cpu_is_amdfam10_shanghai = 1;
+	  break;
+	case 0x8:
+	  __cpu_type = PROCESSOR_AMDFAM10_ISTANBUL;
+	  __cpu_model.__cpu_is_amdfam10_istanbul = 1;
+	  break;
+	default:
+	  __cpu_type = PROCESSOR_AMDFAM10_GENERIC;
+	  break;
+	}
+      break;
+    default:
+      __cpu_type = PROCESSOR_AMD_GENERIC;
+    }
+}
+
+static void
+get_intel_cpu (unsigned int family, unsigned int model, unsigned int brand_id)
+{
+  /* Parse family and model only if brand ID is 0. */
+  if (brand_id == 0)
+    {
+      switch (family)
+	{
+	case 0x5:
+	  __cpu_type = PROCESSOR_PENTIUM;
+	  break;
+	case 0x6:
+	  switch (model)
+	    {
+	    case 0x1a:
+	    case 0x1e:
+	    case 0x1f:
+	    case 0x2e:
+	      /* Nehalem.  */
+	      __cpu_type = PROCESSOR_COREI7_NEHALEM;
+	      __cpu_model.__cpu_is_corei7_nehalem = 1;
+	      break;
+	    case 0x25:
+	    case 0x2c:
+	    case 0x2f:
+	      /* Westmere.  */
+	      __cpu_type = PROCESSOR_COREI7_WESTMERE;
+	      __cpu_model.__cpu_is_corei7_westmere = 1;
+	      break;
+	    case 0x2a:
+	      /* Sandy Bridge.  */
+	      __cpu_type = PROCESSOR_COREI7_SANDYBRIDGE;
+	      __cpu_model.__cpu_is_corei7_sandybridge = 1;
+	      break;
+	    case 0x17:
+	    case 0x1d:
+	      /* Penryn.  */
+	    case 0x0f:
+	      /* Merom.  */
+	      __cpu_type = PROCESSOR_CORE2;
+	      break;
+	    default:
+	      __cpu_type = PROCESSOR_INTEL_GENERIC;
+	      break;
+	    }
+	  break;
+	default:
+	  /* We have no idea.  */
+	  __cpu_type = PROCESSOR_INTEL_GENERIC;
+	  break;
+	}
+    }
+}	             	
+
+static void
+get_available_features (unsigned int ecx, unsigned int edx)
+{
+  __cpu_features.__cpu_cmov = (edx & bit_CMOV) ? 1 : 0;
+  __cpu_features.__cpu_mmx = (edx & bit_MMX) ? 1 : 0;
+  __cpu_features.__cpu_sse = (edx & bit_SSE) ? 1 : 0;
+  __cpu_features.__cpu_sse2 = (edx & bit_SSE2) ? 1 : 0;
+  __cpu_features.__cpu_popcnt = (ecx & bit_POPCNT) ? 1 : 0;
+  __cpu_features.__cpu_sse3 = (ecx & bit_SSE3) ? 1 : 0;
+  __cpu_features.__cpu_ssse3 = (ecx & bit_SSSE3) ? 1 : 0;
+  __cpu_features.__cpu_sse4_1 = (ecx & bit_SSE4_1) ? 1 : 0;
+  __cpu_features.__cpu_sse4_2 = (ecx & bit_SSE4_2) ? 1 : 0;
+}
+
+/* A noinline function calling __get_cpuid. Having many calls to
+   cpuid in one function in 32-bit mode causes GCC to complain:
+   "can’t find a register in class ‘CLOBBERED_REGS’".  This is
+   related to PR rtl-optimization 44174. */
+
+static int __attribute__ ((noinline))
+__get_cpuid_output (unsigned int __level,
+		    unsigned int *__eax, unsigned int *__ebx,
+		    unsigned int *__ecx, unsigned int *__edx)
+{
+  return __get_cpuid (__level, __eax, __ebx, __ecx, __edx);
+}
+
+/* This function will be linked in to binaries that need to look up
+   CPU information.  */
+
+void
+__cpu_indicator_init(void)
+{
+  unsigned int eax, ebx, ecx, edx;
+
+  int max_level = 5;
+  unsigned int vendor;
+  unsigned int model, family, brand_id;
+
+  memset (&__cpu_features, 0, sizeof (struct __processor_features));
+  memset (&__cpu_model, 0, sizeof (struct __processor_model));
+
+  /* Assume cpuid insn present. Run in level 0 to get vendor id. */
+  if (!__get_cpuid_output (0, &eax, &ebx, &ecx, &edx))
+    return;
+
+  vendor = ebx;
+  max_level = eax;
+
+  if (max_level < 1)
+    return;
+
+  if (!__get_cpuid_output (1, &eax, &ebx, &ecx, &edx))
+    return;
+
+  model = (eax >> 4) & 0x0f;
+  family = (eax >> 8) & 0x0f;
+  brand_id = ebx & 0xff;
+
+  /* Adjust model and family for Intel CPUS. */
+  if (vendor == SIG_INTEL)
+    {
+      unsigned int extended_model, extended_family;
+
+      extended_model = (eax >> 12) & 0xf0;
+      extended_family = (eax >> 20) & 0xff;
+      if (family == 0x0f)
+	{
+	  family += extended_family;
+	  model += extended_model;
+	}
+      else if (family == 0x06)
+	model += extended_model;
+    }
+
+  /* Find CPU model. */
+
+  if (vendor == SIG_AMD)
+    {
+      __cpu_model.__cpu_is_amd = 1;
+      get_amd_cpu (family, model);
+    }
+  else if (vendor == SIG_INTEL)
+    {
+      __cpu_model.__cpu_is_intel = 1;
+      get_intel_cpu (family, model, brand_id);
+    }
+
+  /* Find available features. */
+  get_available_features (ecx, edx);
+}
+
+#else
+
+void
+__cpu_indicator_init(void)
+{
+}
+
+#endif /* __GNUC__ */
Index: gcc/tree-pass.h
===================================================================
--- gcc/tree-pass.h	(revision 177767)
+++ gcc/tree-pass.h	(working copy)
@@ -449,6 +449,7 @@  extern struct gimple_opt_pass pass_split_functions
 extern struct gimple_opt_pass pass_feedback_split_functions;
 extern struct gimple_opt_pass pass_threadsafe_analyze;
 extern struct gimple_opt_pass pass_tree_convert_builtin_dispatch;
+extern struct gimple_opt_pass pass_tree_fold_builtin_target;
 
 /* IPA Passes */
 extern struct simple_ipa_opt_pass pass_ipa_lower_emutls;
Index: gcc/testsuite/gcc.dg/builtin_target.c
===================================================================
--- gcc/testsuite/gcc.dg/builtin_target.c	(revision 0)
+++ gcc/testsuite/gcc.dg/builtin_target.c	(revision 0)
@@ -0,0 +1,49 @@ 
+/* This test checks if the __builtin_target_* calls are recognized. */
+
+/* { dg-do run } */
+
+int
+fn1 ()
+{
+  if (__builtin_target_supports_cmov () < 0)
+    return -1;
+  if (__builtin_target_supports_mmx () < 0)
+    return -1;
+  if (__builtin_target_supports_popcount () < 0)
+    return -1;
+  if (__builtin_target_supports_sse () < 0)
+    return -1;
+  if (__builtin_target_supports_sse2 () < 0)
+    return -1;
+  if (__builtin_target_supports_sse3 () < 0)
+    return -1;
+  if (__builtin_target_supports_ssse3 () < 0)
+    return -1;
+  if (__builtin_target_supports_sse4_1 () < 0)
+    return -1;
+  if (__builtin_target_supports_sse4_2 () < 0)
+    return -1;
+  if (__builtin_target_is_amd () < 0)
+    return -1;
+  if (__builtin_target_is_intel () < 0)
+    return -1;
+  if (__builtin_target_is_corei7_nehalem () < 0)
+    return -1;
+  if (__builtin_target_is_corei7_westmere () < 0)
+    return -1;
+  if (__builtin_target_is_corei7_sandybridge () < 0)
+    return -1;
+  if (__builtin_target_is_amdfam10_barcelona () < 0)
+    return -1;
+  if (__builtin_target_is_amdfam10_shanghai () < 0)
+    return -1;
+  if (__builtin_target_is_amdfam10_istanbul () < 0)
+    return -1;
+
+  return 0;
+}
+
+int main ()
+{
+  return fn1 ();
+}
Index: gcc/builtins.def
===================================================================
--- gcc/builtins.def	(revision 177767)
+++ gcc/builtins.def	(working copy)
@@ -763,6 +763,25 @@  DEF_BUILTIN (BUILT_IN_EMUTLS_REGISTER_COMMON,
 /* Multiversioning builtin dispatch hook. */
 DEF_GCC_BUILTIN (BUILT_IN_DISPATCH, "dispatch", BT_FN_INT_PTR_FN_INT_PTR_PTR_VAR, ATTR_NULL)
 
+/* Builtins to determine target type and features at run-time. */
+DEF_GCC_BUILTIN (BUILT_IN_TARGET_SUPPORTS_CMOV, "target_supports_cmov", BT_FN_INT, ATTR_NULL)
+DEF_GCC_BUILTIN (BUILT_IN_TARGET_SUPPORTS_MMX, "target_supports_mmx", BT_FN_INT, ATTR_NULL)
+DEF_GCC_BUILTIN (BUILT_IN_TARGET_SUPPORTS_POPCOUNT, "target_supports_popcount", BT_FN_INT, ATTR_NULL)
+DEF_GCC_BUILTIN (BUILT_IN_TARGET_SUPPORTS_SSE, "target_supports_sse", BT_FN_INT, ATTR_NULL)
+DEF_GCC_BUILTIN (BUILT_IN_TARGET_SUPPORTS_SSE2, "target_supports_sse2", BT_FN_INT, ATTR_NULL)
+DEF_GCC_BUILTIN (BUILT_IN_TARGET_SUPPORTS_SSE3, "target_supports_sse3", BT_FN_INT, ATTR_NULL)
+DEF_GCC_BUILTIN (BUILT_IN_TARGET_SUPPORTS_SSSE3, "target_supports_ssse3", BT_FN_INT, ATTR_NULL)
+DEF_GCC_BUILTIN (BUILT_IN_TARGET_SUPPORTS_SSE4_1, "target_supports_sse4_1", BT_FN_INT, ATTR_NULL)
+DEF_GCC_BUILTIN (BUILT_IN_TARGET_SUPPORTS_SSE4_2, "target_supports_sse4_2", BT_FN_INT, ATTR_NULL)
+DEF_GCC_BUILTIN (BUILT_IN_TARGET_IS_AMD, "target_is_amd", BT_FN_INT, ATTR_NULL)
+DEF_GCC_BUILTIN (BUILT_IN_TARGET_IS_INTEL, "target_is_intel", BT_FN_INT, ATTR_NULL)
+DEF_GCC_BUILTIN (BUILT_IN_TARGET_IS_COREI7_NEHALEM, "target_is_corei7_nehalem", BT_FN_INT, ATTR_NULL)
+DEF_GCC_BUILTIN (BUILT_IN_TARGET_IS_COREI7_WESTMERE, "target_is_corei7_westmere", BT_FN_INT, ATTR_NULL)
+DEF_GCC_BUILTIN (BUILT_IN_TARGET_IS_COREI7_SANDYBRIDGE, "target_is_corei7_sandybridge", BT_FN_INT, ATTR_NULL)
+DEF_GCC_BUILTIN (BUILT_IN_TARGET_IS_AMDFAM10_BARCELONA, "target_is_amdfam10_barcelona", BT_FN_INT, ATTR_NULL)
+DEF_GCC_BUILTIN (BUILT_IN_TARGET_IS_AMDFAM10_SHANGHAI, "target_is_amdfam10_shanghai", BT_FN_INT, ATTR_NULL)
+DEF_GCC_BUILTIN (BUILT_IN_TARGET_IS_AMDFAM10_ISTANBUL, "target_is_amdfam10_istanbul", BT_FN_INT, ATTR_NULL)
+
 /* Exception support.  */
 DEF_BUILTIN_STUB (BUILT_IN_UNWIND_RESUME, "__builtin_unwind_resume")
 DEF_BUILTIN_STUB (BUILT_IN_CXA_END_CLEANUP, "__builtin_cxa_end_cleanup")
Index: gcc/mversn-dispatch.c
===================================================================
--- gcc/mversn-dispatch.c	(revision 177767)
+++ gcc/mversn-dispatch.c	(working copy)
@@ -135,6 +135,7 @@  along with GCC; see the file COPYING3.  If not see
 #include "output.h"
 #include "vecprim.h"
 #include "gimple-pretty-print.h"
+#include "target.h"
 
 typedef struct cgraph_node* NODEPTR;
 DEF_VEC_P (NODEPTR);
@@ -1764,3 +1765,103 @@  struct gimple_opt_pass pass_tree_convert_builtin_d
   TODO_update_ssa | TODO_verify_ssa
  }
 };
+
+/* Fold calls to __builtin_target_* */
+
+static unsigned int
+do_fold_builtin_target (void)
+{
+  basic_block bb;
+  gimple_stmt_iterator gsi;
+
+  /* Go through each stmt looking for __builtin_target_* calls */
+  FOR_EACH_BB_FN (bb, DECL_STRUCT_FUNCTION (current_function_decl))
+    {
+      for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
+        {
+	  gimple stmt = gsi_stmt (gsi);
+	  gimple assign_stmt;
+          tree call_decl;
+	  tree lhs_retval;
+	  tree folded_val;
+
+	  tree ssa_var, tmp_var;
+	  gimple init_stmt;
+
+          if (!is_gimple_call (stmt))
+            continue;
+
+          call_decl = gimple_call_fndecl (stmt);
+
+	  /* Check if it is a __builtin_target_* call. */
+
+	  if (call_decl == NULL
+	      || DECL_NAME (call_decl) == NULL_TREE
+	      || DECL_BUILT_IN_CLASS (call_decl) != BUILT_IN_NORMAL
+	      || strstr (IDENTIFIER_POINTER (DECL_NAME (call_decl)),
+                         "__builtin_target") == NULL)
+            continue;
+
+	  /* If the lhs is NULL there is no need to fold the call. */
+	  lhs_retval = gimple_call_lhs(stmt);
+	  if (lhs_retval == NULL)
+	    continue;
+
+	  /* Call the target hook to fold the builtin */	
+          folded_val = targetm.fold_builtin(call_decl, 0, NULL, false);
+
+	  /* If the target does not support the builtin then fold it to zero. */
+	  if (folded_val == NULL_TREE)
+	    folded_val = build_zero_cst (unsigned_type_node);
+
+	  /* Type cast unsigned value to integer */
+	  tmp_var = create_tmp_var (unsigned_type_node, NULL);
+	  init_stmt = gimple_build_assign (tmp_var, folded_val);
+	  ssa_var = make_ssa_name (tmp_var, init_stmt);
+	  gimple_assign_set_lhs (init_stmt, ssa_var);
+	  mark_symbols_for_renaming (init_stmt);
+
+	  assign_stmt = gimple_build_assign_with_ops (NOP_EXPR, lhs_retval, ssa_var, 0);
+	  mark_symbols_for_renaming(assign_stmt);
+
+	  gsi_insert_after_without_update (&gsi, assign_stmt, GSI_SAME_STMT);
+	  gsi_insert_after_without_update (&gsi, init_stmt, GSI_SAME_STMT);
+	  /* Delete the original call. */
+	  gsi_remove(&gsi, true);
+	}
+    }
+
+  return 0;
+}
+
+static bool
+gate_fold_builtin_target (void)
+{
+  return true;
+}
+
+/* Pass to fold __builtin_target_* functions */
+
+struct gimple_opt_pass pass_tree_fold_builtin_target =
+{
+ {
+  GIMPLE_PASS,
+  "fold_builtin_target",	        /* name */
+  gate_fold_builtin_target,		/* gate */
+  do_fold_builtin_target,		/* execute */
+  NULL,					/* sub */
+  NULL,					/* next */
+  0,					/* static_pass_number */
+  TV_FOLD_BUILTIN_TARGET,		/* tv_id */
+  PROP_cfg,				/* properties_required */
+  PROP_cfg,				/* properties_provided */
+  0,					/* properties_destroyed */
+  0,					/* todo_flags_start */
+  TODO_dump_func |			/* todo_flags_finish */
+  TODO_cleanup_cfg |
+  TODO_update_ssa |
+  TODO_verify_ssa
+ }
+};
+
+
Index: gcc/timevar.def
===================================================================
--- gcc/timevar.def	(revision 177767)
+++ gcc/timevar.def	(working copy)
@@ -124,6 +124,7 @@  DEFTIMEVAR (TV_PARSE_INMETH          , "parser inl
 DEFTIMEVAR (TV_TEMPLATE_INST         , "template instantiation")
 DEFTIMEVAR (TV_INLINE_HEURISTICS     , "inline heuristics")
 DEFTIMEVAR (TV_MVERSN_DISPATCH       , "multiversion dispatch")
+DEFTIMEVAR (TV_FOLD_BUILTIN_TARGET   , "fold __builtin_target calls")
 DEFTIMEVAR (TV_INTEGRATION           , "integration")
 DEFTIMEVAR (TV_TREE_GIMPLIFY	     , "tree gimplify")
 DEFTIMEVAR (TV_TREE_EH		     , "tree eh")
Index: gcc/passes.c
===================================================================
--- gcc/passes.c	(revision 177767)
+++ gcc/passes.c	(working copy)
@@ -1249,6 +1249,8 @@  init_optimization_passes (void)
     {
       struct opt_pass **p = &pass_ipa_multiversion_dispatch.pass.sub;
       NEXT_PASS (pass_tree_convert_builtin_dispatch);
+      /* Fold calls to __builtin_target_*. */
+      NEXT_PASS (pass_tree_fold_builtin_target);
       /* Rebuilding cgraph edges is necessary as the above passes change
          the call graph.  Otherwise, future optimizations use the old
 	 call graph and make wrong decisions sometimes.*/
Index: gcc/config/i386/i386.c
===================================================================
--- gcc/config/i386/i386.c	(revision 177767)
+++ gcc/config/i386/i386.c	(working copy)
@@ -58,6 +58,8 @@  along with GCC; see the file COPYING3.  If not see
 #include "sched-int.h"
 #include "sbitmap.h"
 #include "fibheap.h"
+#include "tree-flow.h"
+#include "tree-pass.h"
 
 enum upper_128bits_state
 {
@@ -7867,6 +7869,338 @@  ix86_build_builtin_va_list (void)
   return ret;
 }
 
+/* Returns a struct type with name NAME and number of fields equal to
+   NUM_FIELDS.  Each field is a unsigned int bit field of length 1 bit. */
+
+static tree
+build_struct_with_one_bit_fields (int num_fields, const char *name)
+{
+  int i;
+  char field_name [10];
+  tree field = NULL_TREE, field_chain = NULL_TREE;
+  tree type = make_node (RECORD_TYPE);
+
+  strcpy (field_name, "k_field");
+
+  for (i = 0; i < num_fields; i++)
+    {
+      /* Name the fields, 0_field, 1_field, ... */
+      field_name [0] = '0' + i;
+      field = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
+			  get_identifier (field_name), unsigned_type_node);
+      DECL_BIT_FIELD (field) = 1;
+      DECL_SIZE (field) = bitsize_one_node;
+      if (field_chain != NULL_TREE)
+	DECL_CHAIN (field) = field_chain;
+      field_chain = field;
+    }
+  finish_builtin_struct (type, name, field_chain, NULL_TREE);
+  return type;
+}
+
+/* Returns a VAR_DECL of type TYPE and name NAME. */
+
+static tree
+make_var_decl (tree type, const char *name)
+{
+  tree new_decl;
+  struct varpool_node *vnode;
+
+  new_decl = build_decl (UNKNOWN_LOCATION,
+	                 VAR_DECL,
+	  	         get_identifier(name),
+		         type);
+
+  DECL_EXTERNAL (new_decl) = 1;
+  TREE_STATIC (new_decl) = 1;
+  TREE_PUBLIC (new_decl) = 1;
+  DECL_INITIAL (new_decl) = 0;
+  DECL_ARTIFICIAL (new_decl) = 0;
+  DECL_PRESERVE_P (new_decl) = 1;
+
+  make_decl_one_only (new_decl, DECL_ASSEMBLER_NAME (new_decl));
+  assemble_variable (new_decl, 0, 0, 0);
+
+  vnode = varpool_node (new_decl);
+  gcc_assert (vnode != NULL);
+  /* Set finalized to 1, otherwise it asserts in function "write_symbol" in
+     lto-streamer-out.c. */
+  vnode->finalized = 1;
+
+  return new_decl;
+}
+
+/* Traverses the chain of fields in STRUCT_TYPE and returns the FIELD_NUM
+   numbered field. */
+
+static tree
+get_field_from_struct (tree struct_type, int field_num)
+{
+  int i;
+  tree field = TYPE_FIELDS (struct_type);
+
+  for (i = 0; i < field_num; i++, field = DECL_CHAIN(field))
+    {
+      gcc_assert (field != NULL_TREE);
+    }
+
+  return field;
+}
+
+/* Create a new static constructor that calls __cpu_indicator_init ()
+   function defined in libgcc/config/i386-cpuinfo.c which runs cpuid
+   to figure out the type of the target. */
+
+static tree
+make_constructor_to_get_target_type (const char *name)
+{
+  tree decl, type, t;
+  gimple_seq seq;
+  basic_block new_bb;
+  tree old_current_function_decl;
+
+  tree __cpu_indicator_int_decl;
+  gimple constructor_body;
+
+
+  type = build_function_type_list (void_type_node, NULL_TREE);
+
+  /* Make a call stmt to __cpu_indicator_init */
+  __cpu_indicator_int_decl = build_fn_decl ("__cpu_indicator_init", type);
+  constructor_body = gimple_build_call (__cpu_indicator_int_decl, 0);
+  DECL_EXTERNAL (__cpu_indicator_int_decl) = 1;
+
+  decl = build_fn_decl (name, type);
+
+  DECL_NAME (decl) = get_identifier (name);
+  SET_DECL_ASSEMBLER_NAME (decl, DECL_NAME (decl));
+  gcc_assert (cgraph_node (decl) != NULL);
+
+  TREE_USED (decl) = 1;
+  DECL_ARTIFICIAL (decl) = 1;
+  DECL_IGNORED_P (decl) = 0;
+  TREE_PUBLIC (decl) = 0;
+  DECL_UNINLINABLE (decl) = 1;
+  DECL_EXTERNAL (decl) = 0;
+  DECL_CONTEXT (decl) = NULL_TREE;
+  DECL_INITIAL (decl) = make_node (BLOCK);
+  DECL_STATIC_CONSTRUCTOR (decl) = 1;
+  TREE_READONLY (decl) = 0;
+  DECL_PURE_P (decl) = 0;
+
+  /* This is a comdat. */ 
+  make_decl_one_only (decl, DECL_ASSEMBLER_NAME (decl));
+
+  /* Build result decl and add to function_decl. */
+  t = build_decl (UNKNOWN_LOCATION, RESULT_DECL, NULL_TREE, void_type_node);
+  DECL_ARTIFICIAL (t) = 1;
+  DECL_IGNORED_P (t) = 1;
+  DECL_RESULT (decl) = t;
+
+  gimplify_function_tree (decl);
+
+  /* Build CFG for this function. */
+
+  old_current_function_decl = current_function_decl;
+  push_cfun (DECL_STRUCT_FUNCTION (decl));
+  current_function_decl = decl;
+  init_empty_tree_cfg_for_function (DECL_STRUCT_FUNCTION (decl));
+  cfun->curr_properties |=
+    (PROP_gimple_lcf | PROP_gimple_leh | PROP_cfg | PROP_referenced_vars |
+     PROP_ssa);
+  new_bb = create_empty_bb (ENTRY_BLOCK_PTR);
+  make_edge (ENTRY_BLOCK_PTR, new_bb, EDGE_FALLTHRU);
+
+  /* XXX: Not sure if the edge commented below is necessary.  If I add this
+     edge, it fails in gimple_verify_flow_info in tree-cfg.c in condition :
+     " if (e->flags & EDGE_FALLTHRU)"
+     during -fprofile-generate.
+     Otherwise, it is fine.  Deleting this edge does not break anything.
+     Commenting this so that it is clear I am intentionally not doing this.*/
+  /* make_edge (new_bb, EXIT_BLOCK_PTR, EDGE_FALLTHRU); */
+
+  seq = gimple_seq_alloc_with_stmt (constructor_body);
+
+  set_bb_seq (new_bb, seq);
+  gimple_set_bb (constructor_body, new_bb);
+
+  /* Set the lexical block of the constructor body. Fails the inliner
+     other wise. */
+  gimple_set_block (constructor_body, DECL_INITIAL (decl));
+
+  /* This call is very important if this pass runs when the IR is in
+     SSA form.  It breaks things in strange ways otherwise. */
+  init_tree_ssa (DECL_STRUCT_FUNCTION (decl));
+  /* add_referenced_var (version_selector_var); */
+
+  cgraph_add_new_function (decl, true);
+  cgraph_call_function_insertion_hooks (cgraph_node (decl));
+  cgraph_mark_needed_node (cgraph_node (decl));
+
+  pop_cfun ();
+  current_function_decl = old_current_function_decl;
+  return decl;
+}
+
+/* FNDECL is a __builtin_target_* call that is folded into an integer defined
+   in libgcc/config/i386/i386-cpuinfo.c */
+
+static tree 
+fold_builtin_target (tree fndecl)
+{
+  /* This is the order of bit-fields in __processor_features in
+     i386-cpuinfo.c */
+  enum processor_features
+  {
+    F_CMOV = 0,
+    F_MMX,
+    F_POPCNT,
+    F_SSE,
+    F_SSE2,
+    F_SSE3,
+    F_SSSE3,
+    F_SSE4_1,
+    F_SSE4_2,
+    F_MAX
+  };
+
+  /* This is the order of bit-fields in __processor_model in
+     i386-cpuinfo.c */
+  enum processor_model
+  {
+    M_AMD = 0,
+    M_INTEL,
+    M_COREI7_NEHALEM,
+    M_COREI7_WESTMERE,
+    M_COREI7_SANDYBRIDGE,
+    M_AMDFAM10_BARCELONA,
+    M_AMDFAM10_SHANGHAI,
+    M_AMDFAM10_ISTANBUL,
+    M_MAX
+  };
+
+  static tree __processor_features_type = NULL_TREE;
+  static tree __cpu_features_var = NULL_TREE;
+  static tree __processor_model_type = NULL_TREE;
+  static tree __cpu_model_var = NULL_TREE;
+  static tree ctor_decl = NULL_TREE;
+  static tree field;
+  static tree which_struct;
+
+  /* Make a call to __cpu_indicatior_init in a constructor.
+     Function __cpu_indicator_init is defined in i386-cpuinfo.c. */
+  if (ctor_decl == NULL_TREE)
+   ctor_decl = make_constructor_to_get_target_type 
+		("__cpu_indicator_init_ctor");
+
+  if (__processor_features_type == NULL_TREE)
+    __processor_features_type = build_struct_with_one_bit_fields (F_MAX,
+ 			          "__processor_features");
+
+  if (__processor_model_type == NULL_TREE)
+    __processor_model_type = build_struct_with_one_bit_fields (M_MAX,
+ 			          "__processor_model");
+
+  if (__cpu_features_var == NULL_TREE)
+    __cpu_features_var = make_var_decl (__processor_features_type,
+					"__cpu_features");
+
+  if (__cpu_model_var == NULL_TREE)
+    __cpu_model_var = make_var_decl (__processor_model_type,
+				     "__cpu_model");
+
+  /* Look at fndecl code to identify the field requested. */ 
+  switch (DECL_FUNCTION_CODE (fndecl))
+    {
+    case BUILT_IN_TARGET_SUPPORTS_CMOV:
+      field = get_field_from_struct (__processor_features_type, F_CMOV);
+      which_struct = __cpu_features_var;
+      break;
+    case BUILT_IN_TARGET_SUPPORTS_MMX:
+      field = get_field_from_struct (__processor_features_type, F_MMX);
+      which_struct = __cpu_features_var;
+      break;
+    case BUILT_IN_TARGET_SUPPORTS_POPCOUNT:
+      field = get_field_from_struct (__processor_features_type, F_POPCNT);
+      which_struct = __cpu_features_var;
+      break;
+    case BUILT_IN_TARGET_SUPPORTS_SSE:
+      field = get_field_from_struct (__processor_features_type, F_SSE);
+      which_struct = __cpu_features_var;
+      break;
+    case BUILT_IN_TARGET_SUPPORTS_SSE2:
+      field = get_field_from_struct (__processor_features_type, F_SSE2);
+      which_struct = __cpu_features_var;
+      break;
+    case BUILT_IN_TARGET_SUPPORTS_SSE3:
+      field = get_field_from_struct (__processor_features_type, F_SSE3);
+      which_struct = __cpu_features_var;
+      break;
+    case BUILT_IN_TARGET_SUPPORTS_SSSE3:
+      field = get_field_from_struct (__processor_features_type, F_SSE3);
+      which_struct = __cpu_features_var;
+      break;
+    case BUILT_IN_TARGET_SUPPORTS_SSE4_1:
+      field = get_field_from_struct (__processor_features_type, F_SSE4_1);
+      which_struct = __cpu_features_var;
+      break;
+    case BUILT_IN_TARGET_SUPPORTS_SSE4_2:
+      field = get_field_from_struct (__processor_features_type, F_SSE4_2);
+      which_struct = __cpu_features_var;
+      break;
+    case BUILT_IN_TARGET_IS_AMD:
+      field = get_field_from_struct (__processor_model_type, M_AMD);;
+      which_struct = __cpu_model_var;
+      break;
+    case BUILT_IN_TARGET_IS_INTEL:
+      field = get_field_from_struct (__processor_model_type, M_INTEL);;
+      which_struct = __cpu_model_var;
+      break;
+    case BUILT_IN_TARGET_IS_COREI7_NEHALEM:
+      field = get_field_from_struct (__processor_model_type, M_COREI7_NEHALEM);;
+      which_struct = __cpu_model_var;
+      break;
+    case BUILT_IN_TARGET_IS_COREI7_WESTMERE:
+      field = get_field_from_struct (__processor_model_type, M_COREI7_WESTMERE);;
+      which_struct = __cpu_model_var;
+      break;
+    case BUILT_IN_TARGET_IS_COREI7_SANDYBRIDGE:
+      field = get_field_from_struct (__processor_model_type, M_COREI7_SANDYBRIDGE);;
+      which_struct = __cpu_model_var;
+      break;
+    case BUILT_IN_TARGET_IS_AMDFAM10_BARCELONA:
+      field = get_field_from_struct (__processor_model_type, M_AMDFAM10_BARCELONA);;
+      which_struct = __cpu_model_var;
+      break;
+    case BUILT_IN_TARGET_IS_AMDFAM10_SHANGHAI:
+      field = get_field_from_struct (__processor_model_type, M_AMDFAM10_SHANGHAI);;
+      which_struct = __cpu_model_var;
+      break;
+    case BUILT_IN_TARGET_IS_AMDFAM10_ISTANBUL:
+      field = get_field_from_struct (__processor_model_type, M_AMDFAM10_ISTANBUL);;
+      which_struct = __cpu_model_var;
+      break;
+    default:
+      return NULL_TREE;
+    }
+
+  return build3 (COMPONENT_REF, TREE_TYPE (field), which_struct, field, NULL_TREE);
+}
+
+/* Folds __builtin_target_* builtins. */
+
+static tree
+ix86_fold_builtin (tree fndecl, int n_args ATTRIBUTE_UNUSED,
+		    tree *args ATTRIBUTE_UNUSED, bool ignore ATTRIBUTE_UNUSED)
+{
+  const char *decl_name = IDENTIFIER_POINTER (DECL_NAME (fndecl));
+  if (DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_NORMAL
+      && strstr(decl_name, "__builtin_target") != NULL)
+    return fold_builtin_target (fndecl);
+
+  return NULL_TREE;
+}
+
 /* Worker function for TARGET_SETUP_INCOMING_VARARGS.  */
 
 static void
@@ -35097,6 +35431,9 @@  ix86_autovectorize_vector_sizes (void)
 #undef TARGET_BUILD_BUILTIN_VA_LIST
 #define TARGET_BUILD_BUILTIN_VA_LIST ix86_build_builtin_va_list
 
+#undef TARGET_FOLD_BUILTIN
+#define TARGET_FOLD_BUILTIN ix86_fold_builtin
+
 #undef TARGET_ENUM_VA_LIST_P
 #define TARGET_ENUM_VA_LIST_P ix86_enum_va_list