diff mbox

Also fold bmi/bmi2/tbm bextr/bextri/bzhi/pext/pdep builtins

Message ID 20161021152641.GL7282@tucnak.redhat.com
State New
Headers show

Commit Message

Jakub Jelinek Oct. 21, 2016, 3:26 p.m. UTC
Hi!

This patch on top of the just posted patch adds folding for a couple more
builtins (though, hundreds or thousands of other md builtins remain unfolded
even though they actually could be folded for e.g. const arguments).

Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?

2016-10-21  Jakub Jelinek  <jakub@redhat.com>

	* config/i386/i386.c (ix86_fold_builtin): Handle
	IX86_BUILTIN_BEXTR{,I}{32,64}, IX86_BUILTIN_BZHI{32,64},
	IX86_BUILTIN_PDEP{32,64} and IX86_BUILTIN_PEXT{32,64}.
	(ix86_gimple_fold_builtin): Handle IX86_BUILTIN_BZHI{32,64},
	IX86_BUILTIN_PDEP{32,64} and IX86_BUILTIN_PEXT{32,64}.

	* gcc.target/i386/bmi2-pext-1.c: New test.
	* gcc.target/i386/bmi2-pdep-1.c: New test.
	* gcc.target/i386/bmi2-bzhi-3.c: New test.
	* gcc.target/i386/tbm-bextri-1.c: New test.
	* gcc.target/i386/bmi-bextr-6.c: New test.


	Jakub

Comments

Uros Bizjak Oct. 21, 2016, 3:37 p.m. UTC | #1
On Fri, Oct 21, 2016 at 5:26 PM, Jakub Jelinek <jakub@redhat.com> wrote:
> Hi!
>
> This patch on top of the just posted patch adds folding for a couple more
> builtins (though, hundreds or thousands of other md builtins remain unfolded
> even though they actually could be folded for e.g. const arguments).
>
> Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?
>
> 2016-10-21  Jakub Jelinek  <jakub@redhat.com>
>
>         * config/i386/i386.c (ix86_fold_builtin): Handle
>         IX86_BUILTIN_BEXTR{,I}{32,64}, IX86_BUILTIN_BZHI{32,64},
>         IX86_BUILTIN_PDEP{32,64} and IX86_BUILTIN_PEXT{32,64}.
>         (ix86_gimple_fold_builtin): Handle IX86_BUILTIN_BZHI{32,64},
>         IX86_BUILTIN_PDEP{32,64} and IX86_BUILTIN_PEXT{32,64}.
>
>         * gcc.target/i386/bmi2-pext-1.c: New test.
>         * gcc.target/i386/bmi2-pdep-1.c: New test.
>         * gcc.target/i386/bmi2-bzhi-3.c: New test.
>         * gcc.target/i386/tbm-bextri-1.c: New test.
>         * gcc.target/i386/bmi-bextr-6.c: New test.

I'm not versed in this area, let's ask Richi for a review...

OK if Richi says so...

Thanks,
Uros.

> --- gcc/config/i386/i386.c.jj   2016-10-21 14:31:21.770818850 +0200
> +++ gcc/config/i386/i386.c      2016-10-21 14:58:58.897893832 +0200
> @@ -33369,6 +33369,88 @@ ix86_fold_builtin (tree fndecl, int n_ar
>             }
>           break;
>
> +       case IX86_BUILTIN_BEXTR32:
> +       case IX86_BUILTIN_BEXTR64:
> +       case IX86_BUILTIN_BEXTRI32:
> +       case IX86_BUILTIN_BEXTRI64:
> +         gcc_assert (n_args == 2);
> +         if (tree_fits_uhwi_p (args[1]))
> +           {
> +             unsigned HOST_WIDE_INT res = 0;
> +             unsigned int prec = TYPE_PRECISION (TREE_TYPE (args[0]));
> +             unsigned int start = tree_to_uhwi (args[1]);
> +             unsigned int len = (start & 0xff00) >> 8;
> +             start &= 0xff;
> +             if (start >= prec || len == 0)
> +               res = 0;
> +             else if (!tree_fits_uhwi_p (args[0]))
> +               break;
> +             else
> +               res = tree_to_uhwi (args[0]) >> start;
> +             if (len > prec)
> +               len = prec;
> +             if (len < HOST_BITS_PER_WIDE_INT)
> +               res &= (HOST_WIDE_INT_1U << len) - 1;
> +             return build_int_cstu (TREE_TYPE (TREE_TYPE (fndecl)), res);
> +           }
> +         break;
> +
> +       case IX86_BUILTIN_BZHI32:
> +       case IX86_BUILTIN_BZHI64:
> +         gcc_assert (n_args == 2);
> +         if (tree_fits_uhwi_p (args[1]))
> +           {
> +             unsigned int idx = tree_to_uhwi (args[1]) & 0xff;
> +             if (idx >= TYPE_PRECISION (TREE_TYPE (args[0])))
> +               return args[0];
> +             if (!tree_fits_uhwi_p (args[0]))
> +               break;
> +             unsigned HOST_WIDE_INT res = tree_to_uhwi (args[0]);
> +             res &= ~(HOST_WIDE_INT_M1U << idx);
> +             return build_int_cstu (TREE_TYPE (TREE_TYPE (fndecl)), res);
> +           }
> +         break;
> +
> +       case IX86_BUILTIN_PDEP32:
> +       case IX86_BUILTIN_PDEP64:
> +         gcc_assert (n_args == 2);
> +         if (tree_fits_uhwi_p (args[0]) && tree_fits_uhwi_p (args[1]))
> +           {
> +             unsigned HOST_WIDE_INT src = tree_to_uhwi (args[0]);
> +             unsigned HOST_WIDE_INT mask = tree_to_uhwi (args[1]);
> +             unsigned HOST_WIDE_INT res = 0;
> +             unsigned HOST_WIDE_INT m, k = 1;
> +             for (m = 1; m; m <<= 1)
> +               if ((mask & m) != 0)
> +                 {
> +                   if ((src & k) != 0)
> +                     res |= m;
> +                   k <<= 1;
> +                 }
> +             return build_int_cstu (TREE_TYPE (TREE_TYPE (fndecl)), res);
> +           }
> +         break;
> +
> +       case IX86_BUILTIN_PEXT32:
> +       case IX86_BUILTIN_PEXT64:
> +         gcc_assert (n_args == 2);
> +         if (tree_fits_uhwi_p (args[0]) && tree_fits_uhwi_p (args[1]))
> +           {
> +             unsigned HOST_WIDE_INT src = tree_to_uhwi (args[0]);
> +             unsigned HOST_WIDE_INT mask = tree_to_uhwi (args[1]);
> +             unsigned HOST_WIDE_INT res = 0;
> +             unsigned HOST_WIDE_INT m, k = 1;
> +             for (m = 1; m; m <<= 1)
> +               if ((mask & m) != 0)
> +                 {
> +                   if ((src & m) != 0)
> +                     res |= k;
> +                   k <<= 1;
> +                 }
> +             return build_int_cstu (TREE_TYPE (TREE_TYPE (fndecl)), res);
> +           }
> +         break;
> +
>         default:
>           break;
>         }
> @@ -33393,7 +33475,7 @@ ix86_gimple_fold_builtin (gimple_stmt_it
>    int n_args = gimple_call_num_args (stmt);
>    enum ix86_builtins fn_code = (enum ix86_builtins) DECL_FUNCTION_CODE (fndecl);
>    tree decl = NULL_TREE;
> -  tree arg0;
> +  tree arg0, arg1;
>
>    switch (fn_code)
>      {
> @@ -33432,6 +33514,41 @@ ix86_gimple_fold_builtin (gimple_stmt_it
>           gimple_set_location (g, loc);
>           gsi_replace (gsi, g, true);
>           return true;
> +       }
> +      break;
> +
> +    case IX86_BUILTIN_BZHI32:
> +    case IX86_BUILTIN_BZHI64:
> +      gcc_assert (n_args == 2);
> +      arg1 = gimple_call_arg (stmt, 1);
> +      if (tree_fits_uhwi_p (arg1) && gimple_call_lhs (stmt))
> +       {
> +         unsigned int idx = tree_to_uhwi (arg1) & 0xff;
> +         arg0 = gimple_call_arg (stmt, 0);
> +         if (idx < TYPE_PRECISION (TREE_TYPE (arg0)))
> +           break;
> +         location_t loc = gimple_location (stmt);
> +         gimple *g = gimple_build_assign (gimple_call_lhs (stmt), arg0);
> +         gimple_set_location (g, loc);
> +         gsi_replace (gsi, g, true);
> +         return true;
> +       }
> +      break;
> +
> +    case IX86_BUILTIN_PDEP32:
> +    case IX86_BUILTIN_PDEP64:
> +    case IX86_BUILTIN_PEXT32:
> +    case IX86_BUILTIN_PEXT64:
> +      gcc_assert (n_args == 2);
> +      arg1 = gimple_call_arg (stmt, 1);
> +      if (integer_all_onesp (arg1) && gimple_call_lhs (stmt))
> +       {
> +         location_t loc = gimple_location (stmt);
> +         arg0 = gimple_call_arg (stmt, 0);
> +         gimple *g = gimple_build_assign (gimple_call_lhs (stmt), arg0);
> +         gimple_set_location (g, loc);
> +         gsi_replace (gsi, g, true);
> +         return true;
>         }
>        break;
>
> --- gcc/testsuite/gcc.target/i386/bmi2-pext-1.c.jj      2016-10-21 15:09:43.568733192 +0200
> +++ gcc/testsuite/gcc.target/i386/bmi2-pext-1.c 2016-10-21 15:09:33.000000000 +0200
> @@ -0,0 +1,33 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -mbmi2 -fdump-tree-optimized" } */
> +/* { dg-final { scan-tree-dump-not "link_error" "optimized" } } */
> +
> +#include <x86intrin.h>
> +
> +extern void link_error (void);
> +
> +unsigned int a;
> +unsigned long long b;
> +
> +int
> +main ()
> +{
> +  asm volatile ("" : : "g" (&a), "g" (&b) : "memory");
> +  if (_pext_u32 (0xabcdef98, 0xffff0000) != 0xabcd
> +      || _pext_u32 (0xabcdef98, 0xffffff00) != 0xabcdef
> +      || _pext_u32 (0xabcdef98, 0x0f0f0f0f) != 0xbdf8
> +      || _pext_u32 (0xabcdef98, 0xff0fff0f) != 0xabdef8
> +      || _pext_u32 (0xabcdef98, 0x000fffff) != 0xdef98
> +      || _pext_u32 (a, 0xffffffff) != a)
> +    link_error ();
> +#ifdef __x86_64__
> +  if (_pext_u64 (0xabcdef9876543210UL, 0xffffffff00000000UL) != 0xabcdef98UL
> +      || _pext_u64 (0xabcdef9876543210UL, 0xffffffffffffff00UL) != 0xabcdef98765432UL
> +      || _pext_u64 (0xabcdef9876543210UL, 0x0f0f0f0f0f0f0f0fUL) != 0xbdf86420UL
> +      || _pext_u64 (0xabcdef9876543210UL, 0xff0fff0fff0fff0fUL) != 0xabdef8764320UL
> +      || _pext_u64 (0xabcdef9876543210UL, 0x00000000000fffffUL) != 0x43210UL
> +      || _pext_u64 (b, 0xffffffffffffffffUL) != b)
> +    link_error ();
> +#endif
> +  return 0;
> +}
> --- gcc/testsuite/gcc.target/i386/bmi2-pdep-1.c.jj      2016-10-21 15:18:07.611358728 +0200
> +++ gcc/testsuite/gcc.target/i386/bmi2-pdep-1.c 2016-10-21 15:18:00.000000000 +0200
> @@ -0,0 +1,33 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -mbmi2 -fdump-tree-optimized" } */
> +/* { dg-final { scan-tree-dump-not "link_error" "optimized" } } */
> +
> +#include <x86intrin.h>
> +
> +extern void link_error (void);
> +
> +unsigned int a;
> +unsigned long long b;
> +
> +int
> +main ()
> +{
> +  asm volatile ("" : : "g" (&a), "g" (&b) : "memory");
> +  if (_pdep_u32 (0xabcdef98, 0xffff0000) != 0xef980000
> +      || _pdep_u32 (0xabcdef98, 0xffffff00) != 0xcdef9800
> +      || _pdep_u32 (0xabcdef98, 0x0f0f0f0f) != 0x0e0f0908
> +      || _pdep_u32 (0xabcdef98, 0xff0fff0f) != 0xcd0ef908
> +      || _pdep_u32 (0xabcdef98, 0x000fffff) != 0xdef98
> +      || _pdep_u32 (a, 0xffffffff) != a)
> +    link_error ();
> +#ifdef __x86_64__
> +  if (_pdep_u64 (0xabcdef9876543210UL, 0xffffffff00000000UL) != 0x7654321000000000UL
> +      || _pdep_u64 (0xabcdef9876543210UL, 0xffffffffffffff00UL) != 0xcdef987654321000UL
> +      || _pdep_u64 (0xabcdef9876543210UL, 0x0f0f0f0f0f0f0f0fUL) != 0x0706050403020100UL
> +      || _pdep_u64 (0xabcdef9876543210UL, 0xff0fff0fff0fff0fUL) != 0xef09870654032100UL
> +      || _pdep_u64 (0xabcdef9876543210UL, 0x00000000000fffffUL) != 0x43210UL
> +      || _pdep_u64 (b, 0xffffffffffffffffUL) != b)
> +    link_error ();
> +#endif
> +  return 0;
> +}
> --- gcc/testsuite/gcc.target/i386/bmi2-bzhi-3.c.jj      2016-10-21 13:18:06.844209990 +0200
> +++ gcc/testsuite/gcc.target/i386/bmi2-bzhi-3.c 2016-10-21 14:42:00.177759205 +0200
> @@ -0,0 +1,77 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -mbmi2 -fdump-tree-optimized" } */
> +/* { dg-final { scan-tree-dump-not "link_error" "optimized" } } */
> +
> +#include <x86intrin.h>
> +
> +extern void link_error (void);
> +unsigned int a;
> +unsigned long long b;
> +
> +static inline unsigned int f1 (void) { return _bzhi_u32 (a, 0); }
> +static inline unsigned int f2 (unsigned int x) { return _bzhi_u32 (x, 0); }
> +static inline unsigned int f3 (void) { return _bzhi_u32 (a, 5); }
> +static inline unsigned int f4 (unsigned int x) { return _bzhi_u32 (x, 5); }
> +static inline unsigned int f5 (void) { return _bzhi_u32 (a, 31); }
> +static inline unsigned int f6 (unsigned int x) { return _bzhi_u32 (x, 31); }
> +static inline unsigned int f7 (void) { return _bzhi_u32 (a, 32); }
> +static inline unsigned int f8 (unsigned int x) { return _bzhi_u32 (x, 32); }
> +static inline unsigned int f9 (void) { return _bzhi_u32 (a, 37); }
> +static inline unsigned int f10 (unsigned int x) { return _bzhi_u32 (x, 37); }
> +static inline unsigned int f11 (void) { return _bzhi_u32 (a, 257); }
> +static inline unsigned int f12 (unsigned int x) { return _bzhi_u32 (x, 257); }
> +static inline unsigned int f13 (void) { return _bzhi_u32 (a, 289); }
> +static inline unsigned int f14 (unsigned int x) { return _bzhi_u32 (x, 289); }
> +#ifdef __x86_64__
> +static inline unsigned long long f21 (void) { return _bzhi_u64 (b, 0); }
> +static inline unsigned long long f22 (unsigned long long x) { return _bzhi_u64 (x, 0); }
> +static inline unsigned long long f23 (void) { return _bzhi_u64 (b, 5); }
> +static inline unsigned long long f24 (unsigned long long x) { return _bzhi_u64 (x, 5); }
> +static inline unsigned long long f25 (void) { return _bzhi_u64 (b, 63); }
> +static inline unsigned long long f26 (unsigned long long x) { return _bzhi_u64 (x, 63); }
> +static inline unsigned long long f27 (void) { return _bzhi_u64 (b, 64); }
> +static inline unsigned long long f28 (unsigned long long x) { return _bzhi_u64 (x, 64); }
> +static inline unsigned long long f29 (void) { return _bzhi_u64 (b, 69); }
> +static inline unsigned long long f30 (unsigned long long x) { return _bzhi_u64 (x, 69); }
> +static inline unsigned long long f31 (void) { return _bzhi_u64 (b, 257); }
> +static inline unsigned long long f32 (unsigned long long x) { return _bzhi_u64 (x, 257); }
> +static inline unsigned long long f33 (void) { return _bzhi_u64 (b, 321); }
> +static inline unsigned long long f34 (unsigned long long x) { return _bzhi_u64 (x, 321); }
> +#endif
> +
> +unsigned int c;
> +unsigned long long d;
> +
> +int
> +main ()
> +{
> +  asm volatile ("" : : "g" (&c), "g" (&d) : "memory");
> +  a = -1U;
> +  b = -1ULL;
> +  if (f1 () != 0 || f2 (-1U) != 0
> +      || f3 () != 0x1f || f4 (-1U) != 0x1f
> +      || f5 () != 0x7fffffffU || f6 (-1U) != 0x7fffffffU
> +      || f7 () != -1U || f8 (-1U) != -1U
> +      || f9 () != -1U || f10 (-1U) != -1U
> +      || f11 () != 1 || f12 (-1U) != 1
> +      || f13 () != -1U || f14 (-1U) != -1U)
> +    link_error ();
> +  if (_bzhi_u32 (c, 32) != c
> +      || _bzhi_u32 (c, 64) != c
> +      || _bzhi_u32 (c, 255) != c)
> +    link_error ();
> +#ifdef __x86_64__
> +  if (f21 () != 0 || f22 (-1ULL) != 0
> +      || f23 () != 0x1f || f24 (-1ULL) != 0x1f
> +      || f25 () != 0x7fffffffffffffffULL || f26 (-1ULL) != 0x7fffffffffffffffULL
> +      || f27 () != -1ULL || f28 (-1ULL) != -1ULL
> +      || f29 () != -1ULL || f30 (-1ULL) != -1ULL
> +      || f31 () != 1 || f32 (-1ULL) != 1
> +      || f33 () != -1ULL || f34 (-1ULL) != -1ULL)
> +    link_error ();
> +  if (_bzhi_u64 (d, 64) != d
> +      || _bzhi_u64 (d, 255) != d)
> +    link_error ();
> +#endif
> +  return 0;
> +}
> --- gcc/testsuite/gcc.target/i386/tbm-bextri-1.c.jj     2016-10-21 14:46:06.632648404 +0200
> +++ gcc/testsuite/gcc.target/i386/tbm-bextri-1.c        2016-10-21 14:45:58.000000000 +0200
> @@ -0,0 +1,36 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -mtbm -fdump-tree-optimized" } */
> +/* { dg-final { scan-tree-dump-not "link_error" "optimized" } } */
> +
> +#include <x86intrin.h>
> +
> +extern void link_error (void);
> +
> +volatile unsigned int a;
> +volatile unsigned long long b;
> +
> +int
> +main ()
> +{
> +  if (__bextri_u32 (0xffffffffU, 0 | (0 << 8)) != 0
> +      || __bextri_u32 (0xffffffffU, 64 | (16 << 8)) != 0
> +      || __bextri_u32 (0x12345678U, 4 | (10 << 8)) != 0x167
> +      || __bextri_u32 (0xffffffffU, 2 | (255 << 8)) != 0x3fffffff
> +      || __bextri_u32 (0xdeadbeefU, 2 | (64 << 8)) != 0x37ab6fbb
> +      || __bextri_u32 (0xdeadbeefU, 0 | (64 << 8)) != 0xdeadbeefU
> +      || __bextri_u32 (a, 0 | (0 << 8)) != 0
> +      || __bextri_u32 (a, 32 | (16 << 8)) != 0)
> +    link_error ();
> +#ifdef __x86_64__
> +  if (__bextri_u64 (0xffffffffffffffffUL, 0 | (0 << 8)) != 0
> +      || __bextri_u64 (0xffffffffffffffffUL, 128 | (16 << 8)) != 0
> +      || __bextri_u64 (0x123456789abcdef0UL, 5 | (37 << 8)) != 0x13c4d5e6f7UL
> +      || __bextri_u64 (0xffffffffffffffffUL, 2 | (255 << 8)) != 0x3fffffffffffffffUL
> +      || __bextri_u64 (0xdeadbeefbeefdeadU, 2 | (64 << 8)) != 0x37ab6fbbefbbf7abUL
> +      || __bextri_u64 (0xdeadbeefbeefdeadU, 0 | (64 << 8)) != 0xdeadbeefbeefdeadUL
> +      || __bextri_u64 (b, 0 | (0 << 8)) != 0
> +      || __bextri_u64 (b, 64 | (16 << 8)) != 0)
> +    link_error ();
> +#endif
> +  return 0;
> +}
> --- gcc/testsuite/gcc.target/i386/bmi-bextr-6.c.jj      2016-10-21 14:06:12.533953786 +0200
> +++ gcc/testsuite/gcc.target/i386/bmi-bextr-6.c 2016-10-21 14:44:53.358573284 +0200
> @@ -0,0 +1,54 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -mbmi -fdump-tree-optimized" } */
> +/* { dg-final { scan-tree-dump-not "link_error" "optimized" } } */
> +
> +#include <x86intrin.h>
> +
> +extern void link_error (void);
> +
> +volatile unsigned int a;
> +volatile unsigned long long b;
> +
> +int
> +main ()
> +{
> +  if (__bextr_u32 (0xffffffffU, 0 | (0 << 8)) != 0
> +      || __bextr_u32 (0xffffffffU, 64 | (16 << 8)) != 0
> +      || __bextr_u32 (0x12345678U, 4 | (10 << 8)) != 0x167
> +      || __bextr_u32 (0xffffffffU, 2 | (255 << 8)) != 0x3fffffff
> +      || __bextr_u32 (0xdeadbeefU, 2 | (64 << 8)) != 0x37ab6fbb
> +      || __bextr_u32 (0xdeadbeefU, 0 | (64 << 8)) != 0xdeadbeefU
> +      || __bextr_u32 (a, 0 | (0 << 8)) != 0
> +      || __bextr_u32 (a, 32 | (16 << 8)) != 0)
> +    link_error ();
> +  if (_bextr_u32 (0xffffffffU, 0, 0) != 0
> +      || _bextr_u32 (0xffffffffU, 64, 16) != 0
> +      || _bextr_u32 (0x12345678U, 4, 10) != 0x167
> +      || _bextr_u32 (0xffffffffU, 2, 255) != 0x3fffffff
> +      || _bextr_u32 (0xdeadbeefU, 2, 64) != 0x37ab6fbb
> +      || _bextr_u32 (0xdeadbeefU, 0, 64) != 0xdeadbeefU
> +      || _bextr_u32 (a, 0, 0) != 0
> +      || _bextr_u32 (a, 32, 16) != 0)
> +    link_error ();
> +#ifdef __x86_64__
> +  if (__bextr_u64 (0xffffffffffffffffUL, 0 | (0 << 8)) != 0
> +      || __bextr_u64 (0xffffffffffffffffUL, 128 | (16 << 8)) != 0
> +      || __bextr_u64 (0x123456789abcdef0UL, 5 | (37 << 8)) != 0x13c4d5e6f7UL
> +      || __bextr_u64 (0xffffffffffffffffUL, 2 | (255 << 8)) != 0x3fffffffffffffffUL
> +      || __bextr_u64 (0xdeadbeefbeefdeadU, 2 | (64 << 8)) != 0x37ab6fbbefbbf7abUL
> +      || __bextr_u64 (0xdeadbeefbeefdeadU, 0 | (64 << 8)) != 0xdeadbeefbeefdeadUL
> +      || __bextr_u64 (b, 0 | (0 << 8)) != 0
> +      || __bextr_u64 (b, 64 | (16 << 8)) != 0)
> +    link_error ();
> +  if (_bextr_u64 (0xffffffffffffffffUL, 0, 0) != 0
> +      || _bextr_u64 (0xffffffffffffffffUL, 128, 16) != 0
> +      || _bextr_u64 (0x123456789abcdef0UL, 5, 37) != 0x13c4d5e6f7UL
> +      || _bextr_u64 (0xffffffffffffffffUL, 2, 255) != 0x3fffffffffffffffUL
> +      || _bextr_u64 (0xdeadbeefbeefdeadUL, 2, 64) != 0x37ab6fbbefbbf7abUL
> +      || _bextr_u64 (0xdeadbeefbeefdeadUL, 0, 64) != 0xdeadbeefbeefdeadUL
> +      || _bextr_u64 (b, 0, 0) != 0
> +      || _bextr_u64 (b, 64, 16) != 0)
> +    link_error ();
> +#endif
> +  return 0;
> +}
>
>         Jakub
Uros Bizjak Oct. 22, 2016, 11:46 a.m. UTC | #2
On Fri, Oct 21, 2016 at 5:37 PM, Uros Bizjak <ubizjak@gmail.com> wrote:
> On Fri, Oct 21, 2016 at 5:26 PM, Jakub Jelinek <jakub@redhat.com> wrote:
>
>> This patch on top of the just posted patch adds folding for a couple more
>> builtins (though, hundreds or thousands of other md builtins remain unfolded
>> even though they actually could be folded for e.g. const arguments).

Just a few words regarding other unfolded builtins. x86 intrinsics
(and consequently builtins) are considered as a convenient way to emit
assembly instructions. So, the same rules as when writting assembly,
although slightly relaxed, should apply there. IMO, compiler
optimizations with intrinsics should be an exception, not the rule. As
an example, __builtin_ctz, __builtin_clz and functionaly similar
target-builtins are rather messy w.r.t to "undefinedness", so I think
this fact warrants some help from the compiler. But there is no need
to handle every single builtin - only a competent person that knows
the background of these intrinsics should use them.

Uros.
Jakub Jelinek Oct. 22, 2016, 3:54 p.m. UTC | #3
On Sat, Oct 22, 2016 at 01:46:30PM +0200, Uros Bizjak wrote:
> On Fri, Oct 21, 2016 at 5:37 PM, Uros Bizjak <ubizjak@gmail.com> wrote:
> > On Fri, Oct 21, 2016 at 5:26 PM, Jakub Jelinek <jakub@redhat.com> wrote:
> >
> >> This patch on top of the just posted patch adds folding for a couple more
> >> builtins (though, hundreds or thousands of other md builtins remain unfolded
> >> even though they actually could be folded for e.g. const arguments).
> 
> Just a few words regarding other unfolded builtins. x86 intrinsics
> (and consequently builtins) are considered as a convenient way to emit
> assembly instructions. So, the same rules as when writting assembly,
> although slightly relaxed, should apply there. IMO, compiler
> optimizations with intrinsics should be an exception, not the rule. As
> an example, __builtin_ctz, __builtin_clz and functionaly similar
> target-builtins are rather messy w.r.t to "undefinedness", so I think
> this fact warrants some help from the compiler. But there is no need
> to handle every single builtin - only a competent person that knows
> the background of these intrinsics should use them.

Generally constant folding what we can is a good thing, usually people will
not use the intrinsics when they are passing constants directly, but
constants could appear there through inlining and other optimizations.
If we do constant fold the x86 intrinsics, we allow further constant folding
and optimizations down the road.
For various x86 intrinsics we do some constant folding, but only late
(during RTL optimizations), and only if the insn patterns don't contain
UNSPECs.

Besides the BMI/BMI2/TBM/LZCNT intrinsics that are already folded or I've
posted patch for, intrinsics that IMHO would be nice to be folded are e.g.
__builtin_ia32_bsr*, __builtin_ia32_ro[rl]*, maybe
__builtin_ia32_{,r}sqrtps*, __builtin_ia32_rcpps, etc.
For __builtin_ia32_addps and the like the question is why we have those
builtins at all, it would be better to just use normal vector arithmetics.
__builtin_ia32_cmp*p[sd], __builtin_ia32_{min,max}[ps][sd] etc. are also
nicely constant foldable, etc.

	Jakub
Marc Glisse Oct. 22, 2016, 5:44 p.m. UTC | #4
On Sat, 22 Oct 2016, Jakub Jelinek wrote:

> On Sat, Oct 22, 2016 at 01:46:30PM +0200, Uros Bizjak wrote:
>> On Fri, Oct 21, 2016 at 5:37 PM, Uros Bizjak <ubizjak@gmail.com> wrote:
>>> On Fri, Oct 21, 2016 at 5:26 PM, Jakub Jelinek <jakub@redhat.com> wrote:
>>>
>>>> This patch on top of the just posted patch adds folding for a couple more
>>>> builtins (though, hundreds or thousands of other md builtins remain unfolded
>>>> even though they actually could be folded for e.g. const arguments).
>>
>> Just a few words regarding other unfolded builtins. x86 intrinsics
>> (and consequently builtins) are considered as a convenient way to emit
>> assembly instructions. So, the same rules as when writting assembly,
>> although slightly relaxed, should apply there. IMO, compiler
>> optimizations with intrinsics should be an exception, not the rule. As
>> an example, __builtin_ctz, __builtin_clz and functionaly similar
>> target-builtins are rather messy w.r.t to "undefinedness", so I think
>> this fact warrants some help from the compiler. But there is no need
>> to handle every single builtin - only a competent person that knows
>> the background of these intrinsics should use them.
>
> Generally constant folding what we can is a good thing, usually people will
> not use the intrinsics when they are passing constants directly, but
> constants could appear there through inlining and other optimizations.
> If we do constant fold the x86 intrinsics, we allow further constant folding
> and optimizations down the road.

+1

> For various x86 intrinsics we do some constant folding, but only late
> (during RTL optimizations), and only if the insn patterns don't contain
> UNSPECs.
>
> Besides the BMI/BMI2/TBM/LZCNT intrinsics that are already folded or I've
> posted patch for, intrinsics that IMHO would be nice to be folded are e.g.
> __builtin_ia32_bsr*, __builtin_ia32_ro[rl]*, maybe
> __builtin_ia32_{,r}sqrtps*, __builtin_ia32_rcpps, etc.
> For __builtin_ia32_addps and the like the question is why we have those
> builtins at all, it would be better to just use normal vector arithmetics.

Note that we do use operator+ directly in *intrin.h. We only keep the 
builtin __builtin_ia32_addps because ada maintainers asked us to. We could 
lower them to normal vector arithmetics early in gimple, but it doesn't 
seem worth touching them since they are legacy.

> __builtin_ia32_cmp*p[sd], __builtin_ia32_{min,max}[ps][sd] etc. are also
> nicely constant foldable, etc.

I think _mm_cmpeq_pd could use the vector extensions instead of 
__builtin_ia32_cmpeqpd if they were ported from C++ to C, same for a few 
more. Some others which don't have such a close match in the vector 
extensions could still be lowered (in gimple) to vector operations, which 
would allow constant folding as well as other optimizations.
Richard Biener Oct. 24, 2016, 7:34 a.m. UTC | #5
On Fri, 21 Oct 2016, Uros Bizjak wrote:

> On Fri, Oct 21, 2016 at 5:26 PM, Jakub Jelinek <jakub@redhat.com> wrote:
> > Hi!
> >
> > This patch on top of the just posted patch adds folding for a couple more
> > builtins (though, hundreds or thousands of other md builtins remain unfolded
> > even though they actually could be folded for e.g. const arguments).
> >
> > Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?
> >
> > 2016-10-21  Jakub Jelinek  <jakub@redhat.com>
> >
> >         * config/i386/i386.c (ix86_fold_builtin): Handle
> >         IX86_BUILTIN_BEXTR{,I}{32,64}, IX86_BUILTIN_BZHI{32,64},
> >         IX86_BUILTIN_PDEP{32,64} and IX86_BUILTIN_PEXT{32,64}.
> >         (ix86_gimple_fold_builtin): Handle IX86_BUILTIN_BZHI{32,64},
> >         IX86_BUILTIN_PDEP{32,64} and IX86_BUILTIN_PEXT{32,64}.
> >
> >         * gcc.target/i386/bmi2-pext-1.c: New test.
> >         * gcc.target/i386/bmi2-pdep-1.c: New test.
> >         * gcc.target/i386/bmi2-bzhi-3.c: New test.
> >         * gcc.target/i386/tbm-bextri-1.c: New test.
> >         * gcc.target/i386/bmi-bextr-6.c: New test.
> 
> I'm not versed in this area, let's ask Richi for a review...
> 
> OK if Richi says so...

Ok.

Thanks,
Richard.
 
> Thanks,
> Uros.
> 
> > --- gcc/config/i386/i386.c.jj   2016-10-21 14:31:21.770818850 +0200
> > +++ gcc/config/i386/i386.c      2016-10-21 14:58:58.897893832 +0200
> > @@ -33369,6 +33369,88 @@ ix86_fold_builtin (tree fndecl, int n_ar
> >             }
> >           break;
> >
> > +       case IX86_BUILTIN_BEXTR32:
> > +       case IX86_BUILTIN_BEXTR64:
> > +       case IX86_BUILTIN_BEXTRI32:
> > +       case IX86_BUILTIN_BEXTRI64:
> > +         gcc_assert (n_args == 2);
> > +         if (tree_fits_uhwi_p (args[1]))
> > +           {
> > +             unsigned HOST_WIDE_INT res = 0;
> > +             unsigned int prec = TYPE_PRECISION (TREE_TYPE (args[0]));
> > +             unsigned int start = tree_to_uhwi (args[1]);
> > +             unsigned int len = (start & 0xff00) >> 8;
> > +             start &= 0xff;
> > +             if (start >= prec || len == 0)
> > +               res = 0;
> > +             else if (!tree_fits_uhwi_p (args[0]))
> > +               break;
> > +             else
> > +               res = tree_to_uhwi (args[0]) >> start;
> > +             if (len > prec)
> > +               len = prec;
> > +             if (len < HOST_BITS_PER_WIDE_INT)
> > +               res &= (HOST_WIDE_INT_1U << len) - 1;
> > +             return build_int_cstu (TREE_TYPE (TREE_TYPE (fndecl)), res);
> > +           }
> > +         break;
> > +
> > +       case IX86_BUILTIN_BZHI32:
> > +       case IX86_BUILTIN_BZHI64:
> > +         gcc_assert (n_args == 2);
> > +         if (tree_fits_uhwi_p (args[1]))
> > +           {
> > +             unsigned int idx = tree_to_uhwi (args[1]) & 0xff;
> > +             if (idx >= TYPE_PRECISION (TREE_TYPE (args[0])))
> > +               return args[0];
> > +             if (!tree_fits_uhwi_p (args[0]))
> > +               break;
> > +             unsigned HOST_WIDE_INT res = tree_to_uhwi (args[0]);
> > +             res &= ~(HOST_WIDE_INT_M1U << idx);
> > +             return build_int_cstu (TREE_TYPE (TREE_TYPE (fndecl)), res);
> > +           }
> > +         break;
> > +
> > +       case IX86_BUILTIN_PDEP32:
> > +       case IX86_BUILTIN_PDEP64:
> > +         gcc_assert (n_args == 2);
> > +         if (tree_fits_uhwi_p (args[0]) && tree_fits_uhwi_p (args[1]))
> > +           {
> > +             unsigned HOST_WIDE_INT src = tree_to_uhwi (args[0]);
> > +             unsigned HOST_WIDE_INT mask = tree_to_uhwi (args[1]);
> > +             unsigned HOST_WIDE_INT res = 0;
> > +             unsigned HOST_WIDE_INT m, k = 1;
> > +             for (m = 1; m; m <<= 1)
> > +               if ((mask & m) != 0)
> > +                 {
> > +                   if ((src & k) != 0)
> > +                     res |= m;
> > +                   k <<= 1;
> > +                 }
> > +             return build_int_cstu (TREE_TYPE (TREE_TYPE (fndecl)), res);
> > +           }
> > +         break;
> > +
> > +       case IX86_BUILTIN_PEXT32:
> > +       case IX86_BUILTIN_PEXT64:
> > +         gcc_assert (n_args == 2);
> > +         if (tree_fits_uhwi_p (args[0]) && tree_fits_uhwi_p (args[1]))
> > +           {
> > +             unsigned HOST_WIDE_INT src = tree_to_uhwi (args[0]);
> > +             unsigned HOST_WIDE_INT mask = tree_to_uhwi (args[1]);
> > +             unsigned HOST_WIDE_INT res = 0;
> > +             unsigned HOST_WIDE_INT m, k = 1;
> > +             for (m = 1; m; m <<= 1)
> > +               if ((mask & m) != 0)
> > +                 {
> > +                   if ((src & m) != 0)
> > +                     res |= k;
> > +                   k <<= 1;
> > +                 }
> > +             return build_int_cstu (TREE_TYPE (TREE_TYPE (fndecl)), res);
> > +           }
> > +         break;
> > +
> >         default:
> >           break;
> >         }
> > @@ -33393,7 +33475,7 @@ ix86_gimple_fold_builtin (gimple_stmt_it
> >    int n_args = gimple_call_num_args (stmt);
> >    enum ix86_builtins fn_code = (enum ix86_builtins) DECL_FUNCTION_CODE (fndecl);
> >    tree decl = NULL_TREE;
> > -  tree arg0;
> > +  tree arg0, arg1;
> >
> >    switch (fn_code)
> >      {
> > @@ -33432,6 +33514,41 @@ ix86_gimple_fold_builtin (gimple_stmt_it
> >           gimple_set_location (g, loc);
> >           gsi_replace (gsi, g, true);
> >           return true;
> > +       }
> > +      break;
> > +
> > +    case IX86_BUILTIN_BZHI32:
> > +    case IX86_BUILTIN_BZHI64:
> > +      gcc_assert (n_args == 2);
> > +      arg1 = gimple_call_arg (stmt, 1);
> > +      if (tree_fits_uhwi_p (arg1) && gimple_call_lhs (stmt))
> > +       {
> > +         unsigned int idx = tree_to_uhwi (arg1) & 0xff;
> > +         arg0 = gimple_call_arg (stmt, 0);
> > +         if (idx < TYPE_PRECISION (TREE_TYPE (arg0)))
> > +           break;
> > +         location_t loc = gimple_location (stmt);
> > +         gimple *g = gimple_build_assign (gimple_call_lhs (stmt), arg0);
> > +         gimple_set_location (g, loc);
> > +         gsi_replace (gsi, g, true);
> > +         return true;
> > +       }
> > +      break;
> > +
> > +    case IX86_BUILTIN_PDEP32:
> > +    case IX86_BUILTIN_PDEP64:
> > +    case IX86_BUILTIN_PEXT32:
> > +    case IX86_BUILTIN_PEXT64:
> > +      gcc_assert (n_args == 2);
> > +      arg1 = gimple_call_arg (stmt, 1);
> > +      if (integer_all_onesp (arg1) && gimple_call_lhs (stmt))
> > +       {
> > +         location_t loc = gimple_location (stmt);
> > +         arg0 = gimple_call_arg (stmt, 0);
> > +         gimple *g = gimple_build_assign (gimple_call_lhs (stmt), arg0);
> > +         gimple_set_location (g, loc);
> > +         gsi_replace (gsi, g, true);
> > +         return true;
> >         }
> >        break;
> >
> > --- gcc/testsuite/gcc.target/i386/bmi2-pext-1.c.jj      2016-10-21 15:09:43.568733192 +0200
> > +++ gcc/testsuite/gcc.target/i386/bmi2-pext-1.c 2016-10-21 15:09:33.000000000 +0200
> > @@ -0,0 +1,33 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O2 -mbmi2 -fdump-tree-optimized" } */
> > +/* { dg-final { scan-tree-dump-not "link_error" "optimized" } } */
> > +
> > +#include <x86intrin.h>
> > +
> > +extern void link_error (void);
> > +
> > +unsigned int a;
> > +unsigned long long b;
> > +
> > +int
> > +main ()
> > +{
> > +  asm volatile ("" : : "g" (&a), "g" (&b) : "memory");
> > +  if (_pext_u32 (0xabcdef98, 0xffff0000) != 0xabcd
> > +      || _pext_u32 (0xabcdef98, 0xffffff00) != 0xabcdef
> > +      || _pext_u32 (0xabcdef98, 0x0f0f0f0f) != 0xbdf8
> > +      || _pext_u32 (0xabcdef98, 0xff0fff0f) != 0xabdef8
> > +      || _pext_u32 (0xabcdef98, 0x000fffff) != 0xdef98
> > +      || _pext_u32 (a, 0xffffffff) != a)
> > +    link_error ();
> > +#ifdef __x86_64__
> > +  if (_pext_u64 (0xabcdef9876543210UL, 0xffffffff00000000UL) != 0xabcdef98UL
> > +      || _pext_u64 (0xabcdef9876543210UL, 0xffffffffffffff00UL) != 0xabcdef98765432UL
> > +      || _pext_u64 (0xabcdef9876543210UL, 0x0f0f0f0f0f0f0f0fUL) != 0xbdf86420UL
> > +      || _pext_u64 (0xabcdef9876543210UL, 0xff0fff0fff0fff0fUL) != 0xabdef8764320UL
> > +      || _pext_u64 (0xabcdef9876543210UL, 0x00000000000fffffUL) != 0x43210UL
> > +      || _pext_u64 (b, 0xffffffffffffffffUL) != b)
> > +    link_error ();
> > +#endif
> > +  return 0;
> > +}
> > --- gcc/testsuite/gcc.target/i386/bmi2-pdep-1.c.jj      2016-10-21 15:18:07.611358728 +0200
> > +++ gcc/testsuite/gcc.target/i386/bmi2-pdep-1.c 2016-10-21 15:18:00.000000000 +0200
> > @@ -0,0 +1,33 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O2 -mbmi2 -fdump-tree-optimized" } */
> > +/* { dg-final { scan-tree-dump-not "link_error" "optimized" } } */
> > +
> > +#include <x86intrin.h>
> > +
> > +extern void link_error (void);
> > +
> > +unsigned int a;
> > +unsigned long long b;
> > +
> > +int
> > +main ()
> > +{
> > +  asm volatile ("" : : "g" (&a), "g" (&b) : "memory");
> > +  if (_pdep_u32 (0xabcdef98, 0xffff0000) != 0xef980000
> > +      || _pdep_u32 (0xabcdef98, 0xffffff00) != 0xcdef9800
> > +      || _pdep_u32 (0xabcdef98, 0x0f0f0f0f) != 0x0e0f0908
> > +      || _pdep_u32 (0xabcdef98, 0xff0fff0f) != 0xcd0ef908
> > +      || _pdep_u32 (0xabcdef98, 0x000fffff) != 0xdef98
> > +      || _pdep_u32 (a, 0xffffffff) != a)
> > +    link_error ();
> > +#ifdef __x86_64__
> > +  if (_pdep_u64 (0xabcdef9876543210UL, 0xffffffff00000000UL) != 0x7654321000000000UL
> > +      || _pdep_u64 (0xabcdef9876543210UL, 0xffffffffffffff00UL) != 0xcdef987654321000UL
> > +      || _pdep_u64 (0xabcdef9876543210UL, 0x0f0f0f0f0f0f0f0fUL) != 0x0706050403020100UL
> > +      || _pdep_u64 (0xabcdef9876543210UL, 0xff0fff0fff0fff0fUL) != 0xef09870654032100UL
> > +      || _pdep_u64 (0xabcdef9876543210UL, 0x00000000000fffffUL) != 0x43210UL
> > +      || _pdep_u64 (b, 0xffffffffffffffffUL) != b)
> > +    link_error ();
> > +#endif
> > +  return 0;
> > +}
> > --- gcc/testsuite/gcc.target/i386/bmi2-bzhi-3.c.jj      2016-10-21 13:18:06.844209990 +0200
> > +++ gcc/testsuite/gcc.target/i386/bmi2-bzhi-3.c 2016-10-21 14:42:00.177759205 +0200
> > @@ -0,0 +1,77 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O2 -mbmi2 -fdump-tree-optimized" } */
> > +/* { dg-final { scan-tree-dump-not "link_error" "optimized" } } */
> > +
> > +#include <x86intrin.h>
> > +
> > +extern void link_error (void);
> > +unsigned int a;
> > +unsigned long long b;
> > +
> > +static inline unsigned int f1 (void) { return _bzhi_u32 (a, 0); }
> > +static inline unsigned int f2 (unsigned int x) { return _bzhi_u32 (x, 0); }
> > +static inline unsigned int f3 (void) { return _bzhi_u32 (a, 5); }
> > +static inline unsigned int f4 (unsigned int x) { return _bzhi_u32 (x, 5); }
> > +static inline unsigned int f5 (void) { return _bzhi_u32 (a, 31); }
> > +static inline unsigned int f6 (unsigned int x) { return _bzhi_u32 (x, 31); }
> > +static inline unsigned int f7 (void) { return _bzhi_u32 (a, 32); }
> > +static inline unsigned int f8 (unsigned int x) { return _bzhi_u32 (x, 32); }
> > +static inline unsigned int f9 (void) { return _bzhi_u32 (a, 37); }
> > +static inline unsigned int f10 (unsigned int x) { return _bzhi_u32 (x, 37); }
> > +static inline unsigned int f11 (void) { return _bzhi_u32 (a, 257); }
> > +static inline unsigned int f12 (unsigned int x) { return _bzhi_u32 (x, 257); }
> > +static inline unsigned int f13 (void) { return _bzhi_u32 (a, 289); }
> > +static inline unsigned int f14 (unsigned int x) { return _bzhi_u32 (x, 289); }
> > +#ifdef __x86_64__
> > +static inline unsigned long long f21 (void) { return _bzhi_u64 (b, 0); }
> > +static inline unsigned long long f22 (unsigned long long x) { return _bzhi_u64 (x, 0); }
> > +static inline unsigned long long f23 (void) { return _bzhi_u64 (b, 5); }
> > +static inline unsigned long long f24 (unsigned long long x) { return _bzhi_u64 (x, 5); }
> > +static inline unsigned long long f25 (void) { return _bzhi_u64 (b, 63); }
> > +static inline unsigned long long f26 (unsigned long long x) { return _bzhi_u64 (x, 63); }
> > +static inline unsigned long long f27 (void) { return _bzhi_u64 (b, 64); }
> > +static inline unsigned long long f28 (unsigned long long x) { return _bzhi_u64 (x, 64); }
> > +static inline unsigned long long f29 (void) { return _bzhi_u64 (b, 69); }
> > +static inline unsigned long long f30 (unsigned long long x) { return _bzhi_u64 (x, 69); }
> > +static inline unsigned long long f31 (void) { return _bzhi_u64 (b, 257); }
> > +static inline unsigned long long f32 (unsigned long long x) { return _bzhi_u64 (x, 257); }
> > +static inline unsigned long long f33 (void) { return _bzhi_u64 (b, 321); }
> > +static inline unsigned long long f34 (unsigned long long x) { return _bzhi_u64 (x, 321); }
> > +#endif
> > +
> > +unsigned int c;
> > +unsigned long long d;
> > +
> > +int
> > +main ()
> > +{
> > +  asm volatile ("" : : "g" (&c), "g" (&d) : "memory");
> > +  a = -1U;
> > +  b = -1ULL;
> > +  if (f1 () != 0 || f2 (-1U) != 0
> > +      || f3 () != 0x1f || f4 (-1U) != 0x1f
> > +      || f5 () != 0x7fffffffU || f6 (-1U) != 0x7fffffffU
> > +      || f7 () != -1U || f8 (-1U) != -1U
> > +      || f9 () != -1U || f10 (-1U) != -1U
> > +      || f11 () != 1 || f12 (-1U) != 1
> > +      || f13 () != -1U || f14 (-1U) != -1U)
> > +    link_error ();
> > +  if (_bzhi_u32 (c, 32) != c
> > +      || _bzhi_u32 (c, 64) != c
> > +      || _bzhi_u32 (c, 255) != c)
> > +    link_error ();
> > +#ifdef __x86_64__
> > +  if (f21 () != 0 || f22 (-1ULL) != 0
> > +      || f23 () != 0x1f || f24 (-1ULL) != 0x1f
> > +      || f25 () != 0x7fffffffffffffffULL || f26 (-1ULL) != 0x7fffffffffffffffULL
> > +      || f27 () != -1ULL || f28 (-1ULL) != -1ULL
> > +      || f29 () != -1ULL || f30 (-1ULL) != -1ULL
> > +      || f31 () != 1 || f32 (-1ULL) != 1
> > +      || f33 () != -1ULL || f34 (-1ULL) != -1ULL)
> > +    link_error ();
> > +  if (_bzhi_u64 (d, 64) != d
> > +      || _bzhi_u64 (d, 255) != d)
> > +    link_error ();
> > +#endif
> > +  return 0;
> > +}
> > --- gcc/testsuite/gcc.target/i386/tbm-bextri-1.c.jj     2016-10-21 14:46:06.632648404 +0200
> > +++ gcc/testsuite/gcc.target/i386/tbm-bextri-1.c        2016-10-21 14:45:58.000000000 +0200
> > @@ -0,0 +1,36 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O2 -mtbm -fdump-tree-optimized" } */
> > +/* { dg-final { scan-tree-dump-not "link_error" "optimized" } } */
> > +
> > +#include <x86intrin.h>
> > +
> > +extern void link_error (void);
> > +
> > +volatile unsigned int a;
> > +volatile unsigned long long b;
> > +
> > +int
> > +main ()
> > +{
> > +  if (__bextri_u32 (0xffffffffU, 0 | (0 << 8)) != 0
> > +      || __bextri_u32 (0xffffffffU, 64 | (16 << 8)) != 0
> > +      || __bextri_u32 (0x12345678U, 4 | (10 << 8)) != 0x167
> > +      || __bextri_u32 (0xffffffffU, 2 | (255 << 8)) != 0x3fffffff
> > +      || __bextri_u32 (0xdeadbeefU, 2 | (64 << 8)) != 0x37ab6fbb
> > +      || __bextri_u32 (0xdeadbeefU, 0 | (64 << 8)) != 0xdeadbeefU
> > +      || __bextri_u32 (a, 0 | (0 << 8)) != 0
> > +      || __bextri_u32 (a, 32 | (16 << 8)) != 0)
> > +    link_error ();
> > +#ifdef __x86_64__
> > +  if (__bextri_u64 (0xffffffffffffffffUL, 0 | (0 << 8)) != 0
> > +      || __bextri_u64 (0xffffffffffffffffUL, 128 | (16 << 8)) != 0
> > +      || __bextri_u64 (0x123456789abcdef0UL, 5 | (37 << 8)) != 0x13c4d5e6f7UL
> > +      || __bextri_u64 (0xffffffffffffffffUL, 2 | (255 << 8)) != 0x3fffffffffffffffUL
> > +      || __bextri_u64 (0xdeadbeefbeefdeadU, 2 | (64 << 8)) != 0x37ab6fbbefbbf7abUL
> > +      || __bextri_u64 (0xdeadbeefbeefdeadU, 0 | (64 << 8)) != 0xdeadbeefbeefdeadUL
> > +      || __bextri_u64 (b, 0 | (0 << 8)) != 0
> > +      || __bextri_u64 (b, 64 | (16 << 8)) != 0)
> > +    link_error ();
> > +#endif
> > +  return 0;
> > +}
> > --- gcc/testsuite/gcc.target/i386/bmi-bextr-6.c.jj      2016-10-21 14:06:12.533953786 +0200
> > +++ gcc/testsuite/gcc.target/i386/bmi-bextr-6.c 2016-10-21 14:44:53.358573284 +0200
> > @@ -0,0 +1,54 @@
> > +/* { dg-do compile } */
> > +/* { dg-options "-O2 -mbmi -fdump-tree-optimized" } */
> > +/* { dg-final { scan-tree-dump-not "link_error" "optimized" } } */
> > +
> > +#include <x86intrin.h>
> > +
> > +extern void link_error (void);
> > +
> > +volatile unsigned int a;
> > +volatile unsigned long long b;
> > +
> > +int
> > +main ()
> > +{
> > +  if (__bextr_u32 (0xffffffffU, 0 | (0 << 8)) != 0
> > +      || __bextr_u32 (0xffffffffU, 64 | (16 << 8)) != 0
> > +      || __bextr_u32 (0x12345678U, 4 | (10 << 8)) != 0x167
> > +      || __bextr_u32 (0xffffffffU, 2 | (255 << 8)) != 0x3fffffff
> > +      || __bextr_u32 (0xdeadbeefU, 2 | (64 << 8)) != 0x37ab6fbb
> > +      || __bextr_u32 (0xdeadbeefU, 0 | (64 << 8)) != 0xdeadbeefU
> > +      || __bextr_u32 (a, 0 | (0 << 8)) != 0
> > +      || __bextr_u32 (a, 32 | (16 << 8)) != 0)
> > +    link_error ();
> > +  if (_bextr_u32 (0xffffffffU, 0, 0) != 0
> > +      || _bextr_u32 (0xffffffffU, 64, 16) != 0
> > +      || _bextr_u32 (0x12345678U, 4, 10) != 0x167
> > +      || _bextr_u32 (0xffffffffU, 2, 255) != 0x3fffffff
> > +      || _bextr_u32 (0xdeadbeefU, 2, 64) != 0x37ab6fbb
> > +      || _bextr_u32 (0xdeadbeefU, 0, 64) != 0xdeadbeefU
> > +      || _bextr_u32 (a, 0, 0) != 0
> > +      || _bextr_u32 (a, 32, 16) != 0)
> > +    link_error ();
> > +#ifdef __x86_64__
> > +  if (__bextr_u64 (0xffffffffffffffffUL, 0 | (0 << 8)) != 0
> > +      || __bextr_u64 (0xffffffffffffffffUL, 128 | (16 << 8)) != 0
> > +      || __bextr_u64 (0x123456789abcdef0UL, 5 | (37 << 8)) != 0x13c4d5e6f7UL
> > +      || __bextr_u64 (0xffffffffffffffffUL, 2 | (255 << 8)) != 0x3fffffffffffffffUL
> > +      || __bextr_u64 (0xdeadbeefbeefdeadU, 2 | (64 << 8)) != 0x37ab6fbbefbbf7abUL
> > +      || __bextr_u64 (0xdeadbeefbeefdeadU, 0 | (64 << 8)) != 0xdeadbeefbeefdeadUL
> > +      || __bextr_u64 (b, 0 | (0 << 8)) != 0
> > +      || __bextr_u64 (b, 64 | (16 << 8)) != 0)
> > +    link_error ();
> > +  if (_bextr_u64 (0xffffffffffffffffUL, 0, 0) != 0
> > +      || _bextr_u64 (0xffffffffffffffffUL, 128, 16) != 0
> > +      || _bextr_u64 (0x123456789abcdef0UL, 5, 37) != 0x13c4d5e6f7UL
> > +      || _bextr_u64 (0xffffffffffffffffUL, 2, 255) != 0x3fffffffffffffffUL
> > +      || _bextr_u64 (0xdeadbeefbeefdeadUL, 2, 64) != 0x37ab6fbbefbbf7abUL
> > +      || _bextr_u64 (0xdeadbeefbeefdeadUL, 0, 64) != 0xdeadbeefbeefdeadUL
> > +      || _bextr_u64 (b, 0, 0) != 0
> > +      || _bextr_u64 (b, 64, 16) != 0)
> > +    link_error ();
> > +#endif
> > +  return 0;
> > +}
> >
> >         Jakub
> 
>
diff mbox

Patch

--- gcc/config/i386/i386.c.jj	2016-10-21 14:31:21.770818850 +0200
+++ gcc/config/i386/i386.c	2016-10-21 14:58:58.897893832 +0200
@@ -33369,6 +33369,88 @@  ix86_fold_builtin (tree fndecl, int n_ar
 	    }
 	  break;
 
+	case IX86_BUILTIN_BEXTR32:
+	case IX86_BUILTIN_BEXTR64:
+	case IX86_BUILTIN_BEXTRI32:
+	case IX86_BUILTIN_BEXTRI64:
+	  gcc_assert (n_args == 2);
+	  if (tree_fits_uhwi_p (args[1]))
+	    {
+	      unsigned HOST_WIDE_INT res = 0;
+	      unsigned int prec = TYPE_PRECISION (TREE_TYPE (args[0]));
+	      unsigned int start = tree_to_uhwi (args[1]);
+	      unsigned int len = (start & 0xff00) >> 8;
+	      start &= 0xff;
+	      if (start >= prec || len == 0)
+		res = 0;
+	      else if (!tree_fits_uhwi_p (args[0]))
+		break;
+	      else
+		res = tree_to_uhwi (args[0]) >> start;
+	      if (len > prec)
+		len = prec;
+	      if (len < HOST_BITS_PER_WIDE_INT)
+		res &= (HOST_WIDE_INT_1U << len) - 1;
+	      return build_int_cstu (TREE_TYPE (TREE_TYPE (fndecl)), res);
+	    }
+	  break;
+
+	case IX86_BUILTIN_BZHI32:
+	case IX86_BUILTIN_BZHI64:
+	  gcc_assert (n_args == 2);
+	  if (tree_fits_uhwi_p (args[1]))
+	    {
+	      unsigned int idx = tree_to_uhwi (args[1]) & 0xff;
+	      if (idx >= TYPE_PRECISION (TREE_TYPE (args[0])))
+		return args[0];
+	      if (!tree_fits_uhwi_p (args[0]))
+		break;
+	      unsigned HOST_WIDE_INT res = tree_to_uhwi (args[0]);
+	      res &= ~(HOST_WIDE_INT_M1U << idx);
+	      return build_int_cstu (TREE_TYPE (TREE_TYPE (fndecl)), res);
+	    }
+	  break;
+
+	case IX86_BUILTIN_PDEP32:
+	case IX86_BUILTIN_PDEP64:
+	  gcc_assert (n_args == 2);
+	  if (tree_fits_uhwi_p (args[0]) && tree_fits_uhwi_p (args[1]))
+	    {
+	      unsigned HOST_WIDE_INT src = tree_to_uhwi (args[0]);
+	      unsigned HOST_WIDE_INT mask = tree_to_uhwi (args[1]);
+	      unsigned HOST_WIDE_INT res = 0;
+	      unsigned HOST_WIDE_INT m, k = 1;
+	      for (m = 1; m; m <<= 1)
+		if ((mask & m) != 0)
+		  {
+		    if ((src & k) != 0)
+		      res |= m;
+		    k <<= 1;
+		  }
+	      return build_int_cstu (TREE_TYPE (TREE_TYPE (fndecl)), res);
+	    }
+	  break;
+
+	case IX86_BUILTIN_PEXT32:
+	case IX86_BUILTIN_PEXT64:
+	  gcc_assert (n_args == 2);
+	  if (tree_fits_uhwi_p (args[0]) && tree_fits_uhwi_p (args[1]))
+	    {
+	      unsigned HOST_WIDE_INT src = tree_to_uhwi (args[0]);
+	      unsigned HOST_WIDE_INT mask = tree_to_uhwi (args[1]);
+	      unsigned HOST_WIDE_INT res = 0;
+	      unsigned HOST_WIDE_INT m, k = 1;
+	      for (m = 1; m; m <<= 1)
+		if ((mask & m) != 0)
+		  {
+		    if ((src & m) != 0)
+		      res |= k;
+		    k <<= 1;
+		  }
+	      return build_int_cstu (TREE_TYPE (TREE_TYPE (fndecl)), res);
+	    }
+	  break;
+
 	default:
 	  break;
 	}
@@ -33393,7 +33475,7 @@  ix86_gimple_fold_builtin (gimple_stmt_it
   int n_args = gimple_call_num_args (stmt);
   enum ix86_builtins fn_code = (enum ix86_builtins) DECL_FUNCTION_CODE (fndecl);
   tree decl = NULL_TREE;
-  tree arg0;
+  tree arg0, arg1;
 
   switch (fn_code)
     {
@@ -33432,6 +33514,41 @@  ix86_gimple_fold_builtin (gimple_stmt_it
 	  gimple_set_location (g, loc);
 	  gsi_replace (gsi, g, true);
 	  return true;
+	}
+      break;
+
+    case IX86_BUILTIN_BZHI32:
+    case IX86_BUILTIN_BZHI64:
+      gcc_assert (n_args == 2);
+      arg1 = gimple_call_arg (stmt, 1);
+      if (tree_fits_uhwi_p (arg1) && gimple_call_lhs (stmt))
+	{
+	  unsigned int idx = tree_to_uhwi (arg1) & 0xff;
+	  arg0 = gimple_call_arg (stmt, 0);
+	  if (idx < TYPE_PRECISION (TREE_TYPE (arg0)))
+	    break;
+	  location_t loc = gimple_location (stmt);
+	  gimple *g = gimple_build_assign (gimple_call_lhs (stmt), arg0);
+	  gimple_set_location (g, loc);
+	  gsi_replace (gsi, g, true);
+	  return true;
+	}
+      break;
+
+    case IX86_BUILTIN_PDEP32:
+    case IX86_BUILTIN_PDEP64:
+    case IX86_BUILTIN_PEXT32:
+    case IX86_BUILTIN_PEXT64:
+      gcc_assert (n_args == 2);
+      arg1 = gimple_call_arg (stmt, 1);
+      if (integer_all_onesp (arg1) && gimple_call_lhs (stmt))
+	{
+	  location_t loc = gimple_location (stmt);
+	  arg0 = gimple_call_arg (stmt, 0);
+	  gimple *g = gimple_build_assign (gimple_call_lhs (stmt), arg0);
+	  gimple_set_location (g, loc);
+	  gsi_replace (gsi, g, true);
+	  return true;
 	}
       break;
 
--- gcc/testsuite/gcc.target/i386/bmi2-pext-1.c.jj	2016-10-21 15:09:43.568733192 +0200
+++ gcc/testsuite/gcc.target/i386/bmi2-pext-1.c	2016-10-21 15:09:33.000000000 +0200
@@ -0,0 +1,33 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -mbmi2 -fdump-tree-optimized" } */
+/* { dg-final { scan-tree-dump-not "link_error" "optimized" } } */
+
+#include <x86intrin.h>
+
+extern void link_error (void);
+
+unsigned int a;
+unsigned long long b;
+
+int
+main ()
+{
+  asm volatile ("" : : "g" (&a), "g" (&b) : "memory");
+  if (_pext_u32 (0xabcdef98, 0xffff0000) != 0xabcd
+      || _pext_u32 (0xabcdef98, 0xffffff00) != 0xabcdef
+      || _pext_u32 (0xabcdef98, 0x0f0f0f0f) != 0xbdf8
+      || _pext_u32 (0xabcdef98, 0xff0fff0f) != 0xabdef8
+      || _pext_u32 (0xabcdef98, 0x000fffff) != 0xdef98
+      || _pext_u32 (a, 0xffffffff) != a)
+    link_error ();
+#ifdef __x86_64__
+  if (_pext_u64 (0xabcdef9876543210UL, 0xffffffff00000000UL) != 0xabcdef98UL
+      || _pext_u64 (0xabcdef9876543210UL, 0xffffffffffffff00UL) != 0xabcdef98765432UL
+      || _pext_u64 (0xabcdef9876543210UL, 0x0f0f0f0f0f0f0f0fUL) != 0xbdf86420UL
+      || _pext_u64 (0xabcdef9876543210UL, 0xff0fff0fff0fff0fUL) != 0xabdef8764320UL
+      || _pext_u64 (0xabcdef9876543210UL, 0x00000000000fffffUL) != 0x43210UL
+      || _pext_u64 (b, 0xffffffffffffffffUL) != b)
+    link_error ();
+#endif
+  return 0;
+}
--- gcc/testsuite/gcc.target/i386/bmi2-pdep-1.c.jj	2016-10-21 15:18:07.611358728 +0200
+++ gcc/testsuite/gcc.target/i386/bmi2-pdep-1.c	2016-10-21 15:18:00.000000000 +0200
@@ -0,0 +1,33 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -mbmi2 -fdump-tree-optimized" } */
+/* { dg-final { scan-tree-dump-not "link_error" "optimized" } } */
+
+#include <x86intrin.h>
+
+extern void link_error (void);
+
+unsigned int a;
+unsigned long long b;
+
+int
+main ()
+{
+  asm volatile ("" : : "g" (&a), "g" (&b) : "memory");
+  if (_pdep_u32 (0xabcdef98, 0xffff0000) != 0xef980000
+      || _pdep_u32 (0xabcdef98, 0xffffff00) != 0xcdef9800
+      || _pdep_u32 (0xabcdef98, 0x0f0f0f0f) != 0x0e0f0908
+      || _pdep_u32 (0xabcdef98, 0xff0fff0f) != 0xcd0ef908
+      || _pdep_u32 (0xabcdef98, 0x000fffff) != 0xdef98
+      || _pdep_u32 (a, 0xffffffff) != a)
+    link_error ();
+#ifdef __x86_64__
+  if (_pdep_u64 (0xabcdef9876543210UL, 0xffffffff00000000UL) != 0x7654321000000000UL
+      || _pdep_u64 (0xabcdef9876543210UL, 0xffffffffffffff00UL) != 0xcdef987654321000UL
+      || _pdep_u64 (0xabcdef9876543210UL, 0x0f0f0f0f0f0f0f0fUL) != 0x0706050403020100UL
+      || _pdep_u64 (0xabcdef9876543210UL, 0xff0fff0fff0fff0fUL) != 0xef09870654032100UL
+      || _pdep_u64 (0xabcdef9876543210UL, 0x00000000000fffffUL) != 0x43210UL
+      || _pdep_u64 (b, 0xffffffffffffffffUL) != b)
+    link_error ();
+#endif
+  return 0;
+}
--- gcc/testsuite/gcc.target/i386/bmi2-bzhi-3.c.jj	2016-10-21 13:18:06.844209990 +0200
+++ gcc/testsuite/gcc.target/i386/bmi2-bzhi-3.c	2016-10-21 14:42:00.177759205 +0200
@@ -0,0 +1,77 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -mbmi2 -fdump-tree-optimized" } */
+/* { dg-final { scan-tree-dump-not "link_error" "optimized" } } */
+
+#include <x86intrin.h>
+
+extern void link_error (void);
+unsigned int a;
+unsigned long long b;
+
+static inline unsigned int f1 (void) { return _bzhi_u32 (a, 0); }
+static inline unsigned int f2 (unsigned int x) { return _bzhi_u32 (x, 0); }
+static inline unsigned int f3 (void) { return _bzhi_u32 (a, 5); }
+static inline unsigned int f4 (unsigned int x) { return _bzhi_u32 (x, 5); }
+static inline unsigned int f5 (void) { return _bzhi_u32 (a, 31); }
+static inline unsigned int f6 (unsigned int x) { return _bzhi_u32 (x, 31); }
+static inline unsigned int f7 (void) { return _bzhi_u32 (a, 32); }
+static inline unsigned int f8 (unsigned int x) { return _bzhi_u32 (x, 32); }
+static inline unsigned int f9 (void) { return _bzhi_u32 (a, 37); }
+static inline unsigned int f10 (unsigned int x) { return _bzhi_u32 (x, 37); }
+static inline unsigned int f11 (void) { return _bzhi_u32 (a, 257); }
+static inline unsigned int f12 (unsigned int x) { return _bzhi_u32 (x, 257); }
+static inline unsigned int f13 (void) { return _bzhi_u32 (a, 289); }
+static inline unsigned int f14 (unsigned int x) { return _bzhi_u32 (x, 289); }
+#ifdef __x86_64__
+static inline unsigned long long f21 (void) { return _bzhi_u64 (b, 0); }
+static inline unsigned long long f22 (unsigned long long x) { return _bzhi_u64 (x, 0); }
+static inline unsigned long long f23 (void) { return _bzhi_u64 (b, 5); }
+static inline unsigned long long f24 (unsigned long long x) { return _bzhi_u64 (x, 5); }
+static inline unsigned long long f25 (void) { return _bzhi_u64 (b, 63); }
+static inline unsigned long long f26 (unsigned long long x) { return _bzhi_u64 (x, 63); }
+static inline unsigned long long f27 (void) { return _bzhi_u64 (b, 64); }
+static inline unsigned long long f28 (unsigned long long x) { return _bzhi_u64 (x, 64); }
+static inline unsigned long long f29 (void) { return _bzhi_u64 (b, 69); }
+static inline unsigned long long f30 (unsigned long long x) { return _bzhi_u64 (x, 69); }
+static inline unsigned long long f31 (void) { return _bzhi_u64 (b, 257); }
+static inline unsigned long long f32 (unsigned long long x) { return _bzhi_u64 (x, 257); }
+static inline unsigned long long f33 (void) { return _bzhi_u64 (b, 321); }
+static inline unsigned long long f34 (unsigned long long x) { return _bzhi_u64 (x, 321); }
+#endif
+
+unsigned int c;
+unsigned long long d;
+
+int
+main ()
+{
+  asm volatile ("" : : "g" (&c), "g" (&d) : "memory");
+  a = -1U;
+  b = -1ULL;
+  if (f1 () != 0 || f2 (-1U) != 0
+      || f3 () != 0x1f || f4 (-1U) != 0x1f
+      || f5 () != 0x7fffffffU || f6 (-1U) != 0x7fffffffU
+      || f7 () != -1U || f8 (-1U) != -1U
+      || f9 () != -1U || f10 (-1U) != -1U
+      || f11 () != 1 || f12 (-1U) != 1
+      || f13 () != -1U || f14 (-1U) != -1U)
+    link_error ();
+  if (_bzhi_u32 (c, 32) != c
+      || _bzhi_u32 (c, 64) != c
+      || _bzhi_u32 (c, 255) != c)
+    link_error ();
+#ifdef __x86_64__
+  if (f21 () != 0 || f22 (-1ULL) != 0
+      || f23 () != 0x1f || f24 (-1ULL) != 0x1f
+      || f25 () != 0x7fffffffffffffffULL || f26 (-1ULL) != 0x7fffffffffffffffULL
+      || f27 () != -1ULL || f28 (-1ULL) != -1ULL
+      || f29 () != -1ULL || f30 (-1ULL) != -1ULL
+      || f31 () != 1 || f32 (-1ULL) != 1
+      || f33 () != -1ULL || f34 (-1ULL) != -1ULL)
+    link_error ();
+  if (_bzhi_u64 (d, 64) != d
+      || _bzhi_u64 (d, 255) != d)
+    link_error ();
+#endif
+  return 0;
+}
--- gcc/testsuite/gcc.target/i386/tbm-bextri-1.c.jj	2016-10-21 14:46:06.632648404 +0200
+++ gcc/testsuite/gcc.target/i386/tbm-bextri-1.c	2016-10-21 14:45:58.000000000 +0200
@@ -0,0 +1,36 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -mtbm -fdump-tree-optimized" } */
+/* { dg-final { scan-tree-dump-not "link_error" "optimized" } } */
+
+#include <x86intrin.h>
+
+extern void link_error (void);
+
+volatile unsigned int a;
+volatile unsigned long long b;
+
+int
+main ()
+{
+  if (__bextri_u32 (0xffffffffU, 0 | (0 << 8)) != 0
+      || __bextri_u32 (0xffffffffU, 64 | (16 << 8)) != 0
+      || __bextri_u32 (0x12345678U, 4 | (10 << 8)) != 0x167
+      || __bextri_u32 (0xffffffffU, 2 | (255 << 8)) != 0x3fffffff
+      || __bextri_u32 (0xdeadbeefU, 2 | (64 << 8)) != 0x37ab6fbb
+      || __bextri_u32 (0xdeadbeefU, 0 | (64 << 8)) != 0xdeadbeefU
+      || __bextri_u32 (a, 0 | (0 << 8)) != 0
+      || __bextri_u32 (a, 32 | (16 << 8)) != 0)
+    link_error ();
+#ifdef __x86_64__
+  if (__bextri_u64 (0xffffffffffffffffUL, 0 | (0 << 8)) != 0
+      || __bextri_u64 (0xffffffffffffffffUL, 128 | (16 << 8)) != 0
+      || __bextri_u64 (0x123456789abcdef0UL, 5 | (37 << 8)) != 0x13c4d5e6f7UL
+      || __bextri_u64 (0xffffffffffffffffUL, 2 | (255 << 8)) != 0x3fffffffffffffffUL
+      || __bextri_u64 (0xdeadbeefbeefdeadU, 2 | (64 << 8)) != 0x37ab6fbbefbbf7abUL
+      || __bextri_u64 (0xdeadbeefbeefdeadU, 0 | (64 << 8)) != 0xdeadbeefbeefdeadUL
+      || __bextri_u64 (b, 0 | (0 << 8)) != 0
+      || __bextri_u64 (b, 64 | (16 << 8)) != 0)
+    link_error ();
+#endif
+  return 0;
+}
--- gcc/testsuite/gcc.target/i386/bmi-bextr-6.c.jj	2016-10-21 14:06:12.533953786 +0200
+++ gcc/testsuite/gcc.target/i386/bmi-bextr-6.c	2016-10-21 14:44:53.358573284 +0200
@@ -0,0 +1,54 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -mbmi -fdump-tree-optimized" } */
+/* { dg-final { scan-tree-dump-not "link_error" "optimized" } } */
+
+#include <x86intrin.h>
+
+extern void link_error (void);
+
+volatile unsigned int a;
+volatile unsigned long long b;
+
+int
+main ()
+{
+  if (__bextr_u32 (0xffffffffU, 0 | (0 << 8)) != 0
+      || __bextr_u32 (0xffffffffU, 64 | (16 << 8)) != 0
+      || __bextr_u32 (0x12345678U, 4 | (10 << 8)) != 0x167
+      || __bextr_u32 (0xffffffffU, 2 | (255 << 8)) != 0x3fffffff
+      || __bextr_u32 (0xdeadbeefU, 2 | (64 << 8)) != 0x37ab6fbb
+      || __bextr_u32 (0xdeadbeefU, 0 | (64 << 8)) != 0xdeadbeefU
+      || __bextr_u32 (a, 0 | (0 << 8)) != 0
+      || __bextr_u32 (a, 32 | (16 << 8)) != 0)
+    link_error ();
+  if (_bextr_u32 (0xffffffffU, 0, 0) != 0
+      || _bextr_u32 (0xffffffffU, 64, 16) != 0
+      || _bextr_u32 (0x12345678U, 4, 10) != 0x167
+      || _bextr_u32 (0xffffffffU, 2, 255) != 0x3fffffff
+      || _bextr_u32 (0xdeadbeefU, 2, 64) != 0x37ab6fbb
+      || _bextr_u32 (0xdeadbeefU, 0, 64) != 0xdeadbeefU
+      || _bextr_u32 (a, 0, 0) != 0
+      || _bextr_u32 (a, 32, 16) != 0)
+    link_error ();
+#ifdef __x86_64__
+  if (__bextr_u64 (0xffffffffffffffffUL, 0 | (0 << 8)) != 0
+      || __bextr_u64 (0xffffffffffffffffUL, 128 | (16 << 8)) != 0
+      || __bextr_u64 (0x123456789abcdef0UL, 5 | (37 << 8)) != 0x13c4d5e6f7UL
+      || __bextr_u64 (0xffffffffffffffffUL, 2 | (255 << 8)) != 0x3fffffffffffffffUL
+      || __bextr_u64 (0xdeadbeefbeefdeadU, 2 | (64 << 8)) != 0x37ab6fbbefbbf7abUL
+      || __bextr_u64 (0xdeadbeefbeefdeadU, 0 | (64 << 8)) != 0xdeadbeefbeefdeadUL
+      || __bextr_u64 (b, 0 | (0 << 8)) != 0
+      || __bextr_u64 (b, 64 | (16 << 8)) != 0)
+    link_error ();
+  if (_bextr_u64 (0xffffffffffffffffUL, 0, 0) != 0
+      || _bextr_u64 (0xffffffffffffffffUL, 128, 16) != 0
+      || _bextr_u64 (0x123456789abcdef0UL, 5, 37) != 0x13c4d5e6f7UL
+      || _bextr_u64 (0xffffffffffffffffUL, 2, 255) != 0x3fffffffffffffffUL
+      || _bextr_u64 (0xdeadbeefbeefdeadUL, 2, 64) != 0x37ab6fbbefbbf7abUL
+      || _bextr_u64 (0xdeadbeefbeefdeadUL, 0, 64) != 0xdeadbeefbeefdeadUL
+      || _bextr_u64 (b, 0, 0) != 0
+      || _bextr_u64 (b, 64, 16) != 0)
+    link_error ();
+#endif
+  return 0;
+}