diff mbox

Add _mm512_{,mask_}reduce_*_* intrinsics (PR target/80324)

Message ID 20170407145250.GB1914@tucnak
State New
Headers show

Commit Message

Jakub Jelinek April 7, 2017, 2:52 p.m. UTC
Hi!

This patch is slightly larger, so I haven't included it in the patch I've
sent a few minutes ago.

I've looked at godbolt for what ICC generates for these and picked sequences
that generate approx. as good code as that.  For
min_epi64/max_epi64/min_epu64/max_epu64 there is a slight complication that
in AVX512F there is only _mm512_{min,max}_ep{i,u}64 but not the _mm256_ or
_mm_ ones, so we need to perform 512-bit operations all the time rather than
perform extractions, 256-bit operation, further extractions and then 128-bit
operations.

Seems we need to teach our permutation code further instructions, e.g.
typedef long long V __attribute__((vector_size (64)));
typedef int W __attribute__((vector_size (64)));
W f0 (W x) {
  return __builtin_shuffle (x, (W) { 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7 });
}
V f1 (V x) {
  return __builtin_shuffle (x, (V) { 4, 5, 6, 7, 0, 1, 2, 3 });
}
generate unnecessarily bad code (could use vpshufi64x2 instruction),
guess that can be resolved for GCC8.

Tested with                                                                                                                                        
make -j272 -k check-gcc RUNTESTFLAGS='--target_board=unix\{-m32,-m64\} i386.exp'                                                                   
on KNL, will bootstrap/regtest on my Haswell-E next, ok for trunk                                                                                  
if that passes?                                                                                                                                    
                                                                                                                                                   
It is not a regression, on the other side it really shouldn't affect any                                                                           
code that is not using those intrinsics.                                                                                                           

2017-04-07  Jakub Jelinek  <jakub@redhat.com>

	PR target/80324
	* config/i386/avx512fintrin.h (_mm512_reduce_add_epi32,
	_mm512_reduce_mul_epi32, _mm512_reduce_and_epi32,
	_mm512_reduce_or_epi32, _mm512_mask_reduce_add_epi32,
	_mm512_mask_reduce_mul_epi32, _mm512_mask_reduce_and_epi32,
	_mm512_mask_reduce_or_epi32, _mm512_reduce_min_epi32,
	_mm512_reduce_max_epi32, _mm512_reduce_min_epu32,
	_mm512_reduce_max_epu32, _mm512_mask_reduce_min_epi32,
	_mm512_mask_reduce_max_epi32, _mm512_mask_reduce_min_epu32,
	_mm512_mask_reduce_max_epu32, _mm512_reduce_add_ps,
	_mm512_reduce_mul_ps, _mm512_mask_reduce_add_ps,
	_mm512_mask_reduce_mul_ps, _mm512_reduce_min_ps, _mm512_reduce_max_ps,
	_mm512_mask_reduce_min_ps, _mm512_mask_reduce_max_ps,
	_mm512_reduce_add_epi64, _mm512_reduce_mul_epi64,
	_mm512_reduce_and_epi64, _mm512_reduce_or_epi64,
	_mm512_mask_reduce_add_epi64, _mm512_mask_reduce_mul_epi64,
	_mm512_mask_reduce_and_epi64, _mm512_mask_reduce_or_epi64,
	_mm512_reduce_min_epi64, _mm512_reduce_max_epi64,
	_mm512_mask_reduce_min_epi64, _mm512_mask_reduce_max_epi64,
	_mm512_reduce_min_epu64, _mm512_reduce_max_epu64,
	_mm512_mask_reduce_min_epu64, _mm512_mask_reduce_max_epu64,
	_mm512_reduce_add_pd, _mm512_reduce_mul_pd, _mm512_mask_reduce_add_pd,
	_mm512_mask_reduce_mul_pd, _mm512_reduce_min_pd, _mm512_reduce_max_pd,
	_mm512_mask_reduce_min_pd, _mm512_mask_reduce_max_pd): New intrinsics.

	* gcc.target/i386/avx512f-reduce-op-1.c: New test.


	Jakub

Comments

Uros Bizjak April 7, 2017, 3:09 p.m. UTC | #1
On Fri, Apr 7, 2017 at 4:52 PM, Jakub Jelinek <jakub@redhat.com> wrote:
> Hi!
>
> This patch is slightly larger, so I haven't included it in the patch I've
> sent a few minutes ago.
>
> I've looked at godbolt for what ICC generates for these and picked sequences
> that generate approx. as good code as that.  For
> min_epi64/max_epi64/min_epu64/max_epu64 there is a slight complication that
> in AVX512F there is only _mm512_{min,max}_ep{i,u}64 but not the _mm256_ or
> _mm_ ones, so we need to perform 512-bit operations all the time rather than
> perform extractions, 256-bit operation, further extractions and then 128-bit
> operations.
>
> Seems we need to teach our permutation code further instructions, e.g.
> typedef long long V __attribute__((vector_size (64)));
> typedef int W __attribute__((vector_size (64)));
> W f0 (W x) {
>   return __builtin_shuffle (x, (W) { 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7 });
> }
> V f1 (V x) {
>   return __builtin_shuffle (x, (V) { 4, 5, 6, 7, 0, 1, 2, 3 });
> }
> generate unnecessarily bad code (could use vpshufi64x2 instruction),
> guess that can be resolved for GCC8.
>
> Tested with
> make -j272 -k check-gcc RUNTESTFLAGS='--target_board=unix\{-m32,-m64\} i386.exp'
> on KNL, will bootstrap/regtest on my Haswell-E next, ok for trunk
> if that passes?
>
> It is not a regression, on the other side it really shouldn't affect any
> code that is not using those intrinsics.
>
> 2017-04-07  Jakub Jelinek  <jakub@redhat.com>
>
>         PR target/80324
>         * config/i386/avx512fintrin.h (_mm512_reduce_add_epi32,
>         _mm512_reduce_mul_epi32, _mm512_reduce_and_epi32,
>         _mm512_reduce_or_epi32, _mm512_mask_reduce_add_epi32,
>         _mm512_mask_reduce_mul_epi32, _mm512_mask_reduce_and_epi32,
>         _mm512_mask_reduce_or_epi32, _mm512_reduce_min_epi32,
>         _mm512_reduce_max_epi32, _mm512_reduce_min_epu32,
>         _mm512_reduce_max_epu32, _mm512_mask_reduce_min_epi32,
>         _mm512_mask_reduce_max_epi32, _mm512_mask_reduce_min_epu32,
>         _mm512_mask_reduce_max_epu32, _mm512_reduce_add_ps,
>         _mm512_reduce_mul_ps, _mm512_mask_reduce_add_ps,
>         _mm512_mask_reduce_mul_ps, _mm512_reduce_min_ps, _mm512_reduce_max_ps,
>         _mm512_mask_reduce_min_ps, _mm512_mask_reduce_max_ps,
>         _mm512_reduce_add_epi64, _mm512_reduce_mul_epi64,
>         _mm512_reduce_and_epi64, _mm512_reduce_or_epi64,
>         _mm512_mask_reduce_add_epi64, _mm512_mask_reduce_mul_epi64,
>         _mm512_mask_reduce_and_epi64, _mm512_mask_reduce_or_epi64,
>         _mm512_reduce_min_epi64, _mm512_reduce_max_epi64,
>         _mm512_mask_reduce_min_epi64, _mm512_mask_reduce_max_epi64,
>         _mm512_reduce_min_epu64, _mm512_reduce_max_epu64,
>         _mm512_mask_reduce_min_epu64, _mm512_mask_reduce_max_epu64,
>         _mm512_reduce_add_pd, _mm512_reduce_mul_pd, _mm512_mask_reduce_add_pd,
>         _mm512_mask_reduce_mul_pd, _mm512_reduce_min_pd, _mm512_reduce_max_pd,
>         _mm512_mask_reduce_min_pd, _mm512_mask_reduce_max_pd): New intrinsics.
>
>         * gcc.target/i386/avx512f-reduce-op-1.c: New test.

LGTM, but please wait for Kirill's opinion on the implementation.

Thanks,
Uros.

> --- gcc/config/i386/avx512fintrin.h.jj  2017-04-07 12:25:13.065643755 +0200
> +++ gcc/config/i386/avx512fintrin.h     2017-04-07 16:34:37.976974227 +0200
> @@ -13282,6 +13282,470 @@ _mm512_cmpgt_epu64_mask (__m512i __A, __
>                                                     (__mmask8) -1);
>  }
>
> +#undef __MM512_REDUCE_OP
> +#define __MM512_REDUCE_OP(op) \
> +  __v8si __T1 = (__v8si) _mm512_extracti64x4_epi64 (__A, 1);           \
> +  __v8si __T2 = (__v8si) _mm512_extracti64x4_epi64 (__A, 0);           \
> +  __m256i __T3 = (__m256i) (__T1 op __T2);                             \
> +  __v4si __T4 = (__v4si) _mm256_extracti128_si256 (__T3, 1);           \
> +  __v4si __T5 = (__v4si) _mm256_extracti128_si256 (__T3, 0);           \
> +  __v4si __T6 = __T4 op __T5;                                          \
> +  __v4si __T7 = __builtin_shuffle (__T6, (__v4si) { 2, 3, 0, 1 });     \
> +  __v4si __T8 = __T6 op __T7;                                          \
> +  return __T8[0] op __T8[1]
> +
> +extern __inline int
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm512_reduce_add_epi32 (__m512i __A)
> +{
> +  __MM512_REDUCE_OP (+);
> +}
> +
> +extern __inline int
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm512_reduce_mul_epi32 (__m512i __A)
> +{
> +  __MM512_REDUCE_OP (*);
> +}
> +
> +extern __inline int
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm512_reduce_and_epi32 (__m512i __A)
> +{
> +  __MM512_REDUCE_OP (&);
> +}
> +
> +extern __inline int
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm512_reduce_or_epi32 (__m512i __A)
> +{
> +  __MM512_REDUCE_OP (|);
> +}
> +
> +extern __inline int
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm512_mask_reduce_add_epi32 (__mmask16 __U, __m512i __A)
> +{
> +  __A = _mm512_maskz_mov_epi32 (__U, __A);
> +  __MM512_REDUCE_OP (+);
> +}
> +
> +extern __inline int
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm512_mask_reduce_mul_epi32 (__mmask16 __U, __m512i __A)
> +{
> +  __A = _mm512_mask_mov_epi32 (_mm512_set1_epi32 (1), __U, __A);
> +  __MM512_REDUCE_OP (*);
> +}
> +
> +extern __inline int
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm512_mask_reduce_and_epi32 (__mmask16 __U, __m512i __A)
> +{
> +  __A = _mm512_mask_mov_epi32 (_mm512_set1_epi32 (~0), __U, __A);
> +  __MM512_REDUCE_OP (&);
> +}
> +
> +extern __inline int
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm512_mask_reduce_or_epi32 (__mmask16 __U, __m512i __A)
> +{
> +  __A = _mm512_maskz_mov_epi32 (__U, __A);
> +  __MM512_REDUCE_OP (|);
> +}
> +
> +#undef __MM512_REDUCE_OP
> +#define __MM512_REDUCE_OP(op) \
> +  __m256i __T1 = (__m256i) _mm512_extracti64x4_epi64 (__A, 1);         \
> +  __m256i __T2 = (__m256i) _mm512_extracti64x4_epi64 (__A, 0);         \
> +  __m256i __T3 = _mm256_##op (__T1, __T2);                             \
> +  __m128i __T4 = (__m128i) _mm256_extracti128_si256 (__T3, 1);         \
> +  __m128i __T5 = (__m128i) _mm256_extracti128_si256 (__T3, 0);         \
> +  __m128i __T6 = _mm_##op (__T4, __T5);                                        \
> +  __m128i __T7 = (__m128i) __builtin_shuffle ((__v4si) __T6,           \
> +                                             (__v4si) { 2, 3, 0, 1 }); \
> +  __m128i __T8 = _mm_##op (__T6, __T7);                                        \
> +  __m128i __T9 = (__m128i) __builtin_shuffle ((__v4si) __T8,           \
> +                                             (__v4si) { 1, 0, 1, 0 }); \
> +  __v4si __T10 = (__v4si) _mm_##op (__T8, __T9);                       \
> +  return __T10[0]
> +
> +extern __inline int
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm512_reduce_min_epi32 (__m512i __A)
> +{
> +  __MM512_REDUCE_OP (min_epi32);
> +}
> +
> +extern __inline int
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm512_reduce_max_epi32 (__m512i __A)
> +{
> +  __MM512_REDUCE_OP (max_epi32);
> +}
> +
> +extern __inline unsigned int
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm512_reduce_min_epu32 (__m512i __A)
> +{
> +  __MM512_REDUCE_OP (min_epu32);
> +}
> +
> +extern __inline unsigned int
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm512_reduce_max_epu32 (__m512i __A)
> +{
> +  __MM512_REDUCE_OP (max_epu32);
> +}
> +
> +extern __inline int
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm512_mask_reduce_min_epi32 (__mmask16 __U, __m512i __A)
> +{
> +  __A = _mm512_mask_mov_epi32 (_mm512_set1_epi32 (__INT_MAX__), __U, __A);
> +  __MM512_REDUCE_OP (min_epi32);
> +}
> +
> +extern __inline int
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm512_mask_reduce_max_epi32 (__mmask16 __U, __m512i __A)
> +{
> +  __A = _mm512_mask_mov_epi32 (_mm512_set1_epi32 (-__INT_MAX__ - 1), __U, __A);
> +  __MM512_REDUCE_OP (max_epi32);
> +}
> +
> +extern __inline unsigned int
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm512_mask_reduce_min_epu32 (__mmask16 __U, __m512i __A)
> +{
> +  __A = _mm512_mask_mov_epi32 (_mm512_set1_epi32 (~0), __U, __A);
> +  __MM512_REDUCE_OP (min_epu32);
> +}
> +
> +extern __inline unsigned int
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm512_mask_reduce_max_epu32 (__mmask16 __U, __m512i __A)
> +{
> +  __A = _mm512_maskz_mov_epi32 (__U, __A);
> +  __MM512_REDUCE_OP (max_epu32);
> +}
> +
> +#undef __MM512_REDUCE_OP
> +#define __MM512_REDUCE_OP(op) \
> +  __m256 __T1 = (__m256) _mm512_extractf64x4_pd ((__m512d) __A, 1);    \
> +  __m256 __T2 = (__m256) _mm512_extractf64x4_pd ((__m512d) __A, 0);    \
> +  __m256 __T3 = __T1 op __T2;                                          \
> +  __m128 __T4 = _mm256_extractf128_ps (__T3, 1);                       \
> +  __m128 __T5 = _mm256_extractf128_ps (__T3, 0);                       \
> +  __m128 __T6 = __T4 op __T5;                                          \
> +  __m128 __T7 = __builtin_shuffle (__T6, (__v4si) { 2, 3, 0, 1 });     \
> +  __m128 __T8 = __T6 op __T7;                                          \
> +  return __T8[0] op __T8[1]
> +
> +extern __inline float
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm512_reduce_add_ps (__m512 __A)
> +{
> +  __MM512_REDUCE_OP (+);
> +}
> +
> +extern __inline float
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm512_reduce_mul_ps (__m512 __A)
> +{
> +  __MM512_REDUCE_OP (*);
> +}
> +
> +extern __inline float
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm512_mask_reduce_add_ps (__mmask16 __U, __m512 __A)
> +{
> +  __A = _mm512_maskz_mov_ps (__U, __A);
> +  __MM512_REDUCE_OP (+);
> +}
> +
> +extern __inline float
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm512_mask_reduce_mul_ps (__mmask16 __U, __m512 __A)
> +{
> +  __A = _mm512_mask_mov_ps (_mm512_set1_ps (1.0f), __U, __A);
> +  __MM512_REDUCE_OP (*);
> +}
> +
> +#undef __MM512_REDUCE_OP
> +#define __MM512_REDUCE_OP(op) \
> +  __m256 __T1 = (__m256) _mm512_extractf64x4_pd ((__m512d) __A, 1);    \
> +  __m256 __T2 = (__m256) _mm512_extractf64x4_pd ((__m512d) __A, 0);    \
> +  __m256 __T3 = _mm256_##op (__T1, __T2);                              \
> +  __m128 __T4 = _mm256_extractf128_ps (__T3, 1);                       \
> +  __m128 __T5 = _mm256_extractf128_ps (__T3, 0);                       \
> +  __m128 __T6 = _mm_##op (__T4, __T5);                                 \
> +  __m128 __T7 = __builtin_shuffle (__T6, (__v4si) { 2, 3, 0, 1 });     \
> +  __m128 __T8 = _mm_##op (__T6, __T7);                                 \
> +  __m128 __T9 = __builtin_shuffle (__T8, (__v4si) { 1, 0, 1, 0 });     \
> +  __m128 __T10 = _mm_##op (__T8, __T9);                                        \
> +  return __T10[0]
> +
> +extern __inline float
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm512_reduce_min_ps (__m512 __A)
> +{
> +  __MM512_REDUCE_OP (min_ps);
> +}
> +
> +extern __inline float
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm512_reduce_max_ps (__m512 __A)
> +{
> +  __MM512_REDUCE_OP (max_ps);
> +}
> +
> +extern __inline float
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm512_mask_reduce_min_ps (__mmask16 __U, __m512 __A)
> +{
> +  __A = _mm512_mask_mov_ps (_mm512_set1_ps (__builtin_inff ()), __U, __A);
> +  __MM512_REDUCE_OP (min_ps);
> +}
> +
> +extern __inline float
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm512_mask_reduce_max_ps (__mmask16 __U, __m512 __A)
> +{
> +  __A = _mm512_mask_mov_ps (_mm512_set1_ps (-__builtin_inff ()), __U, __A);
> +  __MM512_REDUCE_OP (max_ps);
> +}
> +
> +#undef __MM512_REDUCE_OP
> +#define __MM512_REDUCE_OP(op) \
> +  __v4di __T1 = (__v4di) _mm512_extracti64x4_epi64 (__A, 1);           \
> +  __v4di __T2 = (__v4di) _mm512_extracti64x4_epi64 (__A, 0);           \
> +  __m256i __T3 = (__m256i) (__T1 op __T2);                             \
> +  __v2di __T4 = (__v2di) _mm256_extracti128_si256 (__T3, 1);           \
> +  __v2di __T5 = (__v2di) _mm256_extracti128_si256 (__T3, 0);           \
> +  __v2di __T6 = __T4 op __T5;                                          \
> +  return __T6[0] op __T6[1]
> +
> +extern __inline long long
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm512_reduce_add_epi64 (__m512i __A)
> +{
> +  __MM512_REDUCE_OP (+);
> +}
> +
> +extern __inline long long
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm512_reduce_mul_epi64 (__m512i __A)
> +{
> +  __MM512_REDUCE_OP (*);
> +}
> +
> +extern __inline long long
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm512_reduce_and_epi64 (__m512i __A)
> +{
> +  __MM512_REDUCE_OP (&);
> +}
> +
> +extern __inline long long
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm512_reduce_or_epi64 (__m512i __A)
> +{
> +  __MM512_REDUCE_OP (|);
> +}
> +
> +extern __inline long long
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm512_mask_reduce_add_epi64 (__mmask8 __U, __m512i __A)
> +{
> +  __A = _mm512_maskz_mov_epi64 (__U, __A);
> +  __MM512_REDUCE_OP (+);
> +}
> +
> +extern __inline long long
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm512_mask_reduce_mul_epi64 (__mmask8 __U, __m512i __A)
> +{
> +  __A = _mm512_mask_mov_epi64 (_mm512_set1_epi64 (1LL), __U, __A);
> +  __MM512_REDUCE_OP (*);
> +}
> +
> +extern __inline long long
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm512_mask_reduce_and_epi64 (__mmask8 __U, __m512i __A)
> +{
> +  __A = _mm512_mask_mov_epi64 (_mm512_set1_epi64 (~0LL), __U, __A);
> +  __MM512_REDUCE_OP (&);
> +}
> +
> +extern __inline long long
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm512_mask_reduce_or_epi64 (__mmask8 __U, __m512i __A)
> +{
> +  __A = _mm512_maskz_mov_epi64 (__U, __A);
> +  __MM512_REDUCE_OP (|);
> +}
> +
> +#undef __MM512_REDUCE_OP
> +#define __MM512_REDUCE_OP(op) \
> +  __m512i __T1 = _mm512_shuffle_i64x2 (__A, __A, 0x4e);                        \
> +  __m512i __T2 = _mm512_##op (__A, __T1);                              \
> +  __m512i __T3                                                         \
> +    = (__m512i) __builtin_shuffle ((__v8di) __T2,                      \
> +                                  (__v8di) { 2, 3, 0, 1, 6, 7, 4, 5 });\
> +  __m512i __T4 = _mm512_##op (__T2, __T3);                             \
> +  __m512i __T5                                                         \
> +    = (__m512i) __builtin_shuffle ((__v8di) __T4,                      \
> +                                  (__v8di) { 1, 0, 3, 2, 5, 4, 7, 6 });\
> +  __v8di __T6 = (__v8di) _mm512_##op (__T4, __T5);                     \
> +  return __T6[0]
> +
> +extern __inline long long
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm512_reduce_min_epi64 (__m512i __A)
> +{
> +  __MM512_REDUCE_OP (min_epi64);
> +}
> +
> +extern __inline long long
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm512_reduce_max_epi64 (__m512i __A)
> +{
> +  __MM512_REDUCE_OP (max_epi64);
> +}
> +
> +extern __inline long long
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm512_mask_reduce_min_epi64 (__mmask8 __U, __m512i __A)
> +{
> +  __A = _mm512_mask_mov_epi64 (_mm512_set1_epi64 (__LONG_LONG_MAX__),
> +                              __U, __A);
> +  __MM512_REDUCE_OP (min_epi64);
> +}
> +
> +extern __inline long long
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm512_mask_reduce_max_epi64 (__mmask8 __U, __m512i __A)
> +{
> +  __A = _mm512_mask_mov_epi64 (_mm512_set1_epi64 (-__LONG_LONG_MAX__ - 1),
> +                              __U, __A);
> +  __MM512_REDUCE_OP (max_epi64);
> +}
> +
> +extern __inline unsigned long long
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm512_reduce_min_epu64 (__m512i __A)
> +{
> +  __MM512_REDUCE_OP (min_epu64);
> +}
> +
> +extern __inline unsigned long long
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm512_reduce_max_epu64 (__m512i __A)
> +{
> +  __MM512_REDUCE_OP (max_epu64);
> +}
> +
> +extern __inline unsigned long long
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm512_mask_reduce_min_epu64 (__mmask8 __U, __m512i __A)
> +{
> +  __A = _mm512_mask_mov_epi64 (_mm512_set1_epi64 (~0LL), __U, __A);
> +  __MM512_REDUCE_OP (min_epu64);
> +}
> +
> +extern __inline unsigned long long
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm512_mask_reduce_max_epu64 (__mmask8 __U, __m512i __A)
> +{
> +  __A = _mm512_maskz_mov_epi64 (__U, __A);
> +  __MM512_REDUCE_OP (max_epu64);
> +}
> +
> +#undef __MM512_REDUCE_OP
> +#define __MM512_REDUCE_OP(op) \
> +  __m256d __T1 = (__m256d) _mm512_extractf64x4_pd (__A, 1);            \
> +  __m256d __T2 = (__m256d) _mm512_extractf64x4_pd (__A, 0);            \
> +  __m256d __T3 = __T1 op __T2;                                         \
> +  __m128d __T4 = _mm256_extractf128_pd (__T3, 1);                      \
> +  __m128d __T5 = _mm256_extractf128_pd (__T3, 0);                      \
> +  __m128d __T6 = __T4 op __T5;                                         \
> +  return __T6[0] op __T6[1]
> +
> +extern __inline double
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm512_reduce_add_pd (__m512d __A)
> +{
> +  __MM512_REDUCE_OP (+);
> +}
> +
> +extern __inline double
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm512_reduce_mul_pd (__m512d __A)
> +{
> +  __MM512_REDUCE_OP (*);
> +}
> +
> +extern __inline double
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm512_mask_reduce_add_pd (__mmask8 __U, __m512d __A)
> +{
> +  __A = _mm512_maskz_mov_pd (__U, __A);
> +  __MM512_REDUCE_OP (+);
> +}
> +
> +extern __inline double
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm512_mask_reduce_mul_pd (__mmask8 __U, __m512d __A)
> +{
> +  __A = _mm512_mask_mov_pd (_mm512_set1_pd (1.0), __U, __A);
> +  __MM512_REDUCE_OP (*);
> +}
> +
> +#undef __MM512_REDUCE_OP
> +#define __MM512_REDUCE_OP(op) \
> +  __m256d __T1 = (__m256d) _mm512_extractf64x4_pd (__A, 1);            \
> +  __m256d __T2 = (__m256d) _mm512_extractf64x4_pd (__A, 0);            \
> +  __m256d __T3 = _mm256_##op (__T1, __T2);                             \
> +  __m128d __T4 = _mm256_extractf128_pd (__T3, 1);                      \
> +  __m128d __T5 = _mm256_extractf128_pd (__T3, 0);                      \
> +  __m128d __T6 = _mm_##op (__T4, __T5);                                        \
> +  __m128d __T7 = (__m128d) __builtin_shuffle (__T6, (__v2di) { 1, 0 });        \
> +  __m128d __T8 = _mm_##op (__T6, __T7);                                        \
> +  return __T8[0]
> +
> +extern __inline double
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm512_reduce_min_pd (__m512d __A)
> +{
> +  __MM512_REDUCE_OP (min_pd);
> +}
> +
> +extern __inline double
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm512_reduce_max_pd (__m512d __A)
> +{
> +  __MM512_REDUCE_OP (max_pd);
> +}
> +
> +extern __inline double
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm512_mask_reduce_min_pd (__mmask8 __U, __m512d __A)
> +{
> +  __A = _mm512_mask_mov_pd (_mm512_set1_pd (__builtin_inf ()), __U, __A);
> +  __MM512_REDUCE_OP (min_pd);
> +}
> +
> +extern __inline double
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm512_mask_reduce_max_pd (__mmask8 __U, __m512d __A)
> +{
> +  __A = _mm512_mask_mov_pd (_mm512_set1_pd (-__builtin_inf ()), __U, __A);
> +  __MM512_REDUCE_OP (max_pd);
> +}
> +
> +#undef __MM512_REDUCE_OP
> +
>  #ifdef __DISABLE_AVX512F__
>  #undef __DISABLE_AVX512F__
>  #pragma GCC pop_options
> --- gcc/testsuite/gcc.target/i386/avx512f-reduce-op-1.c.jj      2017-04-07 12:25:19.578556015 +0200
> +++ gcc/testsuite/gcc.target/i386/avx512f-reduce-op-1.c 2017-04-07 16:34:37.972974281 +0200
> @@ -0,0 +1,410 @@
> +/* { dg-do run } */
> +/* { dg-options "-O2 -mavx512f" } */
> +/* { dg-require-effective-target avx512f } */
> +
> +#define AVX512F
> +
> +#include "avx512f-helper.h"
> +
> +__attribute__((noinline, noclone)) int
> +test_reduce_add_epi32 (__m512i a)
> +{
> +  return _mm512_reduce_add_epi32 (a);
> +}
> +
> +__attribute__((noinline, noclone)) int
> +test_reduce_mul_epi32 (__m512i a)
> +{
> +  return _mm512_reduce_mul_epi32 (a);
> +}
> +
> +__attribute__((noinline, noclone)) int
> +test_reduce_and_epi32 (__m512i a)
> +{
> +  return _mm512_reduce_and_epi32 (a);
> +}
> +
> +__attribute__((noinline, noclone)) int
> +test_reduce_or_epi32 (__m512i a)
> +{
> +  return _mm512_reduce_or_epi32 (a);
> +}
> +
> +__attribute__((noinline, noclone)) int
> +test_mask_reduce_add_epi32 (__mmask16 u, __m512i a)
> +{
> +  return _mm512_mask_reduce_add_epi32 (u, a);
> +}
> +
> +__attribute__((noinline, noclone)) int
> +test_mask_reduce_mul_epi32 (__mmask16 u, __m512i a)
> +{
> +  return _mm512_mask_reduce_mul_epi32 (u, a);
> +}
> +
> +__attribute__((noinline, noclone)) int
> +test_mask_reduce_and_epi32 (__mmask16 u, __m512i a)
> +{
> +  return _mm512_mask_reduce_and_epi32 (u, a);
> +}
> +
> +__attribute__((noinline, noclone)) int
> +test_mask_reduce_or_epi32 (__mmask16 u, __m512i a)
> +{
> +  return _mm512_mask_reduce_or_epi32 (u, a);
> +}
> +
> +__attribute__((noinline, noclone)) int
> +test_reduce_min_epi32 (__m512i a)
> +{
> +  return _mm512_reduce_min_epi32 (a);
> +}
> +
> +__attribute__((noinline, noclone)) int
> +test_reduce_max_epi32 (__m512i a)
> +{
> +  return _mm512_reduce_max_epi32 (a);
> +}
> +
> +__attribute__((noinline, noclone)) unsigned int
> +test_reduce_min_epu32 (__m512i a)
> +{
> +  return _mm512_reduce_min_epu32 (a);
> +}
> +
> +__attribute__((noinline, noclone)) unsigned int
> +test_reduce_max_epu32 (__m512i a)
> +{
> +  return _mm512_reduce_max_epu32 (a);
> +}
> +
> +__attribute__((noinline, noclone)) int
> +test_mask_reduce_min_epi32 (__mmask16 u, __m512i a)
> +{
> +  return _mm512_mask_reduce_min_epi32 (u, a);
> +}
> +
> +__attribute__((noinline, noclone)) int
> +test_mask_reduce_max_epi32 (__mmask16 u, __m512i a)
> +{
> +  return _mm512_mask_reduce_max_epi32 (u, a);
> +}
> +
> +__attribute__((noinline, noclone)) unsigned int
> +test_mask_reduce_min_epu32 (__mmask16 u, __m512i a)
> +{
> +  return _mm512_mask_reduce_min_epu32 (u, a);
> +}
> +
> +__attribute__((noinline, noclone)) unsigned int
> +test_mask_reduce_max_epu32 (__mmask16 u, __m512i a)
> +{
> +  return _mm512_mask_reduce_max_epu32 (u, a);
> +}
> +
> +__attribute__((noinline, noclone)) float
> +test_reduce_add_ps (__m512 a)
> +{
> +  return _mm512_reduce_add_ps (a);
> +}
> +
> +__attribute__((noinline, noclone)) float
> +test_reduce_mul_ps (__m512 a)
> +{
> +  return _mm512_reduce_mul_ps (a);
> +}
> +
> +__attribute__((noinline, noclone)) float
> +test_mask_reduce_add_ps (__mmask16 u, __m512 a)
> +{
> +  return _mm512_mask_reduce_add_ps (u, a);
> +}
> +
> +__attribute__((noinline, noclone)) float
> +test_mask_reduce_mul_ps (__mmask16 u, __m512 a)
> +{
> +  return _mm512_mask_reduce_mul_ps (u, a);
> +}
> +
> +__attribute__((noinline, noclone)) float
> +test_reduce_min_ps (__m512 a)
> +{
> +  return _mm512_reduce_min_ps (a);
> +}
> +
> +__attribute__((noinline, noclone)) float
> +test_reduce_max_ps (__m512 a)
> +{
> +  return _mm512_reduce_max_ps (a);
> +}
> +
> +__attribute__((noinline, noclone)) float
> +test_mask_reduce_min_ps (__mmask16 u, __m512 a)
> +{
> +  return _mm512_mask_reduce_min_ps (u, a);
> +}
> +
> +__attribute__((noinline, noclone)) float
> +test_mask_reduce_max_ps (__mmask16 u, __m512 a)
> +{
> +  return _mm512_mask_reduce_max_ps (u, a);
> +}
> +
> +__attribute__((noinline, noclone)) long long
> +test_reduce_add_epi64 (__m512i a)
> +{
> +  return _mm512_reduce_add_epi64 (a);
> +}
> +
> +__attribute__((noinline, noclone)) long long
> +test_reduce_mul_epi64 (__m512i a)
> +{
> +  return _mm512_reduce_mul_epi64 (a);
> +}
> +
> +__attribute__((noinline, noclone)) long long
> +test_reduce_and_epi64 (__m512i a)
> +{
> +  return _mm512_reduce_and_epi64 (a);
> +}
> +
> +__attribute__((noinline, noclone)) long long
> +test_reduce_or_epi64 (__m512i a)
> +{
> +  return _mm512_reduce_or_epi64 (a);
> +}
> +
> +__attribute__((noinline, noclone)) long long
> +test_mask_reduce_add_epi64 (__mmask8 u, __m512i a)
> +{
> +  return _mm512_mask_reduce_add_epi64 (u, a);
> +}
> +
> +__attribute__((noinline, noclone)) long long
> +test_mask_reduce_mul_epi64 (__mmask8 u, __m512i a)
> +{
> +  return _mm512_mask_reduce_mul_epi64 (u, a);
> +}
> +
> +__attribute__((noinline, noclone)) long long
> +test_mask_reduce_and_epi64 (__mmask8 u, __m512i a)
> +{
> +  return _mm512_mask_reduce_and_epi64 (u, a);
> +}
> +
> +__attribute__((noinline, noclone)) long long
> +test_mask_reduce_or_epi64 (__mmask8 u, __m512i a)
> +{
> +  return _mm512_mask_reduce_or_epi64 (u, a);
> +}
> +
> +__attribute__((noinline, noclone)) long long
> +test_reduce_min_epi64 (__m512i a)
> +{
> +  return _mm512_reduce_min_epi64 (a);
> +}
> +
> +__attribute__((noinline, noclone)) long long
> +test_reduce_max_epi64 (__m512i a)
> +{
> +  return _mm512_reduce_max_epi64 (a);
> +}
> +
> +__attribute__((noinline, noclone)) unsigned long long
> +test_reduce_min_epu64 (__m512i a)
> +{
> +  return _mm512_reduce_min_epu64 (a);
> +}
> +
> +__attribute__((noinline, noclone)) unsigned long long
> +test_reduce_max_epu64 (__m512i a)
> +{
> +  return _mm512_reduce_max_epu64 (a);
> +}
> +
> +__attribute__((noinline, noclone)) long long
> +test_mask_reduce_min_epi64 (__mmask8 u, __m512i a)
> +{
> +  return _mm512_mask_reduce_min_epi64 (u, a);
> +}
> +
> +__attribute__((noinline, noclone)) long long
> +test_mask_reduce_max_epi64 (__mmask8 u, __m512i a)
> +{
> +  return _mm512_mask_reduce_max_epi64 (u, a);
> +}
> +
> +__attribute__((noinline, noclone)) unsigned long long
> +test_mask_reduce_min_epu64 (__mmask8 u, __m512i a)
> +{
> +  return _mm512_mask_reduce_min_epu64 (u, a);
> +}
> +
> +__attribute__((noinline, noclone)) unsigned long long
> +test_mask_reduce_max_epu64 (__mmask8 u, __m512i a)
> +{
> +  return _mm512_mask_reduce_max_epu64 (u, a);
> +}
> +
> +__attribute__((noinline, noclone)) double
> +test_reduce_add_pd (__m512d a)
> +{
> +  return _mm512_reduce_add_pd (a);
> +}
> +
> +__attribute__((noinline, noclone)) double
> +test_reduce_mul_pd (__m512d a)
> +{
> +  return _mm512_reduce_mul_pd (a);
> +}
> +
> +__attribute__((noinline, noclone)) double
> +test_mask_reduce_add_pd (__mmask8 u, __m512d a)
> +{
> +  return _mm512_mask_reduce_add_pd (u, a);
> +}
> +
> +__attribute__((noinline, noclone)) double
> +test_mask_reduce_mul_pd (__mmask8 u, __m512d a)
> +{
> +  return _mm512_mask_reduce_mul_pd (u, a);
> +}
> +
> +__attribute__((noinline, noclone)) double
> +test_reduce_min_pd (__m512d a)
> +{
> +  return _mm512_reduce_min_pd (a);
> +}
> +
> +__attribute__((noinline, noclone)) double
> +test_reduce_max_pd (__m512d a)
> +{
> +  return _mm512_reduce_max_pd (a);
> +}
> +
> +__attribute__((noinline, noclone)) double
> +test_mask_reduce_min_pd (__mmask8 u, __m512d a)
> +{
> +  return _mm512_mask_reduce_min_pd (u, a);
> +}
> +
> +__attribute__((noinline, noclone)) double
> +test_mask_reduce_max_pd (__mmask8 u, __m512d a)
> +{
> +  return _mm512_mask_reduce_max_pd (u, a);
> +}
> +
> +#define TESTOP(opname, op, type, suffix, neutral) \
> +  do {                                                                 \
> +    type r1 = _mm512_reduce_##opname##_##suffix (v.x);                 \
> +    type r2 = test_reduce_##opname##_##suffix (v.x);                   \
> +    type r3 = neutral;                                                 \
> +    if (r1 != r2)                                                      \
> +      __builtin_abort ();                                              \
> +    for (int i = 0; i < SIZE; i++)                                     \
> +      r3 = r3 op v.a[i];                                               \
> +    if (r1 != r3)                                                      \
> +      __builtin_abort ();                                              \
> +    type r4 = _mm512_mask_reduce_##opname##_##suffix (MASK_VALUE, v.x);        \
> +    type r5 = test_mask_reduce_##opname##_##suffix (MASK_VALUE, v.x);  \
> +    if (r4 != r5)                                                      \
> +      __builtin_abort ();                                              \
> +    r3 = neutral;                                                      \
> +    for (int i = 0; i < SIZE; i++)                                     \
> +      if (MASK_VALUE & (1 << i))                                       \
> +       r3 = r3 op v.a[i];                                              \
> +    if (r4 != r3)                                                      \
> +      __builtin_abort ();                                              \
> +    type r6 = _mm512_mask_reduce_##opname##_##suffix (0, v.x);         \
> +    type r7 = test_mask_reduce_##opname##_##suffix (0, v.x);           \
> +    if (r6 != r7 || r6 != neutral)                                     \
> +      __builtin_abort ();                                              \
> +  } while (0)
> +
> +#define SIZE (AVX512F_LEN / 32)
> +#include "avx512f-mask-type.h"
> +
> +#define TEST_EPI32(c1, c2, c3, c4, c5, c6, c7, c8, \
> +                  c9, c10, c11, c12, c13, c14, c15, c16)               \
> +  do {                                                                 \
> +    UNION_TYPE (AVX512F_LEN, i_d) v;                                   \
> +    v.x = _mm512_set_epi32 (c1, c2, c3, c4, c5, c6, c7, c8,            \
> +                           c9, c10, c11, c12, c13, c14, c15, c16);     \
> +    TESTOP (add, +, int, epi32, 0);                                    \
> +    TESTOP (mul, *, int, epi32, 1);                                    \
> +    TESTOP (and, &, int, epi32, ~0);                                   \
> +    TESTOP (or, |, int, epi32, 0);                                     \
> +    TESTOP (min, < v.a[i] ? r3 :, int, epi32, __INT_MAX__);            \
> +    TESTOP (max, > v.a[i] ? r3 :, int, epi32, -__INT_MAX__ - 1);       \
> +    TESTOP (min, < (unsigned) v.a[i] ? r3 :, unsigned, epu32, ~0U);    \
> +    TESTOP (max, > (unsigned) v.a[i] ? r3 :, unsigned, epu32, 0);      \
> +  } while (0)
> +
> +#define TEST_PS(c1, c2, c3, c4, c5, c6, c7, c8, \
> +               c9, c10, c11, c12, c13, c14, c15, c16)                  \
> +  do {                                                                 \
> +    UNION_TYPE (AVX512F_LEN, ) v;                                      \
> +    v.x = _mm512_set_ps (c1, c2, c3, c4, c5, c6, c7, c8,               \
> +                        c9, c10, c11, c12, c13, c14, c15, c16);        \
> +    TESTOP (add, +, float, ps, 0.0f);                                  \
> +    TESTOP (mul, *, float, ps, 1.0f);                                  \
> +    TESTOP (min, < v.a[i] ? r3 :, float, ps, __builtin_inff ());       \
> +    TESTOP (max, > v.a[i] ? r3 :, float, ps, -__builtin_inff ());      \
> +  } while (0)
> +
> +static void
> +test_epi32_ps (void)
> +{
> +  TEST_EPI32 (1, 2, 3, 4, 5, 6, 6, 5, 4, 3, 2, 1, 7, 6, 5, 4);
> +  TEST_EPI32 (-1, 15, -1, 7, -1, 7, -1, -1, 6, 6, -1, -1, -1, -1, 7, 6);
> +  TEST_PS (1, 2, 3, 4, 5, 6, 6, 5, 4, 3, 2, 1, 7, 6, 5, 4);
> +  TEST_PS (1.25f, 2.25f, -0.25f, 4.0f, -2.0f, 4.0f, -3.0f, 2.0f,
> +           -0.5f, -1.0f, 1.0f, -1.0f, 1.0f, 1.0f, 2.0f, 4.0f);
> +}
> +
> +#undef SIZE
> +#define SIZE (AVX512F_LEN / 64)
> +#include "avx512f-mask-type.h"
> +
> +#define TEST_EPI64(c1, c2, c3, c4, c5, c6, c7, c8) \
> +  do {                                                                 \
> +    UNION_TYPE (AVX512F_LEN, i_q) v;                                   \
> +    v.x = _mm512_set_epi64 (c1, c2, c3, c4, c5, c6, c7, c8);           \
> +    TESTOP (add, +, long long, epi64, 0);                              \
> +    TESTOP (mul, *, long long, epi64, 1);                              \
> +    TESTOP (and, &, long long, epi64, ~0LL);                           \
> +    TESTOP (or, |, long long, epi64, 0);                               \
> +    TESTOP (min, < v.a[i] ? r3 :, long long, epi64, __LONG_LONG_MAX__);        \
> +    TESTOP (max, > v.a[i] ? r3 :, long long, epi64,                    \
> +           -__LONG_LONG_MAX__ - 1);                                    \
> +    TESTOP (min, < (unsigned long long) v.a[i] ? r3 :,                 \
> +           unsigned long long, epu64, ~0ULL);                          \
> +    TESTOP (max, > (unsigned long long) v.a[i] ? r3 :,                 \
> +           unsigned long long, epu64, 0);                              \
> +  } while (0)
> +
> +#define TEST_PD(c1, c2, c3, c4, c5, c6, c7, c8) \
> +  do {                                                                 \
> +    UNION_TYPE (AVX512F_LEN, d) v;                                     \
> +    v.x = _mm512_set_pd (c1, c2, c3, c4, c5, c6, c7, c8);              \
> +    TESTOP (add, +, double, pd, 0.0);                                  \
> +    TESTOP (mul, *, double, pd, 1.0);                                  \
> +    TESTOP (min, < v.a[i] ? r3 :, double, pd, __builtin_inf ());       \
> +    TESTOP (max, > v.a[i] ? r3 :, double, pd, -__builtin_inf ());      \
> +  } while (0)
> +
> +static void
> +test_epi64_pd (void)
> +{
> +  TEST_EPI64 (1, 2, 3, 4, 5, 6, 6, 5);
> +  TEST_EPI64 (-1, 15, -1, 7, -1, 7, -1, -1);
> +  TEST_PD (1, 2, 3, 4, 5, 6, 6, 5);
> +  TEST_PD (1.25f, 2.25f, -0.25f, 4.0f, -2.0f, 4.0f, -3.0f, 2.0f);
> +}
> +
> +void
> +test_512 (void)
> +{
> +  test_epi32_ps ();
> +  test_epi64_pd ();
> +}
>
>         Jakub
Kirill Yukhin April 10, 2017, 5:33 a.m. UTC | #2
Hi Jakib,
On 07 Apr 16:52, Jakub Jelinek wrote:
> Hi!
>
> This patch is slightly larger, so I haven't included it in the patch I've
> sent a few minutes ago.
>
> I've looked at godbolt for what ICC generates for these and picked sequences
> that generate approx. as good code as that.  For
> min_epi64/max_epi64/min_epu64/max_epu64 there is a slight complication that
> in AVX512F there is only _mm512_{min,max}_ep{i,u}64 but not the _mm256_ or
> _mm_ ones, so we need to perform 512-bit operations all the time rather than
> perform extractions, 256-bit operation, further extractions and then 128-bit
> operations.
>
> Seems we need to teach our permutation code further instructions, e.g.
> typedef long long V __attribute__((vector_size (64)));
> typedef int W __attribute__((vector_size (64)));
> W f0 (W x) {
>   return __builtin_shuffle (x, (W) { 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7 });
> }
> V f1 (V x) {
>   return __builtin_shuffle (x, (V) { 4, 5, 6, 7, 0, 1, 2, 3 });
> }
> generate unnecessarily bad code (could use vpshufi64x2 instruction),
> guess that can be resolved for GCC8.
>
> Tested with
> make -j272 -k check-gcc RUNTESTFLAGS='--target_board=unix\{-m32,-m64\} i386.exp'
> on KNL, will bootstrap/regtest on my Haswell-E next, ok for trunk
> if that passes?
Patch is OK for trunk, thanks for implementing those intrinsics!

--
K
diff mbox

Patch

--- gcc/config/i386/avx512fintrin.h.jj	2017-04-07 12:25:13.065643755 +0200
+++ gcc/config/i386/avx512fintrin.h	2017-04-07 16:34:37.976974227 +0200
@@ -13282,6 +13282,470 @@  _mm512_cmpgt_epu64_mask (__m512i __A, __
 						    (__mmask8) -1);
 }
 
+#undef __MM512_REDUCE_OP
+#define __MM512_REDUCE_OP(op) \
+  __v8si __T1 = (__v8si) _mm512_extracti64x4_epi64 (__A, 1);		\
+  __v8si __T2 = (__v8si) _mm512_extracti64x4_epi64 (__A, 0);		\
+  __m256i __T3 = (__m256i) (__T1 op __T2);				\
+  __v4si __T4 = (__v4si) _mm256_extracti128_si256 (__T3, 1);		\
+  __v4si __T5 = (__v4si) _mm256_extracti128_si256 (__T3, 0);		\
+  __v4si __T6 = __T4 op __T5;						\
+  __v4si __T7 = __builtin_shuffle (__T6, (__v4si) { 2, 3, 0, 1 });	\
+  __v4si __T8 = __T6 op __T7;						\
+  return __T8[0] op __T8[1]
+
+extern __inline int
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_reduce_add_epi32 (__m512i __A)
+{
+  __MM512_REDUCE_OP (+);
+}
+
+extern __inline int
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_reduce_mul_epi32 (__m512i __A)
+{
+  __MM512_REDUCE_OP (*);
+}
+
+extern __inline int
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_reduce_and_epi32 (__m512i __A)
+{
+  __MM512_REDUCE_OP (&);
+}
+
+extern __inline int
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_reduce_or_epi32 (__m512i __A)
+{
+  __MM512_REDUCE_OP (|);
+}
+
+extern __inline int
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_reduce_add_epi32 (__mmask16 __U, __m512i __A)
+{
+  __A = _mm512_maskz_mov_epi32 (__U, __A);
+  __MM512_REDUCE_OP (+);
+}
+
+extern __inline int
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_reduce_mul_epi32 (__mmask16 __U, __m512i __A)
+{
+  __A = _mm512_mask_mov_epi32 (_mm512_set1_epi32 (1), __U, __A);
+  __MM512_REDUCE_OP (*);
+}
+
+extern __inline int
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_reduce_and_epi32 (__mmask16 __U, __m512i __A)
+{
+  __A = _mm512_mask_mov_epi32 (_mm512_set1_epi32 (~0), __U, __A);
+  __MM512_REDUCE_OP (&);
+}
+
+extern __inline int
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_reduce_or_epi32 (__mmask16 __U, __m512i __A)
+{
+  __A = _mm512_maskz_mov_epi32 (__U, __A);
+  __MM512_REDUCE_OP (|);
+}
+
+#undef __MM512_REDUCE_OP
+#define __MM512_REDUCE_OP(op) \
+  __m256i __T1 = (__m256i) _mm512_extracti64x4_epi64 (__A, 1);		\
+  __m256i __T2 = (__m256i) _mm512_extracti64x4_epi64 (__A, 0);		\
+  __m256i __T3 = _mm256_##op (__T1, __T2);				\
+  __m128i __T4 = (__m128i) _mm256_extracti128_si256 (__T3, 1);		\
+  __m128i __T5 = (__m128i) _mm256_extracti128_si256 (__T3, 0);		\
+  __m128i __T6 = _mm_##op (__T4, __T5);					\
+  __m128i __T7 = (__m128i) __builtin_shuffle ((__v4si) __T6,		\
+					      (__v4si) { 2, 3, 0, 1 });	\
+  __m128i __T8 = _mm_##op (__T6, __T7);					\
+  __m128i __T9 = (__m128i) __builtin_shuffle ((__v4si) __T8,		\
+					      (__v4si) { 1, 0, 1, 0 });	\
+  __v4si __T10 = (__v4si) _mm_##op (__T8, __T9);			\
+  return __T10[0]
+
+extern __inline int
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_reduce_min_epi32 (__m512i __A)
+{
+  __MM512_REDUCE_OP (min_epi32);
+}
+
+extern __inline int
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_reduce_max_epi32 (__m512i __A)
+{
+  __MM512_REDUCE_OP (max_epi32);
+}
+
+extern __inline unsigned int
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_reduce_min_epu32 (__m512i __A)
+{
+  __MM512_REDUCE_OP (min_epu32);
+}
+
+extern __inline unsigned int
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_reduce_max_epu32 (__m512i __A)
+{
+  __MM512_REDUCE_OP (max_epu32);
+}
+
+extern __inline int
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_reduce_min_epi32 (__mmask16 __U, __m512i __A)
+{
+  __A = _mm512_mask_mov_epi32 (_mm512_set1_epi32 (__INT_MAX__), __U, __A);
+  __MM512_REDUCE_OP (min_epi32);
+}
+
+extern __inline int
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_reduce_max_epi32 (__mmask16 __U, __m512i __A)
+{
+  __A = _mm512_mask_mov_epi32 (_mm512_set1_epi32 (-__INT_MAX__ - 1), __U, __A);
+  __MM512_REDUCE_OP (max_epi32);
+}
+
+extern __inline unsigned int
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_reduce_min_epu32 (__mmask16 __U, __m512i __A)
+{
+  __A = _mm512_mask_mov_epi32 (_mm512_set1_epi32 (~0), __U, __A);
+  __MM512_REDUCE_OP (min_epu32);
+}
+
+extern __inline unsigned int
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_reduce_max_epu32 (__mmask16 __U, __m512i __A)
+{
+  __A = _mm512_maskz_mov_epi32 (__U, __A);
+  __MM512_REDUCE_OP (max_epu32);
+}
+
+#undef __MM512_REDUCE_OP
+#define __MM512_REDUCE_OP(op) \
+  __m256 __T1 = (__m256) _mm512_extractf64x4_pd ((__m512d) __A, 1);	\
+  __m256 __T2 = (__m256) _mm512_extractf64x4_pd ((__m512d) __A, 0);	\
+  __m256 __T3 = __T1 op __T2;						\
+  __m128 __T4 = _mm256_extractf128_ps (__T3, 1);			\
+  __m128 __T5 = _mm256_extractf128_ps (__T3, 0);			\
+  __m128 __T6 = __T4 op __T5;						\
+  __m128 __T7 = __builtin_shuffle (__T6, (__v4si) { 2, 3, 0, 1 });	\
+  __m128 __T8 = __T6 op __T7;						\
+  return __T8[0] op __T8[1]
+
+extern __inline float
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_reduce_add_ps (__m512 __A)
+{
+  __MM512_REDUCE_OP (+);
+}
+
+extern __inline float
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_reduce_mul_ps (__m512 __A)
+{
+  __MM512_REDUCE_OP (*);
+}
+
+extern __inline float
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_reduce_add_ps (__mmask16 __U, __m512 __A)
+{
+  __A = _mm512_maskz_mov_ps (__U, __A);
+  __MM512_REDUCE_OP (+);
+}
+
+extern __inline float
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_reduce_mul_ps (__mmask16 __U, __m512 __A)
+{
+  __A = _mm512_mask_mov_ps (_mm512_set1_ps (1.0f), __U, __A);
+  __MM512_REDUCE_OP (*);
+}
+
+#undef __MM512_REDUCE_OP
+#define __MM512_REDUCE_OP(op) \
+  __m256 __T1 = (__m256) _mm512_extractf64x4_pd ((__m512d) __A, 1);	\
+  __m256 __T2 = (__m256) _mm512_extractf64x4_pd ((__m512d) __A, 0);	\
+  __m256 __T3 = _mm256_##op (__T1, __T2);				\
+  __m128 __T4 = _mm256_extractf128_ps (__T3, 1);			\
+  __m128 __T5 = _mm256_extractf128_ps (__T3, 0);			\
+  __m128 __T6 = _mm_##op (__T4, __T5);					\
+  __m128 __T7 = __builtin_shuffle (__T6, (__v4si) { 2, 3, 0, 1 });	\
+  __m128 __T8 = _mm_##op (__T6, __T7);					\
+  __m128 __T9 = __builtin_shuffle (__T8, (__v4si) { 1, 0, 1, 0 });	\
+  __m128 __T10 = _mm_##op (__T8, __T9);					\
+  return __T10[0]
+
+extern __inline float
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_reduce_min_ps (__m512 __A)
+{
+  __MM512_REDUCE_OP (min_ps);
+}
+
+extern __inline float
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_reduce_max_ps (__m512 __A)
+{
+  __MM512_REDUCE_OP (max_ps);
+}
+
+extern __inline float
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_reduce_min_ps (__mmask16 __U, __m512 __A)
+{
+  __A = _mm512_mask_mov_ps (_mm512_set1_ps (__builtin_inff ()), __U, __A);
+  __MM512_REDUCE_OP (min_ps);
+}
+
+extern __inline float
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_reduce_max_ps (__mmask16 __U, __m512 __A)
+{
+  __A = _mm512_mask_mov_ps (_mm512_set1_ps (-__builtin_inff ()), __U, __A);
+  __MM512_REDUCE_OP (max_ps);
+}
+
+#undef __MM512_REDUCE_OP
+#define __MM512_REDUCE_OP(op) \
+  __v4di __T1 = (__v4di) _mm512_extracti64x4_epi64 (__A, 1);		\
+  __v4di __T2 = (__v4di) _mm512_extracti64x4_epi64 (__A, 0);		\
+  __m256i __T3 = (__m256i) (__T1 op __T2);				\
+  __v2di __T4 = (__v2di) _mm256_extracti128_si256 (__T3, 1);		\
+  __v2di __T5 = (__v2di) _mm256_extracti128_si256 (__T3, 0);		\
+  __v2di __T6 = __T4 op __T5;						\
+  return __T6[0] op __T6[1]
+
+extern __inline long long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_reduce_add_epi64 (__m512i __A)
+{
+  __MM512_REDUCE_OP (+);
+}
+
+extern __inline long long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_reduce_mul_epi64 (__m512i __A)
+{
+  __MM512_REDUCE_OP (*);
+}
+
+extern __inline long long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_reduce_and_epi64 (__m512i __A)
+{
+  __MM512_REDUCE_OP (&);
+}
+
+extern __inline long long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_reduce_or_epi64 (__m512i __A)
+{
+  __MM512_REDUCE_OP (|);
+}
+
+extern __inline long long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_reduce_add_epi64 (__mmask8 __U, __m512i __A)
+{
+  __A = _mm512_maskz_mov_epi64 (__U, __A);
+  __MM512_REDUCE_OP (+);
+}
+
+extern __inline long long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_reduce_mul_epi64 (__mmask8 __U, __m512i __A)
+{
+  __A = _mm512_mask_mov_epi64 (_mm512_set1_epi64 (1LL), __U, __A);
+  __MM512_REDUCE_OP (*);
+}
+
+extern __inline long long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_reduce_and_epi64 (__mmask8 __U, __m512i __A)
+{
+  __A = _mm512_mask_mov_epi64 (_mm512_set1_epi64 (~0LL), __U, __A);
+  __MM512_REDUCE_OP (&);
+}
+
+extern __inline long long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_reduce_or_epi64 (__mmask8 __U, __m512i __A)
+{
+  __A = _mm512_maskz_mov_epi64 (__U, __A);
+  __MM512_REDUCE_OP (|);
+}
+
+#undef __MM512_REDUCE_OP
+#define __MM512_REDUCE_OP(op) \
+  __m512i __T1 = _mm512_shuffle_i64x2 (__A, __A, 0x4e);			\
+  __m512i __T2 = _mm512_##op (__A, __T1);				\
+  __m512i __T3								\
+    = (__m512i) __builtin_shuffle ((__v8di) __T2,			\
+				   (__v8di) { 2, 3, 0, 1, 6, 7, 4, 5 });\
+  __m512i __T4 = _mm512_##op (__T2, __T3);				\
+  __m512i __T5								\
+    = (__m512i) __builtin_shuffle ((__v8di) __T4,			\
+				   (__v8di) { 1, 0, 3, 2, 5, 4, 7, 6 });\
+  __v8di __T6 = (__v8di) _mm512_##op (__T4, __T5);			\
+  return __T6[0]
+
+extern __inline long long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_reduce_min_epi64 (__m512i __A)
+{
+  __MM512_REDUCE_OP (min_epi64);
+}
+
+extern __inline long long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_reduce_max_epi64 (__m512i __A)
+{
+  __MM512_REDUCE_OP (max_epi64);
+}
+
+extern __inline long long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_reduce_min_epi64 (__mmask8 __U, __m512i __A)
+{
+  __A = _mm512_mask_mov_epi64 (_mm512_set1_epi64 (__LONG_LONG_MAX__),
+			       __U, __A);
+  __MM512_REDUCE_OP (min_epi64);
+}
+
+extern __inline long long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_reduce_max_epi64 (__mmask8 __U, __m512i __A)
+{
+  __A = _mm512_mask_mov_epi64 (_mm512_set1_epi64 (-__LONG_LONG_MAX__ - 1),
+			       __U, __A);
+  __MM512_REDUCE_OP (max_epi64);
+}
+
+extern __inline unsigned long long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_reduce_min_epu64 (__m512i __A)
+{
+  __MM512_REDUCE_OP (min_epu64);
+}
+
+extern __inline unsigned long long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_reduce_max_epu64 (__m512i __A)
+{
+  __MM512_REDUCE_OP (max_epu64);
+}
+
+extern __inline unsigned long long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_reduce_min_epu64 (__mmask8 __U, __m512i __A)
+{
+  __A = _mm512_mask_mov_epi64 (_mm512_set1_epi64 (~0LL), __U, __A);
+  __MM512_REDUCE_OP (min_epu64);
+}
+
+extern __inline unsigned long long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_reduce_max_epu64 (__mmask8 __U, __m512i __A)
+{
+  __A = _mm512_maskz_mov_epi64 (__U, __A);
+  __MM512_REDUCE_OP (max_epu64);
+}
+
+#undef __MM512_REDUCE_OP
+#define __MM512_REDUCE_OP(op) \
+  __m256d __T1 = (__m256d) _mm512_extractf64x4_pd (__A, 1);		\
+  __m256d __T2 = (__m256d) _mm512_extractf64x4_pd (__A, 0);		\
+  __m256d __T3 = __T1 op __T2;						\
+  __m128d __T4 = _mm256_extractf128_pd (__T3, 1);			\
+  __m128d __T5 = _mm256_extractf128_pd (__T3, 0);			\
+  __m128d __T6 = __T4 op __T5;						\
+  return __T6[0] op __T6[1]
+
+extern __inline double
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_reduce_add_pd (__m512d __A)
+{
+  __MM512_REDUCE_OP (+);
+}
+
+extern __inline double
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_reduce_mul_pd (__m512d __A)
+{
+  __MM512_REDUCE_OP (*);
+}
+
+extern __inline double
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_reduce_add_pd (__mmask8 __U, __m512d __A)
+{
+  __A = _mm512_maskz_mov_pd (__U, __A);
+  __MM512_REDUCE_OP (+);
+}
+
+extern __inline double
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_reduce_mul_pd (__mmask8 __U, __m512d __A)
+{
+  __A = _mm512_mask_mov_pd (_mm512_set1_pd (1.0), __U, __A);
+  __MM512_REDUCE_OP (*);
+}
+
+#undef __MM512_REDUCE_OP
+#define __MM512_REDUCE_OP(op) \
+  __m256d __T1 = (__m256d) _mm512_extractf64x4_pd (__A, 1);		\
+  __m256d __T2 = (__m256d) _mm512_extractf64x4_pd (__A, 0);		\
+  __m256d __T3 = _mm256_##op (__T1, __T2);				\
+  __m128d __T4 = _mm256_extractf128_pd (__T3, 1);			\
+  __m128d __T5 = _mm256_extractf128_pd (__T3, 0);			\
+  __m128d __T6 = _mm_##op (__T4, __T5);					\
+  __m128d __T7 = (__m128d) __builtin_shuffle (__T6, (__v2di) { 1, 0 });	\
+  __m128d __T8 = _mm_##op (__T6, __T7);					\
+  return __T8[0]
+
+extern __inline double
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_reduce_min_pd (__m512d __A)
+{
+  __MM512_REDUCE_OP (min_pd);
+}
+
+extern __inline double
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_reduce_max_pd (__m512d __A)
+{
+  __MM512_REDUCE_OP (max_pd);
+}
+
+extern __inline double
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_reduce_min_pd (__mmask8 __U, __m512d __A)
+{
+  __A = _mm512_mask_mov_pd (_mm512_set1_pd (__builtin_inf ()), __U, __A);
+  __MM512_REDUCE_OP (min_pd);
+}
+
+extern __inline double
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_reduce_max_pd (__mmask8 __U, __m512d __A)
+{
+  __A = _mm512_mask_mov_pd (_mm512_set1_pd (-__builtin_inf ()), __U, __A);
+  __MM512_REDUCE_OP (max_pd);
+}
+
+#undef __MM512_REDUCE_OP
+
 #ifdef __DISABLE_AVX512F__
 #undef __DISABLE_AVX512F__
 #pragma GCC pop_options
--- gcc/testsuite/gcc.target/i386/avx512f-reduce-op-1.c.jj	2017-04-07 12:25:19.578556015 +0200
+++ gcc/testsuite/gcc.target/i386/avx512f-reduce-op-1.c	2017-04-07 16:34:37.972974281 +0200
@@ -0,0 +1,410 @@ 
+/* { dg-do run } */
+/* { dg-options "-O2 -mavx512f" } */
+/* { dg-require-effective-target avx512f } */
+
+#define AVX512F
+
+#include "avx512f-helper.h"
+
+__attribute__((noinline, noclone)) int
+test_reduce_add_epi32 (__m512i a)
+{
+  return _mm512_reduce_add_epi32 (a);
+}
+
+__attribute__((noinline, noclone)) int
+test_reduce_mul_epi32 (__m512i a)
+{
+  return _mm512_reduce_mul_epi32 (a);
+}
+
+__attribute__((noinline, noclone)) int
+test_reduce_and_epi32 (__m512i a)
+{
+  return _mm512_reduce_and_epi32 (a);
+}
+
+__attribute__((noinline, noclone)) int
+test_reduce_or_epi32 (__m512i a)
+{
+  return _mm512_reduce_or_epi32 (a);
+}
+
+__attribute__((noinline, noclone)) int
+test_mask_reduce_add_epi32 (__mmask16 u, __m512i a)
+{
+  return _mm512_mask_reduce_add_epi32 (u, a);
+}
+
+__attribute__((noinline, noclone)) int
+test_mask_reduce_mul_epi32 (__mmask16 u, __m512i a)
+{
+  return _mm512_mask_reduce_mul_epi32 (u, a);
+}
+
+__attribute__((noinline, noclone)) int
+test_mask_reduce_and_epi32 (__mmask16 u, __m512i a)
+{
+  return _mm512_mask_reduce_and_epi32 (u, a);
+}
+
+__attribute__((noinline, noclone)) int
+test_mask_reduce_or_epi32 (__mmask16 u, __m512i a)
+{
+  return _mm512_mask_reduce_or_epi32 (u, a);
+}
+
+__attribute__((noinline, noclone)) int
+test_reduce_min_epi32 (__m512i a)
+{
+  return _mm512_reduce_min_epi32 (a);
+}
+
+__attribute__((noinline, noclone)) int
+test_reduce_max_epi32 (__m512i a)
+{
+  return _mm512_reduce_max_epi32 (a);
+}
+
+__attribute__((noinline, noclone)) unsigned int
+test_reduce_min_epu32 (__m512i a)
+{
+  return _mm512_reduce_min_epu32 (a);
+}
+
+__attribute__((noinline, noclone)) unsigned int
+test_reduce_max_epu32 (__m512i a)
+{
+  return _mm512_reduce_max_epu32 (a);
+}
+
+__attribute__((noinline, noclone)) int
+test_mask_reduce_min_epi32 (__mmask16 u, __m512i a)
+{
+  return _mm512_mask_reduce_min_epi32 (u, a);
+}
+
+__attribute__((noinline, noclone)) int
+test_mask_reduce_max_epi32 (__mmask16 u, __m512i a)
+{
+  return _mm512_mask_reduce_max_epi32 (u, a);
+}
+
+__attribute__((noinline, noclone)) unsigned int
+test_mask_reduce_min_epu32 (__mmask16 u, __m512i a)
+{
+  return _mm512_mask_reduce_min_epu32 (u, a);
+}
+
+__attribute__((noinline, noclone)) unsigned int
+test_mask_reduce_max_epu32 (__mmask16 u, __m512i a)
+{
+  return _mm512_mask_reduce_max_epu32 (u, a);
+}
+
+__attribute__((noinline, noclone)) float
+test_reduce_add_ps (__m512 a)
+{
+  return _mm512_reduce_add_ps (a);
+}
+
+__attribute__((noinline, noclone)) float
+test_reduce_mul_ps (__m512 a)
+{
+  return _mm512_reduce_mul_ps (a);
+}
+
+__attribute__((noinline, noclone)) float
+test_mask_reduce_add_ps (__mmask16 u, __m512 a)
+{
+  return _mm512_mask_reduce_add_ps (u, a);
+}
+
+__attribute__((noinline, noclone)) float
+test_mask_reduce_mul_ps (__mmask16 u, __m512 a)
+{
+  return _mm512_mask_reduce_mul_ps (u, a);
+}
+
+__attribute__((noinline, noclone)) float
+test_reduce_min_ps (__m512 a)
+{
+  return _mm512_reduce_min_ps (a);
+}
+
+__attribute__((noinline, noclone)) float
+test_reduce_max_ps (__m512 a)
+{
+  return _mm512_reduce_max_ps (a);
+}
+
+__attribute__((noinline, noclone)) float
+test_mask_reduce_min_ps (__mmask16 u, __m512 a)
+{
+  return _mm512_mask_reduce_min_ps (u, a);
+}
+
+__attribute__((noinline, noclone)) float
+test_mask_reduce_max_ps (__mmask16 u, __m512 a)
+{
+  return _mm512_mask_reduce_max_ps (u, a);
+}
+
+__attribute__((noinline, noclone)) long long
+test_reduce_add_epi64 (__m512i a)
+{
+  return _mm512_reduce_add_epi64 (a);
+}
+
+__attribute__((noinline, noclone)) long long
+test_reduce_mul_epi64 (__m512i a)
+{
+  return _mm512_reduce_mul_epi64 (a);
+}
+
+__attribute__((noinline, noclone)) long long
+test_reduce_and_epi64 (__m512i a)
+{
+  return _mm512_reduce_and_epi64 (a);
+}
+
+__attribute__((noinline, noclone)) long long
+test_reduce_or_epi64 (__m512i a)
+{
+  return _mm512_reduce_or_epi64 (a);
+}
+
+__attribute__((noinline, noclone)) long long
+test_mask_reduce_add_epi64 (__mmask8 u, __m512i a)
+{
+  return _mm512_mask_reduce_add_epi64 (u, a);
+}
+
+__attribute__((noinline, noclone)) long long
+test_mask_reduce_mul_epi64 (__mmask8 u, __m512i a)
+{
+  return _mm512_mask_reduce_mul_epi64 (u, a);
+}
+
+__attribute__((noinline, noclone)) long long
+test_mask_reduce_and_epi64 (__mmask8 u, __m512i a)
+{
+  return _mm512_mask_reduce_and_epi64 (u, a);
+}
+
+__attribute__((noinline, noclone)) long long
+test_mask_reduce_or_epi64 (__mmask8 u, __m512i a)
+{
+  return _mm512_mask_reduce_or_epi64 (u, a);
+}
+
+__attribute__((noinline, noclone)) long long
+test_reduce_min_epi64 (__m512i a)
+{
+  return _mm512_reduce_min_epi64 (a);
+}
+
+__attribute__((noinline, noclone)) long long
+test_reduce_max_epi64 (__m512i a)
+{
+  return _mm512_reduce_max_epi64 (a);
+}
+
+__attribute__((noinline, noclone)) unsigned long long
+test_reduce_min_epu64 (__m512i a)
+{
+  return _mm512_reduce_min_epu64 (a);
+}
+
+__attribute__((noinline, noclone)) unsigned long long
+test_reduce_max_epu64 (__m512i a)
+{
+  return _mm512_reduce_max_epu64 (a);
+}
+
+__attribute__((noinline, noclone)) long long
+test_mask_reduce_min_epi64 (__mmask8 u, __m512i a)
+{
+  return _mm512_mask_reduce_min_epi64 (u, a);
+}
+
+__attribute__((noinline, noclone)) long long
+test_mask_reduce_max_epi64 (__mmask8 u, __m512i a)
+{
+  return _mm512_mask_reduce_max_epi64 (u, a);
+}
+
+__attribute__((noinline, noclone)) unsigned long long
+test_mask_reduce_min_epu64 (__mmask8 u, __m512i a)
+{
+  return _mm512_mask_reduce_min_epu64 (u, a);
+}
+
+__attribute__((noinline, noclone)) unsigned long long
+test_mask_reduce_max_epu64 (__mmask8 u, __m512i a)
+{
+  return _mm512_mask_reduce_max_epu64 (u, a);
+}
+
+__attribute__((noinline, noclone)) double
+test_reduce_add_pd (__m512d a)
+{
+  return _mm512_reduce_add_pd (a);
+}
+
+__attribute__((noinline, noclone)) double
+test_reduce_mul_pd (__m512d a)
+{
+  return _mm512_reduce_mul_pd (a);
+}
+
+__attribute__((noinline, noclone)) double
+test_mask_reduce_add_pd (__mmask8 u, __m512d a)
+{
+  return _mm512_mask_reduce_add_pd (u, a);
+}
+
+__attribute__((noinline, noclone)) double
+test_mask_reduce_mul_pd (__mmask8 u, __m512d a)
+{
+  return _mm512_mask_reduce_mul_pd (u, a);
+}
+
+__attribute__((noinline, noclone)) double
+test_reduce_min_pd (__m512d a)
+{
+  return _mm512_reduce_min_pd (a);
+}
+
+__attribute__((noinline, noclone)) double
+test_reduce_max_pd (__m512d a)
+{
+  return _mm512_reduce_max_pd (a);
+}
+
+__attribute__((noinline, noclone)) double
+test_mask_reduce_min_pd (__mmask8 u, __m512d a)
+{
+  return _mm512_mask_reduce_min_pd (u, a);
+}
+
+__attribute__((noinline, noclone)) double
+test_mask_reduce_max_pd (__mmask8 u, __m512d a)
+{
+  return _mm512_mask_reduce_max_pd (u, a);
+}
+
+#define TESTOP(opname, op, type, suffix, neutral) \
+  do {									\
+    type r1 = _mm512_reduce_##opname##_##suffix (v.x);			\
+    type r2 = test_reduce_##opname##_##suffix (v.x);			\
+    type r3 = neutral;							\
+    if (r1 != r2)							\
+      __builtin_abort ();						\
+    for (int i = 0; i < SIZE; i++)					\
+      r3 = r3 op v.a[i];						\
+    if (r1 != r3)							\
+      __builtin_abort ();						\
+    type r4 = _mm512_mask_reduce_##opname##_##suffix (MASK_VALUE, v.x);	\
+    type r5 = test_mask_reduce_##opname##_##suffix (MASK_VALUE, v.x);	\
+    if (r4 != r5)							\
+      __builtin_abort ();						\
+    r3 = neutral;							\
+    for (int i = 0; i < SIZE; i++)					\
+      if (MASK_VALUE & (1 << i))					\
+	r3 = r3 op v.a[i];						\
+    if (r4 != r3)							\
+      __builtin_abort ();						\
+    type r6 = _mm512_mask_reduce_##opname##_##suffix (0, v.x);		\
+    type r7 = test_mask_reduce_##opname##_##suffix (0, v.x);		\
+    if (r6 != r7 || r6 != neutral)					\
+      __builtin_abort ();						\
+  } while (0)
+
+#define SIZE (AVX512F_LEN / 32)
+#include "avx512f-mask-type.h"
+
+#define TEST_EPI32(c1, c2, c3, c4, c5, c6, c7, c8, \
+		   c9, c10, c11, c12, c13, c14, c15, c16)		\
+  do {									\
+    UNION_TYPE (AVX512F_LEN, i_d) v;					\
+    v.x = _mm512_set_epi32 (c1, c2, c3, c4, c5, c6, c7, c8,		\
+			    c9, c10, c11, c12, c13, c14, c15, c16);	\
+    TESTOP (add, +, int, epi32, 0);					\
+    TESTOP (mul, *, int, epi32, 1);					\
+    TESTOP (and, &, int, epi32, ~0);					\
+    TESTOP (or, |, int, epi32, 0);					\
+    TESTOP (min, < v.a[i] ? r3 :, int, epi32, __INT_MAX__);		\
+    TESTOP (max, > v.a[i] ? r3 :, int, epi32, -__INT_MAX__ - 1);	\
+    TESTOP (min, < (unsigned) v.a[i] ? r3 :, unsigned, epu32, ~0U);	\
+    TESTOP (max, > (unsigned) v.a[i] ? r3 :, unsigned, epu32, 0);	\
+  } while (0)
+
+#define TEST_PS(c1, c2, c3, c4, c5, c6, c7, c8, \
+		c9, c10, c11, c12, c13, c14, c15, c16)			\
+  do {									\
+    UNION_TYPE (AVX512F_LEN, ) v;					\
+    v.x = _mm512_set_ps (c1, c2, c3, c4, c5, c6, c7, c8,		\
+			 c9, c10, c11, c12, c13, c14, c15, c16);	\
+    TESTOP (add, +, float, ps, 0.0f);					\
+    TESTOP (mul, *, float, ps, 1.0f);					\
+    TESTOP (min, < v.a[i] ? r3 :, float, ps, __builtin_inff ());	\
+    TESTOP (max, > v.a[i] ? r3 :, float, ps, -__builtin_inff ());	\
+  } while (0)
+
+static void
+test_epi32_ps (void)
+{
+  TEST_EPI32 (1, 2, 3, 4, 5, 6, 6, 5, 4, 3, 2, 1, 7, 6, 5, 4);
+  TEST_EPI32 (-1, 15, -1, 7, -1, 7, -1, -1, 6, 6, -1, -1, -1, -1, 7, 6);
+  TEST_PS (1, 2, 3, 4, 5, 6, 6, 5, 4, 3, 2, 1, 7, 6, 5, 4);
+  TEST_PS (1.25f, 2.25f, -0.25f, 4.0f, -2.0f, 4.0f, -3.0f, 2.0f,
+           -0.5f, -1.0f, 1.0f, -1.0f, 1.0f, 1.0f, 2.0f, 4.0f);
+}
+
+#undef SIZE
+#define SIZE (AVX512F_LEN / 64)
+#include "avx512f-mask-type.h"
+
+#define TEST_EPI64(c1, c2, c3, c4, c5, c6, c7, c8) \
+  do {									\
+    UNION_TYPE (AVX512F_LEN, i_q) v;					\
+    v.x = _mm512_set_epi64 (c1, c2, c3, c4, c5, c6, c7, c8);		\
+    TESTOP (add, +, long long, epi64, 0);				\
+    TESTOP (mul, *, long long, epi64, 1);				\
+    TESTOP (and, &, long long, epi64, ~0LL);				\
+    TESTOP (or, |, long long, epi64, 0);				\
+    TESTOP (min, < v.a[i] ? r3 :, long long, epi64, __LONG_LONG_MAX__);	\
+    TESTOP (max, > v.a[i] ? r3 :, long long, epi64,			\
+	    -__LONG_LONG_MAX__ - 1);					\
+    TESTOP (min, < (unsigned long long) v.a[i] ? r3 :,			\
+	    unsigned long long, epu64, ~0ULL);				\
+    TESTOP (max, > (unsigned long long) v.a[i] ? r3 :,			\
+	    unsigned long long, epu64, 0);				\
+  } while (0)
+
+#define TEST_PD(c1, c2, c3, c4, c5, c6, c7, c8) \
+  do {									\
+    UNION_TYPE (AVX512F_LEN, d) v;					\
+    v.x = _mm512_set_pd (c1, c2, c3, c4, c5, c6, c7, c8);		\
+    TESTOP (add, +, double, pd, 0.0);					\
+    TESTOP (mul, *, double, pd, 1.0);					\
+    TESTOP (min, < v.a[i] ? r3 :, double, pd, __builtin_inf ());	\
+    TESTOP (max, > v.a[i] ? r3 :, double, pd, -__builtin_inf ());	\
+  } while (0)
+
+static void
+test_epi64_pd (void)
+{
+  TEST_EPI64 (1, 2, 3, 4, 5, 6, 6, 5);
+  TEST_EPI64 (-1, 15, -1, 7, -1, 7, -1, -1);
+  TEST_PD (1, 2, 3, 4, 5, 6, 6, 5);
+  TEST_PD (1.25f, 2.25f, -0.25f, 4.0f, -2.0f, 4.0f, -3.0f, 2.0f);
+}
+
+void
+test_512 (void)
+{
+  test_epi32_ps ();
+  test_epi64_pd ();
+}