Message ID | 20170407145250.GB1914@tucnak |
---|---|
State | New |
Headers | show |
On Fri, Apr 7, 2017 at 4:52 PM, Jakub Jelinek <jakub@redhat.com> wrote: > Hi! > > This patch is slightly larger, so I haven't included it in the patch I've > sent a few minutes ago. > > I've looked at godbolt for what ICC generates for these and picked sequences > that generate approx. as good code as that. For > min_epi64/max_epi64/min_epu64/max_epu64 there is a slight complication that > in AVX512F there is only _mm512_{min,max}_ep{i,u}64 but not the _mm256_ or > _mm_ ones, so we need to perform 512-bit operations all the time rather than > perform extractions, 256-bit operation, further extractions and then 128-bit > operations. > > Seems we need to teach our permutation code further instructions, e.g. > typedef long long V __attribute__((vector_size (64))); > typedef int W __attribute__((vector_size (64))); > W f0 (W x) { > return __builtin_shuffle (x, (W) { 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7 }); > } > V f1 (V x) { > return __builtin_shuffle (x, (V) { 4, 5, 6, 7, 0, 1, 2, 3 }); > } > generate unnecessarily bad code (could use vpshufi64x2 instruction), > guess that can be resolved for GCC8. > > Tested with > make -j272 -k check-gcc RUNTESTFLAGS='--target_board=unix\{-m32,-m64\} i386.exp' > on KNL, will bootstrap/regtest on my Haswell-E next, ok for trunk > if that passes? > > It is not a regression, on the other side it really shouldn't affect any > code that is not using those intrinsics. > > 2017-04-07 Jakub Jelinek <jakub@redhat.com> > > PR target/80324 > * config/i386/avx512fintrin.h (_mm512_reduce_add_epi32, > _mm512_reduce_mul_epi32, _mm512_reduce_and_epi32, > _mm512_reduce_or_epi32, _mm512_mask_reduce_add_epi32, > _mm512_mask_reduce_mul_epi32, _mm512_mask_reduce_and_epi32, > _mm512_mask_reduce_or_epi32, _mm512_reduce_min_epi32, > _mm512_reduce_max_epi32, _mm512_reduce_min_epu32, > _mm512_reduce_max_epu32, _mm512_mask_reduce_min_epi32, > _mm512_mask_reduce_max_epi32, _mm512_mask_reduce_min_epu32, > _mm512_mask_reduce_max_epu32, _mm512_reduce_add_ps, > _mm512_reduce_mul_ps, _mm512_mask_reduce_add_ps, > _mm512_mask_reduce_mul_ps, _mm512_reduce_min_ps, _mm512_reduce_max_ps, > _mm512_mask_reduce_min_ps, _mm512_mask_reduce_max_ps, > _mm512_reduce_add_epi64, _mm512_reduce_mul_epi64, > _mm512_reduce_and_epi64, _mm512_reduce_or_epi64, > _mm512_mask_reduce_add_epi64, _mm512_mask_reduce_mul_epi64, > _mm512_mask_reduce_and_epi64, _mm512_mask_reduce_or_epi64, > _mm512_reduce_min_epi64, _mm512_reduce_max_epi64, > _mm512_mask_reduce_min_epi64, _mm512_mask_reduce_max_epi64, > _mm512_reduce_min_epu64, _mm512_reduce_max_epu64, > _mm512_mask_reduce_min_epu64, _mm512_mask_reduce_max_epu64, > _mm512_reduce_add_pd, _mm512_reduce_mul_pd, _mm512_mask_reduce_add_pd, > _mm512_mask_reduce_mul_pd, _mm512_reduce_min_pd, _mm512_reduce_max_pd, > _mm512_mask_reduce_min_pd, _mm512_mask_reduce_max_pd): New intrinsics. > > * gcc.target/i386/avx512f-reduce-op-1.c: New test. LGTM, but please wait for Kirill's opinion on the implementation. Thanks, Uros. > --- gcc/config/i386/avx512fintrin.h.jj 2017-04-07 12:25:13.065643755 +0200 > +++ gcc/config/i386/avx512fintrin.h 2017-04-07 16:34:37.976974227 +0200 > @@ -13282,6 +13282,470 @@ _mm512_cmpgt_epu64_mask (__m512i __A, __ > (__mmask8) -1); > } > > +#undef __MM512_REDUCE_OP > +#define __MM512_REDUCE_OP(op) \ > + __v8si __T1 = (__v8si) _mm512_extracti64x4_epi64 (__A, 1); \ > + __v8si __T2 = (__v8si) _mm512_extracti64x4_epi64 (__A, 0); \ > + __m256i __T3 = (__m256i) (__T1 op __T2); \ > + __v4si __T4 = (__v4si) _mm256_extracti128_si256 (__T3, 1); \ > + __v4si __T5 = (__v4si) _mm256_extracti128_si256 (__T3, 0); \ > + __v4si __T6 = __T4 op __T5; \ > + __v4si __T7 = __builtin_shuffle (__T6, (__v4si) { 2, 3, 0, 1 }); \ > + __v4si __T8 = __T6 op __T7; \ > + return __T8[0] op __T8[1] > + > +extern __inline int > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm512_reduce_add_epi32 (__m512i __A) > +{ > + __MM512_REDUCE_OP (+); > +} > + > +extern __inline int > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm512_reduce_mul_epi32 (__m512i __A) > +{ > + __MM512_REDUCE_OP (*); > +} > + > +extern __inline int > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm512_reduce_and_epi32 (__m512i __A) > +{ > + __MM512_REDUCE_OP (&); > +} > + > +extern __inline int > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm512_reduce_or_epi32 (__m512i __A) > +{ > + __MM512_REDUCE_OP (|); > +} > + > +extern __inline int > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm512_mask_reduce_add_epi32 (__mmask16 __U, __m512i __A) > +{ > + __A = _mm512_maskz_mov_epi32 (__U, __A); > + __MM512_REDUCE_OP (+); > +} > + > +extern __inline int > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm512_mask_reduce_mul_epi32 (__mmask16 __U, __m512i __A) > +{ > + __A = _mm512_mask_mov_epi32 (_mm512_set1_epi32 (1), __U, __A); > + __MM512_REDUCE_OP (*); > +} > + > +extern __inline int > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm512_mask_reduce_and_epi32 (__mmask16 __U, __m512i __A) > +{ > + __A = _mm512_mask_mov_epi32 (_mm512_set1_epi32 (~0), __U, __A); > + __MM512_REDUCE_OP (&); > +} > + > +extern __inline int > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm512_mask_reduce_or_epi32 (__mmask16 __U, __m512i __A) > +{ > + __A = _mm512_maskz_mov_epi32 (__U, __A); > + __MM512_REDUCE_OP (|); > +} > + > +#undef __MM512_REDUCE_OP > +#define __MM512_REDUCE_OP(op) \ > + __m256i __T1 = (__m256i) _mm512_extracti64x4_epi64 (__A, 1); \ > + __m256i __T2 = (__m256i) _mm512_extracti64x4_epi64 (__A, 0); \ > + __m256i __T3 = _mm256_##op (__T1, __T2); \ > + __m128i __T4 = (__m128i) _mm256_extracti128_si256 (__T3, 1); \ > + __m128i __T5 = (__m128i) _mm256_extracti128_si256 (__T3, 0); \ > + __m128i __T6 = _mm_##op (__T4, __T5); \ > + __m128i __T7 = (__m128i) __builtin_shuffle ((__v4si) __T6, \ > + (__v4si) { 2, 3, 0, 1 }); \ > + __m128i __T8 = _mm_##op (__T6, __T7); \ > + __m128i __T9 = (__m128i) __builtin_shuffle ((__v4si) __T8, \ > + (__v4si) { 1, 0, 1, 0 }); \ > + __v4si __T10 = (__v4si) _mm_##op (__T8, __T9); \ > + return __T10[0] > + > +extern __inline int > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm512_reduce_min_epi32 (__m512i __A) > +{ > + __MM512_REDUCE_OP (min_epi32); > +} > + > +extern __inline int > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm512_reduce_max_epi32 (__m512i __A) > +{ > + __MM512_REDUCE_OP (max_epi32); > +} > + > +extern __inline unsigned int > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm512_reduce_min_epu32 (__m512i __A) > +{ > + __MM512_REDUCE_OP (min_epu32); > +} > + > +extern __inline unsigned int > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm512_reduce_max_epu32 (__m512i __A) > +{ > + __MM512_REDUCE_OP (max_epu32); > +} > + > +extern __inline int > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm512_mask_reduce_min_epi32 (__mmask16 __U, __m512i __A) > +{ > + __A = _mm512_mask_mov_epi32 (_mm512_set1_epi32 (__INT_MAX__), __U, __A); > + __MM512_REDUCE_OP (min_epi32); > +} > + > +extern __inline int > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm512_mask_reduce_max_epi32 (__mmask16 __U, __m512i __A) > +{ > + __A = _mm512_mask_mov_epi32 (_mm512_set1_epi32 (-__INT_MAX__ - 1), __U, __A); > + __MM512_REDUCE_OP (max_epi32); > +} > + > +extern __inline unsigned int > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm512_mask_reduce_min_epu32 (__mmask16 __U, __m512i __A) > +{ > + __A = _mm512_mask_mov_epi32 (_mm512_set1_epi32 (~0), __U, __A); > + __MM512_REDUCE_OP (min_epu32); > +} > + > +extern __inline unsigned int > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm512_mask_reduce_max_epu32 (__mmask16 __U, __m512i __A) > +{ > + __A = _mm512_maskz_mov_epi32 (__U, __A); > + __MM512_REDUCE_OP (max_epu32); > +} > + > +#undef __MM512_REDUCE_OP > +#define __MM512_REDUCE_OP(op) \ > + __m256 __T1 = (__m256) _mm512_extractf64x4_pd ((__m512d) __A, 1); \ > + __m256 __T2 = (__m256) _mm512_extractf64x4_pd ((__m512d) __A, 0); \ > + __m256 __T3 = __T1 op __T2; \ > + __m128 __T4 = _mm256_extractf128_ps (__T3, 1); \ > + __m128 __T5 = _mm256_extractf128_ps (__T3, 0); \ > + __m128 __T6 = __T4 op __T5; \ > + __m128 __T7 = __builtin_shuffle (__T6, (__v4si) { 2, 3, 0, 1 }); \ > + __m128 __T8 = __T6 op __T7; \ > + return __T8[0] op __T8[1] > + > +extern __inline float > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm512_reduce_add_ps (__m512 __A) > +{ > + __MM512_REDUCE_OP (+); > +} > + > +extern __inline float > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm512_reduce_mul_ps (__m512 __A) > +{ > + __MM512_REDUCE_OP (*); > +} > + > +extern __inline float > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm512_mask_reduce_add_ps (__mmask16 __U, __m512 __A) > +{ > + __A = _mm512_maskz_mov_ps (__U, __A); > + __MM512_REDUCE_OP (+); > +} > + > +extern __inline float > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm512_mask_reduce_mul_ps (__mmask16 __U, __m512 __A) > +{ > + __A = _mm512_mask_mov_ps (_mm512_set1_ps (1.0f), __U, __A); > + __MM512_REDUCE_OP (*); > +} > + > +#undef __MM512_REDUCE_OP > +#define __MM512_REDUCE_OP(op) \ > + __m256 __T1 = (__m256) _mm512_extractf64x4_pd ((__m512d) __A, 1); \ > + __m256 __T2 = (__m256) _mm512_extractf64x4_pd ((__m512d) __A, 0); \ > + __m256 __T3 = _mm256_##op (__T1, __T2); \ > + __m128 __T4 = _mm256_extractf128_ps (__T3, 1); \ > + __m128 __T5 = _mm256_extractf128_ps (__T3, 0); \ > + __m128 __T6 = _mm_##op (__T4, __T5); \ > + __m128 __T7 = __builtin_shuffle (__T6, (__v4si) { 2, 3, 0, 1 }); \ > + __m128 __T8 = _mm_##op (__T6, __T7); \ > + __m128 __T9 = __builtin_shuffle (__T8, (__v4si) { 1, 0, 1, 0 }); \ > + __m128 __T10 = _mm_##op (__T8, __T9); \ > + return __T10[0] > + > +extern __inline float > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm512_reduce_min_ps (__m512 __A) > +{ > + __MM512_REDUCE_OP (min_ps); > +} > + > +extern __inline float > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm512_reduce_max_ps (__m512 __A) > +{ > + __MM512_REDUCE_OP (max_ps); > +} > + > +extern __inline float > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm512_mask_reduce_min_ps (__mmask16 __U, __m512 __A) > +{ > + __A = _mm512_mask_mov_ps (_mm512_set1_ps (__builtin_inff ()), __U, __A); > + __MM512_REDUCE_OP (min_ps); > +} > + > +extern __inline float > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm512_mask_reduce_max_ps (__mmask16 __U, __m512 __A) > +{ > + __A = _mm512_mask_mov_ps (_mm512_set1_ps (-__builtin_inff ()), __U, __A); > + __MM512_REDUCE_OP (max_ps); > +} > + > +#undef __MM512_REDUCE_OP > +#define __MM512_REDUCE_OP(op) \ > + __v4di __T1 = (__v4di) _mm512_extracti64x4_epi64 (__A, 1); \ > + __v4di __T2 = (__v4di) _mm512_extracti64x4_epi64 (__A, 0); \ > + __m256i __T3 = (__m256i) (__T1 op __T2); \ > + __v2di __T4 = (__v2di) _mm256_extracti128_si256 (__T3, 1); \ > + __v2di __T5 = (__v2di) _mm256_extracti128_si256 (__T3, 0); \ > + __v2di __T6 = __T4 op __T5; \ > + return __T6[0] op __T6[1] > + > +extern __inline long long > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm512_reduce_add_epi64 (__m512i __A) > +{ > + __MM512_REDUCE_OP (+); > +} > + > +extern __inline long long > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm512_reduce_mul_epi64 (__m512i __A) > +{ > + __MM512_REDUCE_OP (*); > +} > + > +extern __inline long long > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm512_reduce_and_epi64 (__m512i __A) > +{ > + __MM512_REDUCE_OP (&); > +} > + > +extern __inline long long > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm512_reduce_or_epi64 (__m512i __A) > +{ > + __MM512_REDUCE_OP (|); > +} > + > +extern __inline long long > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm512_mask_reduce_add_epi64 (__mmask8 __U, __m512i __A) > +{ > + __A = _mm512_maskz_mov_epi64 (__U, __A); > + __MM512_REDUCE_OP (+); > +} > + > +extern __inline long long > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm512_mask_reduce_mul_epi64 (__mmask8 __U, __m512i __A) > +{ > + __A = _mm512_mask_mov_epi64 (_mm512_set1_epi64 (1LL), __U, __A); > + __MM512_REDUCE_OP (*); > +} > + > +extern __inline long long > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm512_mask_reduce_and_epi64 (__mmask8 __U, __m512i __A) > +{ > + __A = _mm512_mask_mov_epi64 (_mm512_set1_epi64 (~0LL), __U, __A); > + __MM512_REDUCE_OP (&); > +} > + > +extern __inline long long > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm512_mask_reduce_or_epi64 (__mmask8 __U, __m512i __A) > +{ > + __A = _mm512_maskz_mov_epi64 (__U, __A); > + __MM512_REDUCE_OP (|); > +} > + > +#undef __MM512_REDUCE_OP > +#define __MM512_REDUCE_OP(op) \ > + __m512i __T1 = _mm512_shuffle_i64x2 (__A, __A, 0x4e); \ > + __m512i __T2 = _mm512_##op (__A, __T1); \ > + __m512i __T3 \ > + = (__m512i) __builtin_shuffle ((__v8di) __T2, \ > + (__v8di) { 2, 3, 0, 1, 6, 7, 4, 5 });\ > + __m512i __T4 = _mm512_##op (__T2, __T3); \ > + __m512i __T5 \ > + = (__m512i) __builtin_shuffle ((__v8di) __T4, \ > + (__v8di) { 1, 0, 3, 2, 5, 4, 7, 6 });\ > + __v8di __T6 = (__v8di) _mm512_##op (__T4, __T5); \ > + return __T6[0] > + > +extern __inline long long > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm512_reduce_min_epi64 (__m512i __A) > +{ > + __MM512_REDUCE_OP (min_epi64); > +} > + > +extern __inline long long > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm512_reduce_max_epi64 (__m512i __A) > +{ > + __MM512_REDUCE_OP (max_epi64); > +} > + > +extern __inline long long > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm512_mask_reduce_min_epi64 (__mmask8 __U, __m512i __A) > +{ > + __A = _mm512_mask_mov_epi64 (_mm512_set1_epi64 (__LONG_LONG_MAX__), > + __U, __A); > + __MM512_REDUCE_OP (min_epi64); > +} > + > +extern __inline long long > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm512_mask_reduce_max_epi64 (__mmask8 __U, __m512i __A) > +{ > + __A = _mm512_mask_mov_epi64 (_mm512_set1_epi64 (-__LONG_LONG_MAX__ - 1), > + __U, __A); > + __MM512_REDUCE_OP (max_epi64); > +} > + > +extern __inline unsigned long long > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm512_reduce_min_epu64 (__m512i __A) > +{ > + __MM512_REDUCE_OP (min_epu64); > +} > + > +extern __inline unsigned long long > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm512_reduce_max_epu64 (__m512i __A) > +{ > + __MM512_REDUCE_OP (max_epu64); > +} > + > +extern __inline unsigned long long > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm512_mask_reduce_min_epu64 (__mmask8 __U, __m512i __A) > +{ > + __A = _mm512_mask_mov_epi64 (_mm512_set1_epi64 (~0LL), __U, __A); > + __MM512_REDUCE_OP (min_epu64); > +} > + > +extern __inline unsigned long long > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm512_mask_reduce_max_epu64 (__mmask8 __U, __m512i __A) > +{ > + __A = _mm512_maskz_mov_epi64 (__U, __A); > + __MM512_REDUCE_OP (max_epu64); > +} > + > +#undef __MM512_REDUCE_OP > +#define __MM512_REDUCE_OP(op) \ > + __m256d __T1 = (__m256d) _mm512_extractf64x4_pd (__A, 1); \ > + __m256d __T2 = (__m256d) _mm512_extractf64x4_pd (__A, 0); \ > + __m256d __T3 = __T1 op __T2; \ > + __m128d __T4 = _mm256_extractf128_pd (__T3, 1); \ > + __m128d __T5 = _mm256_extractf128_pd (__T3, 0); \ > + __m128d __T6 = __T4 op __T5; \ > + return __T6[0] op __T6[1] > + > +extern __inline double > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm512_reduce_add_pd (__m512d __A) > +{ > + __MM512_REDUCE_OP (+); > +} > + > +extern __inline double > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm512_reduce_mul_pd (__m512d __A) > +{ > + __MM512_REDUCE_OP (*); > +} > + > +extern __inline double > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm512_mask_reduce_add_pd (__mmask8 __U, __m512d __A) > +{ > + __A = _mm512_maskz_mov_pd (__U, __A); > + __MM512_REDUCE_OP (+); > +} > + > +extern __inline double > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm512_mask_reduce_mul_pd (__mmask8 __U, __m512d __A) > +{ > + __A = _mm512_mask_mov_pd (_mm512_set1_pd (1.0), __U, __A); > + __MM512_REDUCE_OP (*); > +} > + > +#undef __MM512_REDUCE_OP > +#define __MM512_REDUCE_OP(op) \ > + __m256d __T1 = (__m256d) _mm512_extractf64x4_pd (__A, 1); \ > + __m256d __T2 = (__m256d) _mm512_extractf64x4_pd (__A, 0); \ > + __m256d __T3 = _mm256_##op (__T1, __T2); \ > + __m128d __T4 = _mm256_extractf128_pd (__T3, 1); \ > + __m128d __T5 = _mm256_extractf128_pd (__T3, 0); \ > + __m128d __T6 = _mm_##op (__T4, __T5); \ > + __m128d __T7 = (__m128d) __builtin_shuffle (__T6, (__v2di) { 1, 0 }); \ > + __m128d __T8 = _mm_##op (__T6, __T7); \ > + return __T8[0] > + > +extern __inline double > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm512_reduce_min_pd (__m512d __A) > +{ > + __MM512_REDUCE_OP (min_pd); > +} > + > +extern __inline double > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm512_reduce_max_pd (__m512d __A) > +{ > + __MM512_REDUCE_OP (max_pd); > +} > + > +extern __inline double > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm512_mask_reduce_min_pd (__mmask8 __U, __m512d __A) > +{ > + __A = _mm512_mask_mov_pd (_mm512_set1_pd (__builtin_inf ()), __U, __A); > + __MM512_REDUCE_OP (min_pd); > +} > + > +extern __inline double > +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > +_mm512_mask_reduce_max_pd (__mmask8 __U, __m512d __A) > +{ > + __A = _mm512_mask_mov_pd (_mm512_set1_pd (-__builtin_inf ()), __U, __A); > + __MM512_REDUCE_OP (max_pd); > +} > + > +#undef __MM512_REDUCE_OP > + > #ifdef __DISABLE_AVX512F__ > #undef __DISABLE_AVX512F__ > #pragma GCC pop_options > --- gcc/testsuite/gcc.target/i386/avx512f-reduce-op-1.c.jj 2017-04-07 12:25:19.578556015 +0200 > +++ gcc/testsuite/gcc.target/i386/avx512f-reduce-op-1.c 2017-04-07 16:34:37.972974281 +0200 > @@ -0,0 +1,410 @@ > +/* { dg-do run } */ > +/* { dg-options "-O2 -mavx512f" } */ > +/* { dg-require-effective-target avx512f } */ > + > +#define AVX512F > + > +#include "avx512f-helper.h" > + > +__attribute__((noinline, noclone)) int > +test_reduce_add_epi32 (__m512i a) > +{ > + return _mm512_reduce_add_epi32 (a); > +} > + > +__attribute__((noinline, noclone)) int > +test_reduce_mul_epi32 (__m512i a) > +{ > + return _mm512_reduce_mul_epi32 (a); > +} > + > +__attribute__((noinline, noclone)) int > +test_reduce_and_epi32 (__m512i a) > +{ > + return _mm512_reduce_and_epi32 (a); > +} > + > +__attribute__((noinline, noclone)) int > +test_reduce_or_epi32 (__m512i a) > +{ > + return _mm512_reduce_or_epi32 (a); > +} > + > +__attribute__((noinline, noclone)) int > +test_mask_reduce_add_epi32 (__mmask16 u, __m512i a) > +{ > + return _mm512_mask_reduce_add_epi32 (u, a); > +} > + > +__attribute__((noinline, noclone)) int > +test_mask_reduce_mul_epi32 (__mmask16 u, __m512i a) > +{ > + return _mm512_mask_reduce_mul_epi32 (u, a); > +} > + > +__attribute__((noinline, noclone)) int > +test_mask_reduce_and_epi32 (__mmask16 u, __m512i a) > +{ > + return _mm512_mask_reduce_and_epi32 (u, a); > +} > + > +__attribute__((noinline, noclone)) int > +test_mask_reduce_or_epi32 (__mmask16 u, __m512i a) > +{ > + return _mm512_mask_reduce_or_epi32 (u, a); > +} > + > +__attribute__((noinline, noclone)) int > +test_reduce_min_epi32 (__m512i a) > +{ > + return _mm512_reduce_min_epi32 (a); > +} > + > +__attribute__((noinline, noclone)) int > +test_reduce_max_epi32 (__m512i a) > +{ > + return _mm512_reduce_max_epi32 (a); > +} > + > +__attribute__((noinline, noclone)) unsigned int > +test_reduce_min_epu32 (__m512i a) > +{ > + return _mm512_reduce_min_epu32 (a); > +} > + > +__attribute__((noinline, noclone)) unsigned int > +test_reduce_max_epu32 (__m512i a) > +{ > + return _mm512_reduce_max_epu32 (a); > +} > + > +__attribute__((noinline, noclone)) int > +test_mask_reduce_min_epi32 (__mmask16 u, __m512i a) > +{ > + return _mm512_mask_reduce_min_epi32 (u, a); > +} > + > +__attribute__((noinline, noclone)) int > +test_mask_reduce_max_epi32 (__mmask16 u, __m512i a) > +{ > + return _mm512_mask_reduce_max_epi32 (u, a); > +} > + > +__attribute__((noinline, noclone)) unsigned int > +test_mask_reduce_min_epu32 (__mmask16 u, __m512i a) > +{ > + return _mm512_mask_reduce_min_epu32 (u, a); > +} > + > +__attribute__((noinline, noclone)) unsigned int > +test_mask_reduce_max_epu32 (__mmask16 u, __m512i a) > +{ > + return _mm512_mask_reduce_max_epu32 (u, a); > +} > + > +__attribute__((noinline, noclone)) float > +test_reduce_add_ps (__m512 a) > +{ > + return _mm512_reduce_add_ps (a); > +} > + > +__attribute__((noinline, noclone)) float > +test_reduce_mul_ps (__m512 a) > +{ > + return _mm512_reduce_mul_ps (a); > +} > + > +__attribute__((noinline, noclone)) float > +test_mask_reduce_add_ps (__mmask16 u, __m512 a) > +{ > + return _mm512_mask_reduce_add_ps (u, a); > +} > + > +__attribute__((noinline, noclone)) float > +test_mask_reduce_mul_ps (__mmask16 u, __m512 a) > +{ > + return _mm512_mask_reduce_mul_ps (u, a); > +} > + > +__attribute__((noinline, noclone)) float > +test_reduce_min_ps (__m512 a) > +{ > + return _mm512_reduce_min_ps (a); > +} > + > +__attribute__((noinline, noclone)) float > +test_reduce_max_ps (__m512 a) > +{ > + return _mm512_reduce_max_ps (a); > +} > + > +__attribute__((noinline, noclone)) float > +test_mask_reduce_min_ps (__mmask16 u, __m512 a) > +{ > + return _mm512_mask_reduce_min_ps (u, a); > +} > + > +__attribute__((noinline, noclone)) float > +test_mask_reduce_max_ps (__mmask16 u, __m512 a) > +{ > + return _mm512_mask_reduce_max_ps (u, a); > +} > + > +__attribute__((noinline, noclone)) long long > +test_reduce_add_epi64 (__m512i a) > +{ > + return _mm512_reduce_add_epi64 (a); > +} > + > +__attribute__((noinline, noclone)) long long > +test_reduce_mul_epi64 (__m512i a) > +{ > + return _mm512_reduce_mul_epi64 (a); > +} > + > +__attribute__((noinline, noclone)) long long > +test_reduce_and_epi64 (__m512i a) > +{ > + return _mm512_reduce_and_epi64 (a); > +} > + > +__attribute__((noinline, noclone)) long long > +test_reduce_or_epi64 (__m512i a) > +{ > + return _mm512_reduce_or_epi64 (a); > +} > + > +__attribute__((noinline, noclone)) long long > +test_mask_reduce_add_epi64 (__mmask8 u, __m512i a) > +{ > + return _mm512_mask_reduce_add_epi64 (u, a); > +} > + > +__attribute__((noinline, noclone)) long long > +test_mask_reduce_mul_epi64 (__mmask8 u, __m512i a) > +{ > + return _mm512_mask_reduce_mul_epi64 (u, a); > +} > + > +__attribute__((noinline, noclone)) long long > +test_mask_reduce_and_epi64 (__mmask8 u, __m512i a) > +{ > + return _mm512_mask_reduce_and_epi64 (u, a); > +} > + > +__attribute__((noinline, noclone)) long long > +test_mask_reduce_or_epi64 (__mmask8 u, __m512i a) > +{ > + return _mm512_mask_reduce_or_epi64 (u, a); > +} > + > +__attribute__((noinline, noclone)) long long > +test_reduce_min_epi64 (__m512i a) > +{ > + return _mm512_reduce_min_epi64 (a); > +} > + > +__attribute__((noinline, noclone)) long long > +test_reduce_max_epi64 (__m512i a) > +{ > + return _mm512_reduce_max_epi64 (a); > +} > + > +__attribute__((noinline, noclone)) unsigned long long > +test_reduce_min_epu64 (__m512i a) > +{ > + return _mm512_reduce_min_epu64 (a); > +} > + > +__attribute__((noinline, noclone)) unsigned long long > +test_reduce_max_epu64 (__m512i a) > +{ > + return _mm512_reduce_max_epu64 (a); > +} > + > +__attribute__((noinline, noclone)) long long > +test_mask_reduce_min_epi64 (__mmask8 u, __m512i a) > +{ > + return _mm512_mask_reduce_min_epi64 (u, a); > +} > + > +__attribute__((noinline, noclone)) long long > +test_mask_reduce_max_epi64 (__mmask8 u, __m512i a) > +{ > + return _mm512_mask_reduce_max_epi64 (u, a); > +} > + > +__attribute__((noinline, noclone)) unsigned long long > +test_mask_reduce_min_epu64 (__mmask8 u, __m512i a) > +{ > + return _mm512_mask_reduce_min_epu64 (u, a); > +} > + > +__attribute__((noinline, noclone)) unsigned long long > +test_mask_reduce_max_epu64 (__mmask8 u, __m512i a) > +{ > + return _mm512_mask_reduce_max_epu64 (u, a); > +} > + > +__attribute__((noinline, noclone)) double > +test_reduce_add_pd (__m512d a) > +{ > + return _mm512_reduce_add_pd (a); > +} > + > +__attribute__((noinline, noclone)) double > +test_reduce_mul_pd (__m512d a) > +{ > + return _mm512_reduce_mul_pd (a); > +} > + > +__attribute__((noinline, noclone)) double > +test_mask_reduce_add_pd (__mmask8 u, __m512d a) > +{ > + return _mm512_mask_reduce_add_pd (u, a); > +} > + > +__attribute__((noinline, noclone)) double > +test_mask_reduce_mul_pd (__mmask8 u, __m512d a) > +{ > + return _mm512_mask_reduce_mul_pd (u, a); > +} > + > +__attribute__((noinline, noclone)) double > +test_reduce_min_pd (__m512d a) > +{ > + return _mm512_reduce_min_pd (a); > +} > + > +__attribute__((noinline, noclone)) double > +test_reduce_max_pd (__m512d a) > +{ > + return _mm512_reduce_max_pd (a); > +} > + > +__attribute__((noinline, noclone)) double > +test_mask_reduce_min_pd (__mmask8 u, __m512d a) > +{ > + return _mm512_mask_reduce_min_pd (u, a); > +} > + > +__attribute__((noinline, noclone)) double > +test_mask_reduce_max_pd (__mmask8 u, __m512d a) > +{ > + return _mm512_mask_reduce_max_pd (u, a); > +} > + > +#define TESTOP(opname, op, type, suffix, neutral) \ > + do { \ > + type r1 = _mm512_reduce_##opname##_##suffix (v.x); \ > + type r2 = test_reduce_##opname##_##suffix (v.x); \ > + type r3 = neutral; \ > + if (r1 != r2) \ > + __builtin_abort (); \ > + for (int i = 0; i < SIZE; i++) \ > + r3 = r3 op v.a[i]; \ > + if (r1 != r3) \ > + __builtin_abort (); \ > + type r4 = _mm512_mask_reduce_##opname##_##suffix (MASK_VALUE, v.x); \ > + type r5 = test_mask_reduce_##opname##_##suffix (MASK_VALUE, v.x); \ > + if (r4 != r5) \ > + __builtin_abort (); \ > + r3 = neutral; \ > + for (int i = 0; i < SIZE; i++) \ > + if (MASK_VALUE & (1 << i)) \ > + r3 = r3 op v.a[i]; \ > + if (r4 != r3) \ > + __builtin_abort (); \ > + type r6 = _mm512_mask_reduce_##opname##_##suffix (0, v.x); \ > + type r7 = test_mask_reduce_##opname##_##suffix (0, v.x); \ > + if (r6 != r7 || r6 != neutral) \ > + __builtin_abort (); \ > + } while (0) > + > +#define SIZE (AVX512F_LEN / 32) > +#include "avx512f-mask-type.h" > + > +#define TEST_EPI32(c1, c2, c3, c4, c5, c6, c7, c8, \ > + c9, c10, c11, c12, c13, c14, c15, c16) \ > + do { \ > + UNION_TYPE (AVX512F_LEN, i_d) v; \ > + v.x = _mm512_set_epi32 (c1, c2, c3, c4, c5, c6, c7, c8, \ > + c9, c10, c11, c12, c13, c14, c15, c16); \ > + TESTOP (add, +, int, epi32, 0); \ > + TESTOP (mul, *, int, epi32, 1); \ > + TESTOP (and, &, int, epi32, ~0); \ > + TESTOP (or, |, int, epi32, 0); \ > + TESTOP (min, < v.a[i] ? r3 :, int, epi32, __INT_MAX__); \ > + TESTOP (max, > v.a[i] ? r3 :, int, epi32, -__INT_MAX__ - 1); \ > + TESTOP (min, < (unsigned) v.a[i] ? r3 :, unsigned, epu32, ~0U); \ > + TESTOP (max, > (unsigned) v.a[i] ? r3 :, unsigned, epu32, 0); \ > + } while (0) > + > +#define TEST_PS(c1, c2, c3, c4, c5, c6, c7, c8, \ > + c9, c10, c11, c12, c13, c14, c15, c16) \ > + do { \ > + UNION_TYPE (AVX512F_LEN, ) v; \ > + v.x = _mm512_set_ps (c1, c2, c3, c4, c5, c6, c7, c8, \ > + c9, c10, c11, c12, c13, c14, c15, c16); \ > + TESTOP (add, +, float, ps, 0.0f); \ > + TESTOP (mul, *, float, ps, 1.0f); \ > + TESTOP (min, < v.a[i] ? r3 :, float, ps, __builtin_inff ()); \ > + TESTOP (max, > v.a[i] ? r3 :, float, ps, -__builtin_inff ()); \ > + } while (0) > + > +static void > +test_epi32_ps (void) > +{ > + TEST_EPI32 (1, 2, 3, 4, 5, 6, 6, 5, 4, 3, 2, 1, 7, 6, 5, 4); > + TEST_EPI32 (-1, 15, -1, 7, -1, 7, -1, -1, 6, 6, -1, -1, -1, -1, 7, 6); > + TEST_PS (1, 2, 3, 4, 5, 6, 6, 5, 4, 3, 2, 1, 7, 6, 5, 4); > + TEST_PS (1.25f, 2.25f, -0.25f, 4.0f, -2.0f, 4.0f, -3.0f, 2.0f, > + -0.5f, -1.0f, 1.0f, -1.0f, 1.0f, 1.0f, 2.0f, 4.0f); > +} > + > +#undef SIZE > +#define SIZE (AVX512F_LEN / 64) > +#include "avx512f-mask-type.h" > + > +#define TEST_EPI64(c1, c2, c3, c4, c5, c6, c7, c8) \ > + do { \ > + UNION_TYPE (AVX512F_LEN, i_q) v; \ > + v.x = _mm512_set_epi64 (c1, c2, c3, c4, c5, c6, c7, c8); \ > + TESTOP (add, +, long long, epi64, 0); \ > + TESTOP (mul, *, long long, epi64, 1); \ > + TESTOP (and, &, long long, epi64, ~0LL); \ > + TESTOP (or, |, long long, epi64, 0); \ > + TESTOP (min, < v.a[i] ? r3 :, long long, epi64, __LONG_LONG_MAX__); \ > + TESTOP (max, > v.a[i] ? r3 :, long long, epi64, \ > + -__LONG_LONG_MAX__ - 1); \ > + TESTOP (min, < (unsigned long long) v.a[i] ? r3 :, \ > + unsigned long long, epu64, ~0ULL); \ > + TESTOP (max, > (unsigned long long) v.a[i] ? r3 :, \ > + unsigned long long, epu64, 0); \ > + } while (0) > + > +#define TEST_PD(c1, c2, c3, c4, c5, c6, c7, c8) \ > + do { \ > + UNION_TYPE (AVX512F_LEN, d) v; \ > + v.x = _mm512_set_pd (c1, c2, c3, c4, c5, c6, c7, c8); \ > + TESTOP (add, +, double, pd, 0.0); \ > + TESTOP (mul, *, double, pd, 1.0); \ > + TESTOP (min, < v.a[i] ? r3 :, double, pd, __builtin_inf ()); \ > + TESTOP (max, > v.a[i] ? r3 :, double, pd, -__builtin_inf ()); \ > + } while (0) > + > +static void > +test_epi64_pd (void) > +{ > + TEST_EPI64 (1, 2, 3, 4, 5, 6, 6, 5); > + TEST_EPI64 (-1, 15, -1, 7, -1, 7, -1, -1); > + TEST_PD (1, 2, 3, 4, 5, 6, 6, 5); > + TEST_PD (1.25f, 2.25f, -0.25f, 4.0f, -2.0f, 4.0f, -3.0f, 2.0f); > +} > + > +void > +test_512 (void) > +{ > + test_epi32_ps (); > + test_epi64_pd (); > +} > > Jakub
Hi Jakib, On 07 Apr 16:52, Jakub Jelinek wrote: > Hi! > > This patch is slightly larger, so I haven't included it in the patch I've > sent a few minutes ago. > > I've looked at godbolt for what ICC generates for these and picked sequences > that generate approx. as good code as that. For > min_epi64/max_epi64/min_epu64/max_epu64 there is a slight complication that > in AVX512F there is only _mm512_{min,max}_ep{i,u}64 but not the _mm256_ or > _mm_ ones, so we need to perform 512-bit operations all the time rather than > perform extractions, 256-bit operation, further extractions and then 128-bit > operations. > > Seems we need to teach our permutation code further instructions, e.g. > typedef long long V __attribute__((vector_size (64))); > typedef int W __attribute__((vector_size (64))); > W f0 (W x) { > return __builtin_shuffle (x, (W) { 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7 }); > } > V f1 (V x) { > return __builtin_shuffle (x, (V) { 4, 5, 6, 7, 0, 1, 2, 3 }); > } > generate unnecessarily bad code (could use vpshufi64x2 instruction), > guess that can be resolved for GCC8. > > Tested with > make -j272 -k check-gcc RUNTESTFLAGS='--target_board=unix\{-m32,-m64\} i386.exp' > on KNL, will bootstrap/regtest on my Haswell-E next, ok for trunk > if that passes? Patch is OK for trunk, thanks for implementing those intrinsics! -- K
--- gcc/config/i386/avx512fintrin.h.jj 2017-04-07 12:25:13.065643755 +0200 +++ gcc/config/i386/avx512fintrin.h 2017-04-07 16:34:37.976974227 +0200 @@ -13282,6 +13282,470 @@ _mm512_cmpgt_epu64_mask (__m512i __A, __ (__mmask8) -1); } +#undef __MM512_REDUCE_OP +#define __MM512_REDUCE_OP(op) \ + __v8si __T1 = (__v8si) _mm512_extracti64x4_epi64 (__A, 1); \ + __v8si __T2 = (__v8si) _mm512_extracti64x4_epi64 (__A, 0); \ + __m256i __T3 = (__m256i) (__T1 op __T2); \ + __v4si __T4 = (__v4si) _mm256_extracti128_si256 (__T3, 1); \ + __v4si __T5 = (__v4si) _mm256_extracti128_si256 (__T3, 0); \ + __v4si __T6 = __T4 op __T5; \ + __v4si __T7 = __builtin_shuffle (__T6, (__v4si) { 2, 3, 0, 1 }); \ + __v4si __T8 = __T6 op __T7; \ + return __T8[0] op __T8[1] + +extern __inline int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_reduce_add_epi32 (__m512i __A) +{ + __MM512_REDUCE_OP (+); +} + +extern __inline int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_reduce_mul_epi32 (__m512i __A) +{ + __MM512_REDUCE_OP (*); +} + +extern __inline int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_reduce_and_epi32 (__m512i __A) +{ + __MM512_REDUCE_OP (&); +} + +extern __inline int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_reduce_or_epi32 (__m512i __A) +{ + __MM512_REDUCE_OP (|); +} + +extern __inline int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_reduce_add_epi32 (__mmask16 __U, __m512i __A) +{ + __A = _mm512_maskz_mov_epi32 (__U, __A); + __MM512_REDUCE_OP (+); +} + +extern __inline int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_reduce_mul_epi32 (__mmask16 __U, __m512i __A) +{ + __A = _mm512_mask_mov_epi32 (_mm512_set1_epi32 (1), __U, __A); + __MM512_REDUCE_OP (*); +} + +extern __inline int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_reduce_and_epi32 (__mmask16 __U, __m512i __A) +{ + __A = _mm512_mask_mov_epi32 (_mm512_set1_epi32 (~0), __U, __A); + __MM512_REDUCE_OP (&); +} + +extern __inline int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_reduce_or_epi32 (__mmask16 __U, __m512i __A) +{ + __A = _mm512_maskz_mov_epi32 (__U, __A); + __MM512_REDUCE_OP (|); +} + +#undef __MM512_REDUCE_OP +#define __MM512_REDUCE_OP(op) \ + __m256i __T1 = (__m256i) _mm512_extracti64x4_epi64 (__A, 1); \ + __m256i __T2 = (__m256i) _mm512_extracti64x4_epi64 (__A, 0); \ + __m256i __T3 = _mm256_##op (__T1, __T2); \ + __m128i __T4 = (__m128i) _mm256_extracti128_si256 (__T3, 1); \ + __m128i __T5 = (__m128i) _mm256_extracti128_si256 (__T3, 0); \ + __m128i __T6 = _mm_##op (__T4, __T5); \ + __m128i __T7 = (__m128i) __builtin_shuffle ((__v4si) __T6, \ + (__v4si) { 2, 3, 0, 1 }); \ + __m128i __T8 = _mm_##op (__T6, __T7); \ + __m128i __T9 = (__m128i) __builtin_shuffle ((__v4si) __T8, \ + (__v4si) { 1, 0, 1, 0 }); \ + __v4si __T10 = (__v4si) _mm_##op (__T8, __T9); \ + return __T10[0] + +extern __inline int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_reduce_min_epi32 (__m512i __A) +{ + __MM512_REDUCE_OP (min_epi32); +} + +extern __inline int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_reduce_max_epi32 (__m512i __A) +{ + __MM512_REDUCE_OP (max_epi32); +} + +extern __inline unsigned int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_reduce_min_epu32 (__m512i __A) +{ + __MM512_REDUCE_OP (min_epu32); +} + +extern __inline unsigned int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_reduce_max_epu32 (__m512i __A) +{ + __MM512_REDUCE_OP (max_epu32); +} + +extern __inline int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_reduce_min_epi32 (__mmask16 __U, __m512i __A) +{ + __A = _mm512_mask_mov_epi32 (_mm512_set1_epi32 (__INT_MAX__), __U, __A); + __MM512_REDUCE_OP (min_epi32); +} + +extern __inline int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_reduce_max_epi32 (__mmask16 __U, __m512i __A) +{ + __A = _mm512_mask_mov_epi32 (_mm512_set1_epi32 (-__INT_MAX__ - 1), __U, __A); + __MM512_REDUCE_OP (max_epi32); +} + +extern __inline unsigned int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_reduce_min_epu32 (__mmask16 __U, __m512i __A) +{ + __A = _mm512_mask_mov_epi32 (_mm512_set1_epi32 (~0), __U, __A); + __MM512_REDUCE_OP (min_epu32); +} + +extern __inline unsigned int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_reduce_max_epu32 (__mmask16 __U, __m512i __A) +{ + __A = _mm512_maskz_mov_epi32 (__U, __A); + __MM512_REDUCE_OP (max_epu32); +} + +#undef __MM512_REDUCE_OP +#define __MM512_REDUCE_OP(op) \ + __m256 __T1 = (__m256) _mm512_extractf64x4_pd ((__m512d) __A, 1); \ + __m256 __T2 = (__m256) _mm512_extractf64x4_pd ((__m512d) __A, 0); \ + __m256 __T3 = __T1 op __T2; \ + __m128 __T4 = _mm256_extractf128_ps (__T3, 1); \ + __m128 __T5 = _mm256_extractf128_ps (__T3, 0); \ + __m128 __T6 = __T4 op __T5; \ + __m128 __T7 = __builtin_shuffle (__T6, (__v4si) { 2, 3, 0, 1 }); \ + __m128 __T8 = __T6 op __T7; \ + return __T8[0] op __T8[1] + +extern __inline float +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_reduce_add_ps (__m512 __A) +{ + __MM512_REDUCE_OP (+); +} + +extern __inline float +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_reduce_mul_ps (__m512 __A) +{ + __MM512_REDUCE_OP (*); +} + +extern __inline float +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_reduce_add_ps (__mmask16 __U, __m512 __A) +{ + __A = _mm512_maskz_mov_ps (__U, __A); + __MM512_REDUCE_OP (+); +} + +extern __inline float +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_reduce_mul_ps (__mmask16 __U, __m512 __A) +{ + __A = _mm512_mask_mov_ps (_mm512_set1_ps (1.0f), __U, __A); + __MM512_REDUCE_OP (*); +} + +#undef __MM512_REDUCE_OP +#define __MM512_REDUCE_OP(op) \ + __m256 __T1 = (__m256) _mm512_extractf64x4_pd ((__m512d) __A, 1); \ + __m256 __T2 = (__m256) _mm512_extractf64x4_pd ((__m512d) __A, 0); \ + __m256 __T3 = _mm256_##op (__T1, __T2); \ + __m128 __T4 = _mm256_extractf128_ps (__T3, 1); \ + __m128 __T5 = _mm256_extractf128_ps (__T3, 0); \ + __m128 __T6 = _mm_##op (__T4, __T5); \ + __m128 __T7 = __builtin_shuffle (__T6, (__v4si) { 2, 3, 0, 1 }); \ + __m128 __T8 = _mm_##op (__T6, __T7); \ + __m128 __T9 = __builtin_shuffle (__T8, (__v4si) { 1, 0, 1, 0 }); \ + __m128 __T10 = _mm_##op (__T8, __T9); \ + return __T10[0] + +extern __inline float +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_reduce_min_ps (__m512 __A) +{ + __MM512_REDUCE_OP (min_ps); +} + +extern __inline float +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_reduce_max_ps (__m512 __A) +{ + __MM512_REDUCE_OP (max_ps); +} + +extern __inline float +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_reduce_min_ps (__mmask16 __U, __m512 __A) +{ + __A = _mm512_mask_mov_ps (_mm512_set1_ps (__builtin_inff ()), __U, __A); + __MM512_REDUCE_OP (min_ps); +} + +extern __inline float +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_reduce_max_ps (__mmask16 __U, __m512 __A) +{ + __A = _mm512_mask_mov_ps (_mm512_set1_ps (-__builtin_inff ()), __U, __A); + __MM512_REDUCE_OP (max_ps); +} + +#undef __MM512_REDUCE_OP +#define __MM512_REDUCE_OP(op) \ + __v4di __T1 = (__v4di) _mm512_extracti64x4_epi64 (__A, 1); \ + __v4di __T2 = (__v4di) _mm512_extracti64x4_epi64 (__A, 0); \ + __m256i __T3 = (__m256i) (__T1 op __T2); \ + __v2di __T4 = (__v2di) _mm256_extracti128_si256 (__T3, 1); \ + __v2di __T5 = (__v2di) _mm256_extracti128_si256 (__T3, 0); \ + __v2di __T6 = __T4 op __T5; \ + return __T6[0] op __T6[1] + +extern __inline long long +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_reduce_add_epi64 (__m512i __A) +{ + __MM512_REDUCE_OP (+); +} + +extern __inline long long +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_reduce_mul_epi64 (__m512i __A) +{ + __MM512_REDUCE_OP (*); +} + +extern __inline long long +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_reduce_and_epi64 (__m512i __A) +{ + __MM512_REDUCE_OP (&); +} + +extern __inline long long +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_reduce_or_epi64 (__m512i __A) +{ + __MM512_REDUCE_OP (|); +} + +extern __inline long long +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_reduce_add_epi64 (__mmask8 __U, __m512i __A) +{ + __A = _mm512_maskz_mov_epi64 (__U, __A); + __MM512_REDUCE_OP (+); +} + +extern __inline long long +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_reduce_mul_epi64 (__mmask8 __U, __m512i __A) +{ + __A = _mm512_mask_mov_epi64 (_mm512_set1_epi64 (1LL), __U, __A); + __MM512_REDUCE_OP (*); +} + +extern __inline long long +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_reduce_and_epi64 (__mmask8 __U, __m512i __A) +{ + __A = _mm512_mask_mov_epi64 (_mm512_set1_epi64 (~0LL), __U, __A); + __MM512_REDUCE_OP (&); +} + +extern __inline long long +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_reduce_or_epi64 (__mmask8 __U, __m512i __A) +{ + __A = _mm512_maskz_mov_epi64 (__U, __A); + __MM512_REDUCE_OP (|); +} + +#undef __MM512_REDUCE_OP +#define __MM512_REDUCE_OP(op) \ + __m512i __T1 = _mm512_shuffle_i64x2 (__A, __A, 0x4e); \ + __m512i __T2 = _mm512_##op (__A, __T1); \ + __m512i __T3 \ + = (__m512i) __builtin_shuffle ((__v8di) __T2, \ + (__v8di) { 2, 3, 0, 1, 6, 7, 4, 5 });\ + __m512i __T4 = _mm512_##op (__T2, __T3); \ + __m512i __T5 \ + = (__m512i) __builtin_shuffle ((__v8di) __T4, \ + (__v8di) { 1, 0, 3, 2, 5, 4, 7, 6 });\ + __v8di __T6 = (__v8di) _mm512_##op (__T4, __T5); \ + return __T6[0] + +extern __inline long long +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_reduce_min_epi64 (__m512i __A) +{ + __MM512_REDUCE_OP (min_epi64); +} + +extern __inline long long +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_reduce_max_epi64 (__m512i __A) +{ + __MM512_REDUCE_OP (max_epi64); +} + +extern __inline long long +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_reduce_min_epi64 (__mmask8 __U, __m512i __A) +{ + __A = _mm512_mask_mov_epi64 (_mm512_set1_epi64 (__LONG_LONG_MAX__), + __U, __A); + __MM512_REDUCE_OP (min_epi64); +} + +extern __inline long long +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_reduce_max_epi64 (__mmask8 __U, __m512i __A) +{ + __A = _mm512_mask_mov_epi64 (_mm512_set1_epi64 (-__LONG_LONG_MAX__ - 1), + __U, __A); + __MM512_REDUCE_OP (max_epi64); +} + +extern __inline unsigned long long +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_reduce_min_epu64 (__m512i __A) +{ + __MM512_REDUCE_OP (min_epu64); +} + +extern __inline unsigned long long +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_reduce_max_epu64 (__m512i __A) +{ + __MM512_REDUCE_OP (max_epu64); +} + +extern __inline unsigned long long +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_reduce_min_epu64 (__mmask8 __U, __m512i __A) +{ + __A = _mm512_mask_mov_epi64 (_mm512_set1_epi64 (~0LL), __U, __A); + __MM512_REDUCE_OP (min_epu64); +} + +extern __inline unsigned long long +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_reduce_max_epu64 (__mmask8 __U, __m512i __A) +{ + __A = _mm512_maskz_mov_epi64 (__U, __A); + __MM512_REDUCE_OP (max_epu64); +} + +#undef __MM512_REDUCE_OP +#define __MM512_REDUCE_OP(op) \ + __m256d __T1 = (__m256d) _mm512_extractf64x4_pd (__A, 1); \ + __m256d __T2 = (__m256d) _mm512_extractf64x4_pd (__A, 0); \ + __m256d __T3 = __T1 op __T2; \ + __m128d __T4 = _mm256_extractf128_pd (__T3, 1); \ + __m128d __T5 = _mm256_extractf128_pd (__T3, 0); \ + __m128d __T6 = __T4 op __T5; \ + return __T6[0] op __T6[1] + +extern __inline double +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_reduce_add_pd (__m512d __A) +{ + __MM512_REDUCE_OP (+); +} + +extern __inline double +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_reduce_mul_pd (__m512d __A) +{ + __MM512_REDUCE_OP (*); +} + +extern __inline double +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_reduce_add_pd (__mmask8 __U, __m512d __A) +{ + __A = _mm512_maskz_mov_pd (__U, __A); + __MM512_REDUCE_OP (+); +} + +extern __inline double +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_reduce_mul_pd (__mmask8 __U, __m512d __A) +{ + __A = _mm512_mask_mov_pd (_mm512_set1_pd (1.0), __U, __A); + __MM512_REDUCE_OP (*); +} + +#undef __MM512_REDUCE_OP +#define __MM512_REDUCE_OP(op) \ + __m256d __T1 = (__m256d) _mm512_extractf64x4_pd (__A, 1); \ + __m256d __T2 = (__m256d) _mm512_extractf64x4_pd (__A, 0); \ + __m256d __T3 = _mm256_##op (__T1, __T2); \ + __m128d __T4 = _mm256_extractf128_pd (__T3, 1); \ + __m128d __T5 = _mm256_extractf128_pd (__T3, 0); \ + __m128d __T6 = _mm_##op (__T4, __T5); \ + __m128d __T7 = (__m128d) __builtin_shuffle (__T6, (__v2di) { 1, 0 }); \ + __m128d __T8 = _mm_##op (__T6, __T7); \ + return __T8[0] + +extern __inline double +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_reduce_min_pd (__m512d __A) +{ + __MM512_REDUCE_OP (min_pd); +} + +extern __inline double +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_reduce_max_pd (__m512d __A) +{ + __MM512_REDUCE_OP (max_pd); +} + +extern __inline double +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_reduce_min_pd (__mmask8 __U, __m512d __A) +{ + __A = _mm512_mask_mov_pd (_mm512_set1_pd (__builtin_inf ()), __U, __A); + __MM512_REDUCE_OP (min_pd); +} + +extern __inline double +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_reduce_max_pd (__mmask8 __U, __m512d __A) +{ + __A = _mm512_mask_mov_pd (_mm512_set1_pd (-__builtin_inf ()), __U, __A); + __MM512_REDUCE_OP (max_pd); +} + +#undef __MM512_REDUCE_OP + #ifdef __DISABLE_AVX512F__ #undef __DISABLE_AVX512F__ #pragma GCC pop_options --- gcc/testsuite/gcc.target/i386/avx512f-reduce-op-1.c.jj 2017-04-07 12:25:19.578556015 +0200 +++ gcc/testsuite/gcc.target/i386/avx512f-reduce-op-1.c 2017-04-07 16:34:37.972974281 +0200 @@ -0,0 +1,410 @@ +/* { dg-do run } */ +/* { dg-options "-O2 -mavx512f" } */ +/* { dg-require-effective-target avx512f } */ + +#define AVX512F + +#include "avx512f-helper.h" + +__attribute__((noinline, noclone)) int +test_reduce_add_epi32 (__m512i a) +{ + return _mm512_reduce_add_epi32 (a); +} + +__attribute__((noinline, noclone)) int +test_reduce_mul_epi32 (__m512i a) +{ + return _mm512_reduce_mul_epi32 (a); +} + +__attribute__((noinline, noclone)) int +test_reduce_and_epi32 (__m512i a) +{ + return _mm512_reduce_and_epi32 (a); +} + +__attribute__((noinline, noclone)) int +test_reduce_or_epi32 (__m512i a) +{ + return _mm512_reduce_or_epi32 (a); +} + +__attribute__((noinline, noclone)) int +test_mask_reduce_add_epi32 (__mmask16 u, __m512i a) +{ + return _mm512_mask_reduce_add_epi32 (u, a); +} + +__attribute__((noinline, noclone)) int +test_mask_reduce_mul_epi32 (__mmask16 u, __m512i a) +{ + return _mm512_mask_reduce_mul_epi32 (u, a); +} + +__attribute__((noinline, noclone)) int +test_mask_reduce_and_epi32 (__mmask16 u, __m512i a) +{ + return _mm512_mask_reduce_and_epi32 (u, a); +} + +__attribute__((noinline, noclone)) int +test_mask_reduce_or_epi32 (__mmask16 u, __m512i a) +{ + return _mm512_mask_reduce_or_epi32 (u, a); +} + +__attribute__((noinline, noclone)) int +test_reduce_min_epi32 (__m512i a) +{ + return _mm512_reduce_min_epi32 (a); +} + +__attribute__((noinline, noclone)) int +test_reduce_max_epi32 (__m512i a) +{ + return _mm512_reduce_max_epi32 (a); +} + +__attribute__((noinline, noclone)) unsigned int +test_reduce_min_epu32 (__m512i a) +{ + return _mm512_reduce_min_epu32 (a); +} + +__attribute__((noinline, noclone)) unsigned int +test_reduce_max_epu32 (__m512i a) +{ + return _mm512_reduce_max_epu32 (a); +} + +__attribute__((noinline, noclone)) int +test_mask_reduce_min_epi32 (__mmask16 u, __m512i a) +{ + return _mm512_mask_reduce_min_epi32 (u, a); +} + +__attribute__((noinline, noclone)) int +test_mask_reduce_max_epi32 (__mmask16 u, __m512i a) +{ + return _mm512_mask_reduce_max_epi32 (u, a); +} + +__attribute__((noinline, noclone)) unsigned int +test_mask_reduce_min_epu32 (__mmask16 u, __m512i a) +{ + return _mm512_mask_reduce_min_epu32 (u, a); +} + +__attribute__((noinline, noclone)) unsigned int +test_mask_reduce_max_epu32 (__mmask16 u, __m512i a) +{ + return _mm512_mask_reduce_max_epu32 (u, a); +} + +__attribute__((noinline, noclone)) float +test_reduce_add_ps (__m512 a) +{ + return _mm512_reduce_add_ps (a); +} + +__attribute__((noinline, noclone)) float +test_reduce_mul_ps (__m512 a) +{ + return _mm512_reduce_mul_ps (a); +} + +__attribute__((noinline, noclone)) float +test_mask_reduce_add_ps (__mmask16 u, __m512 a) +{ + return _mm512_mask_reduce_add_ps (u, a); +} + +__attribute__((noinline, noclone)) float +test_mask_reduce_mul_ps (__mmask16 u, __m512 a) +{ + return _mm512_mask_reduce_mul_ps (u, a); +} + +__attribute__((noinline, noclone)) float +test_reduce_min_ps (__m512 a) +{ + return _mm512_reduce_min_ps (a); +} + +__attribute__((noinline, noclone)) float +test_reduce_max_ps (__m512 a) +{ + return _mm512_reduce_max_ps (a); +} + +__attribute__((noinline, noclone)) float +test_mask_reduce_min_ps (__mmask16 u, __m512 a) +{ + return _mm512_mask_reduce_min_ps (u, a); +} + +__attribute__((noinline, noclone)) float +test_mask_reduce_max_ps (__mmask16 u, __m512 a) +{ + return _mm512_mask_reduce_max_ps (u, a); +} + +__attribute__((noinline, noclone)) long long +test_reduce_add_epi64 (__m512i a) +{ + return _mm512_reduce_add_epi64 (a); +} + +__attribute__((noinline, noclone)) long long +test_reduce_mul_epi64 (__m512i a) +{ + return _mm512_reduce_mul_epi64 (a); +} + +__attribute__((noinline, noclone)) long long +test_reduce_and_epi64 (__m512i a) +{ + return _mm512_reduce_and_epi64 (a); +} + +__attribute__((noinline, noclone)) long long +test_reduce_or_epi64 (__m512i a) +{ + return _mm512_reduce_or_epi64 (a); +} + +__attribute__((noinline, noclone)) long long +test_mask_reduce_add_epi64 (__mmask8 u, __m512i a) +{ + return _mm512_mask_reduce_add_epi64 (u, a); +} + +__attribute__((noinline, noclone)) long long +test_mask_reduce_mul_epi64 (__mmask8 u, __m512i a) +{ + return _mm512_mask_reduce_mul_epi64 (u, a); +} + +__attribute__((noinline, noclone)) long long +test_mask_reduce_and_epi64 (__mmask8 u, __m512i a) +{ + return _mm512_mask_reduce_and_epi64 (u, a); +} + +__attribute__((noinline, noclone)) long long +test_mask_reduce_or_epi64 (__mmask8 u, __m512i a) +{ + return _mm512_mask_reduce_or_epi64 (u, a); +} + +__attribute__((noinline, noclone)) long long +test_reduce_min_epi64 (__m512i a) +{ + return _mm512_reduce_min_epi64 (a); +} + +__attribute__((noinline, noclone)) long long +test_reduce_max_epi64 (__m512i a) +{ + return _mm512_reduce_max_epi64 (a); +} + +__attribute__((noinline, noclone)) unsigned long long +test_reduce_min_epu64 (__m512i a) +{ + return _mm512_reduce_min_epu64 (a); +} + +__attribute__((noinline, noclone)) unsigned long long +test_reduce_max_epu64 (__m512i a) +{ + return _mm512_reduce_max_epu64 (a); +} + +__attribute__((noinline, noclone)) long long +test_mask_reduce_min_epi64 (__mmask8 u, __m512i a) +{ + return _mm512_mask_reduce_min_epi64 (u, a); +} + +__attribute__((noinline, noclone)) long long +test_mask_reduce_max_epi64 (__mmask8 u, __m512i a) +{ + return _mm512_mask_reduce_max_epi64 (u, a); +} + +__attribute__((noinline, noclone)) unsigned long long +test_mask_reduce_min_epu64 (__mmask8 u, __m512i a) +{ + return _mm512_mask_reduce_min_epu64 (u, a); +} + +__attribute__((noinline, noclone)) unsigned long long +test_mask_reduce_max_epu64 (__mmask8 u, __m512i a) +{ + return _mm512_mask_reduce_max_epu64 (u, a); +} + +__attribute__((noinline, noclone)) double +test_reduce_add_pd (__m512d a) +{ + return _mm512_reduce_add_pd (a); +} + +__attribute__((noinline, noclone)) double +test_reduce_mul_pd (__m512d a) +{ + return _mm512_reduce_mul_pd (a); +} + +__attribute__((noinline, noclone)) double +test_mask_reduce_add_pd (__mmask8 u, __m512d a) +{ + return _mm512_mask_reduce_add_pd (u, a); +} + +__attribute__((noinline, noclone)) double +test_mask_reduce_mul_pd (__mmask8 u, __m512d a) +{ + return _mm512_mask_reduce_mul_pd (u, a); +} + +__attribute__((noinline, noclone)) double +test_reduce_min_pd (__m512d a) +{ + return _mm512_reduce_min_pd (a); +} + +__attribute__((noinline, noclone)) double +test_reduce_max_pd (__m512d a) +{ + return _mm512_reduce_max_pd (a); +} + +__attribute__((noinline, noclone)) double +test_mask_reduce_min_pd (__mmask8 u, __m512d a) +{ + return _mm512_mask_reduce_min_pd (u, a); +} + +__attribute__((noinline, noclone)) double +test_mask_reduce_max_pd (__mmask8 u, __m512d a) +{ + return _mm512_mask_reduce_max_pd (u, a); +} + +#define TESTOP(opname, op, type, suffix, neutral) \ + do { \ + type r1 = _mm512_reduce_##opname##_##suffix (v.x); \ + type r2 = test_reduce_##opname##_##suffix (v.x); \ + type r3 = neutral; \ + if (r1 != r2) \ + __builtin_abort (); \ + for (int i = 0; i < SIZE; i++) \ + r3 = r3 op v.a[i]; \ + if (r1 != r3) \ + __builtin_abort (); \ + type r4 = _mm512_mask_reduce_##opname##_##suffix (MASK_VALUE, v.x); \ + type r5 = test_mask_reduce_##opname##_##suffix (MASK_VALUE, v.x); \ + if (r4 != r5) \ + __builtin_abort (); \ + r3 = neutral; \ + for (int i = 0; i < SIZE; i++) \ + if (MASK_VALUE & (1 << i)) \ + r3 = r3 op v.a[i]; \ + if (r4 != r3) \ + __builtin_abort (); \ + type r6 = _mm512_mask_reduce_##opname##_##suffix (0, v.x); \ + type r7 = test_mask_reduce_##opname##_##suffix (0, v.x); \ + if (r6 != r7 || r6 != neutral) \ + __builtin_abort (); \ + } while (0) + +#define SIZE (AVX512F_LEN / 32) +#include "avx512f-mask-type.h" + +#define TEST_EPI32(c1, c2, c3, c4, c5, c6, c7, c8, \ + c9, c10, c11, c12, c13, c14, c15, c16) \ + do { \ + UNION_TYPE (AVX512F_LEN, i_d) v; \ + v.x = _mm512_set_epi32 (c1, c2, c3, c4, c5, c6, c7, c8, \ + c9, c10, c11, c12, c13, c14, c15, c16); \ + TESTOP (add, +, int, epi32, 0); \ + TESTOP (mul, *, int, epi32, 1); \ + TESTOP (and, &, int, epi32, ~0); \ + TESTOP (or, |, int, epi32, 0); \ + TESTOP (min, < v.a[i] ? r3 :, int, epi32, __INT_MAX__); \ + TESTOP (max, > v.a[i] ? r3 :, int, epi32, -__INT_MAX__ - 1); \ + TESTOP (min, < (unsigned) v.a[i] ? r3 :, unsigned, epu32, ~0U); \ + TESTOP (max, > (unsigned) v.a[i] ? r3 :, unsigned, epu32, 0); \ + } while (0) + +#define TEST_PS(c1, c2, c3, c4, c5, c6, c7, c8, \ + c9, c10, c11, c12, c13, c14, c15, c16) \ + do { \ + UNION_TYPE (AVX512F_LEN, ) v; \ + v.x = _mm512_set_ps (c1, c2, c3, c4, c5, c6, c7, c8, \ + c9, c10, c11, c12, c13, c14, c15, c16); \ + TESTOP (add, +, float, ps, 0.0f); \ + TESTOP (mul, *, float, ps, 1.0f); \ + TESTOP (min, < v.a[i] ? r3 :, float, ps, __builtin_inff ()); \ + TESTOP (max, > v.a[i] ? r3 :, float, ps, -__builtin_inff ()); \ + } while (0) + +static void +test_epi32_ps (void) +{ + TEST_EPI32 (1, 2, 3, 4, 5, 6, 6, 5, 4, 3, 2, 1, 7, 6, 5, 4); + TEST_EPI32 (-1, 15, -1, 7, -1, 7, -1, -1, 6, 6, -1, -1, -1, -1, 7, 6); + TEST_PS (1, 2, 3, 4, 5, 6, 6, 5, 4, 3, 2, 1, 7, 6, 5, 4); + TEST_PS (1.25f, 2.25f, -0.25f, 4.0f, -2.0f, 4.0f, -3.0f, 2.0f, + -0.5f, -1.0f, 1.0f, -1.0f, 1.0f, 1.0f, 2.0f, 4.0f); +} + +#undef SIZE +#define SIZE (AVX512F_LEN / 64) +#include "avx512f-mask-type.h" + +#define TEST_EPI64(c1, c2, c3, c4, c5, c6, c7, c8) \ + do { \ + UNION_TYPE (AVX512F_LEN, i_q) v; \ + v.x = _mm512_set_epi64 (c1, c2, c3, c4, c5, c6, c7, c8); \ + TESTOP (add, +, long long, epi64, 0); \ + TESTOP (mul, *, long long, epi64, 1); \ + TESTOP (and, &, long long, epi64, ~0LL); \ + TESTOP (or, |, long long, epi64, 0); \ + TESTOP (min, < v.a[i] ? r3 :, long long, epi64, __LONG_LONG_MAX__); \ + TESTOP (max, > v.a[i] ? r3 :, long long, epi64, \ + -__LONG_LONG_MAX__ - 1); \ + TESTOP (min, < (unsigned long long) v.a[i] ? r3 :, \ + unsigned long long, epu64, ~0ULL); \ + TESTOP (max, > (unsigned long long) v.a[i] ? r3 :, \ + unsigned long long, epu64, 0); \ + } while (0) + +#define TEST_PD(c1, c2, c3, c4, c5, c6, c7, c8) \ + do { \ + UNION_TYPE (AVX512F_LEN, d) v; \ + v.x = _mm512_set_pd (c1, c2, c3, c4, c5, c6, c7, c8); \ + TESTOP (add, +, double, pd, 0.0); \ + TESTOP (mul, *, double, pd, 1.0); \ + TESTOP (min, < v.a[i] ? r3 :, double, pd, __builtin_inf ()); \ + TESTOP (max, > v.a[i] ? r3 :, double, pd, -__builtin_inf ()); \ + } while (0) + +static void +test_epi64_pd (void) +{ + TEST_EPI64 (1, 2, 3, 4, 5, 6, 6, 5); + TEST_EPI64 (-1, 15, -1, 7, -1, 7, -1, -1); + TEST_PD (1, 2, 3, 4, 5, 6, 6, 5); + TEST_PD (1.25f, 2.25f, -0.25f, 4.0f, -2.0f, 4.0f, -3.0f, 2.0f); +} + +void +test_512 (void) +{ + test_epi32_ps (); + test_epi64_pd (); +}