diff mbox series

[i386] Mask generation in avx2intrin.h

Message ID alpine.DEB.2.02.1711251032400.27363@stedding.saclay.inria.fr
State New
Headers show
Series [i386] Mask generation in avx2intrin.h | expand

Commit Message

Marc Glisse Nov. 25, 2017, 10 a.m. UTC
Hello,

the way full masks are generated currently in avx2intrin.h is 
questionable: opaque for the inline functions, weird/wrong for the macros.

It is possible we may want to add code so the constant mask with all ones 
may be generated with vxorpd+vcmpeqpd instead of loading it from memory, 
but that looks like something that should be decided globally, not in each 
instruction that uses it.

Bootstrap+regtest on x86_64-pc-linux-gnu (skylake).

2017-11-27  Marc Glisse  <marc.glisse@inria.fr>

 	PR target/80885
 	* config/i386/avx2intrin.h (_mm_i32gather_pd): Rewrite mask generation.
 	(_mm256_i32gather_pd): Likewise.
 	(_mm_i64gather_pd): Likewise.
 	(_mm256_i64gather_pd): Likewise.
 	(_mm_i32gather_ps): Likewise.
 	(_mm256_i32gather_ps): Likewise.
 	(_mm_i64gather_ps): Likewise.
 	(_mm256_i64gather_ps): Likewise.

Comments

Marc Glisse Dec. 5, 2017, 12:19 p.m. UTC | #1
Adding Cc: Uros and Kirill
https://gcc.gnu.org/ml/gcc-patches/2017-11/msg02233.html

On Sat, 25 Nov 2017, Marc Glisse wrote:

> Hello,
>
> the way full masks are generated currently in avx2intrin.h is questionable: 
> opaque for the inline functions, weird/wrong for the macros.
>
> It is possible we may want to add code so the constant mask with all ones may 
> be generated with vxorpd+vcmpeqpd instead of loading it from memory, but that 
> looks like something that should be decided globally, not in each instruction 
> that uses it.
>
> Bootstrap+regtest on x86_64-pc-linux-gnu (skylake).
>
> 2017-11-27  Marc Glisse  <marc.glisse@inria.fr>
>
> 	PR target/80885
> 	* config/i386/avx2intrin.h (_mm_i32gather_pd): Rewrite mask generation.
> 	(_mm256_i32gather_pd): Likewise.
> 	(_mm_i64gather_pd): Likewise.
> 	(_mm256_i64gather_pd): Likewise.
> 	(_mm_i32gather_ps): Likewise.
> 	(_mm256_i32gather_ps): Likewise.
> 	(_mm_i64gather_ps): Likewise.
> 	(_mm256_i64gather_ps): Likewise.
Marc Glisse April 30, 2018, 9:04 p.m. UTC | #2
Ping https://gcc.gnu.org/ml/gcc-patches/2017-11/msg02233.html

On Sat, 25 Nov 2017, Marc Glisse wrote:

> Hello,
>
> the way full masks are generated currently in avx2intrin.h is questionable: 
> opaque for the inline functions, weird/wrong for the macros.
>
> It is possible we may want to add code so the constant mask with all ones may 
> be generated with vxorpd+vcmpeqpd instead of loading it from memory, but that 
> looks like something that should be decided globally, not in each instruction 
> that uses it.
>
> Bootstrap+regtest on x86_64-pc-linux-gnu (skylake).
>
> 2017-11-27  Marc Glisse  <marc.glisse@inria.fr>
>
> 	PR target/80885
> 	* config/i386/avx2intrin.h (_mm_i32gather_pd): Rewrite mask generation.
> 	(_mm256_i32gather_pd): Likewise.
> 	(_mm_i64gather_pd): Likewise.
> 	(_mm256_i64gather_pd): Likewise.
> 	(_mm_i32gather_ps): Likewise.
> 	(_mm256_i32gather_ps): Likewise.
> 	(_mm_i64gather_ps): Likewise.
> 	(_mm256_i64gather_ps): Likewise.
diff mbox series

Patch

Index: gcc/config/i386/avx2intrin.h
===================================================================
--- gcc/config/i386/avx2intrin.h	(revision 255140)
+++ gcc/config/i386/avx2intrin.h	(working copy)
@@ -1241,22 +1241,21 @@  __attribute__ ((__gnu_inline__, __always
 _mm_srlv_epi64 (__m128i __X, __m128i __Y)
 {
   return (__m128i) __builtin_ia32_psrlv2di ((__v2di)__X, (__v2di)__Y);
 }
 
 #ifdef __OPTIMIZE__
 extern __inline __m128d
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm_i32gather_pd (double const *__base, __m128i __index, const int __scale)
 {
-  __v2df __zero = _mm_setzero_pd ();
-  __v2df __mask = _mm_cmpeq_pd (__zero, __zero);
+  __v2df __mask = (__v2df)_mm_set1_epi64x (-1);
 
   return (__m128d) __builtin_ia32_gathersiv2df (_mm_undefined_pd (),
 						__base,
 						(__v4si)__index,
 						__mask,
 						__scale);
 }
 
 extern __inline __m128d
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
@@ -1267,22 +1266,21 @@  _mm_mask_i32gather_pd (__m128d __src, do
 						__base,
 						(__v4si)__index,
 						(__v2df)__mask,
 						__scale);
 }
 
 extern __inline __m256d
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm256_i32gather_pd (double const *__base, __m128i __index, const int __scale)
 {
-  __v4df __zero = _mm256_setzero_pd ();
-  __v4df __mask = _mm256_cmp_pd (__zero, __zero, _CMP_EQ_OQ);
+  __v4df __mask = (__v4df)_mm256_set1_epi64x (-1);
 
   return (__m256d) __builtin_ia32_gathersiv4df (_mm256_undefined_pd (),
 						__base,
 						(__v4si)__index,
 						__mask,
 						__scale);
 }
 
 extern __inline __m256d
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
@@ -1294,21 +1292,21 @@  _mm256_mask_i32gather_pd (__m256d __src,
 						(__v4si)__index,
 						(__v4df)__mask,
 						__scale);
 }
 
 extern __inline __m128d
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm_i64gather_pd (double const *__base, __m128i __index, const int __scale)
 {
   __v2df __src = _mm_setzero_pd ();
-  __v2df __mask = _mm_cmpeq_pd (__src, __src);
+  __v2df __mask = (__v2df)_mm_set1_epi64x (-1);
 
   return (__m128d) __builtin_ia32_gatherdiv2df (__src,
 						__base,
 						(__v2di)__index,
 						__mask,
 						__scale);
 }
 
 extern __inline __m128d
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
@@ -1320,21 +1318,21 @@  _mm_mask_i64gather_pd (__m128d __src, do
 						(__v2di)__index,
 						(__v2df)__mask,
 						__scale);
 }
 
 extern __inline __m256d
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm256_i64gather_pd (double const *__base, __m256i __index, const int __scale)
 {
   __v4df __src = _mm256_setzero_pd ();
-  __v4df __mask = _mm256_cmp_pd (__src, __src, _CMP_EQ_OQ);
+  __v4df __mask = (__v4df)_mm256_set1_epi64x (-1);
 
   return (__m256d) __builtin_ia32_gatherdiv4df (__src,
 						__base,
 						(__v4di)__index,
 						__mask,
 						__scale);
 }
 
 extern __inline __m256d
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
@@ -1346,21 +1344,21 @@  _mm256_mask_i64gather_pd (__m256d __src,
 						(__v4di)__index,
 						(__v4df)__mask,
 						__scale);
 }
 
 extern __inline __m128
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm_i32gather_ps (float const *__base, __m128i __index, const int __scale)
 {
   __v4sf __src = _mm_setzero_ps ();
-  __v4sf __mask = _mm_cmpeq_ps (__src, __src);
+  __v4sf __mask = (__v4sf)_mm_set1_epi64x (-1);
 
   return (__m128) __builtin_ia32_gathersiv4sf (__src,
 					       __base,
 					       (__v4si)__index,
 					       __mask,
 					       __scale);
 }
 
 extern __inline __m128
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
@@ -1372,21 +1370,21 @@  _mm_mask_i32gather_ps (__m128 __src, flo
 					       (__v4si)__index,
 					       (__v4sf)__mask,
 					       __scale);
 }
 
 extern __inline __m256
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm256_i32gather_ps (float const *__base, __m256i __index, const int __scale)
 {
   __v8sf __src = _mm256_setzero_ps ();
-  __v8sf __mask = _mm256_cmp_ps (__src, __src, _CMP_EQ_OQ);
+  __v8sf __mask = (__v8sf)_mm256_set1_epi64x (-1);
 
   return (__m256) __builtin_ia32_gathersiv8sf (__src,
 					       __base,
 					       (__v8si)__index,
 					       __mask,
 					       __scale);
 }
 
 extern __inline __m256
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
@@ -1398,21 +1396,21 @@  _mm256_mask_i32gather_ps (__m256 __src,
 					       (__v8si)__index,
 					       (__v8sf)__mask,
 					       __scale);
 }
 
 extern __inline __m128
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm_i64gather_ps (float const *__base, __m128i __index, const int __scale)
 {
   __v4sf __src = _mm_setzero_ps ();
-  __v4sf __mask = _mm_cmpeq_ps (__src, __src);
+  __v4sf __mask = (__v4sf)_mm_set1_epi64x (-1);
 
   return (__m128) __builtin_ia32_gatherdiv4sf (__src,
 					       __base,
 					       (__v2di)__index,
 					       __mask,
 					       __scale);
 }
 
 extern __inline __m128
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
@@ -1424,21 +1422,21 @@  _mm_mask_i64gather_ps (__m128 __src, flo
 						(__v2di)__index,
 						(__v4sf)__mask,
 						__scale);
 }
 
 extern __inline __m128
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm256_i64gather_ps (float const *__base, __m256i __index, const int __scale)
 {
   __v4sf __src = _mm_setzero_ps ();
-  __v4sf __mask = _mm_cmpeq_ps (__src, __src);
+  __v4sf __mask = (__v4sf)_mm_set1_epi64x (-1);
 
   return (__m128) __builtin_ia32_gatherdiv4sf256 (__src,
 						  __base,
 						  (__v4di)__index,
 						  __mask,
 						  __scale);
 }
 
 extern __inline __m128
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
@@ -1665,126 +1663,119 @@  _mm256_mask_i64gather_epi32 (__m128i __s
 						   __base,
 						   (__v4di)__index,
 						   (__v4si)__mask,
 						   __scale);
 }
 #else /* __OPTIMIZE__ */
 #define _mm_i32gather_pd(BASE, INDEX, SCALE)				\
   (__m128d) __builtin_ia32_gathersiv2df ((__v2df) _mm_setzero_pd (),	\
 					 (double const *)BASE,		\
 					 (__v4si)(__m128i)INDEX,	\
-					 (__v2df)_mm_set1_pd(		\
-					   (double)(long long int) -1), \
+					 (__v2df)_mm_set1_epi64x (-1),	\
 					 (int)SCALE)
 
 #define _mm_mask_i32gather_pd(SRC, BASE, INDEX, MASK, SCALE)	 \
   (__m128d) __builtin_ia32_gathersiv2df ((__v2df)(__m128d)SRC,	 \
 					 (double const *)BASE,	 \
 					 (__v4si)(__m128i)INDEX, \
 					 (__v2df)(__m128d)MASK,	 \
 					 (int)SCALE)
 
 #define _mm256_i32gather_pd(BASE, INDEX, SCALE)				\
   (__m256d) __builtin_ia32_gathersiv4df ((__v4df) _mm256_setzero_pd (),	\
 					 (double const *)BASE,		\
 					 (__v4si)(__m128i)INDEX,	\
-					 (__v4df)_mm256_set1_pd(	\
-					   (double)(long long int) -1), \
+					 (__v4df)_mm256_set1_epi64x(-1),\
 					 (int)SCALE)
 
 #define _mm256_mask_i32gather_pd(SRC, BASE, INDEX, MASK, SCALE)	 \
   (__m256d) __builtin_ia32_gathersiv4df ((__v4df)(__m256d)SRC,	 \
 					 (double const *)BASE,	 \
 					 (__v4si)(__m128i)INDEX, \
 					 (__v4df)(__m256d)MASK,	 \
 					 (int)SCALE)
 
 #define _mm_i64gather_pd(BASE, INDEX, SCALE)				\
   (__m128d) __builtin_ia32_gatherdiv2df ((__v2df) _mm_setzero_pd (),	\
 					 (double const *)BASE,		\
 					 (__v2di)(__m128i)INDEX,	\
-					 (__v2df)_mm_set1_pd(		\
-					   (double)(long long int) -1), \
+					 (__v2df)_mm_set1_epi64x (-1),	\
 					 (int)SCALE)
 
 #define _mm_mask_i64gather_pd(SRC, BASE, INDEX, MASK, SCALE)	 \
   (__m128d) __builtin_ia32_gatherdiv2df ((__v2df)(__m128d)SRC,	 \
 					 (double const *)BASE,	 \
 					 (__v2di)(__m128i)INDEX, \
 					 (__v2df)(__m128d)MASK,	 \
 					 (int)SCALE)
 
 #define _mm256_i64gather_pd(BASE, INDEX, SCALE)				\
   (__m256d) __builtin_ia32_gatherdiv4df ((__v4df) _mm256_setzero_pd (),	\
 					 (double const *)BASE,		\
 					 (__v4di)(__m256i)INDEX,	\
-					 (__v4df)_mm256_set1_pd(	\
-					   (double)(long long int) -1), \
+					 (__v4df)_mm256_set1_epi64x(-1),\
 					 (int)SCALE)
 
 #define _mm256_mask_i64gather_pd(SRC, BASE, INDEX, MASK, SCALE)	 \
   (__m256d) __builtin_ia32_gatherdiv4df ((__v4df)(__m256d)SRC,	 \
 					 (double const *)BASE,	 \
 					 (__v4di)(__m256i)INDEX, \
 					 (__v4df)(__m256d)MASK,	 \
 					 (int)SCALE)
 
 #define _mm_i32gather_ps(BASE, INDEX, SCALE)				\
   (__m128) __builtin_ia32_gathersiv4sf ((__v4sf) _mm_setzero_ps (),	\
 					(float const *)BASE,		\
 					(__v4si)(__m128i)INDEX,		\
-					_mm_set1_ps ((float)(int) -1),	\
+					(__v4sf)_mm_set1_epi64x (-1),	\
 					(int)SCALE)
 
 #define _mm_mask_i32gather_ps(SRC, BASE, INDEX, MASK, SCALE)	 \
   (__m128) __builtin_ia32_gathersiv4sf ((__v4sf)(__m128d)SRC,	 \
 					(float const *)BASE,	 \
 					(__v4si)(__m128i)INDEX,	 \
 					(__v4sf)(__m128d)MASK,	 \
 					(int)SCALE)
 
 #define _mm256_i32gather_ps(BASE, INDEX, SCALE)			       \
   (__m256) __builtin_ia32_gathersiv8sf ((__v8sf) _mm256_setzero_ps (), \
 					(float const *)BASE,	       \
 					(__v8si)(__m256i)INDEX,	       \
-					(__v8sf)_mm256_set1_ps (       \
-					  (float)(int) -1),	       \
+					(__v8sf)_mm256_set1_epi64x(-1),\
 					(int)SCALE)
 
 #define _mm256_mask_i32gather_ps(SRC, BASE, INDEX, MASK, SCALE) \
   (__m256) __builtin_ia32_gathersiv8sf ((__v8sf)(__m256)SRC,	\
 					(float const *)BASE,	\
 					(__v8si)(__m256i)INDEX, \
 					(__v8sf)(__m256d)MASK,	\
 					(int)SCALE)
 
 #define _mm_i64gather_ps(BASE, INDEX, SCALE)				\
   (__m128) __builtin_ia32_gatherdiv4sf ((__v4sf) _mm_setzero_pd (),	\
 					(float const *)BASE,		\
 					(__v2di)(__m128i)INDEX,		\
-					(__v4sf)_mm_set1_ps (		\
-					  (float)(int) -1),		\
+					(__v4sf)_mm_set1_epi64x (-1),	\
 					(int)SCALE)
 
 #define _mm_mask_i64gather_ps(SRC, BASE, INDEX, MASK, SCALE)	 \
   (__m128) __builtin_ia32_gatherdiv4sf ((__v4sf)(__m128)SRC,	 \
 					(float const *)BASE,	 \
 					(__v2di)(__m128i)INDEX,	 \
 					(__v4sf)(__m128d)MASK,	 \
 					(int)SCALE)
 
 #define _mm256_i64gather_ps(BASE, INDEX, SCALE)				\
   (__m128) __builtin_ia32_gatherdiv4sf256 ((__v4sf) _mm_setzero_ps (),	\
 					   (float const *)BASE,		\
 					   (__v4di)(__m256i)INDEX,	\
-					   (__v4sf)_mm_set1_ps(		\
-					     (float)(int) -1),		\
+					   (__v4sf)_mm_set1_epi64x (-1),\
 					   (int)SCALE)
 
 #define _mm256_mask_i64gather_ps(SRC, BASE, INDEX, MASK, SCALE)	   \
   (__m128) __builtin_ia32_gatherdiv4sf256 ((__v4sf)(__m128)SRC,	   \
 					   (float const *)BASE,	   \
 					   (__v4di)(__m256i)INDEX, \
 					   (__v4sf)(__m128)MASK,   \
 					   (int)SCALE)
 
 #define _mm_i32gather_epi64(BASE, INDEX, SCALE)				\