diff mbox

Fix MMX/SSE/AVX* shifts by non-immediate scalar (PR target/80286)

Message ID CAFULd4bKJ-igNrE_O6t-P1CLZR6nJrY2HcTZY4XNCJNBuEZhNg@mail.gmail.com
State New
Headers show

Commit Message

Uros Bizjak April 6, 2017, 8:40 a.m. UTC
On Thu, Apr 6, 2017 at 9:33 AM, Uros Bizjak <ubizjak@gmail.com> wrote:
> On Tue, Apr 4, 2017 at 5:09 PM, Jakub Jelinek <jakub@redhat.com> wrote:
>> On Tue, Apr 04, 2017 at 02:33:24PM +0200, Uros Bizjak wrote:
>>> > I assume split those before reload.  Because we want to give reload a chance
>>> > to do the zero extension on GPRs if it is more beneficial, and it might
>>> > choose to store it into memory and load into XMM from memory and that is
>>> > hard to do after reload.
>>>
>>> Yes, split before reload, and hope that alternative's decorations play
>>> well with RA.
>>
>> Haven't done these splitters yet, just playing now with:
>> typedef long long __m256i __attribute__ ((__vector_size__ (32), __may_alias__));
>> typedef int __v4si __attribute__ ((__vector_size__ (16)));
>> typedef short __v8hi __attribute__ ((__vector_size__ (16)));
>> typedef int __v8si __attribute__ ((__vector_size__ (32)));
>> typedef long long __m128i __attribute__ ((__vector_size__ (16), __may_alias__));
>> extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
>> _mm256_castsi256_si128 (__m256i __A) { return (__m128i) __builtin_ia32_si_si256 ((__v8si)__A); }
>> extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
>> _mm_cvtsi128_si32 (__m128i __A) { return __builtin_ia32_vec_ext_v4si ((__v4si)__A, 0); }
>> extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
>> _mm_srli_epi16 (__m128i __A, int __B) { return (__m128i)__builtin_ia32_psrlwi128 ((__v8hi)__A, __B); }
>> __m256i m;
>> __m128i foo (__m128i minmax)
>> {
>>   int shift = _mm_cvtsi128_si32 (_mm256_castsi256_si128 (m));
>>   return _mm_srli_epi16 (minmax, shift);
>> }
>> to see what it emits (in that case we already have zero extension rather
>> than sign extension).
>>> > With ? in front of it or without?  I admit I've only tried so far:
>>>
>>> I'd leave ?* in this case. In my experience, RA allocates alternative
>>> with ?* only when really needed.
>>
>> So far I have following, which seems to work fine for the above testcase and
>> -O2 -m64 -mavx2, but doesn't work for -O2 -m32 -mavx2.
>> For 64-bit combiner matches the *vec_extractv4si_0_zext pattern and as that
>> doesn't have ? nor * in the constraint, it is used.
>> For 32-bit there is no such pattern and we end up with just zero_extendsidi2
>> pattern and apparently either the ? or * prevent IRA/LRA from using it.
>> If I remove both ?*, I get nice code even for 32-bit.
>
> Newly introduced alternatives (x/x) and (v/v) are valid also for
> 32-bit targets, so we have to adjust insn constraint of
> *vec_extractv4si_0_zext and enable alternatives accordingly. After the
> adjustment, the pattern will be split to a zero-extend.

Attached patch fixes your testcase above for 64 and 32-bit targets.
What do you think?

Uros.
diff mbox

Patch

diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index 6ed2390..d1c3c16 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -3767,10 +3767,10 @@ 
 
 (define_insn "*zero_extendsidi2"
   [(set (match_operand:DI 0 "nonimmediate_operand"
-			"=r,?r,?o,r   ,o,?*Ym,?!*y,?r ,?r,?*Yi,?*x,*r")
+		"=r,?r,?o,r   ,o,?*Ym,?!*y,?r ,?r,?*Yi,?*x,?*x,?*v,*r")
 	(zero_extend:DI
 	 (match_operand:SI 1 "x86_64_zext_operand"
-	        	"0 ,rm,r ,rmWz,0,r   ,m   ,*Yj,*x,r   ,m  ,*k")))]
+	        "0 ,rm,r ,rmWz,0,r   ,m   ,*Yj,*x,r   ,m  , *x, *v,*k")))]
   ""
 {
   switch (get_attr_type (insn))
@@ -3791,6 +3791,15 @@ 
       return "%vpextrd\t{$0, %1, %k0|%k0, %1, 0}";
 
     case TYPE_SSEMOV:
+      if (SSE_REG_P (operands[0]) && SSE_REG_P (operands[1]))
+	{
+	  if (EXT_REX_SSE_REG_P (operands[0])
+	      || EXT_REX_SSE_REG_P (operands[1]))
+	    return "vpmovzxdq\t{%t1, %g0|%g0, %t1}";
+	  else
+	    return "%vpmovzxdq\t{%1, %0|%0, %1}";
+	}
+
       if (GENERAL_REG_P (operands[0]))
 	return "%vmovd\t{%1, %k0|%k0, %1}";
 
@@ -3813,6 +3822,10 @@ 
 	    (eq_attr "alternative" "10")
 	      (const_string "sse2")
 	    (eq_attr "alternative" "11")
+	      (const_string "sse4")
+	    (eq_attr "alternative" "12")
+	      (const_string "avx512f")
+	    (eq_attr "alternative" "13")
 	      (const_string "x64_avx512bw")
 	   ]
 	   (const_string "*")))
@@ -3821,16 +3834,16 @@ 
 	      (const_string "multi")
 	    (eq_attr "alternative" "5,6")
 	      (const_string "mmxmov")
-	    (eq_attr "alternative" "7,9,10")
+	    (eq_attr "alternative" "7,9,10,11,12")
 	      (const_string "ssemov")
 	    (eq_attr "alternative" "8")
 	      (const_string "sselog1")
-	    (eq_attr "alternative" "11")
+	    (eq_attr "alternative" "13")
 	      (const_string "mskmov")
 	   ]
 	   (const_string "imovx")))
    (set (attr "prefix_extra")
-     (if_then_else (eq_attr "alternative" "8")
+     (if_then_else (eq_attr "alternative" "8,11,12")
        (const_string "1")
        (const_string "*")))
    (set (attr "length_immediate")
@@ -3848,7 +3861,7 @@ 
    (set (attr "mode")
      (cond [(eq_attr "alternative" "5,6")
 	      (const_string "DI")
-	    (eq_attr "alternative" "7,8,9")
+	    (eq_attr "alternative" "7,8,9,11,12")
 	      (const_string "TI")
 	   ]
 	   (const_string "SI")))])
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 15ced88..094404b 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -13516,18 +13516,6 @@ 
   "#"
   [(set_attr "isa" "*,sse4,*,*")])
 
-(define_insn_and_split "*vec_extractv4si_0_zext"
-  [(set (match_operand:DI 0 "register_operand" "=r")
-	(zero_extend:DI
-	  (vec_select:SI
-	    (match_operand:V4SI 1 "register_operand" "v")
-	    (parallel [(const_int 0)]))))]
-  "TARGET_64BIT && TARGET_SSE2 && TARGET_INTER_UNIT_MOVES_FROM_VEC"
-  "#"
-  "&& reload_completed"
-  [(set (match_dup 0) (zero_extend:DI (match_dup 1)))]
-  "operands[1] = gen_lowpart (SImode, operands[1]);")
-
 (define_insn "*vec_extractv2di_0_sse"
   [(set (match_operand:DI 0 "nonimmediate_operand"     "=v,m")
 	(vec_select:DI
@@ -13546,6 +13534,35 @@ 
   [(set (match_dup 0) (match_dup 1))]
   "operands[1] = gen_lowpart (<MODE>mode, operands[1]);")
 
+(define_insn "*vec_extractv4si_0_zext_sse4"
+  [(set (match_operand:DI 0 "register_operand" "=r,x,v")
+	(zero_extend:DI
+	  (vec_select:SI
+	    (match_operand:V4SI 1 "register_operand" "Yj,x,v")
+	    (parallel [(const_int 0)]))))]
+  "TARGET_SSE4_1"
+  "#"
+  [(set_attr "isa" "x64,*,avx512f")])
+
+(define_insn "*vec_extractv4si_0_zext"
+  [(set (match_operand:DI 0 "register_operand" "=r")
+	(zero_extend:DI
+	  (vec_select:SI
+	    (match_operand:V4SI 1 "register_operand" "x")
+	    (parallel [(const_int 0)]))))]
+  "TARGET_64BIT && TARGET_SSE2 && TARGET_INTER_UNIT_MOVES_FROM_VEC"
+  "#")
+
+(define_split
+  [(set (match_operand:DI 0 "register_operand")
+	(zero_extend:DI
+	  (vec_select:SI
+	    (match_operand:V4SI 1 "register_operand")
+	    (parallel [(const_int 0)]))))]
+  "TARGET_SSE2 && reload_completed"
+  [(set (match_dup 0) (zero_extend:DI (match_dup 1)))]
+  "operands[1] = gen_lowpart (SImode, operands[1]);")
+
 (define_insn "*vec_extractv4si"
   [(set (match_operand:SI 0 "nonimmediate_operand" "=rm,rm,Yr,*x,x,Yv")
 	(vec_select:SI