diff mbox

[i386] Introduce SSE 4.1 SI->DI zero-extension to moves between SSE registers

Message ID CAFULd4aaau2-V4EvbAX77DEh2q=ZxDi_JWrGyFyPR6X7ubwYNw@mail.gmail.com
State New
Headers show

Commit Message

Uros Bizjak April 6, 2017, 7:20 p.m. UTC
Attached patch considerably improves zero-extended SImode -> DImode
moves between SSE registers for SSE4.1 targets. The patch teaches the
compiler to generate:

        vmovdqa m(%rip), %ymm1
        vpmovzxdq       %xmm1, %xmm1
        vpsrlw  %xmm1, %xmm0, %xmm0

to zero-extend the value in the SSE register, instead of round
tripping the value to GPR:

        vmovdqa m(%rip), %ymm1
        vmovd   %xmm1, %eax
        vmovq   %rax, %xmm1
        vpsrlw  %xmm1, %xmm0, %xmm0

... or horrible code for targets without preference to inter-unit moves.

As mentioned by Jakub, there are other optimization opportunities with
count argument handling.

2017-04-06  Uros Bizjak  <ubizjak@gmail.com>

    PR target/80286
    * config/i386/sse.md (*vec_extractv4si_0_zext_sse4): New pattern.
    * config/i386/i386.md (*zero_extendsidi2):
    Add (?*x,*x) and (?*v,*v) alternatives.

Patch was bootstrapped and regression tested on x86_64-linux-gnu {,-m32}.

Committed to mainline SVN.

Uros.
diff mbox

Patch

Index: config/i386/i386.md
===================================================================
--- config/i386/i386.md	(revision 246738)
+++ config/i386/i386.md	(working copy)
@@ -3767,10 +3767,10 @@ 
 
 (define_insn "*zero_extendsidi2"
   [(set (match_operand:DI 0 "nonimmediate_operand"
-			"=r,?r,?o,r   ,o,?*Ym,?!*y,?r ,?r,?*Yi,?*x,*r")
+		"=r,?r,?o,r   ,o,?*Ym,?!*y,?r ,?r,?*Yi,?*x,?*x,?*v,*r")
 	(zero_extend:DI
 	 (match_operand:SI 1 "x86_64_zext_operand"
-	        	"0 ,rm,r ,rmWz,0,r   ,m   ,*Yj,*x,r   ,m  ,*k")))]
+	        "0 ,rm,r ,rmWz,0,r   ,m   ,*Yj,*x,r   ,m  , *x, *v,*k")))]
   ""
 {
   switch (get_attr_type (insn))
@@ -3791,6 +3791,15 @@ 
       return "%vpextrd\t{$0, %1, %k0|%k0, %1, 0}";
 
     case TYPE_SSEMOV:
+      if (SSE_REG_P (operands[0]) && SSE_REG_P (operands[1]))
+	{
+	  if (EXT_REX_SSE_REG_P (operands[0])
+	      || EXT_REX_SSE_REG_P (operands[1]))
+	    return "vpmovzxdq\t{%t1, %g0|%g0, %t1}";
+	  else
+	    return "%vpmovzxdq\t{%1, %0|%0, %1}";
+	}
+
       if (GENERAL_REG_P (operands[0]))
 	return "%vmovd\t{%1, %k0|%k0, %1}";
 
@@ -3813,6 +3822,10 @@ 
 	    (eq_attr "alternative" "10")
 	      (const_string "sse2")
 	    (eq_attr "alternative" "11")
+	      (const_string "sse4")
+	    (eq_attr "alternative" "12")
+	      (const_string "avx512f")
+	    (eq_attr "alternative" "13")
 	      (const_string "x64_avx512bw")
 	   ]
 	   (const_string "*")))
@@ -3821,16 +3834,16 @@ 
 	      (const_string "multi")
 	    (eq_attr "alternative" "5,6")
 	      (const_string "mmxmov")
-	    (eq_attr "alternative" "7,9,10")
+	    (eq_attr "alternative" "7,9,10,11,12")
 	      (const_string "ssemov")
 	    (eq_attr "alternative" "8")
 	      (const_string "sselog1")
-	    (eq_attr "alternative" "11")
+	    (eq_attr "alternative" "13")
 	      (const_string "mskmov")
 	   ]
 	   (const_string "imovx")))
    (set (attr "prefix_extra")
-     (if_then_else (eq_attr "alternative" "8")
+     (if_then_else (eq_attr "alternative" "8,11,12")
        (const_string "1")
        (const_string "*")))
    (set (attr "length_immediate")
@@ -3848,7 +3861,7 @@ 
    (set (attr "mode")
      (cond [(eq_attr "alternative" "5,6")
 	      (const_string "DI")
-	    (eq_attr "alternative" "7,8,9")
+	    (eq_attr "alternative" "7,8,9,11,12")
 	      (const_string "TI")
 	   ]
 	   (const_string "SI")))])
Index: config/i386/sse.md
===================================================================
--- config/i386/sse.md	(revision 246738)
+++ config/i386/sse.md	(working copy)
@@ -13516,18 +13516,6 @@ 
   "#"
   [(set_attr "isa" "*,sse4,*,*")])
 
-(define_insn_and_split "*vec_extractv4si_0_zext"
-  [(set (match_operand:DI 0 "register_operand" "=r")
-	(zero_extend:DI
-	  (vec_select:SI
-	    (match_operand:V4SI 1 "register_operand" "v")
-	    (parallel [(const_int 0)]))))]
-  "TARGET_64BIT && TARGET_SSE2 && TARGET_INTER_UNIT_MOVES_FROM_VEC"
-  "#"
-  "&& reload_completed"
-  [(set (match_dup 0) (zero_extend:DI (match_dup 1)))]
-  "operands[1] = gen_lowpart (SImode, operands[1]);")
-
 (define_insn "*vec_extractv2di_0_sse"
   [(set (match_operand:DI 0 "nonimmediate_operand"     "=v,m")
 	(vec_select:DI
@@ -13546,6 +13534,35 @@ 
   [(set (match_dup 0) (match_dup 1))]
   "operands[1] = gen_lowpart (<MODE>mode, operands[1]);")
 
+(define_insn "*vec_extractv4si_0_zext_sse4"
+  [(set (match_operand:DI 0 "register_operand" "=r,x,v")
+	(zero_extend:DI
+	  (vec_select:SI
+	    (match_operand:V4SI 1 "register_operand" "Yj,x,v")
+	    (parallel [(const_int 0)]))))]
+  "TARGET_SSE4_1"
+  "#"
+  [(set_attr "isa" "x64,*,avx512f")])
+
+(define_insn "*vec_extractv4si_0_zext"
+  [(set (match_operand:DI 0 "register_operand" "=r")
+	(zero_extend:DI
+	  (vec_select:SI
+	    (match_operand:V4SI 1 "register_operand" "x")
+	    (parallel [(const_int 0)]))))]
+  "TARGET_64BIT && TARGET_SSE2 && TARGET_INTER_UNIT_MOVES_FROM_VEC"
+  "#")
+
+(define_split
+  [(set (match_operand:DI 0 "register_operand")
+	(zero_extend:DI
+	  (vec_select:SI
+	    (match_operand:V4SI 1 "register_operand")
+	    (parallel [(const_int 0)]))))]
+  "TARGET_SSE2 && reload_completed"
+  [(set (match_dup 0) (zero_extend:DI (match_dup 1)))]
+  "operands[1] = gen_lowpart (SImode, operands[1]);")
+
 (define_insn "*vec_extractv4si"
   [(set (match_operand:SI 0 "nonimmediate_operand" "=rm,rm,Yr,*x,x,Yv")
 	(vec_select:SI