Patchwork AVX2 permutation improvements

login
register
mail settings
Submitter Jakub Jelinek
Date March 20, 2012, 11:22 a.m.
Message ID <20120320112217.GR16117@tyan-ft48-01.lab.bos.redhat.com>
Download mbox | patch
Permalink /patch/147782/
State New
Headers show

Comments

Jakub Jelinek - March 20, 2012, 11:22 a.m.
Hi!

This patch improves register -> register broadcast AVX2 permutations
and also starts using vpermps where possible for V8SFmode
permutations.  Bootstrapped/regtested on x86_64-linux and i686-linux,
ok for trunk?

2012-03-20  Jakub Jelinek  <jakub@redhat.com>

	PR target/52607
	* config/i386/i386.md ("isa" attribute): Add avx2 and noavx2.
	("enabled" attribute): Handle avx2 and noavx2 isas.
	* config/i386/sse.md (avx2_vec_dupv8sf_1, avx2_pbroadcast<mode>_1):
	New insns.
	(vec_dup<mode>): Add avx2 =x,x alternative.
	(vec_dup<mode> splitter): Don't split if TARGET_AVX2.
	(*avx_vperm_broadcast_<mode>): Don't split V4DFmode if TARGET_AVX2.
	For TARGET_AVX2, V8SFmode and elt == 0 split into vbroadcastss.
	* config/i386/i386.c (expand_vec_perm_pshufb): Emit also vpermps
	for V8SFmode.
	(expand_vec_perm_1): For broadcasts, use avx2_pbroadcast<mode>_1
	if possible, handle also V8SFmode.


	Jakub
Richard Henderson - March 20, 2012, 3:55 p.m.
On 03/20/12 04:22, Jakub Jelinek wrote:
> 2012-03-20  Jakub Jelinek  <jakub@redhat.com>
> 
> 	PR target/52607
> 	* config/i386/i386.md ("isa" attribute): Add avx2 and noavx2.
> 	("enabled" attribute): Handle avx2 and noavx2 isas.
> 	* config/i386/sse.md (avx2_vec_dupv8sf_1, avx2_pbroadcast<mode>_1):
> 	New insns.
> 	(vec_dup<mode>): Add avx2 =x,x alternative.
> 	(vec_dup<mode> splitter): Don't split if TARGET_AVX2.
> 	(*avx_vperm_broadcast_<mode>): Don't split V4DFmode if TARGET_AVX2.
> 	For TARGET_AVX2, V8SFmode and elt == 0 split into vbroadcastss.
> 	* config/i386/i386.c (expand_vec_perm_pshufb): Emit also vpermps
> 	for V8SFmode.
> 	(expand_vec_perm_1): For broadcasts, use avx2_pbroadcast<mode>_1
> 	if possible, handle also V8SFmode.

Ok.


r~

Patch

--- gcc/config/i386/i386.md.jj	2012-03-20 08:51:30.937236938 +0100
+++ gcc/config/i386/i386.md	2012-03-20 08:54:50.742079909 +0100
@@ -639,7 +639,7 @@  (define_attr "use_carry" "0,1" (const_st
 (define_attr "movu" "0,1" (const_string "0"))
 
 ;; Used to control the "enabled" attribute on a per-instruction basis.
-(define_attr "isa" "base,sse2,sse2_noavx,sse3,sse4,sse4_noavx,noavx,avx,bmi2"
+(define_attr "isa" "base,sse2,sse2_noavx,sse3,sse4,sse4_noavx,noavx,avx,avx2,noavx2,bmi2"
   (const_string "base"))
 
 (define_attr "enabled" ""
@@ -652,6 +652,8 @@  (define_attr "enabled" ""
 	   (symbol_ref "TARGET_SSE4_1 && !TARGET_AVX")
 	 (eq_attr "isa" "avx") (symbol_ref "TARGET_AVX")
 	 (eq_attr "isa" "noavx") (symbol_ref "!TARGET_AVX")
+	 (eq_attr "isa" "avx2") (symbol_ref "TARGET_AVX2")
+	 (eq_attr "isa" "noavx2") (symbol_ref "!TARGET_AVX2")
 	 (eq_attr "isa" "bmi2") (symbol_ref "TARGET_BMI2")
 	]
 	(const_int 1)))
--- gcc/config/i386/sse.md.jj	2012-03-20 08:51:30.940236899 +0100
+++ gcc/config/i386/sse.md	2012-03-20 08:55:22.344898469 +0100
@@ -3808,6 +3808,18 @@  (define_insn "avx2_vec_dup<mode>"
     (set_attr "prefix" "vex")
     (set_attr "mode" "<MODE>")])
 
+(define_insn "avx2_vec_dupv8sf_1"
+  [(set (match_operand:V8SF 0 "register_operand" "=x")
+	(vec_duplicate:V8SF
+	  (vec_select:SF
+	    (match_operand:V8SF 1 "register_operand" "x")
+	    (parallel [(const_int 0)]))))]
+  "TARGET_AVX2"
+  "vbroadcastss\t{%x1, %0|%0, %x1}"
+  [(set_attr "type" "sselog1")
+    (set_attr "prefix" "vex")
+    (set_attr "mode" "V8SF")])
+
 (define_insn "vec_dupv4sf"
   [(set (match_operand:V4SF 0 "register_operand" "=x,x,x")
 	(vec_duplicate:V4SF
@@ -11876,6 +11888,19 @@  (define_insn "avx2_pbroadcast<mode>"
    (set_attr "prefix" "vex")
    (set_attr "mode" "<sseinsnmode>")])
 
+(define_insn "avx2_pbroadcast<mode>_1"
+  [(set (match_operand:VI_256 0 "register_operand" "=x")
+	(vec_duplicate:VI_256
+	  (vec_select:<ssescalarmode>
+	    (match_operand:VI_256 1 "nonimmediate_operand" "xm")
+	    (parallel [(const_int 0)]))))]
+  "TARGET_AVX2"
+  "vpbroadcast<ssemodesuffix>\t{%x1, %0|%0, %x1}"
+  [(set_attr "type" "ssemov")
+   (set_attr "prefix_extra" "1")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "<sseinsnmode>")])
+
 (define_insn "avx2_permvarv8si"
   [(set (match_operand:V8SI 0 "register_operand" "=x")
 	(unspec:V8SI
@@ -11967,16 +11992,18 @@  (define_mode_iterator AVX_VEC_DUP_MODE
   [V8SI V8SF V4DI V4DF])
 
 (define_insn "vec_dup<mode>"
-  [(set (match_operand:AVX_VEC_DUP_MODE 0 "register_operand" "=x,x")
+  [(set (match_operand:AVX_VEC_DUP_MODE 0 "register_operand" "=x,x,x")
 	(vec_duplicate:AVX_VEC_DUP_MODE
-	  (match_operand:<ssescalarmode> 1 "nonimmediate_operand" "m,?x")))]
+	  (match_operand:<ssescalarmode> 1 "nonimmediate_operand" "m,x,?x")))]
   "TARGET_AVX"
   "@
    vbroadcast<ssescalarmodesuffix>\t{%1, %0|%0, %1}
+   vbroadcast<ssescalarmodesuffix>\t{%x1, %0|%0, %x1}
    #"
   [(set_attr "type" "ssemov")
    (set_attr "prefix_extra" "1")
    (set_attr "prefix" "vex")
+   (set_attr "isa" "*,avx2,noavx2")
    (set_attr "mode" "V8SF")])
 
 (define_insn "avx2_vbroadcasti128_<mode>"
@@ -11995,7 +12022,7 @@  (define_split
   [(set (match_operand:AVX_VEC_DUP_MODE 0 "register_operand")
 	(vec_duplicate:AVX_VEC_DUP_MODE
 	  (match_operand:<ssescalarmode> 1 "register_operand")))]
-  "TARGET_AVX && reload_completed"
+  "TARGET_AVX && !TARGET_AVX2 && reload_completed"
   [(set (match_dup 2)
 	(vec_duplicate:<ssehalfvecmode> (match_dup 1)))
    (set (match_dup 0)
@@ -12057,7 +12084,7 @@  (define_insn_and_split "*avx_vperm_broad
 	    [(match_operand 3 "const_int_operand" "C,n,n")])))]
   "TARGET_AVX"
   "#"
-  "&& reload_completed"
+  "&& reload_completed && (<MODE>mode != V4DFmode || !TARGET_AVX2)"
   [(set (match_dup 0) (vec_duplicate:VF_256 (match_dup 1)))]
 {
   rtx op0 = operands[0], op1 = operands[1];
@@ -12067,6 +12094,13 @@  (define_insn_and_split "*avx_vperm_broad
     {
       int mask;
 
+      if (TARGET_AVX2 && elt == 0)
+	{
+	  emit_insn (gen_vec_dup<mode> (op0, gen_lowpart (<ssescalarmode>mode,
+							  op1)));
+	  DONE;
+	}
+
       /* Shuffle element we care about into all elements of the 128-bit lane.
 	 The other lane gets shuffled too, but we don't care.  */
       if (<MODE>mode == V4DFmode)
--- gcc/config/i386/i386.c.jj	2012-03-20 08:51:30.942236880 +0100
+++ gcc/config/i386/i386.c	2012-03-20 08:54:50.753080383 +0100
@@ -35834,7 +35834,7 @@  valid_perm_using_mode_p (enum machine_mo
 }
 
 /* A subroutine of ix86_expand_vec_perm_builtin_1.  Try to implement D
-   in terms of pshufb, vpperm, vpermq, vpermd or vperm2i128.  */
+   in terms of pshufb, vpperm, vpermq, vpermd, vpermps or vperm2i128.  */
 
 static bool
 expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
@@ -35908,6 +35908,9 @@  expand_vec_perm_pshufb (struct expand_ve
 	      if (valid_perm_using_mode_p (V8SImode, d))
 		vmode = V8SImode;
 	    }
+	  /* Or if vpermps can be used.  */
+	  else if (d->vmode == V8SFmode)
+	    vmode = V8SImode;
 
 	  if (vmode == V32QImode)
 	    {
@@ -35950,6 +35953,12 @@  expand_vec_perm_pshufb (struct expand_ve
 				gen_rtvec_v (GET_MODE_NUNITS (vmode), rperm));
   vperm = force_reg (vmode, vperm);
 
+  if (vmode == V8SImode && d->vmode == V8SFmode)
+    {
+      vmode = V8SFmode;
+      vperm = gen_lowpart (vmode, vperm);
+    }
+
   target = gen_lowpart (vmode, d->target);
   op0 = gen_lowpart (vmode, d->op0);
   if (d->op0 == d->op1)
@@ -35958,6 +35967,8 @@  expand_vec_perm_pshufb (struct expand_ve
 	emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, vperm));
       else if (vmode == V32QImode)
 	emit_insn (gen_avx2_pshufbv32qi3 (target, op0, vperm));
+      else if (vmode == V8SFmode)
+	emit_insn (gen_avx2_permvarv8sf (target, vperm, op0));
       else
 	emit_insn (gen_avx2_permvarv8si (target, vperm, op0));
     }
@@ -36006,20 +36017,17 @@  expand_vec_perm_1 (struct expand_vec_per
       else if (broadcast_perm && TARGET_AVX2)
 	{
 	  /* Use vpbroadcast{b,w,d}.  */
-	  rtx op = d->op0, (*gen) (rtx, rtx) = NULL;
+	  rtx (*gen) (rtx, rtx) = NULL;
 	  switch (d->vmode)
 	    {
 	    case V32QImode:
-	      op = gen_lowpart (V16QImode, op);
-	      gen = gen_avx2_pbroadcastv32qi;
+	      gen = gen_avx2_pbroadcastv32qi_1;
 	      break;
 	    case V16HImode:
-	      op = gen_lowpart (V8HImode, op);
-	      gen = gen_avx2_pbroadcastv16hi;
+	      gen = gen_avx2_pbroadcastv16hi_1;
 	      break;
 	    case V8SImode:
-	      op = gen_lowpart (V4SImode, op);
-	      gen = gen_avx2_pbroadcastv8si;
+	      gen = gen_avx2_pbroadcastv8si_1;
 	      break;
 	    case V16QImode:
 	      gen = gen_avx2_pbroadcastv16qi;
@@ -36027,13 +36035,16 @@  expand_vec_perm_1 (struct expand_vec_per
 	    case V8HImode:
 	      gen = gen_avx2_pbroadcastv8hi;
 	      break;
+	    case V8SFmode:
+	      gen = gen_avx2_vec_dupv8sf_1;
+	      break;
 	    /* For other modes prefer other shuffles this function creates.  */
 	    default: break;
 	    }
 	  if (gen != NULL)
 	    {
 	      if (!d->testing_p)
-		emit_insn (gen (d->target, op));
+		emit_insn (gen (d->target, d->op0));
 	      return true;
 	    }
 	}
@@ -36101,7 +36112,7 @@  expand_vec_perm_1 (struct expand_vec_per
     return true;
 
   /* Try the SSSE3 pshufb or XOP vpperm or AVX2 vperm2i128,
-     vpshufb, vpermd or vpermq variable permutation.  */
+     vpshufb, vpermd, vpermps or vpermq variable permutation.  */
   if (expand_vec_perm_pshufb (d))
     return true;