diff mbox

[i386] : Fix PR 52932 - _mm256_permutevar8x32_ps and _mm256_permutevar8x32_ps

Message ID CAFULd4b-bvYLhL0eCx-LRQrbUj5+FQP+R4bHgpK1uyTwad33Gg@mail.gmail.com
State New
Headers show

Commit Message

Uros Bizjak April 12, 2012, 6:32 p.m. UTC
Hello!

Attached patch fixes issues around AVX2 vpermps and vpermd instructions.

1. Changes second argument of _mm256_permutevar8x32_ps to __m256i type
and consequently changes second argument of
__builtin_ia32_permvarsf256 argument to __v8si type.
2. Changes avx2_permvarv2sf pattern to accept v8si mask operand as its
2nd operand
3. Changes avx2_permvarv2si pattern in similar way, so it accepts mask
as its 2nd operand
4. Macroizes avx2_permvarv2sf and permvarv2si patterns
5. Mechanically updates all calls to these two expanders
6. Fixes testcases accordingly

2012-04-12  Uros Bizjak  <ubizjak@gmail.com>

	PR target/52932
	* config/i386/avx2intrin.h (_mm256_permutevar8x32_ps): Change second
	argument type to __m256i.  Update call to __builtin_ia32_permvarsf256.
	* config/i386/sse.md (UNSPEC_VPERMVAR): New.
	(UNSPEC_VPERMSI, UNSPEC_VPERMSF): Remove.
	(avx2_permvarv8sf, avx2_permvarv8si): Switch operands 1 and 2.
	(avx2_permvar<mode>): Macroize insn from avx2_permvarv8sf and
	avx2_permvarv8si using VI4F_256 mode iterator.
	* config/i386/i386.c (bdesc_args) <__builtin_ia32_permvarsf256>:
	Update builtin type to V8SF_FTYPE_V8SF_V8SI.
	(ix86_expand_vec_perm): Update calls to gen_avx2_permvarv8si and
	gen_avx2_permvarv8sf.
	(expand_vec_perm_pshufb): Ditto.

testsuite/ChangeLog:

2012-04-12  Uros Bizjak  <ubizjak@gmail.com>

	PR target/52932
	* gcc.target/i386/avx2-vpermps-1.c (avx2_test): Use __m256i type for
	second function argument.
	* gcc.target/i386/avx2-vpermps-2.c (init_permps): Update declaration.
	(calc_permps): Update declaration.  Calculate result correctly.
	(avx2_test): Change src2 type to union256i_d.
	* gcc.target/i386/avx2-vpermd-2.c (calc_permd): Calculate result
	correctly.

Patch was tested on x86_64-pc-linux-gnu {,-m32}. Earlier version of
the patch (without mechanical changes) was also tested on AVX2 target
by Kirill.

Patch was committed to mainline SVN, will be committed to 4.7.1 in a few days.

Uros.
diff mbox

Patch

Index: config/i386/avx2intrin.h
===================================================================
--- config/i386/avx2intrin.h	(revision 186383)
+++ config/i386/avx2intrin.h	(working copy)
@@ -1034,9 +1034,9 @@  _mm256_permute4x64_pd (__m256d __X, const int __M)
 
 extern __inline __m256
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_permutevar8x32_ps (__m256 __X, __m256 __Y)
+_mm256_permutevar8x32_ps (__m256 __X, __m256i __Y)
 {
-  return (__m256) __builtin_ia32_permvarsf256 ((__v8sf)__X,(__v8sf)__Y);
+  return (__m256) __builtin_ia32_permvarsf256 ((__v8sf)__X, (__v8si)__Y);
 }
 
 #ifdef __OPTIMIZE__
Index: config/i386/sse.md
===================================================================
--- config/i386/sse.md	(revision 186383)
+++ config/i386/sse.md	(working copy)
@@ -79,8 +79,7 @@ 
   UNSPEC_VCVTPS2PH
 
   ;; For AVX2 support
-  UNSPEC_VPERMSI
-  UNSPEC_VPERMSF
+  UNSPEC_VPERMVAR
   UNSPEC_VPERMTI
   UNSPEC_GATHER
   UNSPEC_VSIBADDR
@@ -11901,30 +11900,18 @@ 
    (set_attr "prefix" "vex")
    (set_attr "mode" "<sseinsnmode>")])
 
-(define_insn "avx2_permvarv8si"
-  [(set (match_operand:V8SI 0 "register_operand" "=x")
-	(unspec:V8SI
-	  [(match_operand:V8SI 1 "register_operand" "x")
-	   (match_operand:V8SI 2 "nonimmediate_operand" "xm")]
-	  UNSPEC_VPERMSI))]
+(define_insn "avx2_permvar<mode>"
+  [(set (match_operand:VI4F_256 0 "register_operand" "=x")
+	(unspec:VI4F_256
+	  [(match_operand:VI4F_256 1 "nonimmediate_operand" "xm")
+	   (match_operand:V8SI 2 "register_operand" "x")]
+	  UNSPEC_VPERMVAR))]
   "TARGET_AVX2"
-  "vpermd\t{%2, %1, %0|%0, %1, %2}"
+  "vperm<ssemodesuffix>\t{%1, %2, %0|%0, %2, %1}"
   [(set_attr "type" "sselog")
    (set_attr "prefix" "vex")
    (set_attr "mode" "OI")])
 
-(define_insn "avx2_permvarv8sf"
-  [(set (match_operand:V8SF 0 "register_operand" "=x")
-	(unspec:V8SF
-	  [(match_operand:V8SF 1 "register_operand" "x")
-	   (match_operand:V8SF 2 "nonimmediate_operand" "xm")]
-	  UNSPEC_VPERMSF))]
-  "TARGET_AVX2"
-  "vpermps\t{%2, %1, %0|%0, %1, %2}"
-  [(set_attr "type" "sselog")
-   (set_attr "prefix" "vex")
-   (set_attr "mode" "OI")])
-
 (define_expand "avx2_perm<mode>"
   [(match_operand:VI8F_256 0 "register_operand")
    (match_operand:VI8F_256 1 "nonimmediate_operand")
Index: config/i386/i386.c
===================================================================
--- config/i386/i386.c	(revision 186383)
+++ config/i386/i386.c	(working copy)
@@ -19937,7 +19937,7 @@  ix86_expand_vec_perm (rtx operands[])
 	  vt = force_reg (maskmode, vt);
 	  mask = gen_lowpart (maskmode, mask);
 	  if (maskmode == V8SImode)
-	    emit_insn (gen_avx2_permvarv8si (t1, vt, mask));
+	    emit_insn (gen_avx2_permvarv8si (t1, mask, vt));
 	  else
 	    emit_insn (gen_avx2_pshufbv32qi3 (t1, mask, vt));
 
@@ -19971,13 +19971,13 @@  ix86_expand_vec_perm (rtx operands[])
 	     the high bits of the shuffle elements.  No need for us to
 	     perform an AND ourselves.  */
 	  if (one_operand_shuffle)
-	    emit_insn (gen_avx2_permvarv8si (target, mask, op0));
+	    emit_insn (gen_avx2_permvarv8si (target, op0, mask));
 	  else
 	    {
 	      t1 = gen_reg_rtx (V8SImode);
 	      t2 = gen_reg_rtx (V8SImode);
-	      emit_insn (gen_avx2_permvarv8si (t1, mask, op0));
-	      emit_insn (gen_avx2_permvarv8si (t2, mask, op1));
+	      emit_insn (gen_avx2_permvarv8si (t1, op0, mask));
+	      emit_insn (gen_avx2_permvarv8si (t2, op0, mask));
 	      goto merge_two;
 	    }
 	  return;
@@ -19985,13 +19985,13 @@  ix86_expand_vec_perm (rtx operands[])
 	case V8SFmode:
 	  mask = gen_lowpart (V8SFmode, mask);
 	  if (one_operand_shuffle)
-	    emit_insn (gen_avx2_permvarv8sf (target, mask, op0));
+	    emit_insn (gen_avx2_permvarv8sf (target, op0, mask));
 	  else
 	    {
 	      t1 = gen_reg_rtx (V8SFmode);
 	      t2 = gen_reg_rtx (V8SFmode);
-	      emit_insn (gen_avx2_permvarv8sf (t1, mask, op0));
-	      emit_insn (gen_avx2_permvarv8sf (t2, mask, op1));
+	      emit_insn (gen_avx2_permvarv8sf (t1, op0, mask));
+	      emit_insn (gen_avx2_permvarv8sf (t2, op1, mask));
 	      goto merge_two;
 	    }
 	  return;
@@ -20004,7 +20004,7 @@  ix86_expand_vec_perm (rtx operands[])
 	  t2 = gen_reg_rtx (V8SImode);
 	  emit_insn (gen_avx_vec_concatv8si (t1, op0, op1));
 	  emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
-	  emit_insn (gen_avx2_permvarv8si (t1, t2, t1));
+	  emit_insn (gen_avx2_permvarv8si (t1, t1, t2));
 	  emit_insn (gen_avx_vextractf128v8si (target, t1, const0_rtx));
 	  return;
 
@@ -20014,7 +20014,7 @@  ix86_expand_vec_perm (rtx operands[])
 	  mask = gen_lowpart (V4SFmode, mask);
 	  emit_insn (gen_avx_vec_concatv8sf (t1, op0, op1));
 	  emit_insn (gen_avx_vec_concatv8sf (t2, mask, mask));
-	  emit_insn (gen_avx2_permvarv8sf (t1, t2, t1));
+	  emit_insn (gen_avx2_permvarv8sf (t1, t1, t2));
 	  emit_insn (gen_avx_vextractf128v8sf (target, t1, const0_rtx));
 	  return;
 
@@ -26948,8 +26948,8 @@  static const struct builtin_description bdesc_args
   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv4si, "__builtin_ia32_pbroadcastd128", IX86_BUILTIN_PBROADCASTD128, UNKNOWN, (int) V4SI_FTYPE_V4SI },
   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv2di, "__builtin_ia32_pbroadcastq128", IX86_BUILTIN_PBROADCASTQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI },
   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permvarv8si, "__builtin_ia32_permvarsi256", IX86_BUILTIN_VPERMVARSI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
+  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permvarv8sf, "__builtin_ia32_permvarsf256", IX86_BUILTIN_VPERMVARSF256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SI },
   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permv4df, "__builtin_ia32_permdf256", IX86_BUILTIN_VPERMDF256, UNKNOWN, (int) V4DF_FTYPE_V4DF_INT },
-  { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permvarv8sf, "__builtin_ia32_permvarsf256", IX86_BUILTIN_VPERMVARSF256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permv4di, "__builtin_ia32_permdi256", IX86_BUILTIN_VPERMDI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT },
   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permv2ti, "__builtin_ia32_permti256", IX86_BUILTIN_VPERMTI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI_INT },
   { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_extracti128, "__builtin_ia32_extract128i256", IX86_BUILTIN_VEXTRACT128I256, UNKNOWN, (int) V2DI_FTYPE_V4DI_INT },
@@ -36126,9 +36126,9 @@  expand_vec_perm_pshufb (struct expand_vec_perm_d *
       else if (vmode == V32QImode)
 	emit_insn (gen_avx2_pshufbv32qi3 (target, op0, vperm));
       else if (vmode == V8SFmode)
-	emit_insn (gen_avx2_permvarv8sf (target, vperm, op0));
+	emit_insn (gen_avx2_permvarv8sf (target, op0, vperm));
       else
-	emit_insn (gen_avx2_permvarv8si (target, vperm, op0));
+	emit_insn (gen_avx2_permvarv8si (target, op0, vperm));
     }
   else
     {
Index: testsuite/gcc.target/i386/avx2-vpermd-2.c
===================================================================
--- testsuite/gcc.target/i386/avx2-vpermd-2.c	(revision 186383)
+++ testsuite/gcc.target/i386/avx2-vpermd-2.c	(working copy)
@@ -29,8 +29,8 @@  calc_permd (int *src1, int *src2, int *dst)
   memcpy (dst, src1, 32);
   for (i = 0; i < 8; i++)
     {
-      temp = src1[i];
-      dst[i] = src2[temp & 7];
+      temp = src2[i];
+      dst[i] = src1[temp & 7];
     }
 }
 
Index: testsuite/gcc.target/i386/avx2-vpermps-1.c
===================================================================
--- testsuite/gcc.target/i386/avx2-vpermps-1.c	(revision 186383)
+++ testsuite/gcc.target/i386/avx2-vpermps-1.c	(working copy)
@@ -5,9 +5,10 @@ 
 #include <immintrin.h>
 
 __m256 x;
+__m256i y;
 
 void extern
 avx2_test (void)
 {
-  x = _mm256_permutevar8x32_ps (x, x);
+  x = _mm256_permutevar8x32_ps (x, y);
 }
Index: testsuite/gcc.target/i386/avx2-vpermps-2.c
===================================================================
--- testsuite/gcc.target/i386/avx2-vpermps-2.c	(revision 186383)
+++ testsuite/gcc.target/i386/avx2-vpermps-2.c	(working copy)
@@ -8,7 +8,7 @@ 
 #define NUM 10
 
 static void
-init_permps (float *src1, float *src2, int seed)
+init_permps (float *src1, int *src2, int seed)
 {
   int i, sign = 1;
 
@@ -21,24 +21,24 @@  static void
 }
 
 static void
-calc_permps (float *src1, float *src2, float *dst)
+calc_permps (float *src1, int *src2, float *dst)
 {
   int i;
   unsigned temp;
-  unsigned *idx = (int *) src1;
 
   memcpy (dst, src1, 32);
   for (i = 0; i < 8; i++)
     {
-      temp = idx[i];
-      dst[i] = src2[temp & 7];
+      temp = src2[i];
+      dst[i] = src1[temp & 7];
     }
 }
 
 static void
 avx2_test (void)
 {
-  union256 src1, src2, dst;
+  union256 src1, dst;
+  union256i_d src2;
   float dst_ref[8];
   int i;