===================================================================
@@ -1034,9 +1034,9 @@ _mm256_permute4x64_pd (__m256d __X, const int __M)
extern __inline __m256
__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_permutevar8x32_ps (__m256 __X, __m256 __Y)
+_mm256_permutevar8x32_ps (__m256 __X, __m256i __Y)
{
- return (__m256) __builtin_ia32_permvarsf256 ((__v8sf)__X,(__v8sf)__Y);
+ return (__m256) __builtin_ia32_permvarsf256 ((__v8sf)__X, (__v8si)__Y);
}
#ifdef __OPTIMIZE__
===================================================================
@@ -79,8 +79,7 @@
UNSPEC_VCVTPS2PH
;; For AVX2 support
- UNSPEC_VPERMSI
- UNSPEC_VPERMSF
+ UNSPEC_VPERMVAR
UNSPEC_VPERMTI
UNSPEC_GATHER
UNSPEC_VSIBADDR
@@ -11901,30 +11900,18 @@
(set_attr "prefix" "vex")
(set_attr "mode" "<sseinsnmode>")])
-(define_insn "avx2_permvarv8si"
- [(set (match_operand:V8SI 0 "register_operand" "=x")
- (unspec:V8SI
- [(match_operand:V8SI 1 "register_operand" "x")
- (match_operand:V8SI 2 "nonimmediate_operand" "xm")]
- UNSPEC_VPERMSI))]
+(define_insn "avx2_permvar<mode>"
+ [(set (match_operand:VI4F_256 0 "register_operand" "=x")
+ (unspec:VI4F_256
+ [(match_operand:VI4F_256 1 "nonimmediate_operand" "xm")
+ (match_operand:V8SI 2 "register_operand" "x")]
+ UNSPEC_VPERMVAR))]
"TARGET_AVX2"
- "vpermd\t{%2, %1, %0|%0, %1, %2}"
+ "vperm<ssemodesuffix>\t{%1, %2, %0|%0, %2, %1}"
[(set_attr "type" "sselog")
(set_attr "prefix" "vex")
(set_attr "mode" "OI")])
-(define_insn "avx2_permvarv8sf"
- [(set (match_operand:V8SF 0 "register_operand" "=x")
- (unspec:V8SF
- [(match_operand:V8SF 1 "register_operand" "x")
- (match_operand:V8SF 2 "nonimmediate_operand" "xm")]
- UNSPEC_VPERMSF))]
- "TARGET_AVX2"
- "vpermps\t{%2, %1, %0|%0, %1, %2}"
- [(set_attr "type" "sselog")
- (set_attr "prefix" "vex")
- (set_attr "mode" "OI")])
-
(define_expand "avx2_perm<mode>"
[(match_operand:VI8F_256 0 "register_operand")
(match_operand:VI8F_256 1 "nonimmediate_operand")
===================================================================
@@ -19937,7 +19937,7 @@ ix86_expand_vec_perm (rtx operands[])
vt = force_reg (maskmode, vt);
mask = gen_lowpart (maskmode, mask);
if (maskmode == V8SImode)
- emit_insn (gen_avx2_permvarv8si (t1, vt, mask));
+ emit_insn (gen_avx2_permvarv8si (t1, mask, vt));
else
emit_insn (gen_avx2_pshufbv32qi3 (t1, mask, vt));
@@ -19971,13 +19971,13 @@ ix86_expand_vec_perm (rtx operands[])
the high bits of the shuffle elements. No need for us to
perform an AND ourselves. */
if (one_operand_shuffle)
- emit_insn (gen_avx2_permvarv8si (target, mask, op0));
+ emit_insn (gen_avx2_permvarv8si (target, op0, mask));
else
{
t1 = gen_reg_rtx (V8SImode);
t2 = gen_reg_rtx (V8SImode);
- emit_insn (gen_avx2_permvarv8si (t1, mask, op0));
- emit_insn (gen_avx2_permvarv8si (t2, mask, op1));
+ emit_insn (gen_avx2_permvarv8si (t1, op0, mask));
+ emit_insn (gen_avx2_permvarv8si (t2, op0, mask));
goto merge_two;
}
return;
@@ -19985,13 +19985,13 @@ ix86_expand_vec_perm (rtx operands[])
case V8SFmode:
mask = gen_lowpart (V8SFmode, mask);
if (one_operand_shuffle)
- emit_insn (gen_avx2_permvarv8sf (target, mask, op0));
+ emit_insn (gen_avx2_permvarv8sf (target, op0, mask));
else
{
t1 = gen_reg_rtx (V8SFmode);
t2 = gen_reg_rtx (V8SFmode);
- emit_insn (gen_avx2_permvarv8sf (t1, mask, op0));
- emit_insn (gen_avx2_permvarv8sf (t2, mask, op1));
+ emit_insn (gen_avx2_permvarv8sf (t1, op0, mask));
+ emit_insn (gen_avx2_permvarv8sf (t2, op1, mask));
goto merge_two;
}
return;
@@ -20004,7 +20004,7 @@ ix86_expand_vec_perm (rtx operands[])
t2 = gen_reg_rtx (V8SImode);
emit_insn (gen_avx_vec_concatv8si (t1, op0, op1));
emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
- emit_insn (gen_avx2_permvarv8si (t1, t2, t1));
+ emit_insn (gen_avx2_permvarv8si (t1, t1, t2));
emit_insn (gen_avx_vextractf128v8si (target, t1, const0_rtx));
return;
@@ -20014,7 +20014,7 @@ ix86_expand_vec_perm (rtx operands[])
mask = gen_lowpart (V4SFmode, mask);
emit_insn (gen_avx_vec_concatv8sf (t1, op0, op1));
emit_insn (gen_avx_vec_concatv8sf (t2, mask, mask));
- emit_insn (gen_avx2_permvarv8sf (t1, t2, t1));
+ emit_insn (gen_avx2_permvarv8sf (t1, t1, t2));
emit_insn (gen_avx_vextractf128v8sf (target, t1, const0_rtx));
return;
@@ -26948,8 +26948,8 @@ static const struct builtin_description bdesc_args
{ OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv4si, "__builtin_ia32_pbroadcastd128", IX86_BUILTIN_PBROADCASTD128, UNKNOWN, (int) V4SI_FTYPE_V4SI },
{ OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_pbroadcastv2di, "__builtin_ia32_pbroadcastq128", IX86_BUILTIN_PBROADCASTQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI },
{ OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permvarv8si, "__builtin_ia32_permvarsi256", IX86_BUILTIN_VPERMVARSI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI },
+ { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permvarv8sf, "__builtin_ia32_permvarsf256", IX86_BUILTIN_VPERMVARSF256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SI },
{ OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permv4df, "__builtin_ia32_permdf256", IX86_BUILTIN_VPERMDF256, UNKNOWN, (int) V4DF_FTYPE_V4DF_INT },
- { OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permvarv8sf, "__builtin_ia32_permvarsf256", IX86_BUILTIN_VPERMVARSF256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
{ OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permv4di, "__builtin_ia32_permdi256", IX86_BUILTIN_VPERMDI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_INT },
{ OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_permv2ti, "__builtin_ia32_permti256", IX86_BUILTIN_VPERMTI256, UNKNOWN, (int) V4DI_FTYPE_V4DI_V4DI_INT },
{ OPTION_MASK_ISA_AVX2, CODE_FOR_avx2_extracti128, "__builtin_ia32_extract128i256", IX86_BUILTIN_VEXTRACT128I256, UNKNOWN, (int) V2DI_FTYPE_V4DI_INT },
@@ -36126,9 +36126,9 @@ expand_vec_perm_pshufb (struct expand_vec_perm_d *
else if (vmode == V32QImode)
emit_insn (gen_avx2_pshufbv32qi3 (target, op0, vperm));
else if (vmode == V8SFmode)
- emit_insn (gen_avx2_permvarv8sf (target, vperm, op0));
+ emit_insn (gen_avx2_permvarv8sf (target, op0, vperm));
else
- emit_insn (gen_avx2_permvarv8si (target, vperm, op0));
+ emit_insn (gen_avx2_permvarv8si (target, op0, vperm));
}
else
{
===================================================================
@@ -29,8 +29,8 @@ calc_permd (int *src1, int *src2, int *dst)
memcpy (dst, src1, 32);
for (i = 0; i < 8; i++)
{
- temp = src1[i];
- dst[i] = src2[temp & 7];
+ temp = src2[i];
+ dst[i] = src1[temp & 7];
}
}
===================================================================
@@ -5,9 +5,10 @@
#include <immintrin.h>
__m256 x;
+__m256i y;
void extern
avx2_test (void)
{
- x = _mm256_permutevar8x32_ps (x, x);
+ x = _mm256_permutevar8x32_ps (x, y);
}
===================================================================
@@ -8,7 +8,7 @@
#define NUM 10
static void
-init_permps (float *src1, float *src2, int seed)
+init_permps (float *src1, int *src2, int seed)
{
int i, sign = 1;
@@ -21,24 +21,24 @@ static void
}
static void
-calc_permps (float *src1, float *src2, float *dst)
+calc_permps (float *src1, int *src2, float *dst)
{
int i;
unsigned temp;
- unsigned *idx = (int *) src1;
memcpy (dst, src1, 32);
for (i = 0; i < 8; i++)
{
- temp = idx[i];
- dst[i] = src2[temp & 7];
+ temp = src2[i];
+ dst[i] = src1[temp & 7];
}
}
static void
avx2_test (void)
{
- union256 src1, src2, dst;
+ union256 src1, dst;
+ union256i_d src2;
float dst_ref[8];
int i;