===================================================================
@@ -3038,14 +3038,25 @@
(match_operand:V2DF 2 "nonimmediate_operand" "")]
"TARGET_SSE2"
{
- rtx r1, r2;
+ rtx tmp0, tmp1;
- r1 = gen_reg_rtx (V4SFmode);
- r2 = gen_reg_rtx (V4SFmode);
+ if (TARGET_AVX && !TARGET_PREFER_AVX128)
+ {
+ tmp0 = gen_reg_rtx (V4DFmode);
+ tmp1 = force_reg (V2DFmode, operands[1]);
- emit_insn (gen_sse2_cvtpd2ps (r1, operands[1]));
- emit_insn (gen_sse2_cvtpd2ps (r2, operands[2]));
- emit_insn (gen_sse_movlhps (operands[0], r1, r2));
+ emit_insn (gen_avx_vec_concatv4df (tmp0, tmp1, operands[2]));
+ emit_insn (gen_avx_cvtpd2ps256 (operands[0], tmp0));
+ }
+ else
+ {
+ tmp0 = gen_reg_rtx (V4SFmode);
+ tmp1 = gen_reg_rtx (V4SFmode);
+
+ emit_insn (gen_sse2_cvtpd2ps (tmp0, operands[1]));
+ emit_insn (gen_sse2_cvtpd2ps (tmp1, operands[2]));
+ emit_insn (gen_sse_movlhps (operands[0], tmp0, tmp1));
+ }
DONE;
})
@@ -3057,12 +3068,12 @@
{
rtx r1, r2;
- r1 = gen_reg_rtx (V8SImode);
- r2 = gen_reg_rtx (V8SImode);
+ r1 = gen_reg_rtx (V4SImode);
+ r2 = gen_reg_rtx (V4SImode);
- emit_insn (gen_avx_cvttpd2dq256_2 (r1, operands[1]));
- emit_insn (gen_avx_cvttpd2dq256_2 (r2, operands[2]));
- emit_insn (gen_avx_vperm2f128v8si3 (operands[0], r1, r2, GEN_INT (0x20)));
+ emit_insn (gen_fix_truncv4dfv4si2 (r1, operands[1]));
+ emit_insn (gen_fix_truncv4dfv4si2 (r2, operands[2]));
+ emit_insn (gen_avx_vec_concatv8si (operands[0], r1, r2));
DONE;
})
@@ -3072,16 +3083,28 @@
(match_operand:V2DF 2 "nonimmediate_operand" "")]
"TARGET_SSE2"
{
- rtx r1, r2;
+ rtx tmp0, tmp1;
- r1 = gen_reg_rtx (V4SImode);
- r2 = gen_reg_rtx (V4SImode);
+ if (TARGET_AVX && !TARGET_PREFER_AVX128)
+ {
+ tmp0 = gen_reg_rtx (V4DFmode);
+ tmp1 = force_reg (V2DFmode, operands[1]);
- emit_insn (gen_sse2_cvttpd2dq (r1, operands[1]));
- emit_insn (gen_sse2_cvttpd2dq (r2, operands[2]));
- emit_insn (gen_vec_interleave_lowv2di (gen_lowpart (V2DImode, operands[0]),
- gen_lowpart (V2DImode, r1),
- gen_lowpart (V2DImode, r2)));
+ emit_insn (gen_avx_vec_concatv4df (tmp0, tmp1, operands[2]));
+ emit_insn (gen_fix_truncv4dfv4si2 (operands[0], tmp0));
+ }
+ else
+ {
+ tmp0 = gen_reg_rtx (V4SImode);
+ tmp1 = gen_reg_rtx (V4SImode);
+
+ emit_insn (gen_sse2_cvttpd2dq (tmp0, operands[1]));
+ emit_insn (gen_sse2_cvttpd2dq (tmp1, operands[2]));
+ emit_insn
+ (gen_vec_interleave_lowv2di (gen_lowpart (V2DImode, operands[0]),
+ gen_lowpart (V2DImode, tmp0),
+ gen_lowpart (V2DImode, tmp1)));
+ }
DONE;
})
@@ -3126,12 +3149,12 @@
{
rtx r1, r2;
- r1 = gen_reg_rtx (V8SImode);
- r2 = gen_reg_rtx (V8SImode);
+ r1 = gen_reg_rtx (V4SImode);
+ r2 = gen_reg_rtx (V4SImode);
- emit_insn (gen_avx_cvtpd2dq256_2 (r1, operands[1]));
- emit_insn (gen_avx_cvtpd2dq256_2 (r2, operands[2]));
- emit_insn (gen_avx_vperm2f128v8si3 (operands[0], r1, r2, GEN_INT (0x20)));
+ emit_insn (gen_avx_cvtpd2dq256 (r1, operands[1]));
+ emit_insn (gen_avx_cvtpd2dq256 (r2, operands[2]));
+ emit_insn (gen_avx_vec_concatv8si (operands[0], r1, r2));
DONE;
})
@@ -3141,16 +3164,28 @@
(match_operand:V2DF 2 "nonimmediate_operand" "")]
"TARGET_SSE2"
{
- rtx r1, r2;
+ rtx tmp0, tmp1;
- r1 = gen_reg_rtx (V4SImode);
- r2 = gen_reg_rtx (V4SImode);
+ if (TARGET_AVX && !TARGET_PREFER_AVX128)
+ {
+ tmp0 = gen_reg_rtx (V4DFmode);
+ tmp1 = force_reg (V2DFmode, operands[1]);
- emit_insn (gen_sse2_cvtpd2dq (r1, operands[1]));
- emit_insn (gen_sse2_cvtpd2dq (r2, operands[2]));
- emit_insn (gen_vec_interleave_lowv2di (gen_lowpart (V2DImode, operands[0]),
- gen_lowpart (V2DImode, r1),
- gen_lowpart (V2DImode, r2)));
+ emit_insn (gen_avx_vec_concatv4df (tmp0, tmp1, operands[2]));
+ emit_insn (gen_avx_cvtpd2dq256 (operands[0], tmp0));
+ }
+ else
+ {
+ tmp0 = gen_reg_rtx (V4SImode);
+ tmp1 = gen_reg_rtx (V4SImode);
+
+ emit_insn (gen_sse2_cvtpd2dq (tmp0, operands[1]));
+ emit_insn (gen_sse2_cvtpd2dq (tmp1, operands[2]));
+ emit_insn
+ (gen_vec_interleave_lowv2di (gen_lowpart (V2DImode, operands[0]),
+ gen_lowpart (V2DImode, tmp0),
+ gen_lowpart (V2DImode, tmp1)));
+ }
DONE;
})
Hello! Attached patch optimizes v2df (x2) -> v4sf,v4si conversion sequences for AVX from: vcvtpd2psx 48(%rsp), %xmm1 vcvtpd2psx 64(%rsp), %xmm0 vmovlhps %xmm0, %xmm1, %xmm0 vmovaps %xmm0, 32(%rsp) to vmovapd 64(%rsp), %xmm0 vinsertf128 $0x1, 80(%rsp), %ymm0, %ymm0 vcvtpd2psy %ymm0, %xmm0 vmovaps %xmm0, 32(%rsp) Please note only one conversion instruction. In a similar way, the patch optimizes floor/ceil/round from: vroundpd $1, 32(%rsp), %xmm1 vroundpd $1, 48(%rsp), %xmm0 vcvttpd2dqx %xmm1, %xmm1 vcvttpd2dqx %xmm0, %xmm0 vpunpcklqdq %xmm0, %xmm1, %xmm0 vmovdqa %xmm0, 16(%rsp) to vroundpd $1, 64(%rsp), %xmm1 vroundpd $1, 80(%rsp), %xmm0 vinsertf128 $0x1, %xmm0, %ymm1, %ymm0 vcvttpd2dqy %ymm0, %xmm0 vmovdqa %xmm0, 32(%rsp) Ideally, this would be just "vcvtpd2psy 64(%rsp), %xmm0" or "vroundpd $1, 64(%rsp), %ymm1", but vectorizer does not (yet) support mixed vectorize factors. The patch also changes a couple of patterns to use simpler SSE patterns with vec-concat pattern to generate equivalent code. 2011-11-15 Uros Bizjak <ubizjak@gmail.com> * config/i386/sse.md (vec_pack_trunc_v2df): Optimize sequence for AVX. (vec_pack_sfix_trunc_v2df): Ditto. (vec_pack_sfix_v2df): Ditto. (vec_pack_sfix_trunc_v4df): Generate fix_truncv4dfv4si2 and avx_vec_concatv8si patterns. (vec_pack_sfix_v4df): Generate avx_cvtpd2dq256 and avx_vec_concatv8si patterns. testsuite/ChangeLog: 2011-11-15 Uros Bizjak <ubizjak@gmail.com> * gcc.target/i386/avx-cvt-2-vec.c: New test. * gcc.target/i386/avx-floor-sfix-2-vec.c: Ditto. * gcc.target/i386/avx-ceil-sfix-2-vec.c: Ditto. * gcc.target/i386/avx-rint-sfix-2-vec.c: Ditto. * gcc.target/i386/avx-round-sfix-2-vec.c: Ditto. Tested on x86_64-pc-linux-gnu {,-m32} AVX target, committed to mainline SVN. Uros. Index: gcc.target/i386/avx-ceil-sfix-2-vec.c =================================================================== --- gcc.target/i386/avx-ceil-sfix-2-vec.c (revision 0) +++ gcc.target/i386/avx-ceil-sfix-2-vec.c (revision 0) @@ -0,0 +1,62 @@ +/* { dg-do run } */ +/* { dg-options "-O2 -ffast-math -ftree-vectorize -mavx" } */ +/* { dg-require-effective-target avx } */ +/* { dg-skip-if "no M_PI" { vxworks_kernel } } */ + +#ifndef CHECK_H +#define CHECK_H "avx-check.h" +#endif + +#ifndef TEST +#define TEST avx_test +#endif + +#include CHECK_H + +#include <math.h> + +extern double ceil (double); + +#define NUM 4 + +static void +__attribute__((__target__("fpmath=sse"))) +init_src (double *src) +{ + int i, sign = 1; + double f = rand (); + + for (i = 0; i < NUM; i++) + { + src[i] = (i + 1) * f * M_PI * sign; + if (i < (NUM / 2)) + { + if ((i % 6) == 0) + f = f * src[i]; + } + else if (i == (NUM / 2)) + f = rand (); + else if ((i % 6) == 0) + f = 1 / (f * (i + 1) * src[i] * M_PI * sign); + sign = -sign; + } +} + +static void +__attribute__((__target__("fpmath=387"))) +TEST (void) +{ + double a[NUM]; + int r[NUM]; + int i; + + init_src (a); + + for (i = 0; i < NUM; i++) + r[i] = (int) ceil (a[i]); + + /* check results: */ + for (i = 0; i < NUM; i++) + if (r[i] != (int) ceil (a[i])) + abort(); +} Index: gcc.target/i386/avx-rint-sfix-2-vec.c =================================================================== --- gcc.target/i386/avx-rint-sfix-2-vec.c (revision 0) +++ gcc.target/i386/avx-rint-sfix-2-vec.c (revision 0) @@ -0,0 +1,62 @@ +/* { dg-do run } */ +/* { dg-options "-O2 -ffast-math -ftree-vectorize -mavx" } */ +/* { dg-require-effective-target avx } */ +/* { dg-skip-if "no M_PI" { vxworks_kernel } } */ + +#ifndef CHECK_H +#define CHECK_H "avx-check.h" +#endif + +#ifndef TEST +#define TEST avx_test +#endif + +#include CHECK_H + +#include <math.h> + +extern double rint (double); + +#define NUM 4 + +static void +__attribute__((__target__("fpmath=sse"))) +init_src (double *src) +{ + int i, sign = 1; + double f = rand (); + + for (i = 0; i < NUM; i++) + { + src[i] = (i + 1) * f * M_PI * sign; + if (i < (NUM / 2)) + { + if ((i % 6) == 0) + f = f * src[i]; + } + else if (i == (NUM / 2)) + f = rand (); + else if ((i % 6) == 0) + f = 1 / (f * (i + 1) * src[i] * M_PI * sign); + sign = -sign; + } +} + +static void +__attribute__((__target__("fpmath=387"))) +TEST (void) +{ + double a[NUM]; + int r[NUM]; + int i; + + init_src (a); + + for (i = 0; i < NUM; i++) + r[i] = (int) rint (a[i]); + + /* check results: */ + for (i = 0; i < NUM; i++) + if (r[i] != (int) rint (a[i])) + abort(); +} Index: gcc.target/i386/avx-round-sfix-2-vec.c =================================================================== --- gcc.target/i386/avx-round-sfix-2-vec.c (revision 0) +++ gcc.target/i386/avx-round-sfix-2-vec.c (revision 0) @@ -0,0 +1,62 @@ +/* { dg-do run } */ +/* { dg-options "-O2 -ffast-math -ftree-vectorize -mavx" } */ +/* { dg-require-effective-target avx } */ +/* { dg-skip-if "no M_PI" { vxworks_kernel } } */ + +#ifndef CHECK_H +#define CHECK_H "avx-check.h" +#endif + +#ifndef TEST +#define TEST avx_test +#endif + +#include CHECK_H + +#include <math.h> + +extern double round (double); + +#define NUM 4 + +static void +__attribute__((__target__("fpmath=sse"))) +init_src (double *src) +{ + int i, sign = 1; + double f = rand (); + + for (i = 0; i < NUM; i++) + { + src[i] = (i + 1) * f * M_PI * sign; + if (i < (NUM / 2)) + { + if ((i % 6) == 0) + f = f * src[i]; + } + else if (i == (NUM / 2)) + f = rand (); + else if ((i % 6) == 0) + f = 1 / (f * (i + 1) * src[i] * M_PI * sign); + sign = -sign; + } +} + +static void +__attribute__((__target__("fpmath=387"))) +TEST (void) +{ + double a[NUM]; + int r[NUM]; + int i; + + init_src (a); + + for (i = 0; i < NUM; i++) + r[i] = (int) round (a[i]); + + /* check results: */ + for (i = 0; i < NUM; i++) + if (r[i] != (int) round (a[i])) + abort(); +} Index: gcc.target/i386/avx-floor-sfix-2-vec.c =================================================================== --- gcc.target/i386/avx-floor-sfix-2-vec.c (revision 0) +++ gcc.target/i386/avx-floor-sfix-2-vec.c (revision 0) @@ -0,0 +1,62 @@ +/* { dg-do run } */ +/* { dg-options "-O2 -ffast-math -ftree-vectorize -mavx" } */ +/* { dg-require-effective-target avx } */ +/* { dg-skip-if "no M_PI" { vxworks_kernel } } */ + +#ifndef CHECK_H +#define CHECK_H "avx-check.h" +#endif + +#ifndef TEST +#define TEST avx_test +#endif + +#include CHECK_H + +#include <math.h> + +extern double floor (double); + +#define NUM 4 + +static void +__attribute__((__target__("fpmath=sse"))) +init_src (double *src) +{ + int i, sign = 1; + double f = rand (); + + for (i = 0; i < NUM; i++) + { + src[i] = (i + 1) * f * M_PI * sign; + if (i < (NUM / 2)) + { + if ((i % 6) == 0) + f = f * src[i]; + } + else if (i == (NUM / 2)) + f = rand (); + else if ((i % 6) == 0) + f = 1 / (f * (i + 1) * src[i] * M_PI * sign); + sign = -sign; + } +} + +static void +__attribute__((__target__("fpmath=387"))) +TEST (void) +{ + double a[NUM]; + int r[NUM]; + int i; + + init_src (a); + + for (i = 0; i < NUM; i++) + r[i] = (int) floor (a[i]); + + /* check results: */ + for (i = 0; i < NUM; i++) + if (r[i] != (int) floor (a[i])) + abort(); +}