diff mbox

[i386] : Optimize v2df (x2) -> v4sf,v4si conversion sequences for AVX.

Message ID CAFULd4aJrpB8SqK=+OfR2=2iCKnBji8YBtuHmD5AGPSbFihzUw@mail.gmail.com
State New
Headers show

Commit Message

Uros Bizjak Nov. 16, 2011, 6:35 p.m. UTC
On Tue, Nov 15, 2011 at 8:23 PM, Uros Bizjak <ubizjak@gmail.com> wrote:

> Attached patch optimizes  v2df (x2) -> v4sf,v4si conversion sequences
> for AVX from:

>
>        vroundpd        $1, 32(%rsp), %xmm1
>        vroundpd        $1, 48(%rsp), %xmm0
>        vcvttpd2dqx     %xmm1, %xmm1
>        vcvttpd2dqx     %xmm0, %xmm0
>        vpunpcklqdq     %xmm0, %xmm1, %xmm0
>        vmovdqa %xmm0, 16(%rsp)
>
> to
>
>        vroundpd        $1, 64(%rsp), %xmm1
>        vroundpd        $1, 80(%rsp), %xmm0
>        vinsertf128     $0x1, %xmm0, %ymm1, %ymm0
>        vcvttpd2dqy     %ymm0, %xmm0
>        vmovdqa %xmm0, 32(%rsp)
>
> Ideally, this would be just "vcvtpd2psy 64(%rsp), %xmm0" or "vroundpd
> $1, 64(%rsp), %ymm1", but vectorizer does not (yet) support mixed
> vectorize factors.

Attached patch optimizes above code a step further, generating:

	vmovapd	64(%rsp), %xmm0
	vinsertf128	$0x1, 80(%rsp), %ymm0, %ymm0
	vroundpd	$1, %ymm0, %ymm0
	vcvttpd2dqy	%ymm0, %xmm0
	vmovdqa	%xmm0, 32(%rsp)

2011-11-16  Uros Bizjak  <ubizjak@gmail.com>

	* config/i386/sse.md (round<mode>2_vec_pack_sfix): Optimize V2DFmode
	sequence for AVX.
	(<sse4_1>_round<ssemodesuffix>_vec_pack_sfix<avxsizesuffix>): Ditto.

Tested on x86_64-pc-linux-gnu {,-m32} AVX target, committed to mainline SVN.

Uros.
diff mbox

Patch

Index: sse.md
===================================================================
--- sse.md	(revision 181402)
+++ sse.md	(working copy)
@@ -9962,17 +9962,32 @@ 
 {
   rtx tmp0, tmp1;
 
-  tmp0 = gen_reg_rtx (<MODE>mode);
-  tmp1 = gen_reg_rtx (<MODE>mode);
+  if (<MODE>mode == V2DFmode
+      && TARGET_AVX && !TARGET_PREFER_AVX128)
+    {
+      rtx tmp2 = gen_reg_rtx (V4DFmode);
 
-  emit_insn
-    (gen_<sse4_1>_round<ssemodesuffix><avxsizesuffix> (tmp0, operands[1],
-						       operands[3]));
-  emit_insn
-    (gen_<sse4_1>_round<ssemodesuffix><avxsizesuffix> (tmp1, operands[2],
-						       operands[3]));
-  emit_insn
-    (gen_vec_pack_sfix_trunc_<mode> (operands[0], tmp0, tmp1));
+      tmp0 = gen_reg_rtx (V4DFmode);
+      tmp1 = force_reg (V2DFmode, operands[1]);
+
+      emit_insn (gen_avx_vec_concatv4df (tmp0, tmp1, operands[2]));
+      emit_insn (gen_avx_roundpd256 (tmp2, tmp0, operands[3]));
+      emit_insn (gen_fix_truncv4dfv4si2 (operands[0], tmp2));
+    }
+  else
+    {
+      tmp0 = gen_reg_rtx (<MODE>mode);
+      tmp1 = gen_reg_rtx (<MODE>mode);
+
+      emit_insn
+       (gen_<sse4_1>_round<ssemodesuffix><avxsizesuffix> (tmp0, operands[1],
+							  operands[3]));
+      emit_insn
+       (gen_<sse4_1>_round<ssemodesuffix><avxsizesuffix> (tmp1, operands[2],
+							  operands[3]));
+      emit_insn
+       (gen_vec_pack_sfix_trunc_<mode> (operands[0], tmp0, tmp1));
+    }
   DONE;
 })
 
@@ -10053,14 +10068,29 @@ 
 {
   rtx tmp0, tmp1;
 
-  tmp0 = gen_reg_rtx (<MODE>mode);
-  tmp1 = gen_reg_rtx (<MODE>mode);
+  if (<MODE>mode == V2DFmode
+      && TARGET_AVX && !TARGET_PREFER_AVX128)
+    {
+      rtx tmp2 = gen_reg_rtx (V4DFmode);
 
-  emit_insn (gen_round<mode>2 (tmp0, operands[1]));
-  emit_insn (gen_round<mode>2 (tmp1, operands[2]));
+      tmp0 = gen_reg_rtx (V4DFmode);
+      tmp1 = force_reg (V2DFmode, operands[1]);
 
-  emit_insn
-    (gen_vec_pack_sfix_trunc_<mode> (operands[0], tmp0, tmp1));
+      emit_insn (gen_avx_vec_concatv4df (tmp0, tmp1, operands[2]));
+      emit_insn (gen_roundv4df2 (tmp2, tmp0));
+      emit_insn (gen_fix_truncv4dfv4si2 (operands[0], tmp2));
+    }
+  else
+    {
+      tmp0 = gen_reg_rtx (<MODE>mode);
+      tmp1 = gen_reg_rtx (<MODE>mode);
+
+      emit_insn (gen_round<mode>2 (tmp0, operands[1]));
+      emit_insn (gen_round<mode>2 (tmp1, operands[2]));
+
+      emit_insn
+       (gen_vec_pack_sfix_trunc_<mode> (operands[0], tmp0, tmp1));
+    }
   DONE;
 })