diff mbox series

x86_64: Improve (interunit) moves from TImode to V1TImode.

Message ID 000c01d80327$3c552310$b4ff6930$@nextmovesoftware.com
State New
Headers show
Series x86_64: Improve (interunit) moves from TImode to V1TImode. | expand

Commit Message

Roger Sayle Jan. 6, 2022, 6 p.m. UTC
This patch improves the code generated when moving a 128-bit value

in TImode, represented by two 64-bit registers, to V1TImode, which

is a single SSE register.

 

Currently, the simple move:

typedef unsigned __int128 uv1ti __attribute__ ((__vector_size__ (16)));

uv1ti foo(__int128 x) { return (uv1ti)x; }

 

is always transferred via memory, as:

foo:    movq    %rdi, -24(%rsp)

        movq    %rsi, -16(%rsp)

        movdqa  -24(%rsp), %xmm0

        ret

 

with this patch, we now generate (with -msse2):

foo:    movq    %rdi, %xmm1

        movq    %rsi, %xmm2

        punpcklqdq      %xmm2, %xmm1

        movdqa  %xmm1, %xmm0

        ret

 

and with -mavx2:

foo:    vmovq   %rdi, %xmm1

        vpinsrq $1, %rsi, %xmm1, %xmm0

        ret

 

Even more dramatic is the improvement of zero extended transfers.

 

uv1ti bar(unsigned char c) { return (uv1ti)(__int128)c; }

 

Previously generated:

bar:    movq    $0, -16(%rsp)

        movzbl  %dil, %eax

        movq    %rax, -24(%rsp)

        vmovdqa -24(%rsp), %xmm0

        ret

 

Now generates:

bar:    movzbl  %dil, %edi

        movq    %rdi, %xmm0

        ret

 

 

My first attempt at this functionality attempted to use a

simple define_split:

 

+;; Move TImode to V1TImode via V2DImode instead of memory.

+(define_split

+  [(set (match_operand:V1TI 0 "register_operand")

+        (subreg:V1TI (match_operand:TI 1 "register_operand") 0))]

+  "TARGET_64BIT && TARGET_SSE2 && can_create_pseudo_p ()"

+  [(set (match_dup 2) (vec_concat:V2DI (match_dup 3) (match_dup 4)))

+   (set (match_dup 0) (subreg:V1TI (match_dup 2) 0))]

+{

+  operands[2] = gen_reg_rtx (V2DImode);

+  operands[3] = gen_lowpart (DImode, operands[1]);

+  operands[4] = gen_highpart (DImode, operands[1]);

+})

+

 

Unfortunately, this triggers very late during the compilation

preventing some of the simplification's we'd like (in combine).

For example the foo case above becomes:

 

foo:    movq    %rsi, -16(%rsp)

        movq    %rdi, %xmm0

        movhps  -16(%rsp), %xmm0

 

transferring half directly, and the other half via memory.

And for the bar case above, GCC fails to appreciate that

movq/vmovq clears the high bits, resulting in:

 

bar:    movzbl  %dil, %eax

        xorl    %edx, %edx

        vmovq   %rax, %xmm1

        vpinsrq $1, %rdx, %xmm1, %xmm0

        ret

 

 

Hence the solution (i.e. this patch) is to add a special case

to ix86_expand_vector_move for TImode to V1TImode transfers.

 

This patch has been tested on x86_64-pc-linux-gnu with make bootstrap

and make -k check with no new failures.  Ok for mainline?

 

 

2022-01-06  Roger Sayle  <roger@nextmovesoftware.com>

 

gcc/ChangeLog

* config/i386/i386-expand.c (ix86_expand_vector_move): Add

special case for TImode to V1TImode moves, going via V2DImode.

 

gcc/testsuite/ChangeLog

* gcc.target/i386/sse2-v1ti-mov-1.c: New test case.

* gcc.target/i386/sse2-v1ti-zext.c: New test case.

 

 

Many thanks in advance,

Roger

--

Comments

Uros Bizjak Jan. 8, 2022, 10:12 a.m. UTC | #1
On Thu, Jan 6, 2022 at 7:00 PM Roger Sayle <roger@nextmovesoftware.com> wrote:
>
>
>
> This patch improves the code generated when moving a 128-bit value
>
> in TImode, represented by two 64-bit registers, to V1TImode, which
>
> is a single SSE register.
>
>
>
> Currently, the simple move:
>
> typedef unsigned __int128 uv1ti __attribute__ ((__vector_size__ (16)));
>
> uv1ti foo(__int128 x) { return (uv1ti)x; }
>
>
>
> is always transferred via memory, as:
>
> foo:    movq    %rdi, -24(%rsp)
>
>         movq    %rsi, -16(%rsp)
>
>         movdqa  -24(%rsp), %xmm0
>
>         ret
>
>
>
> with this patch, we now generate (with -msse2):
>
> foo:    movq    %rdi, %xmm1
>
>         movq    %rsi, %xmm2
>
>         punpcklqdq      %xmm2, %xmm1
>
>         movdqa  %xmm1, %xmm0
>
>         ret
>
>
>
> and with -mavx2:
>
> foo:    vmovq   %rdi, %xmm1
>
>         vpinsrq $1, %rsi, %xmm1, %xmm0
>
>         ret
>
>
>
> Even more dramatic is the improvement of zero extended transfers.
>
>
>
> uv1ti bar(unsigned char c) { return (uv1ti)(__int128)c; }
>
>
>
> Previously generated:
>
> bar:    movq    $0, -16(%rsp)
>
>         movzbl  %dil, %eax
>
>         movq    %rax, -24(%rsp)
>
>         vmovdqa -24(%rsp), %xmm0
>
>         ret
>
>
>
> Now generates:
>
> bar:    movzbl  %dil, %edi
>
>         movq    %rdi, %xmm0
>
>         ret
>
>
>
>
>
> My first attempt at this functionality attempted to use a
>
> simple define_split:
>
>
>
> +;; Move TImode to V1TImode via V2DImode instead of memory.
>
> +(define_split
>
> +  [(set (match_operand:V1TI 0 "register_operand")
>
> +        (subreg:V1TI (match_operand:TI 1 "register_operand") 0))]
>
> +  "TARGET_64BIT && TARGET_SSE2 && can_create_pseudo_p ()"
>
> +  [(set (match_dup 2) (vec_concat:V2DI (match_dup 3) (match_dup 4)))
>
> +   (set (match_dup 0) (subreg:V1TI (match_dup 2) 0))]
>
> +{
>
> +  operands[2] = gen_reg_rtx (V2DImode);
>
> +  operands[3] = gen_lowpart (DImode, operands[1]);
>
> +  operands[4] = gen_highpart (DImode, operands[1]);
>
> +})
>
> +
>
>
>
> Unfortunately, this triggers very late during the compilation
>
> preventing some of the simplification's we'd like (in combine).
>
> For example the foo case above becomes:
>
>
>
> foo:    movq    %rsi, -16(%rsp)
>
>         movq    %rdi, %xmm0
>
>         movhps  -16(%rsp), %xmm0
>
>
>
> transferring half directly, and the other half via memory.
>
> And for the bar case above, GCC fails to appreciate that
>
> movq/vmovq clears the high bits, resulting in:
>
>
>
> bar:    movzbl  %dil, %eax
>
>         xorl    %edx, %edx
>
>         vmovq   %rax, %xmm1
>
>         vpinsrq $1, %rdx, %xmm1, %xmm0
>
>         ret
>
>
>
>
>
> Hence the solution (i.e. this patch) is to add a special case
>
> to ix86_expand_vector_move for TImode to V1TImode transfers.
>
>
>
> This patch has been tested on x86_64-pc-linux-gnu with make bootstrap
>
> and make -k check with no new failures.  Ok for mainline?
>
>
>
>
>
> 2022-01-06  Roger Sayle  <roger@nextmovesoftware.com>
>
>
>
> gcc/ChangeLog
>
> * config/i386/i386-expand.c (ix86_expand_vector_move): Add
>
> special case for TImode to V1TImode moves, going via V2DImode.
>
>
>
> gcc/testsuite/ChangeLog
>
> * gcc.target/i386/sse2-v1ti-mov-1.c: New test case.
>
> * gcc.target/i386/sse2-v1ti-zext.c: New test case.

OK.

Thanks,
Uros.
diff mbox series

Patch

diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c
index 7013c20..47c4870 100644
--- a/gcc/config/i386/i386-expand.c
+++ b/gcc/config/i386/i386-expand.c
@@ -615,6 +615,23 @@  ix86_expand_vector_move (machine_mode mode, rtx operands[])
       return;
     }
 
+  /* Special case TImode to V1TImode conversions, via V2DI.  */
+  if (mode == V1TImode
+      && SUBREG_P (op1)
+      && GET_MODE (SUBREG_REG (op1)) == TImode
+      && TARGET_64BIT && TARGET_SSE
+      && can_create_pseudo_p ())
+    {
+      rtx tmp = gen_reg_rtx (V2DImode);
+      rtx lo = gen_reg_rtx (DImode);
+      rtx hi = gen_reg_rtx (DImode);
+      emit_move_insn (lo, gen_lowpart (DImode, SUBREG_REG (op1)));
+      emit_move_insn (hi, gen_highpart (DImode, SUBREG_REG (op1)));
+      emit_insn (gen_vec_concatv2di (tmp, lo, hi));
+      emit_move_insn (op0, gen_lowpart (V1TImode, tmp));
+      return;
+    }
+
   /* If operand0 is a hard register, make operand1 a pseudo.  */
   if (can_create_pseudo_p ()
       && !ix86_hardreg_mov_ok (op0, op1))
diff --git a/gcc/testsuite/gcc.target/i386/sse2-v1ti-mov-1.c b/gcc/testsuite/gcc.target/i386/sse2-v1ti-mov-1.c
new file mode 100644
index 0000000..a1ef7b7
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/sse2-v1ti-mov-1.c
@@ -0,0 +1,11 @@ 
+/* { dg-do compile { target int128 } } */
+/* { dg-options "-O2 -msse2" } */
+
+typedef unsigned __int128 uv1ti __attribute__ ((__vector_size__ (16)));
+
+uv1ti foo(__int128 x)
+{
+  return (uv1ti)x;
+}
+
+/* { dg-final { scan-assembler-not "%\[er\]sp" } } */
diff --git a/gcc/testsuite/gcc.target/i386/sse2-v1ti-zext.c b/gcc/testsuite/gcc.target/i386/sse2-v1ti-zext.c
new file mode 100644
index 0000000..4dfc655
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/sse2-v1ti-zext.c
@@ -0,0 +1,16 @@ 
+/* { dg-do compile { target int128 } } */
+/* { dg-options "-O2 -msse2" } */
+
+typedef unsigned __int128 uv1ti __attribute__ ((__vector_size__ (16)));
+
+uv1ti extqi(unsigned char c) { return (uv1ti)(__int128)c; }
+uv1ti exthi(unsigned short s) { return (uv1ti)(__int128)s; }
+uv1ti extsi(unsigned int i) { return (uv1ti)(__int128)i; }
+uv1ti extdi(unsigned long long l) { return (uv1ti)(__int128)l; }
+
+uv1ti pextqi(unsigned char *pc) { return (uv1ti)(__int128)(*pc); }
+uv1ti pexthi(unsigned short *ps) { return (uv1ti)(__int128)(*ps); }
+uv1ti pextsi(unsigned int *pi) { return (uv1ti)(__int128)(*pi); }
+uv1ti pextdi(unsigned long long *pl) { return (uv1ti)(__int128)(*pl); }
+
+/* { dg-final { scan-assembler-not "%\[er\]sp" } } */