diff mbox series

x86_64: Improved implementation of TImode rotations.

Message ID 01b001d7cf3f$dfd47110$9f7d5330$@nextmovesoftware.com
State New
Headers show
Series x86_64: Improved implementation of TImode rotations. | expand

Commit Message

Roger Sayle Nov. 1, 2021, 4:45 p.m. UTC
This simple patch improves the implementation of 128-bit (TImode)
rotations on x86_64 (a missed optimization opportunity spotted
during the recent V1TImode improvements).

Currently, the function:

unsigned __int128 rotrti3(unsigned __int128 x, unsigned int i) {
  return (x >> i) | (x << (128-i));
}

produces:

rotrti3:
        movq    %rsi, %r8
        movq    %rdi, %r9
        movl    %edx, %ecx
        movq    %rdi, %rsi
        movq    %r9, %rax
        movq    %r8, %rdx
        movq    %r8, %rdi
        shrdq   %r8, %rax
        shrq    %cl, %rdx
        xorl    %r8d, %r8d
        testb   $64, %cl
        cmovne  %rdx, %rax
        cmovne  %r8, %rdx
        negl    %ecx
        andl    $127, %ecx
        shldq   %r9, %rdi
        salq    %cl, %rsi
        xorl    %r9d, %r9d
        testb   $64, %cl
        cmovne  %rsi, %rdi
        cmovne  %r9, %rsi
        orq     %rdi, %rdx
        orq     %rsi, %rax
        ret

with this patch, GCC will now generate the much nicer:
rotrti3:
        movl    %edx, %ecx
        movq    %rdi, %rdx
        shrdq   %rsi, %rdx
        shrdq   %rdi, %rsi
        andl    $64, %ecx
        movq    %rdx, %rax
        cmove   %rsi, %rdx
        cmovne  %rsi, %rax
        ret

Even I wasn't expecting the optimizer's choice of the final three
instructions; a thing of beauty.  For rotations larger than 64,
the lowpart and the highpart (%rax and %rdx) are transposed, and
it would be nice to have a conditional swap/exchange.  The inspired
solution the compiler comes up with is to store/duplicate the same
value in both %rax/%rdx, and then use complementary conditional moves
to either update the lowpart or highpart, which cleverly avoids the
potential decode-stage pipeline stall (on some microarchitectures)
from having multiple instructions conditional on the same condition.
See X86_TUNE_ONE_IF_CONV_INSN, and notice there are two such stalls
in the original expansion of rot[rl]ti3.

One quick question, does TARGET_64BIT (always) imply TARGET_CMOVE?

This patch has been tested on x86_64-pc-linux-gnu with a make bootstrap
and make -k check with no new failures.  Interestingly the correct
behaviour is already tested by (amongst other tests) sse2-v1ti-shift-3.c
that confirms V1TImode rotates by constants match rotlti3/rotrti3.

Ok for mainline?


2021-11-01  Roger Sayle  <roger@nextmovesoftware.com>

	* config/i386/i386.md (<any_rotate>ti3): Provide expansion for
	rotations by non-constant amounts on TARGET_CMOVE architectures.


Thanks in advance,
Roger
--

Comments

Uros Bizjak Nov. 1, 2021, 7:06 p.m. UTC | #1
On Mon, Nov 1, 2021 at 5:45 PM Roger Sayle <roger@nextmovesoftware.com> wrote:
>
>
> This simple patch improves the implementation of 128-bit (TImode)
> rotations on x86_64 (a missed optimization opportunity spotted
> during the recent V1TImode improvements).
>
> Currently, the function:
>
> unsigned __int128 rotrti3(unsigned __int128 x, unsigned int i) {
>   return (x >> i) | (x << (128-i));
> }
>
> produces:
>
> rotrti3:
>         movq    %rsi, %r8
>         movq    %rdi, %r9
>         movl    %edx, %ecx
>         movq    %rdi, %rsi
>         movq    %r9, %rax
>         movq    %r8, %rdx
>         movq    %r8, %rdi
>         shrdq   %r8, %rax
>         shrq    %cl, %rdx
>         xorl    %r8d, %r8d
>         testb   $64, %cl
>         cmovne  %rdx, %rax
>         cmovne  %r8, %rdx
>         negl    %ecx
>         andl    $127, %ecx
>         shldq   %r9, %rdi
>         salq    %cl, %rsi
>         xorl    %r9d, %r9d
>         testb   $64, %cl
>         cmovne  %rsi, %rdi
>         cmovne  %r9, %rsi
>         orq     %rdi, %rdx
>         orq     %rsi, %rax
>         ret
>
> with this patch, GCC will now generate the much nicer:
> rotrti3:
>         movl    %edx, %ecx
>         movq    %rdi, %rdx
>         shrdq   %rsi, %rdx
>         shrdq   %rdi, %rsi
>         andl    $64, %ecx
>         movq    %rdx, %rax
>         cmove   %rsi, %rdx
>         cmovne  %rsi, %rax
>         ret
>
> Even I wasn't expecting the optimizer's choice of the final three
> instructions; a thing of beauty.  For rotations larger than 64,
> the lowpart and the highpart (%rax and %rdx) are transposed, and
> it would be nice to have a conditional swap/exchange.  The inspired
> solution the compiler comes up with is to store/duplicate the same
> value in both %rax/%rdx, and then use complementary conditional moves
> to either update the lowpart or highpart, which cleverly avoids the
> potential decode-stage pipeline stall (on some microarchitectures)
> from having multiple instructions conditional on the same condition.
> See X86_TUNE_ONE_IF_CONV_INSN, and notice there are two such stalls
> in the original expansion of rot[rl]ti3.
>
> One quick question, does TARGET_64BIT (always) imply TARGET_CMOVE?

Yes, from i386-options.c:

  /* X86_ARCH_CMOV: Conditional move was added for pentiumpro.  */
  ~(m_386 | m_486 | m_PENT | m_LAKEMONT | m_K6),

> This patch has been tested on x86_64-pc-linux-gnu with a make bootstrap
> and make -k check with no new failures.  Interestingly the correct
> behaviour is already tested by (amongst other tests) sse2-v1ti-shift-3.c
> that confirms V1TImode rotates by constants match rotlti3/rotrti3.
>
> Ok for mainline?
>
>
> 2021-11-01  Roger Sayle  <roger@nextmovesoftware.com>
>
>         * config/i386/i386.md (<any_rotate>ti3): Provide expansion for
>         rotations by non-constant amounts on TARGET_CMOVE architectures.

OK with a nit below.

Thanks,
Uros.

+      if (<CODE> == ROTATE)
+ {
+  emit_insn (gen_x86_64_shld (tmp_lo, src_hi, amount));
+  emit_insn (gen_x86_64_shld (tmp_hi, src_lo, amount));
+ }
+      else
+ {
+  emit_insn (gen_x86_64_shrd (tmp_lo, src_hi, amount));
+  emit_insn (gen_x86_64_shrd (tmp_hi, src_lo, amount));
+ }

      rtx (*shift) (rtx, rtx, rtx)
            = (<CODE> == ROTATE) ? gen_x86_64_shld : gen_x86_64_shrd;
      emit_insn (shift (tmp_lo, src_hi, amount));
      emit_insn (shift (tmp_hi, src_lo, amount));



>
> Thanks in advance,
> Roger
> --
>
diff mbox series

Patch

diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index e733a40..2285c6c 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -12572,6 +12572,31 @@ 
   if (const_1_to_63_operand (operands[2], VOIDmode))
     emit_insn (gen_ix86_<insn>ti3_doubleword
 		(operands[0], operands[1], operands[2]));
+  else if (TARGET_CMOVE)
+    {
+      rtx amount = force_reg (QImode, operands[2]);
+      rtx src_lo = gen_lowpart (DImode, operands[1]);
+      rtx src_hi = gen_highpart (DImode, operands[1]);
+      rtx tmp_lo = gen_reg_rtx (DImode);
+      rtx tmp_hi = gen_reg_rtx (DImode);
+      emit_move_insn (tmp_lo, src_lo);
+      emit_move_insn (tmp_hi, src_hi);
+      if (<CODE> == ROTATE)
+	{
+	  emit_insn (gen_x86_64_shld (tmp_lo, src_hi, amount));
+	  emit_insn (gen_x86_64_shld (tmp_hi, src_lo, amount));
+	}
+      else
+	{
+	  emit_insn (gen_x86_64_shrd (tmp_lo, src_hi, amount));
+	  emit_insn (gen_x86_64_shrd (tmp_hi, src_lo, amount));
+	}
+      rtx dst_lo = gen_lowpart (DImode, operands[0]);
+      rtx dst_hi = gen_highpart (DImode, operands[0]);
+      emit_move_insn (dst_lo, tmp_lo);
+      emit_move_insn (dst_hi, tmp_hi);
+      emit_insn (gen_x86_shiftdi_adj_1 (dst_lo, dst_hi, amount, tmp_lo));
+    }
   else
     FAIL;