diff mbox

Vectorizing abs(char/short/int) on x86.

Message ID CAFULd4YKqqrZ8nvid2Vy+6nBjDZixF59DdMmcyw0Rx9bORNh-Q@mail.gmail.com
State New
Headers show

Commit Message

Uros Bizjak Oct. 30, 2013, 9:31 a.m. UTC
On Tue, Oct 29, 2013 at 6:18 PM, Cong Hou <congh@google.com> wrote:

>>> For the define_expand I added as below, the else body is there to
>>> avoid fall-through transformations to ABS operation in optabs.c.
>>> Otherwise ABS will be converted to other operations even that we have
>>> corresponding instructions from SSSE3.
>>
>> No, it wont be.
>>
>> Fallthrough will generate the pattern that will be matched by the insn
>> pattern above, just like you are doing by hand below.
>
>
> I think the case is special for abs(). In optabs.c, there is a
> function expand_abs() in which the function expand_abs_nojump() is
> called. This function first tries the expand function defined for the
> target and if it fails it will try max(v, -v) then shift-xor-sub
> method. If I don't generate any instruction for SSSE3, the
> fall-through will be max(v, -v). I have tested it on my machine.

I have tested the usual approach in i386.md, shown exactly by the
patch below (and using your other changes to i386.c):

-- cut here --
-- cut here --

using following testcase:

--cut here--
#define N 32

int ca[N];
int cb[N];

void test1 (void)
{
  int i;
  for (i = 0; i < N; ++i)
    cb[i] = abs (ca[i]);
}
-- cut here --

Compiling on x86_64-pc-linux-gnu target (inherently -msse2):

~/gcc-build-fast/gcc/cc1 -O2 -ftree-vectorize -dp t.c

.L2:
        movdqa  ca(%rax), %xmm0 # 25    *movv4si_internal/2     [length = 8]
        addq    $16, %rax       # 30    *adddi_1/1      [length = 4]
        movdqa  ca-16(%rax), %xmm1      # 46    *movv4si_internal/2
 [length = 8]
        psrad   $31, %xmm0      # 26    ashrv4si3/1     [length = 5]
        pxor    %xmm0, %xmm1    # 47    *xorv4si3/1     [length = 4]
        psubd   %xmm0, %xmm1    # 28    *subv4si3/1     [length = 4]
        movaps  %xmm1, cb-16(%rax)      # 29    *movv4si_internal/3
 [length = 7]
        cmpq    $128, %rax      # 32    *cmpdi_1/1      [length = 6]
        jne     .L2     # 33    *jcc_1  [length = 2]

~/gcc-build-fast/gcc/cc1 -O2 -ftree-vectorize -mssse3 -dp t.c

.L2:
        pabsd   ca(%rax), %xmm0 # 25    *absv4si2       [length = 9]
        addq    $16, %rax       # 27    *adddi_1/1      [length = 4]
        movaps  %xmm0, cb-16(%rax)      # 26    *movv4si_internal/3
 [length = 7]
        cmpq    $128, %rax      # 29    *cmpdi_1/1      [length = 6]
        jne     .L2     # 30    *jcc_1  [length = 2]

As shown above, it worked OK for both with -mssse3 (generating pabsd
insn) and without (generating your V4SI sequence).

Uros.
diff mbox

Patch

Index: sse.md
===================================================================
--- sse.md      (revision 204149)
+++ sse.md      (working copy)
@@ -10270,7 +10270,7 @@ 
    (set (attr "prefix_rex") (symbol_ref "x86_extended_reg_mentioned_p (insn)"))
    (set_attr "mode" "DI")])

-(define_insn "abs<mode>2"
+(define_insn "*abs<mode>2"
   [(set (match_operand:VI124_AVX2_48_AVX512F 0 "register_operand" "=v")
        (abs:VI124_AVX2_48_AVX512F
          (match_operand:VI124_AVX2_48_AVX512F 1 "nonimmediate_operand" "vm")))]
@@ -10282,6 +10282,19 @@ 
    (set_attr "prefix" "maybe_vex")
    (set_attr "mode" "<sseinsnmode>")])

+(define_expand "abs<mode>2"
+  [(set (match_operand:VI124_AVX2_48_AVX512F 0 "register_operand")
+       (abs:VI124_AVX2_48_AVX512F
+         (match_operand:VI124_AVX2_48_AVX512F 1 "nonimmediate_operand")))]
+  "TARGET_SSE2"
+{
+  if (!TARGET_SSSE3)
+    {
+      ix86_expand_sse2_abs (operands[0], operands[1]);
+      DONE;
+    }
+})
+
 (define_insn "abs<mode>2"
   [(set (match_operand:MMXMODEI 0 "register_operand" "=y")
        (abs:MMXMODEI