diff mbox

Vectorizing abs(char/short/int) on x86.

Message ID CAK=A3=3YJnZTVnxQPiMv10n=8-CWQe9H+Bh+H_ChKOedZhoFpA@mail.gmail.com
State New
Headers show

Commit Message

Cong Hou Oct. 30, 2013, 5:01 p.m. UTC
I found my problem: I put DONE outside of if not inside. You are
right. I have updated my patch.

I appreciate your comment and test on it!


thanks,
Cong








On Wed, Oct 30, 2013 at 2:31 AM, Uros Bizjak <ubizjak@gmail.com> wrote:
> On Tue, Oct 29, 2013 at 6:18 PM, Cong Hou <congh@google.com> wrote:
>
>>>> For the define_expand I added as below, the else body is there to
>>>> avoid fall-through transformations to ABS operation in optabs.c.
>>>> Otherwise ABS will be converted to other operations even that we have
>>>> corresponding instructions from SSSE3.
>>>
>>> No, it wont be.
>>>
>>> Fallthrough will generate the pattern that will be matched by the insn
>>> pattern above, just like you are doing by hand below.
>>
>>
>> I think the case is special for abs(). In optabs.c, there is a
>> function expand_abs() in which the function expand_abs_nojump() is
>> called. This function first tries the expand function defined for the
>> target and if it fails it will try max(v, -v) then shift-xor-sub
>> method. If I don't generate any instruction for SSSE3, the
>> fall-through will be max(v, -v). I have tested it on my machine.
>
> I have tested the usual approach in i386.md, shown exactly by the
> patch below (and using your other changes to i386.c):
>
> -- cut here --
> Index: sse.md
> ===================================================================
> --- sse.md      (revision 204149)
> +++ sse.md      (working copy)
> @@ -10270,7 +10270,7 @@
>     (set (attr "prefix_rex") (symbol_ref "x86_extended_reg_mentioned_p (insn)"))
>     (set_attr "mode" "DI")])
>
> -(define_insn "abs<mode>2"
> +(define_insn "*abs<mode>2"
>    [(set (match_operand:VI124_AVX2_48_AVX512F 0 "register_operand" "=v")
>         (abs:VI124_AVX2_48_AVX512F
>           (match_operand:VI124_AVX2_48_AVX512F 1 "nonimmediate_operand" "vm")))]
> @@ -10282,6 +10282,19 @@
>     (set_attr "prefix" "maybe_vex")
>     (set_attr "mode" "<sseinsnmode>")])
>
> +(define_expand "abs<mode>2"
> +  [(set (match_operand:VI124_AVX2_48_AVX512F 0 "register_operand")
> +       (abs:VI124_AVX2_48_AVX512F
> +         (match_operand:VI124_AVX2_48_AVX512F 1 "nonimmediate_operand")))]
> +  "TARGET_SSE2"
> +{
> +  if (!TARGET_SSSE3)
> +    {
> +      ix86_expand_sse2_abs (operands[0], operands[1]);
> +      DONE;
> +    }
> +})
> +
>  (define_insn "abs<mode>2"
>    [(set (match_operand:MMXMODEI 0 "register_operand" "=y")
>         (abs:MMXMODEI
> -- cut here --
>
> using following testcase:
>
> --cut here--
> #define N 32
>
> int ca[N];
> int cb[N];
>
> void test1 (void)
> {
>   int i;
>   for (i = 0; i < N; ++i)
>     cb[i] = abs (ca[i]);
> }
> -- cut here --
>
> Compiling on x86_64-pc-linux-gnu target (inherently -msse2):
>
> ~/gcc-build-fast/gcc/cc1 -O2 -ftree-vectorize -dp t.c
>
> .L2:
>         movdqa  ca(%rax), %xmm0 # 25    *movv4si_internal/2     [length = 8]
>         addq    $16, %rax       # 30    *adddi_1/1      [length = 4]
>         movdqa  ca-16(%rax), %xmm1      # 46    *movv4si_internal/2
>  [length = 8]
>         psrad   $31, %xmm0      # 26    ashrv4si3/1     [length = 5]
>         pxor    %xmm0, %xmm1    # 47    *xorv4si3/1     [length = 4]
>         psubd   %xmm0, %xmm1    # 28    *subv4si3/1     [length = 4]
>         movaps  %xmm1, cb-16(%rax)      # 29    *movv4si_internal/3
>  [length = 7]
>         cmpq    $128, %rax      # 32    *cmpdi_1/1      [length = 6]
>         jne     .L2     # 33    *jcc_1  [length = 2]
>
> ~/gcc-build-fast/gcc/cc1 -O2 -ftree-vectorize -mssse3 -dp t.c
>
> .L2:
>         pabsd   ca(%rax), %xmm0 # 25    *absv4si2       [length = 9]
>         addq    $16, %rax       # 27    *adddi_1/1      [length = 4]
>         movaps  %xmm0, cb-16(%rax)      # 26    *movv4si_internal/3
>  [length = 7]
>         cmpq    $128, %rax      # 29    *cmpdi_1/1      [length = 6]
>         jne     .L2     # 30    *jcc_1  [length = 2]
>
> As shown above, it worked OK for both with -mssse3 (generating pabsd
> insn) and without (generating your V4SI sequence).
>
> Uros.

Comments

Uros Bizjak Oct. 30, 2013, 5:22 p.m. UTC | #1
On Wed, Oct 30, 2013 at 6:01 PM, Cong Hou <congh@google.com> wrote:
> I found my problem: I put DONE outside of if not inside. You are
> right. I have updated my patch.

OK, great that we put things in order ;)

Does this patch need some extra middle-end functionality? I was not
able to vectorize char and short part of your patch.

Regarding the testcase - please put it to gcc.target/i386/ directory.
There is nothing generic in the test, as confirmed by target-dependent
scan test. You will find plenty of examples in the mentioned
directory. I'd suggest to split the testcase in three files, and to
simplify it to something like the testcase with global variables I
used earlier.

Modulo testcase, the patch is OK otherwise, but middle-end parts
should be committed first.

Thanks,
Uros.
Cong Hou Oct. 30, 2013, 6:05 p.m. UTC | #2
Also, as the current expand for abs() on 8/16bit integer is not used
at all, should I comment them temporarily now? Later I can uncomment
them once I finished the pattern recognizer.



thanks,
Cong


On Wed, Oct 30, 2013 at 10:22 AM, Uros Bizjak <ubizjak@gmail.com> wrote:
> On Wed, Oct 30, 2013 at 6:01 PM, Cong Hou <congh@google.com> wrote:
>> I found my problem: I put DONE outside of if not inside. You are
>> right. I have updated my patch.
>
> OK, great that we put things in order ;)
>
> Does this patch need some extra middle-end functionality? I was not
> able to vectorize char and short part of your patch.
>
> Regarding the testcase - please put it to gcc.target/i386/ directory.
> There is nothing generic in the test, as confirmed by target-dependent
> scan test. You will find plenty of examples in the mentioned
> directory. I'd suggest to split the testcase in three files, and to
> simplify it to something like the testcase with global variables I
> used earlier.
>
> Modulo testcase, the patch is OK otherwise, but middle-end parts
> should be committed first.
>
> Thanks,
> Uros.
diff mbox

Patch

diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index 8a38316..84c7ab5 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,10 @@ 
+2013-10-22  Cong Hou  <congh@google.com>
+
+ PR target/58762
+ * config/i386/i386-protos.h (ix86_expand_sse2_abs): New function.
+ * config/i386/i386.c (ix86_expand_sse2_abs): New function.
+ * config/i386/sse.md: Add SSE2 support to abs (8/16/32-bit-int).
+
 2013-10-14  David Malcolm  <dmalcolm@redhat.com>

  * dumpfile.h (gcc::dump_manager): New class, to hold state
diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h
index 3ab2f3a..ca31224 100644
--- a/gcc/config/i386/i386-protos.h
+++ b/gcc/config/i386/i386-protos.h
@@ -238,6 +238,7 @@  extern void ix86_expand_mul_widen_evenodd (rtx,
rtx, rtx, bool, bool);
 extern void ix86_expand_mul_widen_hilo (rtx, rtx, rtx, bool, bool);
 extern void ix86_expand_sse2_mulv4si3 (rtx, rtx, rtx);
 extern void ix86_expand_sse2_mulvxdi3 (rtx, rtx, rtx);
+extern void ix86_expand_sse2_abs (rtx, rtx);

 /* In i386-c.c  */
 extern void ix86_target_macros (void);
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 02cbbbd..71905fc 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -41696,6 +41696,53 @@  ix86_expand_sse2_mulvxdi3 (rtx op0, rtx op1, rtx op2)
        gen_rtx_MULT (mode, op1, op2));
 }

+void
+ix86_expand_sse2_abs (rtx op0, rtx op1)
+{
+  enum machine_mode mode = GET_MODE (op0);
+  rtx tmp0, tmp1;
+
+  switch (mode)
+    {
+      /* For 32-bit signed integer X, the best way to calculate the absolute
+ value of X is (((signed) X >> (W-1)) ^ X) - ((signed) X >> (W-1)).  */
+      case V4SImode:
+ tmp0 = expand_simple_binop (mode, ASHIFTRT, op1,
+    GEN_INT (GET_MODE_BITSIZE
+ (GET_MODE_INNER (mode)) - 1),
+    NULL, 0, OPTAB_DIRECT);
+ if (tmp0)
+  tmp1 = expand_simple_binop (mode, XOR, op1, tmp0,
+      NULL, 0, OPTAB_DIRECT);
+ if (tmp0 && tmp1)
+  expand_simple_binop (mode, MINUS, tmp1, tmp0,
+       op0, 0, OPTAB_DIRECT);
+ break;
+
+      /* For 16-bit signed integer X, the best way to calculate the absolute
+ value of X is max (X, -X), as SSE2 provides the PMAXSW insn.  */
+      case V8HImode:
+ tmp0 = expand_unop (mode, neg_optab, op1, NULL_RTX, 0);
+ if (tmp0)
+  expand_simple_binop (mode, SMAX, op1, tmp0, op0, 0,
+       OPTAB_DIRECT);
+ break;
+
+      /* For 8-bit signed integer X, the best way to calculate the absolute
+ value of X is min ((unsigned char) X, (unsigned char) (-X)),
+ as SSE2 provides the PMINUB insn.  */
+      case V16QImode:
+ tmp0 = expand_unop (mode, neg_optab, op1, NULL_RTX, 0);
+ if (tmp0)
+  expand_simple_binop (V16QImode, UMIN, op1, tmp0, op0, 0,
+       OPTAB_DIRECT);
+ break;
+
+      default:
+ break;
+    }
+}
+
 /* Expand an insert into a vector register through pinsr insn.
    Return true if successful.  */

diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index c3f6c94..46e1df4 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -8721,7 +8721,7 @@ 
    (set (attr "prefix_rex") (symbol_ref "x86_extended_reg_mentioned_p (insn)"))
    (set_attr "mode" "DI")])

-(define_insn "abs<mode>2"
+(define_insn "*abs<mode>2"
   [(set (match_operand:VI124_AVX2_48_AVX512F 0 "register_operand" "=v")
  (abs:VI124_AVX2_48_AVX512F
   (match_operand:VI124_AVX2_48_AVX512F 1 "nonimmediate_operand" "vm")))]
@@ -8733,6 +8733,19 @@ 
    (set_attr "prefix" "maybe_vex")
    (set_attr "mode" "<sseinsnmode>")])

+(define_expand "abs<mode>2"
+  [(set (match_operand:VI124_AVX2_48_AVX512F 0 "register_operand")
+ (abs:VI124_AVX2_48_AVX512F
+  (match_operand:VI124_AVX2_48_AVX512F 1 "nonimmediate_operand")))]
+  "TARGET_SSE2"
+{
+  if (!TARGET_SSSE3)
+    {
+      ix86_expand_sse2_abs (operands[0], operands[1]);
+      DONE;
+    }
+})
+
 (define_insn "abs<mode>2"
   [(set (match_operand:MMXMODEI 0 "register_operand" "=y")
  (abs:MMXMODEI
diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog
index 075d071..cf5b942 100644
--- a/gcc/testsuite/ChangeLog
+++ b/gcc/testsuite/ChangeLog
@@ -1,3 +1,8 @@ 
+2013-10-22  Cong Hou  <congh@google.com>
+
+ PR target/58762
+ * gcc.dg/vect/pr58762.c: New test.
+
 2013-10-14  Tobias Burnus  <burnus@net-b.de>

  PR fortran/58658
diff --git a/gcc/testsuite/gcc.dg/vect/pr58762.c
b/gcc/testsuite/gcc.dg/vect/pr58762.c
new file mode 100644
index 0000000..6468d0a
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/pr58762.c
@@ -0,0 +1,28 @@ 
+/* { dg-require-effective-target vect_int } */
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+void test1 (char* a, char* b)
+{
+  int i;
+  for (i = 0; i < 10000; ++i)
+    a[i] = abs (b[i]);
+}
+
+void test2 (short* a, short* b)
+{
+  int i;
+  for (i = 0; i < 10000; ++i)
+    a[i] = abs (b[i]);
+}
+
+void test3 (int* a, int* b)
+{
+  int i;
+  for (i = 0; i < 10000; ++i)
+    a[i] = abs (b[i]);
+}
+
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 3 "vect"
+       { target i?86-*-* x86_64-*-* } } } */
+/* { dg-final { cleanup-tree-dump "vect" } } */