diff mbox

Vectorizing abs(char/short/int) on x86.

Message ID CAK=A3=2Tj7SiOozESR59G-f9T0wV7HjKroxxmE=ASuDKXDFQGw@mail.gmail.com
State New
Headers show

Commit Message

Cong Hou Oct. 30, 2013, 5:02 p.m. UTC
Forget to attach the patch file.



thanks,
Cong


On Wed, Oct 30, 2013 at 10:01 AM, Cong Hou <congh@google.com> wrote:
> I found my problem: I put DONE outside of if not inside. You are
> right. I have updated my patch.
>
> I appreciate your comment and test on it!
>
>
> thanks,
> Cong
>
>
>
> diff --git a/gcc/ChangeLog b/gcc/ChangeLog
> index 8a38316..84c7ab5 100644
> --- a/gcc/ChangeLog
> +++ b/gcc/ChangeLog
> @@ -1,3 +1,10 @@
> +2013-10-22  Cong Hou  <congh@google.com>
> +
> + PR target/58762
> + * config/i386/i386-protos.h (ix86_expand_sse2_abs): New function.
> + * config/i386/i386.c (ix86_expand_sse2_abs): New function.
> + * config/i386/sse.md: Add SSE2 support to abs (8/16/32-bit-int).
> +
>  2013-10-14  David Malcolm  <dmalcolm@redhat.com>
>
>   * dumpfile.h (gcc::dump_manager): New class, to hold state
> diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h
> index 3ab2f3a..ca31224 100644
> --- a/gcc/config/i386/i386-protos.h
> +++ b/gcc/config/i386/i386-protos.h
> @@ -238,6 +238,7 @@ extern void ix86_expand_mul_widen_evenodd (rtx,
> rtx, rtx, bool, bool);
>  extern void ix86_expand_mul_widen_hilo (rtx, rtx, rtx, bool, bool);
>  extern void ix86_expand_sse2_mulv4si3 (rtx, rtx, rtx);
>  extern void ix86_expand_sse2_mulvxdi3 (rtx, rtx, rtx);
> +extern void ix86_expand_sse2_abs (rtx, rtx);
>
>  /* In i386-c.c  */
>  extern void ix86_target_macros (void);
> diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
> index 02cbbbd..71905fc 100644
> --- a/gcc/config/i386/i386.c
> +++ b/gcc/config/i386/i386.c
> @@ -41696,6 +41696,53 @@ ix86_expand_sse2_mulvxdi3 (rtx op0, rtx op1, rtx op2)
>         gen_rtx_MULT (mode, op1, op2));
>  }
>
> +void
> +ix86_expand_sse2_abs (rtx op0, rtx op1)
> +{
> +  enum machine_mode mode = GET_MODE (op0);
> +  rtx tmp0, tmp1;
> +
> +  switch (mode)
> +    {
> +      /* For 32-bit signed integer X, the best way to calculate the absolute
> + value of X is (((signed) X >> (W-1)) ^ X) - ((signed) X >> (W-1)).  */
> +      case V4SImode:
> + tmp0 = expand_simple_binop (mode, ASHIFTRT, op1,
> +    GEN_INT (GET_MODE_BITSIZE
> + (GET_MODE_INNER (mode)) - 1),
> +    NULL, 0, OPTAB_DIRECT);
> + if (tmp0)
> +  tmp1 = expand_simple_binop (mode, XOR, op1, tmp0,
> +      NULL, 0, OPTAB_DIRECT);
> + if (tmp0 && tmp1)
> +  expand_simple_binop (mode, MINUS, tmp1, tmp0,
> +       op0, 0, OPTAB_DIRECT);
> + break;
> +
> +      /* For 16-bit signed integer X, the best way to calculate the absolute
> + value of X is max (X, -X), as SSE2 provides the PMAXSW insn.  */
> +      case V8HImode:
> + tmp0 = expand_unop (mode, neg_optab, op1, NULL_RTX, 0);
> + if (tmp0)
> +  expand_simple_binop (mode, SMAX, op1, tmp0, op0, 0,
> +       OPTAB_DIRECT);
> + break;
> +
> +      /* For 8-bit signed integer X, the best way to calculate the absolute
> + value of X is min ((unsigned char) X, (unsigned char) (-X)),
> + as SSE2 provides the PMINUB insn.  */
> +      case V16QImode:
> + tmp0 = expand_unop (mode, neg_optab, op1, NULL_RTX, 0);
> + if (tmp0)
> +  expand_simple_binop (V16QImode, UMIN, op1, tmp0, op0, 0,
> +       OPTAB_DIRECT);
> + break;
> +
> +      default:
> + break;
> +    }
> +}
> +
>  /* Expand an insert into a vector register through pinsr insn.
>     Return true if successful.  */
>
> diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
> index c3f6c94..46e1df4 100644
> --- a/gcc/config/i386/sse.md
> +++ b/gcc/config/i386/sse.md
> @@ -8721,7 +8721,7 @@
>     (set (attr "prefix_rex") (symbol_ref "x86_extended_reg_mentioned_p (insn)"))
>     (set_attr "mode" "DI")])
>
> -(define_insn "abs<mode>2"
> +(define_insn "*abs<mode>2"
>    [(set (match_operand:VI124_AVX2_48_AVX512F 0 "register_operand" "=v")
>   (abs:VI124_AVX2_48_AVX512F
>    (match_operand:VI124_AVX2_48_AVX512F 1 "nonimmediate_operand" "vm")))]
> @@ -8733,6 +8733,19 @@
>     (set_attr "prefix" "maybe_vex")
>     (set_attr "mode" "<sseinsnmode>")])
>
> +(define_expand "abs<mode>2"
> +  [(set (match_operand:VI124_AVX2_48_AVX512F 0 "register_operand")
> + (abs:VI124_AVX2_48_AVX512F
> +  (match_operand:VI124_AVX2_48_AVX512F 1 "nonimmediate_operand")))]
> +  "TARGET_SSE2"
> +{
> +  if (!TARGET_SSSE3)
> +    {
> +      ix86_expand_sse2_abs (operands[0], operands[1]);
> +      DONE;
> +    }
> +})
> +
>  (define_insn "abs<mode>2"
>    [(set (match_operand:MMXMODEI 0 "register_operand" "=y")
>   (abs:MMXMODEI
> diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog
> index 075d071..cf5b942 100644
> --- a/gcc/testsuite/ChangeLog
> +++ b/gcc/testsuite/ChangeLog
> @@ -1,3 +1,8 @@
> +2013-10-22  Cong Hou  <congh@google.com>
> +
> + PR target/58762
> + * gcc.dg/vect/pr58762.c: New test.
> +
>  2013-10-14  Tobias Burnus  <burnus@net-b.de>
>
>   PR fortran/58658
> diff --git a/gcc/testsuite/gcc.dg/vect/pr58762.c
> b/gcc/testsuite/gcc.dg/vect/pr58762.c
> new file mode 100644
> index 0000000..6468d0a
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/vect/pr58762.c
> @@ -0,0 +1,28 @@
> +/* { dg-require-effective-target vect_int } */
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -ftree-vectorize" } */
> +
> +void test1 (char* a, char* b)
> +{
> +  int i;
> +  for (i = 0; i < 10000; ++i)
> +    a[i] = abs (b[i]);
> +}
> +
> +void test2 (short* a, short* b)
> +{
> +  int i;
> +  for (i = 0; i < 10000; ++i)
> +    a[i] = abs (b[i]);
> +}
> +
> +void test3 (int* a, int* b)
> +{
> +  int i;
> +  for (i = 0; i < 10000; ++i)
> +    a[i] = abs (b[i]);
> +}
> +
> +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 3 "vect"
> +       { target i?86-*-* x86_64-*-* } } } */
> +/* { dg-final { cleanup-tree-dump "vect" } } */
>
>
>
>
>
> On Wed, Oct 30, 2013 at 2:31 AM, Uros Bizjak <ubizjak@gmail.com> wrote:
>> On Tue, Oct 29, 2013 at 6:18 PM, Cong Hou <congh@google.com> wrote:
>>
>>>>> For the define_expand I added as below, the else body is there to
>>>>> avoid fall-through transformations to ABS operation in optabs.c.
>>>>> Otherwise ABS will be converted to other operations even that we have
>>>>> corresponding instructions from SSSE3.
>>>>
>>>> No, it wont be.
>>>>
>>>> Fallthrough will generate the pattern that will be matched by the insn
>>>> pattern above, just like you are doing by hand below.
>>>
>>>
>>> I think the case is special for abs(). In optabs.c, there is a
>>> function expand_abs() in which the function expand_abs_nojump() is
>>> called. This function first tries the expand function defined for the
>>> target and if it fails it will try max(v, -v) then shift-xor-sub
>>> method. If I don't generate any instruction for SSSE3, the
>>> fall-through will be max(v, -v). I have tested it on my machine.
>>
>> I have tested the usual approach in i386.md, shown exactly by the
>> patch below (and using your other changes to i386.c):
>>
>> -- cut here --
>> Index: sse.md
>> ===================================================================
>> --- sse.md      (revision 204149)
>> +++ sse.md      (working copy)
>> @@ -10270,7 +10270,7 @@
>>     (set (attr "prefix_rex") (symbol_ref "x86_extended_reg_mentioned_p (insn)"))
>>     (set_attr "mode" "DI")])
>>
>> -(define_insn "abs<mode>2"
>> +(define_insn "*abs<mode>2"
>>    [(set (match_operand:VI124_AVX2_48_AVX512F 0 "register_operand" "=v")
>>         (abs:VI124_AVX2_48_AVX512F
>>           (match_operand:VI124_AVX2_48_AVX512F 1 "nonimmediate_operand" "vm")))]
>> @@ -10282,6 +10282,19 @@
>>     (set_attr "prefix" "maybe_vex")
>>     (set_attr "mode" "<sseinsnmode>")])
>>
>> +(define_expand "abs<mode>2"
>> +  [(set (match_operand:VI124_AVX2_48_AVX512F 0 "register_operand")
>> +       (abs:VI124_AVX2_48_AVX512F
>> +         (match_operand:VI124_AVX2_48_AVX512F 1 "nonimmediate_operand")))]
>> +  "TARGET_SSE2"
>> +{
>> +  if (!TARGET_SSSE3)
>> +    {
>> +      ix86_expand_sse2_abs (operands[0], operands[1]);
>> +      DONE;
>> +    }
>> +})
>> +
>>  (define_insn "abs<mode>2"
>>    [(set (match_operand:MMXMODEI 0 "register_operand" "=y")
>>         (abs:MMXMODEI
>> -- cut here --
>>
>> using following testcase:
>>
>> --cut here--
>> #define N 32
>>
>> int ca[N];
>> int cb[N];
>>
>> void test1 (void)
>> {
>>   int i;
>>   for (i = 0; i < N; ++i)
>>     cb[i] = abs (ca[i]);
>> }
>> -- cut here --
>>
>> Compiling on x86_64-pc-linux-gnu target (inherently -msse2):
>>
>> ~/gcc-build-fast/gcc/cc1 -O2 -ftree-vectorize -dp t.c
>>
>> .L2:
>>         movdqa  ca(%rax), %xmm0 # 25    *movv4si_internal/2     [length = 8]
>>         addq    $16, %rax       # 30    *adddi_1/1      [length = 4]
>>         movdqa  ca-16(%rax), %xmm1      # 46    *movv4si_internal/2
>>  [length = 8]
>>         psrad   $31, %xmm0      # 26    ashrv4si3/1     [length = 5]
>>         pxor    %xmm0, %xmm1    # 47    *xorv4si3/1     [length = 4]
>>         psubd   %xmm0, %xmm1    # 28    *subv4si3/1     [length = 4]
>>         movaps  %xmm1, cb-16(%rax)      # 29    *movv4si_internal/3
>>  [length = 7]
>>         cmpq    $128, %rax      # 32    *cmpdi_1/1      [length = 6]
>>         jne     .L2     # 33    *jcc_1  [length = 2]
>>
>> ~/gcc-build-fast/gcc/cc1 -O2 -ftree-vectorize -mssse3 -dp t.c
>>
>> .L2:
>>         pabsd   ca(%rax), %xmm0 # 25    *absv4si2       [length = 9]
>>         addq    $16, %rax       # 27    *adddi_1/1      [length = 4]
>>         movaps  %xmm0, cb-16(%rax)      # 26    *movv4si_internal/3
>>  [length = 7]
>>         cmpq    $128, %rax      # 29    *cmpdi_1/1      [length = 6]
>>         jne     .L2     # 30    *jcc_1  [length = 2]
>>
>> As shown above, it worked OK for both with -mssse3 (generating pabsd
>> insn) and without (generating your V4SI sequence).
>>
>> Uros.
diff mbox

Patch

diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index 8a38316..84c7ab5 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,10 @@ 
+2013-10-22  Cong Hou  <congh@google.com>
+
+	PR target/58762
+	* config/i386/i386-protos.h (ix86_expand_sse2_abs): New function.
+	* config/i386/i386.c (ix86_expand_sse2_abs): New function.
+	* config/i386/sse.md: Add SSE2 support to abs (8/16/32-bit-int).
+
 2013-10-14  David Malcolm  <dmalcolm@redhat.com>
 
 	* dumpfile.h (gcc::dump_manager): New class, to hold state
diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h
index 3ab2f3a..ca31224 100644
--- a/gcc/config/i386/i386-protos.h
+++ b/gcc/config/i386/i386-protos.h
@@ -238,6 +238,7 @@  extern void ix86_expand_mul_widen_evenodd (rtx, rtx, rtx, bool, bool);
 extern void ix86_expand_mul_widen_hilo (rtx, rtx, rtx, bool, bool);
 extern void ix86_expand_sse2_mulv4si3 (rtx, rtx, rtx);
 extern void ix86_expand_sse2_mulvxdi3 (rtx, rtx, rtx);
+extern void ix86_expand_sse2_abs (rtx, rtx);
 
 /* In i386-c.c  */
 extern void ix86_target_macros (void);
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 02cbbbd..71905fc 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -41696,6 +41696,53 @@  ix86_expand_sse2_mulvxdi3 (rtx op0, rtx op1, rtx op2)
 		       gen_rtx_MULT (mode, op1, op2));
 }
 
+void
+ix86_expand_sse2_abs (rtx op0, rtx op1)
+{
+  enum machine_mode mode = GET_MODE (op0);
+  rtx tmp0, tmp1;
+
+  switch (mode)
+    {
+      /* For 32-bit signed integer X, the best way to calculate the absolute
+	 value of X is (((signed) X >> (W-1)) ^ X) - ((signed) X >> (W-1)).  */
+      case V4SImode:
+	tmp0 = expand_simple_binop (mode, ASHIFTRT, op1,
+				    GEN_INT (GET_MODE_BITSIZE
+						 (GET_MODE_INNER (mode)) - 1),
+				    NULL, 0, OPTAB_DIRECT);
+	if (tmp0)
+	  tmp1 = expand_simple_binop (mode, XOR, op1, tmp0,
+				      NULL, 0, OPTAB_DIRECT);
+	if (tmp0 && tmp1)
+	  expand_simple_binop (mode, MINUS, tmp1, tmp0,
+			       op0, 0, OPTAB_DIRECT);
+	break;
+
+      /* For 16-bit signed integer X, the best way to calculate the absolute
+	 value of X is max (X, -X), as SSE2 provides the PMAXSW insn.  */
+      case V8HImode:
+	tmp0 = expand_unop (mode, neg_optab, op1, NULL_RTX, 0);
+	if (tmp0)
+	  expand_simple_binop (mode, SMAX, op1, tmp0, op0, 0,
+			       OPTAB_DIRECT);
+	break;
+
+      /* For 8-bit signed integer X, the best way to calculate the absolute
+	 value of X is min ((unsigned char) X, (unsigned char) (-X)),
+	 as SSE2 provides the PMINUB insn.  */
+      case V16QImode:
+	tmp0 = expand_unop (mode, neg_optab, op1, NULL_RTX, 0);
+	if (tmp0)
+	  expand_simple_binop (V16QImode, UMIN, op1, tmp0, op0, 0,
+			       OPTAB_DIRECT);
+	break;
+
+      default:
+	break;
+    }
+}
+
 /* Expand an insert into a vector register through pinsr insn.
    Return true if successful.  */
 
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index c3f6c94..46e1df4 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -8721,7 +8721,7 @@ 
    (set (attr "prefix_rex") (symbol_ref "x86_extended_reg_mentioned_p (insn)"))
    (set_attr "mode" "DI")])
 
-(define_insn "abs<mode>2"
+(define_insn "*abs<mode>2"
   [(set (match_operand:VI124_AVX2_48_AVX512F 0 "register_operand" "=v")
 	(abs:VI124_AVX2_48_AVX512F
 	  (match_operand:VI124_AVX2_48_AVX512F 1 "nonimmediate_operand" "vm")))]
@@ -8733,6 +8733,19 @@ 
    (set_attr "prefix" "maybe_vex")
    (set_attr "mode" "<sseinsnmode>")])
 
+(define_expand "abs<mode>2"
+  [(set (match_operand:VI124_AVX2_48_AVX512F 0 "register_operand")
+	(abs:VI124_AVX2_48_AVX512F
+	  (match_operand:VI124_AVX2_48_AVX512F 1 "nonimmediate_operand")))]
+  "TARGET_SSE2"
+{
+  if (!TARGET_SSSE3)
+    {
+      ix86_expand_sse2_abs (operands[0], operands[1]);
+      DONE;
+    }
+})
+
 (define_insn "abs<mode>2"
   [(set (match_operand:MMXMODEI 0 "register_operand" "=y")
 	(abs:MMXMODEI
diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog
index 075d071..cf5b942 100644
--- a/gcc/testsuite/ChangeLog
+++ b/gcc/testsuite/ChangeLog
@@ -1,3 +1,8 @@ 
+2013-10-22  Cong Hou  <congh@google.com>
+
+	PR target/58762
+	* gcc.dg/vect/pr58762.c: New test.
+
 2013-10-14  Tobias Burnus  <burnus@net-b.de>
 
 	PR fortran/58658
diff --git a/gcc/testsuite/gcc.dg/vect/pr58762.c b/gcc/testsuite/gcc.dg/vect/pr58762.c
new file mode 100644
index 0000000..6468d0a
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/pr58762.c
@@ -0,0 +1,28 @@ 
+/* { dg-require-effective-target vect_int } */
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+void test1 (char* a, char* b)
+{
+  int i;
+  for (i = 0; i < 10000; ++i)
+    a[i] = abs (b[i]);
+}
+
+void test2 (short* a, short* b)
+{
+  int i;
+  for (i = 0; i < 10000; ++i)
+    a[i] = abs (b[i]);
+}
+
+void test3 (int* a, int* b)
+{
+  int i;
+  for (i = 0; i < 10000; ++i)
+    a[i] = abs (b[i]);
+}
+
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 3 "vect"
+       { target i?86-*-* x86_64-*-* } } } */
+/* { dg-final { cleanup-tree-dump "vect" } } */