diff mbox

Vectorizing abs(char/short/int) on x86.

Message ID CAK=A3=3-qKTNUaYuvh4iUyufHvQ=kC24CaGEyQBkzJkvgbO_-w@mail.gmail.com
State New
Headers show

Commit Message

Cong Hou Oct. 30, 2013, 6:01 p.m. UTC
On Wed, Oct 30, 2013 at 10:22 AM, Uros Bizjak <ubizjak@gmail.com> wrote:
> On Wed, Oct 30, 2013 at 6:01 PM, Cong Hou <congh@google.com> wrote:
>> I found my problem: I put DONE outside of if not inside. You are
>> right. I have updated my patch.
>
> OK, great that we put things in order ;)
>
> Does this patch need some extra middle-end functionality? I was not
> able to vectorize char and short part of your patch.


In the original patch, I converted abs() on short and char values to
their own types by removing type casts. That is, originally char_val1
= abs(char_val2) will be converted to char_val1 = (char) abs((int)
char_val2) in the frontend, and I would like to convert it back to
char_val1 = abs(char_val2). But after several discussions, it seems
this conversion has some problems such as overflow converns, and I
thereby removed that part.

Now you should still be able to vectorize abs(char) and abs(short) but
with packing and unpacking. Later I will consider to write pattern
recognizer for abs(char) and abs(short) and then the expand on
abs(char)/abs(short) in this patch will be used during vectorization.


>
> Regarding the testcase - please put it to gcc.target/i386/ directory.
> There is nothing generic in the test, as confirmed by target-dependent
> scan test. You will find plenty of examples in the mentioned
> directory. I'd suggest to split the testcase in three files, and to
> simplify it to something like the testcase with global variables I
> used earlier.


I have done it. The test case is split into three for s8/s16/s32 in
gcc.target/i386.


Thank you!

Cong








>
> Modulo testcase, the patch is OK otherwise, but middle-end parts
> should be committed first.
>
> Thanks,
> Uros.
diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index 8a38316..84c7ab5 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,10 @@
+2013-10-22  Cong Hou  <congh@google.com>
+
+	PR target/58762
+	* config/i386/i386-protos.h (ix86_expand_sse2_abs): New function.
+	* config/i386/i386.c (ix86_expand_sse2_abs): New function.
+	* config/i386/sse.md: Add SSE2 support to abs (8/16/32-bit-int).
+
 2013-10-14  David Malcolm  <dmalcolm@redhat.com>
 
 	* dumpfile.h (gcc::dump_manager): New class, to hold state
diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h
index 3ab2f3a..ca31224 100644
--- a/gcc/config/i386/i386-protos.h
+++ b/gcc/config/i386/i386-protos.h
@@ -238,6 +238,7 @@ extern void ix86_expand_mul_widen_evenodd (rtx, rtx, rtx, bool, bool);
 extern void ix86_expand_mul_widen_hilo (rtx, rtx, rtx, bool, bool);
 extern void ix86_expand_sse2_mulv4si3 (rtx, rtx, rtx);
 extern void ix86_expand_sse2_mulvxdi3 (rtx, rtx, rtx);
+extern void ix86_expand_sse2_abs (rtx, rtx);
 
 /* In i386-c.c  */
 extern void ix86_target_macros (void);
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 02cbbbd..71905fc 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -41696,6 +41696,53 @@ ix86_expand_sse2_mulvxdi3 (rtx op0, rtx op1, rtx op2)
 		       gen_rtx_MULT (mode, op1, op2));
 }
 
+void
+ix86_expand_sse2_abs (rtx op0, rtx op1)
+{
+  enum machine_mode mode = GET_MODE (op0);
+  rtx tmp0, tmp1;
+
+  switch (mode)
+    {
+      /* For 32-bit signed integer X, the best way to calculate the absolute
+	 value of X is (((signed) X >> (W-1)) ^ X) - ((signed) X >> (W-1)).  */
+      case V4SImode:
+	tmp0 = expand_simple_binop (mode, ASHIFTRT, op1,
+				    GEN_INT (GET_MODE_BITSIZE
+						 (GET_MODE_INNER (mode)) - 1),
+				    NULL, 0, OPTAB_DIRECT);
+	if (tmp0)
+	  tmp1 = expand_simple_binop (mode, XOR, op1, tmp0,
+				      NULL, 0, OPTAB_DIRECT);
+	if (tmp0 && tmp1)
+	  expand_simple_binop (mode, MINUS, tmp1, tmp0,
+			       op0, 0, OPTAB_DIRECT);
+	break;
+
+      /* For 16-bit signed integer X, the best way to calculate the absolute
+	 value of X is max (X, -X), as SSE2 provides the PMAXSW insn.  */
+      case V8HImode:
+	tmp0 = expand_unop (mode, neg_optab, op1, NULL_RTX, 0);
+	if (tmp0)
+	  expand_simple_binop (mode, SMAX, op1, tmp0, op0, 0,
+			       OPTAB_DIRECT);
+	break;
+
+      /* For 8-bit signed integer X, the best way to calculate the absolute
+	 value of X is min ((unsigned char) X, (unsigned char) (-X)),
+	 as SSE2 provides the PMINUB insn.  */
+      case V16QImode:
+	tmp0 = expand_unop (mode, neg_optab, op1, NULL_RTX, 0);
+	if (tmp0)
+	  expand_simple_binop (V16QImode, UMIN, op1, tmp0, op0, 0,
+			       OPTAB_DIRECT);
+	break;
+
+      default:
+	break;
+    }
+}
+
 /* Expand an insert into a vector register through pinsr insn.
    Return true if successful.  */
 
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index c3f6c94..46e1df4 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -8721,7 +8721,7 @@
    (set (attr "prefix_rex") (symbol_ref "x86_extended_reg_mentioned_p (insn)"))
    (set_attr "mode" "DI")])
 
-(define_insn "abs<mode>2"
+(define_insn "*abs<mode>2"
   [(set (match_operand:VI124_AVX2_48_AVX512F 0 "register_operand" "=v")
 	(abs:VI124_AVX2_48_AVX512F
 	  (match_operand:VI124_AVX2_48_AVX512F 1 "nonimmediate_operand" "vm")))]
@@ -8733,6 +8733,19 @@
    (set_attr "prefix" "maybe_vex")
    (set_attr "mode" "<sseinsnmode>")])
 
+(define_expand "abs<mode>2"
+  [(set (match_operand:VI124_AVX2_48_AVX512F 0 "register_operand")
+	(abs:VI124_AVX2_48_AVX512F
+	  (match_operand:VI124_AVX2_48_AVX512F 1 "nonimmediate_operand")))]
+  "TARGET_SSE2"
+{
+  if (!TARGET_SSSE3)
+    {
+      ix86_expand_sse2_abs (operands[0], operands[1]);
+      DONE;
+    }
+})
+
 (define_insn "abs<mode>2"
   [(set (match_operand:MMXMODEI 0 "register_operand" "=y")
 	(abs:MMXMODEI
diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog
index 075d071..c6a20c7 100644
--- a/gcc/testsuite/ChangeLog
+++ b/gcc/testsuite/ChangeLog
@@ -1,3 +1,9 @@
+2013-10-22  Cong Hou  <congh@google.com>
+
+	* gcc.target/i386/vect-abs-s8.c: New test.
+	* gcc.target/i386/vect-abs-s16.c: New test.
+	* gcc.target/i386/vect-abs-s32.c: New test.
+
 2013-10-14  Tobias Burnus  <burnus@net-b.de>
 
 	PR fortran/58658
diff --git a/gcc/testsuite/gcc.target/i386/vect-abs-s16.c b/gcc/testsuite/gcc.target/i386/vect-abs-s16.c
new file mode 100644
index 0000000..191ae34
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/vect-abs-s16.c
@@ -0,0 +1,14 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize -msse2 -mno-sse3 -fdump-tree-vect-details" } */
+
+
+void test (short* a, short* b)
+{
+  int i;
+  for (i = 0; i < 10000; ++i)
+    a[i] = abs (b[i]);
+}
+
+
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */
+/* { dg-final { cleanup-tree-dump "vect" } } */
diff --git a/gcc/testsuite/gcc.target/i386/vect-abs-s32.c b/gcc/testsuite/gcc.target/i386/vect-abs-s32.c
new file mode 100644
index 0000000..575e8ef
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/vect-abs-s32.c
@@ -0,0 +1,14 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize -msse2 -mno-sse3 -fdump-tree-vect-details" } */
+
+
+void test (int* a, int* b)
+{
+  int i;
+  for (i = 0; i < 10000; ++i)
+    a[i] = abs (b[i]);
+}
+
+
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */
+/* { dg-final { cleanup-tree-dump "vect" } } */
diff --git a/gcc/testsuite/gcc.target/i386/vect-abs-s8.c b/gcc/testsuite/gcc.target/i386/vect-abs-s8.c
new file mode 100644
index 0000000..3f3f3fa
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/vect-abs-s8.c
@@ -0,0 +1,14 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize -msse2 -mno-sse3 -fdump-tree-vect-details" } */
+
+
+void test (char* a, char* b)
+{
+  int i;
+  for (i = 0; i < 10000; ++i)
+    a[i] = abs (b[i]);
+}
+
+
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */
+/* { dg-final { cleanup-tree-dump "vect" } } */

Comments

Uros Bizjak Oct. 30, 2013, 6:13 p.m. UTC | #1
On Wed, Oct 30, 2013 at 7:01 PM, Cong Hou <congh@google.com> wrote:

>>> I found my problem: I put DONE outside of if not inside. You are
>>> right. I have updated my patch.
>>
>> OK, great that we put things in order ;)
>>
>> Does this patch need some extra middle-end functionality? I was not
>> able to vectorize char and short part of your patch.
>
>
> In the original patch, I converted abs() on short and char values to
> their own types by removing type casts. That is, originally char_val1
> = abs(char_val2) will be converted to char_val1 = (char) abs((int)
> char_val2) in the frontend, and I would like to convert it back to
> char_val1 = abs(char_val2). But after several discussions, it seems
> this conversion has some problems such as overflow converns, and I
> thereby removed that part.
>
> Now you should still be able to vectorize abs(char) and abs(short) but
> with packing and unpacking. Later I will consider to write pattern
> recognizer for abs(char) and abs(short) and then the expand on
> abs(char)/abs(short) in this patch will be used during vectorization.

OK, this seems reasonable. We already have "unused" SSSE3 8/16 bit abs
pattern, so I think we can commit SSE2 expanders, even if they will be
unused for now. The proposed recognizer will benefit SSE2 as well as
existing SSSE3 patterns.

>> Regarding the testcase - please put it to gcc.target/i386/ directory.
>> There is nothing generic in the test, as confirmed by target-dependent
>> scan test. You will find plenty of examples in the mentioned
>> directory. I'd suggest to split the testcase in three files, and to
>> simplify it to something like the testcase with global variables I
>> used earlier.
>
>
> I have done it. The test case is split into three for s8/s16/s32 in
> gcc.target/i386.

OK.

The patch is OK for mainline, but please check formatting and
whitespace before the patch is committed.

Thanks,
Uros.
Cong Hou Oct. 30, 2013, 8:02 p.m. UTC | #2
I have run check_GNU_style.sh on my patch.

The patch is submitted. Thank you for your comments and help on this patch!



thanks,
Cong


On Wed, Oct 30, 2013 at 11:13 AM, Uros Bizjak <ubizjak@gmail.com> wrote:
> On Wed, Oct 30, 2013 at 7:01 PM, Cong Hou <congh@google.com> wrote:
>
>>>> I found my problem: I put DONE outside of if not inside. You are
>>>> right. I have updated my patch.
>>>
>>> OK, great that we put things in order ;)
>>>
>>> Does this patch need some extra middle-end functionality? I was not
>>> able to vectorize char and short part of your patch.
>>
>>
>> In the original patch, I converted abs() on short and char values to
>> their own types by removing type casts. That is, originally char_val1
>> = abs(char_val2) will be converted to char_val1 = (char) abs((int)
>> char_val2) in the frontend, and I would like to convert it back to
>> char_val1 = abs(char_val2). But after several discussions, it seems
>> this conversion has some problems such as overflow converns, and I
>> thereby removed that part.
>>
>> Now you should still be able to vectorize abs(char) and abs(short) but
>> with packing and unpacking. Later I will consider to write pattern
>> recognizer for abs(char) and abs(short) and then the expand on
>> abs(char)/abs(short) in this patch will be used during vectorization.
>
> OK, this seems reasonable. We already have "unused" SSSE3 8/16 bit abs
> pattern, so I think we can commit SSE2 expanders, even if they will be
> unused for now. The proposed recognizer will benefit SSE2 as well as
> existing SSSE3 patterns.
>
>>> Regarding the testcase - please put it to gcc.target/i386/ directory.
>>> There is nothing generic in the test, as confirmed by target-dependent
>>> scan test. You will find plenty of examples in the mentioned
>>> directory. I'd suggest to split the testcase in three files, and to
>>> simplify it to something like the testcase with global variables I
>>> used earlier.
>>
>>
>> I have done it. The test case is split into three for s8/s16/s32 in
>> gcc.target/i386.
>
> OK.
>
> The patch is OK for mainline, but please check formatting and
> whitespace before the patch is committed.
>
> Thanks,
> Uros.
diff mbox

Patch

diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index 8a38316..84c7ab5 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,10 @@ 
+2013-10-22  Cong Hou  <congh@google.com>
+
+ PR target/58762
+ * config/i386/i386-protos.h (ix86_expand_sse2_abs): New function.
+ * config/i386/i386.c (ix86_expand_sse2_abs): New function.
+ * config/i386/sse.md: Add SSE2 support to abs (8/16/32-bit-int).
+
 2013-10-14  David Malcolm  <dmalcolm@redhat.com>

  * dumpfile.h (gcc::dump_manager): New class, to hold state
diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h
index 3ab2f3a..ca31224 100644
--- a/gcc/config/i386/i386-protos.h
+++ b/gcc/config/i386/i386-protos.h
@@ -238,6 +238,7 @@  extern void ix86_expand_mul_widen_evenodd (rtx,
rtx, rtx, bool, bool);
 extern void ix86_expand_mul_widen_hilo (rtx, rtx, rtx, bool, bool);
 extern void ix86_expand_sse2_mulv4si3 (rtx, rtx, rtx);
 extern void ix86_expand_sse2_mulvxdi3 (rtx, rtx, rtx);
+extern void ix86_expand_sse2_abs (rtx, rtx);

 /* In i386-c.c  */
 extern void ix86_target_macros (void);
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 02cbbbd..71905fc 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -41696,6 +41696,53 @@  ix86_expand_sse2_mulvxdi3 (rtx op0, rtx op1, rtx op2)
        gen_rtx_MULT (mode, op1, op2));
 }

+void
+ix86_expand_sse2_abs (rtx op0, rtx op1)
+{
+  enum machine_mode mode = GET_MODE (op0);
+  rtx tmp0, tmp1;
+
+  switch (mode)
+    {
+      /* For 32-bit signed integer X, the best way to calculate the absolute
+ value of X is (((signed) X >> (W-1)) ^ X) - ((signed) X >> (W-1)).  */
+      case V4SImode:
+ tmp0 = expand_simple_binop (mode, ASHIFTRT, op1,
+    GEN_INT (GET_MODE_BITSIZE
+ (GET_MODE_INNER (mode)) - 1),
+    NULL, 0, OPTAB_DIRECT);
+ if (tmp0)
+  tmp1 = expand_simple_binop (mode, XOR, op1, tmp0,
+      NULL, 0, OPTAB_DIRECT);
+ if (tmp0 && tmp1)
+  expand_simple_binop (mode, MINUS, tmp1, tmp0,
+       op0, 0, OPTAB_DIRECT);
+ break;
+
+      /* For 16-bit signed integer X, the best way to calculate the absolute
+ value of X is max (X, -X), as SSE2 provides the PMAXSW insn.  */
+      case V8HImode:
+ tmp0 = expand_unop (mode, neg_optab, op1, NULL_RTX, 0);
+ if (tmp0)
+  expand_simple_binop (mode, SMAX, op1, tmp0, op0, 0,
+       OPTAB_DIRECT);
+ break;
+
+      /* For 8-bit signed integer X, the best way to calculate the absolute
+ value of X is min ((unsigned char) X, (unsigned char) (-X)),
+ as SSE2 provides the PMINUB insn.  */
+      case V16QImode:
+ tmp0 = expand_unop (mode, neg_optab, op1, NULL_RTX, 0);
+ if (tmp0)
+  expand_simple_binop (V16QImode, UMIN, op1, tmp0, op0, 0,
+       OPTAB_DIRECT);
+ break;
+
+      default:
+ break;
+    }
+}
+
 /* Expand an insert into a vector register through pinsr insn.
    Return true if successful.  */

diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index c3f6c94..46e1df4 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -8721,7 +8721,7 @@ 
    (set (attr "prefix_rex") (symbol_ref "x86_extended_reg_mentioned_p (insn)"))
    (set_attr "mode" "DI")])

-(define_insn "abs<mode>2"
+(define_insn "*abs<mode>2"
   [(set (match_operand:VI124_AVX2_48_AVX512F 0 "register_operand" "=v")
  (abs:VI124_AVX2_48_AVX512F
   (match_operand:VI124_AVX2_48_AVX512F 1 "nonimmediate_operand" "vm")))]
@@ -8733,6 +8733,19 @@ 
    (set_attr "prefix" "maybe_vex")
    (set_attr "mode" "<sseinsnmode>")])

+(define_expand "abs<mode>2"
+  [(set (match_operand:VI124_AVX2_48_AVX512F 0 "register_operand")
+ (abs:VI124_AVX2_48_AVX512F
+  (match_operand:VI124_AVX2_48_AVX512F 1 "nonimmediate_operand")))]
+  "TARGET_SSE2"
+{
+  if (!TARGET_SSSE3)
+    {
+      ix86_expand_sse2_abs (operands[0], operands[1]);
+      DONE;
+    }
+})
+
 (define_insn "abs<mode>2"
   [(set (match_operand:MMXMODEI 0 "register_operand" "=y")
  (abs:MMXMODEI
diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog
index 075d071..c6a20c7 100644
--- a/gcc/testsuite/ChangeLog
+++ b/gcc/testsuite/ChangeLog
@@ -1,3 +1,9 @@ 
+2013-10-22  Cong Hou  <congh@google.com>
+
+ * gcc.target/i386/vect-abs-s8.c: New test.
+ * gcc.target/i386/vect-abs-s16.c: New test.
+ * gcc.target/i386/vect-abs-s32.c: New test.
+
 2013-10-14  Tobias Burnus  <burnus@net-b.de>

  PR fortran/58658
diff --git a/gcc/testsuite/gcc.target/i386/vect-abs-s16.c
b/gcc/testsuite/gcc.target/i386/vect-abs-s16.c
new file mode 100644
index 0000000..191ae34
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/vect-abs-s16.c
@@ -0,0 +1,14 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize -msse2 -mno-sse3
-fdump-tree-vect-details" } */
+
+
+void test (short* a, short* b)
+{
+  int i;
+  for (i = 0; i < 10000; ++i)
+    a[i] = abs (b[i]);
+}
+
+
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */
+/* { dg-final { cleanup-tree-dump "vect" } } */
diff --git a/gcc/testsuite/gcc.target/i386/vect-abs-s32.c
b/gcc/testsuite/gcc.target/i386/vect-abs-s32.c
new file mode 100644
index 0000000..575e8ef
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/vect-abs-s32.c
@@ -0,0 +1,14 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize -msse2 -mno-sse3
-fdump-tree-vect-details" } */
+
+
+void test (int* a, int* b)
+{
+  int i;
+  for (i = 0; i < 10000; ++i)
+    a[i] = abs (b[i]);
+}
+
+
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */
+/* { dg-final { cleanup-tree-dump "vect" } } */
diff --git a/gcc/testsuite/gcc.target/i386/vect-abs-s8.c
b/gcc/testsuite/gcc.target/i386/vect-abs-s8.c
new file mode 100644
index 0000000..3f3f3fa
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/vect-abs-s8.c
@@ -0,0 +1,14 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize -msse2 -mno-sse3
-fdump-tree-vect-details" } */
+
+
+void test (char* a, char* b)
+{
+  int i;
+  for (i = 0; i < 10000; ++i)
+    a[i] = abs (b[i]);
+}
+
+
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */
+/* { dg-final { cleanup-tree-dump "vect" } } */