diff mbox series

[rs6000] Add support for usadv16qi and usadv8hi standard patterns

Message ID 2a3e7921-c2ee-ff57-677a-f84becc0f002@linux.vnet.ibm.com
State New
Headers show
Series [rs6000] Add support for usadv16qi and usadv8hi standard patterns | expand

Commit Message

Bill Schmidt Nov. 6, 2017, 12:25 a.m. UTC
Hi,

This patch adds support for vectorization of unsigned SAD expressions.  SAD
vectorization uses the usad<mode> pattern to represent a widening accumulation
of SADs performed on a narrower type.  The two cases in this patch operate
on V16QImode and V8HImode, respectively, accumulating into V4SImode.  A
vectorized loop on SAD operations will use these patterns in the main loop
body and perform a final reduction to sum the 4 accumulated results in the
V4SImode accumulator during the loop epilogue.

POWER's sum-across ops (vsum4ubs and vsum4shs) unfortunately have saturating
semantics, so they can only be used for the sum-across; the accumulation
with previous iteration results requires a separate add.

Bootstrapped and tested on powerpc64le-linux-gnu for POWER8 and POWER9
subtargets with no regressions.  Is this ok for trunk?

Thanks,
Bill


[gcc]

2017-11-05  Bill Schmidt  <wschmidt@linux.vnet.ibm.com>

	* config/rs6000/altivec.md (*p9_vadu<mode>3) Rename to
	p9_vadu<mode>3.
	(usadv16qi): New define_expand.
	(usadv8hi): New define_expand.

[gcc/testsuite]

2017-11-05  Bill Schmidt  <wschmidt@linux.vnet.ibm.com>

	* gcc.target/powerpc/sad-vectorize-1.c: New file.
	* gcc.target/powerpc/sad-vectorize-2.c: New file.
	* gcc.target/powerpc/sad-vectorize-3.c: New file.
	* gcc.target/powerpc/sad-vectorize-4.c: New file.

Comments

Segher Boessenkool Nov. 6, 2017, 10:17 a.m. UTC | #1
Hi Bill,

On Sun, Nov 05, 2017 at 06:25:11PM -0600, Bill Schmidt wrote:
> This patch adds support for vectorization of unsigned SAD expressions.  SAD
> vectorization uses the usad<mode> pattern to represent a widening accumulation
> of SADs performed on a narrower type.  The two cases in this patch operate
> on V16QImode and V8HImode, respectively, accumulating into V4SImode.  A
> vectorized loop on SAD operations will use these patterns in the main loop
> body and perform a final reduction to sum the 4 accumulated results in the
> V4SImode accumulator during the loop epilogue.
> 
> POWER's sum-across ops (vsum4ubs and vsum4shs) unfortunately have saturating
> semantics, so they can only be used for the sum-across; the accumulation
> with previous iteration results requires a separate add.

> @@ -4184,6 +4184,51 @@
>    "vbpermd %0,%1,%2"
>    [(set_attr "type" "vecsimple")])
>  
> +;; Support for SAD (sum of absolute differences).
> +
> +;; Due to saturating semantics, we can't combine the sum-across
> +;; with the vector accumulate in vsum4ubs.  A vadduwm is needed.
> +(define_expand "usadv16qi"
> +  [(use (match_operand:V4SI 0 "register_operand"))
> +   (use (match_operand:V16QI 1 "register_operand"))
> +   (use (match_operand:V16QI 2 "register_operand"))
> +   (use (match_operand:V4SI 3 "register_operand"))]
> +  "TARGET_P9_VECTOR"
> +  "
> +{
> +  rtx absd = gen_reg_rtx (V16QImode);
> +  rtx zero = gen_reg_rtx (V4SImode);
> +  rtx psum = gen_reg_rtx (V4SImode);
> +
> +  emit_insn (gen_p9_vaduv16qi3 (absd, operands[1], operands[2]));
> +  emit_insn (gen_altivec_vspltisw (zero, const0_rtx));
> +  emit_insn (gen_altivec_vsum4ubs (psum, absd, zero));
> +  emit_insn (gen_addv4si3 (operands[0], psum, operands[3]));
> +  DONE;
> +}")

No quotes around the {} block please (twice).

Other than that, looks fine to me, please commit.  Thanks,


Segher
Bill Schmidt Nov. 6, 2017, 1:48 p.m. UTC | #2
On Nov 6, 2017, at 4:17 AM, Segher Boessenkool <segher@kernel.crashing.org> wrote:
> 
> Hi Bill,
> 
> No quotes around the {} block please (twice).

Whoops.  I know better; copied from a bad example and missed it.
> 
> Other than that, looks fine to me, please commit.  Thanks,
> 
Thanks for the review!  r254453.

Bill
> 
> Segher
>
diff mbox series

Patch

Index: gcc/config/rs6000/altivec.md
===================================================================
--- gcc/config/rs6000/altivec.md	(revision 254428)
+++ gcc/config/rs6000/altivec.md	(working copy)
@@ -4020,7 +4020,7 @@ 
   "TARGET_P9_VECTOR")
 
 ;; Vector absolute difference unsigned
-(define_insn "*p9_vadu<mode>3"
+(define_insn "p9_vadu<mode>3"
   [(set (match_operand:VI 0 "register_operand" "=v")
         (unspec:VI [(match_operand:VI 1 "register_operand" "v")
 		    (match_operand:VI 2 "register_operand" "v")]
@@ -4184,6 +4184,51 @@ 
   "vbpermd %0,%1,%2"
   [(set_attr "type" "vecsimple")])
 
+;; Support for SAD (sum of absolute differences).
+
+;; Due to saturating semantics, we can't combine the sum-across
+;; with the vector accumulate in vsum4ubs.  A vadduwm is needed.
+(define_expand "usadv16qi"
+  [(use (match_operand:V4SI 0 "register_operand"))
+   (use (match_operand:V16QI 1 "register_operand"))
+   (use (match_operand:V16QI 2 "register_operand"))
+   (use (match_operand:V4SI 3 "register_operand"))]
+  "TARGET_P9_VECTOR"
+  "
+{
+  rtx absd = gen_reg_rtx (V16QImode);
+  rtx zero = gen_reg_rtx (V4SImode);
+  rtx psum = gen_reg_rtx (V4SImode);
+
+  emit_insn (gen_p9_vaduv16qi3 (absd, operands[1], operands[2]));
+  emit_insn (gen_altivec_vspltisw (zero, const0_rtx));
+  emit_insn (gen_altivec_vsum4ubs (psum, absd, zero));
+  emit_insn (gen_addv4si3 (operands[0], psum, operands[3]));
+  DONE;
+}")
+
+;; Since vsum4shs is saturating and further performs signed
+;; arithmetic, we can't combine the sum-across with the vector
+;; accumulate in vsum4shs.  A vadduwm is needed.
+(define_expand "usadv8hi"
+  [(use (match_operand:V4SI 0 "register_operand"))
+   (use (match_operand:V8HI 1 "register_operand"))
+   (use (match_operand:V8HI 2 "register_operand"))
+   (use (match_operand:V4SI 3 "register_operand"))]
+  "TARGET_P9_VECTOR"
+  "
+{
+  rtx absd = gen_reg_rtx (V8HImode);
+  rtx zero = gen_reg_rtx (V4SImode);
+  rtx psum = gen_reg_rtx (V4SImode);
+
+  emit_insn (gen_p9_vaduv8hi3 (absd, operands[1], operands[2]));
+  emit_insn (gen_altivec_vspltisw (zero, const0_rtx));
+  emit_insn (gen_altivec_vsum4shs (psum, absd, zero));
+  emit_insn (gen_addv4si3 (operands[0], psum, operands[3]));
+  DONE;
+}")
+
 ;; Decimal Integer operations
 (define_int_iterator UNSPEC_BCD_ADD_SUB [UNSPEC_BCDADD UNSPEC_BCDSUB])
 
Index: gcc/testsuite/gcc.target/powerpc/sad-vectorize-1.c
===================================================================
--- gcc/testsuite/gcc.target/powerpc/sad-vectorize-1.c	(nonexistent)
+++ gcc/testsuite/gcc.target/powerpc/sad-vectorize-1.c	(working copy)
@@ -0,0 +1,36 @@ 
+/* { dg-do compile { target { powerpc*-*-* } } } */
+/* { dg-skip-if "do not override -mcpu" { powerpc*-*-* } { "-mcpu=*" } { "-mcpu=power9" } } */
+/* { dg-require-effective-target powerpc_p9vector_ok } */
+/* { dg-skip-if "" { powerpc*-*-aix* } } */
+/* { dg-options "-O3 -mcpu=power9" } */
+
+/* Verify that we vectorize this SAD loop using vabsdub. */
+
+extern int abs (int __x) __attribute__ ((__nothrow__, __leaf__)) __attribute__ ((__const__));
+
+static int
+foo (unsigned char *w, int i, unsigned char *x, int j)
+{
+  int tot = 0;
+  for (int a = 0; a < 16; a++)
+    {
+      for (int b = 0; b < 16; b++)
+	tot += abs (w[b] - x[b]);
+      w += i;
+      x += j;
+    }
+  return tot;
+}
+
+void
+bar (unsigned char *w, unsigned char *x, int i, int *result)
+{
+  *result = foo (w, 16, x, i);
+}
+
+/* { dg-final { scan-assembler-times "vabsdub" 16 } } */
+/* { dg-final { scan-assembler-times "vsum4ubs" 16 } } */
+/* { dg-final { scan-assembler-times "vadduwm" 17 } } */
+
+/* Note: One of the 16 adds is optimized out (add with zero),
+   leaving 15.  The extra two adds are for the final reduction.  */
Index: gcc/testsuite/gcc.target/powerpc/sad-vectorize-2.c
===================================================================
--- gcc/testsuite/gcc.target/powerpc/sad-vectorize-2.c	(nonexistent)
+++ gcc/testsuite/gcc.target/powerpc/sad-vectorize-2.c	(working copy)
@@ -0,0 +1,36 @@ 
+/* { dg-do compile { target { powerpc*-*-* } } } */
+/* { dg-skip-if "do not override -mcpu" { powerpc*-*-* } { "-mcpu=*" } { "-mcpu=power9" } } */
+/* { dg-require-effective-target powerpc_p9vector_ok } */
+/* { dg-skip-if "" { powerpc*-*-aix* } } */
+/* { dg-options "-O3 -mcpu=power9" } */
+
+/* Verify that we vectorize this SAD loop using vabsduh. */
+
+extern int abs (int __x) __attribute__ ((__nothrow__, __leaf__)) __attribute__ ((__const__));
+
+static int
+foo (unsigned short *w, int i, unsigned short *x, int j)
+{
+  int tot = 0;
+  for (int a = 0; a < 16; a++)
+    {
+      for (int b = 0; b < 8; b++)
+	tot += abs (w[b] - x[b]);
+      w += i;
+      x += j;
+    }
+  return tot;
+}
+
+void
+bar (unsigned short *w, unsigned short *x, int i, int *result)
+{
+  *result = foo (w, 8, x, i);
+}
+
+/* { dg-final { scan-assembler-times "vabsduh" 16 } } */
+/* { dg-final { scan-assembler-times "vsum4shs" 16 } } */
+/* { dg-final { scan-assembler-times "vadduwm" 17 } } */
+
+/* Note: One of the 16 adds is optimized out (add with zero),
+   leaving 15.  The extra two adds are for the final reduction.  */
Index: gcc/testsuite/gcc.target/powerpc/sad-vectorize-3.c
===================================================================
--- gcc/testsuite/gcc.target/powerpc/sad-vectorize-3.c	(nonexistent)
+++ gcc/testsuite/gcc.target/powerpc/sad-vectorize-3.c	(working copy)
@@ -0,0 +1,57 @@ 
+/* { dg-do run { target { powerpc*-*-linux* && { lp64 && p9vector_hw } } } } */
+/* { dg-require-effective-target powerpc_p9vector_ok } */
+/* { dg-options "-O3 -mcpu=power9" } */
+/* { dg-skip-if "do not override -mcpu" { powerpc*-*-* } { "-mcpu=*" } { "-mcpu=power9" } } */
+
+/* Verify that we get correct code when we vectorize this SAD loop using
+   vabsdub. */
+
+extern void abort ();
+extern int abs (int __x) __attribute__ ((__nothrow__, __leaf__)) __attribute__ ((__const__));
+
+static int
+foo (unsigned char *w, int i, unsigned char *x, int j)
+{
+  int tot = 0;
+  for (int a = 0; a < 16; a++)
+    {
+      for (int b = 0; b < 16; b++)
+	tot += abs (w[b] - x[b]);
+      w += i;
+      x += j;
+    }
+  return tot;
+}
+
+void
+bar (unsigned char *w, unsigned char *x, int i, int *result)
+{
+  *result = foo (w, 16, x, i);
+}
+
+int
+main ()
+{
+  unsigned char m[256];
+  unsigned char n[256];
+  int sum, i;
+
+  for (i = 0; i < 256; ++i)
+    if (i % 2 == 0)
+      {
+	m[i] = (i % 8) * 2 + 1;
+	n[i] = -(i % 8);
+      }
+    else
+      {
+	m[i] = -((i % 8) * 2 + 2);
+	n[i] = -((i % 8) >> 1);
+      }
+  
+  bar (m, n, 16, &sum);
+
+  if (sum != 32384)
+    abort ();
+
+  return 0;
+}
Index: gcc/testsuite/gcc.target/powerpc/sad-vectorize-4.c
===================================================================
--- gcc/testsuite/gcc.target/powerpc/sad-vectorize-4.c	(nonexistent)
+++ gcc/testsuite/gcc.target/powerpc/sad-vectorize-4.c	(working copy)
@@ -0,0 +1,57 @@ 
+/* { dg-do run { target { powerpc*-*-linux* && { lp64 && p9vector_hw } } } } */
+/* { dg-require-effective-target powerpc_p9vector_ok } */
+/* { dg-options "-O3 -mcpu=power9" } */
+/* { dg-skip-if "do not override -mcpu" { powerpc*-*-* } { "-mcpu=*" } { "-mcpu=power9" } } */
+
+/* Verify that we get correct code when we vectorize this SAD loop using
+   vabsduh. */
+
+extern void abort ();
+extern int abs (int __x) __attribute__ ((__nothrow__, __leaf__)) __attribute__ ((__const__));
+
+static int
+foo (unsigned short *w, int i, unsigned short *x, int j)
+{
+  int tot = 0;
+  for (int a = 0; a < 16; a++)
+    {
+      for (int b = 0; b < 8; b++)
+	tot += abs (w[b] - x[b]);
+      w += i;
+      x += j;
+    }
+  return tot;
+}
+
+void
+bar (unsigned short *w, unsigned short *x, int i, int *result)
+{
+  *result = foo (w, 8, x, i);
+}
+
+int
+main ()
+{
+  unsigned short m[128];
+  unsigned short n[128];
+  int sum, i;
+
+  for (i = 0; i < 128; ++i)
+    if (i % 2 == 0)
+      {
+	m[i] = (i % 8) * 2 + 1;
+	n[i] = i % 8;
+      }
+    else
+      {
+	m[i] = (i % 8) * 4 - 3;
+	n[i] = (i % 8) >> 1;
+      }
+  
+  bar (m, n, 8, &sum);
+
+  if (sum != 992)
+    abort ();
+
+  return 0;
+}