[rs6000] Add _mm_blend_epi16 and _mm_blendv_epi8
diff mbox series

Message ID e16a24bd-1f33-de4c-6e2d-32c6f0c75a3d@us.ibm.com
State New
Headers show
Series
  • [rs6000] Add _mm_blend_epi16 and _mm_blendv_epi8
Related show

Commit Message

Paul A. Clarke July 20, 2019, 3:18 a.m. UTC
Add compatibility implementations of _mm_blend_epi16 and _mm_blendv_epi8
intrinsics.

Respective test cases are copied almost verbatim (minor changes to
the dejagnu head lines) from i386.

2019-07-19  Paul A. Clarke  <pc@us.ibm.com>

[gcc]

	* config/rs6000/smmintrin.h (_mm_blend_epi16): New.
	(_mm_blendv_epi8): New.

[gcc/testsuite]

	* gcc.target/powerpc/sse4_1-check.h: New.
	* gcc.target/powerpc/sse4_1-pblendvb.c: New.
	* gcc.target/powerpc/sse4_1-pblendw.c: New.
	* gcc.target/powerpc/sse4_1-pblendw-2.c: New.

Tested on 64bit LE, 64bit and 32bit BE.

OK for trunk?


PC

Comments

Segher Boessenkool July 21, 2019, 5:39 p.m. UTC | #1
Hi Paul,

All looks fine, okay for trunk.  Thanks!

Just some possible improvements:

On Fri, Jul 19, 2019 at 10:18:47PM -0500, Paul Clarke wrote:
> +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))

Maybe all these terribly long lines would be better if they used a
macro?  Something defined in xmmintrin.h I guess, and just for the
attribute part?

> +_mm_blend_epi16 (__m128i __A, __m128i __B, const int __imm8)
> +{
> +  __v8hu __bitmask = vec_splats ((unsigned short) __imm8);
> +  const __v8hu __shifty = { 0, 1, 2, 3, 4, 5, 6, 7 };
> +  __bitmask = vec_sr (__bitmask, __shifty);
> +  const __v8hu __ones = vec_splats ((unsigned short) 0x0001);
> +  __bitmask = vec_and (__bitmask, __ones);
> +  const __v8hu __zero = {0};
> +  __bitmask = vec_sub (__zero, __bitmask);
> +  return (__m128i) vec_sel ((__v8hu) __A, (__v8hu) __B, __bitmask);
> +}

You can do a lot better than this, using vgbbd (that's vec_gb in
instrinsics).  It's probably nicest if you splat the __imm8 to all
bytes in a vector, then do the vgbbd, and then you can immediately
vec_sel with the result of that.

> +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
> +_mm_blendv_epi8 (__m128i __A, __m128i __B, __m128i __mask)
> +{
> +  const __v16qu __hibits = vec_splats ((unsigned char) 0x80);
> +  __v16qu __lmask = vec_and ((__v16qu) __mask, __hibits);
> +  const __v16qu __zero = {0};
> +  __lmask = (vector unsigned char) vec_cmpgt (__lmask, __zero);
> +  return (__m128i) vec_sel ((__v16qu) __A, (__v16qu) __B, __lmask);
> +}

Can you do this with just a vsrab / vec_sra?  Splat imm 7 to a vec,
sra by that?


Segher

Patch
diff mbox series

Index: gcc/config/rs6000/smmintrin.h
===================================================================
diff --git a/trunk/gcc/config/rs6000/smmintrin.h b/trunk/gcc/config/rs6000/smmintrin.h
--- a/trunk/gcc/config/rs6000/smmintrin.h	(revision 273615)
+++ b/trunk/gcc/config/rs6000/smmintrin.h	(working copy)
@@ -66,4 +66,27 @@  _mm_extract_ps (__m128 __X, const int __N)
   return ((__v4si)__X)[__N & 3];
 }
 
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_blend_epi16 (__m128i __A, __m128i __B, const int __imm8)
+{
+  __v8hu __bitmask = vec_splats ((unsigned short) __imm8);
+  const __v8hu __shifty = { 0, 1, 2, 3, 4, 5, 6, 7 };
+  __bitmask = vec_sr (__bitmask, __shifty);
+  const __v8hu __ones = vec_splats ((unsigned short) 0x0001);
+  __bitmask = vec_and (__bitmask, __ones);
+  const __v8hu __zero = {0};
+  __bitmask = vec_sub (__zero, __bitmask);
+  return (__m128i) vec_sel ((__v8hu) __A, (__v8hu) __B, __bitmask);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_blendv_epi8 (__m128i __A, __m128i __B, __m128i __mask)
+{
+  const __v16qu __hibits = vec_splats ((unsigned char) 0x80);
+  __v16qu __lmask = vec_and ((__v16qu) __mask, __hibits);
+  const __v16qu __zero = {0};
+  __lmask = (vector unsigned char) vec_cmpgt (__lmask, __zero);
+  return (__m128i) vec_sel ((__v16qu) __A, (__v16qu) __B, __lmask);
+}
+
 #endif
Index: gcc/testsuite/gcc.target/powerpc/sse4_1-check.h
===================================================================
diff --git a/trunk/gcc/testsuite/gcc.target/powerpc/sse4_1-check.h b/trunk/gcc/testsuite/gcc.target/powerpc/sse4_1-check.h
new file mode 10644
--- /dev/null	(revision 0)
+++ b/trunk/gcc/testsuite/gcc.target/powerpc/sse4_1-check.h	(working copy)
@@ -0,0 +1,27 @@ 
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "m128-check.h"
+
+//#define DEBUG 1
+
+#define TEST sse4_1_test
+
+static void sse4_1_test (void);
+
+static void
+__attribute__ ((noinline))
+do_test (void)
+{
+  sse4_1_test ();
+}
+
+int
+main ()
+{
+  do_test ();
+#ifdef DEBUG
+  printf ("PASSED\n");
+#endif
+  return 0;
+}
Index: gcc/testsuite/gcc.target/powerpc/sse4_1-pblendvb.c
===================================================================
diff --git a/trunk/gcc/testsuite/gcc.target/powerpc/sse4_1-pblendvb.c b/trunk/gcc/testsuite/gcc.target/powerpc/sse4_1-pblendvb.c
new file mode 10644
--- /dev/null	(revision 0)
+++ b/trunk/gcc/testsuite/gcc.target/powerpc/sse4_1-pblendvb.c	(working copy)
@@ -0,0 +1,71 @@ 
+/* { dg-do run } */
+/* { dg-options "-O2 -mpower8-vector -Wno-psabi" } */
+/* { dg-require-effective-target p8vector_hw } */
+
+#define NO_WARN_X86_INTRINSICS 1
+#ifndef CHECK_H
+#define CHECK_H "sse4_1-check.h"
+#endif
+
+#ifndef TEST
+#define TEST sse4_1_test
+#endif
+
+#include CHECK_H
+
+#include <smmintrin.h>
+#include <string.h>
+
+#define NUM 20
+
+static void
+init_pblendvb (unsigned char *src1, unsigned char *src2,
+	       unsigned char *mask)
+{
+  int i, sign = 1; 
+
+  for (i = 0; i < NUM * 16; i++)
+    {
+      src1[i] = i* i * sign;
+      src2[i] = (i + 20) * sign;
+      mask[i] = (i % 3) + ((i * (14 + sign))
+			   ^ (src1[i] | src2[i] | (i*3)));
+      sign = -sign;
+    }
+}
+
+static int
+check_pblendvb (__m128i *dst, unsigned char *src1,
+		unsigned char *src2, unsigned char *mask)
+{
+  unsigned char tmp[16];
+  int j;
+
+  memcpy (&tmp[0], src1, sizeof (tmp));
+  for (j = 0; j < 16; j++)
+    if (mask [j] & 0x80)
+      tmp[j] = src2[j];
+
+  return memcmp (dst, &tmp[0], sizeof (tmp));
+}
+
+static void
+TEST (void)
+{
+  union
+    {
+      __m128i x[NUM];
+      unsigned char c[NUM * 16];
+    } dst, src1, src2, mask;
+  int i;
+
+  init_pblendvb (src1.c, src2.c, mask.c);
+
+  for (i = 0; i < NUM; i++)
+    {
+      dst.x[i] = _mm_blendv_epi8 (src1.x[i], src2.x[i], mask.x[i]);
+      if (check_pblendvb (&dst.x[i], &src1.c[i * 16], &src2.c[i * 16],
+			  &mask.c[i * 16]))
+	abort ();
+    }
+}
Index: gcc/testsuite/gcc.target/powerpc/sse4_1-pblendw-2.c
===================================================================
diff --git a/trunk/gcc/testsuite/gcc.target/powerpc/sse4_1-pblendw-2.c b/trunk/gcc/testsuite/gcc.target/powerpc/sse4_1-pblendw-2.c
new file mode 10644
--- /dev/null	(revision 0)
+++ b/trunk/gcc/testsuite/gcc.target/powerpc/sse4_1-pblendw-2.c	(working copy)
@@ -0,0 +1,80 @@ 
+/* { dg-do run } */
+/* { dg-options "-O2 -mpower8-vector -Wno-psabi" } */
+/* { dg-require-effective-target p8vector_hw } */
+
+#define NO_WARN_X86_INTRINSICS 1
+#include "sse4_1-check.h"
+
+#include <smmintrin.h>
+#include <string.h>
+
+#define NUM 20
+
+#undef MASK
+#define MASK 0xfe
+
+static void
+init_pblendw (short *src1, short *src2)
+{
+  int i, sign = 1;
+
+  for (i = 0; i < NUM * 8; i++)
+    {
+      src1[i] = i * i * sign;
+      src2[i] = (i + 20) * sign;
+      sign = -sign;
+    }
+}
+
+static int
+check_pblendw (__m128i *dst, short *src1, short *src2)
+{
+  short tmp[8];
+  int j;
+
+  memcpy (&tmp[0], src1, sizeof (tmp));
+  for (j = 0; j < 8; j++)
+    if ((MASK & (1 << j)))
+      tmp[j] = src2[j];
+
+  return memcmp (dst, &tmp[0], sizeof (tmp));
+}
+
+static void
+sse4_1_test (void)
+{
+  __m128i x, y;
+  union
+    {
+      __m128i x[NUM];
+      short s[NUM * 8];
+    } dst, src1, src2;
+  union
+    {
+      __m128i x;
+      short s[8];
+    } src3;
+  int i;
+
+  init_pblendw (src1.s, src2.s);
+
+  /* Check pblendw imm8, m128, xmm */
+  for (i = 0; i < NUM; i++)
+    {
+      dst.x[i] = _mm_blend_epi16 (src1.x[i], src2.x[i], MASK); 
+      if (check_pblendw (&dst.x[i], &src1.s[i * 8], &src2.s[i * 8]))
+	abort ();
+    }
+    
+   /* Check pblendw imm8, xmm, xmm */
+  src3.x = _mm_setzero_si128 ();
+
+  x = _mm_blend_epi16 (dst.x[2], src3.x, MASK);
+  y = _mm_blend_epi16 (src3.x, dst.x[2], MASK);
+
+  if (check_pblendw (&x, &dst.s[16], &src3.s[0]))
+    abort ();
+
+  if (check_pblendw (&y, &src3.s[0], &dst.s[16]))
+    abort ();
+}
Index: gcc/testsuite/gcc.target/powerpc/sse4_1-pblendw.c
===================================================================
diff --git a/trunk/gcc/testsuite/gcc.target/powerpc/sse4_1-pblendw.c b/trunk/gcc/testsuite/gcc.target/powerpc/sse4_1-pblendw.c
new file mode 10644
--- /dev/null	(revision 0)
+++ b/trunk/gcc/testsuite/gcc.target/powerpc/sse4_1-pblendw.c	(working copy)
@@ -0,0 +1,89 @@ 
+/* { dg-do run } */
+/* { dg-options "-O2 -mpower8-vector -Wno-psabi" } */
+/* { dg-require-effective-target p8vector_hw } */
+
+#define NO_WARN_X86_INTRINSICS 1
+#ifndef CHECK_H
+#define CHECK_H "sse4_1-check.h"
+#endif
+
+#ifndef TEST
+#define TEST sse4_1_test
+#endif
+
+#include CHECK_H
+
+#include <smmintrin.h>
+#include <string.h>
+
+#define NUM 20
+
+#ifndef MASK
+#define MASK 0x0f
+#endif
+
+static void
+init_pblendw (short *src1, short *src2)
+{
+  int i, sign = 1;
+
+  for (i = 0; i < NUM * 8; i++)
+    {
+      src1[i] = i * i * sign;
+      src2[i] = (i + 20) * sign;
+      sign = -sign;
+    }
+}
+
+static int
+check_pblendw (__m128i *dst, short *src1, short *src2)
+{
+  short tmp[8];
+  int j;
+
+  memcpy (&tmp[0], src1, sizeof (tmp));
+  for (j = 0; j < 8; j++)
+    if ((MASK & (1 << j)))
+      tmp[j] = src2[j];
+
+  return memcmp (dst, &tmp[0], sizeof (tmp));
+}
+
+static void
+TEST (void)
+{
+  __m128i x, y;
+  union
+    {
+      __m128i x[NUM];
+      short s[NUM * 8];
+    } dst, src1, src2;
+  union
+    {
+      __m128i x;
+      short s[8];
+    } src3;
+  int i;
+
+  init_pblendw (src1.s, src2.s);
+
+  /* Check pblendw imm8, m128, xmm */
+  for (i = 0; i < NUM; i++)
+    {
+      dst.x[i] = _mm_blend_epi16 (src1.x[i], src2.x[i], MASK); 
+      if (check_pblendw (&dst.x[i], &src1.s[i * 8], &src2.s[i * 8]))
+	abort ();
+    }
+    
+   /* Check pblendw imm8, xmm, xmm */
+  src3.x = _mm_setzero_si128 ();
+
+  x = _mm_blend_epi16 (dst.x[2], src3.x, MASK);
+  y = _mm_blend_epi16 (src3.x, dst.x[2], MASK);
+
+  if (check_pblendw (&x, &dst.s[16], &src3.s[0]))
+    abort ();
+
+  if (check_pblendw (&y, &src3.s[0], &dst.s[16]))
+    abort ();
+}