diff mbox series

[v3,5/6] rs6000: Support more SSE4 "cmp", "mul", "pack" intrinsics

Message ID 20210823190310.1679905-6-pc@us.ibm.com
State New
Headers show
Series rs6000: Support more SSE4 intrinsics | expand

Commit Message

Paul A. Clarke Aug. 23, 2021, 7:03 p.m. UTC
Function signatures and decorations match gcc/config/i386/smmintrin.h.

Also, copy tests for:
- _mm_cmpeq_epi64
- _mm_mullo_epi32, _mm_mul_epi32
- _mm_packus_epi32
- _mm_cmpgt_epi64 (SSE4.2)

from gcc/testsuite/gcc.target/i386.

2021-08-23  Paul A. Clarke  <pc@us.ibm.com>

gcc
	* config/rs6000/smmintrin.h (_mm_cmpeq_epi64, _mm_cmpgt_epi64,
	_mm_mullo_epi32, _mm_mul_epi32, _mm_packus_epi32): New.
	* config/rs6000/nmmintrin.h: Copy from i386, tweak to suit.

gcc/testsuite
	* gcc.target/powerpc/pr78102.c: Copy from gcc.target/i386,
	adjust dg directives to suit.
	* gcc.target/powerpc/sse4_1-packusdw.c: Same.
	* gcc.target/powerpc/sse4_1-pcmpeqq.c: Same.
	* gcc.target/powerpc/sse4_1-pmuldq.c: Same.
	* gcc.target/powerpc/sse4_1-pmulld.c: Same.
	* gcc.target/powerpc/sse4_2-pcmpgtq.c: Same.
	* gcc.target/powerpc/sse4_2-check.h: Copy from gcc.target/i386,
	tweak to suit.
---
v3:
- Add nmmintrin.h. _mm_cmpgt_epi64 is part of SSE4.2, which is
  ostensibly defined in nmmintrin.h. Following the i386 implementation,
  however, nmmintrin.h only includes smmintrin.h, and the actual
  implementations appear there.
- Add sse4_2-check.h, required by sse4_2-pcmpgtq.c. My testing was
  obviously inadequate.
v2:
- Added "extern" to functions to maintain compatible decorations with
  like implementations in gcc/config/i386.
- Removed "-Wno-psabi" from tests as unnecessary, per v1 review.
- Noted testing in patch series cover letter.

 gcc/config/rs6000/nmmintrin.h                 | 40 ++++++++++
 gcc/config/rs6000/smmintrin.h                 | 41 +++++++++++
 gcc/testsuite/gcc.target/powerpc/pr78102.c    | 23 ++++++
 .../gcc.target/powerpc/sse4_1-packusdw.c      | 73 +++++++++++++++++++
 .../gcc.target/powerpc/sse4_1-pcmpeqq.c       | 46 ++++++++++++
 .../gcc.target/powerpc/sse4_1-pmuldq.c        | 51 +++++++++++++
 .../gcc.target/powerpc/sse4_1-pmulld.c        | 46 ++++++++++++
 .../gcc.target/powerpc/sse4_2-check.h         | 18 +++++
 .../gcc.target/powerpc/sse4_2-pcmpgtq.c       | 46 ++++++++++++
 9 files changed, 384 insertions(+)
 create mode 100644 gcc/config/rs6000/nmmintrin.h
 create mode 100644 gcc/testsuite/gcc.target/powerpc/pr78102.c
 create mode 100644 gcc/testsuite/gcc.target/powerpc/sse4_1-packusdw.c
 create mode 100644 gcc/testsuite/gcc.target/powerpc/sse4_1-pcmpeqq.c
 create mode 100644 gcc/testsuite/gcc.target/powerpc/sse4_1-pmuldq.c
 create mode 100644 gcc/testsuite/gcc.target/powerpc/sse4_1-pmulld.c
 create mode 100644 gcc/testsuite/gcc.target/powerpc/sse4_2-check.h
 create mode 100644 gcc/testsuite/gcc.target/powerpc/sse4_2-pcmpgtq.c

Comments

Li, Pan2 via Gcc-patches Aug. 27, 2021, 3:21 p.m. UTC | #1
Hi Paul,

On 8/23/21 2:03 PM, Paul A. Clarke wrote:
> Function signatures and decorations match gcc/config/i386/smmintrin.h.
>
> Also, copy tests for:
> - _mm_cmpeq_epi64
> - _mm_mullo_epi32, _mm_mul_epi32
> - _mm_packus_epi32
> - _mm_cmpgt_epi64 (SSE4.2)
>
> from gcc/testsuite/gcc.target/i386.
>
> 2021-08-23  Paul A. Clarke  <pc@us.ibm.com>
>
> gcc
> 	* config/rs6000/smmintrin.h (_mm_cmpeq_epi64, _mm_cmpgt_epi64,
> 	_mm_mullo_epi32, _mm_mul_epi32, _mm_packus_epi32): New.
> 	* config/rs6000/nmmintrin.h: Copy from i386, tweak to suit.
>
> gcc/testsuite
> 	* gcc.target/powerpc/pr78102.c: Copy from gcc.target/i386,
> 	adjust dg directives to suit.
> 	* gcc.target/powerpc/sse4_1-packusdw.c: Same.
> 	* gcc.target/powerpc/sse4_1-pcmpeqq.c: Same.
> 	* gcc.target/powerpc/sse4_1-pmuldq.c: Same.
> 	* gcc.target/powerpc/sse4_1-pmulld.c: Same.
> 	* gcc.target/powerpc/sse4_2-pcmpgtq.c: Same.
> 	* gcc.target/powerpc/sse4_2-check.h: Copy from gcc.target/i386,
> 	tweak to suit.
> ---
> v3:
> - Add nmmintrin.h. _mm_cmpgt_epi64 is part of SSE4.2, which is
>    ostensibly defined in nmmintrin.h. Following the i386 implementation,
>    however, nmmintrin.h only includes smmintrin.h, and the actual
>    implementations appear there.
> - Add sse4_2-check.h, required by sse4_2-pcmpgtq.c. My testing was
>    obviously inadequate.
> v2:
> - Added "extern" to functions to maintain compatible decorations with
>    like implementations in gcc/config/i386.
> - Removed "-Wno-psabi" from tests as unnecessary, per v1 review.
> - Noted testing in patch series cover letter.
>
>   gcc/config/rs6000/nmmintrin.h                 | 40 ++++++++++
>   gcc/config/rs6000/smmintrin.h                 | 41 +++++++++++
>   gcc/testsuite/gcc.target/powerpc/pr78102.c    | 23 ++++++
>   .../gcc.target/powerpc/sse4_1-packusdw.c      | 73 +++++++++++++++++++
>   .../gcc.target/powerpc/sse4_1-pcmpeqq.c       | 46 ++++++++++++
>   .../gcc.target/powerpc/sse4_1-pmuldq.c        | 51 +++++++++++++
>   .../gcc.target/powerpc/sse4_1-pmulld.c        | 46 ++++++++++++
>   .../gcc.target/powerpc/sse4_2-check.h         | 18 +++++
>   .../gcc.target/powerpc/sse4_2-pcmpgtq.c       | 46 ++++++++++++
>   9 files changed, 384 insertions(+)
>   create mode 100644 gcc/config/rs6000/nmmintrin.h
>   create mode 100644 gcc/testsuite/gcc.target/powerpc/pr78102.c
>   create mode 100644 gcc/testsuite/gcc.target/powerpc/sse4_1-packusdw.c
>   create mode 100644 gcc/testsuite/gcc.target/powerpc/sse4_1-pcmpeqq.c
>   create mode 100644 gcc/testsuite/gcc.target/powerpc/sse4_1-pmuldq.c
>   create mode 100644 gcc/testsuite/gcc.target/powerpc/sse4_1-pmulld.c
>   create mode 100644 gcc/testsuite/gcc.target/powerpc/sse4_2-check.h
>   create mode 100644 gcc/testsuite/gcc.target/powerpc/sse4_2-pcmpgtq.c
>
> diff --git a/gcc/config/rs6000/nmmintrin.h b/gcc/config/rs6000/nmmintrin.h
> new file mode 100644
> index 000000000000..20a70bee3776
> --- /dev/null
> +++ b/gcc/config/rs6000/nmmintrin.h
> @@ -0,0 +1,40 @@
> +/* Copyright (C) 2021 Free Software Foundation, Inc.
> +
> +   This file is part of GCC.
> +
> +   GCC is free software; you can redistribute it and/or modify
> +   it under the terms of the GNU General Public License as published by
> +   the Free Software Foundation; either version 3, or (at your option)
> +   any later version.
> +
> +   GCC is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> +   GNU General Public License for more details.
> +
> +   Under Section 7 of GPL version 3, you are granted additional
> +   permissions described in the GCC Runtime Library Exception, version
> +   3.1, as published by the Free Software Foundation.
> +
> +   You should have received a copy of the GNU General Public License and
> +   a copy of the GCC Runtime Library Exception along with this program;
> +   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
> +   <http://www.gnu.org/licenses/>.  */
> +
> +#ifndef NO_WARN_X86_INTRINSICS
> +/* This header is distributed to simplify porting x86_64 code that
> +   makes explicit use of Intel intrinsics to powerpc64le.
> +   It is the user's responsibility to determine if the results are
> +   acceptable and make additional changes as necessary.
> +   Note that much code that uses Intel intrinsics can be rewritten in
> +   standard C or GNU C extensions, which are more portable and better
> +   optimized across multiple targets.  */
> +#endif
> +
> +#ifndef _NMMINTRIN_H_INCLUDED
> +#define _NMMINTRIN_H_INCLUDED
> +
> +/* We just include SSE4.1 header file.  */
> +#include <smmintrin.h>
> +
> +#endif /* _NMMINTRIN_H_INCLUDED */

Should there be something in here indicating that nmmintrin.h is for SSE 
4.2?  Otherwise it's a bit of a head-scratcher to a new person wondering 
why this file exists.  No big deal either way.

This looks fine to me with or without that.  Recommend approval.

Thanks!
Bill

> diff --git a/gcc/config/rs6000/smmintrin.h b/gcc/config/rs6000/smmintrin.h
> index fdef6674d16c..c04d2bb5b6d3 100644
> --- a/gcc/config/rs6000/smmintrin.h
> +++ b/gcc/config/rs6000/smmintrin.h
> @@ -386,6 +386,15 @@ _mm_testnzc_si128 (__m128i __A, __m128i __B)
>   
>   #define _mm_test_mix_ones_zeros(M, V) _mm_testnzc_si128 ((M), (V))
>   
> +#ifdef _ARCH_PWR8
> +extern __inline __m128i
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm_cmpeq_epi64 (__m128i __X, __m128i __Y)
> +{
> +  return (__m128i) vec_cmpeq ((__v2di)__X, (__v2di)__Y);
> +}
> +#endif
> +
>   extern __inline __m128i
>   __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
>   _mm_min_epi8 (__m128i __X, __m128i __Y)
> @@ -444,6 +453,22 @@ _mm_max_epu32 (__m128i __X, __m128i __Y)
>   
>   extern __inline __m128i
>   __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm_mullo_epi32 (__m128i __X, __m128i __Y)
> +{
> +  return (__m128i) vec_mul ((__v4su)__X, (__v4su)__Y);
> +}
> +
> +#ifdef _ARCH_PWR8
> +__inline __m128i
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm_mul_epi32 (__m128i __X, __m128i __Y)
> +{
> +  return (__m128i) vec_mule ((__v4si)__X, (__v4si)__Y);
> +}
> +#endif
> +
> +__inline __m128i
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
>   _mm_cvtepi8_epi16 (__m128i __A)
>   {
>     return (__m128i) vec_unpackh ((__v16qi)__A);
> @@ -607,4 +632,20 @@ _mm_minpos_epu16 (__m128i __A)
>     return __r.__m;
>   }
>   
> +__inline __m128i
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm_packus_epi32 (__m128i __X, __m128i __Y)
> +{
> +  return (__m128i) vec_packsu ((__v4si)__X, (__v4si)__Y);
> +}
> +
> +#ifdef _ARCH_PWR8
> +__inline __m128i
> +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> +_mm_cmpgt_epi64 (__m128i __X, __m128i __Y)
> +{
> +  return (__m128i) vec_cmpgt ((__v2di)__X, (__v2di)__Y);
> +}
> +#endif
> +
>   #endif
> diff --git a/gcc/testsuite/gcc.target/powerpc/pr78102.c b/gcc/testsuite/gcc.target/powerpc/pr78102.c
> new file mode 100644
> index 000000000000..56a2d497bbff
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/powerpc/pr78102.c
> @@ -0,0 +1,23 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -mvsx" } */
> +/* { dg-require-effective-target powerpc_vsx_ok } */
> +
> +#include <x86intrin.h>
> +
> +__m128i
> +foo (const __m128i x, const __m128i y)
> +{
> +  return _mm_cmpeq_epi64 (x, y);
> +}
> +
> +__v2di
> +bar (const __v2di x, const __v2di y)
> +{
> +  return x == y;
> +}
> +
> +__v2di
> +baz (const __v2di x, const __v2di y)
> +{
> +  return x != y;
> +}
> diff --git a/gcc/testsuite/gcc.target/powerpc/sse4_1-packusdw.c b/gcc/testsuite/gcc.target/powerpc/sse4_1-packusdw.c
> new file mode 100644
> index 000000000000..15b8ca418f54
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/powerpc/sse4_1-packusdw.c
> @@ -0,0 +1,73 @@
> +/* { dg-do run } */
> +/* { dg-options "-O2 -mvsx" } */
> +/* { dg-require-effective-target powerpc_vsx_ok } */
> +
> +#ifndef CHECK_H
> +#define CHECK_H "sse4_1-check.h"
> +#endif
> +
> +#ifndef TEST
> +#define TEST sse4_1_test
> +#endif
> +
> +#include CHECK_H
> +
> +#include <smmintrin.h>
> +
> +#define NUM 64
> +
> +static unsigned short
> +int_to_ushort (int iVal)
> +{
> +  unsigned short sVal;
> +
> +  if (iVal < 0)
> +    sVal = 0;
> +  else if (iVal > 0xffff)
> +    sVal = 0xffff;
> +  else sVal = iVal;
> +
> +  return sVal;
> +}
> +
> +static void
> +TEST (void)
> +{
> +  union
> +    {
> +      __m128i x[NUM / 4];
> +      int i[NUM];
> +    } src1, src2;
> +  union
> +    {
> +      __m128i x[NUM / 4];
> +      unsigned short s[NUM * 2];
> +    } dst;
> +  int i, sign = 1;
> +
> +  for (i = 0; i < NUM; i++)
> +    {
> +      src1.i[i] = i * i * sign;
> +      src2.i[i] = (i + 20) * sign;
> +      sign = -sign;
> +    }
> +
> +  for (i = 0; i < NUM; i += 4)
> +    dst.x[i / 4] = _mm_packus_epi32 (src1.x [i / 4], src2.x [i / 4]);
> +
> +  for (i = 0; i < NUM; i ++)
> +    {
> +      int dstIndex;
> +      unsigned short sVal;
> +
> +      sVal = int_to_ushort (src1.i[i]);
> +      dstIndex = (i % 4) + (i / 4) * 8;
> +      if (sVal != dst.s[dstIndex])
> +	abort ();
> +
> +      sVal = int_to_ushort (src2.i[i]);
> +      dstIndex += 4;
> +      if (sVal != dst.s[dstIndex])
> +	abort ();
> +    }
> +}
> diff --git a/gcc/testsuite/gcc.target/powerpc/sse4_1-pcmpeqq.c b/gcc/testsuite/gcc.target/powerpc/sse4_1-pcmpeqq.c
> new file mode 100644
> index 000000000000..39b9f01d64a4
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/powerpc/sse4_1-pcmpeqq.c
> @@ -0,0 +1,46 @@
> +/* { dg-do run } */
> +/* { dg-options "-O2 -mpower8-vector" } */
> +/* { dg-require-effective-target p8vector_hw } */
> +
> +#ifndef CHECK_H
> +#define CHECK_H "sse4_1-check.h"
> +#endif
> +
> +#ifndef TEST
> +#define TEST sse4_1_test
> +#endif
> +
> +#include CHECK_H
> +
> +#include <smmintrin.h>
> +
> +#define NUM 64
> +
> +static void
> +TEST (void)
> +{
> +  union
> +    {
> +      __m128i x[NUM / 2];
> +      long long ll[NUM];
> +    } dst, src1, src2;
> +  int i, sign=1;
> +  long long is_eq;
> +
> +  for (i = 0; i < NUM; i++)
> +    {
> +      src1.ll[i] = i * i * sign;
> +      src2.ll[i] = (i + 20) * sign;
> +      sign = -sign;
> +    }
> +
> +  for (i = 0; i < NUM; i += 2)
> +    dst.x [i / 2] = _mm_cmpeq_epi64(src1.x [i / 2], src2.x [i / 2]);
> +
> +  for (i = 0; i < NUM; i++)
> +    {
> +      is_eq = src1.ll[i] == src2.ll[i] ? 0xffffffffffffffffLL : 0LL;
> +      if (is_eq != dst.ll[i])
> +	abort ();
> +    }
> +}
> diff --git a/gcc/testsuite/gcc.target/powerpc/sse4_1-pmuldq.c b/gcc/testsuite/gcc.target/powerpc/sse4_1-pmuldq.c
> new file mode 100644
> index 000000000000..6a884f46235f
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/powerpc/sse4_1-pmuldq.c
> @@ -0,0 +1,51 @@
> +/* { dg-do run } */
> +/* { dg-options "-O2 -mpower8-vector" } */
> +/* { dg-require-effective-target p8vector_hw } */
> +
> +#ifndef CHECK_H
> +#define CHECK_H "sse4_1-check.h"
> +#endif
> +
> +#ifndef TEST
> +#define TEST sse4_1_test
> +#endif
> +
> +#include CHECK_H
> +
> +#include <smmintrin.h>
> +
> +#define NUM 64
> +
> +static void
> +TEST (void)
> +{
> +  union
> +    {
> +      __m128i x[NUM / 2];
> +      long long ll[NUM];
> +    } dst;
> +  union
> +    {
> +      __m128i x[NUM / 2];
> +      int i[NUM * 2];
> +    } src1, src2;
> +  int i, sign = 1;
> +  long long value;
> +
> +  for (i = 0; i < NUM * 2; i += 2)
> +    {
> +      src1.i[i] = i * i * sign;
> +      src2.i[i] = (i + 20) * sign;
> +      sign = -sign;
> +    }
> +
> +  for (i = 0; i < NUM; i += 2)
> +    dst.x[i / 2] = _mm_mul_epi32 (src1.x[i / 2], src2.x[i / 2]);
> +
> +  for (i = 0; i < NUM; i++)
> +    {
> +      value = (long long) src1.i[i * 2] * (long long) src2.i[i * 2];
> +      if (value != dst.ll[i])
> +	abort ();
> +    }
> +}
> diff --git a/gcc/testsuite/gcc.target/powerpc/sse4_1-pmulld.c b/gcc/testsuite/gcc.target/powerpc/sse4_1-pmulld.c
> new file mode 100644
> index 000000000000..150832915911
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/powerpc/sse4_1-pmulld.c
> @@ -0,0 +1,46 @@
> +/* { dg-do run } */
> +/* { dg-options "-O2 -mvsx" } */
> +/* { dg-require-effective-target powerpc_vsx_ok } */
> +
> +#ifndef CHECK_H
> +#define CHECK_H "sse4_1-check.h"
> +#endif
> +
> +#ifndef TEST
> +#define TEST sse4_1_test
> +#endif
> +
> +#include CHECK_H
> +
> +#include <smmintrin.h>
> +
> +#define NUM 64
> +
> +static void
> +TEST (void)
> +{
> +  union
> +    {
> +      __m128i x[NUM / 4];
> +      int i[NUM];
> +    } dst, src1, src2;
> +  int i, sign = 1;
> +  int value;
> +
> +  for (i = 0; i < NUM; i++)
> +    {
> +      src1.i[i] = i * i * sign;
> +      src2.i[i] = (i + 20) * sign;
> +      sign = -sign;
> +    }
> +
> +  for (i = 0; i < NUM; i += 4)
> +    dst.x[i / 4] = _mm_mullo_epi32 (src1.x[i / 4], src2.x[i / 4]);
> +
> +  for (i = 0; i < NUM; i++)
> +    {
> +      value = src1.i[i] * src2.i[i];
> +      if (value != dst.i[i])
> +	abort ();
> +    }
> +}
> diff --git a/gcc/testsuite/gcc.target/powerpc/sse4_2-check.h b/gcc/testsuite/gcc.target/powerpc/sse4_2-check.h
> new file mode 100644
> index 000000000000..f6264e5a1083
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/powerpc/sse4_2-check.h
> @@ -0,0 +1,18 @@
> +#define NO_WARN_X86_INTRINSICS 1
> +
> +static void sse4_2_test (void);
> +
> +static void
> +__attribute__ ((noinline))
> +do_test (void)
> +{
> +  sse4_2_test ();
> +}
> +
> +int
> +main ()
> +{
> +  do_test ();
> +
> +  return 0;
> +}
> diff --git a/gcc/testsuite/gcc.target/powerpc/sse4_2-pcmpgtq.c b/gcc/testsuite/gcc.target/powerpc/sse4_2-pcmpgtq.c
> new file mode 100644
> index 000000000000..4bfbad885b30
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/powerpc/sse4_2-pcmpgtq.c
> @@ -0,0 +1,46 @@
> +/* { dg-do run } */
> +/* { dg-options "-O2 -mvsx" } */
> +/* { dg-require-effective-target powerpc_vsx_ok } */
> +
> +#ifndef CHECK_H
> +#define CHECK_H "sse4_2-check.h"
> +#endif
> +
> +#ifndef TEST
> +#define TEST sse4_2_test
> +#endif
> +
> +#include CHECK_H
> +
> +#include <nmmintrin.h>
> +
> +#define NUM 64
> +
> +static void
> +TEST (void)
> +{
> +  union
> +    {
> +      __m128i x[NUM / 2];
> +      long long ll[NUM];
> +    } dst, src1, src2;
> +  int i, sign = 1;
> +  long long is_eq;
> +
> +  for (i = 0; i < NUM; i++)
> +    {
> +      src1.ll[i] = i * i * sign;
> +      src2.ll[i] = (i + 20) * sign;
> +      sign = -sign;
> +    }
> +
> +  for (i = 0; i < NUM; i += 2)
> +    dst.x[i / 2] = _mm_cmpgt_epi64 (src1.x[i / 2], src2.x[i / 2]);
> +
> +  for (i = 0; i < NUM; i++)
> +    {
> +      is_eq = src1.ll[i] > src2.ll[i] ? 0xFFFFFFFFFFFFFFFFLL : 0LL;
> +      if (is_eq != dst.ll[i])
> +	abort ();
> +    }
> +}
Paul A. Clarke Aug. 27, 2021, 6:52 p.m. UTC | #2
On Fri, Aug 27, 2021 at 10:21:35AM -0500, Bill Schmidt via Gcc-patches wrote:
> On 8/23/21 2:03 PM, Paul A. Clarke wrote:
> > Function signatures and decorations match gcc/config/i386/smmintrin.h.

> > gcc

> > 	* config/rs6000/nmmintrin.h: Copy from i386, tweak to suit.

> > ---
> > v3:
> > - Add nmmintrin.h. _mm_cmpgt_epi64 is part of SSE4.2, which is
> >    ostensibly defined in nmmintrin.h. Following the i386 implementation,
> >    however, nmmintrin.h only includes smmintrin.h, and the actual
> >    implementations appear there.

> > v2:
> > - Added "extern" to functions to maintain compatible decorations with
> >    like implementations in gcc/config/i386.

> > diff --git a/gcc/config/rs6000/nmmintrin.h b/gcc/config/rs6000/nmmintrin.h
> > new file mode 100644
> > index 000000000000..20a70bee3776
> > --- /dev/null
> > +++ b/gcc/config/rs6000/nmmintrin.h
> > @@ -0,0 +1,40 @@
> > +/* Copyright (C) 2021 Free Software Foundation, Inc.
> > +
> > +   This file is part of GCC.
> > +
> > +   GCC is free software; you can redistribute it and/or modify
> > +   it under the terms of the GNU General Public License as published by
> > +   the Free Software Foundation; either version 3, or (at your option)
> > +   any later version.
> > +
> > +   GCC is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> > +   GNU General Public License for more details.
> > +
> > +   Under Section 7 of GPL version 3, you are granted additional
> > +   permissions described in the GCC Runtime Library Exception, version
> > +   3.1, as published by the Free Software Foundation.
> > +
> > +   You should have received a copy of the GNU General Public License and
> > +   a copy of the GCC Runtime Library Exception along with this program;
> > +   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
> > +   <http://www.gnu.org/licenses/>.  */
> > +
> > +#ifndef NO_WARN_X86_INTRINSICS
> > +/* This header is distributed to simplify porting x86_64 code that
> > +   makes explicit use of Intel intrinsics to powerpc64le.
> > +   It is the user's responsibility to determine if the results are
> > +   acceptable and make additional changes as necessary.
> > +   Note that much code that uses Intel intrinsics can be rewritten in
> > +   standard C or GNU C extensions, which are more portable and better
> > +   optimized across multiple targets.  */
> > +#endif
> > +
> > +#ifndef _NMMINTRIN_H_INCLUDED
> > +#define _NMMINTRIN_H_INCLUDED
> > +
> > +/* We just include SSE4.1 header file.  */
> > +#include <smmintrin.h>
> > +
> > +#endif /* _NMMINTRIN_H_INCLUDED */
> 
> Should there be something in here indicating that nmmintrin.h is for SSE
> 4.2?  Otherwise it's a bit of a head-scratcher to a new person wondering why
> this file exists.  No big deal either way.

For good or bad, I have been trying to minimize differences with the
analogous i386 files.  With the exception of the copyright and our annoying
litte warning, the only difference was this comment:

--
/* Implemented from the specification included in the Intel C++ Compiler
   User Guide and Reference, version 10.0.  */
--

I didn't find that (1) accurate, since there are no implementations therein,
or (2) particularly informative, as I imagine that document has a much
bigger scope than SSE4.2.  And keeping it would be a bit misleading, I think.
So, I intentionally removed the comment.

> This looks fine to me with or without that.  Recommend approval.

Thanks for the review!

PC
Segher Boessenkool Oct. 11, 2021, 11:07 p.m. UTC | #3
Hi!

On Mon, Aug 23, 2021 at 02:03:09PM -0500, Paul A. Clarke wrote:
> gcc
> 	* config/rs6000/smmintrin.h (_mm_cmpeq_epi64, _mm_cmpgt_epi64,
> 	_mm_mullo_epi32, _mm_mul_epi32, _mm_packus_epi32): New.
> 	* config/rs6000/nmmintrin.h: Copy from i386, tweak to suit.
> 
> gcc/testsuite
> 	* gcc.target/powerpc/pr78102.c: Copy from gcc.target/i386,
> 	adjust dg directives to suit.
> 	* gcc.target/powerpc/sse4_1-packusdw.c: Same.
> 	* gcc.target/powerpc/sse4_1-pcmpeqq.c: Same.
> 	* gcc.target/powerpc/sse4_1-pmuldq.c: Same.
> 	* gcc.target/powerpc/sse4_1-pmulld.c: Same.
> 	* gcc.target/powerpc/sse4_2-pcmpgtq.c: Same.
> 	* gcc.target/powerpc/sse4_2-check.h: Copy from gcc.target/i386,
> 	tweak to suit.

Okay for trunk (with the vsx_hw thing).  Thanks!


Segher
Paul A. Clarke Oct. 12, 2021, 1:55 a.m. UTC | #4
On Mon, Oct 11, 2021 at 06:07:35PM -0500, Segher Boessenkool wrote:
> On Mon, Aug 23, 2021 at 02:03:09PM -0500, Paul A. Clarke wrote:
> > gcc
> > 	* config/rs6000/smmintrin.h (_mm_cmpeq_epi64, _mm_cmpgt_epi64,
> > 	_mm_mullo_epi32, _mm_mul_epi32, _mm_packus_epi32): New.
> > 	* config/rs6000/nmmintrin.h: Copy from i386, tweak to suit.
> > 
> > gcc/testsuite
> > 	* gcc.target/powerpc/pr78102.c: Copy from gcc.target/i386,
> > 	adjust dg directives to suit.
> > 	* gcc.target/powerpc/sse4_1-packusdw.c: Same.
> > 	* gcc.target/powerpc/sse4_1-pcmpeqq.c: Same.
> > 	* gcc.target/powerpc/sse4_1-pmuldq.c: Same.
> > 	* gcc.target/powerpc/sse4_1-pmulld.c: Same.
> > 	* gcc.target/powerpc/sse4_2-pcmpgtq.c: Same.
> > 	* gcc.target/powerpc/sse4_2-check.h: Copy from gcc.target/i386,
> > 	tweak to suit.
> 
> Okay for trunk (with the vsx_hw thing).  Thanks!

This was committed:

rs6000: Support more SSE4 "cmp", "mul", "pack" intrinsics

Function signatures and decorations match gcc/config/i386/smmintrin.h.

Also, copy tests for:
- _mm_cmpeq_epi64
- _mm_mullo_epi32, _mm_mul_epi32
- _mm_packus_epi32
- _mm_cmpgt_epi64 (SSE4.2)

from gcc/testsuite/gcc.target/i386.

2021-10-11  Paul A. Clarke  <pc@us.ibm.com>

gcc
        * config/rs6000/smmintrin.h (_mm_cmpeq_epi64, _mm_cmpgt_epi64,
        _mm_mullo_epi32, _mm_mul_epi32, _mm_packus_epi32): New.
        * config/rs6000/nmmintrin.h: Copy from i386, tweak to suit.

gcc/testsuite
        * gcc.target/powerpc/pr78102.c: Copy from gcc.target/i386,
        adjust dg directives to suit.
        * gcc.target/powerpc/sse4_1-packusdw.c: Same.
        * gcc.target/powerpc/sse4_1-pcmpeqq.c: Same.
        * gcc.target/powerpc/sse4_1-pmuldq.c: Same.
        * gcc.target/powerpc/sse4_1-pmulld.c: Same.
        * gcc.target/powerpc/sse4_2-pcmpgtq.c: Same.
        * gcc.target/powerpc/sse4_2-check.h: Copy from gcc.target/i386,
        tweak to suit.
---
v4: Fix "space after cast" and "vsx_hw" issues, per Segher review.

diff --git a/gcc/config/rs6000/nmmintrin.h b/gcc/config/rs6000/nmmintrin.h
new file mode 100644
index 000000000000..20a70bee3776
--- /dev/null
+++ b/gcc/config/rs6000/nmmintrin.h
@@ -0,0 +1,40 @@
+/* Copyright (C) 2021 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef NO_WARN_X86_INTRINSICS
+/* This header is distributed to simplify porting x86_64 code that
+   makes explicit use of Intel intrinsics to powerpc64le.
+   It is the user's responsibility to determine if the results are
+   acceptable and make additional changes as necessary.
+   Note that much code that uses Intel intrinsics can be rewritten in
+   standard C or GNU C extensions, which are more portable and better
+   optimized across multiple targets.  */
+#endif
+
+#ifndef _NMMINTRIN_H_INCLUDED
+#define _NMMINTRIN_H_INCLUDED
+
+/* We just include SSE4.1 header file.  */
+#include <smmintrin.h>
+
+#endif /* _NMMINTRIN_H_INCLUDED */
diff --git a/gcc/config/rs6000/smmintrin.h b/gcc/config/rs6000/smmintrin.h
index ad6b68e13cce..90ce03d22709 100644
--- a/gcc/config/rs6000/smmintrin.h
+++ b/gcc/config/rs6000/smmintrin.h
@@ -274,6 +274,15 @@ _mm_floor_ss (__m128 __A, __m128 __B)
   return __r;
 }
 
+#ifdef _ARCH_PWR8
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpeq_epi64 (__m128i __X, __m128i __Y)
+{
+  return (__m128i) vec_cmpeq ((__v2di) __X, (__v2di) __Y);
+}
+#endif
+
 extern __inline __m128i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm_min_epi8 (__m128i __X, __m128i __Y)
@@ -332,6 +341,22 @@ _mm_max_epu32 (__m128i __X, __m128i __Y)
 
 extern __inline __m128i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mullo_epi32 (__m128i __X, __m128i __Y)
+{
+  return (__m128i) vec_mul ((__v4su) __X, (__v4su) __Y);
+}
+
+#ifdef _ARCH_PWR8
+__inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mul_epi32 (__m128i __X, __m128i __Y)
+{
+  return (__m128i) vec_mule ((__v4si) __X, (__v4si) __Y);
+}
+#endif
+
+__inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm_cvtepi8_epi16 (__m128i __A)
 {
   return (__m128i) vec_unpackh ((__v16qi) __A);
@@ -495,4 +520,20 @@ _mm_minpos_epu16 (__m128i __A)
   return __r.__m;
 }
 
+__inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_packus_epi32 (__m128i __X, __m128i __Y)
+{
+  return (__m128i) vec_packsu ((__v4si) __X, (__v4si) __Y);
+}
+
+#ifdef _ARCH_PWR8
+__inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpgt_epi64 (__m128i __X, __m128i __Y)
+{
+  return (__m128i) vec_cmpgt ((__v2di) __X, (__v2di) __Y);
+}
+#endif
+
 #endif
diff --git a/gcc/testsuite/gcc.target/powerpc/pr78102.c b/gcc/testsuite/gcc.target/powerpc/pr78102.c
new file mode 100644
index 000000000000..68898c7f9428
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/pr78102.c
@@ -0,0 +1,23 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mvsx" } */
+/* { dg-require-effective-target powerpc_vsx_hw } */
+
+#include <x86intrin.h>
+
+__m128i
+foo (const __m128i x, const __m128i y)
+{
+  return _mm_cmpeq_epi64 (x, y);
+}
+
+__v2di
+bar (const __v2di x, const __v2di y)
+{
+  return x == y;
+}
+
+__v2di
+baz (const __v2di x, const __v2di y)
+{
+  return x != y;
+}
diff --git a/gcc/testsuite/gcc.target/powerpc/sse4_1-packusdw.c b/gcc/testsuite/gcc.target/powerpc/sse4_1-packusdw.c
new file mode 100644
index 000000000000..8b757a267468
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/sse4_1-packusdw.c
@@ -0,0 +1,73 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -mvsx" } */
+/* { dg-require-effective-target powerpc_vsx_hw } */
+
+#ifndef CHECK_H
+#define CHECK_H "sse4_1-check.h"
+#endif
+
+#ifndef TEST
+#define TEST sse4_1_test
+#endif
+
+#include CHECK_H
+
+#include <smmintrin.h>
+
+#define NUM 64
+
+static unsigned short
+int_to_ushort (int iVal)
+{
+  unsigned short sVal;
+
+  if (iVal < 0)
+    sVal = 0;
+  else if (iVal > 0xffff)
+    sVal = 0xffff;
+  else sVal = iVal;
+
+  return sVal;
+}
+
+static void
+TEST (void)
+{
+  union
+    {
+      __m128i x[NUM / 4];
+      int i[NUM];
+    } src1, src2;
+  union
+    {
+      __m128i x[NUM / 4];
+      unsigned short s[NUM * 2];
+    } dst;
+  int i, sign = 1;
+
+  for (i = 0; i < NUM; i++)
+    {
+      src1.i[i] = i * i * sign;
+      src2.i[i] = (i + 20) * sign;
+      sign = -sign;
+    }
+
+  for (i = 0; i < NUM; i += 4)
+    dst.x[i / 4] = _mm_packus_epi32 (src1.x [i / 4], src2.x [i / 4]);
+
+  for (i = 0; i < NUM; i ++)
+    {
+      int dstIndex;
+      unsigned short sVal;
+
+      sVal = int_to_ushort (src1.i[i]);
+      dstIndex = (i % 4) + (i / 4) * 8;
+      if (sVal != dst.s[dstIndex])
+	abort ();
+
+      sVal = int_to_ushort (src2.i[i]);
+      dstIndex += 4;
+      if (sVal != dst.s[dstIndex])
+	abort ();
+    }
+}
diff --git a/gcc/testsuite/gcc.target/powerpc/sse4_1-pcmpeqq.c b/gcc/testsuite/gcc.target/powerpc/sse4_1-pcmpeqq.c
new file mode 100644
index 000000000000..39b9f01d64a4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/sse4_1-pcmpeqq.c
@@ -0,0 +1,46 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -mpower8-vector" } */
+/* { dg-require-effective-target p8vector_hw } */
+
+#ifndef CHECK_H
+#define CHECK_H "sse4_1-check.h"
+#endif
+
+#ifndef TEST
+#define TEST sse4_1_test
+#endif
+
+#include CHECK_H
+
+#include <smmintrin.h>
+
+#define NUM 64
+
+static void
+TEST (void)
+{
+  union
+    {
+      __m128i x[NUM / 2];
+      long long ll[NUM];
+    } dst, src1, src2;
+  int i, sign=1;
+  long long is_eq;
+
+  for (i = 0; i < NUM; i++)
+    {
+      src1.ll[i] = i * i * sign;
+      src2.ll[i] = (i + 20) * sign;
+      sign = -sign;
+    }
+
+  for (i = 0; i < NUM; i += 2)
+    dst.x [i / 2] = _mm_cmpeq_epi64(src1.x [i / 2], src2.x [i / 2]);
+
+  for (i = 0; i < NUM; i++)
+    {
+      is_eq = src1.ll[i] == src2.ll[i] ? 0xffffffffffffffffLL : 0LL;
+      if (is_eq != dst.ll[i])
+	abort ();
+    }
+}
diff --git a/gcc/testsuite/gcc.target/powerpc/sse4_1-pmuldq.c b/gcc/testsuite/gcc.target/powerpc/sse4_1-pmuldq.c
new file mode 100644
index 000000000000..6a884f46235f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/sse4_1-pmuldq.c
@@ -0,0 +1,51 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -mpower8-vector" } */
+/* { dg-require-effective-target p8vector_hw } */
+
+#ifndef CHECK_H
+#define CHECK_H "sse4_1-check.h"
+#endif
+
+#ifndef TEST
+#define TEST sse4_1_test
+#endif
+
+#include CHECK_H
+
+#include <smmintrin.h>
+
+#define NUM 64
+
+static void
+TEST (void)
+{
+  union
+    {
+      __m128i x[NUM / 2];
+      long long ll[NUM];
+    } dst;
+  union
+    {
+      __m128i x[NUM / 2];
+      int i[NUM * 2];
+    } src1, src2;
+  int i, sign = 1;
+  long long value;
+
+  for (i = 0; i < NUM * 2; i += 2)
+    {
+      src1.i[i] = i * i * sign;
+      src2.i[i] = (i + 20) * sign;
+      sign = -sign;
+    }
+
+  for (i = 0; i < NUM; i += 2)
+    dst.x[i / 2] = _mm_mul_epi32 (src1.x[i / 2], src2.x[i / 2]);
+
+  for (i = 0; i < NUM; i++)
+    {
+      value = (long long) src1.i[i * 2] * (long long) src2.i[i * 2];
+      if (value != dst.ll[i])
+	abort ();
+    }
+}
diff --git a/gcc/testsuite/gcc.target/powerpc/sse4_1-pmulld.c b/gcc/testsuite/gcc.target/powerpc/sse4_1-pmulld.c
new file mode 100644
index 000000000000..730334366426
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/sse4_1-pmulld.c
@@ -0,0 +1,46 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -mvsx" } */
+/* { dg-require-effective-target powerpc_vsx_hw } */
+
+#ifndef CHECK_H
+#define CHECK_H "sse4_1-check.h"
+#endif
+
+#ifndef TEST
+#define TEST sse4_1_test
+#endif
+
+#include CHECK_H
+
+#include <smmintrin.h>
+
+#define NUM 64
+
+static void
+TEST (void)
+{
+  union
+    {
+      __m128i x[NUM / 4];
+      int i[NUM];
+    } dst, src1, src2;
+  int i, sign = 1;
+  int value;
+
+  for (i = 0; i < NUM; i++)
+    {
+      src1.i[i] = i * i * sign;
+      src2.i[i] = (i + 20) * sign;
+      sign = -sign;
+    }
+
+  for (i = 0; i < NUM; i += 4)
+    dst.x[i / 4] = _mm_mullo_epi32 (src1.x[i / 4], src2.x[i / 4]);
+
+  for (i = 0; i < NUM; i++)
+    {
+      value = src1.i[i] * src2.i[i];
+      if (value != dst.i[i])
+	abort ();
+    }
+}
diff --git a/gcc/testsuite/gcc.target/powerpc/sse4_2-check.h b/gcc/testsuite/gcc.target/powerpc/sse4_2-check.h
new file mode 100644
index 000000000000..f6264e5a1083
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/sse4_2-check.h
@@ -0,0 +1,18 @@
+#define NO_WARN_X86_INTRINSICS 1
+
+static void sse4_2_test (void);
+
+static void
+__attribute__ ((noinline))
+do_test (void)
+{
+  sse4_2_test ();
+}
+
+int
+main ()
+{
+  do_test ();
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/powerpc/sse4_2-pcmpgtq.c b/gcc/testsuite/gcc.target/powerpc/sse4_2-pcmpgtq.c
new file mode 100644
index 000000000000..a8a6a2010f45
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/sse4_2-pcmpgtq.c
@@ -0,0 +1,46 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -mvsx" } */
+/* { dg-require-effective-target powerpc_vsx_hw } */
+
+#ifndef CHECK_H
+#define CHECK_H "sse4_2-check.h"
+#endif
+
+#ifndef TEST
+#define TEST sse4_2_test
+#endif
+
+#include CHECK_H
+
+#include <nmmintrin.h>
+
+#define NUM 64
+
+static void
+TEST (void)
+{
+  union
+    {
+      __m128i x[NUM / 2];
+      long long ll[NUM];
+    } dst, src1, src2;
+  int i, sign = 1;
+  long long is_eq;
+
+  for (i = 0; i < NUM; i++)
+    {
+      src1.ll[i] = i * i * sign;
+      src2.ll[i] = (i + 20) * sign;
+      sign = -sign;
+    }
+
+  for (i = 0; i < NUM; i += 2)
+    dst.x[i / 2] = _mm_cmpgt_epi64 (src1.x[i / 2], src2.x[i / 2]);
+
+  for (i = 0; i < NUM; i++)
+    {
+      is_eq = src1.ll[i] > src2.ll[i] ? 0xFFFFFFFFFFFFFFFFLL : 0LL;
+      if (is_eq != dst.ll[i])
+	abort ();
+    }
+}
diff mbox series

Patch

diff --git a/gcc/config/rs6000/nmmintrin.h b/gcc/config/rs6000/nmmintrin.h
new file mode 100644
index 000000000000..20a70bee3776
--- /dev/null
+++ b/gcc/config/rs6000/nmmintrin.h
@@ -0,0 +1,40 @@ 
+/* Copyright (C) 2021 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef NO_WARN_X86_INTRINSICS
+/* This header is distributed to simplify porting x86_64 code that
+   makes explicit use of Intel intrinsics to powerpc64le.
+   It is the user's responsibility to determine if the results are
+   acceptable and make additional changes as necessary.
+   Note that much code that uses Intel intrinsics can be rewritten in
+   standard C or GNU C extensions, which are more portable and better
+   optimized across multiple targets.  */
+#endif
+
+#ifndef _NMMINTRIN_H_INCLUDED
+#define _NMMINTRIN_H_INCLUDED
+
+/* We just include SSE4.1 header file.  */
+#include <smmintrin.h>
+
+#endif /* _NMMINTRIN_H_INCLUDED */
diff --git a/gcc/config/rs6000/smmintrin.h b/gcc/config/rs6000/smmintrin.h
index fdef6674d16c..c04d2bb5b6d3 100644
--- a/gcc/config/rs6000/smmintrin.h
+++ b/gcc/config/rs6000/smmintrin.h
@@ -386,6 +386,15 @@  _mm_testnzc_si128 (__m128i __A, __m128i __B)
 
 #define _mm_test_mix_ones_zeros(M, V) _mm_testnzc_si128 ((M), (V))
 
+#ifdef _ARCH_PWR8
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpeq_epi64 (__m128i __X, __m128i __Y)
+{
+  return (__m128i) vec_cmpeq ((__v2di)__X, (__v2di)__Y);
+}
+#endif
+
 extern __inline __m128i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm_min_epi8 (__m128i __X, __m128i __Y)
@@ -444,6 +453,22 @@  _mm_max_epu32 (__m128i __X, __m128i __Y)
 
 extern __inline __m128i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mullo_epi32 (__m128i __X, __m128i __Y)
+{
+  return (__m128i) vec_mul ((__v4su)__X, (__v4su)__Y);
+}
+
+#ifdef _ARCH_PWR8
+__inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mul_epi32 (__m128i __X, __m128i __Y)
+{
+  return (__m128i) vec_mule ((__v4si)__X, (__v4si)__Y);
+}
+#endif
+
+__inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm_cvtepi8_epi16 (__m128i __A)
 {
   return (__m128i) vec_unpackh ((__v16qi)__A);
@@ -607,4 +632,20 @@  _mm_minpos_epu16 (__m128i __A)
   return __r.__m;
 }
 
+__inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_packus_epi32 (__m128i __X, __m128i __Y)
+{
+  return (__m128i) vec_packsu ((__v4si)__X, (__v4si)__Y);
+}
+
+#ifdef _ARCH_PWR8
+__inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpgt_epi64 (__m128i __X, __m128i __Y)
+{
+  return (__m128i) vec_cmpgt ((__v2di)__X, (__v2di)__Y);
+}
+#endif
+
 #endif
diff --git a/gcc/testsuite/gcc.target/powerpc/pr78102.c b/gcc/testsuite/gcc.target/powerpc/pr78102.c
new file mode 100644
index 000000000000..56a2d497bbff
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/pr78102.c
@@ -0,0 +1,23 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -mvsx" } */
+/* { dg-require-effective-target powerpc_vsx_ok } */
+
+#include <x86intrin.h>
+
+__m128i
+foo (const __m128i x, const __m128i y)
+{
+  return _mm_cmpeq_epi64 (x, y);
+}
+
+__v2di
+bar (const __v2di x, const __v2di y)
+{
+  return x == y;
+}
+
+__v2di
+baz (const __v2di x, const __v2di y)
+{
+  return x != y;
+}
diff --git a/gcc/testsuite/gcc.target/powerpc/sse4_1-packusdw.c b/gcc/testsuite/gcc.target/powerpc/sse4_1-packusdw.c
new file mode 100644
index 000000000000..15b8ca418f54
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/sse4_1-packusdw.c
@@ -0,0 +1,73 @@ 
+/* { dg-do run } */
+/* { dg-options "-O2 -mvsx" } */
+/* { dg-require-effective-target powerpc_vsx_ok } */
+
+#ifndef CHECK_H
+#define CHECK_H "sse4_1-check.h"
+#endif
+
+#ifndef TEST
+#define TEST sse4_1_test
+#endif
+
+#include CHECK_H
+
+#include <smmintrin.h>
+
+#define NUM 64
+
+static unsigned short
+int_to_ushort (int iVal)
+{
+  unsigned short sVal;
+
+  if (iVal < 0)
+    sVal = 0;
+  else if (iVal > 0xffff)
+    sVal = 0xffff;
+  else sVal = iVal;
+
+  return sVal;
+}
+
+static void
+TEST (void)
+{
+  union
+    {
+      __m128i x[NUM / 4];
+      int i[NUM];
+    } src1, src2;
+  union
+    {
+      __m128i x[NUM / 4];
+      unsigned short s[NUM * 2];
+    } dst;
+  int i, sign = 1;
+
+  for (i = 0; i < NUM; i++)
+    {
+      src1.i[i] = i * i * sign;
+      src2.i[i] = (i + 20) * sign;
+      sign = -sign;
+    }
+
+  for (i = 0; i < NUM; i += 4)
+    dst.x[i / 4] = _mm_packus_epi32 (src1.x [i / 4], src2.x [i / 4]);
+
+  for (i = 0; i < NUM; i ++)
+    {
+      int dstIndex;
+      unsigned short sVal;
+
+      sVal = int_to_ushort (src1.i[i]);
+      dstIndex = (i % 4) + (i / 4) * 8;
+      if (sVal != dst.s[dstIndex])
+	abort ();
+
+      sVal = int_to_ushort (src2.i[i]);
+      dstIndex += 4;
+      if (sVal != dst.s[dstIndex])
+	abort ();
+    }
+}
diff --git a/gcc/testsuite/gcc.target/powerpc/sse4_1-pcmpeqq.c b/gcc/testsuite/gcc.target/powerpc/sse4_1-pcmpeqq.c
new file mode 100644
index 000000000000..39b9f01d64a4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/sse4_1-pcmpeqq.c
@@ -0,0 +1,46 @@ 
+/* { dg-do run } */
+/* { dg-options "-O2 -mpower8-vector" } */
+/* { dg-require-effective-target p8vector_hw } */
+
+#ifndef CHECK_H
+#define CHECK_H "sse4_1-check.h"
+#endif
+
+#ifndef TEST
+#define TEST sse4_1_test
+#endif
+
+#include CHECK_H
+
+#include <smmintrin.h>
+
+#define NUM 64
+
+static void
+TEST (void)
+{
+  union
+    {
+      __m128i x[NUM / 2];
+      long long ll[NUM];
+    } dst, src1, src2;
+  int i, sign=1;
+  long long is_eq;
+
+  for (i = 0; i < NUM; i++)
+    {
+      src1.ll[i] = i * i * sign;
+      src2.ll[i] = (i + 20) * sign;
+      sign = -sign;
+    }
+
+  for (i = 0; i < NUM; i += 2)
+    dst.x [i / 2] = _mm_cmpeq_epi64(src1.x [i / 2], src2.x [i / 2]);
+
+  for (i = 0; i < NUM; i++)
+    {
+      is_eq = src1.ll[i] == src2.ll[i] ? 0xffffffffffffffffLL : 0LL;
+      if (is_eq != dst.ll[i])
+	abort ();
+    }
+}
diff --git a/gcc/testsuite/gcc.target/powerpc/sse4_1-pmuldq.c b/gcc/testsuite/gcc.target/powerpc/sse4_1-pmuldq.c
new file mode 100644
index 000000000000..6a884f46235f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/sse4_1-pmuldq.c
@@ -0,0 +1,51 @@ 
+/* { dg-do run } */
+/* { dg-options "-O2 -mpower8-vector" } */
+/* { dg-require-effective-target p8vector_hw } */
+
+#ifndef CHECK_H
+#define CHECK_H "sse4_1-check.h"
+#endif
+
+#ifndef TEST
+#define TEST sse4_1_test
+#endif
+
+#include CHECK_H
+
+#include <smmintrin.h>
+
+#define NUM 64
+
+static void
+TEST (void)
+{
+  union
+    {
+      __m128i x[NUM / 2];
+      long long ll[NUM];
+    } dst;
+  union
+    {
+      __m128i x[NUM / 2];
+      int i[NUM * 2];
+    } src1, src2;
+  int i, sign = 1;
+  long long value;
+
+  for (i = 0; i < NUM * 2; i += 2)
+    {
+      src1.i[i] = i * i * sign;
+      src2.i[i] = (i + 20) * sign;
+      sign = -sign;
+    }
+
+  for (i = 0; i < NUM; i += 2)
+    dst.x[i / 2] = _mm_mul_epi32 (src1.x[i / 2], src2.x[i / 2]);
+
+  for (i = 0; i < NUM; i++)
+    {
+      value = (long long) src1.i[i * 2] * (long long) src2.i[i * 2];
+      if (value != dst.ll[i])
+	abort ();
+    }
+}
diff --git a/gcc/testsuite/gcc.target/powerpc/sse4_1-pmulld.c b/gcc/testsuite/gcc.target/powerpc/sse4_1-pmulld.c
new file mode 100644
index 000000000000..150832915911
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/sse4_1-pmulld.c
@@ -0,0 +1,46 @@ 
+/* { dg-do run } */
+/* { dg-options "-O2 -mvsx" } */
+/* { dg-require-effective-target powerpc_vsx_ok } */
+
+#ifndef CHECK_H
+#define CHECK_H "sse4_1-check.h"
+#endif
+
+#ifndef TEST
+#define TEST sse4_1_test
+#endif
+
+#include CHECK_H
+
+#include <smmintrin.h>
+
+#define NUM 64
+
+static void
+TEST (void)
+{
+  union
+    {
+      __m128i x[NUM / 4];
+      int i[NUM];
+    } dst, src1, src2;
+  int i, sign = 1;
+  int value;
+
+  for (i = 0; i < NUM; i++)
+    {
+      src1.i[i] = i * i * sign;
+      src2.i[i] = (i + 20) * sign;
+      sign = -sign;
+    }
+
+  for (i = 0; i < NUM; i += 4)
+    dst.x[i / 4] = _mm_mullo_epi32 (src1.x[i / 4], src2.x[i / 4]);
+
+  for (i = 0; i < NUM; i++)
+    {
+      value = src1.i[i] * src2.i[i];
+      if (value != dst.i[i])
+	abort ();
+    }
+}
diff --git a/gcc/testsuite/gcc.target/powerpc/sse4_2-check.h b/gcc/testsuite/gcc.target/powerpc/sse4_2-check.h
new file mode 100644
index 000000000000..f6264e5a1083
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/sse4_2-check.h
@@ -0,0 +1,18 @@ 
+#define NO_WARN_X86_INTRINSICS 1
+
+static void sse4_2_test (void);
+
+static void
+__attribute__ ((noinline))
+do_test (void)
+{
+  sse4_2_test ();
+}
+
+int
+main ()
+{
+  do_test ();
+
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/powerpc/sse4_2-pcmpgtq.c b/gcc/testsuite/gcc.target/powerpc/sse4_2-pcmpgtq.c
new file mode 100644
index 000000000000..4bfbad885b30
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/sse4_2-pcmpgtq.c
@@ -0,0 +1,46 @@ 
+/* { dg-do run } */
+/* { dg-options "-O2 -mvsx" } */
+/* { dg-require-effective-target powerpc_vsx_ok } */
+
+#ifndef CHECK_H
+#define CHECK_H "sse4_2-check.h"
+#endif
+
+#ifndef TEST
+#define TEST sse4_2_test
+#endif
+
+#include CHECK_H
+
+#include <nmmintrin.h>
+
+#define NUM 64
+
+static void
+TEST (void)
+{
+  union
+    {
+      __m128i x[NUM / 2];
+      long long ll[NUM];
+    } dst, src1, src2;
+  int i, sign = 1;
+  long long is_eq;
+
+  for (i = 0; i < NUM; i++)
+    {
+      src1.ll[i] = i * i * sign;
+      src2.ll[i] = (i + 20) * sign;
+      sign = -sign;
+    }
+
+  for (i = 0; i < NUM; i += 2)
+    dst.x[i / 2] = _mm_cmpgt_epi64 (src1.x[i / 2], src2.x[i / 2]);
+
+  for (i = 0; i < NUM; i++)
+    {
+      is_eq = src1.ll[i] > src2.ll[i] ? 0xFFFFFFFFFFFFFFFFLL : 0LL;
+      if (is_eq != dst.ll[i])
+	abort ();
+    }
+}