diff mbox series

[rs6000,1/2] Add compatible implementations of x86 SSSE3 intrinsics <tmmintrin.h>

Message ID e10c0cbf-d461-c3b0-aa6a-5af476652c9a@us.ibm.com
State New
Headers show
Series [rs6000,1/2] Add compatible implementations of x86 SSSE3 intrinsics <tmmintrin.h> | expand

Commit Message

Paul A. Clarke Oct. 22, 2018, 6:25 p.m. UTC
This is a follow-on to earlier commits for adding compatibility
implementations of x86 intrinsics for PPC64LE.  This patch adds
the 32 x86 intrinsics from <tmmintrin.h> ("SSSE3").

(Patch 2/2 adds tests for these intrinsics, and briefly describes
the tests performed.)

./gcc/ChangeLog:

2018-10-22  Paul A. Clarke  <pc@us.ibm.com>

	* config/rs6000/tmmintrin.h: New file.
	* config.gcc (powerpc*-*-*): Add tmmintrin.h to extra_headers.

Comments

Segher Boessenkool Oct. 22, 2018, 11:28 p.m. UTC | #1
Hi Paul,

On Mon, Oct 22, 2018 at 01:25:57PM -0500, Paul Clarke wrote:
> This is a follow-on to earlier commits for adding compatibility
> implementations of x86 intrinsics for PPC64LE.  This patch adds
> the 32 x86 intrinsics from <tmmintrin.h> ("SSSE3").

> +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))

You could make this fit on a line by wrapping before the __attribute__.
This is true for previous patches as well of course, but it kind of
annoys me in this one.

> +_mm_alignr_pi8 (__m64 __A, __m64 __B, unsigned int __count)
> +{
> +  if (__count < 16)
> +  {

Indentation is broken here and elsewhere in this file.  Please fix.

  if (...)
    {
      blabla;
      blabla;
    }
  else
    {
      blabla;
      blabla;
    }

> +    __v2du __C = {__B, __A};
> +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
> +    const __v4su __shift = {__count << 3, 0, 0, 0};
> +    __C = (__v2du) vec_sro ((__v16qu)__C,(__v16qu) __shift);

Space after cast.  Space after comma.

> +#else
> +    const __v4su __shift = { 0, 0, 0, __count << 3 };

Sometimes you put spaces inside {} and sometimes not.  Please choose one?
Common style is with.


This is okay for trunk if made a bit more readable (by following the usual
coding style rules, say ;-) )


Segher
diff mbox series

Patch

Index: gcc/config/rs6000/tmmintrin.h
===================================================================
diff --git a/trunk/gcc/config/rs6000/tmmintrin.h b/trunk/gcc/config/rs6000/tmmintrin.h
new file mode 10644
--- /dev/null	(revision 0)
+++ b/trunk/gcc/config/rs6000/tmmintrin.h	(working copy)
@@ -0,0 +1,460 @@ 
+/* Copyright (C) 2003-2018 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* Implemented from the specification included in the Intel C++ Compiler
+   User Guide and Reference, version 9.0.  */
+
+#ifndef NO_WARN_X86_INTRINSICS
+/* This header is distributed to simplify porting x86_64 code that
+   makes explicit use of Intel intrinsics to powerpc64le.
+   It is the user's responsibility to determine if the results are
+   acceptable and make additional changes as necessary.
+   Note that much code that uses Intel intrinsics can be rewritten in
+   standard C or GNU C extensions, which are more portable and better
+   optimized across multiple targets.
+
+   In the specific case of X86 SSE2 (__m128i, __m128d) intrinsics,
+   the PowerPC VMX/VSX ISA is a good match for vector double SIMD
+   operations.  However scalar double operations in vector (XMM)
+   registers require the POWER8 VSX ISA (2.07) level. Also there are
+   important differences for data format and placement of double
+   scalars in the vector register.
+
+   For PowerISA Scalar double is in FPRs (left most 64-bits of the
+   low 32 VSRs), while X86_64 SSE2 uses the right most 64-bits of
+   the XMM. These differences require extra steps on POWER to match
+   the SSE2 scalar double semantics.
+
+   Most SSE2 scalar double intrinsic operations can be performed more
+   efficiently as C language double scalar operations or optimized to
+   use vector SIMD operations.  We recommend this for new applications.
+
+   Another difference is the format and details of the X86_64 MXSCR vs
+   the PowerISA FPSCR / VSCR registers. We recommend applications
+   replace direct access to the MXSCR with the more portable <fenv.h>
+   Posix APIs. */
+#endif
+
+#ifndef TMMINTRIN_H_
+#define TMMINTRIN_H_
+
+#include <altivec.h>
+#include <assert.h>
+
+/* We need definitions from the SSE header files.  */
+#include <pmmintrin.h>
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_abs_epi16 (__m128i __A)
+{
+  return (__m128i) vec_abs ((__v8hi)__A);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_abs_epi32 (__m128i __A)
+{
+  return (__m128i) vec_abs ((__v4si)__A);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_abs_epi8 (__m128i __A)
+{
+  return (__m128i) vec_abs ((__v16qi)__A);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_abs_pi16 (__m64 __A)
+{
+  __v8hi __B = (__v8hi) __builtin_pack_vector_int128 (__A, __A);
+  return (__m64) ((__v2du)vec_abs (__B))[0];
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_abs_pi32 (__m64 __A)
+{
+  __v4si __B = (__v4si) __builtin_pack_vector_int128 (__A, __A);
+  return (__m64) ((__v2du)vec_abs (__B))[0];
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_abs_pi8 (__m64 __A)
+{
+  __v16qi __B = (__v16qi) __builtin_pack_vector_int128 (__A, __A);
+  return (__m64) ((__v2du)vec_abs (__B))[0];
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_alignr_epi8 (__m128i __A, __m128i __B, const unsigned int __count)
+{
+  if (__builtin_constant_p (__count) && __count < 16)
+  {
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+    __A = (__m128i) vec_reve ((__v16qu)__A);
+    __B = (__m128i) vec_reve ((__v16qu)__B);
+#endif
+    __A = (__m128i) vec_sld ((__v16qu)__B, (__v16qu)__A, __count);
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+    __A = (__m128i) vec_reve ((__v16qu)__A);
+#endif
+    return __A;
+  }
+
+  if (__count == 0)
+    return __B;
+
+  if (__count >= 16)
+    if (__count >= 32)
+    {
+      const __v16qu zero = { 0 };
+      return (__m128i) zero;
+    }
+    else
+    {
+      const __v16qu __shift = vec_splats ((unsigned char)((__count - 16) * 8));
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+      return (__m128i) vec_sro ((__v16qu)__A, __shift);
+#else
+      return (__m128i) vec_slo ((__v16qu)__A, __shift);
+#endif
+    }
+  else
+  {
+    const __v16qu __shiftA = vec_splats ((unsigned char)((16 - __count) * 8));
+    const __v16qu __shiftB = vec_splats ((unsigned char)(__count * 8));
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+    __A = (__m128i) vec_slo ((__v16qu)__A, __shiftA);
+    __B = (__m128i) vec_sro ((__v16qu)__B, __shiftB);
+#else
+    __A = (__m128i) vec_sro ((__v16qu)__A, __shiftA);
+    __B = (__m128i) vec_slo ((__v16qu)__B, __shiftB);
+#endif
+    return (__m128i) vec_or ((__v16qu)__A, (__v16qu)__B);
+  }
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_alignr_pi8 (__m64 __A, __m64 __B, unsigned int __count)
+{
+  if (__count < 16)
+  {
+    __v2du __C = {__B, __A};
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+    const __v4su __shift = {__count << 3, 0, 0, 0};
+    __C = (__v2du) vec_sro ((__v16qu)__C,(__v16qu) __shift);
+#else
+    const __v4su __shift = { 0, 0, 0, __count << 3 };
+    __C = (__v2du) vec_slo ((__v16qu)__C,(__v16qu) __shift);
+#endif
+    return (__m64) __C[0];
+  }
+  else
+  {
+    const __m64 __zero = {0};
+    return __zero;
+  }
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_hadd_epi16 (__m128i __A, __m128i __B)
+{
+  const __v16qu __P = {  0,  1,  4,  5,  8,  9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 };
+  const __v16qu __Q = {  2,  3,  6,  7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 };
+  __v8hi __C = vec_perm ((__v8hi)__A, (__v8hi)__B, __P);
+  __v8hi __D = vec_perm ((__v8hi)__A, (__v8hi)__B, __Q);
+  return (__m128i) vec_add (__C, __D);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_hadd_epi32 (__m128i __A, __m128i __B)
+{
+  const __v16qu __P = {  0,  1,  2,  3,  8,  9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27 };
+  const __v16qu __Q = {  4,  5,  6,  7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 };
+  __v4si __C = vec_perm ((__v4si)__A, (__v4si)__B, __P);
+  __v4si __D = vec_perm ((__v4si)__A, (__v4si)__B, __Q);
+  return (__m128i) vec_add (__C, __D);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_hadd_pi16 (__m64 __A, __m64 __B)
+{
+  __v8hi __C = (__v8hi) __builtin_pack_vector_int128 (__A, __B);
+  const __v16qu __P = {  0,  1,  4,  5,  8,  9, 12, 13,  0,  1,  4,  5,  8,  9, 12, 13 };
+  const __v16qu __Q = {  2,  3,  6,  7, 10, 11, 14, 15,  2,  3,  6,  7, 10, 11, 14, 15 };
+  __v8hi __D = vec_perm (__C, __C, __Q);
+  __C = vec_perm (__C, __C, __P);
+  __C = vec_add (__C, __D);
+  return (__m64) ((__v2du) __C)[1];
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_hadd_pi32 (__m64 __A, __m64 __B)
+{
+  __v4si __C = (__v4si) __builtin_pack_vector_int128 (__A, __B);
+  const __v16qu __P = {  0,  1,  2,  3,  8,  9, 10, 11,  0,  1,  2,  3,  8,  9, 10, 11 };
+  const __v16qu __Q = {  4,  5,  6,  7, 12, 13, 14, 15,  4,  5,  6,  7, 12, 13, 14, 15 };
+  __v4si __D = vec_perm (__C, __C, __Q);
+  __C = vec_perm (__C, __C, __P);
+  __C = vec_add (__C, __D);
+  return (__m64) ((__v2du) __C)[1];
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_hadds_epi16 (__m128i __A, __m128i __B)
+{
+  __v4si __C = {0}, __D = {0};
+  __C = vec_sum4s ((__v8hi) __A, __C);
+  __D = vec_sum4s ((__v8hi) __B, __D);
+  __C = (__v4si) vec_packs (__D, __C);
+  return (__m128i) __C;
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_hadds_pi16 (__m64 __A, __m64 __B)
+{
+  const __v4si __zero = {0};
+  __v8hi __C = (__v8hi) __builtin_pack_vector_int128 (__A, __B);
+  __v4si __D = vec_sum4s (__C, __zero);
+  __C = vec_packs (__D, __D);
+  return (__m64) ((__v2du) __C)[1];
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_hsub_epi16 (__m128i __A, __m128i __B)
+{
+  const __v16qu __P = {  0,  1,  4,  5,  8,  9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 };
+  const __v16qu __Q = {  2,  3,  6,  7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 };
+  __v8hi __C = vec_perm ((__v8hi)__A, (__v8hi)__B, __P);
+  __v8hi __D = vec_perm ((__v8hi)__A, (__v8hi)__B, __Q);
+  return (__m128i) vec_sub (__C, __D);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_hsub_epi32 (__m128i __A, __m128i __B)
+{
+  const __v16qu __P = {  0,  1,  2,  3,  8,  9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27 };
+  const __v16qu __Q = {  4,  5,  6,  7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 };
+  __v4si __C = vec_perm ((__v4si) __B, (__v4si) __A, __P);
+  __v4si __D = vec_perm ((__v4si) __B, (__v4si) __A, __Q);
+  return (__m128i) vec_sub (__C, __D);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_hsub_pi16 (__m64 __A, __m64 __B)
+{
+  const __v16qu __P = {  0,  1,  4,  5,  8,  9, 12, 13,  0,  1,  4,  5,  8,  9, 12, 13 };
+  const __v16qu __Q = {  2,  3,  6,  7, 10, 11, 14, 15,  2,  3,  6,  7, 10, 11, 14, 15 };
+  __v8hi __C = (__v8hi) __builtin_pack_vector_int128 (__A, __B);
+  __v8hi __D = vec_perm (__C, __C, __Q);
+  __C = vec_perm (__C, __C, __P);
+  __C = vec_sub (__C, __D);
+  return (__m64) ((__v2du) __C)[1];
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_hsub_pi32 (__m64 __A, __m64 __B)
+{
+  const __v16qu __P = {  0,  1,  2,  3,  8,  9, 10, 11,  0,  1,  2,  3,  8,  9, 10, 11 };
+  const __v16qu __Q = {  4,  5,  6,  7, 12, 13, 14, 15,  4,  5,  6,  7, 12, 13, 14, 15 };
+  __v4si __C = (__v4si) __builtin_pack_vector_int128 (__A, __B);
+  __v4si __D = vec_perm (__C, __C, __Q);
+  __C = vec_perm (__C, __C, __P);
+  __C = vec_sub (__C, __D);
+  return (__m64) ((__v2du) __C)[1];
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_hsubs_epi16 (__m128i __A, __m128i __B)
+{
+  const __v16qu __P = {  0,  1,  4,  5,  8,  9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 };
+  const __v16qu __Q = {  2,  3,  6,  7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 };
+  __v8hi __C = vec_perm ((__v8hi)__A, (__v8hi)__B, __P);
+  __v8hi __D = vec_perm ((__v8hi)__A, (__v8hi)__B, __Q);
+  return (__m128i) vec_subs (__C, __D);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_hsubs_pi16 (__m64 __A, __m64 __B)
+{
+  const __v16qu __P = {  0,  1,  4,  5,  8,  9, 12, 13,  0,  1,  4,  5,  8,  9, 12, 13 };
+  const __v16qu __Q = {  2,  3,  6,  7, 10, 11, 14, 15,  2,  3,  6,  7, 10, 11, 14, 15 };
+  __v8hi __C = (__v8hi) __builtin_pack_vector_int128 (__A, __B);
+  __v8hi __D = vec_perm (__C, __C, __P);
+  __v8hi __E = vec_perm (__C, __C, __Q);
+  __C = vec_subs (__D, __E);
+  return (__m64) ((__v2du) __C)[1];
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_shuffle_epi8 (__m128i __A, __m128i __B)
+{
+  const __v16qi __zero = { 0 };
+  __vector __bool char __select = vec_cmplt ((__v16qi)__A, __zero);
+  __v16qi __C = vec_perm ((__v16qi)__A, (__v16qi)__A, (__v16qu)__B);
+  return (__m128i) vec_sel (__C, __zero, __select);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_shuffle_pi8 (__m64 __A, __m64 __B)
+{
+  const __v16qi __zero = { 0 };
+  __v16qi __C = (__v16qi) __builtin_pack_vector_int128 (__A, __A);
+  __v16qi __D = (__v16qi) __builtin_pack_vector_int128 (__B, __B);
+  __vector __bool char __select = vec_cmplt ((__v16qi)__C, __zero);
+  __C = vec_perm ((__v16qi)__C, (__v16qi)__C, (__v16qu)__D);
+  __C = vec_sel (__C, __zero, __select);
+  return (__m64) ((__v2du)(__C))[0];
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sign_epi8 (__m128i __A, __m128i __B)
+{
+  const __v16qi __zero = { 0 };
+  __v16qi __selectneg = (__v16qi)vec_cmplt ((__v16qi)__B, __zero);
+  __v16qi __selectpos = (__v16qi)vec_neg ((__v16qi)vec_cmpgt ((__v16qi)__B, __zero));
+  __v16qi __conv = vec_add (__selectneg, __selectpos);
+  return (__m128i) vec_mul ((__v16qi)__A, (__v16qi)__conv);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sign_epi16 (__m128i __A, __m128i __B)
+{
+  const __v8hi __zero = { 0 };
+  __v8hi __selectneg = (__v8hi)vec_cmplt ((__v8hi)__B, __zero);
+  __v8hi __selectpos = (__v8hi)vec_neg ((__v8hi)vec_cmpgt ((__v8hi)__B, __zero));
+  __v8hi __conv = vec_add (__selectneg, __selectpos);
+  return (__m128i) vec_mul ((__v8hi)__A, (__v8hi)__conv);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sign_epi32 (__m128i __A, __m128i __B)
+{
+  const __v4si __zero = { 0 };
+  __v4si __selectneg = (__v4si)vec_cmplt ((__v4si)__B, __zero);
+  __v4si __selectpos = (__v4si)vec_neg ((__v4si)vec_cmpgt ((__v4si)__B, __zero));
+  __v4si __conv = vec_add (__selectneg, __selectpos);
+  return (__m128i) vec_mul ((__v4si)__A, (__v4si)__conv);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sign_pi8 (__m64 __A, __m64 __B)
+{
+  const __v16qi __zero = { 0 };
+  __v16qi __C = (__v16qi) __builtin_pack_vector_int128 (__A, __A);
+  __v16qi __D = (__v16qi) __builtin_pack_vector_int128 (__B, __B);
+  __C = (__v16qi) _mm_sign_epi8 ((__m128i)__C, (__m128i)__D);
+  return (__m64) ((__v2du)(__C))[0];
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sign_pi16 (__m64 __A, __m64 __B)
+{
+  const __v8hi __zero = { 0 };
+  __v8hi __C = (__v8hi) __builtin_pack_vector_int128 (__A, __A);
+  __v8hi __D = (__v8hi) __builtin_pack_vector_int128 (__B, __B);
+  __C = (__v8hi) _mm_sign_epi16 ((__m128i)__C, (__m128i)__D);
+  return (__m64) ((__v2du)(__C))[0];
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sign_pi32 (__m64 __A, __m64 __B)
+{
+  const __v4si __zero = { 0 };
+  __v4si __C = (__v4si) __builtin_pack_vector_int128 (__A, __A);
+  __v4si __D = (__v4si) __builtin_pack_vector_int128 (__B, __B);
+  __C = (__v4si) _mm_sign_epi32 ((__m128i)__C, (__m128i)__D);
+  return (__m64) ((__v2du)(__C))[0];
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maddubs_epi16 (__m128i __A, __m128i __B)
+{
+  __v8hi __C = vec_unpackh ((__v16qi)__A);
+  __v8hi __D = vec_unpackl ((__v16qi)__A);
+  __v8hi __unsigned = vec_splats ((signed short)0x00ff);
+  __v8hi __E = vec_and (vec_unpackh ((__v16qi)__B), __unsigned);
+  __v8hi __F = vec_and (vec_unpackl ((__v16qi)__B), __unsigned);
+  __C = vec_mul (__C, __E);
+  __D = vec_mul (__D, __F);
+  const __v16qu __odds  = {  0,  1,  4,  5,  8,  9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 };
+  const __v16qu __evens = {  2,  3,  6,  7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 };
+  __E = vec_perm (__C, __D, __odds);
+  __F = vec_perm (__C, __D, __evens);
+  return (__m128i) vec_adds (__E, __F);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maddubs_pi16 (__m64 __A, __m64 __B)
+{
+  __v8hi __C = (__v8hi) __builtin_pack_vector_int128 (__A, __A);
+  __C = vec_unpackl ((__v16qi)__C);
+  __v8hi __D = (__v8hi) __builtin_pack_vector_int128 (__B, __B);
+  __D = vec_unpackl ((__v16qi)__D);
+  const __v8hi __unsigned = vec_splats ((signed short)0x00ff);
+  __D = vec_and (__D, __unsigned);
+  __D = vec_mul (__C, __D);
+  const __v16qu __odds  = {  0,  1,  4,  5,  8,  9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 };
+  const __v16qu __evens = {  2,  3,  6,  7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 };
+  __C = vec_perm (__D, __D, __odds);
+  __D = vec_perm (__D, __D, __evens);
+  __C = vec_adds (__C, __D);
+  return (__m64) ((__v2du)(__C))[0];
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mulhrs_epi16 (__m128i __A, __m128i __B)
+{
+  __v4si __C = vec_unpackh ((__v8hi)__A);
+  __v4si __D = vec_unpackh ((__v8hi)__B);
+  __C = vec_mul (__C, __D);
+  __D = vec_unpackl ((__v8hi)__A);
+  __v4si __E = vec_unpackl ((__v8hi)__B);
+  __D = vec_mul (__D, __E);
+  const __v4su __shift = vec_splats ((unsigned int)14);
+  __C = vec_sr (__C, __shift);
+  __D = vec_sr (__D, __shift);
+  const __v4si __ones = vec_splats ((signed int)1);
+  __C = vec_add (__C, __ones);
+  __C = vec_sr (__C, (__v4su)__ones);
+  __D = vec_add (__D, __ones);
+  __D = vec_sr (__D, (__v4su)__ones);
+  return (__m128i) vec_pack (__C, __D);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mulhrs_pi16 (__m64 __A, __m64 __B)
+{
+  __v4si __C = (__v4si) __builtin_pack_vector_int128 (__A, __A);
+  __C = vec_unpackh ((__v8hi)__C);
+  __v4si __D = (__v4si) __builtin_pack_vector_int128 (__B, __B);
+  __D = vec_unpackh ((__v8hi)__D);
+  __C = vec_mul (__C, __D);
+  const __v4su __shift = vec_splats ((unsigned int)14);
+  __C = vec_sr (__C, __shift);
+  const __v4si __ones = vec_splats ((signed int)1);
+  __C = vec_add (__C, __ones);
+  __C = vec_sr (__C, (__v4su)__ones);
+  __v8hi __E = vec_pack (__C, __D);
+  return (__m64) ((__v2du)(__E))[0];
+}
+
+#endif
Index: gcc/config.gcc
===================================================================
diff --git a/trunk/gcc/config.gcc b/trunk/gcc/config.gcc
--- a/trunk/gcc/config.gcc	(revision 265238)
+++ b/trunk/gcc/config.gcc	(working copy)
@@ -485,7 +485,7 @@ 
 	extra_headers="${extra_headers} bmi2intrin.h bmiintrin.h"
 	extra_headers="${extra_headers} xmmintrin.h mm_malloc.h emmintrin.h"
 	extra_headers="${extra_headers} mmintrin.h x86intrin.h"
-	extra_headers="${extra_headers} pmmintrin.h"
+	extra_headers="${extra_headers} pmmintrin.h tmmintrin.h"
 	extra_headers="${extra_headers} ppu_intrinsics.h spu2vmx.h vec_types.h si2vmx.h"
 	extra_headers="${extra_headers} amo.h"
 	case x$with_cpu in