diff mbox series

[v2,rs6000,1/4] Fixes for x86 intrinsics on POWER 32bit

Message ID 8f0889c7-06ac-ec10-23a1-a036f79860f6@us.ibm.com
State New
Headers show
Series [v2,rs6000,1/4] Fixes for x86 intrinsics on POWER 32bit | expand

Commit Message

Paul A. Clarke Oct. 25, 2018, 7:07 p.m. UTC
Various clean-ups for 32bit support.

Implement various corrections in the compatibility implementations of the
x86 vector intrinsics found after enabling 32bit mode for the associated
test cases.  (Actual enablement coming in a subsequent patch.)

Bootstrapped and tested on Linux POWER8 LE, POWER8 BE (64 & 32), and POWER7.

OK for trunk?

v2: This patch is new in v2.

2018-10-25  Paul A. Clarke  <pc@us.ibm.com>

gcc/ChangeLog:

	* config/rs6000/mmintrin.h: Enable 32bit compilation.
	* config/rs6000/xmmintrin.h: Likewise.

Comments

Segher Boessenkool Oct. 25, 2018, 10:17 p.m. UTC | #1
On Thu, Oct 25, 2018 at 02:07:33PM -0500, Paul Clarke wrote:
> Various clean-ups for 32bit support.
> 
> Implement various corrections in the compatibility implementations of the
> x86 vector intrinsics found after enabling 32bit mode for the associated
> test cases.  (Actual enablement coming in a subsequent patch.)

So what happened on 32-bit before?  (After you get rid of the #ifdef of
course).  It isn't clear to me.


Segher


> 2018-10-25  Paul A. Clarke  <pc@us.ibm.com>
> 
> gcc/ChangeLog:
> 
> 	* config/rs6000/mmintrin.h: Enable 32bit compilation.
> 	* config/rs6000/xmmintrin.h: Likewise.
Paul A. Clarke Oct. 26, 2018, 1:06 p.m. UTC | #2
On 10/25/2018 05:17 PM, Segher Boessenkool wrote:
> On Thu, Oct 25, 2018 at 02:07:33PM -0500, Paul Clarke wrote:
>> Various clean-ups for 32bit support.
>>
>> Implement various corrections in the compatibility implementations of the
>> x86 vector intrinsics found after enabling 32bit mode for the associated
>> test cases.  (Actual enablement coming in a subsequent patch.)
> 
> So what happened on 32-bit before?  (After you get rid of the #ifdef of
> course).  It isn't clear to me.

Most of the changes are to remove dependency on int128 support, because with '-m32', errors were reported:
/opt/at12.0/lib/gcc/powerpc64-linux-gnu/8.2.1/include/xmmintrin.h:992:61: error: ‘__int128’ is not supported on this target
   return ((__m64) __builtin_unpack_vector_int128 ((__vector __int128)result, 0));

Prompted the many changes like:
> -  vm1 = (__vector signed short)__builtin_pack_vector_int128 (__m2, __m1);
> +  vm1 = (__vector signed short) (__vector unsigned long long) { __m2, __m1 };

./xmmintrin.h:1620:21: warning: conversion from ‘long long unsigned int’ to ‘long unsigned int’ changes value from ‘2269495618449464’ to ‘539504696’ [-Woverflow]
   unsigned long p = 0x0008101820283038UL; // permute control for sign bits
                    ^~~~~~~~~~~~~~~~~~~~
prompting:
> -  unsigned long p = 0x0008101820283038UL; // permute control for sign bits
> +  unsigned long long p = 0x0008101820283038UL; // permute control for sign bits

But if you are asking what happened with the GCC testsuite, then since all of the tests were marked "lp64", they were just ignored as UNSUPPORTED with -m32.

PC
Segher Boessenkool Oct. 26, 2018, 5:01 p.m. UTC | #3
On Fri, Oct 26, 2018 at 08:06:28AM -0500, Paul Clarke wrote:
> On 10/25/2018 05:17 PM, Segher Boessenkool wrote:
> > On Thu, Oct 25, 2018 at 02:07:33PM -0500, Paul Clarke wrote:
> >> Various clean-ups for 32bit support.
> >>
> >> Implement various corrections in the compatibility implementations of the
> >> x86 vector intrinsics found after enabling 32bit mode for the associated
> >> test cases.  (Actual enablement coming in a subsequent patch.)
> > 
> > So what happened on 32-bit before?  (After you get rid of the #ifdef of
> > course).  It isn't clear to me.
> 
> Most of the changes are to remove dependency on int128 support, because with '-m32', errors were reported:
> /opt/at12.0/lib/gcc/powerpc64-linux-gnu/8.2.1/include/xmmintrin.h:992:61: error: ‘__int128’ is not supported on this target
>    return ((__m64) __builtin_unpack_vector_int128 ((__vector __int128)result, 0));
> 
> Prompted the many changes like:
> > -  vm1 = (__vector signed short)__builtin_pack_vector_int128 (__m2, __m1);
> > +  vm1 = (__vector signed short) (__vector unsigned long long) { __m2, __m1 };

Ah, okay.  And you have tested this works correctly both on BE and LE, right?
Okay for trunk then.  Thanks!


Segher
diff mbox series

Patch

Index: gcc/config/rs6000/mmintrin.h
===================================================================
diff --git a/trunk/gcc/config/rs6000/mmintrin.h b/trunk/gcc/config/rs6000/mmintrin.h
--- a/trunk/gcc/config/rs6000/mmintrin.h	(revision 265495)
+++ b/trunk/gcc/config/rs6000/mmintrin.h	(working copy)
@@ -112,7 +112,6 @@ 
   return _mm_cvtsi64_si32 (__i);
 }
 
-#ifdef __powerpc64__
 /* Convert I to a __m64 object.  */
 
 /* Intel intrinsic.  */
@@ -173,9 +172,9 @@ 
   __vector signed short vm1;
   __vector signed char vresult;
 
-  vm1 = (__vector signed short)__builtin_pack_vector_int128 (__m2, __m1);
+  vm1 = (__vector signed short) (__vector unsigned long long) { __m2, __m1 };
   vresult = vec_vpkshss (vm1, vm1);
-  return (__m64) __builtin_unpack_vector_int128 ((__vector __int128)vresult, 0);
+  return (__m64) ((vector long long) vresult)[0];
 }
 
 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
@@ -193,9 +192,9 @@ 
   __vector signed int vm1;
   __vector signed short vresult;
 
-  vm1 = (__vector signed int)__builtin_pack_vector_int128 (__m2, __m1);
+  vm1 = (__vector signed int) (__vector unsigned long long) { __m2, __m1 };
   vresult = vec_vpkswss (vm1, vm1);
-  return ((__m64) __builtin_unpack_vector_int128 ((__vector __int128)vresult, 0));
+  return (__m64) ((vector long long) vresult)[0];
 }
 
 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
@@ -213,9 +212,9 @@ 
   __vector signed short vm1;
   __vector unsigned char vresult;
 
-  vm1 = (__vector signed short)__builtin_pack_vector_int128 (__m2, __m1);
+  vm1 = (__vector signed short) (__vector unsigned long long) { __m2, __m1 };
   vresult = vec_vpkshus (vm1, vm1);
-  return ((__m64) __builtin_unpack_vector_int128 ((__vector __int128)vresult, 0));
+  return (__m64) ((vector long long) vresult)[0];
 }
 
 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
@@ -236,7 +235,7 @@ 
   a = (__vector unsigned char)vec_splats (__m1);
   b = (__vector unsigned char)vec_splats (__m2);
   c = vec_mergel (a, b);
-  return (__builtin_unpack_vector_int128 ((__vector __int128)c, 0));
+  return (__m64) ((vector long long) c)[0];
 #else
   __m64_union m1, m2, res;
 
@@ -317,7 +316,7 @@ 
   a = (__vector unsigned char)vec_splats (__m1);
   b = (__vector unsigned char)vec_splats (__m2);
   c = vec_mergel (a, b);
-  return (__builtin_unpack_vector_int128 ((__vector __int128)c, 1));
+  return (__m64) ((vector long long) c)[1];
 #else
   __m64_union m1, m2, res;
 
@@ -398,7 +397,7 @@ 
   a = (__vector signed char)vec_splats (__m1);
   b = (__vector signed char)vec_splats (__m2);
   c = vec_add (a, b);
-  return (__builtin_unpack_vector_int128 ((__vector __int128)c, 0));
+  return (__m64) ((vector long long) c)[0];
 #else
   __m64_union m1, m2, res;
 
@@ -434,7 +433,7 @@ 
   a = (__vector signed short)vec_splats (__m1);
   b = (__vector signed short)vec_splats (__m2);
   c = vec_add (a, b);
-  return (__builtin_unpack_vector_int128 ((__vector __int128)c, 0));
+  return (__m64) ((vector long long) c)[0];
 #else
   __m64_union m1, m2, res;
 
@@ -466,7 +465,7 @@ 
   a = (__vector signed int)vec_splats (__m1);
   b = (__vector signed int)vec_splats (__m2);
   c = vec_add (a, b);
-  return (__builtin_unpack_vector_int128 ((__vector __int128)c, 0));
+  return (__m64) ((vector long long) c)[0];
 #else
   __m64_union m1, m2, res;
 
@@ -496,7 +495,7 @@ 
   a = (__vector signed char)vec_splats (__m1);
   b = (__vector signed char)vec_splats (__m2);
   c = vec_sub (a, b);
-  return (__builtin_unpack_vector_int128 ((__vector __int128)c, 0));
+  return (__m64) ((vector long long) c)[0];
 #else
   __m64_union m1, m2, res;
 
@@ -532,7 +531,7 @@ 
   a = (__vector signed short)vec_splats (__m1);
   b = (__vector signed short)vec_splats (__m2);
   c = vec_sub (a, b);
-  return (__builtin_unpack_vector_int128 ((__vector __int128)c, 0));
+  return (__m64) ((vector long long) c)[0];
 #else
   __m64_union m1, m2, res;
 
@@ -564,7 +563,7 @@ 
   a = (__vector signed int)vec_splats (__m1);
   b = (__vector signed int)vec_splats (__m2);
   c = vec_sub (a, b);
-  return (__builtin_unpack_vector_int128 ((__vector __int128)c, 0));
+  return (__m64) ((vector long long) c)[0];
 #else
   __m64_union m1, m2, res;
 
@@ -754,7 +753,7 @@ 
   a = (__vector signed char)vec_splats (__m1);
   b = (__vector signed char)vec_splats (__m2);
   c = (__vector signed char)vec_cmpgt (a, b);
-  return (__builtin_unpack_vector_int128 ((__vector __int128)c, 0));
+  return (__m64) ((vector long long) c)[0];
 #else
   __m64_union m1, m2, res;
 
@@ -791,7 +790,7 @@ 
   a = (__vector signed short)vec_splats (__m1);
   b = (__vector signed short)vec_splats (__m2);
   c = (__vector signed short)vec_cmpeq (a, b);
-  return (__builtin_unpack_vector_int128 ((__vector __int128)c, 0));
+  return (__m64) ((vector long long) c)[0];
 #else
   __m64_union m1, m2, res;
 
@@ -822,7 +821,7 @@ 
   a = (__vector signed short)vec_splats (__m1);
   b = (__vector signed short)vec_splats (__m2);
   c = (__vector signed short)vec_cmpgt (a, b);
-  return (__builtin_unpack_vector_int128 ((__vector __int128)c, 0));
+  return (__m64) ((vector long long) c)[0];
 #else
   __m64_union m1, m2, res;
 
@@ -855,7 +854,7 @@ 
   a = (__vector signed int)vec_splats (__m1);
   b = (__vector signed int)vec_splats (__m2);
   c = (__vector signed int)vec_cmpeq (a, b);
-  return (__builtin_unpack_vector_int128 ((__vector __int128)c, 0));
+  return (__m64) ((vector long long) c)[0];
 #else
   __m64_union m1, m2, res;
 
@@ -884,7 +883,7 @@ 
   a = (__vector signed int)vec_splats (__m1);
   b = (__vector signed int)vec_splats (__m2);
   c = (__vector signed int)vec_cmpgt (a, b);
-  return (__builtin_unpack_vector_int128 ((__vector __int128)c, 0));
+  return (__m64) ((vector long long) c)[0];
 #else
   __m64_union m1, m2, res;
 
@@ -915,7 +914,7 @@ 
   a = (__vector signed char)vec_splats (__m1);
   b = (__vector signed char)vec_splats (__m2);
   c = vec_adds (a, b);
-  return (__builtin_unpack_vector_int128 ((__vector __int128)c, 0));
+  return (__m64) ((vector long long) c)[0];
 }
 
 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
@@ -933,7 +932,7 @@ 
   a = (__vector signed short)vec_splats (__m1);
   b = (__vector signed short)vec_splats (__m2);
   c = vec_adds (a, b);
-  return (__builtin_unpack_vector_int128 ((__vector __int128)c, 0));
+  return (__m64) ((vector long long) c)[0];
 }
 
 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
@@ -951,7 +950,7 @@ 
   a = (__vector unsigned char)vec_splats (__m1);
   b = (__vector unsigned char)vec_splats (__m2);
   c = vec_adds (a, b);
-  return (__builtin_unpack_vector_int128 ((__vector __int128)c, 0));
+  return (__m64) ((vector long long) c)[0];
 }
 
 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
@@ -970,7 +969,7 @@ 
   a = (__vector unsigned short)vec_splats (__m1);
   b = (__vector unsigned short)vec_splats (__m2);
   c = vec_adds (a, b);
-  return (__builtin_unpack_vector_int128 ((__vector __int128)c, 0));
+  return (__m64) ((vector long long) c)[0];
 }
 
 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
@@ -989,7 +988,7 @@ 
   a = (__vector signed char)vec_splats (__m1);
   b = (__vector signed char)vec_splats (__m2);
   c = vec_subs (a, b);
-  return (__builtin_unpack_vector_int128 ((__vector __int128)c, 0));
+  return (__m64) ((vector long long) c)[0];
 }
 
 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
@@ -1008,7 +1007,7 @@ 
   a = (__vector signed short)vec_splats (__m1);
   b = (__vector signed short)vec_splats (__m2);
   c = vec_subs (a, b);
-  return (__builtin_unpack_vector_int128 ((__vector __int128)c, 0));
+  return (__m64) ((vector long long) c)[0];
 }
 
 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
@@ -1027,7 +1026,7 @@ 
   a = (__vector unsigned char)vec_splats (__m1);
   b = (__vector unsigned char)vec_splats (__m2);
   c = vec_subs (a, b);
-  return (__builtin_unpack_vector_int128 ((__vector __int128)c, 0));
+  return (__m64) ((vector long long) c)[0];
 }
 
 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
@@ -1046,7 +1045,7 @@ 
   a = (__vector unsigned short)vec_splats (__m1);
   b = (__vector unsigned short)vec_splats (__m2);
   c = vec_subs (a, b);
-  return (__builtin_unpack_vector_int128 ((__vector __int128)c, 0));
+  return (__m64) ((vector long long) c)[0];
 }
 
 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
@@ -1068,7 +1067,7 @@ 
   a = (__vector signed short)vec_splats (__m1);
   b = (__vector signed short)vec_splats (__m2);
   c = vec_vmsumshm (a, b, zero);
-  return (__builtin_unpack_vector_int128 ((__vector __int128)c, 0));
+  return (__m64) ((vector long long) c)[0];
 }
 
 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
@@ -1096,7 +1095,7 @@ 
   w1 = vec_vmulosh (a, b);
   c = (__vector signed short)vec_perm (w0, w1, xform1);
 
-  return (__builtin_unpack_vector_int128 ((__vector __int128)c, 0));
+  return (__m64) ((vector long long) c)[0];
 }
 
 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
@@ -1115,7 +1114,7 @@ 
   a = (__vector signed short)vec_splats (__m1);
   b = (__vector signed short)vec_splats (__m2);
   c = a * b;
-  return (__builtin_unpack_vector_int128 ((__vector __int128)c, 0));
+  return (__m64) ((vector long long) c)[0];
 }
 
 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
@@ -1136,7 +1135,7 @@ 
       m = (__vector signed short)vec_splats (__m);
       c = (__vector unsigned short)vec_splats ((unsigned short)__count);
       r = vec_sl (m, (__vector unsigned short)c);
-      return (__builtin_unpack_vector_int128 ((__vector __int128)r, 0));
+      return (__m64) ((vector long long) r)[0];
     }
   else
   return (0);
@@ -1205,7 +1204,7 @@ 
 	m = (__vector signed short)vec_splats (__m);
 	c = (__vector unsigned short)vec_splats ((unsigned short)__count);
 	r = vec_sra (m, (__vector unsigned short)c);
-	return (__builtin_unpack_vector_int128 ((__vector __int128)r, 0));
+        return (__m64) ((vector long long) r)[0];
     }
   else
   return (0);
@@ -1274,7 +1273,7 @@ 
 	m = (__vector unsigned short)vec_splats (__m);
 	c = (__vector unsigned short)vec_splats ((unsigned short)__count);
 	r = vec_sr (m, (__vector unsigned short)c);
-	return (__builtin_unpack_vector_int128 ((__vector __int128)r, 0));
+        return (__m64) ((vector long long) r)[0];
     }
   else
     return (0);
@@ -1417,7 +1416,7 @@ 
   __vector signed short w;
 
   w = (__vector signed short)vec_splats (__w);
-  return (__builtin_unpack_vector_int128 ((__vector __int128)w, 0));
+  return (__m64) ((vector long long) w)[0];
 #else
   __m64_union res;
 
@@ -1437,7 +1436,7 @@ 
   __vector signed char b;
 
   b = (__vector signed char)vec_splats (__b);
-  return (__builtin_unpack_vector_int128 ((__vector __int128)b, 0));
+  return (__m64) ((vector long long) b)[0];
 #else
   __m64_union res;
 
@@ -1452,5 +1451,4 @@ 
   return (res.as_m64);
 #endif
 }
-#endif /* __powerpc64__ */
 #endif /* _MMINTRIN_H_INCLUDED */
Index: gcc/config/rs6000/xmmintrin.h
===================================================================
diff --git a/trunk/gcc/config/rs6000/xmmintrin.h b/trunk/gcc/config/rs6000/xmmintrin.h
--- a/trunk/gcc/config/rs6000/xmmintrin.h	(revision 265495)
+++ b/trunk/gcc/config/rs6000/xmmintrin.h	(working copy)
@@ -996,7 +996,7 @@ 
   rounded = vec_rint(temp);
   result = (__vector unsigned long long) vec_cts (rounded, 0);
 
-  return ((__m64) __builtin_unpack_vector_int128 ((__vector __int128)result, 0));
+  return (__m64) ((vector long long) result)[0];
 }
 
 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
@@ -1053,7 +1053,7 @@ 
   temp = (__v4sf) vec_splat ((__vector long long)__A, 0);
   result = (__vector unsigned long long) vec_cts (temp, 0);
 
-  return ((__m64) __builtin_unpack_vector_int128 ((__vector __int128)result, 0));
+  return (__m64) ((vector long long) result)[0];
 }
 
 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
@@ -1104,7 +1104,7 @@ 
   __vector signed int vm1;
   __vector float vf1;
 
-  vm1 = (__vector signed int) __builtin_pack_vector_int128 (__B, __B);
+  vm1 = (__vector signed int) (__vector unsigned long long) {__B, __B};
   vf1 = (__vector float) vec_ctf (vm1, 0);
 
   return ((__m128) (__vector unsigned long long)
@@ -1126,7 +1126,7 @@ 
   __vector signed int vi4;
   __vector float vf1;
 
-  vs8 = (__vector signed short) __builtin_pack_vector_int128 (__A, __A);
+  vs8 = (__vector signed short) (__vector unsigned long long) { __A, __A };
   vi4 = vec_vupklsh (vs8);
   vf1 = (__vector float) vec_ctf (vi4, 0);
 
@@ -1143,7 +1143,7 @@ 
   __vector unsigned int vi4;
   __vector float vf1;
 
-  vs8 = (__vector unsigned short) __builtin_pack_vector_int128 (__A, __A);
+  vs8 = (__vector unsigned short) (__vector unsigned long long) { __A, __A };
   vi4 = (__vector unsigned int) vec_vmrglh (vs8, zero);
   vf1 = (__vector float) vec_ctf (vi4, 0);
 
@@ -1159,7 +1159,7 @@ 
   __vector signed int vi4;
   __vector float vf1;
 
-  vc16 = (__vector signed char) __builtin_pack_vector_int128 (__A, __A);
+  vc16 = (__vector signed char) (__vector unsigned long long) { __A, __A };
   vs8 = vec_vupkhsb (vc16);
   vi4 = vec_vupkhsh (vs8);
   vf1 = (__vector float) vec_ctf (vi4, 0);
@@ -1179,7 +1179,7 @@ 
   __vector unsigned int vi4;
   __vector float vf1;
 
-  vc16 = (__vector unsigned char) __builtin_pack_vector_int128 (__A, __A);
+  vc16 = (__vector unsigned char) (__vector unsigned long long) { __A, __A };
   vs8 = (__vector unsigned short) vec_vmrglb (vc16, zero);
   vi4 = (__vector unsigned int) vec_vmrghh (vs8,
 					    (__vector unsigned short) zero);
@@ -1195,7 +1195,7 @@ 
   __vector signed int vi4;
   __vector float vf4;
 
-  vi4 = (__vector signed int) __builtin_pack_vector_int128 (__B, __A);
+  vi4 = (__vector signed int) (__vector unsigned long long) { __B, __A };
   vf4 = (__vector float) vec_ctf (vi4, 0);
   return (__m128) vf4;
 }
@@ -1212,7 +1212,7 @@ 
   temp = vec_cts (rounded, 0);
   result = (__vector unsigned long long) vec_pack (temp, temp);
 
-  return ((__m64) __builtin_unpack_vector_int128 ((__vector __int128)result, 0));
+  return (__m64) ((vector long long) result)[0];
 }
 
 /* Convert the four SPFP values in A to four signed 8-bit integers.  */
@@ -1224,15 +1224,12 @@ 
   static const __vector signed int zero = {0, 0, 0, 0};
   __vector signed short tmp_s;
   __vector signed char res_v;
-  __m64 result;
 
   rounded = vec_rint(__A);
   tmp_i = vec_cts (rounded, 0);
   tmp_s = vec_pack (tmp_i, zero);
   res_v = vec_pack (tmp_s, tmp_s);
-  result = (__m64) __builtin_unpack_vector_int128 ((__vector __int128)res_v, 0);
-
-  return (result);
+  return (__m64) ((vector long long) res_v)[0];
 }
 
 /* Selects four specific SPFP values from A and B based on MASK.  */
@@ -1386,9 +1383,12 @@ 
 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_extract_pi16 (__m64 const __A, int const __N)
 {
-  const int shiftr = (__N & 3) * 16;
+  unsigned int shiftr = __N & 3;
+#ifdef __BIG_ENDIAN__
+  shiftr = 3 - shiftr;
+#endif
 
-  return ((__A >> shiftr) & 0xffff);
+  return ((__A >> (shiftr * 16)) & 0xffff);
 }
 
 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
@@ -1429,7 +1429,7 @@ 
   b = (__vector signed short)vec_splats (__B);
   c = (__vector __bool short)vec_cmpgt (a, b);
   r = vec_sel (b, a, c);
-  return (__builtin_unpack_vector_int128 ((__vector __int128)r, 0));
+  return (__m64) ((vector long long) r)[0];
 #else
   __m64_union m1, m2, res;
 
@@ -1467,7 +1467,7 @@ 
   b = (__vector unsigned char)vec_splats (__B);
   c = (__vector __bool char)vec_cmpgt (a, b);
   r = vec_sel (b, a, c);
-  return (__builtin_unpack_vector_int128 ((__vector __int128)r, 0));
+  return (__m64) ((vector long long) r)[0];
 #else
   __m64_union m1, m2, res;
   long i;
@@ -1503,7 +1503,7 @@ 
   b = (__vector signed short)vec_splats (__B);
   c = (__vector __bool short)vec_cmplt (a, b);
   r = vec_sel (b, a, c);
-  return (__builtin_unpack_vector_int128 ((__vector __int128)r, 0));
+  return (__m64) ((vector long long) r)[0];
 #else
   __m64_union m1, m2, res;
 
@@ -1541,7 +1541,7 @@ 
   b = (__vector unsigned char)vec_splats (__B);
   c = (__vector __bool char)vec_cmplt (a, b);
   r = vec_sel (b, a, c);
-  return (__builtin_unpack_vector_int128 ((__vector __int128)r, 0));
+  return (__m64) ((vector long long) r)[0];
 #else
   __m64_union m1, m2, res;
   long i;
@@ -1569,7 +1569,7 @@ 
 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_movemask_pi8 (__m64 __A)
 {
-  unsigned long p = 0x0008101820283038UL; // permute control for sign bits
+  unsigned long long p = 0x0008101820283038UL; // permute control for sign bits
 
   return __builtin_bpermd (p, __A);
 }
@@ -1600,7 +1600,7 @@ 
   w1 = vec_vmulouh (a, b);
   c = (__vector unsigned short)vec_perm (w0, w1, xform1);
 
-  return (__builtin_unpack_vector_int128 ((__vector __int128)c, 0));
+  return (__m64) ((vector long long) c)[0];
 }
 
 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
@@ -1643,7 +1643,7 @@ 
   p = vec_splats (t.as_m64);
   a = vec_splats (__A);
   r = vec_perm (a, a, (__vector unsigned char)p);
-  return (__builtin_unpack_vector_int128 ((__vector __int128)r, 0));
+  return (__m64) ((vector long long) r)[0];
 }
 
 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
@@ -1683,7 +1683,7 @@ 
   a = (__vector unsigned char)vec_splats (__A);
   b = (__vector unsigned char)vec_splats (__B);
   c = vec_avg (a, b);
-  return (__builtin_unpack_vector_int128 ((__vector __int128)c, 0));
+  return (__m64) ((vector long long) c)[0];
 }
 
 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
@@ -1701,7 +1701,7 @@ 
   a = (__vector unsigned short)vec_splats (__A);
   b = (__vector unsigned short)vec_splats (__B);
   c = vec_avg (a, b);
-  return (__builtin_unpack_vector_int128 ((__vector __int128)c, 0));
+  return (__m64) ((vector long long) c)[0];
 }
 
 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
@@ -1723,8 +1723,8 @@ 
     { 0, 0, 0, 0 };
   unsigned short result;
 
-  a = (__vector unsigned char) __builtin_pack_vector_int128 (0UL, __A);
-  b = (__vector unsigned char) __builtin_pack_vector_int128 (0UL, __B);
+  a = (__vector unsigned char) (__vector unsigned long long) { 0UL, __A };
+  b = (__vector unsigned char) (__vector unsigned long long) { 0UL, __B };
   vmin = vec_min (a, b);
   vmax = vec_max (a, b);
   vabsdiff = vec_sub (vmax, vmin);