diff mbox

[x86] Avoid builtins for SSE/AVX2 immidiate logical shifts

Message ID 201704241515.11173.linux@carewolf.com
State New
Headers show

Commit Message

Allan Sandfeld Jensen April 24, 2017, 1:15 p.m. UTC
On Monday 24 April 2017, Jakub Jelinek wrote:
> On Mon, Apr 24, 2017 at 09:33:09AM +0200, Allan Sandfeld Jensen wrote:
> > --- a/gcc/config/i386/avx2intrin.h
> > +++ b/gcc/config/i386/avx2intrin.h
> > @@ -667,7 +667,7 @@ extern __inline __m256i
> > 
> >  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> >  _mm256_slli_epi16 (__m256i __A, int __B)
> >  {
> > 
> > -  return (__m256i)__builtin_ia32_psllwi256 ((__v16hi)__A, __B);
> > +  return ((__B & 0xff) < 16) ? (__m256i)((__v16hi)__A << (__B & 0xff)) :
> > _mm256_setzero_si256();
> > 
> >  }
> 
> What is the advantage of doing that when you replace one operation with
> several (&, <, ?:, <<)?
> I'd say instead we should fold the builtins if in the gimple fold target
> hook we see the shift count constant and can decide based on that.
> Or we could use __builtin_constant_p (__B) to decide whether to use
> the generic vector shifts or builtin, but that means larger IL.
> 
Okay, I have tried that, and I also made it more obvious how the intrinsics 
can become non-immediate shift.

Comments

Jakub Jelinek May 2, 2017, 10:11 a.m. UTC | #1
On Mon, Apr 24, 2017 at 03:15:11PM +0200, Allan Sandfeld Jensen wrote:
> Okay, I have tried that, and I also made it more obvious how the intrinsics 
> can become non-immediate shift.
> 

> diff --git a/gcc/ChangeLog b/gcc/ChangeLog
> index b58f5050db0..b9406550fc5 100644
> --- a/gcc/ChangeLog
> +++ b/gcc/ChangeLog
> @@ -1,3 +1,10 @@
> +2017-04-22  Allan Sandfeld Jensen  <sandfeld@kde.org>
> +
> +	* config/i386/emmintrin.h (_mm_slli_*, _mm_srli_*):
> +	Use vector intrinstics instead of builtins.
> +	* config/i386/avx2intrin.h (_mm256_slli_*, _mm256_srli_*):
> +	Use vector intrinstics instead of builtins.
> +
>  2017-04-21  Uros Bizjak  <ubizjak@gmail.com>
>  
>  	* config/i386/i386.md (*extzvqi_mem_rex64): Move above *extzv<mode>.
> diff --git a/gcc/config/i386/avx2intrin.h b/gcc/config/i386/avx2intrin.h
> index 82f170a3d61..64ba52b244e 100644
> --- a/gcc/config/i386/avx2intrin.h
> +++ b/gcc/config/i386/avx2intrin.h
> @@ -665,13 +665,6 @@ _mm256_slli_si256 (__m256i __A, const int __N)
>  
>  extern __inline __m256i
>  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> -_mm256_slli_epi16 (__m256i __A, int __B)
> -{
> -  return (__m256i)__builtin_ia32_psllwi256 ((__v16hi)__A, __B);
> -}
> -
> -extern __inline __m256i
> -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
>  _mm256_sll_epi16 (__m256i __A, __m128i __B)
>  {
>    return (__m256i)__builtin_ia32_psllw256((__v16hi)__A, (__v8hi)__B);
> @@ -679,9 +672,11 @@ _mm256_sll_epi16 (__m256i __A, __m128i __B)
>  
>  extern __inline __m256i
>  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> -_mm256_slli_epi32 (__m256i __A, int __B)
> +_mm256_slli_epi16 (__m256i __A, int __B)
>  {
> -  return (__m256i)__builtin_ia32_pslldi256 ((__v8si)__A, __B);
> +  if (__builtin_constant_p(__B))
> +    return ((unsigned int)__B < 16) ? (__m256i)((__v16hi)__A << __B) : _mm256_setzero_si256();
> +  return _mm256_sll_epi16(__A, _mm_cvtsi32_si128(__B));
>  }

The formatting is wrong, missing spaces before function names and opening (,
too long lines.  Also, you've removed some builtin uses like
__builtin_ia32_psllwi256 above, but haven't removed those builtins from the
compiler (unlike the intrinsics, the builtins are not supported and can be
removed).  But I guess the primary question is on Uros, do we
want to handle this in the *intrin.h headers and thus increase the size
of those (and their parsing time etc.), or do we want to handle this
in the target folders (tree as well as gimple one), where we'd convert
e.g. __builtin_ia32_psllwi256 to the shift if the shift count is constant.

	Jakub
Allan Sandfeld Jensen May 2, 2017, 11:19 a.m. UTC | #2
On Tuesday 02 May 2017, Jakub Jelinek wrote:
> On Mon, Apr 24, 2017 at 03:15:11PM +0200, Allan Sandfeld Jensen wrote:
> > Okay, I have tried that, and I also made it more obvious how the
> > intrinsics can become non-immediate shift.
> > 
> > 
> > diff --git a/gcc/ChangeLog b/gcc/ChangeLog
> > index b58f5050db0..b9406550fc5 100644
> > --- a/gcc/ChangeLog
> > +++ b/gcc/ChangeLog
> > @@ -1,3 +1,10 @@
> > +2017-04-22  Allan Sandfeld Jensen  <sandfeld@kde.org>
> > +
> > +	* config/i386/emmintrin.h (_mm_slli_*, _mm_srli_*):
> > +	Use vector intrinstics instead of builtins.
> > +	* config/i386/avx2intrin.h (_mm256_slli_*, _mm256_srli_*):
> > +	Use vector intrinstics instead of builtins.
> > +
> > 
> >  2017-04-21  Uros Bizjak  <ubizjak@gmail.com>
> >  
> >  	* config/i386/i386.md (*extzvqi_mem_rex64): Move above *extzv<mode>.
> > 
> > diff --git a/gcc/config/i386/avx2intrin.h b/gcc/config/i386/avx2intrin.h
> > index 82f170a3d61..64ba52b244e 100644
> > --- a/gcc/config/i386/avx2intrin.h
> > +++ b/gcc/config/i386/avx2intrin.h
> > @@ -665,13 +665,6 @@ _mm256_slli_si256 (__m256i __A, const int __N)
> > 
> >  extern __inline __m256i
> >  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> > 
> > -_mm256_slli_epi16 (__m256i __A, int __B)
> > -{
> > -  return (__m256i)__builtin_ia32_psllwi256 ((__v16hi)__A, __B);
> > -}
> > -
> > -extern __inline __m256i
> > -__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> > 
> >  _mm256_sll_epi16 (__m256i __A, __m128i __B)
> >  {
> >  
> >    return (__m256i)__builtin_ia32_psllw256((__v16hi)__A, (__v8hi)__B);
> > 
> > @@ -679,9 +672,11 @@ _mm256_sll_epi16 (__m256i __A, __m128i __B)
> > 
> >  extern __inline __m256i
> >  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> > 
> > -_mm256_slli_epi32 (__m256i __A, int __B)
> > +_mm256_slli_epi16 (__m256i __A, int __B)
> > 
> >  {
> > 
> > -  return (__m256i)__builtin_ia32_pslldi256 ((__v8si)__A, __B);
> > +  if (__builtin_constant_p(__B))
> > +    return ((unsigned int)__B < 16) ? (__m256i)((__v16hi)__A << __B) :
> > _mm256_setzero_si256(); +  return _mm256_sll_epi16(__A,
> > _mm_cvtsi32_si128(__B));
> > 
> >  }
> 
> The formatting is wrong, missing spaces before function names and opening
> (, too long lines.  Also, you've removed some builtin uses like
> __builtin_ia32_psllwi256 above, but haven't removed those builtins from the
> compiler (unlike the intrinsics, the builtins are not supported and can be
> removed).  But I guess the primary question is on Uros, do we
> want to handle this in the *intrin.h headers and thus increase the size
> of those (and their parsing time etc.), or do we want to handle this
> in the target folders (tree as well as gimple one), where we'd convert
> e.g. __builtin_ia32_psllwi256 to the shift if the shift count is constant.
> 
Ok. I will await what you decide.

Btw. I thought of an alternative idea: Make a new set of built-ins, called for 
instance __builtin_lshift and __builtin_rshift, that translates simply to 
GIMPLE shifts, just like cpp_shifts currently does, the only difference being 
the new shifts (unlike C/C++ shifts) are defined for all shift sizes and on 
negative values.  With this also variable shift intrinsics can be written 
without builtins. Though to do this would making a whole set of them for all 
integer types, it would need to be implemented in the c-parser like 
__buitin_shuffle, and not with the other generic builtins.

Would that make sense?

Best regards
`Allan
Marc Glisse May 2, 2017, 3:48 p.m. UTC | #3
On Tue, 2 May 2017, Jakub Jelinek wrote:

> Also, you've removed some builtin uses like __builtin_ia32_psllwi256 
> above, but haven't removed those builtins from the compiler (unlike the 
> intrinsics, the builtins are not supported and can be removed).

When we changed previous intrinsics, the same issue came up, and Ada folks 
asked us to keep the builtins...
diff mbox

Patch

diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index b58f5050db0..b9406550fc5 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,10 @@ 
+2017-04-22  Allan Sandfeld Jensen  <sandfeld@kde.org>
+
+	* config/i386/emmintrin.h (_mm_slli_*, _mm_srli_*):
+	Use vector intrinstics instead of builtins.
+	* config/i386/avx2intrin.h (_mm256_slli_*, _mm256_srli_*):
+	Use vector intrinstics instead of builtins.
+
 2017-04-21  Uros Bizjak  <ubizjak@gmail.com>
 
 	* config/i386/i386.md (*extzvqi_mem_rex64): Move above *extzv<mode>.
diff --git a/gcc/config/i386/avx2intrin.h b/gcc/config/i386/avx2intrin.h
index 82f170a3d61..64ba52b244e 100644
--- a/gcc/config/i386/avx2intrin.h
+++ b/gcc/config/i386/avx2intrin.h
@@ -665,13 +665,6 @@  _mm256_slli_si256 (__m256i __A, const int __N)
 
 extern __inline __m256i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_slli_epi16 (__m256i __A, int __B)
-{
-  return (__m256i)__builtin_ia32_psllwi256 ((__v16hi)__A, __B);
-}
-
-extern __inline __m256i
-__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm256_sll_epi16 (__m256i __A, __m128i __B)
 {
   return (__m256i)__builtin_ia32_psllw256((__v16hi)__A, (__v8hi)__B);
@@ -679,9 +672,11 @@  _mm256_sll_epi16 (__m256i __A, __m128i __B)
 
 extern __inline __m256i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_slli_epi32 (__m256i __A, int __B)
+_mm256_slli_epi16 (__m256i __A, int __B)
 {
-  return (__m256i)__builtin_ia32_pslldi256 ((__v8si)__A, __B);
+  if (__builtin_constant_p(__B))
+    return ((unsigned int)__B < 16) ? (__m256i)((__v16hi)__A << __B) : _mm256_setzero_si256();
+  return _mm256_sll_epi16(__A, _mm_cvtsi32_si128(__B));
 }
 
 extern __inline __m256i
@@ -693,9 +688,11 @@  _mm256_sll_epi32 (__m256i __A, __m128i __B)
 
 extern __inline __m256i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_slli_epi64 (__m256i __A, int __B)
+_mm256_slli_epi32 (__m256i __A, int __B)
 {
-  return (__m256i)__builtin_ia32_psllqi256 ((__v4di)__A, __B);
+  if (__builtin_constant_p(__B))
+    return ((unsigned int)__B < 32) ? (__m256i)((__v8si)__A << __B) : _mm256_setzero_si256();
+  return _mm256_sll_epi32(__A, _mm_cvtsi32_si128(__B));
 }
 
 extern __inline __m256i
@@ -707,6 +704,15 @@  _mm256_sll_epi64 (__m256i __A, __m128i __B)
 
 extern __inline __m256i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_slli_epi64 (__m256i __A, int __B)
+{
+  if (__builtin_constant_p(__B))
+    return ((unsigned int)__B < 64) ? (__m256i)((__v4di)__A << __B) : _mm256_setzero_si256();
+  return _mm256_sll_epi64(__A, _mm_cvtsi32_si128(__B));
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm256_srai_epi16 (__m256i __A, int __B)
 {
   return (__m256i)__builtin_ia32_psrawi256 ((__v16hi)__A, __B);
@@ -756,13 +762,6 @@  _mm256_srli_si256 (__m256i __A, const int __N)
 
 extern __inline __m256i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_srli_epi16 (__m256i __A, int __B)
-{
-  return (__m256i)__builtin_ia32_psrlwi256 ((__v16hi)__A, __B);
-}
-
-extern __inline __m256i
-__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm256_srl_epi16 (__m256i __A, __m128i __B)
 {
   return (__m256i)__builtin_ia32_psrlw256((__v16hi)__A, (__v8hi)__B);
@@ -770,9 +769,11 @@  _mm256_srl_epi16 (__m256i __A, __m128i __B)
 
 extern __inline __m256i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_srli_epi32 (__m256i __A, int __B)
+_mm256_srli_epi16 (__m256i __A, int __B)
 {
-  return (__m256i)__builtin_ia32_psrldi256 ((__v8si)__A, __B);
+  if (__builtin_constant_p(__B))
+    return ((unsigned int)__B < 16) ? (__m256i)((__v16hu)__A >> __B) : _mm256_setzero_si256();
+  return _mm256_srl_epi16(__A, _mm_cvtsi32_si128(__B));
 }
 
 extern __inline __m256i
@@ -784,9 +785,11 @@  _mm256_srl_epi32 (__m256i __A, __m128i __B)
 
 extern __inline __m256i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
-_mm256_srli_epi64 (__m256i __A, int __B)
+_mm256_srli_epi32 (__m256i __A, int __B)
 {
-  return (__m256i)__builtin_ia32_psrlqi256 ((__v4di)__A, __B);
+  if (__builtin_constant_p(__B))
+    return ((unsigned int)__B < 32) ? (__m256i)((__v8su)__A >> __B) : _mm256_setzero_si256();
+  return _mm256_srl_epi32(__A, _mm_cvtsi32_si128(__B));
 }
 
 extern __inline __m256i
@@ -798,6 +801,15 @@  _mm256_srl_epi64 (__m256i __A, __m128i __B)
 
 extern __inline __m256i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_srli_epi64 (__m256i __A, int __B)
+{
+  if (__builtin_constant_p(__B))
+    return ((unsigned int)__B < 64) ? (__m256i)((__v4du)__A >> __B) : _mm256_setzero_si256();
+  return _mm256_srl_epi64(__A, _mm_cvtsi32_si128(__B));
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm256_sub_epi8 (__m256i __A, __m256i __B)
 {
   return (__m256i) ((__v32qu)__A - (__v32qu)__B);
diff --git a/gcc/config/i386/emmintrin.h b/gcc/config/i386/emmintrin.h
index 828f4a07a9b..419041e2acb 100644
--- a/gcc/config/i386/emmintrin.h
+++ b/gcc/config/i386/emmintrin.h
@@ -903,6 +903,28 @@  _mm_cvtss_sd (__m128d __A, __m128 __B)
   return (__m128d)__builtin_ia32_cvtss2sd ((__v2df) __A, (__v4sf)__B);
 }
 
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsi32_si128 (int __A)
+{
+  return _mm_set_epi32 (0, 0, 0, __A);
+}
+
+#ifdef __x86_64__
+/* Intel intrinsic.  */
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsi64_si128 (long long __A)
+{
+  return _mm_set_epi64x (0, __A);
+}
+
+/* Microsoft intrinsic.  */
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsi64x_si128 (long long __A)
+{
+  return _mm_set_epi64x (0, __A);
+}
+#endif
+
 #ifdef __OPTIMIZE__
 extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_shuffle_pd(__m128d __A, __m128d __B, const int __mask)
@@ -1138,21 +1160,75 @@  _mm_mul_epu32 (__m128i __A, __m128i __B)
 }
 
 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sll_epi16 (__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_psllw128((__v8hi)__A, (__v8hi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sll_epi32 (__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_pslld128((__v4si)__A, (__v4si)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sll_epi64 (__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_psllq128((__v2di)__A, (__v2di)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sra_epi16 (__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_psraw128 ((__v8hi)__A, (__v8hi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sra_epi32 (__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_psrad128 ((__v4si)__A, (__v4si)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_srl_epi16 (__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_psrlw128 ((__v8hi)__A, (__v8hi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_srl_epi32 (__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_psrld128 ((__v4si)__A, (__v4si)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_srl_epi64 (__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_psrlq128 ((__v2di)__A, (__v2di)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_slli_epi16 (__m128i __A, int __B)
 {
-  return (__m128i)__builtin_ia32_psllwi128 ((__v8hi)__A, __B);
+  if (__builtin_constant_p(__B))
+    return ((unsigned int)__B < 16) ? (__m128i)((__v8hi)__A << __B) : _mm_setzero_si128();
+  return _mm_sll_epi16(__A, _mm_cvtsi32_si128(__B));
 }
 
 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_slli_epi32 (__m128i __A, int __B)
 {
-  return (__m128i)__builtin_ia32_pslldi128 ((__v4si)__A, __B);
+  if (__builtin_constant_p(__B))
+    return ((unsigned int)__B < 32) ? (__m128i)((__v4si)__A << __B) : _mm_setzero_si128();
+  return _mm_sll_epi32(__A, _mm_cvtsi32_si128(__B));
 }
 
 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_slli_epi64 (__m128i __A, int __B)
 {
-  return (__m128i)__builtin_ia32_psllqi128 ((__v2di)__A, __B);
+  if (__builtin_constant_p(__B))
+    return ((unsigned int)__B < 64) ? (__m128i)((__v2di)__A << __B) : _mm_setzero_si128();
+  return _mm_sll_epi64(__A, _mm_cvtsi32_si128(__B));
 }
 
 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
@@ -1205,67 +1281,25 @@  _mm_slli_si128 (__m128i __A, const int __N)
 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_srli_epi16 (__m128i __A, int __B)
 {
-  return (__m128i)__builtin_ia32_psrlwi128 ((__v8hi)__A, __B);
+  if (__builtin_constant_p(__B))
+    return ((unsigned int)__B < 16) ? (__m128i)((__v8hu)__A >> __B) : _mm_setzero_si128();
+  return _mm_srl_epi16(__A, _mm_cvtsi32_si128(__B));
 }
 
 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_srli_epi32 (__m128i __A, int __B)
 {
-  return (__m128i)__builtin_ia32_psrldi128 ((__v4si)__A, __B);
+  if (__builtin_constant_p(__B))
+    return ((unsigned int)__B < 32) ? (__m128i)((__v4su)__A >> __B) : _mm_setzero_si128();
+  return _mm_srl_epi32(__A, _mm_cvtsi32_si128(__B));
 }
 
 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_srli_epi64 (__m128i __A, int __B)
 {
-  return (__m128i)__builtin_ia32_psrlqi128 ((__v2di)__A, __B);
-}
-
-extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
-_mm_sll_epi16 (__m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_psllw128((__v8hi)__A, (__v8hi)__B);
-}
-
-extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
-_mm_sll_epi32 (__m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_pslld128((__v4si)__A, (__v4si)__B);
-}
-
-extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
-_mm_sll_epi64 (__m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_psllq128((__v2di)__A, (__v2di)__B);
-}
-
-extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
-_mm_sra_epi16 (__m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_psraw128 ((__v8hi)__A, (__v8hi)__B);
-}
-
-extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
-_mm_sra_epi32 (__m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_psrad128 ((__v4si)__A, (__v4si)__B);
-}
-
-extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
-_mm_srl_epi16 (__m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_psrlw128 ((__v8hi)__A, (__v8hi)__B);
-}
-
-extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
-_mm_srl_epi32 (__m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_psrld128 ((__v4si)__A, (__v4si)__B);
-}
-
-extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
-_mm_srl_epi64 (__m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_psrlq128 ((__v2di)__A, (__v2di)__B);
+  if (__builtin_constant_p(__B))
+    return ((unsigned int)__B < 64) ? (__m128i)((__v2du)__A >> __B) : _mm_setzero_si128();
+  return _mm_srl_epi64(__A, _mm_cvtsi32_si128(__B));
 }
 
 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
@@ -1497,28 +1531,6 @@  _mm_mfence (void)
   __builtin_ia32_mfence ();
 }
 
-extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
-_mm_cvtsi32_si128 (int __A)
-{
-  return _mm_set_epi32 (0, 0, 0, __A);
-}
-
-#ifdef __x86_64__
-/* Intel intrinsic.  */
-extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
-_mm_cvtsi64_si128 (long long __A)
-{
-  return _mm_set_epi64x (0, __A);
-}
-
-/* Microsoft intrinsic.  */
-extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
-_mm_cvtsi64x_si128 (long long __A)
-{
-  return _mm_set_epi64x (0, __A);
-}
-#endif
-
 /* Casts between various SP, DP, INT vector types.  Note that these do no
    conversion of values, they just change the type.  */
 extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog
index 6f4dc8d5095..a4470730ac6 100644
--- a/gcc/testsuite/ChangeLog
+++ b/gcc/testsuite/ChangeLog
@@ -1,3 +1,10 @@ 
+2017-04-22  Allan Sandfeld Jensen  <sandfeld@kde.org>
+
+	* gcc.target/i386/sse2-shifts-1.c: New testcases of shift intrinsics
+	producing intended instructions.
+	* gcc.target/i386/sse2-shifts-2.c: New testcasse of shift intrinsics
+	being folded.
+
 2017-04-21  Janus Weil  <janus@gcc.gnu.org>
 
 	PR fortran/80392
diff --git a/gcc/testsuite/gcc.target/i386/sse2-shifts-1.c b/gcc/testsuite/gcc.target/i386/sse2-shifts-1.c
new file mode 100644
index 00000000000..a2305cf042a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/sse2-shifts-1.c
@@ -0,0 +1,54 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse2 -mno-avx" } */
+/* { dg-require-effective-target sse2 } */
+
+#include <emmintrin.h>
+
+__m128i test1(__m128i a)
+{
+    return _mm_slli_epi16(a, 9);
+}
+
+__m128i test2(__m128i a)
+{
+    return _mm_slli_epi32(a, 13);
+}
+
+__m128i test3(__m128i a)
+{
+    return _mm_slli_epi64(a, 17);
+}
+
+__m128i test4(__m128i a)
+{
+    return _mm_srli_epi16(a, 9);
+}
+
+__m128i test5(__m128i a)
+{
+    return _mm_srli_epi32(a, 13);
+}
+
+__m128i test6(__m128i a)
+{
+    return _mm_srli_epi64(a, 7);
+}
+
+__m128i test7(__m128i a)
+{
+    return _mm_srai_epi16(a, 3);
+}
+
+__m128i test8(__m128i a)
+{
+    return _mm_srai_epi32(a, 6);
+}
+
+/* { dg-final { scan-assembler "psllw" } } */
+/* { dg-final { scan-assembler "pslld" } } */
+/* { dg-final { scan-assembler "psllq" } } */
+/* { dg-final { scan-assembler "psrlw" } } */
+/* { dg-final { scan-assembler "psrld" } } */
+/* { dg-final { scan-assembler "psrlq" } } */
+/* { dg-final { scan-assembler "psraw" } } */
+/* { dg-final { scan-assembler "psrad" } } */
diff --git a/gcc/testsuite/gcc.target/i386/sse2-shifts-2.c b/gcc/testsuite/gcc.target/i386/sse2-shifts-2.c
new file mode 100644
index 00000000000..ce05a7dc44e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/sse2-shifts-2.c
@@ -0,0 +1,27 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse2" } */
+/* { dg-require-effective-target sse2 } */
+
+#include <emmintrin.h>
+
+__m128i test1(__m128i a)
+{
+    a = _mm_slli_epi16(a, 2);
+    return _mm_slli_epi16(a, 3);
+}
+/* { dg-final { scan-assembler "psllw.*5"} } */
+
+__m128i test3(__m128i a)
+{
+    a = _mm_srli_epi16(a, 4);
+    return _mm_srli_epi16(a, 9);
+}
+/* { dg-final { scan-assembler-times "psrlw" 1} } */
+
+__m128i test4(__m128i a)
+{
+    a = _mm_setr_epi32(128, 255, 86, 23);
+    return _mm_srli_epi32(a, 8);
+}
+/* { dg-final { scan-assembler-not "psrld"} } */
+