diff mbox

[x86] Avoid builtins for SSE/AVX2 immidiate logical shifts

Message ID 201704240933.09704.linux@carewolf.com
State New
Headers show

Commit Message

Allan Sandfeld Jensen April 24, 2017, 7:33 a.m. UTC
On Saturday 22 April 2017, Allan Sandfeld Jensen wrote:
> Replaces definitions of immediate logical shift intrinsics with GCC
> extension syntax. Tests are added to ensure the intrinsics still produce
> the right instructions and that a few basic optimizations now work.
> 
> Compared to the earlier version of the patch, all potentially undefined
> shifts are now avoided, which also means no variable shifts or arithmetic
> right shifts.

Fixed 2 errors in the tests.

Comments

Jakub Jelinek April 24, 2017, 7:43 a.m. UTC | #1
On Mon, Apr 24, 2017 at 09:33:09AM +0200, Allan Sandfeld Jensen wrote:
> --- a/gcc/config/i386/avx2intrin.h
> +++ b/gcc/config/i386/avx2intrin.h
> @@ -667,7 +667,7 @@ extern __inline __m256i
>  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
>  _mm256_slli_epi16 (__m256i __A, int __B)
>  {
> -  return (__m256i)__builtin_ia32_psllwi256 ((__v16hi)__A, __B);
> +  return ((__B & 0xff) < 16) ? (__m256i)((__v16hi)__A << (__B & 0xff)) : _mm256_setzero_si256();
>  }

What is the advantage of doing that when you replace one operation with
several (&, <, ?:, <<)?
I'd say instead we should fold the builtins if in the gimple fold target
hook we see the shift count constant and can decide based on that.
Or we could use __builtin_constant_p (__B) to decide whether to use
the generic vector shifts or builtin, but that means larger IL.

	Jakub
Allan Sandfeld Jensen April 24, 2017, 7:51 a.m. UTC | #2
On Monday 24 April 2017, Jakub Jelinek wrote:
> On Mon, Apr 24, 2017 at 09:33:09AM +0200, Allan Sandfeld Jensen wrote:
> > --- a/gcc/config/i386/avx2intrin.h
> > +++ b/gcc/config/i386/avx2intrin.h
> > @@ -667,7 +667,7 @@ extern __inline __m256i
> > 
> >  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> >  _mm256_slli_epi16 (__m256i __A, int __B)
> >  {
> > 
> > -  return (__m256i)__builtin_ia32_psllwi256 ((__v16hi)__A, __B);
> > +  return ((__B & 0xff) < 16) ? (__m256i)((__v16hi)__A << (__B & 0xff)) :
> > _mm256_setzero_si256();
> > 
> >  }
> 
> What is the advantage of doing that when you replace one operation with
> several (&, <, ?:, <<)?
> I'd say instead we should fold the builtins if in the gimple fold target
> hook we see the shift count constant and can decide based on that.
> Or we could use __builtin_constant_p (__B) to decide whether to use
> the generic vector shifts or builtin, but that means larger IL.

The advantage is that in this builtin, the __B is always a literal (or 
constexpr), so the if statement is resolved at compile time.

`Allan
Jakub Jelinek April 24, 2017, 7:56 a.m. UTC | #3
On Mon, Apr 24, 2017 at 09:51:29AM +0200, Allan Sandfeld Jensen wrote:
> On Monday 24 April 2017, Jakub Jelinek wrote:
> > On Mon, Apr 24, 2017 at 09:33:09AM +0200, Allan Sandfeld Jensen wrote:
> > > --- a/gcc/config/i386/avx2intrin.h
> > > +++ b/gcc/config/i386/avx2intrin.h
> > > @@ -667,7 +667,7 @@ extern __inline __m256i
> > > 
> > >  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> > >  _mm256_slli_epi16 (__m256i __A, int __B)
> > >  {
> > > 
> > > -  return (__m256i)__builtin_ia32_psllwi256 ((__v16hi)__A, __B);
> > > +  return ((__B & 0xff) < 16) ? (__m256i)((__v16hi)__A << (__B & 0xff)) :
> > > _mm256_setzero_si256();
> > > 
> > >  }
> > 
> > What is the advantage of doing that when you replace one operation with
> > several (&, <, ?:, <<)?
> > I'd say instead we should fold the builtins if in the gimple fold target
> > hook we see the shift count constant and can decide based on that.
> > Or we could use __builtin_constant_p (__B) to decide whether to use
> > the generic vector shifts or builtin, but that means larger IL.
> 
> The advantage is that in this builtin, the __B is always a literal (or 
> constexpr), so the if statement is resolved at compile time.

Do we really want to support all the thousands _mm* intrinsics in constexpr
contexts?  People can just use generic vectors instead.

That said, both the options I've mentioned above provide the same advantages
and don't have the disadvantages of pessimizing normal code.

	Jakub
Allan Sandfeld Jensen April 24, 2017, 8:02 a.m. UTC | #4
On Monday 24 April 2017, Jakub Jelinek wrote:
> On Mon, Apr 24, 2017 at 09:51:29AM +0200, Allan Sandfeld Jensen wrote:
> > On Monday 24 April 2017, Jakub Jelinek wrote:
> > > On Mon, Apr 24, 2017 at 09:33:09AM +0200, Allan Sandfeld Jensen wrote:
> > > > --- a/gcc/config/i386/avx2intrin.h
> > > > +++ b/gcc/config/i386/avx2intrin.h
> > > > @@ -667,7 +667,7 @@ extern __inline __m256i
> > > > 
> > > >  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
> > > >  _mm256_slli_epi16 (__m256i __A, int __B)
> > > >  {
> > > > 
> > > > -  return (__m256i)__builtin_ia32_psllwi256 ((__v16hi)__A, __B);
> > > > +  return ((__B & 0xff) < 16) ? (__m256i)((__v16hi)__A << (__B &
> > > > 0xff)) : _mm256_setzero_si256();
> > > > 
> > > >  }
> > > 
> > > What is the advantage of doing that when you replace one operation with
> > > several (&, <, ?:, <<)?
> > > I'd say instead we should fold the builtins if in the gimple fold
> > > target hook we see the shift count constant and can decide based on
> > > that. Or we could use __builtin_constant_p (__B) to decide whether to
> > > use the generic vector shifts or builtin, but that means larger IL.
> > 
> > The advantage is that in this builtin, the __B is always a literal (or
> > constexpr), so the if statement is resolved at compile time.
> 
> Do we really want to support all the thousands _mm* intrinsics in constexpr
> contexts?  People can just use generic vectors instead.
> 
I would love to support it, but first we need a C extension attribute matching 
constexpr, and I consider it a separate issue.

> That said, both the options I've mentioned above provide the same
> advantages and don't have the disadvantages of pessimizing normal code.
> 
What pessimizing? This produce the same or better code for all legal 
arguments. The only difference besides better generated code is that it allows 
the intrinsics to be used incorrectly with non-literal arguments because we 
lack the C-extension for constexp to prevent that.

`Allan
Jakub Jelinek April 24, 2017, 8:25 a.m. UTC | #5
On Mon, Apr 24, 2017 at 10:02:40AM +0200, Allan Sandfeld Jensen wrote:
> > That said, both the options I've mentioned above provide the same
> > advantages and don't have the disadvantages of pessimizing normal code.
> > 
> What pessimizing? This produce the same or better code for all legal 
> arguments. The only difference besides better generated code is that it allows 

No.  Have you really tried that?

> the intrinsics to be used incorrectly with non-literal arguments because we 
> lack the C-extension for constexp to prevent that.

Consider e.g. -O2 -mavx2 -mtune=intel:
#include <x86intrin.h>

__m256i
foo (__m256i x, int s)
{
  return (__m256i)__builtin_ia32_psllwi256 ((__v16hi)x, s);
}

__m256i
bar (__m256i x, int s)
{
  return ((s & 0xff) < 16) ? (__m256i)((__v16hi)x << (s & 0xff)) : _mm256_setzero_si256 ();
}

The first one generates
        movl    %edi, %edi
        vmovq   %rdi, %xmm1
        vpsllw  %xmm1, %ymm0, %ymm0
        ret
(because that is actually what the instruction does), the second one
        movzbl  %dil, %edi
        cmpl    $15, %edi
        jg      .L5
        vmovq   %rdi, %xmm1
        vpsllw  %xmm1, %ymm0, %ymm0
        ret
        .p2align 4,,7
        .p2align 3
.L5:
        vpxor   %xmm0, %xmm0, %xmm0
        ret

	Jakub
Allan Sandfeld Jensen April 24, 2017, 8:34 a.m. UTC | #6
On Monday 24 April 2017, Jakub Jelinek wrote:
> On Mon, Apr 24, 2017 at 10:02:40AM +0200, Allan Sandfeld Jensen wrote:
> > > That said, both the options I've mentioned above provide the same
> > > advantages and don't have the disadvantages of pessimizing normal code.
> > 
> > What pessimizing? This produce the same or better code for all legal
> > arguments. The only difference besides better generated code is that it
> > allows
> 
> No.  Have you really tried that?
> 
> > the intrinsics to be used incorrectly with non-literal arguments because
> > we lack the C-extension for constexp to prevent that.
> 
> Consider e.g. -O2 -mavx2 -mtune=intel:
> #include <x86intrin.h>
> 
> __m256i
> foo (__m256i x, int s)
> {
>   return (__m256i)__builtin_ia32_psllwi256 ((__v16hi)x, s);
> }
> 
> __m256i
> bar (__m256i x, int s)
> {
>   return ((s & 0xff) < 16) ? (__m256i)((__v16hi)x << (s & 0xff)) :
> _mm256_setzero_si256 (); }
> 
> The first one generates
>         movl    %edi, %edi
>         vmovq   %rdi, %xmm1
>         vpsllw  %xmm1, %ymm0, %ymm0
>         ret
> (because that is actually what the instruction does), the second one
That is a different instruction. That is the vpsllw not vpsllwi

The intrinsics I changed is the immediate version, I didn't change the non-
immediate version. It is probably a bug if you can give non-immediate values 
to the immediate only intrinsic. At least both versions handles it, if in 
different ways, but is is illegal arguments.

`Allan
Allan Sandfeld Jensen April 24, 2017, 8:38 a.m. UTC | #7
On Monday 24 April 2017, Allan Sandfeld Jensen wrote:
> On Monday 24 April 2017, Jakub Jelinek wrote:
> > On Mon, Apr 24, 2017 at 10:02:40AM +0200, Allan Sandfeld Jensen wrote:
> > > > That said, both the options I've mentioned above provide the same
> > > > advantages and don't have the disadvantages of pessimizing normal
> > > > code.
> > > 
> > > What pessimizing? This produce the same or better code for all legal
> > > arguments. The only difference besides better generated code is that it
> > > allows
> > 
> > No.  Have you really tried that?
> > 
> > > the intrinsics to be used incorrectly with non-literal arguments
> > > because we lack the C-extension for constexp to prevent that.
> > 
> > Consider e.g. -O2 -mavx2 -mtune=intel:
> > #include <x86intrin.h>
> > 
> > __m256i
> > foo (__m256i x, int s)
> > {
> > 
> >   return (__m256i)__builtin_ia32_psllwi256 ((__v16hi)x, s);
> > 
> > }
> > 
> > __m256i
> > bar (__m256i x, int s)
> > {
> > 
> >   return ((s & 0xff) < 16) ? (__m256i)((__v16hi)x << (s & 0xff)) :
> > _mm256_setzero_si256 (); }
> > 
> > The first one generates
> > 
> >         movl    %edi, %edi
> >         vmovq   %rdi, %xmm1
> >         vpsllw  %xmm1, %ymm0, %ymm0
> >         ret
> > 
> > (because that is actually what the instruction does), the second one
> 
> That is a different instruction. That is the vpsllw not vpsllwi
> 
> The intrinsics I changed is the immediate version, I didn't change the non-
> immediate version. It is probably a bug if you can give non-immediate
> values to the immediate only intrinsic. At least both versions handles it,
> if in different ways, but is is illegal arguments.
> 
Though I now that I think about it, this means my change of to the existing 
sse-psslw-1.c test and friends is wrong, because it uses variable input.

`Allan
Jakub Jelinek April 24, 2017, 8:40 a.m. UTC | #8
On Mon, Apr 24, 2017 at 10:34:58AM +0200, Allan Sandfeld Jensen wrote:
> That is a different instruction. That is the vpsllw not vpsllwi
> 
> The intrinsics I changed is the immediate version, I didn't change the non-
> immediate version. It is probably a bug if you can give non-immediate values 
> to the immediate only intrinsic. At least both versions handles it, if in 
> different ways, but is is illegal arguments.

The documentation is unclear on that and I've only recently fixed up some
cases where these intrinsics weren't able to handle non-constant arguments
in some cases, while both ICC and clang coped with that fine.
So it is clearly allowed and handled by all the compilers and needs to be
supported, people use that in real-world code.

	Jakub
Jakub Jelinek April 24, 2017, 9:17 a.m. UTC | #9
On Mon, Apr 24, 2017 at 11:01:29AM +0200, Allan Sandfeld Jensen wrote:
> On Monday 24 April 2017, Jakub Jelinek wrote:
> > On Mon, Apr 24, 2017 at 10:34:58AM +0200, Allan Sandfeld Jensen wrote:
> > > That is a different instruction. That is the vpsllw not vpsllwi
> > > 
> > > The intrinsics I changed is the immediate version, I didn't change the
> > > non- immediate version. It is probably a bug if you can give
> > > non-immediate values to the immediate only intrinsic. At least both
> > > versions handles it, if in different ways, but is is illegal arguments.
> > 
> > The documentation is unclear on that and I've only recently fixed up some
> > cases where these intrinsics weren't able to handle non-constant arguments
> > in some cases, while both ICC and clang coped with that fine.
> > So it is clearly allowed and handled by all the compilers and needs to be
> > supported, people use that in real-world code.
> > 
> Undoubtedly it happens. I just make a mistake myself that created that case. 
> But it is rather unfortunate, and means we make wrong code currently for 
> corner case values.

The intrinsic documentation is poor, usually you have a good documentation
on what the instructions do, and then you just have to guess what the
intrinsics do.  You can of course ask Intel for clarification.

If you try:
#include <x86intrin.h>

__m128i
foo (__m128i a, int b)
{
  return _mm_slli_epi16 (a, b);
}
and call it with 257 from somewhere else, you can see that all the compilers
will give you zero vector.  And similarly if you use 257 literally instead
of b.  So what the intrinsic (unlike the instruction)
actually does is that it compares all bits of the imm8 argument (supposedly
using unsigned comparison) and if it is bigger than 15 (or 7 or 31 or 63
depending on the bitsize of element) it yields 0 vector.

	Jakub
Allan Sandfeld Jensen April 24, 2017, 9:36 a.m. UTC | #10
On Monday 24 April 2017, Jakub Jelinek wrote:
> On Mon, Apr 24, 2017 at 11:01:29AM +0200, Allan Sandfeld Jensen wrote:
> > On Monday 24 April 2017, Jakub Jelinek wrote:
> > > On Mon, Apr 24, 2017 at 10:34:58AM +0200, Allan Sandfeld Jensen wrote:
> > > > That is a different instruction. That is the vpsllw not vpsllwi
> > > > 
> > > > The intrinsics I changed is the immediate version, I didn't change
> > > > the non- immediate version. It is probably a bug if you can give
> > > > non-immediate values to the immediate only intrinsic. At least both
> > > > versions handles it, if in different ways, but is is illegal
> > > > arguments.
> > > 
> > > The documentation is unclear on that and I've only recently fixed up
> > > some cases where these intrinsics weren't able to handle non-constant
> > > arguments in some cases, while both ICC and clang coped with that
> > > fine.
> > > So it is clearly allowed and handled by all the compilers and needs to
> > > be supported, people use that in real-world code.
> > 
> > Undoubtedly it happens. I just make a mistake myself that created that
> > case. But it is rather unfortunate, and means we make wrong code
> > currently for corner case values.
> 
> The intrinsic documentation is poor, usually you have a good documentation
> on what the instructions do, and then you just have to guess what the
> intrinsics do.  You can of course ask Intel for clarification.
> 
> If you try:
> #include <x86intrin.h>
> 
> __m128i
> foo (__m128i a, int b)
> {
>   return _mm_slli_epi16 (a, b);
> }
> and call it with 257 from somewhere else, you can see that all the
> compilers will give you zero vector.  And similarly if you use 257
> literally instead of b.  So what the intrinsic (unlike the instruction)
> actually does is that it compares all bits of the imm8 argument (supposedly
> using unsigned comparison) and if it is bigger than 15 (or 7 or 31 or 63
> depending on the bitsize of element) it yields 0 vector.
> 
Good point. I was using intel's documentation at 
https://software.intel.com/sites/landingpage/IntrinsicsGuide/, but if all 
compilers including us does something else, practicality wins.

It did make me curious and test out what _mm_slli_epi16(v, -250); compiles to. 
For some reason that becomes an undefined shift using the non-immediate sll in 
gcc, but returns the zero-vector in clang. With my patch it was a 6 bit shift, 
but that is apparently not de-facto standard.


`Allan
diff mbox

Patch

diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index b58f5050db0..b9406550fc5 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,10 @@ 
+2017-04-22  Allan Sandfeld Jensen  <sandfeld@kde.org>
+
+	* config/i386/emmintrin.h (_mm_slli_*, _mm_srli_*):
+	Use vector intrinstics instead of builtins.
+	* config/i386/avx2intrin.h (_mm256_slli_*, _mm256_srli_*):
+	Use vector intrinstics instead of builtins.
+
 2017-04-21  Uros Bizjak  <ubizjak@gmail.com>
 
 	* config/i386/i386.md (*extzvqi_mem_rex64): Move above *extzv<mode>.
diff --git a/gcc/config/i386/avx2intrin.h b/gcc/config/i386/avx2intrin.h
index 82f170a3d61..acb49734131 100644
--- a/gcc/config/i386/avx2intrin.h
+++ b/gcc/config/i386/avx2intrin.h
@@ -667,7 +667,7 @@  extern __inline __m256i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm256_slli_epi16 (__m256i __A, int __B)
 {
-  return (__m256i)__builtin_ia32_psllwi256 ((__v16hi)__A, __B);
+  return ((__B & 0xff) < 16) ? (__m256i)((__v16hi)__A << (__B & 0xff)) : _mm256_setzero_si256();
 }
 
 extern __inline __m256i
@@ -681,7 +681,7 @@  extern __inline __m256i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm256_slli_epi32 (__m256i __A, int __B)
 {
-  return (__m256i)__builtin_ia32_pslldi256 ((__v8si)__A, __B);
+  return ((__B & 0xff) < 32) ? (__m256i)((__v8si)__A << (__B & 0xff)) : _mm256_setzero_si256();
 }
 
 extern __inline __m256i
@@ -695,7 +695,7 @@  extern __inline __m256i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm256_slli_epi64 (__m256i __A, int __B)
 {
-  return (__m256i)__builtin_ia32_psllqi256 ((__v4di)__A, __B);
+  return ((__B & 0xff) < 64) ? (__m256i)((__v4di)__A << (__B & 0xff)) : _mm256_setzero_si256();
 }
 
 extern __inline __m256i
@@ -758,7 +758,7 @@  extern __inline __m256i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm256_srli_epi16 (__m256i __A, int __B)
 {
-  return (__m256i)__builtin_ia32_psrlwi256 ((__v16hi)__A, __B);
+  return ((__B & 0xff) < 16) ? (__m256i) ((__v16hu)__A >> (__B & 0xff)) : _mm256_setzero_si256();
 }
 
 extern __inline __m256i
@@ -772,7 +772,7 @@  extern __inline __m256i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm256_srli_epi32 (__m256i __A, int __B)
 {
-  return (__m256i)__builtin_ia32_psrldi256 ((__v8si)__A, __B);
+  return ((__B & 0xff) < 32) ? (__m256i) ((__v8su)__A >> (__B & 0xff)) : _mm256_setzero_si256();
 }
 
 extern __inline __m256i
@@ -786,7 +786,7 @@  extern __inline __m256i
 __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
 _mm256_srli_epi64 (__m256i __A, int __B)
 {
-  return (__m256i)__builtin_ia32_psrlqi256 ((__v4di)__A, __B);
+  return ((__B & 0xff) < 64) ? (__m256i) ((__v4du)__A >> (__B & 0xff)) : _mm256_setzero_si256();
 }
 
 extern __inline __m256i
diff --git a/gcc/config/i386/emmintrin.h b/gcc/config/i386/emmintrin.h
index 828f4a07a9b..5c048d9fd0d 100644
--- a/gcc/config/i386/emmintrin.h
+++ b/gcc/config/i386/emmintrin.h
@@ -1140,19 +1140,19 @@  _mm_mul_epu32 (__m128i __A, __m128i __B)
 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_slli_epi16 (__m128i __A, int __B)
 {
-  return (__m128i)__builtin_ia32_psllwi128 ((__v8hi)__A, __B);
+  return ((__B & 0xff) < 16) ? (__m128i)((__v8hi)__A << (__B & 0xff)) : _mm_setzero_si128();
 }
 
 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_slli_epi32 (__m128i __A, int __B)
 {
-  return (__m128i)__builtin_ia32_pslldi128 ((__v4si)__A, __B);
+  return ((__B & 0xff) < 32) ? (__m128i)((__v4si)__A << (__B & 0xff)) : _mm_setzero_si128();
 }
 
 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_slli_epi64 (__m128i __A, int __B)
 {
-  return (__m128i)__builtin_ia32_psllqi128 ((__v2di)__A, __B);
+  return ((__B & 0xff) < 64) ? (__m128i)((__v2di)__A << (__B & 0xff)) : _mm_setzero_si128();
 }
 
 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
@@ -1205,19 +1205,19 @@  _mm_slli_si128 (__m128i __A, const int __N)
 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_srli_epi16 (__m128i __A, int __B)
 {
-  return (__m128i)__builtin_ia32_psrlwi128 ((__v8hi)__A, __B);
+  return ((__B & 0xff) < 16) ? (__m128i)((__v8hu)__A >> (__B & 0xff)) : _mm_setzero_si128();
 }
 
 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_srli_epi32 (__m128i __A, int __B)
 {
-  return (__m128i)__builtin_ia32_psrldi128 ((__v4si)__A, __B);
+  return ((__B & 0xff) < 32) ? (__m128i)((__v4su)__A >> (__B & 0xff)) : _mm_setzero_si128();
 }
 
 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
 _mm_srli_epi64 (__m128i __A, int __B)
 {
-  return (__m128i)__builtin_ia32_psrlqi128 ((__v2di)__A, __B);
+  return ((__B & 0xff) < 64) ? (__m128i)((__v2du)__A >> (__B & 0xff)) : _mm_setzero_si128();
 }
 
 extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog
index 6f4dc8d5095..ffface0f6b3 100644
--- a/gcc/testsuite/ChangeLog
+++ b/gcc/testsuite/ChangeLog
@@ -1,3 +1,13 @@ 
+2017-04-22  Allan Sandfeld Jensen  <sandfeld@kde.org>
+
+	* gcc.target/i386/sse2-pslld-1.c: Expand test with more corner cases.
+	* gcc.target/i386/sse2-psllw-1.c: Expand test with more corner cases.
+	* gcc.target/i386/sse2-pslrd-1.c: Expand test with more corner cases.
+	* gcc.target/i386/sse2-shifts-1.c: New testcases of shift intrinsics
+	producing intended instructions.
+	* gcc.target/i386/sse2-shifts-2.c: New testcasse of shift intrinsics
+	being folded.
+
 2017-04-21  Janus Weil  <janus@gcc.gnu.org>
 
 	PR fortran/80392
diff --git a/gcc/testsuite/gcc.target/i386/sse2-pslld-1.c b/gcc/testsuite/gcc.target/i386/sse2-pslld-1.c
index 31474e3234f..b2eb938fc06 100644
--- a/gcc/testsuite/gcc.target/i386/sse2-pslld-1.c
+++ b/gcc/testsuite/gcc.target/i386/sse2-pslld-1.c
@@ -10,17 +10,15 @@ 
 #define TEST sse2_test
 #endif
 
-#define N 0xf
-
 #include CHECK_H
 
 #include <emmintrin.h>
 
 static __m128i
 __attribute__((noinline, unused))
-test (__m128i s1)
+test (__m128i s1, int n)
 {
-  return _mm_slli_epi32 (s1, N); 
+  return _mm_slli_epi32 (s1, n); 
 }
 
 static void
@@ -28,16 +26,25 @@  TEST (void)
 {
   union128i_d u, s;
   int e[4] = {0};
-  int i;
+  int ns[4] = {15, 65, 260, -250};
  
   s.x = _mm_set_epi32 (1, -2, 3, 4);
 
-  u.x = test (s.x);
-
-  if (N < 32)
-    for (i = 0; i < 4; i++)
-      e[i] = s.a[i] << N; 
-
-  if (check_union128i_d (u, e))
-    abort (); 
+  for (int j = 0; j < 4; j++) {
+    int n = ns[j];
+    u.x = test (s.x, n);
+    
+    n = n & 0xff;
+    if (n < 32) {
+      for (int i = 0; i < 4; i++)
+        e[i] = s.a[i] << n;
+    } else {
+      for (int i = 0; i < 4; i++)
+        e[i] = 0;
+    }
+
+
+    if (check_union128i_d (u, e))
+      abort ();
+  }
 }
diff --git a/gcc/testsuite/gcc.target/i386/sse2-psllw-1.c b/gcc/testsuite/gcc.target/i386/sse2-psllw-1.c
index 3153ec45529..6a740fce050 100644
--- a/gcc/testsuite/gcc.target/i386/sse2-psllw-1.c
+++ b/gcc/testsuite/gcc.target/i386/sse2-psllw-1.c
@@ -10,17 +10,15 @@ 
 #define TEST sse2_test
 #endif
 
-#define N 0xb
-
 #include CHECK_H
 
 #include <emmintrin.h>
 
 static __m128i
 __attribute__((noinline, unused))
-test (__m128i s1)
+test (__m128i s1, int n)
 {
-  return _mm_slli_epi16 (s1, N); 
+  return _mm_slli_epi16 (s1, n); 
 }
 
 static void
@@ -28,16 +26,25 @@  TEST (void)
 {
   union128i_w u, s;
   short e[8] = {0};
+  int ns[4] = {11, 16, 63, -250};
   int i;
  
   s.x = _mm_set_epi16 (1, 2, 3, 4, 5, 6, 0x7000, 0x9000);
 
-  u.x = test (s.x);
-
-  if (N < 16)
-    for (i = 0; i < 8; i++)
-      e[i] = s.a[i] << N; 
-
-  if (check_union128i_w (u, e))
-    abort (); 
+  for (int j = 0; j < 4; j++) {
+    int n = ns[j];
+    u.x = test (s.x, n);
+
+    n = n & 0xff;
+    if (n < 16) {
+      for (i = 0; i < 8; i++)
+        e[i] = s.a[i] << n;
+    } else {
+      for (int i = 0; i < 8; i++)
+        e[i] = 0;
+    }
+
+    if (check_union128i_w (u, e))
+      abort (); 
+  }
 }
diff --git a/gcc/testsuite/gcc.target/i386/sse2-psrld-1.c b/gcc/testsuite/gcc.target/i386/sse2-psrld-1.c
index d310fc45204..ec5f1c2d391 100644
--- a/gcc/testsuite/gcc.target/i386/sse2-psrld-1.c
+++ b/gcc/testsuite/gcc.target/i386/sse2-psrld-1.c
@@ -10,17 +10,15 @@ 
 #define TEST sse2_test
 #endif
 
-#define N 0xf
-
 #include CHECK_H
 
 #include <emmintrin.h>
 
 static __m128i
 __attribute__((noinline, unused))
-test (__m128i s1)
+test (__m128i s1, int n)
 {
-  return _mm_srli_epi32 (s1, N); 
+  return _mm_srli_epi32 (s1, n);
 }
 
 static void
@@ -28,19 +26,28 @@  TEST (void)
 {
   union128i_d u, s;
   int e[4] = {0};
+  int ns[4] = {15, 65, 260, -250};
   unsigned int tmp;
   int i;
  
   s.x = _mm_set_epi32 (1, -2, 3, 4);
 
-  u.x = test (s.x);
-
-  if (N < 32)
-    for (i = 0; i < 4; i++) {
-      tmp  = s.a[i];
-      e[i] = tmp >> N; 
+  for (int j = 0; j < 4; j++) {
+    int n = ns[j];
+    u.x = test (s.x, n);
+
+    n = n & 0xff;
+    if (n < 32) {
+      for (i = 0; i < 4; i++) {
+        tmp  = s.a[i];
+        e[i] = tmp >> n; 
+      }
+    } else {
+      for (int i = 0; i < 4; i++)
+        e[i] = 0;
     }
 
-  if (check_union128i_d (u, e))
-    abort (); 
+    if (check_union128i_d (u, e))
+      abort ();
+  }
 }
diff --git a/gcc/testsuite/gcc.target/i386/sse2-shifts-1.c b/gcc/testsuite/gcc.target/i386/sse2-shifts-1.c
new file mode 100644
index 00000000000..a2305cf042a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/sse2-shifts-1.c
@@ -0,0 +1,54 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse2 -mno-avx" } */
+/* { dg-require-effective-target sse2 } */
+
+#include <emmintrin.h>
+
+__m128i test1(__m128i a)
+{
+    return _mm_slli_epi16(a, 9);
+}
+
+__m128i test2(__m128i a)
+{
+    return _mm_slli_epi32(a, 13);
+}
+
+__m128i test3(__m128i a)
+{
+    return _mm_slli_epi64(a, 17);
+}
+
+__m128i test4(__m128i a)
+{
+    return _mm_srli_epi16(a, 9);
+}
+
+__m128i test5(__m128i a)
+{
+    return _mm_srli_epi32(a, 13);
+}
+
+__m128i test6(__m128i a)
+{
+    return _mm_srli_epi64(a, 7);
+}
+
+__m128i test7(__m128i a)
+{
+    return _mm_srai_epi16(a, 3);
+}
+
+__m128i test8(__m128i a)
+{
+    return _mm_srai_epi32(a, 6);
+}
+
+/* { dg-final { scan-assembler "psllw" } } */
+/* { dg-final { scan-assembler "pslld" } } */
+/* { dg-final { scan-assembler "psllq" } } */
+/* { dg-final { scan-assembler "psrlw" } } */
+/* { dg-final { scan-assembler "psrld" } } */
+/* { dg-final { scan-assembler "psrlq" } } */
+/* { dg-final { scan-assembler "psraw" } } */
+/* { dg-final { scan-assembler "psrad" } } */
diff --git a/gcc/testsuite/gcc.target/i386/sse2-shifts-2.c b/gcc/testsuite/gcc.target/i386/sse2-shifts-2.c
new file mode 100644
index 00000000000..ce05a7dc44e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/sse2-shifts-2.c
@@ -0,0 +1,27 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse2" } */
+/* { dg-require-effective-target sse2 } */
+
+#include <emmintrin.h>
+
+__m128i test1(__m128i a)
+{
+    a = _mm_slli_epi16(a, 2);
+    return _mm_slli_epi16(a, 3);
+}
+/* { dg-final { scan-assembler "psllw.*5"} } */
+
+__m128i test3(__m128i a)
+{
+    a = _mm_srli_epi16(a, 4);
+    return _mm_srli_epi16(a, 9);
+}
+/* { dg-final { scan-assembler-times "psrlw" 1} } */
+
+__m128i test4(__m128i a)
+{
+    a = _mm_setr_epi32(128, 255, 86, 23);
+    return _mm_srli_epi32(a, 8);
+}
+/* { dg-final { scan-assembler-not "psrld"} } */
+