Message ID | 201704240933.09704.linux@carewolf.com |
---|---|
State | New |
Headers | show |
On Mon, Apr 24, 2017 at 09:33:09AM +0200, Allan Sandfeld Jensen wrote: > --- a/gcc/config/i386/avx2intrin.h > +++ b/gcc/config/i386/avx2intrin.h > @@ -667,7 +667,7 @@ extern __inline __m256i > __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > _mm256_slli_epi16 (__m256i __A, int __B) > { > - return (__m256i)__builtin_ia32_psllwi256 ((__v16hi)__A, __B); > + return ((__B & 0xff) < 16) ? (__m256i)((__v16hi)__A << (__B & 0xff)) : _mm256_setzero_si256(); > } What is the advantage of doing that when you replace one operation with several (&, <, ?:, <<)? I'd say instead we should fold the builtins if in the gimple fold target hook we see the shift count constant and can decide based on that. Or we could use __builtin_constant_p (__B) to decide whether to use the generic vector shifts or builtin, but that means larger IL. Jakub
On Monday 24 April 2017, Jakub Jelinek wrote: > On Mon, Apr 24, 2017 at 09:33:09AM +0200, Allan Sandfeld Jensen wrote: > > --- a/gcc/config/i386/avx2intrin.h > > +++ b/gcc/config/i386/avx2intrin.h > > @@ -667,7 +667,7 @@ extern __inline __m256i > > > > __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > > _mm256_slli_epi16 (__m256i __A, int __B) > > { > > > > - return (__m256i)__builtin_ia32_psllwi256 ((__v16hi)__A, __B); > > + return ((__B & 0xff) < 16) ? (__m256i)((__v16hi)__A << (__B & 0xff)) : > > _mm256_setzero_si256(); > > > > } > > What is the advantage of doing that when you replace one operation with > several (&, <, ?:, <<)? > I'd say instead we should fold the builtins if in the gimple fold target > hook we see the shift count constant and can decide based on that. > Or we could use __builtin_constant_p (__B) to decide whether to use > the generic vector shifts or builtin, but that means larger IL. The advantage is that in this builtin, the __B is always a literal (or constexpr), so the if statement is resolved at compile time. `Allan
On Mon, Apr 24, 2017 at 09:51:29AM +0200, Allan Sandfeld Jensen wrote: > On Monday 24 April 2017, Jakub Jelinek wrote: > > On Mon, Apr 24, 2017 at 09:33:09AM +0200, Allan Sandfeld Jensen wrote: > > > --- a/gcc/config/i386/avx2intrin.h > > > +++ b/gcc/config/i386/avx2intrin.h > > > @@ -667,7 +667,7 @@ extern __inline __m256i > > > > > > __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > > > _mm256_slli_epi16 (__m256i __A, int __B) > > > { > > > > > > - return (__m256i)__builtin_ia32_psllwi256 ((__v16hi)__A, __B); > > > + return ((__B & 0xff) < 16) ? (__m256i)((__v16hi)__A << (__B & 0xff)) : > > > _mm256_setzero_si256(); > > > > > > } > > > > What is the advantage of doing that when you replace one operation with > > several (&, <, ?:, <<)? > > I'd say instead we should fold the builtins if in the gimple fold target > > hook we see the shift count constant and can decide based on that. > > Or we could use __builtin_constant_p (__B) to decide whether to use > > the generic vector shifts or builtin, but that means larger IL. > > The advantage is that in this builtin, the __B is always a literal (or > constexpr), so the if statement is resolved at compile time. Do we really want to support all the thousands _mm* intrinsics in constexpr contexts? People can just use generic vectors instead. That said, both the options I've mentioned above provide the same advantages and don't have the disadvantages of pessimizing normal code. Jakub
On Monday 24 April 2017, Jakub Jelinek wrote: > On Mon, Apr 24, 2017 at 09:51:29AM +0200, Allan Sandfeld Jensen wrote: > > On Monday 24 April 2017, Jakub Jelinek wrote: > > > On Mon, Apr 24, 2017 at 09:33:09AM +0200, Allan Sandfeld Jensen wrote: > > > > --- a/gcc/config/i386/avx2intrin.h > > > > +++ b/gcc/config/i386/avx2intrin.h > > > > @@ -667,7 +667,7 @@ extern __inline __m256i > > > > > > > > __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) > > > > _mm256_slli_epi16 (__m256i __A, int __B) > > > > { > > > > > > > > - return (__m256i)__builtin_ia32_psllwi256 ((__v16hi)__A, __B); > > > > + return ((__B & 0xff) < 16) ? (__m256i)((__v16hi)__A << (__B & > > > > 0xff)) : _mm256_setzero_si256(); > > > > > > > > } > > > > > > What is the advantage of doing that when you replace one operation with > > > several (&, <, ?:, <<)? > > > I'd say instead we should fold the builtins if in the gimple fold > > > target hook we see the shift count constant and can decide based on > > > that. Or we could use __builtin_constant_p (__B) to decide whether to > > > use the generic vector shifts or builtin, but that means larger IL. > > > > The advantage is that in this builtin, the __B is always a literal (or > > constexpr), so the if statement is resolved at compile time. > > Do we really want to support all the thousands _mm* intrinsics in constexpr > contexts? People can just use generic vectors instead. > I would love to support it, but first we need a C extension attribute matching constexpr, and I consider it a separate issue. > That said, both the options I've mentioned above provide the same > advantages and don't have the disadvantages of pessimizing normal code. > What pessimizing? This produce the same or better code for all legal arguments. The only difference besides better generated code is that it allows the intrinsics to be used incorrectly with non-literal arguments because we lack the C-extension for constexp to prevent that. `Allan
On Mon, Apr 24, 2017 at 10:02:40AM +0200, Allan Sandfeld Jensen wrote: > > That said, both the options I've mentioned above provide the same > > advantages and don't have the disadvantages of pessimizing normal code. > > > What pessimizing? This produce the same or better code for all legal > arguments. The only difference besides better generated code is that it allows No. Have you really tried that? > the intrinsics to be used incorrectly with non-literal arguments because we > lack the C-extension for constexp to prevent that. Consider e.g. -O2 -mavx2 -mtune=intel: #include <x86intrin.h> __m256i foo (__m256i x, int s) { return (__m256i)__builtin_ia32_psllwi256 ((__v16hi)x, s); } __m256i bar (__m256i x, int s) { return ((s & 0xff) < 16) ? (__m256i)((__v16hi)x << (s & 0xff)) : _mm256_setzero_si256 (); } The first one generates movl %edi, %edi vmovq %rdi, %xmm1 vpsllw %xmm1, %ymm0, %ymm0 ret (because that is actually what the instruction does), the second one movzbl %dil, %edi cmpl $15, %edi jg .L5 vmovq %rdi, %xmm1 vpsllw %xmm1, %ymm0, %ymm0 ret .p2align 4,,7 .p2align 3 .L5: vpxor %xmm0, %xmm0, %xmm0 ret Jakub
On Monday 24 April 2017, Jakub Jelinek wrote: > On Mon, Apr 24, 2017 at 10:02:40AM +0200, Allan Sandfeld Jensen wrote: > > > That said, both the options I've mentioned above provide the same > > > advantages and don't have the disadvantages of pessimizing normal code. > > > > What pessimizing? This produce the same or better code for all legal > > arguments. The only difference besides better generated code is that it > > allows > > No. Have you really tried that? > > > the intrinsics to be used incorrectly with non-literal arguments because > > we lack the C-extension for constexp to prevent that. > > Consider e.g. -O2 -mavx2 -mtune=intel: > #include <x86intrin.h> > > __m256i > foo (__m256i x, int s) > { > return (__m256i)__builtin_ia32_psllwi256 ((__v16hi)x, s); > } > > __m256i > bar (__m256i x, int s) > { > return ((s & 0xff) < 16) ? (__m256i)((__v16hi)x << (s & 0xff)) : > _mm256_setzero_si256 (); } > > The first one generates > movl %edi, %edi > vmovq %rdi, %xmm1 > vpsllw %xmm1, %ymm0, %ymm0 > ret > (because that is actually what the instruction does), the second one That is a different instruction. That is the vpsllw not vpsllwi The intrinsics I changed is the immediate version, I didn't change the non- immediate version. It is probably a bug if you can give non-immediate values to the immediate only intrinsic. At least both versions handles it, if in different ways, but is is illegal arguments. `Allan
On Monday 24 April 2017, Allan Sandfeld Jensen wrote: > On Monday 24 April 2017, Jakub Jelinek wrote: > > On Mon, Apr 24, 2017 at 10:02:40AM +0200, Allan Sandfeld Jensen wrote: > > > > That said, both the options I've mentioned above provide the same > > > > advantages and don't have the disadvantages of pessimizing normal > > > > code. > > > > > > What pessimizing? This produce the same or better code for all legal > > > arguments. The only difference besides better generated code is that it > > > allows > > > > No. Have you really tried that? > > > > > the intrinsics to be used incorrectly with non-literal arguments > > > because we lack the C-extension for constexp to prevent that. > > > > Consider e.g. -O2 -mavx2 -mtune=intel: > > #include <x86intrin.h> > > > > __m256i > > foo (__m256i x, int s) > > { > > > > return (__m256i)__builtin_ia32_psllwi256 ((__v16hi)x, s); > > > > } > > > > __m256i > > bar (__m256i x, int s) > > { > > > > return ((s & 0xff) < 16) ? (__m256i)((__v16hi)x << (s & 0xff)) : > > _mm256_setzero_si256 (); } > > > > The first one generates > > > > movl %edi, %edi > > vmovq %rdi, %xmm1 > > vpsllw %xmm1, %ymm0, %ymm0 > > ret > > > > (because that is actually what the instruction does), the second one > > That is a different instruction. That is the vpsllw not vpsllwi > > The intrinsics I changed is the immediate version, I didn't change the non- > immediate version. It is probably a bug if you can give non-immediate > values to the immediate only intrinsic. At least both versions handles it, > if in different ways, but is is illegal arguments. > Though I now that I think about it, this means my change of to the existing sse-psslw-1.c test and friends is wrong, because it uses variable input. `Allan
On Mon, Apr 24, 2017 at 10:34:58AM +0200, Allan Sandfeld Jensen wrote: > That is a different instruction. That is the vpsllw not vpsllwi > > The intrinsics I changed is the immediate version, I didn't change the non- > immediate version. It is probably a bug if you can give non-immediate values > to the immediate only intrinsic. At least both versions handles it, if in > different ways, but is is illegal arguments. The documentation is unclear on that and I've only recently fixed up some cases where these intrinsics weren't able to handle non-constant arguments in some cases, while both ICC and clang coped with that fine. So it is clearly allowed and handled by all the compilers and needs to be supported, people use that in real-world code. Jakub
On Mon, Apr 24, 2017 at 11:01:29AM +0200, Allan Sandfeld Jensen wrote: > On Monday 24 April 2017, Jakub Jelinek wrote: > > On Mon, Apr 24, 2017 at 10:34:58AM +0200, Allan Sandfeld Jensen wrote: > > > That is a different instruction. That is the vpsllw not vpsllwi > > > > > > The intrinsics I changed is the immediate version, I didn't change the > > > non- immediate version. It is probably a bug if you can give > > > non-immediate values to the immediate only intrinsic. At least both > > > versions handles it, if in different ways, but is is illegal arguments. > > > > The documentation is unclear on that and I've only recently fixed up some > > cases where these intrinsics weren't able to handle non-constant arguments > > in some cases, while both ICC and clang coped with that fine. > > So it is clearly allowed and handled by all the compilers and needs to be > > supported, people use that in real-world code. > > > Undoubtedly it happens. I just make a mistake myself that created that case. > But it is rather unfortunate, and means we make wrong code currently for > corner case values. The intrinsic documentation is poor, usually you have a good documentation on what the instructions do, and then you just have to guess what the intrinsics do. You can of course ask Intel for clarification. If you try: #include <x86intrin.h> __m128i foo (__m128i a, int b) { return _mm_slli_epi16 (a, b); } and call it with 257 from somewhere else, you can see that all the compilers will give you zero vector. And similarly if you use 257 literally instead of b. So what the intrinsic (unlike the instruction) actually does is that it compares all bits of the imm8 argument (supposedly using unsigned comparison) and if it is bigger than 15 (or 7 or 31 or 63 depending on the bitsize of element) it yields 0 vector. Jakub
On Monday 24 April 2017, Jakub Jelinek wrote: > On Mon, Apr 24, 2017 at 11:01:29AM +0200, Allan Sandfeld Jensen wrote: > > On Monday 24 April 2017, Jakub Jelinek wrote: > > > On Mon, Apr 24, 2017 at 10:34:58AM +0200, Allan Sandfeld Jensen wrote: > > > > That is a different instruction. That is the vpsllw not vpsllwi > > > > > > > > The intrinsics I changed is the immediate version, I didn't change > > > > the non- immediate version. It is probably a bug if you can give > > > > non-immediate values to the immediate only intrinsic. At least both > > > > versions handles it, if in different ways, but is is illegal > > > > arguments. > > > > > > The documentation is unclear on that and I've only recently fixed up > > > some cases where these intrinsics weren't able to handle non-constant > > > arguments in some cases, while both ICC and clang coped with that > > > fine. > > > So it is clearly allowed and handled by all the compilers and needs to > > > be supported, people use that in real-world code. > > > > Undoubtedly it happens. I just make a mistake myself that created that > > case. But it is rather unfortunate, and means we make wrong code > > currently for corner case values. > > The intrinsic documentation is poor, usually you have a good documentation > on what the instructions do, and then you just have to guess what the > intrinsics do. You can of course ask Intel for clarification. > > If you try: > #include <x86intrin.h> > > __m128i > foo (__m128i a, int b) > { > return _mm_slli_epi16 (a, b); > } > and call it with 257 from somewhere else, you can see that all the > compilers will give you zero vector. And similarly if you use 257 > literally instead of b. So what the intrinsic (unlike the instruction) > actually does is that it compares all bits of the imm8 argument (supposedly > using unsigned comparison) and if it is bigger than 15 (or 7 or 31 or 63 > depending on the bitsize of element) it yields 0 vector. > Good point. I was using intel's documentation at https://software.intel.com/sites/landingpage/IntrinsicsGuide/, but if all compilers including us does something else, practicality wins. It did make me curious and test out what _mm_slli_epi16(v, -250); compiles to. For some reason that becomes an undefined shift using the non-immediate sll in gcc, but returns the zero-vector in clang. With my patch it was a 6 bit shift, but that is apparently not de-facto standard. `Allan
diff --git a/gcc/ChangeLog b/gcc/ChangeLog index b58f5050db0..b9406550fc5 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,10 @@ +2017-04-22 Allan Sandfeld Jensen <sandfeld@kde.org> + + * config/i386/emmintrin.h (_mm_slli_*, _mm_srli_*): + Use vector intrinstics instead of builtins. + * config/i386/avx2intrin.h (_mm256_slli_*, _mm256_srli_*): + Use vector intrinstics instead of builtins. + 2017-04-21 Uros Bizjak <ubizjak@gmail.com> * config/i386/i386.md (*extzvqi_mem_rex64): Move above *extzv<mode>. diff --git a/gcc/config/i386/avx2intrin.h b/gcc/config/i386/avx2intrin.h index 82f170a3d61..acb49734131 100644 --- a/gcc/config/i386/avx2intrin.h +++ b/gcc/config/i386/avx2intrin.h @@ -667,7 +667,7 @@ extern __inline __m256i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm256_slli_epi16 (__m256i __A, int __B) { - return (__m256i)__builtin_ia32_psllwi256 ((__v16hi)__A, __B); + return ((__B & 0xff) < 16) ? (__m256i)((__v16hi)__A << (__B & 0xff)) : _mm256_setzero_si256(); } extern __inline __m256i @@ -681,7 +681,7 @@ extern __inline __m256i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm256_slli_epi32 (__m256i __A, int __B) { - return (__m256i)__builtin_ia32_pslldi256 ((__v8si)__A, __B); + return ((__B & 0xff) < 32) ? (__m256i)((__v8si)__A << (__B & 0xff)) : _mm256_setzero_si256(); } extern __inline __m256i @@ -695,7 +695,7 @@ extern __inline __m256i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm256_slli_epi64 (__m256i __A, int __B) { - return (__m256i)__builtin_ia32_psllqi256 ((__v4di)__A, __B); + return ((__B & 0xff) < 64) ? (__m256i)((__v4di)__A << (__B & 0xff)) : _mm256_setzero_si256(); } extern __inline __m256i @@ -758,7 +758,7 @@ extern __inline __m256i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm256_srli_epi16 (__m256i __A, int __B) { - return (__m256i)__builtin_ia32_psrlwi256 ((__v16hi)__A, __B); + return ((__B & 0xff) < 16) ? (__m256i) ((__v16hu)__A >> (__B & 0xff)) : _mm256_setzero_si256(); } extern __inline __m256i @@ -772,7 +772,7 @@ extern __inline __m256i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm256_srli_epi32 (__m256i __A, int __B) { - return (__m256i)__builtin_ia32_psrldi256 ((__v8si)__A, __B); + return ((__B & 0xff) < 32) ? (__m256i) ((__v8su)__A >> (__B & 0xff)) : _mm256_setzero_si256(); } extern __inline __m256i @@ -786,7 +786,7 @@ extern __inline __m256i __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) _mm256_srli_epi64 (__m256i __A, int __B) { - return (__m256i)__builtin_ia32_psrlqi256 ((__v4di)__A, __B); + return ((__B & 0xff) < 64) ? (__m256i) ((__v4du)__A >> (__B & 0xff)) : _mm256_setzero_si256(); } extern __inline __m256i diff --git a/gcc/config/i386/emmintrin.h b/gcc/config/i386/emmintrin.h index 828f4a07a9b..5c048d9fd0d 100644 --- a/gcc/config/i386/emmintrin.h +++ b/gcc/config/i386/emmintrin.h @@ -1140,19 +1140,19 @@ _mm_mul_epu32 (__m128i __A, __m128i __B) extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_slli_epi16 (__m128i __A, int __B) { - return (__m128i)__builtin_ia32_psllwi128 ((__v8hi)__A, __B); + return ((__B & 0xff) < 16) ? (__m128i)((__v8hi)__A << (__B & 0xff)) : _mm_setzero_si128(); } extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_slli_epi32 (__m128i __A, int __B) { - return (__m128i)__builtin_ia32_pslldi128 ((__v4si)__A, __B); + return ((__B & 0xff) < 32) ? (__m128i)((__v4si)__A << (__B & 0xff)) : _mm_setzero_si128(); } extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_slli_epi64 (__m128i __A, int __B) { - return (__m128i)__builtin_ia32_psllqi128 ((__v2di)__A, __B); + return ((__B & 0xff) < 64) ? (__m128i)((__v2di)__A << (__B & 0xff)) : _mm_setzero_si128(); } extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) @@ -1205,19 +1205,19 @@ _mm_slli_si128 (__m128i __A, const int __N) extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_srli_epi16 (__m128i __A, int __B) { - return (__m128i)__builtin_ia32_psrlwi128 ((__v8hi)__A, __B); + return ((__B & 0xff) < 16) ? (__m128i)((__v8hu)__A >> (__B & 0xff)) : _mm_setzero_si128(); } extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_srli_epi32 (__m128i __A, int __B) { - return (__m128i)__builtin_ia32_psrldi128 ((__v4si)__A, __B); + return ((__B & 0xff) < 32) ? (__m128i)((__v4su)__A >> (__B & 0xff)) : _mm_setzero_si128(); } extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_srli_epi64 (__m128i __A, int __B) { - return (__m128i)__builtin_ia32_psrlqi128 ((__v2di)__A, __B); + return ((__B & 0xff) < 64) ? (__m128i)((__v2du)__A >> (__B & 0xff)) : _mm_setzero_si128(); } extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index 6f4dc8d5095..ffface0f6b3 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,3 +1,13 @@ +2017-04-22 Allan Sandfeld Jensen <sandfeld@kde.org> + + * gcc.target/i386/sse2-pslld-1.c: Expand test with more corner cases. + * gcc.target/i386/sse2-psllw-1.c: Expand test with more corner cases. + * gcc.target/i386/sse2-pslrd-1.c: Expand test with more corner cases. + * gcc.target/i386/sse2-shifts-1.c: New testcases of shift intrinsics + producing intended instructions. + * gcc.target/i386/sse2-shifts-2.c: New testcasse of shift intrinsics + being folded. + 2017-04-21 Janus Weil <janus@gcc.gnu.org> PR fortran/80392 diff --git a/gcc/testsuite/gcc.target/i386/sse2-pslld-1.c b/gcc/testsuite/gcc.target/i386/sse2-pslld-1.c index 31474e3234f..b2eb938fc06 100644 --- a/gcc/testsuite/gcc.target/i386/sse2-pslld-1.c +++ b/gcc/testsuite/gcc.target/i386/sse2-pslld-1.c @@ -10,17 +10,15 @@ #define TEST sse2_test #endif -#define N 0xf - #include CHECK_H #include <emmintrin.h> static __m128i __attribute__((noinline, unused)) -test (__m128i s1) +test (__m128i s1, int n) { - return _mm_slli_epi32 (s1, N); + return _mm_slli_epi32 (s1, n); } static void @@ -28,16 +26,25 @@ TEST (void) { union128i_d u, s; int e[4] = {0}; - int i; + int ns[4] = {15, 65, 260, -250}; s.x = _mm_set_epi32 (1, -2, 3, 4); - u.x = test (s.x); - - if (N < 32) - for (i = 0; i < 4; i++) - e[i] = s.a[i] << N; - - if (check_union128i_d (u, e)) - abort (); + for (int j = 0; j < 4; j++) { + int n = ns[j]; + u.x = test (s.x, n); + + n = n & 0xff; + if (n < 32) { + for (int i = 0; i < 4; i++) + e[i] = s.a[i] << n; + } else { + for (int i = 0; i < 4; i++) + e[i] = 0; + } + + + if (check_union128i_d (u, e)) + abort (); + } } diff --git a/gcc/testsuite/gcc.target/i386/sse2-psllw-1.c b/gcc/testsuite/gcc.target/i386/sse2-psllw-1.c index 3153ec45529..6a740fce050 100644 --- a/gcc/testsuite/gcc.target/i386/sse2-psllw-1.c +++ b/gcc/testsuite/gcc.target/i386/sse2-psllw-1.c @@ -10,17 +10,15 @@ #define TEST sse2_test #endif -#define N 0xb - #include CHECK_H #include <emmintrin.h> static __m128i __attribute__((noinline, unused)) -test (__m128i s1) +test (__m128i s1, int n) { - return _mm_slli_epi16 (s1, N); + return _mm_slli_epi16 (s1, n); } static void @@ -28,16 +26,25 @@ TEST (void) { union128i_w u, s; short e[8] = {0}; + int ns[4] = {11, 16, 63, -250}; int i; s.x = _mm_set_epi16 (1, 2, 3, 4, 5, 6, 0x7000, 0x9000); - u.x = test (s.x); - - if (N < 16) - for (i = 0; i < 8; i++) - e[i] = s.a[i] << N; - - if (check_union128i_w (u, e)) - abort (); + for (int j = 0; j < 4; j++) { + int n = ns[j]; + u.x = test (s.x, n); + + n = n & 0xff; + if (n < 16) { + for (i = 0; i < 8; i++) + e[i] = s.a[i] << n; + } else { + for (int i = 0; i < 8; i++) + e[i] = 0; + } + + if (check_union128i_w (u, e)) + abort (); + } } diff --git a/gcc/testsuite/gcc.target/i386/sse2-psrld-1.c b/gcc/testsuite/gcc.target/i386/sse2-psrld-1.c index d310fc45204..ec5f1c2d391 100644 --- a/gcc/testsuite/gcc.target/i386/sse2-psrld-1.c +++ b/gcc/testsuite/gcc.target/i386/sse2-psrld-1.c @@ -10,17 +10,15 @@ #define TEST sse2_test #endif -#define N 0xf - #include CHECK_H #include <emmintrin.h> static __m128i __attribute__((noinline, unused)) -test (__m128i s1) +test (__m128i s1, int n) { - return _mm_srli_epi32 (s1, N); + return _mm_srli_epi32 (s1, n); } static void @@ -28,19 +26,28 @@ TEST (void) { union128i_d u, s; int e[4] = {0}; + int ns[4] = {15, 65, 260, -250}; unsigned int tmp; int i; s.x = _mm_set_epi32 (1, -2, 3, 4); - u.x = test (s.x); - - if (N < 32) - for (i = 0; i < 4; i++) { - tmp = s.a[i]; - e[i] = tmp >> N; + for (int j = 0; j < 4; j++) { + int n = ns[j]; + u.x = test (s.x, n); + + n = n & 0xff; + if (n < 32) { + for (i = 0; i < 4; i++) { + tmp = s.a[i]; + e[i] = tmp >> n; + } + } else { + for (int i = 0; i < 4; i++) + e[i] = 0; } - if (check_union128i_d (u, e)) - abort (); + if (check_union128i_d (u, e)) + abort (); + } } diff --git a/gcc/testsuite/gcc.target/i386/sse2-shifts-1.c b/gcc/testsuite/gcc.target/i386/sse2-shifts-1.c new file mode 100644 index 00000000000..a2305cf042a --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/sse2-shifts-1.c @@ -0,0 +1,54 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -msse2 -mno-avx" } */ +/* { dg-require-effective-target sse2 } */ + +#include <emmintrin.h> + +__m128i test1(__m128i a) +{ + return _mm_slli_epi16(a, 9); +} + +__m128i test2(__m128i a) +{ + return _mm_slli_epi32(a, 13); +} + +__m128i test3(__m128i a) +{ + return _mm_slli_epi64(a, 17); +} + +__m128i test4(__m128i a) +{ + return _mm_srli_epi16(a, 9); +} + +__m128i test5(__m128i a) +{ + return _mm_srli_epi32(a, 13); +} + +__m128i test6(__m128i a) +{ + return _mm_srli_epi64(a, 7); +} + +__m128i test7(__m128i a) +{ + return _mm_srai_epi16(a, 3); +} + +__m128i test8(__m128i a) +{ + return _mm_srai_epi32(a, 6); +} + +/* { dg-final { scan-assembler "psllw" } } */ +/* { dg-final { scan-assembler "pslld" } } */ +/* { dg-final { scan-assembler "psllq" } } */ +/* { dg-final { scan-assembler "psrlw" } } */ +/* { dg-final { scan-assembler "psrld" } } */ +/* { dg-final { scan-assembler "psrlq" } } */ +/* { dg-final { scan-assembler "psraw" } } */ +/* { dg-final { scan-assembler "psrad" } } */ diff --git a/gcc/testsuite/gcc.target/i386/sse2-shifts-2.c b/gcc/testsuite/gcc.target/i386/sse2-shifts-2.c new file mode 100644 index 00000000000..ce05a7dc44e --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/sse2-shifts-2.c @@ -0,0 +1,27 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -msse2" } */ +/* { dg-require-effective-target sse2 } */ + +#include <emmintrin.h> + +__m128i test1(__m128i a) +{ + a = _mm_slli_epi16(a, 2); + return _mm_slli_epi16(a, 3); +} +/* { dg-final { scan-assembler "psllw.*5"} } */ + +__m128i test3(__m128i a) +{ + a = _mm_srli_epi16(a, 4); + return _mm_srli_epi16(a, 9); +} +/* { dg-final { scan-assembler-times "psrlw" 1} } */ + +__m128i test4(__m128i a) +{ + a = _mm_setr_epi32(128, 255, 86, 23); + return _mm_srli_epi32(a, 8); +} +/* { dg-final { scan-assembler-not "psrld"} } */ +