diff mbox series

Add _mm256_{load,store}u2_m128{,d,i} intrinsics (PR target/91341)

Message ID 20190805073050.GP2726@tucnak
State New
Headers show
Series Add _mm256_{load,store}u2_m128{,d,i} intrinsics (PR target/91341) | expand

Commit Message

Jakub Jelinek Aug. 5, 2019, 7:30 a.m. UTC
Hi!

The following patch adds a couple of intrinsics that both ICC and clang
have, but GCC doesn't.

Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?

They emit optimal code except for the last one, _mm256_storeu2_m128i,
where we emit
	vmovups	%xmm0, (%rsi)
	vextractf128	$0x1, %ymm0, %xmm0
	vmovups	%xmm0, (%rdi)
instead of
	vmovups	%xmm0, (%rsi)
	vextractf128	$0x1, %ymm0, (%rdi)
That is because for _mm256_extractf128_si256 is implemented as V8SImode
pattern, but _m128i is V2DImode, and we don't have a pattern like:
        (set (match_operand:V2DI 0 ("nonimmediate_operand") ("=xm, vm"))
	  (subreg:V2DI
            (vec_select:V4SI (match_operand:V8SI 1 ("register_operand") ("x, v"))
                (parallel [
                        (const_int 4 [0x4])
                        (const_int 5 [0x5])
                        (const_int 6 [0x6])
                        (const_int 7 [0x7])
                    ])) 0))
Shall we add that (and just for this mode combination, or using iterators
for others)?  Unfortunately the builtin that would use V2DI in the
vec_select instead of V4SI is AVX2 and so can't be used in this case.

2019-08-05  Jakub Jelinek  <jakub@redhat.com>

	PR target/91341
	* config/i386/avxintrin.h (_mm256_loadu2_m128, _mm256_storeu2_m128,
	_mm256_loadu2_m128d, _mm256_storeu2_m128d, _mm256_loadu2_m128i,
	_mm256_storeu2_m128i): New function.

	* gcc.target/i386/avx-loadu2-m128-1.c: New test.
	* gcc.target/i386/avx-loadu2-m128-2.c: New test.
	* gcc.target/i386/avx-loadu2-m128d-1.c: New test.
	* gcc.target/i386/avx-loadu2-m128d-2.c: New test.
	* gcc.target/i386/avx-loadu2-m128i-1.c: New test.
	* gcc.target/i386/avx-loadu2-m128i-2.c: New test.
	* gcc.target/i386/avx-storeu2-m128-1.c: New test.
	* gcc.target/i386/avx-storeu2-m128-2.c: New test.
	* gcc.target/i386/avx-storeu2-m128d-1.c: New test.
	* gcc.target/i386/avx-storeu2-m128d-2.c: New test.
	* gcc.target/i386/avx-storeu2-m128i-1.c: New test.
	* gcc.target/i386/avx-storeu2-m128i-2.c: New test.


	Jakub

Comments

Uros Bizjak Aug. 5, 2019, 7:40 a.m. UTC | #1
On Mon, Aug 5, 2019 at 9:30 AM Jakub Jelinek <jakub@redhat.com> wrote:
>
> Hi!
>
> The following patch adds a couple of intrinsics that both ICC and clang
> have, but GCC doesn't.
>
> Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?
>
> They emit optimal code except for the last one, _mm256_storeu2_m128i,
> where we emit
>         vmovups %xmm0, (%rsi)
>         vextractf128    $0x1, %ymm0, %xmm0
>         vmovups %xmm0, (%rdi)
> instead of
>         vmovups %xmm0, (%rsi)
>         vextractf128    $0x1, %ymm0, (%rdi)
> That is because for _mm256_extractf128_si256 is implemented as V8SImode
> pattern, but _m128i is V2DImode, and we don't have a pattern like:
>         (set (match_operand:V2DI 0 ("nonimmediate_operand") ("=xm, vm"))
>           (subreg:V2DI
>             (vec_select:V4SI (match_operand:V8SI 1 ("register_operand") ("x, v"))
>                 (parallel [
>                         (const_int 4 [0x4])
>                         (const_int 5 [0x5])
>                         (const_int 6 [0x6])
>                         (const_int 7 [0x7])
>                     ])) 0))
> Shall we add that (and just for this mode combination, or using iterators
> for others)?  Unfortunately the builtin that would use V2DI in the
> vec_select instead of V4SI is AVX2 and so can't be used in this case.

Let's leave this for now. We already have similar cases of subreg
mismatches (not only with xmm regs) that result in unmerged memory
operands, and they are fairly benign.

> 2019-08-05  Jakub Jelinek  <jakub@redhat.com>
>
>         PR target/91341
>         * config/i386/avxintrin.h (_mm256_loadu2_m128, _mm256_storeu2_m128,
>         _mm256_loadu2_m128d, _mm256_storeu2_m128d, _mm256_loadu2_m128i,
>         _mm256_storeu2_m128i): New function.
>
>         * gcc.target/i386/avx-loadu2-m128-1.c: New test.
>         * gcc.target/i386/avx-loadu2-m128-2.c: New test.
>         * gcc.target/i386/avx-loadu2-m128d-1.c: New test.
>         * gcc.target/i386/avx-loadu2-m128d-2.c: New test.
>         * gcc.target/i386/avx-loadu2-m128i-1.c: New test.
>         * gcc.target/i386/avx-loadu2-m128i-2.c: New test.
>         * gcc.target/i386/avx-storeu2-m128-1.c: New test.
>         * gcc.target/i386/avx-storeu2-m128-2.c: New test.
>         * gcc.target/i386/avx-storeu2-m128d-1.c: New test.
>         * gcc.target/i386/avx-storeu2-m128d-2.c: New test.
>         * gcc.target/i386/avx-storeu2-m128i-1.c: New test.
>         * gcc.target/i386/avx-storeu2-m128i-2.c: New test.

OK.

Thanks,
Uros.

> --- gcc/config/i386/avxintrin.h.jj      2019-01-01 12:37:32.417724576 +0100
> +++ gcc/config/i386/avxintrin.h 2019-08-04 16:39:10.091659072 +0200
> @@ -1520,6 +1520,48 @@ _mm256_setr_m128i (__m128i __L, __m128i
>    return _mm256_set_m128i (__H, __L);
>  }
>
> +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
> +_mm256_loadu2_m128 (float const *__PH, float const *__PL)
> +{
> +  return _mm256_insertf128_ps (_mm256_castps128_ps256 (_mm_loadu_ps (__PL)),
> +                              _mm_loadu_ps (__PH), 1);
> +}
> +
> +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
> +_mm256_storeu2_m128 (float *__PH, float *__PL, __m256 __A)
> +{
> +  _mm_storeu_ps (__PL, _mm256_castps256_ps128 (__A));
> +  _mm_storeu_ps (__PH, _mm256_extractf128_ps (__A, 1));
> +}
> +
> +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
> +_mm256_loadu2_m128d (double const *__PH, double const *__PL)
> +{
> +  return _mm256_insertf128_pd (_mm256_castpd128_pd256 (_mm_loadu_pd (__PL)),
> +                              _mm_loadu_pd (__PH), 1);
> +}
> +
> +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
> +_mm256_storeu2_m128d (double *__PH, double *__PL, __m256d __A)
> +{
> +  _mm_storeu_pd (__PL, _mm256_castpd256_pd128 (__A));
> +  _mm_storeu_pd (__PH, _mm256_extractf128_pd (__A, 1));
> +}
> +
> +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
> +_mm256_loadu2_m128i (__m128i_u const *__PH, __m128i_u const *__PL)
> +{
> +  return _mm256_insertf128_si256 (_mm256_castsi128_si256 (_mm_loadu_si128 (__PL)),
> +                                 _mm_loadu_si128 (__PH), 1);
> +}
> +
> +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
> +_mm256_storeu2_m128i (__m128i_u *__PH, __m128i_u *__PL, __m256i __A)
> +{
> +  _mm_storeu_si128 (__PL, _mm256_castsi256_si128 (__A));
> +  _mm_storeu_si128 (__PH, _mm256_extractf128_si256 (__A, 1));
> +}
> +
>  #ifdef __DISABLE_AVX__
>  #undef __DISABLE_AVX__
>  #pragma GCC pop_options
> --- gcc/testsuite/gcc.target/i386/avx-loadu2-m128-1.c.jj        2019-08-04 16:52:17.205753124 +0200
> +++ gcc/testsuite/gcc.target/i386/avx-loadu2-m128-1.c   2019-08-04 16:50:01.315810000 +0200
> @@ -0,0 +1,12 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -mavx" } */
> +/* { dg-final { scan-assembler "\tvmovups\t" } } */
> +/* { dg-final { scan-assembler "\tvinsertf128\t" } } */
> +
> +#include <immintrin.h>
> +
> +__m256
> +foo (float const *hi, float const *lo)
> +{
> +  return _mm256_loadu2_m128 (hi, lo);
> +}
> --- gcc/testsuite/gcc.target/i386/avx-loadu2-m128-2.c.jj        2019-08-04 16:52:20.358705400 +0200
> +++ gcc/testsuite/gcc.target/i386/avx-loadu2-m128-2.c   2019-08-04 16:59:50.002899417 +0200
> @@ -0,0 +1,17 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -mavx" } */
> +/* { dg-require-effective-target avx } */
> +
> +#include "avx-check.h"
> +
> +static void
> +avx_test (void)
> +{
> +  union256 u;
> +  float e[8] = { 1.5f, -9.5f, 13.25f, -24.75f, -18.75f, 12.0f, 0.0f, 9.0f };
> +  float f[8] = { -24.75f, -18.75f, 12.0f, 0.0f, -9.5f, 13.25f, -24.75f, -18.75f };
> +
> +  u.x = _mm256_loadu2_m128 (e + 1, e + 3);
> +  if (check_union256 (u, f))
> +    abort ();
> +}
> --- gcc/testsuite/gcc.target/i386/avx-loadu2-m128d-1.c.jj       2019-08-04 16:52:17.205753124 +0200
> +++ gcc/testsuite/gcc.target/i386/avx-loadu2-m128d-1.c  2019-08-04 17:03:13.548818465 +0200
> @@ -0,0 +1,12 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -mavx" } */
> +/* { dg-final { scan-assembler "\tvmovupd\t" } } */
> +/* { dg-final { scan-assembler "\tvinsertf128\t" } } */
> +
> +#include <immintrin.h>
> +
> +__m256d
> +foo (double const *hi, double const *lo)
> +{
> +  return _mm256_loadu2_m128d (hi, lo);
> +}
> --- gcc/testsuite/gcc.target/i386/avx-loadu2-m128d-2.c.jj       2019-08-04 16:52:20.358705400 +0200
> +++ gcc/testsuite/gcc.target/i386/avx-loadu2-m128d-2.c  2019-08-04 17:05:00.342201999 +0200
> @@ -0,0 +1,17 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -mavx" } */
> +/* { dg-require-effective-target avx } */
> +
> +#include "avx-check.h"
> +
> +static void
> +avx_test (void)
> +{
> +  union256d u;
> +  double e[8] = { 1.5, -9.5, 13.25, -24.75, -18.75, 12.0, 0.0, 9.0 };
> +  double f[4] = { 12.0, 0.0, -9.5, 13.25 };
> +
> +  u.x = _mm256_loadu2_m128d (e + 1, e + 5);
> +  if (check_union256d (u, f))
> +    abort ();
> +}
> --- gcc/testsuite/gcc.target/i386/avx-loadu2-m128i-1.c.jj       2019-08-04 16:52:17.205753124 +0200
> +++ gcc/testsuite/gcc.target/i386/avx-loadu2-m128i-1.c  2019-08-04 17:06:44.386628690 +0200
> @@ -0,0 +1,12 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -mavx" } */
> +/* { dg-final { scan-assembler "\tvmovdqu\t" } } */
> +/* { dg-final { scan-assembler "\tvinsert\[fi]128\t" } } */
> +
> +#include <immintrin.h>
> +
> +__m256i
> +foo (__m128i_u const *hi, __m128i_u const *lo)
> +{
> +  return _mm256_loadu2_m128i (hi, lo);
> +}
> --- gcc/testsuite/gcc.target/i386/avx-loadu2-m128i-2.c.jj       2019-08-04 16:52:20.358705400 +0200
> +++ gcc/testsuite/gcc.target/i386/avx-loadu2-m128i-2.c  2019-08-04 17:11:04.864691481 +0200
> @@ -0,0 +1,17 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -mavx" } */
> +/* { dg-require-effective-target avx } */
> +
> +#include "avx-check.h"
> +
> +static void
> +avx_test (void)
> +{
> +  union256i_d u;
> +  int e[8] = { 1, -9, 13, -24, -18, 12, 0, 9 };
> +  int f[8] = { -24, -18, 12, 0, -9, 13, -24, -18 };
> +
> +  u.x = _mm256_loadu2_m128i ((__m128i_u *) (e + 1), (__m128i_u *) (e + 3));
> +  if (check_union256i_d (u, f))
> +    abort ();
> +}
> --- gcc/testsuite/gcc.target/i386/avx-storeu2-m128-1.c.jj       2019-08-04 17:13:27.124541181 +0200
> +++ gcc/testsuite/gcc.target/i386/avx-storeu2-m128-1.c  2019-08-04 17:15:14.546917455 +0200
> @@ -0,0 +1,12 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -mavx" } */
> +/* { dg-final { scan-assembler "\tvmovups\t" } } */
> +/* { dg-final { scan-assembler "\tvextractf128\t" } } */
> +
> +#include <immintrin.h>
> +
> +void
> +foo (float *hi, float *lo, __m256 a)
> +{
> +  _mm256_storeu2_m128 (hi, lo, a);
> +}
> --- gcc/testsuite/gcc.target/i386/avx-storeu2-m128-2.c.jj       2019-08-04 17:13:30.135495667 +0200
> +++ gcc/testsuite/gcc.target/i386/avx-storeu2-m128-2.c  2019-08-04 17:19:36.590956577 +0200
> @@ -0,0 +1,18 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -mavx" } */
> +/* { dg-require-effective-target avx } */
> +
> +#include "avx-check.h"
> +
> +static void
> +avx_test (void)
> +{
> +  float e[12] = { -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f };
> +  float f[12] = { -1.0f, -18.75f, 12.0f, 0.0f, 9.0f, -1.0f, 1.5f, -9.5f, 13.25f, -24.75f, -1.0f, -1.0f };
> +  int i;
> +  __m256 x = _mm256_set_ps (1.5f, -9.5f, 13.25f, -24.75f, -18.75f, 12.0f, 0.0f, 9.0f);
> +  _mm256_storeu2_m128 (e + 1, e + 6, x);
> +  for (i = 0; i < 12; i++)
> +    if (e[i] != f[i])
> +      abort ();
> +}
> --- gcc/testsuite/gcc.target/i386/avx-storeu2-m128d-1.c.jj      2019-08-04 17:13:27.124541181 +0200
> +++ gcc/testsuite/gcc.target/i386/avx-storeu2-m128d-1.c 2019-08-04 17:34:55.951056592 +0200
> @@ -0,0 +1,12 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -mavx" } */
> +/* { dg-final { scan-assembler "\tvmovup\[sd]\t" } } */
> +/* { dg-final { scan-assembler "\tvextractf128\t" } } */
> +
> +#include <immintrin.h>
> +
> +void
> +foo (double *hi, double *lo, __m256d a)
> +{
> +  _mm256_storeu2_m128d (hi, lo, a);
> +}
> --- gcc/testsuite/gcc.target/i386/avx-storeu2-m128d-2.c.jj      2019-08-04 17:13:30.135495667 +0200
> +++ gcc/testsuite/gcc.target/i386/avx-storeu2-m128d-2.c 2019-08-04 17:35:17.505730678 +0200
> @@ -0,0 +1,18 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -mavx" } */
> +/* { dg-require-effective-target avx } */
> +
> +#include "avx-check.h"
> +
> +static void
> +avx_test (void)
> +{
> +  double e[8] = { -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0 };
> +  double f[8] = { -1.0, 13.25, -24.75, -1.0, 1.5, -9.5, -1.0, -1.0 };
> +  int i;
> +  __m256d x = _mm256_set_pd (1.5, -9.5, 13.25, -24.75);
> +  _mm256_storeu2_m128d (e + 1, e + 4, x);
> +  for (i = 0; i < 8; i++)
> +    if (e[i] != f[i])
> +      abort ();
> +}
> --- gcc/testsuite/gcc.target/i386/avx-storeu2-m128i-1.c.jj      2019-08-04 17:13:27.124541181 +0200
> +++ gcc/testsuite/gcc.target/i386/avx-storeu2-m128i-1.c 2019-08-04 17:42:55.207811439 +0200
> @@ -0,0 +1,12 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -mavx" } */
> +/* { dg-final { scan-assembler "\tvmov(dqu|ups)\t" } } */
> +/* { dg-final { scan-assembler "\tvextract\[if]128\t" } } */
> +
> +#include <immintrin.h>
> +
> +void
> +foo (__m128i_u *hi, __m128i_u *lo, __m256i a)
> +{
> +  _mm256_storeu2_m128i (hi, lo, a);
> +}
> --- gcc/testsuite/gcc.target/i386/avx-storeu2-m128i-2.c.jj      2019-08-04 17:13:30.135495667 +0200
> +++ gcc/testsuite/gcc.target/i386/avx-storeu2-m128i-2.c 2019-08-04 17:43:30.488278278 +0200
> @@ -0,0 +1,18 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -mavx" } */
> +/* { dg-require-effective-target avx } */
> +
> +#include "avx-check.h"
> +
> +static void
> +avx_test (void)
> +{
> +  int e[12] = { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 };
> +  int f[12] = { -1, -18, 12, 0, 9, -1, 1, -9, 13, -24, -1, -1 };
> +  int i;
> +  __m256i x = _mm256_set_epi32 (1, -9, 13, -24, -18, 12, 0, 9);
> +  _mm256_storeu2_m128i ((__m128i_u *) (e + 1), (__m128i_u *) (e + 6), x);
> +  for (i = 0; i < 12; i++)
> +    if (e[i] != f[i])
> +      abort ();
> +}
>
>         Jakub
diff mbox series

Patch

--- gcc/config/i386/avxintrin.h.jj	2019-01-01 12:37:32.417724576 +0100
+++ gcc/config/i386/avxintrin.h	2019-08-04 16:39:10.091659072 +0200
@@ -1520,6 +1520,48 @@  _mm256_setr_m128i (__m128i __L, __m128i
   return _mm256_set_m128i (__H, __L);
 }
 
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_loadu2_m128 (float const *__PH, float const *__PL)
+{
+  return _mm256_insertf128_ps (_mm256_castps128_ps256 (_mm_loadu_ps (__PL)),
+			       _mm_loadu_ps (__PH), 1);
+}
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_storeu2_m128 (float *__PH, float *__PL, __m256 __A)
+{
+  _mm_storeu_ps (__PL, _mm256_castps256_ps128 (__A));
+  _mm_storeu_ps (__PH, _mm256_extractf128_ps (__A, 1));
+}
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_loadu2_m128d (double const *__PH, double const *__PL)
+{
+  return _mm256_insertf128_pd (_mm256_castpd128_pd256 (_mm_loadu_pd (__PL)),
+			       _mm_loadu_pd (__PH), 1);
+}
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_storeu2_m128d (double *__PH, double *__PL, __m256d __A)
+{
+  _mm_storeu_pd (__PL, _mm256_castpd256_pd128 (__A));
+  _mm_storeu_pd (__PH, _mm256_extractf128_pd (__A, 1));
+}
+
+extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_loadu2_m128i (__m128i_u const *__PH, __m128i_u const *__PL)
+{
+  return _mm256_insertf128_si256 (_mm256_castsi128_si256 (_mm_loadu_si128 (__PL)),
+				  _mm_loadu_si128 (__PH), 1);
+}
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_storeu2_m128i (__m128i_u *__PH, __m128i_u *__PL, __m256i __A)
+{
+  _mm_storeu_si128 (__PL, _mm256_castsi256_si128 (__A));
+  _mm_storeu_si128 (__PH, _mm256_extractf128_si256 (__A, 1));
+}
+
 #ifdef __DISABLE_AVX__
 #undef __DISABLE_AVX__
 #pragma GCC pop_options
--- gcc/testsuite/gcc.target/i386/avx-loadu2-m128-1.c.jj	2019-08-04 16:52:17.205753124 +0200
+++ gcc/testsuite/gcc.target/i386/avx-loadu2-m128-1.c	2019-08-04 16:50:01.315810000 +0200
@@ -0,0 +1,12 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx" } */
+/* { dg-final { scan-assembler "\tvmovups\t" } } */
+/* { dg-final { scan-assembler "\tvinsertf128\t" } } */
+
+#include <immintrin.h>
+
+__m256
+foo (float const *hi, float const *lo)
+{
+  return _mm256_loadu2_m128 (hi, lo);
+}
--- gcc/testsuite/gcc.target/i386/avx-loadu2-m128-2.c.jj	2019-08-04 16:52:20.358705400 +0200
+++ gcc/testsuite/gcc.target/i386/avx-loadu2-m128-2.c	2019-08-04 16:59:50.002899417 +0200
@@ -0,0 +1,17 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx" } */
+/* { dg-require-effective-target avx } */
+
+#include "avx-check.h"
+
+static void
+avx_test (void)
+{
+  union256 u;
+  float e[8] = { 1.5f, -9.5f, 13.25f, -24.75f, -18.75f, 12.0f, 0.0f, 9.0f };
+  float f[8] = { -24.75f, -18.75f, 12.0f, 0.0f, -9.5f, 13.25f, -24.75f, -18.75f };
+
+  u.x = _mm256_loadu2_m128 (e + 1, e + 3);
+  if (check_union256 (u, f))
+    abort ();
+}
--- gcc/testsuite/gcc.target/i386/avx-loadu2-m128d-1.c.jj	2019-08-04 16:52:17.205753124 +0200
+++ gcc/testsuite/gcc.target/i386/avx-loadu2-m128d-1.c	2019-08-04 17:03:13.548818465 +0200
@@ -0,0 +1,12 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx" } */
+/* { dg-final { scan-assembler "\tvmovupd\t" } } */
+/* { dg-final { scan-assembler "\tvinsertf128\t" } } */
+
+#include <immintrin.h>
+
+__m256d
+foo (double const *hi, double const *lo)
+{
+  return _mm256_loadu2_m128d (hi, lo);
+}
--- gcc/testsuite/gcc.target/i386/avx-loadu2-m128d-2.c.jj	2019-08-04 16:52:20.358705400 +0200
+++ gcc/testsuite/gcc.target/i386/avx-loadu2-m128d-2.c	2019-08-04 17:05:00.342201999 +0200
@@ -0,0 +1,17 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx" } */
+/* { dg-require-effective-target avx } */
+
+#include "avx-check.h"
+
+static void
+avx_test (void)
+{
+  union256d u;
+  double e[8] = { 1.5, -9.5, 13.25, -24.75, -18.75, 12.0, 0.0, 9.0 };
+  double f[4] = { 12.0, 0.0, -9.5, 13.25 };
+
+  u.x = _mm256_loadu2_m128d (e + 1, e + 5);
+  if (check_union256d (u, f))
+    abort ();
+}
--- gcc/testsuite/gcc.target/i386/avx-loadu2-m128i-1.c.jj	2019-08-04 16:52:17.205753124 +0200
+++ gcc/testsuite/gcc.target/i386/avx-loadu2-m128i-1.c	2019-08-04 17:06:44.386628690 +0200
@@ -0,0 +1,12 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx" } */
+/* { dg-final { scan-assembler "\tvmovdqu\t" } } */
+/* { dg-final { scan-assembler "\tvinsert\[fi]128\t" } } */
+
+#include <immintrin.h>
+
+__m256i
+foo (__m128i_u const *hi, __m128i_u const *lo)
+{
+  return _mm256_loadu2_m128i (hi, lo);
+}
--- gcc/testsuite/gcc.target/i386/avx-loadu2-m128i-2.c.jj	2019-08-04 16:52:20.358705400 +0200
+++ gcc/testsuite/gcc.target/i386/avx-loadu2-m128i-2.c	2019-08-04 17:11:04.864691481 +0200
@@ -0,0 +1,17 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx" } */
+/* { dg-require-effective-target avx } */
+
+#include "avx-check.h"
+
+static void
+avx_test (void)
+{
+  union256i_d u;
+  int e[8] = { 1, -9, 13, -24, -18, 12, 0, 9 };
+  int f[8] = { -24, -18, 12, 0, -9, 13, -24, -18 };
+
+  u.x = _mm256_loadu2_m128i ((__m128i_u *) (e + 1), (__m128i_u *) (e + 3));
+  if (check_union256i_d (u, f))
+    abort ();
+}
--- gcc/testsuite/gcc.target/i386/avx-storeu2-m128-1.c.jj	2019-08-04 17:13:27.124541181 +0200
+++ gcc/testsuite/gcc.target/i386/avx-storeu2-m128-1.c	2019-08-04 17:15:14.546917455 +0200
@@ -0,0 +1,12 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx" } */
+/* { dg-final { scan-assembler "\tvmovups\t" } } */
+/* { dg-final { scan-assembler "\tvextractf128\t" } } */
+
+#include <immintrin.h>
+
+void
+foo (float *hi, float *lo, __m256 a)
+{
+  _mm256_storeu2_m128 (hi, lo, a);
+}
--- gcc/testsuite/gcc.target/i386/avx-storeu2-m128-2.c.jj	2019-08-04 17:13:30.135495667 +0200
+++ gcc/testsuite/gcc.target/i386/avx-storeu2-m128-2.c	2019-08-04 17:19:36.590956577 +0200
@@ -0,0 +1,18 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx" } */
+/* { dg-require-effective-target avx } */
+
+#include "avx-check.h"
+
+static void
+avx_test (void)
+{
+  float e[12] = { -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f, -1.0f };
+  float f[12] = { -1.0f, -18.75f, 12.0f, 0.0f, 9.0f, -1.0f, 1.5f, -9.5f, 13.25f, -24.75f, -1.0f, -1.0f };
+  int i;
+  __m256 x = _mm256_set_ps (1.5f, -9.5f, 13.25f, -24.75f, -18.75f, 12.0f, 0.0f, 9.0f);
+  _mm256_storeu2_m128 (e + 1, e + 6, x);
+  for (i = 0; i < 12; i++)
+    if (e[i] != f[i])
+      abort ();
+}
--- gcc/testsuite/gcc.target/i386/avx-storeu2-m128d-1.c.jj	2019-08-04 17:13:27.124541181 +0200
+++ gcc/testsuite/gcc.target/i386/avx-storeu2-m128d-1.c	2019-08-04 17:34:55.951056592 +0200
@@ -0,0 +1,12 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx" } */
+/* { dg-final { scan-assembler "\tvmovup\[sd]\t" } } */
+/* { dg-final { scan-assembler "\tvextractf128\t" } } */
+
+#include <immintrin.h>
+
+void
+foo (double *hi, double *lo, __m256d a)
+{
+  _mm256_storeu2_m128d (hi, lo, a);
+}
--- gcc/testsuite/gcc.target/i386/avx-storeu2-m128d-2.c.jj	2019-08-04 17:13:30.135495667 +0200
+++ gcc/testsuite/gcc.target/i386/avx-storeu2-m128d-2.c	2019-08-04 17:35:17.505730678 +0200
@@ -0,0 +1,18 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx" } */
+/* { dg-require-effective-target avx } */
+
+#include "avx-check.h"
+
+static void
+avx_test (void)
+{
+  double e[8] = { -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0 };
+  double f[8] = { -1.0, 13.25, -24.75, -1.0, 1.5, -9.5, -1.0, -1.0 };
+  int i;
+  __m256d x = _mm256_set_pd (1.5, -9.5, 13.25, -24.75);
+  _mm256_storeu2_m128d (e + 1, e + 4, x);
+  for (i = 0; i < 8; i++)
+    if (e[i] != f[i])
+      abort ();
+}
--- gcc/testsuite/gcc.target/i386/avx-storeu2-m128i-1.c.jj	2019-08-04 17:13:27.124541181 +0200
+++ gcc/testsuite/gcc.target/i386/avx-storeu2-m128i-1.c	2019-08-04 17:42:55.207811439 +0200
@@ -0,0 +1,12 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx" } */
+/* { dg-final { scan-assembler "\tvmov(dqu|ups)\t" } } */
+/* { dg-final { scan-assembler "\tvextract\[if]128\t" } } */
+
+#include <immintrin.h>
+
+void
+foo (__m128i_u *hi, __m128i_u *lo, __m256i a)
+{
+  _mm256_storeu2_m128i (hi, lo, a);
+}
--- gcc/testsuite/gcc.target/i386/avx-storeu2-m128i-2.c.jj	2019-08-04 17:13:30.135495667 +0200
+++ gcc/testsuite/gcc.target/i386/avx-storeu2-m128i-2.c	2019-08-04 17:43:30.488278278 +0200
@@ -0,0 +1,18 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx" } */
+/* { dg-require-effective-target avx } */
+
+#include "avx-check.h"
+
+static void
+avx_test (void)
+{
+  int e[12] = { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 };
+  int f[12] = { -1, -18, 12, 0, 9, -1, 1, -9, 13, -24, -1, -1 };
+  int i;
+  __m256i x = _mm256_set_epi32 (1, -9, 13, -24, -18, 12, 0, 9);
+  _mm256_storeu2_m128i ((__m128i_u *) (e + 1), (__m128i_u *) (e + 6), x);
+  for (i = 0; i < 12; i++)
+    if (e[i] != f[i])
+      abort ();
+}