diff mbox series

[v2,ARM,4/x] : MVE ACLE vector interleaving store intrinsics.

Message ID DBBPR08MB477526473432F76B0B71490F9BFF0@DBBPR08MB4775.eurprd08.prod.outlook.com
State New
Headers show
Series [v2,ARM,4/x] : MVE ACLE vector interleaving store intrinsics. | expand

Commit Message

Srinath Parvathaneni March 10, 2020, 6:20 p.m. UTC
Hello Kyrill,

Following patch is the rebased version of v1.
(version v1) https://gcc.gnu.org/pipermail/gcc-patches/2019-November/534328.html

####

Hello,

This patch supports MVE ACLE intrinsics vst4q_s8, vst4q_s16, vst4q_s32, vst4q_u8,
vst4q_u16, vst4q_u32, vst4q_f16 and vst4q_f32.

In this patch arm_mve_builtins.def file is added to the source code in which the
builtins for MVE ACLE intrinsics are defined using builtin qualifiers.

Please refer to M-profile Vector Extension (MVE) intrinsics [1]  for more details.
[1] https://developer.arm.com/architectures/instruction-sets/simd-isas/helium/mve-intrinsics

Regression tested on target arm-none-eabi and armeb-none-eabi and found no regressions.

Ok for trunk?

Thanks,
Srinath.

gcc/ChangeLog:

2020-03-06  Andre Vieira  <andre.simoesdiasvieira@arm.com>
	    Mihail Ionescu  <mihail.ionescu@arm.com>
	    Srinath Parvathaneni  <srinath.parvathaneni@arm.com>

	* config/arm/arm-builtins.c (CF): Define mve_builtin_data.
	(VAR1): Define.
	(ARM_BUILTIN_MVE_PATTERN_START): Define.
	(arm_init_mve_builtins): Define function.
	(arm_init_builtins): Add TARGET_HAVE_MVE check.
	(arm_expand_builtin_1): Check the range of fcode.
	(arm_expand_mve_builtin): Define function to expand MVE builtins.
	(arm_expand_builtin): Check the range of fcode.
	* config/arm/arm_mve.h (__ARM_FEATURE_MVE): Define MVE floating point
        types.
	(__ARM_MVE_PRESERVE_USER_NAMESPACE): Define to protect user namespace.
	(vst4q_s8): Define macro.
	(vst4q_s16): Likewise.
	(vst4q_s32): Likewise.
	(vst4q_u8): Likewise.
	(vst4q_u16): Likewise.
	(vst4q_u32): Likewise.
	(vst4q_f16): Likewise.
	(vst4q_f32): Likewise.
	(__arm_vst4q_s8): Define inline builtin.
	(__arm_vst4q_s16): Likewise.
	(__arm_vst4q_s32): Likewise.
	(__arm_vst4q_u8): Likewise.
	(__arm_vst4q_u16): Likewise.
	(__arm_vst4q_u32): Likewise.
	(__arm_vst4q_f16): Likewise.
	(__arm_vst4q_f32): Likewise.
	(__ARM_mve_typeid): Define macro with MVE types.
	(__ARM_mve_coerce): Define macro with _Generic feature.
	(vst4q): Define polymorphic variant for different vst4q builtins.
	* config/arm/arm_mve_builtins.def: New file.
	* config/arm/iterators.md (VSTRUCT): Modify to allow XI and OI
	modes in MVE.
	* config/arm/mve.md (MVE_VLD_ST): Define iterator.
	(unspec): Define unspec.
	(mve_vst4q<mode>): Define RTL pattern.
	* config/arm/neon.md (mov<mode>): Modify expand to allow XI and OI
	modes in MVE.
	(neon_mov<mode>): Modify RTL define_insn to allow XI and OI modes
	in MVE.
	(define_split): Allow OI mode split for MVE after reload.
	(define_split): Allow XI mode split for MVE after reload.
	* config/arm/t-arm (arm.o): Add entry for arm_mve_builtins.def.
	(arm-builtins.o): Likewise.

gcc/testsuite/ChangeLog:

2020-03-06  Andre Vieira  <andre.simoesdiasvieira@arm.com>
	    Mihail Ionescu  <mihail.ionescu@arm.com>
	    Srinath Parvathaneni  <srinath.parvathaneni@arm.com>

	* gcc.target/arm/mve/intrinsics/vst4q_f16.c: New test.
	* gcc.target/arm/mve/intrinsics/vst4q_f32.c: Likewise.
	* gcc.target/arm/mve/intrinsics/vst4q_s16.c: Likewise.
	* gcc.target/arm/mve/intrinsics/vst4q_s32.c: Likewise.
	* gcc.target/arm/mve/intrinsics/vst4q_s8.c: Likewise.
	* gcc.target/arm/mve/intrinsics/vst4q_u16.c: Likewise.
	* gcc.target/arm/mve/intrinsics/vst4q_u32.c: Likewise.
	* gcc.target/arm/mve/intrinsics/vst4q_u8.c: Likewise.


###############     Attachment also inlined for ease of reply    ###############
diff --git a/gcc/config/arm/arm-builtins.c b/gcc/config/arm/arm-builtins.c
index 28917363eeae51b7cc39576f3c3e77a0350b8877..b9ee45d5950ac9c1e12d88cd7d3ece1953dc51d0 100644
--- a/gcc/config/arm/arm-builtins.c
+++ b/gcc/config/arm/arm-builtins.c
@@ -432,6 +432,13 @@ static arm_builtin_datum neon_builtin_data[] =
 };
 
 #undef CF
+#define CF(N,X) CODE_FOR_mve_##N##X
+static arm_builtin_datum mve_builtin_data[] =
+{
+#include "arm_mve_builtins.def"
+};
+
+#undef CF
 #undef VAR1
 #define VAR1(T, N, A) \
   {#N, UP (A), CODE_FOR_arm_##N, 0, T##_QUALIFIERS},
@@ -736,6 +743,13 @@ enum arm_builtins
 
 #include "arm_acle_builtins.def"
 
+  ARM_BUILTIN_MVE_BASE,
+
+#undef VAR1
+#define VAR1(T, N, X) \
+  ARM_BUILTIN_MVE_##N##X,
+#include "arm_mve_builtins.def"
+
   ARM_BUILTIN_MAX
 };
 
@@ -745,6 +759,9 @@ enum arm_builtins
 #define ARM_BUILTIN_NEON_PATTERN_START \
   (ARM_BUILTIN_NEON_BASE + 1)
 
+#define ARM_BUILTIN_MVE_PATTERN_START \
+  (ARM_BUILTIN_MVE_BASE + 1)
+
 #define ARM_BUILTIN_ACLE_PATTERN_START \
   (ARM_BUILTIN_ACLE_BASE + 1)
 
@@ -1276,6 +1293,22 @@ arm_init_acle_builtins (void)
     }
 }
 
+/* Set up all the MVE builtins mentioned in arm_mve_builtins.def file.  */
+static void
+arm_init_mve_builtins (void)
+{
+  volatile unsigned int i, fcode = ARM_BUILTIN_MVE_PATTERN_START;
+
+  arm_init_simd_builtin_scalar_types ();
+  arm_init_simd_builtin_types ();
+
+  for (i = 0; i < ARRAY_SIZE (mve_builtin_data); i++, fcode++)
+    {
+      arm_builtin_datum *d = &mve_builtin_data[i];
+      arm_init_builtin (fcode, d, "__builtin_mve");
+    }
+}
+
 /* Set up all the NEON builtins, even builtins for instructions that are not
    in the current target ISA to allow the user to compile particular modules
    with different target specific options that differ from the command line
@@ -2020,8 +2053,10 @@ arm_init_builtins (void)
       = add_builtin_function ("__builtin_arm_lane_check", lane_check_fpr,
 			      ARM_BUILTIN_SIMD_LANE_CHECK, BUILT_IN_MD,
 			      NULL, NULL_TREE);
-
-      arm_init_neon_builtins ();
+      if (TARGET_HAVE_MVE)
+	arm_init_mve_builtins ();
+      else
+	arm_init_neon_builtins ();
       arm_init_vfp_builtins ();
       arm_init_crypto_builtins ();
     }
@@ -2565,10 +2600,14 @@ arm_expand_builtin_1 (int fcode, tree exp, rtx target,
   int is_void = 0;
   int k;
   bool neon = false;
+  bool mve = false;
 
   if (IN_RANGE (fcode, ARM_BUILTIN_VFP_BASE, ARM_BUILTIN_ACLE_BASE - 1))
     neon = true;
 
+  if (IN_RANGE (fcode, ARM_BUILTIN_MVE_BASE, ARM_BUILTIN_MAX - 1))
+    mve = true;
+
   is_void = !!(d->qualifiers[0] & qualifier_void);
 
   num_args += is_void;
@@ -2610,7 +2649,7 @@ arm_expand_builtin_1 (int fcode, tree exp, rtx target,
 	}
       else if (d->qualifiers[qualifiers_k] & qualifier_pointer)
 	{
-	  if (neon)
+	  if (neon || mve)
 	    args[k] = ARG_BUILTIN_NEON_MEMORY;
 	  else
 	    args[k] = ARG_BUILTIN_MEMORY;
@@ -2660,6 +2699,26 @@ arm_expand_acle_builtin (int fcode, tree exp, rtx target)
   return arm_expand_builtin_1 (fcode, exp, target, d);
 }
 
+/* Expand an MVE builtin, i.e. those registered only if their respective target
+   constraints are met.  This check happens within arm_expand_builtin.  */
+
+static rtx
+arm_expand_mve_builtin (int fcode, tree exp, rtx target)
+{
+  if (fcode >= ARM_BUILTIN_MVE_BASE && !TARGET_HAVE_MVE)
+  {
+    fatal_error (input_location,
+		"You must enable MVE instructions"
+		" to use these intrinsics");
+    return const0_rtx;
+  }
+
+  arm_builtin_datum *d
+    = &mve_builtin_data[fcode - ARM_BUILTIN_MVE_PATTERN_START];
+
+  return arm_expand_builtin_1 (fcode, exp, target, d);
+}
+
 /* Expand a Neon builtin, i.e. those registered only if TARGET_NEON holds.
    Most of these are "special" because they don't have symbolic
    constants defined per-instruction or per instruction-variant.  Instead, the
@@ -2753,6 +2812,8 @@ arm_expand_builtin (tree exp,
       /* Don't generate any RTL.  */
       return const0_rtx;
     }
+  if (fcode >= ARM_BUILTIN_MVE_BASE)
+    return arm_expand_mve_builtin (fcode, exp, target);
 
   if (fcode >= ARM_BUILTIN_ACLE_BASE)
     return arm_expand_acle_builtin (fcode, exp, target);
diff --git a/gcc/config/arm/arm_mve.h b/gcc/config/arm/arm_mve.h
index 5ffb466596b5d8fc330616a6fcc7ee37d3e28def..39c6a1551a72700292dde8ef6cea44ba0907af8d 100644
--- a/gcc/config/arm/arm_mve.h
+++ b/gcc/config/arm/arm_mve.h
@@ -42,6 +42,13 @@ typedef __simd128_float16_t float16x8_t;
 typedef __simd128_float32_t float32x4_t;
 #endif
 
+#if (__ARM_FEATURE_MVE & 2) /* MVE Floating point.  */
+typedef struct { float16x8_t val[2]; } float16x8x2_t;
+typedef struct { float16x8_t val[4]; } float16x8x4_t;
+typedef struct { float32x4_t val[2]; } float32x4x2_t;
+typedef struct { float32x4_t val[4]; } float32x4x4_t;
+#endif
+
 typedef uint16_t mve_pred16_t;
 typedef __simd128_uint8_t uint8x16_t;
 typedef __simd128_uint16_t uint16x8_t;
@@ -52,6 +59,330 @@ typedef __simd128_int16_t int16x8_t;
 typedef __simd128_int32_t int32x4_t;
 typedef __simd128_int64_t int64x2_t;
 
+typedef struct { int16x8_t val[2]; } int16x8x2_t;
+typedef struct { int16x8_t val[4]; } int16x8x4_t;
+typedef struct { int32x4_t val[2]; } int32x4x2_t;
+typedef struct { int32x4_t val[4]; } int32x4x4_t;
+typedef struct { int8x16_t val[2]; } int8x16x2_t;
+typedef struct { int8x16_t val[4]; } int8x16x4_t;
+typedef struct { uint16x8_t val[2]; } uint16x8x2_t;
+typedef struct { uint16x8_t val[4]; } uint16x8x4_t;
+typedef struct { uint32x4_t val[2]; } uint32x4x2_t;
+typedef struct { uint32x4_t val[4]; } uint32x4x4_t;
+typedef struct { uint8x16_t val[2]; } uint8x16x2_t;
+typedef struct { uint8x16_t val[4]; } uint8x16x4_t;
+
+#ifndef __ARM_MVE_PRESERVE_USER_NAMESPACE
+#define vst4q_s8( __addr, __value) __arm_vst4q_s8( __addr, __value)
+#define vst4q_s16( __addr, __value) __arm_vst4q_s16( __addr, __value)
+#define vst4q_s32( __addr, __value) __arm_vst4q_s32( __addr, __value)
+#define vst4q_u8( __addr, __value) __arm_vst4q_u8( __addr, __value)
+#define vst4q_u16( __addr, __value) __arm_vst4q_u16( __addr, __value)
+#define vst4q_u32( __addr, __value) __arm_vst4q_u32( __addr, __value)
+#define vst4q_f16( __addr, __value) __arm_vst4q_f16( __addr, __value)
+#define vst4q_f32( __addr, __value) __arm_vst4q_f32( __addr, __value)
+#endif
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+__arm_vst4q_s8 (int8_t * __addr, int8x16x4_t __value)
+{
+  union { int8x16x4_t __i; __builtin_neon_xi __o; } __rv;
+  __rv.__i = __value;
+  __builtin_mve_vst4qv16qi ((__builtin_neon_qi *) __addr, __rv.__o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+__arm_vst4q_s16 (int16_t * __addr, int16x8x4_t __value)
+{
+  union { int16x8x4_t __i; __builtin_neon_xi __o; } __rv;
+  __rv.__i = __value;
+  __builtin_mve_vst4qv8hi ((__builtin_neon_hi *) __addr, __rv.__o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+__arm_vst4q_s32 (int32_t * __addr, int32x4x4_t __value)
+{
+  union { int32x4x4_t __i; __builtin_neon_xi __o; } __rv;
+  __rv.__i = __value;
+  __builtin_mve_vst4qv4si ((__builtin_neon_si *) __addr, __rv.__o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+__arm_vst4q_u8 (uint8_t * __addr, uint8x16x4_t __value)
+{
+  union { uint8x16x4_t __i; __builtin_neon_xi __o; } __rv;
+  __rv.__i = __value;
+  __builtin_mve_vst4qv16qi ((__builtin_neon_qi *) __addr, __rv.__o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+__arm_vst4q_u16 (uint16_t * __addr, uint16x8x4_t __value)
+{
+  union { uint16x8x4_t __i; __builtin_neon_xi __o; } __rv;
+  __rv.__i = __value;
+  __builtin_mve_vst4qv8hi ((__builtin_neon_hi *) __addr, __rv.__o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+__arm_vst4q_u32 (uint32_t * __addr, uint32x4x4_t __value)
+{
+  union { uint32x4x4_t __i; __builtin_neon_xi __o; } __rv;
+  __rv.__i = __value;
+  __builtin_mve_vst4qv4si ((__builtin_neon_si *) __addr, __rv.__o);
+}
+
+#if (__ARM_FEATURE_MVE & 2) /* MVE Floating point.  */
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+__arm_vst4q_f16 (float16_t * __addr, float16x8x4_t __value)
+{
+  union { float16x8x4_t __i; __builtin_neon_xi __o; } __rv;
+  __rv.__i = __value;
+  __builtin_mve_vst4qv8hf (__addr, __rv.__o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+__arm_vst4q_f32 (float32_t * __addr, float32x4x4_t __value)
+{
+  union { float32x4x4_t __i; __builtin_neon_xi __o; } __rv;
+  __rv.__i = __value;
+  __builtin_mve_vst4qv4sf (__addr, __rv.__o);
+}
+
+#endif
+
+enum {
+    __ARM_mve_type_float16_t = 1,
+    __ARM_mve_type_float16_t_ptr,
+    __ARM_mve_type_float16_t_const_ptr,
+    __ARM_mve_type_float16x8_t,
+    __ARM_mve_type_float16x8x2_t,
+    __ARM_mve_type_float16x8x4_t,
+    __ARM_mve_type_float32_t,
+    __ARM_mve_type_float32_t_ptr,
+    __ARM_mve_type_float32_t_const_ptr,
+    __ARM_mve_type_float32x4_t,
+    __ARM_mve_type_float32x4x2_t,
+    __ARM_mve_type_float32x4x4_t,
+    __ARM_mve_type_int16_t,
+    __ARM_mve_type_int16_t_ptr,
+    __ARM_mve_type_int16_t_const_ptr,
+    __ARM_mve_type_int16x8_t,
+    __ARM_mve_type_int16x8x2_t,
+    __ARM_mve_type_int16x8x4_t,
+    __ARM_mve_type_int32_t,
+    __ARM_mve_type_int32_t_ptr,
+    __ARM_mve_type_int32_t_const_ptr,
+    __ARM_mve_type_int32x4_t,
+    __ARM_mve_type_int32x4x2_t,
+    __ARM_mve_type_int32x4x4_t,
+    __ARM_mve_type_int64_t,
+    __ARM_mve_type_int64_t_ptr,
+    __ARM_mve_type_int64_t_const_ptr,
+    __ARM_mve_type_int64x2_t,
+    __ARM_mve_type_int8_t,
+    __ARM_mve_type_int8_t_ptr,
+    __ARM_mve_type_int8_t_const_ptr,
+    __ARM_mve_type_int8x16_t,
+    __ARM_mve_type_int8x16x2_t,
+    __ARM_mve_type_int8x16x4_t,
+    __ARM_mve_type_uint16_t,
+    __ARM_mve_type_uint16_t_ptr,
+    __ARM_mve_type_uint16_t_const_ptr,
+    __ARM_mve_type_uint16x8_t,
+    __ARM_mve_type_uint16x8x2_t,
+    __ARM_mve_type_uint16x8x4_t,
+    __ARM_mve_type_uint32_t,
+    __ARM_mve_type_uint32_t_ptr,
+    __ARM_mve_type_uint32_t_const_ptr,
+    __ARM_mve_type_uint32x4_t,
+    __ARM_mve_type_uint32x4x2_t,
+    __ARM_mve_type_uint32x4x4_t,
+    __ARM_mve_type_uint64_t,
+    __ARM_mve_type_uint64_t_ptr,
+    __ARM_mve_type_uint64_t_const_ptr,
+    __ARM_mve_type_uint64x2_t,
+    __ARM_mve_type_uint8_t,
+    __ARM_mve_type_uint8_t_ptr,
+    __ARM_mve_type_uint8_t_const_ptr,
+    __ARM_mve_type_uint8x16_t,
+    __ARM_mve_type_uint8x16x2_t,
+    __ARM_mve_type_uint8x16x4_t,
+    __ARM_mve_unsupported_type
+};
+
+#if (__ARM_FEATURE_MVE & 2) /* MVE Floating point.  */
+#define __ARM_mve_typeid(x) _Generic(x, \
+    float16_t: __ARM_mve_type_float16_t, \
+    float16_t *: __ARM_mve_type_float16_t_ptr, \
+    float16_t const *: __ARM_mve_type_float16_t_const_ptr, \
+    float16x8_t: __ARM_mve_type_float16x8_t, \
+    float16x8x2_t: __ARM_mve_type_float16x8x2_t, \
+    float16x8x4_t: __ARM_mve_type_float16x8x4_t, \
+    float32_t: __ARM_mve_type_float32_t, \
+    float32_t *: __ARM_mve_type_float32_t_ptr, \
+    float32_t const *: __ARM_mve_type_float32_t_const_ptr, \
+    float32x4_t: __ARM_mve_type_float32x4_t, \
+    float32x4x2_t: __ARM_mve_type_float32x4x2_t, \
+    float32x4x4_t: __ARM_mve_type_float32x4x4_t, \
+    int16_t: __ARM_mve_type_int16_t, \
+    int16_t *: __ARM_mve_type_int16_t_ptr, \
+    int16_t const *: __ARM_mve_type_int16_t_const_ptr, \
+    int16x8_t: __ARM_mve_type_int16x8_t, \
+    int16x8x2_t: __ARM_mve_type_int16x8x2_t, \
+    int16x8x4_t: __ARM_mve_type_int16x8x4_t, \
+    int32_t: __ARM_mve_type_int32_t, \
+    int32_t *: __ARM_mve_type_int32_t_ptr, \
+    int32_t const *: __ARM_mve_type_int32_t_const_ptr, \
+    int32x4_t: __ARM_mve_type_int32x4_t, \
+    int32x4x2_t: __ARM_mve_type_int32x4x2_t, \
+    int32x4x4_t: __ARM_mve_type_int32x4x4_t, \
+    int64_t: __ARM_mve_type_int64_t, \
+    int64_t *: __ARM_mve_type_int64_t_ptr, \
+    int64_t const *: __ARM_mve_type_int64_t_const_ptr, \
+    int64x2_t: __ARM_mve_type_int64x2_t, \
+    int8_t: __ARM_mve_type_int8_t, \
+    int8_t *: __ARM_mve_type_int8_t_ptr, \
+    int8_t const *: __ARM_mve_type_int8_t_const_ptr, \
+    int8x16_t: __ARM_mve_type_int8x16_t, \
+    int8x16x2_t: __ARM_mve_type_int8x16x2_t, \
+    int8x16x4_t: __ARM_mve_type_int8x16x4_t, \
+    uint16_t: __ARM_mve_type_uint16_t, \
+    uint16_t *: __ARM_mve_type_uint16_t_ptr, \
+    uint16_t const *: __ARM_mve_type_uint16_t_const_ptr, \
+    uint16x8_t: __ARM_mve_type_uint16x8_t, \
+    uint16x8x2_t: __ARM_mve_type_uint16x8x2_t, \
+    uint16x8x4_t: __ARM_mve_type_uint16x8x4_t, \
+    uint32_t: __ARM_mve_type_uint32_t, \
+    uint32_t *: __ARM_mve_type_uint32_t_ptr, \
+    uint32_t const *: __ARM_mve_type_uint32_t_const_ptr, \
+    uint32x4_t: __ARM_mve_type_uint32x4_t, \
+    uint32x4x2_t: __ARM_mve_type_uint32x4x2_t, \
+    uint32x4x4_t: __ARM_mve_type_uint32x4x4_t, \
+    uint64_t: __ARM_mve_type_uint64_t, \
+    uint64_t *: __ARM_mve_type_uint64_t_ptr, \
+    uint64_t const *: __ARM_mve_type_uint64_t_const_ptr, \
+    uint64x2_t: __ARM_mve_type_uint64x2_t, \
+    uint8_t: __ARM_mve_type_uint8_t, \
+    uint8_t *: __ARM_mve_type_uint8_t_ptr, \
+    uint8_t const *: __ARM_mve_type_uint8_t_const_ptr, \
+    uint8x16_t: __ARM_mve_type_uint8x16_t, \
+    uint8x16x2_t: __ARM_mve_type_uint8x16x2_t, \
+    uint8x16x4_t: __ARM_mve_type_uint8x16x4_t, \
+    default: _Generic(x, \
+	signed char: __ARM_mve_type_int8_t, \
+	short: __ARM_mve_type_int16_t, \
+	int: __ARM_mve_type_int32_t, \
+	long: __ARM_mve_type_int32_t, \
+	long long: __ARM_mve_type_int64_t, \
+	unsigned char: __ARM_mve_type_uint8_t, \
+	unsigned short: __ARM_mve_type_uint16_t, \
+	unsigned int: __ARM_mve_type_uint32_t, \
+	unsigned long: __ARM_mve_type_uint32_t, \
+	unsigned long long: __ARM_mve_type_uint64_t, \
+	default: __ARM_mve_unsupported_type))
+#else
+#define __ARM_mve_typeid(x) _Generic(x, \
+    int16_t: __ARM_mve_type_int16_t, \
+    int16_t *: __ARM_mve_type_int16_t_ptr, \
+    int16_t const *: __ARM_mve_type_int16_t_const_ptr, \
+    int16x8_t: __ARM_mve_type_int16x8_t, \
+    int16x8x2_t: __ARM_mve_type_int16x8x2_t, \
+    int16x8x4_t: __ARM_mve_type_int16x8x4_t, \
+    int32_t: __ARM_mve_type_int32_t, \
+    int32_t *: __ARM_mve_type_int32_t_ptr, \
+    int32_t const *: __ARM_mve_type_int32_t_const_ptr, \
+    int32x4_t: __ARM_mve_type_int32x4_t, \
+    int32x4x2_t: __ARM_mve_type_int32x4x2_t, \
+    int32x4x4_t: __ARM_mve_type_int32x4x4_t, \
+    int64_t: __ARM_mve_type_int64_t, \
+    int64_t *: __ARM_mve_type_int64_t_ptr, \
+    int64_t const *: __ARM_mve_type_int64_t_const_ptr, \
+    int64x2_t: __ARM_mve_type_int64x2_t, \
+    int8_t: __ARM_mve_type_int8_t, \
+    int8_t *: __ARM_mve_type_int8_t_ptr, \
+    int8_t const *: __ARM_mve_type_int8_t_const_ptr, \
+    int8x16_t: __ARM_mve_type_int8x16_t, \
+    int8x16x2_t: __ARM_mve_type_int8x16x2_t, \
+    int8x16x4_t: __ARM_mve_type_int8x16x4_t, \
+    uint16_t: __ARM_mve_type_uint16_t, \
+    uint16_t *: __ARM_mve_type_uint16_t_ptr, \
+    uint16_t const *: __ARM_mve_type_uint16_t_const_ptr, \
+    uint16x8_t: __ARM_mve_type_uint16x8_t, \
+    uint16x8x2_t: __ARM_mve_type_uint16x8x2_t, \
+    uint16x8x4_t: __ARM_mve_type_uint16x8x4_t, \
+    uint32_t: __ARM_mve_type_uint32_t, \
+    uint32_t *: __ARM_mve_type_uint32_t_ptr, \
+    uint32_t const *: __ARM_mve_type_uint32_t_const_ptr, \
+    uint32x4_t: __ARM_mve_type_uint32x4_t, \
+    uint32x4x2_t: __ARM_mve_type_uint32x4x2_t, \
+    uint32x4x4_t: __ARM_mve_type_uint32x4x4_t, \
+    uint64_t: __ARM_mve_type_uint64_t, \
+    uint64_t *: __ARM_mve_type_uint64_t_ptr, \
+    uint64_t const *: __ARM_mve_type_uint64_t_const_ptr, \
+    uint64x2_t: __ARM_mve_type_uint64x2_t, \
+    uint8_t: __ARM_mve_type_uint8_t, \
+    uint8_t *: __ARM_mve_type_uint8_t_ptr, \
+    uint8_t const *: __ARM_mve_type_uint8_t_const_ptr, \
+    uint8x16_t: __ARM_mve_type_uint8x16_t, \
+    uint8x16x2_t: __ARM_mve_type_uint8x16x2_t, \
+    uint8x16x4_t: __ARM_mve_type_uint8x16x4_t, \
+    default: _Generic(x, \
+	signed char: __ARM_mve_type_int8_t, \
+	short: __ARM_mve_type_int16_t, \
+	int: __ARM_mve_type_int32_t, \
+	long: __ARM_mve_type_int32_t, \
+	long long: __ARM_mve_type_int64_t, \
+	unsigned char: __ARM_mve_type_uint8_t, \
+	unsigned short: __ARM_mve_type_uint16_t, \
+	unsigned int: __ARM_mve_type_uint32_t, \
+	unsigned long: __ARM_mve_type_uint32_t, \
+	unsigned long long: __ARM_mve_type_uint64_t, \
+	default: __ARM_mve_unsupported_type))
+#endif /* MVE Floating point.  */
+
+extern void *__ARM_undef;
+#define __ARM_mve_coerce(param, type) \
+    _Generic(param, type: param, default: *(type *)__ARM_undef)
+
+#if (__ARM_FEATURE_MVE & 2) /* MVE Floating point.  */
+
+#define vst4q(p0,p1) __arm_vst4q(p0,p1)
+#define __arm_vst4q(p0,p1) ({ __typeof(p0) __p0 = (p0); \
+  __typeof(p1) __p1 = (p1); \
+  _Generic( (int (*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)])0, \
+  int (*)[__ARM_mve_type_int8_t_ptr][__ARM_mve_type_int8x16x4_t]: __arm_vst4q_s8 (__ARM_mve_coerce(__p0, int8_t *), __ARM_mve_coerce(__p1, int8x16x4_t)), \
+  int (*)[__ARM_mve_type_int16_t_ptr][__ARM_mve_type_int16x8x4_t]: __arm_vst4q_s16 (__ARM_mve_coerce(__p0, int16_t *), __ARM_mve_coerce(__p1, int16x8x4_t)), \
+  int (*)[__ARM_mve_type_int32_t_ptr][__ARM_mve_type_int32x4x4_t]: __arm_vst4q_s32 (__ARM_mve_coerce(__p0, int32_t *), __ARM_mve_coerce(__p1, int32x4x4_t)), \
+  int (*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint8x16x4_t]: __arm_vst4q_u8 (__ARM_mve_coerce(__p0, uint8_t *), __ARM_mve_coerce(__p1, uint8x16x4_t)), \
+  int (*)[__ARM_mve_type_uint16_t_ptr][__ARM_mve_type_uint16x8x4_t]: __arm_vst4q_u16 (__ARM_mve_coerce(__p0, uint16_t *), __ARM_mve_coerce(__p1, uint16x8x4_t)), \
+  int (*)[__ARM_mve_type_uint32_t_ptr][__ARM_mve_type_uint32x4x4_t]: __arm_vst4q_u32 (__ARM_mve_coerce(__p0, uint32_t *), __ARM_mve_coerce(__p1, uint32x4x4_t)), \
+  int (*)[__ARM_mve_type_float16_t_ptr][__ARM_mve_type_float16x8x4_t]: __arm_vst4q_f16 (__ARM_mve_coerce(__p0, float16_t *), __ARM_mve_coerce(__p1, float16x8x4_t)), \
+  int (*)[__ARM_mve_type_float32_t_ptr][__ARM_mve_type_float32x4x4_t]: __arm_vst4q_f32 (__ARM_mve_coerce(__p0, float32_t *), __ARM_mve_coerce(__p1, float32x4x4_t)));})
+
+#else /* MVE Interger.  */
+
+#define vst4q(p0,p1) __arm_vst4q(p0,p1)
+#define __arm_vst4q(p0,p1) ({ __typeof(p0) __p0 = (p0); \
+  __typeof(p1) __p1 = (p1); \
+  _Generic( (int (*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)])0, \
+  int (*)[__ARM_mve_type_int8_t_ptr][__ARM_mve_type_int8x16x4_t]: __arm_vst4q_s8 (__ARM_mve_coerce(__p0, int8_t *), __ARM_mve_coerce(__p1, int8x16x4_t)), \
+  int (*)[__ARM_mve_type_int16_t_ptr][__ARM_mve_type_int16x8x4_t]: __arm_vst4q_s16 (__ARM_mve_coerce(__p0, int16_t *), __ARM_mve_coerce(__p1, int16x8x4_t)), \
+  int (*)[__ARM_mve_type_int32_t_ptr][__ARM_mve_type_int32x4x4_t]: __arm_vst4q_s32 (__ARM_mve_coerce(__p0, int32_t *), __ARM_mve_coerce(__p1, int32x4x4_t)), \
+  int (*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint8x16x4_t]: __arm_vst4q_u8 (__ARM_mve_coerce(__p0, uint8_t *), __ARM_mve_coerce(__p1, uint8x16x4_t)), \
+  int (*)[__ARM_mve_type_uint16_t_ptr][__ARM_mve_type_uint16x8x4_t]: __arm_vst4q_u16 (__ARM_mve_coerce(__p0, uint16_t *), __ARM_mve_coerce(__p1, uint16x8x4_t)), \
+  int (*)[__ARM_mve_type_uint32_t_ptr][__ARM_mve_type_uint32x4x4_t]: __arm_vst4q_u32 (__ARM_mve_coerce(__p0, uint32_t *), __ARM_mve_coerce(__p1, uint32x4x4_t)));})
+
+#endif /* MVE Floating point.  */
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/gcc/config/arm/arm_mve_builtins.def b/gcc/config/arm/arm_mve_builtins.def
new file mode 100644
index 0000000000000000000000000000000000000000..4a7e4d06f0af1e639788848041133a800b183e83
--- /dev/null
+++ b/gcc/config/arm/arm_mve_builtins.def
@@ -0,0 +1,21 @@
+/*  MVE builtin definitions for Arm.
+    Copyright  (C) 2019-2020 Free Software Foundation, Inc.
+    Contributed by Arm.
+
+    This file is part of GCC.
+
+    GCC is free software; you can redistribute it and/or modify it
+    under the terms of the GNU General Public License as published
+    by the Free Software Foundation; either version 3, or (at your
+    option) any later version.
+
+    GCC is distributed in the hope that it will be useful, but WITHOUT
+    ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+    or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
+    License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with GCC; see the file COPYING3.  If not see
+    <http://www.gnu.org/licenses/>.  */
+
+VAR5 (STORE1, vst4q, v16qi, v8hi, v4si, v8hf, v4sf)
diff --git a/gcc/config/arm/iterators.md b/gcc/config/arm/iterators.md
index 6af76580c0c5d0a6efd47eb1409061c74dca6378..5c1a11bf7dee7590d668e7ec5e3b068789b3b3db 100644
--- a/gcc/config/arm/iterators.md
+++ b/gcc/config/arm/iterators.md
@@ -131,7 +131,8 @@
 (define_mode_iterator VQXMOV [V16QI V8HI V8HF V8BF V4SI V4SF V2DI TI])
 
 ;; Opaque structure types wider than TImode.
-(define_mode_iterator VSTRUCT [EI OI CI XI])
+(define_mode_iterator VSTRUCT [(EI "!TARGET_HAVE_MVE") OI
+			       (CI "!TARGET_HAVE_MVE") XI])
 
 ;; Opaque structure types used in table lookups (except vtbl1/vtbx1).
 (define_mode_iterator VTAB [TI EI OI])
diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md
index 92176a57bd75695a63b1bc958c9b2791f79ea6a4..9b3bb6129a99c4f3c952e7a72d0874dcba12f30d 100644
--- a/gcc/config/arm/mve.md
+++ b/gcc/config/arm/mve.md
@@ -17,9 +17,12 @@
 ;; along with GCC; see the file COPYING3.  If not see
 ;; <http://www.gnu.org/licenses/>.
 
-(define_mode_iterator MVE_types [V16QI V8HI V4SI V2DI TI V8HF V4SF V2DF])
 (define_mode_attr V_sz_elem2 [(V16QI "s8") (V8HI "u16") (V4SI "u32")
 			      (V2DI "u64")])
+(define_mode_iterator MVE_types [V16QI V8HI V4SI V2DI TI V8HF V4SF V2DF])
+(define_mode_iterator MVE_VLD_ST [V16QI V8HI V4SI V8HF V4SF])
+
+(define_c_enum "unspec" [VST4Q])
 
 (define_insn "*mve_mov<mode>"
   [(set (match_operand:MVE_types 0 "nonimmediate_operand" "=w,w,r,w,w,r,w,Us")
@@ -83,3 +86,37 @@
 }
   [(set_attr "length" "4,4")
    (set_attr "type" "mve_move,mve_move")])
+
+;;
+;; [vst4q])
+;;
+(define_insn "mve_vst4q<mode>"
+  [(set (match_operand:XI 0 "neon_struct_operand" "=Um")
+	(unspec:XI [(match_operand:XI 1 "s_register_operand" "w")
+		    (unspec:MVE_VLD_ST [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+	 VST4Q))
+  ]
+  "TARGET_HAVE_MVE"
+{
+   rtx ops[6];
+   int regno = REGNO (operands[1]);
+   ops[0] = gen_rtx_REG (TImode, regno);
+   ops[1] = gen_rtx_REG (TImode, regno+4);
+   ops[2] = gen_rtx_REG (TImode, regno+8);
+   ops[3] = gen_rtx_REG (TImode, regno+12);
+   rtx reg  = operands[0];
+   while (reg && !REG_P (reg))
+    reg = XEXP (reg, 0);
+   gcc_assert (REG_P (reg));
+   ops[4] = reg;
+   ops[5] = operands[0];
+   /* Here in first three instructions data is stored to ops[4]'s location but
+      in the fourth instruction data is stored to operands[0], this is to
+      support the writeback.  */
+   output_asm_insn ("vst40.<V_sz_elem>\t{%q0, %q1, %q2, %q3}, [%4]\n\t"
+		    "vst41.<V_sz_elem>\t{%q0, %q1, %q2, %q3}, [%4]\n\t"
+		    "vst42.<V_sz_elem>\t{%q0, %q1, %q2, %q3}, [%4]\n\t"
+		    "vst43.<V_sz_elem>\t{%q0, %q1, %q2, %q3}, %5", ops);
+   return "";
+}
+  [(set_attr "length" "16")])
diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md
index b6a8eb62c8ff05fc32805029fea8f682402d7399..fbfeef233f38831a5cb256622625879d15209431 100644
--- a/gcc/config/arm/neon.md
+++ b/gcc/config/arm/neon.md
@@ -149,7 +149,7 @@
 (define_expand "mov<mode>"
   [(set (match_operand:VSTRUCT 0 "nonimmediate_operand")
 	(match_operand:VSTRUCT 1 "general_operand"))]
-  "TARGET_NEON"
+  "TARGET_NEON || TARGET_HAVE_MVE"
 {
   gcc_checking_assert (aligned_operand (operands[0], <MODE>mode));
   gcc_checking_assert (aligned_operand (operands[1], <MODE>mode));
@@ -181,7 +181,7 @@
 (define_insn "*neon_mov<mode>"
   [(set (match_operand:VSTRUCT 0 "nonimmediate_operand"	"=w,Ut,w")
 	(match_operand:VSTRUCT 1 "general_operand"	" w,w, Ut"))]
-  "TARGET_NEON
+  "(TARGET_NEON || TARGET_HAVE_MVE)
    && (register_operand (operands[0], <MODE>mode)
        || register_operand (operands[1], <MODE>mode))"
 {
@@ -217,7 +217,7 @@
 (define_split
   [(set (match_operand:OI 0 "s_register_operand" "")
 	(match_operand:OI 1 "s_register_operand" ""))]
-  "TARGET_NEON && reload_completed"
+  "(TARGET_NEON || TARGET_HAVE_MVE)&& reload_completed"
   [(set (match_dup 0) (match_dup 1))
    (set (match_dup 2) (match_dup 3))]
 {
@@ -258,7 +258,7 @@
 (define_split
   [(set (match_operand:XI 0 "s_register_operand" "")
 	(match_operand:XI 1 "s_register_operand" ""))]
-  "TARGET_NEON && reload_completed"
+  "(TARGET_NEON || TARGET_HAVE_MVE) && reload_completed"
   [(set (match_dup 0) (match_dup 1))
    (set (match_dup 2) (match_dup 3))
    (set (match_dup 4) (match_dup 5))
diff --git a/gcc/config/arm/t-arm b/gcc/config/arm/t-arm
index 2d980830e7669e60d6f2cb61014389d121bfbcb1..1f7f169731301edc1f275342ad7a0eb46407f6e4 100644
--- a/gcc/config/arm/t-arm
+++ b/gcc/config/arm/t-arm
@@ -137,7 +137,8 @@ arm.o: $(srcdir)/config/arm/arm.c $(CONFIG_H) $(SYSTEM_H) coretypes.h $(TM_H) \
   arm-cpu-data.h \
   $(srcdir)/config/arm/arm-protos.h \
   $(srcdir)/config/arm/arm_neon_builtins.def \
-  $(srcdir)/config/arm/arm_vfp_builtins.def
+  $(srcdir)/config/arm/arm_vfp_builtins.def \
+  $(srcdir)/config/arm/arm_mve_builtins.def
 
 arm-builtins.o: $(srcdir)/config/arm/arm-builtins.c $(CONFIG_H) \
   $(SYSTEM_H) coretypes.h $(TM_H) \
@@ -147,6 +148,7 @@ arm-builtins.o: $(srcdir)/config/arm/arm-builtins.c $(CONFIG_H) \
   $(srcdir)/config/arm/arm_acle_builtins.def \
   $(srcdir)/config/arm/arm_neon_builtins.def \
   $(srcdir)/config/arm/arm_vfp_builtins.def \
+  $(srcdir)/config/arm/arm_mve_builtins.def \
   $(srcdir)/config/arm/arm-simd-builtin-types.def
 	$(COMPILER) -c $(ALL_COMPILERFLAGS) $(ALL_CPPFLAGS) $(INCLUDES) \
 		$(srcdir)/config/arm/arm-builtins.c
diff --git a/gcc/testsuite/gcc.target/arm/mve/intrinsics/vst4q_f16.c b/gcc/testsuite/gcc.target/arm/mve/intrinsics/vst4q_f16.c
new file mode 100644
index 0000000000000000000000000000000000000000..8516cfa832b03b4e8713aef8fc1cbabdf62c13f5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/mve/intrinsics/vst4q_f16.c
@@ -0,0 +1,38 @@
+/* { dg-do compile  } */
+/* { dg-require-effective-target arm_v8_1m_mve_fp_ok } */
+/* { dg-add-options arm_v8_1m_mve_fp } */
+/* { dg-additional-options "-O2" } */
+
+#include "arm_mve.h"
+
+void
+foo (float16_t * addr, float16x8x4_t value)
+{
+  vst4q_f16 (addr, value);
+}
+
+/* { dg-final { scan-assembler "vst40.16"  }  } */
+/* { dg-final { scan-assembler "vst41.16"  }  } */
+/* { dg-final { scan-assembler "vst42.16"  }  } */
+/* { dg-final { scan-assembler "vst43.16"  }  } */
+
+void
+foo1 (float16_t * addr, float16x8x4_t value)
+{
+  vst4q (addr, value);
+}
+
+/* { dg-final { scan-assembler "vst40.16"  }  } */
+/* { dg-final { scan-assembler "vst41.16"  }  } */
+/* { dg-final { scan-assembler "vst42.16"  }  } */
+/* { dg-final { scan-assembler "vst43.16"  }  } */
+
+void
+foo2 (float16_t * addr, float16x8x4_t value)
+{
+  vst4q_f16 (addr, value);
+  addr += 32;
+  vst4q_f16 (addr, value);
+}
+
+/* { dg-final { scan-assembler {vst43.16\s\{.*\}, \[.*\]!}  }  } */
diff --git a/gcc/testsuite/gcc.target/arm/mve/intrinsics/vst4q_f32.c b/gcc/testsuite/gcc.target/arm/mve/intrinsics/vst4q_f32.c
new file mode 100644
index 0000000000000000000000000000000000000000..c8b327259ff44d3d56377e0f15a03719792e534c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/mve/intrinsics/vst4q_f32.c
@@ -0,0 +1,38 @@
+/* { dg-do compile  } */
+/* { dg-require-effective-target arm_v8_1m_mve_fp_ok } */
+/* { dg-add-options arm_v8_1m_mve_fp } */
+/* { dg-additional-options "-O2" } */
+
+#include "arm_mve.h"
+
+void
+foo (float32_t * addr, float32x4x4_t value)
+{
+  vst4q_f32 (addr, value);
+}
+
+/* { dg-final { scan-assembler "vst40.32"  }  } */
+/* { dg-final { scan-assembler "vst41.32"  }  } */
+/* { dg-final { scan-assembler "vst42.32"  }  } */
+/* { dg-final { scan-assembler "vst43.32"  }  } */
+
+void
+foo1 (float32_t * addr, float32x4x4_t value)
+{
+  vst4q (addr, value);
+}
+
+/* { dg-final { scan-assembler "vst40.32"  }  } */
+/* { dg-final { scan-assembler "vst41.32"  }  } */
+/* { dg-final { scan-assembler "vst42.32"  }  } */
+/* { dg-final { scan-assembler "vst43.32"  }  } */
+
+void
+foo2 (float32_t * addr, float32x4x4_t value)
+{
+  vst4q_f32 (addr, value);
+  addr += 16;
+  vst4q_f32 (addr, value);
+}
+
+/* { dg-final { scan-assembler {vst43.32\s\{.*\}, \[.*\]!}  }  } */
diff --git a/gcc/testsuite/gcc.target/arm/mve/intrinsics/vst4q_s16.c b/gcc/testsuite/gcc.target/arm/mve/intrinsics/vst4q_s16.c
new file mode 100644
index 0000000000000000000000000000000000000000..d06947d04d828f0bdf831dbef13b3a8ebb70c75a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/mve/intrinsics/vst4q_s16.c
@@ -0,0 +1,38 @@
+/* { dg-do compile  } */
+/* { dg-require-effective-target arm_v8_1m_mve_ok } */
+/* { dg-add-options arm_v8_1m_mve } */
+/* { dg-additional-options "-O2" } */
+
+#include "arm_mve.h"
+
+void
+foo (int16_t * addr, int16x8x4_t value)
+{
+  vst4q_s16 (addr, value);
+}
+
+/* { dg-final { scan-assembler "vst40.16"  }  } */
+/* { dg-final { scan-assembler "vst41.16"  }  } */
+/* { dg-final { scan-assembler "vst42.16"  }  } */
+/* { dg-final { scan-assembler "vst43.16"  }  } */
+
+void
+foo1 (int16_t * addr, int16x8x4_t value)
+{
+  vst4q (addr, value);
+}
+
+/* { dg-final { scan-assembler "vst40.16"  }  } */
+/* { dg-final { scan-assembler "vst41.16"  }  } */
+/* { dg-final { scan-assembler "vst42.16"  }  } */
+/* { dg-final { scan-assembler "vst43.16"  }  } */
+
+void
+foo2 (int16_t * addr, int16x8x4_t value)
+{
+  vst4q_s16 (addr, value);
+  addr += 32;
+  vst4q_s16 (addr, value);
+}
+
+/* { dg-final { scan-assembler {vst43.16\s\{.*\}, \[.*\]!}  }  } */
diff --git a/gcc/testsuite/gcc.target/arm/mve/intrinsics/vst4q_s32.c b/gcc/testsuite/gcc.target/arm/mve/intrinsics/vst4q_s32.c
new file mode 100644
index 0000000000000000000000000000000000000000..5dc6835ef847032ce3a7d2c538f53eaec38dd0ca
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/mve/intrinsics/vst4q_s32.c
@@ -0,0 +1,38 @@
+/* { dg-do compile  } */
+/* { dg-require-effective-target arm_v8_1m_mve_ok } */
+/* { dg-add-options arm_v8_1m_mve } */
+/* { dg-additional-options "-O2" } */
+
+#include "arm_mve.h"
+
+void
+foo (int32_t * addr, int32x4x4_t value)
+{
+  vst4q_s32 (addr, value);
+}
+
+/* { dg-final { scan-assembler "vst40.32"  }  } */
+/* { dg-final { scan-assembler "vst41.32"  }  } */
+/* { dg-final { scan-assembler "vst42.32"  }  } */
+/* { dg-final { scan-assembler "vst43.32"  }  } */
+
+void
+foo1 (int32_t * addr, int32x4x4_t value)
+{
+  vst4q (addr, value);
+}
+
+/* { dg-final { scan-assembler "vst40.32"  }  } */
+/* { dg-final { scan-assembler "vst41.32"  }  } */
+/* { dg-final { scan-assembler "vst42.32"  }  } */
+/* { dg-final { scan-assembler "vst43.32"  }  } */
+
+void
+foo2 (int32_t * addr, int32x4x4_t value)
+{
+  vst4q_s32 (addr, value);
+  addr += 16;
+  vst4q_s32 (addr, value); 
+}
+
+/* { dg-final { scan-assembler {vst43.32\s\{.*\}, \[.*\]!}  }  } */
diff --git a/gcc/testsuite/gcc.target/arm/mve/intrinsics/vst4q_s8.c b/gcc/testsuite/gcc.target/arm/mve/intrinsics/vst4q_s8.c
new file mode 100644
index 0000000000000000000000000000000000000000..a3cb53e229236dc2f3355ef8622a6c99a9b9f3da
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/mve/intrinsics/vst4q_s8.c
@@ -0,0 +1,38 @@
+/* { dg-do compile  } */
+/* { dg-require-effective-target arm_v8_1m_mve_ok } */
+/* { dg-add-options arm_v8_1m_mve } */
+/* { dg-additional-options "-O2" } */
+
+#include "arm_mve.h"
+
+void
+foo (int8_t * addr, int8x16x4_t value)
+{
+  vst4q_s8 (addr, value);
+}
+
+/* { dg-final { scan-assembler "vst40.8"  }  } */
+/* { dg-final { scan-assembler "vst41.8"  }  } */
+/* { dg-final { scan-assembler "vst42.8"  }  } */
+/* { dg-final { scan-assembler "vst43.8"  }  } */
+
+void
+foo1 (int8_t * addr, int8x16x4_t value)
+{
+  vst4q (addr, value);
+}
+
+/* { dg-final { scan-assembler "vst40.8"  }  } */
+/* { dg-final { scan-assembler "vst41.8"  }  } */
+/* { dg-final { scan-assembler "vst42.8"  }  } */
+/* { dg-final { scan-assembler "vst43.8"  }  } */
+
+void
+foo2 (int8_t * addr, int8x16x4_t value)
+{
+  vst4q_s8 (addr, value);
+  addr += 16*4;
+  vst4q_s8 (addr, value);
+}
+
+/* { dg-final { scan-assembler {vst43.8\s\{.*\}, \[.*\]!}  }  } */
diff --git a/gcc/testsuite/gcc.target/arm/mve/intrinsics/vst4q_u16.c b/gcc/testsuite/gcc.target/arm/mve/intrinsics/vst4q_u16.c
new file mode 100644
index 0000000000000000000000000000000000000000..87dd4bfc0624c8340f522dde3ca6818266d7cde0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/mve/intrinsics/vst4q_u16.c
@@ -0,0 +1,38 @@
+/* { dg-do compile  } */
+/* { dg-require-effective-target arm_v8_1m_mve_ok } */
+/* { dg-add-options arm_v8_1m_mve } */
+/* { dg-additional-options "-O2" } */
+
+#include "arm_mve.h"
+
+void
+foo (uint16_t * addr, uint16x8x4_t value)
+{
+  vst4q_u16 (addr, value);
+}
+
+/* { dg-final { scan-assembler "vst40.16"  }  } */
+/* { dg-final { scan-assembler "vst41.16"  }  } */
+/* { dg-final { scan-assembler "vst42.16"  }  } */
+/* { dg-final { scan-assembler "vst43.16"  }  } */
+
+void
+foo1 (uint16_t * addr, uint16x8x4_t value)
+{
+  vst4q (addr, value);
+}
+
+/* { dg-final { scan-assembler "vst40.16"  }  } */
+/* { dg-final { scan-assembler "vst41.16"  }  } */
+/* { dg-final { scan-assembler "vst42.16"  }  } */
+/* { dg-final { scan-assembler "vst43.16"  }  } */
+
+void
+foo2 (uint16_t * addr, uint16x8x4_t value)
+{
+  vst4q_u16 (addr, value);
+  addr += 32;
+  vst4q_u16 (addr, value);
+}
+
+/* { dg-final { scan-assembler {vst43.16\s\{.*\}, \[.*\]!}  }  } */
diff --git a/gcc/testsuite/gcc.target/arm/mve/intrinsics/vst4q_u32.c b/gcc/testsuite/gcc.target/arm/mve/intrinsics/vst4q_u32.c
new file mode 100644
index 0000000000000000000000000000000000000000..943aa0266a7da619295e3e30fe99c011c1b7184e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/mve/intrinsics/vst4q_u32.c
@@ -0,0 +1,38 @@
+/* { dg-do compile  } */
+/* { dg-require-effective-target arm_v8_1m_mve_ok } */
+/* { dg-add-options arm_v8_1m_mve } */
+/* { dg-additional-options "-O2" } */
+
+#include "arm_mve.h"
+
+void
+foo (uint32_t * addr, uint32x4x4_t value)
+{
+  vst4q_u32 (addr, value);
+}
+
+/* { dg-final { scan-assembler "vst40.32"  }  } */
+/* { dg-final { scan-assembler "vst41.32"  }  } */
+/* { dg-final { scan-assembler "vst42.32"  }  } */
+/* { dg-final { scan-assembler "vst43.32"  }  } */
+
+void
+foo1 (uint32_t * addr, uint32x4x4_t value)
+{
+  vst4q (addr, value);
+}
+
+/* { dg-final { scan-assembler "vst40.32"  }  } */
+/* { dg-final { scan-assembler "vst41.32"  }  } */
+/* { dg-final { scan-assembler "vst42.32"  }  } */
+/* { dg-final { scan-assembler "vst43.32"  }  } */
+
+void
+foo2 (uint32_t * addr, uint32x4x4_t value)
+{
+  vst4q_u32 (addr, value);
+  addr += 16;
+  vst4q_u32 (addr, value); 
+}
+
+/* { dg-final { scan-assembler {vst43.32\s\{.*\}, \[.*\]!}  }  } */
diff --git a/gcc/testsuite/gcc.target/arm/mve/intrinsics/vst4q_u8.c b/gcc/testsuite/gcc.target/arm/mve/intrinsics/vst4q_u8.c
new file mode 100644
index 0000000000000000000000000000000000000000..c8dcc97087c6f3b795fa26337b97a802e5e13c44
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/mve/intrinsics/vst4q_u8.c
@@ -0,0 +1,38 @@
+/* { dg-do compile  } */
+/* { dg-require-effective-target arm_v8_1m_mve_ok } */
+/* { dg-add-options arm_v8_1m_mve } */
+/* { dg-additional-options "-O2" } */
+
+#include "arm_mve.h"
+
+void
+foo (uint8_t * addr, uint8x16x4_t value)
+{
+  vst4q_u8 (addr, value);
+}
+
+/* { dg-final { scan-assembler "vst40.8"  }  } */
+/* { dg-final { scan-assembler "vst41.8"  }  } */
+/* { dg-final { scan-assembler "vst42.8"  }  } */
+/* { dg-final { scan-assembler "vst43.8"  }  } */
+
+void
+foo1 (uint8_t * addr, uint8x16x4_t value)
+{
+  vst4q (addr, value);
+}
+
+/* { dg-final { scan-assembler "vst40.8"  }  } */
+/* { dg-final { scan-assembler "vst41.8"  }  } */
+/* { dg-final { scan-assembler "vst42.8"  }  } */
+/* { dg-final { scan-assembler "vst43.8"  }  } */
+
+void
+foo2 (uint8_t * addr, uint8x16x4_t value)
+{
+  vst4q_u8 (addr, value);
+  addr += 16*4;
+  vst4q_u8 (addr, value);
+}
+
+/* { dg-final { scan-assembler {vst43.8\s\{.*\}, \[.*\]!}  }  } */

Comments

Kyrylo Tkachov March 17, 2020, 11:19 a.m. UTC | #1
Hi Srinath,

Thanks, I've pushed this to master.
Kyrill

-----Original Message-----
From: Srinath Parvathaneni <Srinath.Parvathaneni@arm.com> 
Sent: 10 March 2020 18:20
To: gcc-patches@gcc.gnu.org
Cc: Kyrylo Tkachov <Kyrylo.Tkachov@arm.com>
Subject: [PATCH v2][ARM][GCC][4/x]: MVE ACLE vector interleaving store intrinsics.

Hello Kyrill,

Following patch is the rebased version of v1.
(version v1) https://gcc.gnu.org/pipermail/gcc-patches/2019-November/534328.html

####

Hello,

This patch supports MVE ACLE intrinsics vst4q_s8, vst4q_s16, vst4q_s32, vst4q_u8, vst4q_u16, vst4q_u32, vst4q_f16 and vst4q_f32.

In this patch arm_mve_builtins.def file is added to the source code in which the builtins for MVE ACLE intrinsics are defined using builtin qualifiers.

Please refer to M-profile Vector Extension (MVE) intrinsics [1]  for more details.
[1] https://developer.arm.com/architectures/instruction-sets/simd-isas/helium/mve-intrinsics

Regression tested on target arm-none-eabi and armeb-none-eabi and found no regressions.

Ok for trunk?

Thanks,
Srinath.

gcc/ChangeLog:

2020-03-06  Andre Vieira  <andre.simoesdiasvieira@arm.com>
	    Mihail Ionescu  <mihail.ionescu@arm.com>
	    Srinath Parvathaneni  <srinath.parvathaneni@arm.com>

	* config/arm/arm-builtins.c (CF): Define mve_builtin_data.
	(VAR1): Define.
	(ARM_BUILTIN_MVE_PATTERN_START): Define.
	(arm_init_mve_builtins): Define function.
	(arm_init_builtins): Add TARGET_HAVE_MVE check.
	(arm_expand_builtin_1): Check the range of fcode.
	(arm_expand_mve_builtin): Define function to expand MVE builtins.
	(arm_expand_builtin): Check the range of fcode.
	* config/arm/arm_mve.h (__ARM_FEATURE_MVE): Define MVE floating point
        types.
	(__ARM_MVE_PRESERVE_USER_NAMESPACE): Define to protect user namespace.
	(vst4q_s8): Define macro.
	(vst4q_s16): Likewise.
	(vst4q_s32): Likewise.
	(vst4q_u8): Likewise.
	(vst4q_u16): Likewise.
	(vst4q_u32): Likewise.
	(vst4q_f16): Likewise.
	(vst4q_f32): Likewise.
	(__arm_vst4q_s8): Define inline builtin.
	(__arm_vst4q_s16): Likewise.
	(__arm_vst4q_s32): Likewise.
	(__arm_vst4q_u8): Likewise.
	(__arm_vst4q_u16): Likewise.
	(__arm_vst4q_u32): Likewise.
	(__arm_vst4q_f16): Likewise.
	(__arm_vst4q_f32): Likewise.
	(__ARM_mve_typeid): Define macro with MVE types.
	(__ARM_mve_coerce): Define macro with _Generic feature.
	(vst4q): Define polymorphic variant for different vst4q builtins.
	* config/arm/arm_mve_builtins.def: New file.
	* config/arm/iterators.md (VSTRUCT): Modify to allow XI and OI
	modes in MVE.
	* config/arm/mve.md (MVE_VLD_ST): Define iterator.
	(unspec): Define unspec.
	(mve_vst4q<mode>): Define RTL pattern.
	* config/arm/neon.md (mov<mode>): Modify expand to allow XI and OI
	modes in MVE.
	(neon_mov<mode>): Modify RTL define_insn to allow XI and OI modes
	in MVE.
	(define_split): Allow OI mode split for MVE after reload.
	(define_split): Allow XI mode split for MVE after reload.
	* config/arm/t-arm (arm.o): Add entry for arm_mve_builtins.def.
	(arm-builtins.o): Likewise.

gcc/testsuite/ChangeLog:

2020-03-06  Andre Vieira  <andre.simoesdiasvieira@arm.com>
	    Mihail Ionescu  <mihail.ionescu@arm.com>
	    Srinath Parvathaneni  <srinath.parvathaneni@arm.com>

	* gcc.target/arm/mve/intrinsics/vst4q_f16.c: New test.
	* gcc.target/arm/mve/intrinsics/vst4q_f32.c: Likewise.
	* gcc.target/arm/mve/intrinsics/vst4q_s16.c: Likewise.
	* gcc.target/arm/mve/intrinsics/vst4q_s32.c: Likewise.
	* gcc.target/arm/mve/intrinsics/vst4q_s8.c: Likewise.
	* gcc.target/arm/mve/intrinsics/vst4q_u16.c: Likewise.
	* gcc.target/arm/mve/intrinsics/vst4q_u32.c: Likewise.
	* gcc.target/arm/mve/intrinsics/vst4q_u8.c: Likewise.


###############     Attachment also inlined for ease of reply    ###############


diff --git a/gcc/config/arm/arm-builtins.c b/gcc/config/arm/arm-builtins.c index 28917363eeae51b7cc39576f3c3e77a0350b8877..b9ee45d5950ac9c1e12d88cd7d3ece1953dc51d0 100644
--- a/gcc/config/arm/arm-builtins.c
+++ b/gcc/config/arm/arm-builtins.c
@@ -432,6 +432,13 @@ static arm_builtin_datum neon_builtin_data[] =  };
 
 #undef CF
+#define CF(N,X) CODE_FOR_mve_##N##X
+static arm_builtin_datum mve_builtin_data[] = { #include 
+"arm_mve_builtins.def"
+};
+
+#undef CF
 #undef VAR1
 #define VAR1(T, N, A) \
   {#N, UP (A), CODE_FOR_arm_##N, 0, T##_QUALIFIERS}, @@ -736,6 +743,13 @@ enum arm_builtins
 
 #include "arm_acle_builtins.def"
 
+  ARM_BUILTIN_MVE_BASE,
+
+#undef VAR1
+#define VAR1(T, N, X) \
+  ARM_BUILTIN_MVE_##N##X,
+#include "arm_mve_builtins.def"
+
   ARM_BUILTIN_MAX
 };
 
@@ -745,6 +759,9 @@ enum arm_builtins
 #define ARM_BUILTIN_NEON_PATTERN_START \
   (ARM_BUILTIN_NEON_BASE + 1)
 
+#define ARM_BUILTIN_MVE_PATTERN_START \
+  (ARM_BUILTIN_MVE_BASE + 1)
+
 #define ARM_BUILTIN_ACLE_PATTERN_START \
   (ARM_BUILTIN_ACLE_BASE + 1)
 
@@ -1276,6 +1293,22 @@ arm_init_acle_builtins (void)
     }
 }
 
+/* Set up all the MVE builtins mentioned in arm_mve_builtins.def file.  
+*/ static void arm_init_mve_builtins (void) {
+  volatile unsigned int i, fcode = ARM_BUILTIN_MVE_PATTERN_START;
+
+  arm_init_simd_builtin_scalar_types ();  arm_init_simd_builtin_types 
+ ();
+
+  for (i = 0; i < ARRAY_SIZE (mve_builtin_data); i++, fcode++)
+    {
+      arm_builtin_datum *d = &mve_builtin_data[i];
+      arm_init_builtin (fcode, d, "__builtin_mve");
+    }
+}
+
 /* Set up all the NEON builtins, even builtins for instructions that are not
    in the current target ISA to allow the user to compile particular modules
    with different target specific options that differ from the command line @@ -2020,8 +2053,10 @@ arm_init_builtins (void)
       = add_builtin_function ("__builtin_arm_lane_check", lane_check_fpr,
 			      ARM_BUILTIN_SIMD_LANE_CHECK, BUILT_IN_MD,
 			      NULL, NULL_TREE);
-
-      arm_init_neon_builtins ();
+      if (TARGET_HAVE_MVE)
+	arm_init_mve_builtins ();
+      else
+	arm_init_neon_builtins ();
       arm_init_vfp_builtins ();
       arm_init_crypto_builtins ();
     }
@@ -2565,10 +2600,14 @@ arm_expand_builtin_1 (int fcode, tree exp, rtx target,
   int is_void = 0;
   int k;
   bool neon = false;
+  bool mve = false;
 
   if (IN_RANGE (fcode, ARM_BUILTIN_VFP_BASE, ARM_BUILTIN_ACLE_BASE - 1))
     neon = true;
 
+  if (IN_RANGE (fcode, ARM_BUILTIN_MVE_BASE, ARM_BUILTIN_MAX - 1))
+    mve = true;
+
   is_void = !!(d->qualifiers[0] & qualifier_void);
 
   num_args += is_void;
@@ -2610,7 +2649,7 @@ arm_expand_builtin_1 (int fcode, tree exp, rtx target,
 	}
       else if (d->qualifiers[qualifiers_k] & qualifier_pointer)
 	{
-	  if (neon)
+	  if (neon || mve)
 	    args[k] = ARG_BUILTIN_NEON_MEMORY;
 	  else
 	    args[k] = ARG_BUILTIN_MEMORY;
@@ -2660,6 +2699,26 @@ arm_expand_acle_builtin (int fcode, tree exp, rtx target)
   return arm_expand_builtin_1 (fcode, exp, target, d);  }
 
+/* Expand an MVE builtin, i.e. those registered only if their respective target
+   constraints are met.  This check happens within arm_expand_builtin.  
+*/
+
+static rtx
+arm_expand_mve_builtin (int fcode, tree exp, rtx target) {
+  if (fcode >= ARM_BUILTIN_MVE_BASE && !TARGET_HAVE_MVE)
+  {
+    fatal_error (input_location,
+		"You must enable MVE instructions"
+		" to use these intrinsics");
+    return const0_rtx;
+  }
+
+  arm_builtin_datum *d
+    = &mve_builtin_data[fcode - ARM_BUILTIN_MVE_PATTERN_START];
+
+  return arm_expand_builtin_1 (fcode, exp, target, d); }
+
 /* Expand a Neon builtin, i.e. those registered only if TARGET_NEON holds.
    Most of these are "special" because they don't have symbolic
    constants defined per-instruction or per instruction-variant.  Instead, the @@ -2753,6 +2812,8 @@ arm_expand_builtin (tree exp,
       /* Don't generate any RTL.  */
       return const0_rtx;
     }
+  if (fcode >= ARM_BUILTIN_MVE_BASE)
+    return arm_expand_mve_builtin (fcode, exp, target);
 
   if (fcode >= ARM_BUILTIN_ACLE_BASE)
     return arm_expand_acle_builtin (fcode, exp, target); diff --git a/gcc/config/arm/arm_mve.h b/gcc/config/arm/arm_mve.h index 5ffb466596b5d8fc330616a6fcc7ee37d3e28def..39c6a1551a72700292dde8ef6cea44ba0907af8d 100644
--- a/gcc/config/arm/arm_mve.h
+++ b/gcc/config/arm/arm_mve.h
@@ -42,6 +42,13 @@ typedef __simd128_float16_t float16x8_t;  typedef __simd128_float32_t float32x4_t;  #endif
 
+#if (__ARM_FEATURE_MVE & 2) /* MVE Floating point.  */ typedef struct { 
+float16x8_t val[2]; } float16x8x2_t; typedef struct { float16x8_t 
+val[4]; } float16x8x4_t; typedef struct { float32x4_t val[2]; } 
+float32x4x2_t; typedef struct { float32x4_t val[4]; } float32x4x4_t; 
+#endif
+
 typedef uint16_t mve_pred16_t;
 typedef __simd128_uint8_t uint8x16_t;
 typedef __simd128_uint16_t uint16x8_t;
@@ -52,6 +59,330 @@ typedef __simd128_int16_t int16x8_t;  typedef __simd128_int32_t int32x4_t;  typedef __simd128_int64_t int64x2_t;
 
+typedef struct { int16x8_t val[2]; } int16x8x2_t; typedef struct { 
+int16x8_t val[4]; } int16x8x4_t; typedef struct { int32x4_t val[2]; } 
+int32x4x2_t; typedef struct { int32x4_t val[4]; } int32x4x4_t; typedef 
+struct { int8x16_t val[2]; } int8x16x2_t; typedef struct { int8x16_t 
+val[4]; } int8x16x4_t; typedef struct { uint16x8_t val[2]; } 
+uint16x8x2_t; typedef struct { uint16x8_t val[4]; } uint16x8x4_t; 
+typedef struct { uint32x4_t val[2]; } uint32x4x2_t; typedef struct { 
+uint32x4_t val[4]; } uint32x4x4_t; typedef struct { uint8x16_t val[2]; 
+} uint8x16x2_t; typedef struct { uint8x16_t val[4]; } uint8x16x4_t;
+
+#ifndef __ARM_MVE_PRESERVE_USER_NAMESPACE #define vst4q_s8( __addr, 
+__value) __arm_vst4q_s8( __addr, __value) #define vst4q_s16( __addr, 
+__value) __arm_vst4q_s16( __addr, __value) #define vst4q_s32( __addr, 
+__value) __arm_vst4q_s32( __addr, __value) #define vst4q_u8( __addr, 
+__value) __arm_vst4q_u8( __addr, __value) #define vst4q_u16( __addr, 
+__value) __arm_vst4q_u16( __addr, __value) #define vst4q_u32( __addr, 
+__value) __arm_vst4q_u32( __addr, __value) #define vst4q_f16( __addr, 
+__value) __arm_vst4q_f16( __addr, __value) #define vst4q_f32( __addr, 
+__value) __arm_vst4q_f32( __addr, __value) #endif
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+__arm_vst4q_s8 (int8_t * __addr, int8x16x4_t __value) {
+  union { int8x16x4_t __i; __builtin_neon_xi __o; } __rv;
+  __rv.__i = __value;
+  __builtin_mve_vst4qv16qi ((__builtin_neon_qi *) __addr, __rv.__o); }
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+__arm_vst4q_s16 (int16_t * __addr, int16x8x4_t __value) {
+  union { int16x8x4_t __i; __builtin_neon_xi __o; } __rv;
+  __rv.__i = __value;
+  __builtin_mve_vst4qv8hi ((__builtin_neon_hi *) __addr, __rv.__o); }
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+__arm_vst4q_s32 (int32_t * __addr, int32x4x4_t __value) {
+  union { int32x4x4_t __i; __builtin_neon_xi __o; } __rv;
+  __rv.__i = __value;
+  __builtin_mve_vst4qv4si ((__builtin_neon_si *) __addr, __rv.__o); }
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+__arm_vst4q_u8 (uint8_t * __addr, uint8x16x4_t __value) {
+  union { uint8x16x4_t __i; __builtin_neon_xi __o; } __rv;
+  __rv.__i = __value;
+  __builtin_mve_vst4qv16qi ((__builtin_neon_qi *) __addr, __rv.__o); }
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+__arm_vst4q_u16 (uint16_t * __addr, uint16x8x4_t __value) {
+  union { uint16x8x4_t __i; __builtin_neon_xi __o; } __rv;
+  __rv.__i = __value;
+  __builtin_mve_vst4qv8hi ((__builtin_neon_hi *) __addr, __rv.__o); }
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+__arm_vst4q_u32 (uint32_t * __addr, uint32x4x4_t __value) {
+  union { uint32x4x4_t __i; __builtin_neon_xi __o; } __rv;
+  __rv.__i = __value;
+  __builtin_mve_vst4qv4si ((__builtin_neon_si *) __addr, __rv.__o); }
+
+#if (__ARM_FEATURE_MVE & 2) /* MVE Floating point.  */
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+__arm_vst4q_f16 (float16_t * __addr, float16x8x4_t __value) {
+  union { float16x8x4_t __i; __builtin_neon_xi __o; } __rv;
+  __rv.__i = __value;
+  __builtin_mve_vst4qv8hf (__addr, __rv.__o); }
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+__arm_vst4q_f32 (float32_t * __addr, float32x4x4_t __value) {
+  union { float32x4x4_t __i; __builtin_neon_xi __o; } __rv;
+  __rv.__i = __value;
+  __builtin_mve_vst4qv4sf (__addr, __rv.__o); }
+
+#endif
+
+enum {
+    __ARM_mve_type_float16_t = 1,
+    __ARM_mve_type_float16_t_ptr,
+    __ARM_mve_type_float16_t_const_ptr,
+    __ARM_mve_type_float16x8_t,
+    __ARM_mve_type_float16x8x2_t,
+    __ARM_mve_type_float16x8x4_t,
+    __ARM_mve_type_float32_t,
+    __ARM_mve_type_float32_t_ptr,
+    __ARM_mve_type_float32_t_const_ptr,
+    __ARM_mve_type_float32x4_t,
+    __ARM_mve_type_float32x4x2_t,
+    __ARM_mve_type_float32x4x4_t,
+    __ARM_mve_type_int16_t,
+    __ARM_mve_type_int16_t_ptr,
+    __ARM_mve_type_int16_t_const_ptr,
+    __ARM_mve_type_int16x8_t,
+    __ARM_mve_type_int16x8x2_t,
+    __ARM_mve_type_int16x8x4_t,
+    __ARM_mve_type_int32_t,
+    __ARM_mve_type_int32_t_ptr,
+    __ARM_mve_type_int32_t_const_ptr,
+    __ARM_mve_type_int32x4_t,
+    __ARM_mve_type_int32x4x2_t,
+    __ARM_mve_type_int32x4x4_t,
+    __ARM_mve_type_int64_t,
+    __ARM_mve_type_int64_t_ptr,
+    __ARM_mve_type_int64_t_const_ptr,
+    __ARM_mve_type_int64x2_t,
+    __ARM_mve_type_int8_t,
+    __ARM_mve_type_int8_t_ptr,
+    __ARM_mve_type_int8_t_const_ptr,
+    __ARM_mve_type_int8x16_t,
+    __ARM_mve_type_int8x16x2_t,
+    __ARM_mve_type_int8x16x4_t,
+    __ARM_mve_type_uint16_t,
+    __ARM_mve_type_uint16_t_ptr,
+    __ARM_mve_type_uint16_t_const_ptr,
+    __ARM_mve_type_uint16x8_t,
+    __ARM_mve_type_uint16x8x2_t,
+    __ARM_mve_type_uint16x8x4_t,
+    __ARM_mve_type_uint32_t,
+    __ARM_mve_type_uint32_t_ptr,
+    __ARM_mve_type_uint32_t_const_ptr,
+    __ARM_mve_type_uint32x4_t,
+    __ARM_mve_type_uint32x4x2_t,
+    __ARM_mve_type_uint32x4x4_t,
+    __ARM_mve_type_uint64_t,
+    __ARM_mve_type_uint64_t_ptr,
+    __ARM_mve_type_uint64_t_const_ptr,
+    __ARM_mve_type_uint64x2_t,
+    __ARM_mve_type_uint8_t,
+    __ARM_mve_type_uint8_t_ptr,
+    __ARM_mve_type_uint8_t_const_ptr,
+    __ARM_mve_type_uint8x16_t,
+    __ARM_mve_type_uint8x16x2_t,
+    __ARM_mve_type_uint8x16x4_t,
+    __ARM_mve_unsupported_type
+};
+
+#if (__ARM_FEATURE_MVE & 2) /* MVE Floating point.  */ #define 
+__ARM_mve_typeid(x) _Generic(x, \
+    float16_t: __ARM_mve_type_float16_t, \
+    float16_t *: __ARM_mve_type_float16_t_ptr, \
+    float16_t const *: __ARM_mve_type_float16_t_const_ptr, \
+    float16x8_t: __ARM_mve_type_float16x8_t, \
+    float16x8x2_t: __ARM_mve_type_float16x8x2_t, \
+    float16x8x4_t: __ARM_mve_type_float16x8x4_t, \
+    float32_t: __ARM_mve_type_float32_t, \
+    float32_t *: __ARM_mve_type_float32_t_ptr, \
+    float32_t const *: __ARM_mve_type_float32_t_const_ptr, \
+    float32x4_t: __ARM_mve_type_float32x4_t, \
+    float32x4x2_t: __ARM_mve_type_float32x4x2_t, \
+    float32x4x4_t: __ARM_mve_type_float32x4x4_t, \
+    int16_t: __ARM_mve_type_int16_t, \
+    int16_t *: __ARM_mve_type_int16_t_ptr, \
+    int16_t const *: __ARM_mve_type_int16_t_const_ptr, \
+    int16x8_t: __ARM_mve_type_int16x8_t, \
+    int16x8x2_t: __ARM_mve_type_int16x8x2_t, \
+    int16x8x4_t: __ARM_mve_type_int16x8x4_t, \
+    int32_t: __ARM_mve_type_int32_t, \
+    int32_t *: __ARM_mve_type_int32_t_ptr, \
+    int32_t const *: __ARM_mve_type_int32_t_const_ptr, \
+    int32x4_t: __ARM_mve_type_int32x4_t, \
+    int32x4x2_t: __ARM_mve_type_int32x4x2_t, \
+    int32x4x4_t: __ARM_mve_type_int32x4x4_t, \
+    int64_t: __ARM_mve_type_int64_t, \
+    int64_t *: __ARM_mve_type_int64_t_ptr, \
+    int64_t const *: __ARM_mve_type_int64_t_const_ptr, \
+    int64x2_t: __ARM_mve_type_int64x2_t, \
+    int8_t: __ARM_mve_type_int8_t, \
+    int8_t *: __ARM_mve_type_int8_t_ptr, \
+    int8_t const *: __ARM_mve_type_int8_t_const_ptr, \
+    int8x16_t: __ARM_mve_type_int8x16_t, \
+    int8x16x2_t: __ARM_mve_type_int8x16x2_t, \
+    int8x16x4_t: __ARM_mve_type_int8x16x4_t, \
+    uint16_t: __ARM_mve_type_uint16_t, \
+    uint16_t *: __ARM_mve_type_uint16_t_ptr, \
+    uint16_t const *: __ARM_mve_type_uint16_t_const_ptr, \
+    uint16x8_t: __ARM_mve_type_uint16x8_t, \
+    uint16x8x2_t: __ARM_mve_type_uint16x8x2_t, \
+    uint16x8x4_t: __ARM_mve_type_uint16x8x4_t, \
+    uint32_t: __ARM_mve_type_uint32_t, \
+    uint32_t *: __ARM_mve_type_uint32_t_ptr, \
+    uint32_t const *: __ARM_mve_type_uint32_t_const_ptr, \
+    uint32x4_t: __ARM_mve_type_uint32x4_t, \
+    uint32x4x2_t: __ARM_mve_type_uint32x4x2_t, \
+    uint32x4x4_t: __ARM_mve_type_uint32x4x4_t, \
+    uint64_t: __ARM_mve_type_uint64_t, \
+    uint64_t *: __ARM_mve_type_uint64_t_ptr, \
+    uint64_t const *: __ARM_mve_type_uint64_t_const_ptr, \
+    uint64x2_t: __ARM_mve_type_uint64x2_t, \
+    uint8_t: __ARM_mve_type_uint8_t, \
+    uint8_t *: __ARM_mve_type_uint8_t_ptr, \
+    uint8_t const *: __ARM_mve_type_uint8_t_const_ptr, \
+    uint8x16_t: __ARM_mve_type_uint8x16_t, \
+    uint8x16x2_t: __ARM_mve_type_uint8x16x2_t, \
+    uint8x16x4_t: __ARM_mve_type_uint8x16x4_t, \
+    default: _Generic(x, \
+	signed char: __ARM_mve_type_int8_t, \
+	short: __ARM_mve_type_int16_t, \
+	int: __ARM_mve_type_int32_t, \
+	long: __ARM_mve_type_int32_t, \
+	long long: __ARM_mve_type_int64_t, \
+	unsigned char: __ARM_mve_type_uint8_t, \
+	unsigned short: __ARM_mve_type_uint16_t, \
+	unsigned int: __ARM_mve_type_uint32_t, \
+	unsigned long: __ARM_mve_type_uint32_t, \
+	unsigned long long: __ARM_mve_type_uint64_t, \
+	default: __ARM_mve_unsupported_type))
+#else
+#define __ARM_mve_typeid(x) _Generic(x, \
+    int16_t: __ARM_mve_type_int16_t, \
+    int16_t *: __ARM_mve_type_int16_t_ptr, \
+    int16_t const *: __ARM_mve_type_int16_t_const_ptr, \
+    int16x8_t: __ARM_mve_type_int16x8_t, \
+    int16x8x2_t: __ARM_mve_type_int16x8x2_t, \
+    int16x8x4_t: __ARM_mve_type_int16x8x4_t, \
+    int32_t: __ARM_mve_type_int32_t, \
+    int32_t *: __ARM_mve_type_int32_t_ptr, \
+    int32_t const *: __ARM_mve_type_int32_t_const_ptr, \
+    int32x4_t: __ARM_mve_type_int32x4_t, \
+    int32x4x2_t: __ARM_mve_type_int32x4x2_t, \
+    int32x4x4_t: __ARM_mve_type_int32x4x4_t, \
+    int64_t: __ARM_mve_type_int64_t, \
+    int64_t *: __ARM_mve_type_int64_t_ptr, \
+    int64_t const *: __ARM_mve_type_int64_t_const_ptr, \
+    int64x2_t: __ARM_mve_type_int64x2_t, \
+    int8_t: __ARM_mve_type_int8_t, \
+    int8_t *: __ARM_mve_type_int8_t_ptr, \
+    int8_t const *: __ARM_mve_type_int8_t_const_ptr, \
+    int8x16_t: __ARM_mve_type_int8x16_t, \
+    int8x16x2_t: __ARM_mve_type_int8x16x2_t, \
+    int8x16x4_t: __ARM_mve_type_int8x16x4_t, \
+    uint16_t: __ARM_mve_type_uint16_t, \
+    uint16_t *: __ARM_mve_type_uint16_t_ptr, \
+    uint16_t const *: __ARM_mve_type_uint16_t_const_ptr, \
+    uint16x8_t: __ARM_mve_type_uint16x8_t, \
+    uint16x8x2_t: __ARM_mve_type_uint16x8x2_t, \
+    uint16x8x4_t: __ARM_mve_type_uint16x8x4_t, \
+    uint32_t: __ARM_mve_type_uint32_t, \
+    uint32_t *: __ARM_mve_type_uint32_t_ptr, \
+    uint32_t const *: __ARM_mve_type_uint32_t_const_ptr, \
+    uint32x4_t: __ARM_mve_type_uint32x4_t, \
+    uint32x4x2_t: __ARM_mve_type_uint32x4x2_t, \
+    uint32x4x4_t: __ARM_mve_type_uint32x4x4_t, \
+    uint64_t: __ARM_mve_type_uint64_t, \
+    uint64_t *: __ARM_mve_type_uint64_t_ptr, \
+    uint64_t const *: __ARM_mve_type_uint64_t_const_ptr, \
+    uint64x2_t: __ARM_mve_type_uint64x2_t, \
+    uint8_t: __ARM_mve_type_uint8_t, \
+    uint8_t *: __ARM_mve_type_uint8_t_ptr, \
+    uint8_t const *: __ARM_mve_type_uint8_t_const_ptr, \
+    uint8x16_t: __ARM_mve_type_uint8x16_t, \
+    uint8x16x2_t: __ARM_mve_type_uint8x16x2_t, \
+    uint8x16x4_t: __ARM_mve_type_uint8x16x4_t, \
+    default: _Generic(x, \
+	signed char: __ARM_mve_type_int8_t, \
+	short: __ARM_mve_type_int16_t, \
+	int: __ARM_mve_type_int32_t, \
+	long: __ARM_mve_type_int32_t, \
+	long long: __ARM_mve_type_int64_t, \
+	unsigned char: __ARM_mve_type_uint8_t, \
+	unsigned short: __ARM_mve_type_uint16_t, \
+	unsigned int: __ARM_mve_type_uint32_t, \
+	unsigned long: __ARM_mve_type_uint32_t, \
+	unsigned long long: __ARM_mve_type_uint64_t, \
+	default: __ARM_mve_unsupported_type))
+#endif /* MVE Floating point.  */
+
+extern void *__ARM_undef;
+#define __ARM_mve_coerce(param, type) \
+    _Generic(param, type: param, default: *(type *)__ARM_undef)
+
+#if (__ARM_FEATURE_MVE & 2) /* MVE Floating point.  */
+
+#define vst4q(p0,p1) __arm_vst4q(p0,p1) #define __arm_vst4q(p0,p1) ({ 
+__typeof(p0) __p0 = (p0); \
+  __typeof(p1) __p1 = (p1); \
+  _Generic( (int (*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)])0, 
+\
+  int (*)[__ARM_mve_type_int8_t_ptr][__ARM_mve_type_int8x16x4_t]: 
+__arm_vst4q_s8 (__ARM_mve_coerce(__p0, int8_t *), 
+__ARM_mve_coerce(__p1, int8x16x4_t)), \
+  int (*)[__ARM_mve_type_int16_t_ptr][__ARM_mve_type_int16x8x4_t]: 
+__arm_vst4q_s16 (__ARM_mve_coerce(__p0, int16_t *), 
+__ARM_mve_coerce(__p1, int16x8x4_t)), \
+  int (*)[__ARM_mve_type_int32_t_ptr][__ARM_mve_type_int32x4x4_t]: 
+__arm_vst4q_s32 (__ARM_mve_coerce(__p0, int32_t *), 
+__ARM_mve_coerce(__p1, int32x4x4_t)), \
+  int (*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint8x16x4_t]: 
+__arm_vst4q_u8 (__ARM_mve_coerce(__p0, uint8_t *), 
+__ARM_mve_coerce(__p1, uint8x16x4_t)), \
+  int (*)[__ARM_mve_type_uint16_t_ptr][__ARM_mve_type_uint16x8x4_t]: 
+__arm_vst4q_u16 (__ARM_mve_coerce(__p0, uint16_t *), 
+__ARM_mve_coerce(__p1, uint16x8x4_t)), \
+  int (*)[__ARM_mve_type_uint32_t_ptr][__ARM_mve_type_uint32x4x4_t]: 
+__arm_vst4q_u32 (__ARM_mve_coerce(__p0, uint32_t *), 
+__ARM_mve_coerce(__p1, uint32x4x4_t)), \
+  int (*)[__ARM_mve_type_float16_t_ptr][__ARM_mve_type_float16x8x4_t]: 
+__arm_vst4q_f16 (__ARM_mve_coerce(__p0, float16_t *), 
+__ARM_mve_coerce(__p1, float16x8x4_t)), \
+  int (*)[__ARM_mve_type_float32_t_ptr][__ARM_mve_type_float32x4x4_t]: 
+__arm_vst4q_f32 (__ARM_mve_coerce(__p0, float32_t *), 
+__ARM_mve_coerce(__p1, float32x4x4_t)));})
+
+#else /* MVE Interger.  */
+
+#define vst4q(p0,p1) __arm_vst4q(p0,p1) #define __arm_vst4q(p0,p1) ({ 
+__typeof(p0) __p0 = (p0); \
+  __typeof(p1) __p1 = (p1); \
+  _Generic( (int (*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)])0, 
+\
+  int (*)[__ARM_mve_type_int8_t_ptr][__ARM_mve_type_int8x16x4_t]: 
+__arm_vst4q_s8 (__ARM_mve_coerce(__p0, int8_t *), 
+__ARM_mve_coerce(__p1, int8x16x4_t)), \
+  int (*)[__ARM_mve_type_int16_t_ptr][__ARM_mve_type_int16x8x4_t]: 
+__arm_vst4q_s16 (__ARM_mve_coerce(__p0, int16_t *), 
+__ARM_mve_coerce(__p1, int16x8x4_t)), \
+  int (*)[__ARM_mve_type_int32_t_ptr][__ARM_mve_type_int32x4x4_t]: 
+__arm_vst4q_s32 (__ARM_mve_coerce(__p0, int32_t *), 
+__ARM_mve_coerce(__p1, int32x4x4_t)), \
+  int (*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint8x16x4_t]: 
+__arm_vst4q_u8 (__ARM_mve_coerce(__p0, uint8_t *), 
+__ARM_mve_coerce(__p1, uint8x16x4_t)), \
+  int (*)[__ARM_mve_type_uint16_t_ptr][__ARM_mve_type_uint16x8x4_t]: 
+__arm_vst4q_u16 (__ARM_mve_coerce(__p0, uint16_t *), 
+__ARM_mve_coerce(__p1, uint16x8x4_t)), \
+  int (*)[__ARM_mve_type_uint32_t_ptr][__ARM_mve_type_uint32x4x4_t]: 
+__arm_vst4q_u32 (__ARM_mve_coerce(__p0, uint32_t *), 
+__ARM_mve_coerce(__p1, uint32x4x4_t)));})
+
+#endif /* MVE Floating point.  */
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/gcc/config/arm/arm_mve_builtins.def b/gcc/config/arm/arm_mve_builtins.def
new file mode 100644
index 0000000000000000000000000000000000000000..4a7e4d06f0af1e639788848041133a800b183e83
--- /dev/null
+++ b/gcc/config/arm/arm_mve_builtins.def
@@ -0,0 +1,21 @@
+/*  MVE builtin definitions for Arm.
+    Copyright  (C) 2019-2020 Free Software Foundation, Inc.
+    Contributed by Arm.
+
+    This file is part of GCC.
+
+    GCC is free software; you can redistribute it and/or modify it
+    under the terms of the GNU General Public License as published
+    by the Free Software Foundation; either version 3, or (at your
+    option) any later version.
+
+    GCC is distributed in the hope that it will be useful, but WITHOUT
+    ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+    or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
+    License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with GCC; see the file COPYING3.  If not see
+    <http://www.gnu.org/licenses/>.  */
+
+VAR5 (STORE1, vst4q, v16qi, v8hi, v4si, v8hf, v4sf)
diff --git a/gcc/config/arm/iterators.md b/gcc/config/arm/iterators.md index 6af76580c0c5d0a6efd47eb1409061c74dca6378..5c1a11bf7dee7590d668e7ec5e3b068789b3b3db 100644
--- a/gcc/config/arm/iterators.md
+++ b/gcc/config/arm/iterators.md
@@ -131,7 +131,8 @@
 (define_mode_iterator VQXMOV [V16QI V8HI V8HF V8BF V4SI V4SF V2DI TI])
 
 ;; Opaque structure types wider than TImode.
-(define_mode_iterator VSTRUCT [EI OI CI XI])
+(define_mode_iterator VSTRUCT [(EI "!TARGET_HAVE_MVE") OI
+			       (CI "!TARGET_HAVE_MVE") XI])
 
 ;; Opaque structure types used in table lookups (except vtbl1/vtbx1).
 (define_mode_iterator VTAB [TI EI OI])
diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md index 92176a57bd75695a63b1bc958c9b2791f79ea6a4..9b3bb6129a99c4f3c952e7a72d0874dcba12f30d 100644
--- a/gcc/config/arm/mve.md
+++ b/gcc/config/arm/mve.md
@@ -17,9 +17,12 @@
 ;; along with GCC; see the file COPYING3.  If not see  ;; <http://www.gnu.org/licenses/>.
 
-(define_mode_iterator MVE_types [V16QI V8HI V4SI V2DI TI V8HF V4SF V2DF])  (define_mode_attr V_sz_elem2 [(V16QI "s8") (V8HI "u16") (V4SI "u32")
 			      (V2DI "u64")])
+(define_mode_iterator MVE_types [V16QI V8HI V4SI V2DI TI V8HF V4SF 
+V2DF]) (define_mode_iterator MVE_VLD_ST [V16QI V8HI V4SI V8HF V4SF])
+
+(define_c_enum "unspec" [VST4Q])
 
 (define_insn "*mve_mov<mode>"
   [(set (match_operand:MVE_types 0 "nonimmediate_operand" "=w,w,r,w,w,r,w,Us") @@ -83,3 +86,37 @@  }
   [(set_attr "length" "4,4")
    (set_attr "type" "mve_move,mve_move")])
+
+;;
+;; [vst4q])
+;;
+(define_insn "mve_vst4q<mode>"
+  [(set (match_operand:XI 0 "neon_struct_operand" "=Um")
+	(unspec:XI [(match_operand:XI 1 "s_register_operand" "w")
+		    (unspec:MVE_VLD_ST [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+	 VST4Q))
+  ]
+  "TARGET_HAVE_MVE"
+{
+   rtx ops[6];
+   int regno = REGNO (operands[1]);
+   ops[0] = gen_rtx_REG (TImode, regno);
+   ops[1] = gen_rtx_REG (TImode, regno+4);
+   ops[2] = gen_rtx_REG (TImode, regno+8);
+   ops[3] = gen_rtx_REG (TImode, regno+12);
+   rtx reg  = operands[0];
+   while (reg && !REG_P (reg))
+    reg = XEXP (reg, 0);
+   gcc_assert (REG_P (reg));
+   ops[4] = reg;
+   ops[5] = operands[0];
+   /* Here in first three instructions data is stored to ops[4]'s location but
+      in the fourth instruction data is stored to operands[0], this is to
+      support the writeback.  */
+   output_asm_insn ("vst40.<V_sz_elem>\t{%q0, %q1, %q2, %q3}, [%4]\n\t"
+		    "vst41.<V_sz_elem>\t{%q0, %q1, %q2, %q3}, [%4]\n\t"
+		    "vst42.<V_sz_elem>\t{%q0, %q1, %q2, %q3}, [%4]\n\t"
+		    "vst43.<V_sz_elem>\t{%q0, %q1, %q2, %q3}, %5", ops);
+   return "";
+}
+  [(set_attr "length" "16")])
diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md index b6a8eb62c8ff05fc32805029fea8f682402d7399..fbfeef233f38831a5cb256622625879d15209431 100644
--- a/gcc/config/arm/neon.md
+++ b/gcc/config/arm/neon.md
@@ -149,7 +149,7 @@
 (define_expand "mov<mode>"
   [(set (match_operand:VSTRUCT 0 "nonimmediate_operand")
 	(match_operand:VSTRUCT 1 "general_operand"))]
-  "TARGET_NEON"
+  "TARGET_NEON || TARGET_HAVE_MVE"
 {
   gcc_checking_assert (aligned_operand (operands[0], <MODE>mode));
   gcc_checking_assert (aligned_operand (operands[1], <MODE>mode)); @@ -181,7 +181,7 @@  (define_insn "*neon_mov<mode>"
   [(set (match_operand:VSTRUCT 0 "nonimmediate_operand"	"=w,Ut,w")
 	(match_operand:VSTRUCT 1 "general_operand"	" w,w, Ut"))]
-  "TARGET_NEON
+  "(TARGET_NEON || TARGET_HAVE_MVE)
    && (register_operand (operands[0], <MODE>mode)
        || register_operand (operands[1], <MODE>mode))"
 {
@@ -217,7 +217,7 @@
 (define_split
   [(set (match_operand:OI 0 "s_register_operand" "")
 	(match_operand:OI 1 "s_register_operand" ""))]
-  "TARGET_NEON && reload_completed"
+  "(TARGET_NEON || TARGET_HAVE_MVE)&& reload_completed"
   [(set (match_dup 0) (match_dup 1))
    (set (match_dup 2) (match_dup 3))]
 {
@@ -258,7 +258,7 @@
 (define_split
   [(set (match_operand:XI 0 "s_register_operand" "")
 	(match_operand:XI 1 "s_register_operand" ""))]
-  "TARGET_NEON && reload_completed"
+  "(TARGET_NEON || TARGET_HAVE_MVE) && reload_completed"
   [(set (match_dup 0) (match_dup 1))
    (set (match_dup 2) (match_dup 3))
    (set (match_dup 4) (match_dup 5))
diff --git a/gcc/config/arm/t-arm b/gcc/config/arm/t-arm index 2d980830e7669e60d6f2cb61014389d121bfbcb1..1f7f169731301edc1f275342ad7a0eb46407f6e4 100644
--- a/gcc/config/arm/t-arm
+++ b/gcc/config/arm/t-arm
@@ -137,7 +137,8 @@ arm.o: $(srcdir)/config/arm/arm.c $(CONFIG_H) $(SYSTEM_H) coretypes.h $(TM_H) \
   arm-cpu-data.h \
   $(srcdir)/config/arm/arm-protos.h \
   $(srcdir)/config/arm/arm_neon_builtins.def \
-  $(srcdir)/config/arm/arm_vfp_builtins.def
+  $(srcdir)/config/arm/arm_vfp_builtins.def \  
+ $(srcdir)/config/arm/arm_mve_builtins.def
 
 arm-builtins.o: $(srcdir)/config/arm/arm-builtins.c $(CONFIG_H) \
   $(SYSTEM_H) coretypes.h $(TM_H) \
@@ -147,6 +148,7 @@ arm-builtins.o: $(srcdir)/config/arm/arm-builtins.c $(CONFIG_H) \
   $(srcdir)/config/arm/arm_acle_builtins.def \
   $(srcdir)/config/arm/arm_neon_builtins.def \
   $(srcdir)/config/arm/arm_vfp_builtins.def \
+  $(srcdir)/config/arm/arm_mve_builtins.def \
   $(srcdir)/config/arm/arm-simd-builtin-types.def
 	$(COMPILER) -c $(ALL_COMPILERFLAGS) $(ALL_CPPFLAGS) $(INCLUDES) \
 		$(srcdir)/config/arm/arm-builtins.c
diff --git a/gcc/testsuite/gcc.target/arm/mve/intrinsics/vst4q_f16.c b/gcc/testsuite/gcc.target/arm/mve/intrinsics/vst4q_f16.c
new file mode 100644
index 0000000000000000000000000000000000000000..8516cfa832b03b4e8713aef8fc1cbabdf62c13f5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/mve/intrinsics/vst4q_f16.c
@@ -0,0 +1,38 @@
+/* { dg-do compile  } */
+/* { dg-require-effective-target arm_v8_1m_mve_fp_ok } */
+/* { dg-add-options arm_v8_1m_mve_fp } */
+/* { dg-additional-options "-O2" } */
+
+#include "arm_mve.h"
+
+void
+foo (float16_t * addr, float16x8x4_t value) {
+  vst4q_f16 (addr, value);
+}
+
+/* { dg-final { scan-assembler "vst40.16"  }  } */
+/* { dg-final { scan-assembler "vst41.16"  }  } */
+/* { dg-final { scan-assembler "vst42.16"  }  } */
+/* { dg-final { scan-assembler "vst43.16"  }  } */
+
+void
+foo1 (float16_t * addr, float16x8x4_t value) {
+  vst4q (addr, value);
+}
+
+/* { dg-final { scan-assembler "vst40.16"  }  } */
+/* { dg-final { scan-assembler "vst41.16"  }  } */
+/* { dg-final { scan-assembler "vst42.16"  }  } */
+/* { dg-final { scan-assembler "vst43.16"  }  } */
+
+void
+foo2 (float16_t * addr, float16x8x4_t value) {
+  vst4q_f16 (addr, value);
+  addr += 32;
+  vst4q_f16 (addr, value);
+}
+
+/* { dg-final { scan-assembler {vst43.16\s\{.*\}, \[.*\]!}  }  } */
diff --git a/gcc/testsuite/gcc.target/arm/mve/intrinsics/vst4q_f32.c b/gcc/testsuite/gcc.target/arm/mve/intrinsics/vst4q_f32.c
new file mode 100644
index 0000000000000000000000000000000000000000..c8b327259ff44d3d56377e0f15a03719792e534c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/mve/intrinsics/vst4q_f32.c
@@ -0,0 +1,38 @@
+/* { dg-do compile  } */
+/* { dg-require-effective-target arm_v8_1m_mve_fp_ok } */
+/* { dg-add-options arm_v8_1m_mve_fp } */
+/* { dg-additional-options "-O2" } */
+
+#include "arm_mve.h"
+
+void
+foo (float32_t * addr, float32x4x4_t value) {
+  vst4q_f32 (addr, value);
+}
+
+/* { dg-final { scan-assembler "vst40.32"  }  } */
+/* { dg-final { scan-assembler "vst41.32"  }  } */
+/* { dg-final { scan-assembler "vst42.32"  }  } */
+/* { dg-final { scan-assembler "vst43.32"  }  } */
+
+void
+foo1 (float32_t * addr, float32x4x4_t value) {
+  vst4q (addr, value);
+}
+
+/* { dg-final { scan-assembler "vst40.32"  }  } */
+/* { dg-final { scan-assembler "vst41.32"  }  } */
+/* { dg-final { scan-assembler "vst42.32"  }  } */
+/* { dg-final { scan-assembler "vst43.32"  }  } */
+
+void
+foo2 (float32_t * addr, float32x4x4_t value) {
+  vst4q_f32 (addr, value);
+  addr += 16;
+  vst4q_f32 (addr, value);
+}
+
+/* { dg-final { scan-assembler {vst43.32\s\{.*\}, \[.*\]!}  }  } */
diff --git a/gcc/testsuite/gcc.target/arm/mve/intrinsics/vst4q_s16.c b/gcc/testsuite/gcc.target/arm/mve/intrinsics/vst4q_s16.c
new file mode 100644
index 0000000000000000000000000000000000000000..d06947d04d828f0bdf831dbef13b3a8ebb70c75a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/mve/intrinsics/vst4q_s16.c
@@ -0,0 +1,38 @@
+/* { dg-do compile  } */
+/* { dg-require-effective-target arm_v8_1m_mve_ok } */
+/* { dg-add-options arm_v8_1m_mve } */
+/* { dg-additional-options "-O2" } */
+
+#include "arm_mve.h"
+
+void
+foo (int16_t * addr, int16x8x4_t value) {
+  vst4q_s16 (addr, value);
+}
+
+/* { dg-final { scan-assembler "vst40.16"  }  } */
+/* { dg-final { scan-assembler "vst41.16"  }  } */
+/* { dg-final { scan-assembler "vst42.16"  }  } */
+/* { dg-final { scan-assembler "vst43.16"  }  } */
+
+void
+foo1 (int16_t * addr, int16x8x4_t value) {
+  vst4q (addr, value);
+}
+
+/* { dg-final { scan-assembler "vst40.16"  }  } */
+/* { dg-final { scan-assembler "vst41.16"  }  } */
+/* { dg-final { scan-assembler "vst42.16"  }  } */
+/* { dg-final { scan-assembler "vst43.16"  }  } */
+
+void
+foo2 (int16_t * addr, int16x8x4_t value) {
+  vst4q_s16 (addr, value);
+  addr += 32;
+  vst4q_s16 (addr, value);
+}
+
+/* { dg-final { scan-assembler {vst43.16\s\{.*\}, \[.*\]!}  }  } */
diff --git a/gcc/testsuite/gcc.target/arm/mve/intrinsics/vst4q_s32.c b/gcc/testsuite/gcc.target/arm/mve/intrinsics/vst4q_s32.c
new file mode 100644
index 0000000000000000000000000000000000000000..5dc6835ef847032ce3a7d2c538f53eaec38dd0ca
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/mve/intrinsics/vst4q_s32.c
@@ -0,0 +1,38 @@
+/* { dg-do compile  } */
+/* { dg-require-effective-target arm_v8_1m_mve_ok } */
+/* { dg-add-options arm_v8_1m_mve } */
+/* { dg-additional-options "-O2" } */
+
+#include "arm_mve.h"
+
+void
+foo (int32_t * addr, int32x4x4_t value) {
+  vst4q_s32 (addr, value);
+}
+
+/* { dg-final { scan-assembler "vst40.32"  }  } */
+/* { dg-final { scan-assembler "vst41.32"  }  } */
+/* { dg-final { scan-assembler "vst42.32"  }  } */
+/* { dg-final { scan-assembler "vst43.32"  }  } */
+
+void
+foo1 (int32_t * addr, int32x4x4_t value) {
+  vst4q (addr, value);
+}
+
+/* { dg-final { scan-assembler "vst40.32"  }  } */
+/* { dg-final { scan-assembler "vst41.32"  }  } */
+/* { dg-final { scan-assembler "vst42.32"  }  } */
+/* { dg-final { scan-assembler "vst43.32"  }  } */
+
+void
+foo2 (int32_t * addr, int32x4x4_t value) {
+  vst4q_s32 (addr, value);
+  addr += 16;
+  vst4q_s32 (addr, value);
+}
+
+/* { dg-final { scan-assembler {vst43.32\s\{.*\}, \[.*\]!}  }  } */
diff --git a/gcc/testsuite/gcc.target/arm/mve/intrinsics/vst4q_s8.c b/gcc/testsuite/gcc.target/arm/mve/intrinsics/vst4q_s8.c
new file mode 100644
index 0000000000000000000000000000000000000000..a3cb53e229236dc2f3355ef8622a6c99a9b9f3da
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/mve/intrinsics/vst4q_s8.c
@@ -0,0 +1,38 @@
+/* { dg-do compile  } */
+/* { dg-require-effective-target arm_v8_1m_mve_ok } */
+/* { dg-add-options arm_v8_1m_mve } */
+/* { dg-additional-options "-O2" } */
+
+#include "arm_mve.h"
+
+void
+foo (int8_t * addr, int8x16x4_t value)
+{
+  vst4q_s8 (addr, value);
+}
+
+/* { dg-final { scan-assembler "vst40.8"  }  } */
+/* { dg-final { scan-assembler "vst41.8"  }  } */
+/* { dg-final { scan-assembler "vst42.8"  }  } */
+/* { dg-final { scan-assembler "vst43.8"  }  } */
+
+void
+foo1 (int8_t * addr, int8x16x4_t value) {
+  vst4q (addr, value);
+}
+
+/* { dg-final { scan-assembler "vst40.8"  }  } */
+/* { dg-final { scan-assembler "vst41.8"  }  } */
+/* { dg-final { scan-assembler "vst42.8"  }  } */
+/* { dg-final { scan-assembler "vst43.8"  }  } */
+
+void
+foo2 (int8_t * addr, int8x16x4_t value) {
+  vst4q_s8 (addr, value);
+  addr += 16*4;
+  vst4q_s8 (addr, value);
+}
+
+/* { dg-final { scan-assembler {vst43.8\s\{.*\}, \[.*\]!}  }  } */
diff --git a/gcc/testsuite/gcc.target/arm/mve/intrinsics/vst4q_u16.c b/gcc/testsuite/gcc.target/arm/mve/intrinsics/vst4q_u16.c
new file mode 100644
index 0000000000000000000000000000000000000000..87dd4bfc0624c8340f522dde3ca6818266d7cde0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/mve/intrinsics/vst4q_u16.c
@@ -0,0 +1,38 @@
+/* { dg-do compile  } */
+/* { dg-require-effective-target arm_v8_1m_mve_ok } */
+/* { dg-add-options arm_v8_1m_mve } */
+/* { dg-additional-options "-O2" } */
+
+#include "arm_mve.h"
+
+void
+foo (uint16_t * addr, uint16x8x4_t value) {
+  vst4q_u16 (addr, value);
+}
+
+/* { dg-final { scan-assembler "vst40.16"  }  } */
+/* { dg-final { scan-assembler "vst41.16"  }  } */
+/* { dg-final { scan-assembler "vst42.16"  }  } */
+/* { dg-final { scan-assembler "vst43.16"  }  } */
+
+void
+foo1 (uint16_t * addr, uint16x8x4_t value) {
+  vst4q (addr, value);
+}
+
+/* { dg-final { scan-assembler "vst40.16"  }  } */
+/* { dg-final { scan-assembler "vst41.16"  }  } */
+/* { dg-final { scan-assembler "vst42.16"  }  } */
+/* { dg-final { scan-assembler "vst43.16"  }  } */
+
+void
+foo2 (uint16_t * addr, uint16x8x4_t value) {
+  vst4q_u16 (addr, value);
+  addr += 32;
+  vst4q_u16 (addr, value);
+}
+
+/* { dg-final { scan-assembler {vst43.16\s\{.*\}, \[.*\]!}  }  } */
diff --git a/gcc/testsuite/gcc.target/arm/mve/intrinsics/vst4q_u32.c b/gcc/testsuite/gcc.target/arm/mve/intrinsics/vst4q_u32.c
new file mode 100644
index 0000000000000000000000000000000000000000..943aa0266a7da619295e3e30fe99c011c1b7184e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/mve/intrinsics/vst4q_u32.c
@@ -0,0 +1,38 @@
+/* { dg-do compile  } */
+/* { dg-require-effective-target arm_v8_1m_mve_ok } */
+/* { dg-add-options arm_v8_1m_mve } */
+/* { dg-additional-options "-O2" } */
+
+#include "arm_mve.h"
+
+void
+foo (uint32_t * addr, uint32x4x4_t value) {
+  vst4q_u32 (addr, value);
+}
+
+/* { dg-final { scan-assembler "vst40.32"  }  } */
+/* { dg-final { scan-assembler "vst41.32"  }  } */
+/* { dg-final { scan-assembler "vst42.32"  }  } */
+/* { dg-final { scan-assembler "vst43.32"  }  } */
+
+void
+foo1 (uint32_t * addr, uint32x4x4_t value) {
+  vst4q (addr, value);
+}
+
+/* { dg-final { scan-assembler "vst40.32"  }  } */
+/* { dg-final { scan-assembler "vst41.32"  }  } */
+/* { dg-final { scan-assembler "vst42.32"  }  } */
+/* { dg-final { scan-assembler "vst43.32"  }  } */
+
+void
+foo2 (uint32_t * addr, uint32x4x4_t value) {
+  vst4q_u32 (addr, value);
+  addr += 16;
+  vst4q_u32 (addr, value);
+}
+
+/* { dg-final { scan-assembler {vst43.32\s\{.*\}, \[.*\]!}  }  } */
diff --git a/gcc/testsuite/gcc.target/arm/mve/intrinsics/vst4q_u8.c b/gcc/testsuite/gcc.target/arm/mve/intrinsics/vst4q_u8.c
new file mode 100644
index 0000000000000000000000000000000000000000..c8dcc97087c6f3b795fa26337b97a802e5e13c44
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/mve/intrinsics/vst4q_u8.c
@@ -0,0 +1,38 @@
+/* { dg-do compile  } */
+/* { dg-require-effective-target arm_v8_1m_mve_ok } */
+/* { dg-add-options arm_v8_1m_mve } */
+/* { dg-additional-options "-O2" } */
+
+#include "arm_mve.h"
+
+void
+foo (uint8_t * addr, uint8x16x4_t value) {
+  vst4q_u8 (addr, value);
+}
+
+/* { dg-final { scan-assembler "vst40.8"  }  } */
+/* { dg-final { scan-assembler "vst41.8"  }  } */
+/* { dg-final { scan-assembler "vst42.8"  }  } */
+/* { dg-final { scan-assembler "vst43.8"  }  } */
+
+void
+foo1 (uint8_t * addr, uint8x16x4_t value) {
+  vst4q (addr, value);
+}
+
+/* { dg-final { scan-assembler "vst40.8"  }  } */
+/* { dg-final { scan-assembler "vst41.8"  }  } */
+/* { dg-final { scan-assembler "vst42.8"  }  } */
+/* { dg-final { scan-assembler "vst43.8"  }  } */
+
+void
+foo2 (uint8_t * addr, uint8x16x4_t value) {
+  vst4q_u8 (addr, value);
+  addr += 16*4;
+  vst4q_u8 (addr, value);
+}
+
+/* { dg-final { scan-assembler {vst43.8\s\{.*\}, \[.*\]!}  }  } */
diff mbox series

Patch

diff --git a/gcc/config/arm/arm-builtins.c b/gcc/config/arm/arm-builtins.c
index 28917363eeae51b7cc39576f3c3e77a0350b8877..b9ee45d5950ac9c1e12d88cd7d3ece1953dc51d0 100644
--- a/gcc/config/arm/arm-builtins.c
+++ b/gcc/config/arm/arm-builtins.c
@@ -432,6 +432,13 @@  static arm_builtin_datum neon_builtin_data[] =
 };
 
 #undef CF
+#define CF(N,X) CODE_FOR_mve_##N##X
+static arm_builtin_datum mve_builtin_data[] =
+{
+#include "arm_mve_builtins.def"
+};
+
+#undef CF
 #undef VAR1
 #define VAR1(T, N, A) \
   {#N, UP (A), CODE_FOR_arm_##N, 0, T##_QUALIFIERS},
@@ -736,6 +743,13 @@  enum arm_builtins
 
 #include "arm_acle_builtins.def"
 
+  ARM_BUILTIN_MVE_BASE,
+
+#undef VAR1
+#define VAR1(T, N, X) \
+  ARM_BUILTIN_MVE_##N##X,
+#include "arm_mve_builtins.def"
+
   ARM_BUILTIN_MAX
 };
 
@@ -745,6 +759,9 @@  enum arm_builtins
 #define ARM_BUILTIN_NEON_PATTERN_START \
   (ARM_BUILTIN_NEON_BASE + 1)
 
+#define ARM_BUILTIN_MVE_PATTERN_START \
+  (ARM_BUILTIN_MVE_BASE + 1)
+
 #define ARM_BUILTIN_ACLE_PATTERN_START \
   (ARM_BUILTIN_ACLE_BASE + 1)
 
@@ -1276,6 +1293,22 @@  arm_init_acle_builtins (void)
     }
 }
 
+/* Set up all the MVE builtins mentioned in arm_mve_builtins.def file.  */
+static void
+arm_init_mve_builtins (void)
+{
+  volatile unsigned int i, fcode = ARM_BUILTIN_MVE_PATTERN_START;
+
+  arm_init_simd_builtin_scalar_types ();
+  arm_init_simd_builtin_types ();
+
+  for (i = 0; i < ARRAY_SIZE (mve_builtin_data); i++, fcode++)
+    {
+      arm_builtin_datum *d = &mve_builtin_data[i];
+      arm_init_builtin (fcode, d, "__builtin_mve");
+    }
+}
+
 /* Set up all the NEON builtins, even builtins for instructions that are not
    in the current target ISA to allow the user to compile particular modules
    with different target specific options that differ from the command line
@@ -2020,8 +2053,10 @@  arm_init_builtins (void)
       = add_builtin_function ("__builtin_arm_lane_check", lane_check_fpr,
 			      ARM_BUILTIN_SIMD_LANE_CHECK, BUILT_IN_MD,
 			      NULL, NULL_TREE);
-
-      arm_init_neon_builtins ();
+      if (TARGET_HAVE_MVE)
+	arm_init_mve_builtins ();
+      else
+	arm_init_neon_builtins ();
       arm_init_vfp_builtins ();
       arm_init_crypto_builtins ();
     }
@@ -2565,10 +2600,14 @@  arm_expand_builtin_1 (int fcode, tree exp, rtx target,
   int is_void = 0;
   int k;
   bool neon = false;
+  bool mve = false;
 
   if (IN_RANGE (fcode, ARM_BUILTIN_VFP_BASE, ARM_BUILTIN_ACLE_BASE - 1))
     neon = true;
 
+  if (IN_RANGE (fcode, ARM_BUILTIN_MVE_BASE, ARM_BUILTIN_MAX - 1))
+    mve = true;
+
   is_void = !!(d->qualifiers[0] & qualifier_void);
 
   num_args += is_void;
@@ -2610,7 +2649,7 @@  arm_expand_builtin_1 (int fcode, tree exp, rtx target,
 	}
       else if (d->qualifiers[qualifiers_k] & qualifier_pointer)
 	{
-	  if (neon)
+	  if (neon || mve)
 	    args[k] = ARG_BUILTIN_NEON_MEMORY;
 	  else
 	    args[k] = ARG_BUILTIN_MEMORY;
@@ -2660,6 +2699,26 @@  arm_expand_acle_builtin (int fcode, tree exp, rtx target)
   return arm_expand_builtin_1 (fcode, exp, target, d);
 }
 
+/* Expand an MVE builtin, i.e. those registered only if their respective target
+   constraints are met.  This check happens within arm_expand_builtin.  */
+
+static rtx
+arm_expand_mve_builtin (int fcode, tree exp, rtx target)
+{
+  if (fcode >= ARM_BUILTIN_MVE_BASE && !TARGET_HAVE_MVE)
+  {
+    fatal_error (input_location,
+		"You must enable MVE instructions"
+		" to use these intrinsics");
+    return const0_rtx;
+  }
+
+  arm_builtin_datum *d
+    = &mve_builtin_data[fcode - ARM_BUILTIN_MVE_PATTERN_START];
+
+  return arm_expand_builtin_1 (fcode, exp, target, d);
+}
+
 /* Expand a Neon builtin, i.e. those registered only if TARGET_NEON holds.
    Most of these are "special" because they don't have symbolic
    constants defined per-instruction or per instruction-variant.  Instead, the
@@ -2753,6 +2812,8 @@  arm_expand_builtin (tree exp,
       /* Don't generate any RTL.  */
       return const0_rtx;
     }
+  if (fcode >= ARM_BUILTIN_MVE_BASE)
+    return arm_expand_mve_builtin (fcode, exp, target);
 
   if (fcode >= ARM_BUILTIN_ACLE_BASE)
     return arm_expand_acle_builtin (fcode, exp, target);
diff --git a/gcc/config/arm/arm_mve.h b/gcc/config/arm/arm_mve.h
index 5ffb466596b5d8fc330616a6fcc7ee37d3e28def..39c6a1551a72700292dde8ef6cea44ba0907af8d 100644
--- a/gcc/config/arm/arm_mve.h
+++ b/gcc/config/arm/arm_mve.h
@@ -42,6 +42,13 @@  typedef __simd128_float16_t float16x8_t;
 typedef __simd128_float32_t float32x4_t;
 #endif
 
+#if (__ARM_FEATURE_MVE & 2) /* MVE Floating point.  */
+typedef struct { float16x8_t val[2]; } float16x8x2_t;
+typedef struct { float16x8_t val[4]; } float16x8x4_t;
+typedef struct { float32x4_t val[2]; } float32x4x2_t;
+typedef struct { float32x4_t val[4]; } float32x4x4_t;
+#endif
+
 typedef uint16_t mve_pred16_t;
 typedef __simd128_uint8_t uint8x16_t;
 typedef __simd128_uint16_t uint16x8_t;
@@ -52,6 +59,330 @@  typedef __simd128_int16_t int16x8_t;
 typedef __simd128_int32_t int32x4_t;
 typedef __simd128_int64_t int64x2_t;
 
+typedef struct { int16x8_t val[2]; } int16x8x2_t;
+typedef struct { int16x8_t val[4]; } int16x8x4_t;
+typedef struct { int32x4_t val[2]; } int32x4x2_t;
+typedef struct { int32x4_t val[4]; } int32x4x4_t;
+typedef struct { int8x16_t val[2]; } int8x16x2_t;
+typedef struct { int8x16_t val[4]; } int8x16x4_t;
+typedef struct { uint16x8_t val[2]; } uint16x8x2_t;
+typedef struct { uint16x8_t val[4]; } uint16x8x4_t;
+typedef struct { uint32x4_t val[2]; } uint32x4x2_t;
+typedef struct { uint32x4_t val[4]; } uint32x4x4_t;
+typedef struct { uint8x16_t val[2]; } uint8x16x2_t;
+typedef struct { uint8x16_t val[4]; } uint8x16x4_t;
+
+#ifndef __ARM_MVE_PRESERVE_USER_NAMESPACE
+#define vst4q_s8( __addr, __value) __arm_vst4q_s8( __addr, __value)
+#define vst4q_s16( __addr, __value) __arm_vst4q_s16( __addr, __value)
+#define vst4q_s32( __addr, __value) __arm_vst4q_s32( __addr, __value)
+#define vst4q_u8( __addr, __value) __arm_vst4q_u8( __addr, __value)
+#define vst4q_u16( __addr, __value) __arm_vst4q_u16( __addr, __value)
+#define vst4q_u32( __addr, __value) __arm_vst4q_u32( __addr, __value)
+#define vst4q_f16( __addr, __value) __arm_vst4q_f16( __addr, __value)
+#define vst4q_f32( __addr, __value) __arm_vst4q_f32( __addr, __value)
+#endif
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+__arm_vst4q_s8 (int8_t * __addr, int8x16x4_t __value)
+{
+  union { int8x16x4_t __i; __builtin_neon_xi __o; } __rv;
+  __rv.__i = __value;
+  __builtin_mve_vst4qv16qi ((__builtin_neon_qi *) __addr, __rv.__o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+__arm_vst4q_s16 (int16_t * __addr, int16x8x4_t __value)
+{
+  union { int16x8x4_t __i; __builtin_neon_xi __o; } __rv;
+  __rv.__i = __value;
+  __builtin_mve_vst4qv8hi ((__builtin_neon_hi *) __addr, __rv.__o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+__arm_vst4q_s32 (int32_t * __addr, int32x4x4_t __value)
+{
+  union { int32x4x4_t __i; __builtin_neon_xi __o; } __rv;
+  __rv.__i = __value;
+  __builtin_mve_vst4qv4si ((__builtin_neon_si *) __addr, __rv.__o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+__arm_vst4q_u8 (uint8_t * __addr, uint8x16x4_t __value)
+{
+  union { uint8x16x4_t __i; __builtin_neon_xi __o; } __rv;
+  __rv.__i = __value;
+  __builtin_mve_vst4qv16qi ((__builtin_neon_qi *) __addr, __rv.__o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+__arm_vst4q_u16 (uint16_t * __addr, uint16x8x4_t __value)
+{
+  union { uint16x8x4_t __i; __builtin_neon_xi __o; } __rv;
+  __rv.__i = __value;
+  __builtin_mve_vst4qv8hi ((__builtin_neon_hi *) __addr, __rv.__o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+__arm_vst4q_u32 (uint32_t * __addr, uint32x4x4_t __value)
+{
+  union { uint32x4x4_t __i; __builtin_neon_xi __o; } __rv;
+  __rv.__i = __value;
+  __builtin_mve_vst4qv4si ((__builtin_neon_si *) __addr, __rv.__o);
+}
+
+#if (__ARM_FEATURE_MVE & 2) /* MVE Floating point.  */
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+__arm_vst4q_f16 (float16_t * __addr, float16x8x4_t __value)
+{
+  union { float16x8x4_t __i; __builtin_neon_xi __o; } __rv;
+  __rv.__i = __value;
+  __builtin_mve_vst4qv8hf (__addr, __rv.__o);
+}
+
+__extension__ extern __inline void
+__attribute__ ((__always_inline__, __gnu_inline__, __artificial__))
+__arm_vst4q_f32 (float32_t * __addr, float32x4x4_t __value)
+{
+  union { float32x4x4_t __i; __builtin_neon_xi __o; } __rv;
+  __rv.__i = __value;
+  __builtin_mve_vst4qv4sf (__addr, __rv.__o);
+}
+
+#endif
+
+enum {
+    __ARM_mve_type_float16_t = 1,
+    __ARM_mve_type_float16_t_ptr,
+    __ARM_mve_type_float16_t_const_ptr,
+    __ARM_mve_type_float16x8_t,
+    __ARM_mve_type_float16x8x2_t,
+    __ARM_mve_type_float16x8x4_t,
+    __ARM_mve_type_float32_t,
+    __ARM_mve_type_float32_t_ptr,
+    __ARM_mve_type_float32_t_const_ptr,
+    __ARM_mve_type_float32x4_t,
+    __ARM_mve_type_float32x4x2_t,
+    __ARM_mve_type_float32x4x4_t,
+    __ARM_mve_type_int16_t,
+    __ARM_mve_type_int16_t_ptr,
+    __ARM_mve_type_int16_t_const_ptr,
+    __ARM_mve_type_int16x8_t,
+    __ARM_mve_type_int16x8x2_t,
+    __ARM_mve_type_int16x8x4_t,
+    __ARM_mve_type_int32_t,
+    __ARM_mve_type_int32_t_ptr,
+    __ARM_mve_type_int32_t_const_ptr,
+    __ARM_mve_type_int32x4_t,
+    __ARM_mve_type_int32x4x2_t,
+    __ARM_mve_type_int32x4x4_t,
+    __ARM_mve_type_int64_t,
+    __ARM_mve_type_int64_t_ptr,
+    __ARM_mve_type_int64_t_const_ptr,
+    __ARM_mve_type_int64x2_t,
+    __ARM_mve_type_int8_t,
+    __ARM_mve_type_int8_t_ptr,
+    __ARM_mve_type_int8_t_const_ptr,
+    __ARM_mve_type_int8x16_t,
+    __ARM_mve_type_int8x16x2_t,
+    __ARM_mve_type_int8x16x4_t,
+    __ARM_mve_type_uint16_t,
+    __ARM_mve_type_uint16_t_ptr,
+    __ARM_mve_type_uint16_t_const_ptr,
+    __ARM_mve_type_uint16x8_t,
+    __ARM_mve_type_uint16x8x2_t,
+    __ARM_mve_type_uint16x8x4_t,
+    __ARM_mve_type_uint32_t,
+    __ARM_mve_type_uint32_t_ptr,
+    __ARM_mve_type_uint32_t_const_ptr,
+    __ARM_mve_type_uint32x4_t,
+    __ARM_mve_type_uint32x4x2_t,
+    __ARM_mve_type_uint32x4x4_t,
+    __ARM_mve_type_uint64_t,
+    __ARM_mve_type_uint64_t_ptr,
+    __ARM_mve_type_uint64_t_const_ptr,
+    __ARM_mve_type_uint64x2_t,
+    __ARM_mve_type_uint8_t,
+    __ARM_mve_type_uint8_t_ptr,
+    __ARM_mve_type_uint8_t_const_ptr,
+    __ARM_mve_type_uint8x16_t,
+    __ARM_mve_type_uint8x16x2_t,
+    __ARM_mve_type_uint8x16x4_t,
+    __ARM_mve_unsupported_type
+};
+
+#if (__ARM_FEATURE_MVE & 2) /* MVE Floating point.  */
+#define __ARM_mve_typeid(x) _Generic(x, \
+    float16_t: __ARM_mve_type_float16_t, \
+    float16_t *: __ARM_mve_type_float16_t_ptr, \
+    float16_t const *: __ARM_mve_type_float16_t_const_ptr, \
+    float16x8_t: __ARM_mve_type_float16x8_t, \
+    float16x8x2_t: __ARM_mve_type_float16x8x2_t, \
+    float16x8x4_t: __ARM_mve_type_float16x8x4_t, \
+    float32_t: __ARM_mve_type_float32_t, \
+    float32_t *: __ARM_mve_type_float32_t_ptr, \
+    float32_t const *: __ARM_mve_type_float32_t_const_ptr, \
+    float32x4_t: __ARM_mve_type_float32x4_t, \
+    float32x4x2_t: __ARM_mve_type_float32x4x2_t, \
+    float32x4x4_t: __ARM_mve_type_float32x4x4_t, \
+    int16_t: __ARM_mve_type_int16_t, \
+    int16_t *: __ARM_mve_type_int16_t_ptr, \
+    int16_t const *: __ARM_mve_type_int16_t_const_ptr, \
+    int16x8_t: __ARM_mve_type_int16x8_t, \
+    int16x8x2_t: __ARM_mve_type_int16x8x2_t, \
+    int16x8x4_t: __ARM_mve_type_int16x8x4_t, \
+    int32_t: __ARM_mve_type_int32_t, \
+    int32_t *: __ARM_mve_type_int32_t_ptr, \
+    int32_t const *: __ARM_mve_type_int32_t_const_ptr, \
+    int32x4_t: __ARM_mve_type_int32x4_t, \
+    int32x4x2_t: __ARM_mve_type_int32x4x2_t, \
+    int32x4x4_t: __ARM_mve_type_int32x4x4_t, \
+    int64_t: __ARM_mve_type_int64_t, \
+    int64_t *: __ARM_mve_type_int64_t_ptr, \
+    int64_t const *: __ARM_mve_type_int64_t_const_ptr, \
+    int64x2_t: __ARM_mve_type_int64x2_t, \
+    int8_t: __ARM_mve_type_int8_t, \
+    int8_t *: __ARM_mve_type_int8_t_ptr, \
+    int8_t const *: __ARM_mve_type_int8_t_const_ptr, \
+    int8x16_t: __ARM_mve_type_int8x16_t, \
+    int8x16x2_t: __ARM_mve_type_int8x16x2_t, \
+    int8x16x4_t: __ARM_mve_type_int8x16x4_t, \
+    uint16_t: __ARM_mve_type_uint16_t, \
+    uint16_t *: __ARM_mve_type_uint16_t_ptr, \
+    uint16_t const *: __ARM_mve_type_uint16_t_const_ptr, \
+    uint16x8_t: __ARM_mve_type_uint16x8_t, \
+    uint16x8x2_t: __ARM_mve_type_uint16x8x2_t, \
+    uint16x8x4_t: __ARM_mve_type_uint16x8x4_t, \
+    uint32_t: __ARM_mve_type_uint32_t, \
+    uint32_t *: __ARM_mve_type_uint32_t_ptr, \
+    uint32_t const *: __ARM_mve_type_uint32_t_const_ptr, \
+    uint32x4_t: __ARM_mve_type_uint32x4_t, \
+    uint32x4x2_t: __ARM_mve_type_uint32x4x2_t, \
+    uint32x4x4_t: __ARM_mve_type_uint32x4x4_t, \
+    uint64_t: __ARM_mve_type_uint64_t, \
+    uint64_t *: __ARM_mve_type_uint64_t_ptr, \
+    uint64_t const *: __ARM_mve_type_uint64_t_const_ptr, \
+    uint64x2_t: __ARM_mve_type_uint64x2_t, \
+    uint8_t: __ARM_mve_type_uint8_t, \
+    uint8_t *: __ARM_mve_type_uint8_t_ptr, \
+    uint8_t const *: __ARM_mve_type_uint8_t_const_ptr, \
+    uint8x16_t: __ARM_mve_type_uint8x16_t, \
+    uint8x16x2_t: __ARM_mve_type_uint8x16x2_t, \
+    uint8x16x4_t: __ARM_mve_type_uint8x16x4_t, \
+    default: _Generic(x, \
+	signed char: __ARM_mve_type_int8_t, \
+	short: __ARM_mve_type_int16_t, \
+	int: __ARM_mve_type_int32_t, \
+	long: __ARM_mve_type_int32_t, \
+	long long: __ARM_mve_type_int64_t, \
+	unsigned char: __ARM_mve_type_uint8_t, \
+	unsigned short: __ARM_mve_type_uint16_t, \
+	unsigned int: __ARM_mve_type_uint32_t, \
+	unsigned long: __ARM_mve_type_uint32_t, \
+	unsigned long long: __ARM_mve_type_uint64_t, \
+	default: __ARM_mve_unsupported_type))
+#else
+#define __ARM_mve_typeid(x) _Generic(x, \
+    int16_t: __ARM_mve_type_int16_t, \
+    int16_t *: __ARM_mve_type_int16_t_ptr, \
+    int16_t const *: __ARM_mve_type_int16_t_const_ptr, \
+    int16x8_t: __ARM_mve_type_int16x8_t, \
+    int16x8x2_t: __ARM_mve_type_int16x8x2_t, \
+    int16x8x4_t: __ARM_mve_type_int16x8x4_t, \
+    int32_t: __ARM_mve_type_int32_t, \
+    int32_t *: __ARM_mve_type_int32_t_ptr, \
+    int32_t const *: __ARM_mve_type_int32_t_const_ptr, \
+    int32x4_t: __ARM_mve_type_int32x4_t, \
+    int32x4x2_t: __ARM_mve_type_int32x4x2_t, \
+    int32x4x4_t: __ARM_mve_type_int32x4x4_t, \
+    int64_t: __ARM_mve_type_int64_t, \
+    int64_t *: __ARM_mve_type_int64_t_ptr, \
+    int64_t const *: __ARM_mve_type_int64_t_const_ptr, \
+    int64x2_t: __ARM_mve_type_int64x2_t, \
+    int8_t: __ARM_mve_type_int8_t, \
+    int8_t *: __ARM_mve_type_int8_t_ptr, \
+    int8_t const *: __ARM_mve_type_int8_t_const_ptr, \
+    int8x16_t: __ARM_mve_type_int8x16_t, \
+    int8x16x2_t: __ARM_mve_type_int8x16x2_t, \
+    int8x16x4_t: __ARM_mve_type_int8x16x4_t, \
+    uint16_t: __ARM_mve_type_uint16_t, \
+    uint16_t *: __ARM_mve_type_uint16_t_ptr, \
+    uint16_t const *: __ARM_mve_type_uint16_t_const_ptr, \
+    uint16x8_t: __ARM_mve_type_uint16x8_t, \
+    uint16x8x2_t: __ARM_mve_type_uint16x8x2_t, \
+    uint16x8x4_t: __ARM_mve_type_uint16x8x4_t, \
+    uint32_t: __ARM_mve_type_uint32_t, \
+    uint32_t *: __ARM_mve_type_uint32_t_ptr, \
+    uint32_t const *: __ARM_mve_type_uint32_t_const_ptr, \
+    uint32x4_t: __ARM_mve_type_uint32x4_t, \
+    uint32x4x2_t: __ARM_mve_type_uint32x4x2_t, \
+    uint32x4x4_t: __ARM_mve_type_uint32x4x4_t, \
+    uint64_t: __ARM_mve_type_uint64_t, \
+    uint64_t *: __ARM_mve_type_uint64_t_ptr, \
+    uint64_t const *: __ARM_mve_type_uint64_t_const_ptr, \
+    uint64x2_t: __ARM_mve_type_uint64x2_t, \
+    uint8_t: __ARM_mve_type_uint8_t, \
+    uint8_t *: __ARM_mve_type_uint8_t_ptr, \
+    uint8_t const *: __ARM_mve_type_uint8_t_const_ptr, \
+    uint8x16_t: __ARM_mve_type_uint8x16_t, \
+    uint8x16x2_t: __ARM_mve_type_uint8x16x2_t, \
+    uint8x16x4_t: __ARM_mve_type_uint8x16x4_t, \
+    default: _Generic(x, \
+	signed char: __ARM_mve_type_int8_t, \
+	short: __ARM_mve_type_int16_t, \
+	int: __ARM_mve_type_int32_t, \
+	long: __ARM_mve_type_int32_t, \
+	long long: __ARM_mve_type_int64_t, \
+	unsigned char: __ARM_mve_type_uint8_t, \
+	unsigned short: __ARM_mve_type_uint16_t, \
+	unsigned int: __ARM_mve_type_uint32_t, \
+	unsigned long: __ARM_mve_type_uint32_t, \
+	unsigned long long: __ARM_mve_type_uint64_t, \
+	default: __ARM_mve_unsupported_type))
+#endif /* MVE Floating point.  */
+
+extern void *__ARM_undef;
+#define __ARM_mve_coerce(param, type) \
+    _Generic(param, type: param, default: *(type *)__ARM_undef)
+
+#if (__ARM_FEATURE_MVE & 2) /* MVE Floating point.  */
+
+#define vst4q(p0,p1) __arm_vst4q(p0,p1)
+#define __arm_vst4q(p0,p1) ({ __typeof(p0) __p0 = (p0); \
+  __typeof(p1) __p1 = (p1); \
+  _Generic( (int (*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)])0, \
+  int (*)[__ARM_mve_type_int8_t_ptr][__ARM_mve_type_int8x16x4_t]: __arm_vst4q_s8 (__ARM_mve_coerce(__p0, int8_t *), __ARM_mve_coerce(__p1, int8x16x4_t)), \
+  int (*)[__ARM_mve_type_int16_t_ptr][__ARM_mve_type_int16x8x4_t]: __arm_vst4q_s16 (__ARM_mve_coerce(__p0, int16_t *), __ARM_mve_coerce(__p1, int16x8x4_t)), \
+  int (*)[__ARM_mve_type_int32_t_ptr][__ARM_mve_type_int32x4x4_t]: __arm_vst4q_s32 (__ARM_mve_coerce(__p0, int32_t *), __ARM_mve_coerce(__p1, int32x4x4_t)), \
+  int (*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint8x16x4_t]: __arm_vst4q_u8 (__ARM_mve_coerce(__p0, uint8_t *), __ARM_mve_coerce(__p1, uint8x16x4_t)), \
+  int (*)[__ARM_mve_type_uint16_t_ptr][__ARM_mve_type_uint16x8x4_t]: __arm_vst4q_u16 (__ARM_mve_coerce(__p0, uint16_t *), __ARM_mve_coerce(__p1, uint16x8x4_t)), \
+  int (*)[__ARM_mve_type_uint32_t_ptr][__ARM_mve_type_uint32x4x4_t]: __arm_vst4q_u32 (__ARM_mve_coerce(__p0, uint32_t *), __ARM_mve_coerce(__p1, uint32x4x4_t)), \
+  int (*)[__ARM_mve_type_float16_t_ptr][__ARM_mve_type_float16x8x4_t]: __arm_vst4q_f16 (__ARM_mve_coerce(__p0, float16_t *), __ARM_mve_coerce(__p1, float16x8x4_t)), \
+  int (*)[__ARM_mve_type_float32_t_ptr][__ARM_mve_type_float32x4x4_t]: __arm_vst4q_f32 (__ARM_mve_coerce(__p0, float32_t *), __ARM_mve_coerce(__p1, float32x4x4_t)));})
+
+#else /* MVE Interger.  */
+
+#define vst4q(p0,p1) __arm_vst4q(p0,p1)
+#define __arm_vst4q(p0,p1) ({ __typeof(p0) __p0 = (p0); \
+  __typeof(p1) __p1 = (p1); \
+  _Generic( (int (*)[__ARM_mve_typeid(__p0)][__ARM_mve_typeid(__p1)])0, \
+  int (*)[__ARM_mve_type_int8_t_ptr][__ARM_mve_type_int8x16x4_t]: __arm_vst4q_s8 (__ARM_mve_coerce(__p0, int8_t *), __ARM_mve_coerce(__p1, int8x16x4_t)), \
+  int (*)[__ARM_mve_type_int16_t_ptr][__ARM_mve_type_int16x8x4_t]: __arm_vst4q_s16 (__ARM_mve_coerce(__p0, int16_t *), __ARM_mve_coerce(__p1, int16x8x4_t)), \
+  int (*)[__ARM_mve_type_int32_t_ptr][__ARM_mve_type_int32x4x4_t]: __arm_vst4q_s32 (__ARM_mve_coerce(__p0, int32_t *), __ARM_mve_coerce(__p1, int32x4x4_t)), \
+  int (*)[__ARM_mve_type_uint8_t_ptr][__ARM_mve_type_uint8x16x4_t]: __arm_vst4q_u8 (__ARM_mve_coerce(__p0, uint8_t *), __ARM_mve_coerce(__p1, uint8x16x4_t)), \
+  int (*)[__ARM_mve_type_uint16_t_ptr][__ARM_mve_type_uint16x8x4_t]: __arm_vst4q_u16 (__ARM_mve_coerce(__p0, uint16_t *), __ARM_mve_coerce(__p1, uint16x8x4_t)), \
+  int (*)[__ARM_mve_type_uint32_t_ptr][__ARM_mve_type_uint32x4x4_t]: __arm_vst4q_u32 (__ARM_mve_coerce(__p0, uint32_t *), __ARM_mve_coerce(__p1, uint32x4x4_t)));})
+
+#endif /* MVE Floating point.  */
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/gcc/config/arm/arm_mve_builtins.def b/gcc/config/arm/arm_mve_builtins.def
new file mode 100644
index 0000000000000000000000000000000000000000..4a7e4d06f0af1e639788848041133a800b183e83
--- /dev/null
+++ b/gcc/config/arm/arm_mve_builtins.def
@@ -0,0 +1,21 @@ 
+/*  MVE builtin definitions for Arm.
+    Copyright  (C) 2019-2020 Free Software Foundation, Inc.
+    Contributed by Arm.
+
+    This file is part of GCC.
+
+    GCC is free software; you can redistribute it and/or modify it
+    under the terms of the GNU General Public License as published
+    by the Free Software Foundation; either version 3, or (at your
+    option) any later version.
+
+    GCC is distributed in the hope that it will be useful, but WITHOUT
+    ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+    or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
+    License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with GCC; see the file COPYING3.  If not see
+    <http://www.gnu.org/licenses/>.  */
+
+VAR5 (STORE1, vst4q, v16qi, v8hi, v4si, v8hf, v4sf)
diff --git a/gcc/config/arm/iterators.md b/gcc/config/arm/iterators.md
index 6af76580c0c5d0a6efd47eb1409061c74dca6378..5c1a11bf7dee7590d668e7ec5e3b068789b3b3db 100644
--- a/gcc/config/arm/iterators.md
+++ b/gcc/config/arm/iterators.md
@@ -131,7 +131,8 @@ 
 (define_mode_iterator VQXMOV [V16QI V8HI V8HF V8BF V4SI V4SF V2DI TI])
 
 ;; Opaque structure types wider than TImode.
-(define_mode_iterator VSTRUCT [EI OI CI XI])
+(define_mode_iterator VSTRUCT [(EI "!TARGET_HAVE_MVE") OI
+			       (CI "!TARGET_HAVE_MVE") XI])
 
 ;; Opaque structure types used in table lookups (except vtbl1/vtbx1).
 (define_mode_iterator VTAB [TI EI OI])
diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md
index 92176a57bd75695a63b1bc958c9b2791f79ea6a4..9b3bb6129a99c4f3c952e7a72d0874dcba12f30d 100644
--- a/gcc/config/arm/mve.md
+++ b/gcc/config/arm/mve.md
@@ -17,9 +17,12 @@ 
 ;; along with GCC; see the file COPYING3.  If not see
 ;; <http://www.gnu.org/licenses/>.
 
-(define_mode_iterator MVE_types [V16QI V8HI V4SI V2DI TI V8HF V4SF V2DF])
 (define_mode_attr V_sz_elem2 [(V16QI "s8") (V8HI "u16") (V4SI "u32")
 			      (V2DI "u64")])
+(define_mode_iterator MVE_types [V16QI V8HI V4SI V2DI TI V8HF V4SF V2DF])
+(define_mode_iterator MVE_VLD_ST [V16QI V8HI V4SI V8HF V4SF])
+
+(define_c_enum "unspec" [VST4Q])
 
 (define_insn "*mve_mov<mode>"
   [(set (match_operand:MVE_types 0 "nonimmediate_operand" "=w,w,r,w,w,r,w,Us")
@@ -83,3 +86,37 @@ 
 }
   [(set_attr "length" "4,4")
    (set_attr "type" "mve_move,mve_move")])
+
+;;
+;; [vst4q])
+;;
+(define_insn "mve_vst4q<mode>"
+  [(set (match_operand:XI 0 "neon_struct_operand" "=Um")
+	(unspec:XI [(match_operand:XI 1 "s_register_operand" "w")
+		    (unspec:MVE_VLD_ST [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+	 VST4Q))
+  ]
+  "TARGET_HAVE_MVE"
+{
+   rtx ops[6];
+   int regno = REGNO (operands[1]);
+   ops[0] = gen_rtx_REG (TImode, regno);
+   ops[1] = gen_rtx_REG (TImode, regno+4);
+   ops[2] = gen_rtx_REG (TImode, regno+8);
+   ops[3] = gen_rtx_REG (TImode, regno+12);
+   rtx reg  = operands[0];
+   while (reg && !REG_P (reg))
+    reg = XEXP (reg, 0);
+   gcc_assert (REG_P (reg));
+   ops[4] = reg;
+   ops[5] = operands[0];
+   /* Here in first three instructions data is stored to ops[4]'s location but
+      in the fourth instruction data is stored to operands[0], this is to
+      support the writeback.  */
+   output_asm_insn ("vst40.<V_sz_elem>\t{%q0, %q1, %q2, %q3}, [%4]\n\t"
+		    "vst41.<V_sz_elem>\t{%q0, %q1, %q2, %q3}, [%4]\n\t"
+		    "vst42.<V_sz_elem>\t{%q0, %q1, %q2, %q3}, [%4]\n\t"
+		    "vst43.<V_sz_elem>\t{%q0, %q1, %q2, %q3}, %5", ops);
+   return "";
+}
+  [(set_attr "length" "16")])
diff --git a/gcc/config/arm/neon.md b/gcc/config/arm/neon.md
index b6a8eb62c8ff05fc32805029fea8f682402d7399..fbfeef233f38831a5cb256622625879d15209431 100644
--- a/gcc/config/arm/neon.md
+++ b/gcc/config/arm/neon.md
@@ -149,7 +149,7 @@ 
 (define_expand "mov<mode>"
   [(set (match_operand:VSTRUCT 0 "nonimmediate_operand")
 	(match_operand:VSTRUCT 1 "general_operand"))]
-  "TARGET_NEON"
+  "TARGET_NEON || TARGET_HAVE_MVE"
 {
   gcc_checking_assert (aligned_operand (operands[0], <MODE>mode));
   gcc_checking_assert (aligned_operand (operands[1], <MODE>mode));
@@ -181,7 +181,7 @@ 
 (define_insn "*neon_mov<mode>"
   [(set (match_operand:VSTRUCT 0 "nonimmediate_operand"	"=w,Ut,w")
 	(match_operand:VSTRUCT 1 "general_operand"	" w,w, Ut"))]
-  "TARGET_NEON
+  "(TARGET_NEON || TARGET_HAVE_MVE)
    && (register_operand (operands[0], <MODE>mode)
        || register_operand (operands[1], <MODE>mode))"
 {
@@ -217,7 +217,7 @@ 
 (define_split
   [(set (match_operand:OI 0 "s_register_operand" "")
 	(match_operand:OI 1 "s_register_operand" ""))]
-  "TARGET_NEON && reload_completed"
+  "(TARGET_NEON || TARGET_HAVE_MVE)&& reload_completed"
   [(set (match_dup 0) (match_dup 1))
    (set (match_dup 2) (match_dup 3))]
 {
@@ -258,7 +258,7 @@ 
 (define_split
   [(set (match_operand:XI 0 "s_register_operand" "")
 	(match_operand:XI 1 "s_register_operand" ""))]
-  "TARGET_NEON && reload_completed"
+  "(TARGET_NEON || TARGET_HAVE_MVE) && reload_completed"
   [(set (match_dup 0) (match_dup 1))
    (set (match_dup 2) (match_dup 3))
    (set (match_dup 4) (match_dup 5))
diff --git a/gcc/config/arm/t-arm b/gcc/config/arm/t-arm
index 2d980830e7669e60d6f2cb61014389d121bfbcb1..1f7f169731301edc1f275342ad7a0eb46407f6e4 100644
--- a/gcc/config/arm/t-arm
+++ b/gcc/config/arm/t-arm
@@ -137,7 +137,8 @@  arm.o: $(srcdir)/config/arm/arm.c $(CONFIG_H) $(SYSTEM_H) coretypes.h $(TM_H) \
   arm-cpu-data.h \
   $(srcdir)/config/arm/arm-protos.h \
   $(srcdir)/config/arm/arm_neon_builtins.def \
-  $(srcdir)/config/arm/arm_vfp_builtins.def
+  $(srcdir)/config/arm/arm_vfp_builtins.def \
+  $(srcdir)/config/arm/arm_mve_builtins.def
 
 arm-builtins.o: $(srcdir)/config/arm/arm-builtins.c $(CONFIG_H) \
   $(SYSTEM_H) coretypes.h $(TM_H) \
@@ -147,6 +148,7 @@  arm-builtins.o: $(srcdir)/config/arm/arm-builtins.c $(CONFIG_H) \
   $(srcdir)/config/arm/arm_acle_builtins.def \
   $(srcdir)/config/arm/arm_neon_builtins.def \
   $(srcdir)/config/arm/arm_vfp_builtins.def \
+  $(srcdir)/config/arm/arm_mve_builtins.def \
   $(srcdir)/config/arm/arm-simd-builtin-types.def
 	$(COMPILER) -c $(ALL_COMPILERFLAGS) $(ALL_CPPFLAGS) $(INCLUDES) \
 		$(srcdir)/config/arm/arm-builtins.c
diff --git a/gcc/testsuite/gcc.target/arm/mve/intrinsics/vst4q_f16.c b/gcc/testsuite/gcc.target/arm/mve/intrinsics/vst4q_f16.c
new file mode 100644
index 0000000000000000000000000000000000000000..8516cfa832b03b4e8713aef8fc1cbabdf62c13f5
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/mve/intrinsics/vst4q_f16.c
@@ -0,0 +1,38 @@ 
+/* { dg-do compile  } */
+/* { dg-require-effective-target arm_v8_1m_mve_fp_ok } */
+/* { dg-add-options arm_v8_1m_mve_fp } */
+/* { dg-additional-options "-O2" } */
+
+#include "arm_mve.h"
+
+void
+foo (float16_t * addr, float16x8x4_t value)
+{
+  vst4q_f16 (addr, value);
+}
+
+/* { dg-final { scan-assembler "vst40.16"  }  } */
+/* { dg-final { scan-assembler "vst41.16"  }  } */
+/* { dg-final { scan-assembler "vst42.16"  }  } */
+/* { dg-final { scan-assembler "vst43.16"  }  } */
+
+void
+foo1 (float16_t * addr, float16x8x4_t value)
+{
+  vst4q (addr, value);
+}
+
+/* { dg-final { scan-assembler "vst40.16"  }  } */
+/* { dg-final { scan-assembler "vst41.16"  }  } */
+/* { dg-final { scan-assembler "vst42.16"  }  } */
+/* { dg-final { scan-assembler "vst43.16"  }  } */
+
+void
+foo2 (float16_t * addr, float16x8x4_t value)
+{
+  vst4q_f16 (addr, value);
+  addr += 32;
+  vst4q_f16 (addr, value);
+}
+
+/* { dg-final { scan-assembler {vst43.16\s\{.*\}, \[.*\]!}  }  } */
diff --git a/gcc/testsuite/gcc.target/arm/mve/intrinsics/vst4q_f32.c b/gcc/testsuite/gcc.target/arm/mve/intrinsics/vst4q_f32.c
new file mode 100644
index 0000000000000000000000000000000000000000..c8b327259ff44d3d56377e0f15a03719792e534c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/mve/intrinsics/vst4q_f32.c
@@ -0,0 +1,38 @@ 
+/* { dg-do compile  } */
+/* { dg-require-effective-target arm_v8_1m_mve_fp_ok } */
+/* { dg-add-options arm_v8_1m_mve_fp } */
+/* { dg-additional-options "-O2" } */
+
+#include "arm_mve.h"
+
+void
+foo (float32_t * addr, float32x4x4_t value)
+{
+  vst4q_f32 (addr, value);
+}
+
+/* { dg-final { scan-assembler "vst40.32"  }  } */
+/* { dg-final { scan-assembler "vst41.32"  }  } */
+/* { dg-final { scan-assembler "vst42.32"  }  } */
+/* { dg-final { scan-assembler "vst43.32"  }  } */
+
+void
+foo1 (float32_t * addr, float32x4x4_t value)
+{
+  vst4q (addr, value);
+}
+
+/* { dg-final { scan-assembler "vst40.32"  }  } */
+/* { dg-final { scan-assembler "vst41.32"  }  } */
+/* { dg-final { scan-assembler "vst42.32"  }  } */
+/* { dg-final { scan-assembler "vst43.32"  }  } */
+
+void
+foo2 (float32_t * addr, float32x4x4_t value)
+{
+  vst4q_f32 (addr, value);
+  addr += 16;
+  vst4q_f32 (addr, value);
+}
+
+/* { dg-final { scan-assembler {vst43.32\s\{.*\}, \[.*\]!}  }  } */
diff --git a/gcc/testsuite/gcc.target/arm/mve/intrinsics/vst4q_s16.c b/gcc/testsuite/gcc.target/arm/mve/intrinsics/vst4q_s16.c
new file mode 100644
index 0000000000000000000000000000000000000000..d06947d04d828f0bdf831dbef13b3a8ebb70c75a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/mve/intrinsics/vst4q_s16.c
@@ -0,0 +1,38 @@ 
+/* { dg-do compile  } */
+/* { dg-require-effective-target arm_v8_1m_mve_ok } */
+/* { dg-add-options arm_v8_1m_mve } */
+/* { dg-additional-options "-O2" } */
+
+#include "arm_mve.h"
+
+void
+foo (int16_t * addr, int16x8x4_t value)
+{
+  vst4q_s16 (addr, value);
+}
+
+/* { dg-final { scan-assembler "vst40.16"  }  } */
+/* { dg-final { scan-assembler "vst41.16"  }  } */
+/* { dg-final { scan-assembler "vst42.16"  }  } */
+/* { dg-final { scan-assembler "vst43.16"  }  } */
+
+void
+foo1 (int16_t * addr, int16x8x4_t value)
+{
+  vst4q (addr, value);
+}
+
+/* { dg-final { scan-assembler "vst40.16"  }  } */
+/* { dg-final { scan-assembler "vst41.16"  }  } */
+/* { dg-final { scan-assembler "vst42.16"  }  } */
+/* { dg-final { scan-assembler "vst43.16"  }  } */
+
+void
+foo2 (int16_t * addr, int16x8x4_t value)
+{
+  vst4q_s16 (addr, value);
+  addr += 32;
+  vst4q_s16 (addr, value);
+}
+
+/* { dg-final { scan-assembler {vst43.16\s\{.*\}, \[.*\]!}  }  } */
diff --git a/gcc/testsuite/gcc.target/arm/mve/intrinsics/vst4q_s32.c b/gcc/testsuite/gcc.target/arm/mve/intrinsics/vst4q_s32.c
new file mode 100644
index 0000000000000000000000000000000000000000..5dc6835ef847032ce3a7d2c538f53eaec38dd0ca
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/mve/intrinsics/vst4q_s32.c
@@ -0,0 +1,38 @@ 
+/* { dg-do compile  } */
+/* { dg-require-effective-target arm_v8_1m_mve_ok } */
+/* { dg-add-options arm_v8_1m_mve } */
+/* { dg-additional-options "-O2" } */
+
+#include "arm_mve.h"
+
+void
+foo (int32_t * addr, int32x4x4_t value)
+{
+  vst4q_s32 (addr, value);
+}
+
+/* { dg-final { scan-assembler "vst40.32"  }  } */
+/* { dg-final { scan-assembler "vst41.32"  }  } */
+/* { dg-final { scan-assembler "vst42.32"  }  } */
+/* { dg-final { scan-assembler "vst43.32"  }  } */
+
+void
+foo1 (int32_t * addr, int32x4x4_t value)
+{
+  vst4q (addr, value);
+}
+
+/* { dg-final { scan-assembler "vst40.32"  }  } */
+/* { dg-final { scan-assembler "vst41.32"  }  } */
+/* { dg-final { scan-assembler "vst42.32"  }  } */
+/* { dg-final { scan-assembler "vst43.32"  }  } */
+
+void
+foo2 (int32_t * addr, int32x4x4_t value)
+{
+  vst4q_s32 (addr, value);
+  addr += 16;
+  vst4q_s32 (addr, value); 
+}
+
+/* { dg-final { scan-assembler {vst43.32\s\{.*\}, \[.*\]!}  }  } */
diff --git a/gcc/testsuite/gcc.target/arm/mve/intrinsics/vst4q_s8.c b/gcc/testsuite/gcc.target/arm/mve/intrinsics/vst4q_s8.c
new file mode 100644
index 0000000000000000000000000000000000000000..a3cb53e229236dc2f3355ef8622a6c99a9b9f3da
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/mve/intrinsics/vst4q_s8.c
@@ -0,0 +1,38 @@ 
+/* { dg-do compile  } */
+/* { dg-require-effective-target arm_v8_1m_mve_ok } */
+/* { dg-add-options arm_v8_1m_mve } */
+/* { dg-additional-options "-O2" } */
+
+#include "arm_mve.h"
+
+void
+foo (int8_t * addr, int8x16x4_t value)
+{
+  vst4q_s8 (addr, value);
+}
+
+/* { dg-final { scan-assembler "vst40.8"  }  } */
+/* { dg-final { scan-assembler "vst41.8"  }  } */
+/* { dg-final { scan-assembler "vst42.8"  }  } */
+/* { dg-final { scan-assembler "vst43.8"  }  } */
+
+void
+foo1 (int8_t * addr, int8x16x4_t value)
+{
+  vst4q (addr, value);
+}
+
+/* { dg-final { scan-assembler "vst40.8"  }  } */
+/* { dg-final { scan-assembler "vst41.8"  }  } */
+/* { dg-final { scan-assembler "vst42.8"  }  } */
+/* { dg-final { scan-assembler "vst43.8"  }  } */
+
+void
+foo2 (int8_t * addr, int8x16x4_t value)
+{
+  vst4q_s8 (addr, value);
+  addr += 16*4;
+  vst4q_s8 (addr, value);
+}
+
+/* { dg-final { scan-assembler {vst43.8\s\{.*\}, \[.*\]!}  }  } */
diff --git a/gcc/testsuite/gcc.target/arm/mve/intrinsics/vst4q_u16.c b/gcc/testsuite/gcc.target/arm/mve/intrinsics/vst4q_u16.c
new file mode 100644
index 0000000000000000000000000000000000000000..87dd4bfc0624c8340f522dde3ca6818266d7cde0
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/mve/intrinsics/vst4q_u16.c
@@ -0,0 +1,38 @@ 
+/* { dg-do compile  } */
+/* { dg-require-effective-target arm_v8_1m_mve_ok } */
+/* { dg-add-options arm_v8_1m_mve } */
+/* { dg-additional-options "-O2" } */
+
+#include "arm_mve.h"
+
+void
+foo (uint16_t * addr, uint16x8x4_t value)
+{
+  vst4q_u16 (addr, value);
+}
+
+/* { dg-final { scan-assembler "vst40.16"  }  } */
+/* { dg-final { scan-assembler "vst41.16"  }  } */
+/* { dg-final { scan-assembler "vst42.16"  }  } */
+/* { dg-final { scan-assembler "vst43.16"  }  } */
+
+void
+foo1 (uint16_t * addr, uint16x8x4_t value)
+{
+  vst4q (addr, value);
+}
+
+/* { dg-final { scan-assembler "vst40.16"  }  } */
+/* { dg-final { scan-assembler "vst41.16"  }  } */
+/* { dg-final { scan-assembler "vst42.16"  }  } */
+/* { dg-final { scan-assembler "vst43.16"  }  } */
+
+void
+foo2 (uint16_t * addr, uint16x8x4_t value)
+{
+  vst4q_u16 (addr, value);
+  addr += 32;
+  vst4q_u16 (addr, value);
+}
+
+/* { dg-final { scan-assembler {vst43.16\s\{.*\}, \[.*\]!}  }  } */
diff --git a/gcc/testsuite/gcc.target/arm/mve/intrinsics/vst4q_u32.c b/gcc/testsuite/gcc.target/arm/mve/intrinsics/vst4q_u32.c
new file mode 100644
index 0000000000000000000000000000000000000000..943aa0266a7da619295e3e30fe99c011c1b7184e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/mve/intrinsics/vst4q_u32.c
@@ -0,0 +1,38 @@ 
+/* { dg-do compile  } */
+/* { dg-require-effective-target arm_v8_1m_mve_ok } */
+/* { dg-add-options arm_v8_1m_mve } */
+/* { dg-additional-options "-O2" } */
+
+#include "arm_mve.h"
+
+void
+foo (uint32_t * addr, uint32x4x4_t value)
+{
+  vst4q_u32 (addr, value);
+}
+
+/* { dg-final { scan-assembler "vst40.32"  }  } */
+/* { dg-final { scan-assembler "vst41.32"  }  } */
+/* { dg-final { scan-assembler "vst42.32"  }  } */
+/* { dg-final { scan-assembler "vst43.32"  }  } */
+
+void
+foo1 (uint32_t * addr, uint32x4x4_t value)
+{
+  vst4q (addr, value);
+}
+
+/* { dg-final { scan-assembler "vst40.32"  }  } */
+/* { dg-final { scan-assembler "vst41.32"  }  } */
+/* { dg-final { scan-assembler "vst42.32"  }  } */
+/* { dg-final { scan-assembler "vst43.32"  }  } */
+
+void
+foo2 (uint32_t * addr, uint32x4x4_t value)
+{
+  vst4q_u32 (addr, value);
+  addr += 16;
+  vst4q_u32 (addr, value); 
+}
+
+/* { dg-final { scan-assembler {vst43.32\s\{.*\}, \[.*\]!}  }  } */
diff --git a/gcc/testsuite/gcc.target/arm/mve/intrinsics/vst4q_u8.c b/gcc/testsuite/gcc.target/arm/mve/intrinsics/vst4q_u8.c
new file mode 100644
index 0000000000000000000000000000000000000000..c8dcc97087c6f3b795fa26337b97a802e5e13c44
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/mve/intrinsics/vst4q_u8.c
@@ -0,0 +1,38 @@ 
+/* { dg-do compile  } */
+/* { dg-require-effective-target arm_v8_1m_mve_ok } */
+/* { dg-add-options arm_v8_1m_mve } */
+/* { dg-additional-options "-O2" } */
+
+#include "arm_mve.h"
+
+void
+foo (uint8_t * addr, uint8x16x4_t value)
+{
+  vst4q_u8 (addr, value);
+}
+
+/* { dg-final { scan-assembler "vst40.8"  }  } */
+/* { dg-final { scan-assembler "vst41.8"  }  } */
+/* { dg-final { scan-assembler "vst42.8"  }  } */
+/* { dg-final { scan-assembler "vst43.8"  }  } */
+
+void
+foo1 (uint8_t * addr, uint8x16x4_t value)
+{
+  vst4q (addr, value);
+}
+
+/* { dg-final { scan-assembler "vst40.8"  }  } */
+/* { dg-final { scan-assembler "vst41.8"  }  } */
+/* { dg-final { scan-assembler "vst42.8"  }  } */
+/* { dg-final { scan-assembler "vst43.8"  }  } */
+
+void
+foo2 (uint8_t * addr, uint8x16x4_t value)
+{
+  vst4q_u8 (addr, value);
+  addr += 16*4;
+  vst4q_u8 (addr, value);
+}
+
+/* { dg-final { scan-assembler {vst43.8\s\{.*\}, \[.*\]!}  }  } */