[committed,AArch64] Improve SVE constant moves
diff mbox series

Message ID mptimr1s5yj.fsf@arm.com
State New
Headers show
Series
  • [committed,AArch64] Improve SVE constant moves
Related show

Commit Message

Richard Sandiford Aug. 13, 2019, 10:41 a.m. UTC
If there's no SVE instruction to load a given constant directly, this
patch instead tries to use an Advanced SIMD constant move and then
duplicates the constant to fill an SVE vector.  The main use of this
is to support constants in which each byte is in { 0, 0xff }.

Also, the patch prefers a simple integer move followed by a duplicate
over a load from memory, like we already do for Advanced SIMD.  This is
a useful option to have and would be easy to turn off via a tuning
parameter if necessary.

The patch also extends the handling of wide LD1Rs to big endian,
whereas previously we punted to a full LD1RQ.

Tested on aarch64-linux-gnu (with and without SVE) and aarch64_be-elf.
Applied as r274375 (treating the machmode.h bit as obvious).

Richard


2019-08-13  Richard Sandiford  <richard.sandiford@arm.com>

gcc/
	* machmode.h (opt_mode::else_mode): New function.
	(opt_mode::else_blk): Use it.
	* config/aarch64/aarch64-protos.h (aarch64_vq_mode): Declare.
	(aarch64_full_sve_mode, aarch64_sve_ld1rq_operand_p): Likewise.
	(aarch64_gen_stepped_int_parallel): Likewise.
	(aarch64_stepped_int_parallel_p): Likewise.
	(aarch64_expand_mov_immediate): Remove the optional gen_vec_duplicate
	argument.
	* config/aarch64/aarch64.c
	(aarch64_expand_sve_widened_duplicate): Delete.
	(aarch64_expand_sve_dupq, aarch64_expand_sve_ld1rq): New functions.
	(aarch64_expand_sve_const_vector): Rewrite to handle more cases.
	(aarch64_expand_mov_immediate): Remove the optional gen_vec_duplicate
	argument.  Use early returns in the !CONST_INT_P handling.
	Pass all SVE data vectors to aarch64_expand_sve_const_vector rather
	than handling some inline.
	(aarch64_full_sve_mode, aarch64_vq_mode): New functions, split out
	from...
	(aarch64_simd_container_mode): ...here.
	(aarch64_gen_stepped_int_parallel, aarch64_stepped_int_parallel_p)
	(aarch64_sve_ld1rq_operand_p): New functions.
	* config/aarch64/predicates.md (descending_int_parallel)
	(aarch64_sve_ld1rq_operand): New predicates.
	* config/aarch64/constraints.md (UtQ): New constraint.
	* config/aarch64/aarch64.md (UNSPEC_REINTERPRET): New unspec.
	* config/aarch64/aarch64-sve.md (mov<SVE_ALL:mode>): Remove the
	gen_vec_duplicate from call to aarch64_expand_mov_immediate.
	(@aarch64_sve_reinterpret<mode>): New expander.
	(*aarch64_sve_reinterpret<mode>): New pattern.
	(@aarch64_vec_duplicate_vq<mode>_le): New pattern.
	(@aarch64_vec_duplicate_vq<mode>_be): Likewise.
	(*sve_ld1rq<Vesize>): Replace with...
	(@aarch64_sve_ld1rq<mode>): ...this new pattern.

gcc/testsuite/
	* gcc.target/aarch64/sve/init_2.c: Expect ld1rd to be used
	instead of a full vector load.
	* gcc.target/aarch64/sve/init_4.c: Likewise.
	* gcc.target/aarch64/sve/ld1r_2.c: Remove constants that no longer
	need to be loaded from memory.
	* gcc.target/aarch64/sve/slp_2.c: Expect the same output for
	big and little endian.
	* gcc.target/aarch64/sve/slp_3.c: Likewise.  Expect 3 of the
	doubles to be moved via integer registers rather than loaded
	from memory.
	* gcc.target/aarch64/sve/slp_4.c: Likewise but for 4 doubles.
	* gcc.target/aarch64/sve/spill_4.c: Expect 16-bit constants to be
	loaded via an integer register rather than from memory.
	* gcc.target/aarch64/sve/const_1.c: New test.
	* gcc.target/aarch64/sve/const_2.c: Likewise.
	* gcc.target/aarch64/sve/const_3.c: Likewise.

Patch
diff mbox series

Index: gcc/machmode.h
===================================================================
--- gcc/machmode.h	2019-03-08 18:14:25.849009683 +0000
+++ gcc/machmode.h	2019-08-13 11:28:59.890111027 +0100
@@ -251,7 +251,8 @@  #define CLASS_HAS_WIDER_MODES_P(CLASS)
   ALWAYS_INLINE opt_mode (from_int m) : m_mode (machine_mode (m)) {}
 
   machine_mode else_void () const;
-  machine_mode else_blk () const;
+  machine_mode else_blk () const { return else_mode (BLKmode); }
+  machine_mode else_mode (machine_mode) const;
   T require () const;
 
   bool exists () const;
@@ -271,13 +272,13 @@  opt_mode<T>::else_void () const
   return m_mode;
 }
 
-/* If the T exists, return its enum value, otherwise return E_BLKmode.  */
+/* If the T exists, return its enum value, otherwise return FALLBACK.  */
 
 template<typename T>
 inline machine_mode
-opt_mode<T>::else_blk () const
+opt_mode<T>::else_mode (machine_mode fallback) const
 {
-  return m_mode == E_VOIDmode ? E_BLKmode : m_mode;
+  return m_mode == E_VOIDmode ? fallback : m_mode;
 }
 
 /* Assert that the object contains a T and return it.  */
Index: gcc/config/aarch64/aarch64-protos.h
===================================================================
--- gcc/config/aarch64/aarch64-protos.h	2019-08-13 11:21:04.501550177 +0100
+++ gcc/config/aarch64/aarch64-protos.h	2019-08-13 11:28:59.882111087 +0100
@@ -416,6 +416,8 @@  unsigned HOST_WIDE_INT aarch64_and_split
 bool aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in, machine_mode mode);
 int aarch64_branch_cost (bool, bool);
 enum aarch64_symbol_type aarch64_classify_symbolic_expression (rtx);
+opt_machine_mode aarch64_vq_mode (scalar_mode);
+opt_machine_mode aarch64_full_sve_mode (scalar_mode);
 bool aarch64_can_const_movi_rtx_p (rtx x, machine_mode mode);
 bool aarch64_const_vec_all_same_int_p (rtx, HOST_WIDE_INT);
 bool aarch64_const_vec_all_same_in_range_p (rtx, HOST_WIDE_INT,
@@ -504,9 +506,12 @@  rtx aarch64_return_addr (int, rtx);
 rtx aarch64_simd_gen_const_vector_dup (machine_mode, HOST_WIDE_INT);
 bool aarch64_simd_mem_operand_p (rtx);
 bool aarch64_sve_ld1r_operand_p (rtx);
+bool aarch64_sve_ld1rq_operand_p (rtx);
 bool aarch64_sve_ldr_operand_p (rtx);
 bool aarch64_sve_struct_memory_operand_p (rtx);
 rtx aarch64_simd_vect_par_cnst_half (machine_mode, int, bool);
+rtx aarch64_gen_stepped_int_parallel (unsigned int, int, int);
+bool aarch64_stepped_int_parallel_p (rtx, int);
 rtx aarch64_tls_get_addr (void);
 tree aarch64_fold_builtin (tree, int, tree *, bool);
 unsigned aarch64_dbx_register_number (unsigned);
@@ -518,7 +523,7 @@  const char * aarch64_output_probe_stack_
 const char * aarch64_output_probe_sve_stack_clash (rtx, rtx, rtx, rtx);
 void aarch64_err_no_fpadvsimd (machine_mode);
 void aarch64_expand_epilogue (bool);
-void aarch64_expand_mov_immediate (rtx, rtx, rtx (*) (rtx, rtx) = 0);
+void aarch64_expand_mov_immediate (rtx, rtx);
 rtx aarch64_ptrue_reg (machine_mode);
 rtx aarch64_pfalse_reg (machine_mode);
 void aarch64_emit_sve_pred_move (rtx, rtx, rtx);
Index: gcc/config/aarch64/aarch64.c
===================================================================
--- gcc/config/aarch64/aarch64.c	2019-08-13 11:21:04.505550150 +0100
+++ gcc/config/aarch64/aarch64.c	2019-08-13 11:28:59.886111056 +0100
@@ -3242,32 +3242,55 @@  aarch64_expand_vec_series (rtx dest, rtx
   emit_set_insn (dest, gen_rtx_VEC_SERIES (mode, base, step));
 }
 
-/* Try to duplicate SRC into SVE register DEST, given that SRC is an
-   integer of mode INT_MODE.  Return true on success.  */
+/* Duplicate 128-bit Advanced SIMD vector SRC so that it fills an SVE
+   register of mode MODE.  Use TARGET for the result if it's nonnull
+   and convenient.
+
+   The two vector modes must have the same element mode.  The behavior
+   is to duplicate architectural lane N of SRC into architectural lanes
+   N + I * STEP of the result.  On big-endian targets, architectural
+   lane 0 of an Advanced SIMD vector is the last element of the vector
+   in memory layout, so for big-endian targets this operation has the
+   effect of reversing SRC before duplicating it.  Callers need to
+   account for this.  */
 
-static bool
-aarch64_expand_sve_widened_duplicate (rtx dest, scalar_int_mode src_mode,
-				      rtx src)
+rtx
+aarch64_expand_sve_dupq (rtx target, machine_mode mode, rtx src)
 {
-  /* If the constant is smaller than 128 bits, we can do the move
-     using a vector of SRC_MODEs.  */
-  if (src_mode != TImode)
-    {
-      poly_uint64 count = exact_div (GET_MODE_SIZE (GET_MODE (dest)),
-				     GET_MODE_SIZE (src_mode));
-      machine_mode dup_mode = mode_for_vector (src_mode, count).require ();
-      emit_move_insn (gen_lowpart (dup_mode, dest),
-		      gen_const_vec_duplicate (dup_mode, src));
-      return true;
+  machine_mode src_mode = GET_MODE (src);
+  gcc_assert (GET_MODE_INNER (mode) == GET_MODE_INNER (src_mode));
+  insn_code icode = (BYTES_BIG_ENDIAN
+		     ? code_for_aarch64_vec_duplicate_vq_be (mode)
+		     : code_for_aarch64_vec_duplicate_vq_le (mode));
+
+  unsigned int i = 0;
+  expand_operand ops[3];
+  create_output_operand (&ops[i++], target, mode);
+  create_output_operand (&ops[i++], src, src_mode);
+  if (BYTES_BIG_ENDIAN)
+    {
+      /* Create a PARALLEL describing the reversal of SRC.  */
+      unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (mode);
+      rtx sel = aarch64_gen_stepped_int_parallel (nelts_per_vq,
+						  nelts_per_vq - 1, -1);
+      create_fixed_operand (&ops[i++], sel);
     }
+  expand_insn (icode, i, ops);
+  return ops[0].value;
+}
+
+/* Try to force 128-bit vector value SRC into memory and use LD1RQ to fetch
+   the memory image into DEST.  Return true on success.  */
 
-  /* Use LD1RQ[BHWD] to load the 128 bits from memory.  */
-  src = force_const_mem (src_mode, src);
+static bool
+aarch64_expand_sve_ld1rq (rtx dest, rtx src)
+{
+  src = force_const_mem (GET_MODE (src), src);
   if (!src)
     return false;
 
   /* Make sure that the address is legitimate.  */
-  if (!aarch64_sve_ld1r_operand_p (src))
+  if (!aarch64_sve_ld1rq_operand_p (src))
     {
       rtx addr = force_reg (Pmode, XEXP (src, 0));
       src = replace_equiv_address (src, addr);
@@ -3277,46 +3300,127 @@  aarch64_expand_sve_widened_duplicate (rt
   unsigned int elem_bytes = GET_MODE_UNIT_SIZE (mode);
   machine_mode pred_mode = aarch64_sve_pred_mode (elem_bytes).require ();
   rtx ptrue = aarch64_ptrue_reg (pred_mode);
-  src = gen_rtx_UNSPEC (mode, gen_rtvec (2, ptrue, src), UNSPEC_LD1RQ);
-  emit_insn (gen_rtx_SET (dest, src));
+  emit_insn (gen_aarch64_sve_ld1rq (mode, dest, src, ptrue));
   return true;
 }
 
-/* Expand a move of general CONST_VECTOR SRC into DEST, given that it
-   isn't a simple duplicate or series.  */
+/* Return a register containing CONST_VECTOR SRC, given that SRC has an
+   SVE data mode and isn't a legitimate constant.  Use TARGET for the
+   result if convenient.
 
-static void
-aarch64_expand_sve_const_vector (rtx dest, rtx src)
+   The returned register can have whatever mode seems most natural
+   given the contents of SRC.  */
+
+static rtx
+aarch64_expand_sve_const_vector (rtx target, rtx src)
 {
   machine_mode mode = GET_MODE (src);
   unsigned int npatterns = CONST_VECTOR_NPATTERNS (src);
   unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (src);
-  gcc_assert (npatterns > 1);
+  scalar_mode elt_mode = GET_MODE_INNER (mode);
+  unsigned int elt_bits = GET_MODE_BITSIZE (elt_mode);
+  unsigned int encoded_bits = npatterns * nelts_per_pattern * elt_bits;
 
-  if (nelts_per_pattern == 1)
+  if (nelts_per_pattern == 1 && encoded_bits == 128)
     {
-      /* The constant is a repeating seqeuence of at least two elements,
-	 where the repeating elements occupy no more than 128 bits.
-	 Get an integer representation of the replicated value.  */
-      scalar_int_mode int_mode;
-      if (BYTES_BIG_ENDIAN)
-	/* For now, always use LD1RQ to load the value on big-endian
-	   targets, since the handling of smaller integers includes a
-	   subreg that is semantically an element reverse.  */
-	int_mode = TImode;
-      else
+      /* The constant is a duplicated quadword but can't be narrowed
+	 beyond a quadword.  Get the memory image of the first quadword
+	 as a 128-bit vector and try using LD1RQ to load it from memory.
+
+	 The effect for both endiannesses is to load memory lane N into
+	 architectural lanes N + I * STEP of the result.  On big-endian
+	 targets, the layout of the 128-bit vector in an Advanced SIMD
+	 register would be different from its layout in an SVE register,
+	 but this 128-bit vector is a memory value only.  */
+      machine_mode vq_mode = aarch64_vq_mode (elt_mode).require ();
+      rtx vq_value = simplify_gen_subreg (vq_mode, src, mode, 0);
+      if (vq_value && aarch64_expand_sve_ld1rq (target, vq_value))
+	return target;
+    }
+
+  if (nelts_per_pattern == 1 && encoded_bits < 128)
+    {
+      /* The vector is a repeating sequence of 64 bits or fewer.
+	 See if we can load them using an Advanced SIMD move and then
+	 duplicate it to fill a vector.  This is better than using a GPR
+	 move because it keeps everything in the same register file.  */
+      machine_mode vq_mode = aarch64_vq_mode (elt_mode).require ();
+      rtx_vector_builder builder (vq_mode, npatterns, 1);
+      for (unsigned int i = 0; i < npatterns; ++i)
 	{
-	  unsigned int int_bits = GET_MODE_UNIT_BITSIZE (mode) * npatterns;
-	  gcc_assert (int_bits <= 128);
-	  int_mode = int_mode_for_size (int_bits, 0).require ();
-	}
-      rtx int_value = simplify_gen_subreg (int_mode, src, mode, 0);
-      if (int_value
-	  && aarch64_expand_sve_widened_duplicate (dest, int_mode, int_value))
-	return;
+	  /* We want memory lane N to go into architectural lane N,
+	     so reverse for big-endian targets.  The DUP .Q pattern
+	     has a compensating reverse built-in.  */
+	  unsigned int srci = BYTES_BIG_ENDIAN ? npatterns - i - 1 : i;
+	  builder.quick_push (CONST_VECTOR_ENCODED_ELT (src, srci));
+	}
+      rtx vq_src = builder.build ();
+      if (aarch64_simd_valid_immediate (vq_src, NULL))
+	{
+	  vq_src = force_reg (vq_mode, vq_src);
+	  return aarch64_expand_sve_dupq (target, mode, vq_src);
+	}
+
+      /* Get an integer representation of the repeating part of Advanced
+	 SIMD vector VQ_SRC.  This preserves the endianness of VQ_SRC,
+	 which for big-endian targets is lane-swapped wrt a normal
+	 Advanced SIMD vector.  This means that for both endiannesses,
+	 memory lane N of SVE vector SRC corresponds to architectural
+	 lane N of a register holding VQ_SRC.  This in turn means that
+	 memory lane 0 of SVE vector SRC is in the lsb of VQ_SRC (viewed
+	 as a single 128-bit value) and thus that memory lane 0 of SRC is
+	 in the lsb of the integer.  Duplicating the integer therefore
+	 ensures that memory lane N of SRC goes into architectural lane
+	 N + I * INDEX of the SVE register.  */
+      scalar_mode int_mode = int_mode_for_size (encoded_bits, 0).require ();
+      rtx elt_value = simplify_gen_subreg (int_mode, vq_src, vq_mode, 0);
+      if (elt_value)
+	{
+	  /* Pretend that we had a vector of INT_MODE to start with.  */
+	  elt_mode = int_mode;
+	  mode = aarch64_full_sve_mode (int_mode).require ();
+
+	  /* If the integer can be moved into a general register by a
+	     single instruction, do that and duplicate the result.  */
+	  if (CONST_INT_P (elt_value)
+	      && aarch64_move_imm (INTVAL (elt_value), elt_mode))
+	    {
+	      elt_value = force_reg (elt_mode, elt_value);
+	      return expand_vector_broadcast (mode, elt_value);
+	    }
+	}
+      else if (npatterns == 1)
+	/* We're duplicating a single value, but can't do better than
+	   force it to memory and load from there.  This handles things
+	   like symbolic constants.  */
+	elt_value = CONST_VECTOR_ENCODED_ELT (src, 0);
+
+      if (elt_value)
+	{
+	  /* Load the element from memory if we can, otherwise move it into
+	     a register and use a DUP.  */
+	  rtx op = force_const_mem (elt_mode, elt_value);
+	  if (!op)
+	    op = force_reg (elt_mode, elt_value);
+	  return expand_vector_broadcast (mode, op);
+	}
+    }
+
+  /* Try using INDEX.  */
+  rtx base, step;
+  if (const_vec_series_p (src, &base, &step))
+    {
+      aarch64_expand_vec_series (target, base, step);
+      return target;
     }
 
+  /* From here on, it's better to force the whole constant to memory
+     if we can.  */
+  if (GET_MODE_NUNITS (mode).is_constant ())
+    return NULL_RTX;
+
   /* Expand each pattern individually.  */
+  gcc_assert (npatterns > 1);
   rtx_vector_builder builder;
   auto_vec<rtx, 16> vectors (npatterns);
   for (unsigned int i = 0; i < npatterns; ++i)
@@ -3333,22 +3437,20 @@  aarch64_expand_sve_const_vector (rtx des
       npatterns /= 2;
       for (unsigned int i = 0; i < npatterns; ++i)
 	{
-	  rtx tmp = (npatterns == 1 ? dest : gen_reg_rtx (mode));
+	  rtx tmp = (npatterns == 1 ? target : gen_reg_rtx (mode));
 	  rtvec v = gen_rtvec (2, vectors[i], vectors[i + npatterns]);
 	  emit_set_insn (tmp, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
 	  vectors[i] = tmp;
 	}
     }
-  gcc_assert (vectors[0] == dest);
+  gcc_assert (vectors[0] == target);
+  return target;
 }
 
-/* Set DEST to immediate IMM.  For SVE vector modes, GEN_VEC_DUPLICATE
-   is a pattern that can be used to set DEST to a replicated scalar
-   element.  */
+/* Set DEST to immediate IMM.  */
 
 void
-aarch64_expand_mov_immediate (rtx dest, rtx imm,
-			      rtx (*gen_vec_duplicate) (rtx, rtx))
+aarch64_expand_mov_immediate (rtx dest, rtx imm)
 {
   machine_mode mode = GET_MODE (dest);
 
@@ -3471,38 +3573,24 @@  aarch64_expand_mov_immediate (rtx dest,
 
   if (!CONST_INT_P (imm))
     {
-      rtx base, step, value;
       if (GET_CODE (imm) == HIGH
 	  || aarch64_simd_valid_immediate (imm, NULL))
-	emit_insn (gen_rtx_SET (dest, imm));
-      else if (const_vec_series_p (imm, &base, &step))
-	aarch64_expand_vec_series (dest, base, step);
-      else if (const_vec_duplicate_p (imm, &value))
-	{
-	  /* If the constant is out of range of an SVE vector move,
-	     load it from memory if we can, otherwise move it into
-	     a register and use a DUP.  */
-	  scalar_mode inner_mode = GET_MODE_INNER (mode);
-	  rtx op = force_const_mem (inner_mode, value);
-	  if (!op)
-	    op = force_reg (inner_mode, value);
-	  else if (!aarch64_sve_ld1r_operand_p (op))
-	    {
-	      rtx addr = force_reg (Pmode, XEXP (op, 0));
-	      op = replace_equiv_address (op, addr);
-	    }
-	  emit_insn (gen_vec_duplicate (dest, op));
-	}
-      else if (GET_CODE (imm) == CONST_VECTOR
-	       && !GET_MODE_NUNITS (GET_MODE (imm)).is_constant ())
-	aarch64_expand_sve_const_vector (dest, imm);
-      else
 	{
-	  rtx mem = force_const_mem (mode, imm);
-	  gcc_assert (mem);
-	  emit_move_insn (dest, mem);
+	  emit_insn (gen_rtx_SET (dest, imm));
+	  return;
 	}
 
+      if (GET_CODE (imm) == CONST_VECTOR && aarch64_sve_data_mode_p (mode))
+	if (rtx res = aarch64_expand_sve_const_vector (dest, imm))
+	  {
+	    if (dest != res)
+	      emit_insn (gen_aarch64_sve_reinterpret (mode, dest, res));
+	    return;
+	  }
+
+      rtx mem = force_const_mem (mode, imm);
+      gcc_assert (mem);
+      emit_move_insn (dest, mem);
       return;
     }
 
@@ -14172,55 +14260,71 @@  aarch64_vector_mode_supported_p (machine
   return vec_flags != 0 && (vec_flags & VEC_STRUCT) == 0;
 }
 
+/* Return the full-width SVE vector mode for element mode MODE, if one
+   exists.  */
+opt_machine_mode
+aarch64_full_sve_mode (scalar_mode mode)
+{
+  switch (mode)
+    {
+    case E_DFmode:
+      return VNx2DFmode;
+    case E_SFmode:
+      return VNx4SFmode;
+    case E_HFmode:
+      return VNx8HFmode;
+    case E_DImode:
+	return VNx2DImode;
+    case E_SImode:
+      return VNx4SImode;
+    case E_HImode:
+      return VNx8HImode;
+    case E_QImode:
+      return VNx16QImode;
+    default:
+      return opt_machine_mode ();
+    }
+}
+
+/* Return the 128-bit Advanced SIMD vector mode for element mode MODE,
+   if it exists.  */
+opt_machine_mode
+aarch64_vq_mode (scalar_mode mode)
+{
+  switch (mode)
+    {
+    case E_DFmode:
+      return V2DFmode;
+    case E_SFmode:
+      return V4SFmode;
+    case E_HFmode:
+      return V8HFmode;
+    case E_SImode:
+      return V4SImode;
+    case E_HImode:
+      return V8HImode;
+    case E_QImode:
+      return V16QImode;
+    case E_DImode:
+      return V2DImode;
+    default:
+      return opt_machine_mode ();
+    }
+}
+
 /* Return appropriate SIMD container
    for MODE within a vector of WIDTH bits.  */
 static machine_mode
 aarch64_simd_container_mode (scalar_mode mode, poly_int64 width)
 {
   if (TARGET_SVE && known_eq (width, BITS_PER_SVE_VECTOR))
-    switch (mode)
-      {
-      case E_DFmode:
-	return VNx2DFmode;
-      case E_SFmode:
-	return VNx4SFmode;
-      case E_HFmode:
-	return VNx8HFmode;
-      case E_DImode:
-	return VNx2DImode;
-      case E_SImode:
-	return VNx4SImode;
-      case E_HImode:
-	return VNx8HImode;
-      case E_QImode:
-	return VNx16QImode;
-      default:
-	return word_mode;
-      }
+    return aarch64_full_sve_mode (mode).else_mode (word_mode);
 
   gcc_assert (known_eq (width, 64) || known_eq (width, 128));
   if (TARGET_SIMD)
     {
       if (known_eq (width, 128))
-	switch (mode)
-	  {
-	  case E_DFmode:
-	    return V2DFmode;
-	  case E_SFmode:
-	    return V4SFmode;
-	  case E_HFmode:
-	    return V8HFmode;
-	  case E_SImode:
-	    return V4SImode;
-	  case E_HImode:
-	    return V8HImode;
-	  case E_QImode:
-	    return V16QImode;
-	  case E_DImode:
-	    return V2DImode;
-	  default:
-	    break;
-	  }
+	return aarch64_vq_mode (mode).else_mode (word_mode);
       else
 	switch (mode)
 	  {
@@ -14946,6 +15050,36 @@  aarch64_simd_check_vect_par_cnst_half (r
   return true;
 }
 
+/* Return a PARALLEL containing NELTS elements, with element I equal
+   to BASE + I * STEP.  */
+
+rtx
+aarch64_gen_stepped_int_parallel (unsigned int nelts, int base, int step)
+{
+  rtvec vec = rtvec_alloc (nelts);
+  for (unsigned int i = 0; i < nelts; ++i)
+    RTVEC_ELT (vec, i) = gen_int_mode (base + i * step, DImode);
+  return gen_rtx_PARALLEL (VOIDmode, vec);
+}
+
+/* Return true if OP is a PARALLEL of CONST_INTs that form a linear
+   series with step STEP.  */
+
+bool
+aarch64_stepped_int_parallel_p (rtx op, int step)
+{
+  if (GET_CODE (op) != PARALLEL || !CONST_INT_P (XVECEXP (op, 0, 0)))
+    return false;
+
+  unsigned HOST_WIDE_INT base = UINTVAL (XVECEXP (op, 0, 0));
+  for (int i = 1; i < XVECLEN (op, 0); ++i)
+    if (!CONST_INT_P (XVECEXP (op, 0, i))
+	|| UINTVAL (XVECEXP (op, 0, i)) != base + i * step)
+      return false;
+
+  return true;
+}
+
 /* Bounds-check lanes.  Ensure OPERAND lies between LOW (inclusive) and
    HIGH (exclusive).  */
 void
@@ -14998,6 +15132,25 @@  aarch64_sve_ld1r_operand_p (rtx op)
 	  && offset_6bit_unsigned_scaled_p (mode, addr.const_offset));
 }
 
+/* Return true if OP is a valid MEM operand for an SVE LD1RQ instruction.  */
+bool
+aarch64_sve_ld1rq_operand_p (rtx op)
+{
+  struct aarch64_address_info addr;
+  scalar_mode elem_mode = GET_MODE_INNER (GET_MODE (op));
+  if (!MEM_P (op)
+      || !aarch64_classify_address (&addr, XEXP (op, 0), elem_mode, false))
+    return false;
+
+  if (addr.type == ADDRESS_REG_IMM)
+    return offset_4bit_signed_scaled_p (TImode, addr.const_offset);
+
+  if (addr.type == ADDRESS_REG_REG)
+    return (1U << addr.shift) == GET_MODE_SIZE (elem_mode);
+
+  return false;
+}
+
 /* Return true if OP is a valid MEM operand for an SVE LDR instruction.
    The conditions for STR are the same.  */
 bool
Index: gcc/config/aarch64/predicates.md
===================================================================
--- gcc/config/aarch64/predicates.md	2019-08-13 10:49:16.403311568 +0100
+++ gcc/config/aarch64/predicates.md	2019-08-13 11:28:59.890111027 +0100
@@ -431,6 +431,12 @@  (define_special_predicate "vect_par_cnst
   return aarch64_simd_check_vect_par_cnst_half (op, mode, false);
 })
 
+(define_predicate "descending_int_parallel"
+  (match_code "parallel")
+{
+  return aarch64_stepped_int_parallel_p (op, -1);
+})
+
 (define_special_predicate "aarch64_simd_lshift_imm"
   (match_code "const,const_vector")
 {
@@ -543,6 +549,10 @@  (define_predicate "aarch64_sve_ld1r_oper
   (and (match_operand 0 "memory_operand")
        (match_test "aarch64_sve_ld1r_operand_p (op)")))
 
+(define_predicate "aarch64_sve_ld1rq_operand"
+  (and (match_code "mem")
+       (match_test "aarch64_sve_ld1rq_operand_p (op)")))
+
 ;; Like memory_operand, but restricted to addresses that are valid for
 ;; SVE LDR and STR instructions.
 (define_predicate "aarch64_sve_ldr_operand"
Index: gcc/config/aarch64/constraints.md
===================================================================
--- gcc/config/aarch64/constraints.md	2019-08-13 10:49:16.395311626 +0100
+++ gcc/config/aarch64/constraints.md	2019-08-13 11:28:59.890111027 +0100
@@ -272,6 +272,12 @@  (define_memory_constraint "Utq"
        (match_test "aarch64_legitimate_address_p (V2DImode,
 						  XEXP (op, 0), 1)")))
 
+(define_memory_constraint "UtQ"
+  "@internal
+   An address valid for SVE LD1RQs."
+  (and (match_code "mem")
+       (match_test "aarch64_sve_ld1rq_operand_p (op)")))
+
 (define_memory_constraint "Uty"
   "@internal
    An address valid for SVE LD1Rs."
Index: gcc/config/aarch64/aarch64.md
===================================================================
--- gcc/config/aarch64/aarch64.md	2019-08-07 19:27:50.000000000 +0100
+++ gcc/config/aarch64/aarch64.md	2019-08-13 11:28:59.890111027 +0100
@@ -234,6 +234,7 @@  (define_c_enum "unspec" [
     UNSPEC_CLASTB
     UNSPEC_FADDA
     UNSPEC_REV_SUBREG
+    UNSPEC_REINTERPRET
     UNSPEC_SPECULATION_TRACKER
     UNSPEC_COPYSIGN
     UNSPEC_TTEST		; Represent transaction test.
Index: gcc/config/aarch64/aarch64-sve.md
===================================================================
--- gcc/config/aarch64/aarch64-sve.md	2019-08-13 11:21:04.501550177 +0100
+++ gcc/config/aarch64/aarch64-sve.md	2019-08-13 11:28:59.882111087 +0100
@@ -207,8 +207,7 @@  (define_expand "mov<mode>"
 
     if (CONSTANT_P (operands[1]))
       {
-	aarch64_expand_mov_immediate (operands[0], operands[1],
-				      gen_vec_duplicate<mode>);
+	aarch64_expand_mov_immediate (operands[0], operands[1]);
 	DONE;
       }
 
@@ -326,6 +325,39 @@  (define_insn_and_split "*aarch64_sve_mov
   }
 )
 
+;; Reinterpret operand 1 in operand 0's mode, without changing its contents.
+;; This is equivalent to a subreg on little-endian targets but not for
+;; big-endian; see the comment at the head of the file for details.
+(define_expand "@aarch64_sve_reinterpret<mode>"
+  [(set (match_operand:SVE_ALL 0 "register_operand")
+	(unspec:SVE_ALL [(match_operand 1 "aarch64_any_register_operand")]
+			UNSPEC_REINTERPRET))]
+  "TARGET_SVE"
+  {
+    if (!BYTES_BIG_ENDIAN)
+      {
+	emit_move_insn (operands[0], gen_lowpart (<MODE>mode, operands[1]));
+	DONE;
+      }
+  }
+)
+
+;; A pattern for handling type punning on big-endian targets.  We use a
+;; special predicate for operand 1 to reduce the number of patterns.
+(define_insn_and_split "*aarch64_sve_reinterpret<mode>"
+  [(set (match_operand:SVE_ALL 0 "register_operand" "=w")
+	(unspec:SVE_ALL [(match_operand 1 "aarch64_any_register_operand" "0")]
+			UNSPEC_REINTERPRET))]
+  "TARGET_SVE"
+  "#"
+  "&& reload_completed"
+  [(set (match_dup 0) (match_dup 1))]
+  {
+    emit_note (NOTE_INSN_DELETED);
+    DONE;
+  }
+)
+
 ;; -------------------------------------------------------------------------
 ;; ---- Moves of multiple vectors
 ;; -------------------------------------------------------------------------
@@ -787,6 +819,39 @@  (define_insn_and_split "*vec_duplicate<m
   [(set_attr "length" "4,4,8")]
 )
 
+;; Duplicate an Advanced SIMD vector to fill an SVE vector (LE version).
+(define_insn "@aarch64_vec_duplicate_vq<mode>_le"
+  [(set (match_operand:SVE_ALL 0 "register_operand" "=w")
+	(vec_duplicate:SVE_ALL
+	  (match_operand:<V128> 1 "register_operand" "w")))]
+  "TARGET_SVE && !BYTES_BIG_ENDIAN"
+  {
+    operands[1] = gen_rtx_REG (<MODE>mode, REGNO (operands[1]));
+    return "dup\t%0.q, %1.q[0]";
+  }
+)
+
+;; Duplicate an Advanced SIMD vector to fill an SVE vector (BE version).
+;; The SVE register layout puts memory lane N into (architectural)
+;; register lane N, whereas the Advanced SIMD layout puts the memory
+;; lsb into the register lsb.  We therefore have to describe this in rtl
+;; terms as a reverse of the V128 vector followed by a duplicate.
+(define_insn "@aarch64_vec_duplicate_vq<mode>_be"
+  [(set (match_operand:SVE_ALL 0 "register_operand" "=w")
+	(vec_duplicate:SVE_ALL
+	  (vec_select:<V128>
+	    (match_operand:<V128> 1 "register_operand" "w")
+	    (match_operand 2 "descending_int_parallel"))))]
+  "TARGET_SVE
+   && BYTES_BIG_ENDIAN
+   && known_eq (INTVAL (XVECEXP (operands[2], 0, 0)),
+		GET_MODE_NUNITS (<V128>mode) - 1)"
+  {
+    operands[1] = gen_rtx_REG (<MODE>mode, REGNO (operands[1]));
+    return "dup\t%0.q, %1.q[0]";
+  }
+)
+
 ;; This is used for vec_duplicate<mode>s from memory, but can also
 ;; be used by combine to optimize selects of a a vec_duplicate<mode>
 ;; with zero.
@@ -802,17 +867,19 @@  (define_insn "sve_ld1r<mode>"
   "ld1r<Vesize>\t%0.<Vetype>, %1/z, %2"
 )
 
-;; Load 128 bits from memory and duplicate to fill a vector.  Since there
-;; are so few operations on 128-bit "elements", we don't define a VNx1TI
-;; and simply use vectors of bytes instead.
-(define_insn "*sve_ld1rq<Vesize>"
+;; Load 128 bits from memory under predicate control and duplicate to
+;; fill a vector.
+(define_insn "@aarch64_sve_ld1rq<mode>"
   [(set (match_operand:SVE_ALL 0 "register_operand" "=w")
 	(unspec:SVE_ALL
-	  [(match_operand:<VPRED> 1 "register_operand" "Upl")
-	   (match_operand:TI 2 "aarch64_sve_ld1r_operand" "Uty")]
+	  [(match_operand:<VPRED> 2 "register_operand" "Upl")
+	   (match_operand:<V128> 1 "aarch64_sve_ld1rq_operand" "UtQ")]
 	  UNSPEC_LD1RQ))]
   "TARGET_SVE"
-  "ld1rq<Vesize>\t%0.<Vetype>, %1/z, %2"
+  {
+    operands[1] = gen_rtx_MEM (<VEL>mode, XEXP (operands[1], 0));
+    return "ld1rq<Vesize>\t%0.<Vetype>, %2/z, %1";
+  }
 )
 
 ;; -------------------------------------------------------------------------
Index: gcc/testsuite/gcc.target/aarch64/sve/init_2.c
===================================================================
--- gcc/testsuite/gcc.target/aarch64/sve/init_2.c	2019-07-29 09:46:41.910859821 +0100
+++ gcc/testsuite/gcc.target/aarch64/sve/init_2.c	2019-08-13 11:28:59.890111027 +0100
@@ -11,9 +11,9 @@  typedef int32_t vnx4si __attribute__((ve
 /*
 ** foo:
 **	...
-**	ld1w	(z[0-9]+\.s), p[0-9]+/z, \[x[0-9]+\]
-**	insr	\1, w1
-**	insr	\1, w0
+**	ld1rd	(z[0-9]+)\.d, p[0-9]+/z, \[x[0-9]+\]
+**	insr	\1\.s, w1
+**	insr	\1\.s, w0
 **	...
 */
 __attribute__((noipa))
Index: gcc/testsuite/gcc.target/aarch64/sve/init_4.c
===================================================================
--- gcc/testsuite/gcc.target/aarch64/sve/init_4.c	2019-07-29 09:46:41.910859821 +0100
+++ gcc/testsuite/gcc.target/aarch64/sve/init_4.c	2019-08-13 11:28:59.890111027 +0100
@@ -11,10 +11,10 @@  typedef int32_t vnx4si __attribute__((ve
 /*
 ** foo:
 **	...
-**	ld1w	(z[0-9]+\.s), p[0-9]+/z, \[x[0-9]+\]
-**	insr	\1, w1
-**	insr	\1, w0
-**	rev	\1, \1
+**	ld1rd	(z[0-9]+)\.d, p[0-9]+/z, \[x[0-9]+\]
+**	insr	\1\.s, w1
+**	insr	\1\.s, w0
+**	rev	\1\.s, \1\.s
 **	...
 */
 __attribute__((noipa))
Index: gcc/testsuite/gcc.target/aarch64/sve/ld1r_2.c
===================================================================
--- gcc/testsuite/gcc.target/aarch64/sve/ld1r_2.c	2019-03-08 18:14:29.788994704 +0000
+++ gcc/testsuite/gcc.target/aarch64/sve/ld1r_2.c	2019-08-13 11:28:59.890111027 +0100
@@ -28,22 +28,6 @@  #define FOR_EACH_LOAD_BROADCAST(T)		\
   T (int64_t)
 
 #define FOR_EACH_LOAD_BROADCAST_IMM(T)					\
-  T (int16_t, 129, imm_129)						\
-  T (int32_t, 129, imm_129)						\
-  T (int64_t, 129, imm_129)						\
-									\
-  T (int16_t, -130, imm_m130)						\
-  T (int32_t, -130, imm_m130)						\
-  T (int64_t, -130, imm_m130)						\
-									\
-  T (int16_t, 0x1234, imm_0x1234)					\
-  T (int32_t, 0x1234, imm_0x1234)					\
-  T (int64_t, 0x1234, imm_0x1234)					\
-									\
-  T (int16_t, 0xFEDC, imm_0xFEDC)					\
-  T (int32_t, 0xFEDC, imm_0xFEDC)					\
-  T (int64_t, 0xFEDC, imm_0xFEDC)					\
-									\
   T (int32_t, 0x12345678, imm_0x12345678)				\
   T (int64_t, 0x12345678, imm_0x12345678)				\
 									\
@@ -56,6 +40,6 @@  FOR_EACH_LOAD_BROADCAST (DEF_LOAD_BROADC
 FOR_EACH_LOAD_BROADCAST_IMM (DEF_LOAD_BROADCAST_IMM)
 
 /* { dg-final { scan-assembler-times {\tld1rb\tz[0-9]+\.b, p[0-7]/z, } 1 } } */
-/* { dg-final { scan-assembler-times {\tld1rh\tz[0-9]+\.h, p[0-7]/z, } 5 } } */
-/* { dg-final { scan-assembler-times {\tld1rw\tz[0-9]+\.s, p[0-7]/z, } 7 } } */
-/* { dg-final { scan-assembler-times {\tld1rd\tz[0-9]+\.d, p[0-7]/z, } 8 } } */
+/* { dg-final { scan-assembler-times {\tld1rh\tz[0-9]+\.h, p[0-7]/z, } 1 } } */
+/* { dg-final { scan-assembler-times {\tld1rw\tz[0-9]+\.s, p[0-7]/z, } 3 } } */
+/* { dg-final { scan-assembler-times {\tld1rd\tz[0-9]+\.d, p[0-7]/z, } 4 } } */
Index: gcc/testsuite/gcc.target/aarch64/sve/slp_2.c
===================================================================
--- gcc/testsuite/gcc.target/aarch64/sve/slp_2.c	2019-03-08 18:14:29.764994797 +0000
+++ gcc/testsuite/gcc.target/aarch64/sve/slp_2.c	2019-08-13 11:28:59.890111027 +0100
@@ -29,12 +29,9 @@  #define TEST_ALL(T)				\
 
 TEST_ALL (VEC_PERM)
 
-/* { dg-final { scan-assembler-times {\tld1rh\tz[0-9]+\.h, } 2 { target aarch64_little_endian } } } */
-/* { dg-final { scan-assembler-times {\tld1rqb\tz[0-9]+\.b, } 2 { target aarch64_big_endian } } } */
-/* { dg-final { scan-assembler-times {\tld1rw\tz[0-9]+\.s, } 3 { target aarch64_little_endian } } } */
-/* { dg-final { scan-assembler-times {\tld1rqh\tz[0-9]+\.h, } 3 { target aarch64_big_endian } } } */
-/* { dg-final { scan-assembler-times {\tld1rd\tz[0-9]+\.d, } 3 { target aarch64_little_endian } } } */
-/* { dg-final { scan-assembler-times {\tld1rqw\tz[0-9]+\.s, } 3 { target aarch64_big_endian } } } */
+/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.h, w[0-9]+\n} 2 } } */
+/* { dg-final { scan-assembler-times {\tld1rw\tz[0-9]+\.s, } 3 } } */
+/* { dg-final { scan-assembler-times {\tld1rd\tz[0-9]+\.d, } 3 } } */
 /* { dg-final { scan-assembler-times {\tld1rqd\tz[0-9]+\.d, } 3 } } */
 /* { dg-final { scan-assembler-not {\tzip1\t} } } */
 /* { dg-final { scan-assembler-not {\tzip2\t} } } */
Index: gcc/testsuite/gcc.target/aarch64/sve/slp_3.c
===================================================================
--- gcc/testsuite/gcc.target/aarch64/sve/slp_3.c	2019-03-08 18:14:29.768994780 +0000
+++ gcc/testsuite/gcc.target/aarch64/sve/slp_3.c	2019-08-13 11:28:59.890111027 +0100
@@ -32,18 +32,17 @@  #define TEST_ALL(T)				\
 TEST_ALL (VEC_PERM)
 
 /* 1 for each 8-bit type.  */
-/* { dg-final { scan-assembler-times {\tld1rw\tz[0-9]+\.s, } 2 { target aarch64_little_endian } } } */
-/* { dg-final { scan-assembler-times {\tld1rqb\tz[0-9]+\.b, } 2 { target aarch64_big_endian } } } */
-/* 1 for each 16-bit type and 4 for double.  */
-/* { dg-final { scan-assembler-times {\tld1rd\tz[0-9]+\.d, } 7 { target aarch64_little_endian } } } */
-/* { dg-final { scan-assembler-times {\tld1rqh\tz[0-9]+\.h, } 3 { target aarch64_big_endian } } } */
-/* { dg-final { scan-assembler-times {\tld1rd\tz[0-9]+\.d, } 4 { target aarch64_big_endian } } } */
+/* { dg-final { scan-assembler-times {\tld1rw\tz[0-9]+\.s, } 2 } } */
+/* 1 for each 16-bit type plus 1 for double.  */
+/* { dg-final { scan-assembler-times {\tld1rd\tz[0-9]+\.d, } 4 } } */
 /* 1 for each 32-bit type.  */
 /* { dg-final { scan-assembler-times {\tld1rqw\tz[0-9]+\.s, } 3 } } */
 /* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.d, #41\n} 2 } } */
 /* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.d, #25\n} 2 } } */
 /* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.d, #31\n} 2 } } */
 /* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.d, #62\n} 2 } } */
+/* 3 for double.  */
+/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.d, x[0-9]+\n} 3 } } */
 /* The 64-bit types need:
 
       ZIP1 ZIP1 (2 ZIP2s optimized away)
Index: gcc/testsuite/gcc.target/aarch64/sve/slp_4.c
===================================================================
--- gcc/testsuite/gcc.target/aarch64/sve/slp_4.c	2019-03-08 18:14:29.772994767 +0000
+++ gcc/testsuite/gcc.target/aarch64/sve/slp_4.c	2019-08-13 11:28:59.890111027 +0100
@@ -35,10 +35,8 @@  #define TEST_ALL(T)				\
 
 TEST_ALL (VEC_PERM)
 
-/* 1 for each 8-bit type, 4 for each 32-bit type and 8 for double.  */
-/* { dg-final { scan-assembler-times {\tld1rd\tz[0-9]+\.d, } 22 { target aarch64_little_endian } } } */
-/* { dg-final { scan-assembler-times {\tld1rqb\tz[0-9]+\.b, } 2 { target aarch64_big_endian } } } */
-/* { dg-final { scan-assembler-times {\tld1rd\tz[0-9]+\.d, } 20 { target aarch64_big_endian } } } */
+/* 1 for each 8-bit type, 4 for each 32-bit type and 4 for double.  */
+/* { dg-final { scan-assembler-times {\tld1rd\tz[0-9]+\.d, } 18 } } */
 /* 1 for each 16-bit type.  */
 /* { dg-final { scan-assembler-times {\tld1rqh\tz[0-9]\.h, } 3 } } */
 /* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.d, #99\n} 2 } } */
@@ -49,6 +47,8 @@  TEST_ALL (VEC_PERM)
 /* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.d, #37\n} 2 } } */
 /* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.d, #24\n} 2 } } */
 /* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.d, #81\n} 2 } } */
+/* 4 for double.  */
+/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.d, x[0-9]+\n} 4 } } */
 /* The 32-bit types need:
 
       ZIP1 ZIP1 (2 ZIP2s optimized away)
Index: gcc/testsuite/gcc.target/aarch64/sve/spill_4.c
===================================================================
--- gcc/testsuite/gcc.target/aarch64/sve/spill_4.c	2019-03-08 18:14:29.776994751 +0000
+++ gcc/testsuite/gcc.target/aarch64/sve/spill_4.c	2019-08-13 11:28:59.890111027 +0100
@@ -24,10 +24,10 @@  TEST_LOOP (uint16_t, 0x1234);
 TEST_LOOP (uint32_t, 0x12345);
 TEST_LOOP (uint64_t, 0x123456);
 
-/* { dg-final { scan-assembler-times {\tptrue\tp[0-9]+\.h,} 3 } } */
+/* { dg-final { scan-assembler-not {\tptrue\tp[0-9]+\.h,} } } */
 /* { dg-final { scan-assembler-times {\tptrue\tp[0-9]+\.s,} 3 } } */
 /* { dg-final { scan-assembler-times {\tptrue\tp[0-9]+\.d,} 3 } } */
-/* { dg-final { scan-assembler-times {\tld1rh\tz[0-9]+\.h,} 3 } } */
+/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.h, w[0-9]+\n} 3 } } */
 /* { dg-final { scan-assembler-times {\tld1rw\tz[0-9]+\.s,} 3 } } */
 /* { dg-final { scan-assembler-times {\tld1rd\tz[0-9]+\.d,} 3 } } */
 /* { dg-final { scan-assembler-not {\tldr\tz[0-9]} } } */
Index: gcc/testsuite/gcc.target/aarch64/sve/const_1.c
===================================================================
--- /dev/null	2019-07-30 08:53:31.317691683 +0100
+++ gcc/testsuite/gcc.target/aarch64/sve/const_1.c	2019-08-13 11:28:59.890111027 +0100
@@ -0,0 +1,13 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O3" } */
+
+#include <stdint.h>
+
+void
+set (uint64_t *dst, int count)
+{
+  for (int i = 0; i < count; ++i)
+    dst[i] = 0xffff00ff00ffff00ULL;
+}
+
+/* { dg-final { scan-assembler {\tmovi\tv([0-9]+)\.2d, 0xffff00ff00ffff00\n.*\tdup\tz[0-9]+\.q, z\1\.q\[0\]\n} } } */
Index: gcc/testsuite/gcc.target/aarch64/sve/const_2.c
===================================================================
--- /dev/null	2019-07-30 08:53:31.317691683 +0100
+++ gcc/testsuite/gcc.target/aarch64/sve/const_2.c	2019-08-13 11:28:59.890111027 +0100
@@ -0,0 +1,20 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O3" } */
+
+#include <stdint.h>
+
+#define TEST(TYPE, CONST)			\
+  void						\
+  set_##TYPE (TYPE *dst, int count)		\
+  {						\
+    for (int i = 0; i < count; ++i)		\
+      dst[i] = CONST;				\
+  }
+
+TEST (uint16_t, 129)
+TEST (uint32_t, 129)
+TEST (uint64_t, 129)
+
+/* { dg-final { scan-assembler {\tmovi\tv([0-9]+)\.8h, 0x81\n[^:]*\tdup\tz[0-9]+\.q, z\1\.q\[0\]\n} } } */
+/* { dg-final { scan-assembler {\tmovi\tv([0-9]+)\.4s, 0x81\n[^:]*\tdup\tz[0-9]+\.q, z\1\.q\[0\]\n} } } */
+/* { dg-final { scan-assembler {\tmov\t(x[0-9]+), 129\n[^:]*\tmov\tz[0-9]+\.d, \1\n} } } */
Index: gcc/testsuite/gcc.target/aarch64/sve/const_3.c
===================================================================
--- /dev/null	2019-07-30 08:53:31.317691683 +0100
+++ gcc/testsuite/gcc.target/aarch64/sve/const_3.c	2019-08-13 11:28:59.890111027 +0100
@@ -0,0 +1,20 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O3" } */
+
+#include <stdint.h>
+
+#define TEST(TYPE, CONST)			\
+  void						\
+  set_##TYPE (TYPE *dst, int count)		\
+  {						\
+    for (int i = 0; i < count; ++i)		\
+      dst[i] = CONST;				\
+  }
+
+TEST (uint16_t, 0x1234)
+TEST (uint32_t, 0x1234)
+TEST (uint64_t, 0x1234)
+
+/* { dg-final { scan-assembler {\tmov\t(w[0-9]+), 4660\n[^:]*\tmov\tz[0-9]+\.h, \1\n} } } */
+/* { dg-final { scan-assembler {\tmov\t(w[0-9]+), 4660\n[^:]*\tmov\tz[0-9]+\.s, \1\n} } } */
+/* { dg-final { scan-assembler {\tmov\t(x[0-9]+), 4660\n[^:]*\tmov\tz[0-9]+\.d, \1\n} } } */