[committed,AArch64] Add autovec support for partial SVE vectors
diff mbox series

Message ID mptr22813ts.fsf@arm.com
State New
Headers show
Series
  • [committed,AArch64] Add autovec support for partial SVE vectors
Related show

Commit Message

Richard Sandiford Nov. 16, 2019, 11:02 a.m. UTC
This patch adds the bare minimum needed to support autovectorisation of
partial SVE vectors, namely moves and integer addition.  Later patches
add more interesting cases.

Tested on aarch64-linux-gnu and applied as r278341.

Richard


2019-11-16  Richard Sandiford  <richard.sandiford@arm.com>

gcc/
	* config/aarch64/aarch64-modes.def: Define partial SVE vector
	float modes.
	* config/aarch64/aarch64-protos.h (aarch64_sve_pred_mode): New
	function.
	* config/aarch64/aarch64.c (aarch64_classify_vector_mode): Handle the
	new vector float modes.
	(aarch64_sve_container_bits): New function.
	(aarch64_sve_pred_mode): Likewise.
	(aarch64_get_mask_mode): Use it.
	(aarch64_sve_element_int_mode): Handle structure modes and partial
	modes.
	(aarch64_sve_container_int_mode): New function.
	(aarch64_vectorize_related_mode): Return SVE modes when given
	SVE modes.  Handle partial modes, taking the preferred number
	of units from the size of the given mode.
	(aarch64_hard_regno_mode_ok): Allow partial modes to be stored
	in registers.
	(aarch64_expand_sve_ld1rq): Use the mode form of aarch64_sve_pred_mode.
	(aarch64_expand_sve_const_vector): Handle partial SVE vectors.
	(aarch64_split_sve_subreg_move): Use the mode form of
	aarch64_sve_pred_mode.
	(aarch64_secondary_reload): Handle partial modes in the same way
	as full big-endian vectors.
	(aarch64_vector_mode_supported_p): Allow partial SVE vectors.
	(aarch64_autovectorize_vector_modes): Try unpacked SVE vectors,
	merging with the Advanced SIMD modes.  If two modes have the
	same size, try the Advanced SIMD mode first.
	(aarch64_simd_valid_immediate): Use the container rather than
	the element mode for INDEX constants.
	(aarch64_simd_vector_alignment): Make the alignment of partial
	SVE vector modes the same as their minimum size.
	(aarch64_evpc_sel): Use the mode form of aarch64_sve_pred_mode.
	* config/aarch64/aarch64-sve.md (mov<SVE_FULL:mode>): Extend to...
	(mov<SVE_ALL:mode>): ...this.
	(movmisalign<SVE_FULL:mode>): Extend to...
	(movmisalign<SVE_ALL:mode>): ...this.
	(*aarch64_sve_mov<mode>_le): Rename to...
	(*aarch64_sve_mov<mode>_ldr_str): ...this.
	(*aarch64_sve_mov<SVE_FULL:mode>_be): Rename and extend to...
	(*aarch64_sve_mov<SVE_ALL:mode>_no_ldr_str): ...this.  Handle
	partial modes regardless of endianness.
	(aarch64_sve_reload_be): Rename to...
	(aarch64_sve_reload_mem): ...this and enable for little-endian.
	Use aarch64_sve_pred_mode to get the appropriate predicate mode.
	(@aarch64_pred_mov<SVE_FULL:mode>): Extend to...
	(@aarch64_pred_mov<SVE_ALL:mode>): ...this.
	(*aarch64_sve_mov<SVE_FULL:mode>_subreg_be): Extend to...
	(*aarch64_sve_mov<SVE_ALL:mode>_subreg_be): ...this.
	(@aarch64_sve_reinterpret<SVE_FULL:mode>): Extend to...
	(@aarch64_sve_reinterpret<SVE_ALL:mode>): ...this.
	(*aarch64_sve_reinterpret<SVE_FULL:mode>): Extend to...
	(*aarch64_sve_reinterpret<SVE_ALL:mode>): ...this.
	(maskload<SVE_FULL:mode><vpred>): Extend to...
	(maskload<SVE_ALL:mode><vpred>): ...this.
	(maskstore<SVE_FULL:mode><vpred>): Extend to...
	(maskstore<SVE_ALL:mode><vpred>): ...this.
	(vec_duplicate<SVE_FULL:mode>): Extend to...
	(vec_duplicate<SVE_ALL:mode>): ...this.
	(*vec_duplicate<SVE_FULL:mode>_reg): Extend to...
	(*vec_duplicate<SVE_ALL:mode>_reg): ...this.
	(sve_ld1r<SVE_FULL:mode>): Extend to...
	(sve_ld1r<SVE_ALL:mode>): ...this.
	(vec_series<SVE_FULL_I:mode>): Extend to...
	(vec_series<SVE_I:mode>): ...this.
	(*vec_series<SVE_FULL_I:mode>_plus): Extend to...
	(*vec_series<SVE_I:mode>_plus): ...this.
	(@aarch64_pred_sxt<SVE_FULL_HSDI:mode><SVE_PARTIAL_I:mode>): Avoid
	new VPRED ambiguity.
	(@aarch64_cond_sxt<SVE_FULL_HSDI:mode><SVE_PARTIAL_I:mode>): Likewise.
	(add<SVE_FULL_I:mode>3): Extend to...
	(add<SVE_I:mode>3): ...this.
	* config/aarch64/iterators.md (SVE_ALL, SVE_I): New mode iterators.
	(Vetype, Vesize, VEL, Vel, vwcore): Handle partial SVE vector modes.
	(VPRED, vpred): Likewise.
	(Vctype): New iterator.
	(vw): Remove SVE modes.

gcc/testsuite/
	* gcc.target/aarch64/sve/mixed_size_1.c: New test.
	* gcc.target/aarch64/sve/mixed_size_2.c: Likewise.
	* gcc.target/aarch64/sve/mixed_size_3.c: Likewise.
	* gcc.target/aarch64/sve/mixed_size_4.c: Likewise.
	* gcc.target/aarch64/sve/mixed_size_5.c: Likewise.

Patch
diff mbox series

Index: gcc/config/aarch64/aarch64-modes.def
===================================================================
--- gcc/config/aarch64/aarch64-modes.def	2019-10-16 11:53:03.681148277 +0100
+++ gcc/config/aarch64/aarch64-modes.def	2019-11-16 10:59:20.722514516 +0000
@@ -123,13 +123,18 @@  SVE_MODES (4, VNx64, VNx32, VNx16, VNx8)
 VECTOR_MODES_WITH_PREFIX (VNx, INT, 2, 1);
 VECTOR_MODES_WITH_PREFIX (VNx, INT, 4, 1);
 VECTOR_MODES_WITH_PREFIX (VNx, INT, 8, 1);
+VECTOR_MODES_WITH_PREFIX (VNx, FLOAT, 4, 1);
+VECTOR_MODES_WITH_PREFIX (VNx, FLOAT, 8, 1);
 
 ADJUST_NUNITS (VNx2QI, aarch64_sve_vg);
 ADJUST_NUNITS (VNx2HI, aarch64_sve_vg);
 ADJUST_NUNITS (VNx2SI, aarch64_sve_vg);
+ADJUST_NUNITS (VNx2HF, aarch64_sve_vg);
+ADJUST_NUNITS (VNx2SF, aarch64_sve_vg);
 
 ADJUST_NUNITS (VNx4QI, aarch64_sve_vg * 2);
 ADJUST_NUNITS (VNx4HI, aarch64_sve_vg * 2);
+ADJUST_NUNITS (VNx4HF, aarch64_sve_vg * 2);
 
 ADJUST_NUNITS (VNx8QI, aarch64_sve_vg * 4);
 
@@ -139,8 +144,11 @@  ADJUST_ALIGNMENT (VNx8QI, 1);
 
 ADJUST_ALIGNMENT (VNx2HI, 2);
 ADJUST_ALIGNMENT (VNx4HI, 2);
+ADJUST_ALIGNMENT (VNx2HF, 2);
+ADJUST_ALIGNMENT (VNx4HF, 2);
 
 ADJUST_ALIGNMENT (VNx2SI, 4);
+ADJUST_ALIGNMENT (VNx2SF, 4);
 
 /* Quad float: 128-bit floating mode for long doubles.  */
 FLOAT_MODE (TF, 16, ieee_quad_format);
Index: gcc/config/aarch64/aarch64-protos.h
===================================================================
--- gcc/config/aarch64/aarch64-protos.h	2019-11-13 08:39:21.000000000 +0000
+++ gcc/config/aarch64/aarch64-protos.h	2019-11-16 10:59:20.722514516 +0000
@@ -512,6 +512,7 @@  bool aarch64_zero_extend_const_eq (machi
 bool aarch64_move_imm (HOST_WIDE_INT, machine_mode);
 machine_mode aarch64_sve_int_mode (machine_mode);
 opt_machine_mode aarch64_sve_pred_mode (unsigned int);
+machine_mode aarch64_sve_pred_mode (machine_mode);
 opt_machine_mode aarch64_sve_data_mode (scalar_mode, poly_uint64);
 bool aarch64_sve_mode_p (machine_mode);
 HOST_WIDE_INT aarch64_fold_sve_cnt_pat (aarch64_svpattern, unsigned int);
Index: gcc/config/aarch64/aarch64.c
===================================================================
--- gcc/config/aarch64/aarch64.c	2019-11-16 10:43:45.589105879 +0000
+++ gcc/config/aarch64/aarch64.c	2019-11-16 10:59:20.730514460 +0000
@@ -1625,6 +1625,11 @@  aarch64_classify_vector_mode (machine_mo
     case E_VNx4HImode:
     /* Partial SVE SI vector.  */
     case E_VNx2SImode:
+    /* Partial SVE HF vectors.  */
+    case E_VNx2HFmode:
+    case E_VNx4HFmode:
+    /* Partial SVE SF vector.  */
+    case E_VNx2SFmode:
       return TARGET_SVE ? VEC_SVE_DATA | VEC_PARTIAL : 0;
 
     case E_VNx16QImode:
@@ -1753,6 +1758,22 @@  aarch64_array_mode_supported_p (machine_
   return false;
 }
 
+/* MODE is some form of SVE vector mode.  For data modes, return the number
+   of vector register bits that each element of MODE occupies, such as 64
+   for both VNx2DImode and VNx2SImode (where each 32-bit value is stored
+   in a 64-bit container).  For predicate modes, return the number of
+   data bits controlled by each significant predicate bit.  */
+
+static unsigned int
+aarch64_sve_container_bits (machine_mode mode)
+{
+  unsigned int vec_flags = aarch64_classify_vector_mode (mode);
+  poly_uint64 vector_bits = (vec_flags & (VEC_PARTIAL | VEC_SVE_PRED)
+			     ? BITS_PER_SVE_VECTOR
+			     : GET_MODE_BITSIZE (mode));
+  return vector_element_size (vector_bits, GET_MODE_NUNITS (mode));
+}
+
 /* Return the SVE predicate mode to use for elements that have
    ELEM_NBYTES bytes, if such a mode exists.  */
 
@@ -1773,6 +1794,16 @@  aarch64_sve_pred_mode (unsigned int elem
   return opt_machine_mode ();
 }
 
+/* Return the SVE predicate mode that should be used to control
+   SVE mode MODE.  */
+
+machine_mode
+aarch64_sve_pred_mode (machine_mode mode)
+{
+  unsigned int bits = aarch64_sve_container_bits (mode);
+  return aarch64_sve_pred_mode (bits / BITS_PER_UNIT).require ();
+}
+
 /* Implement TARGET_VECTORIZE_GET_MASK_MODE.  */
 
 static opt_machine_mode
@@ -1780,7 +1811,7 @@  aarch64_get_mask_mode (machine_mode mode
 {
   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
   if (vec_flags & VEC_SVE_DATA)
-    return aarch64_sve_pred_mode (GET_MODE_UNIT_SIZE (mode));
+    return aarch64_sve_pred_mode (mode);
 
   return default_get_mask_mode (mode);
 }
@@ -1806,11 +1837,25 @@  aarch64_sve_data_mode (scalar_mode inner
 static scalar_int_mode
 aarch64_sve_element_int_mode (machine_mode mode)
 {
-  unsigned int elt_bits = vector_element_size (BITS_PER_SVE_VECTOR,
+  poly_uint64 vector_bits = (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
+			     ? BITS_PER_SVE_VECTOR
+			     : GET_MODE_BITSIZE (mode));
+  unsigned int elt_bits = vector_element_size (vector_bits,
 					       GET_MODE_NUNITS (mode));
   return int_mode_for_size (elt_bits, 0).require ();
 }
 
+/* Return an integer element mode that contains exactly
+   aarch64_sve_container_bits (MODE) bits.  This is wider than
+   aarch64_sve_element_int_mode if MODE is a partial vector,
+   otherwise it's the same.  */
+
+static scalar_int_mode
+aarch64_sve_container_int_mode (machine_mode mode)
+{
+  return int_mode_for_size (aarch64_sve_container_bits (mode), 0).require ();
+}
+
 /* Return the integer vector mode associated with SVE mode MODE.
    Unlike related_int_vector_mode, this can handle the case in which
    MODE is a predicate (and thus has a different total size).  */
@@ -1831,6 +1876,37 @@  aarch64_vectorize_related_mode (machine_
 {
   unsigned int vec_flags = aarch64_classify_vector_mode (vector_mode);
 
+  /* If we're operating on SVE vectors, try to return an SVE mode.  */
+  poly_uint64 sve_nunits;
+  if ((vec_flags & VEC_SVE_DATA)
+      && multiple_p (BYTES_PER_SVE_VECTOR,
+		     GET_MODE_SIZE (element_mode), &sve_nunits))
+    {
+      machine_mode sve_mode;
+      if (maybe_ne (nunits, 0U))
+	{
+	  /* Try to find a full or partial SVE mode with exactly
+	     NUNITS units.  */
+	  if (multiple_p (sve_nunits, nunits)
+	      && aarch64_sve_data_mode (element_mode,
+					nunits).exists (&sve_mode))
+	    return sve_mode;
+	}
+      else
+	{
+	  /* Take the preferred number of units from the number of bytes
+	     that fit in VECTOR_MODE.  We always start by "autodetecting"
+	     a full vector mode with preferred_simd_mode, so vectors
+	     chosen here will also be full vector modes.  Then
+	     autovectorize_vector_modes tries smaller starting modes
+	     and thus smaller preferred numbers of units.  */
+	  sve_nunits = ordered_min (sve_nunits, GET_MODE_SIZE (vector_mode));
+	  if (aarch64_sve_data_mode (element_mode,
+				     sve_nunits).exists (&sve_mode))
+	    return sve_mode;
+	}
+    }
+
   /* Prefer to use 1 128-bit vector instead of 2 64-bit vectors.  */
   if ((vec_flags & VEC_ADVSIMD)
       && known_eq (nunits, 0U)
@@ -1907,11 +1983,6 @@  aarch64_hard_regno_mode_ok (unsigned reg
     return mode == DImode;
 
   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
-  /* At the moment, partial vector modes are only useful for memory
-     references, but that could change in future.  */
-  if (vec_flags & VEC_PARTIAL)
-    return false;
-
   if (vec_flags & VEC_SVE_PRED)
     return pr_or_ffr_regnum_p (regno);
 
@@ -4015,8 +4086,7 @@  aarch64_expand_sve_ld1rq (rtx dest, rtx
     }
 
   machine_mode mode = GET_MODE (dest);
-  unsigned int elem_bytes = GET_MODE_UNIT_SIZE (mode);
-  machine_mode pred_mode = aarch64_sve_pred_mode (elem_bytes).require ();
+  machine_mode pred_mode = aarch64_sve_pred_mode (mode);
   rtx ptrue = aarch64_ptrue_reg (pred_mode);
   emit_insn (gen_aarch64_sve_ld1rq (mode, dest, src, ptrue));
   return true;
@@ -4037,7 +4107,26 @@  aarch64_expand_sve_const_vector (rtx tar
   unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (src);
   scalar_mode elt_mode = GET_MODE_INNER (mode);
   unsigned int elt_bits = GET_MODE_BITSIZE (elt_mode);
-  unsigned int encoded_bits = npatterns * nelts_per_pattern * elt_bits;
+  unsigned int container_bits = aarch64_sve_container_bits (mode);
+  unsigned int encoded_bits = npatterns * nelts_per_pattern * container_bits;
+
+  if (nelts_per_pattern == 1
+      && encoded_bits <= 128
+      && container_bits != elt_bits)
+    {
+      /* We have a partial vector mode and a constant whose full-vector
+	 equivalent would occupy a repeating 128-bit sequence.  Build that
+	 full-vector equivalent instead, so that we have the option of
+	 using LD1RQ and Advanced SIMD operations.  */
+      unsigned int repeat = container_bits / elt_bits;
+      machine_mode full_mode = aarch64_full_sve_mode (elt_mode).require ();
+      rtx_vector_builder builder (full_mode, npatterns * repeat, 1);
+      for (unsigned int i = 0; i < npatterns; ++i)
+	for (unsigned int j = 0; j < repeat; ++j)
+	  builder.quick_push (CONST_VECTOR_ENCODED_ELT (src, i));
+      target = aarch64_target_reg (target, full_mode);
+      return aarch64_expand_sve_const_vector (target, builder.build ());
+    }
 
   if (nelts_per_pattern == 1 && encoded_bits == 128)
     {
@@ -4730,8 +4819,7 @@  aarch64_split_sve_subreg_move (rtx dest,
     std::swap (mode_with_wider_elts, mode_with_narrower_elts);
 
   unsigned int unspec = aarch64_sve_rev_unspec (mode_with_narrower_elts);
-  unsigned int wider_bytes = GET_MODE_UNIT_SIZE (mode_with_wider_elts);
-  machine_mode pred_mode = aarch64_sve_pred_mode (wider_bytes).require ();
+  machine_mode pred_mode = aarch64_sve_pred_mode (mode_with_wider_elts);
 
   /* Get the operands in the appropriate modes and emit the instruction.  */
   ptrue = gen_lowpart (pred_mode, ptrue);
@@ -9971,19 +10059,21 @@  aarch64_secondary_reload (bool in_p ATTR
 			  machine_mode mode,
 			  secondary_reload_info *sri)
 {
-  /* Use aarch64_sve_reload_be for SVE reloads that cannot be handled
-     directly by the *aarch64_sve_mov<mode>_[lb]e move patterns.  See the
-     comment at the head of aarch64-sve.md for more details about the
-     big-endian handling.  */
-  if (BYTES_BIG_ENDIAN
-      && reg_class_subset_p (rclass, FP_REGS)
+  /* Use aarch64_sve_reload_mem for SVE memory reloads that cannot use
+     LDR and STR.  See the comment at the head of aarch64-sve.md for
+     more details about the big-endian handling.  */
+  if (reg_class_subset_p (rclass, FP_REGS)
       && !((REG_P (x) && HARD_REGISTER_P (x))
 	   || aarch64_simd_valid_immediate (x, NULL))
-      && mode != VNx16QImode
-      && aarch64_sve_data_mode_p (mode))
+      && mode != VNx16QImode)
     {
-      sri->icode = CODE_FOR_aarch64_sve_reload_be;
-      return NO_REGS;
+      unsigned int vec_flags = aarch64_classify_vector_mode (mode);
+      if ((vec_flags & VEC_SVE_DATA)
+	  && ((vec_flags & VEC_PARTIAL) || BYTES_BIG_ENDIAN))
+	{
+	  sri->icode = CODE_FOR_aarch64_sve_reload_mem;
+	  return NO_REGS;
+	}
     }
 
   /* If we have to disable direct literal pool loads and stores because the
@@ -15837,7 +15927,7 @@  aarch64_struct_value_rtx (tree fndecl AT
 aarch64_vector_mode_supported_p (machine_mode mode)
 {
   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
-  return vec_flags != 0 && (vec_flags & (VEC_STRUCT | VEC_PARTIAL)) == 0;
+  return vec_flags != 0 && (vec_flags & VEC_STRUCT) == 0;
 }
 
 /* Return the full-width SVE vector mode for element mode MODE, if one
@@ -15938,29 +16028,72 @@  aarch64_preferred_simd_mode (scalar_mode
 static unsigned int
 aarch64_autovectorize_vector_modes (vector_modes *modes, bool)
 {
-  if (TARGET_SVE)
-    modes->safe_push (VNx16QImode);
-
-  /* Try using 128-bit vectors for all element types.  */
-  modes->safe_push (V16QImode);
+  static const machine_mode sve_modes[] = {
+    /* Try using full vectors for all element types.  */
+    VNx16QImode,
+
+    /* Try using 16-bit containers for 8-bit elements and full vectors
+       for wider elements.  */
+    VNx8QImode,
+
+    /* Try using 32-bit containers for 8-bit and 16-bit elements and
+       full vectors for wider elements.  */
+    VNx4QImode,
 
-  /* Try using 64-bit vectors for 8-bit elements and 128-bit vectors
-     for wider elements.  */
-  modes->safe_push (V8QImode);
-
-  /* Try using 64-bit vectors for 16-bit elements and 128-bit vectors
-     for wider elements.
+    /* Try using 64-bit containers for all element types.  */
+    VNx2QImode
+  };
 
-     TODO: We could support a limited form of V4QImode too, so that
-     we use 32-bit vectors for 8-bit elements.  */
-  modes->safe_push (V4HImode);
+  static const machine_mode advsimd_modes[] = {
+    /* Try using 128-bit vectors for all element types.  */
+    V16QImode,
+
+    /* Try using 64-bit vectors for 8-bit elements and 128-bit vectors
+       for wider elements.  */
+    V8QImode,
+
+    /* Try using 64-bit vectors for 16-bit elements and 128-bit vectors
+       for wider elements.
+
+       TODO: We could support a limited form of V4QImode too, so that
+       we use 32-bit vectors for 8-bit elements.  */
+    V4HImode,
+
+    /* Try using 64-bit vectors for 32-bit elements and 128-bit vectors
+       for 64-bit elements.
+
+       TODO: We could similarly support limited forms of V2QImode and V2HImode
+       for this case.  */
+    V2SImode
+  };
 
-  /* Try using 64-bit vectors for 32-bit elements and 128-bit vectors
-     for 64-bit elements.
+  /* Try using N-byte SVE modes only after trying N-byte Advanced SIMD mode.
+     This is because:
 
-     TODO: We could similarly support limited forms of V2QImode and V2HImode
-     for this case.  */
-  modes->safe_push (V2SImode);
+     - If we can't use N-byte Advanced SIMD vectors then the placement
+       doesn't matter; we'll just continue as though the Advanced SIMD
+       entry didn't exist.
+
+     - If an SVE main loop with N bytes ends up being cheaper than an
+       Advanced SIMD main loop with N bytes then by default we'll replace
+       the Advanced SIMD version with the SVE one.
+
+     - If an Advanced SIMD main loop with N bytes ends up being cheaper
+       than an SVE main loop with N bytes then by default we'll try to
+       use the SVE loop to vectorize the epilogue instead.  */
+  unsigned int sve_i = TARGET_SVE ? 0 : ARRAY_SIZE (sve_modes);
+  unsigned int advsimd_i = 0;
+  while (advsimd_i < ARRAY_SIZE (advsimd_modes))
+    {
+      if (sve_i < ARRAY_SIZE (sve_modes)
+	  && maybe_gt (GET_MODE_NUNITS (sve_modes[sve_i]),
+		       GET_MODE_NUNITS (advsimd_modes[advsimd_i])))
+	modes->safe_push (sve_modes[sve_i++]);
+      else
+	modes->safe_push (advsimd_modes[advsimd_i++]);
+    }
+  while (sve_i < ARRAY_SIZE (sve_modes))
+    modes->safe_push (sve_modes[sve_i++]);
 
   unsigned int flags = 0;
   /* Consider enabling VECT_COMPARE_COSTS for SVE, both so that we
@@ -16507,7 +16640,14 @@  aarch64_simd_valid_immediate (rtx op, si
 	return false;
 
       if (info)
-	*info = simd_immediate_info (elt_mode, base, step);
+	{
+	  /* Get the corresponding container mode.  E.g. an INDEX on V2SI
+	     should yield two integer values per 128-bit block, meaning
+	     that we need to treat it in the same way as V2DI and then
+	     ignore the upper 32 bits of each element.  */
+	  elt_mode = aarch64_sve_container_int_mode (mode);
+	  *info = simd_immediate_info (elt_mode, base, step);
+	}
       return true;
     }
   else if (GET_CODE (op) == CONST_VECTOR
@@ -16976,9 +17116,9 @@  aarch64_simd_vector_alignment (const_tre
      direct way we have of identifying real SVE predicate types.  */
   if (GET_MODE_CLASS (TYPE_MODE (type)) == MODE_VECTOR_BOOL)
     return 16;
-  if (TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
-    return 128;
-  return wi::umin (wi::to_wide (TYPE_SIZE (type)), 128).to_uhwi ();
+  widest_int min_size
+    = constant_lower_bound (wi::to_poly_widest (TYPE_SIZE (type)));
+  return wi::umin (min_size, 128).to_uhwi ();
 }
 
 /* Implement target hook TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT.  */
@@ -19154,7 +19294,7 @@  aarch64_evpc_sel (struct expand_vec_perm
   if (d->testing_p)
     return true;
 
-  machine_mode pred_mode = aarch64_sve_pred_mode (unit_size).require ();
+  machine_mode pred_mode = aarch64_sve_pred_mode (vmode);
 
   rtx_vector_builder builder (pred_mode, n_patterns, 2);
   for (int i = 0; i < n_patterns * 2; i++)
Index: gcc/config/aarch64/aarch64-sve.md
===================================================================
--- gcc/config/aarch64/aarch64-sve.md	2019-11-16 10:55:37.212088720 +0000
+++ gcc/config/aarch64/aarch64-sve.md	2019-11-16 10:59:20.726514488 +0000
@@ -546,8 +546,8 @@ 
 ;; -------------------------------------------------------------------------
 
 (define_expand "mov<mode>"
-  [(set (match_operand:SVE_FULL 0 "nonimmediate_operand")
-	(match_operand:SVE_FULL 1 "general_operand"))]
+  [(set (match_operand:SVE_ALL 0 "nonimmediate_operand")
+	(match_operand:SVE_ALL 1 "general_operand"))]
   "TARGET_SVE"
   {
     /* Use the predicated load and store patterns where possible.
@@ -576,8 +576,8 @@  (define_expand "mov<mode>"
 )
 
 (define_expand "movmisalign<mode>"
-  [(set (match_operand:SVE_FULL 0 "nonimmediate_operand")
-	(match_operand:SVE_FULL 1 "general_operand"))]
+  [(set (match_operand:SVE_ALL 0 "nonimmediate_operand")
+	(match_operand:SVE_ALL 1 "general_operand"))]
   "TARGET_SVE"
   {
     /* Equivalent to a normal move for our purpooses.  */
@@ -586,10 +586,11 @@  (define_expand "movmisalign<mode>"
   }
 )
 
-;; Unpredicated moves (bytes or little-endian).  Only allow memory operations
-;; during and after RA; before RA we want the predicated load and store
-;; patterns to be used instead.
-(define_insn "*aarch64_sve_mov<mode>_le"
+;; Unpredicated moves that can use LDR and STR, i.e. full vectors for which
+;; little-endian ordering is acceptable.  Only allow memory operations during
+;; and after RA; before RA we want the predicated load and store patterns to
+;; be used instead.
+(define_insn "*aarch64_sve_mov<mode>_ldr_str"
   [(set (match_operand:SVE_FULL 0 "aarch64_sve_nonimmediate_operand" "=w, Utr, w, w")
 	(match_operand:SVE_FULL 1 "aarch64_sve_general_operand" "Utr, w, w, Dn"))]
   "TARGET_SVE
@@ -604,35 +605,37 @@  (define_insn "*aarch64_sve_mov<mode>_le"
    * return aarch64_output_sve_mov_immediate (operands[1]);"
 )
 
-;; Unpredicated moves (non-byte big-endian).  Memory accesses require secondary
-;; reloads.
-(define_insn "*aarch64_sve_mov<mode>_be"
-  [(set (match_operand:SVE_FULL 0 "register_operand" "=w, w")
-	(match_operand:SVE_FULL 1 "aarch64_nonmemory_operand" "w, Dn"))]
-  "TARGET_SVE && BYTES_BIG_ENDIAN && <MODE>mode != VNx16QImode"
+;; Unpredicated moves that cannot use LDR and STR, i.e. partial vectors
+;; or vectors for which little-endian ordering isn't acceptable.  Memory
+;; accesses require secondary reloads.
+(define_insn "*aarch64_sve_mov<mode>_no_ldr_str"
+  [(set (match_operand:SVE_ALL 0 "register_operand" "=w, w")
+	(match_operand:SVE_ALL 1 "aarch64_nonmemory_operand" "w, Dn"))]
+  "TARGET_SVE
+   && <MODE>mode != VNx16QImode
+   && (BYTES_BIG_ENDIAN
+       || maybe_ne (BYTES_PER_SVE_VECTOR, GET_MODE_SIZE (<MODE>mode)))"
   "@
    mov\t%0.d, %1.d
    * return aarch64_output_sve_mov_immediate (operands[1]);"
 )
 
-;; Handle big-endian memory reloads.  We use byte PTRUE for all modes
-;; to try to encourage reuse.
-;; This pattern needs constraints due to TARGET_SECONDARY_RELOAD hook.
-(define_expand "aarch64_sve_reload_be"
+;; Handle memory reloads for modes that can't use LDR and STR.  We use
+;; byte PTRUE for all modes to try to encourage reuse.  This pattern
+;; needs constraints because it is returned by TARGET_SECONDARY_RELOAD.
+(define_expand "aarch64_sve_reload_mem"
   [(parallel
      [(set (match_operand 0)
 	   (match_operand 1))
       (clobber (match_operand:VNx16BI 2 "register_operand" "=Upl"))])]
-  "TARGET_SVE && BYTES_BIG_ENDIAN"
+  "TARGET_SVE"
   {
     /* Create a PTRUE.  */
     emit_move_insn (operands[2], CONSTM1_RTX (VNx16BImode));
 
     /* Refer to the PTRUE in the appropriate mode for this move.  */
     machine_mode mode = GET_MODE (operands[0]);
-    machine_mode pred_mode
-      = aarch64_sve_pred_mode (GET_MODE_UNIT_SIZE (mode)).require ();
-    rtx pred = gen_lowpart (pred_mode, operands[2]);
+    rtx pred = gen_lowpart (aarch64_sve_pred_mode (mode), operands[2]);
 
     /* Emit a predicated load or store.  */
     aarch64_emit_sve_pred_move (operands[0], pred, operands[1]);
@@ -644,18 +647,18 @@  (define_expand "aarch64_sve_reload_be"
 ;; Note that this pattern is generated directly by aarch64_emit_sve_pred_move,
 ;; so changes to this pattern will need changes there as well.
 (define_insn_and_split "@aarch64_pred_mov<mode>"
-  [(set (match_operand:SVE_FULL 0 "nonimmediate_operand" "=w, w, m")
-	(unspec:SVE_FULL
+  [(set (match_operand:SVE_ALL 0 "nonimmediate_operand" "=w, w, m")
+	(unspec:SVE_ALL
 	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl")
-	   (match_operand:SVE_FULL 2 "nonimmediate_operand" "w, m, w")]
+	   (match_operand:SVE_ALL 2 "nonimmediate_operand" "w, m, w")]
 	  UNSPEC_PRED_X))]
   "TARGET_SVE
    && (register_operand (operands[0], <MODE>mode)
        || register_operand (operands[2], <MODE>mode))"
   "@
    #
-   ld1<Vesize>\t%0.<Vetype>, %1/z, %2
-   st1<Vesize>\t%2.<Vetype>, %1, %0"
+   ld1<Vesize>\t%0.<Vctype>, %1/z, %2
+   st1<Vesize>\t%2.<Vctype>, %1, %0"
   "&& register_operand (operands[0], <MODE>mode)
    && register_operand (operands[2], <MODE>mode)"
   [(set (match_dup 0) (match_dup 2))]
@@ -666,8 +669,8 @@  (define_insn_and_split "@aarch64_pred_mo
 ;; for details.  We use a special predicate for operand 2 to reduce
 ;; the number of patterns.
 (define_insn_and_split "*aarch64_sve_mov<mode>_subreg_be"
-  [(set (match_operand:SVE_FULL 0 "aarch64_sve_nonimmediate_operand" "=w")
-	(unspec:SVE_FULL
+  [(set (match_operand:SVE_ALL 0 "aarch64_sve_nonimmediate_operand" "=w")
+	(unspec:SVE_ALL
 	  [(match_operand:VNx16BI 1 "register_operand" "Upl")
 	   (match_operand 2 "aarch64_any_register_operand" "w")]
 	  UNSPEC_REV_SUBREG))]
@@ -685,8 +688,8 @@  (define_insn_and_split "*aarch64_sve_mov
 ;; This is equivalent to a subreg on little-endian targets but not for
 ;; big-endian; see the comment at the head of the file for details.
 (define_expand "@aarch64_sve_reinterpret<mode>"
-  [(set (match_operand:SVE_FULL 0 "register_operand")
-	(unspec:SVE_FULL
+  [(set (match_operand:SVE_ALL 0 "register_operand")
+	(unspec:SVE_ALL
 	  [(match_operand 1 "aarch64_any_register_operand")]
 	  UNSPEC_REINTERPRET))]
   "TARGET_SVE"
@@ -702,8 +705,8 @@  (define_expand "@aarch64_sve_reinterpret
 ;; A pattern for handling type punning on big-endian targets.  We use a
 ;; special predicate for operand 1 to reduce the number of patterns.
 (define_insn_and_split "*aarch64_sve_reinterpret<mode>"
-  [(set (match_operand:SVE_FULL 0 "register_operand" "=w")
-	(unspec:SVE_FULL
+  [(set (match_operand:SVE_ALL 0 "register_operand" "=w")
+	(unspec:SVE_ALL
 	  [(match_operand 1 "aarch64_any_register_operand" "w")]
 	  UNSPEC_REINTERPRET))]
   "TARGET_SVE"
@@ -1141,13 +1144,13 @@  (define_insn "aarch64_update_ffrt"
 
 ;; Predicated LD1.
 (define_insn "maskload<mode><vpred>"
-  [(set (match_operand:SVE_FULL 0 "register_operand" "=w")
-	(unspec:SVE_FULL
+  [(set (match_operand:SVE_ALL 0 "register_operand" "=w")
+	(unspec:SVE_ALL
 	  [(match_operand:<VPRED> 2 "register_operand" "Upl")
-	   (match_operand:SVE_FULL 1 "memory_operand" "m")]
+	   (match_operand:SVE_ALL 1 "memory_operand" "m")]
 	  UNSPEC_LD1_SVE))]
   "TARGET_SVE"
-  "ld1<Vesize>\t%0.<Vetype>, %2/z, %1"
+  "ld1<Vesize>\t%0.<Vctype>, %2/z, %1"
 )
 
 ;; Unpredicated LD[234].
@@ -1940,14 +1943,14 @@  (define_insn "*aarch64_sve_gather_prefet
 
 ;; Predicated ST1.
 (define_insn "maskstore<mode><vpred>"
-  [(set (match_operand:SVE_FULL 0 "memory_operand" "+m")
-	(unspec:SVE_FULL
+  [(set (match_operand:SVE_ALL 0 "memory_operand" "+m")
+	(unspec:SVE_ALL
 	  [(match_operand:<VPRED> 2 "register_operand" "Upl")
-	   (match_operand:SVE_FULL 1 "register_operand" "w")
+	   (match_operand:SVE_ALL 1 "register_operand" "w")
 	   (match_dup 0)]
 	  UNSPEC_ST1_SVE))]
   "TARGET_SVE"
-  "st1<Vesize>\t%1.<Vetype>, %2, %0"
+  "st1<Vesize>\t%1.<Vctype>, %2, %0"
 )
 
 ;; Unpredicated ST[234].  This is always a full update, so the dependence
@@ -2283,8 +2286,8 @@  (define_insn "*aarch64_scatter_store_tru
 
 (define_expand "vec_duplicate<mode>"
   [(parallel
-    [(set (match_operand:SVE_FULL 0 "register_operand")
-	  (vec_duplicate:SVE_FULL
+    [(set (match_operand:SVE_ALL 0 "register_operand")
+	  (vec_duplicate:SVE_ALL
 	    (match_operand:<VEL> 1 "aarch64_sve_dup_operand")))
      (clobber (scratch:VNx16BI))])]
   "TARGET_SVE"
@@ -2304,8 +2307,8 @@  (define_expand "vec_duplicate<mode>"
 ;; the load at the first opportunity in order to allow the PTRUE to be
 ;; optimized with surrounding code.
 (define_insn_and_split "*vec_duplicate<mode>_reg"
-  [(set (match_operand:SVE_FULL 0 "register_operand" "=w, w, w")
-	(vec_duplicate:SVE_FULL
+  [(set (match_operand:SVE_ALL 0 "register_operand" "=w, w, w")
+	(vec_duplicate:SVE_ALL
 	  (match_operand:<VEL> 1 "aarch64_sve_dup_operand" "r, w, Uty")))
    (clobber (match_scratch:VNx16BI 2 "=X, X, Upl"))]
   "TARGET_SVE"
@@ -2364,12 +2367,12 @@  (define_insn "@aarch64_vec_duplicate_vq<
 ;; be used by combine to optimize selects of a a vec_duplicate<mode>
 ;; with zero.
 (define_insn "sve_ld1r<mode>"
-  [(set (match_operand:SVE_FULL 0 "register_operand" "=w")
-	(unspec:SVE_FULL
+  [(set (match_operand:SVE_ALL 0 "register_operand" "=w")
+	(unspec:SVE_ALL
 	  [(match_operand:<VPRED> 1 "register_operand" "Upl")
-	   (vec_duplicate:SVE_FULL
+	   (vec_duplicate:SVE_ALL
 	     (match_operand:<VEL> 2 "aarch64_sve_ld1r_operand" "Uty"))
-	   (match_operand:SVE_FULL 3 "aarch64_simd_imm_zero")]
+	   (match_operand:SVE_ALL 3 "aarch64_simd_imm_zero")]
 	  UNSPEC_SEL))]
   "TARGET_SVE"
   "ld1r<Vesize>\t%0.<Vetype>, %1/z, %2"
@@ -2431,29 +2434,29 @@  (define_insn "vec_shl_insert_<mode>"
 ;; -------------------------------------------------------------------------
 
 (define_insn "vec_series<mode>"
-  [(set (match_operand:SVE_FULL_I 0 "register_operand" "=w, w, w")
-	(vec_series:SVE_FULL_I
+  [(set (match_operand:SVE_I 0 "register_operand" "=w, w, w")
+	(vec_series:SVE_I
 	  (match_operand:<VEL> 1 "aarch64_sve_index_operand" "Usi, r, r")
 	  (match_operand:<VEL> 2 "aarch64_sve_index_operand" "r, Usi, r")))]
   "TARGET_SVE"
   "@
-   index\t%0.<Vetype>, #%1, %<vw>2
-   index\t%0.<Vetype>, %<vw>1, #%2
-   index\t%0.<Vetype>, %<vw>1, %<vw>2"
+   index\t%0.<Vctype>, #%1, %<vwcore>2
+   index\t%0.<Vctype>, %<vwcore>1, #%2
+   index\t%0.<Vctype>, %<vwcore>1, %<vwcore>2"
 )
 
 ;; Optimize {x, x, x, x, ...} + {0, n, 2*n, 3*n, ...} if n is in range
 ;; of an INDEX instruction.
 (define_insn "*vec_series<mode>_plus"
-  [(set (match_operand:SVE_FULL_I 0 "register_operand" "=w")
-	(plus:SVE_FULL_I
-	  (vec_duplicate:SVE_FULL_I
+  [(set (match_operand:SVE_I 0 "register_operand" "=w")
+	(plus:SVE_I
+	  (vec_duplicate:SVE_I
 	    (match_operand:<VEL> 1 "register_operand" "r"))
-	  (match_operand:SVE_FULL_I 2 "immediate_operand")))]
+	  (match_operand:SVE_I 2 "immediate_operand")))]
   "TARGET_SVE && aarch64_check_zero_based_sve_index_immediate (operands[2])"
   {
     operands[2] = aarch64_check_zero_based_sve_index_immediate (operands[2]);
-    return "index\t%0.<Vetype>, %<vw>1, #%2";
+    return "index\t%0.<Vctype>, %<vwcore>1, #%2";
   }
 )
 
@@ -2821,7 +2824,7 @@  (define_insn "@cond_<optab><mode>"
 (define_insn "@aarch64_pred_sxt<SVE_FULL_HSDI:mode><SVE_PARTIAL_I:mode>"
   [(set (match_operand:SVE_FULL_HSDI 0 "register_operand" "=w")
 	(unspec:SVE_FULL_HSDI
-	  [(match_operand:<VPRED> 1 "register_operand" "Upl")
+	  [(match_operand:<SVE_FULL_HSDI:VPRED> 1 "register_operand" "Upl")
 	   (sign_extend:SVE_FULL_HSDI
 	     (truncate:SVE_PARTIAL_I
 	       (match_operand:SVE_FULL_HSDI 2 "register_operand" "w")))]
@@ -2834,7 +2837,7 @@  (define_insn "@aarch64_pred_sxt<SVE_FULL
 (define_insn "@aarch64_cond_sxt<SVE_FULL_HSDI:mode><SVE_PARTIAL_I:mode>"
   [(set (match_operand:SVE_FULL_HSDI 0 "register_operand" "=w, ?&w, ?&w")
 	(unspec:SVE_FULL_HSDI
-	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl, Upl")
+	  [(match_operand:<SVE_FULL_HSDI:VPRED> 1 "register_operand" "Upl, Upl, Upl")
 	   (sign_extend:SVE_FULL_HSDI
 	     (truncate:SVE_PARTIAL_I
 	       (match_operand:SVE_FULL_HSDI 2 "register_operand" "w, w, w")))
@@ -3386,10 +3389,10 @@  (define_insn_and_rewrite "*cond_<optab><
 ;; -------------------------------------------------------------------------
 
 (define_insn "add<mode>3"
-  [(set (match_operand:SVE_FULL_I 0 "register_operand" "=w, w, w, ?w, ?w, w")
-	(plus:SVE_FULL_I
-	  (match_operand:SVE_FULL_I 1 "register_operand" "%0, 0, 0, w, w, w")
-	  (match_operand:SVE_FULL_I 2 "aarch64_sve_add_operand" "vsa, vsn, vsi, vsa, vsn, w")))]
+  [(set (match_operand:SVE_I 0 "register_operand" "=w, w, w, ?w, ?w, w")
+	(plus:SVE_I
+	  (match_operand:SVE_I 1 "register_operand" "%0, 0, 0, w, w, w")
+	  (match_operand:SVE_I 2 "aarch64_sve_add_operand" "vsa, vsn, vsi, vsa, vsn, w")))]
   "TARGET_SVE"
   "@
    add\t%0.<Vetype>, %0.<Vetype>, #%D2
Index: gcc/config/aarch64/iterators.md
===================================================================
--- gcc/config/aarch64/iterators.md	2019-11-16 10:55:37.212088720 +0000
+++ gcc/config/aarch64/iterators.md	2019-11-16 10:59:20.730514460 +0000
@@ -344,6 +344,21 @@  (define_mode_iterator SVE_PARTIAL_I [VNx
 				     VNx4HI VNx2HI
 				     VNx2SI])
 
+;; All SVE vector modes.
+(define_mode_iterator SVE_ALL [VNx16QI VNx8QI VNx4QI VNx2QI
+			       VNx8HI VNx4HI VNx2HI
+			       VNx8HF VNx4HF VNx2HF
+			       VNx4SI VNx2SI
+			       VNx4SF VNx2SF
+			       VNx2DI
+			       VNx2DF])
+
+;; All SVE integer vector modes.
+(define_mode_iterator SVE_I [VNx16QI VNx8QI VNx4QI VNx2QI
+			     VNx8HI VNx4HI VNx2HI
+			     VNx4SI VNx2SI
+			     VNx2DI])
+
 ;; Modes involved in extending or truncating SVE data, for 8 elements per
 ;; 128-bit block.
 (define_mode_iterator VNx8_NARROW [VNx8QI])
@@ -776,28 +791,37 @@  (define_mode_attr Vmntype [(V8HI ".8b")
 			   (HI   "")])
 
 ;; Mode-to-individual element type mapping.
-(define_mode_attr Vetype [(V8QI "b") (V16QI "b") (VNx16QI "b") (VNx16BI "b")
-			  (V4HI "h") (V8HI  "h") (VNx8HI  "h") (VNx8BI  "h")
-			  (V2SI "s") (V4SI  "s") (VNx4SI  "s") (VNx4BI  "s")
-			  (V2DI "d")             (VNx2DI  "d") (VNx2BI  "d")
-			  (V4HF "h") (V8HF  "h") (VNx8HF  "h")
-			  (V2SF "s") (V4SF  "s") (VNx4SF  "s")
-			  (V2DF "d")             (VNx2DF  "d")
-			  (HF   "h")
-			  (SF   "s") (DF  "d")
-			  (QI "b")   (HI "h")
-			  (SI "s")   (DI "d")])
+(define_mode_attr Vetype [(V8QI "b") (V16QI "b")
+			  (V4HI "h") (V8HI  "h")
+			  (V2SI "s") (V4SI  "s")
+			  (V2DI "d")
+			  (V4HF "h") (V8HF  "h")
+			  (V2SF "s") (V4SF  "s")
+			  (V2DF "d")
+			  (VNx16BI "b") (VNx8BI "h") (VNx4BI "s") (VNx2BI "d")
+			  (VNx16QI "b") (VNx8QI "b") (VNx4QI "b") (VNx2QI "b")
+			  (VNx8HI "h") (VNx4HI "h") (VNx2HI "h")
+			  (VNx8HF "h") (VNx4HF "h") (VNx2HF "h")
+			  (VNx4SI "s") (VNx2SI "s")
+			  (VNx4SF "s") (VNx2SF "s")
+			  (VNx2DI "d")
+			  (VNx2DF "d")
+			  (HF "h")
+			  (SF "s") (DF "d")
+			  (QI "b") (HI "h")
+			  (SI "s") (DI "d")])
 
 ;; Like Vetype, but map to types that are a quarter of the element size.
 (define_mode_attr Vetype_fourth [(VNx4SI "b") (VNx2DI "h")])
 
 ;; Equivalent of "size" for a vector element.
-(define_mode_attr Vesize [(VNx16QI "b") (VNx8QI  "b")
-			  (VNx4QI  "b") (VNx2QI  "b")
-			  (VNx8HI  "h") (VNx4HI  "h")
-			  (VNx2HI  "h") (VNx8HF  "h")
-			  (VNx4SI  "w") (VNx2SI  "w") (VNx4SF  "w")
-			  (VNx2DI  "d") (VNx2DF  "d")
+(define_mode_attr Vesize [(VNx16QI "b") (VNx8QI "b") (VNx4QI "b") (VNx2QI "b")
+			  (VNx8HI "h") (VNx4HI "h") (VNx2HI "h")
+			  (VNx8HF "h") (VNx4HF "h") (VNx2HF "h")
+			  (VNx4SI "w") (VNx2SI "w")
+			  (VNx4SF "w") (VNx2SF "w")
+			  (VNx2DI "d")
+			  (VNx2DF "d")
 			  (VNx32QI "b") (VNx48QI "b") (VNx64QI "b")
 			  (VNx16HI "h") (VNx24HI "h") (VNx32HI "h")
 			  (VNx16HF "h") (VNx24HF "h") (VNx32HF "h")
@@ -806,6 +830,16 @@  (define_mode_attr Vesize [(VNx16QI "b")
 			  (VNx4DI  "d") (VNx6DI  "d") (VNx8DI  "d")
 			  (VNx4DF  "d") (VNx6DF  "d") (VNx8DF  "d")])
 
+;; The Z register suffix for an SVE mode's element container, i.e. the
+;; Vetype of full SVE modes that have the same number of elements.
+(define_mode_attr Vctype [(VNx16QI "b") (VNx8QI "h") (VNx4QI "s") (VNx2QI "d")
+			  (VNx8HI "h") (VNx4HI "s") (VNx2HI "d")
+			  (VNx8HF "h") (VNx4HF "s") (VNx2HF "d")
+			  (VNx4SI "s") (VNx2SI "d")
+			  (VNx4SF "s") (VNx2SF "d")
+			  (VNx2DI "d")
+			  (VNx2DF "d")])
+
 ;; Vetype is used everywhere in scheduling type and assembly output,
 ;; sometimes they are not the same, for example HF modes on some
 ;; instructions.  stype is defined to represent scheduling type
@@ -827,26 +861,40 @@  (define_mode_attr Vbtype [(V8QI "8b")  (
 			  (SI   "8b")  (SF    "8b")])
 
 ;; Define element mode for each vector mode.
-(define_mode_attr VEL [(V8QI  "QI") (V16QI "QI") (VNx16QI "QI")
-			(V4HI "HI") (V8HI  "HI") (VNx8HI  "HI")
-			(V2SI "SI") (V4SI  "SI") (VNx4SI  "SI")
-			(DI   "DI") (V2DI  "DI") (VNx2DI  "DI")
-			(V4HF "HF") (V8HF  "HF") (VNx8HF  "HF")
-			(V2SF "SF") (V4SF  "SF") (VNx4SF  "SF")
-			(DF   "DF") (V2DF  "DF") (VNx2DF  "DF")
-			(SI   "SI") (HI    "HI")
-			(QI   "QI")])
+(define_mode_attr VEL [(V8QI  "QI") (V16QI "QI")
+		       (V4HI "HI") (V8HI  "HI")
+		       (V2SI "SI") (V4SI  "SI")
+		       (DI   "DI") (V2DI  "DI")
+		       (V4HF "HF") (V8HF  "HF")
+		       (V2SF "SF") (V4SF  "SF")
+		       (DF   "DF") (V2DF  "DF")
+		       (SI   "SI") (HI    "HI")
+		       (QI   "QI")
+		       (VNx16QI "QI") (VNx8QI "QI") (VNx4QI "QI") (VNx2QI "QI")
+		       (VNx8HI "HI") (VNx4HI "HI") (VNx2HI "HI")
+		       (VNx8HF "HF") (VNx4HF "HF") (VNx2HF "HF")
+		       (VNx4SI "SI") (VNx2SI "SI")
+		       (VNx4SF "SF") (VNx2SF "SF")
+		       (VNx2DI "DI")
+		       (VNx2DF "DF")])
 
 ;; Define element mode for each vector mode (lower case).
-(define_mode_attr Vel [(V8QI "qi") (V16QI "qi") (VNx16QI "qi")
-			(V4HI "hi") (V8HI "hi") (VNx8HI  "hi")
-			(V2SI "si") (V4SI "si") (VNx4SI  "si")
-			(DI "di")   (V2DI "di") (VNx2DI  "di")
-			(V4HF "hf") (V8HF "hf") (VNx8HF  "hf")
-			(V2SF "sf") (V4SF "sf") (VNx4SF  "sf")
-			(V2DF "df") (DF "df")   (VNx2DF  "df")
-			(SI   "si") (HI   "hi")
-			(QI   "qi")])
+(define_mode_attr Vel [(V8QI "qi") (V16QI "qi")
+		       (V4HI "hi") (V8HI "hi")
+		       (V2SI "si") (V4SI "si")
+		       (DI   "di") (V2DI "di")
+		       (V4HF "hf") (V8HF "hf")
+		       (V2SF "sf") (V4SF "sf")
+		       (V2DF "df") (DF   "df")
+		       (SI   "si") (HI   "hi")
+		       (QI   "qi")
+		       (VNx16QI "qi") (VNx8QI "qi") (VNx4QI "qi") (VNx2QI "qi")
+		       (VNx8HI "hi") (VNx4HI "hi") (VNx2HI "hi")
+		       (VNx8HF "hf") (VNx4HF "hf") (VNx2HF "hf")
+		       (VNx4SI "si") (VNx2SI "si")
+		       (VNx4SF "sf") (VNx2SF "sf")
+		       (VNx2DI "di")
+		       (VNx2DF "df")])
 
 ;; Element mode with floating-point values replaced by like-sized integers.
 (define_mode_attr VEL_INT [(VNx16QI "QI")
@@ -994,23 +1042,29 @@  (define_mode_attr Vhalftype [(V16QI "8b"
 			     (V4SF "2s")])
 
 ;; Define corresponding core/FP element mode for each vector mode.
-(define_mode_attr vw [(V8QI "w") (V16QI "w") (VNx16QI "w")
-		      (V4HI "w") (V8HI "w") (VNx8HI "w")
-		      (V2SI "w") (V4SI "w") (VNx4SI "w")
-		      (DI   "x") (V2DI "x") (VNx2DI "x")
-		      (VNx8HF "h")
-		      (V2SF "s") (V4SF "s") (VNx4SF "s")
-		      (V2DF "d") (VNx2DF "d")])
+(define_mode_attr vw [(V8QI "w") (V16QI "w")
+		      (V4HI "w") (V8HI "w")
+		      (V2SI "w") (V4SI "w")
+		      (DI   "x") (V2DI "x")
+		      (V2SF "s") (V4SF "s")
+		      (V2DF "d")])
 
 ;; Corresponding core element mode for each vector mode.  This is a
 ;; variation on <vw> mapping FP modes to GP regs.
-(define_mode_attr vwcore [(V8QI "w") (V16QI "w") (VNx16QI "w")
-			  (V4HI "w") (V8HI "w") (VNx8HI "w")
-			  (V2SI "w") (V4SI "w") (VNx4SI "w")
-			  (DI   "x") (V2DI "x") (VNx2DI "x")
-			  (V4HF "w") (V8HF "w") (VNx8HF "w")
-			  (V2SF "w") (V4SF "w") (VNx4SF "w")
-			  (V2DF "x") (VNx2DF "x")])
+(define_mode_attr vwcore [(V8QI "w") (V16QI "w")
+			  (V4HI "w") (V8HI "w")
+			  (V2SI "w") (V4SI "w")
+			  (DI   "x") (V2DI "x")
+			  (V4HF "w") (V8HF "w")
+			  (V2SF "w") (V4SF "w")
+			  (V2DF "x")
+			  (VNx16QI "w") (VNx8QI "w") (VNx4QI "w") (VNx2QI "w")
+			  (VNx8HI "w") (VNx4HI "w") (VNx2HI "w")
+			  (VNx8HF "w") (VNx4HF "w") (VNx2HF "w")
+			  (VNx4SI "w") (VNx2SI "w")
+			  (VNx4SF "w") (VNx2SF "w")
+			  (VNx2DI "x")
+			  (VNx2DF "x")])
 
 ;; Double vector types for ALLX.
 (define_mode_attr Vallxd [(QI "8b") (HI "4h") (SI "2s")])
@@ -1248,10 +1302,14 @@  (define_mode_attr vsingle [(VNx32QI "vnx
 
 ;; The predicate mode associated with an SVE data mode.  For structure modes
 ;; this is equivalent to the <VPRED> of the subvector mode.
-(define_mode_attr VPRED [(VNx16QI "VNx16BI")
-			 (VNx8HI "VNx8BI") (VNx8HF "VNx8BI")
-			 (VNx4SI "VNx4BI") (VNx4SF "VNx4BI")
-			 (VNx2DI "VNx2BI") (VNx2DF "VNx2BI")
+(define_mode_attr VPRED [(VNx16QI "VNx16BI") (VNx8QI "VNx8BI")
+			 (VNx4QI "VNx4BI") (VNx2QI "VNx2BI")
+			 (VNx8HI "VNx8BI") (VNx4HI "VNx4BI") (VNx2HI "VNx2BI")
+			 (VNx8HF "VNx8BI") (VNx4HF "VNx4BI") (VNx2HF "VNx2BI")
+			 (VNx4SI "VNx4BI") (VNx2SI "VNx2BI")
+			 (VNx4SF "VNx4BI") (VNx2SF "VNx2BI")
+			 (VNx2DI "VNx2BI")
+			 (VNx2DF "VNx2BI")
 			 (VNx32QI "VNx16BI")
 			 (VNx16HI "VNx8BI") (VNx16HF "VNx8BI")
 			 (VNx8SI "VNx4BI") (VNx8SF "VNx4BI")
@@ -1266,10 +1324,14 @@  (define_mode_attr VPRED [(VNx16QI "VNx16
 			 (VNx8DI "VNx2BI") (VNx8DF "VNx2BI")])
 
 ;; ...and again in lower case.
-(define_mode_attr vpred [(VNx16QI "vnx16bi")
-			 (VNx8HI "vnx8bi") (VNx8HF "vnx8bi")
-			 (VNx4SI "vnx4bi") (VNx4SF "vnx4bi")
-			 (VNx2DI "vnx2bi") (VNx2DF "vnx2bi")
+(define_mode_attr vpred [(VNx16QI "vnx16bi") (VNx8QI "vnx8bi")
+			 (VNx4QI "vnx4bi") (VNx2QI "vnx2bi")
+			 (VNx8HI "vnx8bi") (VNx4HI "vnx4bi") (VNx2HI "vnx2bi")
+			 (VNx8HF "vnx8bi") (VNx4HF "vnx4bi") (VNx2HF "vnx2bi")
+			 (VNx4SI "vnx4bi") (VNx2SI "vnx2bi")
+			 (VNx4SF "vnx4bi") (VNx2SF "vnx2bi")
+			 (VNx2DI "vnx2bi")
+			 (VNx2DF "vnx2bi")
 			 (VNx32QI "vnx16bi")
 			 (VNx16HI "vnx8bi") (VNx16HF "vnx8bi")
 			 (VNx8SI "vnx4bi") (VNx8SF "vnx4bi")
Index: gcc/testsuite/gcc.target/aarch64/sve/mixed_size_1.c
===================================================================
--- /dev/null	2019-09-17 11:41:18.176664108 +0100
+++ gcc/testsuite/gcc.target/aarch64/sve/mixed_size_1.c	2019-11-16 10:59:20.730514460 +0000
@@ -0,0 +1,39 @@ 
+/* { dg-options "-O2 -ftree-vectorize -fno-tree-loop-distribute-patterns" } */
+
+#include <stdint.h>
+
+#define TEST_LOOP(TYPE1, TYPE2)						\
+  void									\
+  f_##TYPE1##_##TYPE2 (TYPE1 *restrict dst1, TYPE1 *restrict src1,	\
+		       TYPE2 *restrict dst2, TYPE2 *restrict src2,	\
+		       int n)						\
+  {									\
+    for (int i = 0; i < n; ++i)						\
+      {									\
+	dst1[i] += src1[i];						\
+	dst2[i] = src2[i];						\
+      }									\
+  }
+
+#define TEST_ALL(T) \
+  T (uint16_t, uint8_t) \
+  T (uint32_t, uint16_t) \
+  T (uint32_t, _Float16) \
+  T (uint64_t, uint32_t) \
+  T (uint64_t, float)
+
+TEST_ALL (TEST_LOOP)
+
+/* { dg-final { scan-assembler-times {\tld1b\tz[0-9]+\.h,} 1 } } */
+/* { dg-final { scan-assembler-times {\tst1b\tz[0-9]+\.h,} 1 } } */
+/* { dg-final { scan-assembler-times {\tld1h\tz[0-9]+\.s,} 2 } } */
+/* { dg-final { scan-assembler-times {\tst1h\tz[0-9]+\.s,} 2 } } */
+/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.d,} 2 } } */
+/* { dg-final { scan-assembler-times {\tst1w\tz[0-9]+\.d,} 2 } } */
+
+/* { dg-final { scan-assembler-times {\tld1h\tz[0-9]+\.h,} 2 } } */
+/* { dg-final { scan-assembler-times {\tst1h\tz[0-9]+\.h,} 1 } } */
+/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.s,} 4 } } */
+/* { dg-final { scan-assembler-times {\tst1w\tz[0-9]+\.s,} 2 } } */
+/* { dg-final { scan-assembler-times {\tld1d\tz[0-9]+\.d,} 4 } } */
+/* { dg-final { scan-assembler-times {\tst1d\tz[0-9]+\.d,} 2 } } */
Index: gcc/testsuite/gcc.target/aarch64/sve/mixed_size_2.c
===================================================================
--- /dev/null	2019-09-17 11:41:18.176664108 +0100
+++ gcc/testsuite/gcc.target/aarch64/sve/mixed_size_2.c	2019-11-16 10:59:20.730514460 +0000
@@ -0,0 +1,41 @@ 
+/* { dg-options "-O2 -ftree-vectorize -fno-tree-loop-distribute-patterns" } */
+
+#include <stdint.h>
+
+#define TEST_LOOP(TYPE1, TYPE2)						\
+  void									\
+  f_##TYPE1##_##TYPE2 (TYPE1 *restrict dst1, TYPE1 *restrict src1,	\
+		       TYPE2 *restrict dst2, int n)			\
+  {									\
+    for (int i = 0; i < n; ++i)						\
+      {									\
+	dst1[i] += src1[i];						\
+	dst2[i] = 1;							\
+      }									\
+  }
+
+#define TEST_ALL(T) \
+  T (uint16_t, uint8_t) \
+  T (uint32_t, uint16_t) \
+  T (uint32_t, _Float16) \
+  T (uint64_t, uint32_t) \
+  T (uint64_t, float)
+
+TEST_ALL (TEST_LOOP)
+
+/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.b, #1\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.h, #1\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.s, #1\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.h, #1\.0} 1 } } */
+/* { dg-final { scan-assembler-times {\tfmov\tz[0-9]+\.s, #1\.0} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tst1b\tz[0-9]+\.h,} 1 } } */
+/* { dg-final { scan-assembler-times {\tst1h\tz[0-9]+\.s,} 2 } } */
+/* { dg-final { scan-assembler-times {\tst1w\tz[0-9]+\.d,} 2 } } */
+
+/* { dg-final { scan-assembler-times {\tld1h\tz[0-9]+\.h,} 2 } } */
+/* { dg-final { scan-assembler-times {\tst1h\tz[0-9]+\.h,} 1 } } */
+/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.s,} 4 } } */
+/* { dg-final { scan-assembler-times {\tst1w\tz[0-9]+\.s,} 2 } } */
+/* { dg-final { scan-assembler-times {\tld1d\tz[0-9]+\.d,} 4 } } */
+/* { dg-final { scan-assembler-times {\tst1d\tz[0-9]+\.d,} 2 } } */
Index: gcc/testsuite/gcc.target/aarch64/sve/mixed_size_3.c
===================================================================
--- /dev/null	2019-09-17 11:41:18.176664108 +0100
+++ gcc/testsuite/gcc.target/aarch64/sve/mixed_size_3.c	2019-11-16 10:59:20.730514460 +0000
@@ -0,0 +1,41 @@ 
+/* { dg-options "-O2 -ftree-vectorize -fno-tree-loop-distribute-patterns" } */
+
+#include <stdint.h>
+
+#define TEST_LOOP(TYPE1, TYPE2)						\
+  void									\
+  f_##TYPE1##_##TYPE2 (TYPE1 *restrict dst1, TYPE1 *restrict src1,	\
+		       TYPE2 *restrict dst2, TYPE2 src2, int n)		\
+  {									\
+    for (int i = 0; i < n; ++i)						\
+      {									\
+	dst1[i] += src1[i];						\
+	dst2[i] = src2;							\
+      }									\
+  }
+
+#define TEST_ALL(T) \
+  T (uint16_t, uint8_t) \
+  T (uint32_t, uint16_t) \
+  T (uint32_t, _Float16) \
+  T (uint64_t, uint32_t) \
+  T (uint64_t, float)
+
+TEST_ALL (TEST_LOOP)
+
+/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.b, w3\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.h, w3\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.s, w3\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.h, h0\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tmov\tz[0-9]+\.s, s0\n} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tst1b\tz[0-9]+\.h,} 1 } } */
+/* { dg-final { scan-assembler-times {\tst1h\tz[0-9]+\.s,} 2 } } */
+/* { dg-final { scan-assembler-times {\tst1w\tz[0-9]+\.d,} 2 } } */
+
+/* { dg-final { scan-assembler-times {\tld1h\tz[0-9]+\.h,} 2 } } */
+/* { dg-final { scan-assembler-times {\tst1h\tz[0-9]+\.h,} 1 } } */
+/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.s,} 4 } } */
+/* { dg-final { scan-assembler-times {\tst1w\tz[0-9]+\.s,} 2 } } */
+/* { dg-final { scan-assembler-times {\tld1d\tz[0-9]+\.d,} 4 } } */
+/* { dg-final { scan-assembler-times {\tst1d\tz[0-9]+\.d,} 2 } } */
Index: gcc/testsuite/gcc.target/aarch64/sve/mixed_size_4.c
===================================================================
--- /dev/null	2019-09-17 11:41:18.176664108 +0100
+++ gcc/testsuite/gcc.target/aarch64/sve/mixed_size_4.c	2019-11-16 10:59:20.730514460 +0000
@@ -0,0 +1,43 @@ 
+/* { dg-options "-O2 -ftree-vectorize -fno-tree-loop-distribute-patterns" } */
+
+#include <stdint.h>
+
+#define TEST_LOOP(TYPE1, TYPE2)						\
+  void									\
+  f_##TYPE1##_##TYPE2 (TYPE1 *restrict dst1, TYPE1 *restrict src1,	\
+		       TYPE2 *restrict dst2, TYPE2 n)			\
+  {									\
+    for (TYPE2 i = 0; i < n; ++i)					\
+      {									\
+	dst1[i] += src1[i];						\
+	dst2[i] = i;							\
+      }									\
+  }
+
+#define TEST_ALL(T) \
+  T (uint16_t, uint8_t) \
+  T (uint32_t, uint16_t) \
+  T (uint64_t, uint32_t)
+
+TEST_ALL (TEST_LOOP)
+
+/* { dg-final { scan-assembler-not {\tindex\tz[0-9]+\.b,} } } */
+/* { dg-final { scan-assembler-times {\tindex\tz[0-9]+\.h, #0, #1\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tindex\tz[0-9]+\.s, #0, #1\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tindex\tz[0-9]+\.d, #0, #1\n} 1 } } */
+
+/* { dg-final { scan-assembler-not {\tcntb\t} } } */
+/* { dg-final { scan-assembler-times {\tcnth\t} 1 } } */
+/* { dg-final { scan-assembler-times {\tcntw\t} 1 } } */
+/* { dg-final { scan-assembler-times {\tcntd\t} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tst1b\tz[0-9]+\.h,} 1 } } */
+/* { dg-final { scan-assembler-times {\tst1h\tz[0-9]+\.s,} 1 } } */
+/* { dg-final { scan-assembler-times {\tst1w\tz[0-9]+\.d,} 1 } } */
+
+/* { dg-final { scan-assembler-times {\tld1h\tz[0-9]+\.h,} 2 } } */
+/* { dg-final { scan-assembler-times {\tst1h\tz[0-9]+\.h,} 1 } } */
+/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.s,} 2 } } */
+/* { dg-final { scan-assembler-times {\tst1w\tz[0-9]+\.s,} 1 } } */
+/* { dg-final { scan-assembler-times {\tld1d\tz[0-9]+\.d,} 2 } } */
+/* { dg-final { scan-assembler-times {\tst1d\tz[0-9]+\.d,} 1 } } */
Index: gcc/testsuite/gcc.target/aarch64/sve/mixed_size_5.c
===================================================================
--- /dev/null	2019-09-17 11:41:18.176664108 +0100
+++ gcc/testsuite/gcc.target/aarch64/sve/mixed_size_5.c	2019-11-16 10:59:20.730514460 +0000
@@ -0,0 +1,42 @@ 
+/* { dg-options "-O2 -ftree-vectorize -fno-tree-loop-distribute-patterns -msve-vector-bits=512" } */
+
+#include <stdint.h>
+
+#define TEST_LOOP(TYPE1, TYPE2)						\
+  void									\
+  f_##TYPE1##_##TYPE2 (TYPE1 *restrict dst1, TYPE1 *restrict src1,	\
+		       TYPE2 *restrict dst2, TYPE2 *restrict src2,	\
+		       int n)						\
+  {									\
+    for (int i = 0; i < n; ++i)						\
+      {									\
+	dst1[i * 2] = src1[i * 2] + 1;					\
+	dst1[i * 2 + 1] = src1[i * 2 + 1] + 1;				\
+	dst2[i * 2] = 2;						\
+	dst2[i * 2 + 1] = 3;						\
+      }									\
+  }
+
+#define TEST_ALL(T) \
+  T (uint16_t, uint8_t) \
+  T (uint32_t, uint16_t) \
+  T (uint32_t, _Float16) \
+  T (uint64_t, uint32_t) \
+  T (uint64_t, float)
+
+TEST_ALL (TEST_LOOP)
+
+/* { dg-final { scan-assembler-times {\tld1rw\tz[0-9]+\.s,} 1 } } */
+/* { dg-final { scan-assembler-times {\tld1rd\tz[0-9]+\.d,} 2 } } */
+/* { dg-final { scan-assembler-times {\tld1rqw\tz[0-9]+\.s,} 2 } } */
+
+/* { dg-final { scan-assembler-times {\tst1b\tz[0-9]+\.h,} 1 } } */
+/* { dg-final { scan-assembler-times {\tst1h\tz[0-9]+\.s,} 2 } } */
+/* { dg-final { scan-assembler-times {\tst1w\tz[0-9]+\.d,} 2 } } */
+
+/* { dg-final { scan-assembler-times {\tld1h\tz[0-9]+\.h,} 1 } } */
+/* { dg-final { scan-assembler-times {\tst1h\tz[0-9]+\.h,} 1 } } */
+/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.s,} 2 } } */
+/* { dg-final { scan-assembler-times {\tst1w\tz[0-9]+\.s,} 2 } } */
+/* { dg-final { scan-assembler-times {\tld1d\tz[0-9]+\.d,} 2 } } */
+/* { dg-final { scan-assembler-times {\tst1d\tz[0-9]+\.d,} 2 } } */