===================================================================
@@ -4314,6 +4314,7 @@ extern tree build_vector_from_val (tree,
extern tree build_uniform_cst (tree, tree);
extern tree build_vec_series (tree, tree, tree);
extern tree build_index_vector (tree, poly_uint64, poly_uint64);
+extern tree build_vector_a_then_b (tree, unsigned int, tree, tree);
extern void recompute_constructor_flags (tree);
extern void verify_constructor_flags (tree);
extern tree build_constructor (tree, vec<constructor_elt, va_gc> * CXX_MEM_STAT_INFO);
===================================================================
@@ -1981,6 +1981,23 @@ build_index_vector (tree vec_type, poly_
return v.build ();
}
+/* Return a VECTOR_CST of type VEC_TYPE in which the first NUM_A
+ elements are A and the rest are B. */
+
+tree
+build_vector_a_then_b (tree vec_type, unsigned int num_a, tree a, tree b)
+{
+ gcc_assert (known_le (num_a, TYPE_VECTOR_SUBPARTS (vec_type)));
+ unsigned int count = constant_lower_bound (TYPE_VECTOR_SUBPARTS (vec_type));
+ /* Optimize the constant case. */
+ if ((count & 1) == 0 && TYPE_VECTOR_SUBPARTS (vec_type).is_constant ())
+ count /= 2;
+ tree_vector_builder builder (vec_type, count, 2);
+ for (unsigned int i = 0; i < count * 2; ++i)
+ builder.quick_push (i < num_a ? a : b);
+ return builder.build ();
+}
+
/* Something has messed with the elements of CONSTRUCTOR C after it was built;
calculate TREE_CONSTANT and TREE_SIDE_EFFECTS. */
===================================================================
@@ -691,6 +691,36 @@ fold_const_vec_convert (tree ret_type, t
/* Try to evaluate:
+ IFN_WHILE_ULT (ARG0, ARG1, (TYPE) { ... })
+
+ Return the value on success and null on failure. */
+
+static tree
+fold_while_ult (tree type, poly_uint64 arg0, poly_uint64 arg1)
+{
+ if (known_ge (arg0, arg1))
+ return build_zero_cst (type);
+
+ if (maybe_ge (arg0, arg1))
+ return NULL_TREE;
+
+ poly_uint64 diff = arg1 - arg0;
+ poly_uint64 nelts = TYPE_VECTOR_SUBPARTS (type);
+ if (known_ge (diff, nelts))
+ return build_all_ones_cst (type);
+
+ unsigned HOST_WIDE_INT const_diff;
+ if (known_le (diff, nelts) && diff.is_constant (&const_diff))
+ {
+ tree minus_one = build_minus_one_cst (TREE_TYPE (type));
+ tree zero = build_zero_cst (TREE_TYPE (type));
+ return build_vector_a_then_b (type, const_diff, minus_one, zero);
+ }
+ return NULL_TREE;
+}
+
+/* Try to evaluate:
+
*RESULT = FN (*ARG)
in format FORMAT. Return true on success. */
@@ -1782,6 +1812,14 @@ fold_const_call (combined_fn fn, tree ty
}
return NULL_TREE;
+ case CFN_WHILE_ULT:
+ {
+ poly_uint64 parg0, parg1;
+ if (poly_int_tree_p (arg0, &parg0) && poly_int_tree_p (arg1, &parg1))
+ return fold_while_ult (type, parg0, parg1);
+ return NULL_TREE;
+ }
+
default:
return fold_const_call_1 (fn, type, arg0, arg1, arg2);
}
===================================================================
@@ -406,6 +406,33 @@ enum aarch64_key_type {
extern struct tune_params aarch64_tune_params;
+/* The available SVE predicate patterns, known in the ACLE as "svpattern". */
+#define AARCH64_FOR_SVPATTERN(T) \
+ T (POW2, pow2, 0) \
+ T (VL1, vl1, 1) \
+ T (VL2, vl2, 2) \
+ T (VL3, vl3, 3) \
+ T (VL4, vl4, 4) \
+ T (VL5, vl5, 5) \
+ T (VL6, vl6, 6) \
+ T (VL7, vl7, 7) \
+ T (VL8, vl8, 8) \
+ T (VL16, vl16, 9) \
+ T (VL32, vl32, 10) \
+ T (VL64, vl64, 11) \
+ T (VL128, vl128, 12) \
+ T (VL256, vl256, 13) \
+ T (MUL4, mul4, 29) \
+ T (MUL3, mul3, 30) \
+ T (ALL, all, 31)
+
+#define AARCH64_SVENUM(UPPER, LOWER, VALUE) AARCH64_SV_##UPPER = VALUE,
+enum aarch64_svpattern {
+ AARCH64_FOR_SVPATTERN (AARCH64_SVENUM)
+ AARCH64_NUM_SVPATTERNS
+};
+#undef AARCH64_SVENUM
+
void aarch64_post_cfi_startproc (void);
poly_int64 aarch64_initial_elimination_offset (unsigned, unsigned);
int aarch64_get_condition_code (rtx);
===================================================================
@@ -481,12 +481,18 @@ (define_expand "mov<mode>"
{
if (GET_CODE (operands[0]) == MEM)
operands[1] = force_reg (<MODE>mode, operands[1]);
+
+ if (CONSTANT_P (operands[1]))
+ {
+ aarch64_expand_mov_immediate (operands[0], operands[1]);
+ DONE;
+ }
}
)
(define_insn "*aarch64_sve_mov<mode>"
[(set (match_operand:PRED_ALL 0 "nonimmediate_operand" "=Upa, m, Upa, Upa")
- (match_operand:PRED_ALL 1 "general_operand" "Upa, Upa, m, Dn"))]
+ (match_operand:PRED_ALL 1 "aarch64_mov_operand" "Upa, Upa, m, Dn"))]
"TARGET_SVE
&& (register_operand (operands[0], <MODE>mode)
|| register_operand (operands[1], <MODE>mode))"
@@ -2923,7 +2929,7 @@ (define_insn "*pred_cmp<cmp_op><mode>"
;; Set element I of the result if operand1 + J < operand2 for all J in [0, I],
;; with the comparison being unsigned.
-(define_insn "while_ult<GPI:mode><PRED_ALL:mode>"
+(define_insn "@while_ult<GPI:mode><PRED_ALL:mode>"
[(set (match_operand:PRED_ALL 0 "register_operand" "=Upa")
(unspec:PRED_ALL [(match_operand:GPI 1 "aarch64_reg_or_zero" "rZ")
(match_operand:GPI 2 "aarch64_reg_or_zero" "rZ")]
===================================================================
@@ -83,7 +83,7 @@ #define POINTER_BYTES (POINTER_SIZE / BI
/* Information about a legitimate vector immediate operand. */
struct simd_immediate_info
{
- enum insn_type { MOV, MVN, INDEX };
+ enum insn_type { MOV, MVN, INDEX, PTRUE };
enum modifier_type { LSL, MSL };
simd_immediate_info () {}
@@ -92,6 +92,7 @@ struct simd_immediate_info
insn_type = MOV, modifier_type = LSL,
unsigned int = 0);
simd_immediate_info (scalar_mode, rtx, rtx);
+ simd_immediate_info (scalar_int_mode, aarch64_svpattern);
/* The mode of the elements. */
scalar_mode elt_mode;
@@ -120,6 +121,9 @@ struct simd_immediate_info
subsequent element. */
rtx base, step;
} index;
+
+ /* For PTRUE. */
+ aarch64_svpattern pattern;
} u;
};
@@ -159,6 +163,16 @@ struct simd_immediate_info
u.index.step = step_in;
}
+/* Construct a predicate that controls elements of mode ELT_MODE_IN
+ and has PTRUE pattern PATTERN_IN. */
+inline simd_immediate_info
+::simd_immediate_info (scalar_int_mode elt_mode_in,
+ aarch64_svpattern pattern_in)
+ : elt_mode (elt_mode_in), insn (PTRUE)
+{
+ u.pattern = pattern_in;
+}
+
/* The current code model. */
enum aarch64_code_model aarch64_cmodel;
@@ -1334,6 +1348,22 @@ static const char *const aarch64_sve_con
"pmore", "plast", "tcont", "tstop", "gt", "le", "al", "nv"
};
+/* Return the assembly token for svpattern value VALUE. */
+
+static const char *
+svpattern_token (enum aarch64_svpattern pattern)
+{
+ switch (pattern)
+ {
+#define CASE(UPPER, LOWER, VALUE) case AARCH64_SV_##UPPER: return #LOWER;
+ AARCH64_FOR_SVPATTERN (CASE)
+#undef CASE
+ case AARCH64_NUM_SVPATTERNS:
+ break;
+ }
+ gcc_unreachable ();
+}
+
/* Generate code to enable conditional branches in functions over 1 MiB. */
const char *
aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest,
@@ -2529,6 +2559,146 @@ aarch64_force_temporary (machine_mode mo
}
}
+/* Return true if predicate value X is a constant in which every element
+ is a CONST_INT. When returning true, describe X in BUILDER as a VNx16BI
+ value, i.e. as a predicate in which all bits are significant. */
+
+static bool
+aarch64_get_sve_pred_bits (rtx_vector_builder &builder, rtx x)
+{
+ if (GET_CODE (x) != CONST_VECTOR)
+ return false;
+
+ unsigned int factor = vector_element_size (GET_MODE_NUNITS (VNx16BImode),
+ GET_MODE_NUNITS (GET_MODE (x)));
+ unsigned int npatterns = CONST_VECTOR_NPATTERNS (x) * factor;
+ unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (x);
+ builder.new_vector (VNx16BImode, npatterns, nelts_per_pattern);
+
+ unsigned int nelts = const_vector_encoded_nelts (x);
+ for (unsigned int i = 0; i < nelts; ++i)
+ {
+ rtx elt = CONST_VECTOR_ENCODED_ELT (x, i);
+ if (!CONST_INT_P (elt))
+ return false;
+
+ builder.quick_push (elt);
+ for (unsigned int j = 1; j < factor; ++j)
+ builder.quick_push (const0_rtx);
+ }
+ builder.finalize ();
+ return true;
+}
+
+/* BUILDER contains a predicate constant of mode VNx16BI. Return the
+ widest predicate element size it can have (that is, the largest size
+ for which each element would still be 0 or 1). */
+
+unsigned int
+aarch64_widest_sve_pred_elt_size (rtx_vector_builder &builder)
+{
+ /* Start with the most optimistic assumption: that we only need
+ one bit per pattern. This is what we will use if only the first
+ bit in each pattern is ever set. */
+ unsigned int mask = GET_MODE_SIZE (DImode);
+ mask |= builder.npatterns ();
+
+ /* Look for set bits. */
+ unsigned int nelts = builder.encoded_nelts ();
+ for (unsigned int i = 1; i < nelts; ++i)
+ if (INTVAL (builder.elt (i)) != 0)
+ {
+ if (i & 1)
+ return 1;
+ mask |= i;
+ }
+ return mask & -mask;
+}
+
+/* BUILDER is a predicate constant of mode VNx16BI. Consider the value
+ that the constant would have with predicate element size ELT_SIZE
+ (ignoring the upper bits in each element) and return:
+
+ * -1 if all bits are set
+ * N if the predicate has N leading set bits followed by all clear bits
+ * 0 if the predicate does not have any of these forms. */
+
+int
+aarch64_partial_ptrue_length (rtx_vector_builder &builder,
+ unsigned int elt_size)
+{
+ /* If nelts_per_pattern is 3, we have set bits followed by clear bits
+ followed by set bits. */
+ if (builder.nelts_per_pattern () == 3)
+ return 0;
+
+ /* Skip over leading set bits. */
+ unsigned int nelts = builder.encoded_nelts ();
+ unsigned int i = 0;
+ for (; i < nelts; i += elt_size)
+ if (INTVAL (builder.elt (i)) == 0)
+ break;
+ unsigned int vl = i / elt_size;
+
+ /* Check for the all-true case. */
+ if (i == nelts)
+ return -1;
+
+ /* If nelts_per_pattern is 1, then either VL is zero, or we have a
+ repeating pattern of set bits followed by clear bits. */
+ if (builder.nelts_per_pattern () != 2)
+ return 0;
+
+ /* We have a "foreground" value and a duplicated "background" value.
+ If the background might repeat and the last set bit belongs to it,
+ we might have set bits followed by clear bits followed by set bits. */
+ if (i > builder.npatterns () && maybe_ne (nelts, builder.full_nelts ()))
+ return 0;
+
+ /* Make sure that the rest are all clear. */
+ for (; i < nelts; i += elt_size)
+ if (INTVAL (builder.elt (i)) != 0)
+ return 0;
+
+ return vl;
+}
+
+/* See if there is an svpattern that encodes an SVE predicate of mode
+ PRED_MODE in which the first VL bits are set and the rest are clear.
+ Return the pattern if so, otherwise return AARCH64_NUM_SVPATTERNS.
+ A VL of -1 indicates an all-true vector. */
+
+aarch64_svpattern
+aarch64_svpattern_for_vl (machine_mode pred_mode, int vl)
+{
+ if (vl < 0)
+ return AARCH64_SV_ALL;
+
+ if (maybe_gt (vl, GET_MODE_NUNITS (pred_mode)))
+ return AARCH64_NUM_SVPATTERNS;
+
+ if (vl >= 1 && vl <= 8)
+ return aarch64_svpattern (AARCH64_SV_VL1 + (vl - 1));
+
+ if (vl >= 16 && vl <= 256 && pow2p_hwi (vl))
+ return aarch64_svpattern (AARCH64_SV_VL16 + (exact_log2 (vl) - 4));
+
+ int max_vl;
+ if (GET_MODE_NUNITS (pred_mode).is_constant (&max_vl))
+ {
+ if (vl == (max_vl / 3) * 3)
+ return AARCH64_SV_MUL3;
+ /* These would only trigger for non-power-of-2 lengths. */
+ if (vl == (max_vl & -4))
+ return AARCH64_SV_MUL4;
+ if (vl == (1 << floor_log2 (max_vl)))
+ return AARCH64_SV_POW2;
+ if (vl == max_vl)
+ return AARCH64_SV_ALL;
+ }
+ return AARCH64_NUM_SVPATTERNS;
+}
+
/* Return an all-true predicate register of mode MODE. */
rtx
@@ -3447,6 +3617,17 @@ aarch64_expand_sve_const_vector (rtx tar
return target;
}
+/* Use WHILE to set predicate register DEST so that the first VL bits
+ are set and the rest are clear. */
+
+static void
+aarch64_sve_move_pred_via_while (rtx dest, unsigned int vl)
+{
+ rtx limit = force_reg (DImode, gen_int_mode (vl, DImode));
+ emit_insn (gen_while_ult (DImode, GET_MODE (dest),
+ dest, const0_rtx, limit));
+}
+
/* Set DEST to immediate IMM. */
void
@@ -3580,6 +3761,19 @@ aarch64_expand_mov_immediate (rtx dest,
return;
}
+ rtx_vector_builder builder;
+ if (GET_MODE_CLASS (GET_MODE (imm)) == MODE_VECTOR_BOOL
+ && aarch64_get_sve_pred_bits (builder, imm))
+ {
+ unsigned int elt_size = aarch64_widest_sve_pred_elt_size (builder);
+ int vl = aarch64_partial_ptrue_length (builder, elt_size);
+ if (vl > 0)
+ {
+ aarch64_sve_move_pred_via_while (dest, vl);
+ return;
+ }
+ }
+
if (GET_CODE (imm) == CONST_VECTOR && aarch64_sve_data_mode_p (mode))
if (rtx res = aarch64_expand_sve_const_vector (dest, imm))
{
@@ -14776,6 +14970,44 @@ aarch64_sve_valid_immediate (unsigned HO
return false;
}
+/* Return true if X is a valid SVE predicate. If INFO is nonnull, use
+ it to describe valid immediates. */
+
+static bool
+aarch64_sve_pred_valid_immediate (rtx x, simd_immediate_info *info)
+{
+ if (x == CONST0_RTX (GET_MODE (x)))
+ {
+ if (info)
+ *info = simd_immediate_info (DImode, 0);
+ return true;
+ }
+
+ /* Analyze the value as a VNx16BImode. This should be relatively
+ efficient, since rtx_vector_builder has enough built-in capacity
+ to store all VLA predicate constants without needing the heap. */
+ rtx_vector_builder builder;
+ if (!aarch64_get_sve_pred_bits (builder, x))
+ return false;
+
+ unsigned int elt_size = aarch64_widest_sve_pred_elt_size (builder);
+ if (int vl = aarch64_partial_ptrue_length (builder, elt_size))
+ {
+ machine_mode mode = aarch64_sve_pred_mode (elt_size).require ();
+ aarch64_svpattern pattern = aarch64_svpattern_for_vl (mode, vl);
+ if (pattern != AARCH64_NUM_SVPATTERNS)
+ {
+ if (info)
+ {
+ scalar_int_mode int_mode = aarch64_sve_element_int_mode (mode);
+ *info = simd_immediate_info (int_mode, pattern);
+ }
+ return true;
+ }
+ }
+ return false;
+}
+
/* Return true if OP is a valid SIMD immediate for the operation
described by WHICH. If INFO is nonnull, use it to describe valid
immediates. */
@@ -14788,6 +15020,9 @@ aarch64_simd_valid_immediate (rtx op, si
if (vec_flags == 0 || vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
return false;
+ if (vec_flags & VEC_SVE_PRED)
+ return aarch64_sve_pred_valid_immediate (op, info);
+
scalar_mode elt_mode = GET_MODE_INNER (mode);
rtx base, step;
unsigned int n_elts;
@@ -14812,21 +15047,6 @@ aarch64_simd_valid_immediate (rtx op, si
else
return false;
- /* Handle PFALSE and PTRUE. */
- if (vec_flags & VEC_SVE_PRED)
- {
- if (op == CONST0_RTX (mode) || op == CONSTM1_RTX (mode))
- {
- if (info)
- {
- scalar_int_mode int_mode = aarch64_sve_element_int_mode (mode);
- *info = simd_immediate_info (int_mode, op == CONSTM1_RTX (mode));
- }
- return true;
- }
- return false;
- }
-
scalar_float_mode elt_float_mode;
if (n_elts == 1
&& is_a <scalar_float_mode> (elt_mode, &elt_float_mode))
@@ -16570,14 +16790,23 @@ aarch64_output_sve_mov_immediate (rtx co
if (aarch64_sve_pred_mode_p (vec_mode))
{
static char buf[sizeof ("ptrue\t%0.N, vlNNNNN")];
- unsigned int total_bytes;
- if (info.u.mov.value == const0_rtx)
- snprintf (buf, sizeof (buf), "pfalse\t%%0.b");
- else if (BYTES_PER_SVE_VECTOR.is_constant (&total_bytes))
- snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, vl%d", element_char,
- total_bytes / GET_MODE_SIZE (info.elt_mode));
+ if (info.insn == simd_immediate_info::MOV)
+ {
+ gcc_assert (info.u.mov.value == const0_rtx);
+ snprintf (buf, sizeof (buf), "pfalse\t%%0.b");
+ }
else
- snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, all", element_char);
+ {
+ gcc_assert (info.insn == simd_immediate_info::PTRUE);
+ unsigned int total_bytes;
+ if (info.u.pattern == AARCH64_SV_ALL
+ && BYTES_PER_SVE_VECTOR.is_constant (&total_bytes))
+ snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, vl%d", element_char,
+ total_bytes / GET_MODE_SIZE (info.elt_mode));
+ else
+ snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, %s", element_char,
+ svpattern_token (info.u.pattern));
+ }
return buf;
}
===================================================================
@@ -9,29 +9,30 @@ #define TEST_LOOP(TYPE) \
void \
multi_loop_##TYPE (TYPE *x, TYPE val) \
{ \
- for (int i = 0; i < 7; ++i) \
+ for (int i = 0; i < 9; ++i) \
x[i] += val; \
consumer (x); \
- for (int i = 0; i < 7; ++i) \
+ for (int i = 0; i < 9; ++i) \
x[i] += val; \
consumer (x); \
- for (int i = 0; i < 7; ++i) \
+ for (int i = 0; i < 9; ++i) \
x[i] += val; \
consumer (x); \
}
/* One iteration is enough. */
TEST_LOOP (uint8_t);
+/* Two iterations are enough. We specialize the second two loops based
+ on whether the first executes once or twice. */
TEST_LOOP (uint16_t);
-/* Two iterations are enough. Complete unrolling makes sense
- even at -O2. */
+/* Three iterations are needed; ought to stay a loop. */
TEST_LOOP (uint32_t);
-/* Four iterations are needed; ought to stay a loop. */
+/* Five iterations are needed; ought to stay a loop. */
TEST_LOOP (uint64_t);
/* { dg-final { scan-assembler-times {\twhilelo\tp[0-9]\.b} 3 } } */
-/* { dg-final { scan-assembler-times {\twhilelo\tp[0-9]\.h} 3 } } */
-/* { dg-final { scan-assembler {\twhilelo\tp[0-9]\.s} } } */
+/* { dg-final { scan-assembler-times {\twhilelo\tp[0-9]\.h} 8 } } */
+/* { dg-final { scan-assembler-times {\twhilelo\tp[0-9]\.s} 6 } } */
/* { dg-final { scan-assembler-times {\twhilelo\tp[0-9]\.d} 6 } } */
/* { dg-final { scan-assembler-not {\tldr\tz[0-9]} } } */
/* { dg-final { scan-assembler-not {\tstr\tz[0-9]} } } */
===================================================================
@@ -0,0 +1,25 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize -msve-vector-bits=scalable" } */
+
+#include <stdint.h>
+
+#define ADD_LOOP(TYPE) \
+ TYPE __attribute__ ((noinline, noclone)) \
+ vec_while_##TYPE (TYPE *restrict a) \
+ { \
+ for (int i = 0; i < 7; ++i) \
+ a[i] += 1; \
+ }
+
+#define TEST_ALL(T) \
+ T (int8_t) \
+ T (int16_t) \
+ T (int32_t) \
+ T (int64_t)
+
+TEST_ALL (ADD_LOOP)
+
+/* { dg-final { scan-assembler-times {\tptrue\tp[0-7]\.b, vl7\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tptrue\tp[0-7]\.h, vl7\n} 1 } } */
+/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.s,} 2 } } */
+/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.d,} 2 } } */
===================================================================
@@ -0,0 +1,25 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize -msve-vector-bits=scalable" } */
+
+#include <stdint.h>
+
+#define ADD_LOOP(TYPE) \
+ TYPE __attribute__ ((noinline, noclone)) \
+ vec_while_##TYPE (TYPE *restrict a) \
+ { \
+ for (int i = 0; i < 8; ++i) \
+ a[i] += 1; \
+ }
+
+#define TEST_ALL(T) \
+ T (int8_t) \
+ T (int16_t) \
+ T (int32_t) \
+ T (int64_t)
+
+TEST_ALL (ADD_LOOP)
+
+/* { dg-final { scan-assembler-times {\tptrue\tp[0-7]\.b, vl8\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tptrue\tp[0-7]\.h, vl8\n} 1 } } */
+/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.s,} 2 } } */
+/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.d,} 2 } } */
===================================================================
@@ -0,0 +1,25 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize -msve-vector-bits=scalable" } */
+
+#include <stdint.h>
+
+#define ADD_LOOP(TYPE) \
+ TYPE __attribute__ ((noinline, noclone)) \
+ vec_while_##TYPE (TYPE *restrict a) \
+ { \
+ for (int i = 0; i < 9; ++i) \
+ a[i] += 1; \
+ }
+
+#define TEST_ALL(T) \
+ T (int8_t) \
+ T (int16_t) \
+ T (int32_t) \
+ T (int64_t)
+
+TEST_ALL (ADD_LOOP)
+
+/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.b,} 1 } } */
+/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.h,} 2 } } */
+/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.s,} 2 } } */
+/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.d,} 2 } } */
===================================================================
@@ -0,0 +1,25 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize -msve-vector-bits=scalable" } */
+
+#include <stdint.h>
+
+#define ADD_LOOP(TYPE) \
+ TYPE __attribute__ ((noinline, noclone)) \
+ vec_while_##TYPE (TYPE *restrict a) \
+ { \
+ for (int i = 0; i < 16; ++i) \
+ a[i] += 1; \
+ }
+
+#define TEST_ALL(T) \
+ T (int8_t) \
+ T (int16_t) \
+ T (int32_t) \
+ T (int64_t)
+
+TEST_ALL (ADD_LOOP)
+
+/* { dg-final { scan-assembler-times {\tptrue\tp[0-7]\.b, vl16\n} 1 } } */
+/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.h,} 2 } } */
+/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.s,} 2 } } */
+/* { dg-final { scan-assembler-times {\twhilelo\tp[0-7]\.d,} 2 } } */
===================================================================
@@ -0,0 +1,25 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize -msve-vector-bits=512" } */
+
+#include <stdint.h>
+
+#define ADD_LOOP(TYPE, COUNT) \
+ TYPE __attribute__ ((noinline, noclone)) \
+ vec_while_##TYPE (TYPE *restrict a) \
+ { \
+ for (int i = 0; i < COUNT; ++i) \
+ a[i] += 1; \
+ }
+
+#define TEST_ALL(T) \
+ T (int8_t, 63) \
+ T (int16_t, 30) \
+ T (int32_t, 15) \
+ T (int64_t, 6)
+
+TEST_ALL (ADD_LOOP)
+
+/* { dg-final { scan-assembler-times {\tptrue\tp[0-7]\.b, mul3\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tptrue\tp[0-7]\.h, mul3\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tptrue\tp[0-7]\.s, mul3\n} 1 } } */
+/* { dg-final { scan-assembler-times {\tptrue\tp[0-7]\.d, vl6\n} 1 } } */