===================================================================
@@ -1487,6 +1487,10 @@ Target supports hardware vectors of @cod
@item vect_long_long
Target supports hardware vectors of @code{long long}.
+@item vect_check_ptrs
+Target supports the @code{check_raw_ptrs} and @code{check_war_ptrs}
+optabs on vectors.
+
@item vect_fully_masked
Target supports fully-masked (also known as fully-predicated) loops,
so that vector loops can handle partial as well as full vectors.
===================================================================
@@ -429,6 +429,9 @@ OPTAB_D (atomic_xor_optab, "atomic_xor$I
OPTAB_D (get_thread_pointer_optab, "get_thread_pointer$I$a")
OPTAB_D (set_thread_pointer_optab, "set_thread_pointer$I$a")
+OPTAB_D (check_raw_ptrs_optab, "check_raw_ptrs$a")
+OPTAB_D (check_war_ptrs_optab, "check_war_ptrs$a")
+
OPTAB_DC (vec_duplicate_optab, "vec_duplicate$a", VEC_DUPLICATE)
OPTAB_DC (vec_series_optab, "vec_series$a", VEC_SERIES)
OPTAB_D (vec_shl_insert_optab, "vec_shl_insert_$a")
===================================================================
@@ -5076,6 +5076,37 @@ for (i = 1; i < GET_MODE_NUNITS (@var{n}
operand0[i] = operand0[i - 1] && (operand1 + i < operand2);
@end smallexample
+@cindex @code{check_raw_ptrs@var{m}} instruction pattern
+@item @samp{check_raw_ptrs@var{m}}
+Check whether, given two pointers @var{a} and @var{b} and a length @var{len},
+a write of @var{len} bytes at @var{a} followed by a read of @var{len} bytes
+at @var{b} can be split into interleaved byte accesses
+@samp{@var{a}[0], @var{b}[0], @var{a}[1], @var{b}[1], @dots{}}
+without affecting the dependencies between the bytes. Set operand 0
+to true if the split is possible and false otherwise.
+
+Operands 1, 2 and 3 provide the values of @var{a}, @var{b} and @var{len}
+respectively. Operand 4 is a constant integer that provides the known
+common alignment of @var{a} and @var{b}. All inputs have mode @var{m}.
+
+This split is possible if:
+
+@smallexample
+@var{a} == @var{b} || @var{a} + @var{len} <= @var{b} || @var{b} + @var{len} <= @var{a}
+@end smallexample
+
+You should only define this pattern if the target has a way of accelerating
+the test without having to do the individual comparisons.
+
+@cindex @code{check_war_ptrs@var{m}} instruction pattern
+@item @samp{check_war_ptrs@var{m}}
+Like @samp{check_raw_ptrs@var{m}}, but with the read and write swapped round.
+The split is possible in this case if:
+
+@smallexample
+@var{b} <= @var{a} || @var{a} + @var{len} <= @var{b}
+@end smallexample
+
@cindex @code{vec_cmp@var{m}@var{n}} instruction pattern
@item @samp{vec_cmp@var{m}@var{n}}
Output a vector comparison. Operand 0 of mode @var{n} is the destination for
===================================================================
@@ -63,6 +63,7 @@ along with GCC; see the file COPYING3.
- cond_ternary: a conditional ternary optab, such as cond_fma_rev<mode>
- fold_left: for scalar = FN (scalar, vector), keyed off the vector mode
+ - check_ptrs: used for check_{raw,war}_ptrs
DEF_INTERNAL_SIGNED_OPTAB_FN defines an internal function that
maps to one of two optabs, depending on the signedness of an input.
@@ -136,6 +137,10 @@ DEF_INTERNAL_OPTAB_FN (MASK_STORE_LANES,
vec_mask_store_lanes, mask_store_lanes)
DEF_INTERNAL_OPTAB_FN (WHILE_ULT, ECF_CONST | ECF_NOTHROW, while_ult, while)
+DEF_INTERNAL_OPTAB_FN (CHECK_RAW_PTRS, ECF_CONST | ECF_NOTHROW,
+ check_raw_ptrs, check_ptrs)
+DEF_INTERNAL_OPTAB_FN (CHECK_WAR_PTRS, ECF_CONST | ECF_NOTHROW,
+ check_war_ptrs, check_ptrs)
DEF_INTERNAL_OPTAB_FN (VEC_SHL_INSERT, ECF_CONST | ECF_NOTHROW,
vec_shl_insert, binary)
===================================================================
@@ -221,6 +221,8 @@ extern int internal_fn_mask_index (inter
extern int internal_fn_stored_value_index (internal_fn);
extern bool internal_gather_scatter_fn_supported_p (internal_fn, tree,
tree, tree, int);
+extern bool internal_check_ptrs_fn_supported_p (internal_fn, tree,
+ poly_uint64, unsigned int);
extern void expand_internal_call (gcall *);
extern void expand_internal_call (internal_fn, gcall *);
===================================================================
@@ -118,6 +118,7 @@ #define while_direct { 0, 2, false }
#define fold_extract_direct { 2, 2, false }
#define fold_left_direct { 1, 1, false }
#define mask_fold_left_direct { 1, 1, false }
+#define check_ptrs_direct { 0, 0, false }
const direct_internal_fn_info direct_internal_fn_array[IFN_LAST + 1] = {
#define DEF_INTERNAL_FN(CODE, FLAGS, FNSPEC) not_direct,
@@ -3006,6 +3007,9 @@ #define expand_fold_left_optab_fn(FN, ST
#define expand_mask_fold_left_optab_fn(FN, STMT, OPTAB) \
expand_direct_optab_fn (FN, STMT, OPTAB, 3)
+#define expand_check_ptrs_optab_fn(FN, STMT, OPTAB) \
+ expand_direct_optab_fn (FN, STMT, OPTAB, 4)
+
/* RETURN_TYPE and ARGS are a return type and argument list that are
in principle compatible with FN (which satisfies direct_internal_fn_p).
Return the types that should be used to determine whether the
@@ -3095,6 +3099,7 @@ #define direct_while_optab_supported_p c
#define direct_fold_extract_optab_supported_p direct_optab_supported_p
#define direct_fold_left_optab_supported_p direct_optab_supported_p
#define direct_mask_fold_left_optab_supported_p direct_optab_supported_p
+#define direct_check_ptrs_optab_supported_p direct_optab_supported_p
/* Return the optab used by internal function FN. */
@@ -3572,6 +3577,24 @@ internal_gather_scatter_fn_supported_p (
&& insn_operand_matches (icode, 3 + output_ops, GEN_INT (scale)));
}
+/* Return true if the target supports IFN_CHECK_{RAW,WAR}_PTRS function IFN
+ for pointers of type TYPE when the accesses have LENGTH bytes and their
+ common byte alignment is ALIGN. */
+
+bool
+internal_check_ptrs_fn_supported_p (internal_fn ifn, tree type,
+ poly_uint64 length, unsigned int align)
+{
+ machine_mode mode = TYPE_MODE (type);
+ optab optab = direct_internal_fn_optab (ifn);
+ insn_code icode = direct_optab_handler (optab, mode);
+ if (icode == CODE_FOR_nothing)
+ return false;
+ rtx length_rtx = immed_wide_int_const (length, mode);
+ return (insn_operand_matches (icode, 3, length_rtx)
+ && insn_operand_matches (icode, 4, GEN_INT (align)));
+}
+
/* Expand STMT as though it were a call to internal function FN. */
void
===================================================================
@@ -96,6 +96,7 @@ Software Foundation; either version 3, o
#include "builtins.h"
#include "tree-eh.h"
#include "ssa.h"
+#include "internal-fn.h"
static struct datadep_stats
{
@@ -1719,6 +1720,80 @@ prune_runtime_alias_test_list (vec<dr_wi
}
}
+/* A subroutine of create_intersect_range_checks, with a subset of the
+ same arguments. Try to use IFN_CHECK_RAW_PTRS and IFN_CHECK_WAR_PTRS
+ to optimize cases in which the references form a simple RAW, WAR or
+ WAR dependence. */
+
+static bool
+create_ifn_alias_checks (tree *cond_expr,
+ const dr_with_seg_len_pair_t &alias_pair)
+{
+ const dr_with_seg_len& dr_a = alias_pair.first;
+ const dr_with_seg_len& dr_b = alias_pair.second;
+
+ /* Check for cases in which:
+
+ (a) we have a known RAW, WAR or WAR dependence
+ (b) the accesses are well-ordered in both the original and new code
+ (see the comment above the DR_ALIAS_* flags for details); and
+ (c) the DR_STEPs describe all access pairs covered by ALIAS_PAIR. */
+ if (alias_pair.flags & ~(DR_ALIAS_RAW | DR_ALIAS_WAR | DR_ALIAS_WAW))
+ return false;
+
+ /* Make sure that both DRs access the same pattern of bytes,
+ with a constant length and and step. */
+ poly_uint64 seg_len;
+ if (!operand_equal_p (dr_a.seg_len, dr_b.seg_len, 0)
+ || !poly_int_tree_p (dr_a.seg_len, &seg_len)
+ || maybe_ne (dr_a.access_size, dr_b.access_size)
+ || !operand_equal_p (DR_STEP (dr_a.dr), DR_STEP (dr_b.dr), 0)
+ || !tree_fits_uhwi_p (DR_STEP (dr_a.dr)))
+ return false;
+
+ unsigned HOST_WIDE_INT bytes = tree_to_uhwi (DR_STEP (dr_a.dr));
+ tree addr_a = DR_BASE_ADDRESS (dr_a.dr);
+ tree addr_b = DR_BASE_ADDRESS (dr_b.dr);
+
+ /* See whether the target suports what we want to do. WAW checks are
+ equivalent to WAR checks here. */
+ internal_fn ifn = (alias_pair.flags & DR_ALIAS_RAW
+ ? IFN_CHECK_RAW_PTRS
+ : IFN_CHECK_WAR_PTRS);
+ unsigned int align = MIN (dr_a.align, dr_b.align);
+ poly_uint64 full_length = seg_len + bytes;
+ if (!internal_check_ptrs_fn_supported_p (ifn, TREE_TYPE (addr_a),
+ full_length, align))
+ {
+ full_length = seg_len + dr_a.access_size;
+ if (!internal_check_ptrs_fn_supported_p (ifn, TREE_TYPE (addr_a),
+ full_length, align))
+ return false;
+ }
+
+ /* Commit to using this form of test. */
+ addr_a = fold_build_pointer_plus (addr_a, DR_OFFSET (dr_a.dr));
+ addr_a = fold_build_pointer_plus (addr_a, DR_INIT (dr_a.dr));
+
+ addr_b = fold_build_pointer_plus (addr_b, DR_OFFSET (dr_b.dr));
+ addr_b = fold_build_pointer_plus (addr_b, DR_INIT (dr_b.dr));
+
+ *cond_expr = build_call_expr_internal_loc (UNKNOWN_LOCATION,
+ ifn, boolean_type_node,
+ 4, addr_a, addr_b,
+ size_int (full_length),
+ size_int (align));
+
+ if (dump_enabled_p ())
+ {
+ if (ifn == IFN_CHECK_RAW_PTRS)
+ dump_printf (MSG_NOTE, "using an IFN_CHECK_RAW_PTRS test\n");
+ else
+ dump_printf (MSG_NOTE, "using an IFN_CHECK_WAR_PTRS test\n");
+ }
+ return true;
+}
+
/* Try to generate a runtime condition that is true if ALIAS_PAIR is
free of aliases, using a condition based on index values instead
of a condition based on addresses. Return true on success,
@@ -2240,6 +2315,9 @@ create_intersect_range_checks (class loo
if (create_intersect_range_checks_index (loop, cond_expr, alias_pair))
return;
+ if (create_ifn_alias_checks (cond_expr, alias_pair))
+ return;
+
if (create_waw_or_war_checks (cond_expr, alias_pair))
return;
===================================================================
@@ -2077,6 +2077,8 @@ (define_int_iterator SVE_CFP_TERNARY_LAN
(define_int_iterator SVE_WHILE [UNSPEC_WHILE_LE UNSPEC_WHILE_LO
UNSPEC_WHILE_LS UNSPEC_WHILE_LT])
+(define_int_iterator SVE2_WHILE_PTR [UNSPEC_WHILERW UNSPEC_WHILEWR])
+
(define_int_iterator SVE_SHIFT_WIDE [UNSPEC_ASHIFT_WIDE
UNSPEC_ASHIFTRT_WIDE
UNSPEC_LSHIFTRT_WIDE])
@@ -2157,6 +2159,8 @@ (define_int_attr optab [(UNSPEC_ANDF "an
(UNSPEC_FEXPA "fexpa")
(UNSPEC_FTSMUL "ftsmul")
(UNSPEC_FTSSEL "ftssel")
+ (UNSPEC_WHILERW "vec_check_raw_alias")
+ (UNSPEC_WHILEWR "vec_check_war_alias")
(UNSPEC_COND_FABS "abs")
(UNSPEC_COND_FADD "add")
(UNSPEC_COND_FCADD90 "cadd90")
@@ -2480,13 +2484,18 @@ (define_int_attr cmp_op [(UNSPEC_COND_CM
(UNSPEC_WHILE_LE "le")
(UNSPEC_WHILE_LO "lo")
(UNSPEC_WHILE_LS "ls")
- (UNSPEC_WHILE_LT "lt")])
+ (UNSPEC_WHILE_LT "lt")
+ (UNSPEC_WHILERW "rw")
+ (UNSPEC_WHILEWR "wr")])
(define_int_attr while_optab_cmp [(UNSPEC_WHILE_LE "le")
(UNSPEC_WHILE_LO "ult")
(UNSPEC_WHILE_LS "ule")
(UNSPEC_WHILE_LT "lt")])
+(define_int_attr raw_war [(UNSPEC_WHILERW "raw")
+ (UNSPEC_WHILEWR "war")])
+
(define_int_attr brk_op [(UNSPEC_BRKA "a") (UNSPEC_BRKB "b")
(UNSPEC_BRKN "n")
(UNSPEC_BRKPA "pa") (UNSPEC_BRKPB "pb")])
@@ -2630,3 +2639,6 @@ (define_int_attr min_elem_bits [(UNSPEC_
(UNSPEC_REVB "16")
(UNSPEC_REVH "32")
(UNSPEC_REVW "64")])
+
+(define_int_attr unspec [(UNSPEC_WHILERW "UNSPEC_WHILERW")
+ (UNSPEC_WHILEWR "UNSPEC_WHILEWR")])
===================================================================
@@ -245,6 +245,8 @@ (define_c_enum "unspec" [
UNSPEC_WHILE_LO
UNSPEC_WHILE_LS
UNSPEC_WHILE_LT
+ UNSPEC_WHILERW
+ UNSPEC_WHILEWR
UNSPEC_LDN
UNSPEC_STN
UNSPEC_INSR
===================================================================
@@ -869,3 +869,8 @@ (define_special_predicate "aarch64_any_r
(define_predicate "aarch64_sve_any_binary_operator"
(match_code "plus,minus,mult,div,udiv,smax,umax,smin,umin,and,ior,xor"))
+
+(define_predicate "aarch64_bytes_per_sve_vector_operand"
+ (and (match_code "const_int,const_poly_int")
+ (match_test "known_eq (wi::to_poly_wide (op, mode),
+ BYTES_PER_SVE_VECTOR)")))
===================================================================
@@ -331,3 +331,61 @@ (define_insn_and_rewrite "*aarch64_sve2_
}
[(set_attr "movprfx" "*,yes")]
)
+
+;; Use WHILERW and WHILEWR to accelerate alias checks. This is only
+;; possible if the accesses we're checking are exactly the same size
+;; as an SVE vector.
+(define_expand "check_<raw_war>_ptrs<mode>"
+ [(match_operand:GPI 0 "register_operand")
+ (unspec:VNx16BI
+ [(match_operand:GPI 1 "register_operand")
+ (match_operand:GPI 2 "register_operand")
+ (match_operand:GPI 3 "aarch64_bytes_per_sve_vector_operand")
+ (match_operand:GPI 4 "const_int_operand")]
+ SVE2_WHILE_PTR)]
+ "TARGET_SVE2"
+{
+ /* Use the widest predicate mode we can. */
+ unsigned int align = INTVAL (operands[4]);
+ if (align > 8)
+ align = 8;
+ machine_mode pred_mode = aarch64_sve_pred_mode (align).require ();
+
+ /* Emit a WHILERW or WHILEWR, setting the condition codes based on
+ the result. */
+ emit_insn (gen_aarch64_sve2_while_ptest
+ (<SVE2_WHILE_PTR:unspec>, <MODE>mode, pred_mode,
+ gen_rtx_SCRATCH (pred_mode), operands[1], operands[2],
+ CONSTM1_RTX (VNx16BImode), CONSTM1_RTX (pred_mode)));
+
+ /* Set operand 0 to true if the last bit of the predicate result is set,
+ i.e. if all elements are free of dependencies. */
+ rtx cc_reg = gen_rtx_REG (CC_NZCmode, CC_REGNUM);
+ rtx cmp = gen_rtx_LTU (<MODE>mode, cc_reg, const0_rtx);
+ emit_insn (gen_aarch64_cstore<mode> (operands[0], cmp, cc_reg));
+ DONE;
+})
+
+;; A WHILERW or WHILEWR in which only the flags result is interesting.
+(define_insn_and_rewrite "@aarch64_sve2_while<cmp_op><GPI:mode><PRED_ALL:mode>_ptest"
+ [(set (reg:CC_NZC CC_REGNUM)
+ (unspec:CC_NZC
+ [(match_operand 3)
+ (match_operand 4)
+ (const_int SVE_KNOWN_PTRUE)
+ (unspec:PRED_ALL
+ [(match_operand:GPI 1 "register_operand" "r")
+ (match_operand:GPI 2 "register_operand" "r")]
+ SVE2_WHILE_PTR)]
+ UNSPEC_PTEST))
+ (clobber (match_scratch:PRED_ALL 0 "=Upa"))]
+ "TARGET_SVE2"
+ "while<cmp_op>\t%0.<PRED_ALL:Vetype>, %x1, %x2"
+ ;; Force the compiler to drop the unused predicate operand, so that we
+ ;; don't have an unnecessary PTRUE.
+ "&& (!CONSTANT_P (operands[3]) || !CONSTANT_P (operands[4]))"
+ {
+ operands[3] = CONSTM1_RTX (VNx16BImode);
+ operands[4] = CONSTM1_RTX (<PRED_ALL:MODE>mode);
+ }
+)
===================================================================
@@ -6459,6 +6459,13 @@ proc check_effective_target_vect_natural
return $et_vect_natural_alignment
}
+# Return true if the target supports the check_raw_ptrs and check_war_ptrs
+# optabs on vectors.
+
+proc check_effective_target_vect_check_ptrs { } {
+ return [check_effective_target_aarch64_sve2]
+}
+
# Return true if fully-masked loops are supported.
proc check_effective_target_vect_fully_masked { } {
===================================================================
@@ -60,5 +60,6 @@ main (void)
/* { dg-final { scan-tree-dump {flags: *WAR\n} "vect" { target vect_int } } } */
/* { dg-final { scan-tree-dump-not {flags: [^\n]*ARBITRARY\n} "vect" } } */
-/* { dg-final { scan-tree-dump "using an address-based WAR/WAW test" "vect" } } */
+/* { dg-final { scan-tree-dump "using an address-based WAR/WAW test" "vect" { target { ! vect_check_ptrs } } } } */
+/* { dg-final { scan-tree-dump "using an IFN_CHECK_WAR_PTRS test" "vect" { target vect_check_ptrs } } } */
/* { dg-final { scan-tree-dump-not "using an index-based" "vect" } } */
===================================================================
@@ -57,5 +57,6 @@ main (void)
}
/* { dg-final { scan-tree-dump {flags: *WAW\n} "vect" { target vect_int } } } */
-/* { dg-final { scan-tree-dump "using an address-based WAR/WAW test" "vect" } } */
+/* { dg-final { scan-tree-dump "using an address-based WAR/WAW test" "vect" { target { ! vect_check_ptrs } } } } */
+/* { dg-final { scan-tree-dump "using an IFN_CHECK_WAR_PTRS test" "vect" { target vect_check_ptrs } } } */
/* { dg-final { scan-tree-dump-not "using an index-based" "vect" } } */
===================================================================
@@ -62,5 +62,6 @@ main (void)
}
/* { dg-final { scan-tree-dump {flags: *RAW\n} "vect" { target vect_int } } } */
-/* { dg-final { scan-tree-dump "using an address-based overlap test" "vect" } } */
+/* { dg-final { scan-tree-dump "using an address-based overlap test" "vect" { target { ! vect_check_ptrs } } } } */
+/* { dg-final { scan-tree-dump "using an IFN_CHECK_RAW_PTRS test" "vect" { target vect_check_ptrs } } } */
/* { dg-final { scan-tree-dump-not "using an index-based" "vect" } } */
===================================================================
@@ -0,0 +1,30 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+/* { dg-require-effective-target lp64 } */
+
+#include <stdint.h>
+
+#define TEST_LOOP(TYPE) \
+ TYPE \
+ test_##TYPE (TYPE *dst, TYPE *src, int n) \
+ { \
+ TYPE res = 0; \
+ for (int i = 0; i < n; ++i) \
+ { \
+ dst[i] += 1; \
+ res += src[i]; \
+ } \
+ return res; \
+ }
+
+TEST_LOOP (int8_t);
+TEST_LOOP (int16_t);
+TEST_LOOP (int32_t);
+TEST_LOOP (int64_t);
+
+/* { dg-final { scan-assembler-times {\twhilerw\t} 4 } } */
+/* { dg-final { scan-assembler-times {\twhilerw\tp[0-9]+\.b, x0, x1\n} 1 } } */
+/* { dg-final { scan-assembler-times {\twhilerw\tp[0-9]+\.h, x0, x1\n} 1 } } */
+/* { dg-final { scan-assembler-times {\twhilerw\tp[0-9]+\.s, x0, x1\n} 1 } } */
+/* { dg-final { scan-assembler-times {\twhilerw\tp[0-9]+\.d, x[0-9]+, x1\n} 1 } } */
+/* { dg-final { scan-assembler-not {\twhilewr\t} } } */
===================================================================
@@ -0,0 +1,29 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+/* { dg-require-effective-target lp64 } */
+
+#include <stdint.h>
+
+#define TEST_LOOP(TYPE) \
+ void \
+ test_##TYPE (TYPE *dst, TYPE *src1, TYPE *src2, int n) \
+ { \
+ for (int i = 0; i < n; ++i) \
+ dst[i] = src1[i] + src2[i]; \
+ }
+
+TEST_LOOP (int8_t);
+TEST_LOOP (int16_t);
+TEST_LOOP (int32_t);
+TEST_LOOP (int64_t);
+
+/* { dg-final { scan-assembler-times {\twhilewr\t} 8 } } */
+/* { dg-final { scan-assembler-times {\twhilewr\tp[0-9]+\.b, x1, x0\n} 1 } } */
+/* { dg-final { scan-assembler-times {\twhilewr\tp[0-9]+\.b, x2, x0\n} 1 } } */
+/* { dg-final { scan-assembler-times {\twhilewr\tp[0-9]+\.h, x1, x0\n} 1 } } */
+/* { dg-final { scan-assembler-times {\twhilewr\tp[0-9]+\.h, x2, x0\n} 1 } } */
+/* { dg-final { scan-assembler-times {\twhilewr\tp[0-9]+\.s, x1, x0\n} 1 } } */
+/* { dg-final { scan-assembler-times {\twhilewr\tp[0-9]+\.s, x2, x0\n} 1 } } */
+/* { dg-final { scan-assembler-times {\twhilewr\tp[0-9]+\.d, x1, x0\n} 1 } } */
+/* { dg-final { scan-assembler-times {\twhilewr\tp[0-9]+\.d, x2, x0\n} 1 } } */
+/* { dg-final { scan-assembler-not {\twhilerw\t} } } */
===================================================================
@@ -0,0 +1,37 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize -fno-tree-loop-distribute-patterns" } */
+/* { dg-require-effective-target lp64 } */
+
+#include <stdint.h>
+
+#define TEST_LOOP(TYPE) \
+ void \
+ test_##TYPE (TYPE *dst1, TYPE *dst2, TYPE *dst3, int n) \
+ { \
+ for (int i = 0; i < n; ++i) \
+ { \
+ dst1[i] = 1; \
+ dst2[i] = 2; \
+ dst3[i] = 3; \
+ } \
+ }
+
+TEST_LOOP (int8_t);
+TEST_LOOP (int16_t);
+TEST_LOOP (int32_t);
+TEST_LOOP (int64_t);
+
+/* { dg-final { scan-assembler-times {\twhilewr\t} 12 } } */
+/* { dg-final { scan-assembler-times {\twhilewr\tp[0-9]+\.b, x0, x1\n} 1 } } */
+/* { dg-final { scan-assembler-times {\twhilewr\tp[0-9]+\.b, x0, x2\n} 1 } } */
+/* { dg-final { scan-assembler-times {\twhilewr\tp[0-9]+\.b, x1, x2\n} 1 } } */
+/* { dg-final { scan-assembler-times {\twhilewr\tp[0-9]+\.h, x0, x1\n} 1 } } */
+/* { dg-final { scan-assembler-times {\twhilewr\tp[0-9]+\.h, x0, x2\n} 1 } } */
+/* { dg-final { scan-assembler-times {\twhilewr\tp[0-9]+\.h, x1, x2\n} 1 } } */
+/* { dg-final { scan-assembler-times {\twhilewr\tp[0-9]+\.s, x0, x1\n} 1 } } */
+/* { dg-final { scan-assembler-times {\twhilewr\tp[0-9]+\.s, x0, x2\n} 1 } } */
+/* { dg-final { scan-assembler-times {\twhilewr\tp[0-9]+\.s, x1, x2\n} 1 } } */
+/* { dg-final { scan-assembler-times {\twhilewr\tp[0-9]+\.d, x0, x1\n} 1 } } */
+/* { dg-final { scan-assembler-times {\twhilewr\tp[0-9]+\.d, x0, x2\n} 1 } } */
+/* { dg-final { scan-assembler-times {\twhilewr\tp[0-9]+\.d, x1, x2\n} 1 } } */
+/* { dg-final { scan-assembler-not {\twhilerw\t} } } */