@@ -197,6 +197,12 @@ rtl_opt_pass * make_pass_shorten_memrefs (gcc::context *ctxt);
rtl_opt_pass * make_pass_avlprop (gcc::context *ctxt);
rtl_opt_pass * make_pass_vsetvl (gcc::context *ctxt);
+/* Routines implemented in riscv-vsetvl.cc. */
+extern bool has_vtype_op (rtx_insn *);
+extern bool mask_agnostic_p (rtx_insn *);
+extern rtx get_avl (rtx_insn *);
+extern bool vsetvl_insn_p (rtx_insn *);
+
/* Routines implemented in riscv-string.c. */
extern bool riscv_expand_block_compare (rtx, rtx, rtx, rtx);
extern bool riscv_expand_block_move (rtx, rtx, rtx);
@@ -259,7 +259,7 @@ policy_to_str (bool agnostic_p)
/* Return true if it is an RVV instruction depends on VTYPE global
status register. */
-static bool
+bool
has_vtype_op (rtx_insn *rinsn)
{
return recog_memoized (rinsn) >= 0 && get_attr_has_vtype_op (rinsn);
@@ -307,7 +307,7 @@ vector_config_insn_p (rtx_insn *rinsn)
}
/* Return true if it is vsetvldi or vsetvlsi. */
-static bool
+bool
vsetvl_insn_p (rtx_insn *rinsn)
{
if (!rinsn || !vector_config_insn_p (rinsn))
@@ -387,7 +387,7 @@ get_vl (rtx_insn *rinsn)
}
/* Helper function to get AVL operand. */
-static rtx
+rtx
get_avl (rtx_insn *rinsn)
{
if (vsetvl_insn_p (rinsn) || vsetvl_discard_result_insn_p (rinsn))
@@ -412,7 +412,7 @@ get_default_ma ()
}
/* Helper function to get MA operand. */
-static bool
+bool
mask_agnostic_p (rtx_insn *rinsn)
{
/* If it doesn't have MA, we return agnostic by default. */
@@ -9612,6 +9612,71 @@ riscv_issue_rate (void)
return tune_param->issue_rate;
}
+/* Structure for very basic vector configuration tracking in the scheduler. */
+struct last_vconfig
+{
+ bool valid;
+ bool ta;
+ bool ma;
+ uint8_t sew;
+ uint8_t vlmul;
+ rtx avl;
+} last_vconfig;
+
+/* Clear LAST_VCONFIG so we have no known state. */
+static void
+clear_vconfig (void)
+{
+ memset (&last_vconfig, 0, sizeof (last_vconfig));
+}
+
+/* Return TRUE if INSN is a vector insn needing a particular
+ vector configuration that is trivially equal to the last
+ vector insn issued. Return FALSE otherwise. */
+static bool
+compatible_with_last_vconfig (rtx_insn *insn)
+{
+ /* We might be able to extract the data from a preexisting vsetvl. */
+ if (vsetvl_insn_p (insn))
+ return false;
+
+ /* Nothing to do for these cases. */
+ if (!NONDEBUG_INSN_P (insn) || !has_vtype_op (insn))
+ return false;
+
+ extract_insn_cached (insn);
+
+ rtx avl = get_avl (insn);
+ if (avl != last_vconfig.avl)
+ return false;
+
+ if (get_sew (insn) != last_vconfig.sew)
+ return false;
+
+ if (get_vlmul (insn) != last_vconfig.vlmul)
+ return false;
+
+ if (tail_agnostic_p (insn) != last_vconfig.ta)
+ return false;
+
+ if (mask_agnostic_p (insn) != last_vconfig.ma)
+ return false;
+
+ /* No differences found, they're trivially compatible. */
+ return true;
+}
+
+/* Implement TARGET_SCHED_INIT, we use this to track the vector configuration
+ of the last issued vector instruction. We can then use that information
+ to potentially adjust the ready queue to issue instructions of a compatible
+ vector configuration instead of a conflicting configuration. That will
+ reduce the number of vsetvl instructions we ultimately emit. */
+static void
+riscv_sched_init (FILE *, int, int)
+{
+ clear_vconfig ();
+}
+
/* Implement TARGET_SCHED_VARIABLE_ISSUE. */
static int
riscv_sched_variable_issue (FILE *, int, rtx_insn *insn, int more)
@@ -9636,9 +9701,85 @@ riscv_sched_variable_issue (FILE *, int, rtx_insn *insn, int more)
an assert so we can find and fix this problem. */
gcc_assert (insn_has_dfa_reservation_p (insn));
+ /* If this is a vector insn with vl/vtype info, then record the last
+ vector configuration. */
+ if (vsetvl_insn_p (insn))
+ clear_vconfig ();
+ else if (NONDEBUG_INSN_P (insn) && has_vtype_op (insn))
+ {
+ extract_insn_cached (insn);
+
+ rtx avl = get_avl (insn);
+ if (avl == RVV_VLMAX)
+ avl = const0_rtx;
+
+ if (!avl || !CONST_INT_P (avl))
+ clear_vconfig ();
+ else
+ {
+ last_vconfig.valid = true;
+ last_vconfig.avl = avl;
+ last_vconfig.sew = get_sew (insn);
+ last_vconfig.vlmul = get_vlmul (insn);
+ last_vconfig.ta = tail_agnostic_p (insn);
+ last_vconfig.ma = mask_agnostic_p (insn);
+ }
+ }
return more - 1;
}
+/* Implement TARGET_SCHED_REORDER. The goal here is to look at the ready
+ queue and reorder it ever so slightly to encourage issing an insn with
+ the same vector configuration as the most recently issued vector
+ instruction. That will reduce vsetvl instructions. */
+static int
+riscv_sched_reorder (FILE *, int, rtx_insn **ready, int *nreadyp, int)
+{
+ /* If we don't have a valid prior vector configuration, then there is
+ no point in reordering the ready queue, similarly if there is
+ just one entry in the queue. */
+ if (!last_vconfig.valid || *nreadyp == 1)
+ return riscv_issue_rate ();
+
+ int nready = *nreadyp;
+ int priority = INSN_PRIORITY (ready[nready - 1]);
+ for (int i = nready - 1; i >= 0; i--)
+ {
+ rtx_insn *insn = ready[i];
+
+ /* On a high performance core, vsetvl instructions should be
+ inexpensive. Removing them is very much a secondary concern, so
+ be extremely conservative with reordering, essentially only
+ allowing reordering within the highest priority value.
+
+ Lower end cores may benefit from more flexibility here. That
+ tuning is left to those who understand their core's behavior
+ and can thoroughly benchmark the result. Assuming such
+ designs appear, we can probably put an entry in the tuning
+ structure to indicate how much difference in priority to allow. */
+ if (INSN_PRIORITY (insn) < priority)
+ break;
+
+ if (compatible_with_last_vconfig (insn))
+ {
+ /* This entry is compatible with the last vconfig and has
+ the same priority as the most important insn. So swap
+ it so that we keep the vector configuration as-is and
+ ultimately eliminate a vsetvl.
+
+ Note no need to swap if this is the first entry in the
+ queue. */
+ if (i == nready - 1)
+ break;
+
+ std::swap (ready[i], ready[nready - 1]);
+ break;
+ }
+ }
+
+ return riscv_issue_rate ();
+}
+
/* Implement TARGET_SCHED_MACRO_FUSION_P. Return true if target supports
instruction fusion of some sort. */
@@ -12610,9 +12751,15 @@ riscv_stack_clash_protection_alloca_probe_range (void)
#undef TARGET_SCHED_MACRO_FUSION_PAIR_P
#define TARGET_SCHED_MACRO_FUSION_PAIR_P riscv_macro_fusion_pair_p
+#undef TARGET_SCHED_INIT
+#define TARGET_SCHED_INIT riscv_sched_init
+
#undef TARGET_SCHED_VARIABLE_ISSUE
#define TARGET_SCHED_VARIABLE_ISSUE riscv_sched_variable_issue
+#undef TARGET_SCHED_REORDER
+#define TARGET_SCHED_REORDER riscv_sched_reorder
+
#undef TARGET_SCHED_ADJUST_COST
#define TARGET_SCHED_ADJUST_COST riscv_sched_adjust_cost
@@ -11,5 +11,5 @@ void foo(_Float16 y, int64_t *i64p)
asm volatile ("# use %0 %1" : : "vr"(vx), "vr" (vy));
}
-/* { dg-final { scan-assembler-times {vsetivli\s+zero,\s*1,\s*e16,\s*mf4,\s*t[au],\s*m[au]} 1 } } */
-/* { dg-final { scan-assembler-times {vsetvli\s+zero,\s*zero,\s*e64,\s*m1,\s*t[au],\s*m[au]} 1 } } */
+/* { dg-final { scan-assembler-times {vsetivli\s+zero,\s*1,\s*e16,\s*m1,\s*t[au],\s*m[au]} 1 } } */
+/* { dg-final { scan-assembler-times {vsetivli\s+zero,\s*1,\s*e64,\s*m1,\s*t[au],\s*m[au]} 1 } } */
@@ -12,5 +12,5 @@ void foo(_Float16 y, int16_t z, int64_t *i64p)
asm volatile ("# use %0 %1" : : "vr"(vx), "vr" (vy), "vr" (vz));
}
-/* { dg-final { scan-assembler-times {vsetivli\s+zero,\s*1,\s*e16,\s*mf4,\s*t[au],\s*m[au]} 1 } } */
-/* { dg-final { scan-assembler-times {vsetvli\s+zero,\s*zero,\s*e64,\s*m1,\s*t[au],\s*m[au]} 1 } } */
+/* { dg-final { scan-assembler-times {vsetivli\s+zero,\s*1,\s*e64,\s*m1,\s*t[au],\s*m[au]} 1 } } */
+/* { dg-final { scan-assembler-times {vsetivli\s+zero,\s*1,\s*e16,\s*m1,\s*t[au],\s*m[au]} 1 } } */
@@ -11,5 +11,5 @@ void foo(_Float16 y, int64_t *i64p)
asm volatile ("# use %0 %1" : : "vr"(vx), "vr" (vy));
}
-/* { dg-final { scan-assembler-times {vsetivli\s+zero,\s*1,\s*e16,\s*mf4,\s*t[au],\s*m[au]} 1 } } */
-/* { dg-final { scan-assembler-times {vsetvli\s+zero,\s*zero,\s*e64,\s*m1,\s*t[au],\s*m[au]} 1 } } */
+/* { dg-final { scan-assembler-times {vsetivli\s+zero,\s*1,\s*e64,\s*m1,\s*t[au],\s*m[au]} 1 } } */
+/* { dg-final { scan-assembler-times {vsetivli\s+zero,\s*1,\s*e16,\s*m1,\s*t[au],\s*m[au]} 1 } } */