@@ -286,6 +286,7 @@ struct riscv_tune_param
unsigned short memory_cost;
unsigned short fmv_cost;
bool slow_unaligned_access;
+ bool overlap_op_by_pieces;
bool use_divmod_expansion;
unsigned int fusible_ops;
const struct cpu_vector_cost *vec_costs;
@@ -425,6 +426,7 @@ static const struct riscv_tune_param rocket_tune_info = {
5, /* memory_cost */
8, /* fmv_cost */
true, /* slow_unaligned_access */
+ false, /* overlap_op_by_pieces */
false, /* use_divmod_expansion */
RISCV_FUSE_NOTHING, /* fusible_ops */
NULL, /* vector cost */
@@ -442,6 +444,7 @@ static const struct riscv_tune_param sifive_7_tune_info = {
3, /* memory_cost */
8, /* fmv_cost */
true, /* slow_unaligned_access */
+ false, /* overlap_op_by_pieces */
false, /* use_divmod_expansion */
RISCV_FUSE_NOTHING, /* fusible_ops */
NULL, /* vector cost */
@@ -459,6 +462,7 @@ static const struct riscv_tune_param sifive_p400_tune_info = {
3, /* memory_cost */
4, /* fmv_cost */
true, /* slow_unaligned_access */
+ false, /* overlap_op_by_pieces */
false, /* use_divmod_expansion */
RISCV_FUSE_LUI_ADDI | RISCV_FUSE_AUIPC_ADDI, /* fusible_ops */
&generic_vector_cost, /* vector cost */
@@ -476,6 +480,7 @@ static const struct riscv_tune_param sifive_p600_tune_info = {
3, /* memory_cost */
4, /* fmv_cost */
true, /* slow_unaligned_access */
+ false, /* overlap_op_by_pieces */
false, /* use_divmod_expansion */
RISCV_FUSE_LUI_ADDI | RISCV_FUSE_AUIPC_ADDI, /* fusible_ops */
&generic_vector_cost, /* vector cost */
@@ -493,6 +498,7 @@ static const struct riscv_tune_param thead_c906_tune_info = {
5, /* memory_cost */
8, /* fmv_cost */
false, /* slow_unaligned_access */
+ false, /* overlap_op_by_pieces */
false, /* use_divmod_expansion */
RISCV_FUSE_NOTHING, /* fusible_ops */
NULL, /* vector cost */
@@ -510,6 +516,7 @@ static const struct riscv_tune_param xiangshan_nanhu_tune_info = {
3, /* memory_cost */
3, /* fmv_cost */
true, /* slow_unaligned_access */
+ false, /* overlap_op_by_pieces */
false, /* use_divmod_expansion */
RISCV_FUSE_ZEXTW | RISCV_FUSE_ZEXTH, /* fusible_ops */
NULL, /* vector cost */
@@ -527,6 +534,7 @@ static const struct riscv_tune_param generic_ooo_tune_info = {
4, /* memory_cost */
4, /* fmv_cost */
false, /* slow_unaligned_access */
+ true, /* overlap_op_by_pieces */
false, /* use_divmod_expansion */
RISCV_FUSE_NOTHING, /* fusible_ops */
&generic_vector_cost, /* vector cost */
@@ -544,6 +552,7 @@ static const struct riscv_tune_param optimize_size_tune_info = {
2, /* memory_cost */
8, /* fmv_cost */
false, /* slow_unaligned_access */
+ false, /* overlap_op_by_pieces */
false, /* use_divmod_expansion */
RISCV_FUSE_NOTHING, /* fusible_ops */
NULL, /* vector cost */
@@ -9923,6 +9932,14 @@ riscv_slow_unaligned_access (machine_mode, unsigned int)
return riscv_slow_unaligned_access_p;
}
+/* Implement TARGET_OVERLAP_OP_BY_PIECES_P. */
+
+static bool
+riscv_overlap_op_by_pieces (void)
+{
+ return tune_param->overlap_op_by_pieces;
+}
+
/* Implement TARGET_CAN_CHANGE_MODE_CLASS. */
static bool
@@ -11340,6 +11357,9 @@ riscv_get_raw_result_mode (int regno)
#undef TARGET_SLOW_UNALIGNED_ACCESS
#define TARGET_SLOW_UNALIGNED_ACCESS riscv_slow_unaligned_access
+#undef TARGET_OVERLAP_OP_BY_PIECES_P
+#define TARGET_OVERLAP_OP_BY_PIECES_P riscv_overlap_op_by_pieces
+
#undef TARGET_SECONDARY_MEMORY_NEEDED
#define TARGET_SECONDARY_MEMORY_NEEDED riscv_secondary_memory_needed
@@ -24,9 +24,8 @@ void copy_aligned_##N (void *to, void *from) \
** ...
** lw\t[at][0-9],0\([at][0-9]\)
** sw\t[at][0-9],0\([at][0-9]\)
-** ...
-** lbu\t[at][0-9],6\([at][0-9]\)
-** sb\t[at][0-9],6\([at][0-9]\)
+** lw\t[at][0-9],3\([at][0-9]\)
+** sw\t[at][0-9],3\([at][0-9]\)
** ...
*/
COPY_N(7)
@@ -36,9 +35,8 @@ COPY_N(7)
** ...
** lw\t[at][0-9],0\([at][0-9]\)
** sw\t[at][0-9],0\([at][0-9]\)
-** ...
-** lbu\t[at][0-9],6\([at][0-9]\)
-** sb\t[at][0-9],6\([at][0-9]\)
+** lw\t[at][0-9],3\([at][0-9]\)
+** sw\t[at][0-9],3\([at][0-9]\)
** ...
*/
COPY_ALIGNED_N(7)
@@ -66,11 +64,10 @@ COPY_ALIGNED_N(8)
** ...
** ...
** lw\t[at][0-9],0\([at][0-9]\)
-** ...
** sw\t[at][0-9],0\([at][0-9]\)
** ...
-** lbu\t[at][0-9],10\([at][0-9]\)
-** sb\t[at][0-9],10\([at][0-9]\)
+** lw\t[at][0-9],7\([at][0-9]\)
+** sw\t[at][0-9],7\([at][0-9]\)
** ...
*/
COPY_N(11)
@@ -79,11 +76,10 @@ COPY_N(11)
**copy_aligned_11:
** ...
** lw\t[at][0-9],0\([at][0-9]\)
-** ...
** sw\t[at][0-9],0\([at][0-9]\)
** ...
-** lbu\t[at][0-9],10\([at][0-9]\)
-** sb\t[at][0-9],10\([at][0-9]\)
+** lw\t[at][0-9],7\([at][0-9]\)
+** sw\t[at][0-9],7\([at][0-9]\)
** ...
*/
COPY_ALIGNED_N(11)
@@ -24,9 +24,8 @@ void copy_aligned_##N (void *to, void *from) \
** ...
** lw\t[at][0-9],0\([at][0-9]\)
** sw\t[at][0-9],0\([at][0-9]\)
-** ...
-** lbu\t[at][0-9],6\([at][0-9]\)
-** sb\t[at][0-9],6\([at][0-9]\)
+** lw\t[at][0-9],3\([at][0-9]\)
+** sw\t[at][0-9],3\([at][0-9]\)
** ...
*/
COPY_N(7)
@@ -36,9 +35,8 @@ COPY_N(7)
** ...
** lw\t[at][0-9],0\([at][0-9]\)
** sw\t[at][0-9],0\([at][0-9]\)
-** ...
-** lbu\t[at][0-9],6\([at][0-9]\)
-** sb\t[at][0-9],6\([at][0-9]\)
+** lw\t[at][0-9],3\([at][0-9]\)
+** sw\t[at][0-9],3\([at][0-9]\)
** ...
*/
COPY_ALIGNED_N(7)
@@ -66,9 +64,8 @@ COPY_ALIGNED_N(8)
** ...
** ld\t[at][0-9],0\([at][0-9]\)
** sd\t[at][0-9],0\([at][0-9]\)
-** ...
-** lbu\t[at][0-9],10\([at][0-9]\)
-** sb\t[at][0-9],10\([at][0-9]\)
+** lw\t[at][0-9],7\([at][0-9]\)
+** sw\t[at][0-9],7\([at][0-9]\)
** ...
*/
COPY_N(11)
@@ -77,11 +74,9 @@ COPY_N(11)
**copy_aligned_11:
** ...
** ld\t[at][0-9],0\([at][0-9]\)
-** ...
** sd\t[at][0-9],0\([at][0-9]\)
-** ...
-** lbu\t[at][0-9],10\([at][0-9]\)
-** sb\t[at][0-9],10\([at][0-9]\)
+** lw\t[at][0-9],7\([at][0-9]\)
+** sw\t[at][0-9],7\([at][0-9]\)
** ...
*/
COPY_ALIGNED_N(11)
@@ -90,11 +85,9 @@ COPY_ALIGNED_N(11)
**copy_15:
** ...
** ld\t[at][0-9],0\([at][0-9]\)
-** ...
** sd\t[at][0-9],0\([at][0-9]\)
-** ...
-** lbu\t[at][0-9],14\([at][0-9]\)
-** sb\t[at][0-9],14\([at][0-9]\)
+** ld\t[at][0-9],7\([at][0-9]\)
+** sd\t[at][0-9],7\([at][0-9]\)
** ...
*/
COPY_N(15)
@@ -103,11 +96,9 @@ COPY_N(15)
**copy_aligned_15:
** ...
** ld\t[at][0-9],0\([at][0-9]\)
-** ...
** sd\t[at][0-9],0\([at][0-9]\)
-** ...
-** lbu\t[at][0-9],14\([at][0-9]\)
-** sb\t[at][0-9],14\([at][0-9]\)
+** ld\t[at][0-9],7\([at][0-9]\)
+** sd\t[at][0-9],7\([at][0-9]\)
** ...
*/
COPY_ALIGNED_N(15)
This patch adds the field overlap_op_by_pieces to the struct riscv_tune_param, which is used by the TARGET_OVERLAP_OP_BY_PIECES_P() hook. This hook is used by the by-pieces infrastructure to decide if overlapping memory accesses should be emitted. The new property is set to false in all tune structs except for generic-ooo. The changes in the expansion can be seen in the adjustments of the cpymem test cases. These tests also reveal a limitation in the RISC-V cpymem expansion that prevents this optimization as only by-pieces cpymem expansions emit overlapping memory accesses. gcc/ChangeLog: * config/riscv/riscv.cc (struct riscv_tune_param): New field overlap_op_by_pieces. (riscv_overlap_op_by_pieces): New function. (TARGET_OVERLAP_OP_BY_PIECES_P): Connect to riscv_overlap_op_by_pieces. gcc/testsuite/ChangeLog: * gcc.target/riscv/cpymem-32-ooo.c: Adjust for overlapping access. * gcc.target/riscv/cpymem-64-ooo.c: Likewise. Signed-off-by: Christoph Müllner <christoph.muellner@vrull.eu> --- gcc/config/riscv/riscv.cc | 20 +++++++++++ .../gcc.target/riscv/cpymem-32-ooo.c | 20 +++++------ .../gcc.target/riscv/cpymem-64-ooo.c | 33 +++++++------------ 3 files changed, 40 insertions(+), 33 deletions(-)