diff mbox series

[2/4] xtensa: Consider the Loop Option when setmemsi is expanded to small loop

Message ID 842bf876-2f00-7216-b6f4-e625a13868dc@yahoo.co.jp
State New
Headers show
Series [1/4] xtensa: Tweak some widen multiplications | expand

Commit Message

Takayuki 'January June' Suwa June 10, 2022, 4:18 a.m. UTC
Now apply to almost any size of aligned block under such circumstances.

gcc/ChangeLog:

	* config/xtensa/xtensa.cc (xtensa_expand_block_set_small_loop):
	Pass through the block length / loop count conditions if
	zero-overhead looping is configured and active,
---
  gcc/config/xtensa/xtensa.cc | 65 +++++++++++++++++++++++++------------
  1 file changed, 45 insertions(+), 20 deletions(-)

    /* Insn expansion: holding the init value.
       Either MOV(.N) or L32R w/litpool.  */
@@ -1523,16 +1531,33 @@ xtensa_expand_block_set_small_loop (rtx *operands)
      expand_len = TARGET_DENSITY ? 2 : 3;
    else
      expand_len = 3 + 4;
-  /* Insn expansion: Either ADDI(.N) or ADDMI for the end address.  */
-  expand_len += bytes > 127 ? 3
-			    : (TARGET_DENSITY && bytes <= 15) ? 2 : 3;
-
-  /* Insn expansion: the loop body and branch instruction.
-     For store, one of S8I, S16I or S32I(.N).
-     For advance, ADDI(.N).
-     For branch, BNE.  */
-  expand_len += (TARGET_DENSITY && align == 4 ? 2 : 3)
-		+ (TARGET_DENSITY ? 2 : 3) + 3;
+  if (TARGET_LOOPS && optimize) /* zero-overhead looping */
+    {
+      /* Insn translation: Either MOV(.N) or L32R w/litpool for the
+	 loop count.  */
+      expand_len += xtensa_simm12b (count) ? xtensa_sizeof_MOVI (count)
+					   : 3 + 4;
+      /* Insn translation: LOOP, the zero-overhead looping setup
+	 instruction.  */
+      expand_len += 3;
+      /* Insn expansion: the loop body instructions.
+	For store, one of S8I, S16I or S32I(.N).
+	For advance, ADDI(.N).  */
+      expand_len += (TARGET_DENSITY && align == 4 ? 2 : 3)
+		    + (TARGET_DENSITY ? 2 : 3);
+    }
+  else /* NO zero-overhead looping */
+    {
+      /* Insn expansion: Either ADDI(.N) or ADDMI for the end address.  */
+      expand_len += bytes > 127 ? 3
+				: (TARGET_DENSITY && bytes <= 15) ? 2 : 3;
+      /* Insn expansion: the loop body and branch instruction.
+	For store, one of S8I, S16I or S32I(.N).
+	For advance, ADDI(.N).
+	For branch, BNE.  */
+      expand_len += (TARGET_DENSITY && align == 4 ? 2 : 3)
+		    + (TARGET_DENSITY ? 2 : 3) + 3;
+    }

    /* Function call: preparing two arguments.  */
    funccall_len = xtensa_sizeof_MOVI (value);

Comments

Max Filippov June 11, 2022, 12:12 a.m. UTC | #1
Hi Suwa-san,

On Thu, Jun 9, 2022 at 9:26 PM Takayuki 'January June' Suwa
<jjsuwa_sys3175@yahoo.co.jp> wrote:
>
> Now apply to almost any size of aligned block under such circumstances.
>
> gcc/ChangeLog:
>
>         * config/xtensa/xtensa.cc (xtensa_expand_block_set_small_loop):
>         Pass through the block length / loop count conditions if
>         zero-overhead looping is configured and active,
> ---
>   gcc/config/xtensa/xtensa.cc | 65 +++++++++++++++++++++++++------------
>   1 file changed, 45 insertions(+), 20 deletions(-)

This change results in a bunch of ICEs in tests that look like this:

gcc/gcc/testsuite/gcc.c-torture/compile/memtst.c: In function 'main':
gcc/gcc/testsuite/gcc.c-torture/compile/memtst.c:28:1: error:
unrecognizable insn:
(insn 7 6 8 2 (set (reg:SI 45)
       (plus:SI (reg:SI 44)
           (const_int 262144 [0x40000])))
"gcc/gcc/testsuite/gcc.c-torture/compile/memtst.c":23:3 -1
    (nil))
during RTL pass: vregs
gcc/gcc/testsuite/gcc.c-torture/compile/memtst.c:28:1: internal
compiler error: in extract_insn, at recog.cc:2791
0x6a21cf _fatal_insn(char const*, rtx_def const*, char const*, int, char const*)
       gcc/gcc/rtl-error.cc:108
0x6a2252 _fatal_insn_not_found(rtx_def const*, char const*, int, char const*)
       gcc/gcc/rtl-error.cc:116
0x693824 extract_insn(rtx_insn*)
       gcc/gcc/recog.cc:2791
0xb27647 instantiate_virtual_regs_in_insn
       gcc/gcc/function.cc:1611
0xb27647 instantiate_virtual_regs
       gcc/gcc/function.cc:1985
0xb27647 execute
       gcc/gcc/function.cc:2034
Takayuki 'January June' Suwa June 11, 2022, 3:31 a.m. UTC | #2
On 2022/06/11 9:12, Max Filippov wrote:
> Hi Suwa-san,
hi!

> This change results in a bunch of ICEs in tests that look like this:
> 
> gcc/gcc/testsuite/gcc.c-torture/compile/memtst.c: In function 'main':
> gcc/gcc/testsuite/gcc.c-torture/compile/memtst.c:28:1: error:
> unrecognizable insn:
> (insn 7 6 8 2 (set (reg:SI 45)
>         (plus:SI (reg:SI 44)
>             (const_int 262144 [0x40000])))
oh, what a my mistake... it's so RISCy!

     int array[65535];
     void test(void) {
       __builtin_memset(array, 0, sizeof(array));
     }

	.literal_position
	.literal .LC0, array
	.literal .LC2, 65535
test:
	l32r	a3, .LC0
	l32r	a2, .LC2
	movi.n	a4, 0
	loop	a2, .L2_LEND
.L2:
	s32i.n	a4, a3, 0
	addi.n	a3, a3, 4
	.L2_LEND:
	ret.n
---
   gcc/config/xtensa/xtensa.cc | 71 ++++++++++++++++++++++++++-----------
   1 file changed, 50 insertions(+), 21 deletions(-)

diff --git a/gcc/config/xtensa/xtensa.cc b/gcc/config/xtensa/xtensa.cc
index c7b54babc37..bc3330f836f 100644
--- a/gcc/config/xtensa/xtensa.cc
+++ b/gcc/config/xtensa/xtensa.cc
@@ -1483,7 +1483,7 @@ xtensa_expand_block_set_unrolled_loop (rtx *operands)
   int
   xtensa_expand_block_set_small_loop (rtx *operands)
   {
-  HOST_WIDE_INT bytes, value, align;
+  HOST_WIDE_INT bytes, value, align, count;
     int expand_len, funccall_len;
     rtx x, dst, end, reg;
     machine_mode unit_mode;
@@ -1503,17 +1503,25 @@ xtensa_expand_block_set_small_loop (rtx *operands)
     /* Totally-aligned block only.  */
     if (bytes % align != 0)
       return 0;
+  count = bytes / align;

-  /* If 4-byte aligned, small loop substitution is almost optimal, thus
-     limited to only offset to the end address for ADDI/ADDMI
instruction.  */
-  if (align == 4
-      && ! (bytes <= 127 || (bytes <= 32512 && bytes % 256 == 0)))
-    return 0;
+  /* If the Loop Option (zero-overhead looping) is configured and active,
+     almost no restrictions about the length of the block.  */
+  if (! (TARGET_LOOPS && optimize))
+    {
+      /* If 4-byte aligned, small loop substitution is almost optimal,
+	 thus limited to only offset to the end address for ADDI/ADDMI
+	 instruction.  */
+      if (align == 4
+	  && ! (bytes <= 127 || (bytes <= 32512 && bytes % 256 == 0)))
+	return 0;

-  /* If no 4-byte aligned, loop count should be treated as the
constraint.  */
-  if (align != 4
-      && bytes / align > ((optimize > 1 && !optimize_size) ? 8 : 15))
-    return 0;
+      /* If no 4-byte aligned, loop count should be treated as the
+	 constraint.  */
+      if (align != 4
+	  && count > ((optimize > 1 && !optimize_size) ? 8 : 15))
+	return 0;
+    }

     /* Insn expansion: holding the init value.
        Either MOV(.N) or L32R w/litpool.  */
@@ -1523,16 +1531,33 @@ xtensa_expand_block_set_small_loop (rtx *operands)
       expand_len = TARGET_DENSITY ? 2 : 3;
     else
       expand_len = 3 + 4;
-  /* Insn expansion: Either ADDI(.N) or ADDMI for the end address.  */
-  expand_len += bytes > 127 ? 3
-			    : (TARGET_DENSITY && bytes <= 15) ? 2 : 3;
-
-  /* Insn expansion: the loop body and branch instruction.
-     For store, one of S8I, S16I or S32I(.N).
-     For advance, ADDI(.N).
-     For branch, BNE.  */
-  expand_len += (TARGET_DENSITY && align == 4 ? 2 : 3)
-		+ (TARGET_DENSITY ? 2 : 3) + 3;
+  if (TARGET_LOOPS && optimize) /* zero-overhead looping */
+    {
+      /* Insn translation: Either MOV(.N) or L32R w/litpool for the
+	 loop count.  */
+      expand_len += xtensa_simm12b (count) ? xtensa_sizeof_MOVI (count)
+					   : 3 + 4;
+      /* Insn translation: LOOP, the zero-overhead looping setup
+	 instruction.  */
+      expand_len += 3;
+      /* Insn expansion: the loop body instructions.
+	For store, one of S8I, S16I or S32I(.N).
+	For advance, ADDI(.N).  */
+      expand_len += (TARGET_DENSITY && align == 4 ? 2 : 3)
+		    + (TARGET_DENSITY ? 2 : 3);
+    }
+  else /* NO zero-overhead looping */
+    {
+      /* Insn expansion: Either ADDI(.N) or ADDMI for the end address.  */
+      expand_len += bytes > 127 ? 3
+				: (TARGET_DENSITY && bytes <= 15) ? 2 : 3;
+      /* Insn expansion: the loop body and branch instruction.
+	For store, one of S8I, S16I or S32I(.N).
+	For advance, ADDI(.N).
+	For branch, BNE.  */
+      expand_len += (TARGET_DENSITY && align == 4 ? 2 : 3)
+		    + (TARGET_DENSITY ? 2 : 3) + 3;
+    }

     /* Function call: preparing two arguments.  */
     funccall_len = xtensa_sizeof_MOVI (value);
@@ -1555,7 +1580,11 @@ xtensa_expand_block_set_small_loop (rtx *operands)
     dst = gen_reg_rtx (SImode);
     emit_move_insn (dst, x);
     end = gen_reg_rtx (SImode);
-  emit_insn (gen_addsi3 (end, dst, operands[1] /* the length */));
+  if (TARGET_LOOPS && optimize)
+    x = force_reg (SImode, operands[1] /* the length */);
+  else
+    x = operands[1];
+  emit_insn (gen_addsi3 (end, dst, x));
     switch (align)
       {
       case 1:
diff mbox series

Patch

diff --git a/gcc/config/xtensa/xtensa.cc b/gcc/config/xtensa/xtensa.cc
index c7b54babc37..616ced3ed38 100644
--- a/gcc/config/xtensa/xtensa.cc
+++ b/gcc/config/xtensa/xtensa.cc
@@ -1483,7 +1483,7 @@  xtensa_expand_block_set_unrolled_loop (rtx *operands)
  int
  xtensa_expand_block_set_small_loop (rtx *operands)
  {
-  HOST_WIDE_INT bytes, value, align;
+  HOST_WIDE_INT bytes, value, align, count;
    int expand_len, funccall_len;
    rtx x, dst, end, reg;
    machine_mode unit_mode;
@@ -1503,17 +1503,25 @@  xtensa_expand_block_set_small_loop (rtx *operands)
    /* Totally-aligned block only.  */
    if (bytes % align != 0)
      return 0;
+  count = bytes / align;

-  /* If 4-byte aligned, small loop substitution is almost optimal, thus
-     limited to only offset to the end address for ADDI/ADDMI 
instruction.  */
-  if (align == 4
-      && ! (bytes <= 127 || (bytes <= 32512 && bytes % 256 == 0)))
-    return 0;
+  /* If the Loop Option (zero-overhead looping) is configured and active,
+     almost no restrictions about the length of the block.  */
+  if (! (TARGET_LOOPS && optimize))
+    {
+      /* If 4-byte aligned, small loop substitution is almost optimal,
+	 thus limited to only offset to the end address for ADDI/ADDMI
+	 instruction.  */
+      if (align == 4
+	  && ! (bytes <= 127 || (bytes <= 32512 && bytes % 256 == 0)))
+	return 0;

-  /* If no 4-byte aligned, loop count should be treated as the 
constraint.  */
-  if (align != 4
-      && bytes / align > ((optimize > 1 && !optimize_size) ? 8 : 15))
-    return 0;
+      /* If no 4-byte aligned, loop count should be treated as the
+	 constraint.  */
+      if (align != 4
+	  && count > ((optimize > 1 && !optimize_size) ? 8 : 15))
+	return 0;
+    }