diff mbox series

aarch64: Add extend-as-extract-with-shift pattern [PR96998]

Message ID 20200910144220.336upswkes5geylc@arm.com
State New
Headers show
Series aarch64: Add extend-as-extract-with-shift pattern [PR96998] | expand

Commit Message

Alex Coplan Sept. 10, 2020, 2:42 p.m. UTC
Hello,

Since r11-2903-g6b3034eaba83935d9f6dfb20d2efbdb34b5b00bf introduced a
canonicalization from mult to shift on address reloads, a missing
pattern in the AArch64 backend was exposed.

In particular, we were missing the ashift variant of
*add_<optab><mode>_multp2 (this mult variant is redundant and was
removed in r11-3033-g2f8ae301f6a125f50b0a758047fcddae7b68daa8).

This patch adds that missing pattern (fixing PR96998), updates
aarch64_is_extend_from_extract() to work for the shift pattern (instead
of the redundant mult variant) and updates callers in the cost
calculations to apply the costing for the shift variant instead.

Testing:
 * Bootstrap and regtest on aarch64-none-linux-gnu, no regressions.
 * Checked new unit test passes on arm-none-linux-gnueabihf.
 * Checked new unit test isn't run on x86 (inline asm uses
   arm/aarch64-specific constraint).
 * Tested linux-next tree no longer encounters this ICE on arm64 with
   patched GCC (unfortunately still doesn't compile successfully due to
   a fix for PR96475 which introduces an ICE on arm64).

OK for trunk?

Thanks,
Alex

---

gcc/ChangeLog:

	PR target/96998
	* config/aarch64/aarch64.c (aarch64_is_extend_from_extract): Update to
	work for shift pattern instead of mult.
	(aarch64_strip_extend): Cost extract+shift pattern instead of
	now-removed extract+mult pattern.
	(aarch64_rtx_arith_op_extract_p): Likewise.
	* config/aarch64/aarch64.md (*add_<optab><mode>_shftex): New.

gcc/testsuite/ChangeLog:

	PR target/96998
	* gcc.c-torture/compile/pr96998.c: New test.

Comments

Richard Sandiford Sept. 10, 2020, 6:18 p.m. UTC | #1
Alex Coplan <alex.coplan@arm.com> writes:
> Hello,
>
> Since r11-2903-g6b3034eaba83935d9f6dfb20d2efbdb34b5b00bf introduced a
> canonicalization from mult to shift on address reloads, a missing
> pattern in the AArch64 backend was exposed.
>
> In particular, we were missing the ashift variant of
> *add_<optab><mode>_multp2 (this mult variant is redundant and was
> removed in r11-3033-g2f8ae301f6a125f50b0a758047fcddae7b68daa8).
>
> This patch adds that missing pattern (fixing PR96998), updates
> aarch64_is_extend_from_extract() to work for the shift pattern (instead
> of the redundant mult variant) and updates callers in the cost
> calculations to apply the costing for the shift variant instead.

Hmm.  I think we should instead fix this in combine.  For the related
testcase:

void g(void) {
  int g;
  for (;; g = h()) {
    asm volatile ("" :: "r"(&d.e[g]));
  }
}

combine tries:

Trying 9 -> 10:
    9: r99:DI=sign_extend(r93:SI)<<0x2
      REG_DEAD r93:SI
   10: r100:DI=r96:DI+r99:DI
      REG_DEAD r99:DI
Successfully matched this instruction:
(set (reg:DI 100)
    (plus:DI (ashift:DI (sign_extend:DI (reg/v:SI 93 [ g ]))
            (const_int 2 [0x2]))
        (reg/f:DI 96)))
allowing combination of insns 9 and 10
original costs 4 + 4 = 8
replacement cost 8
deferring deletion of insn with uid = 9.
modifying insn i3    10: r100:DI=sign_extend(r93:SI)<<0x2+r96:DI
      REG_DEAD r93:SI
deferring rescan insn with uid = 10.

where (shift (extend …) …) is IMO the correct representation of this
operation.  The new pattern comes from this code in make_extraction:

  else if (GET_CODE (inner) == ASHIFT
	   && CONST_INT_P (XEXP (inner, 1))
	   && pos_rtx == 0 && pos == 0
	   && len > UINTVAL (XEXP (inner, 1)))
    {
      /* We're extracting the least significant bits of an rtx
	 (ashift X (const_int C)), where LEN > C.  Extract the
	 least significant (LEN - C) bits of X, giving an rtx
	 whose mode is MODE, then shift it left C times.  */
      new_rtx = make_extraction (mode, XEXP (inner, 0),
			     0, 0, len - INTVAL (XEXP (inner, 1)),
			     unsignedp, in_dest, in_compare);
      if (new_rtx != 0)
	return gen_rtx_ASHIFT (mode, new_rtx, XEXP (inner, 1));
    }

But because of the unfortunate special case that shifts need
to be MULTs inside MEMs, make_compound_operation_int has:

    case ASHIFT:
      /* Convert shifts by constants into multiplications if inside
	 an address.  */
      if (in_code == MEM && CONST_INT_P (XEXP (x, 1))
	  && INTVAL (XEXP (x, 1)) < HOST_BITS_PER_WIDE_INT
	  && INTVAL (XEXP (x, 1)) >= 0)
	{
	  HOST_WIDE_INT count = INTVAL (XEXP (x, 1));
	  HOST_WIDE_INT multval = HOST_WIDE_INT_1 << count;

	  new_rtx = make_compound_operation (XEXP (x, 0), next_code);
	  if (GET_CODE (new_rtx) == NEG)
	    {
	      new_rtx = XEXP (new_rtx, 0);
	      multval = -multval;
	    }
	  multval = trunc_int_for_mode (multval, mode);
	  new_rtx = gen_rtx_MULT (mode, new_rtx, gen_int_mode (multval, mode));
	}
      break;

Thus for:

    case ASHIFTRT:
      lhs = XEXP (x, 0);
      rhs = XEXP (x, 1);

      /* If we have (ashiftrt (ashift foo C1) C2) with C2 >= C1,
	 this is a SIGN_EXTRACT.  */
      if (CONST_INT_P (rhs)
	  && GET_CODE (lhs) == ASHIFT
	  && CONST_INT_P (XEXP (lhs, 1))
	  && INTVAL (rhs) >= INTVAL (XEXP (lhs, 1))
	  && INTVAL (XEXP (lhs, 1)) >= 0
	  && INTVAL (rhs) < mode_width)
	{
	  new_rtx = make_compound_operation (XEXP (lhs, 0), next_code);
	  new_rtx = make_extraction (mode, new_rtx,
				     INTVAL (rhs) - INTVAL (XEXP (lhs, 1)),
				     NULL_RTX, mode_width - INTVAL (rhs),
				     code == LSHIFTRT, 0, in_code == COMPARE);
	  break;
	}

the first new_rtx is a MULT rather than an ASHIFT when the operation
occurs inside a MEM.  make_extraction then doesn't do the same
canonicalisation as it does for ASHIFT above.  So for the original
testcase we get:

Trying 8 -> 9:
    8: r98:DI=sign_extend(r92:SI)
      REG_DEAD r92:SI
    9: [r98:DI*0x4+r96:DI]=asm_operands
      REG_DEAD r98:DI
Failed to match this instruction:
(set (mem:SI (plus:DI (sign_extract:DI (mult:DI (subreg:DI (reg/v:SI 92 [ g ]) 0)
                    (const_int 4 [0x4]))
                (const_int 34 [0x22])
                (const_int 0 [0]))
            (reg/f:DI 96)) [3 *i_5+0 S4 A32])
    (asm_operands:SI ("") ("=Q") 0 []
         []
         [] /tmp/foo.c:13))
allowing combination of insns 8 and 9
original costs 4 + 4 = 8
replacement cost 4
deferring deletion of insn with uid = 8.
modifying insn i3     9: [sign_extract(r92:SI#0*0x4,0x22,0)+r96:DI]=asm_operands
      REG_DEAD r92:SI
deferring rescan insn with uid = 9.
starting the processing of deferred insns
rescanning insn with uid = 9.
ending the processing of deferred insns

So IMO make_extraction should recognise and handle the “mem form”
of the shift too.

It's all a bit unfortunate really.  Having different representations
for shifts inside and outside MEMs is the Second Great RTL Mistake.
(The first was modeless const_ints. :-))

If we do that, we should be able to remove the handling of
extract-based addresses in aarch64_classify_index & co.

Thanks,
Richard

>
> Testing:
>  * Bootstrap and regtest on aarch64-none-linux-gnu, no regressions.
>  * Checked new unit test passes on arm-none-linux-gnueabihf.
>  * Checked new unit test isn't run on x86 (inline asm uses
>    arm/aarch64-specific constraint).
>  * Tested linux-next tree no longer encounters this ICE on arm64 with
>    patched GCC (unfortunately still doesn't compile successfully due to
>    a fix for PR96475 which introduces an ICE on arm64).
>
> OK for trunk?
>
> Thanks,
> Alex
>
> ---
>
> gcc/ChangeLog:
>
> 	PR target/96998
> 	* config/aarch64/aarch64.c (aarch64_is_extend_from_extract): Update to
> 	work for shift pattern instead of mult.
> 	(aarch64_strip_extend): Cost extract+shift pattern instead of
> 	now-removed extract+mult pattern.
> 	(aarch64_rtx_arith_op_extract_p): Likewise.
> 	* config/aarch64/aarch64.md (*add_<optab><mode>_shftex): New.
>
> gcc/testsuite/ChangeLog:
>
> 	PR target/96998
> 	* gcc.c-torture/compile/pr96998.c: New test.
>
> diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
> index b6d74496cd0..55e0fc4e683 100644
> --- a/gcc/config/aarch64/aarch64.c
> +++ b/gcc/config/aarch64/aarch64.c
> @@ -2815,27 +2815,24 @@ aarch64_is_noplt_call_p (rtx sym)
>     represent an expression that matches an extend operation.  The
>     operands represent the parameters from
>  
> -   (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)).  */
> +   (extract:MODE (ashift (reg) (SHIFT_IMM)) (EXTRACT_IMM) (const_int 0)).  */
>  bool
> -aarch64_is_extend_from_extract (scalar_int_mode mode, rtx mult_imm,
> +aarch64_is_extend_from_extract (scalar_int_mode mode, rtx shift_imm,
>  				rtx extract_imm)
>  {
> -  HOST_WIDE_INT mult_val, extract_val;
> +  HOST_WIDE_INT shift_val, extract_val;
>  
> -  if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
> +  if (! CONST_INT_P (shift_imm) || ! CONST_INT_P (extract_imm))
>      return false;
>  
> -  mult_val = INTVAL (mult_imm);
> +  shift_val = INTVAL (shift_imm);
>    extract_val = INTVAL (extract_imm);
>  
> -  if (extract_val > 8
> -      && extract_val < GET_MODE_BITSIZE (mode)
> -      && exact_log2 (extract_val & ~7) > 0
> -      && (extract_val & 7) <= 4
> -      && mult_val == (1 << (extract_val & 7)))
> -    return true;
> -
> -  return false;
> +  return extract_val > 8
> +	  && extract_val < GET_MODE_BITSIZE (mode)
> +	  && exact_log2 (extract_val & ~7) > 0
> +	  && shift_val <= 4
> +	  && shift_val == (extract_val & 7);
>  }
>  
>  /* Emit an insn that's a simple single-set.  Both the operands must be
> @@ -11262,7 +11259,7 @@ aarch64_strip_extend (rtx x, bool strip_shift)
>    /* Zero and sign extraction of a widened value.  */
>    if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
>        && XEXP (op, 2) == const0_rtx
> -      && GET_CODE (XEXP (op, 0)) == MULT
> +      && GET_CODE (XEXP (op, 0)) == ASHIFT
>        && aarch64_is_extend_from_extract (mode, XEXP (XEXP (op, 0), 1),
>  					 XEXP (op, 1)))
>      return XEXP (XEXP (op, 0), 0);
> @@ -11617,7 +11614,7 @@ aarch64_rtx_arith_op_extract_p (rtx x, scalar_int_mode mode)
>        rtx op1 = XEXP (x, 1);
>        rtx op2 = XEXP (x, 2);
>  
> -      if (GET_CODE (op0) == MULT
> +      if (GET_CODE (op0) == ASHIFT
>  	  && CONST_INT_P (op1)
>  	  && op2 == const0_rtx
>  	  && CONST_INT_P (XEXP (op0, 1))
> diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
> index dbc6b1db176..4bb7b318b99 100644
> --- a/gcc/config/aarch64/aarch64.md
> +++ b/gcc/config/aarch64/aarch64.md
> @@ -2471,6 +2471,19 @@ (define_insn "*add_<optab><ALLX:mode>_<GPI:mode>"
>    [(set_attr "type" "alu_ext")]
>  )
>  
> +(define_insn "*add_<optab><mode>_shftex"
> +  [(set (match_operand:GPI 0 "register_operand" "=rk")
> +	(plus:GPI (ANY_EXTRACT:GPI
> +		    (ashift:GPI (match_operand:GPI 1 "register_operand" "r")
> +				(match_operand 2 "aarch64_shift_imm_<mode>" "n"))
> +		    (match_operand 3 "const_int_operand" "n")
> +		    (const_int 0))
> +		  (match_operand:GPI 4 "register_operand" "r")))]
> +  "aarch64_is_extend_from_extract (<MODE>mode, operands[2], operands[3])"
> +  "add\\t%<w>0, %<w>4, %<w>1, <su>xt%e3 %2"
> +  [(set_attr "type" "alu_ext")]
> +)
> +
>  ;; zero_extend version of above
>  (define_insn "*add_<optab><SHORT:mode>_si_uxtw"
>    [(set (match_operand:DI 0 "register_operand" "=rk")
> diff --git a/gcc/testsuite/gcc.c-torture/compile/pr96998.c b/gcc/testsuite/gcc.c-torture/compile/pr96998.c
> new file mode 100644
> index 00000000000..a75d5dcfe08
> --- /dev/null
> +++ b/gcc/testsuite/gcc.c-torture/compile/pr96998.c
> @@ -0,0 +1,15 @@
> +/* { dg-do compile { target arm*-*-* aarch64*-*-* } } */
> +
> +int h(void);
> +struct c d;
> +struct c {
> +  int e[1];
> +};
> +
> +void f(void) {
> +  int g;
> +  for (;; g = h()) {
> +    int *i = &d.e[g];
> +    asm("" : "=Q"(*i));
> +  }
> +}
Alex Coplan Sept. 16, 2020, 5:08 p.m. UTC | #2
Hi Richard,

On 10/09/2020 19:18, Richard Sandiford wrote:
> Alex Coplan <alex.coplan@arm.com> writes:
> > Hello,
> >
> > Since r11-2903-g6b3034eaba83935d9f6dfb20d2efbdb34b5b00bf introduced a
> > canonicalization from mult to shift on address reloads, a missing
> > pattern in the AArch64 backend was exposed.
> >
> > In particular, we were missing the ashift variant of
> > *add_<optab><mode>_multp2 (this mult variant is redundant and was
> > removed in r11-3033-g2f8ae301f6a125f50b0a758047fcddae7b68daa8).
> >
> > This patch adds that missing pattern (fixing PR96998), updates
> > aarch64_is_extend_from_extract() to work for the shift pattern (instead
> > of the redundant mult variant) and updates callers in the cost
> > calculations to apply the costing for the shift variant instead.
> 
> Hmm.  I think we should instead fix this in combine.

Ok, thanks for the review. Please find a revised patch attached which
fixes this in combine instead.

> If we do that, we should be able to remove the handling of
> extract-based addresses in aarch64_classify_index & co.

Nice. I've tried to remove all the redundant extract-based address
handling in the AArch64 backend as a result.

The revised patch also fixes up a comment in combine which appears to
have suffered from bitrot.

Testing:
 * Bootstrap and regtest on aarch64-none-linux-gnu,
   arm-none-linux-gnueabihf, and x86_64-pc-linux-gnu.
 * Regtested an x64 -> aarch64-none-elf cross with -mabi=ilp32.
 * Checked that we no longer ICE when building linux-next on AArch64.

OK for trunk?

Thanks,
Alex

---

gcc/ChangeLog:

	* combine.c (expand_compound_operation): Tweak variable name in
	comment to match source.
	(make_extraction): Handle mult by power of two in addition to
	ashift.
	* config/aarch64/aarch64.c (aarch64_is_extend_from_extract): Delete.
	(aarch64_classify_index): Remove redundant extend-as-extract handling.
	(aarch64_strip_extend): Likewise.
	(aarch64_rtx_arith_op_extract_p): Likewise, remove now-unused
	parameter. Update callers...
	(aarch64_rtx_costs): ... here.

gcc/testsuite/ChangeLog:

	* gcc.c-torture/compile/pr96998.c: New test.
diff --git a/gcc/combine.c b/gcc/combine.c
index c88382e..cd8e544 100644
--- a/gcc/combine.c
+++ b/gcc/combine.c
@@ -7419,8 +7419,8 @@ expand_compound_operation (rtx x)
     }
 
   /* If we reach here, we want to return a pair of shifts.  The inner
-     shift is a left shift of BITSIZE - POS - LEN bits.  The outer
-     shift is a right shift of BITSIZE - LEN bits.  It is arithmetic or
+     shift is a left shift of MODEWIDTH - POS - LEN bits.  The outer
+     shift is a right shift of MODEWIDTH - LEN bits.  It is arithmetic or
      logical depending on the value of UNSIGNEDP.
 
      If this was a ZERO_EXTEND or ZERO_EXTRACT, this pair of shifts will be
@@ -7650,20 +7650,29 @@ make_extraction (machine_mode mode, rtx inner, HOST_WIDE_INT pos,
 	is_mode = GET_MODE (SUBREG_REG (inner));
       inner = SUBREG_REG (inner);
     }
-  else if (GET_CODE (inner) == ASHIFT
+  else if ((GET_CODE (inner) == ASHIFT || GET_CODE (inner) == MULT)
 	   && CONST_INT_P (XEXP (inner, 1))
-	   && pos_rtx == 0 && pos == 0
-	   && len > UINTVAL (XEXP (inner, 1)))
-    {
-      /* We're extracting the least significant bits of an rtx
-	 (ashift X (const_int C)), where LEN > C.  Extract the
-	 least significant (LEN - C) bits of X, giving an rtx
-	 whose mode is MODE, then shift it left C times.  */
-      new_rtx = make_extraction (mode, XEXP (inner, 0),
-			     0, 0, len - INTVAL (XEXP (inner, 1)),
-			     unsignedp, in_dest, in_compare);
-      if (new_rtx != 0)
-	return gen_rtx_ASHIFT (mode, new_rtx, XEXP (inner, 1));
+	   && pos_rtx == 0 && pos == 0)
+    {
+      const HOST_WIDE_INT ci = INTVAL (XEXP (inner, 1));
+      const bool mult = GET_CODE (inner) == MULT;
+      const int shift_amt = mult ? exact_log2 (ci) : ci;
+
+      if (shift_amt > 0 && len > (unsigned)shift_amt)
+	{
+	  /* We're extracting the least significant bits of an rtx
+	     (ashift X (const_int C)) or (mult X (const_int (2^C))),
+	     where LEN > C.  Extract the least significant (LEN - C) bits
+	     of X, giving an rtx whose mode is MODE, then shift it left
+	     C times.  */
+	  new_rtx = make_extraction (mode, XEXP (inner, 0),
+				 0, 0, len - shift_amt,
+				 unsignedp, in_dest, in_compare);
+	  if (new_rtx)
+	    return mult
+	    ? gen_rtx_MULT (mode, new_rtx, XEXP (inner, 1))
+	    : gen_rtx_ASHIFT (mode, new_rtx, XEXP (inner, 1));
+	}
     }
   else if (GET_CODE (inner) == TRUNCATE
 	   /* If trying or potentionally trying to extract
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index b251f39..56738ae 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -2811,33 +2811,6 @@ aarch64_is_noplt_call_p (rtx sym)
   return false;
 }
 
-/* Return true if the offsets to a zero/sign-extract operation
-   represent an expression that matches an extend operation.  The
-   operands represent the parameters from
-
-   (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)).  */
-bool
-aarch64_is_extend_from_extract (scalar_int_mode mode, rtx mult_imm,
-				rtx extract_imm)
-{
-  HOST_WIDE_INT mult_val, extract_val;
-
-  if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
-    return false;
-
-  mult_val = INTVAL (mult_imm);
-  extract_val = INTVAL (extract_imm);
-
-  if (extract_val > 8
-      && extract_val < GET_MODE_BITSIZE (mode)
-      && exact_log2 (extract_val & ~7) > 0
-      && (extract_val & 7) <= 4
-      && mult_val == (1 << (extract_val & 7)))
-    return true;
-
-  return false;
-}
-
 /* Emit an insn that's a simple single-set.  Both the operands must be
    known to be valid.  */
 inline static rtx_insn *
@@ -8837,22 +8810,6 @@ aarch64_classify_index (struct aarch64_address_info *info, rtx x,
       index = XEXP (XEXP (x, 0), 0);
       shift = INTVAL (XEXP (x, 1));
     }
-  /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
-  else if ((GET_CODE (x) == SIGN_EXTRACT
-	    || GET_CODE (x) == ZERO_EXTRACT)
-	   && GET_MODE (x) == DImode
-	   && GET_CODE (XEXP (x, 0)) == MULT
-	   && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
-	   && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
-    {
-      type = (GET_CODE (x) == SIGN_EXTRACT)
-	? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
-      index = XEXP (XEXP (x, 0), 0);
-      shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
-      if (INTVAL (XEXP (x, 1)) != 32 + shift
-	  || INTVAL (XEXP (x, 2)) != 0)
-	shift = -1;
-    }
   /* (and:DI (mult:DI (reg:DI) (const_int scale))
      (const_int 0xffffffff<<shift)) */
   else if (GET_CODE (x) == AND
@@ -8868,22 +8825,6 @@ aarch64_classify_index (struct aarch64_address_info *info, rtx x,
       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
 	shift = -1;
     }
-  /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
-  else if ((GET_CODE (x) == SIGN_EXTRACT
-	    || GET_CODE (x) == ZERO_EXTRACT)
-	   && GET_MODE (x) == DImode
-	   && GET_CODE (XEXP (x, 0)) == ASHIFT
-	   && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
-	   && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
-    {
-      type = (GET_CODE (x) == SIGN_EXTRACT)
-	? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
-      index = XEXP (XEXP (x, 0), 0);
-      shift = INTVAL (XEXP (XEXP (x, 0), 1));
-      if (INTVAL (XEXP (x, 1)) != 32 + shift
-	  || INTVAL (XEXP (x, 2)) != 0)
-	shift = -1;
-    }
   /* (and:DI (ashift:DI (reg:DI) (const_int shift))
      (const_int 0xffffffff<<shift)) */
   else if (GET_CODE (x) == AND
@@ -11259,16 +11200,6 @@ aarch64_strip_extend (rtx x, bool strip_shift)
   if (!is_a <scalar_int_mode> (GET_MODE (op), &mode))
     return op;
 
-  /* Zero and sign extraction of a widened value.  */
-  if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
-      && XEXP (op, 2) == const0_rtx
-      && GET_CODE (XEXP (op, 0)) == MULT
-      && aarch64_is_extend_from_extract (mode, XEXP (XEXP (op, 0), 1),
-					 XEXP (op, 1)))
-    return XEXP (XEXP (op, 0), 0);
-
-  /* It can also be represented (for zero-extend) as an AND with an
-     immediate.  */
   if (GET_CODE (op) == AND
       && GET_CODE (XEXP (op, 0)) == MULT
       && CONST_INT_P (XEXP (XEXP (op, 0), 1))
@@ -11606,31 +11537,11 @@ aarch64_branch_cost (bool speed_p, bool predictable_p)
 /* Return true if the RTX X in mode MODE is a zero or sign extract
    usable in an ADD or SUB (extended register) instruction.  */
 static bool
-aarch64_rtx_arith_op_extract_p (rtx x, scalar_int_mode mode)
-{
-  /* Catch add with a sign extract.
-     This is add_<optab><mode>_multp2.  */
-  if (GET_CODE (x) == SIGN_EXTRACT
-      || GET_CODE (x) == ZERO_EXTRACT)
-    {
-      rtx op0 = XEXP (x, 0);
-      rtx op1 = XEXP (x, 1);
-      rtx op2 = XEXP (x, 2);
-
-      if (GET_CODE (op0) == MULT
-	  && CONST_INT_P (op1)
-	  && op2 == const0_rtx
-	  && CONST_INT_P (XEXP (op0, 1))
-	  && aarch64_is_extend_from_extract (mode,
-					     XEXP (op0, 1),
-					     op1))
-	{
-	  return true;
-	}
-    }
+aarch64_rtx_arith_op_extract_p (rtx x)
+{
   /* The simple case <ARITH>, XD, XN, XM, [us]xt.
      No shift.  */
-  else if (GET_CODE (x) == SIGN_EXTEND
+  if (GET_CODE (x) == SIGN_EXTEND
 	   || GET_CODE (x) == ZERO_EXTEND)
     return REG_P (XEXP (x, 0));
 
@@ -12319,7 +12230,7 @@ cost_minus:
 
 	/* Look for SUB (extended register).  */
 	if (is_a <scalar_int_mode> (mode, &int_mode)
-	    && aarch64_rtx_arith_op_extract_p (op1, int_mode))
+	    && aarch64_rtx_arith_op_extract_p (op1))
 	  {
 	    if (speed)
 	      *cost += extra_cost->alu.extend_arith;
@@ -12399,7 +12310,7 @@ cost_plus:
 
 	/* Look for ADD (extended register).  */
 	if (is_a <scalar_int_mode> (mode, &int_mode)
-	    && aarch64_rtx_arith_op_extract_p (op0, int_mode))
+	    && aarch64_rtx_arith_op_extract_p (op0))
 	  {
 	    if (speed)
 	      *cost += extra_cost->alu.extend_arith;
diff --git a/gcc/testsuite/gcc.c-torture/compile/pr96998.c b/gcc/testsuite/gcc.c-torture/compile/pr96998.c
new file mode 100644
index 0000000..a75d5dc
--- /dev/null
+++ b/gcc/testsuite/gcc.c-torture/compile/pr96998.c
@@ -0,0 +1,15 @@
+/* { dg-do compile { target arm*-*-* aarch64*-*-* } } */
+
+int h(void);
+struct c d;
+struct c {
+  int e[1];
+};
+
+void f(void) {
+  int g;
+  for (;; g = h()) {
+    int *i = &d.e[g];
+    asm("" : "=Q"(*i));
+  }
+}
Segher Boessenkool Sept. 16, 2020, 8:32 p.m. UTC | #3
Hi!

On Thu, Sep 10, 2020 at 07:18:01PM +0100, Richard Sandiford wrote:

<huge snip>

> It's all a bit unfortunate really.  Having different representations
> for shifts inside and outside MEMs is the Second Great RTL Mistake.

Yes...  All targets with something that computes shifted addresses (like
in a LEA style insn) needs to have that insn in two representations.

Can this still be fixed?

> (The first was modeless const_ints. :-))

I *like* those.  Almost always.  Stockholm syndrome?

These days a mode on them should not cost much (in storage size).  It
should simplify quite some code, too.

> If we do that, we should be able to remove the handling of
> extract-based addresses in aarch64_classify_index & co.

If we do what?  I don't follow, sorry.

(Patch to combine sounds fine fwiw; patches welcome, as always.)


Segher
Richard Sandiford Sept. 17, 2020, 7:10 a.m. UTC | #4
Alex Coplan <alex.coplan@arm.com> writes:
> Hi Richard,
>
> On 10/09/2020 19:18, Richard Sandiford wrote:
>> Alex Coplan <alex.coplan@arm.com> writes:
>> > Hello,
>> >
>> > Since r11-2903-g6b3034eaba83935d9f6dfb20d2efbdb34b5b00bf introduced a
>> > canonicalization from mult to shift on address reloads, a missing
>> > pattern in the AArch64 backend was exposed.
>> >
>> > In particular, we were missing the ashift variant of
>> > *add_<optab><mode>_multp2 (this mult variant is redundant and was
>> > removed in r11-3033-g2f8ae301f6a125f50b0a758047fcddae7b68daa8).
>> >
>> > This patch adds that missing pattern (fixing PR96998), updates
>> > aarch64_is_extend_from_extract() to work for the shift pattern (instead
>> > of the redundant mult variant) and updates callers in the cost
>> > calculations to apply the costing for the shift variant instead.
>> 
>> Hmm.  I think we should instead fix this in combine.
>
> Ok, thanks for the review. Please find a revised patch attached which
> fixes this in combine instead.

Thanks for doing this, looks like a really nice clean-up.

> @@ -7650,20 +7650,29 @@ make_extraction (machine_mode mode, rtx inner, HOST_WIDE_INT pos,
>  	is_mode = GET_MODE (SUBREG_REG (inner));
>        inner = SUBREG_REG (inner);
>      }
> -  else if (GET_CODE (inner) == ASHIFT
> +  else if ((GET_CODE (inner) == ASHIFT || GET_CODE (inner) == MULT)
>  	   && CONST_INT_P (XEXP (inner, 1))
> -	   && pos_rtx == 0 && pos == 0
> -	   && len > UINTVAL (XEXP (inner, 1)))
> -    {
> -      /* We're extracting the least significant bits of an rtx
> -	 (ashift X (const_int C)), where LEN > C.  Extract the
> -	 least significant (LEN - C) bits of X, giving an rtx
> -	 whose mode is MODE, then shift it left C times.  */
> -      new_rtx = make_extraction (mode, XEXP (inner, 0),
> -			     0, 0, len - INTVAL (XEXP (inner, 1)),
> -			     unsignedp, in_dest, in_compare);
> -      if (new_rtx != 0)
> -	return gen_rtx_ASHIFT (mode, new_rtx, XEXP (inner, 1));
> +	   && pos_rtx == 0 && pos == 0)
> +    {
> +      const HOST_WIDE_INT ci = INTVAL (XEXP (inner, 1));
> +      const bool mult = GET_CODE (inner) == MULT;
> +      const int shift_amt = mult ? exact_log2 (ci) : ci;

Not going to be a problem in practice but: HOST_WIDE_INT would be better
than int here, so that we don't truncate the value for ASHIFT before it
has been tested.  Similarly:

> +
> +      if (shift_amt > 0 && len > (unsigned)shift_amt)

unsigned HOST_WIDE_INT here.

> +	{
> +	  /* We're extracting the least significant bits of an rtx
> +	     (ashift X (const_int C)) or (mult X (const_int (2^C))),
> +	     where LEN > C.  Extract the least significant (LEN - C) bits
> +	     of X, giving an rtx whose mode is MODE, then shift it left
> +	     C times.  */
> +	  new_rtx = make_extraction (mode, XEXP (inner, 0),
> +				 0, 0, len - shift_amt,
> +				 unsignedp, in_dest, in_compare);
> +	  if (new_rtx)
> +	    return mult
> +	    ? gen_rtx_MULT (mode, new_rtx, XEXP (inner, 1))
> +	    : gen_rtx_ASHIFT (mode, new_rtx, XEXP (inner, 1));

Easier as gen_rtx_fmt_ee (GET_CODE (inner), mode, …);

The combine parts LGTM otherwise, but Segher should have the
final say.

> diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
> index b251f39..56738ae 100644
> --- a/gcc/config/aarch64/aarch64.c
> +++ b/gcc/config/aarch64/aarch64.c
> @@ -2811,33 +2811,6 @@ aarch64_is_noplt_call_p (rtx sym)
>    return false;
>  }
>  
> -/* Return true if the offsets to a zero/sign-extract operation
> -   represent an expression that matches an extend operation.  The
> -   operands represent the parameters from
> -
> -   (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)).  */
> -bool
> -aarch64_is_extend_from_extract (scalar_int_mode mode, rtx mult_imm,
> -				rtx extract_imm)
> -{
> -  HOST_WIDE_INT mult_val, extract_val;
> -
> -  if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
> -    return false;
> -
> -  mult_val = INTVAL (mult_imm);
> -  extract_val = INTVAL (extract_imm);
> -
> -  if (extract_val > 8
> -      && extract_val < GET_MODE_BITSIZE (mode)
> -      && exact_log2 (extract_val & ~7) > 0
> -      && (extract_val & 7) <= 4
> -      && mult_val == (1 << (extract_val & 7)))
> -    return true;
> -
> -  return false;
> -}
> -
>  /* Emit an insn that's a simple single-set.  Both the operands must be
>     known to be valid.  */
>  inline static rtx_insn *
> @@ -8837,22 +8810,6 @@ aarch64_classify_index (struct aarch64_address_info *info, rtx x,
>        index = XEXP (XEXP (x, 0), 0);
>        shift = INTVAL (XEXP (x, 1));
>      }
> -  /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
> -  else if ((GET_CODE (x) == SIGN_EXTRACT
> -	    || GET_CODE (x) == ZERO_EXTRACT)
> -	   && GET_MODE (x) == DImode
> -	   && GET_CODE (XEXP (x, 0)) == MULT
> -	   && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
> -	   && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
> -    {
> -      type = (GET_CODE (x) == SIGN_EXTRACT)
> -	? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
> -      index = XEXP (XEXP (x, 0), 0);
> -      shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
> -      if (INTVAL (XEXP (x, 1)) != 32 + shift
> -	  || INTVAL (XEXP (x, 2)) != 0)
> -	shift = -1;
> -    }
>    /* (and:DI (mult:DI (reg:DI) (const_int scale))
>       (const_int 0xffffffff<<shift)) */
>    else if (GET_CODE (x) == AND
> @@ -8868,22 +8825,6 @@ aarch64_classify_index (struct aarch64_address_info *info, rtx x,
>        if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
>  	shift = -1;
>      }
> -  /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
> -  else if ((GET_CODE (x) == SIGN_EXTRACT
> -	    || GET_CODE (x) == ZERO_EXTRACT)
> -	   && GET_MODE (x) == DImode
> -	   && GET_CODE (XEXP (x, 0)) == ASHIFT
> -	   && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
> -	   && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
> -    {
> -      type = (GET_CODE (x) == SIGN_EXTRACT)
> -	? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
> -      index = XEXP (XEXP (x, 0), 0);
> -      shift = INTVAL (XEXP (XEXP (x, 0), 1));
> -      if (INTVAL (XEXP (x, 1)) != 32 + shift
> -	  || INTVAL (XEXP (x, 2)) != 0)
> -	shift = -1;
> -    }
>    /* (and:DI (ashift:DI (reg:DI) (const_int shift))
>       (const_int 0xffffffff<<shift)) */
>    else if (GET_CODE (x) == AND
> @@ -11259,16 +11200,6 @@ aarch64_strip_extend (rtx x, bool strip_shift)
>    if (!is_a <scalar_int_mode> (GET_MODE (op), &mode))
>      return op;
>  
> -  /* Zero and sign extraction of a widened value.  */
> -  if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
> -      && XEXP (op, 2) == const0_rtx
> -      && GET_CODE (XEXP (op, 0)) == MULT
> -      && aarch64_is_extend_from_extract (mode, XEXP (XEXP (op, 0), 1),
> -					 XEXP (op, 1)))
> -    return XEXP (XEXP (op, 0), 0);
> -
> -  /* It can also be represented (for zero-extend) as an AND with an
> -     immediate.  */
>    if (GET_CODE (op) == AND
>        && GET_CODE (XEXP (op, 0)) == MULT
>        && CONST_INT_P (XEXP (XEXP (op, 0), 1))
> @@ -11606,31 +11537,11 @@ aarch64_branch_cost (bool speed_p, bool predictable_p)
>  /* Return true if the RTX X in mode MODE is a zero or sign extract
>     usable in an ADD or SUB (extended register) instruction.  */
>  static bool
> -aarch64_rtx_arith_op_extract_p (rtx x, scalar_int_mode mode)
> -{
> -  /* Catch add with a sign extract.
> -     This is add_<optab><mode>_multp2.  */
> -  if (GET_CODE (x) == SIGN_EXTRACT
> -      || GET_CODE (x) == ZERO_EXTRACT)
> -    {
> -      rtx op0 = XEXP (x, 0);
> -      rtx op1 = XEXP (x, 1);
> -      rtx op2 = XEXP (x, 2);
> -
> -      if (GET_CODE (op0) == MULT
> -	  && CONST_INT_P (op1)
> -	  && op2 == const0_rtx
> -	  && CONST_INT_P (XEXP (op0, 1))
> -	  && aarch64_is_extend_from_extract (mode,
> -					     XEXP (op0, 1),
> -					     op1))
> -	{
> -	  return true;
> -	}
> -    }
> +aarch64_rtx_arith_op_extract_p (rtx x)
> +{
>    /* The simple case <ARITH>, XD, XN, XM, [us]xt.
>       No shift.  */
> -  else if (GET_CODE (x) == SIGN_EXTEND
> +  if (GET_CODE (x) == SIGN_EXTEND
>  	   || GET_CODE (x) == ZERO_EXTEND)

Should reindent this line too.

>      return REG_P (XEXP (x, 0));
>  
> @@ -12319,7 +12230,7 @@ cost_minus:
>  
>  	/* Look for SUB (extended register).  */
>  	if (is_a <scalar_int_mode> (mode, &int_mode)
> -	    && aarch64_rtx_arith_op_extract_p (op1, int_mode))
> +	    && aarch64_rtx_arith_op_extract_p (op1))
>  	  {
>  	    if (speed)
>  	      *cost += extra_cost->alu.extend_arith;
> @@ -12399,7 +12310,7 @@ cost_plus:
>  
>  	/* Look for ADD (extended register).  */
>  	if (is_a <scalar_int_mode> (mode, &int_mode)
> -	    && aarch64_rtx_arith_op_extract_p (op0, int_mode))
> +	    && aarch64_rtx_arith_op_extract_p (op0))
>  	  {
>  	    if (speed)
>  	      *cost += extra_cost->alu.extend_arith;

int_mode is now unused in both hunks, so can just remove the “, &int_mode”s.

The aarch64 changes are OK with those (incredibly inconsequential)
comments fixed.

Thanks,
Richard
Segher Boessenkool Sept. 17, 2020, 7:40 p.m. UTC | #5
Hi!

On Thu, Sep 17, 2020 at 08:10:22AM +0100, Richard Sandiford wrote:
> Alex Coplan <alex.coplan@arm.com> writes:
> The combine parts LGTM otherwise, but Segher should have the
> final say.

I am doubtful this does not regress on many targets.

I'll test it, we'll see :-)


Segher
Alex Coplan Sept. 18, 2020, 12:57 p.m. UTC | #6
Hi Richard, Segher,

On 17/09/2020 08:10, Richard Sandiford wrote:
> Alex Coplan <alex.coplan@arm.com> writes:
> > Hi Richard,
> >
> > On 10/09/2020 19:18, Richard Sandiford wrote:
> >> Alex Coplan <alex.coplan@arm.com> writes:
> >> > Hello,
> >> >
> >> > Since r11-2903-g6b3034eaba83935d9f6dfb20d2efbdb34b5b00bf introduced a
> >> > canonicalization from mult to shift on address reloads, a missing
> >> > pattern in the AArch64 backend was exposed.
> >> >
> >> > In particular, we were missing the ashift variant of
> >> > *add_<optab><mode>_multp2 (this mult variant is redundant and was
> >> > removed in r11-3033-g2f8ae301f6a125f50b0a758047fcddae7b68daa8).
> >> >
> >> > This patch adds that missing pattern (fixing PR96998), updates
> >> > aarch64_is_extend_from_extract() to work for the shift pattern (instead
> >> > of the redundant mult variant) and updates callers in the cost
> >> > calculations to apply the costing for the shift variant instead.
> >> 
> >> Hmm.  I think we should instead fix this in combine.
> >
> > Ok, thanks for the review. Please find a revised patch attached which
> > fixes this in combine instead.
> 
> Thanks for doing this, looks like a really nice clean-up.
> 
> > @@ -7650,20 +7650,29 @@ make_extraction (machine_mode mode, rtx inner, HOST_WIDE_INT pos,
> >  	is_mode = GET_MODE (SUBREG_REG (inner));
> >        inner = SUBREG_REG (inner);
> >      }
> > -  else if (GET_CODE (inner) == ASHIFT
> > +  else if ((GET_CODE (inner) == ASHIFT || GET_CODE (inner) == MULT)
> >  	   && CONST_INT_P (XEXP (inner, 1))
> > -	   && pos_rtx == 0 && pos == 0
> > -	   && len > UINTVAL (XEXP (inner, 1)))
> > -    {
> > -      /* We're extracting the least significant bits of an rtx
> > -	 (ashift X (const_int C)), where LEN > C.  Extract the
> > -	 least significant (LEN - C) bits of X, giving an rtx
> > -	 whose mode is MODE, then shift it left C times.  */
> > -      new_rtx = make_extraction (mode, XEXP (inner, 0),
> > -			     0, 0, len - INTVAL (XEXP (inner, 1)),
> > -			     unsignedp, in_dest, in_compare);
> > -      if (new_rtx != 0)
> > -	return gen_rtx_ASHIFT (mode, new_rtx, XEXP (inner, 1));
> > +	   && pos_rtx == 0 && pos == 0)
> > +    {
> > +      const HOST_WIDE_INT ci = INTVAL (XEXP (inner, 1));
> > +      const bool mult = GET_CODE (inner) == MULT;
> > +      const int shift_amt = mult ? exact_log2 (ci) : ci;
> 
> Not going to be a problem in practice but: HOST_WIDE_INT would be better
> than int here, so that we don't truncate the value for ASHIFT before it
> has been tested.  Similarly:
> 
> > +
> > +      if (shift_amt > 0 && len > (unsigned)shift_amt)
> 
> unsigned HOST_WIDE_INT here.

Makes sense, thanks.

> 
> > +	{
> > +	  /* We're extracting the least significant bits of an rtx
> > +	     (ashift X (const_int C)) or (mult X (const_int (2^C))),
> > +	     where LEN > C.  Extract the least significant (LEN - C) bits
> > +	     of X, giving an rtx whose mode is MODE, then shift it left
> > +	     C times.  */
> > +	  new_rtx = make_extraction (mode, XEXP (inner, 0),
> > +				 0, 0, len - shift_amt,
> > +				 unsignedp, in_dest, in_compare);
> > +	  if (new_rtx)
> > +	    return mult
> > +	    ? gen_rtx_MULT (mode, new_rtx, XEXP (inner, 1))
> > +	    : gen_rtx_ASHIFT (mode, new_rtx, XEXP (inner, 1));
> 
> Easier as gen_rtx_fmt_ee (GET_CODE (inner), mode, …);

Ah, I thought there must have been an easier way to do that!

> 
> The combine parts LGTM otherwise, but Segher should have the
> final say.

Great. FWIW, I tested the previous patch on aarch64, arm, and x86 and
there were no regressions there. I've attached a revised patch with
these fixes and that bootstraps/regtests fine on aarch64-none-linux-gnu.

<snip>

> The aarch64 changes are OK with those (incredibly inconsequential)
> comments fixed.

Great, thanks. I'll wait for Segher's verdict on the combine bits.

Alex

---

gcc/ChangeLog:

	* combine.c (expand_compound_operation): Tweak variable name in
	comment to match source.
	(make_extraction): Handle mult by power of two in addition to
	ashift.
	* config/aarch64/aarch64.c (aarch64_is_extend_from_extract): Delete.
	(aarch64_classify_index): Remove redundant extend-as-extract handling.
	(aarch64_strip_extend): Likewise.
	(aarch64_rtx_arith_op_extract_p): Likewise, remove now-unused
	parameter. Update callers...
	(aarch64_rtx_costs): ... here.

gcc/testsuite/ChangeLog:

	* gcc.c-torture/compile/pr96998.c: New test.
diff --git a/gcc/combine.c b/gcc/combine.c
index c88382efbd3..fe8eff2b464 100644
--- a/gcc/combine.c
+++ b/gcc/combine.c
@@ -7419,8 +7419,8 @@ expand_compound_operation (rtx x)
     }
 
   /* If we reach here, we want to return a pair of shifts.  The inner
-     shift is a left shift of BITSIZE - POS - LEN bits.  The outer
-     shift is a right shift of BITSIZE - LEN bits.  It is arithmetic or
+     shift is a left shift of MODEWIDTH - POS - LEN bits.  The outer
+     shift is a right shift of MODEWIDTH - LEN bits.  It is arithmetic or
      logical depending on the value of UNSIGNEDP.
 
      If this was a ZERO_EXTEND or ZERO_EXTRACT, this pair of shifts will be
@@ -7650,20 +7650,27 @@ make_extraction (machine_mode mode, rtx inner, HOST_WIDE_INT pos,
 	is_mode = GET_MODE (SUBREG_REG (inner));
       inner = SUBREG_REG (inner);
     }
-  else if (GET_CODE (inner) == ASHIFT
+  else if ((GET_CODE (inner) == ASHIFT || GET_CODE (inner) == MULT)
 	   && CONST_INT_P (XEXP (inner, 1))
-	   && pos_rtx == 0 && pos == 0
-	   && len > UINTVAL (XEXP (inner, 1)))
-    {
-      /* We're extracting the least significant bits of an rtx
-	 (ashift X (const_int C)), where LEN > C.  Extract the
-	 least significant (LEN - C) bits of X, giving an rtx
-	 whose mode is MODE, then shift it left C times.  */
-      new_rtx = make_extraction (mode, XEXP (inner, 0),
-			     0, 0, len - INTVAL (XEXP (inner, 1)),
-			     unsignedp, in_dest, in_compare);
-      if (new_rtx != 0)
-	return gen_rtx_ASHIFT (mode, new_rtx, XEXP (inner, 1));
+	   && pos_rtx == 0 && pos == 0)
+    {
+      const HOST_WIDE_INT ci = INTVAL (XEXP (inner, 1));
+      const auto code = GET_CODE (inner);
+      const HOST_WIDE_INT shift_amt = (code == MULT) ? exact_log2 (ci) : ci;
+
+      if (shift_amt > 0 && len > (unsigned HOST_WIDE_INT)shift_amt)
+	{
+	  /* We're extracting the least significant bits of an rtx
+	     (ashift X (const_int C)) or (mult X (const_int (2^C))),
+	     where LEN > C.  Extract the least significant (LEN - C) bits
+	     of X, giving an rtx whose mode is MODE, then shift it left
+	     C times.  */
+	  new_rtx = make_extraction (mode, XEXP (inner, 0),
+				 0, 0, len - shift_amt,
+				 unsignedp, in_dest, in_compare);
+	  if (new_rtx)
+	    return gen_rtx_fmt_ee (code, mode, new_rtx, XEXP (inner, 1));
+	}
     }
   else if (GET_CODE (inner) == TRUNCATE
 	   /* If trying or potentionally trying to extract
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index b251f3947e2..8bb7c310a77 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -2811,33 +2811,6 @@ aarch64_is_noplt_call_p (rtx sym)
   return false;
 }
 
-/* Return true if the offsets to a zero/sign-extract operation
-   represent an expression that matches an extend operation.  The
-   operands represent the parameters from
-
-   (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)).  */
-bool
-aarch64_is_extend_from_extract (scalar_int_mode mode, rtx mult_imm,
-				rtx extract_imm)
-{
-  HOST_WIDE_INT mult_val, extract_val;
-
-  if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
-    return false;
-
-  mult_val = INTVAL (mult_imm);
-  extract_val = INTVAL (extract_imm);
-
-  if (extract_val > 8
-      && extract_val < GET_MODE_BITSIZE (mode)
-      && exact_log2 (extract_val & ~7) > 0
-      && (extract_val & 7) <= 4
-      && mult_val == (1 << (extract_val & 7)))
-    return true;
-
-  return false;
-}
-
 /* Emit an insn that's a simple single-set.  Both the operands must be
    known to be valid.  */
 inline static rtx_insn *
@@ -8837,22 +8810,6 @@ aarch64_classify_index (struct aarch64_address_info *info, rtx x,
       index = XEXP (XEXP (x, 0), 0);
       shift = INTVAL (XEXP (x, 1));
     }
-  /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
-  else if ((GET_CODE (x) == SIGN_EXTRACT
-	    || GET_CODE (x) == ZERO_EXTRACT)
-	   && GET_MODE (x) == DImode
-	   && GET_CODE (XEXP (x, 0)) == MULT
-	   && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
-	   && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
-    {
-      type = (GET_CODE (x) == SIGN_EXTRACT)
-	? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
-      index = XEXP (XEXP (x, 0), 0);
-      shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
-      if (INTVAL (XEXP (x, 1)) != 32 + shift
-	  || INTVAL (XEXP (x, 2)) != 0)
-	shift = -1;
-    }
   /* (and:DI (mult:DI (reg:DI) (const_int scale))
      (const_int 0xffffffff<<shift)) */
   else if (GET_CODE (x) == AND
@@ -8868,22 +8825,6 @@ aarch64_classify_index (struct aarch64_address_info *info, rtx x,
       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
 	shift = -1;
     }
-  /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
-  else if ((GET_CODE (x) == SIGN_EXTRACT
-	    || GET_CODE (x) == ZERO_EXTRACT)
-	   && GET_MODE (x) == DImode
-	   && GET_CODE (XEXP (x, 0)) == ASHIFT
-	   && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
-	   && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
-    {
-      type = (GET_CODE (x) == SIGN_EXTRACT)
-	? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
-      index = XEXP (XEXP (x, 0), 0);
-      shift = INTVAL (XEXP (XEXP (x, 0), 1));
-      if (INTVAL (XEXP (x, 1)) != 32 + shift
-	  || INTVAL (XEXP (x, 2)) != 0)
-	shift = -1;
-    }
   /* (and:DI (ashift:DI (reg:DI) (const_int shift))
      (const_int 0xffffffff<<shift)) */
   else if (GET_CODE (x) == AND
@@ -11259,16 +11200,6 @@ aarch64_strip_extend (rtx x, bool strip_shift)
   if (!is_a <scalar_int_mode> (GET_MODE (op), &mode))
     return op;
 
-  /* Zero and sign extraction of a widened value.  */
-  if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
-      && XEXP (op, 2) == const0_rtx
-      && GET_CODE (XEXP (op, 0)) == MULT
-      && aarch64_is_extend_from_extract (mode, XEXP (XEXP (op, 0), 1),
-					 XEXP (op, 1)))
-    return XEXP (XEXP (op, 0), 0);
-
-  /* It can also be represented (for zero-extend) as an AND with an
-     immediate.  */
   if (GET_CODE (op) == AND
       && GET_CODE (XEXP (op, 0)) == MULT
       && CONST_INT_P (XEXP (XEXP (op, 0), 1))
@@ -11606,32 +11537,12 @@ aarch64_branch_cost (bool speed_p, bool predictable_p)
 /* Return true if the RTX X in mode MODE is a zero or sign extract
    usable in an ADD or SUB (extended register) instruction.  */
 static bool
-aarch64_rtx_arith_op_extract_p (rtx x, scalar_int_mode mode)
-{
-  /* Catch add with a sign extract.
-     This is add_<optab><mode>_multp2.  */
-  if (GET_CODE (x) == SIGN_EXTRACT
-      || GET_CODE (x) == ZERO_EXTRACT)
-    {
-      rtx op0 = XEXP (x, 0);
-      rtx op1 = XEXP (x, 1);
-      rtx op2 = XEXP (x, 2);
-
-      if (GET_CODE (op0) == MULT
-	  && CONST_INT_P (op1)
-	  && op2 == const0_rtx
-	  && CONST_INT_P (XEXP (op0, 1))
-	  && aarch64_is_extend_from_extract (mode,
-					     XEXP (op0, 1),
-					     op1))
-	{
-	  return true;
-	}
-    }
+aarch64_rtx_arith_op_extract_p (rtx x)
+{
   /* The simple case <ARITH>, XD, XN, XM, [us]xt.
      No shift.  */
-  else if (GET_CODE (x) == SIGN_EXTEND
-	   || GET_CODE (x) == ZERO_EXTEND)
+  if (GET_CODE (x) == SIGN_EXTEND
+      || GET_CODE (x) == ZERO_EXTEND)
     return REG_P (XEXP (x, 0));
 
   return false;
@@ -12318,8 +12229,8 @@ cost_minus:
 	  }
 
 	/* Look for SUB (extended register).  */
-	if (is_a <scalar_int_mode> (mode, &int_mode)
-	    && aarch64_rtx_arith_op_extract_p (op1, int_mode))
+	if (is_a <scalar_int_mode> (mode)
+	    && aarch64_rtx_arith_op_extract_p (op1))
 	  {
 	    if (speed)
 	      *cost += extra_cost->alu.extend_arith;
@@ -12398,8 +12309,8 @@ cost_plus:
 	*cost += rtx_cost (op1, mode, PLUS, 1, speed);
 
 	/* Look for ADD (extended register).  */
-	if (is_a <scalar_int_mode> (mode, &int_mode)
-	    && aarch64_rtx_arith_op_extract_p (op0, int_mode))
+	if (is_a <scalar_int_mode> (mode)
+	    && aarch64_rtx_arith_op_extract_p (op0))
 	  {
 	    if (speed)
 	      *cost += extra_cost->alu.extend_arith;
diff --git a/gcc/testsuite/gcc.c-torture/compile/pr96998.c b/gcc/testsuite/gcc.c-torture/compile/pr96998.c
new file mode 100644
index 00000000000..a75d5dcfe08
--- /dev/null
+++ b/gcc/testsuite/gcc.c-torture/compile/pr96998.c
@@ -0,0 +1,15 @@
+/* { dg-do compile { target arm*-*-* aarch64*-*-* } } */
+
+int h(void);
+struct c d;
+struct c {
+  int e[1];
+};
+
+void f(void) {
+  int g;
+  for (;; g = h()) {
+    int *i = &d.e[g];
+    asm("" : "=Q"(*i));
+  }
+}
Segher Boessenkool Sept. 21, 2020, 11:35 p.m. UTC | #7
Hi!

So, I tested this patch.  The test builds Linux for all targets, and the
number reported here is just binary size (usually a good indicator for
combine effectiveness).  C0 is the unmodified compiler, C1 is with your
patch.  A size of 0 means it did not build.

                    C0        C1
       alpha   6403469  100.000%
         arc         0         0
         arm  10196358  100.000%
       arm64         0  20228766
       armhf  15042594  100.000%
         c6x   2496218  100.000%
        csky         0         0
       h8300   1217198  100.000%
        i386  11966700  100.000%
        ia64  18814277  100.000%
        m68k   3856350  100.000%
  microblaze   5864258  100.000%
        mips   9142108  100.000%
      mips64   7344744  100.000%
       nds32         0         0
       nios2   3909477  100.000%
    openrisc   4554446  100.000%
      parisc   7721195  100.000%
    parisc64         0         0
     powerpc  10447477  100.000%
   powerpc64  22257111  100.000%
 powerpc64le  19292786  100.000%
     riscv32   1630934  100.000%
     riscv64   7628058  100.000%
        s390  15173928  100.000%
          sh   3410671  100.000%
     shnommu   1685616  100.000%
       sparc   4737096  100.000%
     sparc64   7167122  100.000%
      x86_64  19718928  100.000%
      xtensa   2639363  100.000%

So, there is no difference for most targets (I checked some targets and
there really is no difference).  The only exception is aarch64 (which
the kernel calls "arm64"): the unpatched compiler ICEs!  (At least three
times, even).

during RTL pass: reload
/home/segher/src/kernel/kernel/cgroup/cgroup.c: In function 'rebind_subsystems':
/home/segher/src/kernel/kernel/cgroup/cgroup.c:1777:1: internal compiler error: in lra_set_insn_recog_data, at lra.c:1004
 1777 | }
      | ^
0x1096215f lra_set_insn_recog_data(rtx_insn*)
        /home/segher/src/gcc/gcc/lra.c:1004
0x109625d7 lra_get_insn_recog_data
        /home/segher/src/gcc/gcc/lra-int.h:488
0x109625d7 lra_update_insn_regno_info(rtx_insn*)
        /home/segher/src/gcc/gcc/lra.c:1625
0x10962d03 lra_update_insn_regno_info(rtx_insn*)
        /home/segher/src/gcc/gcc/lra.c:1623
0x10962d03 lra_push_insn_1
        /home/segher/src/gcc/gcc/lra.c:1780
[etc]

This means LRA found an unrecognised insn; and that insn is

(insn 817 804 818 21 (set (reg:DI 324)
        (sign_extract:DI (ashift:DI (subreg:DI (reg:SI 232) 0)
                (const_int 3 [0x3]))
            (const_int 35 [0x23])
            (const_int 0 [0]))) "/home/segher/src/kernel/kernel/cgroup/cgroup.c":1747:3 -1
     (nil))

LRA created that as a reload for

(insn 347 819 348 21 (parallel [
            (set (mem/f:DI (reg:DI 324) [233 *__p_84+0 S8 A64])
                (asm_operands/v:DI ("stlr %1, %0") ("=Q") 0 [
                        (reg:DI 325 [orig:106 prephitmp_18 ] [106])
                    ]
                     [
                        (asm_input:DI ("r") /home/segher/src/kernel/kernel/cgroup/cgroup.c:1747)
                    ]
                     [] /home/segher/src/kernel/kernel/cgroup/cgroup.c:1747))
            (clobber (mem:BLK (scratch) [0  A8]))
        ]) "/home/segher/src/kernel/kernel/cgroup/cgroup.c":1747:3 -1
     (expr_list:REG_DEAD (reg:SI 232)
        (expr_list:REG_DEAD (reg:DI 106 [ prephitmp_18 ])
            (nil))))

as

  347: {[r324:DI]=asm_operands;clobber [scratch];}
      REG_DEAD r232:SI
      REG_DEAD r106:DI
    Inserting insn reload before:
  817: r324:DI=sign_extract(r232:SI#0<<0x3,0x23,0)
  818: r324:DI=r324:DI+r284:DI
  819: r325:DI=r106:DI

(and then it died).


Can you fix this first?  There probably is something target-specific
wrong related to zero_extract.


Segher
Alex Coplan Sept. 22, 2020, 7:40 a.m. UTC | #8
Hi Segher,

On 21/09/2020 18:35, Segher Boessenkool wrote:
> Hi!
> 
> So, I tested this patch.  The test builds Linux for all targets, and the
> number reported here is just binary size (usually a good indicator for
> combine effectiveness).  C0 is the unmodified compiler, C1 is with your
> patch.  A size of 0 means it did not build.
> 
>                     C0        C1
>        alpha   6403469  100.000%
>          arc         0         0
>          arm  10196358  100.000%
>        arm64         0  20228766
>        armhf  15042594  100.000%
>          c6x   2496218  100.000%
>         csky         0         0
>        h8300   1217198  100.000%
>         i386  11966700  100.000%
>         ia64  18814277  100.000%
>         m68k   3856350  100.000%
>   microblaze   5864258  100.000%
>         mips   9142108  100.000%
>       mips64   7344744  100.000%
>        nds32         0         0
>        nios2   3909477  100.000%
>     openrisc   4554446  100.000%
>       parisc   7721195  100.000%
>     parisc64         0         0
>      powerpc  10447477  100.000%
>    powerpc64  22257111  100.000%
>  powerpc64le  19292786  100.000%
>      riscv32   1630934  100.000%
>      riscv64   7628058  100.000%
>         s390  15173928  100.000%
>           sh   3410671  100.000%
>      shnommu   1685616  100.000%
>        sparc   4737096  100.000%
>      sparc64   7167122  100.000%
>       x86_64  19718928  100.000%
>       xtensa   2639363  100.000%

Thanks for doing this testing. The results look good, then: no code size
changes and no build regressions.

> 
> So, there is no difference for most targets (I checked some targets and
> there really is no difference).  The only exception is aarch64 (which
> the kernel calls "arm64"): the unpatched compiler ICEs!  (At least three
> times, even).

Indeed, this is the intended purpose of the patch, see the PR (96998).

> 
> during RTL pass: reload
> /home/segher/src/kernel/kernel/cgroup/cgroup.c: In function 'rebind_subsystems':
> /home/segher/src/kernel/kernel/cgroup/cgroup.c:1777:1: internal compiler error: in lra_set_insn_recog_data, at lra.c:1004
>  1777 | }
>       | ^
> 0x1096215f lra_set_insn_recog_data(rtx_insn*)
>         /home/segher/src/gcc/gcc/lra.c:1004
> 0x109625d7 lra_get_insn_recog_data
>         /home/segher/src/gcc/gcc/lra-int.h:488
> 0x109625d7 lra_update_insn_regno_info(rtx_insn*)
>         /home/segher/src/gcc/gcc/lra.c:1625
> 0x10962d03 lra_update_insn_regno_info(rtx_insn*)
>         /home/segher/src/gcc/gcc/lra.c:1623
> 0x10962d03 lra_push_insn_1
>         /home/segher/src/gcc/gcc/lra.c:1780
> [etc]
> 
> This means LRA found an unrecognised insn; and that insn is
> 
> (insn 817 804 818 21 (set (reg:DI 324)
>         (sign_extract:DI (ashift:DI (subreg:DI (reg:SI 232) 0)
>                 (const_int 3 [0x3]))
>             (const_int 35 [0x23])
>             (const_int 0 [0]))) "/home/segher/src/kernel/kernel/cgroup/cgroup.c":1747:3 -1
>      (nil))
> 
> LRA created that as a reload for
> 
> (insn 347 819 348 21 (parallel [
>             (set (mem/f:DI (reg:DI 324) [233 *__p_84+0 S8 A64])
>                 (asm_operands/v:DI ("stlr %1, %0") ("=Q") 0 [
>                         (reg:DI 325 [orig:106 prephitmp_18 ] [106])
>                     ]
>                      [
>                         (asm_input:DI ("r") /home/segher/src/kernel/kernel/cgroup/cgroup.c:1747)
>                     ]
>                      [] /home/segher/src/kernel/kernel/cgroup/cgroup.c:1747))
>             (clobber (mem:BLK (scratch) [0  A8]))
>         ]) "/home/segher/src/kernel/kernel/cgroup/cgroup.c":1747:3 -1
>      (expr_list:REG_DEAD (reg:SI 232)
>         (expr_list:REG_DEAD (reg:DI 106 [ prephitmp_18 ])
>             (nil))))
> 
> as
> 
>   347: {[r324:DI]=asm_operands;clobber [scratch];}
>       REG_DEAD r232:SI
>       REG_DEAD r106:DI
>     Inserting insn reload before:
>   817: r324:DI=sign_extract(r232:SI#0<<0x3,0x23,0)
>   818: r324:DI=r324:DI+r284:DI
>   819: r325:DI=r106:DI
> 
> (and then it died).
> 
> 
> Can you fix this first?  There probably is something target-specific
> wrong related to zero_extract.

The intent is to fix this in combine here. See the earlier replies in
this thread.

> 
> 
> Segher

Thanks,
Alex
Segher Boessenkool Sept. 22, 2020, 3:26 p.m. UTC | #9
Hi Alex,

On Tue, Sep 22, 2020 at 08:40:07AM +0100, Alex Coplan wrote:
> On 21/09/2020 18:35, Segher Boessenkool wrote:
> Thanks for doing this testing. The results look good, then: no code size
> changes and no build regressions.

No *code* changes.  I cannot test aarch64 likme this.

> > So, there is no difference for most targets (I checked some targets and
> > there really is no difference).  The only exception is aarch64 (which
> > the kernel calls "arm64"): the unpatched compiler ICEs!  (At least three
> > times, even).
> 
> Indeed, this is the intended purpose of the patch, see the PR (96998).

You want to fix a ICE in LRA caused by an instruction created by LRA,
with a patch to combine?!  That doesn't sound right.

If what you want to do is a) fix the backend bug, and then b) get some
extra performance, then do *that*, and keep the patches separate.


> > Can you fix this first?  There probably is something target-specific
> > wrong related to zero_extract.
> 
> The intent is to fix this in combine here. See the earlier replies in
> this thread.

But that is logically impossible.  The infringing insn does not *exist*
yet during combine.


Segher
Richard Sandiford Sept. 22, 2020, 4:08 p.m. UTC | #10
Segher Boessenkool <segher@kernel.crashing.org> writes:
> Hi Alex,
>
> On Tue, Sep 22, 2020 at 08:40:07AM +0100, Alex Coplan wrote:
>> On 21/09/2020 18:35, Segher Boessenkool wrote:
>> Thanks for doing this testing. The results look good, then: no code size
>> changes and no build regressions.
>
> No *code* changes.  I cannot test aarch64 likme this.
>
>> > So, there is no difference for most targets (I checked some targets and
>> > there really is no difference).  The only exception is aarch64 (which
>> > the kernel calls "arm64"): the unpatched compiler ICEs!  (At least three
>> > times, even).
>> 
>> Indeed, this is the intended purpose of the patch, see the PR (96998).
>
> You want to fix a ICE in LRA caused by an instruction created by LRA,
> with a patch to combine?!  That doesn't sound right.
>
> If what you want to do is a) fix the backend bug, and then b) get some
> extra performance, then do *that*, and keep the patches separate.

This patch isn't supposed to be a performance optimisation.  It's supposed
to be a canonicalisation improvement.

The situation as things stand is that aarch64 has a bug: it accepts
an odd sign_extract representation of addresses, but doesn't accept
that same odd form of address as an LEA.  We have two options:

(a) add back instructions that recognise the odd form of LEA, or
(b) remove the code that accepts the odd addresses

I think (b) is the way to go here.  But doing that on its own
would regress code quality.  The reason we recognised the odd
addresses in the first place was because that was the rtl that
combine happened to generate for an important case.

Normal operating procedure is to add patterns and address-matching
code that accepts whatever combine happens to throw at the target,
regardless of how sensible the representation is.  But sometimes I think
we should instead think about whether the representation that combine is
using is the right one.  And IMO this is one such case.

At the moment combine does this:

Trying 8 -> 9:
    8: r98:DI=sign_extend(r92:SI)
      REG_DEAD r92:SI
    9: [r98:DI*0x4+r96:DI]=asm_operands
      REG_DEAD r98:DI
Failed to match this instruction:
(set (mem:SI (plus:DI (sign_extract:DI (mult:DI (subreg:DI (reg/v:SI 92 [ g ]) 0)
                    (const_int 4 [0x4]))
                (const_int 34 [0x22])
                (const_int 0 [0]))
            (reg/f:DI 96)) [3 *i_5+0 S4 A32])
    (asm_operands:SI ("") ("=Q") 0 []
         []
         [] /tmp/foo.c:13))
allowing combination of insns 8 and 9
original costs 4 + 4 = 8
replacement cost 4

and so that's one of the forms that the aarch64 address code accepts.
But a natural substitution would simply replace r98 with the rhs of
the set:

  (set (mem:SI (plus:DI (mult:DI (sign_extend:DI (reg/v:SI 92))
                                 (const_int 4))
                        (reg:DI 96)))
       ...)

The only reason we don't do that is because the substitution
and simplification go through the expand_compound_operation/
make_compound_operation process.

The corresponding (ashift ... (const_int 2)) *does* end up using
the natural sign_extend form rather than sign_extract form.
The only reason we get this (IMO) weird behaviour for mult is
the rule that shifts have to be mults in addresses.  Some code
(like the code being patched) instead expects ashift to be the
canonical form in all situations.

If we make the substitution work “naturally” for mult as well as
ashift, we can remove the addressing-matching code that has no
corresponding LEA pattern, and make the aarch64 address code
self-consistent that way instead.

Thanks,
Richard
Alex Coplan Sept. 29, 2020, 10:36 a.m. UTC | #11
Hi Segher,

Gentle ping.

Is the combine change (a canonicalization fix, as described below) OK
for trunk in light of this info?

On 22/09/2020 17:08, Richard Sandiford wrote:
> Segher Boessenkool <segher@kernel.crashing.org> writes:
> > Hi Alex,
> >
> > On Tue, Sep 22, 2020 at 08:40:07AM +0100, Alex Coplan wrote:
> >> On 21/09/2020 18:35, Segher Boessenkool wrote:
> >> Thanks for doing this testing. The results look good, then: no code size
> >> changes and no build regressions.
> >
> > No *code* changes.  I cannot test aarch64 likme this.
> >
> >> > So, there is no difference for most targets (I checked some targets and
> >> > there really is no difference).  The only exception is aarch64 (which
> >> > the kernel calls "arm64"): the unpatched compiler ICEs!  (At least three
> >> > times, even).
> >> 
> >> Indeed, this is the intended purpose of the patch, see the PR (96998).
> >
> > You want to fix a ICE in LRA caused by an instruction created by LRA,
> > with a patch to combine?!  That doesn't sound right.
> >
> > If what you want to do is a) fix the backend bug, and then b) get some
> > extra performance, then do *that*, and keep the patches separate.
> 
> This patch isn't supposed to be a performance optimisation.  It's supposed
> to be a canonicalisation improvement.
> 
> The situation as things stand is that aarch64 has a bug: it accepts
> an odd sign_extract representation of addresses, but doesn't accept
> that same odd form of address as an LEA.  We have two options:
> 
> (a) add back instructions that recognise the odd form of LEA, or
> (b) remove the code that accepts the odd addresses
> 
> I think (b) is the way to go here.  But doing that on its own
> would regress code quality.  The reason we recognised the odd
> addresses in the first place was because that was the rtl that
> combine happened to generate for an important case.
> 
> Normal operating procedure is to add patterns and address-matching
> code that accepts whatever combine happens to throw at the target,
> regardless of how sensible the representation is.  But sometimes I think
> we should instead think about whether the representation that combine is
> using is the right one.  And IMO this is one such case.
> 
> At the moment combine does this:
> 
> Trying 8 -> 9:
>     8: r98:DI=sign_extend(r92:SI)
>       REG_DEAD r92:SI
>     9: [r98:DI*0x4+r96:DI]=asm_operands
>       REG_DEAD r98:DI
> Failed to match this instruction:
> (set (mem:SI (plus:DI (sign_extract:DI (mult:DI (subreg:DI (reg/v:SI 92 [ g ]) 0)
>                     (const_int 4 [0x4]))
>                 (const_int 34 [0x22])
>                 (const_int 0 [0]))
>             (reg/f:DI 96)) [3 *i_5+0 S4 A32])
>     (asm_operands:SI ("") ("=Q") 0 []
>          []
>          [] /tmp/foo.c:13))
> allowing combination of insns 8 and 9
> original costs 4 + 4 = 8
> replacement cost 4
> 
> and so that's one of the forms that the aarch64 address code accepts.
> But a natural substitution would simply replace r98 with the rhs of
> the set:
> 
>   (set (mem:SI (plus:DI (mult:DI (sign_extend:DI (reg/v:SI 92))
>                                  (const_int 4))
>                         (reg:DI 96)))
>        ...)
> 
> The only reason we don't do that is because the substitution
> and simplification go through the expand_compound_operation/
> make_compound_operation process.
> 
> The corresponding (ashift ... (const_int 2)) *does* end up using
> the natural sign_extend form rather than sign_extract form.
> The only reason we get this (IMO) weird behaviour for mult is
> the rule that shifts have to be mults in addresses.  Some code
> (like the code being patched) instead expects ashift to be the
> canonical form in all situations.
> 
> If we make the substitution work “naturally” for mult as well as
> ashift, we can remove the addressing-matching code that has no
> corresponding LEA pattern, and make the aarch64 address code
> self-consistent that way instead.
> 
> Thanks,
> Richard

Thanks,
Alex
Segher Boessenkool Sept. 29, 2020, 7:20 p.m. UTC | #12
On Tue, Sep 29, 2020 at 11:36:12AM +0100, Alex Coplan wrote:
> Is the combine change (a canonicalization fix, as described below) OK
> for trunk in light of this info?

Can you please resend it with correct info and a corresponding commit
message?


Segher
Alex Coplan Sept. 30, 2020, 10:41 a.m. UTC | #13
On 29/09/2020 14:20, Segher Boessenkool wrote:
> On Tue, Sep 29, 2020 at 11:36:12AM +0100, Alex Coplan wrote:
> > Is the combine change (a canonicalization fix, as described below) OK
> > for trunk in light of this info?
> 
> Can you please resend it with correct info and a corresponding commit
> message?

Sure. I've sent just the combine patch with a proposed commit message:

https://gcc.gnu.org/pipermail/gcc-patches/2020-September/555158.html

Thanks,
Alex
diff mbox series

Patch

diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index b6d74496cd0..55e0fc4e683 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -2815,27 +2815,24 @@  aarch64_is_noplt_call_p (rtx sym)
    represent an expression that matches an extend operation.  The
    operands represent the parameters from
 
-   (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)).  */
+   (extract:MODE (ashift (reg) (SHIFT_IMM)) (EXTRACT_IMM) (const_int 0)).  */
 bool
-aarch64_is_extend_from_extract (scalar_int_mode mode, rtx mult_imm,
+aarch64_is_extend_from_extract (scalar_int_mode mode, rtx shift_imm,
 				rtx extract_imm)
 {
-  HOST_WIDE_INT mult_val, extract_val;
+  HOST_WIDE_INT shift_val, extract_val;
 
-  if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
+  if (! CONST_INT_P (shift_imm) || ! CONST_INT_P (extract_imm))
     return false;
 
-  mult_val = INTVAL (mult_imm);
+  shift_val = INTVAL (shift_imm);
   extract_val = INTVAL (extract_imm);
 
-  if (extract_val > 8
-      && extract_val < GET_MODE_BITSIZE (mode)
-      && exact_log2 (extract_val & ~7) > 0
-      && (extract_val & 7) <= 4
-      && mult_val == (1 << (extract_val & 7)))
-    return true;
-
-  return false;
+  return extract_val > 8
+	  && extract_val < GET_MODE_BITSIZE (mode)
+	  && exact_log2 (extract_val & ~7) > 0
+	  && shift_val <= 4
+	  && shift_val == (extract_val & 7);
 }
 
 /* Emit an insn that's a simple single-set.  Both the operands must be
@@ -11262,7 +11259,7 @@  aarch64_strip_extend (rtx x, bool strip_shift)
   /* Zero and sign extraction of a widened value.  */
   if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
       && XEXP (op, 2) == const0_rtx
-      && GET_CODE (XEXP (op, 0)) == MULT
+      && GET_CODE (XEXP (op, 0)) == ASHIFT
       && aarch64_is_extend_from_extract (mode, XEXP (XEXP (op, 0), 1),
 					 XEXP (op, 1)))
     return XEXP (XEXP (op, 0), 0);
@@ -11617,7 +11614,7 @@  aarch64_rtx_arith_op_extract_p (rtx x, scalar_int_mode mode)
       rtx op1 = XEXP (x, 1);
       rtx op2 = XEXP (x, 2);
 
-      if (GET_CODE (op0) == MULT
+      if (GET_CODE (op0) == ASHIFT
 	  && CONST_INT_P (op1)
 	  && op2 == const0_rtx
 	  && CONST_INT_P (XEXP (op0, 1))
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index dbc6b1db176..4bb7b318b99 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -2471,6 +2471,19 @@  (define_insn "*add_<optab><ALLX:mode>_<GPI:mode>"
   [(set_attr "type" "alu_ext")]
 )
 
+(define_insn "*add_<optab><mode>_shftex"
+  [(set (match_operand:GPI 0 "register_operand" "=rk")
+	(plus:GPI (ANY_EXTRACT:GPI
+		    (ashift:GPI (match_operand:GPI 1 "register_operand" "r")
+				(match_operand 2 "aarch64_shift_imm_<mode>" "n"))
+		    (match_operand 3 "const_int_operand" "n")
+		    (const_int 0))
+		  (match_operand:GPI 4 "register_operand" "r")))]
+  "aarch64_is_extend_from_extract (<MODE>mode, operands[2], operands[3])"
+  "add\\t%<w>0, %<w>4, %<w>1, <su>xt%e3 %2"
+  [(set_attr "type" "alu_ext")]
+)
+
 ;; zero_extend version of above
 (define_insn "*add_<optab><SHORT:mode>_si_uxtw"
   [(set (match_operand:DI 0 "register_operand" "=rk")
diff --git a/gcc/testsuite/gcc.c-torture/compile/pr96998.c b/gcc/testsuite/gcc.c-torture/compile/pr96998.c
new file mode 100644
index 00000000000..a75d5dcfe08
--- /dev/null
+++ b/gcc/testsuite/gcc.c-torture/compile/pr96998.c
@@ -0,0 +1,15 @@ 
+/* { dg-do compile { target arm*-*-* aarch64*-*-* } } */
+
+int h(void);
+struct c d;
+struct c {
+  int e[1];
+};
+
+void f(void) {
+  int g;
+  for (;; g = h()) {
+    int *i = &d.e[g];
+    asm("" : "=Q"(*i));
+  }
+}