diff mbox

[AARCH64] Bad code-gen for structure/block/unaligned memory access

Message ID VI1PR0801MB20313B74A43EAC4756E8C8BDFFC80@VI1PR0801MB2031.eurprd08.prod.outlook.com
State New
Headers show

Commit Message

Tamar Christina June 7, 2017, 11:38 a.m. UTC
Hi All, 


This patch allows larger bitsizes to be used as copy size
when the target does not have SLOW_UNALIGNED_ACCESS.

It also provides an optimized routine for MEM to REG
copying which avoid reconstructing the value piecewise on the stack
and instead uses a combination of shifts and ORs.

This now generates

	adrp	x0, .LANCHOR0
	add	x0, x0, :lo12:.LANCHOR0
	sub	sp, sp, #16
	ldr	w1, [x0, 120]
	str	w1, [sp, 8]
	ldr	x0, [x0, 112]
	ldr	x1, [sp, 8]
	add	sp, sp, 16

instead of:

	adrp	x3, .LANCHOR0
	add	x3, x3, :lo12:.LANCHOR0
	mov	x0, 0
	mov	x1, 0
	sub	sp, sp, #16
	ldr	x2, [x3, 112]
	ldr	w3, [x3, 120]
	add	sp, sp, 16
	ubfx	x5, x2, 8, 8
	bfi	x0, x2, 0, 8
	ubfx	x4, x2, 16, 8
	lsr	w9, w2, 24
	bfi	x0, x5, 8, 8
	ubfx	x7, x2, 32, 8
	ubfx	x5, x2, 40, 8
	ubfx	x8, x3, 8, 8
	bfi	x0, x4, 16, 8
	bfi	x1, x3, 0, 8
	ubfx	x4, x2, 48, 8
	ubfx	x6, x3, 16, 8
	bfi	x0, x9, 24, 8
	bfi	x1, x8, 8, 8
	lsr	x2, x2, 56
	lsr	w3, w3, 24
	bfi	x0, x7, 32, 8
	bfi	x1, x6, 16, 8
	bfi	x0, x5, 40, 8
	bfi	x1, x3, 24, 8
	bfi	x0, x4, 48, 8
	bfi	x0, x2, 56, 8

To load a 12 1-byte element struct.

and

	adrp	x0, .LANCHOR0
	add	x0, x0, :lo12:.LANCHOR0
	sub	sp, sp, #16
	ldrb	w1, [x0, 18]
	ldrh	w0, [x0, 16]
	orr	w0, w0, w1, lsr 16
	str	w0, [sp, 8]
	add	sp, sp, 16

instead of

	adrp	x2, .LANCHOR0
	add	x2, x2, :lo12:.LANCHOR0
	mov	x0, 0
	sub	sp, sp, #16
	ldrh	w1, [x2, 16]
	ldrb	w2, [x2, 18]
	add	sp, sp, 16
	bfi	x0, x1, 0, 8
	ubfx	x1, x1, 8, 8
	bfi	x0, x1, 8, 8
	bfi	x0, x2, 16, 8

These changes only have an effect on structures smaller than 16 bytes.

The remaining stores come from an existing incomplete data-flow analysis
which thinks the value on the stack is being used and doesn't mark
the value as dead.

Regression tested on aarch64-none-linux-gnu and x86_64-pc-linux-gnu and no regressions.

OK for trunk?

Thanks,
Tamar


gcc/
2017-06-07  Tamar Christina  <tamar.christina@arm.com>

	* expr.c (copy_blkmode_to_reg): Fix bitsize for targets
	with fast unaligned access.
	* config/aarch64/aarch64.c (aarch64_expand_movmem):
	Add MEM to REG optimized case.

Comments

Richard Sandiford June 8, 2017, 8:01 a.m. UTC | #1
Tamar Christina <Tamar.Christina@arm.com> writes:
> Hi All, 
>
> This patch allows larger bitsizes to be used as copy size
> when the target does not have SLOW_UNALIGNED_ACCESS.
>
> It also provides an optimized routine for MEM to REG
> copying which avoid reconstructing the value piecewise on the stack
> and instead uses a combination of shifts and ORs.
>
> This now generates
>
> 	adrp	x0, .LANCHOR0
> 	add	x0, x0, :lo12:.LANCHOR0
> 	sub	sp, sp, #16
> 	ldr	w1, [x0, 120]
> 	str	w1, [sp, 8]
> 	ldr	x0, [x0, 112]
> 	ldr	x1, [sp, 8]
> 	add	sp, sp, 16
>
> instead of:
>
> 	adrp	x3, .LANCHOR0
> 	add	x3, x3, :lo12:.LANCHOR0
> 	mov	x0, 0
> 	mov	x1, 0
> 	sub	sp, sp, #16
> 	ldr	x2, [x3, 112]
> 	ldr	w3, [x3, 120]
> 	add	sp, sp, 16
> 	ubfx	x5, x2, 8, 8
> 	bfi	x0, x2, 0, 8
> 	ubfx	x4, x2, 16, 8
> 	lsr	w9, w2, 24
> 	bfi	x0, x5, 8, 8
> 	ubfx	x7, x2, 32, 8
> 	ubfx	x5, x2, 40, 8
> 	ubfx	x8, x3, 8, 8
> 	bfi	x0, x4, 16, 8
> 	bfi	x1, x3, 0, 8
> 	ubfx	x4, x2, 48, 8
> 	ubfx	x6, x3, 16, 8
> 	bfi	x0, x9, 24, 8
> 	bfi	x1, x8, 8, 8
> 	lsr	x2, x2, 56
> 	lsr	w3, w3, 24
> 	bfi	x0, x7, 32, 8
> 	bfi	x1, x6, 16, 8
> 	bfi	x0, x5, 40, 8
> 	bfi	x1, x3, 24, 8
> 	bfi	x0, x4, 48, 8
> 	bfi	x0, x2, 56, 8
>
> To load a 12 1-byte element struct.

Nice!

[...]

> diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
> index 4f769a40a4e9de83cb5aacfd3ff58301c2feeb78..8906d9a9445ed36f43302708d1f6212bcf017bdc 100644
> --- a/gcc/config/aarch64/aarch64.c
> +++ b/gcc/config/aarch64/aarch64.c
> @@ -13498,6 +13498,41 @@ aarch64_expand_movmem (rtx *operands)
>    base = copy_to_mode_reg (Pmode, XEXP (src, 0));
>    src = adjust_automodify_address (src, VOIDmode, base, 0);
>  
> +  /* Optimize routines for MEM to REG copies.  */
> +  if (n < 8 && !REG_P (XEXP (operands[0], 0)))

This seems to be checking that the address of the original destination
memory isn't a plain base register.  Why's it important to reject that
case but allow e.g. base+offset?

> +   {
> +     unsigned int max_align = UINTVAL (operands[2]);
> +     max_align = n < max_align ? max_align : n;

Might be misunderstanding, but isn't max_align always equal to n here,
since n was set by:

  n = UINTVAL (operands[2]);

Indentation of the enclosing { ... } is slightly off.

> +     machine_mode mov_mode, dest_mode
> +       = smallest_mode_for_size (max_align * BITS_PER_UNIT, MODE_INT);
> +     rtx result = gen_reg_rtx (dest_mode);
> +     emit_insn (gen_move_insn (result, GEN_INT (0)));
> +
> +     unsigned int shift_cnt = 0;
> +     for (; n > shift_cnt; shift_cnt += GET_MODE_SIZE (mov_mode))
> +       {
> +	 int nearest = 0;
> +	 /* Find the mode to use, but limit the max to TI mode.  */
> +	 for (unsigned max = 1; max <= (n - shift_cnt) && max <= 16; max *= 2)
> +	      nearest = max;

In the if statement above, you required n < 8, so can max ever by > 16 here?

> +	  mov_mode = smallest_mode_for_size (nearest * BITS_PER_UNIT, MODE_INT);
> +	  rtx reg = gen_reg_rtx (mov_mode);
> +
> +	  src = adjust_address (src, mov_mode, 0);
> +	  emit_insn (gen_move_insn (reg, src));
> +	  src = aarch64_progress_pointer (src);
> +
> +	  reg = gen_rtx_ASHIFT (dest_mode, reg,
> +				GEN_INT (shift_cnt * BITS_PER_UNIT));

This seems to be mixing modes: reg has mode "mov_mode" but the result has
mode "dest_mode".  That isn't well-formed: the mode of a shift result needs
to be the same as the mode of the operand.  I think the load would need
to be a zero-extend of "src" into "reg", with "reg" having mode "dest_mode".

> +	  result = gen_rtx_IOR (dest_mode, reg, result);
> +      }
> +
> +    dst = adjust_address (dst, dest_mode, 0);
> +    emit_insn (gen_move_insn (dst, result));

dest_mode was chosen by smallest_mode_for_size, so can be bigger than n.
Doesn't that mean that we'll write beyond the end of the copy region
when n is an awkward number?

> diff --git a/gcc/expr.c b/gcc/expr.c
> index 91d7ea217229fac62380b5d4b646961bf7c836c1..b1df4651e7942346007cda1cce8ee5a19297ab16 100644
> --- a/gcc/expr.c
> +++ b/gcc/expr.c
> @@ -2743,7 +2743,9 @@ copy_blkmode_to_reg (machine_mode mode, tree src)
>  
>    n_regs = (bytes + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
>    dst_words = XALLOCAVEC (rtx, n_regs);
> -  bitsize = MIN (TYPE_ALIGN (TREE_TYPE (src)), BITS_PER_WORD);
> +  bitsize = BITS_PER_WORD;
> +  if (SLOW_UNALIGNED_ACCESS (BLKmode, TYPE_ALIGN (TREE_TYPE (src))))
> +    bitsize = MIN (TYPE_ALIGN (TREE_TYPE (src)), BITS_PER_WORD);

I think this ought to be testing word_mode instead of BLKmode.
(Testing BLKmode doesn't really make sense in general, because the
mode doesn't have a meaningful alignment.)

Thanks,
Richard
Tamar Christina June 9, 2017, 9:04 a.m. UTC | #2
> > +  /* Optimize routines for MEM to REG copies.  */  if (n < 8 &&
> > + !REG_P (XEXP (operands[0], 0)))
> 
> This seems to be checking that the address of the original destination
> memory isn't a plain base register.  Why's it important to reject that case but
> allow e.g. base+offset?
> 

In the case of e.g.

void Fun3(struct struct3 foo3)
{
  L3 = foo3;
}


> > +   {
> > +     unsigned int max_align = UINTVAL (operands[2]);
> > +     max_align = n < max_align ? max_align : n;
> 
> Might be misunderstanding, but isn't max_align always equal to n here, since
> n was set by:
> 
>   n = UINTVAL (operands[2]);
> 
> Indentation of the enclosing { ... } is slightly off.
> 
> > +     machine_mode mov_mode, dest_mode
> > +       = smallest_mode_for_size (max_align * BITS_PER_UNIT, MODE_INT);
> > +     rtx result = gen_reg_rtx (dest_mode);
> > +     emit_insn (gen_move_insn (result, GEN_INT (0)));
> > +
> > +     unsigned int shift_cnt = 0;
> > +     for (; n > shift_cnt; shift_cnt += GET_MODE_SIZE (mov_mode))
> > +       {
> > +	 int nearest = 0;
> > +	 /* Find the mode to use, but limit the max to TI mode.  */
> > +	 for (unsigned max = 1; max <= (n - shift_cnt) && max <= 16; max *=
> 2)
> > +	      nearest = max;
> 
> In the if statement above, you required n < 8, so can max ever by > 16 here?
> 
> > +	  mov_mode = smallest_mode_for_size (nearest * BITS_PER_UNIT,
> MODE_INT);
> > +	  rtx reg = gen_reg_rtx (mov_mode);
> > +
> > +	  src = adjust_address (src, mov_mode, 0);
> > +	  emit_insn (gen_move_insn (reg, src));
> > +	  src = aarch64_progress_pointer (src);
> > +
> > +	  reg = gen_rtx_ASHIFT (dest_mode, reg,
> > +				GEN_INT (shift_cnt * BITS_PER_UNIT));
> 
> This seems to be mixing modes: reg has mode "mov_mode" but the result
> has mode "dest_mode".  That isn't well-formed: the mode of a shift result
> needs to be the same as the mode of the operand.  I think the load would
> need to be a zero-extend of "src" into "reg", with "reg" having mode
> "dest_mode".
> 
> > +	  result = gen_rtx_IOR (dest_mode, reg, result);
> > +      }
> > +
> > +    dst = adjust_address (dst, dest_mode, 0);
> > +    emit_insn (gen_move_insn (dst, result));
> 
> dest_mode was chosen by smallest_mode_for_size, so can be bigger than n.
> Doesn't that mean that we'll write beyond the end of the copy region when
> n is an awkward number?
> 
> > diff --git a/gcc/expr.c b/gcc/expr.c
> > index
> >
> 91d7ea217229fac62380b5d4b646961bf7c836c1..b1df4651e7942346007cda1cce
> 8e
> > e5a19297ab16 100644
> > --- a/gcc/expr.c
> > +++ b/gcc/expr.c
> > @@ -2743,7 +2743,9 @@ copy_blkmode_to_reg (machine_mode mode,
> tree
> > src)
> >
> >    n_regs = (bytes + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
> >    dst_words = XALLOCAVEC (rtx, n_regs);
> > -  bitsize = MIN (TYPE_ALIGN (TREE_TYPE (src)), BITS_PER_WORD);
> > +  bitsize = BITS_PER_WORD;
> > +  if (SLOW_UNALIGNED_ACCESS (BLKmode, TYPE_ALIGN (TREE_TYPE
> (src))))
> > +    bitsize = MIN (TYPE_ALIGN (TREE_TYPE (src)), BITS_PER_WORD);
> 
> I think this ought to be testing word_mode instead of BLKmode.
> (Testing BLKmode doesn't really make sense in general, because the mode
> doesn't have a meaningful alignment.)
> 
> Thanks,
> Richard
Tamar Christina June 9, 2017, 3:51 p.m. UTC | #3
> > +  /* Optimize routines for MEM to REG copies.  */  if (n < 8 &&
> > + !REG_P (XEXP (operands[0], 0)))
> 
> This seems to be checking that the address of the original destination
> memory isn't a plain base register.  Why's it important to reject that case but
> allow e.g. base+offset?

this function is (as far as I could tell) only being called with two types of destinations:
a location on the stack or a plain register.

When the destination is a register such as with

void Fun3(struct struct3 foo3)
{
  L3 = foo3;
}

You run into the issue you had pointed to below where we might write too much. Ideally the constraint
I would like is to check if the destination is either a new register (no data) or that the structure was padded.

I couldn't figure out how to do this and the gains over the existing code for this case was quite small.
So I just disallowed it leaving it to the existing code, which isn't so bad, only 1 extra instruction.

> 
> > +   {
> > +     unsigned int max_align = UINTVAL (operands[2]);
> > +     max_align = n < max_align ? max_align : n;
> 
> Might be misunderstanding, but isn't max_align always equal to n here, since
> n was set by:

Correct, previously this patch was made to allow n < 16. These were left over from the cleanup.
I'll correct.

> 
> > +	  result = gen_rtx_IOR (dest_mode, reg, result);
> > +      }
> > +
> > +    dst = adjust_address (dst, dest_mode, 0);
> > +    emit_insn (gen_move_insn (dst, result));
> 
> dest_mode was chosen by smallest_mode_for_size, so can be bigger than n.
> Doesn't that mean that we'll write beyond the end of the copy region when
> n is an awkward number?

Yes, see my answer above. For the other case, when we write onto a location on the stack, this is fine
due to the alignment. 

> 
> > diff --git a/gcc/expr.c b/gcc/expr.c
> > index
> >
> 91d7ea217229fac62380b5d4b646961bf7c836c1..b1df4651e7942346007cda1cce
> 8e
> > e5a19297ab16 100644
> > --- a/gcc/expr.c
> > +++ b/gcc/expr.c
> > @@ -2743,7 +2743,9 @@ copy_blkmode_to_reg (machine_mode mode,
> tree
> > src)
> >
> >    n_regs = (bytes + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
> >    dst_words = XALLOCAVEC (rtx, n_regs);
> > -  bitsize = MIN (TYPE_ALIGN (TREE_TYPE (src)), BITS_PER_WORD);
> > +  bitsize = BITS_PER_WORD;
> > +  if (SLOW_UNALIGNED_ACCESS (BLKmode, TYPE_ALIGN (TREE_TYPE
> (src))))
> > +    bitsize = MIN (TYPE_ALIGN (TREE_TYPE (src)), BITS_PER_WORD);
> 
> I think this ought to be testing word_mode instead of BLKmode.
> (Testing BLKmode doesn't really make sense in general, because the mode
> doesn't have a meaningful alignment.)

Ah, yes that makes sense. I'll update the patch.

New patch is validating and will submit it soon.

> 
> Thanks,
> Richard
diff mbox

Patch

diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 4f769a40a4e9de83cb5aacfd3ff58301c2feeb78..8906d9a9445ed36f43302708d1f6212bcf017bdc 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -13498,6 +13498,41 @@  aarch64_expand_movmem (rtx *operands)
   base = copy_to_mode_reg (Pmode, XEXP (src, 0));
   src = adjust_automodify_address (src, VOIDmode, base, 0);
 
+  /* Optimize routines for MEM to REG copies.  */
+  if (n < 8 && !REG_P (XEXP (operands[0], 0)))
+   {
+     unsigned int max_align = UINTVAL (operands[2]);
+     max_align = n < max_align ? max_align : n;
+     machine_mode mov_mode, dest_mode
+       = smallest_mode_for_size (max_align * BITS_PER_UNIT, MODE_INT);
+     rtx result = gen_reg_rtx (dest_mode);
+     emit_insn (gen_move_insn (result, GEN_INT (0)));
+
+     unsigned int shift_cnt = 0;
+     for (; n > shift_cnt; shift_cnt += GET_MODE_SIZE (mov_mode))
+       {
+	 int nearest = 0;
+	 /* Find the mode to use, but limit the max to TI mode.  */
+	 for (unsigned max = 1; max <= (n - shift_cnt) && max <= 16; max *= 2)
+	      nearest = max;
+
+	  mov_mode = smallest_mode_for_size (nearest * BITS_PER_UNIT, MODE_INT);
+	  rtx reg = gen_reg_rtx (mov_mode);
+
+	  src = adjust_address (src, mov_mode, 0);
+	  emit_insn (gen_move_insn (reg, src));
+	  src = aarch64_progress_pointer (src);
+
+	  reg = gen_rtx_ASHIFT (dest_mode, reg,
+				GEN_INT (shift_cnt * BITS_PER_UNIT));
+	  result = gen_rtx_IOR (dest_mode, reg, result);
+      }
+
+    dst = adjust_address (dst, dest_mode, 0);
+    emit_insn (gen_move_insn (dst, result));
+    return true;
+  }
+
   /* Simple cases.  Copy 0-3 bytes, as (if applicable) a 2-byte, then a
      1-byte chunk.  */
   if (n < 4)
diff --git a/gcc/expr.c b/gcc/expr.c
index 91d7ea217229fac62380b5d4b646961bf7c836c1..b1df4651e7942346007cda1cce8ee5a19297ab16 100644
--- a/gcc/expr.c
+++ b/gcc/expr.c
@@ -2743,7 +2743,9 @@  copy_blkmode_to_reg (machine_mode mode, tree src)
 
   n_regs = (bytes + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
   dst_words = XALLOCAVEC (rtx, n_regs);
-  bitsize = MIN (TYPE_ALIGN (TREE_TYPE (src)), BITS_PER_WORD);
+  bitsize = BITS_PER_WORD;
+  if (SLOW_UNALIGNED_ACCESS (BLKmode, TYPE_ALIGN (TREE_TYPE (src))))
+    bitsize = MIN (TYPE_ALIGN (TREE_TYPE (src)), BITS_PER_WORD);
 
   /* Copy the structure BITSIZE bits at a time.  */
   for (bitpos = 0, xbitpos = padding_correction;