diff mbox series

[mid-end] Allow larger copies when not slow_unaligned_access and no padding.

Message ID 20180723170121.GA8189@arm.com
State New
Headers show
Series [mid-end] Allow larger copies when not slow_unaligned_access and no padding. | expand

Commit Message

Tamar Christina July 23, 2018, 5:01 p.m. UTC
Hi All,

This allows copy_blkmode_to_reg to perform larger copies when it is safe to do so by calculating
the bitsize per iteration doing the maximum copy allowed that does not read more
than the amount of bits left to copy.

Strictly speaking, this copying is only done if:

  1. the target supports fast unaligned access
  2. no padding is being used.

This should avoid the issues of the first patch (PR85123) but still work for targets that are safe
to do so.

Original patch https://gcc.gnu.org/ml/gcc-patches/2017-11/msg01088.html
Previous respin https://gcc.gnu.org/ml/gcc-patches/2018-04/msg00239.html


This produces for the copying of a 3 byte structure:

fun3:
	adrp	x1, .LANCHOR0
	add	x1, x1, :lo12:.LANCHOR0
	mov	x0, 0
	sub	sp, sp, #16
	ldrh	w2, [x1, 16]
	ldrb	w1, [x1, 18]
	add	sp, sp, 16
	bfi	x0, x2, 0, 16
	bfi	x0, x1, 16, 8
	ret

whereas before it was producing

fun3:
	adrp	x0, .LANCHOR0
	add	x2, x0, :lo12:.LANCHOR0
	sub	sp, sp, #16
	ldrh	w1, [x0, #:lo12:.LANCHOR0]
	ldrb	w0, [x2, 2]
	strh	w1, [sp, 8]
	strb	w0, [sp, 10]
	ldr	w0, [sp, 8]
	add	sp, sp, 16
	ret

Cross compiled and regtested on
  aarch64_be-none-elf
  armeb-none-eabi
and no issues

Boostrapped and regtested
 aarch64-none-linux-gnu
 x86_64-pc-linux-gnu
 powerpc64-unknown-linux-gnu
 arm-none-linux-gnueabihf

and found no issues.

OK for trunk?

Thanks,
Tamar

gcc/
2018-07-23  Tamar Christina  <tamar.christina@arm.com>

	* expr.c (copy_blkmode_to_reg): Perform larger copies when safe.

--

Comments

Richard Biener July 23, 2018, 5:46 p.m. UTC | #1
On July 23, 2018 7:01:23 PM GMT+02:00, Tamar Christina <tamar.christina@arm.com> wrote:
>Hi All,
>
>This allows copy_blkmode_to_reg to perform larger copies when it is
>safe to do so by calculating
>the bitsize per iteration doing the maximum copy allowed that does not
>read more
>than the amount of bits left to copy.
>
>Strictly speaking, this copying is only done if:
>
>  1. the target supports fast unaligned access
>  2. no padding is being used.
>
>This should avoid the issues of the first patch (PR85123) but still
>work for targets that are safe
>to do so.
>
>Original patch https://gcc.gnu.org/ml/gcc-patches/2017-11/msg01088.html
>Previous respin
>https://gcc.gnu.org/ml/gcc-patches/2018-04/msg00239.html
>
>
>This produces for the copying of a 3 byte structure:
>
>fun3:
>	adrp	x1, .LANCHOR0
>	add	x1, x1, :lo12:.LANCHOR0
>	mov	x0, 0
>	sub	sp, sp, #16
>	ldrh	w2, [x1, 16]
>	ldrb	w1, [x1, 18]
>	add	sp, sp, 16
>	bfi	x0, x2, 0, 16
>	bfi	x0, x1, 16, 8
>	ret
>
>whereas before it was producing
>
>fun3:
>	adrp	x0, .LANCHOR0
>	add	x2, x0, :lo12:.LANCHOR0
>	sub	sp, sp, #16
>	ldrh	w1, [x0, #:lo12:.LANCHOR0]
>	ldrb	w0, [x2, 2]
>	strh	w1, [sp, 8]
>	strb	w0, [sp, 10]
>	ldr	w0, [sp, 8]
>	add	sp, sp, 16
>	ret
>
>Cross compiled and regtested on
>  aarch64_be-none-elf
>  armeb-none-eabi
>and no issues
>
>Boostrapped and regtested
> aarch64-none-linux-gnu
> x86_64-pc-linux-gnu
> powerpc64-unknown-linux-gnu
> arm-none-linux-gnueabihf
>
>and found no issues.
>
>OK for trunk?

How does this affect store-to-load forwarding when the source is initialized piecewise? IMHO we should avoid larger loads but generate larger stores when possible. 

How do non-x86 architectures behave with respect to STLF? 

Richard. 

>Thanks,
>Tamar
>
>gcc/
>2018-07-23  Tamar Christina  <tamar.christina@arm.com>
>
>	* expr.c (copy_blkmode_to_reg): Perform larger copies when safe.
Tamar Christina July 24, 2018, 4:33 p.m. UTC | #2
Hi Richard,

Thanks for the review!

The 07/23/2018 18:46, Richard Biener wrote:
> On July 23, 2018 7:01:23 PM GMT+02:00, Tamar Christina <tamar.christina@arm.com> wrote:
> >Hi All,
> >
> >This allows copy_blkmode_to_reg to perform larger copies when it is
> >safe to do so by calculating
> >the bitsize per iteration doing the maximum copy allowed that does not
> >read more
> >than the amount of bits left to copy.
> >
> >Strictly speaking, this copying is only done if:
> >
> >  1. the target supports fast unaligned access
> >  2. no padding is being used.
> >
> >This should avoid the issues of the first patch (PR85123) but still
> >work for targets that are safe
> >to do so.
> >
> >Original patch https://gcc.gnu.org/ml/gcc-patches/2017-11/msg01088.html
> >Previous respin
> >https://gcc.gnu.org/ml/gcc-patches/2018-04/msg00239.html
> >
> >
> >This produces for the copying of a 3 byte structure:
> >
> >fun3:
> >	adrp	x1, .LANCHOR0
> >	add	x1, x1, :lo12:.LANCHOR0
> >	mov	x0, 0
> >	sub	sp, sp, #16
> >	ldrh	w2, [x1, 16]
> >	ldrb	w1, [x1, 18]
> >	add	sp, sp, 16
> >	bfi	x0, x2, 0, 16
> >	bfi	x0, x1, 16, 8
> >	ret
> >
> >whereas before it was producing
> >
> >fun3:
> >	adrp	x0, .LANCHOR0
> >	add	x2, x0, :lo12:.LANCHOR0
> >	sub	sp, sp, #16
> >	ldrh	w1, [x0, #:lo12:.LANCHOR0]
> >	ldrb	w0, [x2, 2]
> >	strh	w1, [sp, 8]
> >	strb	w0, [sp, 10]
> >	ldr	w0, [sp, 8]
> >	add	sp, sp, 16
> >	ret
> >
> >Cross compiled and regtested on
> >  aarch64_be-none-elf
> >  armeb-none-eabi
> >and no issues
> >
> >Boostrapped and regtested
> > aarch64-none-linux-gnu
> > x86_64-pc-linux-gnu
> > powerpc64-unknown-linux-gnu
> > arm-none-linux-gnueabihf
> >
> >and found no issues.
> >
> >OK for trunk?
> 
> How does this affect store-to-load forwarding when the source is initialized piecewise? IMHO we should avoid larger loads but generate larger stores when possible. 
> 
> How do non-x86 architectures behave with respect to STLF? 
>

I should have made it more explicit in my cover letter, but this only covers reg to reg copies.
So the store-t-load forwarding shouldn't really come to play here, unless I'm missing something

The example in my patch shows that the loads from mem are mostly unaffected.

For x86 the change is also quite significant, e.g for a 5 byte struct load it used to generate

fun5:
	movl	foo5(%rip), %eax
	movl	%eax, %edi
	movzbl	%al, %edx
	movzbl	%ah, %eax
	movb	%al, %dh
	movzbl	foo5+2(%rip), %eax
	shrl	$24, %edi
	salq	$16, %rax
	movq	%rax, %rsi
	movzbl	%dil, %eax
	salq	$24, %rax
	movq	%rax, %rcx
	movq	%rdx, %rax
	movzbl	foo5+4(%rip), %edx
	orq	%rsi, %rax
	salq	$32, %rdx
	orq	%rcx, %rax
	orq	%rdx, %rax
	ret

instead of

fun5:
        movzbl  foo5+4(%rip), %eax
        salq    $32, %rax
        movq    %rax, %rdx
        movl    foo5(%rip), %eax
        orq     %rdx, %rax
        ret

so the loads themselves are unaffected.

Thanks,
Tamar
 
> Richard. 
> 
> >Thanks,
> >Tamar
> >
> >gcc/
> >2018-07-23  Tamar Christina  <tamar.christina@arm.com>
> >
> >	* expr.c (copy_blkmode_to_reg): Perform larger copies when safe.
> 

--
Tamar Christina July 31, 2018, 9:47 a.m. UTC | #3
Ping 😊

> -----Original Message-----
> From: gcc-patches-owner@gcc.gnu.org <gcc-patches-owner@gcc.gnu.org>
> On Behalf Of Tamar Christina
> Sent: Tuesday, July 24, 2018 17:34
> To: Richard Biener <rguenther@suse.de>
> Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>; law@redhat.com;
> ian@airs.com; amodra@gmail.com; bergner@vnet.ibm.com
> Subject: Re: [PATCH][GCC][mid-end] Allow larger copies when not
> slow_unaligned_access and no padding.
> 
> Hi Richard,
> 
> Thanks for the review!
> 
> The 07/23/2018 18:46, Richard Biener wrote:
> > On July 23, 2018 7:01:23 PM GMT+02:00, Tamar Christina
> <tamar.christina@arm.com> wrote:
> > >Hi All,
> > >
> > >This allows copy_blkmode_to_reg to perform larger copies when it is
> > >safe to do so by calculating the bitsize per iteration doing the
> > >maximum copy allowed that does not read more than the amount of bits
> > >left to copy.
> > >
> > >Strictly speaking, this copying is only done if:
> > >
> > >  1. the target supports fast unaligned access  2. no padding is
> > > being used.
> > >
> > >This should avoid the issues of the first patch (PR85123) but still
> > >work for targets that are safe to do so.
> > >
> > >Original patch
> > >https://gcc.gnu.org/ml/gcc-patches/2017-11/msg01088.html
> > >Previous respin
> > >https://gcc.gnu.org/ml/gcc-patches/2018-04/msg00239.html
> > >
> > >
> > >This produces for the copying of a 3 byte structure:
> > >
> > >fun3:
> > >	adrp	x1, .LANCHOR0
> > >	add	x1, x1, :lo12:.LANCHOR0
> > >	mov	x0, 0
> > >	sub	sp, sp, #16
> > >	ldrh	w2, [x1, 16]
> > >	ldrb	w1, [x1, 18]
> > >	add	sp, sp, 16
> > >	bfi	x0, x2, 0, 16
> > >	bfi	x0, x1, 16, 8
> > >	ret
> > >
> > >whereas before it was producing
> > >
> > >fun3:
> > >	adrp	x0, .LANCHOR0
> > >	add	x2, x0, :lo12:.LANCHOR0
> > >	sub	sp, sp, #16
> > >	ldrh	w1, [x0, #:lo12:.LANCHOR0]
> > >	ldrb	w0, [x2, 2]
> > >	strh	w1, [sp, 8]
> > >	strb	w0, [sp, 10]
> > >	ldr	w0, [sp, 8]
> > >	add	sp, sp, 16
> > >	ret
> > >
> > >Cross compiled and regtested on
> > >  aarch64_be-none-elf
> > >  armeb-none-eabi
> > >and no issues
> > >
> > >Boostrapped and regtested
> > > aarch64-none-linux-gnu
> > > x86_64-pc-linux-gnu
> > > powerpc64-unknown-linux-gnu
> > > arm-none-linux-gnueabihf
> > >
> > >and found no issues.
> > >
> > >OK for trunk?
> >
> > How does this affect store-to-load forwarding when the source is initialized
> piecewise? IMHO we should avoid larger loads but generate larger stores
> when possible.
> >
> > How do non-x86 architectures behave with respect to STLF?
> >
> 
> I should have made it more explicit in my cover letter, but this only covers reg
> to reg copies.
> So the store-t-load forwarding shouldn't really come to play here, unless I'm
> missing something
> 
> The example in my patch shows that the loads from mem are mostly
> unaffected.
> 
> For x86 the change is also quite significant, e.g for a 5 byte struct load it used
> to generate
> 
> fun5:
> 	movl	foo5(%rip), %eax
> 	movl	%eax, %edi
> 	movzbl	%al, %edx
> 	movzbl	%ah, %eax
> 	movb	%al, %dh
> 	movzbl	foo5+2(%rip), %eax
> 	shrl	$24, %edi
> 	salq	$16, %rax
> 	movq	%rax, %rsi
> 	movzbl	%dil, %eax
> 	salq	$24, %rax
> 	movq	%rax, %rcx
> 	movq	%rdx, %rax
> 	movzbl	foo5+4(%rip), %edx
> 	orq	%rsi, %rax
> 	salq	$32, %rdx
> 	orq	%rcx, %rax
> 	orq	%rdx, %rax
> 	ret
> 
> instead of
> 
> fun5:
>         movzbl  foo5+4(%rip), %eax
>         salq    $32, %rax
>         movq    %rax, %rdx
>         movl    foo5(%rip), %eax
>         orq     %rdx, %rax
>         ret
> 
> so the loads themselves are unaffected.
> 
> Thanks,
> Tamar
> 
> > Richard.
> >
> > >Thanks,
> > >Tamar
> > >
> > >gcc/
> > >2018-07-23  Tamar Christina  <tamar.christina@arm.com>
> > >
> > >	* expr.c (copy_blkmode_to_reg): Perform larger copies when safe.
> >
> 
> --
Richard Biener July 31, 2018, 10:21 a.m. UTC | #4
On Tue, 31 Jul 2018, Tamar Christina wrote:

> Ping 😊
> 
> > -----Original Message-----
> > From: gcc-patches-owner@gcc.gnu.org <gcc-patches-owner@gcc.gnu.org>
> > On Behalf Of Tamar Christina
> > Sent: Tuesday, July 24, 2018 17:34
> > To: Richard Biener <rguenther@suse.de>
> > Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>; law@redhat.com;
> > ian@airs.com; amodra@gmail.com; bergner@vnet.ibm.com
> > Subject: Re: [PATCH][GCC][mid-end] Allow larger copies when not
> > slow_unaligned_access and no padding.
> > 
> > Hi Richard,
> > 
> > Thanks for the review!
> > 
> > The 07/23/2018 18:46, Richard Biener wrote:
> > > On July 23, 2018 7:01:23 PM GMT+02:00, Tamar Christina
> > <tamar.christina@arm.com> wrote:
> > > >Hi All,
> > > >
> > > >This allows copy_blkmode_to_reg to perform larger copies when it is
> > > >safe to do so by calculating the bitsize per iteration doing the
> > > >maximum copy allowed that does not read more than the amount of bits
> > > >left to copy.
> > > >
> > > >Strictly speaking, this copying is only done if:
> > > >
> > > >  1. the target supports fast unaligned access  2. no padding is
> > > > being used.
> > > >
> > > >This should avoid the issues of the first patch (PR85123) but still
> > > >work for targets that are safe to do so.
> > > >
> > > >Original patch
> > > >https://gcc.gnu.org/ml/gcc-patches/2017-11/msg01088.html
> > > >Previous respin
> > > >https://gcc.gnu.org/ml/gcc-patches/2018-04/msg00239.html
> > > >
> > > >
> > > >This produces for the copying of a 3 byte structure:
> > > >
> > > >fun3:
> > > >	adrp	x1, .LANCHOR0
> > > >	add	x1, x1, :lo12:.LANCHOR0
> > > >	mov	x0, 0
> > > >	sub	sp, sp, #16
> > > >	ldrh	w2, [x1, 16]
> > > >	ldrb	w1, [x1, 18]
> > > >	add	sp, sp, 16
> > > >	bfi	x0, x2, 0, 16
> > > >	bfi	x0, x1, 16, 8
> > > >	ret
> > > >
> > > >whereas before it was producing
> > > >
> > > >fun3:
> > > >	adrp	x0, .LANCHOR0
> > > >	add	x2, x0, :lo12:.LANCHOR0
> > > >	sub	sp, sp, #16
> > > >	ldrh	w1, [x0, #:lo12:.LANCHOR0]
> > > >	ldrb	w0, [x2, 2]
> > > >	strh	w1, [sp, 8]
> > > >	strb	w0, [sp, 10]
> > > >	ldr	w0, [sp, 8]
> > > >	add	sp, sp, 16
> > > >	ret
> > > >
> > > >Cross compiled and regtested on
> > > >  aarch64_be-none-elf
> > > >  armeb-none-eabi
> > > >and no issues
> > > >
> > > >Boostrapped and regtested
> > > > aarch64-none-linux-gnu
> > > > x86_64-pc-linux-gnu
> > > > powerpc64-unknown-linux-gnu
> > > > arm-none-linux-gnueabihf
> > > >
> > > >and found no issues.
> > > >
> > > >OK for trunk?
> > >
> > > How does this affect store-to-load forwarding when the source is initialized
> > piecewise? IMHO we should avoid larger loads but generate larger stores
> > when possible.
> > >
> > > How do non-x86 architectures behave with respect to STLF?
> > >
> > 
> > I should have made it more explicit in my cover letter, but this only covers reg
> > to reg copies.
> > So the store-t-load forwarding shouldn't really come to play here, unless I'm
> > missing something
> > 
> > The example in my patch shows that the loads from mem are mostly
> > unaffected.
> > 
> > For x86 the change is also quite significant, e.g for a 5 byte struct load it used
> > to generate
> > 
> > fun5:
> > 	movl	foo5(%rip), %eax
> > 	movl	%eax, %edi
> > 	movzbl	%al, %edx
> > 	movzbl	%ah, %eax
> > 	movb	%al, %dh
> > 	movzbl	foo5+2(%rip), %eax
> > 	shrl	$24, %edi
> > 	salq	$16, %rax
> > 	movq	%rax, %rsi
> > 	movzbl	%dil, %eax
> > 	salq	$24, %rax
> > 	movq	%rax, %rcx
> > 	movq	%rdx, %rax
> > 	movzbl	foo5+4(%rip), %edx
> > 	orq	%rsi, %rax
> > 	salq	$32, %rdx
> > 	orq	%rcx, %rax
> > 	orq	%rdx, %rax
> > 	ret
> > 
> > instead of
> > 
> > fun5:
> >         movzbl  foo5+4(%rip), %eax
> >         salq    $32, %rax
> >         movq    %rax, %rdx
> >         movl    foo5(%rip), %eax
> >         orq     %rdx, %rax
> >         ret
> > 
> > so the loads themselves are unaffected.

I see.  Few things:

   dst_words = XALLOCAVEC (rtx, n_regs);
+
+  slow_unaligned_access
+    = targetm.slow_unaligned_access (word_mode, TYPE_ALIGN (TREE_TYPE 
(src)));
+
   bitsize = MIN (TYPE_ALIGN (TREE_TYPE (src)), BITS_PER_WORD);

please avoid the extra vertical space.


+
+      /* Find the largest integer mode that can be used to copy all or as
+        many bits as possible of the struct

likewise.

+      FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT)
+       if (padding_correction == 0
+           && !slow_unaligned_access

These conditions are invariant so please hoist them out of the
loop.

+           && GET_MODE_BITSIZE (mode_iter.require ())
+                           <= ((bytes * BITS_PER_UNIT) - bitpos)
+           && GET_MODE_BITSIZE (mode_iter.require ()) <= BITS_PER_WORD)
+         bitsize = GET_MODE_BITSIZE (mode_iter.require ());
+       else
+         break;

The slow_unaligned_access target hook is passed a mode and an
alignment but you are assuming that a narrower mode behaves
the same with possibly different alignment.

I'm also not sure this loop will never compute to sth that is
smaller than the original conservative bitsize based on
TYPE_ALIGN of src, so maybe you could instead use sth like

  min_mode = get_mode_for_size (bitsize);
...

  if (padding_correction == 0
      && !STRICT_ALIGNMENT)
    FOR_EACH_MODE_FROM (mode_iter, min_mode)
      {
        unsigned msize = GET_MODE_BITSIZE (mode_iter.require ());
        if (msize <= ((bytes * BITS_PER_UNIT) - bitpos)
            && msize <= BITS_PER_WORD)
          bitsize = msize;
        else
          break;
      }
...

instead?

Thanks,
Richar.
Tamar Christina July 31, 2018, 3:52 p.m. UTC | #5
Hi Richard,

The 07/31/2018 11:21, Richard Biener wrote:
> On Tue, 31 Jul 2018, Tamar Christina wrote:
> 
> > Ping 😊
> > 
> > > -----Original Message-----
> > > From: gcc-patches-owner@gcc.gnu.org <gcc-patches-owner@gcc.gnu.org>
> > > On Behalf Of Tamar Christina
> > > Sent: Tuesday, July 24, 2018 17:34
> > > To: Richard Biener <rguenther@suse.de>
> > > Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>; law@redhat.com;
> > > ian@airs.com; amodra@gmail.com; bergner@vnet.ibm.com
> > > Subject: Re: [PATCH][GCC][mid-end] Allow larger copies when not
> > > slow_unaligned_access and no padding.
> > > 
> > > Hi Richard,
> > > 
> > > Thanks for the review!
> > > 
> > > The 07/23/2018 18:46, Richard Biener wrote:
> > > > On July 23, 2018 7:01:23 PM GMT+02:00, Tamar Christina
> > > <tamar.christina@arm.com> wrote:
> > > > >Hi All,
> > > > >
> > > > >This allows copy_blkmode_to_reg to perform larger copies when it is
> > > > >safe to do so by calculating the bitsize per iteration doing the
> > > > >maximum copy allowed that does not read more than the amount of bits
> > > > >left to copy.
> > > > >
> > > > >Strictly speaking, this copying is only done if:
> > > > >
> > > > >  1. the target supports fast unaligned access  2. no padding is
> > > > > being used.
> > > > >
> > > > >This should avoid the issues of the first patch (PR85123) but still
> > > > >work for targets that are safe to do so.
> > > > >
> > > > >Original patch
> > > > >https://gcc.gnu.org/ml/gcc-patches/2017-11/msg01088.html
> > > > >Previous respin
> > > > >https://gcc.gnu.org/ml/gcc-patches/2018-04/msg00239.html
> > > > >
> > > > >
> > > > >This produces for the copying of a 3 byte structure:
> > > > >
> > > > >fun3:
> > > > >	adrp	x1, .LANCHOR0
> > > > >	add	x1, x1, :lo12:.LANCHOR0
> > > > >	mov	x0, 0
> > > > >	sub	sp, sp, #16
> > > > >	ldrh	w2, [x1, 16]
> > > > >	ldrb	w1, [x1, 18]
> > > > >	add	sp, sp, 16
> > > > >	bfi	x0, x2, 0, 16
> > > > >	bfi	x0, x1, 16, 8
> > > > >	ret
> > > > >
> > > > >whereas before it was producing
> > > > >
> > > > >fun3:
> > > > >	adrp	x0, .LANCHOR0
> > > > >	add	x2, x0, :lo12:.LANCHOR0
> > > > >	sub	sp, sp, #16
> > > > >	ldrh	w1, [x0, #:lo12:.LANCHOR0]
> > > > >	ldrb	w0, [x2, 2]
> > > > >	strh	w1, [sp, 8]
> > > > >	strb	w0, [sp, 10]
> > > > >	ldr	w0, [sp, 8]
> > > > >	add	sp, sp, 16
> > > > >	ret
> > > > >
> > > > >Cross compiled and regtested on
> > > > >  aarch64_be-none-elf
> > > > >  armeb-none-eabi
> > > > >and no issues
> > > > >
> > > > >Boostrapped and regtested
> > > > > aarch64-none-linux-gnu
> > > > > x86_64-pc-linux-gnu
> > > > > powerpc64-unknown-linux-gnu
> > > > > arm-none-linux-gnueabihf
> > > > >
> > > > >and found no issues.
> > > > >
> > > > >OK for trunk?
> > > >
> > > > How does this affect store-to-load forwarding when the source is initialized
> > > piecewise? IMHO we should avoid larger loads but generate larger stores
> > > when possible.
> > > >
> > > > How do non-x86 architectures behave with respect to STLF?
> > > >
> > > 
> > > I should have made it more explicit in my cover letter, but this only covers reg
> > > to reg copies.
> > > So the store-t-load forwarding shouldn't really come to play here, unless I'm
> > > missing something
> > > 
> > > The example in my patch shows that the loads from mem are mostly
> > > unaffected.
> > > 
> > > For x86 the change is also quite significant, e.g for a 5 byte struct load it used
> > > to generate
> > > 
> > > fun5:
> > > 	movl	foo5(%rip), %eax
> > > 	movl	%eax, %edi
> > > 	movzbl	%al, %edx
> > > 	movzbl	%ah, %eax
> > > 	movb	%al, %dh
> > > 	movzbl	foo5+2(%rip), %eax
> > > 	shrl	$24, %edi
> > > 	salq	$16, %rax
> > > 	movq	%rax, %rsi
> > > 	movzbl	%dil, %eax
> > > 	salq	$24, %rax
> > > 	movq	%rax, %rcx
> > > 	movq	%rdx, %rax
> > > 	movzbl	foo5+4(%rip), %edx
> > > 	orq	%rsi, %rax
> > > 	salq	$32, %rdx
> > > 	orq	%rcx, %rax
> > > 	orq	%rdx, %rax
> > > 	ret
> > > 
> > > instead of
> > > 
> > > fun5:
> > >         movzbl  foo5+4(%rip), %eax
> > >         salq    $32, %rax
> > >         movq    %rax, %rdx
> > >         movl    foo5(%rip), %eax
> > >         orq     %rdx, %rax
> > >         ret
> > > 
> > > so the loads themselves are unaffected.
> 
> I see.  Few things:
> 
>    dst_words = XALLOCAVEC (rtx, n_regs);
> +
> +  slow_unaligned_access
> +    = targetm.slow_unaligned_access (word_mode, TYPE_ALIGN (TREE_TYPE 
> (src)));
> +
>    bitsize = MIN (TYPE_ALIGN (TREE_TYPE (src)), BITS_PER_WORD);
> 
> please avoid the extra vertical space.
> 
> 
> +
> +      /* Find the largest integer mode that can be used to copy all or as
> +        many bits as possible of the struct
> 
> likewise.

Done.

> 
> +      FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT)
> +       if (padding_correction == 0
> +           && !slow_unaligned_access
> 
> These conditions are invariant so please hoist them out of the
> loop.

Done.

> 
> +           && GET_MODE_BITSIZE (mode_iter.require ())
> +                           <= ((bytes * BITS_PER_UNIT) - bitpos)
> +           && GET_MODE_BITSIZE (mode_iter.require ()) <= BITS_PER_WORD)
> +         bitsize = GET_MODE_BITSIZE (mode_iter.require ());
> +       else
> +         break;
> 
> The slow_unaligned_access target hook is passed a mode and an
> alignment but you are assuming that a narrower mode behaves
> the same with possibly different alignment.
>

Ah, indeed, good point. Changing it to STRICT_ALIGNMENT would indeed be
a better check. Done.
 
> I'm also not sure this loop will never compute to sth that is
> smaller than the original conservative bitsize based on
> TYPE_ALIGN of src, so maybe you could instead use sth like
> 
>   min_mode = get_mode_for_size (bitsize);
> ...
> 
>   if (padding_correction == 0
>       && !STRICT_ALIGNMENT)
>     FOR_EACH_MODE_FROM (mode_iter, min_mode)
>       {
>         unsigned msize = GET_MODE_BITSIZE (mode_iter.require ());
>         if (msize <= ((bytes * BITS_PER_UNIT) - bitpos)
>             && msize <= BITS_PER_WORD)
>           bitsize = msize;
>         else
>           break;
>       }
> ...
> 
> instead?

I've rewritten the loop as suggested.  It now doesn't pick a smaller mode
than the initial bitsize.

New patch attached, no change to changelog.

OK for trunk?

Thanks,
Tamar

gcc/
2018-07-31  Tamar Christina  <tamar.christina@arm.com>

	* expr.c (copy_blkmode_to_reg): Perform larger copies when safe.

> 
> Thanks,
> Richar.

--
diff --git a/gcc/expr.c b/gcc/expr.c
index f665e187ebbbc7874ec88e84ca47ed991491c3e5..c468b9419831b8baad3ab4230d8f02cb88a03880 100644
--- a/gcc/expr.c
+++ b/gcc/expr.c
@@ -2767,6 +2767,7 @@ copy_blkmode_to_reg (machine_mode mode_in, tree src)
   /* No current ABI uses variable-sized modes to pass a BLKmnode type.  */
   fixed_size_mode mode = as_a <fixed_size_mode> (mode_in);
   fixed_size_mode dst_mode;
+  opt_scalar_int_mode min_mode;
 
   gcc_assert (TYPE_MODE (TREE_TYPE (src)) == BLKmode);
 
@@ -2796,6 +2797,7 @@ copy_blkmode_to_reg (machine_mode mode_in, tree src)
   n_regs = (bytes + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
   dst_words = XALLOCAVEC (rtx, n_regs);
   bitsize = MIN (TYPE_ALIGN (TREE_TYPE (src)), BITS_PER_WORD);
+  min_mode = int_mode_for_mode (smallest_mode_for_size (bitsize, MODE_INT));
 
   /* Copy the structure BITSIZE bits at a time.  */
   for (bitpos = 0, xbitpos = padding_correction;
@@ -2816,6 +2818,25 @@ copy_blkmode_to_reg (machine_mode mode_in, tree src)
 	  emit_move_insn (dst_word, CONST0_RTX (word_mode));
 	}
 
+      /* Find the largest integer mode that can be used to copy all or as
+	 many bits as possible of the structure if the target supports larger
+	 copies.  There are too many corner cases here w.r.t to alignments on
+	 the read/writes.  So if there is any padding just use single byte
+	 operations.  */
+      opt_scalar_int_mode mode_iter;
+      if (padding_correction == 0 && !STRICT_ALIGNMENT && min_mode.exists ())
+	{
+	  FOR_EACH_MODE_FROM (mode_iter, min_mode.require ())
+	    {
+	      unsigned int msize = GET_MODE_BITSIZE (mode_iter.require ());
+	      if (msize <= ((bytes * BITS_PER_UNIT) - bitpos)
+		  && msize <= BITS_PER_WORD)
+		bitsize = msize;
+	      else
+		break;
+	    }
+	}
+
       /* We need a new source operand each time bitpos is on a word
 	 boundary.  */
       if (bitpos % BITS_PER_WORD == 0)
Richard Biener Aug. 1, 2018, 6:50 a.m. UTC | #6
On Tue, 31 Jul 2018, Tamar Christina wrote:

> Hi Richard,
> 
> The 07/31/2018 11:21, Richard Biener wrote:
> > On Tue, 31 Jul 2018, Tamar Christina wrote:
> > 
> > > Ping 😊
> > > 
> > > > -----Original Message-----
> > > > From: gcc-patches-owner@gcc.gnu.org <gcc-patches-owner@gcc.gnu.org>
> > > > On Behalf Of Tamar Christina
> > > > Sent: Tuesday, July 24, 2018 17:34
> > > > To: Richard Biener <rguenther@suse.de>
> > > > Cc: gcc-patches@gcc.gnu.org; nd <nd@arm.com>; law@redhat.com;
> > > > ian@airs.com; amodra@gmail.com; bergner@vnet.ibm.com
> > > > Subject: Re: [PATCH][GCC][mid-end] Allow larger copies when not
> > > > slow_unaligned_access and no padding.
> > > > 
> > > > Hi Richard,
> > > > 
> > > > Thanks for the review!
> > > > 
> > > > The 07/23/2018 18:46, Richard Biener wrote:
> > > > > On July 23, 2018 7:01:23 PM GMT+02:00, Tamar Christina
> > > > <tamar.christina@arm.com> wrote:
> > > > > >Hi All,
> > > > > >
> > > > > >This allows copy_blkmode_to_reg to perform larger copies when it is
> > > > > >safe to do so by calculating the bitsize per iteration doing the
> > > > > >maximum copy allowed that does not read more than the amount of bits
> > > > > >left to copy.
> > > > > >
> > > > > >Strictly speaking, this copying is only done if:
> > > > > >
> > > > > >  1. the target supports fast unaligned access  2. no padding is
> > > > > > being used.
> > > > > >
> > > > > >This should avoid the issues of the first patch (PR85123) but still
> > > > > >work for targets that are safe to do so.
> > > > > >
> > > > > >Original patch
> > > > > >https://gcc.gnu.org/ml/gcc-patches/2017-11/msg01088.html
> > > > > >Previous respin
> > > > > >https://gcc.gnu.org/ml/gcc-patches/2018-04/msg00239.html
> > > > > >
> > > > > >
> > > > > >This produces for the copying of a 3 byte structure:
> > > > > >
> > > > > >fun3:
> > > > > >	adrp	x1, .LANCHOR0
> > > > > >	add	x1, x1, :lo12:.LANCHOR0
> > > > > >	mov	x0, 0
> > > > > >	sub	sp, sp, #16
> > > > > >	ldrh	w2, [x1, 16]
> > > > > >	ldrb	w1, [x1, 18]
> > > > > >	add	sp, sp, 16
> > > > > >	bfi	x0, x2, 0, 16
> > > > > >	bfi	x0, x1, 16, 8
> > > > > >	ret
> > > > > >
> > > > > >whereas before it was producing
> > > > > >
> > > > > >fun3:
> > > > > >	adrp	x0, .LANCHOR0
> > > > > >	add	x2, x0, :lo12:.LANCHOR0
> > > > > >	sub	sp, sp, #16
> > > > > >	ldrh	w1, [x0, #:lo12:.LANCHOR0]
> > > > > >	ldrb	w0, [x2, 2]
> > > > > >	strh	w1, [sp, 8]
> > > > > >	strb	w0, [sp, 10]
> > > > > >	ldr	w0, [sp, 8]
> > > > > >	add	sp, sp, 16
> > > > > >	ret
> > > > > >
> > > > > >Cross compiled and regtested on
> > > > > >  aarch64_be-none-elf
> > > > > >  armeb-none-eabi
> > > > > >and no issues
> > > > > >
> > > > > >Boostrapped and regtested
> > > > > > aarch64-none-linux-gnu
> > > > > > x86_64-pc-linux-gnu
> > > > > > powerpc64-unknown-linux-gnu
> > > > > > arm-none-linux-gnueabihf
> > > > > >
> > > > > >and found no issues.
> > > > > >
> > > > > >OK for trunk?
> > > > >
> > > > > How does this affect store-to-load forwarding when the source is initialized
> > > > piecewise? IMHO we should avoid larger loads but generate larger stores
> > > > when possible.
> > > > >
> > > > > How do non-x86 architectures behave with respect to STLF?
> > > > >
> > > > 
> > > > I should have made it more explicit in my cover letter, but this only covers reg
> > > > to reg copies.
> > > > So the store-t-load forwarding shouldn't really come to play here, unless I'm
> > > > missing something
> > > > 
> > > > The example in my patch shows that the loads from mem are mostly
> > > > unaffected.
> > > > 
> > > > For x86 the change is also quite significant, e.g for a 5 byte struct load it used
> > > > to generate
> > > > 
> > > > fun5:
> > > > 	movl	foo5(%rip), %eax
> > > > 	movl	%eax, %edi
> > > > 	movzbl	%al, %edx
> > > > 	movzbl	%ah, %eax
> > > > 	movb	%al, %dh
> > > > 	movzbl	foo5+2(%rip), %eax
> > > > 	shrl	$24, %edi
> > > > 	salq	$16, %rax
> > > > 	movq	%rax, %rsi
> > > > 	movzbl	%dil, %eax
> > > > 	salq	$24, %rax
> > > > 	movq	%rax, %rcx
> > > > 	movq	%rdx, %rax
> > > > 	movzbl	foo5+4(%rip), %edx
> > > > 	orq	%rsi, %rax
> > > > 	salq	$32, %rdx
> > > > 	orq	%rcx, %rax
> > > > 	orq	%rdx, %rax
> > > > 	ret
> > > > 
> > > > instead of
> > > > 
> > > > fun5:
> > > >         movzbl  foo5+4(%rip), %eax
> > > >         salq    $32, %rax
> > > >         movq    %rax, %rdx
> > > >         movl    foo5(%rip), %eax
> > > >         orq     %rdx, %rax
> > > >         ret
> > > > 
> > > > so the loads themselves are unaffected.
> > 
> > I see.  Few things:
> > 
> >    dst_words = XALLOCAVEC (rtx, n_regs);
> > +
> > +  slow_unaligned_access
> > +    = targetm.slow_unaligned_access (word_mode, TYPE_ALIGN (TREE_TYPE 
> > (src)));
> > +
> >    bitsize = MIN (TYPE_ALIGN (TREE_TYPE (src)), BITS_PER_WORD);
> > 
> > please avoid the extra vertical space.
> > 
> > 
> > +
> > +      /* Find the largest integer mode that can be used to copy all or as
> > +        many bits as possible of the struct
> > 
> > likewise.
> 
> Done.
> 
> > 
> > +      FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT)
> > +       if (padding_correction == 0
> > +           && !slow_unaligned_access
> > 
> > These conditions are invariant so please hoist them out of the
> > loop.
> 
> Done.
> 
> > 
> > +           && GET_MODE_BITSIZE (mode_iter.require ())
> > +                           <= ((bytes * BITS_PER_UNIT) - bitpos)
> > +           && GET_MODE_BITSIZE (mode_iter.require ()) <= BITS_PER_WORD)
> > +         bitsize = GET_MODE_BITSIZE (mode_iter.require ());
> > +       else
> > +         break;
> > 
> > The slow_unaligned_access target hook is passed a mode and an
> > alignment but you are assuming that a narrower mode behaves
> > the same with possibly different alignment.
> >
> 
> Ah, indeed, good point. Changing it to STRICT_ALIGNMENT would indeed be
> a better check. Done.
>  
> > I'm also not sure this loop will never compute to sth that is
> > smaller than the original conservative bitsize based on
> > TYPE_ALIGN of src, so maybe you could instead use sth like
> > 
> >   min_mode = get_mode_for_size (bitsize);
> > ...
> > 
> >   if (padding_correction == 0
> >       && !STRICT_ALIGNMENT)
> >     FOR_EACH_MODE_FROM (mode_iter, min_mode)
> >       {
> >         unsigned msize = GET_MODE_BITSIZE (mode_iter.require ());
> >         if (msize <= ((bytes * BITS_PER_UNIT) - bitpos)
> >             && msize <= BITS_PER_WORD)
> >           bitsize = msize;
> >         else
> >           break;
> >       }
> > ...
> > 
> > instead?
> 
> I've rewritten the loop as suggested.  It now doesn't pick a smaller mode
> than the initial bitsize.
> 
> New patch attached, no change to changelog.

+  min_mode = int_mode_for_mode (smallest_mode_for_size (bitsize,
MODE_INT));

I think you can use smallest_int_mode_for_size (bitsize), given
bitsize is less or equal than BITS_PER_WORD this mode should
exist (the result isn't an opt_mode but a scalar_int_mode).

OK with that change.

Richard.

> OK for trunk?
> 
> Thanks,
> Tamar
> 
> gcc/
> 2018-07-31  Tamar Christina  <tamar.christina@arm.com>
> 
> 	* expr.c (copy_blkmode_to_reg): Perform larger copies when safe.
> 
> > 
> > Thanks,
> > Richar.
> 
>
diff mbox series

Patch

diff --git a/gcc/expr.c b/gcc/expr.c
index f665e187ebbbc7874ec88e84ca47ed991491c3e5..17b580aabf761491d8003ac74daa014bc252ea9f 100644
--- a/gcc/expr.c
+++ b/gcc/expr.c
@@ -2763,6 +2763,7 @@  copy_blkmode_to_reg (machine_mode mode_in, tree src)
   int i, n_regs;
   unsigned HOST_WIDE_INT bitpos, xbitpos, padding_correction = 0, bytes;
   unsigned int bitsize;
+  bool slow_unaligned_access;
   rtx *dst_words, dst, x, src_word = NULL_RTX, dst_word = NULL_RTX;
   /* No current ABI uses variable-sized modes to pass a BLKmnode type.  */
   fixed_size_mode mode = as_a <fixed_size_mode> (mode_in);
@@ -2795,6 +2796,10 @@  copy_blkmode_to_reg (machine_mode mode_in, tree src)
 
   n_regs = (bytes + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
   dst_words = XALLOCAVEC (rtx, n_regs);
+
+  slow_unaligned_access
+    = targetm.slow_unaligned_access (word_mode, TYPE_ALIGN (TREE_TYPE (src)));
+
   bitsize = MIN (TYPE_ALIGN (TREE_TYPE (src)), BITS_PER_WORD);
 
   /* Copy the structure BITSIZE bits at a time.  */
@@ -2816,6 +2821,23 @@  copy_blkmode_to_reg (machine_mode mode_in, tree src)
 	  emit_move_insn (dst_word, CONST0_RTX (word_mode));
 	}
 
+
+      /* Find the largest integer mode that can be used to copy all or as
+	 many bits as possible of the structure if the target supports larger
+	 copies.  There are too many corner cases here w.r.t to alignments on
+	 the read/writes.  So if there is any padding just use single byte
+	 operations.  */
+      opt_scalar_int_mode mode_iter;
+      FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT)
+	if (padding_correction == 0
+	    && !slow_unaligned_access
+	    && GET_MODE_BITSIZE (mode_iter.require ())
+			    <= ((bytes * BITS_PER_UNIT) - bitpos)
+	    && GET_MODE_BITSIZE (mode_iter.require ()) <= BITS_PER_WORD)
+	  bitsize = GET_MODE_BITSIZE (mode_iter.require ());
+	else
+	  break;
+
       /* We need a new source operand each time bitpos is on a word
 	 boundary.  */
       if (bitpos % BITS_PER_WORD == 0)