diff mbox

[AArch64] Improve HFA code generation

Message ID 1497968507-24709-1-git-send-email-james.greenhalgh@arm.com
State New
Headers show

Commit Message

James Greenhalgh June 20, 2017, 2:21 p.m. UTC
Hi,

For this code:

  struct y {
    float x[4];
  };

  float
  bar3 (struct y x)
  {
    return x.x[3];
  }

GCC generates:

  bar3:
	fmov	x1, d2
	mov	x0, 0
	bfi	x0, x1, 0, 32
	fmov	x1, d3
	bfi	x0, x1, 32, 32
	sbfx	x0, x0, 32, 32
	fmov	s0, w0
	ret

If you can wrap your head around that, you'll spot that it could be
simplified to:

  bar3:
	fmov	s0, s3
	ret

Looking at it, I think the issue is the mode that we assign to the
PARALLEL we build for an HFA in registers. When we get in to
aarch64_layout_arg with a composite, MODE is set to the smallest integer
mode that would contain the size of the composite type. That is to say, in
the example above, MODE will be TImode.

Looking at the expansion path through assign_parms, we're going to go:

assign_parms
  assign_parm_setup_reg
    assign_parm_remove_parallels
      emit_group_store

assign_parm_remove_parallels is going to try to create a REG in MODE,
then construct that REG using the values in the HFA PARALLEL we created. So,
for the example above, we're going to try to create a packed TImode value
built up from each of the four "S" registers we've assigned for the
arguments. Using one of the struct elements is then a 32-bit extract from
the TImode value (then a move back to FP/SIMD registers). This explains
the code-gen in the example. Note that an extract from the TImode value
makes the whole TImode value live, so we can't optimize away the
construction in registers.

If instead we make the PARALLEL that we create in aarch64_layout_arg BLKmode
then our expansion path is through:

assign_parms
  assign_parm_setup_block

Which handles creating a stack slot of the right size for our HFA, and
copying it to there. We could then trust the usual optimisers to deal with
the object construction and eliminate it where possible.

However, we can't just return a BLKmode Parallel, as the mid-end was explictly
asking us to return in MODE, and will eventually ICE given the inconsistency.
One other way we can force these structures to be given BLKmode is through
TARGET_MEMBER_TYPE_FORCES_BLK. Which is what we do in this patch.

We're going to tell the mid-end that any structure of more than one element
which contains either floating-point or vector data should be set out in
BLKmode rather than a large-enough integer mode. In doing so, we implicitly
fix the issue with HFA layout above. But at what cost!

A long running deficiency in GCC's code-gen (doesn't clean up stack
allocations after stack uses have been eliminated) prevents us from
getting what we really wanted, but:

  bar3:
	sub	sp, sp, #16
	fmov	s0, s3
	add	sp, sp, 16
	ret

is pretty close, and a huge improvement over where we are today.

Note that we can still get some pretty bad code-generation out of the
compiler when passing and returning structs. I particularly like this
one:

  struct y {
    float x[4];
  };

  struct y
  bar (struct y x)
  {
    return x;
  }

  bar:
	sub	sp, sp, #48
	stp	s0, s1, [sp, 16]
	stp	s2, s3, [sp, 24]
	ldp	x0, x1, [sp, 16]
	stp	x0, x1, [sp, 32]
	ldp	s0, s1, [sp, 32]
	ldp	s2, s3, [sp, 40]
	add	sp, sp, 48
	ret

But that looks to be a seperate issue, and is not substantially worse tha
current trunk:

  bar:
	fmov	x2, d0
	mov	x1, 0
	mov	x0, 0
	bfi	x1, x2, 0, 32
	fmov	x2, d2
	bfi	x0, x2, 0, 32
	fmov	x2, d1
	bfi	x1, x2, 32, 32
	fmov	x2, d3
	bfi	x0, x2, 32, 32
	ubfx	x2, x1, 0, 32
	ubfx	x1, x1, 32, 32
	fmov	s0, w2
	ubfx	x3, x0, 0, 32
	fmov	s1, w1
	ubfx	x0, x0, 32, 32
	fmov	s2, w3
	fmov	s3, w0
	ret

I've benchamrked this with Spec2000 and found no performance differences.
And bootstrapped on aarch64-none-linux-gnu with no issues.

Does this look like a sensible approach and if so, is it OK for trunk?

Thanks,
James

---
gcc/

2017-06-20  James Greenhalgh  <james.greenhalgh@arm.com>

	* config/aarch64/aarch64.c (aarch64_layout_arg): Construct HFA
	PARALLELs in BLKmode.

gcc/testsuite/

2017-06-20  James Greenhalgh  <james.greenhalgh@arm.com>

	* gcc.target/aarch64/hfa_1.c: New.

Comments

Richard Sandiford June 26, 2017, 7:03 p.m. UTC | #1
James Greenhalgh <james.greenhalgh@arm.com> writes:
> Hi,
>
> For this code:
>
>   struct y {
>     float x[4];
>   };
>
>   float
>   bar3 (struct y x)
>   {
>     return x.x[3];
>   }
>
> GCC generates:
>
>   bar3:
> 	fmov	x1, d2
> 	mov	x0, 0
> 	bfi	x0, x1, 0, 32
> 	fmov	x1, d3
> 	bfi	x0, x1, 32, 32
> 	sbfx	x0, x0, 32, 32
> 	fmov	s0, w0
> 	ret
>
> If you can wrap your head around that, you'll spot that it could be
> simplified to:
>
>   bar3:
> 	fmov	s0, s3
> 	ret

I get the second version with current trunk, at -O and above.  Does this
only happen with some other modifications, or for certain subtargets?
Or maybe I'm testing the wrong thing.

> Looking at it, I think the issue is the mode that we assign to the
> PARALLEL we build for an HFA in registers. When we get in to
> aarch64_layout_arg with a composite, MODE is set to the smallest integer
> mode that would contain the size of the composite type. That is to say, in
> the example above, MODE will be TImode.
>
> Looking at the expansion path through assign_parms, we're going to go:
>
> assign_parms
>   assign_parm_setup_reg
>     assign_parm_remove_parallels
>       emit_group_store
>
> assign_parm_remove_parallels is going to try to create a REG in MODE,
> then construct that REG using the values in the HFA PARALLEL we created. So,
> for the example above, we're going to try to create a packed TImode value
> built up from each of the four "S" registers we've assigned for the
> arguments. Using one of the struct elements is then a 32-bit extract from
> the TImode value (then a move back to FP/SIMD registers). This explains
> the code-gen in the example. Note that an extract from the TImode value
> makes the whole TImode value live, so we can't optimize away the
> construction in registers.
>
> If instead we make the PARALLEL that we create in aarch64_layout_arg BLKmode
> then our expansion path is through:
>
> assign_parms
>   assign_parm_setup_block
>
> Which handles creating a stack slot of the right size for our HFA, and
> copying it to there. We could then trust the usual optimisers to deal with
> the object construction and eliminate it where possible.
>
> However, we can't just return a BLKmode Parallel, as the mid-end was explictly
> asking us to return in MODE, and will eventually ICE given the inconsistency.
> One other way we can force these structures to be given BLKmode is through
> TARGET_MEMBER_TYPE_FORCES_BLK. Which is what we do in this patch.
>
> We're going to tell the mid-end that any structure of more than one element
> which contains either floating-point or vector data should be set out in
> BLKmode rather than a large-enough integer mode. In doing so, we implicitly
> fix the issue with HFA layout above. But at what cost!

The patch only seems to handle scalar floats, not vectors.

Wasn't replying to this to nitpick though :-).  I just wondered whether
there would be any benefit to having an array_mode hook that returns
V4SF for float[4].  (We needed that hook to handle load/store lanes
for SVE, so see git branch linaro-dev/sve if you want to try it.)
Maybe alignment would be a problem though?

Thanks,
Richard
diff mbox

Patch

diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 04417dc..a147068 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -14925,6 +14925,32 @@  aarch64_sched_can_speculate_insn (rtx_insn *insn)
     }
 }
 
+/* We're an composite type, so MODE is the smallest integer mode
+   that can fit the total size of our aggregate.  However,
+   we're going to build a parallel that contains each of our
+   registers, and GCC is going to emit code to move them in
+   to a packed value in MODE.  As an example, for an HFA of
+   two floats, this means creating a packed DImode value from
+   which we can then extract the SFmode elements.  This adds
+   considerable inefficiency.  We can avoid that issue by
+   creating the parallel with BLKmode, which forces the
+   the structure to be constructed on the stack.
+   Optimisation passes may then elide the stack stores if
+   possible.  */
+
+static bool
+aarch64_member_type_forces_blk (const_tree field, machine_mode mode)
+{
+  machine_mode field_mode = TYPE_MODE (TREE_TYPE (field));
+  /* If we are a multi-element structure, and any of those elements are
+     floating-point or vector types then move the whole thing to BLKmode
+     to prevent the silly compiler from deciding to build great big stonking
+     integers full of HFA data.  Though presumably this is going to blow
+     copies of those structures, which is annoying.  */
+  return (mode == VOIDmode
+	  && (SCALAR_FLOAT_MODE_P (field_mode)));
+}
+
 /* Target-specific selftests.  */
 
 #if CHECKING_P
@@ -15115,6 +15141,9 @@  aarch64_libgcc_floating_mode_supported_p
 #undef TARGET_MANGLE_TYPE
 #define TARGET_MANGLE_TYPE aarch64_mangle_type
 
+#undef TARGET_MEMBER_TYPE_FORCES_BLK
+#define TARGET_MEMBER_TYPE_FORCES_BLK aarch64_member_type_forces_blk
+
 #undef TARGET_MEMORY_MOVE_COST
 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
 
diff --git a/gcc/testsuite/gcc.target/aarch64/hfa_1.c b/gcc/testsuite/gcc.target/aarch64/hfa_1.c
new file mode 100644
index 0000000..55c22cf
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/hfa_1.c
@@ -0,0 +1,34 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O3" } */
+
+struct x {
+  float x[4];
+};
+
+float
+foo0 (struct x x)
+{
+  return x.x[0];
+}
+
+float
+foo1 (struct x x)
+{
+  return x.x[1];
+}
+
+float
+foo2 (struct x x)
+{
+  return x.x[2];
+}
+
+
+float
+foo3 (struct x x)
+{
+  return x.x[3];
+}
+
+/* We don't want to see any moves to the integer register bank.  */
+/* { dg-final { scan-assembler-not "fmov\\t\[wx\]\[0-9\]+"} } */