diff mbox

[PATCH/AARCH64] Improve ThunderX code generation slightly with load/store pair

Message ID CA+=Sn1n_osNisfddZCgO14GSEWBCAPujbZhr43jz0Tqj9h8vmQ@mail.gmail.com
State New
Headers show

Commit Message

Andrew Pinski Sept. 12, 2016, 9:28 p.m. UTC
On Tue, Aug 9, 2016 at 10:43 AM, Richard Earnshaw (lists)
<Richard.Earnshaw@arm.com> wrote:
> On 08/08/16 18:48, Andrew Pinski wrote:
>> On Fri, Aug 5, 2016 at 12:18 AM, Andrew Pinski <pinskia@gmail.com> wrote:
>>> Hi,
>>>   On ThunderX, load (and store) pair that does a pair of two word
>>> (32bits) load/stores is slower in some cases than doing two
>>> load/stores.  For some internal benchmarks, it provides a 2-5%
>>> improvement.
>>>
>>> This patch disables the forming of the load/store pairs for SImode if
>>> we are tuning for ThunderX.  I used the tuning flags route so it can
>>> be overridden if needed later on or if someone else wants to use the
>>> same method for their core.
>>>
>>> OK?  Bootstrapped and tested on aarch64-linux-gnu with no regressions.
>>
>>
>> Here is a new version based on feedback both on the list and off.
>> I added a check for alignment to greater than 8 bytes as that is
>> alignment < 8 causes the slow down.
>> I also added two new testcases testing this to make sure it did the
>> load pair optimization when it is profitable.
>>
>> OK?  Bootstrapped and tested on aarch64-linux-gnu with no regressions.
>>
>> Thanks,
>> Andrew Pinski
>>
>> ChangeLog:
>> * config/aarch64/aarch64-tuning-flags.def (slow_ldpw): New tuning option.
>> * config/aarch64/aarch64.c (thunderx_tunings): Enable
>> AARCH64_EXTRA_TUNE_SLOW_LDPW.
>> (aarch64_operands_ok_for_ldpstp): Return false if
>> AARCH64_EXTRA_TUNE_SLOW_LDPW and the mode was SImode
>> and the alignment is less than 8 byte.
>> (aarch64_operands_adjust_ok_for_ldpstp): Likewise.
>>
>> testsuite/ChangeLog:
>> * gcc.target/aarch64/thunderxloadpair.c: New testcase.
>> * gcc.target/aarch64/thunderxnoloadpair.c: New testcase.
>>
>>>
>>> Thanks,
>>> Andrew Pinski
>>>
>>> ChangeLog:
>>> * config/aarch64/aarch64-tuning-flags.def (slow_ldpw): New tuning option.
>>> * config/aarch64/aarch64.c (thunderx_tunings): Enable
>>> AARCH64_EXTRA_TUNE_SLOW_LDPW.
>>> (aarch64_operands_ok_for_ldpstp): Return false if
>>> AARCH64_EXTRA_TUNE_SLOW_LDPW and the mode was SImode.
>>> (aarch64_operands_adjust_ok_for_ldpstp): Likewise.
>>>
>
> OK with the changes noted below.

This is what I committed after a bootstrap/test.

2016-09-12  Andrew Pinski  <apinski@cavium.com>

        * config/aarch64/aarch64-tuning-flags.def (SLOW_UNALIGNED_LDPW):
        New tuning option.
        * config/aarch64/aarch64.c (thunderx_tunings): Enable
        AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW.
        (aarch64_operands_ok_for_ldpstp): Return false if
        AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW and the mode
        was SImode and the alignment is less than 8 byte.
        (aarch64_operands_adjust_ok_for_ldpstp): Likewise.
2016-09-12  Andrew Pinski  <apinski@cavium.com>

        * gcc.target/aarch64/thunderxloadpair.c: New testcase.
        * gcc.target/aarch64/thunderxnoloadpair.c: New testcase.

Thanks,
Andrew

>
> R.
>
>>> stldpw.diff.txt
>>>
>>>
>>> Index: config/aarch64/aarch64-tuning-flags.def
>>> ===================================================================
>>> --- config/aarch64/aarch64-tuning-flags.def  (revision 239228)
>>> +++ config/aarch64/aarch64-tuning-flags.def  (working copy)
>>> @@ -29,3 +29,4 @@
>>>       AARCH64_TUNE_ to give an enum name. */
>>>
>>>  AARCH64_EXTRA_TUNING_OPTION ("rename_fma_regs", RENAME_FMA_REGS)
>>> +AARCH64_EXTRA_TUNING_OPTION ("slow_ldpw", SLOW_LDPW)
>
> I think this should now be renamed SLOW_UNALIGNED_LDPW.  I think it also
> should be commented as to what it does at this point.
>
>>> Index: config/aarch64/aarch64.c
>>> ===================================================================
>>> --- config/aarch64/aarch64.c (revision 239228)
>>> +++ config/aarch64/aarch64.c (working copy)
>>> @@ -712,7 +712,7 @@
>>>    0,        /* max_case_values.  */
>>>    0,        /* cache_line_size.  */
>>>    tune_params::AUTOPREFETCHER_OFF,  /* autoprefetcher_model.  */
>>> -  (AARCH64_EXTRA_TUNE_NONE) /* tune_flags.  */
>>> +  (AARCH64_EXTRA_TUNE_SLOW_LDPW)    /* tune_flags.  */
>>>  };
>>>
>>>  static const struct tune_params xgene1_tunings =
>>> @@ -13593,6 +13593,15 @@
>>>    if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
>>>      return false;
>>>
>>> +  /* If we have SImode and slow ldp, check the alignment to be greater
>>> +     than 8 byte. */
>
> Comment doesn't match code.  Should be "at least 8 byte alignment".
>
>>> +  if (mode == SImode
>>> +      && (aarch64_tune_params.extra_tuning_flags
>>> +          & AARCH64_EXTRA_TUNE_SLOW_LDPW)
>>> +      && !optimize_size
>>> +      && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
>>> +    return false;
>>> +
>>>    /* Check if the addresses are in the form of [base+offset].  */
>>>    extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
>>>    if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
>>> @@ -13752,6 +13761,15 @@
>>>      return false;
>>>      }
>>>
>>> +  /* If we have SImode and slow ldp, check the alignment to be greater
>>> +     than 8 byte. */
>
> Likewise.
>
>>> +  if (mode == SImode
>>> +      && (aarch64_tune_params.extra_tuning_flags
>>> +          & AARCH64_EXTRA_TUNE_SLOW_LDPW)
>>> +      && !optimize_size
>>> +      && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
>>> +    return false;
>>> +
>>>    if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
>>>      rclass_1 = FP_REGS;
>>>    else
>>> Index: testsuite/gcc.target/aarch64/thunderxloadpair.c
>>> ===================================================================
>>> --- testsuite/gcc.target/aarch64/thunderxloadpair.c  (nonexistent)
>>> +++ testsuite/gcc.target/aarch64/thunderxloadpair.c  (working copy)
>>> @@ -0,0 +1,20 @@
>>> +/* { dg-do compile } */
>>> +/* { dg-options "-O2 -mcpu=thunderx" } */
>>> +
>>> +struct ldp
>>> +{
>>> +  long long c;
>>> +  int a, b;
>>> +};
>>> +
>>> +
>>> +int f(struct ldp *a)
>>> +{
>>> +  return a->a + a->b;
>>> +}
>>> +
>>> +
>>> +/* We know the alignement of a->a to be 8 byte aligned so it is profitable
>>> +   to do ldp. */
>>> +/* { dg-final { scan-assembler-times "ldp\tw\[0-9\]+, w\[0-9\]" 1 } } */
>>> +
>>> Index: testsuite/gcc.target/aarch64/thunderxnoloadpair.c
>>> ===================================================================
>>> --- testsuite/gcc.target/aarch64/thunderxnoloadpair.c        (nonexistent)
>>> +++ testsuite/gcc.target/aarch64/thunderxnoloadpair.c        (working copy)
>>> @@ -0,0 +1,17 @@
>>> +/* { dg-do compile } */
>>> +/* { dg-options "-O2 -mcpu=thunderx" } */
>>> +
>>> +struct noldp
>>> +{
>>> +  int a, b;
>>> +};
>>> +
>>> +
>>> +int f(struct noldp *a)
>>> +{
>>> +  return a->a + a->b;
>>> +}
>>> +
>>> +/* We know the alignement of a->a to be 4 byte aligned so it is not profitable
>>> +   to do ldp. */
>>> +/* { dg-final { scan-assembler-time "ldp\tw\[0-9\]+, w\[0-9\]" 1 } } */
>
diff mbox

Patch

Index: gcc/ChangeLog
===================================================================
--- gcc/ChangeLog	(revision 240100)
+++ gcc/ChangeLog	(working copy)
@@ -1,3 +1,14 @@ 
+2016-09-12  Andrew Pinski  <apinski@cavium.com>
+
+	* config/aarch64/aarch64-tuning-flags.def (SLOW_UNALIGNED_LDPW):
+	New tuning option.
+	* config/aarch64/aarch64.c (thunderx_tunings): Enable
+	AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW.
+	(aarch64_operands_ok_for_ldpstp): Return false if
+	AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW and the mode
+	was SImode and the alignment is less than 8 byte.
+	(aarch64_operands_adjust_ok_for_ldpstp): Likewise.
+
 2016-09-12  Marek Polacek  <polacek@redhat.com>
 
 	* doc/extend.texi: Use lowercase "boolean".
Index: gcc/config/aarch64/aarch64-tuning-flags.def
===================================================================
--- gcc/config/aarch64/aarch64-tuning-flags.def	(revision 240100)
+++ gcc/config/aarch64/aarch64-tuning-flags.def	(working copy)
@@ -29,3 +29,8 @@ 
      AARCH64_TUNE_ to give an enum name. */
 
 AARCH64_EXTRA_TUNING_OPTION ("rename_fma_regs", RENAME_FMA_REGS)
+
+/* Don't create non-8 byte aligned load/store pair.  That is if the
+two load/stores are not at least 8 byte aligned don't create load/store
+pairs.   */
+AARCH64_EXTRA_TUNING_OPTION ("slow_unaligned_ldpw", SLOW_UNALIGNED_LDPW)
Index: gcc/config/aarch64/aarch64.c
===================================================================
--- gcc/config/aarch64/aarch64.c	(revision 240100)
+++ gcc/config/aarch64/aarch64.c	(working copy)
@@ -712,7 +712,7 @@ 
   0,	/* max_case_values.  */
   0,	/* cache_line_size.  */
   tune_params::AUTOPREFETCHER_OFF,	/* autoprefetcher_model.  */
-  (AARCH64_EXTRA_TUNE_NONE)	/* tune_flags.  */
+  (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)	/* tune_flags.  */
 };
 
 static const struct tune_params xgene1_tunings =
@@ -13629,6 +13629,15 @@ 
   if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
     return false;
 
+  /* If we have SImode and slow unaligned ldp,
+     check the alignment to be at least 8 byte. */
+  if (mode == SImode
+      && (aarch64_tune_params.extra_tuning_flags
+          & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
+      && !optimize_size
+      && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
+    return false;
+
   /* Check if the addresses are in the form of [base+offset].  */
   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
   if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
@@ -13788,6 +13797,15 @@ 
 	return false;
     }
 
+  /* If we have SImode and slow unaligned ldp,
+     check the alignment to be at least 8 byte. */
+  if (mode == SImode
+      && (aarch64_tune_params.extra_tuning_flags
+          & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
+      && !optimize_size
+      && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
+    return false;
+
   if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
     rclass_1 = FP_REGS;
   else
Index: gcc/testsuite/ChangeLog
===================================================================
--- gcc/testsuite/ChangeLog	(revision 240100)
+++ gcc/testsuite/ChangeLog	(working copy)
@@ -1,3 +1,8 @@ 
+2016-09-12  Andrew Pinski  <apinski@cavium.com>
+
+	* gcc.target/aarch64/thunderxloadpair.c: New testcase.
+	* gcc.target/aarch64/thunderxnoloadpair.c: New testcase.
+
 2016-09-12  Uros Bizjak  <ubizjak@gmail.com>
 
 	* gcc.dg/compat/scalar-by-value-4_x.c: Also test passing of
Index: gcc/testsuite/gcc.target/aarch64/thunderxloadpair.c
===================================================================
--- gcc/testsuite/gcc.target/aarch64/thunderxloadpair.c	(nonexistent)
+++ gcc/testsuite/gcc.target/aarch64/thunderxloadpair.c	(working copy)
@@ -0,0 +1,20 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -mcpu=thunderx" } */
+
+struct ldp
+{
+  long long c;
+  int a, b;
+};
+
+
+int f(struct ldp *a)
+{
+  return a->a + a->b;
+}
+
+
+/* We know the alignement of a->a to be 8 byte aligned so it is profitable
+   to do ldp. */
+/* { dg-final { scan-assembler-times "ldp\tw\[0-9\]+, w\[0-9\]" 1 } } */
+
Index: gcc/testsuite/gcc.target/aarch64/thunderxnoloadpair.c
===================================================================
--- gcc/testsuite/gcc.target/aarch64/thunderxnoloadpair.c	(nonexistent)
+++ gcc/testsuite/gcc.target/aarch64/thunderxnoloadpair.c	(working copy)
@@ -0,0 +1,17 @@ 
+/* { dg-do compile } */
+/* { dg-options "-O2 -mcpu=thunderx" } */
+
+struct noldp
+{
+  int a, b;
+};
+
+
+int f(struct noldp *a)
+{
+  return a->a + a->b;
+}
+
+/* We know the alignement of a->a to be 4 byte aligned so it is not profitable
+   to do ldp. */
+/* { dg-final { scan-assembler-not "ldp\tw\[0-9\]+, w\[0-9\]" } } */