diff mbox

Honnor ix86_accumulate_outgoing_args again

Message ID CAMe9rOopMxzT2T7e2ahQo=Av0bAFonKoyDjiR8y1cvmWQEB-yA@mail.gmail.com
State New
Headers show

Commit Message

H.J. Lu Nov. 12, 2013, 12:37 a.m. UTC
On Mon, Nov 11, 2013 at 4:18 PM, Jakub Jelinek <jakub@redhat.com> wrote:
> On Thu, Oct 10, 2013 at 08:40:05PM +0200, Jan Hubicka wrote:
>> --- config/i386/x86-tune.def  (revision 203387)
>> +++ config/i386/x86-tune.def  (working copy)
>
>> +/* X86_TUNE_AVX256_UNALIGNED_LOAD_OPTIMAL: if true, unaligned loads are
>> +   split.  */
>> +DEF_TUNE (X86_TUNE_AVX256_UNALIGNED_LOAD_OPTIMAL, "256_unaligned_load_optimal",
>> +          ~(m_COREI7 | m_GENERIC))
>> +
>> +/* X86_TUNE_AVX256_UNALIGNED_STORE_OPTIMAL: if true, unaligned loads are
>
> s/loads/stores/
>
>> +   split.  */
>> +DEF_TUNE (X86_TUNE_AVX256_UNALIGNED_STORE_OPTIMAL, "256_unaligned_load_optimal",
>> +          ~(m_COREI7 | m_BDVER | m_GENERIC))
>
> s/load/store/
>

We should exclude m_COREI7_AVX since Ivy Bridge needs to split
32-byt unaligned load and store.

Also comments should say "if false, unaligned loads are split. " instead
of " if true,..."

> Also, I wonder if we couldn't improve the generated code for
> -mavx2 -mtune=generic or -march=core-avx2 -mtune=generic etc.
> - m_GENERIC is included clearly because vmovup{s,d} was really bad
> on SandyBridge (am I right here?), but if the ISA includes AVX2, then
> the code will not run on that chip at all, so can't we override it?

Yes, we should do it, similar to

       if (TARGET_AVX
              || TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
              || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
              || optimize_insn_for_size_p ())
            {
              /* We will eventually emit movups based on insn attributes.  */
              emit_insn (gen_sse2_loadupd (op0, op1));
              return;
            }

>> @@ -3946,10 +3933,10 @@ ix86_option_override_internal (bool main
>>        if (flag_expensive_optimizations
>>         && !(target_flags_explicit & MASK_VZEROUPPER))
>>       target_flags |= MASK_VZEROUPPER;
>> -      if ((x86_avx256_split_unaligned_load & ix86_tune_mask)
>> +      if (!ix86_tune_features[X86_TUNE_SSE_UNALIGNED_LOAD_OPTIMAL]
>
> Didn't you mean to use X86_TUNE_AVX256_UNALIGNED_LOAD_OPTIMAL here?
>
>>         && !(target_flags_explicit & MASK_AVX256_SPLIT_UNALIGNED_LOAD))
>>       target_flags |= MASK_AVX256_SPLIT_UNALIGNED_LOAD;
>> -      if ((x86_avx256_split_unaligned_store & ix86_tune_mask)
>> +      if (!ix86_tune_features[X86_TUNE_SSE_UNALIGNED_STORE_OPTIMAL]
>
> And similarly for STORE here?
>
>>         && !(target_flags_explicit & MASK_AVX256_SPLIT_UNALIGNED_STORE))
>>       target_flags |= MASK_AVX256_SPLIT_UNALIGNED_STORE;
>>        /* Enable 128-bit AVX instruction generation
>
>         Jakub

Something like this.

Comments

Jan Hubicka Nov. 12, 2013, 10:05 a.m. UTC | #1
This is OK, thanks for catching the pasto!
Only...
> diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
> index 430d562..b8cb871 100644
> --- a/gcc/config/i386/i386.c
> +++ b/gcc/config/i386/i386.c
> @@ -3974,10 +3974,10 @@ ix86_option_override_internal (bool main_args_p,
>        if (flag_expensive_optimizations
>        && !(opts_set->x_target_flags & MASK_VZEROUPPER))
>      opts->x_target_flags |= MASK_VZEROUPPER;
> -      if (!ix86_tune_features[X86_TUNE_SSE_UNALIGNED_LOAD_OPTIMAL]
> +      if (!ix86_tune_features[X86_TUNE_AVX256_UNALIGNED_LOAD_OPTIMAL]
>        && !(opts_set->x_target_flags & MASK_AVX256_SPLIT_UNALIGNED_LOAD))
>      opts->x_target_flags |= MASK_AVX256_SPLIT_UNALIGNED_LOAD;
> -      if (!ix86_tune_features[X86_TUNE_SSE_UNALIGNED_STORE_OPTIMAL]
> +      if (!ix86_tune_features[X86_TUNE_AVX256_UNALIGNED_STORE_OPTIMAL]
>        && !(opts_set->x_target_flags & MASK_AVX256_SPLIT_UNALIGNED_STORE))
>      opts->x_target_flags |= MASK_AVX256_SPLIT_UNALIGNED_STORE;
>        /* Enable 128-bit AVX instruction generation
> @@ -16576,7 +16576,7 @@ ix86_avx256_split_vector_move_misalign (rtx
> op0, rtx op1)
> 
>    if (MEM_P (op1))
>      {
> -      if (TARGET_AVX256_SPLIT_UNALIGNED_LOAD)
> +      if (!TARGET_AVX2 && TARGET_AVX256_SPLIT_UNALIGNED_LOAD)
>      {
>        rtx r = gen_reg_rtx (mode);
>        m = adjust_address (op1, mode, 0);
> @@ -16596,7 +16596,7 @@ ix86_avx256_split_vector_move_misalign (rtx
> op0, rtx op1)
>      }
>    else if (MEM_P (op0))
>      {
> -      if (TARGET_AVX256_SPLIT_UNALIGNED_STORE)
> +      if (!TARGET_AVX2 && TARGET_AVX256_SPLIT_UNALIGNED_STORE)

I would add explanation comment on those two.

Shall we also disable argument accumulation for cores? It seems we won't solve the IRA issues, right?

Honza
Jakub Jelinek Nov. 12, 2013, 10:26 a.m. UTC | #2
On Tue, Nov 12, 2013 at 11:05:45AM +0100, Jan Hubicka wrote:
> > @@ -16576,7 +16576,7 @@ ix86_avx256_split_vector_move_misalign (rtx
> > op0, rtx op1)
> > 
> >    if (MEM_P (op1))
> >      {
> > -      if (TARGET_AVX256_SPLIT_UNALIGNED_LOAD)
> > +      if (!TARGET_AVX2 && TARGET_AVX256_SPLIT_UNALIGNED_LOAD)
> >      {
> >        rtx r = gen_reg_rtx (mode);
> >        m = adjust_address (op1, mode, 0);
> > @@ -16596,7 +16596,7 @@ ix86_avx256_split_vector_move_misalign (rtx
> > op0, rtx op1)
> >      }
> >    else if (MEM_P (op0))
> >      {
> > -      if (TARGET_AVX256_SPLIT_UNALIGNED_STORE)
> > +      if (!TARGET_AVX2 && TARGET_AVX256_SPLIT_UNALIGNED_STORE)
> 
> I would add explanation comment on those two.

Looking at http://gcc.gnu.org/ml/gcc-patches/2013-11/msg01235.html
we are going to have some AMD CPU with AVX2 support soon, the question is
if it will prefer 256-bit vmovups/vmovupd/vmovdqu or split, but even
if it will prefer split, the question is if like bdver{1,2,3} it will
be X86_TUNE_AVX128_OPTIMAL, because if yes, then how 256-bit unaligned
loads/stores are handled is much less important there.  Ganesh?

> Shall we also disable argument accumulation for cores? It seems we won't
> solve the IRA issues, right?

You mean LRA issues here, right?  If you are starting to use
no-accumulate-outgoing-args much more often than in the past, I think
the problem that LRA forces a frame pointer in that case is much more
important now (or has that been fixed in the mean time?).  Vlad?

	Jakub
Vladimir Makarov Nov. 12, 2013, 3:39 p.m. UTC | #3
On 11/12/2013 05:26 AM, Jakub Jelinek wrote:
> On Tue, Nov 12, 2013 at 11:05:45AM +0100, Jan Hubicka wrote:
>>> @@ -16576,7 +16576,7 @@ ix86_avx256_split_vector_move_misalign (rtx
>>> op0, rtx op1)
>>>
>>>    if (MEM_P (op1))
>>>      {
>>> -      if (TARGET_AVX256_SPLIT_UNALIGNED_LOAD)
>>> +      if (!TARGET_AVX2 && TARGET_AVX256_SPLIT_UNALIGNED_LOAD)
>>>      {
>>>        rtx r = gen_reg_rtx (mode);
>>>        m = adjust_address (op1, mode, 0);
>>> @@ -16596,7 +16596,7 @@ ix86_avx256_split_vector_move_misalign (rtx
>>> op0, rtx op1)
>>>      }
>>>    else if (MEM_P (op0))
>>>      {
>>> -      if (TARGET_AVX256_SPLIT_UNALIGNED_STORE)
>>> +      if (!TARGET_AVX2 && TARGET_AVX256_SPLIT_UNALIGNED_STORE)
>> I would add explanation comment on those two.
> Looking at http://gcc.gnu.org/ml/gcc-patches/2013-11/msg01235.html
> we are going to have some AMD CPU with AVX2 support soon, the question is
> if it will prefer 256-bit vmovups/vmovupd/vmovdqu or split, but even
> if it will prefer split, the question is if like bdver{1,2,3} it will
> be X86_TUNE_AVX128_OPTIMAL, because if yes, then how 256-bit unaligned
> loads/stores are handled is much less important there.  Ganesh?
>
>> Shall we also disable argument accumulation for cores? It seems we won't
>> solve the IRA issues, right?
> You mean LRA issues here, right?  If you are starting to use
> no-accumulate-outgoing-args much more often than in the past, I think
> the problem that LRA forces a frame pointer in that case is much more
> important now (or has that been fixed in the mean time?).  Vlad?
>
>
I guess it is serious.  So it should fix this for gcc-4.9 in any case. 
I'd say it need 1-2 week of my work.  Right now I am stiil thinking how
to better approach to the implementation of it in LRA.
Jakub Jelinek Nov. 12, 2013, 3:49 p.m. UTC | #4
On Tue, Nov 12, 2013 at 10:39:28AM -0500, Vladimir Makarov wrote:
> >> Shall we also disable argument accumulation for cores? It seems we won't
> >> solve the IRA issues, right?
> > You mean LRA issues here, right?  If you are starting to use
> > no-accumulate-outgoing-args much more often than in the past, I think
> > the problem that LRA forces a frame pointer in that case is much more
> > important now (or has that been fixed in the mean time?).  Vlad?
> >
> >
> I guess it is serious.  So it should fix this for gcc-4.9 in any case. 
> I'd say it need 1-2 week of my work.  Right now I am stiil thinking how
> to better approach to the implementation of it in LRA.

That would be nice.  Given that it is a regression from 4.7 anyway, it can
be fixed in stage3 too, but preferrably sooner than later, so that there is
some time to tune the backend tunables.

	Jakub
Jan Hubicka Nov. 12, 2013, 6:51 p.m. UTC | #5
> On Tue, Nov 12, 2013 at 10:39:28AM -0500, Vladimir Makarov wrote:
> > >> Shall we also disable argument accumulation for cores? It seems we won't
> > >> solve the IRA issues, right?
> > > You mean LRA issues here, right?  If you are starting to use
> > > no-accumulate-outgoing-args much more often than in the past, I think
> > > the problem that LRA forces a frame pointer in that case is much more
> > > important now (or has that been fixed in the mean time?).  Vlad?
> > >
> > >
> > I guess it is serious.  So it should fix this for gcc-4.9 in any case. 
> > I'd say it need 1-2 week of my work.  Right now I am stiil thinking how
> > to better approach to the implementation of it in LRA.
> 
> That would be nice.  Given that it is a regression from 4.7 anyway, it can
> be fixed in stage3 too, but preferrably sooner than later, so that there is
> some time to tune the backend tunables.

Sounds good.  I do not think it is critical - we can always just keep argument
accumulation on as we did for 4.8 and probably few earlier releases, but it
would be really nice to fix this the correct way.

Thank you!
Honza
> 
> 	Jakub
Gopalasubramanian, Ganesh Nov. 13, 2013, 5:10 a.m. UTC | #6
> we are going to have some AMD CPU with AVX2 support soon, the question is
> if it will prefer 256-bit vmovups/vmovupd/vmovdqu or split, but even
> if it will prefer split, the question is if like bdver{1,2,3} it will
> be X86_TUNE_AVX128_OPTIMAL, because if yes, then how 256-bit unaligned
> loads/stores are handled is much less important there.  Ganesh?

256-bit is friendly on bdver4. 
But, 256 bit unaligned stores are micro-coded which we would like to avoid. So we require 128-bit MOVUPS.

-----Original Message-----
From: Jakub Jelinek [mailto:jakub@redhat.com] 
Sent: Tuesday, November 12, 2013 3:57 PM
To: Jan Hubicka
Cc: H.J. Lu; Vladimir Makarov; GCC Patches; Uros Bizjak; Richard Henderson; Gopalasubramanian, Ganesh
Subject: Re: Honnor ix86_accumulate_outgoing_args again

On Tue, Nov 12, 2013 at 11:05:45AM +0100, Jan Hubicka wrote:
> > @@ -16576,7 +16576,7 @@ ix86_avx256_split_vector_move_misalign (rtx 
> > op0, rtx op1)
> > 
> >    if (MEM_P (op1))
> >      {
> > -      if (TARGET_AVX256_SPLIT_UNALIGNED_LOAD)
> > +      if (!TARGET_AVX2 && TARGET_AVX256_SPLIT_UNALIGNED_LOAD)
> >      {
> >        rtx r = gen_reg_rtx (mode);
> >        m = adjust_address (op1, mode, 0); @@ -16596,7 +16596,7 @@ 
> > ix86_avx256_split_vector_move_misalign (rtx op0, rtx op1)
> >      }
> >    else if (MEM_P (op0))
> >      {
> > -      if (TARGET_AVX256_SPLIT_UNALIGNED_STORE)
> > +      if (!TARGET_AVX2 && TARGET_AVX256_SPLIT_UNALIGNED_STORE)
> 
> I would add explanation comment on those two.

Looking at http://gcc.gnu.org/ml/gcc-patches/2013-11/msg01235.html
we are going to have some AMD CPU with AVX2 support soon, the question is
if it will prefer 256-bit vmovups/vmovupd/vmovdqu or split, but even
if it will prefer split, the question is if like bdver{1,2,3} it will
be X86_TUNE_AVX128_OPTIMAL, because if yes, then how 256-bit unaligned
loads/stores are handled is much less important there.  Ganesh?

> Shall we also disable argument accumulation for cores? It seems we won't
> solve the IRA issues, right?

You mean LRA issues here, right?  If you are starting to use
no-accumulate-outgoing-args much more often than in the past, I think
the problem that LRA forces a frame pointer in that case is much more
important now (or has that been fixed in the mean time?).  Vlad?

	Jakub
diff mbox

Patch

diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 430d562..b8cb871 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -3974,10 +3974,10 @@  ix86_option_override_internal (bool main_args_p,
       if (flag_expensive_optimizations
       && !(opts_set->x_target_flags & MASK_VZEROUPPER))
     opts->x_target_flags |= MASK_VZEROUPPER;
-      if (!ix86_tune_features[X86_TUNE_SSE_UNALIGNED_LOAD_OPTIMAL]
+      if (!ix86_tune_features[X86_TUNE_AVX256_UNALIGNED_LOAD_OPTIMAL]
       && !(opts_set->x_target_flags & MASK_AVX256_SPLIT_UNALIGNED_LOAD))
     opts->x_target_flags |= MASK_AVX256_SPLIT_UNALIGNED_LOAD;
-      if (!ix86_tune_features[X86_TUNE_SSE_UNALIGNED_STORE_OPTIMAL]
+      if (!ix86_tune_features[X86_TUNE_AVX256_UNALIGNED_STORE_OPTIMAL]
       && !(opts_set->x_target_flags & MASK_AVX256_SPLIT_UNALIGNED_STORE))
     opts->x_target_flags |= MASK_AVX256_SPLIT_UNALIGNED_STORE;
       /* Enable 128-bit AVX instruction generation
@@ -16576,7 +16576,7 @@  ix86_avx256_split_vector_move_misalign (rtx
op0, rtx op1)

   if (MEM_P (op1))
     {
-      if (TARGET_AVX256_SPLIT_UNALIGNED_LOAD)
+      if (!TARGET_AVX2 && TARGET_AVX256_SPLIT_UNALIGNED_LOAD)
     {
       rtx r = gen_reg_rtx (mode);
       m = adjust_address (op1, mode, 0);
@@ -16596,7 +16596,7 @@  ix86_avx256_split_vector_move_misalign (rtx
op0, rtx op1)
     }
   else if (MEM_P (op0))
     {
-      if (TARGET_AVX256_SPLIT_UNALIGNED_STORE)
+      if (!TARGET_AVX2 && TARGET_AVX256_SPLIT_UNALIGNED_STORE)
     {
       m = adjust_address (op0, mode, 0);
       emit_insn (extract (m, op1, const0_rtx));
diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def
index 1a85ce2..6c7063a 100644
--- a/gcc/config/i386/x86-tune.def
+++ b/gcc/config/i386/x86-tune.def
@@ -376,15 +376,15 @@  DEF_TUNE (X86_TUNE_USE_VECTOR_CONVERTS,
"use_vector_converts", m_AMDFAM10)
 /* AVX instruction selection tuning (some of SSE flags affects AVX, too)     */
 /*****************************************************************************/

-/* X86_TUNE_AVX256_UNALIGNED_LOAD_OPTIMAL: if true, unaligned loads are
+/* X86_TUNE_AVX256_UNALIGNED_LOAD_OPTIMAL: if false, unaligned loads are
    split.  */
 DEF_TUNE (X86_TUNE_AVX256_UNALIGNED_LOAD_OPTIMAL, "256_unaligned_load_optimal",
-          ~(m_COREI7 | m_GENERIC))
+          ~(m_COREI7 | m_COREI7_AVX | m_GENERIC))

-/* X86_TUNE_AVX256_UNALIGNED_STORE_OPTIMAL: if true, unaligned loads are
+/* X86_TUNE_AVX256_UNALIGNED_STORE_OPTIMAL: if false, unaligned stores are
    split.  */
-DEF_TUNE (X86_TUNE_AVX256_UNALIGNED_STORE_OPTIMAL,
"256_unaligned_load_optimal",
-          ~(m_COREI7 | m_BDVER | m_GENERIC))
+DEF_TUNE (X86_TUNE_AVX256_UNALIGNED_STORE_OPTIMAL,
"256_unaligned_store_optimal",
+          ~(m_COREI7 | m_COREI7_AVX | m_BDVER | m_GENERIC))