Patchwork [i386,google] With -mtune=core2, avoid generating the slow unaligned vector load/store (issue5488054)

login
register
mail settings
Submitter Sriraman Tallam
Date Dec. 13, 2011, 2:05 a.m.
Message ID <20111213020557.EA8F1B21AC@azwildcat.mtv.corp.google.com>
Download mbox | patch
Permalink /patch/130969/
State New
Headers show

Comments

Sriraman Tallam - Dec. 13, 2011, 2:05 a.m.
On core2, unaligned vector load/store using movdqu is a very slow operation.
Experiments show it is six times slower than movdqa (aligned) and this is
irrespective of whether the resulting data happens to be aligned or not. 
For Corei7, there is no performance difference between the two and on AMDs,
movdqu is only about 10% slower.  

This patch does not vectorize loops that need to generate the slow unaligned
memory load/stores on core2.


	Do not vectorize loops on Core2 that need to use unaligned
	vector load/stores.
	* tree-vect-stmts.c (is_slow_vect_unaligned_load_store): New function.
	(vect_analyze_stmt): Check if the vectorizable load/store is slow.
	* target.def (TARGET_SLOW_UNALIGNED_VECTOR_MEMOP): New target hook.
	* doc/m.texi.in: Document new target hook:
	TARGET_SLOW_UNALIGNED_VECTOR_MEMOP
	* doc/m.texi: Regenerate.
	* config/i386/i386.c (ix86_slow_unaligned_vector_memop): New function.
	(TARGET_SLOW_UNALIGNED_VECTOR_MEMOP): New macro.

 
+#undef TARGET_SLOW_UNALIGNED_VECTOR_MEMOP
+#define TARGET_SLOW_UNALIGNED_VECTOR_MEMOP ix86_slow_unaligned_vector_memop
+
 #undef TARGET_ENUM_VA_LIST_P
 #define TARGET_ENUM_VA_LIST_P ix86_enum_va_list
 

--
This patch is available for review at http://codereview.appspot.com/5488054
Ira Rosen - Dec. 13, 2011, 5:54 a.m.
gcc-patches-owner@gcc.gnu.org wrote on 13/12/2011 04:05:57 AM:

> On core2, unaligned vector load/store using movdqu is a very slow
operation.
> Experiments show it is six times slower than movdqa (aligned) and this is
> irrespective of whether the resulting data happens to be aligned or not.
> For Corei7, there is no performance difference between the two and on
AMDs,
> movdqu is only about 10% slower.
>
> This patch does not vectorize loops that need to generate the slow
unaligned
> memory load/stores on core2.
>
>
>    Do not vectorize loops on Core2 that need to use unaligned
>    vector load/stores.
>    * tree-vect-stmts.c (is_slow_vect_unaligned_load_store): New function.
>    (vect_analyze_stmt): Check if the vectorizable load/store is slow.
>    * target.def (TARGET_SLOW_UNALIGNED_VECTOR_MEMOP): New target hook.
>    * doc/m.texi.in: Document new target hook:
>    TARGET_SLOW_UNALIGNED_VECTOR_MEMOP
>    * doc/m.texi: Regenerate.
>    * config/i386/i386.c (ix86_slow_unaligned_vector_memop): New function.
>    (TARGET_SLOW_UNALIGNED_VECTOR_MEMOP): New macro.
>


> @@ -5065,27 +5112,43 @@ vect_analyze_stmt (gimple stmt, bool
*need_to_vect
>     if (!bb_vinfo
>         && (STMT_VINFO_RELEVANT_P (stmt_info)
>             || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def))
> +    {
>        ok = (vectorizable_type_promotion (stmt, NULL, NULL, NULL)
>              || vectorizable_type_demotion (stmt, NULL, NULL, NULL)
>              || vectorizable_conversion (stmt, NULL, NULL, NULL)
>              || vectorizable_shift (stmt, NULL, NULL, NULL)
>              || vectorizable_operation (stmt, NULL, NULL, NULL)
>              || vectorizable_assignment (stmt, NULL, NULL, NULL)
> -            || vectorizable_load (stmt, NULL, NULL, NULL, NULL)
>              || vectorizable_call (stmt, NULL, NULL)
> -            || vectorizable_store (stmt, NULL, NULL, NULL)
> -            || vectorizable_reduction (stmt, NULL, NULL, NULL)
> +       || vectorizable_reduction (stmt, NULL, NULL, NULL)
>              || vectorizable_condition (stmt, NULL, NULL, NULL, 0));
> +
> +      if (!ok)
> +   {
> +          ok = (vectorizable_load (stmt, NULL, NULL, NULL, NULL)
> +           || vectorizable_store (stmt, NULL, NULL, NULL));
> +
> +     if (ok && is_slow_vect_unaligned_load_store (stmt))
> +       ok = false;

Why not call is_slow_vect_unaligned_load_store from
vectorizable_load/store?

Ira


> +   }
> +    }
>      else
>        {
>          if (bb_vinfo)
> -          ok = (vectorizable_type_promotion (stmt, NULL, NULL, node)
> -                || vectorizable_type_demotion (stmt, NULL, NULL, node)
> -               || vectorizable_shift (stmt, NULL, NULL, node)
> -                || vectorizable_operation (stmt, NULL, NULL, node)
> -                || vectorizable_assignment (stmt, NULL, NULL, node)
> -                || vectorizable_load (stmt, NULL, NULL, node, NULL)
> -                || vectorizable_store (stmt, NULL, NULL, node));
> +     {
> +       ok = (vectorizable_type_promotion (stmt, NULL, NULL, node)
> +        || vectorizable_type_demotion (stmt, NULL, NULL, node)
> +        || vectorizable_shift (stmt, NULL, NULL, node)
> +                  || vectorizable_operation (stmt, NULL, NULL, node)
> +                  || vectorizable_assignment (stmt, NULL, NULL, node));
> +            if (!ok)
> +         {
> +                ok = (vectorizable_load (stmt, NULL, NULL, node, NULL)
> +            || vectorizable_store (stmt, NULL, NULL, node));
> +           if (ok && is_slow_vect_unaligned_load_store (stmt))
> +        ok = false;
> +         }
> +     }
>        }
Xinliang David Li - Dec. 13, 2011, 6:29 a.m.
> +/* Returns true if the vector load/store is unaligned and if
> +   unaligned vector load/stores are slow.  */

document STMT.

>
> +static bool
> +is_slow_vect_unaligned_load_store (gimple stmt)
> +{
> +  stmt_vec_info stmt_info;
> +  struct data_reference *dr = NULL;
> +
> +  /* Are unaligned load/stores slow for this target?  */
> +  if (!targetm.slow_unaligned_vector_memop
> +      || !targetm.slow_unaligned_vector_memop ())
> +    return false;
> +
> +  /* Harmful only if it is in a hot region of code when profiles are
> +     available.  */
> +  if (profile_status == PROFILE_READ
> +      && !maybe_hot_bb_p (gimple_bb (stmt)))
> +    return false;

Is this check necessary?


> +
> +  stmt_info = vinfo_for_stmt (stmt);
> +  if (!stmt_info)
> +    return false;
> +
> +  /* Check if access is aligned?.  */
> +  if (STMT_VINFO_STRIDED_ACCESS (stmt_info))
> +    {
> +      gimple first_stmt = GROUP_FIRST_ELEMENT (stmt_info);
> +      if (first_stmt
> +         && vinfo_for_stmt (first_stmt))
> +        dr =  STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
> +    }
> +  else
> +    {
> +      dr = STMT_VINFO_DATA_REF (stmt_info);
> +    }

Remove {}

> +
> +  if (!dr)
> +    return false;
> +
> +  if (!aligned_access_p (dr))
> +   {
> +     return true;
> +   }

Remove {}

> +
> +  return false;
> +}
> +
>  /* Make sure the statement is vectorizable.  */
>
>  bool
> @@ -5065,27 +5112,43 @@ vect_analyze_stmt (gimple stmt, bool *need_to_vect
>    if (!bb_vinfo
>        && (STMT_VINFO_RELEVANT_P (stmt_info)
>            || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def))
> +    {
>       ok = (vectorizable_type_promotion (stmt, NULL, NULL, NULL)
>             || vectorizable_type_demotion (stmt, NULL, NULL, NULL)
>             || vectorizable_conversion (stmt, NULL, NULL, NULL)
>             || vectorizable_shift (stmt, NULL, NULL, NULL)
>             || vectorizable_operation (stmt, NULL, NULL, NULL)
>             || vectorizable_assignment (stmt, NULL, NULL, NULL)
> -            || vectorizable_load (stmt, NULL, NULL, NULL, NULL)
>             || vectorizable_call (stmt, NULL, NULL)
> -            || vectorizable_store (stmt, NULL, NULL, NULL)
> -            || vectorizable_reduction (stmt, NULL, NULL, NULL)
> +           || vectorizable_reduction (stmt, NULL, NULL, NULL)
>             || vectorizable_condition (stmt, NULL, NULL, NULL, 0));
> +
> +      if (!ok)
> +       {
> +          ok = (vectorizable_load (stmt, NULL, NULL, NULL, NULL)
> +               || vectorizable_store (stmt, NULL, NULL, NULL));
> +
> +         if (ok && is_slow_vect_unaligned_load_store (stmt))
> +           ok = false;
> +       }
> +    }
>     else
>       {
>         if (bb_vinfo)
> -          ok = (vectorizable_type_promotion (stmt, NULL, NULL, node)
> -                || vectorizable_type_demotion (stmt, NULL, NULL, node)
> -               || vectorizable_shift (stmt, NULL, NULL, node)
> -                || vectorizable_operation (stmt, NULL, NULL, node)
> -                || vectorizable_assignment (stmt, NULL, NULL, node)
> -                || vectorizable_load (stmt, NULL, NULL, node, NULL)
> -                || vectorizable_store (stmt, NULL, NULL, node));
> +         {
> +           ok = (vectorizable_type_promotion (stmt, NULL, NULL, node)
> +                 || vectorizable_type_demotion (stmt, NULL, NULL, node)
> +                 || vectorizable_shift (stmt, NULL, NULL, node)
> +                  || vectorizable_operation (stmt, NULL, NULL, node)
> +                  || vectorizable_assignment (stmt, NULL, NULL, node));
> +            if (!ok)
> +             {
> +                ok = (vectorizable_load (stmt, NULL, NULL, node, NULL)
> +                     || vectorizable_store (stmt, NULL, NULL, node));
> +               if (ok && is_slow_vect_unaligned_load_store (stmt))
> +                 ok = false;
> +             }
> +         }
>       }
>

Same question as Ira has asked -- why not doing the check in
vectorizable_load|store ?

David


>   if (!ok)
> Index: config/i386/i386.c
> ===================================================================
> --- config/i386/i386.c  (revision 182265)
> +++ config/i386/i386.c  (working copy)
> @@ -26464,6 +26464,24 @@ ix86_init_mmx_sse_builtins (void)
>     }
>  }
>
> +/* Detect if this unaligned vectorizable load/stores should be
> +   considered slow.  This is true for core2 where the movdqu insn
> +   is slow, ~5x slower than the movdqa.  */
> +
> +static bool
> +ix86_slow_unaligned_vector_memop (void)
> +{
> +  /* This is known to be slow on core2.  */
> +  if (ix86_tune == PROCESSOR_CORE2_64
> +      || ix86_tune == PROCESSOR_CORE2_32)
> +    return true;
> +
> +  return false;
> +}
> +
>  /* Internal method for ix86_init_builtins.  */
>
>  static void
> @@ -36624,6 +36642,9 @@ ix86_loop_unroll_adjust (unsigned nunroll, struct
>  #undef TARGET_BUILD_BUILTIN_VA_LIST
>  #define TARGET_BUILD_BUILTIN_VA_LIST ix86_build_builtin_va_list
>
> +#undef TARGET_SLOW_UNALIGNED_VECTOR_MEMOP
> +#define TARGET_SLOW_UNALIGNED_VECTOR_MEMOP ix86_slow_unaligned_vector_memop
> +
>  #undef TARGET_ENUM_VA_LIST_P
>  #define TARGET_ENUM_VA_LIST_P ix86_enum_va_list
>
>
> --
> This patch is available for review at http://codereview.appspot.com/5488054
Jakub Jelinek - Dec. 13, 2011, 7:49 a.m.
On Mon, Dec 12, 2011 at 06:05:57PM -0800, Sriraman Tallam wrote:
> 	Do not vectorize loops on Core2 that need to use unaligned
> 	vector load/stores.
> 	* tree-vect-stmts.c (is_slow_vect_unaligned_load_store): New function.
> 	(vect_analyze_stmt): Check if the vectorizable load/store is slow.
> 	* target.def (TARGET_SLOW_UNALIGNED_VECTOR_MEMOP): New target hook.
> 	* doc/m.texi.in: Document new target hook:
> 	TARGET_SLOW_UNALIGNED_VECTOR_MEMOP
> 	* doc/m.texi: Regenerate.
> 	* config/i386/i386.c (ix86_slow_unaligned_vector_memop): New function.
> 	(TARGET_SLOW_UNALIGNED_VECTOR_MEMOP): New macro.

IMHO it would be better if it didn't prevent vectorization of the loops
altogether, but lead to using aligned stores with an alignment check
before the vectorized loop if possible.

Also, are unaligned loads equally expensive to unaligned stores?

See http://gcc.gnu.org/PR49442 for further info, this really should be done
using some cost model rather than a boolean hook.

	Jakub
Richard Henderson - Dec. 13, 2011, 5:58 p.m.
On 12/12/2011 06:05 PM, Sriraman Tallam wrote:
> On core2, unaligned vector load/store using movdqu is a very slow operation.
> Experiments show it is six times slower than movdqa (aligned) and this is
> irrespective of whether the resulting data happens to be aligned or not. 
> For Corei7, there is no performance difference between the two and on AMDs,
> movdqu is only about 10% slower.  
> 
> This patch does not vectorize loops that need to generate the slow unaligned
> memory load/stores on core2.

What happens if you temporarily disable

      /* ??? Similar to above, only less clear because of quote
         typeless stores unquote.  */
      if (TARGET_SSE2 && !TARGET_SSE_TYPELESS_STORES
          && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
        {
          op0 = gen_lowpart (V16QImode, op0);
          op1 = gen_lowpart (V16QImode, op1);
          emit_insn (gen_sse2_movdqu (op0, op1));
          return;
        }

so that the unaligned store happens via movlps + movhps?


r~
Sriraman Tallam - Dec. 13, 2011, 6:26 p.m.
On Tue, Dec 13, 2011 at 9:58 AM, Richard Henderson <rth@redhat.com> wrote:
> On 12/12/2011 06:05 PM, Sriraman Tallam wrote:
>> On core2, unaligned vector load/store using movdqu is a very slow operation.
>> Experiments show it is six times slower than movdqa (aligned) and this is
>> irrespective of whether the resulting data happens to be aligned or not.
>> For Corei7, there is no performance difference between the two and on AMDs,
>> movdqu is only about 10% slower.
>>
>> This patch does not vectorize loops that need to generate the slow unaligned
>> memory load/stores on core2.
>
> What happens if you temporarily disable
>
>      /* ??? Similar to above, only less clear because of quote
>         typeless stores unquote.  */
>      if (TARGET_SSE2 && !TARGET_SSE_TYPELESS_STORES
>          && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
>        {
>          op0 = gen_lowpart (V16QImode, op0);
>          op1 = gen_lowpart (V16QImode, op1);
>          emit_insn (gen_sse2_movdqu (op0, op1));
>          return;
>        }
>
> so that the unaligned store happens via movlps + movhps?

Cool, this works for stores!  It generates the movlps + movhps. I have
to also make a similar change to another call to gen_sse2_movdqu for
loads. Would it be ok to not do this when tune=core2?

Thanks,
-Sri.


>
>
> r~
Sriraman Tallam - Dec. 13, 2011, 6:28 p.m.
On Mon, Dec 12, 2011 at 11:49 PM, Jakub Jelinek <jakub@redhat.com> wrote:
> On Mon, Dec 12, 2011 at 06:05:57PM -0800, Sriraman Tallam wrote:
>>       Do not vectorize loops on Core2 that need to use unaligned
>>       vector load/stores.
>>       * tree-vect-stmts.c (is_slow_vect_unaligned_load_store): New function.
>>       (vect_analyze_stmt): Check if the vectorizable load/store is slow.
>>       * target.def (TARGET_SLOW_UNALIGNED_VECTOR_MEMOP): New target hook.
>>       * doc/m.texi.in: Document new target hook:
>>       TARGET_SLOW_UNALIGNED_VECTOR_MEMOP
>>       * doc/m.texi: Regenerate.
>>       * config/i386/i386.c (ix86_slow_unaligned_vector_memop): New function.
>>       (TARGET_SLOW_UNALIGNED_VECTOR_MEMOP): New macro.
>
> IMHO it would be better if it didn't prevent vectorization of the loops
> altogether, but lead to using aligned stores with an alignment check
> before the vectorized loop if possible.
>
> Also, are unaligned loads equally expensive to unaligned stores?

Unaligned stores are always expensive, irrespective of whether the
data turns out to be aligned or not at run-time. Unaligned loads are
only as expensive when the data item is unaligned. For unaligned loads
of aligned data, the movdqu is still slow but by ~2x rather than 6x.

>
> See http://gcc.gnu.org/PR49442 for further info, this really should be done
> using some cost model rather than a boolean hook.
>
>        Jakub
Sriraman Tallam - Dec. 13, 2011, 6:29 p.m.
On Mon, Dec 12, 2011 at 9:54 PM, Ira Rosen <IRAR@il.ibm.com> wrote:
>
>
> gcc-patches-owner@gcc.gnu.org wrote on 13/12/2011 04:05:57 AM:
>
>> On core2, unaligned vector load/store using movdqu is a very slow
> operation.
>> Experiments show it is six times slower than movdqa (aligned) and this is
>> irrespective of whether the resulting data happens to be aligned or not.
>> For Corei7, there is no performance difference between the two and on
> AMDs,
>> movdqu is only about 10% slower.
>>
>> This patch does not vectorize loops that need to generate the slow
> unaligned
>> memory load/stores on core2.
>>
>>
>>    Do not vectorize loops on Core2 that need to use unaligned
>>    vector load/stores.
>>    * tree-vect-stmts.c (is_slow_vect_unaligned_load_store): New function.
>>    (vect_analyze_stmt): Check if the vectorizable load/store is slow.
>>    * target.def (TARGET_SLOW_UNALIGNED_VECTOR_MEMOP): New target hook.
>>    * doc/m.texi.in: Document new target hook:
>>    TARGET_SLOW_UNALIGNED_VECTOR_MEMOP
>>    * doc/m.texi: Regenerate.
>>    * config/i386/i386.c (ix86_slow_unaligned_vector_memop): New function.
>>    (TARGET_SLOW_UNALIGNED_VECTOR_MEMOP): New macro.
>>
>
>
>> @@ -5065,27 +5112,43 @@ vect_analyze_stmt (gimple stmt, bool
> *need_to_vect
>>     if (!bb_vinfo
>>         && (STMT_VINFO_RELEVANT_P (stmt_info)
>>             || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def))
>> +    {
>>        ok = (vectorizable_type_promotion (stmt, NULL, NULL, NULL)
>>              || vectorizable_type_demotion (stmt, NULL, NULL, NULL)
>>              || vectorizable_conversion (stmt, NULL, NULL, NULL)
>>              || vectorizable_shift (stmt, NULL, NULL, NULL)
>>              || vectorizable_operation (stmt, NULL, NULL, NULL)
>>              || vectorizable_assignment (stmt, NULL, NULL, NULL)
>> -            || vectorizable_load (stmt, NULL, NULL, NULL, NULL)
>>              || vectorizable_call (stmt, NULL, NULL)
>> -            || vectorizable_store (stmt, NULL, NULL, NULL)
>> -            || vectorizable_reduction (stmt, NULL, NULL, NULL)
>> +       || vectorizable_reduction (stmt, NULL, NULL, NULL)
>>              || vectorizable_condition (stmt, NULL, NULL, NULL, 0));
>> +
>> +      if (!ok)
>> +   {
>> +          ok = (vectorizable_load (stmt, NULL, NULL, NULL, NULL)
>> +           || vectorizable_store (stmt, NULL, NULL, NULL));
>> +
>> +     if (ok && is_slow_vect_unaligned_load_store (stmt))
>> +       ok = false;
>
> Why not call is_slow_vect_unaligned_load_store from
> vectorizable_load/store?

Yes, I should have done that. Somehow, I missed that!

>
> Ira
>
>
>> +   }
>> +    }
>>      else
>>        {
>>          if (bb_vinfo)
>> -          ok = (vectorizable_type_promotion (stmt, NULL, NULL, node)
>> -                || vectorizable_type_demotion (stmt, NULL, NULL, node)
>> -               || vectorizable_shift (stmt, NULL, NULL, node)
>> -                || vectorizable_operation (stmt, NULL, NULL, node)
>> -                || vectorizable_assignment (stmt, NULL, NULL, node)
>> -                || vectorizable_load (stmt, NULL, NULL, node, NULL)
>> -                || vectorizable_store (stmt, NULL, NULL, node));
>> +     {
>> +       ok = (vectorizable_type_promotion (stmt, NULL, NULL, node)
>> +        || vectorizable_type_demotion (stmt, NULL, NULL, node)
>> +        || vectorizable_shift (stmt, NULL, NULL, node)
>> +                  || vectorizable_operation (stmt, NULL, NULL, node)
>> +                  || vectorizable_assignment (stmt, NULL, NULL, node));
>> +            if (!ok)
>> +         {
>> +                ok = (vectorizable_load (stmt, NULL, NULL, node, NULL)
>> +            || vectorizable_store (stmt, NULL, NULL, node));
>> +           if (ok && is_slow_vect_unaligned_load_store (stmt))
>> +        ok = false;
>> +         }
>> +     }
>>        }
>
Richard Henderson - Dec. 13, 2011, 6:56 p.m.
On 12/13/2011 10:26 AM, Sriraman Tallam wrote:
> Cool, this works for stores!  It generates the movlps + movhps. I have
> to also make a similar change to another call to gen_sse2_movdqu for
> loads. Would it be ok to not do this when tune=core2?

We can work something out.  

I'd like you to do the benchmarking to know if unaligned loads are really as expensive as unaligned stores, and whether there are reformatting penalties that make the movlps+movhps option for either load or store less attractive.


r~
Sriraman Tallam - Dec. 13, 2011, 7:14 p.m.
On Tue, Dec 13, 2011 at 10:56 AM, Richard Henderson <rth@redhat.com> wrote:
> On 12/13/2011 10:26 AM, Sriraman Tallam wrote:
>> Cool, this works for stores!  It generates the movlps + movhps. I have
>> to also make a similar change to another call to gen_sse2_movdqu for
>> loads. Would it be ok to not do this when tune=core2?
>
> We can work something out.
>
> I'd like you to do the benchmarking to know if unaligned loads are really as expensive as unaligned stores, and whether there are reformatting penalties that make the movlps+movhps option for either load or store less attractive.

Ok thanks, I will get back with the data.



>
>
> r~
Xinliang David Li - Dec. 13, 2011, 8:16 p.m.
See instruction tables here:
http://www.agner.org/optimize/instruction_tables.pdf

My brief reading of the table for core2 and corei7 suggest the following:

1. On core2

movdqu -- both load and store forms take up to 8 cycles to complete,
and store form produces 8 uops while load produces 4 uops

movsd load:  1 uop, 2 cycle latency
movsd store: 1 uop, 3 cycle latency

movhpd, movlpd load: 2 uops, 3 cycle latency
movhpd store: 2 uops, 5 cycle latency
movlpd store: 1uop, 3 cycle latency


2. Core i7

movdqu load: 1 uop, 2 cycle latency
movdqu store: 1 uop, 3 cycle latency

movsd load: 1 uop, 2 cycle latency
movsd store: 1uop, 3 cycle latency

movhpd, movlpd load: 2 uop, 3 cycle latency
movhpd, movlpd sotre: 2 uop, 5 cycle latency


From the above, looks like a Sri's original simple heuristic should work fine

1) for corei7, if the load and stores can not be proved to be 128 bit
aligned, always use movdqu

2) for core2,  experiment can be done to determine whether to look at
unaligned stores or both unaligned loads to disable vectorization.

Yes, for longer term, a more precise cost model is probably needed --
but require lots of work which may not work a lot better in practice.

What is more important is to beef up gcc infrastructure to allow more
aggressive alignment (info) propagation.

In 4.4, gcc does alignment (output array) based versioning -- Sri's
patch has the effect of doing the samething but only for selected
targets.

thanks,

David

On Tue, Dec 13, 2011 at 10:56 AM, Richard Henderson <rth@redhat.com> wrote:
> On 12/13/2011 10:26 AM, Sriraman Tallam wrote:
>> Cool, this works for stores!  It generates the movlps + movhps. I have
>> to also make a similar change to another call to gen_sse2_movdqu for
>> loads. Would it be ok to not do this when tune=core2?
>
> We can work something out.
>
> I'd like you to do the benchmarking to know if unaligned loads are really as expensive as unaligned stores, and whether there are reformatting penalties that make the movlps+movhps option for either load or store less attractive.
>
>
> r~
Sriraman Tallam - Dec. 13, 2011, 8:29 p.m.
On Tue, Dec 13, 2011 at 10:56 AM, Richard Henderson <rth@redhat.com> wrote:
> On 12/13/2011 10:26 AM, Sriraman Tallam wrote:
>> Cool, this works for stores!  It generates the movlps + movhps. I have
>> to also make a similar change to another call to gen_sse2_movdqu for
>> loads. Would it be ok to not do this when tune=core2?
>
> We can work something out.
>
> I'd like you to do the benchmarking to know if unaligned loads are really as expensive as unaligned stores, and whether there are reformatting penalties that make the movlps+movhps option for either load or store less attractive.

I can confirm that movhps+movlps is *not at all* a good substitute for
movdqu on core2. It makes it much worse. MOVHPS/MOVLPS has a very high
penalty (~10x) for unaligned load/stores.

>
>
> r~
Sriraman Tallam - Dec. 13, 2011, 11:50 p.m.
I updated the patch to add the checks in vectorizable_load and
vectorizable_store itself.

Thanks,
-Sri.

On Tue, Dec 13, 2011 at 12:16 PM, Xinliang David Li <davidxl@google.com> wrote:
> See instruction tables here:
> http://www.agner.org/optimize/instruction_tables.pdf
>
> My brief reading of the table for core2 and corei7 suggest the following:
>
> 1. On core2
>
> movdqu -- both load and store forms take up to 8 cycles to complete,
> and store form produces 8 uops while load produces 4 uops
>
> movsd load:  1 uop, 2 cycle latency
> movsd store: 1 uop, 3 cycle latency
>
> movhpd, movlpd load: 2 uops, 3 cycle latency
> movhpd store: 2 uops, 5 cycle latency
> movlpd store: 1uop, 3 cycle latency
>
>
> 2. Core i7
>
> movdqu load: 1 uop, 2 cycle latency
> movdqu store: 1 uop, 3 cycle latency
>
> movsd load: 1 uop, 2 cycle latency
> movsd store: 1uop, 3 cycle latency
>
> movhpd, movlpd load: 2 uop, 3 cycle latency
> movhpd, movlpd sotre: 2 uop, 5 cycle latency
>
>
> From the above, looks like a Sri's original simple heuristic should work fine
>
> 1) for corei7, if the load and stores can not be proved to be 128 bit
> aligned, always use movdqu
>
> 2) for core2,  experiment can be done to determine whether to look at
> unaligned stores or both unaligned loads to disable vectorization.
>
> Yes, for longer term, a more precise cost model is probably needed --
> but require lots of work which may not work a lot better in practice.
>
> What is more important is to beef up gcc infrastructure to allow more
> aggressive alignment (info) propagation.
>
> In 4.4, gcc does alignment (output array) based versioning -- Sri's
> patch has the effect of doing the samething but only for selected
> targets.
>
> thanks,
>
> David
>
> On Tue, Dec 13, 2011 at 10:56 AM, Richard Henderson <rth@redhat.com> wrote:
>> On 12/13/2011 10:26 AM, Sriraman Tallam wrote:
>>> Cool, this works for stores!  It generates the movlps + movhps. I have
>>> to also make a similar change to another call to gen_sse2_movdqu for
>>> loads. Would it be ok to not do this when tune=core2?
>>
>> We can work something out.
>>
>> I'd like you to do the benchmarking to know if unaligned loads are really as expensive as unaligned stores, and whether there are reformatting penalties that make the movlps+movhps option for either load or store less attractive.
>>
>>
>> r~

Patch

Index: doc/tm.texi
===================================================================
--- doc/tm.texi	(revision 182265)
+++ doc/tm.texi	(working copy)
@@ -10984,6 +10984,11 @@  The result is another tree containing a simplified
 call's result.  If @var{ignore} is true the value will be ignored.
 @end deftypefn
 
+@deftypefn {Target Hook} bool TARGET_SLOW_UNALIGNED_VECTOR_MEMOP (void)
+Return true if unaligned vector memory load/store is a slow operation
+on this target.
+@end deftypefn
+
 @deftypefn {Target Hook} {const char *} TARGET_INVALID_WITHIN_DOLOOP (const_rtx @var{insn})
 
 Take an instruction in @var{insn} and return NULL if it is valid within a
Index: doc/tm.texi.in
===================================================================
--- doc/tm.texi.in	(revision 182265)
+++ doc/tm.texi.in	(working copy)
@@ -10875,6 +10875,11 @@  The result is another tree containing a simplified
 call's result.  If @var{ignore} is true the value will be ignored.
 @end deftypefn
 
+@hook TARGET_SLOW_UNALIGNED_VECTOR_MEMOP
+Return true if unaligned vector memory load/store is a slow operation
+on this target.
+@end deftypefn
+
 @hook TARGET_INVALID_WITHIN_DOLOOP
 
 Take an instruction in @var{insn} and return NULL if it is valid within a
Index: target.def
===================================================================
--- target.def	(revision 182265)
+++ target.def	(working copy)
@@ -1221,6 +1221,12 @@  DEFHOOK
  tree, (tree fndecl, int n_args, tree *argp, bool ignore),
  hook_tree_tree_int_treep_bool_null)
 
+/* Returns true if unaligned vector loads/stores are slow.  */
+DEFHOOK
+(slow_unaligned_vector_memop,
+ "",
+ bool, (void), NULL)
+
 /* Returns a code for a target-specific builtin that implements
    reciprocal of the function, or NULL_TREE if not available.  */
 DEFHOOK
Index: tree-vect-stmts.c
===================================================================
--- tree-vect-stmts.c	(revision 182265)
+++ tree-vect-stmts.c	(working copy)
@@ -4905,7 +4905,54 @@  vectorizable_condition (gimple stmt, gimple_stmt_i
   return true;
 }
 
+/* Returns true if the vector load/store is unaligned and if
+   unaligned vector load/stores are slow.  */
 
+static bool
+is_slow_vect_unaligned_load_store (gimple stmt)
+{
+  stmt_vec_info stmt_info;
+  struct data_reference *dr = NULL;
+
+  /* Are unaligned load/stores slow for this target?  */
+  if (!targetm.slow_unaligned_vector_memop
+      || !targetm.slow_unaligned_vector_memop ())
+    return false;
+
+  /* Harmful only if it is in a hot region of code when profiles are
+     available.  */
+  if (profile_status == PROFILE_READ
+      && !maybe_hot_bb_p (gimple_bb (stmt)))
+    return false;
+  
+  stmt_info = vinfo_for_stmt (stmt);
+  if (!stmt_info)
+    return false;
+ 
+  /* Check if access is aligned?.  */
+  if (STMT_VINFO_STRIDED_ACCESS (stmt_info))
+    {
+      gimple first_stmt = GROUP_FIRST_ELEMENT (stmt_info);
+      if (first_stmt
+	  && vinfo_for_stmt (first_stmt))
+        dr =  STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt));
+    }
+  else
+    {
+      dr = STMT_VINFO_DATA_REF (stmt_info);
+    }
+ 
+  if (!dr)
+    return false;
+ 
+  if (!aligned_access_p (dr))
+   {
+     return true;
+   }
+    
+  return false;  
+}
+
 /* Make sure the statement is vectorizable.  */
 
 bool
@@ -5065,27 +5112,43 @@  vect_analyze_stmt (gimple stmt, bool *need_to_vect
    if (!bb_vinfo
        && (STMT_VINFO_RELEVANT_P (stmt_info)
            || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def))
+    {
       ok = (vectorizable_type_promotion (stmt, NULL, NULL, NULL)
             || vectorizable_type_demotion (stmt, NULL, NULL, NULL)
             || vectorizable_conversion (stmt, NULL, NULL, NULL)
             || vectorizable_shift (stmt, NULL, NULL, NULL)
             || vectorizable_operation (stmt, NULL, NULL, NULL)
             || vectorizable_assignment (stmt, NULL, NULL, NULL)
-            || vectorizable_load (stmt, NULL, NULL, NULL, NULL)
             || vectorizable_call (stmt, NULL, NULL)
-            || vectorizable_store (stmt, NULL, NULL, NULL)
-            || vectorizable_reduction (stmt, NULL, NULL, NULL)
+	    || vectorizable_reduction (stmt, NULL, NULL, NULL)
             || vectorizable_condition (stmt, NULL, NULL, NULL, 0));
+
+      if (!ok)
+	{
+          ok = (vectorizable_load (stmt, NULL, NULL, NULL, NULL)
+	        || vectorizable_store (stmt, NULL, NULL, NULL));
+
+	  if (ok && is_slow_vect_unaligned_load_store (stmt))
+	    ok = false;
+	}
+    }
     else
       {
         if (bb_vinfo)
-          ok = (vectorizable_type_promotion (stmt, NULL, NULL, node)
-                || vectorizable_type_demotion (stmt, NULL, NULL, node)
-               || vectorizable_shift (stmt, NULL, NULL, node)
-                || vectorizable_operation (stmt, NULL, NULL, node)
-                || vectorizable_assignment (stmt, NULL, NULL, node)
-                || vectorizable_load (stmt, NULL, NULL, node, NULL)
-                || vectorizable_store (stmt, NULL, NULL, node));
+	  {
+	    ok = (vectorizable_type_promotion (stmt, NULL, NULL, node)
+		  || vectorizable_type_demotion (stmt, NULL, NULL, node)
+		  || vectorizable_shift (stmt, NULL, NULL, node)
+                  || vectorizable_operation (stmt, NULL, NULL, node)
+                  || vectorizable_assignment (stmt, NULL, NULL, node));
+            if (!ok) 
+	      {
+                ok = (vectorizable_load (stmt, NULL, NULL, node, NULL)
+		      || vectorizable_store (stmt, NULL, NULL, node));
+	        if (ok && is_slow_vect_unaligned_load_store (stmt))
+		  ok = false;
+	      }
+	  }
       }
 
   if (!ok)
Index: config/i386/i386.c
===================================================================
--- config/i386/i386.c	(revision 182265)
+++ config/i386/i386.c	(working copy)
@@ -26464,6 +26464,24 @@  ix86_init_mmx_sse_builtins (void)
     }
 }
 
+/* Detect if this unaligned vectorizable load/stores should be
+   considered slow.  This is true for core2 where the movdqu insn
+   is slow, ~5x slower than the movdqa.  */
+
+static bool
+ix86_slow_unaligned_vector_memop (void)
+{
+  /* This is known to be slow on core2.  */
+  if (ix86_tune == PROCESSOR_CORE2_64
+      || ix86_tune == PROCESSOR_CORE2_32)
+    return true;
+
+  return false;
+}
+
 /* Internal method for ix86_init_builtins.  */
 
 static void
@@ -36624,6 +36642,9 @@  ix86_loop_unroll_adjust (unsigned nunroll, struct
 #undef TARGET_BUILD_BUILTIN_VA_LIST
 #define TARGET_BUILD_BUILTIN_VA_LIST ix86_build_builtin_va_list