diff mbox

[x86] merge movsd/movhpd pair in peephole

Message ID CA+4CFy4oGjcv36T3j_pmAe9zEMdZCDG7UTez1p+=Yz2eiKfQPw@mail.gmail.com
State New
Headers show

Commit Message

Wei Mi April 10, 2014, 12:18 a.m. UTC
Hi,

For the testcase 1.c

#include <emmintrin.h>

double a[1000];

__m128d foo1() {
  __m128d res;
  res = _mm_load_sd(&a[1]);
  res = _mm_loadh_pd(res, &a[2]);
  return res;
}

llvm will merge movsd/movhpd to movupd while gcc will not. The merge
is beneficial on x86 machines starting from Nehalem.

The patch is to add the merging in peephole.
bootstrap and regression pass. Is it ok for stage1?

Thanks,
Wei.

gcc/ChangeLog:

2014-04-09  Wei Mi  <wmi@google.com>

        * config/i386/i386.c (get_memref_parts): New function.
        (adjacent_mem_locations): Ditto.
        * config/i386/i386-protos.h: Add decl for adjacent_mem_locations.
        * config/i386/sse.md: Add define_peephole rule.

gcc/testsuite/ChangeLog:

2014-04-09  Wei Mi  <wmi@google.com>

        * gcc.target/i386/sse2-unaligned-mov.c: New test.

Comments

Bin.Cheng April 10, 2014, 2:27 a.m. UTC | #1
On Thu, Apr 10, 2014 at 8:18 AM, Wei Mi <wmi@google.com> wrote:
> Hi,
>
> For the testcase 1.c
>
> #include <emmintrin.h>
>
> double a[1000];
>
> __m128d foo1() {
>   __m128d res;
>   res = _mm_load_sd(&a[1]);
>   res = _mm_loadh_pd(res, &a[2]);
>   return res;
> }
>
> llvm will merge movsd/movhpd to movupd while gcc will not. The merge
> is beneficial on x86 machines starting from Nehalem.
>
> The patch is to add the merging in peephole.
> bootstrap and regression pass. Is it ok for stage1?
>
> Thanks,
> Wei.
>
> gcc/ChangeLog:
>
> 2014-04-09  Wei Mi  <wmi@google.com>
>
>         * config/i386/i386.c (get_memref_parts): New function.
>         (adjacent_mem_locations): Ditto.
>         * config/i386/i386-protos.h: Add decl for adjacent_mem_locations.
>         * config/i386/sse.md: Add define_peephole rule.
>
> gcc/testsuite/ChangeLog:
>
> 2014-04-09  Wei Mi  <wmi@google.com>
>
>         * gcc.target/i386/sse2-unaligned-mov.c: New test.
>
> diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h
> index 6e32978..3ae0d6d 100644
> --- a/gcc/config/i386/i386-protos.h
> +++ b/gcc/config/i386/i386-protos.h
> @@ -312,6 +312,7 @@ extern enum attr_cpu ix86_schedule;
>  #endif
>
>  extern const char * ix86_output_call_insn (rtx insn, rtx call_op);
> +extern bool adjacent_mem_locations (rtx mem1, rtx mem2);
>
>  #ifdef RTX_CODE
>  /* Target data for multipass lookahead scheduling.
> diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
> index 3eefe4a..a330e84 100644
> --- a/gcc/config/i386/i386.c
> +++ b/gcc/config/i386/i386.c
> @@ -46737,6 +46737,70 @@ ix86_atomic_assign_expand_fenv (tree *hold,
> tree *clear, tree *update)
>                     atomic_feraiseexcept_call);
>  }
>
> +/* Try to determine BASE/OFFSET/SIZE parts of the given MEM.
> +   Return true if successful, false if all the values couldn't
> +   be determined.
> +
> +   This function only looks for REG/SYMBOL or REG/SYMBOL+CONST
> +   address forms. */
> +
> +static bool
> +get_memref_parts (rtx mem, rtx *base, HOST_WIDE_INT *offset,
> +                 HOST_WIDE_INT *size)
> +{
> +  rtx addr_rtx;
> +  if MEM_SIZE_KNOWN_P (mem)
> +    *size = MEM_SIZE (mem);
> +  else
> +    return false;
> +
> +  if (GET_CODE (XEXP (mem, 0)) == CONST)
> +    addr_rtx = XEXP (XEXP (mem, 0), 0);
> +  else
> +    addr_rtx = (XEXP (mem, 0));
> +
> +  if (GET_CODE (addr_rtx) == REG
> +      || GET_CODE (addr_rtx) == SYMBOL_REF)
> +    {
> +      *base = addr_rtx;
> +      *offset = 0;
> +    }
> +  else if (GET_CODE (addr_rtx) == PLUS
> +          && CONST_INT_P (XEXP (addr_rtx, 1)))
> +    {
> +      *base = XEXP (addr_rtx, 0);
> +      *offset = INTVAL (XEXP (addr_rtx, 1));
> +    }
> +  else
> +    return false;
> +
> +  return true;
> +}
> +
> +/* If MEM1 is adjacent to MEM2 and MEM1 has lower address,
> +   return true.  */
> +
> +extern bool
> +adjacent_mem_locations (rtx mem1, rtx mem2)
> +{
> +  rtx base1, base2;
> +  HOST_WIDE_INT off1, size1, off2, size2;
> +
> +  if (get_memref_parts (mem1, &base1, &off1, &size1)
> +      && get_memref_parts (mem2, &base2, &off2, &size2))
> +    {
> +      if (GET_CODE (base1) == SYMBOL_REF
> +         && GET_CODE (base2) == SYMBOL_REF
> +         && SYMBOL_REF_DECL (base1) == SYMBOL_REF_DECL (base2))
> +        return (off1 + size1 == off2);
> +      else if (REG_P (base1)
> +              && REG_P (base2)
> +              && REGNO (base1) == REGNO (base2))
> +        return (off1 + size1 == off2);
> +    }
> +  return false;
> +}
> +
>  /* Initialize the GCC target structure.  */
>  #undef TARGET_RETURN_IN_MEMORY
>  #define TARGET_RETURN_IN_MEMORY ix86_return_in_memory
> diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
> index 72a4d6d..4bf8461 100644
> --- a/gcc/config/i386/sse.md
> +++ b/gcc/config/i386/sse.md
> @@ -15606,3 +15606,37 @@
>    [(set_attr "type" "sselog1")
>     (set_attr "length_immediate" "1")
>     (set_attr "mode" "TI")])
> +
> +;; merge movsd/movhpd to movupd when TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
> +;; is true.
> +(define_peephole2
> +  [(set (match_operand:DF 0 "register_operand")
> +       (match_operand:DF 1 "memory_operand"))
> +   (set (match_operand:V2DF 2 "register_operand")
> +       (vec_concat:V2DF (match_dup 0)
> +        (match_operand:DF 3 "memory_operand")))]
> +  "TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
> +   && REGNO (operands[0]) == REGNO (operands[2])
> +   && adjacent_mem_locations (operands[1], operands[3])"
> +  [(set (match_dup 2)
> +       (unspec:V2DF [(match_dup 4)] UNSPEC_LOADU))]
> +{
> +  operands[4] = gen_rtx_MEM (V2DFmode, XEXP(operands[1], 0));
> +})
> +
> +;; merge movsd/movhpd to movupd when TARGET_SSE_UNALIGNED_STORE_OPTIMAL
> +;; is true.
> +(define_peephole2
> +  [(set (match_operand:DF 0 "memory_operand")
> +        (vec_select:DF (match_operand:V2DF 1 "register_operand")
> +                      (parallel [(const_int 0)])))
> +   (set (match_operand:DF 2 "memory_operand")
> +        (vec_select:DF (match_dup 1)
> +                       (parallel [(const_int 1)])))]
> +  "TARGET_SSE_UNALIGNED_STORE_OPTIMAL
> +   && adjacent_mem_locations (operands[0], operands[2])"
> +  [(set (match_dup 3)
> +        (unspec:V2DF [(match_dup 1)] UNSPEC_STOREU))]
> +{
> +  operands[3] = gen_rtx_MEM (V2DFmode, XEXP(operands[0], 0));
> +})
> diff --git a/gcc/testsuite/gcc.target/i386/sse2-unaligned-mov.c
> b/gcc/testsuite/gcc.target/i386/sse2-unaligned-mov.c
> new file mode 100644
> index 0000000..28470ce
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/sse2-unaligned-mov.c
> @@ -0,0 +1,20 @@
> +/* { dg-do compile } */
> +/* { dg-options "-mtune=corei7 -O2" } */
> +
> +#include <emmintrin.h>
> +
> +double a[1000];
> +
> +__m128d foo1() {
> +  __m128d res;
> +  res = _mm_load_sd(&a[1]);
> +  res = _mm_loadh_pd(res, &a[2]);
> +  return res;
> +}
> +
> +void foo2(__m128d res) {
> +  _mm_store_sd(&a[1], res);
> +  _mm_storeh_pd(&a[2], res);
> +}
> +
> +/* { dg-final { scan-assembler-times "movup" 2 } } */

Hi Wei,
Just FYI, though I am not sure whether it can help.  I am working on
load store pairing on ARM too.  On ARM (maybe other machines like
mips), the problem is more general because we can pair memory accesses
with respect to both core register and fpu register.  The current
strategy taken by GCC is to use peephole2 but it's too weak, for
example, it can't handle pairs which are intervened by other
instructions.  Right now I am working on a new pass in GCC to do that
work, and have already worked out a draft patch.  The instrumental
data looks promising with many opportunities captured besides the
original peephole2 pass.  Furthermore, the result could be even better
if register renaming is enabled.  Hopefully I will try to bring back
register renaming for this purpose later.

Thanks,
bin
Wei Mi April 10, 2014, 4:07 a.m. UTC | #2
Hi Bin,

Yes, we have the same problem that if movsd and movhpd are separated,
peephole cannot merge them. The patch could solve the motivational
performance issue we saw to a good extent, but maybe there is still
space to improve if peephole misses some pairs. Glad to know you are
working on this part. It is the same thing we want. Look forward to
your patch.

Thanks,
Wei.

On Wed, Apr 9, 2014 at 7:27 PM, Bin.Cheng <amker.cheng@gmail.com> wrote:
> On Thu, Apr 10, 2014 at 8:18 AM, Wei Mi <wmi@google.com> wrote:
>> Hi,
>>
>> For the testcase 1.c
>>
>> #include <emmintrin.h>
>>
>> double a[1000];
>>
>> __m128d foo1() {
>>   __m128d res;
>>   res = _mm_load_sd(&a[1]);
>>   res = _mm_loadh_pd(res, &a[2]);
>>   return res;
>> }
>>
>> llvm will merge movsd/movhpd to movupd while gcc will not. The merge
>> is beneficial on x86 machines starting from Nehalem.
>>
>> The patch is to add the merging in peephole.
>> bootstrap and regression pass. Is it ok for stage1?
>>
>> Thanks,
>> Wei.
>>
>> gcc/ChangeLog:
>>
>> 2014-04-09  Wei Mi  <wmi@google.com>
>>
>>         * config/i386/i386.c (get_memref_parts): New function.
>>         (adjacent_mem_locations): Ditto.
>>         * config/i386/i386-protos.h: Add decl for adjacent_mem_locations.
>>         * config/i386/sse.md: Add define_peephole rule.
>>
>> gcc/testsuite/ChangeLog:
>>
>> 2014-04-09  Wei Mi  <wmi@google.com>
>>
>>         * gcc.target/i386/sse2-unaligned-mov.c: New test.
>>
>> diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h
>> index 6e32978..3ae0d6d 100644
>> --- a/gcc/config/i386/i386-protos.h
>> +++ b/gcc/config/i386/i386-protos.h
>> @@ -312,6 +312,7 @@ extern enum attr_cpu ix86_schedule;
>>  #endif
>>
>>  extern const char * ix86_output_call_insn (rtx insn, rtx call_op);
>> +extern bool adjacent_mem_locations (rtx mem1, rtx mem2);
>>
>>  #ifdef RTX_CODE
>>  /* Target data for multipass lookahead scheduling.
>> diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
>> index 3eefe4a..a330e84 100644
>> --- a/gcc/config/i386/i386.c
>> +++ b/gcc/config/i386/i386.c
>> @@ -46737,6 +46737,70 @@ ix86_atomic_assign_expand_fenv (tree *hold,
>> tree *clear, tree *update)
>>                     atomic_feraiseexcept_call);
>>  }
>>
>> +/* Try to determine BASE/OFFSET/SIZE parts of the given MEM.
>> +   Return true if successful, false if all the values couldn't
>> +   be determined.
>> +
>> +   This function only looks for REG/SYMBOL or REG/SYMBOL+CONST
>> +   address forms. */
>> +
>> +static bool
>> +get_memref_parts (rtx mem, rtx *base, HOST_WIDE_INT *offset,
>> +                 HOST_WIDE_INT *size)
>> +{
>> +  rtx addr_rtx;
>> +  if MEM_SIZE_KNOWN_P (mem)
>> +    *size = MEM_SIZE (mem);
>> +  else
>> +    return false;
>> +
>> +  if (GET_CODE (XEXP (mem, 0)) == CONST)
>> +    addr_rtx = XEXP (XEXP (mem, 0), 0);
>> +  else
>> +    addr_rtx = (XEXP (mem, 0));
>> +
>> +  if (GET_CODE (addr_rtx) == REG
>> +      || GET_CODE (addr_rtx) == SYMBOL_REF)
>> +    {
>> +      *base = addr_rtx;
>> +      *offset = 0;
>> +    }
>> +  else if (GET_CODE (addr_rtx) == PLUS
>> +          && CONST_INT_P (XEXP (addr_rtx, 1)))
>> +    {
>> +      *base = XEXP (addr_rtx, 0);
>> +      *offset = INTVAL (XEXP (addr_rtx, 1));
>> +    }
>> +  else
>> +    return false;
>> +
>> +  return true;
>> +}
>> +
>> +/* If MEM1 is adjacent to MEM2 and MEM1 has lower address,
>> +   return true.  */
>> +
>> +extern bool
>> +adjacent_mem_locations (rtx mem1, rtx mem2)
>> +{
>> +  rtx base1, base2;
>> +  HOST_WIDE_INT off1, size1, off2, size2;
>> +
>> +  if (get_memref_parts (mem1, &base1, &off1, &size1)
>> +      && get_memref_parts (mem2, &base2, &off2, &size2))
>> +    {
>> +      if (GET_CODE (base1) == SYMBOL_REF
>> +         && GET_CODE (base2) == SYMBOL_REF
>> +         && SYMBOL_REF_DECL (base1) == SYMBOL_REF_DECL (base2))
>> +        return (off1 + size1 == off2);
>> +      else if (REG_P (base1)
>> +              && REG_P (base2)
>> +              && REGNO (base1) == REGNO (base2))
>> +        return (off1 + size1 == off2);
>> +    }
>> +  return false;
>> +}
>> +
>>  /* Initialize the GCC target structure.  */
>>  #undef TARGET_RETURN_IN_MEMORY
>>  #define TARGET_RETURN_IN_MEMORY ix86_return_in_memory
>> diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
>> index 72a4d6d..4bf8461 100644
>> --- a/gcc/config/i386/sse.md
>> +++ b/gcc/config/i386/sse.md
>> @@ -15606,3 +15606,37 @@
>>    [(set_attr "type" "sselog1")
>>     (set_attr "length_immediate" "1")
>>     (set_attr "mode" "TI")])
>> +
>> +;; merge movsd/movhpd to movupd when TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
>> +;; is true.
>> +(define_peephole2
>> +  [(set (match_operand:DF 0 "register_operand")
>> +       (match_operand:DF 1 "memory_operand"))
>> +   (set (match_operand:V2DF 2 "register_operand")
>> +       (vec_concat:V2DF (match_dup 0)
>> +        (match_operand:DF 3 "memory_operand")))]
>> +  "TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
>> +   && REGNO (operands[0]) == REGNO (operands[2])
>> +   && adjacent_mem_locations (operands[1], operands[3])"
>> +  [(set (match_dup 2)
>> +       (unspec:V2DF [(match_dup 4)] UNSPEC_LOADU))]
>> +{
>> +  operands[4] = gen_rtx_MEM (V2DFmode, XEXP(operands[1], 0));
>> +})
>> +
>> +;; merge movsd/movhpd to movupd when TARGET_SSE_UNALIGNED_STORE_OPTIMAL
>> +;; is true.
>> +(define_peephole2
>> +  [(set (match_operand:DF 0 "memory_operand")
>> +        (vec_select:DF (match_operand:V2DF 1 "register_operand")
>> +                      (parallel [(const_int 0)])))
>> +   (set (match_operand:DF 2 "memory_operand")
>> +        (vec_select:DF (match_dup 1)
>> +                       (parallel [(const_int 1)])))]
>> +  "TARGET_SSE_UNALIGNED_STORE_OPTIMAL
>> +   && adjacent_mem_locations (operands[0], operands[2])"
>> +  [(set (match_dup 3)
>> +        (unspec:V2DF [(match_dup 1)] UNSPEC_STOREU))]
>> +{
>> +  operands[3] = gen_rtx_MEM (V2DFmode, XEXP(operands[0], 0));
>> +})
>> diff --git a/gcc/testsuite/gcc.target/i386/sse2-unaligned-mov.c
>> b/gcc/testsuite/gcc.target/i386/sse2-unaligned-mov.c
>> new file mode 100644
>> index 0000000..28470ce
>> --- /dev/null
>> +++ b/gcc/testsuite/gcc.target/i386/sse2-unaligned-mov.c
>> @@ -0,0 +1,20 @@
>> +/* { dg-do compile } */
>> +/* { dg-options "-mtune=corei7 -O2" } */
>> +
>> +#include <emmintrin.h>
>> +
>> +double a[1000];
>> +
>> +__m128d foo1() {
>> +  __m128d res;
>> +  res = _mm_load_sd(&a[1]);
>> +  res = _mm_loadh_pd(res, &a[2]);
>> +  return res;
>> +}
>> +
>> +void foo2(__m128d res) {
>> +  _mm_store_sd(&a[1], res);
>> +  _mm_storeh_pd(&a[2], res);
>> +}
>> +
>> +/* { dg-final { scan-assembler-times "movup" 2 } } */
>
> Hi Wei,
> Just FYI, though I am not sure whether it can help.  I am working on
> load store pairing on ARM too.  On ARM (maybe other machines like
> mips), the problem is more general because we can pair memory accesses
> with respect to both core register and fpu register.  The current
> strategy taken by GCC is to use peephole2 but it's too weak, for
> example, it can't handle pairs which are intervened by other
> instructions.  Right now I am working on a new pass in GCC to do that
> work, and have already worked out a draft patch.  The instrumental
> data looks promising with many opportunities captured besides the
> original peephole2 pass.  Furthermore, the result could be even better
> if register renaming is enabled.  Hopefully I will try to bring back
> register renaming for this purpose later.
>
> Thanks,
> bin
Wei Mi April 21, 2014, 6 p.m. UTC | #3
Ping.

Thanks,
Wei.

On Wed, Apr 9, 2014 at 5:18 PM, Wei Mi <wmi@google.com> wrote:
> Hi,
>
> For the testcase 1.c
>
> #include <emmintrin.h>
>
> double a[1000];
>
> __m128d foo1() {
>   __m128d res;
>   res = _mm_load_sd(&a[1]);
>   res = _mm_loadh_pd(res, &a[2]);
>   return res;
> }
>
> llvm will merge movsd/movhpd to movupd while gcc will not. The merge
> is beneficial on x86 machines starting from Nehalem.
>
> The patch is to add the merging in peephole.
> bootstrap and regression pass. Is it ok for stage1?
>
> Thanks,
> Wei.
>
> gcc/ChangeLog:
>
> 2014-04-09  Wei Mi  <wmi@google.com>
>
>         * config/i386/i386.c (get_memref_parts): New function.
>         (adjacent_mem_locations): Ditto.
>         * config/i386/i386-protos.h: Add decl for adjacent_mem_locations.
>         * config/i386/sse.md: Add define_peephole rule.
>
> gcc/testsuite/ChangeLog:
>
> 2014-04-09  Wei Mi  <wmi@google.com>
>
>         * gcc.target/i386/sse2-unaligned-mov.c: New test.
>
> diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h
> index 6e32978..3ae0d6d 100644
> --- a/gcc/config/i386/i386-protos.h
> +++ b/gcc/config/i386/i386-protos.h
> @@ -312,6 +312,7 @@ extern enum attr_cpu ix86_schedule;
>  #endif
>
>  extern const char * ix86_output_call_insn (rtx insn, rtx call_op);
> +extern bool adjacent_mem_locations (rtx mem1, rtx mem2);
>
>  #ifdef RTX_CODE
>  /* Target data for multipass lookahead scheduling.
> diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
> index 3eefe4a..a330e84 100644
> --- a/gcc/config/i386/i386.c
> +++ b/gcc/config/i386/i386.c
> @@ -46737,6 +46737,70 @@ ix86_atomic_assign_expand_fenv (tree *hold,
> tree *clear, tree *update)
>                     atomic_feraiseexcept_call);
>  }
>
> +/* Try to determine BASE/OFFSET/SIZE parts of the given MEM.
> +   Return true if successful, false if all the values couldn't
> +   be determined.
> +
> +   This function only looks for REG/SYMBOL or REG/SYMBOL+CONST
> +   address forms. */
> +
> +static bool
> +get_memref_parts (rtx mem, rtx *base, HOST_WIDE_INT *offset,
> +                 HOST_WIDE_INT *size)
> +{
> +  rtx addr_rtx;
> +  if MEM_SIZE_KNOWN_P (mem)
> +    *size = MEM_SIZE (mem);
> +  else
> +    return false;
> +
> +  if (GET_CODE (XEXP (mem, 0)) == CONST)
> +    addr_rtx = XEXP (XEXP (mem, 0), 0);
> +  else
> +    addr_rtx = (XEXP (mem, 0));
> +
> +  if (GET_CODE (addr_rtx) == REG
> +      || GET_CODE (addr_rtx) == SYMBOL_REF)
> +    {
> +      *base = addr_rtx;
> +      *offset = 0;
> +    }
> +  else if (GET_CODE (addr_rtx) == PLUS
> +          && CONST_INT_P (XEXP (addr_rtx, 1)))
> +    {
> +      *base = XEXP (addr_rtx, 0);
> +      *offset = INTVAL (XEXP (addr_rtx, 1));
> +    }
> +  else
> +    return false;
> +
> +  return true;
> +}
> +
> +/* If MEM1 is adjacent to MEM2 and MEM1 has lower address,
> +   return true.  */
> +
> +extern bool
> +adjacent_mem_locations (rtx mem1, rtx mem2)
> +{
> +  rtx base1, base2;
> +  HOST_WIDE_INT off1, size1, off2, size2;
> +
> +  if (get_memref_parts (mem1, &base1, &off1, &size1)
> +      && get_memref_parts (mem2, &base2, &off2, &size2))
> +    {
> +      if (GET_CODE (base1) == SYMBOL_REF
> +         && GET_CODE (base2) == SYMBOL_REF
> +         && SYMBOL_REF_DECL (base1) == SYMBOL_REF_DECL (base2))
> +        return (off1 + size1 == off2);
> +      else if (REG_P (base1)
> +              && REG_P (base2)
> +              && REGNO (base1) == REGNO (base2))
> +        return (off1 + size1 == off2);
> +    }
> +  return false;
> +}
> +
>  /* Initialize the GCC target structure.  */
>  #undef TARGET_RETURN_IN_MEMORY
>  #define TARGET_RETURN_IN_MEMORY ix86_return_in_memory
> diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
> index 72a4d6d..4bf8461 100644
> --- a/gcc/config/i386/sse.md
> +++ b/gcc/config/i386/sse.md
> @@ -15606,3 +15606,37 @@
>    [(set_attr "type" "sselog1")
>     (set_attr "length_immediate" "1")
>     (set_attr "mode" "TI")])
> +
> +;; merge movsd/movhpd to movupd when TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
> +;; is true.
> +(define_peephole2
> +  [(set (match_operand:DF 0 "register_operand")
> +       (match_operand:DF 1 "memory_operand"))
> +   (set (match_operand:V2DF 2 "register_operand")
> +       (vec_concat:V2DF (match_dup 0)
> +        (match_operand:DF 3 "memory_operand")))]
> +  "TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
> +   && REGNO (operands[0]) == REGNO (operands[2])
> +   && adjacent_mem_locations (operands[1], operands[3])"
> +  [(set (match_dup 2)
> +       (unspec:V2DF [(match_dup 4)] UNSPEC_LOADU))]
> +{
> +  operands[4] = gen_rtx_MEM (V2DFmode, XEXP(operands[1], 0));
> +})
> +
> +;; merge movsd/movhpd to movupd when TARGET_SSE_UNALIGNED_STORE_OPTIMAL
> +;; is true.
> +(define_peephole2
> +  [(set (match_operand:DF 0 "memory_operand")
> +        (vec_select:DF (match_operand:V2DF 1 "register_operand")
> +                      (parallel [(const_int 0)])))
> +   (set (match_operand:DF 2 "memory_operand")
> +        (vec_select:DF (match_dup 1)
> +                       (parallel [(const_int 1)])))]
> +  "TARGET_SSE_UNALIGNED_STORE_OPTIMAL
> +   && adjacent_mem_locations (operands[0], operands[2])"
> +  [(set (match_dup 3)
> +        (unspec:V2DF [(match_dup 1)] UNSPEC_STOREU))]
> +{
> +  operands[3] = gen_rtx_MEM (V2DFmode, XEXP(operands[0], 0));
> +})
> diff --git a/gcc/testsuite/gcc.target/i386/sse2-unaligned-mov.c
> b/gcc/testsuite/gcc.target/i386/sse2-unaligned-mov.c
> new file mode 100644
> index 0000000..28470ce
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/sse2-unaligned-mov.c
> @@ -0,0 +1,20 @@
> +/* { dg-do compile } */
> +/* { dg-options "-mtune=corei7 -O2" } */
> +
> +#include <emmintrin.h>
> +
> +double a[1000];
> +
> +__m128d foo1() {
> +  __m128d res;
> +  res = _mm_load_sd(&a[1]);
> +  res = _mm_loadh_pd(res, &a[2]);
> +  return res;
> +}
> +
> +void foo2(__m128d res) {
> +  _mm_store_sd(&a[1], res);
> +  _mm_storeh_pd(&a[2], res);
> +}
> +
> +/* { dg-final { scan-assembler-times "movup" 2 } } */
Uros Bizjak April 21, 2014, 6:39 p.m. UTC | #4
On Mon, Apr 21, 2014 at 8:00 PM, Wei Mi <wmi@google.com> wrote:

>> llvm will merge movsd/movhpd to movupd while gcc will not. The merge
>> is beneficial on x86 machines starting from Nehalem.
>>
>> The patch is to add the merging in peephole.
>> bootstrap and regression pass. Is it ok for stage1?

Let's wait for a generic pass, as proposed by Bin. I think that this
pass will render peephole2 approach obsolete.

Uros.
Xinliang David Li April 21, 2014, 6:59 p.m. UTC | #5
Bin, when will the patch for the generic pass be available for review?

David

On Wed, Apr 9, 2014 at 7:27 PM, Bin.Cheng <amker.cheng@gmail.com> wrote:
> On Thu, Apr 10, 2014 at 8:18 AM, Wei Mi <wmi@google.com> wrote:
>> Hi,
>>
>> For the testcase 1.c
>>
>> #include <emmintrin.h>
>>
>> double a[1000];
>>
>> __m128d foo1() {
>>   __m128d res;
>>   res = _mm_load_sd(&a[1]);
>>   res = _mm_loadh_pd(res, &a[2]);
>>   return res;
>> }
>>
>> llvm will merge movsd/movhpd to movupd while gcc will not. The merge
>> is beneficial on x86 machines starting from Nehalem.
>>
>> The patch is to add the merging in peephole.
>> bootstrap and regression pass. Is it ok for stage1?
>>
>> Thanks,
>> Wei.
>>
>> gcc/ChangeLog:
>>
>> 2014-04-09  Wei Mi  <wmi@google.com>
>>
>>         * config/i386/i386.c (get_memref_parts): New function.
>>         (adjacent_mem_locations): Ditto.
>>         * config/i386/i386-protos.h: Add decl for adjacent_mem_locations.
>>         * config/i386/sse.md: Add define_peephole rule.
>>
>> gcc/testsuite/ChangeLog:
>>
>> 2014-04-09  Wei Mi  <wmi@google.com>
>>
>>         * gcc.target/i386/sse2-unaligned-mov.c: New test.
>>
>> diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h
>> index 6e32978..3ae0d6d 100644
>> --- a/gcc/config/i386/i386-protos.h
>> +++ b/gcc/config/i386/i386-protos.h
>> @@ -312,6 +312,7 @@ extern enum attr_cpu ix86_schedule;
>>  #endif
>>
>>  extern const char * ix86_output_call_insn (rtx insn, rtx call_op);
>> +extern bool adjacent_mem_locations (rtx mem1, rtx mem2);
>>
>>  #ifdef RTX_CODE
>>  /* Target data for multipass lookahead scheduling.
>> diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
>> index 3eefe4a..a330e84 100644
>> --- a/gcc/config/i386/i386.c
>> +++ b/gcc/config/i386/i386.c
>> @@ -46737,6 +46737,70 @@ ix86_atomic_assign_expand_fenv (tree *hold,
>> tree *clear, tree *update)
>>                     atomic_feraiseexcept_call);
>>  }
>>
>> +/* Try to determine BASE/OFFSET/SIZE parts of the given MEM.
>> +   Return true if successful, false if all the values couldn't
>> +   be determined.
>> +
>> +   This function only looks for REG/SYMBOL or REG/SYMBOL+CONST
>> +   address forms. */
>> +
>> +static bool
>> +get_memref_parts (rtx mem, rtx *base, HOST_WIDE_INT *offset,
>> +                 HOST_WIDE_INT *size)
>> +{
>> +  rtx addr_rtx;
>> +  if MEM_SIZE_KNOWN_P (mem)
>> +    *size = MEM_SIZE (mem);
>> +  else
>> +    return false;
>> +
>> +  if (GET_CODE (XEXP (mem, 0)) == CONST)
>> +    addr_rtx = XEXP (XEXP (mem, 0), 0);
>> +  else
>> +    addr_rtx = (XEXP (mem, 0));
>> +
>> +  if (GET_CODE (addr_rtx) == REG
>> +      || GET_CODE (addr_rtx) == SYMBOL_REF)
>> +    {
>> +      *base = addr_rtx;
>> +      *offset = 0;
>> +    }
>> +  else if (GET_CODE (addr_rtx) == PLUS
>> +          && CONST_INT_P (XEXP (addr_rtx, 1)))
>> +    {
>> +      *base = XEXP (addr_rtx, 0);
>> +      *offset = INTVAL (XEXP (addr_rtx, 1));
>> +    }
>> +  else
>> +    return false;
>> +
>> +  return true;
>> +}
>> +
>> +/* If MEM1 is adjacent to MEM2 and MEM1 has lower address,
>> +   return true.  */
>> +
>> +extern bool
>> +adjacent_mem_locations (rtx mem1, rtx mem2)
>> +{
>> +  rtx base1, base2;
>> +  HOST_WIDE_INT off1, size1, off2, size2;
>> +
>> +  if (get_memref_parts (mem1, &base1, &off1, &size1)
>> +      && get_memref_parts (mem2, &base2, &off2, &size2))
>> +    {
>> +      if (GET_CODE (base1) == SYMBOL_REF
>> +         && GET_CODE (base2) == SYMBOL_REF
>> +         && SYMBOL_REF_DECL (base1) == SYMBOL_REF_DECL (base2))
>> +        return (off1 + size1 == off2);
>> +      else if (REG_P (base1)
>> +              && REG_P (base2)
>> +              && REGNO (base1) == REGNO (base2))
>> +        return (off1 + size1 == off2);
>> +    }
>> +  return false;
>> +}
>> +
>>  /* Initialize the GCC target structure.  */
>>  #undef TARGET_RETURN_IN_MEMORY
>>  #define TARGET_RETURN_IN_MEMORY ix86_return_in_memory
>> diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
>> index 72a4d6d..4bf8461 100644
>> --- a/gcc/config/i386/sse.md
>> +++ b/gcc/config/i386/sse.md
>> @@ -15606,3 +15606,37 @@
>>    [(set_attr "type" "sselog1")
>>     (set_attr "length_immediate" "1")
>>     (set_attr "mode" "TI")])
>> +
>> +;; merge movsd/movhpd to movupd when TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
>> +;; is true.
>> +(define_peephole2
>> +  [(set (match_operand:DF 0 "register_operand")
>> +       (match_operand:DF 1 "memory_operand"))
>> +   (set (match_operand:V2DF 2 "register_operand")
>> +       (vec_concat:V2DF (match_dup 0)
>> +        (match_operand:DF 3 "memory_operand")))]
>> +  "TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
>> +   && REGNO (operands[0]) == REGNO (operands[2])
>> +   && adjacent_mem_locations (operands[1], operands[3])"
>> +  [(set (match_dup 2)
>> +       (unspec:V2DF [(match_dup 4)] UNSPEC_LOADU))]
>> +{
>> +  operands[4] = gen_rtx_MEM (V2DFmode, XEXP(operands[1], 0));
>> +})
>> +
>> +;; merge movsd/movhpd to movupd when TARGET_SSE_UNALIGNED_STORE_OPTIMAL
>> +;; is true.
>> +(define_peephole2
>> +  [(set (match_operand:DF 0 "memory_operand")
>> +        (vec_select:DF (match_operand:V2DF 1 "register_operand")
>> +                      (parallel [(const_int 0)])))
>> +   (set (match_operand:DF 2 "memory_operand")
>> +        (vec_select:DF (match_dup 1)
>> +                       (parallel [(const_int 1)])))]
>> +  "TARGET_SSE_UNALIGNED_STORE_OPTIMAL
>> +   && adjacent_mem_locations (operands[0], operands[2])"
>> +  [(set (match_dup 3)
>> +        (unspec:V2DF [(match_dup 1)] UNSPEC_STOREU))]
>> +{
>> +  operands[3] = gen_rtx_MEM (V2DFmode, XEXP(operands[0], 0));
>> +})
>> diff --git a/gcc/testsuite/gcc.target/i386/sse2-unaligned-mov.c
>> b/gcc/testsuite/gcc.target/i386/sse2-unaligned-mov.c
>> new file mode 100644
>> index 0000000..28470ce
>> --- /dev/null
>> +++ b/gcc/testsuite/gcc.target/i386/sse2-unaligned-mov.c
>> @@ -0,0 +1,20 @@
>> +/* { dg-do compile } */
>> +/* { dg-options "-mtune=corei7 -O2" } */
>> +
>> +#include <emmintrin.h>
>> +
>> +double a[1000];
>> +
>> +__m128d foo1() {
>> +  __m128d res;
>> +  res = _mm_load_sd(&a[1]);
>> +  res = _mm_loadh_pd(res, &a[2]);
>> +  return res;
>> +}
>> +
>> +void foo2(__m128d res) {
>> +  _mm_store_sd(&a[1], res);
>> +  _mm_storeh_pd(&a[2], res);
>> +}
>> +
>> +/* { dg-final { scan-assembler-times "movup" 2 } } */
>
> Hi Wei,
> Just FYI, though I am not sure whether it can help.  I am working on
> load store pairing on ARM too.  On ARM (maybe other machines like
> mips), the problem is more general because we can pair memory accesses
> with respect to both core register and fpu register.  The current
> strategy taken by GCC is to use peephole2 but it's too weak, for
> example, it can't handle pairs which are intervened by other
> instructions.  Right now I am working on a new pass in GCC to do that
> work, and have already worked out a draft patch.  The instrumental
> data looks promising with many opportunities captured besides the
> original peephole2 pass.  Furthermore, the result could be even better
> if register renaming is enabled.  Hopefully I will try to bring back
> register renaming for this purpose later.
>
> Thanks,
> bin
Bin.Cheng April 22, 2014, 9:48 a.m. UTC | #6
On Tue, Apr 22, 2014 at 2:59 AM, Xinliang David Li <davidxl@google.com> wrote:
> Bin, when will the patch for the generic pass be available for review?
>
Hi,
The patch is still under working and reviewing.  For arm we only need
to handle simple load/stores, so it may need to be extended to handle
generic memory access instructions on x86.

Thanks,
bin
diff mbox

Patch

diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h
index 6e32978..3ae0d6d 100644
--- a/gcc/config/i386/i386-protos.h
+++ b/gcc/config/i386/i386-protos.h
@@ -312,6 +312,7 @@  extern enum attr_cpu ix86_schedule;
 #endif

 extern const char * ix86_output_call_insn (rtx insn, rtx call_op);
+extern bool adjacent_mem_locations (rtx mem1, rtx mem2);

 #ifdef RTX_CODE
 /* Target data for multipass lookahead scheduling.
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 3eefe4a..a330e84 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -46737,6 +46737,70 @@  ix86_atomic_assign_expand_fenv (tree *hold,
tree *clear, tree *update)
                    atomic_feraiseexcept_call);
 }

+/* Try to determine BASE/OFFSET/SIZE parts of the given MEM.
+   Return true if successful, false if all the values couldn't
+   be determined.
+
+   This function only looks for REG/SYMBOL or REG/SYMBOL+CONST
+   address forms. */
+
+static bool
+get_memref_parts (rtx mem, rtx *base, HOST_WIDE_INT *offset,
+                 HOST_WIDE_INT *size)
+{
+  rtx addr_rtx;
+  if MEM_SIZE_KNOWN_P (mem)
+    *size = MEM_SIZE (mem);
+  else
+    return false;
+
+  if (GET_CODE (XEXP (mem, 0)) == CONST)
+    addr_rtx = XEXP (XEXP (mem, 0), 0);
+  else
+    addr_rtx = (XEXP (mem, 0));
+
+  if (GET_CODE (addr_rtx) == REG
+      || GET_CODE (addr_rtx) == SYMBOL_REF)
+    {
+      *base = addr_rtx;
+      *offset = 0;
+    }
+  else if (GET_CODE (addr_rtx) == PLUS
+          && CONST_INT_P (XEXP (addr_rtx, 1)))
+    {
+      *base = XEXP (addr_rtx, 0);
+      *offset = INTVAL (XEXP (addr_rtx, 1));
+    }
+  else
+    return false;
+
+  return true;
+}
+
+/* If MEM1 is adjacent to MEM2 and MEM1 has lower address,
+   return true.  */
+
+extern bool
+adjacent_mem_locations (rtx mem1, rtx mem2)
+{
+  rtx base1, base2;
+  HOST_WIDE_INT off1, size1, off2, size2;
+
+  if (get_memref_parts (mem1, &base1, &off1, &size1)
+      && get_memref_parts (mem2, &base2, &off2, &size2))
+    {
+      if (GET_CODE (base1) == SYMBOL_REF
+         && GET_CODE (base2) == SYMBOL_REF
+         && SYMBOL_REF_DECL (base1) == SYMBOL_REF_DECL (base2))
+        return (off1 + size1 == off2);
+      else if (REG_P (base1)
+              && REG_P (base2)
+              && REGNO (base1) == REGNO (base2))
+        return (off1 + size1 == off2);
+    }
+  return false;
+}
+
 /* Initialize the GCC target structure.  */
 #undef TARGET_RETURN_IN_MEMORY
 #define TARGET_RETURN_IN_MEMORY ix86_return_in_memory
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 72a4d6d..4bf8461 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -15606,3 +15606,37 @@ 
   [(set_attr "type" "sselog1")
    (set_attr "length_immediate" "1")
    (set_attr "mode" "TI")])
+
+;; merge movsd/movhpd to movupd when TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
+;; is true.
+(define_peephole2
+  [(set (match_operand:DF 0 "register_operand")
+       (match_operand:DF 1 "memory_operand"))
+   (set (match_operand:V2DF 2 "register_operand")
+       (vec_concat:V2DF (match_dup 0)
+        (match_operand:DF 3 "memory_operand")))]
+  "TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
+   && REGNO (operands[0]) == REGNO (operands[2])
+   && adjacent_mem_locations (operands[1], operands[3])"
+  [(set (match_dup 2)
+       (unspec:V2DF [(match_dup 4)] UNSPEC_LOADU))]
+{
+  operands[4] = gen_rtx_MEM (V2DFmode, XEXP(operands[1], 0));
+})
+
+;; merge movsd/movhpd to movupd when TARGET_SSE_UNALIGNED_STORE_OPTIMAL
+;; is true.
+(define_peephole2
+  [(set (match_operand:DF 0 "memory_operand")
+        (vec_select:DF (match_operand:V2DF 1 "register_operand")
+                      (parallel [(const_int 0)])))
+   (set (match_operand:DF 2 "memory_operand")
+        (vec_select:DF (match_dup 1)
+                       (parallel [(const_int 1)])))]
+  "TARGET_SSE_UNALIGNED_STORE_OPTIMAL
+   && adjacent_mem_locations (operands[0], operands[2])"
+  [(set (match_dup 3)
+        (unspec:V2DF [(match_dup 1)] UNSPEC_STOREU))]
+{
+  operands[3] = gen_rtx_MEM (V2DFmode, XEXP(operands[0], 0));
+})
diff --git a/gcc/testsuite/gcc.target/i386/sse2-unaligned-mov.c
b/gcc/testsuite/gcc.target/i386/sse2-unaligned-mov.c
new file mode 100644
index 0000000..28470ce
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/sse2-unaligned-mov.c
@@ -0,0 +1,20 @@ 
+/* { dg-do compile } */
+/* { dg-options "-mtune=corei7 -O2" } */
+
+#include <emmintrin.h>
+
+double a[1000];
+
+__m128d foo1() {
+  __m128d res;
+  res = _mm_load_sd(&a[1]);
+  res = _mm_loadh_pd(res, &a[2]);
+  return res;
+}
+
+void foo2(__m128d res) {
+  _mm_store_sd(&a[1], res);
+  _mm_storeh_pd(&a[2], res);
+}
+
+/* { dg-final { scan-assembler-times "movup" 2 } } */