diff mbox

[i386] : Merge movlpd/movsd with movhpd to form movupd

Message ID CAFULd4ZFR9LiRSb9g7npKsL71pb3hBy7oE3FOSqUeF_j3j_4Jg@mail.gmail.com
State New
Headers show

Commit Message

Uros Bizjak April 24, 2015, 11:19 a.m. UTC
Hello!

This patch revives the old patch from Wei to merge movlpd/movsd with
movhpd to movupd. As evident from the patch, the patch merges only
instructions with simple memory references, which should IMO cover all
interesting cases (please see the included testcases).

I have played a bit with load/store fusion infrastructure, but at the
end of the day decided, that it isn't worth to handle relatively rare
transformation with such costly approach.

2015-04-24  Uros Bizjak  <ubizjak@gmail.com>
        Wei Mi  <wmi@google.com>

    * config/i386/i386-protos.h (ix86_operands_ok_for_move_multiple): New.
    * config/i386/i386.c (extract_base_offset_in_addr): New function.
    (ix86_operands_ok_for_move_multiple): Ditto.
    * config/i386/sse.md (movsd/movhpd to movupd peephole2): New pattern.
    (movlpd/movhpd to movupd peephole2): Ditto.

testsuite/ChangeLog:

2015-04-24  Uros Bizjak  <ubizjak@gmail.com>
        Wei Mi  <wmi@google.com>

    * gcc.target/i386/sse2-load-multi.c: New test.
    * gcc.target/i386/sse2-store-multi.c: Ditto.

Tested on x86_64-linux-gnu {,-m32} and will commit it to mainline SVN soon.

Uros.
diff mbox

Patch

Index: config/i386/i386-protos.h
===================================================================
--- config/i386/i386-protos.h	(revision 222384)
+++ config/i386/i386-protos.h	(working copy)
@@ -304,6 +304,8 @@ 
 #endif
 
 extern const char * ix86_output_call_insn (rtx_insn *insn, rtx call_op);
+extern bool ix86_operands_ok_for_move_multiple (rtx *operands, bool load,
+						enum machine_mode mode);
 
 #ifdef RTX_CODE
 /* Target data for multipass lookahead scheduling.
Index: config/i386/i386.c
===================================================================
--- config/i386/i386.c	(revision 222384)
+++ config/i386/i386.c	(working copy)
@@ -51726,6 +51726,92 @@ 
 }
 #endif
 
+/* If MEM is in the form of [base+offset], extract the two parts
+   of address and set to BASE and OFFSET, otherwise return false.  */
+
+static bool
+extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
+{
+  rtx addr;
+
+  gcc_assert (MEM_P (mem));
+
+  addr = XEXP (mem, 0);
+  
+  if (GET_CODE (addr) == CONST)
+    addr = XEXP (addr, 0);
+
+  if (REG_P (addr) || GET_CODE (addr) == SYMBOL_REF)
+    {
+      *base = addr;
+      *offset = const0_rtx;
+      return true;
+    }
+
+  if (GET_CODE (addr) == PLUS
+      && (REG_P (XEXP (addr, 0))
+	  || GET_CODE (XEXP (addr, 0)) == SYMBOL_REF)
+      && CONST_INT_P (XEXP (addr, 1)))
+    {
+      *base = XEXP (addr, 0);
+      *offset = XEXP (addr, 1);
+      return true;
+    }
+
+  return false;
+}
+
+/* Given OPERANDS of consecutive load/store, check if we can merge
+   them into move multiple.  LOAD is true if they are load instructions.
+   MODE is the mode of memory operands.  */
+
+bool
+ix86_operands_ok_for_move_multiple (rtx *operands, bool load,
+				    enum machine_mode mode)
+{
+  HOST_WIDE_INT offval_1, offval_2, msize;
+  rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
+
+  if (load)
+    {
+      mem_1 = operands[1];
+      mem_2 = operands[3];
+      reg_1 = operands[0];
+      reg_2 = operands[2];
+    }
+  else
+    {
+      mem_1 = operands[0];
+      mem_2 = operands[2];
+      reg_1 = operands[1];
+      reg_2 = operands[3];
+    }
+
+  gcc_assert (REG_P (reg_1) && REG_P (reg_2));
+
+  if (REGNO (reg_1) != REGNO (reg_2))
+    return false;
+
+  /* Check if the addresses are in the form of [base+offset].  */
+  if (!extract_base_offset_in_addr (mem_1, &base_1, &offset_1))
+    return false;
+  if (!extract_base_offset_in_addr (mem_2, &base_2, &offset_2))
+    return false;
+
+  /* Check if the bases are the same.  */
+  if (!rtx_equal_p (base_1, base_2))
+    return false;
+
+  offval_1 = INTVAL (offset_1);
+  offval_2 = INTVAL (offset_2);
+  msize = GET_MODE_SIZE (mode);
+  /* Check if mem_1 is adjacent to mem_2 and mem_1 has lower address.  */
+  if (offval_1 + msize != offval_2)
+    return false;
+
+  return true;
+}
+
 /* Initialize the GCC target structure.  */
 #undef TARGET_RETURN_IN_MEMORY
 #define TARGET_RETURN_IN_MEMORY ix86_return_in_memory
Index: config/i386/sse.md
===================================================================
--- config/i386/sse.md	(revision 222384)
+++ config/i386/sse.md	(working copy)
@@ -1183,6 +1183,21 @@ 
 	      ]
 	      (const_string "<MODE>")))])
 
+;; Merge movsd/movhpd to movupd for TARGET_SSE_UNALIGNED_LOAD_OPTIMAL targets.
+(define_peephole2
+  [(set (match_operand:V2DF 0 "register_operand")
+	(vec_concat:V2DF (match_operand:DF 1 "memory_operand")
+			 (match_operand:DF 4 "const0_operand")))
+   (set (match_operand:V2DF 2 "register_operand")
+	(vec_concat:V2DF (vec_select:DF (match_dup 0)
+					(parallel [(const_int 0)]))
+			 (match_operand:DF 3 "memory_operand")))]
+  "TARGET_SSE2 && TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
+   && ix86_operands_ok_for_move_multiple (operands, true, DFmode)"
+  [(set (match_dup 2)
+	(unspec:V2DF [(match_dup 4)] UNSPEC_LOADU))]
+  "operands[4] = adjust_address (operands[1], V2DFmode, 0);")
+
 (define_insn "<sse>_storeu<ssemodesuffix><avxsizesuffix>"
   [(set (match_operand:VF 0 "memory_operand" "=m")
 	(unspec:VF
@@ -1242,6 +1257,20 @@ 
    (set_attr "prefix" "evex")
    (set_attr "mode" "<sseinsnmode>")])
 
+;; Merge movlpd/movhpd to movupd for TARGET_SSE_UNALIGNED_STORE_OPTIMAL targets.
+(define_peephole2
+  [(set (match_operand:DF 0 "memory_operand")
+	(vec_select:DF (match_operand:V2DF 1 "register_operand")
+		       (parallel [(const_int 0)])))
+   (set (match_operand:DF 2 "memory_operand")
+	(vec_select:DF (match_operand:V2DF 3 "register_operand")
+		       (parallel [(const_int 1)])))]
+  "TARGET_SSE2 && TARGET_SSE_UNALIGNED_STORE_OPTIMAL
+   && ix86_operands_ok_for_move_multiple (operands, false, DFmode)"
+  [(set (match_dup 4)
+	(unspec:V2DF [(match_dup 1)] UNSPEC_STOREU))]
+  "operands[4] = adjust_address (operands[0], V2DFmode, 0);")
+
 /* For AVX, normal *mov<mode>_internal pattern will handle unaligned loads
    just fine if misaligned_operand is true, and without the UNSPEC it can
    be combined with arithmetic instructions.  If misaligned_operand is
Index: testsuite/gcc.target/i386/sse2-load-multi.c
===================================================================
--- testsuite/gcc.target/i386/sse2-load-multi.c	(revision 0)
+++ testsuite/gcc.target/i386/sse2-load-multi.c	(working copy)
@@ -0,0 +1,24 @@ 
+/* { dg-do compile } */
+/* { dg-options "-march=corei7 -O2" } */
+
+#include <emmintrin.h>
+
+double a[8];
+
+__m128d load_1 ()
+{
+  __m128d res;
+  res = _mm_load_sd (&a[1]);
+  res = _mm_loadh_pd (res, &a[2]);
+  return res;
+}
+
+__m128d load_2 (double *a)
+{
+  __m128d res;
+  res = _mm_load_sd (&a[1]);
+  res = _mm_loadh_pd (res, &a[2]);
+  return res;
+}
+
+/* { dg-final { scan-assembler-times "movup" 2 } } */
Index: testsuite/gcc.target/i386/sse2-store-multi.c
===================================================================
--- testsuite/gcc.target/i386/sse2-store-multi.c	(revision 0)
+++ testsuite/gcc.target/i386/sse2-store-multi.c	(working copy)
@@ -0,0 +1,20 @@ 
+/* { dg-do compile } */
+/* { dg-options "-march=corei7 -O2" } */
+
+#include <emmintrin.h>
+
+double a[8];
+
+void store_1 (__m128d val)
+{
+  _mm_store_sd (&a[1], val);
+  _mm_storeh_pd (&a[2], val);
+}
+
+void store_2 (__m128d val, double *a)
+{
+  _mm_store_sd (&a[1], val);
+  _mm_storeh_pd (&a[2], val);
+}
+
+/* { dg-final { scan-assembler-times "movup" 2 } } */