diff mbox

[AArch64] Fix size of memory store for the vst<n>_lane intrinsics

Message ID 1383048297-16706-1-git-send-email-james.greenhalgh@arm.com
State New
Headers show

Commit Message

James Greenhalgh Oct. 29, 2013, 12:04 p.m. UTC
Hi,

The vst<n>_lane_<lane_type> intrinsics should write
(sizeof (lane_type) * n) bytes to memory.

In their current form, their asm constraints suggest a write size of
(sizeof (vector_type) * n). This is anywhere from 1 to 16 times too
much data, can cause huge headaches with dead store elimination.

This patch better models how much data we will be writing, which in
turn lets us eliminate the memory clobber. Together, we avoid the
problems with dead store elimination.

Tested with aarch64.exp and checked the C++ neon mangling test which
often breaks when you do these ugly casts.

OK?

Thanks,
James

---
gcc/

2013-10-29  James Greenhalgh  <james.greenhalgh@arm.com>

	* config/aarch64/arm_neon.h
	(__ST2_LANE_FUNC): Better model data size.
	(__ST3_LANE_FUNC): Likewise.
	(__ST4_LANE_FUNC): Likewise.

Comments

Marcus Shawcroft Nov. 5, 2013, 5:56 p.m. UTC | #1
On 29 October 2013 12:04, James Greenhalgh <james.greenhalgh@arm.com> wrote:

> 2013-10-29  James Greenhalgh  <james.greenhalgh@arm.com>
>
>         * config/aarch64/arm_neon.h
>         (__ST2_LANE_FUNC): Better model data size.
>         (__ST3_LANE_FUNC): Likewise.
>         (__ST4_LANE_FUNC): Likewise.

OK /Marcus
diff mbox

Patch

diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index 787ff15..7a63ea1 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -14704,16 +14704,19 @@  __LD4_LANE_FUNC (uint64x2x4_t, uint64_t, 2d, d, u64, q)
 
 #define __ST2_LANE_FUNC(intype, ptrtype, regsuffix,			\
 			lnsuffix, funcsuffix, Q)			\
+  typedef struct { ptrtype __x[2]; } __ST2_LANE_STRUCTURE_##intype;	\
   __extension__ static __inline void					\
   __attribute__ ((__always_inline__))					\
-  vst2 ## Q ## _lane_ ## funcsuffix (const ptrtype *ptr,		\
+  vst2 ## Q ## _lane_ ## funcsuffix (ptrtype *ptr,			\
 				     intype b, const int c)		\
   {									\
+    __ST2_LANE_STRUCTURE_##intype *__p =				\
+				(__ST2_LANE_STRUCTURE_##intype *)ptr;	\
     __asm__ ("ld1 {v16." #regsuffix ", v17." #regsuffix "}, %1\n\t"	\
 	     "st2 {v16." #lnsuffix ", v17." #lnsuffix "}[%2], %0\n\t"	\
-	     : "=Q"(*(intype *) ptr)					\
+	     : "=Q"(*__p)						\
 	     : "Q"(b), "i"(c)						\
-	     : "memory", "v16", "v17");					\
+	     : "v16", "v17");						\
   }
 
 __ST2_LANE_FUNC (int8x8x2_t, int8_t, 8b, b, s8,)
@@ -14743,16 +14746,19 @@  __ST2_LANE_FUNC (uint64x2x2_t, uint64_t, 2d, d, u64, q)
 
 #define __ST3_LANE_FUNC(intype, ptrtype, regsuffix,			\
 			lnsuffix, funcsuffix, Q)			\
+  typedef struct { ptrtype __x[3]; } __ST3_LANE_STRUCTURE_##intype;	\
   __extension__ static __inline void					\
   __attribute__ ((__always_inline__))					\
-  vst3 ## Q ## _lane_ ## funcsuffix (const ptrtype *ptr,		\
+  vst3 ## Q ## _lane_ ## funcsuffix (ptrtype *ptr,			\
 				     intype b, const int c)		\
   {									\
+    __ST3_LANE_STRUCTURE_##intype *__p =				\
+				(__ST3_LANE_STRUCTURE_##intype *)ptr;	\
     __asm__ ("ld1 {v16." #regsuffix " - v18." #regsuffix "}, %1\n\t"	\
 	     "st3 {v16." #lnsuffix " - v18." #lnsuffix "}[%2], %0\n\t"	\
-	     : "=Q"(*(intype *) ptr)					\
+	     : "=Q"(*__p)						\
 	     : "Q"(b), "i"(c)						\
-	     : "memory", "v16", "v17", "v18");				\
+	     : "v16", "v17", "v18");					\
   }
 
 __ST3_LANE_FUNC (int8x8x3_t, int8_t, 8b, b, s8,)
@@ -14782,16 +14788,19 @@  __ST3_LANE_FUNC (uint64x2x3_t, uint64_t, 2d, d, u64, q)
 
 #define __ST4_LANE_FUNC(intype, ptrtype, regsuffix,			\
 			lnsuffix, funcsuffix, Q)			\
+  typedef struct { ptrtype __x[4]; } __ST4_LANE_STRUCTURE_##intype;	\
   __extension__ static __inline void					\
   __attribute__ ((__always_inline__))					\
-  vst4 ## Q ## _lane_ ## funcsuffix (const ptrtype *ptr,		\
+  vst4 ## Q ## _lane_ ## funcsuffix (ptrtype *ptr,			\
 				     intype b, const int c)		\
   {									\
+    __ST4_LANE_STRUCTURE_##intype *__p =				\
+				(__ST4_LANE_STRUCTURE_##intype *)ptr;	\
     __asm__ ("ld1 {v16." #regsuffix " - v19." #regsuffix "}, %1\n\t"	\
 	     "st4 {v16." #lnsuffix " - v19." #lnsuffix "}[%2], %0\n\t"	\
-	     : "=Q"(*(intype *) ptr)					\
+	     : "=Q"(*__p)						\
 	     : "Q"(b), "i"(c)						\
-	     : "memory", "v16", "v17", "v18", "v19");			\
+	     : "v16", "v17", "v18", "v19");				\
   }
 
 __ST4_LANE_FUNC (int8x8x4_t, int8_t, 8b, b, s8,)