Patchwork [AARCH64] : Add missing AdvSIMD intrinsics - vmlsq_laneq_*.

login
register
mail settings
Submitter Tejas Belagod
Date June 14, 2012, 9:55 a.m.
Message ID <4FD9B50B.9050706@arm.com>
Download mbox | patch
Permalink /patch/164876/
State New
Headers show

Comments

Tejas Belagod - June 14, 2012, 9:55 a.m.
Hi,

This patch adds missing AdvSIMD intrinsics vmlsq_laneq_<suf><16,32>  to
arm_neon.h. OK?

Thanks,
Tejas Belagod
ARM.

Changelog:

2012-06-14  Tejas Belagod  <tejas.belagod@arm.com>

gcc/
	* config/aarch64/arm_neon.h (vmlsq_laneq_f32, vmlsq_laneq_s16,
	vmlsq_laneq_u16, vmlsq_laneq_s32, vmlsq_laneq_u32): New.

testsuite/
	* gcc.target/aarch64/vmlsq_laneq.c: New.

Patch

diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index 58976cc..3b581bd 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -11151,6 +11151,77 @@  vmlsl_u32 (uint64x2_t a, uint32x2_t b, uint32x2_t c)
        result;                                                          \
      })
 
+#define vmlsq_laneq_f32(__a, __b, __c, __d)				\
+  __extension__								\
+    ({									\
+       float32x4_t __c_ = (__c);					\
+       float32x4_t __b_ = (__b);					\
+       float32x4_t __a_ = (__a);					\
+       float32x4_t __result;						\
+       float32x4_t __t1;						\
+       __asm__ ("fmul %1.4s, %3.4s, %4.s[%5]; fsub %0.4s, %0.4s, %1.4s"	\
+                : "=w"(__result), "=w"(__t1)				\
+                : "0"(__a_), "w"(__b_), "w"(__c_), "i"(__d)		\
+                : /* No clobbers */);					\
+       __result;							\
+     })
+
+#define vmlsq_laneq_s16(__a, __b, __c, __d)				\
+  __extension__								\
+    ({									\
+       int16x8_t __c_ = (__c);						\
+       int16x8_t __b_ = (__b);						\
+       int16x8_t __a_ = (__a);						\
+       int16x8_t __result;						\
+       __asm__ ("mls %0.8h, %2.8h, %3.h[%4]"				\
+                : "=w"(__result)					\
+                : "0"(__a_), "w"(__b_), "w"(__c_), "i"(__d)		\
+                : /* No clobbers */);					\
+       __result;							\
+     })
+
+#define vmlsq_laneq_s32(__a, __b, __c, __d)				\
+  __extension__								\
+    ({									\
+       int32x4_t __c_ = (__c);						\
+       int32x4_t __b_ = (__b);						\
+       int32x4_t __a_ = (__a);						\
+       int32x4_t __result;						\
+       __asm__ ("mls %0.4s, %2.4s, %3.s[%4]"				\
+                : "=w"(__result)					\
+                : "0"(__a_), "w"(__b_), "w"(__c_), "i"(__d)		\
+                : /* No clobbers */);					\
+       __result;							\
+     })
+
+#define vmlsq_laneq_u16(__a, __b, __c, __d)				\
+  __extension__								\
+    ({									\
+       uint16x8_t __c_ = (__c);						\
+       uint16x8_t __b_ = (__b);						\
+       uint16x8_t __a_ = (__a);						\
+       uint16x8_t __result;						\
+       __asm__ ("mls %0.8h, %2.8h, %3.h[%4]"				\
+                : "=w"(__result)					\
+                : "0"(__a_), "w"(__b_), "w"(__c_), "i"(__d)		\
+                : /* No clobbers */);					\
+       __result;							\
+     })
+
+#define vmlsq_laneq_u32(__a, __b, __c, __d)				\
+  __extension__								\
+    ({									\
+       uint32x4_t __c_ = (__c);						\
+       uint32x4_t __b_ = (__b);						\
+       uint32x4_t __a_ = (__a);						\
+       uint32x4_t __result;						\
+       __asm__ ("mls %0.4s, %2.4s, %3.s[%4]"				\
+                : "=w"(__result)					\
+                : "0"(__a_), "w"(__b_), "w"(__c_), "i"(__d)		\
+                : /* No clobbers */);					\
+       __result;							\
+     })
+
 __extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
 vmlsq_n_f32 (float32x4_t a, float32x4_t b, float32_t c)
 {
diff --git a/gcc/testsuite/gcc.target/aarch64/vmlsq_laneq.c b/gcc/testsuite/gcc.target/aarch64/vmlsq_laneq.c
new file mode 100644
index 0000000..dd3fb81
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/vmlsq_laneq.c
@@ -0,0 +1,158 @@ 
+
+/* { dg-do run } */
+/* { dg-options "-O3" } */
+
+#include "arm_neon.h"
+
+extern void abort (void);
+
+void
+test1 ()
+{
+  int16x8_t val1, val2, val3;
+  int16x8_t result;
+  uint64_t act, exp;
+
+  val1 = vcombine_s16 (vcreate_s16 (UINT64_C (0xffff9ab680000000)),
+		       vcreate_s16 (UINT64_C (0x00000000ffff0000)));
+  val2 = vcombine_s16 (vcreate_s16 (UINT64_C (0x32b77fffffff7fff)),
+		       vcreate_s16 (UINT64_C (0x0000ffff00007fff)));
+  val3 = vcombine_s16 (vcreate_s16 (UINT64_C (0x7fff00007fff0000)),
+		       vcreate_s16 (UINT64_C (0x80007fff00000000)));
+  result = vmlsq_laneq_s16 (val1, val2, val3, 6);
+
+  act = vgetq_lane_u64 (vreinterpretq_u64_s16 (result), 0);
+  exp = UINT64_C (0xb2b69ab5ffffffff);
+  if (act != exp)
+    abort ();
+
+  act = vgetq_lane_u64 (vreinterpretq_u64_s16 (result), 1);
+  exp = UINT64_C (0x00007fffffffffff);
+  if (act != exp)
+    abort ();
+}
+
+void
+test2 ()
+{
+  int32x4_t val1, val2, val3;
+  int32x4_t result;
+  uint64_t exp, act;
+
+  val1 = vcombine_s32 (vcreate_s32 (UINT64_C (0x00008000f46f7fff)),
+		       vcreate_s32 (UINT64_C (0x7fffffffffff8000)));
+  val2 = vcombine_s32 (vcreate_s32 (UINT64_C (0x7fff7fff0e700000)),
+		       vcreate_s32 (UINT64_C (0xffff000080000000)));
+  val3 = vcombine_s32 (vcreate_s32 (UINT64_C (0x00000000ffff0000)),
+		       vcreate_s32 (UINT64_C (0xd9edea1a8000fb28)));
+  result = vmlsq_laneq_s32 (val1, val2, val3, 3);
+
+  act = vgetq_lane_u64 (vreinterpretq_u64_s32 (result), 0);
+  exp = UINT64_C (0xcefb6a1a1d0f7fff);
+  if (act != exp)
+    abort ();
+
+  act = vgetq_lane_u64 (vreinterpretq_u64_s32 (result), 1);
+  exp = UINT64_C (0x6a19ffffffff8000);
+  if (act != exp)
+    abort ();
+}
+
+void
+test3 ()
+{
+  uint16x8_t val1, val2, val3;
+  uint16x8_t result;
+  uint64_t act, exp;
+
+  val1 = vcombine_u16 (vcreate_u16 (UINT64_C (0x000080008000802a)),
+		       vcreate_u16 (UINT64_C (0x7fffffff00007fff)));
+  val2 = vcombine_u16 (vcreate_u16 (UINT64_C (0x7fffcdf1ffff0000)),
+		       vcreate_u16 (UINT64_C (0xe2550000ffffffff)));
+  val3 = vcombine_u16 (vcreate_u16 (UINT64_C (0x80007fff80000000)),
+		       vcreate_u16 (UINT64_C (0xbe2100007fffffff)));
+
+  result = vmlsq_laneq_u16 (val1, val2, val3, 7);
+
+  act = vgetq_lane_u64 (vreinterpretq_u64_u16 (result), 0);
+  exp = UINT64_C (0x3e2115ef3e21802a);
+  if (act != exp)
+    abort ();
+
+  act = vgetq_lane_u64 (vreinterpretq_u64_u16 (result), 1);
+  exp = UINT64_C (0x3d0affffbe213e20);
+  if (act != exp)
+    abort ();
+}
+
+void
+test4 ()
+{
+  uint32x4_t val1, val2, val3;
+  uint32x4_t result;
+  uint64_t act, exp;
+
+  val1 = vcombine_u32 (vcreate_u32 (UINT64_C (0x3295fe3d7fff7fff)),
+		       vcreate_u32 (UINT64_C (0x7fff00007fff7fff)));
+  val2 = vcombine_u32 (vcreate_u32 (UINT64_C (0xffff7fff7fff8000)),
+		       vcreate_u32 (UINT64_C (0x7fff80008000ffff)));
+  val3 = vcombine_u32 (vcreate_u32 (UINT64_C (0x7fff7fff80008000)),
+		       vcreate_u32 (UINT64_C (0x0000800053ab7fff)));
+
+  result = vmlsq_laneq_u32 (val1, val2, val3, 2);
+
+  act = vgetq_lane_u64 (vreinterpretq_u64_u32 (result), 0);
+  exp = UINT64_C (0x4640fe3cbffeffff);
+  if (act != exp)
+    abort ();
+
+  act = vgetq_lane_u64 (vreinterpretq_u64_u32 (result), 1);
+  exp = UINT64_C (0xbffe8000d3abfffe);
+  if (act != exp)
+    abort ();
+}
+
+void
+test5 ()
+{
+  float32x4_t val1, val2, val3;
+  float32x4_t result;
+  float32_t act;
+
+  val1 = vcombine_f32 (vcreate_f32 (UINT64_C (0x3f49daf03ef3dc73)),
+		       vcreate_f32 (UINT64_C (0x3f5d467a3ef3dc73)));
+  val2 = vcombine_f32 (vcreate_f32 (UINT64_C (0x3d2064c83d10cd28)),
+		       vcreate_f32 (UINT64_C (0x3ea7d1a23d10cd28)));
+  val3 = vcombine_f32 (vcreate_f32 (UINT64_C (0x3f6131993edb1e04)),
+		       vcreate_f32 (UINT64_C (0x3f37f4bf3edb1e04)));
+
+  result = vmlsq_laneq_f32 (val1, val2, val3, 0);
+
+  act = vgetq_lane_f32 (result, 0);
+  if (act != 0.46116194128990173f)
+    abort ();
+
+  act = vgetq_lane_f32 (result, 1);
+  if (act != 0.7717385292053223f)
+    abort ();
+
+  act = vgetq_lane_f32 (result, 2);
+  if (act != 0.46116194128990173f)
+    abort ();
+
+  act = vgetq_lane_f32 (result, 3);
+  if (act != 0.7240825295448303f)
+    abort ();
+}
+
+int
+main (void)
+{
+  test1 ();
+  test2 ();
+  test3 ();
+  test4 ();
+  test5 ();
+
+  return 0;
+}