diff mbox

[AArch64,6/7] Add NEON intrinsics vqrdmlah and vqrdmlsh.

Message ID 565589F2.8010208@foss.arm.com
State New
Headers show

Commit Message

Matthew Wahab Nov. 25, 2015, 10:14 a.m. UTC
On 23/11/15 13:35, James Greenhalgh wrote:
> On Fri, Oct 23, 2015 at 01:26:11PM +0100, Matthew Wahab wrote:
>> The ARMv8.1 architecture extension adds two Adv.SIMD instructions,
>> sqrdmlah and sqrdmlsh. This patch adds the NEON intrinsics vqrdmlah and
>> vqrdmlsh for these instructions. The new intrinsics are of the form
>> vqrdml{as}h[q]_<type>.
>>

>> diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
>> index e186348..9e73809 100644
>> --- a/gcc/config/aarch64/arm_neon.h
>> +++ b/gcc/config/aarch64/arm_neon.h
>> @@ -2649,6 +2649,59 @@ vqrdmulhq_s32 (int32x4_t __a, int32x4_t __b)
>>     return (int32x4_t) __builtin_aarch64_sqrdmulhv4si (__a, __b);
>>   }
>>
>> +#pragma GCC push_options
>> +#pragma GCC target ("arch=armv8.1-a")
>
> Can we please patch the documentation to make it clear that -march=armv8.1-a
> always implies -march=armv8.1-a+rdma ? The documentation around which
> feature modifiers are implied when leaves much to be desired.

I'll rework the documentation as part of the (separate) command lines clean-up patch.

>> +
>> +__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
>> +vqrdmlah_s16 (int16x4_t __a, int16x4_t __b, int16x4_t __c)
>> +{
>> +  return (int16x4_t) __builtin_aarch64_sqrdmlahv4hi (__a, __b, __c);
>
> We don't need this cast (likewise the other instances)?
>

Attached, a reworked patch that removes the casts from the new
intrinsics. It also moves the new intrinsics to before the crypto
intrinsics. The intention is that the intrinsics added in this and the
next patch in the set are put in the same place and bracketed by a
single target pragma.

Retested aarch64-none-elf with cross-compiled check-gcc on an ARMv8.1
emulator. Also re-ran the cross-compiled
gcc.target/aarch64/advsimd-intrinsics tests for aarch64-none-elf on an
ARMv8 emulator.

Matthew

gcc/
2015-11-24  Matthew Wahab  <matthew.wahab@arm.com>

	* gcc/config/aarch64/arm_neon.h (vqrdmlah_s16, vqrdmlah_s32): New.
	(vqrdmlahq_s16, vqrdmlahq_s32): New.
	(vqrdmlsh_s16, vqrdmlsh_s32): New.
	(vqrdmlshq_s16, vqrdmlshq_s32): New.

gcc/testsuite
2015-11-24  Matthew Wahab  <matthew.wahab@arm.com>

	* gcc.target/aarch64/advsimd-intrinsics/vqrdmlXh.inc: New file,
	support code for vqrdml{as}h tests.
	* gcc.target/aarch64/advsimd-intrinsics/vqrdmlah.c: New.
	* gcc.target/aarch64/advsimd-intrinsics/vqrdmlsh.c: New.

Comments

James Greenhalgh Nov. 25, 2015, 10:57 a.m. UTC | #1
On Wed, Nov 25, 2015 at 10:14:10AM +0000, Matthew Wahab wrote:
> On 23/11/15 13:35, James Greenhalgh wrote:
> >On Fri, Oct 23, 2015 at 01:26:11PM +0100, Matthew Wahab wrote:
> >>The ARMv8.1 architecture extension adds two Adv.SIMD instructions,
> >>sqrdmlah and sqrdmlsh. This patch adds the NEON intrinsics vqrdmlah and
> >>vqrdmlsh for these instructions. The new intrinsics are of the form
> >>vqrdml{as}h[q]_<type>.
> >>
> 
> >>diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
> >>index e186348..9e73809 100644
> >>--- a/gcc/config/aarch64/arm_neon.h
> >>+++ b/gcc/config/aarch64/arm_neon.h
> >>@@ -2649,6 +2649,59 @@ vqrdmulhq_s32 (int32x4_t __a, int32x4_t __b)
> >>    return (int32x4_t) __builtin_aarch64_sqrdmulhv4si (__a, __b);
> >>  }
> >>
> >>+#pragma GCC push_options
> >>+#pragma GCC target ("arch=armv8.1-a")
> >
> >Can we please patch the documentation to make it clear that -march=armv8.1-a
> >always implies -march=armv8.1-a+rdma ? The documentation around which
> >feature modifiers are implied when leaves much to be desired.
> 
> I'll rework the documentation as part of the (separate) command lines clean-up patch.
> 
> >>+
> >>+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
> >>+vqrdmlah_s16 (int16x4_t __a, int16x4_t __b, int16x4_t __c)
> >>+{
> >>+  return (int16x4_t) __builtin_aarch64_sqrdmlahv4hi (__a, __b, __c);
> >
> >We don't need this cast (likewise the other instances)?
> >
> 
> Attached, a reworked patch that removes the casts from the new
> intrinsics. It also moves the new intrinsics to before the crypto
> intrinsics. The intention is that the intrinsics added in this and the
> next patch in the set are put in the same place and bracketed by a
> single target pragma.
> 
> Retested aarch64-none-elf with cross-compiled check-gcc on an ARMv8.1
> emulator. Also re-ran the cross-compiled
> gcc.target/aarch64/advsimd-intrinsics tests for aarch64-none-elf on an
> ARMv8 emulator.

OK.

Thanks,
James

> gcc/
> 2015-11-24  Matthew Wahab  <matthew.wahab@arm.com>
> 
> 	* gcc/config/aarch64/arm_neon.h (vqrdmlah_s16, vqrdmlah_s32): New.
> 	(vqrdmlahq_s16, vqrdmlahq_s32): New.
> 	(vqrdmlsh_s16, vqrdmlsh_s32): New.
> 	(vqrdmlshq_s16, vqrdmlshq_s32): New.
> 
> gcc/testsuite
> 2015-11-24  Matthew Wahab  <matthew.wahab@arm.com>
> 
> 	* gcc.target/aarch64/advsimd-intrinsics/vqrdmlXh.inc: New file,
> 	support code for vqrdml{as}h tests.
> 	* gcc.target/aarch64/advsimd-intrinsics/vqrdmlah.c: New.
> 	* gcc.target/aarch64/advsimd-intrinsics/vqrdmlsh.c: New.
diff mbox

Patch

From e623828ac2d033a9a51766d9843a650aab9f42e9 Mon Sep 17 00:00:00 2001
From: Matthew Wahab <matthew.wahab@arm.com>
Date: Thu, 27 Aug 2015 13:22:41 +0100
Subject: [PATCH 6/7] Add neon intrinsics: vqrdmlah, vqrdmlsh.

Change-Id: I5c7f8d36ee980d280c1d50f6f212b286084c5acf
---
 gcc/config/aarch64/arm_neon.h                      |  53 ++++++++
 .../aarch64/advsimd-intrinsics/vqrdmlXh.inc        | 138 +++++++++++++++++++++
 .../aarch64/advsimd-intrinsics/vqrdmlah.c          |  57 +++++++++
 .../aarch64/advsimd-intrinsics/vqrdmlsh.c          |  61 +++++++++
 4 files changed, 309 insertions(+)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vqrdmlXh.inc
 create mode 100644 gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vqrdmlah.c
 create mode 100644 gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vqrdmlsh.c

diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index 138b108..63f1627 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -11213,6 +11213,59 @@  vbslq_u64 (uint64x2_t __a, uint64x2_t __b, uint64x2_t __c)
   return __builtin_aarch64_simd_bslv2di_uuuu (__a, __b, __c);
 }
 
+/* ARMv8.1 instrinsics.  */
+#pragma GCC push_options
+#pragma GCC target ("arch=armv8.1-a")
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vqrdmlah_s16 (int16x4_t __a, int16x4_t __b, int16x4_t __c)
+{
+  return __builtin_aarch64_sqrdmlahv4hi (__a, __b, __c);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vqrdmlah_s32 (int32x2_t __a, int32x2_t __b, int32x2_t __c)
+{
+  return __builtin_aarch64_sqrdmlahv2si (__a, __b, __c);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vqrdmlahq_s16 (int16x8_t __a, int16x8_t __b, int16x8_t __c)
+{
+  return __builtin_aarch64_sqrdmlahv8hi (__a, __b, __c);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vqrdmlahq_s32 (int32x4_t __a, int32x4_t __b, int32x4_t __c)
+{
+  return __builtin_aarch64_sqrdmlahv4si (__a, __b, __c);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vqrdmlsh_s16 (int16x4_t __a, int16x4_t __b, int16x4_t __c)
+{
+  return __builtin_aarch64_sqrdmlshv4hi (__a, __b, __c);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vqrdmlsh_s32 (int32x2_t __a, int32x2_t __b, int32x2_t __c)
+{
+  return __builtin_aarch64_sqrdmlshv2si (__a, __b, __c);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vqrdmlshq_s16 (int16x8_t __a, int16x8_t __b, int16x8_t __c)
+{
+  return __builtin_aarch64_sqrdmlshv8hi (__a, __b, __c);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vqrdmlshq_s32 (int32x4_t __a, int32x4_t __b, int32x4_t __c)
+{
+  return __builtin_aarch64_sqrdmlshv4si (__a, __b, __c);
+}
+#pragma GCC pop_options
+
 #pragma GCC push_options
 #pragma GCC target ("+nothing+crypto")
 /* vaes  */
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vqrdmlXh.inc b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vqrdmlXh.inc
new file mode 100644
index 0000000..a504ca6
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vqrdmlXh.inc
@@ -0,0 +1,138 @@ 
+#define FNNAME1(NAME) exec_ ## NAME
+#define FNNAME(NAME) FNNAME1 (NAME)
+
+void FNNAME (INSN) (void)
+{
+  /* vector_res = vqrdmlah (vector, vector2, vector3, vector4),
+     then store the result.  */
+#define TEST_VQRDMLAH2(INSN, Q, T1, T2, W, N, EXPECTED_CUMULATIVE_SAT, CMT) \
+  Set_Neon_Cumulative_Sat (0, VECT_VAR (vector_res, T1, W, N));		\
+  VECT_VAR (vector_res, T1, W, N) =					\
+    INSN##Q##_##T2##W (VECT_VAR (vector, T1, W, N),			\
+		       VECT_VAR (vector2, T1, W, N),			\
+		       VECT_VAR (vector3, T1, W, N));			\
+  vst1##Q##_##T2##W (VECT_VAR (result, T1, W, N),			\
+		     VECT_VAR (vector_res, T1, W, N));			\
+  CHECK_CUMULATIVE_SAT (TEST_MSG, T1, W, N,				\
+			EXPECTED_CUMULATIVE_SAT, CMT)
+
+  /* Two auxliary macros are necessary to expand INSN.  */
+#define TEST_VQRDMLAH1(INSN, Q, T1, T2, W, N, EXPECTED_CUMULATIVE_SAT, CMT) \
+  TEST_VQRDMLAH2 (INSN, Q, T1, T2, W, N, EXPECTED_CUMULATIVE_SAT, CMT)
+
+#define TEST_VQRDMLAH(Q, T1, T2, W, N, EXPECTED_CUMULATIVE_SAT, CMT)	\
+  TEST_VQRDMLAH1 (INSN, Q, T1, T2, W, N, EXPECTED_CUMULATIVE_SAT, CMT)
+
+  DECL_VARIABLE (vector, int, 16, 4);
+  DECL_VARIABLE (vector, int, 32, 2);
+  DECL_VARIABLE (vector, int, 16, 8);
+  DECL_VARIABLE (vector, int, 32, 4);
+
+  DECL_VARIABLE (vector_res, int, 16, 4);
+  DECL_VARIABLE (vector_res, int, 32, 2);
+  DECL_VARIABLE (vector_res, int, 16, 8);
+  DECL_VARIABLE (vector_res, int, 32, 4);
+
+  DECL_VARIABLE (vector2, int, 16, 4);
+  DECL_VARIABLE (vector2, int, 32, 2);
+  DECL_VARIABLE (vector2, int, 16, 8);
+  DECL_VARIABLE (vector2, int, 32, 4);
+
+  DECL_VARIABLE (vector3, int, 16, 4);
+  DECL_VARIABLE (vector3, int, 32, 2);
+  DECL_VARIABLE (vector3, int, 16, 8);
+  DECL_VARIABLE (vector3, int, 32, 4);
+
+  clean_results ();
+
+  VLOAD (vector, buffer, , int, s, 16, 4);
+  VLOAD (vector, buffer, , int, s, 32, 2);
+  VLOAD (vector, buffer, q, int, s, 16, 8);
+  VLOAD (vector, buffer, q, int, s, 32, 4);
+
+  /* Initialize vector2.  */
+  VDUP (vector2, , int, s, 16, 4, 0x5555);
+  VDUP (vector2, , int, s, 32, 2, 0xBB);
+  VDUP (vector2, q, int, s, 16, 8, 0xBB);
+  VDUP (vector2, q, int, s, 32, 4, 0x22);
+
+  /* Initialize vector3.  */
+  VDUP (vector3, , int, s, 16, 4, 0x5555);
+  VDUP (vector3, , int, s, 32, 2, 0xBB);
+  VDUP (vector3, q, int, s, 16, 8, 0x33);
+  VDUP (vector3, q, int, s, 32, 4, 0x22);
+
+#define CMT ""
+  TEST_VQRDMLAH ( , int, s, 16, 4, expected_cumulative_sat, CMT);
+  TEST_VQRDMLAH ( , int, s, 32, 2, expected_cumulative_sat, CMT);
+  TEST_VQRDMLAH (q, int, s, 16, 8, expected_cumulative_sat, CMT);
+  TEST_VQRDMLAH (q, int, s, 32, 4, expected_cumulative_sat, CMT);
+
+  CHECK (TEST_MSG, int, 16, 4, PRIx16, expected, CMT);
+  CHECK (TEST_MSG, int, 32, 2, PRIx32, expected, CMT);
+  CHECK (TEST_MSG, int, 16, 8, PRIx16, expected, CMT);
+  CHECK (TEST_MSG, int, 32, 4, PRIx32, expected, CMT);
+
+  /* Now use input values such that the multiplication causes
+     saturation.  */
+#define TEST_MSG_MUL " (check mul cumulative saturation)"
+  VDUP (vector, , int, s, 16, 4, 0x8000);
+  VDUP (vector, , int, s, 32, 2, 0x80000000);
+  VDUP (vector, q, int, s, 16, 8, 0x8000);
+  VDUP (vector, q, int, s, 32, 4, 0x80000000);
+  VDUP (vector2, , int, s, 16, 4, 0x8000);
+  VDUP (vector2, , int, s, 32, 2, 0x80000000);
+  VDUP (vector2, q, int, s, 16, 8, 0x8000);
+  VDUP (vector2, q, int, s, 32, 4, 0x80000000);
+  VDUP (vector3, , int, s, 16, 4, 0x8000);
+  VDUP (vector3, , int, s, 32, 2, 0x80000000);
+  VDUP (vector3, q, int, s, 16, 8, 0x8000);
+  VDUP (vector3, q, int, s, 32, 4, 0x80000000);
+
+  TEST_VQRDMLAH ( , int, s, 16, 4, expected_cumulative_sat_mul, TEST_MSG_MUL);
+  TEST_VQRDMLAH ( , int, s, 32, 2, expected_cumulative_sat_mul, TEST_MSG_MUL);
+  TEST_VQRDMLAH (q, int, s, 16, 8, expected_cumulative_sat_mul, TEST_MSG_MUL);
+  TEST_VQRDMLAH (q, int, s, 32, 4, expected_cumulative_sat_mul, TEST_MSG_MUL);
+
+  CHECK (TEST_MSG, int, 16, 4, PRIx16, expected_mul, TEST_MSG_MUL);
+  CHECK (TEST_MSG, int, 32, 2, PRIx32, expected_mul, TEST_MSG_MUL);
+  CHECK (TEST_MSG, int, 16, 8, PRIx16, expected_mul, TEST_MSG_MUL);
+  CHECK (TEST_MSG, int, 32, 4, PRIx32, expected_mul, TEST_MSG_MUL);
+
+  /* Use input values where rounding produces a result equal to the
+     saturation value, but does not set the saturation flag.  */
+#define TEST_MSG_ROUND " (check rounding)"
+  VDUP (vector, , int, s, 16, 4, 0x8000);
+  VDUP (vector, , int, s, 32, 2, 0x80000000);
+  VDUP (vector, q, int, s, 16, 8, 0x8000);
+  VDUP (vector, q, int, s, 32, 4, 0x80000000);
+  VDUP (vector2, , int, s, 16, 4, 0x8001);
+  VDUP (vector2, , int, s, 32, 2, 0x80000001);
+  VDUP (vector2, q, int, s, 16, 8, 0x8001);
+  VDUP (vector2, q, int, s, 32, 4, 0x80000001);
+  VDUP (vector3, , int, s, 16, 4, 0x8001);
+  VDUP (vector3, , int, s, 32, 2, 0x80000001);
+  VDUP (vector3, q, int, s, 16, 8, 0x8001);
+  VDUP (vector3, q, int, s, 32, 4, 0x80000001);
+
+  TEST_VQRDMLAH ( , int, s, 16, 4, expected_cumulative_sat_round, \
+		 TEST_MSG_ROUND);
+  TEST_VQRDMLAH ( , int, s, 32, 2, expected_cumulative_sat_round, \
+		 TEST_MSG_ROUND);
+  TEST_VQRDMLAH (q, int, s, 16, 8, expected_cumulative_sat_round, \
+		 TEST_MSG_ROUND);
+  TEST_VQRDMLAH (q, int, s, 32, 4, expected_cumulative_sat_round, \
+		 TEST_MSG_ROUND);
+
+  CHECK (TEST_MSG, int, 16, 4, PRIx16, expected_round, TEST_MSG_ROUND);
+  CHECK (TEST_MSG, int, 32, 2, PRIx32, expected_round, TEST_MSG_ROUND);
+  CHECK (TEST_MSG, int, 16, 8, PRIx16, expected_round, TEST_MSG_ROUND);
+  CHECK (TEST_MSG, int, 32, 4, PRIx32, expected_round, TEST_MSG_ROUND);
+}
+
+int
+main (void)
+{
+  FNNAME (INSN) ();
+  return 0;
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vqrdmlah.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vqrdmlah.c
new file mode 100644
index 0000000..148d94c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vqrdmlah.c
@@ -0,0 +1,57 @@ 
+/* { dg-require-effective-target arm_v8_1a_neon_hw } */
+/* { dg-add-options arm_v8_1a_neon } */
+
+#include <arm_neon.h>
+#include "arm-neon-ref.h"
+#include "compute-ref-data.h"
+
+/* Expected values of cumulative_saturation flag.  */
+int VECT_VAR (expected_cumulative_sat, int, 16, 4) = 0;
+int VECT_VAR (expected_cumulative_sat, int, 32, 2) = 0;
+int VECT_VAR (expected_cumulative_sat, int, 16, 8) = 0;
+int VECT_VAR (expected_cumulative_sat, int, 32, 4) = 0;
+
+/* Expected results.  */
+VECT_VAR_DECL (expected, int, 16, 4) [] = { 0x38d3, 0x38d4, 0x38d5, 0x38d6 };
+VECT_VAR_DECL (expected, int, 32, 2) [] = { 0xfffffff0, 0xfffffff1 };
+VECT_VAR_DECL (expected, int, 16, 8) [] = { 0xfff0,  0xfff1, 0xfff2,  0xfff3,
+					    0xfff4, 0xfff5, 0xfff6, 0xfff7 };
+VECT_VAR_DECL (expected, int, 32, 4) [] = { 0xfffffff0, 0xfffffff1,
+					    0xfffffff2, 0xfffffff3 };
+
+/* Expected values of cumulative_saturation flag when multiplication
+   saturates.  */
+int VECT_VAR (expected_cumulative_sat_mul, int, 16, 4) = 0;
+int VECT_VAR (expected_cumulative_sat_mul, int, 32, 2) = 0;
+int VECT_VAR (expected_cumulative_sat_mul, int, 16, 8) = 0;
+int VECT_VAR (expected_cumulative_sat_mul, int, 32, 4) = 0;
+
+/* Expected results when multiplication saturates.  */
+VECT_VAR_DECL (expected_mul, int, 16, 4) [] = { 0x0, 0x0, 0x0, 0x0 };
+VECT_VAR_DECL (expected_mul, int, 32, 2) [] = { 0x0, 0x0 };
+VECT_VAR_DECL (expected_mul, int, 16, 8) [] = { 0x0, 0x0, 0x0, 0x0,
+						0x0, 0x0, 0x0, 0x0 };
+VECT_VAR_DECL (expected_mul, int, 32, 4) [] = { 0x0, 0x0, 0x0, 0x0 };
+
+/* Expected values of cumulative_saturation flag when rounding
+   should not cause saturation.  */
+int VECT_VAR (expected_cumulative_sat_round, int, 16, 4) = 0;
+int VECT_VAR (expected_cumulative_sat_round, int, 32, 2) = 0;
+int VECT_VAR (expected_cumulative_sat_round, int, 16, 8) = 0;
+int VECT_VAR (expected_cumulative_sat_round, int, 32, 4) = 0;
+
+/* Expected results when rounding should not cause saturation.  */
+VECT_VAR_DECL (expected_round, int, 16, 4) [] = { 0xfffe, 0xfffe,
+						  0xfffe, 0xfffe };
+VECT_VAR_DECL (expected_round, int, 32, 2) [] = { 0xfffffffe, 0xfffffffe };
+VECT_VAR_DECL (expected_round, int, 16, 8) [] = { 0xfffe, 0xfffe,
+						  0xfffe, 0xfffe,
+						  0xfffe, 0xfffe,
+						  0xfffe, 0xfffe };
+VECT_VAR_DECL (expected_round, int, 32, 4) [] = { 0xfffffffe, 0xfffffffe,
+						  0xfffffffe, 0xfffffffe };
+
+#define INSN vqrdmlah
+#define TEST_MSG "VQRDMLAH"
+
+#include "vqrdmlXh.inc"
diff --git a/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vqrdmlsh.c b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vqrdmlsh.c
new file mode 100644
index 0000000..91c3b34
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/advsimd-intrinsics/vqrdmlsh.c
@@ -0,0 +1,61 @@ 
+/* { dg-require-effective-target arm_v8_1a_neon_hw } */
+/* { dg-add-options arm_v8_1a_neon } */
+
+#include <arm_neon.h>
+#include "arm-neon-ref.h"
+#include "compute-ref-data.h"
+
+/* Expected values of cumulative_saturation flag.  */
+int VECT_VAR (expected_cumulative_sat, int, 16, 4) = 0;
+int VECT_VAR (expected_cumulative_sat, int, 32, 2) = 0;
+int VECT_VAR (expected_cumulative_sat, int, 16, 8) = 0;
+int VECT_VAR (expected_cumulative_sat, int, 32, 4) = 0;
+
+/* Expected results.  */
+VECT_VAR_DECL (expected, int, 16, 4) [] = { 0xc70d, 0xc70e, 0xc70f, 0xc710 };
+VECT_VAR_DECL (expected, int, 32, 2) [] = { 0xfffffff0, 0xfffffff1 };
+VECT_VAR_DECL (expected, int, 16, 8) [] = { 0xfff0, 0xfff1, 0xfff2, 0xfff3,
+					    0xfff4, 0xfff5, 0xfff6, 0xfff7 };
+VECT_VAR_DECL (expected, int, 32, 4) [] = { 0xfffffff0, 0xfffffff1,
+					    0xfffffff2, 0xfffffff3 };
+
+/* Expected values of cumulative_saturation flag when multiplication
+   saturates.  */
+int VECT_VAR (expected_cumulative_sat_mul, int, 16, 4) = 1;
+int VECT_VAR (expected_cumulative_sat_mul, int, 32, 2) = 1;
+int VECT_VAR (expected_cumulative_sat_mul, int, 16, 8) = 1;
+int VECT_VAR (expected_cumulative_sat_mul, int, 32, 4) = 1;
+
+/* Expected results when multiplication saturates.  */
+VECT_VAR_DECL (expected_mul, int, 16, 4) [] = { 0x8000, 0x8000,
+						0x8000, 0x8000 };
+VECT_VAR_DECL (expected_mul, int, 32, 2) [] = { 0x80000000, 0x80000000 };
+VECT_VAR_DECL (expected_mul, int, 16, 8) [] = { 0x8000, 0x8000,
+						0x8000, 0x8000,
+						0x8000, 0x8000,
+						0x8000, 0x8000 };
+VECT_VAR_DECL (expected_mul, int, 32, 4) [] = { 0x80000000, 0x80000000,
+						0x80000000, 0x80000000 };
+
+/* Expected values of cumulative_saturation flag when rounding
+   should not cause saturation.  */
+int VECT_VAR (expected_cumulative_sat_round, int, 16, 4) = 1;
+int VECT_VAR (expected_cumulative_sat_round, int, 32, 2) = 1;
+int VECT_VAR (expected_cumulative_sat_round, int, 16, 8) = 1;
+int VECT_VAR (expected_cumulative_sat_round, int, 32, 4) = 1;
+
+/* Expected results when rounding should not cause saturation.  */
+VECT_VAR_DECL (expected_round, int, 16, 4) [] = { 0x8000, 0x8000,
+						  0x8000, 0x8000 };
+VECT_VAR_DECL (expected_round, int, 32, 2) [] = { 0x80000000, 0x80000000 };
+VECT_VAR_DECL (expected_round, int, 16, 8) [] = { 0x8000, 0x8000,
+						  0x8000, 0x8000,
+						  0x8000, 0x8000,
+						  0x8000, 0x8000 };
+VECT_VAR_DECL (expected_round, int, 32, 4) [] = { 0x80000000, 0x80000000,
+						  0x80000000, 0x80000000 };
+
+#define INSN vqrdmlsh
+#define TEST_MSG "VQRDMLSH"
+
+#include "vqrdmlXh.inc"
-- 
2.1.4