diff mbox

[AArch64,1/14] ARMv8.2-A FP16 data processing intrinsics

Message ID d5e925b4-5255-df3a-26ab-e6cf6aba8164@foss.arm.com
State New
Headers show

Commit Message

Jiong Wang July 7, 2016, 4:13 p.m. UTC
Several data-processing instructions are agnostic to the type of their
operands. This patch add the mapping between them and those bit- and
lane-manipulation instructions.

No ARMv8.2-A FP16 extension hardware support is required for these
intrinsics.

gcc/
2016-07-07  Jiong Wang <jiong.wang@arm.com>

         * config/aarch64/aarch64-simd.md
(aarch64_<PERMUTE:perm_insn><PERMUTE:perm_hilo><mode>): Use VALL_F16.
         (aarch64_ext<mode>): Likewise.
         (aarch64_rev<REVERSE:rev_op><mode>): Likewise.
         * config/aarch64/aarch64.c (aarch64_evpc_trn): Support V4HFmode 
and V8HFmode.
         (aarch64_evpc_uzp): Likewise.
         (aarch64_evpc_zip): Likewise.
         (aarch64_evpc_ext): Likewise.
         (aarch64_evpc_rev): Likewise.
         * config/aarch64/arm_neon.h (__aarch64_vdup_lane_f16): New.
         (__aarch64_vdup_laneq_f16): New..
         (__aarch64_vdupq_lane_f16): New.
         (__aarch64_vdupq_laneq_f16): New.
         (vbsl_f16): New.
         (vbslq_f16): New.
         (vdup_n_f16): New.
         (vdupq_n_f16): New.
         (vdup_lane_f16): New.
         (vdup_laneq_f16): New.
         (vdupq_lane_f16): New.
         (vdupq_laneq_f16): New.
         (vduph_lane_f16): New.
         (vduph_laneq_f16): New.
         (vext_f16): New.
         (vextq_f16): New.
         (vmov_n_f16): New.
         (vmovq_n_f16): New.
         (vrev64_f16): New.
         (vrev64q_f16): New.
         (vtrn1_f16): New.
         (vtrn1q_f16): New.
         (vtrn2_f16): New.
         (vtrn2q_f16): New.
         (vtrn_f16): New.
         (vtrnq_f16): New.
         (__INTERLEAVE_LIST): Support float16x4_t, float16x8_t.
         (vuzp1_f16): New.
         (vuzp1q_f16): New.
         (vuzp2_f16): New.
         (vuzp2q_f16): New.
         (vzip1_f16): New.
         (vzip2q_f16): New.
         (vmov_n_f16): Reimplement using vdup_n_f16.
         (vmovq_n_f16): Reimplement using vdupq_n_f16..

Comments

James Greenhalgh July 8, 2016, 2:07 p.m. UTC | #1
On Thu, Jul 07, 2016 at 05:13:56PM +0100, Jiong Wang wrote:
> Several data-processing instructions are agnostic to the type of their
> operands. This patch add the mapping between them and those bit- and
> lane-manipulation instructions.
> 
> No ARMv8.2-A FP16 extension hardware support is required for these
> intrinsics.

These intrinsics are independent of the ARMv8.2-A implementation,
and are proposed to be added in a future ACLE specification. I've
checked that the intrinsics added here match those proposed.

OK for trunk.

Thanks,
James

> gcc/
> 2016-07-07  Jiong Wang <jiong.wang@arm.com>
> 
>         * config/aarch64/aarch64-simd.md
> (aarch64_<PERMUTE:perm_insn><PERMUTE:perm_hilo><mode>): Use VALL_F16.
>         (aarch64_ext<mode>): Likewise.
>         (aarch64_rev<REVERSE:rev_op><mode>): Likewise.
>         * config/aarch64/aarch64.c (aarch64_evpc_trn): Support
> V4HFmode and V8HFmode.
>         (aarch64_evpc_uzp): Likewise.
>         (aarch64_evpc_zip): Likewise.
>         (aarch64_evpc_ext): Likewise.
>         (aarch64_evpc_rev): Likewise.
>         * config/aarch64/arm_neon.h (__aarch64_vdup_lane_f16): New.
>         (__aarch64_vdup_laneq_f16): New..
>         (__aarch64_vdupq_lane_f16): New.
>         (__aarch64_vdupq_laneq_f16): New.
>         (vbsl_f16): New.
>         (vbslq_f16): New.
>         (vdup_n_f16): New.
>         (vdupq_n_f16): New.
>         (vdup_lane_f16): New.
>         (vdup_laneq_f16): New.
>         (vdupq_lane_f16): New.
>         (vdupq_laneq_f16): New.
>         (vduph_lane_f16): New.
>         (vduph_laneq_f16): New.
>         (vext_f16): New.
>         (vextq_f16): New.
>         (vmov_n_f16): New.
>         (vmovq_n_f16): New.
>         (vrev64_f16): New.
>         (vrev64q_f16): New.
>         (vtrn1_f16): New.
>         (vtrn1q_f16): New.
>         (vtrn2_f16): New.
>         (vtrn2q_f16): New.
>         (vtrn_f16): New.
>         (vtrnq_f16): New.
>         (__INTERLEAVE_LIST): Support float16x4_t, float16x8_t.
>         (vuzp1_f16): New.
>         (vuzp1q_f16): New.
>         (vuzp2_f16): New.
>         (vuzp2q_f16): New.
>         (vzip1_f16): New.
>         (vzip2q_f16): New.
>         (vmov_n_f16): Reimplement using vdup_n_f16.
>         (vmovq_n_f16): Reimplement using vdupq_n_f16..
diff mbox

Patch

From b12677052e69b67310c1d63360db2793354414cb Mon Sep 17 00:00:00 2001
From: Jiong Wang <jiong.wang@arm.com>
Date: Tue, 7 Jun 2016 17:01:22 +0100
Subject: [PATCH 01/14] [1/14] ARMv8.2 FP16 data processing intrinsics

---
 gcc/config/aarch64/aarch64-simd.md |  22 +--
 gcc/config/aarch64/aarch64.c       |  16 +++
 gcc/config/aarch64/arm_neon.h      | 275 ++++++++++++++++++++++++++++++++++++-
 3 files changed, 298 insertions(+), 15 deletions(-)

diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index c8a5e3e..74dfe28 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -5161,10 +5161,10 @@ 
 )
 
 (define_insn "aarch64_<PERMUTE:perm_insn><PERMUTE:perm_hilo><mode>"
-  [(set (match_operand:VALL 0 "register_operand" "=w")
-	(unspec:VALL [(match_operand:VALL 1 "register_operand" "w")
-		      (match_operand:VALL 2 "register_operand" "w")]
-		       PERMUTE))]
+  [(set (match_operand:VALL_F16 0 "register_operand" "=w")
+	(unspec:VALL_F16 [(match_operand:VALL_F16 1 "register_operand" "w")
+			  (match_operand:VALL_F16 2 "register_operand" "w")]
+	 PERMUTE))]
   "TARGET_SIMD"
   "<PERMUTE:perm_insn><PERMUTE:perm_hilo>\\t%0.<Vtype>, %1.<Vtype>, %2.<Vtype>"
   [(set_attr "type" "neon_permute<q>")]
@@ -5172,11 +5172,11 @@ 
 
 ;; Note immediate (third) operand is lane index not byte index.
 (define_insn "aarch64_ext<mode>"
-  [(set (match_operand:VALL 0 "register_operand" "=w")
-        (unspec:VALL [(match_operand:VALL 1 "register_operand" "w")
-                      (match_operand:VALL 2 "register_operand" "w")
-                      (match_operand:SI 3 "immediate_operand" "i")]
-                     UNSPEC_EXT))]
+  [(set (match_operand:VALL_F16 0 "register_operand" "=w")
+        (unspec:VALL_F16 [(match_operand:VALL_F16 1 "register_operand" "w")
+			  (match_operand:VALL_F16 2 "register_operand" "w")
+			  (match_operand:SI 3 "immediate_operand" "i")]
+	 UNSPEC_EXT))]
   "TARGET_SIMD"
 {
   operands[3] = GEN_INT (INTVAL (operands[3])
@@ -5187,8 +5187,8 @@ 
 )
 
 (define_insn "aarch64_rev<REVERSE:rev_op><mode>"
-  [(set (match_operand:VALL 0 "register_operand" "=w")
-	(unspec:VALL [(match_operand:VALL 1 "register_operand" "w")]
+  [(set (match_operand:VALL_F16 0 "register_operand" "=w")
+	(unspec:VALL_F16 [(match_operand:VALL_F16 1 "register_operand" "w")]
                     REVERSE))]
   "TARGET_SIMD"
   "rev<REVERSE:rev_op>\\t%0.<Vtype>, %1.<Vtype>"
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index b60e5c5..358d35c 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -12053,6 +12053,8 @@  aarch64_evpc_trn (struct expand_vec_perm_d *d)
 	case V4SImode: gen = gen_aarch64_trn2v4si; break;
 	case V2SImode: gen = gen_aarch64_trn2v2si; break;
 	case V2DImode: gen = gen_aarch64_trn2v2di; break;
+	case V4HFmode: gen = gen_aarch64_trn2v4hf; break;
+	case V8HFmode: gen = gen_aarch64_trn2v8hf; break;
 	case V4SFmode: gen = gen_aarch64_trn2v4sf; break;
 	case V2SFmode: gen = gen_aarch64_trn2v2sf; break;
 	case V2DFmode: gen = gen_aarch64_trn2v2df; break;
@@ -12071,6 +12073,8 @@  aarch64_evpc_trn (struct expand_vec_perm_d *d)
 	case V4SImode: gen = gen_aarch64_trn1v4si; break;
 	case V2SImode: gen = gen_aarch64_trn1v2si; break;
 	case V2DImode: gen = gen_aarch64_trn1v2di; break;
+	case V4HFmode: gen = gen_aarch64_trn1v4hf; break;
+	case V8HFmode: gen = gen_aarch64_trn1v8hf; break;
 	case V4SFmode: gen = gen_aarch64_trn1v4sf; break;
 	case V2SFmode: gen = gen_aarch64_trn1v2sf; break;
 	case V2DFmode: gen = gen_aarch64_trn1v2df; break;
@@ -12136,6 +12140,8 @@  aarch64_evpc_uzp (struct expand_vec_perm_d *d)
 	case V4SImode: gen = gen_aarch64_uzp2v4si; break;
 	case V2SImode: gen = gen_aarch64_uzp2v2si; break;
 	case V2DImode: gen = gen_aarch64_uzp2v2di; break;
+	case V4HFmode: gen = gen_aarch64_uzp2v4hf; break;
+	case V8HFmode: gen = gen_aarch64_uzp2v8hf; break;
 	case V4SFmode: gen = gen_aarch64_uzp2v4sf; break;
 	case V2SFmode: gen = gen_aarch64_uzp2v2sf; break;
 	case V2DFmode: gen = gen_aarch64_uzp2v2df; break;
@@ -12154,6 +12160,8 @@  aarch64_evpc_uzp (struct expand_vec_perm_d *d)
 	case V4SImode: gen = gen_aarch64_uzp1v4si; break;
 	case V2SImode: gen = gen_aarch64_uzp1v2si; break;
 	case V2DImode: gen = gen_aarch64_uzp1v2di; break;
+	case V4HFmode: gen = gen_aarch64_uzp1v4hf; break;
+	case V8HFmode: gen = gen_aarch64_uzp1v8hf; break;
 	case V4SFmode: gen = gen_aarch64_uzp1v4sf; break;
 	case V2SFmode: gen = gen_aarch64_uzp1v2sf; break;
 	case V2DFmode: gen = gen_aarch64_uzp1v2df; break;
@@ -12224,6 +12232,8 @@  aarch64_evpc_zip (struct expand_vec_perm_d *d)
 	case V4SImode: gen = gen_aarch64_zip2v4si; break;
 	case V2SImode: gen = gen_aarch64_zip2v2si; break;
 	case V2DImode: gen = gen_aarch64_zip2v2di; break;
+	case V4HFmode: gen = gen_aarch64_zip2v4hf; break;
+	case V8HFmode: gen = gen_aarch64_zip2v8hf; break;
 	case V4SFmode: gen = gen_aarch64_zip2v4sf; break;
 	case V2SFmode: gen = gen_aarch64_zip2v2sf; break;
 	case V2DFmode: gen = gen_aarch64_zip2v2df; break;
@@ -12242,6 +12252,8 @@  aarch64_evpc_zip (struct expand_vec_perm_d *d)
 	case V4SImode: gen = gen_aarch64_zip1v4si; break;
 	case V2SImode: gen = gen_aarch64_zip1v2si; break;
 	case V2DImode: gen = gen_aarch64_zip1v2di; break;
+	case V4HFmode: gen = gen_aarch64_zip1v4hf; break;
+	case V8HFmode: gen = gen_aarch64_zip1v8hf; break;
 	case V4SFmode: gen = gen_aarch64_zip1v4sf; break;
 	case V2SFmode: gen = gen_aarch64_zip1v2sf; break;
 	case V2DFmode: gen = gen_aarch64_zip1v2df; break;
@@ -12286,6 +12298,8 @@  aarch64_evpc_ext (struct expand_vec_perm_d *d)
     case V8HImode: gen = gen_aarch64_extv8hi; break;
     case V2SImode: gen = gen_aarch64_extv2si; break;
     case V4SImode: gen = gen_aarch64_extv4si; break;
+    case V4HFmode: gen = gen_aarch64_extv4hf; break;
+    case V8HFmode: gen = gen_aarch64_extv8hf; break;
     case V2SFmode: gen = gen_aarch64_extv2sf; break;
     case V4SFmode: gen = gen_aarch64_extv4sf; break;
     case V2DImode: gen = gen_aarch64_extv2di; break;
@@ -12361,6 +12375,8 @@  aarch64_evpc_rev (struct expand_vec_perm_d *d)
 	case V2SImode: gen = gen_aarch64_rev64v2si;  break;
 	case V4SFmode: gen = gen_aarch64_rev64v4sf;  break;
 	case V2SFmode: gen = gen_aarch64_rev64v2sf;  break;
+	case V8HFmode: gen = gen_aarch64_rev64v8hf;  break;
+	case V4HFmode: gen = gen_aarch64_rev64v4hf;  break;
 	default:
 	  return false;
 	}
diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index 4e36c6a..b7b1eb8 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -466,6 +466,8 @@  typedef struct poly16x8x4_t
 #define __aarch64_vdup_lane_any(__size, __q, __a, __b) \
   vdup##__q##_n_##__size (__aarch64_vget_lane_any (__a, __b))
 
+#define __aarch64_vdup_lane_f16(__a, __b) \
+   __aarch64_vdup_lane_any (f16, , __a, __b)
 #define __aarch64_vdup_lane_f32(__a, __b) \
    __aarch64_vdup_lane_any (f32, , __a, __b)
 #define __aarch64_vdup_lane_f64(__a, __b) \
@@ -492,6 +494,8 @@  typedef struct poly16x8x4_t
    __aarch64_vdup_lane_any (u64, , __a, __b)
 
 /* __aarch64_vdup_laneq internal macros.  */
+#define __aarch64_vdup_laneq_f16(__a, __b) \
+   __aarch64_vdup_lane_any (f16, , __a, __b)
 #define __aarch64_vdup_laneq_f32(__a, __b) \
    __aarch64_vdup_lane_any (f32, , __a, __b)
 #define __aarch64_vdup_laneq_f64(__a, __b) \
@@ -518,6 +522,8 @@  typedef struct poly16x8x4_t
    __aarch64_vdup_lane_any (u64, , __a, __b)
 
 /* __aarch64_vdupq_lane internal macros.  */
+#define __aarch64_vdupq_lane_f16(__a, __b) \
+   __aarch64_vdup_lane_any (f16, q, __a, __b)
 #define __aarch64_vdupq_lane_f32(__a, __b) \
    __aarch64_vdup_lane_any (f32, q, __a, __b)
 #define __aarch64_vdupq_lane_f64(__a, __b) \
@@ -544,6 +550,8 @@  typedef struct poly16x8x4_t
    __aarch64_vdup_lane_any (u64, q, __a, __b)
 
 /* __aarch64_vdupq_laneq internal macros.  */
+#define __aarch64_vdupq_laneq_f16(__a, __b) \
+   __aarch64_vdup_lane_any (f16, q, __a, __b)
 #define __aarch64_vdupq_laneq_f32(__a, __b) \
    __aarch64_vdup_lane_any (f32, q, __a, __b)
 #define __aarch64_vdupq_laneq_f64(__a, __b) \
@@ -10369,6 +10377,12 @@  vaddvq_f64 (float64x2_t __a)
 
 /* vbsl  */
 
+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+vbsl_f16 (uint16x4_t __a, float16x4_t __b, float16x4_t __c)
+{
+  return __builtin_aarch64_simd_bslv4hf_suss (__a, __b, __c);
+}
+
 __extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
 vbsl_f32 (uint32x2_t __a, float32x2_t __b, float32x2_t __c)
 {
@@ -10444,6 +10458,12 @@  vbsl_u64 (uint64x1_t __a, uint64x1_t __b, uint64x1_t __c)
       {__builtin_aarch64_simd_bsldi_uuuu (__a[0], __b[0], __c[0])};
 }
 
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vbslq_f16 (uint16x8_t __a, float16x8_t __b, float16x8_t __c)
+{
+  return __builtin_aarch64_simd_bslv8hf_suss (__a, __b, __c);
+}
+
 __extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
 vbslq_f32 (uint32x4_t __a, float32x4_t __b, float32x4_t __c)
 {
@@ -12967,6 +12987,12 @@  vcvtpq_u64_f64 (float64x2_t __a)
 
 /* vdup_n  */
 
+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+vdup_n_f16 (float16_t __a)
+{
+  return (float16x4_t) {__a, __a, __a, __a};
+}
+
 __extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
 vdup_n_f32 (float32_t __a)
 {
@@ -13041,6 +13067,12 @@  vdup_n_u64 (uint64_t __a)
 
 /* vdupq_n  */
 
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vdupq_n_f16 (float16_t __a)
+{
+  return (float16x8_t) {__a, __a, __a, __a, __a, __a, __a, __a};
+}
+
 __extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
 vdupq_n_f32 (float32_t __a)
 {
@@ -13118,6 +13150,12 @@  vdupq_n_u64 (uint64_t __a)
 
 /* vdup_lane  */
 
+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+vdup_lane_f16 (float16x4_t __a, const int __b)
+{
+  return __aarch64_vdup_lane_f16 (__a, __b);
+}
+
 __extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
 vdup_lane_f32 (float32x2_t __a, const int __b)
 {
@@ -13192,6 +13230,12 @@  vdup_lane_u64 (uint64x1_t __a, const int __b)
 
 /* vdup_laneq  */
 
+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+vdup_laneq_f16 (float16x8_t __a, const int __b)
+{
+  return __aarch64_vdup_laneq_f16 (__a, __b);
+}
+
 __extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
 vdup_laneq_f32 (float32x4_t __a, const int __b)
 {
@@ -13265,6 +13309,13 @@  vdup_laneq_u64 (uint64x2_t __a, const int __b)
 }
 
 /* vdupq_lane  */
+
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vdupq_lane_f16 (float16x4_t __a, const int __b)
+{
+  return __aarch64_vdupq_lane_f16 (__a, __b);
+}
+
 __extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
 vdupq_lane_f32 (float32x2_t __a, const int __b)
 {
@@ -13338,6 +13389,13 @@  vdupq_lane_u64 (uint64x1_t __a, const int __b)
 }
 
 /* vdupq_laneq  */
+
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vdupq_laneq_f16 (float16x8_t __a, const int __b)
+{
+  return __aarch64_vdupq_laneq_f16 (__a, __b);
+}
+
 __extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
 vdupq_laneq_f32 (float32x4_t __a, const int __b)
 {
@@ -13430,6 +13488,13 @@  vdupb_lane_u8 (uint8x8_t __a, const int __b)
 }
 
 /* vduph_lane  */
+
+__extension__ static __inline float16_t __attribute__ ((__always_inline__))
+vduph_lane_f16 (float16x4_t __a, const int __b)
+{
+  return __aarch64_vget_lane_any (__a, __b);
+}
+
 __extension__ static __inline poly16_t __attribute__ ((__always_inline__))
 vduph_lane_p16 (poly16x4_t __a, const int __b)
 {
@@ -13449,6 +13514,7 @@  vduph_lane_u16 (uint16x4_t __a, const int __b)
 }
 
 /* vdups_lane  */
+
 __extension__ static __inline float32_t __attribute__ ((__always_inline__))
 vdups_lane_f32 (float32x2_t __a, const int __b)
 {
@@ -13509,6 +13575,13 @@  vdupb_laneq_u8 (uint8x16_t __a, const int __b)
 }
 
 /* vduph_laneq  */
+
+__extension__ static __inline float16_t __attribute__ ((__always_inline__))
+vduph_laneq_f16 (float16x8_t __a, const int __b)
+{
+  return __aarch64_vget_lane_any (__a, __b);
+}
+
 __extension__ static __inline poly16_t __attribute__ ((__always_inline__))
 vduph_laneq_p16 (poly16x8_t __a, const int __b)
 {
@@ -13528,6 +13601,7 @@  vduph_laneq_u16 (uint16x8_t __a, const int __b)
 }
 
 /* vdups_laneq  */
+
 __extension__ static __inline float32_t __attribute__ ((__always_inline__))
 vdups_laneq_f32 (float32x4_t __a, const int __b)
 {
@@ -13567,6 +13641,19 @@  vdupd_laneq_u64 (uint64x2_t __a, const int __b)
 
 /* vext  */
 
+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+vext_f16 (float16x4_t __a, float16x4_t __b, __const int __c)
+{
+  __AARCH64_LANE_CHECK (__a, __c);
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__b, __a,
+			    (uint16x4_t) {4 - __c, 5 - __c, 6 - __c, 7 - __c});
+#else
+  return __builtin_shuffle (__a, __b,
+			    (uint16x4_t) {__c, __c + 1, __c + 2, __c + 3});
+#endif
+}
+
 __extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
 vext_f32 (float32x2_t __a, float32x2_t __b, __const int __c)
 {
@@ -13698,6 +13785,22 @@  vext_u64 (uint64x1_t __a, uint64x1_t __b, __const int __c)
   return __a;
 }
 
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vextq_f16 (float16x8_t __a, float16x8_t __b, __const int __c)
+{
+  __AARCH64_LANE_CHECK (__a, __c);
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__b, __a,
+			    (uint16x8_t) {8 - __c, 9 - __c, 10 - __c, 11 - __c,
+					  12 - __c, 13 - __c, 14 - __c,
+					  15 - __c});
+#else
+  return __builtin_shuffle (__a, __b,
+			    (uint16x8_t) {__c, __c + 1, __c + 2, __c + 3,
+					  __c + 4, __c + 5, __c + 6, __c + 7});
+#endif
+}
+
 __extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
 vextq_f32 (float32x4_t __a, float32x4_t __b, __const int __c)
 {
@@ -14333,8 +14436,7 @@  vld1q_u64 (const uint64_t *a)
 __extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
 vld1_dup_f16 (const float16_t* __a)
 {
-  float16_t __f = *__a;
-  return (float16x4_t) { __f, __f, __f, __f };
+  return vdup_n_f16 (*__a);
 }
 
 __extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
@@ -14414,8 +14516,7 @@  vld1_dup_u64 (const uint64_t* __a)
 __extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
 vld1q_dup_f16 (const float16_t* __a)
 {
-  float16_t __f = *__a;
-  return (float16x8_t) { __f, __f, __f, __f, __f, __f, __f, __f };
+  return vdupq_n_f16 (*__a);
 }
 
 __extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
@@ -18018,6 +18119,12 @@  vmlsq_laneq_u32 (uint32x4_t __a, uint32x4_t __b,
 
 /* vmov_n_  */
 
+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+vmov_n_f16 (float16_t __a)
+{
+  return vdup_n_f16 (__a);
+}
+
 __extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
 vmov_n_f32 (float32_t __a)
 {
@@ -18090,6 +18197,12 @@  vmov_n_u64 (uint64_t __a)
   return (uint64x1_t) {__a};
 }
 
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vmovq_n_f16 (float16_t __a)
+{
+  return vdupq_n_f16 (__a);
+}
+
 __extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
 vmovq_n_f32 (float32_t __a)
 {
@@ -20834,6 +20947,12 @@  vrev32q_u16 (uint16x8_t a)
   return __builtin_shuffle (a, (uint16x8_t) { 1, 0, 3, 2, 5, 4, 7, 6 });
 }
 
+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+vrev64_f16 (float16x4_t __a)
+{
+  return __builtin_shuffle (__a, (uint16x4_t) { 3, 2, 1, 0 });
+}
+
 __extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
 vrev64_f32 (float32x2_t a)
 {
@@ -20888,6 +21007,12 @@  vrev64_u32 (uint32x2_t a)
   return __builtin_shuffle (a, (uint32x2_t) { 1, 0 });
 }
 
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vrev64q_f16 (float16x8_t __a)
+{
+  return __builtin_shuffle (__a, (uint16x8_t) { 3, 2, 1, 0, 7, 6, 5, 4 });
+}
+
 __extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
 vrev64q_f32 (float32x4_t a)
 {
@@ -23840,6 +23965,16 @@  vtbx4_p8 (poly8x8_t __r, poly8x8x4_t __tab, uint8x8_t __idx)
 
 /* vtrn */
 
+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+vtrn1_f16 (float16x4_t __a, float16x4_t __b)
+{
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b, (uint16x4_t) {5, 1, 7, 3});
+#else
+  return __builtin_shuffle (__a, __b, (uint16x4_t) {0, 4, 2, 6});
+#endif
+}
+
 __extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
 vtrn1_f32 (float32x2_t __a, float32x2_t __b)
 {
@@ -23930,6 +24065,16 @@  vtrn1_u32 (uint32x2_t __a, uint32x2_t __b)
 #endif
 }
 
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vtrn1q_f16 (float16x8_t __a, float16x8_t __b)
+{
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b, (uint16x8_t) {9, 1, 11, 3, 13, 5, 15, 7});
+#else
+  return __builtin_shuffle (__a, __b, (uint16x8_t) {0, 8, 2, 10, 4, 12, 6, 14});
+#endif
+}
+
 __extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
 vtrn1q_f32 (float32x4_t __a, float32x4_t __b)
 {
@@ -24056,6 +24201,16 @@  vtrn1q_u64 (uint64x2_t __a, uint64x2_t __b)
 #endif
 }
 
+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+vtrn2_f16 (float16x4_t __a, float16x4_t __b)
+{
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b, (uint16x4_t) {4, 0, 6, 2});
+#else
+  return __builtin_shuffle (__a, __b, (uint16x4_t) {1, 5, 3, 7});
+#endif
+}
+
 __extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
 vtrn2_f32 (float32x2_t __a, float32x2_t __b)
 {
@@ -24146,6 +24301,16 @@  vtrn2_u32 (uint32x2_t __a, uint32x2_t __b)
 #endif
 }
 
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vtrn2q_f16 (float16x8_t __a, float16x8_t __b)
+{
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b, (uint16x8_t) {8, 0, 10, 2, 12, 4, 14, 6});
+#else
+  return __builtin_shuffle (__a, __b, (uint16x8_t) {1, 9, 3, 11, 5, 13, 7, 15});
+#endif
+}
+
 __extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
 vtrn2q_f32 (float32x4_t __a, float32x4_t __b)
 {
@@ -24272,6 +24437,12 @@  vtrn2q_u64 (uint64x2_t __a, uint64x2_t __b)
 #endif
 }
 
+__extension__ static __inline float16x4x2_t __attribute__ ((__always_inline__))
+vtrn_f16 (float16x4_t __a, float16x4_t __b)
+{
+  return (float16x4x2_t) {vtrn1_f16 (__a, __b), vtrn2_f16 (__a, __b)};
+}
+
 __extension__ static __inline float32x2x2_t __attribute__ ((__always_inline__))
 vtrn_f32 (float32x2_t a, float32x2_t b)
 {
@@ -24326,6 +24497,12 @@  vtrn_u32 (uint32x2_t a, uint32x2_t b)
   return (uint32x2x2_t) {vtrn1_u32 (a, b), vtrn2_u32 (a, b)};
 }
 
+__extension__ static __inline float16x8x2_t __attribute__ ((__always_inline__))
+vtrnq_f16 (float16x8_t __a, float16x8_t __b)
+{
+  return (float16x8x2_t) {vtrn1q_f16 (__a, __b), vtrn2q_f16 (__a, __b)};
+}
+
 __extension__ static __inline float32x4x2_t __attribute__ ((__always_inline__))
 vtrnq_f32 (float32x4_t a, float32x4_t b)
 {
@@ -24574,6 +24751,7 @@  vuqaddd_s64 (int64_t __a, uint64_t __b)
   }
 
 #define __INTERLEAVE_LIST(op)					\
+  __DEFINTERLEAVE (op, float16x4x2_t, float16x4_t, f16,)	\
   __DEFINTERLEAVE (op, float32x2x2_t, float32x2_t, f32,)	\
   __DEFINTERLEAVE (op, poly8x8x2_t, poly8x8_t, p8,)		\
   __DEFINTERLEAVE (op, poly16x4x2_t, poly16x4_t, p16,)		\
@@ -24583,6 +24761,7 @@  vuqaddd_s64 (int64_t __a, uint64_t __b)
   __DEFINTERLEAVE (op, uint8x8x2_t, uint8x8_t, u8,)		\
   __DEFINTERLEAVE (op, uint16x4x2_t, uint16x4_t, u16,)		\
   __DEFINTERLEAVE (op, uint32x2x2_t, uint32x2_t, u32,)		\
+  __DEFINTERLEAVE (op, float16x8x2_t, float16x8_t, f16, q)	\
   __DEFINTERLEAVE (op, float32x4x2_t, float32x4_t, f32, q)	\
   __DEFINTERLEAVE (op, poly8x16x2_t, poly8x16_t, p8, q)		\
   __DEFINTERLEAVE (op, poly16x8x2_t, poly16x8_t, p16, q)	\
@@ -24595,6 +24774,16 @@  vuqaddd_s64 (int64_t __a, uint64_t __b)
 
 /* vuzp */
 
+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+vuzp1_f16 (float16x4_t __a, float16x4_t __b)
+{
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b, (uint16x4_t) {5, 7, 1, 3});
+#else
+  return __builtin_shuffle (__a, __b, (uint16x4_t) {0, 2, 4, 6});
+#endif
+}
+
 __extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
 vuzp1_f32 (float32x2_t __a, float32x2_t __b)
 {
@@ -24685,6 +24874,16 @@  vuzp1_u32 (uint32x2_t __a, uint32x2_t __b)
 #endif
 }
 
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vuzp1q_f16 (float16x8_t __a, float16x8_t __b)
+{
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b, (uint16x8_t) {9, 11, 13, 15, 1, 3, 5, 7});
+#else
+  return __builtin_shuffle (__a, __b, (uint16x8_t) {0, 2, 4, 6, 8, 10, 12, 14});
+#endif
+}
+
 __extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
 vuzp1q_f32 (float32x4_t __a, float32x4_t __b)
 {
@@ -24811,6 +25010,16 @@  vuzp1q_u64 (uint64x2_t __a, uint64x2_t __b)
 #endif
 }
 
+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+vuzp2_f16 (float16x4_t __a, float16x4_t __b)
+{
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b, (uint16x4_t) {4, 6, 0, 2});
+#else
+  return __builtin_shuffle (__a, __b, (uint16x4_t) {1, 3, 5, 7});
+#endif
+}
+
 __extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
 vuzp2_f32 (float32x2_t __a, float32x2_t __b)
 {
@@ -24901,6 +25110,16 @@  vuzp2_u32 (uint32x2_t __a, uint32x2_t __b)
 #endif
 }
 
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vuzp2q_f16 (float16x8_t __a, float16x8_t __b)
+{
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b, (uint16x8_t) {8, 10, 12, 14, 0, 2, 4, 6});
+#else
+  return __builtin_shuffle (__a, __b, (uint16x8_t) {1, 3, 5, 7, 9, 11, 13, 15});
+#endif
+}
+
 __extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
 vuzp2q_f32 (float32x4_t __a, float32x4_t __b)
 {
@@ -25031,6 +25250,16 @@  __INTERLEAVE_LIST (uzp)
 
 /* vzip */
 
+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+vzip1_f16 (float16x4_t __a, float16x4_t __b)
+{
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b, (uint16x4_t) {6, 2, 7, 3});
+#else
+  return __builtin_shuffle (__a, __b, (uint16x4_t) {0, 4, 1, 5});
+#endif
+}
+
 __extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
 vzip1_f32 (float32x2_t __a, float32x2_t __b)
 {
@@ -25121,6 +25350,18 @@  vzip1_u32 (uint32x2_t __a, uint32x2_t __b)
 #endif
 }
 
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vzip1q_f16 (float16x8_t __a, float16x8_t __b)
+{
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b,
+			    (uint16x8_t) {12, 4, 13, 5, 14, 6, 15, 7});
+#else
+  return __builtin_shuffle (__a, __b,
+			    (uint16x8_t) {0, 8, 1, 9, 2, 10, 3, 11});
+#endif
+}
+
 __extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
 vzip1q_f32 (float32x4_t __a, float32x4_t __b)
 {
@@ -25250,6 +25491,16 @@  vzip1q_u64 (uint64x2_t __a, uint64x2_t __b)
 #endif
 }
 
+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+vzip2_f16 (float16x4_t __a, float16x4_t __b)
+{
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b, (uint16x4_t) {4, 0, 5, 1});
+#else
+  return __builtin_shuffle (__a, __b, (uint16x4_t) {2, 6, 3, 7});
+#endif
+}
+
 __extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
 vzip2_f32 (float32x2_t __a, float32x2_t __b)
 {
@@ -25340,6 +25591,18 @@  vzip2_u32 (uint32x2_t __a, uint32x2_t __b)
 #endif
 }
 
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vzip2q_f16 (float16x8_t __a, float16x8_t __b)
+{
+#ifdef __AARCH64EB__
+  return __builtin_shuffle (__a, __b,
+			    (uint16x8_t) {8, 0, 9, 1, 10, 2, 11, 3});
+#else
+  return __builtin_shuffle (__a, __b,
+			    (uint16x8_t) {4, 12, 5, 13, 6, 14, 7, 15});
+#endif
+}
+
 __extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
 vzip2q_f32 (float32x4_t __a, float32x4_t __b)
 {
@@ -25479,6 +25742,7 @@  __INTERLEAVE_LIST (zip)
 #undef __aarch64_vget_lane_any
 
 #undef __aarch64_vdup_lane_any
+#undef __aarch64_vdup_lane_f16
 #undef __aarch64_vdup_lane_f32
 #undef __aarch64_vdup_lane_f64
 #undef __aarch64_vdup_lane_p8
@@ -25491,6 +25755,7 @@  __INTERLEAVE_LIST (zip)
 #undef __aarch64_vdup_lane_u16
 #undef __aarch64_vdup_lane_u32
 #undef __aarch64_vdup_lane_u64
+#undef __aarch64_vdup_laneq_f16
 #undef __aarch64_vdup_laneq_f32
 #undef __aarch64_vdup_laneq_f64
 #undef __aarch64_vdup_laneq_p8
@@ -25503,6 +25768,7 @@  __INTERLEAVE_LIST (zip)
 #undef __aarch64_vdup_laneq_u16
 #undef __aarch64_vdup_laneq_u32
 #undef __aarch64_vdup_laneq_u64
+#undef __aarch64_vdupq_lane_f16
 #undef __aarch64_vdupq_lane_f32
 #undef __aarch64_vdupq_lane_f64
 #undef __aarch64_vdupq_lane_p8
@@ -25515,6 +25781,7 @@  __INTERLEAVE_LIST (zip)
 #undef __aarch64_vdupq_lane_u16
 #undef __aarch64_vdupq_lane_u32
 #undef __aarch64_vdupq_lane_u64
+#undef __aarch64_vdupq_laneq_f16
 #undef __aarch64_vdupq_laneq_f32
 #undef __aarch64_vdupq_laneq_f64
 #undef __aarch64_vdupq_laneq_p8
-- 
2.5.0