diff mbox

[14/17,ARM] Add NEON FP16 instrinsics.

Message ID 573B2EDF.8020406@foss.arm.com
State New
Headers show

Commit Message

Matthew Wahab May 17, 2016, 2:46 p.m. UTC
The ARMv8.2-A architecture introduces an optional FP16 extension adding
half-precision floating point data processing instructions to the
existing Adv.SIMD (NEON) support. A future version of the ACLE will add
support for these instructions and this patch implements that support.

The ACLE will introduce new intrinsics for the Adv.SIMD instructions
together and will require that these intrinsics are available when both
the header file arm_neon.h is included and the ACLE feature macro
__ARM_FEATURE_FP16_VECTOR_ARITHMETIC is defined. (The new ACLE feature
macro is dealt with in an earlier patch.)

The patch adds the following new intrinsics to arm_neon.h:
----
float16x4_t vabs_f16 (float16x4_t __a)
float16x8_t vabsq_f16 (float16x8_t __a)
uint16x4_t vceqz_f16 (float16x4_t __a)
uint16x8_t vceqzq_f16 (float16x8_t __a)
uint16x4_t vcgez_f16 (float16x4_t __a)
uint16x8_t vcgezq_f16 (float16x8_t __a)
uint16x4_t vcgtz_f16 (float16x4_t __a)
uint16x8_t vcgtzq_f16 (float16x8_t __a)
uint16x4_t vclez_f16 (float16x4_t __a)
uint16x8_t vclezq_f16 (float16x8_t __a)
uint16x4_t vcltz_f16 (float16x4_t __a)
uint16x8_t vcltzq_f16 (float16x8_t __a)
float16x4_t vcvt_f16_s16 (int16x4_t __a)
float16x4_t vcvt_f16_u16 (uint16x4_t __a)
int16x4_t vcvt_s16_f16 (float16x4_t __a)
uint16x4_t vcvt_u16_f16 (float16x4_t __a)
float16x8_t vcvtq_f16_s16 (int16x8_t __a)
float16x8_t vcvtq_f16_u16 (uint16x8_t __a)
int16x8_t vcvtq_s16_f16 (float16x8_t __a)
uint16x8_t vcvtq_u16_f16 (float16x8_t __a)
int16x4_t vcvta_s16_f16 (float16x4_t __a)
uint16x4_t vcvta_u16_f16 (float16x4_t __a)
int16x8_t vcvtaq_s16_f16 (float16x8_t __a)
uint16x8_t vcvtaq_u16_f16 (float16x8_t __a)
int16x4_t vcvtm_s16_f16 (float16x4_t __a)
uint16x4_t vcvtm_u16_f16 (float16x4_t __a)
int16x8_t vcvtmq_s16_f16 (float16x8_t __a)
uint16x8_t vcvtmq_u16_f16 (float16x8_t __a)
int16x4_t vcvtn_s16_f16 (float16x4_t __a)
uint16x4_t vcvtn_u16_f16 (float16x4_t __a)
int16x8_t vcvtnq_s16_f16 (float16x8_t __a)
uint16x8_t vcvtnq_u16_f16 (float16x8_t __a)
int16x4_t vcvtp_s16_f16 (float16x4_t __a)
uint16x4_t vcvtp_u16_f16 (float16x4_t __a)
int16x8_t vcvtpq_s16_f16 (float16x8_t __a)
uint16x8_t vcvtpq_u16_f16 (float16x8_t __a)
float16x4_t vneg_f16 (float16x4_t __a)
float16x8_t vnegq_f16 (float16x8_t __a)
float16x4_t vrecpe_f16 (float16x4_t __a)
float16x8_t vrecpeq_f16 (float16x8_t __a)
float16x4_t vrnd_f16 (float16x4_t __a)
float16x8_t vrndq_f16 (float16x8_t __a)
float16x4_t vrnda_f16 (float16x4_t __a)
float16x8_t vrndaq_f16 (float16x8_t __a)
float16x4_t vrndm_f16 (float16x4_t __a)
float16x8_t vrndmq_f16 (float16x8_t __a)
float16x4_t vrndn_f16 (float16x4_t __a)
float16x8_t vrndnq_f16 (float16x8_t __a)
float16x4_t vrndp_f16 (float16x4_t __a)
float16x8_t vrndpq_f16 (float16x8_t __a)
float16x4_t vrndx_f16 (float16x4_t __a)
float16x8_t vrndxq_f16 (float16x8_t __a)
float16x4_t vsqrte_f16 (float16x4_t __a)
float16x8_t vsqrteq_f16 (float16x8_t __a)

float16x4_t vabd_f16 (float16x4_t __a, float16x4_t __b)
float16x8_t vabdq_f16 (float16x8_t __a, float16x8_t __b)
float16x4_t vadd_f16 (float16x4_t __a, float16x4_t __b)
float16x8_t vaddq_f16 (float16x8_t __a, float16x8_t __b)
uint16x4_t vcage_f16 (float16x4_t __a, float16x4_t __b)
uint16x8_t vcageq_f16 (float16x8_t __a, float16x8_t __b)
uint16x4_t vcagt_f16 (float16x4_t __a, float16x4_t __b)
uint16x8_t vcagtq_f16 (float16x8_t __a, float16x8_t __b)
uint16x4_t vcale_f16 (float16x4_t __a, float16x4_t __b)
uint16x8_t vcaleq_f16 (float16x8_t __a, float16x8_t __b)
uint16x4_t vcalt_f16 (float16x4_t __a, float16x4_t __b)
uint16x8_t vcaltq_f16 (float16x8_t __a, float16x8_t __b)
uint16x4_t vceq_f16 (float16x4_t __a, float16x4_t __b)
uint16x8_t vceqq_f16 (float16x8_t __a, float16x8_t __b)
uint16x4_t vcge_f16 (float16x4_t __a, float16x4_t __b)
uint16x8_t vcgeq_f16 (float16x8_t __a, float16x8_t __b)
uint16x4_t vcgt_f16 (float16x4_t __a, float16x4_t __b)
uint16x8_t vcgtq_f16 (float16x8_t __a, float16x8_t __b)
uint16x4_t vcle_f16 (float16x4_t __a, float16x4_t __b)
uint16x8_t vcleq_f16 (float16x8_t __a, float16x8_t __b)
uint16x4_t vclt_f16 (float16x4_t __a, float16x4_t __b)
uint16x8_t vcltq_f16 (float16x8_t __a, float16x8_t __b)
float16x4_t vcvt_n_f16_s16 (int16x4_t __a, const int __b)
float16x4_t vcvt_n_f16_u16 (uint16x4_t __a, const int __b)
float16x8_t vcvtq_n_f16_s16 (int16x8_t __a, const int __b)
float16x8_t vcvtq_n_f16_u16 (uint16x8_t __a, const int __b)
int16x4_t vcvt_n_s16_f16 (float16x4_t __a, const int __b)
uint16x4_t vcvt_n_u16_f16 (float16x4_t __a, const int __b)
int16x8_t vcvtq_n_s16_f16 (float16x8_t __a, const int __b)
uint16x8_t vcvtq_n_u16_f16 (float16x8_t __a, const int __b)
float16x4_t vmax_f16 (float16x4_t __a, float16x4_t __b)
float16x8_t vmaxq_f16 (float16x8_t __a, float16x8_t __b)
float16x4_t vmaxnm_f16 (float16x4_t __a, float16x4_t __b)
float16x8_t vmaxnmq_f16 (float16x8_t __a, float16x8_t __b)
float16x4_t vmin_f16 (float16x4_t __a, float16x4_t __b)
float16x8_t vminq_f16 (float16x8_t __a, float16x8_t __b)
float16x4_t vminnm_f16 (float16x4_t __a, float16x4_t __b)
float16x8_t vminnmq_f16 (float16x8_t __a, float16x8_t __b)
float16x4_t vmul_f16 (float16x4_t __a, float16x4_t __b)
float16x4_t vmul_n_f16 (float16x4_t __a, float16_t __b)
float16x8_t vmulq_f16 (float16x8_t __a, float16x8_t __b)
float16x8_t vmulq_n_f16 (float16x8_t __a, float16_t __b)
float16x4_t vpadd_f16 (float16x4_t __a, float16x4_t __b)
float16x4_t vpmax_f16 (float16x4_t __a, float16x4_t __b)
float16x4_t vpmin_f16 (float16x4_t __a, float16x4_t __b)
float16x4_t vrecps_f16 (float16x4_t __a, float16x4_t __b)
float16x8_t vrecpsq_f16 (float16x8_t __a, float16x8_t __b)
float16x4_t vrsqrts_f16 (float16x4_t __a, float16x4_t __b)
float16x8_t vrsqrtsq_f16 (float16x8_t __a, float16x8_t __b)
float16x4_t vsub_f16 (float16x4_t __a, float16x4_t __b)
float16x8_t vsubq_f16 (float16x8_t __a, float16x8_t __b)

float16x4_t vfma_f16 (float16x4_t __a, float16x4_t __b, float16x4_t __c)
float16x8_t vfmaq_f16 (float16x8_t __a, float16x8_t __b, float16x8_t __c)
float16x4_t vfms_f16 (float16x4_t __a, float16x4_t __b, float16x4_t __c)
float16x8_t vfmsq_f16 (float16x8_t __a, float16x8_t __b, float16x8_t __c)
float16x4_t vmul_lane_f16 (float16x4_t __a, float16x4_t __b, const int __c)
float16x8_t vmulq_lane_f16 (float16x8_t __a, float16x4_t __b, const int __c)
----

Tested the series for arm-none-linux-gnueabihf with native bootstrap and
make check and for arm-none-eabi and armeb-none-eabi with make check on
an ARMv8.2-A emulator.

Ok for trunk?
Matthew

2016-05-17  Matthew Wahab  <matthew.wahab@arm.com>

	* config/arm/arm_neon.h: Include arm_fp16.h.
	(vabd_f16): New.
	(vabdq_f16): New.
	(vabs_f16): New.
	(vabsq_f16): New.
	(vadd_f16): New.
	(vaddq_f16): New.
	(vcage_f16): New.
	(vcageq_f16): New.
	(vcagt_f16): New.
	(vcagtq_f16): New.
	(vcale_f16): New.
	(vcaleq_f16): New.
	(vcalt_f16): New.
	(vcaltq_f16): New.
	(vceq_f16): New.
	(vceqq_f16): New.
	(vceqz_f16): New.
	(vceqzq_f16): New.
	(vcge_f16): New.
	(vcgeq_f16): New.
	(vcgez_f16): New.
	(vcgezq_f16): New.
	(vcgt_f16): New.
	(vcgtq_f16): New.
	(vcgtz_f16): New.
	(vcgtzq_f16): New.
	(vcle_f16): New.
	(vcleq_f16): New.
	(vclez_f16): New.
	(vclezq_f16): New.
	(vclt_f16): New.
	(vcltq_f16): New.
	(vcltz_f16): New.
	(vcltzq_f16): New.
	(vcvt_f16_s16): New.
	(vcvt_f16_u16): New.
	(vcvt_s16_f16): New.
	(vcvt_u16_f16): New.
	(vcvtq_f16_s16): New.
	(vcvtq_f16_u16): New.
	(vcvtq_s16_f16): New.
	(vcvtq_u16_f16): New.
	(vcvta_s16_f16): New.
	(vcvta_u16_f16): New.
	(vcvtaq_s16_f16): New.
	(vcvtaq_u16_f16): New.
	(vcvtm_s16_f16): New.
	(vcvtm_u16_f16): New.
	(vcvtmq_s16_f16): New.
	(vcvtmq_u16_f16): New.
	(vcvtn_s16_f16): New.
	(vcvtn_u16_f16): New.
	(vcvtnq_s16_f16): New.
	(vcvtnq_u16_f16): New.
	(vcvtp_s16_f16): New.
	(vcvtp_u16_f16): New.
	(vcvtpq_s16_f16): New.
	(vcvtpq_u16_f16): New.
	(vcvt_n_f16_s16): New.
	(vcvt_n_f16_u16): New.
	(vcvtq_n_f16_s16): New.
	(vcvtq_n_f16_u16): New.
	(vcvt_n_s16_f16): New.
	(vcvt_n_u16_f16): New.
	(vcvtq_n_s16_f16): New.
	(vcvtq_n_u16_f16): New.
	(vfma_f16): New.
	(vfmaq_f16): New.
	(vfms_f16): New.
	(vfmsq_f16): New.
	(vmax_f16): New.
	(vmaxq_f16): New.
	(vmaxnm_f16): New.
	(vmaxnmq_f16): New.
	(vmin_f16): New.
	(vminq_f16): New.
	(vminnm_f16): New.
	(vminnmq_f16): New.
	(vmul_f16): New.
	(vmul_lane_f16): New.
	(vmul_n_f16): New.
	(vmulq_f16): New.
	(vmulq_lane_f16): New.
	(vmulq_n_f16): New.
	(vneg_f16): New.
	(vnegq_f16): New.
	(vpadd_f16): New.
	(vpmax_f16): New.
	(vpmin_f16): New.
	(vrecpe_f16): New.
	(vrecpeq_f16): New.
	(vrnd_f16): New.
	(vrndq_f16): New.
	(vrnda_f16): New.
	(vrndaq_f16): New.
	(vrndm_f16): New.
	(vrndmq_f16): New.
	(vrndn_f16): New.
	(vrndnq_f16): New.
	(vrndp_f16): New.
	(vrndpq_f16): New.
	(vrndx_f16): New.
	(vrndxq_f16): New.
	(vsqrte_f16): New.
	(vsqrteq_f16): New.
	(vrecps_f16): New.
	(vrecpsq_f16): New.
	(vrsqrts_f16): New.
	(vrsqrtsq_f16): New.
	(vsub_f16): New.
	(vsubq_f16): New.
diff mbox

Patch

From 3f8692f5849049af0db05d1cc3b4cda80ae131e0 Mon Sep 17 00:00:00 2001
From: Matthew Wahab <matthew.wahab@arm.com>
Date: Thu, 7 Apr 2016 15:36:34 +0100
Subject: [PATCH 14/17] [PATCH 14/17][ARM] Add NEON FP16 instrinsics.

2016-05-17  Matthew Wahab  <matthew.wahab@arm.com>

	* config/arm/arm_neon.h (vabd_f16): New.
	(vabdq_f16): New.
	(vabs_f16): New.
	(vabsq_f16): New.
	(vadd_f16): New.
	(vaddq_f16): New.
	(vcage_f16): New.
	(vcageq_f16): New.
	(vcagt_f16): New.
	(vcagtq_f16): New.
	(vcale_f16): New.
	(vcaleq_f16): New.
	(vcalt_f16): New.
	(vcaltq_f16): New.
	(vceq_f16): New.
	(vceqq_f16): New.
	(vceqz_f16): New.
	(vceqzq_f16): New.
	(vcge_f16): New.
	(vcgeq_f16): New.
	(vcgez_f16): New.
	(vcgezq_f16): New.
	(vcgt_f16): New.
	(vcgtq_f16): New.
	(vcgtz_f16): New.
	(vcgtzq_f16): New.
	(vcle_f16): New.
	(vcleq_f16): New.
	(vclez_f16): New.
	(vclezq_f16): New.
	(vclt_f16): New.
	(vcltq_f16): New.
	(vcltz_f16): New.
	(vcltzq_f16): New.
	(vcvt_f16_s16): New.
	(vcvt_f16_u16): New.
	(vcvt_s16_f16): New.
	(vcvt_u16_f16): New.
	(vcvtq_f16_s16): New.
	(vcvtq_f16_u16): New.
	(vcvtq_s16_f16): New.
	(vcvtq_u16_f16): New.
	(vcvta_s16_f16): New.
	(vcvta_u16_f16): New.
	(vcvtaq_s16_f16): New.
	(vcvtaq_u16_f16): New.
	(vcvtm_s16_f16): New.
	(vcvtm_u16_f16): New.
	(vcvtmq_s16_f16): New.
	(vcvtmq_u16_f16): New.
	(vcvtn_s16_f16): New.
	(vcvtn_u16_f16): New.
	(vcvtnq_s16_f16): New.
	(vcvtnq_u16_f16): New.
	(vcvtp_s16_f16): New.
	(vcvtp_u16_f16): New.
	(vcvtpq_s16_f16): New.
	(vcvtpq_u16_f16): New.
	(vcvt_n_f16_s16): New.
	(vcvt_n_f16_u16): New.
	(vcvtq_n_f16_s16): New.
	(vcvtq_n_f16_u16): New.
	(vcvt_n_s16_f16): New.
	(vcvt_n_u16_f16): New.
	(vcvtq_n_s16_f16): New.
	(vcvtq_n_u16_f16): New.
	(vfma_f16): New.
	(vfmaq_f16): New.
	(vfms_f16): New.
	(vfmsq_f16): New.
	(vmax_f16): New.
	(vmaxq_f16): New.
	(vmaxnm_f16): New.
	(vmaxnmq_f16): New.
	(vmin_f16): New.
	(vminq_f16): New.
	(vminnm_f16): New.
	(vminnmq_f16): New.
	(vmul_f16): New.
	(vmul_lane_f16): New.
	(vmul_n_f16): New.
	(vmulq_f16): New.
	(vmulq_lane_f16): New.
	(vmulq_n_f16): New.
	(vneg_f16): New.
	(vnegq_f16): New.
	(vpadd_f16): New.
	(vpmax_f16): New.
	(vpmin_f16): New.
	(vrecpe_f16): New.
	(vrecpeq_f16): New.
	(vrnd_f16): New.
	(vrndq_f16): New.
	(vrnda_f16): New.
	(vrndaq_f16): New.
	(vrndm_f16): New.
	(vrndmq_f16): New.
	(vrndn_f16): New.
	(vrndnq_f16): New.
	(vrndp_f16): New.
	(vrndpq_f16): New.
	(vrndx_f16): New.
	(vrndxq_f16): New.
	(vsqrte_f16): New.
	(vsqrteq_f16): New.
	(vrecps_f16): New.
	(vrecpsq_f16): New.
	(vrsqrts_f16): New.
	(vrsqrtsq_f16): New.
	(vsub_f16): New.
	(vsubq_f16): New.
---
 gcc/config/arm/arm_neon.h | 675 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 675 insertions(+)

diff --git a/gcc/config/arm/arm_neon.h b/gcc/config/arm/arm_neon.h
index 5b433b4..4075ff8 100644
--- a/gcc/config/arm/arm_neon.h
+++ b/gcc/config/arm/arm_neon.h
@@ -38,6 +38,7 @@ 
 extern "C" {
 #endif
 
+#include <arm_fp16.h>
 #include <stdint.h>
 
 typedef __simd64_int8_t int8x8_t;
@@ -14830,6 +14831,680 @@  vmull_high_p64 (poly64x2_t __a, poly64x2_t __b)
 
 #pragma GCC pop_options
 
+  /* Intrinsics for FP16 instructions.  */
+#pragma GCC push_options
+#pragma GCC target ("fpu=neon-fp-armv8")
+#if defined (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+vabd_f16 (float16x4_t __a, float16x4_t __b)
+{
+  return __builtin_neon_vabdv4hf (__a, __b);
+}
+
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vabdq_f16 (float16x8_t __a, float16x8_t __b)
+{
+  return __builtin_neon_vabdv8hf (__a, __b);
+}
+
+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+vabs_f16 (float16x4_t __a)
+{
+  return __builtin_neon_vabsv4hf (__a);
+}
+
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vabsq_f16 (float16x8_t __a)
+{
+  return __builtin_neon_vabsv8hf (__a);
+}
+
+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+vadd_f16 (float16x4_t __a, float16x4_t __b)
+{
+  return __builtin_neon_vaddv4hf (__a, __b);
+}
+
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vaddq_f16 (float16x8_t __a, float16x8_t __b)
+{
+  return __builtin_neon_vaddv8hf (__a, __b);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vcage_f16 (float16x4_t __a, float16x4_t __b)
+{
+  return (uint16x4_t)__builtin_neon_vcagev4hf (__a, __b);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vcageq_f16 (float16x8_t __a, float16x8_t __b)
+{
+  return (uint16x8_t)__builtin_neon_vcagev8hf (__a, __b);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vcagt_f16 (float16x4_t __a, float16x4_t __b)
+{
+  return (uint16x4_t)__builtin_neon_vcagtv4hf (__a, __b);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vcagtq_f16 (float16x8_t __a, float16x8_t __b)
+{
+  return (uint16x8_t)__builtin_neon_vcagtv8hf (__a, __b);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vcale_f16 (float16x4_t __a, float16x4_t __b)
+{
+  return (uint16x4_t)__builtin_neon_vcalev4hf (__a, __b);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vcaleq_f16 (float16x8_t __a, float16x8_t __b)
+{
+  return (uint16x8_t)__builtin_neon_vcalev8hf (__a, __b);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vcalt_f16 (float16x4_t __a, float16x4_t __b)
+{
+  return (uint16x4_t)__builtin_neon_vcaltv4hf (__a, __b);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vcaltq_f16 (float16x8_t __a, float16x8_t __b)
+{
+  return (uint16x8_t)__builtin_neon_vcaltv8hf (__a, __b);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vceq_f16 (float16x4_t __a, float16x4_t __b)
+{
+  return (uint16x4_t)__builtin_neon_vceqv4hf (__a, __b);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vceqq_f16 (float16x8_t __a, float16x8_t __b)
+{
+  return (uint16x8_t)__builtin_neon_vceqv8hf (__a, __b);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vceqz_f16 (float16x4_t __a)
+{
+  return (uint16x4_t)__builtin_neon_vceqzv4hf (__a);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vceqzq_f16 (float16x8_t __a)
+{
+  return (uint16x8_t)__builtin_neon_vceqzv8hf (__a);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vcge_f16 (float16x4_t __a, float16x4_t __b)
+{
+  return (uint16x4_t)__builtin_neon_vcgev4hf (__a, __b);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vcgeq_f16 (float16x8_t __a, float16x8_t __b)
+{
+  return (uint16x8_t)__builtin_neon_vcgev8hf (__a, __b);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vcgez_f16 (float16x4_t __a)
+{
+  return (uint16x4_t)__builtin_neon_vcgezv4hf (__a);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vcgezq_f16 (float16x8_t __a)
+{
+  return (uint16x8_t)__builtin_neon_vcgezv8hf (__a);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vcgt_f16 (float16x4_t __a, float16x4_t __b)
+{
+  return (uint16x4_t)__builtin_neon_vcgtv4hf (__a, __b);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vcgtq_f16 (float16x8_t __a, float16x8_t __b)
+{
+  return (uint16x8_t)__builtin_neon_vcgtv8hf (__a, __b);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vcgtz_f16 (float16x4_t __a)
+{
+  return (uint16x4_t)__builtin_neon_vcgtzv4hf (__a);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vcgtzq_f16 (float16x8_t __a)
+{
+  return (uint16x8_t)__builtin_neon_vcgtzv8hf (__a);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vcle_f16 (float16x4_t __a, float16x4_t __b)
+{
+  return (uint16x4_t)__builtin_neon_vclev4hf (__a, __b);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vcleq_f16 (float16x8_t __a, float16x8_t __b)
+{
+  return (uint16x8_t)__builtin_neon_vclev8hf (__a, __b);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vclez_f16 (float16x4_t __a)
+{
+  return (uint16x4_t)__builtin_neon_vclezv4hf (__a);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vclezq_f16 (float16x8_t __a)
+{
+  return (uint16x8_t)__builtin_neon_vclezv8hf (__a);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vclt_f16 (float16x4_t __a, float16x4_t __b)
+{
+  return (uint16x4_t)__builtin_neon_vcltv4hf (__a, __b);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vcltq_f16 (float16x8_t __a, float16x8_t __b)
+{
+  return (uint16x8_t)__builtin_neon_vcltv8hf (__a, __b);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vcltz_f16 (float16x4_t __a)
+{
+  return (uint16x4_t)__builtin_neon_vcltzv4hf (__a);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vcltzq_f16 (float16x8_t __a)
+{
+  return (uint16x8_t)__builtin_neon_vcltzv8hf (__a);
+}
+
+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+vcvt_f16_s16 (int16x4_t __a)
+{
+  return (float16x4_t)__builtin_neon_vcvtsv4hi (__a);
+}
+
+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+vcvt_f16_u16 (uint16x4_t __a)
+{
+  return (float16x4_t)__builtin_neon_vcvtuv4hi ((int16x4_t)__a);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vcvt_s16_f16 (float16x4_t __a)
+{
+  return (int16x4_t)__builtin_neon_vcvtsv4hf (__a);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vcvt_u16_f16 (float16x4_t __a)
+{
+  return (uint16x4_t)__builtin_neon_vcvtuv4hf (__a);
+}
+
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vcvtq_f16_s16 (int16x8_t __a)
+{
+  return (float16x8_t)__builtin_neon_vcvtsv8hi (__a);
+}
+
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vcvtq_f16_u16 (uint16x8_t __a)
+{
+  return (float16x8_t)__builtin_neon_vcvtuv8hi ((int16x8_t)__a);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vcvtq_s16_f16 (float16x8_t __a)
+{
+  return (int16x8_t)__builtin_neon_vcvtsv8hf (__a);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vcvtq_u16_f16 (float16x8_t __a)
+{
+  return (uint16x8_t)__builtin_neon_vcvtuv8hf (__a);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vcvta_s16_f16 (float16x4_t __a)
+{
+  return __builtin_neon_vcvtasv4hf (__a);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vcvta_u16_f16 (float16x4_t __a)
+{
+  return (uint16x4_t)__builtin_neon_vcvtauv4hf (__a);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vcvtaq_s16_f16 (float16x8_t __a)
+{
+  return __builtin_neon_vcvtasv8hf (__a);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vcvtaq_u16_f16 (float16x8_t __a)
+{
+  return (uint16x8_t)__builtin_neon_vcvtauv8hf (__a);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vcvtm_s16_f16 (float16x4_t __a)
+{
+  return __builtin_neon_vcvtmsv4hf (__a);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vcvtm_u16_f16 (float16x4_t __a)
+{
+  return (uint16x4_t)__builtin_neon_vcvtmuv4hf (__a);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vcvtmq_s16_f16 (float16x8_t __a)
+{
+  return __builtin_neon_vcvtmsv8hf (__a);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vcvtmq_u16_f16 (float16x8_t __a)
+{
+  return (uint16x8_t)__builtin_neon_vcvtmuv8hf (__a);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vcvtn_s16_f16 (float16x4_t __a)
+{
+  return __builtin_neon_vcvtnsv4hf (__a);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vcvtn_u16_f16 (float16x4_t __a)
+{
+  return (uint16x4_t)__builtin_neon_vcvtnuv4hf (__a);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vcvtnq_s16_f16 (float16x8_t __a)
+{
+  return __builtin_neon_vcvtnsv8hf (__a);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vcvtnq_u16_f16 (float16x8_t __a)
+{
+  return (uint16x8_t)__builtin_neon_vcvtnuv8hf (__a);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vcvtp_s16_f16 (float16x4_t __a)
+{
+  return __builtin_neon_vcvtpsv4hf (__a);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vcvtp_u16_f16 (float16x4_t __a)
+{
+  return (uint16x4_t)__builtin_neon_vcvtpuv4hf (__a);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vcvtpq_s16_f16 (float16x8_t __a)
+{
+  return __builtin_neon_vcvtpsv8hf (__a);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vcvtpq_u16_f16 (float16x8_t __a)
+{
+  return (uint16x8_t)__builtin_neon_vcvtpuv8hf (__a);
+}
+
+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+vcvt_n_f16_s16 (int16x4_t __a, const int __b)
+{
+  return __builtin_neon_vcvts_nv4hi (__a, __b);
+}
+
+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+vcvt_n_f16_u16 (uint16x4_t __a, const int __b)
+{
+  return __builtin_neon_vcvtu_nv4hi ((int16x4_t)__a, __b);
+}
+
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vcvtq_n_f16_s16 (int16x8_t __a, const int __b)
+{
+  return __builtin_neon_vcvts_nv8hi (__a, __b);
+}
+
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vcvtq_n_f16_u16 (uint16x8_t __a, const int __b)
+{
+  return __builtin_neon_vcvtu_nv8hi ((int16x8_t)__a, __b);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vcvt_n_s16_f16 (float16x4_t __a, const int __b)
+{
+  return __builtin_neon_vcvts_nv4hf (__a, __b);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vcvt_n_u16_f16 (float16x4_t __a, const int __b)
+{
+  return (uint16x4_t)__builtin_neon_vcvtu_nv4hf (__a, __b);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vcvtq_n_s16_f16 (float16x8_t __a, const int __b)
+{
+  return __builtin_neon_vcvts_nv8hf (__a, __b);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vcvtq_n_u16_f16 (float16x8_t __a, const int __b)
+{
+  return (uint16x8_t)__builtin_neon_vcvtu_nv8hf (__a, __b);
+}
+
+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+vfma_f16 (float16x4_t __a, float16x4_t __b, float16x4_t __c)
+{
+  return __builtin_neon_vfmav4hf (__a, __b, __c);
+}
+
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vfmaq_f16 (float16x8_t __a, float16x8_t __b, float16x8_t __c)
+{
+  return __builtin_neon_vfmav8hf (__a, __b, __c);
+}
+
+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+vfms_f16 (float16x4_t __a, float16x4_t __b, float16x4_t __c)
+{
+  return __builtin_neon_vfmsv4hf (__a, __b, __c);
+}
+
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vfmsq_f16 (float16x8_t __a, float16x8_t __b, float16x8_t __c)
+{
+  return __builtin_neon_vfmsv8hf (__a, __b, __c);
+}
+
+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+vmax_f16 (float16x4_t __a, float16x4_t __b)
+{
+  return __builtin_neon_vmaxfv4hf (__a, __b);
+}
+
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vmaxq_f16 (float16x8_t __a, float16x8_t __b)
+{
+  return __builtin_neon_vmaxfv8hf (__a, __b);
+}
+
+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+vmaxnm_f16 (float16x4_t __a, float16x4_t __b)
+{
+  return __builtin_neon_vmaxnmv4hf (__a, __b);
+}
+
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vmaxnmq_f16 (float16x8_t __a, float16x8_t __b)
+{
+  return __builtin_neon_vmaxnmv8hf (__a, __b);
+}
+
+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+vmin_f16 (float16x4_t __a, float16x4_t __b)
+{
+  return __builtin_neon_vminfv4hf (__a, __b);
+}
+
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vminq_f16 (float16x8_t __a, float16x8_t __b)
+{
+  return __builtin_neon_vminfv8hf (__a, __b);
+}
+
+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+vminnm_f16 (float16x4_t __a, float16x4_t __b)
+{
+  return __builtin_neon_vminnmv4hf (__a, __b);
+}
+
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vminnmq_f16 (float16x8_t __a, float16x8_t __b)
+{
+  return __builtin_neon_vminnmv8hf (__a, __b);
+}
+
+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+vmul_f16 (float16x4_t __a, float16x4_t __b)
+{
+  return __builtin_neon_vmulfv4hf (__a, __b);
+}
+
+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+vmul_lane_f16 (float16x4_t __a, float16x4_t __b, const int __c)
+{
+  return __builtin_neon_vmul_lanev4hf (__a, __b, __c);
+}
+
+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+vmul_n_f16 (float16x4_t __a, float16_t __b)
+{
+  return __builtin_neon_vmul_nv4hf (__a, __b);
+}
+
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vmulq_f16 (float16x8_t __a, float16x8_t __b)
+{
+  return __builtin_neon_vmulfv8hf (__a, __b);
+}
+
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vmulq_lane_f16 (float16x8_t __a, float16x4_t __b, const int __c)
+{
+  return __builtin_neon_vmul_lanev8hf (__a, __b, __c);
+}
+
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vmulq_n_f16 (float16x8_t __a, float16_t __b)
+{
+  return __builtin_neon_vmul_nv8hf (__a, __b);
+}
+
+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+vneg_f16 (float16x4_t __a)
+{
+  return __builtin_neon_vnegv4hf (__a);
+}
+
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vnegq_f16 (float16x8_t __a)
+{
+  return __builtin_neon_vnegv8hf (__a);
+}
+
+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+vpadd_f16 (float16x4_t __a, float16x4_t __b)
+{
+  return __builtin_neon_vpaddv4hf (__a, __b);
+}
+
+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+vpmax_f16 (float16x4_t __a, float16x4_t __b)
+{
+  return __builtin_neon_vpmaxfv4hf (__a, __b);
+}
+
+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+vpmin_f16 (float16x4_t __a, float16x4_t __b)
+{
+  return __builtin_neon_vpminfv4hf (__a, __b);
+}
+
+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+vrecpe_f16 (float16x4_t __a)
+{
+  return __builtin_neon_vrecpev4hf (__a);
+}
+
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vrecpeq_f16 (float16x8_t __a)
+{
+  return __builtin_neon_vrecpev8hf (__a);
+}
+
+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+vrnd_f16 (float16x4_t __a)
+{
+  return __builtin_neon_vrndv4hf (__a);
+}
+
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vrndq_f16 (float16x8_t __a)
+{
+  return __builtin_neon_vrndv8hf (__a);
+}
+
+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+vrnda_f16 (float16x4_t __a)
+{
+  return __builtin_neon_vrndav4hf (__a);
+}
+
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vrndaq_f16 (float16x8_t __a)
+{
+  return __builtin_neon_vrndav8hf (__a);
+}
+
+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+vrndm_f16 (float16x4_t __a)
+{
+  return __builtin_neon_vrndmv4hf (__a);
+}
+
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vrndmq_f16 (float16x8_t __a)
+{
+  return __builtin_neon_vrndmv8hf (__a);
+}
+
+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+vrndn_f16 (float16x4_t __a)
+{
+  return __builtin_neon_vrndnv4hf (__a);
+}
+
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vrndnq_f16 (float16x8_t __a)
+{
+  return __builtin_neon_vrndnv8hf (__a);
+}
+
+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+vrndp_f16 (float16x4_t __a)
+{
+  return __builtin_neon_vrndpv4hf (__a);
+}
+
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vrndpq_f16 (float16x8_t __a)
+{
+  return __builtin_neon_vrndpv8hf (__a);
+}
+
+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+vrndx_f16 (float16x4_t __a)
+{
+  return __builtin_neon_vrndxv4hf (__a);
+}
+
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vrndxq_f16 (float16x8_t __a)
+{
+  return __builtin_neon_vrndxv8hf (__a);
+}
+
+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+vsqrte_f16 (float16x4_t __a)
+{
+  return __builtin_neon_vsqrtev4hf (__a);
+}
+
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vsqrteq_f16 (float16x8_t __a)
+{
+  return __builtin_neon_vsqrtev8hf (__a);
+}
+
+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+vrecps_f16 (float16x4_t __a, float16x4_t __b)
+{
+  return __builtin_neon_vrecpsv4hf (__a, __b);
+}
+
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vrecpsq_f16 (float16x8_t __a, float16x8_t __b)
+{
+  return __builtin_neon_vrecpsv8hf (__a, __b);
+}
+
+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+vrsqrts_f16 (float16x4_t __a, float16x4_t __b)
+{
+  return __builtin_neon_vrsqrtsv4hf (__a, __b);
+}
+
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vrsqrtsq_f16 (float16x8_t __a, float16x8_t __b)
+{
+  return __builtin_neon_vrsqrtsv8hf (__a, __b);
+}
+
+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+vsub_f16 (float16x4_t __a, float16x4_t __b)
+{
+  return __builtin_neon_vsubv4hf (__a, __b);
+}
+
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vsubq_f16 (float16x8_t __a, float16x8_t __b)
+{
+  return __builtin_neon_vsubv8hf (__a, __b);
+}
+
+#endif /* __ARM_FEATURE_VECTOR_FP16_ARITHMETIC.  */
+#pragma GCC pop_options
+
   /* Half-precision data processing intrinsics.  */
 #if defined (__ARM_FP16_FORMAT_IEEE) || defined (__ARM_FP16_FORMAT_ALTERNATIVE)
 
-- 
2.1.4