diff mbox

[11/15,AArch64] vreinterpret(q?), vget_(low|high), vld1(q?)_dup

Message ID 55B766DE.80008@arm.com
State New
Headers show

Commit Message

Alan Lawrence July 28, 2015, 11:26 a.m. UTC
gcc/ChangeLog:

	* config/aarch64/arm_neon.h (vreinterpret_p8_f16, vreinterpret_p16_f16,
	vreinterpret_f16_f64, vreinterpret_f16_s8, vreinterpret_f16_s16,
	vreinterpret_f16_s32, vreinterpret_f16_s64, vreinterpret_f16_f32,
	vreinterpret_f16_u8, vreinterpret_f16_u16, vreinterpret_f16_u32,
	vreinterpret_f16_u64, vreinterpret_f16_p8, vreinterpret_f16_p16,
	vreinterpretq_f16_f64, vreinterpretq_f16_s8, vreinterpretq_f16_s16,
	vreinterpretq_f16_s32, vreinterpretq_f16_s64, vreinterpretq_f16_f32,
	vreinterpretq_f16_u8, vreinterpretq_f16_u16, vreinterpretq_f16_u32,
	vreinterpretq_f16_u64, vreinterpretq_f16_p8, vreinterpretq_f16_p16,
	vreinterpret_f32_f16, vreinterpret_f64_f16, vreinterpret_s64_f16,
	vreinterpret_u64_f16, vreinterpretq_u64_f16, vreinterpret_s8_f16,
	vreinterpret_s16_f16, vreinterpret_s32_f16, vreinterpret_u8_f16,
	vreinterpret_u16_f16, vreinterpret_u32_f16, vreinterpretq_p8_f16,
	vreinterpretq_p16_f16, vreinterpretq_f32_f16, vreinterpretq_f64_f16,
	vreinterpretq_s64_f16, vreinterpretq_s8_f16, vreinterpretq_s16_f16,
	vreinterpretq_s32_f16, vreinterpretq_u8_f16, vreinterpretq_u16_f16,
	vreinterpretq_u32_f16, vget_low_f16, vget_high_f16, vld1_dup_f16,
	vld1q_dup_f16): New.

gcc/testsuite/ChangeLog:

	* gcc.target/aarch64/vget_high_1.c: Add float16x8->float16x4 case.
	* gcc.target/aarch64/vget_low_1.c: Likewise.

Comments

James Greenhalgh July 29, 2015, 12:32 p.m. UTC | #1
On Tue, Jul 28, 2015 at 12:26:22PM +0100, Alan Lawrence wrote:
> gcc/ChangeLog:
> 
> 	* config/aarch64/arm_neon.h (vreinterpret_p8_f16, vreinterpret_p16_f16,
> 	vreinterpret_f16_f64, vreinterpret_f16_s8, vreinterpret_f16_s16,
> 	vreinterpret_f16_s32, vreinterpret_f16_s64, vreinterpret_f16_f32,
> 	vreinterpret_f16_u8, vreinterpret_f16_u16, vreinterpret_f16_u32,
> 	vreinterpret_f16_u64, vreinterpret_f16_p8, vreinterpret_f16_p16,
> 	vreinterpretq_f16_f64, vreinterpretq_f16_s8, vreinterpretq_f16_s16,
> 	vreinterpretq_f16_s32, vreinterpretq_f16_s64, vreinterpretq_f16_f32,
> 	vreinterpretq_f16_u8, vreinterpretq_f16_u16, vreinterpretq_f16_u32,
> 	vreinterpretq_f16_u64, vreinterpretq_f16_p8, vreinterpretq_f16_p16,
> 	vreinterpret_f32_f16, vreinterpret_f64_f16, vreinterpret_s64_f16,
> 	vreinterpret_u64_f16, vreinterpretq_u64_f16, vreinterpret_s8_f16,
> 	vreinterpret_s16_f16, vreinterpret_s32_f16, vreinterpret_u8_f16,
> 	vreinterpret_u16_f16, vreinterpret_u32_f16, vreinterpretq_p8_f16,
> 	vreinterpretq_p16_f16, vreinterpretq_f32_f16, vreinterpretq_f64_f16,
> 	vreinterpretq_s64_f16, vreinterpretq_s8_f16, vreinterpretq_s16_f16,
> 	vreinterpretq_s32_f16, vreinterpretq_u8_f16, vreinterpretq_u16_f16,
> 	vreinterpretq_u32_f16, vget_low_f16, vget_high_f16, vld1_dup_f16,
> 	vld1q_dup_f16): New.
> 
> gcc/testsuite/ChangeLog:
> 
> 	* gcc.target/aarch64/vget_high_1.c: Add float16x8->float16x4 case.
> 	* gcc.target/aarch64/vget_low_1.c: Likewise.

> @@ -14871,6 +15171,13 @@ vld1q_u64 (const uint64_t *a)
>  
>  /* vld1_dup  */
>  
> +__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
> +vld1_dup_f16 (const float16_t* __a)
> +{
> +  float16_t __f = *__a;
> +  return (float16x4_t) { __f, __f, __f, __f };
> +}
> +
>  __extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
>  vld1_dup_f32 (const float32_t* __a)
>  {
> @@ -14945,6 +15252,13 @@ vld1_dup_u64 (const uint64_t* __a)
>  
>  /* vld1q_dup  */
>  
> +__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
> +vld1q_dup_f16 (const float16_t* __a)
> +{
> +  float16_t __f = *__a;
> +  return (float16x8_t) { __f, __f, __f, __f, __f, __f, __f, __f };
> +}
> +

Did you check that these actually emit the expected instruction?

Applying your patch set I see some fairly unpleasant code generation,
but I might have made an error, or perhaps you have another patch in
waiting?

Thanks,
James
Alan Lawrence Aug. 24, 2015, 9:17 a.m. UTC | #2
James Greenhalgh wrote:
> 
> Did you check that these actually emit the expected instruction?
> 
> Applying your patch set I see some fairly unpleasant code generation,
> but I might have made an error, or perhaps you have another patch in
> waiting?
> 
> Thanks,
> James
> 

Yes, you are right, some of the code generation here is a bit grotty. However I 
think we should go for correctness and tests first, performance second. (There 
will be some tweaks to the iterators in aarch64-simd.md.)

--Alan
James Greenhalgh Sept. 4, 2015, 9:54 a.m. UTC | #3
On Mon, Aug 24, 2015 at 10:17:19AM +0100, Alan Lawrence wrote:
> James Greenhalgh wrote:
> > 
> > Did you check that these actually emit the expected instruction?
> > 
> > Applying your patch set I see some fairly unpleasant code generation,
> > but I might have made an error, or perhaps you have another patch in
> > waiting?
> > 
> > Thanks,
> > James
> > 
> 
> Yes, you are right, some of the code generation here is a bit grotty. However
> I think we should go for correctness and tests first, performance second.
> (There will be some tweaks to the iterators in aarch64-simd.md.)

In that case, these should be implemented as inline assembly blocks. As it
stands, the code generation for these intrinsics will be very poor with this
patch applied.

I'm going to hold off OKing this until I see a follow-up to fix the code
generation, either replacing those particular intrinsics with inline asm,
or doing the more comprehensive fix in the back-end.

Thanks,
James
James Greenhalgh Sept. 8, 2015, 8:26 a.m. UTC | #4
On Tue, Jul 28, 2015 at 12:26:22PM +0100, Alan Lawrence wrote:
> gcc/ChangeLog:
> 
> 	* config/aarch64/arm_neon.h (vreinterpret_p8_f16, vreinterpret_p16_f16,
> 	vreinterpret_f16_f64, vreinterpret_f16_s8, vreinterpret_f16_s16,
> 	vreinterpret_f16_s32, vreinterpret_f16_s64, vreinterpret_f16_f32,
> 	vreinterpret_f16_u8, vreinterpret_f16_u16, vreinterpret_f16_u32,
> 	vreinterpret_f16_u64, vreinterpret_f16_p8, vreinterpret_f16_p16,
> 	vreinterpretq_f16_f64, vreinterpretq_f16_s8, vreinterpretq_f16_s16,
> 	vreinterpretq_f16_s32, vreinterpretq_f16_s64, vreinterpretq_f16_f32,
> 	vreinterpretq_f16_u8, vreinterpretq_f16_u16, vreinterpretq_f16_u32,
> 	vreinterpretq_f16_u64, vreinterpretq_f16_p8, vreinterpretq_f16_p16,
> 	vreinterpret_f32_f16, vreinterpret_f64_f16, vreinterpret_s64_f16,
> 	vreinterpret_u64_f16, vreinterpretq_u64_f16, vreinterpret_s8_f16,
> 	vreinterpret_s16_f16, vreinterpret_s32_f16, vreinterpret_u8_f16,
> 	vreinterpret_u16_f16, vreinterpret_u32_f16, vreinterpretq_p8_f16,
> 	vreinterpretq_p16_f16, vreinterpretq_f32_f16, vreinterpretq_f64_f16,
> 	vreinterpretq_s64_f16, vreinterpretq_s8_f16, vreinterpretq_s16_f16,
> 	vreinterpretq_s32_f16, vreinterpretq_u8_f16, vreinterpretq_u16_f16,
> 	vreinterpretq_u32_f16, vget_low_f16, vget_high_f16, vld1_dup_f16,
> 	vld1q_dup_f16): New.
> 
> gcc/testsuite/ChangeLog:
> 
> 	* gcc.target/aarch64/vget_high_1.c: Add float16x8->float16x4 case.
> 	* gcc.target/aarch64/vget_low_1.c: Likewise.


OK,

Thanks,
James
diff mbox

Patch

commit beb21a6bce76d4fbedb13fcf25796563b27f6bae
Author: Alan Lawrence <alan.lawrence@arm.com>
Date:   Mon Jun 29 18:46:49 2015 +0100

    [AArch64 5/N v2] vreinterpret, vget_(low|high), vld1(q?)_dup. update tests for vget_low/high

diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index b915754..ff1a45c 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -2891,6 +2891,12 @@  vgetq_lane_u64 (uint64x2_t __a, const int __b)
 /* vreinterpret  */
 
 __extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+vreinterpret_p8_f16 (float16x4_t __a)
+{
+  return (poly8x8_t) __a;
+}
+
+__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
 vreinterpret_p8_f64 (float64x1_t __a)
 {
   return (poly8x8_t) __a;
@@ -2987,6 +2993,12 @@  vreinterpretq_p8_s64 (int64x2_t __a)
 }
 
 __extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
+vreinterpretq_p8_f16 (float16x8_t __a)
+{
+  return (poly8x16_t) __a;
+}
+
+__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
 vreinterpretq_p8_f32 (float32x4_t __a)
 {
   return (poly8x16_t) __a;
@@ -3023,6 +3035,12 @@  vreinterpretq_p8_p16 (poly16x8_t __a)
 }
 
 __extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
+vreinterpret_p16_f16 (float16x4_t __a)
+{
+  return (poly16x4_t) __a;
+}
+
+__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
 vreinterpret_p16_f64 (float64x1_t __a)
 {
   return (poly16x4_t) __a;
@@ -3119,6 +3137,12 @@  vreinterpretq_p16_s64 (int64x2_t __a)
 }
 
 __extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
+vreinterpretq_p16_f16 (float16x8_t __a)
+{
+  return (poly16x8_t) __a;
+}
+
+__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
 vreinterpretq_p16_f32 (float32x4_t __a)
 {
   return (poly16x8_t) __a;
@@ -3154,6 +3178,156 @@  vreinterpretq_p16_p8 (poly8x16_t __a)
   return (poly16x8_t) __a;
 }
 
+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+vreinterpret_f16_f64 (float64x1_t __a)
+{
+  return (float16x4_t) __a;
+}
+
+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+vreinterpret_f16_s8 (int8x8_t __a)
+{
+  return (float16x4_t) __a;
+}
+
+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+vreinterpret_f16_s16 (int16x4_t __a)
+{
+  return (float16x4_t) __a;
+}
+
+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+vreinterpret_f16_s32 (int32x2_t __a)
+{
+  return (float16x4_t) __a;
+}
+
+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+vreinterpret_f16_s64 (int64x1_t __a)
+{
+  return (float16x4_t) __a;
+}
+
+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+vreinterpret_f16_f32 (float32x2_t __a)
+{
+  return (float16x4_t) __a;
+}
+
+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+vreinterpret_f16_u8 (uint8x8_t __a)
+{
+  return (float16x4_t) __a;
+}
+
+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+vreinterpret_f16_u16 (uint16x4_t __a)
+{
+  return (float16x4_t) __a;
+}
+
+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+vreinterpret_f16_u32 (uint32x2_t __a)
+{
+  return (float16x4_t) __a;
+}
+
+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+vreinterpret_f16_u64 (uint64x1_t __a)
+{
+  return (float16x4_t) __a;
+}
+
+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+vreinterpret_f16_p8 (poly8x8_t __a)
+{
+  return (float16x4_t) __a;
+}
+
+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+vreinterpret_f16_p16 (poly16x4_t __a)
+{
+  return (float16x4_t) __a;
+}
+
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vreinterpretq_f16_f64 (float64x2_t __a)
+{
+  return (float16x8_t) __a;
+}
+
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vreinterpretq_f16_s8 (int8x16_t __a)
+{
+  return (float16x8_t) __a;
+}
+
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vreinterpretq_f16_s16 (int16x8_t __a)
+{
+  return (float16x8_t) __a;
+}
+
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vreinterpretq_f16_s32 (int32x4_t __a)
+{
+  return (float16x8_t) __a;
+}
+
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vreinterpretq_f16_s64 (int64x2_t __a)
+{
+  return (float16x8_t) __a;
+}
+
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vreinterpretq_f16_f32 (float32x4_t __a)
+{
+  return (float16x8_t) __a;
+}
+
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vreinterpretq_f16_u8 (uint8x16_t __a)
+{
+  return (float16x8_t) __a;
+}
+
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vreinterpretq_f16_u16 (uint16x8_t __a)
+{
+  return (float16x8_t) __a;
+}
+
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vreinterpretq_f16_u32 (uint32x4_t __a)
+{
+  return (float16x8_t) __a;
+}
+
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vreinterpretq_f16_u64 (uint64x2_t __a)
+{
+  return (float16x8_t) __a;
+}
+
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vreinterpretq_f16_p8 (poly8x16_t __a)
+{
+  return (float16x8_t) __a;
+}
+
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vreinterpretq_f16_p16 (poly16x8_t __a)
+{
+  return (float16x8_t) __a;
+}
+
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vreinterpret_f32_f16 (float16x4_t __a)
+{
+  return (float32x2_t) __a;
+}
+
 __extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
 vreinterpret_f32_f64 (float64x1_t __a)
 {
@@ -3221,6 +3395,12 @@  vreinterpret_f32_p16 (poly16x4_t __a)
 }
 
 __extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vreinterpretq_f32_f16 (float16x8_t __a)
+{
+  return (float32x4_t) __a;
+}
+
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
 vreinterpretq_f32_f64 (float64x2_t __a)
 {
   return (float32x4_t) __a;
@@ -3287,6 +3467,12 @@  vreinterpretq_f32_p16 (poly16x8_t __a)
 }
 
 __extension__ static __inline float64x1_t __attribute__((__always_inline__))
+vreinterpret_f64_f16 (float16x4_t __a)
+{
+  return (float64x1_t) __a;
+}
+
+__extension__ static __inline float64x1_t __attribute__((__always_inline__))
 vreinterpret_f64_f32 (float32x2_t __a)
 {
   return (float64x1_t) __a;
@@ -3353,6 +3539,12 @@  vreinterpret_f64_u64 (uint64x1_t __a)
 }
 
 __extension__ static __inline float64x2_t __attribute__((__always_inline__))
+vreinterpretq_f64_f16 (float16x8_t __a)
+{
+  return (float64x2_t) __a;
+}
+
+__extension__ static __inline float64x2_t __attribute__((__always_inline__))
 vreinterpretq_f64_f32 (float32x4_t __a)
 {
   return (float64x2_t) __a;
@@ -3419,6 +3611,12 @@  vreinterpretq_f64_u64 (uint64x2_t __a)
 }
 
 __extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+vreinterpret_s64_f16 (float16x4_t __a)
+{
+  return (int64x1_t) __a;
+}
+
+__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
 vreinterpret_s64_f64 (float64x1_t __a)
 {
   return (int64x1_t) __a;
@@ -3509,6 +3707,12 @@  vreinterpretq_s64_s32 (int32x4_t __a)
 }
 
 __extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vreinterpretq_s64_f16 (float16x8_t __a)
+{
+  return (int64x2_t) __a;
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
 vreinterpretq_s64_f32 (float32x4_t __a)
 {
   return (int64x2_t) __a;
@@ -3551,6 +3755,12 @@  vreinterpretq_s64_p16 (poly16x8_t __a)
 }
 
 __extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+vreinterpret_u64_f16 (float16x4_t __a)
+{
+  return (uint64x1_t) __a;
+}
+
+__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
 vreinterpret_u64_f64 (float64x1_t __a)
 {
   return (uint64x1_t) __a;
@@ -3647,6 +3857,12 @@  vreinterpretq_u64_s64 (int64x2_t __a)
 }
 
 __extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vreinterpretq_u64_f16 (float16x8_t __a)
+{
+  return (uint64x2_t) __a;
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
 vreinterpretq_u64_f32 (float32x4_t __a)
 {
   return (uint64x2_t) __a;
@@ -3683,6 +3899,12 @@  vreinterpretq_u64_p16 (poly16x8_t __a)
 }
 
 __extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vreinterpret_s8_f16 (float16x4_t __a)
+{
+  return (int8x8_t) __a;
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
 vreinterpret_s8_f64 (float64x1_t __a)
 {
   return (int8x8_t) __a;
@@ -3773,6 +3995,12 @@  vreinterpretq_s8_s64 (int64x2_t __a)
 }
 
 __extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+vreinterpretq_s8_f16 (float16x8_t __a)
+{
+  return (int8x16_t) __a;
+}
+
+__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
 vreinterpretq_s8_f32 (float32x4_t __a)
 {
   return (int8x16_t) __a;
@@ -3815,6 +4043,12 @@  vreinterpretq_s8_p16 (poly16x8_t __a)
 }
 
 __extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vreinterpret_s16_f16 (float16x4_t __a)
+{
+  return (int16x4_t) __a;
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
 vreinterpret_s16_f64 (float64x1_t __a)
 {
   return (int16x4_t) __a;
@@ -3905,6 +4139,12 @@  vreinterpretq_s16_s64 (int64x2_t __a)
 }
 
 __extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vreinterpretq_s16_f16 (float16x8_t __a)
+{
+  return (int16x8_t) __a;
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
 vreinterpretq_s16_f32 (float32x4_t __a)
 {
   return (int16x8_t) __a;
@@ -3947,6 +4187,12 @@  vreinterpretq_s16_p16 (poly16x8_t __a)
 }
 
 __extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vreinterpret_s32_f16 (float16x4_t __a)
+{
+  return (int32x2_t) __a;
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
 vreinterpret_s32_f64 (float64x1_t __a)
 {
   return (int32x2_t) __a;
@@ -4037,6 +4283,12 @@  vreinterpretq_s32_s64 (int64x2_t __a)
 }
 
 __extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vreinterpretq_s32_f16 (float16x8_t __a)
+{
+  return (int32x4_t) __a;
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
 vreinterpretq_s32_f32 (float32x4_t __a)
 {
   return (int32x4_t) __a;
@@ -4079,6 +4331,12 @@  vreinterpretq_s32_p16 (poly16x8_t __a)
 }
 
 __extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vreinterpret_u8_f16 (float16x4_t __a)
+{
+  return (uint8x8_t) __a;
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
 vreinterpret_u8_f64 (float64x1_t __a)
 {
   return (uint8x8_t) __a;
@@ -4175,6 +4433,12 @@  vreinterpretq_u8_s64 (int64x2_t __a)
 }
 
 __extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vreinterpretq_u8_f16 (float16x8_t __a)
+{
+  return (uint8x16_t) __a;
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
 vreinterpretq_u8_f32 (float32x4_t __a)
 {
   return (uint8x16_t) __a;
@@ -4211,6 +4475,12 @@  vreinterpretq_u8_p16 (poly16x8_t __a)
 }
 
 __extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vreinterpret_u16_f16 (float16x4_t __a)
+{
+  return (uint16x4_t) __a;
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
 vreinterpret_u16_f64 (float64x1_t __a)
 {
   return (uint16x4_t) __a;
@@ -4307,6 +4577,12 @@  vreinterpretq_u16_s64 (int64x2_t __a)
 }
 
 __extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vreinterpretq_u16_f16 (float16x8_t __a)
+{
+  return (uint16x8_t) __a;
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
 vreinterpretq_u16_f32 (float32x4_t __a)
 {
   return (uint16x8_t) __a;
@@ -4343,6 +4619,12 @@  vreinterpretq_u16_p16 (poly16x8_t __a)
 }
 
 __extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vreinterpret_u32_f16 (float16x4_t __a)
+{
+  return (uint32x2_t) __a;
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
 vreinterpret_u32_f64 (float64x1_t __a)
 {
   return (uint32x2_t) __a;
@@ -4439,6 +4721,12 @@  vreinterpretq_u32_s64 (int64x2_t __a)
 }
 
 __extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vreinterpretq_u32_f16 (float16x8_t __a)
+{
+  return (uint32x4_t) __a;
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
 vreinterpretq_u32_f32 (float32x4_t __a)
 {
   return (uint32x4_t) __a;
@@ -4639,6 +4927,12 @@  vsetq_lane_u64 (uint64_t __elem, uint64x2_t __vec, const int __index)
   uint64x1_t lo = vcreate_u64 (vgetq_lane_u64 (tmp, 0));  \
   return vreinterpret_##__TYPE##_u64 (lo);
 
+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+vget_low_f16 (float16x8_t __a)
+{
+  __GET_LOW (f16);
+}
+
 __extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
 vget_low_f32 (float32x4_t __a)
 {
@@ -4718,6 +5012,12 @@  vget_low_u64 (uint64x2_t __a)
   uint64x1_t hi = vcreate_u64 (vgetq_lane_u64 (tmp, 1));	\
   return vreinterpret_##__TYPE##_u64 (hi);
 
+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+vget_high_f16 (float16x8_t __a)
+{
+  __GET_HIGH (f16);
+}
+
 __extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
 vget_high_f32 (float32x4_t __a)
 {
@@ -14871,6 +15171,13 @@  vld1q_u64 (const uint64_t *a)
 
 /* vld1_dup  */
 
+__extension__ static __inline float16x4_t __attribute__ ((__always_inline__))
+vld1_dup_f16 (const float16_t* __a)
+{
+  float16_t __f = *__a;
+  return (float16x4_t) { __f, __f, __f, __f };
+}
+
 __extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
 vld1_dup_f32 (const float32_t* __a)
 {
@@ -14945,6 +15252,13 @@  vld1_dup_u64 (const uint64_t* __a)
 
 /* vld1q_dup  */
 
+__extension__ static __inline float16x8_t __attribute__ ((__always_inline__))
+vld1q_dup_f16 (const float16_t* __a)
+{
+  float16_t __f = *__a;
+  return (float16x8_t) { __f, __f, __f, __f, __f, __f, __f, __f };
+}
+
 __extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
 vld1q_dup_f32 (const float32_t* __a)
 {
diff --git a/gcc/testsuite/gcc.target/aarch64/vget_high_1.c b/gcc/testsuite/gcc.target/aarch64/vget_high_1.c
index 4cb872d..b6b57e0 100644
--- a/gcc/testsuite/gcc.target/aarch64/vget_high_1.c
+++ b/gcc/testsuite/gcc.target/aarch64/vget_high_1.c
@@ -14,6 +14,7 @@  VARIANT (int8_t, 8, int8x8_t, int8x16_t, s8)		\
 VARIANT (int16_t, 4, int16x4_t, int16x8_t, s16)		\
 VARIANT (int32_t, 2, int32x2_t, int32x4_t, s32)		\
 VARIANT (int64_t, 1, int64x1_t, int64x2_t, s64)		\
+VARIANT (float16_t, 4, float16x4_t, float16x8_t, f16)	\
 VARIANT (float32_t, 2, float32x2_t, float32x4_t, f32)	\
 VARIANT (float64_t, 1, float64x1_t, float64x2_t, f64)
 
@@ -51,6 +52,8 @@  main (int argc, char **argv)
   int16_t int16_t_data[8] = { -17, 19, 3, -999, 44048, 505, 9999, 1000};
   int32_t int32_t_data[4] = { 123456789, -987654321, -135792468, 975318642 };
   int64_t int64_t_data[2] = {0xfedcba9876543210LL, 0xdeadbabecafebeefLL };
+  float16_t float16_t_data[8] = { 1.25, 4.5, 7.875, 2.3125, 5.675, 8.875,
+      3.6875, 6.75};
   float32_t float32_t_data[4] = { 3.14159, 2.718, 1.414, 100.0 };
   float64_t float64_t_data[2] = { 1.01001000100001, 12345.6789 };
 
diff --git a/gcc/testsuite/gcc.target/aarch64/vget_low_1.c b/gcc/testsuite/gcc.target/aarch64/vget_low_1.c
index f8016ef..2223676 100644
--- a/gcc/testsuite/gcc.target/aarch64/vget_low_1.c
+++ b/gcc/testsuite/gcc.target/aarch64/vget_low_1.c
@@ -14,6 +14,7 @@  VARIANT (int8_t, 8, int8x8_t, int8x16_t, s8)		\
 VARIANT (int16_t, 4, int16x4_t, int16x8_t, s16)		\
 VARIANT (int32_t, 2, int32x2_t, int32x4_t, s32)		\
 VARIANT (int64_t, 1, int64x1_t, int64x2_t, s64)		\
+VARIANT (float16_t, 4, float16x4_t, float16x8_t, f16)	\
 VARIANT (float32_t, 2, float32x2_t, float32x4_t, f32)	\
 VARIANT (float64_t, 1, float64x1_t, float64x2_t, f64)
 
@@ -51,6 +52,8 @@  main (int argc, char **argv)
   int16_t int16_t_data[8] = { -17, 19, 3, -999, 44048, 505, 9999, 1000};
   int32_t int32_t_data[4] = { 123456789, -987654321, -135792468, 975318642 };
   int64_t int64_t_data[2] = {0xfedcba9876543210LL, 0xdeadbabecafebeefLL };
+  float16_t float16_t_data[8] = { 1.25, 4.5, 7.875, 2.3125, 5.675, 8.875,
+      3.6875, 6.75};
   float32_t float32_t_data[4] = { 3.14159, 2.718, 1.414, 100.0 };
   float64_t float64_t_data[2] = { 1.01001000100001, 12345.6789 };